Как объединить похожие записи с разными датами действия?

10

Таблица, над которой я работаю, состоит из трех компонентов:

  1. IDСтолбец (первичный ключ в другой таблице)
  2. Некоторые столбцы данных
  3. Дата действительна from/ toстолбцы.

Ценности:

ID   Data From        To  
1    a    2015-01-01  2015-01-05
1    a    2015-01-06  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-05
2    c    2015-01-06  2015-01-10

Таблица обновляется путем создания «снимков» другого источника данных через определенные промежутки времени и назначения дат действия для записей. Проблема заключается в том, что эти снимки создают повторяющиеся записи для записей (с разными датами действия), которые не изменились вообще в течение этого интервала.

Я хочу уменьшить размер таблицы, ища строки с последовательными датами, объединяя их и назначая им один период действия. Например:

ID   Data From        To  
1    a    2015-01-01  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-10

Логика у меня в настоящее время есть:

  1. Выделите и отсортируйте все строки по идентификатору, полям данных и полям 'valid from' (чтобы они были в группах последовательных строк).
  2. Используйте курсор для сравнения соседних строк на предмет сходства.
  3. Если они одинаковы, объедините строки и измените срок действия, чтобы включить обе строки.

Я понимаю, что курсоры очень неэффективны (у меня большой набор данных), поэтому я ищу другие подходы.

hazrmard
источник
2
Также: добавьте CREATE TABLEутверждение в вопрос.
ypercubeᵀᴹ
2
Насколько велик «большой» набор данных? Почему бы вам не исправить импорт снимка, чтобы он не создавал проблему в первую очередь?
Пол Уайт 9
В порядке миллионов записей. У меня нет прав на изменение способа создания таблицы. Плюс, это не решает проблему с прошлыми записями.
Hazrmard

Ответы:

8

Если это таблица только между рядами, ваш случай может рассматриваться как классическая проблема «разрывов и островков», когда вам просто нужно изолировать островки последовательных диапазонов, а затем «сжать» их, взяв минимум [from]и максимум [to]на остров.

Существует установленный метод решения этой проблемы с использованием двух вызовов ROW_NUMBER:

WITH islands AS
(
  SELECT
    id,
    data,
    [from],
    [to],
    island = ROW_NUMBER() OVER (PARTITION BY id       ORDER BY [from])
           - ROW_NUMBER() OVER (PARTITION BY id, data ORDER BY [from])
  FROM
    #mergeTest
)
SELECT
  id,
  data,
  [from] = MIN([from]),
  [to]   = MAX([to])
FROM
  islands
GROUP BY
  id,
  data,
  island
;

Этот запрос будет работать в более низкой версии, чем SQL Server 2005.

Андрей М
источник
1

Мне удалось написать запрос для решения этой проблемы. Он использует несколько объединений и цикл while для объединения записей. Этот код совместим с SQL Server 2008 R2.

CREATE TABLE #mergeTest
(
    [id] int NOT NULL,
    [data] date,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO #mergeTest ([id],[data],[from],[to]) VALUES     --testing null data value handling
    (1,NULL,'2015-01-01','2015-01-05'), --1
    (1,NULL,'2015-01-05','2015-01-10'), --2
    (1,'2000-01-01','2015-01-10','2015-01-14'), --3
    (1,'2000-01-03','2015-01-14','2015-01-15'), --4
    (1,'2000-01-01','2015-01-15','2015-01-20'), --5
    (1,'2000-01-01','2015-01-20','2015-01-22'), --5
    (1,'2000-01-01','2015-01-22','2015-01-25'), --6
    (1,'2000-01-01','2015-01-25','2015-01-30'), --7
    (1,NULL,'2015-01-30','2015-02-04'), --8
    (2,'2000-01-05','2015-01-01','2015-01-05'), --9
    (2,'2000-01-05','2015-01-05','2015-01-10')  --10

SELECT * FROM #mergeTest 
GO
;

SELECT * INTO #tempSingle                               --isolate single records. Single records need no processing.
    FROM (
        SELECT  [id], [data], MIN([from]) as [from], MIN([to]) as [to],
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]=1;
ALTER TABLE #tempSingle
    DROP COLUMN [grpsz];
GO
;

SELECT * INTO #tempRemainingtemp                        --isolate records w/ more than 2 entries. They need to be reduced to single records
    FROM (
        SELECT  [id], [data],                           --get [id] and [data] of duplicate records
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]>=2;
ALTER TABLE #tempRemainingTemp
    DROP COLUMN [grpsz]
SELECT * FROM #tempRemainingtemp
SELECT * INTO #temp                                     --get all duplicate records into #temp
    FROM (
        SELECT [b].*
        FROM #tempRemainingtemp AS [a]
        JOIN #mergeTest AS [b]
        ON      [a].[id]=[b].[id]
            AND ([a].[data]=[b].[data] OR [a].[data] IS NULL AND [b].[data] IS NULL)) AS [selection];

DROP TABLE #tempRemainingtemp;
Go
SELECT * INTO #tempRemaining
    FROM #temp;
DROP TABLE #temp;
GO
;
SELECT * FROM #tempRemaining
BEGIN
SELECT t1.*, t2.[from] as [prevfrom] INTO #temp0        --filter in records where previous 'to' date matched current 'from' date when grouped by id and data
    FROM #tempRemaining AS t1
    JOIN #tempRemaining AS t2
    ON      t2.[to] = t1.[from]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)

SELECT t1.*, t2.[prevfrom] INTO #temp1                  --add records that did not have a previous 'to' date b/c they were the extreme records in their group
    FROM #tempRemaining AS t1
    LEFT JOIN #temp0 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp0;

SELECT t1.*, t2.[to] as [nextto] INTO #temp2            --filter in records where current 'to' date matched next 'from' date when grouped by id and data
    FROM #temp1 AS t1
    JOIN #temp1 AS t2
    ON      t2.[from] = t1.[to]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL);

SELECT t1.*, t2.[nextto] INTO #temp                     --add records that did not have a next 'from' date b/c they were the extreme records in their group
    FROM #temp1 AS t1
    LEFT JOIN #temp2 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp2;
DROP TABLE #temp1;

DELETE FROM #temp                                       --delete redundant records
    WHERE   [prevfrom] IS NOT NULL
        AND [nextto] IS NOT NULL;

WITH cte AS (                                           --select records that got reduced to singles and insert them into singles account
    SELECT [id], [data], [from], [to]
        FROM [#temp]
        WHERE   [prevfrom] IS NULL
            AND [nextto] IS NULL)
DELETE FROM cte
OUTPUT deleted.* INTO #tempSingle

/* ALL DUPLICATE RECORDS ARE NOW REDUCED TO PAIRS*/

SELECT * FROM #temp;
ALTER TABLE #temp
    DROP COLUMN [nextto],[prevfrom]                     --remove helper columns
END

SELECT TOP 1 * INTO #temptemp                           --create temporary tables for storage
    FROM #temp
SELECT TOP 1 * INTO #tempResult
    FROM #temp
TRUNCATE TABLE #temptemp
TRUNCATE TABLE #tempResult

WHILE EXISTS(SELECT [id] from #temp)
BEGIN
    WITH cte AS (
            SELECT TOP 2 *                              --select pair
                FROM #temp
                ORDER BY [id],[data],[from])
        DELETE FROM cte                                 --delete from original table
        OUTPUT deleted.* INTO #temptemp;
    INSERT INTO #tempResult                             --insert merged record into result table
        SELECT t1.[id], t1.[data], t1.[from], t2.[to]
        FROM #temptemp AS t1
        JOIN #temptemp AS t2
        ON t1.[from]<t2.[from];
    TRUNCATE TABLE #temptemp;                           --empty temporary storage table
END;

TRUNCATE TABLE #mergeTest;                              --insert single records and merged records into original table
INSERT INTO #mergeTest
    SELECT * FROM #tempResult;
INSERT INTO #mergeTest
    SELECT * FROM #tempSingle;

SELECT * FROM #mergeTest
    ORDER BY [id],[from];
hazrmard
источник
0

Просто для случая, когда у вас есть несмежные диапазоны дат, которые, хотя и должны быть последовательными, должны оставаться отдельными, я предложил следующее решение:

Смотри на SQL Fiddle

WITH lag_info AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    lag([To], 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevTo,
    lag(Data, 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevData
  FROM dat
),
segmented AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    -- new interval if non-contigous or data changed
    -- if it's null, it means that it's the first entry for the ID, which means it's a new interval
    CASE
      WHEN [PrevTo] IS NULL
        OR PrevData IS NULL
        OR DATEDIFF(DAY, [PrevTo], [From]) > 1
        OR Data <> PrevData
      THEN 1
      ELSE 0
    END AS is_new_interval
  FROM lag_info
),
segmented_marked AS (
  SELECT
    ID,
    [From],
    [To],
    Data,
    -- increment only when new data is detected, using a running sum
    sum(s.is_new_interval)
      OVER (PARTITION BY ID ORDER BY [From] ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING)
                                AS interval_id
  FROM segmented s
)
SELECT
  ID,
  min([From]) AS [From],
  max([To]) AS [To],
  Data
FROM segmented_marked
GROUP BY ID, Data, interval_id
AlexanderMP
источник
-1

Я написал запрос, который, кажется, работает. Он использует общие табличные выражения, операторы MERGE и аналитические функции. Однако он совместим только с SQL Server 2012+. Вы можете найти суть здесь: MergeRecordsByValidityDate.sql

/*  NOTE: Only works w/ SQL Server 2012+
    Merging identical records with different validity dates.
*/
USE [master]


IF OBJECT_ID('mergeTest') IS NOT NULL
    DROP TABLE mergeTest

CREATE TABLE mergeTest          -- Create table with test data
(
    [id] int NOT NULL,
    [data] char(1) NOT NULL,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO mergeTest ([id],[data],[from],[to]) VALUES      -- Insert records w/ different validity dates
    (1,'a','2015-01-01','2015-01-05'),  --1
    (1,'a','2015-01-05','2015-01-10'),  --2
    (1,'a','2015-01-10','2015-01-14'),  --3
    (1,'b','2015-01-14','2015-01-15'),  --4
    (1,'a','2015-01-15','2015-01-20'),  --5
    (1,'a','2015-01-20','2015-01-25'),  --6
    (1,'a','2015-01-25','2015-01-30'),  --7
    (1,'a','2015-01-30','2015-02-04'),  --8
    (2,'c','2015-01-01','2015-01-05'),  --9
    (2,'c','2015-01-05','2015-01-10')   --10

SELECT * FROM mergeTest

/*  This SELECT function uses a Common Table Expression along with Analytic functions over a partition.
    The data set is partitioned on similar primary key and data columns and ordered by 'from' dates.
    A 'last' and 'next' column is added with 'to' date of prev row and 'from' date of next row.
    For each partition, rows are selected (for each partition) that represent the first and last records 
    of identical data. For e.g. rows 5,6,7,8 are reduced to 5,8.
*/

;WITH partitionedData AS (
    SELECT *,   LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM mergeTest)
SELECT [id],[data],[from],[to],[last],[next] INTO #temp
    FROM partitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temp

/*  Now all redundant 'sandwiched' records have been filtered out, only the extreme records are left.
    This MERGE function matches rows on primary key and data, and If the 'to' date of said record matches
    'from' date of another similar record, then the said record is extended to encapsulate the other record's
    'to' date. For example row 5's 'to' date is extended to equal row 8's 'to' date.
*/

MERGE INTO #temp as m1
    USING #temp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[to]=m2.[from])
    THEN
    UPDATE SET  m1.[to]=m2.[to]
;

SELECT * FROM #temp

/*  The MERGE function has done its job of extending records. However there are still 2 records with
    identical data. For e.g. rows 9,10 exist even though row 9 now has all the required information. This 
    block modifies such redundant rows so their 'last' and 'from' columns become asynchronous.
*/

;WITH repartitionedData AS (
    SELECT [id],[data],[from],[to], LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM #temp)
SELECT [id],[data],[from],[to],[last],[next] INTO #temptemp
    FROM repartitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temptemp

/* Asynchronous rows are deleted
*/

DELETE FROM #temptemp
    WHERE [from]<[last]

SELECT * FROM #temptemp

/*  However, blocks of data with >2 rows (like rows 5 through 8) could not be merged because of the filtered out
    rows (i.e. rows 6,7). Applying MERGE again on the updated data set.
*/

MERGE INTO #temptemp as m1
    USING #temptemp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[from]=m2.[next])
    THEN
    UPDATE SET  m1.[from]=m2.[from],
                m1.[last]=CASE WHEN ((m2.[last] IS NULL) OR (m2.[next] IS NULL)) THEN NULL ELSE m1.[last] END   --if row absorbing from is extreme, then current row is also extreme
;

SELECT * FROM #temptemp

TRUNCATE TABLE mergeTest        -- resetting original table

/* The MERGE corrected all rows with the correct 'from' and 'to' dates. And the only rows we are interested in are
    the extreme rows i.e. with 'last' or 'next' == NULL. SELECTing on that criterion and INSERTing into original table.
*/

INSERT INTO mergeTest           -- inserting processed records into table + some last minute filtering
    SELECT [id],[data],[from],MAX([to])
    FROM #temptemp
        WHERE [next] IS NULL OR [last] IS NULL
    GROUP BY [id],[data],[from]

SELECT * FROM mergeTest

DROP TABLE #temp
DROP TABLE #temptemp
hazrmard
источник