Continuous Aggregate gets wrong data and duplicated time bucket after refreshing

pstreff · November 17, 2023, 9:03am

Hello,

I am having trouble with a monthly Continuous Aggregate on top of a daily one. I create the cagg with the WITH NO DATA option. At that point, when querying the cagg it has the correct data. But as soon as I manually refresh the cagg with refresh_continuous_aggregate it has wrong data and one time bucket is duplicated (once with the correct/expected data and once with the wrong data).

I am using timescaledb v2.10.2 and postgresql version 14 (running the timescale/timescaledb-ha:pg14-latest docker image)

Here is what I have done:

CREATE TABLE object_scans (
	id uuid NOT NULL,
	scanned_at timestamp(0) NOT NULL,
	product_version_id uuid NULL,
	campaign_version_id uuid NOT NULL,
	created_at timestamp(0) NULL,
	updated_at timestamp(0) NULL,
);

SELECT create_hypertable('object_scans', 'scanned_at', chunk_time_interval => INTERVAL '1 day');

/* Inserting some test data with copy to and copy from with the following query
SELECT 
    gen_random_uuid() as id,
    scanned_at,
    '99c15bea-39d0-454e-8c02-58076a6ddd2f' as product_version_id,
    '99c15bea-3763-40cf-891a-7ffbde74e38d' as campaign_version_id,
    scanned_at as created_at,
    scanned_at as updated_at
FROM generate_series('2023-01-01 00:00:00', '2023-11-16 13:00:00', INTERVAL '31 minutes') as scanned_at;
*/

-- Creating the daily cagg
CREATE MATERIALIZED VIEW scans_daily
WITH (timescaledb.continuous) AS
SELECT "object_scans"."campaign_version_id" as "campaign_version_id", 
       "object_scans"."product_version_id" AS "product_version_id",
       time_bucket(INTERVAL '1 day', "scanned_at") AS "time_bucket",
       COUNT(DISTINCT "object_scans"."id") AS "scans"
FROM "object_scans"
GROUP BY "campaign_version_id", "product_version_id", "time_bucket"
WITH NO DATA;

CALL refresh_continuous_aggregate('scans_daily', NULL, localtimestamp - INTERVAL '2 days');

-- This now has correct data
SELECT time_bucket, scans FROM scans_daily
WHERE time_bucket >= '2023-10-01';

Select results

time_bucket            |scans|
-----------------------+-----+
2023-10-01 00:00:00.000|   46|
2023-10-02 00:00:00.000|   47|
2023-10-03 00:00:00.000|   46|
2023-10-04 00:00:00.000|   47|
2023-10-05 00:00:00.000|   46|
2023-10-06 00:00:00.000|   46|
2023-10-07 00:00:00.000|   47|
2023-10-08 00:00:00.000|   46|
2023-10-09 00:00:00.000|   47|
2023-10-10 00:00:00.000|   46|
2023-10-11 00:00:00.000|   47|
2023-10-12 00:00:00.000|   46|
2023-10-13 00:00:00.000|   47|
2023-10-14 00:00:00.000|   46|
2023-10-15 00:00:00.000|   47|
2023-10-16 00:00:00.000|   46|
2023-10-17 00:00:00.000|   46|
2023-10-18 00:00:00.000|   47|
2023-10-19 00:00:00.000|   46|
2023-10-20 00:00:00.000|   47|
2023-10-21 00:00:00.000|   46|
2023-10-22 00:00:00.000|   47|
2023-10-23 00:00:00.000|   46|
2023-10-24 00:00:00.000|   47|
2023-10-25 00:00:00.000|   46|
2023-10-26 00:00:00.000|   47|
2023-10-27 00:00:00.000|   46|
2023-10-28 00:00:00.000|   46|
2023-10-29 00:00:00.000|   47|
2023-10-30 00:00:00.000|   46|
2023-10-31 00:00:00.000|   47|
2023-11-01 00:00:00.000|   46|
2023-11-02 00:00:00.000|   47|
2023-11-03 00:00:00.000|   46|
2023-11-04 00:00:00.000|   47|
2023-11-05 00:00:00.000|   46|
2023-11-06 00:00:00.000|   46|
2023-11-07 00:00:00.000|   47|
2023-11-08 00:00:00.000|   46|
2023-11-09 00:00:00.000|   47|
2023-11-10 00:00:00.000|   46|
2023-11-11 00:00:00.000|   47|
2023-11-12 00:00:00.000|   46|
2023-11-13 00:00:00.000|   47|
2023-11-14 00:00:00.000|   46|
2023-11-15 00:00:00.000|   47|
2023-11-16 00:00:00.000|   36|

-- Create monthly cagg
CREATE MATERIALIZED VIEW scans_monthly
WITH (timescaledb.continuous) AS
SELECT "scans_daily"."campaign_version_id" as "campaign_version_id", 
       "scans_daily"."product_version_id" AS "product_version_id",
       time_bucket(INTERVAL '1 month', "scans_daily"."time_bucket") AS "time_bucket",
       SUM("scans_daily"."scans") AS "scans"
FROM "scans_daily_by_product"
GROUP BY "scans_daily"."campaign_version_id", "scans_daily"."product_version_id", time_bucket(INTERVAL '1 month', "scans_daily"."time_bucket")
WITH NO DATA;

-- Querying the monthly cagg now gives the correct scans amount
SELECT time_bucket, scans from scans_monthly;

Select results

time_bucket            |scans|
-----------------------+-----+
2023-01-01 00:00:00.000| 1440|
2023-02-01 00:00:00.000| 1301|
2023-03-01 00:00:00.000| 1440|
2023-04-01 00:00:00.000| 1394|
2023-05-01 00:00:00.000| 1440|
2023-06-01 00:00:00.000| 1393|
2023-07-01 00:00:00.000| 1440|
2023-08-01 00:00:00.000| 1440|
2023-09-01 00:00:00.000| 1394|
2023-10-01 00:00:00.000| 1440|
2023-11-01 00:00:00.000|  733|

Now when refreshing the monthly cagg manually, strange things start to happen.

-- refresh cagg
CALL refresh_continuous_aggregate('scans_monthly', NULL, localtimestamp - INTERVAL '1 day');

-- Querying the monthly cagg now gives the incorrect values and duplicated time bucket
SELECT time_bucket, scans from scans_monthly;

Select results

time_bucket            |scans|
-----------------------+-----+
2023-01-01 00:00:00.000| 1440|
2023-02-01 00:00:00.000| 1301|
2023-03-01 00:00:00.000| 1440|
2023-04-01 00:00:00.000| 1394|
2023-05-01 00:00:00.000| 1440|
2023-06-01 00:00:00.000| 1393|
2023-07-01 00:00:00.000| 1440|
2023-08-01 00:00:00.000| 1440|
2023-09-01 00:00:00.000| 1394|
2023-10-01 00:00:00.000| 1440|
2023-10-01 00:00:00.000|  140|
2023-11-01 00:00:00.000|  733|

Notice how the october time bucket appears twice.

2023-10-01 00:00:00.000| 1440|
2023-10-01 00:00:00.000|  140|

I am not sure what is going on or what I might have configured/created wrongly here.

jonatasdp · November 29, 2023, 8:54pm

Hi @pstreff, it seems something is wrong locally, I tried your scripts on 2.12.0 / PG 14 and it worked very as expected with no duplications

look my results running here:

-- DROP TABLE object_scans CASCADE;
CREATE TABLE object_scans (
	id uuid NOT NULL,
	scanned_at timestamp(0) NOT NULL,
	product_version_id uuid NULL,
	campaign_version_id uuid NOT NULL,
	created_at timestamp(0) NULL,
	updated_at timestamp(0) NULL);

SELECT create_hypertable('object_scans', 'scanned_at', chunk_time_interval => INTERVAL '1 day');

INSERT INTO object_scans SELECT 
    gen_random_uuid() as id,
    scanned_at,
    '99c15bea-39d0-454e-8c02-58076a6ddd2f' as product_version_id,
    '99c15bea-3763-40cf-891a-7ffbde74e38d' as campaign_version_id,
    scanned_at as created_at,
    scanned_at as updated_at
FROM generate_series('2023-01-01 00:00:00', '2023-11-16 13:00:00', INTERVAL '31 minutes') as scanned_at;

-- Creating the daily cagg
CREATE MATERIALIZED VIEW scans_daily
WITH (timescaledb.continuous) AS
SELECT "object_scans"."campaign_version_id" as "campaign_version_id", 
       "object_scans"."product_version_id" AS "product_version_id",
       time_bucket(INTERVAL '1 day', "scanned_at") AS "time_bucket",
       COUNT(DISTINCT "object_scans"."id") AS "scans"
FROM "object_scans"
GROUP BY "campaign_version_id", "product_version_id", "time_bucket"
WITH NO DATA;

CALL refresh_continuous_aggregate('scans_daily', NULL, localtimestamp - INTERVAL '2 days');

CREATE MATERIALIZED VIEW scans_monthly
WITH (timescaledb.continuous) AS
SELECT "scans_daily"."campaign_version_id" as "campaign_version_id",
       "scans_daily"."product_version_id" AS "product_version_id",
       time_bucket(INTERVAL '1 month', "scans_daily"."time_bucket") AS "time_bucket",
       SUM("scans_daily"."scans") AS "scans"
FROM "scans_daily"
GROUP BY "scans_daily"."campaign_version_id", "scans_daily"."product_version_id", time_bucket(INTERVAL '1 month', "scans_daily"."time_bucket")
WITH NO DATA;

select time_bucket, scans from scans_monthly  order by time_bucket;
CALL refresh_continuous_aggregate('scans_monthly', NULL, localtimestamp - INTERVAL '1 day');
select time_bucket, scans from scans_monthly  order by time_bucket;

output of the last two queries:

┌─────────────────────┬───────┐
│     time_bucket     │ scans │
├─────────────────────┼───────┤
│ 2023-01-01 00:00:00 │  1440 │
│ 2023-02-01 00:00:00 │  1301 │
│ 2023-03-01 00:00:00 │  1440 │
│ 2023-04-01 00:00:00 │  1394 │
│ 2023-05-01 00:00:00 │  1440 │
│ 2023-06-01 00:00:00 │  1393 │
│ 2023-07-01 00:00:00 │  1440 │
│ 2023-08-01 00:00:00 │  1440 │
│ 2023-09-01 00:00:00 │  1394 │
│ 2023-10-01 00:00:00 │  1440 │
│ 2023-11-01 00:00:00 │   722 │
└─────────────────────┴───────┘
(11 rows)

CALL
┌─────────────────────┬───────┐
│     time_bucket     │ scans │
├─────────────────────┼───────┤
│ 2023-01-01 00:00:00 │  1440 │
│ 2023-02-01 00:00:00 │  1301 │
│ 2023-03-01 00:00:00 │  1440 │
│ 2023-04-01 00:00:00 │  1394 │
│ 2023-05-01 00:00:00 │  1440 │
│ 2023-06-01 00:00:00 │  1393 │
│ 2023-07-01 00:00:00 │  1440 │
│ 2023-08-01 00:00:00 │  1440 │
│ 2023-09-01 00:00:00 │  1394 │
│ 2023-10-01 00:00:00 │  1440 │
│ 2023-11-01 00:00:00 │   722 │
└─────────────────────┴───────┘
(11 rows)

pstreff · December 1, 2023, 11:55am

Hey @jonatasdp, thanks for verifying. I will check it locally once again.