Performance issues with materialized_only=false

petergledhillinclusi · January 12, 2024, 1:15pm

I’m trying to use the realtime aggregate feature but having major issues with performance. Perhaps it’s something I’m doing like a missing index but I just can’t see it.

I’m using timescale 2.13.0 on postgres 15.

I’ve create a test script to demonstrate this.

Test1: Real time agg disabled. Performance is great when joining another table

Test2: Real time agg enabled, simple filter. Performance isn’t great, the query seems to scan the whole aggregate but also the whole underlying hypertable. I expected it would only scan the data which isn’t present in the aggregate.

Test3: Real time agg enabled, filter on join table. Performance drops off a cliff. No indexes seem to be used. The same query with realtime disabled does use the indexes on the the agg table.

DROP MATERIALIZED VIEW IF EXISTS gas_telemetry_hourly;
DROP TABLE IF EXISTS device;
DROP TABLE IF EXISTS gas_telemetry;

-- Create tables
CREATE TABLE IF NOT EXISTS gas_telemetry
(
    "timestamp" timestamp with time zone NOT NULL,
    device_id text NOT NULL,
    flow integer,
    pressure integer,
    volume_delta bigint,
    CONSTRAINT gas_telemetry_pkey PRIMARY KEY (device_id, "timestamp")
);

SELECT create_hypertable('gas_telemetry', 'timestamp');

CREATE TABLE IF NOT EXISTS device
(
    device_id text NOT NULL,
	group_id text NOT NULL,
    CONSTRAINT device_pkey PRIMARY KEY (device_id)
);
CREATE INDEX ON device(group_id);

-- Add some data
INSERT INTO device(device_id, group_id)
SELECT 
	concat('device', generate_series(1,20)) device_id, 
	concat('group', generate_series(1,20)) group_id;


INSERT INTO gas_telemetry(timestamp, device_id, flow, pressure, volume_delta)
SELECT 
	timestamp, 
	concat('device', device_id),
	floor(random()*100)::int as flow,  
	floor(random()*100)::int as pressure, 
	floor(random()*100)::int as volume_delta
FROM generate_series(
	'2021-01-01 00:00:00',
    '2024-01-01 00:00:00',
    INTERVAL '1 minute'
  ) as timestamp,
generate_series(1,20) device_id;

-- Create continuous agg
CREATE MATERIALIZED VIEW gas_telemetry_hourly
	WITH (timescaledb.continuous) AS
	SELECT time_bucket(INTERVAL '1 hour', timestamp) AS bucket,
	device_id,
	MAX(pressure) AS pressure_max,
	MIN(pressure) AS pressure_min,
	MAX(flow) AS flow_max,
	MIN(flow) AS flow_min,
	SUM(volume_delta) AS volume_delta_sum
	FROM gas_telemetry t
	GROUP BY device_id, bucket
	WITH NO DATA;


-- RUN MANUALLY
CALL refresh_continuous_aggregate('gas_telemetry_hourly', '2021-01-01 00:00:00', '2023-01-01 00:00:00');


-- TEST 1: JOIN with filter on group_id - Realtime agg disabled -- 41ms - Index Scans all the way 
-- https://explain.depesz.com/s/EkoC
ALTER MATERIALIZED VIEW gas_telemetry_hourly set (timescaledb.materialized_only=true);

SELECT sum(volume_delta_sum)
FROM device d
JOIN gas_telemetry_hourly t using (device_id)
WHERE d.group_id = 'group1'
GROUP BY t.device_id;

-- TEST 2: filter on device_id - Realtime agg enabled - 1034ms 
-- Uses index scans but seems to scan underlying hypertable unnecessarily
-- https://explain.depesz.com/s/8cC6
ALTER MATERIALIZED VIEW gas_telemetry_hourly set (timescaledb.materialized_only=false);

SELECT sum(volume_delta_sum)
FROM gas_telemetry_hourly 
WHERE device_id = 'device1'
GROUP BY device_id;

-- TEST 3: JOIN with filter on group_id - Realtime agg enabled - 13 seconds -- Seq scans on aggregate table and hypertable
-- https://explain.depesz.com/s/8xyE
ALTER MATERIALIZED VIEW gas_telemetry_hourly set (timescaledb.materialized_only=true);

SELECT sum(volume_delta_sum)
FROM device d
JOIN gas_telemetry_hourly t using (device_id)
WHERE d.group_id = 'group1'
GROUP BY t.device_id;

Any help figuring this out would be very much appreciated