Skip to content

Commit 0eb50f5

Browse files
alambde-bgunter
authored andcommitted
Add tests for simplifying multiple aggregate expressions (apache#20723)
## Which issue does this PR close? - Part of apache#15524 - Related to apache#20749 ## Rationale for this change As part of apache#15524 I am working on some optimizations for queries with multiple aggregates. To make it clear what is changing, and ensure I don't introduce regressions, I want to add the tests to main first. Merging the tests first should also make apache#20749 easier to review ## What changes are included in this PR? 1. Add new tests ## Are these changes tested? Yes, all tests ## Are there any user-facing changes? No
1 parent 2e57469 commit 0eb50f5

1 file changed

Lines changed: 344 additions & 0 deletions

File tree

Lines changed: 344 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,344 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
#######
19+
# Tests for aggregate optimizations / simplifications
20+
#######
21+
22+
statement ok
23+
CREATE TABLE sum_simplify_t AS VALUES (1, 100), (1, 200), (2, 100), (NULL, NULL);
24+
25+
# Baseline SUM of an expression
26+
query I
27+
SELECT SUM(column1 + 1) FROM sum_simplify_t;
28+
----
29+
7
30+
31+
query TT
32+
EXPLAIN SELECT SUM(column1 + 1) FROM sum_simplify_t;
33+
----
34+
logical_plan
35+
01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]]
36+
02)--TableScan: sum_simplify_t projection=[column1]
37+
physical_plan
38+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))]
39+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
40+
41+
42+
# Mixed aggregate expressions with type validation
43+
query TI
44+
SELECT arrow_typeof(SUM(column1)), SUM(column1 + 1) FROM sum_simplify_t;
45+
----
46+
Int64 7
47+
48+
query TT
49+
EXPLAIN SELECT arrow_typeof(SUM(column1)), SUM(column1), SUM(column1 + 1) FROM sum_simplify_t;
50+
----
51+
logical_plan
52+
01)Projection: arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))
53+
02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]]
54+
03)----TableScan: sum_simplify_t projection=[column1]
55+
physical_plan
56+
01)ProjectionExec: expr=[arrow_typeof(sum(sum_simplify_t.column1)@0) as arrow_typeof(sum(sum_simplify_t.column1)), sum(sum_simplify_t.column1)@0 as sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))@1 as sum(sum_simplify_t.column1 + Int64(1))]
57+
02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]
58+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
59+
60+
# Duplicate aggregate expressions
61+
query II
62+
SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t;
63+
----
64+
7 7
65+
66+
query TT
67+
EXPLAIN SELECT SUM(column1 + 1) AS sum_plus_1_a, SUM(column1 + 1) AS sum_plus_1_b FROM sum_simplify_t;
68+
----
69+
logical_plan
70+
01)Projection: sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1)) AS sum_plus_1_b
71+
02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1))]]
72+
03)----TableScan: sum_simplify_t projection=[column1]
73+
physical_plan
74+
01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_a, sum(sum_simplify_t.column1 + Int64(1))@0 as sum_plus_1_b]
75+
02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1))]
76+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
77+
78+
79+
# constant aggregate expressions
80+
query II
81+
SELECT SUM(2+1), SUM(3) FROM sum_simplify_t;
82+
----
83+
12 12
84+
85+
query TT
86+
EXPLAIN SELECT SUM(2+1), SUM(3) FROM sum_simplify_t;
87+
----
88+
logical_plan
89+
01)Projection: __common_expr_1 AS sum(Int64(2) + Int64(1)), __common_expr_1 AS sum(Int64(3))
90+
02)--Aggregate: groupBy=[[]], aggr=[[sum(Int64(3)) AS __common_expr_1]]
91+
03)----TableScan: sum_simplify_t projection=[]
92+
physical_plan
93+
01)ProjectionExec: expr=[__common_expr_1@0 as sum(Int64(2) + Int64(1)), __common_expr_1@0 as sum(Int64(3))]
94+
02)--AggregateExec: mode=Single, gby=[], aggr=[__common_expr_1]
95+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
96+
97+
98+
# Duplicated expression across multiple aggregate arguments.
99+
query II
100+
SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t;
101+
----
102+
7 10
103+
104+
105+
query TT
106+
EXPLAIN SELECT SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t;
107+
----
108+
logical_plan
109+
01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1 + Int64(2))]]
110+
02)--TableScan: sum_simplify_t projection=[column1]
111+
physical_plan
112+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1 + Int64(2))]
113+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
114+
115+
# Reordered expressions that still compute the same thing
116+
query II
117+
SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t;
118+
----
119+
7 10
120+
121+
query TT
122+
EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 2) FROM sum_simplify_t;
123+
----
124+
logical_plan
125+
01)Aggregate: groupBy=[[]], aggr=[[sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(2))]]
126+
02)--TableScan: sum_simplify_t projection=[column1]
127+
physical_plan
128+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(2))]
129+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
130+
131+
# DISTINCT aggregates with different arguments
132+
query II
133+
SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t;
134+
----
135+
5 7
136+
137+
query TT
138+
EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(DISTINCT column1 + 2) FROM sum_simplify_t;
139+
----
140+
logical_plan
141+
01)Aggregate: groupBy=[[]], aggr=[[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]]
142+
02)--TableScan: sum_simplify_t projection=[column1]
143+
physical_plan
144+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(DISTINCT sum_simplify_t.column1 + Int64(2))]
145+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
146+
147+
# DISTINCT and non-DISTINCT aggregates
148+
query II
149+
SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t;
150+
----
151+
5 7
152+
153+
query TT
154+
EXPLAIN SELECT SUM(DISTINCT column1 + 1), SUM(column1 + 1) FROM sum_simplify_t;
155+
----
156+
logical_plan
157+
01)Projection: sum(alias1) AS sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2) AS sum(sum_simplify_t.column1 + Int64(1))
158+
02)--Aggregate: groupBy=[[]], aggr=[[sum(alias1), sum(alias2)]]
159+
03)----Aggregate: groupBy=[[__common_expr_1 AS alias1]], aggr=[[sum(__common_expr_1) AS alias2]]
160+
04)------Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1
161+
05)--------TableScan: sum_simplify_t projection=[column1]
162+
physical_plan
163+
01)ProjectionExec: expr=[sum(alias1)@0 as sum(DISTINCT sum_simplify_t.column1 + Int64(1)), sum(alias2)@1 as sum(sum_simplify_t.column1 + Int64(1))]
164+
02)--AggregateExec: mode=Final, gby=[], aggr=[sum(alias1), sum(alias2)]
165+
03)----CoalescePartitionsExec
166+
04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(alias1), sum(alias2)]
167+
05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[alias2]
168+
06)----------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=1
169+
07)------------AggregateExec: mode=Partial, gby=[__common_expr_1@0 as alias1], aggr=[alias2]
170+
08)--------------ProjectionExec: expr=[column1@0 + 1 as __common_expr_1]
171+
09)----------------DataSourceExec: partitions=1, partition_sizes=[1]
172+
173+
# FILTER clauses with different aggregate arguments
174+
query II
175+
SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t;
176+
----
177+
3 NULL
178+
179+
query TT
180+
EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 2) FILTER (WHERE column1 > 2) FROM sum_simplify_t;
181+
----
182+
logical_plan
183+
01)Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]]
184+
02)--TableScan: sum_simplify_t projection=[column1]
185+
physical_plan
186+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(2)) FILTER (WHERE sum_simplify_t.column1 > Int64(2))]
187+
02)--DataSourceExec: partitions=1, partition_sizes=[1]
188+
189+
# FILTER clauses with the same aggregate argument
190+
query II
191+
SELECT
192+
SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a,
193+
SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b
194+
FROM sum_simplify_t;
195+
----
196+
3 3
197+
198+
query TT
199+
EXPLAIN SELECT
200+
SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_a,
201+
SUM(column1 + 1) FILTER (WHERE column1 > 1) AS filtered_sum_b
202+
FROM sum_simplify_t;
203+
----
204+
logical_plan
205+
01)Projection: sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)) AS filtered_sum_b
206+
02)--Aggregate: groupBy=[[]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]]
207+
03)----TableScan: sum_simplify_t projection=[column1]
208+
physical_plan
209+
01)ProjectionExec: expr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_a, sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))@0 as filtered_sum_b]
210+
02)--AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1))]
211+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
212+
213+
# Same aggregate argument with different FILTER predicates
214+
query II
215+
SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t;
216+
----
217+
3 7
218+
219+
query TT
220+
EXPLAIN SELECT SUM(column1 + 1) FILTER (WHERE column1 > 1), SUM(column1 + 1) FILTER (WHERE column1 > 0) FROM sum_simplify_t;
221+
----
222+
logical_plan
223+
01)Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]]
224+
02)--Projection: sum_simplify_t.column1 + Int64(1) AS __common_expr_1, sum_simplify_t.column1
225+
03)----TableScan: sum_simplify_t projection=[column1]
226+
physical_plan
227+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(1)), sum(sum_simplify_t.column1 + Int64(1)) FILTER (WHERE sum_simplify_t.column1 > Int64(0))]
228+
02)--ProjectionExec: expr=[column1@0 + 1 as __common_expr_1, column1@0 as column1]
229+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
230+
231+
# volatile aggregate arguments
232+
query B
233+
SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t;
234+
----
235+
true
236+
237+
query TT
238+
EXPLAIN SELECT SUM(random() + 1) < SUM(random() + 2) FROM sum_simplify_t;
239+
----
240+
logical_plan
241+
01)Projection: sum(random() + Int64(2)) > sum(random() + Int64(1)) AS sum(random() + Int64(1)) < sum(random() + Int64(2))
242+
02)--Aggregate: groupBy=[[]], aggr=[[sum(random() + Float64(1)) AS sum(random() + Int64(1)), sum(random() + Float64(2)) AS sum(random() + Int64(2))]]
243+
03)----TableScan: sum_simplify_t projection=[]
244+
physical_plan
245+
01)ProjectionExec: expr=[sum(random() + Int64(2))@1 > sum(random() + Int64(1))@0 as sum(random() + Int64(1)) < sum(random() + Int64(2))]
246+
02)--AggregateExec: mode=Single, gby=[], aggr=[sum(random() + Int64(1)), sum(random() + Int64(2))]
247+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
248+
249+
# Checks grouped aggregates with explicit ORDER BY return deterministic row order.
250+
query III
251+
SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST;
252+
----
253+
200 2 3
254+
100 5 7
255+
NULL NULL NULL
256+
257+
query TT
258+
EXPLAIN SELECT column2, SUM(column1 + 1), SUM(column1 + 2) FROM sum_simplify_t GROUP BY column2 ORDER BY column2 DESC NULLS LAST;
259+
----
260+
logical_plan
261+
01)Sort: sum_simplify_t.column2 DESC NULLS LAST
262+
02)--Aggregate: groupBy=[[sum_simplify_t.column2]], aggr=[[sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1 + Int64(2))]]
263+
03)----TableScan: sum_simplify_t projection=[column1, column2]
264+
physical_plan
265+
01)SortPreservingMergeExec: [column2@0 DESC NULLS LAST]
266+
02)--SortExec: expr=[column2@0 DESC NULLS LAST], preserve_partitioning=[true]
267+
03)----AggregateExec: mode=FinalPartitioned, gby=[column2@0 as column2], aggr=[sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1 + Int64(2))]
268+
04)------RepartitionExec: partitioning=Hash([column2@0], 4), input_partitions=1
269+
05)--------AggregateExec: mode=Partial, gby=[column2@1 as column2], aggr=[sum(sum_simplify_t.column1 + Int64(1)), sum(sum_simplify_t.column1 + Int64(2))]
270+
06)----------DataSourceExec: partitions=1, partition_sizes=[1]
271+
272+
# Checks commutative forms of equivalent aggregate arguments are simplified consistently.
273+
query II
274+
SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t;
275+
----
276+
7 7
277+
278+
query TT
279+
EXPLAIN SELECT SUM(1 + column1), SUM(column1 + 1) FROM sum_simplify_t;
280+
----
281+
logical_plan
282+
01)Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS Int64(1) + sum_simplify_t.column1), sum(__common_expr_1 AS sum_simplify_t.column1 + Int64(1))]]
283+
02)--Projection: Int64(1) + sum_simplify_t.column1 AS __common_expr_1
284+
03)----TableScan: sum_simplify_t projection=[column1]
285+
physical_plan
286+
01)AggregateExec: mode=Single, gby=[], aggr=[sum(Int64(1) + sum_simplify_t.column1), sum(sum_simplify_t.column1 + Int64(1))]
287+
02)--ProjectionExec: expr=[1 + column1@0 as __common_expr_1]
288+
03)----DataSourceExec: partitions=1, partition_sizes=[1]
289+
290+
# Checks unsigned overflow edge case from PR discussion using transformed SUM arguments.
291+
statement ok
292+
CREATE TABLE IF NOT EXISTS tbl (val INTEGER UNSIGNED);
293+
294+
statement ok
295+
INSERT INTO tbl VALUES (4294967295);
296+
297+
statement ok
298+
INSERT INTO tbl VALUES (4294967295);
299+
300+
# Checks transformed SUM results for unsigned max values are preserved.
301+
query TII
302+
SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl;
303+
----
304+
Int64 8589934592 8589934594
305+
306+
query TT
307+
EXPLAIN SELECT arrow_typeof(SUM(val + 1)), SUM(val + 1), SUM(val + 2) FROM tbl;
308+
----
309+
logical_plan
310+
01)Projection: arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))
311+
02)--Aggregate: groupBy=[[]], aggr=[[sum(__common_expr_1 AS tbl.val + Int64(1)), sum(__common_expr_1 AS tbl.val + Int64(2))]]
312+
03)----Projection: CAST(tbl.val AS Int64) AS __common_expr_1
313+
04)------TableScan: tbl projection=[val]
314+
physical_plan
315+
01)ProjectionExec: expr=[arrow_typeof(sum(tbl.val + Int64(1))@0) as arrow_typeof(sum(tbl.val + Int64(1))), sum(tbl.val + Int64(1))@0 as sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))@1 as sum(tbl.val + Int64(2))]
316+
02)--AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val + Int64(1)), sum(tbl.val + Int64(2))]
317+
03)----ProjectionExec: expr=[CAST(val@0 AS Int64) as __common_expr_1]
318+
04)------DataSourceExec: partitions=1, partition_sizes=[2]
319+
320+
# Checks equivalent rewritten form (SUM + COUNT terms) matches transformed SUM semantics.
321+
query RR
322+
SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl;
323+
----
324+
8589934592 8589934594
325+
326+
query TT
327+
EXPLAIN SELECT SUM(val) + 1 * COUNT(val), SUM(val) + 2 * COUNT(val) FROM tbl;
328+
----
329+
logical_plan
330+
01)Projection: __common_expr_1 + CAST(count(tbl.val) AS Decimal128(20, 0)) AS sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1 AS sum(tbl.val) + CAST(Int64(2) * count(tbl.val) AS Decimal128(20, 0))
331+
02)--Projection: CAST(sum(tbl.val) AS Decimal128(20, 0)) AS __common_expr_1, count(tbl.val)
332+
03)----Aggregate: groupBy=[[]], aggr=[[sum(CAST(tbl.val AS UInt64)), count(tbl.val)]]
333+
04)------TableScan: tbl projection=[val]
334+
physical_plan
335+
01)ProjectionExec: expr=[__common_expr_1@0 + CAST(count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(1) * count(tbl.val), __common_expr_1@0 + CAST(2 * count(tbl.val)@1 AS Decimal128(20, 0)) as sum(tbl.val) + Int64(2) * count(tbl.val)]
336+
02)--ProjectionExec: expr=[CAST(sum(tbl.val)@0 AS Decimal128(20, 0)) as __common_expr_1, count(tbl.val)@1 as count(tbl.val)]
337+
03)----AggregateExec: mode=Single, gby=[], aggr=[sum(tbl.val), count(tbl.val)]
338+
04)------DataSourceExec: partitions=1, partition_sizes=[2]
339+
340+
statement ok
341+
DROP TABLE IF EXISTS tbl;
342+
343+
statement ok
344+
DROP TABLE sum_simplify_t;

0 commit comments

Comments
 (0)