PostgreSQL/src/test/regress/sql/incremental_sort.sql
David Rowley 4a29eabd1d Remove pessimistic cost penalization from Incremental Sort
When incremental sorts were added in v13 a 1.5x pessimism factor was added
to the cost modal.  Seemingly this was done because the cost modal only
has an estimate of the total number of input rows and the number of
presorted groups.  It assumes that the input rows will be evenly
distributed throughout the presorted groups.  The 1.5x pessimism factor
was added to slightly reduce the likelihood of incremental sorts being
used in the hope to avoid performance regressions where an incremental
sort plan was picked and turned out slower due to a large skew in the
number of rows in the presorted groups.

An additional quirk with the path generation code meant that we could
consider both a sort and an incremental sort on paths with presorted keys.
This meant that with the pessimism factor, it was possible that we opted
to perform a sort rather than an incremental sort when the given path had
presorted keys.

Here we remove the 1.5x pessimism factor to allow incremental sorts to
have a fairer chance at being chosen against a full sort.

Previously we would generally create a sort path on the cheapest input
path (if that wasn't sorted already) and incremental sort paths on any
path which had presorted keys.  This meant that if the cheapest input path
wasn't completely sorted but happened to have presorted keys, we would
create a full sort path *and* an incremental sort path on that input path.
Here we change this logic so that if there are presorted keys, we only
create an incremental sort path, and create sort paths only when a full
sort is required.

Both the removal of the cost pessimism factor and the changes made to the
path generation make it more likely that incremental sorts will now be
chosen.  That, of course, as with teaching the planner any new tricks,
means an increased likelihood that the planner will perform an incremental
sort when it's not the best method.  Our standard escape hatch for these
cases is an enable_* GUC.  enable_incremental_sort already exists for
this.

This came out of a report by Pavel Luzanov where he mentioned that the
master branch was choosing to perform a Seq Scan -> Sort -> Group
Aggregate for his query with an ORDER BY aggregate function.  The v15 plan
for his query performed an Index Scan -> Group Aggregate, of course, the
aggregate performed the final sort internally in nodeAgg.c for the
aggregate's ORDER BY.  The ideal plan would have been to use the index,
which provided partially sorted input then use an incremental sort to
provide the aggregate with the sorted input.  This was not being chosen
due to the pessimism in the incremental sort cost modal, so here we remove
that and rationalize the path generation so that sort and incremental sort
plans don't have to needlessly compete.  We assume that it's senseless
to ever use a full sort on a given input path where an incremental sort
can be performed.

Reported-by: Pavel Luzanov
Reviewed-by: Richard Guo
Discussion: https://postgr.es/m/9f61ddbf-2989-1536-b31e-6459370a6baa%40postgrespro.ru
2022-12-16 15:22:23 +13:00

279 lines
12 KiB
PL/PgSQL

-- When there is a LIMIT clause, incremental sort is beneficial because
-- it only has to sort some of the groups, and not the entire table.
explain (costs off)
select * from (select * from tenk1 order by four) t order by four, ten
limit 1;
-- When work_mem is not enough to sort the entire table, incremental sort
-- may be faster if individual groups still fit into work_mem.
set work_mem to '2MB';
explain (costs off)
select * from (select * from tenk1 order by four) t order by four, ten;
reset work_mem;
create table t(a integer, b integer);
create or replace function explain_analyze_without_memory(query text)
returns table (out_line text) language plpgsql
as
$$
declare
line text;
begin
for line in
execute 'explain (analyze, costs off, summary off, timing off) ' || query
loop
out_line := regexp_replace(line, '\d+kB', 'NNkB', 'g');
return next;
end loop;
end;
$$;
create or replace function explain_analyze_inc_sort_nodes(query text)
returns jsonb language plpgsql
as
$$
declare
elements jsonb;
element jsonb;
matching_nodes jsonb := '[]'::jsonb;
begin
execute 'explain (analyze, costs off, summary off, timing off, format ''json'') ' || query into strict elements;
while jsonb_array_length(elements) > 0 loop
element := elements->0;
elements := elements - 0;
case jsonb_typeof(element)
when 'array' then
if jsonb_array_length(element) > 0 then
elements := elements || element;
end if;
when 'object' then
if element ? 'Plan' then
elements := elements || jsonb_build_array(element->'Plan');
element := element - 'Plan';
else
if element ? 'Plans' then
elements := elements || jsonb_build_array(element->'Plans');
element := element - 'Plans';
end if;
if (element->>'Node Type')::text = 'Incremental Sort' then
matching_nodes := matching_nodes || element;
end if;
end if;
end case;
end loop;
return matching_nodes;
end;
$$;
create or replace function explain_analyze_inc_sort_nodes_without_memory(query text)
returns jsonb language plpgsql
as
$$
declare
nodes jsonb := '[]'::jsonb;
node jsonb;
group_key text;
space_key text;
begin
for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
for group_key in select unnest(array['Full-sort Groups', 'Pre-sorted Groups']::text[]) t loop
for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
node := jsonb_set(node, array[group_key, space_key, 'Average Sort Space Used'], '"NN"', false);
node := jsonb_set(node, array[group_key, space_key, 'Peak Sort Space Used'], '"NN"', false);
end loop;
end loop;
nodes := nodes || node;
end loop;
return nodes;
end;
$$;
create or replace function explain_analyze_inc_sort_nodes_verify_invariants(query text)
returns bool language plpgsql
as
$$
declare
node jsonb;
group_stats jsonb;
group_key text;
space_key text;
begin
for node in select * from jsonb_array_elements(explain_analyze_inc_sort_nodes(query)) t loop
for group_key in select unnest(array['Full-sort Groups', 'Pre-sorted Groups']::text[]) t loop
group_stats := node->group_key;
for space_key in select unnest(array['Sort Space Memory', 'Sort Space Disk']::text[]) t loop
if (group_stats->space_key->'Peak Sort Space Used')::bigint < (group_stats->space_key->'Peak Sort Space Used')::bigint then
raise exception '% has invalid max space < average space', group_key;
end if;
end loop;
end loop;
end loop;
return true;
end;
$$;
-- A single large group tested around each mode transition point.
insert into t(a, b) select i/100 + 1, i + 1 from generate_series(0, 999) n(i);
analyze t;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
select * from (select * from t order by a) s order by a, b limit 31;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
select * from (select * from t order by a) s order by a, b limit 32;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
select * from (select * from t order by a) s order by a, b limit 33;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
select * from (select * from t order by a) s order by a, b limit 65;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
select * from (select * from t order by a) s order by a, b limit 66;
delete from t;
-- An initial large group followed by a small group.
insert into t(a, b) select i/50 + 1, i + 1 from generate_series(0, 999) n(i);
analyze t;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 55;
select * from (select * from t order by a) s order by a, b limit 55;
-- Test EXPLAIN ANALYZE with only a fullsort group.
select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 55');
select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 55'));
select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 55');
delete from t;
-- An initial small group followed by a large group.
insert into t(a, b) select (case when i < 5 then i else 9 end), i from generate_series(1, 1000) n(i);
analyze t;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 70;
select * from (select * from t order by a) s order by a, b limit 70;
-- Checks case where we hit a group boundary at the last tuple of a batch.
-- Because the full sort state is bounded, we scan 64 tuples (the mode
-- transition point) but only retain 5. Thus when we transition modes, all
-- tuples in the full sort state have different prefix keys.
explain (costs off) select * from (select * from t order by a) s order by a, b limit 5;
select * from (select * from t order by a) s order by a, b limit 5;
-- Test rescan.
begin;
-- We force the planner to choose a plan with incremental sort on the right side
-- of a nested loop join node. That way we trigger the rescan code path.
set local enable_hashjoin = off;
set local enable_mergejoin = off;
set local enable_material = off;
set local enable_sort = off;
explain (costs off) select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
select * from t left join (select * from (select * from t order by a) v order by a, b) s on s.a = t.a where t.a in (1, 2);
rollback;
-- Test EXPLAIN ANALYZE with both fullsort and presorted groups.
select explain_analyze_without_memory('select * from (select * from t order by a) s order by a, b limit 70');
select jsonb_pretty(explain_analyze_inc_sort_nodes_without_memory('select * from (select * from t order by a) s order by a, b limit 70'));
select explain_analyze_inc_sort_nodes_verify_invariants('select * from (select * from t order by a) s order by a, b limit 70');
delete from t;
-- Small groups of 10 tuples each tested around each mode transition point.
insert into t(a, b) select i / 10, i from generate_series(1, 1000) n(i);
analyze t;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
select * from (select * from t order by a) s order by a, b limit 31;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
select * from (select * from t order by a) s order by a, b limit 32;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
select * from (select * from t order by a) s order by a, b limit 33;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
select * from (select * from t order by a) s order by a, b limit 65;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
select * from (select * from t order by a) s order by a, b limit 66;
delete from t;
-- Small groups of only 1 tuple each tested around each mode transition point.
insert into t(a, b) select i, i from generate_series(1, 1000) n(i);
analyze t;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 31;
select * from (select * from t order by a) s order by a, b limit 31;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 32;
select * from (select * from t order by a) s order by a, b limit 32;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 33;
select * from (select * from t order by a) s order by a, b limit 33;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 65;
select * from (select * from t order by a) s order by a, b limit 65;
explain (costs off) select * from (select * from t order by a) s order by a, b limit 66;
select * from (select * from t order by a) s order by a, b limit 66;
delete from t;
drop table t;
-- Incremental sort vs. parallel queries
set min_parallel_table_scan_size = '1kB';
set min_parallel_index_scan_size = '1kB';
set parallel_setup_cost = 0;
set parallel_tuple_cost = 0;
set max_parallel_workers_per_gather = 2;
create table t (a int, b int, c int);
insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
create index on t (a);
analyze t;
set enable_incremental_sort = off;
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
set enable_incremental_sort = on;
explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
-- Incremental sort vs. set operations with varno 0
set enable_hashagg to off;
explain (costs off) select * from t union select * from t order by 1,3;
-- Full sort, not just incremental sort can be pushed below a gather merge path
-- by generate_useful_gather_paths.
explain (costs off) select distinct a,b from t;
drop table t;
-- Sort pushdown can't go below where expressions are part of the rel target.
-- In particular this is interesting for volatile expressions which have to
-- go above joins since otherwise we'll incorrectly use expression evaluations
-- across multiple rows.
set enable_hashagg=off;
set enable_seqscan=off;
set enable_incremental_sort = off;
set parallel_tuple_cost=0;
set parallel_setup_cost=0;
set min_parallel_table_scan_size = 0;
set min_parallel_index_scan_size = 0;
-- Parallel sort below join.
explain (costs off) select distinct sub.unique1, stringu1
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub;
explain (costs off) select sub.unique1, stringu1
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub
order by 1, 2;
-- Parallel sort but with expression that can be safely generated at the base rel.
explain (costs off) select distinct sub.unique1, md5(stringu1)
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub;
explain (costs off) select sub.unique1, md5(stringu1)
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub
order by 1, 2;
-- Parallel sort with an aggregate that can be safely generated in parallel,
-- but we can't sort by partial aggregate values.
explain (costs off) select count(*)
from tenk1 t1
join tenk1 t2 on t1.unique1 = t2.unique2
join tenk1 t3 on t2.unique1 = t3.unique1
order by count(*);
-- Parallel sort but with expression (correlated subquery) that
-- is prohibited in parallel plans.
explain (costs off) select distinct
unique1,
(select t.unique1 from tenk1 where tenk1.unique1 = t.unique1)
from tenk1 t, generate_series(1, 1000);
explain (costs off) select
unique1,
(select t.unique1 from tenk1 where tenk1.unique1 = t.unique1)
from tenk1 t, generate_series(1, 1000)
order by 1, 2;
-- Parallel sort but with expression not available until the upper rel.
explain (costs off) select distinct sub.unique1, stringu1 || random()::text
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub;
explain (costs off) select sub.unique1, stringu1 || random()::text
from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub
order by 1, 2;