mirror of
https://github.com/postgres/postgres.git
synced 2025-11-13 00:03:54 -05:00
When we first put in collations support, we basically punted on teaching pg_statistic, ANALYZE, and the planner selectivity functions about that. They've just used DEFAULT_COLLATION_OID independently of the actual collation of the data. It's time to improve that, so: * Add columns to pg_statistic that record the specific collation associated with each statistics slot. * Teach ANALYZE to use the column's actual collation when comparing values for statistical purposes, and record this in the appropriate slot. (Note that type-specific typanalyze functions are now expected to fill stats->stacoll with the appropriate collation, too.) * Teach assorted selectivity functions to use the actual collation of the stats they are looking at, instead of just assuming it's DEFAULT_COLLATION_OID. This should give noticeably better results in selectivity estimates for columns with nondefault collations, at least for query clauses that use that same collation (which would be the default behavior in most cases). It's still true that comparisons with explicit COLLATE clauses different from the stored data's collation won't be well-estimated, but that's no worse than before. Also, this patch does make the first step towards doing better with that, which is that it's now theoretically possible to collect stats for a collation other than the column's own collation. Patch by me; thanks to Peter Eisentraut for review. Discussion: https://postgr.es/m/14706.1544630227@sss.pgh.pa.us
517 lines
13 KiB
C
517 lines
13 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* extended_stats.c
|
|
* POSTGRES extended statistics
|
|
*
|
|
* Generic code supporting statistics objects created via CREATE STATISTICS.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/statistics/extended_stats.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/genam.h"
|
|
#include "access/heapam.h"
|
|
#include "access/htup_details.h"
|
|
#include "catalog/indexing.h"
|
|
#include "catalog/pg_collation.h"
|
|
#include "catalog/pg_statistic_ext.h"
|
|
#include "nodes/relation.h"
|
|
#include "postmaster/autovacuum.h"
|
|
#include "statistics/extended_stats_internal.h"
|
|
#include "statistics/statistics.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/syscache.h"
|
|
|
|
|
|
/*
|
|
* Used internally to refer to an individual statistics object, i.e.,
|
|
* a pg_statistic_ext entry.
|
|
*/
|
|
typedef struct StatExtEntry
|
|
{
|
|
Oid statOid; /* OID of pg_statistic_ext entry */
|
|
char *schema; /* statistics object's schema */
|
|
char *name; /* statistics object's name */
|
|
Bitmapset *columns; /* attribute numbers covered by the object */
|
|
List *types; /* 'char' list of enabled statistic kinds */
|
|
} StatExtEntry;
|
|
|
|
|
|
static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
|
|
static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
|
|
int nvacatts, VacAttrStats **vacatts);
|
|
static void statext_store(Relation pg_stext, Oid relid,
|
|
MVNDistinct *ndistinct, MVDependencies *dependencies,
|
|
VacAttrStats **stats);
|
|
|
|
|
|
/*
|
|
* Compute requested extended stats, using the rows sampled for the plain
|
|
* (single-column) stats.
|
|
*
|
|
* This fetches a list of stats types from pg_statistic_ext, computes the
|
|
* requested stats, and serializes them back into the catalog.
|
|
*/
|
|
void
|
|
BuildRelationExtStatistics(Relation onerel, double totalrows,
|
|
int numrows, HeapTuple *rows,
|
|
int natts, VacAttrStats **vacattrstats)
|
|
{
|
|
Relation pg_stext;
|
|
ListCell *lc;
|
|
List *stats;
|
|
MemoryContext cxt;
|
|
MemoryContext oldcxt;
|
|
|
|
cxt = AllocSetContextCreate(CurrentMemoryContext,
|
|
"BuildRelationExtStatistics",
|
|
ALLOCSET_DEFAULT_SIZES);
|
|
oldcxt = MemoryContextSwitchTo(cxt);
|
|
|
|
pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock);
|
|
stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
|
|
|
|
foreach(lc, stats)
|
|
{
|
|
StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
|
|
MVNDistinct *ndistinct = NULL;
|
|
MVDependencies *dependencies = NULL;
|
|
VacAttrStats **stats;
|
|
ListCell *lc2;
|
|
|
|
/*
|
|
* Check if we can build these stats based on the column analyzed. If
|
|
* not, report this fact (except in autovacuum) and move on.
|
|
*/
|
|
stats = lookup_var_attr_stats(onerel, stat->columns,
|
|
natts, vacattrstats);
|
|
if (!stats)
|
|
{
|
|
if (!IsAutoVacuumWorkerProcess())
|
|
ereport(WARNING,
|
|
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
|
|
errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"",
|
|
stat->schema, stat->name,
|
|
get_namespace_name(onerel->rd_rel->relnamespace),
|
|
RelationGetRelationName(onerel)),
|
|
errtable(onerel)));
|
|
continue;
|
|
}
|
|
|
|
/* check allowed number of dimensions */
|
|
Assert(bms_num_members(stat->columns) >= 2 &&
|
|
bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
|
|
|
|
/* compute statistic of each requested type */
|
|
foreach(lc2, stat->types)
|
|
{
|
|
char t = (char) lfirst_int(lc2);
|
|
|
|
if (t == STATS_EXT_NDISTINCT)
|
|
ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
|
|
stat->columns, stats);
|
|
else if (t == STATS_EXT_DEPENDENCIES)
|
|
dependencies = statext_dependencies_build(numrows, rows,
|
|
stat->columns, stats);
|
|
}
|
|
|
|
/* store the statistics in the catalog */
|
|
statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
|
|
}
|
|
|
|
heap_close(pg_stext, RowExclusiveLock);
|
|
|
|
MemoryContextSwitchTo(oldcxt);
|
|
MemoryContextDelete(cxt);
|
|
}
|
|
|
|
/*
|
|
* statext_is_kind_built
|
|
* Is this stat kind built in the given pg_statistic_ext tuple?
|
|
*/
|
|
bool
|
|
statext_is_kind_built(HeapTuple htup, char type)
|
|
{
|
|
AttrNumber attnum;
|
|
|
|
switch (type)
|
|
{
|
|
case STATS_EXT_NDISTINCT:
|
|
attnum = Anum_pg_statistic_ext_stxndistinct;
|
|
break;
|
|
|
|
case STATS_EXT_DEPENDENCIES:
|
|
attnum = Anum_pg_statistic_ext_stxdependencies;
|
|
break;
|
|
|
|
default:
|
|
elog(ERROR, "unexpected statistics type requested: %d", type);
|
|
}
|
|
|
|
return !heap_attisnull(htup, attnum, NULL);
|
|
}
|
|
|
|
/*
|
|
* Return a list (of StatExtEntry) of statistics objects for the given relation.
|
|
*/
|
|
static List *
|
|
fetch_statentries_for_relation(Relation pg_statext, Oid relid)
|
|
{
|
|
SysScanDesc scan;
|
|
ScanKeyData skey;
|
|
HeapTuple htup;
|
|
List *result = NIL;
|
|
|
|
/*
|
|
* Prepare to scan pg_statistic_ext for entries having stxrelid = this
|
|
* rel.
|
|
*/
|
|
ScanKeyInit(&skey,
|
|
Anum_pg_statistic_ext_stxrelid,
|
|
BTEqualStrategyNumber, F_OIDEQ,
|
|
ObjectIdGetDatum(relid));
|
|
|
|
scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
|
|
NULL, 1, &skey);
|
|
|
|
while (HeapTupleIsValid(htup = systable_getnext(scan)))
|
|
{
|
|
StatExtEntry *entry;
|
|
Datum datum;
|
|
bool isnull;
|
|
int i;
|
|
ArrayType *arr;
|
|
char *enabled;
|
|
Form_pg_statistic_ext staForm;
|
|
|
|
entry = palloc0(sizeof(StatExtEntry));
|
|
staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
|
|
entry->statOid = staForm->oid;
|
|
entry->schema = get_namespace_name(staForm->stxnamespace);
|
|
entry->name = pstrdup(NameStr(staForm->stxname));
|
|
for (i = 0; i < staForm->stxkeys.dim1; i++)
|
|
{
|
|
entry->columns = bms_add_member(entry->columns,
|
|
staForm->stxkeys.values[i]);
|
|
}
|
|
|
|
/* decode the stxkind char array into a list of chars */
|
|
datum = SysCacheGetAttr(STATEXTOID, htup,
|
|
Anum_pg_statistic_ext_stxkind, &isnull);
|
|
Assert(!isnull);
|
|
arr = DatumGetArrayTypeP(datum);
|
|
if (ARR_NDIM(arr) != 1 ||
|
|
ARR_HASNULL(arr) ||
|
|
ARR_ELEMTYPE(arr) != CHAROID)
|
|
elog(ERROR, "stxkind is not a 1-D char array");
|
|
enabled = (char *) ARR_DATA_PTR(arr);
|
|
for (i = 0; i < ARR_DIMS(arr)[0]; i++)
|
|
{
|
|
Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
|
|
(enabled[i] == STATS_EXT_DEPENDENCIES));
|
|
entry->types = lappend_int(entry->types, (int) enabled[i]);
|
|
}
|
|
|
|
result = lappend(result, entry);
|
|
}
|
|
|
|
systable_endscan(scan);
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Using 'vacatts' of size 'nvacatts' as input data, return a newly built
|
|
* VacAttrStats array which includes only the items corresponding to
|
|
* attributes indicated by 'stxkeys'. If we don't have all of the per column
|
|
* stats available to compute the extended stats, then we return NULL to indicate
|
|
* to the caller that the stats should not be built.
|
|
*/
|
|
static VacAttrStats **
|
|
lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
|
|
int nvacatts, VacAttrStats **vacatts)
|
|
{
|
|
int i = 0;
|
|
int x = -1;
|
|
VacAttrStats **stats;
|
|
|
|
stats = (VacAttrStats **)
|
|
palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
|
|
|
|
/* lookup VacAttrStats info for the requested columns (same attnum) */
|
|
while ((x = bms_next_member(attrs, x)) >= 0)
|
|
{
|
|
int j;
|
|
|
|
stats[i] = NULL;
|
|
for (j = 0; j < nvacatts; j++)
|
|
{
|
|
if (x == vacatts[j]->tupattnum)
|
|
{
|
|
stats[i] = vacatts[j];
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!stats[i])
|
|
{
|
|
/*
|
|
* Looks like stats were not gathered for one of the columns
|
|
* required. We'll be unable to build the extended stats without
|
|
* this column.
|
|
*/
|
|
pfree(stats);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Sanity check that the column is not dropped - stats should have
|
|
* been removed in this case.
|
|
*/
|
|
Assert(!stats[i]->attr->attisdropped);
|
|
|
|
i++;
|
|
}
|
|
|
|
return stats;
|
|
}
|
|
|
|
/*
|
|
* statext_store
|
|
* Serializes the statistics and stores them into the pg_statistic_ext tuple.
|
|
*/
|
|
static void
|
|
statext_store(Relation pg_stext, Oid statOid,
|
|
MVNDistinct *ndistinct, MVDependencies *dependencies,
|
|
VacAttrStats **stats)
|
|
{
|
|
HeapTuple stup,
|
|
oldtup;
|
|
Datum values[Natts_pg_statistic_ext];
|
|
bool nulls[Natts_pg_statistic_ext];
|
|
bool replaces[Natts_pg_statistic_ext];
|
|
|
|
memset(nulls, true, sizeof(nulls));
|
|
memset(replaces, false, sizeof(replaces));
|
|
memset(values, 0, sizeof(values));
|
|
|
|
/*
|
|
* Construct a new pg_statistic_ext tuple, replacing the calculated stats.
|
|
*/
|
|
if (ndistinct != NULL)
|
|
{
|
|
bytea *data = statext_ndistinct_serialize(ndistinct);
|
|
|
|
nulls[Anum_pg_statistic_ext_stxndistinct - 1] = (data == NULL);
|
|
values[Anum_pg_statistic_ext_stxndistinct - 1] = PointerGetDatum(data);
|
|
}
|
|
|
|
if (dependencies != NULL)
|
|
{
|
|
bytea *data = statext_dependencies_serialize(dependencies);
|
|
|
|
nulls[Anum_pg_statistic_ext_stxdependencies - 1] = (data == NULL);
|
|
values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data);
|
|
}
|
|
|
|
/* always replace the value (either by bytea or NULL) */
|
|
replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true;
|
|
replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true;
|
|
|
|
/* there should already be a pg_statistic_ext tuple */
|
|
oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
|
|
if (!HeapTupleIsValid(oldtup))
|
|
elog(ERROR, "cache lookup failed for statistics object %u", statOid);
|
|
|
|
/* replace it */
|
|
stup = heap_modify_tuple(oldtup,
|
|
RelationGetDescr(pg_stext),
|
|
values,
|
|
nulls,
|
|
replaces);
|
|
ReleaseSysCache(oldtup);
|
|
CatalogTupleUpdate(pg_stext, &stup->t_self, stup);
|
|
|
|
heap_freetuple(stup);
|
|
}
|
|
|
|
/* initialize multi-dimensional sort */
|
|
MultiSortSupport
|
|
multi_sort_init(int ndims)
|
|
{
|
|
MultiSortSupport mss;
|
|
|
|
Assert(ndims >= 2);
|
|
|
|
mss = (MultiSortSupport) palloc0(offsetof(MultiSortSupportData, ssup)
|
|
+ sizeof(SortSupportData) * ndims);
|
|
|
|
mss->ndims = ndims;
|
|
|
|
return mss;
|
|
}
|
|
|
|
/*
|
|
* Prepare sort support info using the given sort operator and collation
|
|
* at the position 'sortdim'
|
|
*/
|
|
void
|
|
multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
|
|
Oid oper, Oid collation)
|
|
{
|
|
SortSupport ssup = &mss->ssup[sortdim];
|
|
|
|
ssup->ssup_cxt = CurrentMemoryContext;
|
|
ssup->ssup_collation = collation;
|
|
ssup->ssup_nulls_first = false;
|
|
|
|
PrepareSortSupportFromOrderingOp(oper, ssup);
|
|
}
|
|
|
|
/* compare all the dimensions in the selected order */
|
|
int
|
|
multi_sort_compare(const void *a, const void *b, void *arg)
|
|
{
|
|
MultiSortSupport mss = (MultiSortSupport) arg;
|
|
SortItem *ia = (SortItem *) a;
|
|
SortItem *ib = (SortItem *) b;
|
|
int i;
|
|
|
|
for (i = 0; i < mss->ndims; i++)
|
|
{
|
|
int compare;
|
|
|
|
compare = ApplySortComparator(ia->values[i], ia->isnull[i],
|
|
ib->values[i], ib->isnull[i],
|
|
&mss->ssup[i]);
|
|
|
|
if (compare != 0)
|
|
return compare;
|
|
}
|
|
|
|
/* equal by default */
|
|
return 0;
|
|
}
|
|
|
|
/* compare selected dimension */
|
|
int
|
|
multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
|
|
MultiSortSupport mss)
|
|
{
|
|
return ApplySortComparator(a->values[dim], a->isnull[dim],
|
|
b->values[dim], b->isnull[dim],
|
|
&mss->ssup[dim]);
|
|
}
|
|
|
|
int
|
|
multi_sort_compare_dims(int start, int end,
|
|
const SortItem *a, const SortItem *b,
|
|
MultiSortSupport mss)
|
|
{
|
|
int dim;
|
|
|
|
for (dim = start; dim <= end; dim++)
|
|
{
|
|
int r = ApplySortComparator(a->values[dim], a->isnull[dim],
|
|
b->values[dim], b->isnull[dim],
|
|
&mss->ssup[dim]);
|
|
|
|
if (r != 0)
|
|
return r;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* has_stats_of_kind
|
|
* Check whether the list contains statistic of a given kind
|
|
*/
|
|
bool
|
|
has_stats_of_kind(List *stats, char requiredkind)
|
|
{
|
|
ListCell *l;
|
|
|
|
foreach(l, stats)
|
|
{
|
|
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
|
|
|
|
if (stat->kind == requiredkind)
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* choose_best_statistics
|
|
* Look for and return statistics with the specified 'requiredkind' which
|
|
* have keys that match at least two of the given attnums. Return NULL if
|
|
* there's no match.
|
|
*
|
|
* The current selection criteria is very simple - we choose the statistics
|
|
* object referencing the most of the requested attributes, breaking ties
|
|
* in favor of objects with fewer keys overall.
|
|
*
|
|
* XXX if multiple statistics objects tie on both criteria, then which object
|
|
* is chosen depends on the order that they appear in the stats list. Perhaps
|
|
* further tiebreakers are needed.
|
|
*/
|
|
StatisticExtInfo *
|
|
choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
|
|
{
|
|
ListCell *lc;
|
|
StatisticExtInfo *best_match = NULL;
|
|
int best_num_matched = 2; /* goal #1: maximize */
|
|
int best_match_keys = (STATS_MAX_DIMENSIONS + 1); /* goal #2: minimize */
|
|
|
|
foreach(lc, stats)
|
|
{
|
|
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
|
|
int num_matched;
|
|
int numkeys;
|
|
Bitmapset *matched;
|
|
|
|
/* skip statistics that are not of the correct type */
|
|
if (info->kind != requiredkind)
|
|
continue;
|
|
|
|
/* determine how many attributes of these stats can be matched to */
|
|
matched = bms_intersect(attnums, info->keys);
|
|
num_matched = bms_num_members(matched);
|
|
bms_free(matched);
|
|
|
|
/*
|
|
* save the actual number of keys in the stats so that we can choose
|
|
* the narrowest stats with the most matching keys.
|
|
*/
|
|
numkeys = bms_num_members(info->keys);
|
|
|
|
/*
|
|
* Use this object when it increases the number of matched clauses or
|
|
* when it matches the same number of attributes but these stats have
|
|
* fewer keys than any previous match.
|
|
*/
|
|
if (num_matched > best_num_matched ||
|
|
(num_matched == best_num_matched && numkeys < best_match_keys))
|
|
{
|
|
best_match = info;
|
|
best_num_matched = num_matched;
|
|
best_match_keys = numkeys;
|
|
}
|
|
}
|
|
|
|
return best_match;
|
|
}
|