mirror of
https://github.com/postgres/postgres.git
synced 2025-06-05 00:02:04 -04:00
ineq_histogram_selectivity() can be invoked in situations where the ordering we care about is not that of the column's histogram. We could be considering some other collation, or even more drastically, the query operator might not agree at all with what was used to construct the histogram. (We'll get here for anything using scalarineqsel-based estimators, so that's quite likely to happen for extension operators.) Up to now we just ignored this issue and assumed we were dealing with an operator/collation whose sort order exactly matches the histogram, possibly resulting in junk estimates if the binary search gets confused. It's past time to improve that, since the use of nondefault collations is increasing. What we can do is verify that the given operator and collation match what's recorded in pg_statistic, and use the existing code only if so. When they don't match, instead execute the operator against each histogram entry, and take the fraction of successes as our selectivity estimate. This gives an estimate that is probably good to about 1/histogram_size, with no assumptions about ordering. (The quality of the estimate is likely to degrade near the ends of the value range, since the two orderings probably don't agree on what is an extremal value; but this is surely going to be more reliable than what we did before.) At some point we might further improve matters by storing more than one histogram calculated according to different orderings. But this code would still be good fallback logic when no matches exist, so that is not an argument for not doing this. While here, also improve get_variable_range() to deal more honestly with non-default collations. This isn't back-patchable, because it requires adding another argument to ineq_histogram_selectivity, and because it might have significant impact on the estimation results for extension operators relying on scalarineqsel --- mostly for the better, one hopes, but in any case destabilizing plan choices in back branches is best avoided. Per investigation of a report from James Lucas. Discussion: https://postgr.es/m/CAAFmbbOvfi=wMM=3qRsPunBSLb8BFREno2oOzSBS=mzfLPKABw@mail.gmail.com
224 lines
8.7 KiB
C
224 lines
8.7 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* selfuncs.h
|
|
* Selectivity functions for standard operators, and assorted
|
|
* infrastructure for selectivity and cost estimation.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* src/include/utils/selfuncs.h
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef SELFUNCS_H
|
|
#define SELFUNCS_H
|
|
|
|
#include "access/htup.h"
|
|
#include "fmgr.h"
|
|
#include "nodes/pathnodes.h"
|
|
|
|
|
|
/*
|
|
* Note: the default selectivity estimates are not chosen entirely at random.
|
|
* We want them to be small enough to ensure that indexscans will be used if
|
|
* available, for typical table densities of ~100 tuples/page. Thus, for
|
|
* example, 0.01 is not quite small enough, since that makes it appear that
|
|
* nearly all pages will be hit anyway. Also, since we sometimes estimate
|
|
* eqsel as 1/num_distinct, we probably want DEFAULT_NUM_DISTINCT to equal
|
|
* 1/DEFAULT_EQ_SEL.
|
|
*/
|
|
|
|
/* default selectivity estimate for equalities such as "A = b" */
|
|
#define DEFAULT_EQ_SEL 0.005
|
|
|
|
/* default selectivity estimate for inequalities such as "A < b" */
|
|
#define DEFAULT_INEQ_SEL 0.3333333333333333
|
|
|
|
/* default selectivity estimate for range inequalities "A > b AND A < c" */
|
|
#define DEFAULT_RANGE_INEQ_SEL 0.005
|
|
|
|
/* default selectivity estimate for pattern-match operators such as LIKE */
|
|
#define DEFAULT_MATCH_SEL 0.005
|
|
|
|
/* default selectivity estimate for other matching operators */
|
|
#define DEFAULT_MATCHING_SEL 0.010
|
|
|
|
/* default number of distinct values in a table */
|
|
#define DEFAULT_NUM_DISTINCT 200
|
|
|
|
/* default selectivity estimate for boolean and null test nodes */
|
|
#define DEFAULT_UNK_SEL 0.005
|
|
#define DEFAULT_NOT_UNK_SEL (1.0 - DEFAULT_UNK_SEL)
|
|
|
|
|
|
/*
|
|
* Clamp a computed probability estimate (which may suffer from roundoff or
|
|
* estimation errors) to valid range. Argument must be a float variable.
|
|
*/
|
|
#define CLAMP_PROBABILITY(p) \
|
|
do { \
|
|
if (p < 0.0) \
|
|
p = 0.0; \
|
|
else if (p > 1.0) \
|
|
p = 1.0; \
|
|
} while (0)
|
|
|
|
|
|
/* Return data from examine_variable and friends */
|
|
typedef struct VariableStatData
|
|
{
|
|
Node *var; /* the Var or expression tree */
|
|
RelOptInfo *rel; /* Relation, or NULL if not identifiable */
|
|
HeapTuple statsTuple; /* pg_statistic tuple, or NULL if none */
|
|
/* NB: if statsTuple!=NULL, it must be freed when caller is done */
|
|
void (*freefunc) (HeapTuple tuple); /* how to free statsTuple */
|
|
Oid vartype; /* exposed type of expression */
|
|
Oid atttype; /* actual type (after stripping relabel) */
|
|
int32 atttypmod; /* actual typmod (after stripping relabel) */
|
|
bool isunique; /* matches unique index or DISTINCT clause */
|
|
bool acl_ok; /* result of ACL check on table or column */
|
|
} VariableStatData;
|
|
|
|
#define ReleaseVariableStats(vardata) \
|
|
do { \
|
|
if (HeapTupleIsValid((vardata).statsTuple)) \
|
|
(vardata).freefunc((vardata).statsTuple); \
|
|
} while(0)
|
|
|
|
|
|
/*
|
|
* genericcostestimate is a general-purpose estimator that can be used for
|
|
* most index types. In some cases we use genericcostestimate as the base
|
|
* code and then incorporate additional index-type-specific knowledge in
|
|
* the type-specific calling function. To avoid code duplication, we make
|
|
* genericcostestimate return a number of intermediate values as well as
|
|
* its preliminary estimates of the output cost values. The GenericCosts
|
|
* struct includes all these values.
|
|
*
|
|
* Callers should initialize all fields of GenericCosts to zero. In addition,
|
|
* they can set numIndexTuples to some positive value if they have a better
|
|
* than default way of estimating the number of leaf index tuples visited.
|
|
*/
|
|
typedef struct
|
|
{
|
|
/* These are the values the cost estimator must return to the planner */
|
|
Cost indexStartupCost; /* index-related startup cost */
|
|
Cost indexTotalCost; /* total index-related scan cost */
|
|
Selectivity indexSelectivity; /* selectivity of index */
|
|
double indexCorrelation; /* order correlation of index */
|
|
|
|
/* Intermediate values we obtain along the way */
|
|
double numIndexPages; /* number of leaf pages visited */
|
|
double numIndexTuples; /* number of leaf tuples visited */
|
|
double spc_random_page_cost; /* relevant random_page_cost value */
|
|
double num_sa_scans; /* # indexscans from ScalarArrayOpExprs */
|
|
} GenericCosts;
|
|
|
|
/* Hooks for plugins to get control when we ask for stats */
|
|
typedef bool (*get_relation_stats_hook_type) (PlannerInfo *root,
|
|
RangeTblEntry *rte,
|
|
AttrNumber attnum,
|
|
VariableStatData *vardata);
|
|
extern PGDLLIMPORT get_relation_stats_hook_type get_relation_stats_hook;
|
|
typedef bool (*get_index_stats_hook_type) (PlannerInfo *root,
|
|
Oid indexOid,
|
|
AttrNumber indexattnum,
|
|
VariableStatData *vardata);
|
|
extern PGDLLIMPORT get_index_stats_hook_type get_index_stats_hook;
|
|
|
|
/* Functions in selfuncs.c */
|
|
|
|
extern void examine_variable(PlannerInfo *root, Node *node, int varRelid,
|
|
VariableStatData *vardata);
|
|
extern bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid);
|
|
extern bool get_restriction_variable(PlannerInfo *root, List *args,
|
|
int varRelid,
|
|
VariableStatData *vardata, Node **other,
|
|
bool *varonleft);
|
|
extern void get_join_variables(PlannerInfo *root, List *args,
|
|
SpecialJoinInfo *sjinfo,
|
|
VariableStatData *vardata1,
|
|
VariableStatData *vardata2,
|
|
bool *join_is_reversed);
|
|
extern double get_variable_numdistinct(VariableStatData *vardata,
|
|
bool *isdefault);
|
|
extern double mcv_selectivity(VariableStatData *vardata,
|
|
FmgrInfo *opproc, Oid collation,
|
|
Datum constval, bool varonleft,
|
|
double *sumcommonp);
|
|
extern double histogram_selectivity(VariableStatData *vardata,
|
|
FmgrInfo *opproc, Oid collation,
|
|
Datum constval, bool varonleft,
|
|
int min_hist_size, int n_skip,
|
|
int *hist_size);
|
|
extern double generic_restriction_selectivity(PlannerInfo *root,
|
|
Oid oproid, Oid collation,
|
|
List *args, int varRelid,
|
|
double default_selectivity);
|
|
extern double ineq_histogram_selectivity(PlannerInfo *root,
|
|
VariableStatData *vardata,
|
|
Oid opoid, FmgrInfo *opproc,
|
|
bool isgt, bool iseq,
|
|
Oid collation,
|
|
Datum constval, Oid consttype);
|
|
extern double var_eq_const(VariableStatData *vardata,
|
|
Oid oproid, Oid collation,
|
|
Datum constval, bool constisnull,
|
|
bool varonleft, bool negate);
|
|
extern double var_eq_non_const(VariableStatData *vardata,
|
|
Oid oproid, Oid collation,
|
|
Node *other,
|
|
bool varonleft, bool negate);
|
|
|
|
extern Selectivity boolvarsel(PlannerInfo *root, Node *arg, int varRelid);
|
|
extern Selectivity booltestsel(PlannerInfo *root, BoolTestType booltesttype,
|
|
Node *arg, int varRelid,
|
|
JoinType jointype, SpecialJoinInfo *sjinfo);
|
|
extern Selectivity nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
|
|
Node *arg, int varRelid,
|
|
JoinType jointype, SpecialJoinInfo *sjinfo);
|
|
extern Selectivity scalararraysel(PlannerInfo *root,
|
|
ScalarArrayOpExpr *clause,
|
|
bool is_join_clause,
|
|
int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);
|
|
extern int estimate_array_length(Node *arrayexpr);
|
|
extern Selectivity rowcomparesel(PlannerInfo *root,
|
|
RowCompareExpr *clause,
|
|
int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo);
|
|
|
|
extern void mergejoinscansel(PlannerInfo *root, Node *clause,
|
|
Oid opfamily, int strategy, bool nulls_first,
|
|
Selectivity *leftstart, Selectivity *leftend,
|
|
Selectivity *rightstart, Selectivity *rightend);
|
|
|
|
extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
|
|
double input_rows, List **pgset);
|
|
|
|
extern void estimate_hash_bucket_stats(PlannerInfo *root,
|
|
Node *hashkey, double nbuckets,
|
|
Selectivity *mcv_freq,
|
|
Selectivity *bucketsize_frac);
|
|
extern double estimate_hashagg_tablesize(Path *path,
|
|
const AggClauseCosts *agg_costs,
|
|
double dNumGroups);
|
|
|
|
extern List *get_quals_from_indexclauses(List *indexclauses);
|
|
extern Cost index_other_operands_eval_cost(PlannerInfo *root,
|
|
List *indexquals);
|
|
extern List *add_predicate_to_index_quals(IndexOptInfo *index,
|
|
List *indexQuals);
|
|
extern void genericcostestimate(PlannerInfo *root, IndexPath *path,
|
|
double loop_count,
|
|
GenericCosts *costs);
|
|
|
|
/* Functions in array_selfuncs.c */
|
|
|
|
extern Selectivity scalararraysel_containment(PlannerInfo *root,
|
|
Node *leftop, Node *rightop,
|
|
Oid elemtype, bool isEquality, bool useOr,
|
|
int varRelid);
|
|
|
|
#endif /* SELFUNCS_H */
|