mirror of
https://github.com/postgres/postgres.git
synced 2025-05-18 00:02:16 -04:00
Edit SGML documentation related to extended statistics.
Use the "statistics object" terminology uniformly here too. Assorted copy-editing. Put new catalogs.sgml sections into alphabetical order.
This commit is contained in:
parent
e84c019598
commit
93ece9cc88
@ -221,13 +221,13 @@
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry>
|
||||
<entry>template data for procedural languages</entry>
|
||||
<entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry>
|
||||
<entry>information about partition key of tables</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry>
|
||||
<entry>information about partition key of tables</entry>
|
||||
<entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry>
|
||||
<entry>template data for procedural languages</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt><
|
||||
</table>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="catalog-pg-statistic-ext">
|
||||
<title><structname>pg_statistic_ext</structname></title>
|
||||
|
||||
<indexterm zone="catalog-pg-statistic-ext">
|
||||
<primary>pg_statistic_ext</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
The catalog <structname>pg_statistic_ext</structname>
|
||||
holds extended planner statistics.
|
||||
</para>
|
||||
|
||||
<table>
|
||||
<title><structname>pg_statistic_ext</> Columns</title>
|
||||
|
||||
<tgroup cols="4">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>Name</entry>
|
||||
<entry>Type</entry>
|
||||
<entry>References</entry>
|
||||
<entry>Description</entry>
|
||||
</row>
|
||||
</thead>
|
||||
|
||||
<tbody>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxrelid</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
|
||||
<entry>The table that the described columns belongs to</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxname</structfield></entry>
|
||||
<entry><type>name</type></entry>
|
||||
<entry></entry>
|
||||
<entry>Name of the statistic.</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxnamespace</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
|
||||
<entry>
|
||||
The OID of the namespace that contains this statistic
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxowner</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
|
||||
<entry>Owner of the statistic</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxkeys</structfield></entry>
|
||||
<entry><type>int2vector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
|
||||
<entry>
|
||||
This is an array of values that indicate which table columns this
|
||||
statistic covers. For example a value of <literal>1 3</literal> would
|
||||
mean that the first and the third table columns make up the statistic key.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxkind</structfield></entry>
|
||||
<entry><type>char[]</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
An array with the modes of the enabled statistic types. Valid values
|
||||
are:
|
||||
<literal>d</literal> for ndistinct coefficients,
|
||||
<literal>f</literal> for functional dependencies.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxndistinct</structfield></entry>
|
||||
<entry><type>pg_ndistinct</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
N-distinct coefficients, serialized as <structname>pg_ndistinct</> type.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxdependencies</structfield></entry>
|
||||
<entry><type>pg_dependencies</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
Functional dependencies, serialized as <structname>pg_dependencies</> type.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="catalog-pg-namespace">
|
||||
<title><structname>pg_namespace</structname></title>
|
||||
@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt><
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="catalog-pg-partitioned-table">
|
||||
<title><structname>pg_partitioned_table</structname></title>
|
||||
|
||||
<indexterm zone="catalog-pg-partitioned-table">
|
||||
<primary>pg_partitioned_table</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
The catalog <structname>pg_partitioned_table</structname> stores
|
||||
information about how tables are partitioned.
|
||||
</para>
|
||||
|
||||
<table>
|
||||
<title><structname>pg_partitioned_table</> Columns</title>
|
||||
|
||||
<tgroup cols="4">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>Name</entry>
|
||||
<entry>Type</entry>
|
||||
<entry>References</entry>
|
||||
<entry>Description</entry>
|
||||
</row>
|
||||
</thead>
|
||||
|
||||
<tbody>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partrelid</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
|
||||
<entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partstrat</structfield></entry>
|
||||
<entry><type>char</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
Partitioning strategy; <literal>l</> = list partitioned table,
|
||||
<literal>r</> = range partitioned table
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partnatts</structfield></entry>
|
||||
<entry><type>int2</type></entry>
|
||||
<entry></entry>
|
||||
<entry>The number of columns in partition key</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partattrs</structfield></entry>
|
||||
<entry><type>int2vector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
|
||||
<entry>
|
||||
This is an array of <structfield>partnatts</structfield> values that
|
||||
indicate which table columns are part of the partition key. For
|
||||
example, a value of <literal>1 3</literal> would mean that the first
|
||||
and the third table columns make up the partition key. A zero in this
|
||||
array indicates that the corresponding partition key column is an
|
||||
expression, rather than a simple column reference.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partclass</structfield></entry>
|
||||
<entry><type>oidvector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
|
||||
<entry>
|
||||
For each column in the partition key, this contains the OID of the
|
||||
operator class to use. See
|
||||
<link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partcollation</structfield></entry>
|
||||
<entry><type>oidvector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
|
||||
<entry>
|
||||
For each column in the partition key, this contains the OID of the
|
||||
the collation to use for partitioning.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partexprs</structfield></entry>
|
||||
<entry><type>pg_node_tree</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
Expression trees (in <function>nodeToString()</function>
|
||||
representation) for partition key columns that are not simple column
|
||||
references. This is a list with one element for each zero
|
||||
entry in <structfield>partattrs</>. Null if all partition key columns
|
||||
are simple references.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
</sect1>
|
||||
|
||||
|
||||
<sect1 id="catalog-pg-pltemplate">
|
||||
<title><structname>pg_pltemplate</structname></title>
|
||||
|
||||
@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt><
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="catalog-pg-partitioned-table">
|
||||
<title><structname>pg_partitioned_table</structname></title>
|
||||
|
||||
<indexterm zone="catalog-pg-partitioned-table">
|
||||
<primary>pg_partitioned_table</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
The catalog <structname>pg_partitioned_table</structname> stores
|
||||
information about how tables are partitioned.
|
||||
</para>
|
||||
|
||||
<table>
|
||||
<title><structname>pg_partitioned_table</> Columns</title>
|
||||
|
||||
<tgroup cols="4">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>Name</entry>
|
||||
<entry>Type</entry>
|
||||
<entry>References</entry>
|
||||
<entry>Description</entry>
|
||||
</row>
|
||||
</thead>
|
||||
|
||||
<tbody>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partrelid</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
|
||||
<entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partstrat</structfield></entry>
|
||||
<entry><type>char</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
Partitioning strategy; <literal>l</> = list partitioned table,
|
||||
<literal>r</> = range partitioned table
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partnatts</structfield></entry>
|
||||
<entry><type>int2</type></entry>
|
||||
<entry></entry>
|
||||
<entry>The number of columns in partition key</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partattrs</structfield></entry>
|
||||
<entry><type>int2vector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
|
||||
<entry>
|
||||
This is an array of <structfield>partnatts</structfield> values that
|
||||
indicate which table columns are part of the partition key. For
|
||||
example, a value of <literal>1 3</literal> would mean that the first
|
||||
and the third table columns make up the partition key. A zero in this
|
||||
array indicates that the corresponding partition key column is an
|
||||
expression, rather than a simple column reference.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partclass</structfield></entry>
|
||||
<entry><type>oidvector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
|
||||
<entry>
|
||||
For each column in the partition key, this contains the OID of the
|
||||
operator class to use. See
|
||||
<link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partcollation</structfield></entry>
|
||||
<entry><type>oidvector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
|
||||
<entry>
|
||||
For each column in the partition key, this contains the OID of the
|
||||
the collation to use for partitioning.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>partexprs</structfield></entry>
|
||||
<entry><type>pg_node_tree</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
Expression trees (in <function>nodeToString()</function>
|
||||
representation) for partition key columns that are not simple column
|
||||
references. This is a list with one element for each zero
|
||||
entry in <structfield>partattrs</>. Null if all partition key columns
|
||||
are simple references.
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="catalog-pg-policy">
|
||||
<title><structname>pg_policy</structname></title>
|
||||
@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<replaceable><iteration count></>:<replaceable><salt><
|
||||
|
||||
</sect1>
|
||||
|
||||
<sect1 id="catalog-pg-statistic-ext">
|
||||
<title><structname>pg_statistic_ext</structname></title>
|
||||
|
||||
<indexterm zone="catalog-pg-statistic-ext">
|
||||
<primary>pg_statistic_ext</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
The catalog <structname>pg_statistic_ext</structname>
|
||||
holds extended planner statistics.
|
||||
Each row in this catalog corresponds to a <firstterm>statistics object</>
|
||||
created with <xref linkend="sql-createstatistics">.
|
||||
</para>
|
||||
|
||||
<table>
|
||||
<title><structname>pg_statistic_ext</> Columns</title>
|
||||
|
||||
<tgroup cols="4">
|
||||
<thead>
|
||||
<row>
|
||||
<entry>Name</entry>
|
||||
<entry>Type</entry>
|
||||
<entry>References</entry>
|
||||
<entry>Description</entry>
|
||||
</row>
|
||||
</thead>
|
||||
|
||||
<tbody>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxrelid</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
|
||||
<entry>Table containing the columns described by this object</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxname</structfield></entry>
|
||||
<entry><type>name</type></entry>
|
||||
<entry></entry>
|
||||
<entry>Name of the statistics object</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxnamespace</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
|
||||
<entry>
|
||||
The OID of the namespace that contains this statistics object
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxowner</structfield></entry>
|
||||
<entry><type>oid</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
|
||||
<entry>Owner of the statistics object</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxkeys</structfield></entry>
|
||||
<entry><type>int2vector</type></entry>
|
||||
<entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
|
||||
<entry>
|
||||
An array of attribute numbers, indicating which table columns are
|
||||
covered by this statistics object;
|
||||
for example a value of <literal>1 3</literal> would
|
||||
mean that the first and the third table columns are covered
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxkind</structfield></entry>
|
||||
<entry><type>char[]</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
An array containing codes for the enabled statistic types;
|
||||
valid values are:
|
||||
<literal>d</literal> for n-distinct statistics,
|
||||
<literal>f</literal> for functional dependency statistics
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxndistinct</structfield></entry>
|
||||
<entry><type>pg_ndistinct</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
N-distinct counts, serialized as <structname>pg_ndistinct</> type
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
<row>
|
||||
<entry><structfield>stxdependencies</structfield></entry>
|
||||
<entry><type>pg_dependencies</type></entry>
|
||||
<entry></entry>
|
||||
<entry>
|
||||
Functional dependency statistics, serialized
|
||||
as <structname>pg_dependencies</> type
|
||||
</entry>
|
||||
</row>
|
||||
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
<para>
|
||||
The <structfield>stxkind</structfield> field is filled at creation of the
|
||||
statistics object, indicating which statistic type(s) are desired.
|
||||
The fields after it are initially NULL and are filled only when the
|
||||
corresponding statistic has been computed by <command>ANALYZE</>.
|
||||
</para>
|
||||
</sect1>
|
||||
|
||||
<sect1 id="catalog-pg-subscription">
|
||||
<title><structname>pg_subscription</structname></title>
|
||||
|
||||
|
@ -1071,25 +1071,41 @@ WHERE tablename = 'road';
|
||||
are independent of each other,
|
||||
an assumption that does not hold when column values are correlated.
|
||||
Regular statistics, because of their per-individual-column nature,
|
||||
do not capture the knowledge of cross-column correlation;
|
||||
<firstterm>multivariate statistics</firstterm> can be used to instruct
|
||||
the server to obtain statistics across such a set of columns,
|
||||
which are later used by the query optimizer
|
||||
to determine cardinality and selectivity
|
||||
of clauses involving those columns.
|
||||
Multivariate statistics are currently the only use of
|
||||
<firstterm>extended statistics</firstterm>.
|
||||
cannot capture any knowledge about cross-column correlation.
|
||||
However, <productname>PostgreSQL</> has the ability to compute
|
||||
<firstterm>multivariate statistics</firstterm>, which can capture
|
||||
such information.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Extended statistics are created using
|
||||
Because the number of possible column combinations is very large,
|
||||
it's impractical to compute multivariate statistics automatically.
|
||||
Instead, <firstterm>extended statistics objects</firstterm>, more often
|
||||
called just <firstterm>statistics objects</>, can be created to instruct
|
||||
the server to obtain statistics across interesting sets of columns.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Statistics objects are created using
|
||||
<xref linkend="sql-createstatistics">, which see for more details.
|
||||
Data collection is deferred until the next <command>ANALYZE</command>
|
||||
on the table, after which the stored values can be examined in the
|
||||
Creation of such an object merely creates a catalog entry expressing
|
||||
interest in the statistics. Actual data collection is performed
|
||||
by <command>ANALYZE</command> (either a manual command, or background
|
||||
auto-analyze). The collected values can be examined in the
|
||||
<link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
|
||||
catalog.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
<command>ANALYZE</command> computes extended statistics based on the same
|
||||
sample of table rows that it takes for computing regular single-column
|
||||
statistics. Since the sample size is increased by increasing the
|
||||
statistics target for the table or any of its columns (as described in
|
||||
the previous section), a larger statistics target will normally result in
|
||||
more accurate extended statistics, as well as more time spent calculating
|
||||
them.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The following subsections describe the types of extended statistics
|
||||
that are currently supported.
|
||||
@ -1099,42 +1115,51 @@ WHERE tablename = 'road';
|
||||
<title>Functional Dependencies</title>
|
||||
|
||||
<para>
|
||||
The simplest type of extended statistics are functional dependencies,
|
||||
a concept used in definitions of database normal forms.
|
||||
Put simply, it is said that column <literal>b</> is functionally
|
||||
dependent on column <literal>a</> if knowledge of the value of
|
||||
<literal>a</> is sufficient to determine the value of <literal>b</>.
|
||||
In normalized databases, functional dependencies are allowed only on
|
||||
primary keys and superkeys. However, many data sets are in practice not
|
||||
fully normalized for various reasons; intentional denormalization for
|
||||
performance reasons is a common example.
|
||||
The simplest type of extended statistics tracks <firstterm>functional
|
||||
dependencies</>, a concept used in definitions of database normal forms.
|
||||
We say that column <structfield>b</> is functionally dependent on
|
||||
column <structfield>a</> if knowledge of the value of
|
||||
<structfield>a</> is sufficient to determine the value
|
||||
of <structfield>b</>, that is there are no two rows having the same value
|
||||
of <structfield>a</> but different values of <structfield>b</>.
|
||||
In a fully normalized database, functional dependencies should exist
|
||||
only on primary keys and superkeys. However, in practice many data sets
|
||||
are not fully normalized for various reasons; intentional
|
||||
denormalization for performance reasons is a common example.
|
||||
Even in a fully normalized database, there may be partial correlation
|
||||
between some columns, which can be expressed as partial functional
|
||||
dependency.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
The existance of functional dependencies directly affects the accuracy
|
||||
of estimates in certain queries.
|
||||
The reason is that conditions on the dependent columns do not
|
||||
restrict the result set, but the query planner (lacking functional
|
||||
dependency knowledge) considers them independent, resulting in
|
||||
underestimates.
|
||||
To inform the planner about the functional dependencies, we collect
|
||||
measurements of dependency during <command>ANALYZE</>. Assessing
|
||||
the degree of dependency between all sets of columns would be
|
||||
prohibitively expensive, so the search is limited to potential
|
||||
dependencies defined using the <literal>dependencies</> option of
|
||||
extended statistics. It is advisable to create
|
||||
<literal>dependencies</> statistics if and only if functional
|
||||
dependencies actually exist, to avoid unnecessary overhead on both
|
||||
<command>ANALYZE</> and query planning.
|
||||
The existence of functional dependencies directly affects the accuracy
|
||||
of estimates in certain queries. If a query contains conditions on
|
||||
both the independent and the dependent column(s), the
|
||||
conditions on the dependent columns do not further reduce the result
|
||||
size; but without knowledge of the functional dependency, the query
|
||||
planner will assume that the conditions are independent, resulting
|
||||
in underestimating the result size.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To inspect functional dependencies on a statistics
|
||||
<literal>stts</literal>, you may do this:
|
||||
To inform the planner about functional dependencies, <command>ANALYZE</>
|
||||
can collect measurements of cross-column dependency. Assessing the
|
||||
degree of dependency between all sets of columns would be prohibitively
|
||||
expensive, so data collection is limited to those groups of columns
|
||||
appearing together in a statistics object defined with
|
||||
the <literal>dependencies</> option. It is advisable to create
|
||||
<literal>dependencies</> statistics only for column groups that are
|
||||
strongly correlated, to avoid unnecessary overhead in both
|
||||
<command>ANALYZE</> and later query planning.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Here is an example of collecting functional-dependency statistics:
|
||||
<programlisting>
|
||||
CREATE STATISTICS stts (dependencies)
|
||||
ON zip, city FROM zipcodes;
|
||||
CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes;
|
||||
|
||||
ANALYZE zipcodes;
|
||||
|
||||
SELECT stxname, stxkeys, stxdependencies
|
||||
FROM pg_statistic_ext
|
||||
WHERE stxname = 'stts';
|
||||
@ -1143,85 +1168,88 @@ SELECT stxname, stxkeys, stxdependencies
|
||||
stts | 1 5 | {"1 => 5": 1.000000, "5 => 1": 0.423130}
|
||||
(1 row)
|
||||
</programlisting>
|
||||
where it can be seen that column 1 (a zip code) fully determines column
|
||||
Here it can be seen that column 1 (zip code) fully determines column
|
||||
5 (city) so the coefficient is 1.0, while city only determines zip code
|
||||
about 42% of the time, meaning that there are many cities (58%) that are
|
||||
represented by more than a single ZIP code.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
When computing the selectivity, the planner inspects all conditions and
|
||||
attempts to identify which conditions are already implied by other
|
||||
conditions. The selectivity estimates from any redundant conditions are
|
||||
ignored from a selectivity point of view. In the example query above,
|
||||
the selectivity estimates for either of the conditions may be eliminated,
|
||||
thus improving the overall estimate.
|
||||
When computing the selectivity for a query involving functionally
|
||||
dependent columns, the planner adjusts the per-condition selectivity
|
||||
estimates using the dependency coefficients so as not to produce
|
||||
an underestimate.
|
||||
</para>
|
||||
|
||||
<sect4>
|
||||
<title>Limitations of Functional Dependencies</title>
|
||||
|
||||
<para>
|
||||
Functional dependencies are a very simple type of statistics, and
|
||||
as such have several limitations. The first limitation is that they
|
||||
only work with simple equality conditions, comparing columns and constant
|
||||
values. It's not possible to use them to eliminate equality conditions
|
||||
comparing two columns or a column to an expression, range clauses,
|
||||
<literal>LIKE</> or any other type of conditions.
|
||||
Functional dependencies are currently only applied when considering
|
||||
simple equality conditions that compare columns to constant values.
|
||||
They are not used to improve estimates for equality conditions
|
||||
comparing two columns or comparing a column to an expression, nor for
|
||||
range clauses, <literal>LIKE</> or any other type of condition.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
When eliminating the implied conditions, the planner assumes that the
|
||||
conditions are compatible. Consider the following example, where
|
||||
this assumption does not hold:
|
||||
|
||||
When estimating with functional dependencies, the planner assumes that
|
||||
conditions on the involved columns are compatible and hence redundant.
|
||||
If they are incompatible, the correct estimate would be zero rows, but
|
||||
that possibility is not considered. For example, given a query like
|
||||
<programlisting>
|
||||
EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
|
||||
QUERY PLAN
|
||||
-----------------------------------------------------------------------------
|
||||
Seq Scan on t (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1)
|
||||
Filter: ((a = 1) AND (b = 10))
|
||||
Rows Removed by Filter: 10000
|
||||
SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105';
|
||||
</programlisting>
|
||||
|
||||
While there are no rows with such combination of values, the planner
|
||||
is unable to verify whether the values match — it only knows that
|
||||
the columns are functionally dependent.
|
||||
the planner will disregard the <structfield>city</> clause as not
|
||||
changing the selectivity, which is correct. However, it will make
|
||||
the same assumption about
|
||||
<programlisting>
|
||||
SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210';
|
||||
</programlisting>
|
||||
even though there will really be zero rows satisfying this query.
|
||||
Functional dependency statistics do not provide enough information
|
||||
to conclude that, however.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This assumption is related to queries executed on the database; in many
|
||||
cases, it's actually satisfied (e.g. when the GUI only allows selecting
|
||||
compatible values). But if that's not the case, functional dependencies
|
||||
may not be a viable option.
|
||||
In many practical situations, this assumption is usually satisfied;
|
||||
for example, there might be a GUI in the application that only allows
|
||||
selecting compatible city and zipcode values to use in a query.
|
||||
But if that's not the case, functional dependencies may not be a viable
|
||||
option.
|
||||
</para>
|
||||
</sect4>
|
||||
</sect3>
|
||||
|
||||
<sect3>
|
||||
<title>Multivariate N-Distinct Coefficients</title>
|
||||
<title>Multivariate N-Distinct Counts</title>
|
||||
|
||||
<para>
|
||||
Single-column statistics store the number of distinct values in each
|
||||
column. Estimates of the number of distinct values on more than one
|
||||
column (for example, for <literal>GROUP BY a, b</literal>) are
|
||||
column. Estimates of the number of distinct values when combining more
|
||||
than one column (for example, for <literal>GROUP BY a, b</literal>) are
|
||||
frequently wrong when the planner only has single-column statistical
|
||||
data, however, causing it to select bad plans.
|
||||
In order to improve n-distinct estimation when multiple columns are
|
||||
grouped together, the <literal>ndistinct</> option of extended statistics
|
||||
can be used, which instructs <command>ANALYZE</> to collect n-distinct
|
||||
estimates for all possible combinations of two or more columns of the set
|
||||
of columns in the statistics object (the per-column estimates are already
|
||||
available in <structname>pg_statistic</>).
|
||||
data, causing it to select bad plans.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Continuing the above example, the n-distinct coefficients in a ZIP
|
||||
code table may look like the following:
|
||||
To improve such estimates, <command>ANALYZE</> can collect n-distinct
|
||||
statistics for groups of columns. As before, it's impractical to do
|
||||
this for every possible column grouping, so data is collected only for
|
||||
those groups of columns appearing together in a statistics object
|
||||
defined with the <literal>ndistinct</> option. Data will be collected
|
||||
for each possible combination of two or more columns from the set of
|
||||
listed columns.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Continuing the previous example, the n-distinct counts in a
|
||||
table of ZIP codes might look like the following:
|
||||
<programlisting>
|
||||
CREATE STATISTICS stts2 (ndistinct)
|
||||
ON zip, state, city FROM zipcodes;
|
||||
CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes;
|
||||
|
||||
ANALYZE zipcodes;
|
||||
|
||||
SELECT stxkeys AS k, stxndistinct AS nd
|
||||
FROM pg_statistic_ext
|
||||
WHERE stxname = 'stts2';
|
||||
@ -1230,11 +1258,19 @@ k | 1 2 5
|
||||
nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
|
||||
(1 row)
|
||||
</programlisting>
|
||||
which indicates that there are three combinations of columns that
|
||||
This indicates that there are three combinations of columns that
|
||||
have 33178 distinct values: ZIP code and state; ZIP code and city;
|
||||
and ZIP code, city and state (the fact that they are all equal is
|
||||
expected given the nature of ZIP-code data). On the other hand,
|
||||
the combination of city and state only has 27435 distinct values.
|
||||
expected given that ZIP code alone is unique in this table). On the
|
||||
other hand, the combination of city and state has only 27435 distinct
|
||||
values.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
It's advisable to create <literal>ndistinct</> statistics objects only
|
||||
on combinations of columns that are actually used for grouping, and
|
||||
for which misestimation of the number of groups is resulting in bad
|
||||
plans. Otherwise, the <command>ANALYZE</> cycles are just wasted.
|
||||
</para>
|
||||
</sect3>
|
||||
</sect2>
|
||||
|
@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
|
||||
</indexterm>
|
||||
|
||||
<sect2>
|
||||
<title>Functional dependencies</title>
|
||||
<title>Functional Dependencies</title>
|
||||
|
||||
<para>
|
||||
Multivariate correlation can be seen with a very simple data set — a
|
||||
table with two columns, both containing the same values:
|
||||
Multivariate correlation can be demonstrated with a very simple data set
|
||||
— a table with two columns, both containing the same values:
|
||||
|
||||
<programlisting>
|
||||
CREATE TABLE t (a INT, b INT);
|
||||
@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1;
|
||||
number of rows, we see that the estimate is very accurate
|
||||
(in fact exact, as the table is very small). Changing the
|
||||
<literal>WHERE</> to use the <structfield>b</> column, an identical
|
||||
plan is generated. Observe what happens if we apply the same
|
||||
condition on both columns combining them with <literal>AND</>:
|
||||
plan is generated. But observe what happens if we apply the same
|
||||
condition on both columns, combining them with <literal>AND</>:
|
||||
|
||||
<programlisting>
|
||||
EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
|
||||
@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
|
||||
</programlisting>
|
||||
|
||||
The planner estimates the selectivity for each condition individually,
|
||||
arriving to the 1% estimates as above, and then multiplies them, getting
|
||||
the final 0.01% estimate. The <quote>actual</quote> figures, however,
|
||||
show that this results in a significant underestimate, as the actual
|
||||
number of rows matching the conditions (100) is two orders of magnitude
|
||||
higher than the estimated value.
|
||||
arriving at the same 1% estimates as above. Then it assumes that the
|
||||
conditions are independent, and so it multiplies their selectivities,
|
||||
producing a final selectivity estimate of just 0.01%.
|
||||
This is a significant underestimate, as the actual number of rows
|
||||
matching the conditions (100) is two orders of magnitude higher.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
This problem can be fixed by applying functional-dependency
|
||||
This problem can be fixed by creating a statistics object that
|
||||
directs <command>ANALYZE</> to calculate functional-dependency
|
||||
multivariate statistics on the two columns:
|
||||
|
||||
<programlisting>
|
||||
@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Multivariate N-Distinct coefficients</title>
|
||||
<title>Multivariate N-Distinct Counts</title>
|
||||
|
||||
<para>
|
||||
A similar problem occurs with estimation of the cardinality of distinct
|
||||
elements, used to determine the number of groups that would be generated
|
||||
by a <command>GROUP BY</command> clause. When <command>GROUP BY</command>
|
||||
lists a single column, the n-distinct estimate (which can be seen as the
|
||||
number of rows returned by the aggregate execution node) is very accurate:
|
||||
A similar problem occurs with estimation of the cardinality of sets of
|
||||
multiple columns, such as the number of groups that would be generated by
|
||||
a <command>GROUP BY</command> clause. When <command>GROUP BY</command>
|
||||
lists a single column, the n-distinct estimate (which is visible as the
|
||||
estimated number of rows returned by the HashAggregate node) is very
|
||||
accurate:
|
||||
<programlisting>
|
||||
EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a;
|
||||
QUERY PLAN
|
||||
@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
|
||||
Group Key: a, b
|
||||
-> Seq Scan on t (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1)
|
||||
</programlisting>
|
||||
By dropping the existing statistics and re-creating it to include n-distinct
|
||||
calculation, the estimate is much improved:
|
||||
By redefining the statistics object to include n-distinct counts for the
|
||||
two columns, the estimate is much improved:
|
||||
<programlisting>
|
||||
DROP STATISTICS stts;
|
||||
CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t;
|
||||
|
@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
|
||||
<term><replaceable class="PARAMETER">statistic_type</replaceable></term>
|
||||
<listitem>
|
||||
<para>
|
||||
A statistic type to be computed in this statistics object. Currently
|
||||
supported types are <literal>ndistinct</literal>, which enables
|
||||
n-distinct coefficient tracking,
|
||||
and <literal>dependencies</literal>, which enables functional
|
||||
dependencies.
|
||||
A statistic type to be computed in this statistics object.
|
||||
Currently supported types are
|
||||
<literal>ndistinct</literal>, which enables n-distinct statistics, and
|
||||
<literal>dependencies</literal>, which enables functional
|
||||
dependency statistics.
|
||||
For more information, see <xref linkend="planner-stats-extended">
|
||||
and <xref linkend="multivariate-statistics-examples">.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
|
||||
<term><replaceable class="PARAMETER">column_name</replaceable></term>
|
||||
<listitem>
|
||||
<para>
|
||||
The name of a table column to be included in the statistics object.
|
||||
The name of a table column to be covered by the computed statistics.
|
||||
At least two column names must be given.
|
||||
</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
|
||||
<title>Notes</title>
|
||||
|
||||
<para>
|
||||
You must be the owner of a table to create or change statistics on it.
|
||||
You must be the owner of a table to create a statistics object
|
||||
reading it. Once created, however, the ownership of the statistics
|
||||
object is independent of the underlying table(s).
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
|
||||
<para>
|
||||
Create table <structname>t1</> with two functionally dependent columns, i.e.
|
||||
knowledge of a value in the first column is sufficient for determining the
|
||||
value in the other column. Then functional dependencies are built on those
|
||||
columns:
|
||||
value in the other column. Then functional dependency statistics are built
|
||||
on those columns:
|
||||
|
||||
<programlisting>
|
||||
CREATE TABLE t1 (
|
||||
@ -136,21 +141,25 @@ CREATE TABLE t1 (
|
||||
INSERT INTO t1 SELECT i/100, i/500
|
||||
FROM generate_series(1,1000000) s(i);
|
||||
|
||||
ANALYZE t1;
|
||||
|
||||
-- the number of matching rows will be drastically underestimated:
|
||||
EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
|
||||
|
||||
CREATE STATISTICS s1 (dependencies) ON a, b FROM t1;
|
||||
|
||||
ANALYZE t1;
|
||||
|
||||
-- valid combination of values
|
||||
-- now the rowcount estimate is more accurate:
|
||||
EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
|
||||
|
||||
-- invalid combination of values
|
||||
EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1);
|
||||
</programlisting>
|
||||
|
||||
Without functional-dependency statistics, the planner would make the
|
||||
same estimate of the number of matching rows for these two queries.
|
||||
With such statistics, it is able to tell that one case has matches
|
||||
and the other does not.
|
||||
Without functional-dependency statistics, the planner would assume
|
||||
that the two <literal>WHERE</> conditions are independent, and would
|
||||
multiply their selectivities together to arrive at a much-too-small
|
||||
rowcount estimate.
|
||||
With such statistics, the planner recognizes that the <literal>WHERE</>
|
||||
conditions are redundant and does not underestimate the rowcount.
|
||||
</para>
|
||||
|
||||
</refsect1>
|
||||
|
Loading…
x
Reference in New Issue
Block a user