Edit SGML documentation related to extended statistics.

Use the "statistics object" terminology uniformly here too. Assorted copy-editing. Put new catalogs.sgml sections into alphabetical order.
2025-05-18 00:02:16 -04:00 · 2017-05-14 19:15:52 -04:00 · 2017-05-14 19:15:52 -04:00 · 93ece9cc88
commit 93ece9cc88
parent e84c019598
4 changed files with 397 additions and 334 deletions
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@ -221,13 +221,13 @@
     </row>
     <row>
-      <entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry>
+      <entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry>
-      <entry>template data for procedural languages</entry>
+      <entry>information about partition key of tables</entry>
     </row>
     <row>
-      <entry><link linkend="catalog-pg-partitioned-table"><structname>pg_partitioned_table</structname></link></entry>
+      <entry><link linkend="catalog-pg-pltemplate"><structname>pg_pltemplate</structname></link></entry>
-      <entry>information about partition key of tables</entry>
+      <entry>template data for procedural languages</entry>
     </row>
     <row>
@ -4271,108 +4271,6 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
  </table>
 </sect1>
 <sect1 id="catalog-pg-statistic-ext">
  <title><structname>pg_statistic_ext</structname></title>
  <indexterm zone="catalog-pg-statistic-ext">
   <primary>pg_statistic_ext</primary>
  </indexterm>
  <para>
   The catalog <structname>pg_statistic_ext</structname>
   holds extended planner statistics.
  </para>
  <table>
   <title><structname>pg_statistic_ext</> Columns</title>
   <tgroup cols="4">
    <thead>
     <row>
      <entry>Name</entry>
      <entry>Type</entry>
      <entry>References</entry>
      <entry>Description</entry>
     </row>
    </thead>
    <tbody>
     <row>
      <entry><structfield>stxrelid</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
      <entry>The table that the described columns belongs to</entry>
     </row>
     <row>
      <entry><structfield>stxname</structfield></entry>
      <entry><type>name</type></entry>
      <entry></entry>
      <entry>Name of the statistic.</entry>
     </row>
     <row>
      <entry><structfield>stxnamespace</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
      <entry>
       The OID of the namespace that contains this statistic
      </entry>
     </row>
     <row>
      <entry><structfield>stxowner</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
      <entry>Owner of the statistic</entry>
     </row>
     <row>
      <entry><structfield>stxkeys</structfield></entry>
      <entry><type>int2vector</type></entry>
      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
      <entry>
       This is an array of values that indicate which table columns this
       statistic covers. For example a value of <literal>1 3</literal> would
       mean that the first and the third table columns make up the statistic key.
      </entry>
     </row>
     <row>
      <entry><structfield>stxkind</structfield></entry>
      <entry><type>char[]</type></entry>
      <entry></entry>
      <entry>
        An array with the modes of the enabled statistic types.  Valid values
        are:
        <literal>d</literal> for ndistinct coefficients,
        <literal>f</literal> for functional dependencies.
      </entry>
     </row>
     <row>
      <entry><structfield>stxndistinct</structfield></entry>
      <entry><type>pg_ndistinct</type></entry>
      <entry></entry>
      <entry>
       N-distinct coefficients, serialized as <structname>pg_ndistinct</> type.
      </entry>
     </row>
     <row>
      <entry><structfield>stxdependencies</structfield></entry>
      <entry><type>pg_dependencies</type></entry>
      <entry></entry>
      <entry>
       Functional dependencies, serialized as <structname>pg_dependencies</> type.
      </entry>
     </row>
    </tbody>
   </tgroup>
  </table>
 </sect1>
 <sect1 id="catalog-pg-namespace">
  <title><structname>pg_namespace</structname></title>
@ -4790,6 +4688,111 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
 </sect1>
 <sect1 id="catalog-pg-partitioned-table">
  <title><structname>pg_partitioned_table</structname></title>
  <indexterm zone="catalog-pg-partitioned-table">
   <primary>pg_partitioned_table</primary>
  </indexterm>
  <para>
   The catalog <structname>pg_partitioned_table</structname> stores
   information about how tables are partitioned.
  </para>
  <table>
   <title><structname>pg_partitioned_table</> Columns</title>
   <tgroup cols="4">
    <thead>
     <row>
      <entry>Name</entry>
      <entry>Type</entry>
      <entry>References</entry>
      <entry>Description</entry>
     </row>
    </thead>
    <tbody>
     <row>
      <entry><structfield>partrelid</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
      <entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry>
     </row>
     <row>
      <entry><structfield>partstrat</structfield></entry>
      <entry><type>char</type></entry>
      <entry></entry>
      <entry>
       Partitioning strategy; <literal>l</> = list partitioned table,
       <literal>r</> = range partitioned table
      </entry>
     </row>
     <row>
      <entry><structfield>partnatts</structfield></entry>
      <entry><type>int2</type></entry>
      <entry></entry>
      <entry>The number of columns in partition key</entry>
     </row>
     <row>
      <entry><structfield>partattrs</structfield></entry>
      <entry><type>int2vector</type></entry>
      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
      <entry>
       This is an array of <structfield>partnatts</structfield> values that
       indicate which table columns are part of the partition key.  For
       example, a value of <literal>1 3</literal> would mean that the first
       and the third table columns make up the partition key.  A zero in this
       array indicates that the corresponding partition key column is an
       expression, rather than a simple column reference.
      </entry>
     </row>
     <row>
      <entry><structfield>partclass</structfield></entry>
      <entry><type>oidvector</type></entry>
      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
      <entry>
       For each column in the partition key, this contains the OID of the
       operator class to use.  See
       <link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details.
      </entry>
     </row>
     <row>
      <entry><structfield>partcollation</structfield></entry>
      <entry><type>oidvector</type></entry>
      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
      <entry>
       For each column in the partition key, this contains the OID of the
       the collation to use for partitioning.
      </entry>
     </row>
     <row>
      <entry><structfield>partexprs</structfield></entry>
      <entry><type>pg_node_tree</type></entry>
      <entry></entry>
      <entry>
       Expression trees (in <function>nodeToString()</function>
       representation) for partition key columns that are not simple column
       references.  This is a list with one element for each zero
       entry in <structfield>partattrs</>.  Null if all partition key columns
       are simple references.
      </entry>
     </row>
    </tbody>
   </tgroup>
  </table>
 </sect1>
 <sect1 id="catalog-pg-pltemplate">
  <title><structname>pg_pltemplate</structname></title>
@ -4896,109 +4899,6 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
 </sect1>
 <sect1 id="catalog-pg-partitioned-table">
  <title><structname>pg_partitioned_table</structname></title>
  <indexterm zone="catalog-pg-partitioned-table">
   <primary>pg_partitioned_table</primary>
  </indexterm>
  <para>
   The catalog <structname>pg_partitioned_table</structname> stores
   information about how tables are partitioned.
  </para>
  <table>
   <title><structname>pg_partitioned_table</> Columns</title>
   <tgroup cols="4">
    <thead>
     <row>
      <entry>Name</entry>
      <entry>Type</entry>
      <entry>References</entry>
      <entry>Description</entry>
     </row>
    </thead>
    <tbody>
     <row>
      <entry><structfield>partrelid</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
      <entry>The OID of the <structname>pg_class</> entry for this partitioned table</entry>
     </row>
     <row>
      <entry><structfield>partstrat</structfield></entry>
      <entry><type>char</type></entry>
      <entry></entry>
      <entry>
       Partitioning strategy; <literal>l</> = list partitioned table,
       <literal>r</> = range partitioned table
      </entry>
     </row>
     <row>
      <entry><structfield>partnatts</structfield></entry>
      <entry><type>int2</type></entry>
      <entry></entry>
      <entry>The number of columns in partition key</entry>
     </row>
     <row>
      <entry><structfield>partattrs</structfield></entry>
      <entry><type>int2vector</type></entry>
      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
      <entry>
       This is an array of <structfield>partnatts</structfield> values that
       indicate which table columns are part of the partition key.  For
       example, a value of <literal>1 3</literal> would mean that the first
       and the third table columns make up the partition key.  A zero in this
       array indicates that the corresponding partition key column is an
       expression, rather than a simple column reference.
      </entry>
     </row>
     <row>
      <entry><structfield>partclass</structfield></entry>
      <entry><type>oidvector</type></entry>
      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
      <entry>
       For each column in the partition key, this contains the OID of the
       operator class to use.  See
       <link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link> for details.
      </entry>
     </row>
     <row>
      <entry><structfield>partcollation</structfield></entry>
      <entry><type>oidvector</type></entry>
      <entry><literal><link linkend="catalog-pg-opclass"><structname>pg_opclass</structname></link>.oid</literal></entry>
      <entry>
       For each column in the partition key, this contains the OID of the
       the collation to use for partitioning.
      </entry>
     </row>
     <row>
      <entry><structfield>partexprs</structfield></entry>
      <entry><type>pg_node_tree</type></entry>
      <entry></entry>
      <entry>
       Expression trees (in <function>nodeToString()</function>
       representation) for partition key columns that are not simple column
       references.  This is a list with one element for each zero
       entry in <structfield>partattrs</>.  Null if all partition key columns
       are simple references.
      </entry>
     </row>
    </tbody>
   </tgroup>
  </table>
 </sect1>
 <sect1 id="catalog-pg-policy">
  <title><structname>pg_policy</structname></title>
@ -6466,6 +6366,120 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
 </sect1>
 <sect1 id="catalog-pg-statistic-ext">
  <title><structname>pg_statistic_ext</structname></title>
  <indexterm zone="catalog-pg-statistic-ext">
   <primary>pg_statistic_ext</primary>
  </indexterm>
  <para>
   The catalog <structname>pg_statistic_ext</structname>
   holds extended planner statistics.
   Each row in this catalog corresponds to a <firstterm>statistics object</>
   created with <xref linkend="sql-createstatistics">.
  </para>
  <table>
   <title><structname>pg_statistic_ext</> Columns</title>
   <tgroup cols="4">
    <thead>
     <row>
      <entry>Name</entry>
      <entry>Type</entry>
      <entry>References</entry>
      <entry>Description</entry>
     </row>
    </thead>
    <tbody>
     <row>
      <entry><structfield>stxrelid</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
      <entry>Table containing the columns described by this object</entry>
     </row>
     <row>
      <entry><structfield>stxname</structfield></entry>
      <entry><type>name</type></entry>
      <entry></entry>
      <entry>Name of the statistics object</entry>
     </row>
     <row>
      <entry><structfield>stxnamespace</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.oid</literal></entry>
      <entry>
       The OID of the namespace that contains this statistics object
      </entry>
     </row>
     <row>
      <entry><structfield>stxowner</structfield></entry>
      <entry><type>oid</type></entry>
      <entry><literal><link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.oid</literal></entry>
      <entry>Owner of the statistics object</entry>
     </row>
     <row>
      <entry><structfield>stxkeys</structfield></entry>
      <entry><type>int2vector</type></entry>
      <entry><literal><link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.attnum</literal></entry>
      <entry>
       An array of attribute numbers, indicating which table columns are
       covered by this statistics object;
       for example a value of <literal>1 3</literal> would
       mean that the first and the third table columns are covered
      </entry>
     </row>
     <row>
      <entry><structfield>stxkind</structfield></entry>
      <entry><type>char[]</type></entry>
      <entry></entry>
      <entry>
        An array containing codes for the enabled statistic types;
        valid values are:
        <literal>d</literal> for n-distinct statistics,
        <literal>f</literal> for functional dependency statistics
      </entry>
     </row>
     <row>
      <entry><structfield>stxndistinct</structfield></entry>
      <entry><type>pg_ndistinct</type></entry>
      <entry></entry>
      <entry>
       N-distinct counts, serialized as <structname>pg_ndistinct</> type
      </entry>
     </row>
     <row>
      <entry><structfield>stxdependencies</structfield></entry>
      <entry><type>pg_dependencies</type></entry>
      <entry></entry>
      <entry>
       Functional dependency statistics, serialized
       as <structname>pg_dependencies</> type
      </entry>
     </row>
    </tbody>
   </tgroup>
  </table>
  <para>
   The <structfield>stxkind</structfield> field is filled at creation of the
   statistics object, indicating which statistic type(s) are desired.
   The fields after it are initially NULL and are filled only when the
   corresponding statistic has been computed by <command>ANALYZE</>.
  </para>
 </sect1>
 <sect1 id="catalog-pg-subscription">
  <title><structname>pg_subscription</structname></title>
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@ -1071,25 +1071,41 @@ WHERE tablename = 'road';
    are independent of each other,
    an assumption that does not hold when column values are correlated.
    Regular statistics, because of their per-individual-column nature,
-    do not capture the knowledge of cross-column correlation;
+    cannot capture any knowledge about cross-column correlation.
-    <firstterm>multivariate statistics</firstterm> can be used to instruct
+    However, <productname>PostgreSQL</> has the ability to compute
-    the server to obtain statistics across such a set of columns,
+    <firstterm>multivariate statistics</firstterm>, which can capture
-    which are later used by the query optimizer
+    such information.
    to determine cardinality and selectivity
    of clauses involving those columns.
    Multivariate statistics are currently the only use of
    <firstterm>extended statistics</firstterm>.
   </para>
   <para>
-    Extended statistics are created using
+    Because the number of possible column combinations is very large,
    it's impractical to compute multivariate statistics automatically.
    Instead, <firstterm>extended statistics objects</firstterm>, more often
    called just <firstterm>statistics objects</>, can be created to instruct
    the server to obtain statistics across interesting sets of columns.
   </para>
   <para>
    Statistics objects are created using
    <xref linkend="sql-createstatistics">, which see for more details.
-    Data collection is deferred until the next <command>ANALYZE</command>
+    Creation of such an object merely creates a catalog entry expressing
-    on the table, after which the stored values can be examined in the
+    interest in the statistics.  Actual data collection is performed
    by <command>ANALYZE</command> (either a manual command, or background
    auto-analyze).  The collected values can be examined in the
    <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
    catalog.
   </para>
   <para>
    <command>ANALYZE</command> computes extended statistics based on the same
    sample of table rows that it takes for computing regular single-column
    statistics.  Since the sample size is increased by increasing the
    statistics target for the table or any of its columns (as described in
    the previous section), a larger statistics target will normally result in
    more accurate extended statistics, as well as more time spent calculating
    them.
   </para>
   <para>
    The following subsections describe the types of extended statistics
    that are currently supported.
@ -1099,142 +1115,162 @@ WHERE tablename = 'road';
    <title>Functional Dependencies</title>
    <para>
-     The simplest type of extended statistics are functional dependencies,
+     The simplest type of extended statistics tracks <firstterm>functional
-     a concept used in definitions of database normal forms.
+     dependencies</>, a concept used in definitions of database normal forms.
-     Put simply, it is said that column <literal>b</> is functionally
+     We say that column <structfield>b</> is functionally dependent on
-     dependent on column <literal>a</> if knowledge of the value of
+     column <structfield>a</> if knowledge of the value of
-     <literal>a</> is sufficient to determine the value of <literal>b</>.
+     <structfield>a</> is sufficient to determine the value
-     In normalized databases, functional dependencies are allowed only on
+     of <structfield>b</>, that is there are no two rows having the same value
-     primary keys and superkeys. However, many data sets are in practice not
+     of <structfield>a</> but different values of <structfield>b</>.
-     fully normalized for various reasons; intentional denormalization for
+     In a fully normalized database, functional dependencies should exist
-     performance reasons is a common example.
+     only on primary keys and superkeys. However, in practice many data sets
     are not fully normalized for various reasons; intentional
     denormalization for performance reasons is a common example.
     Even in a fully normalized database, there may be partial correlation
     between some columns, which can be expressed as partial functional
     dependency.
    </para>
    <para>
-     The existance of functional dependencies directly affects the accuracy
+     The existence of functional dependencies directly affects the accuracy
-     of estimates in certain queries.
+     of estimates in certain queries.  If a query contains conditions on
-     The reason is that conditions on the dependent columns do not
+     both the independent and the dependent column(s), the
-     restrict the result set, but the query planner (lacking functional
+     conditions on the dependent columns do not further reduce the result
-     dependency knowledge) considers them independent, resulting in
+     size; but without knowledge of the functional dependency, the query
-     underestimates.
+     planner will assume that the conditions are independent, resulting
-     To inform the planner about the functional dependencies, we collect
+     in underestimating the result size.
     measurements of dependency during <command>ANALYZE</>. Assessing
     the degree of dependency between all sets of columns would be
     prohibitively expensive, so the search is limited to potential
     dependencies defined using the <literal>dependencies</> option of
     extended statistics.  It is advisable to create
     <literal>dependencies</> statistics if and only if functional
     dependencies actually exist, to avoid unnecessary overhead on both
     <command>ANALYZE</> and query planning.
    </para>
    <para>
-     To inspect functional dependencies on a statistics
+     To inform the planner about functional dependencies, <command>ANALYZE</>
-     <literal>stts</literal>, you may do this:
+     can collect measurements of cross-column dependency. Assessing the
     degree of dependency between all sets of columns would be prohibitively
     expensive, so data collection is limited to those groups of columns
     appearing together in a statistics object defined with
     the <literal>dependencies</> option.  It is advisable to create
     <literal>dependencies</> statistics only for column groups that are
     strongly correlated, to avoid unnecessary overhead in both
     <command>ANALYZE</> and later query planning.
    </para>
    <para>
     Here is an example of collecting functional-dependency statistics:
 <programlisting>
-CREATE STATISTICS stts (dependencies)
+CREATE STATISTICS stts (dependencies) ON zip, city FROM zipcodes;
-               ON zip, city FROM zipcodes;
+
 ANALYZE zipcodes;
 SELECT stxname, stxkeys, stxdependencies
  FROM pg_statistic_ext
- WHERE stxname = 'stts';
+  WHERE stxname = 'stts';
 stxname | stxkeys |             stxdependencies               
 ---------+---------+------------------------------------------
 stts    | 1 5     | {"1 => 5": 1.000000, "5 => 1": 0.423130}
 (1 row)
 </programlisting>
-     where it can be seen that column 1 (a zip code) fully determines column
+     Here it can be seen that column 1 (zip code) fully determines column
     5 (city) so the coefficient is 1.0, while city only determines zip code
     about 42% of the time, meaning that there are many cities (58%) that are
     represented by more than a single ZIP code.
    </para>
    <para>
-     When computing the selectivity, the planner inspects all conditions and
+     When computing the selectivity for a query involving functionally
-     attempts to identify which conditions are already implied by other
+     dependent columns, the planner adjusts the per-condition selectivity
-     conditions.  The selectivity estimates from any redundant conditions are
+     estimates using the dependency coefficients so as not to produce
-     ignored from a selectivity point of view. In the example query above,
+     an underestimate.
     the selectivity estimates for either of the conditions may be eliminated,
     thus improving the overall estimate.
    </para>
    <sect4>
     <title>Limitations of Functional Dependencies</title>
     <para>
-      Functional dependencies are a very simple type of statistics, and
+      Functional dependencies are currently only applied when considering
-      as such have several limitations. The first limitation is that they
+      simple equality conditions that compare columns to constant values.
-      only work with simple equality conditions, comparing columns and constant
+      They are not used to improve estimates for equality conditions
-      values. It's not possible to use them to eliminate equality conditions
+      comparing two columns or comparing a column to an expression, nor for
-      comparing two columns or a column to an expression, range clauses,
+      range clauses, <literal>LIKE</> or any other type of condition.
      <literal>LIKE</> or any other type of conditions.
     </para>
     <para>
-      When eliminating the implied conditions, the planner assumes that the
+      When estimating with functional dependencies, the planner assumes that
-      conditions are compatible. Consider the following example, where
+      conditions on the involved columns are compatible and hence redundant.
-      this assumption does not hold:
+      If they are incompatible, the correct estimate would be zero rows, but
-
+      that possibility is not considered.  For example, given a query like
 <programlisting>
-EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 10;
+SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '94105';
                                 QUERY PLAN                                  
 -----------------------------------------------------------------------------
 Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual rows=0 loops=1)
   Filter: ((a = 1) AND (b = 10))
   Rows Removed by Filter: 10000
 </programlisting>
-
+      the planner will disregard the <structfield>city</> clause as not
-      While there are no rows with such combination of values, the planner
+      changing the selectivity, which is correct.  However, it will make
-      is unable to verify whether the values match &mdash; it only knows that
+      the same assumption about
-      the columns are functionally dependent.
+<programlisting>
 SELECT * FROM zipcodes WHERE city = 'San Francisco' AND zip = '90210';
 </programlisting>
      even though there will really be zero rows satisfying this query.
      Functional dependency statistics do not provide enough information
      to conclude that, however.
     </para>
     <para>
-      This assumption is related to queries executed on the database; in many
+      In many practical situations, this assumption is usually satisfied;
-      cases, it's actually satisfied (e.g. when the GUI only allows selecting
+      for example, there might be a GUI in the application that only allows
-      compatible values). But if that's not the case, functional dependencies
+      selecting compatible city and zipcode values to use in a query.
-      may not be a viable option.
+      But if that's not the case, functional dependencies may not be a viable
      option.
     </para>
    </sect4>
   </sect3>
   <sect3>
-    <title>Multivariate N-Distinct Coefficients</title>
+    <title>Multivariate N-Distinct Counts</title>
    <para>
     Single-column statistics store the number of distinct values in each
-     column.  Estimates of the number of distinct values on more than one
+     column.  Estimates of the number of distinct values when combining more
-     column (for example, for <literal>GROUP BY a, b</literal>) are
+     than one column (for example, for <literal>GROUP BY a, b</literal>) are
     frequently wrong when the planner only has single-column statistical
-     data, however, causing it to select bad plans.
+     data, causing it to select bad plans.
     In order to improve n-distinct estimation when multiple columns are
     grouped together, the <literal>ndistinct</> option of extended statistics
     can be used, which instructs <command>ANALYZE</> to collect n-distinct
     estimates for all possible combinations of two or more columns of the set
     of columns in the statistics object (the per-column estimates are already
     available in <structname>pg_statistic</>).
    </para>
    <para>
-     Continuing the above example, the n-distinct coefficients in a ZIP
+     To improve such estimates, <command>ANALYZE</> can collect n-distinct
-     code table may look like the following:
+     statistics for groups of columns.  As before, it's impractical to do
     this for every possible column grouping, so data is collected only for
     those groups of columns appearing together in a statistics object
     defined with the <literal>ndistinct</> option.  Data will be collected
     for each possible combination of two or more columns from the set of
     listed columns.
    </para>
    <para>
     Continuing the previous example, the n-distinct counts in a
     table of ZIP codes might look like the following:
 <programlisting>
-CREATE STATISTICS stts2 (ndistinct)
+CREATE STATISTICS stts2 (ndistinct) ON zip, state, city FROM zipcodes;
-               ON zip, state, city FROM zipcodes;
+
 ANALYZE zipcodes;
 SELECT stxkeys AS k, stxndistinct AS nd
  FROM pg_statistic_ext
- WHERE stxname = 'stts2';
+  WHERE stxname = 'stts2';
 -[ RECORD 1 ]--------------------------------------------------------
 k  | 1 2 5
 nd | {"1, 2": 33178, "1, 5": 33178, "2, 5": 27435, "1, 2, 5": 33178}
 (1 row)
 </programlisting>
-     which indicates that there are three combinations of columns that
+     This indicates that there are three combinations of columns that
     have 33178 distinct values: ZIP code and state; ZIP code and city;
     and ZIP code, city and state (the fact that they are all equal is
-     expected given the nature of ZIP-code data).  On the other hand,
+     expected given that ZIP code alone is unique in this table).  On the
-     the combination of city and state only has 27435 distinct values.
+     other hand, the combination of city and state has only 27435 distinct
     values.
    </para>
    <para>
     It's advisable to create <literal>ndistinct</> statistics objects only
     on combinations of columns that are actually used for grouping, and
     for which misestimation of the number of groups is resulting in bad
     plans.  Otherwise, the <command>ANALYZE</> cycles are just wasted.
    </para>
   </sect3>
  </sect2>
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@ -456,10 +456,11 @@ rows = (outer_cardinality * inner_cardinality) * selectivity
  </indexterm>
  <sect2>
-   <title>Functional dependencies</title>
+   <title>Functional Dependencies</title>
   <para>
-    Multivariate correlation can be seen with a very simple data set &mdash; a
+    Multivariate correlation can be demonstrated with a very simple data set
-    table with two columns, both containing the same values:
+    &mdash; a table with two columns, both containing the same values:
 <programlisting>
 CREATE TABLE t (a INT, b INT);
@ -501,8 +502,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1;
    number of rows, we see that the estimate is very accurate
    (in fact exact, as the table is very small).  Changing the
    <literal>WHERE</> to use the <structfield>b</> column, an identical
-    plan is generated.  Observe what happens if we apply the same
+    plan is generated.  But observe what happens if we apply the same
-    condition on both columns combining them with <literal>AND</>:
+    condition on both columns, combining them with <literal>AND</>:
 <programlisting>
 EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
@ -514,15 +515,16 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
 </programlisting>
    The planner estimates the selectivity for each condition individually,
-    arriving to the 1% estimates as above, and then multiplies them, getting
+    arriving at the same 1% estimates as above.  Then it assumes that the
-    the final 0.01% estimate. The <quote>actual</quote> figures, however,
+    conditions are independent, and so it multiplies their selectivities,
-    show that this results in a significant underestimate, as the actual
+    producing a final selectivity estimate of just 0.01%.
-    number of rows matching the conditions (100) is two orders of magnitude
+    This is a significant underestimate, as the actual number of rows
-    higher than the estimated value.
+    matching the conditions (100) is two orders of magnitude higher.
   </para>
   <para>
-    This problem can be fixed by applying functional-dependency
+    This problem can be fixed by creating a statistics object that
    directs <command>ANALYZE</> to calculate functional-dependency
    multivariate statistics on the two columns:
 <programlisting>
@ -539,13 +541,15 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT * FROM t WHERE a = 1 AND b = 1;
  </sect2>
  <sect2>
-   <title>Multivariate N-Distinct coefficients</title>
+   <title>Multivariate N-Distinct Counts</title>
   <para>
-    A similar problem occurs with estimation of the cardinality of distinct
+    A similar problem occurs with estimation of the cardinality of sets of
-    elements, used to determine the number of groups that would be generated
+    multiple columns, such as the number of groups that would be generated by
-    by a <command>GROUP BY</command> clause.  When <command>GROUP BY</command>
+    a <command>GROUP BY</command> clause.  When <command>GROUP BY</command>
-    lists a single column, the n-distinct estimate (which can be seen as the
+    lists a single column, the n-distinct estimate (which is visible as the
-    number of rows returned by the aggregate execution node) is very accurate:
+    estimated number of rows returned by the HashAggregate node) is very
    accurate:
 <programlisting>
 EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a;
                                       QUERY PLAN                                        
@ -565,8 +569,8 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
   Group Key: a, b
   -&gt;  Seq Scan on t  (cost=0.00..145.00 rows=10000 width=8) (actual rows=10000 loops=1)
 </programlisting>
-    By dropping the existing statistics and re-creating it to include n-distinct
+    By redefining the statistics object to include n-distinct counts for the
-    calculation, the estimate is much improved:
+    two columns, the estimate is much improved:
 <programlisting>
 DROP STATISTICS stts;
 CREATE STATISTICS stts (dependencies, ndistinct) ON a, b FROM t;
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@ -79,11 +79,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
    <term><replaceable class="PARAMETER">statistic_type</replaceable></term>
    <listitem>
     <para>
-      A statistic type to be computed in this statistics object.  Currently
+      A statistic type to be computed in this statistics object.
-      supported types are <literal>ndistinct</literal>, which enables
+      Currently supported types are
-      n-distinct coefficient tracking,
+      <literal>ndistinct</literal>, which enables n-distinct statistics, and
-      and <literal>dependencies</literal>, which enables functional
+      <literal>dependencies</literal>, which enables functional
-      dependencies.
+      dependency statistics.
      For more information, see <xref linkend="planner-stats-extended">
      and <xref linkend="multivariate-statistics-examples">.
     </para>
    </listitem>
   </varlistentry>
@ -92,7 +94,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
    <term><replaceable class="PARAMETER">column_name</replaceable></term>
    <listitem>
     <para>
-      The name of a table column to be included in the statistics object.
+      The name of a table column to be covered by the computed statistics.
      At least two column names must be given.
     </para>
    </listitem>
   </varlistentry>
@ -114,7 +117,9 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
  <title>Notes</title>
  <para>
-   You must be the owner of a table to create or change statistics on it.
+   You must be the owner of a table to create a statistics object
   reading it.  Once created, however, the ownership of the statistics
   object is independent of the underlying table(s).
  </para>
 </refsect1>
@ -124,8 +129,8 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
  <para>
   Create table <structname>t1</> with two functionally dependent columns, i.e.
   knowledge of a value in the first column is sufficient for determining the
-   value in the other column. Then functional dependencies are built on those
+   value in the other column. Then functional dependency statistics are built
-   columns:
+   on those columns:
 <programlisting>
 CREATE TABLE t1 (
@ -136,21 +141,25 @@ CREATE TABLE t1 (
 INSERT INTO t1 SELECT i/100, i/500
                 FROM generate_series(1,1000000) s(i);
 ANALYZE t1;
 -- the number of matching rows will be drastically underestimated:
 EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
 CREATE STATISTICS s1 (dependencies) ON a, b FROM t1;
 ANALYZE t1;
-- valid combination of values
+-- now the rowcount estimate is more accurate:
 EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
 -- invalid combination of values
 EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 1);
 </programlisting>
-   Without functional-dependency statistics, the planner would make the
+   Without functional-dependency statistics, the planner would assume
-   same estimate of the number of matching rows for these two queries.
+   that the two <literal>WHERE</> conditions are independent, and would
-   With such statistics, it is able to tell that one case has matches
+   multiply their selectivities together to arrive at a much-too-small
-   and the other does not.
+   rowcount estimate.
   With such statistics, the planner recognizes that the <literal>WHERE</>
   conditions are redundant and does not underestimate the rowcount.
  </para>
 </refsect1>