Change patternsel (LIKE/regex selectivity estimation) so that if there

is a large enough histogram, it will use the number of matches in the histogram to derive a selectivity estimate, rather than the admittedly pretty bogus heuristics involving examining the pattern contents. I set 'large enough' at 100, but perhaps we should change that later. Also apply the same technique in contrib/ltree's <@ and @> estimator. Per discussion with Stefan Kaltenbrunner and Matteo Beccati.
2025-07-13 00:01:36 -04:00 · 2006-09-20 19:50:21 +00:00 · 2006-09-20 19:50:21 +00:00 · bfd1ffa948
commit bfd1ffa948
parent 06b33f0ee8
3 changed files with 245 additions and 116 deletions
--- a/contrib/ltree/ltree_op.c
+++ b/contrib/ltree/ltree_op.c
@ -1,13 +1,14 @@
 /*
 * op function for ltree
 * Teodor Sigaev <teodor@stack.net>
- * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.12 2006/05/30 22:12:13 tgl Exp $
+ * $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.13 2006/09/20 19:50:21 tgl Exp $
 */
 #include "ltree.h"
 #include <ctype.h>
 #include "catalog/pg_statistic.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 #include "utils/syscache.h"
@ -606,6 +607,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 		FmgrInfo	contproc;
 		double		mcvsum;
 		double		mcvsel;
 		double		nullfrac;
 		fmgr_info(get_opcode(operator), &contproc);
@ -616,10 +618,40 @@ ltreeparentsel(PG_FUNCTION_ARGS)
 								 &mcvsum);
 		/*
-		 * We have the exact selectivity for values appearing in the MCV list;
+		 * If the histogram is large enough, see what fraction of it the
-		 * use the default selectivity for the rest of the population.
+		 * constant is "<@" to, and assume that's representative of the
 		 * non-MCV population.  Otherwise use the default selectivity for
 		 * the non-MCV population.
 		 */
-		selec = mcvsel + DEFAULT_PARENT_SEL * (1.0 - mcvsum);
+		selec = histogram_selectivity(&vardata, &contproc,
 									  constval, varonleft,
 									  100, 1);
 		if (selec < 0)
 		{
 			/* Nope, fall back on default */
 			selec = DEFAULT_PARENT_SEL;
 		}
 		else
 		{
 			/* Yes, but don't believe extremely small or large estimates. */
 			if (selec < 0.0001)
 				selec = 0.0001;
 			else if (selec > 0.9999)
 				selec = 0.9999;
 		}
 		if (HeapTupleIsValid(vardata.statsTuple))
 			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
 		else
 			nullfrac = 0.0;
 		/*
 		 * Now merge the results from the MCV and histogram calculations,
 		 * realizing that the histogram covers only the non-null values that
 		 * are not listed in MCV.
 		 */
 		selec *= 1.0 - nullfrac - mcvsum;
 		selec += mcvsel;
 	}
 	else
 		selec = DEFAULT_PARENT_SEL;
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@ -15,7 +15,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.212 2006/09/19 22:49:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.213 2006/09/20 19:50:21 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -235,7 +235,7 @@ eqsel(PG_FUNCTION_ARGS)
 			{
 				/*
 				 * Constant is "=" to this common value.  We know selectivity
-				 * exactly (or as exactly as VACUUM could calculate it,
+				 * exactly (or as exactly as ANALYZE could calculate it,
 				 * anyway).
 				 */
 				selec = numbers[i];
@ -315,7 +315,7 @@ eqsel(PG_FUNCTION_ARGS)
 	else
 	{
 		/*
-		 * No VACUUM ANALYZE stats available, so make a guess using estimated
+		 * No ANALYZE stats available, so make a guess using estimated
 		 * number of distinct values and assuming they are equally common.
 		 * (The guess is unlikely to be very good, but we do know a few
 		 * special cases.)
@ -446,7 +446,7 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
 }
 /*
- *	mcv_selectivity				- Examine the MCV list for scalarineqsel
+ *	mcv_selectivity			- Examine the MCV list for selectivity estimates
 *
 * Determine the fraction of the variable's MCV population that satisfies
 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.  Also
@ -500,6 +500,80 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 	return mcv_selec;
 }
 /*
 *	histogram_selectivity	- Examine the histogram for selectivity estimates
 *
 * Determine the fraction of the variable's histogram entries that satisfy
 * the predicate (VAR OP CONST), or (CONST OP VAR) if !varonleft.
 *
 * This code will work for any boolean-returning predicate operator, whether
 * or not it has anything to do with the histogram sort operator.  We are
 * essentially using the histogram just as a representative sample.  However,
 * small histograms are unlikely to be all that representative, so the caller
 * should specify a minimum histogram size to use, and fall back on some
 * other approach if this routine fails.
 *
 * The caller also specifies n_skip, which causes us to ignore the first and
 * last n_skip histogram elements, on the grounds that they are outliers and
 * hence not very representative.  If in doubt, min_hist_size = 100 and
 * n_skip = 1 are reasonable values.
 *
 * The function result is the selectivity, or -1 if there is no histogram
 * or it's smaller than min_hist_size.
 *
 * Note that the result disregards both the most-common-values (if any) and
 * null entries.  The caller is expected to combine this result with
 * statistics for those portions of the column population.  It may also be
 * prudent to clamp the result range, ie, disbelieve exact 0 or 1 outputs.
 */
 double
 histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 					  Datum constval, bool varonleft,
 					  int min_hist_size, int n_skip)
 {
 	double		result;
 	Datum	   *values;
 	int			nvalues;
 	/* check sanity of parameters */
 	Assert(n_skip >= 0);
 	Assert(min_hist_size > 2 * n_skip);
 	if (HeapTupleIsValid(vardata->statsTuple) &&
 		get_attstatsslot(vardata->statsTuple,
 						 vardata->atttype, vardata->atttypmod,
 						 STATISTIC_KIND_HISTOGRAM, InvalidOid,
 						 &values, &nvalues,
 						 NULL, NULL))
 	{
 		if (nvalues >= min_hist_size)
 		{
 			int			nmatch = 0;
 			int			i;
 			for (i = n_skip; i < nvalues - n_skip; i++)
 			{
 				if (varonleft ?
 					DatumGetBool(FunctionCall2(opproc,
 											   values[i],
 											   constval)) :
 					DatumGetBool(FunctionCall2(opproc,
 											   constval,
 											   values[i])))
 					nmatch++;
 			}
 			result = ((double) nmatch) / ((double) (nvalues - 2 * n_skip));
 		}
 		else
 			result = -1;
 		free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
 	}
 	else
 		result = -1;
 	return result;
 }
 /*
 *	ineq_histogram_selectivity	- Examine the histogram for scalarineqsel
 *
@ -521,12 +595,11 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 	double		hist_selec;
 	Datum	   *values;
 	int			nvalues;
 	int			i;
 	hist_selec = 0.0;
 	/*
-	 * Someday, VACUUM might store more than one histogram per rel/att,
+	 * Someday, ANALYZE might store more than one histogram per rel/att,
 	 * corresponding to more than one possible sort ordering defined for the
 	 * column type.  However, to make that work we will need to figure out
 	 * which staop to search for --- it's not necessarily the one we have at
@ -544,105 +617,107 @@ ineq_histogram_selectivity(VariableStatData *vardata,
 	{
 		if (nvalues > 1)
 		{
-			double		histfrac;
+			/*
-			bool		ltcmp;
+			 * Use binary search to find proper location, ie, the first
 			 * slot at which the comparison fails.  (If the given operator
 			 * isn't actually sort-compatible with the histogram, you'll
 			 * get garbage results ... but probably not any more garbage-y
 			 * than you would from the old linear search.)
 			 */
 			double	histfrac;
 			int		lobound = 0;		/* first possible slot to search */
 			int		hibound = nvalues;	/* last+1 slot to search */
-			ltcmp = DatumGetBool(FunctionCall2(opproc,
+			while (lobound < hibound)
-											   values[0],
+			{
-											   constval));
+				int		probe = (lobound + hibound) / 2;
-			if (isgt)
+				bool	ltcmp;
-				ltcmp = !ltcmp;
+
-			if (!ltcmp)
+				ltcmp = DatumGetBool(FunctionCall2(opproc,
 												   values[probe],
 												   constval));
 				if (isgt)
 					ltcmp = !ltcmp;
 				if (ltcmp)
 					lobound = probe + 1;
 				else
 					hibound = probe;
 			}
 			if (lobound <= 0)
 			{
 				/* Constant is below lower histogram boundary. */
 				histfrac = 0.0;
 			}
 			else if (lobound >= nvalues)
 			{
 				/* Constant is above upper histogram boundary. */
 				histfrac = 1.0;
 			}
 			else
 			{
 				int			i = lobound;
 				double		val,
 							high,
 							low;
 				double		binfrac;
 				/*
-				 * Scan to find proper location.  This could be made faster by
+				 * We have values[i-1] < constant < values[i].
-				 * using a binary-search method, but it's probably not worth
+				 *
-				 * the trouble for typical histogram sizes.
+				 * Convert the constant and the two nearest bin boundary
 				 * values to a uniform comparison scale, and do a linear
 				 * interpolation within this bin.
 				 */
-				for (i = 1; i < nvalues; i++)
+				if (convert_to_scalar(constval, consttype, &val,
 									  values[i - 1], values[i],
 									  vardata->vartype,
 									  &low, &high))
 				{
-					ltcmp = DatumGetBool(FunctionCall2(opproc,
+					if (high <= low)
-													   values[i],
+					{
-													   constval));
+						/* cope if bin boundaries appear identical */
-					if (isgt)
+						binfrac = 0.5;
-						ltcmp = !ltcmp;
+					}
-					if (!ltcmp)
+					else if (val <= low)
-						break;
+						binfrac = 0.0;
-				}
+					else if (val >= high)
-				if (i >= nvalues)
+						binfrac = 1.0;
-				{
+					else
-					/* Constant is above upper histogram boundary. */
+					{
-					histfrac = 1.0;
+						binfrac = (val - low) / (high - low);
 						/*
 						 * Watch out for the possibility that we got a NaN
 						 * or Infinity from the division.  This can happen
 						 * despite the previous checks, if for example
 						 * "low" is -Infinity.
 						 */
 						if (isnan(binfrac) ||
 							binfrac < 0.0 || binfrac > 1.0)
 							binfrac = 0.5;
 					}
 				}
 				else
 				{
 					double		val,
 								high,
 								low;
 					double		binfrac;
 					/*
-					 * We have values[i-1] < constant < values[i].
+					 * Ideally we'd produce an error here, on the grounds
-					 *
+					 * that the given operator shouldn't have scalarXXsel
-					 * Convert the constant and the two nearest bin boundary
+					 * registered as its selectivity func unless we can
-					 * values to a uniform comparison scale, and do a linear
+					 * deal with its operand types.  But currently, all
-					 * interpolation within this bin.
+					 * manner of stuff is invoking scalarXXsel, so give a
 					 * default estimate until that can be fixed.
 					 */
-					if (convert_to_scalar(constval, consttype, &val,
+					binfrac = 0.5;
 										  values[i - 1], values[i],
 										  vardata->vartype,
 										  &low, &high))
 					{
 						if (high <= low)
 						{
 							/* cope if bin boundaries appear identical */
 							binfrac = 0.5;
 						}
 						else if (val <= low)
 							binfrac = 0.0;
 						else if (val >= high)
 							binfrac = 1.0;
 						else
 						{
 							binfrac = (val - low) / (high - low);
 							/*
 							 * Watch out for the possibility that we got a NaN
 							 * or Infinity from the division.  This can happen
 							 * despite the previous checks, if for example
 							 * "low" is -Infinity.
 							 */
 							if (isnan(binfrac) ||
 								binfrac < 0.0 || binfrac > 1.0)
 								binfrac = 0.5;
 						}
 					}
 					else
 					{
 						/*
 						 * Ideally we'd produce an error here, on the grounds
 						 * that the given operator shouldn't have scalarXXsel
 						 * registered as its selectivity func unless we can
 						 * deal with its operand types.  But currently, all
 						 * manner of stuff is invoking scalarXXsel, so give a
 						 * default estimate until that can be fixed.
 						 */
 						binfrac = 0.5;
 					}
 					/*
 					 * Now, compute the overall selectivity across the values
 					 * represented by the histogram.  We have i-1 full bins
 					 * and binfrac partial bin below the constant.
 					 */
 					histfrac = (double) (i - 1) + binfrac;
 					histfrac /= (double) (nvalues - 1);
 				}
 				/*
 				 * Now, compute the overall selectivity across the values
 				 * represented by the histogram.  We have i-1 full bins
 				 * and binfrac partial bin below the constant.
 				 */
 				histfrac = (double) (i - 1) + binfrac;
 				histfrac /= (double) (nvalues - 1);
 			}
 			/*
@ -970,35 +1045,50 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 	else
 	{
 		/*
-		 * Not exact-match pattern.  We estimate selectivity of the fixed
+		 * Not exact-match pattern.  If we have a sufficiently large
-		 * prefix and remainder of pattern separately, then combine the two
+		 * histogram, estimate selectivity for the histogram part of the
-		 * to get an estimate of the selectivity for the part of the column
+		 * population by counting matches in the histogram.  If not, estimate
-		 * population represented by the histogram.  We then add up data for
+		 * selectivity of the fixed prefix and remainder of pattern
-		 * any most-common-values values; these are not in the histogram
+		 * separately, then combine the two to get an estimate of the
-		 * population, and we can get exact answers for them by applying
+		 * selectivity for the part of the column population represented by
-		 * the pattern operator, so there's no reason to approximate.
+		 * the histogram.  We then add up data for any most-common-values
-		 * (If the MCVs cover a significant part of the total population,
+		 * values; these are not in the histogram population, and we can get
-		 * this gives us a big leg up in accuracy.)
+		 * exact answers for them by applying the pattern operator, so there's
 		 * no reason to approximate.  (If the MCVs cover a significant part of
 		 * the total population, this gives us a big leg up in accuracy.)
 		 */
 		Selectivity prefixsel;
 		Selectivity restsel;
 		Selectivity selec;
 		FmgrInfo	opproc;
 		double		nullfrac,
 					mcv_selec,
 					sumcommon;
-		if (HeapTupleIsValid(vardata.statsTuple))
+		/* Try to use the histogram entries to get selectivity */
-			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
+		fmgr_info(get_opcode(operator), &opproc);
 		else
 			nullfrac = 0.0;
-		if (pstatus == Pattern_Prefix_Partial)
+		selec = histogram_selectivity(&vardata, &opproc, constval, true,
-			prefixsel = prefix_selectivity(&vardata, opclass, prefix);
+									  100, 1);
 		if (selec < 0)
 		{
 			/* Nope, so fake it with the heuristic method */
 			Selectivity prefixsel;
 			Selectivity restsel;
 			if (pstatus == Pattern_Prefix_Partial)
 				prefixsel = prefix_selectivity(&vardata, opclass, prefix);
 			else
 				prefixsel = 1.0;
 			restsel = pattern_selectivity(rest, ptype);
 			selec = prefixsel * restsel;
 		}
 		else
-			prefixsel = 1.0;
+		{
-		restsel = pattern_selectivity(rest, ptype);
+			/* Yes, but don't believe extremely small or large estimates. */
-		selec = prefixsel * restsel;
+			if (selec < 0.0001)
 				selec = 0.0001;
 			else if (selec > 0.9999)
 				selec = 0.9999;
 		}
 		/*
 		 * If we have most-common-values info, add up the fractions of the MCV
@ -1006,10 +1096,14 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype)
 		 * directly to the result selectivity.  Also add up the total fraction
 		 * represented by MCV entries.
 		 */
 		fmgr_info(get_opcode(operator), &opproc);
 		mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
 									&sumcommon);
 		if (HeapTupleIsValid(vardata.statsTuple))
 			nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
 		else
 			nullfrac = 0.0;
 		/*
 		 * Now merge the results from the MCV and histogram calculations,
 		 * realizing that the histogram covers only the non-null values that
@ -1332,7 +1426,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
 	else
 	{
 		/*
-		 * No VACUUM ANALYZE stats available, so make a guess
+		 * No ANALYZE stats available, so make a guess
 		 */
 		switch (nulltesttype)
 		{
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@ -8,7 +8,7 @@
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.34 2006/07/01 22:07:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.35 2006/09/20 19:50:21 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -110,6 +110,9 @@ extern double get_variable_numdistinct(VariableStatData *vardata);
 extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 							  Datum constval, bool varonleft,
 							  double *sumcommonp);
 extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 									Datum constval, bool varonleft,
 									int min_hist_size, int n_skip);
 extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
 					 Pattern_Type ptype,