pgstattuple: Improve reports generated for indexes (hash, gist, btree)

pgstattuple checks the state of the pages retrieved for gist and hash using some check functions from each index AM, respectively gistcheckpage() and _hash_checkpage(). When these are called, they would fail when bumping on data that is found as incorrect (like opaque area size not matching, or empty pages), contrary to btree that simply discards these cases and continues to aggregate data. Zero pages can happen after a crash, with these AMs being able to do an internal cleanup when these are seen. Also, sporadic failures are annoying when doing for example a large-scale diagnostic query based on pgstattuple with a join of pg_class, as it forces one to use tricks like quals to discard hash or gist indexes, or use a PL wrapper able to catch errors. This commit changes the reports generated for btree, gist and hash to be more user-friendly; - When seeing an empty page, report it as free space. This new rule applies to gist and hash, and already applied to btree. - For btree, a check based on the size of BTPageOpaqueData is added. - For gist indexes, gistcheckpage() is not called anymore, replaced by a check based on the size of GISTPageOpaqueData. - For hash indexes, instead of _hash_getbuf_with_strategy(), use a direct call to ReadBufferExtended(), coupled with a check based on HashPageOpaqueData. The opaque area size check was already used. - Pages that do not match these criterias are discarded from the stats reports generated. There have been a couple of bug reports over the years that complained about the current behavior for hash and gist, as being not that useful, with nothing being done about it. Hence this change is backpatched down to v13. Reported-by: Noah Misch <noah@leadboat.com> Author: Nitin Motiani <nitinmotiani@google.com> Reviewed-by: Dilip Kumar <dilipbalaut@gmail.com> Discussion: https://postgr.es/m/CAH5HC95gT1J3dRYK4qEnaywG8RqjbwDdt04wuj8p39R=HukayA@mail.gmail.com Backpatch-through: 13
test_json_parser: Speed up 002_inline.pl
2025-10-03 00:03:21 -04:00 · 2025-10-02 11:09:12 +09:00 · 2025-10-01 09:25:21 -07:00
4 changed files with 134 additions and 65 deletions
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@ -422,7 +422,7 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
 		/* fully empty page */
 		stat->free_space += BLCKSZ;
 	}
-	else
+	else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(BTPageOpaqueData)))
 	{
 		BTPageOpaque opaque;

@ -456,10 +456,16 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
 	Buffer		buf;
 	Page		page;

-	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+	LockBuffer(buf, HASH_READ);
 	page = BufferGetPage(buf);

-	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
+	if (PageIsNew(page))
+	{
+		/* fully empty page */
+		stat->free_space += BLCKSZ;
+	}
+	else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
 	{
 		HashPageOpaque opaque;

@ -500,17 +506,23 @@ pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,

 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
 	LockBuffer(buf, GIST_SHARE);
-	gistcheckpage(rel, buf);
 	page = BufferGetPage(buf);
-
-	if (GistPageIsLeaf(page))
+	if (PageIsNew(page))
 	{
-		pgstat_index_page(stat, page, FirstOffsetNumber,
-						  PageGetMaxOffsetNumber(page));
+		/* fully empty page */
+		stat->free_space += BLCKSZ;
 	}
-	else
+	else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(GISTPageOpaqueData)))
 	{
-		/* root or node */
+		if (GistPageIsLeaf(page))
+		{
+			pgstat_index_page(stat, page, FirstOffsetNumber,
+							  PageGetMaxOffsetNumber(page));
+		}
+		else
+		{
+			/* root or node */
+		}
 	}

 	UnlockReleaseBuffer(buf);
--- a/src/test/modules/test_json_parser/README
+++ b/src/test/modules/test_json_parser/README
@ -6,10 +6,12 @@ This module contains two programs for testing the json parsers.
 - `test_json_parser_incremental` is for testing the incremental parser, It
  reads in a file and passes it in very small chunks (default is 60 bytes at a
  time) to the incremental parser. It's not meant to be a speed test but to
-  test the accuracy of the incremental parser.  There are two option arguments,
-  "-c nn" specifies an alternative chunk size, and "-s" specifies using
-  semantic routines. The semantic routines re-output the json, although not in
-  a very pretty form. The required non-option argument is the input file name.
+  test the accuracy of the incremental parser. The option "-c nn" specifies an
+  alternative chunk size, "-r nn" runs a range of chunk sizes down to one byte
+  on the same input (with output separated by null bytes), and "-s" specifies
+  using semantic routines. The semantic routines re-output the json, although
+  not in a very pretty form. The required non-option argument is the input file
+  name.
 - `test_json_parser_perf` is for speed testing both the standard
  recursive descent parser and the non-recursive incremental
  parser. If given the `-i` flag it uses the non-recursive parser,
--- a/src/test/modules/test_json_parser/t/002_inline.pl
+++ b/src/test/modules/test_json_parser/t/002_inline.pl
@ -33,23 +33,37 @@ sub test
 	print $fh "$json";
 	close($fh);

+	# The -r mode runs the parser in a loop, with output separated by nulls.
+	# Unpack that as a list of null-terminated ASCII strings (Z*) and check that
+	# each run produces the same result.
+	my ($all_stdout, $all_stderr) =
+	  run_command([ $exe, "-r", $chunk, $fname ]);
+
+	my @stdout = unpack("(Z*)*", $all_stdout);
+	my @stderr = unpack("(Z*)*", $all_stderr);
+
+	is(scalar @stdout, $chunk, "$name: stdout has correct number of entries");
+	is(scalar @stderr, $chunk, "$name: stderr has correct number of entries");
+
+	my $i = 0;
+
 	foreach my $size (reverse(1 .. $chunk))
 	{
-		my ($stdout, $stderr) = run_command([ $exe, "-c", $size, $fname ]);
-
 		if (defined($params{error}))
 		{
-			unlike($stdout, qr/SUCCESS/,
+			unlike($stdout[$i], qr/SUCCESS/,
 				"$name, chunk size $size: test fails");
-			like($stderr, $params{error},
+			like($stderr[$i], $params{error},
 				"$name, chunk size $size: correct error output");
 		}
 		else
 		{
-			like($stdout, qr/SUCCESS/,
+			like($stdout[$i], qr/SUCCESS/,
 				"$name, chunk size $size: test succeeds");
-			is($stderr, "", "$name, chunk size $size: no error output");
+			is($stderr[$i], "", "$name, chunk size $size: no error output");
 		}
+
+		$i++;
 	}
 }

--- a/src/test/modules/test_json_parser/test_json_parser_incremental.c
+++ b/src/test/modules/test_json_parser/test_json_parser_incremental.c
@ -12,9 +12,14 @@
 * the parser in very small chunks. In practice you would normally use
 * much larger chunks, but doing this makes it more likely that the
 * full range of increment handling, especially in the lexer, is exercised.
+ *
 * If the "-c SIZE" option is provided, that chunk size is used instead
 * of the default of 60.
 *
+ * If the "-r SIZE" option is provided, a range of chunk sizes from SIZE down to
+ * 1 are run sequentially. A null byte is printed to the streams after each
+ * iteration.
+ *
 * If the -s flag is given, the program does semantic processing. This should
 * just mirror back the json, albeit with white space changes.
 *
@ -82,20 +87,24 @@ main(int argc, char **argv)
 	StringInfoData json;
 	int			n_read;
 	size_t		chunk_size = DEFAULT_CHUNK_SIZE;
+	bool		run_chunk_ranges = false;
 	struct stat statbuf;
-	off_t		bytes_left;
 	JsonSemAction *testsem = &nullSemAction;
 	char	   *testfile;
 	int			c;
 	bool		need_strings = false;
+	int			ret = 0;

 	pg_logging_init(argv[0]);

-	while ((c = getopt(argc, argv, "c:s")) != -1)
+	while ((c = getopt(argc, argv, "r:c:s")) != -1)
 	{
 		switch (c)
 		{
-			case 'c':			/* chunksize */
+			case 'r':			/* chunk range */
+				run_chunk_ranges = true;
+				/* fall through */
+			case 'c':			/* chunk size */
 				chunk_size = strtou64(optarg, NULL, 10);
 				if (chunk_size > BUFSIZE)
 					pg_fatal("chunk size cannot exceed %d", BUFSIZE);
@ -121,7 +130,6 @@ main(int argc, char **argv)
 		exit(1);
 	}

-	makeJsonLexContextIncremental(&lex, PG_UTF8, need_strings);
 	initStringInfo(&json);

 	if ((json_file = fopen(testfile, PG_BINARY_R)) == NULL)
@ -130,57 +138,90 @@ main(int argc, char **argv)
 	if (fstat(fileno(json_file), &statbuf) != 0)
 		pg_fatal("error statting input: %m");

-	bytes_left = statbuf.st_size;
-
-	for (;;)
+	do
 	{
-		/* We will break when there's nothing left to read */
+		/*
+		 * This outer loop only repeats in -r mode. Reset the parse state and
+		 * our position in the input file for the inner loop, which performs
+		 * the incremental parsing.
+		 */
+		off_t		bytes_left = statbuf.st_size;
+		size_t		to_read = chunk_size;

-		if (bytes_left < chunk_size)
-			chunk_size = bytes_left;
+		makeJsonLexContextIncremental(&lex, PG_UTF8, need_strings);

-		n_read = fread(buff, 1, chunk_size, json_file);
-		if (n_read < chunk_size)
-			pg_fatal("error reading input file: %d", ferror(json_file));
+		rewind(json_file);
+		resetStringInfo(&json);

-		appendBinaryStringInfo(&json, buff, n_read);
+		for (;;)
+		{
+			/* We will break when there's nothing left to read */
+
+			if (bytes_left < to_read)
+				to_read = bytes_left;
+
+			n_read = fread(buff, 1, to_read, json_file);
+			if (n_read < to_read)
+				pg_fatal("error reading input file: %d", ferror(json_file));
+
+			appendBinaryStringInfo(&json, buff, n_read);
+
+			/*
+			 * Append some trailing junk to the buffer passed to the parser.
+			 * This helps us ensure that the parser does the right thing even
+			 * if the chunk isn't terminated with a '\0'.
+			 */
+			appendStringInfoString(&json, "1+23 trailing junk");
+			bytes_left -= n_read;
+			if (bytes_left > 0)
+			{
+				result = pg_parse_json_incremental(&lex, testsem,
+												   json.data, n_read,
+												   false);
+				if (result != JSON_INCOMPLETE)
+				{
+					fprintf(stderr, "%s\n", json_errdetail(result, &lex));
+					ret = 1;
+					goto cleanup;
+				}
+				resetStringInfo(&json);
+			}
+			else
+			{
+				result = pg_parse_json_incremental(&lex, testsem,
+												   json.data, n_read,
+												   true);
+				if (result != JSON_SUCCESS)
+				{
+					fprintf(stderr, "%s\n", json_errdetail(result, &lex));
+					ret = 1;
+					goto cleanup;
+				}
+				if (!need_strings)
+					printf("SUCCESS!\n");
+				break;
+			}
+		}
+
+cleanup:
+		freeJsonLexContext(&lex);

 		/*
-		 * Append some trailing junk to the buffer passed to the parser. This
-		 * helps us ensure that the parser does the right thing even if the
-		 * chunk isn't terminated with a '\0'.
+		 * In -r mode, separate output with nulls so that the calling test can
+		 * split it up, decrement the chunk size, and loop back to the top.
+		 * All other modes immediately fall out of the loop and exit.
 		 */
-		appendStringInfoString(&json, "1+23 trailing junk");
-		bytes_left -= n_read;
-		if (bytes_left > 0)
+		if (run_chunk_ranges)
 		{
-			result = pg_parse_json_incremental(&lex, testsem,
-											   json.data, n_read,
-											   false);
-			if (result != JSON_INCOMPLETE)
-			{
-				fprintf(stderr, "%s\n", json_errdetail(result, &lex));
-				exit(1);
-			}
-			resetStringInfo(&json);
+			fputc('\0', stdout);
+			fputc('\0', stderr);
 		}
-		else
-		{
-			result = pg_parse_json_incremental(&lex, testsem,
-											   json.data, n_read,
-											   true);
-			if (result != JSON_SUCCESS)
-			{
-				fprintf(stderr, "%s\n", json_errdetail(result, &lex));
-				exit(1);
-			}
-			if (!need_strings)
-				printf("SUCCESS!\n");
-			break;
-		}
-	}
+	} while (run_chunk_ranges && (--chunk_size > 0));
+
 	fclose(json_file);
-	exit(0);
+	free(json.data);
+
+	return ret;
 }

 /*