mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-26 00:02:18 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			944 lines
		
	
	
		
			27 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			944 lines
		
	
	
		
			27 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-------------------------------------------------------------------------
 | |
|  *
 | |
|  * pg_visibility.c
 | |
|  *	  display visibility map information and page-level visibility bits
 | |
|  *
 | |
|  * Copyright (c) 2016-2025, PostgreSQL Global Development Group
 | |
|  *
 | |
|  *	  contrib/pg_visibility/pg_visibility.c
 | |
|  *-------------------------------------------------------------------------
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include "access/heapam.h"
 | |
| #include "access/htup_details.h"
 | |
| #include "access/visibilitymap.h"
 | |
| #include "access/xloginsert.h"
 | |
| #include "catalog/pg_type.h"
 | |
| #include "catalog/storage_xlog.h"
 | |
| #include "funcapi.h"
 | |
| #include "miscadmin.h"
 | |
| #include "storage/bufmgr.h"
 | |
| #include "storage/proc.h"
 | |
| #include "storage/procarray.h"
 | |
| #include "storage/read_stream.h"
 | |
| #include "storage/smgr.h"
 | |
| #include "utils/rel.h"
 | |
| 
 | |
| PG_MODULE_MAGIC;
 | |
| 
 | |
| typedef struct vbits
 | |
| {
 | |
| 	BlockNumber next;
 | |
| 	BlockNumber count;
 | |
| 	uint8		bits[FLEXIBLE_ARRAY_MEMBER];
 | |
| } vbits;
 | |
| 
 | |
| typedef struct corrupt_items
 | |
| {
 | |
| 	BlockNumber next;
 | |
| 	BlockNumber count;
 | |
| 	ItemPointer tids;
 | |
| } corrupt_items;
 | |
| 
 | |
| /* for collect_corrupt_items_read_stream_next_block */
 | |
| struct collect_corrupt_items_read_stream_private
 | |
| {
 | |
| 	bool		all_frozen;
 | |
| 	bool		all_visible;
 | |
| 	BlockNumber current_blocknum;
 | |
| 	BlockNumber last_exclusive;
 | |
| 	Relation	rel;
 | |
| 	Buffer		vmbuffer;
 | |
| };
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(pg_visibility_map);
 | |
| PG_FUNCTION_INFO_V1(pg_visibility_map_rel);
 | |
| PG_FUNCTION_INFO_V1(pg_visibility);
 | |
| PG_FUNCTION_INFO_V1(pg_visibility_rel);
 | |
| PG_FUNCTION_INFO_V1(pg_visibility_map_summary);
 | |
| PG_FUNCTION_INFO_V1(pg_check_frozen);
 | |
| PG_FUNCTION_INFO_V1(pg_check_visible);
 | |
| PG_FUNCTION_INFO_V1(pg_truncate_visibility_map);
 | |
| 
 | |
| static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd);
 | |
| static vbits *collect_visibility_data(Oid relid, bool include_pd);
 | |
| static corrupt_items *collect_corrupt_items(Oid relid, bool all_visible,
 | |
| 											bool all_frozen);
 | |
| static void record_corrupt_item(corrupt_items *items, ItemPointer tid);
 | |
| static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin,
 | |
| 							  Buffer buffer);
 | |
| static void check_relation_relkind(Relation rel);
 | |
| 
 | |
| /*
 | |
|  * Visibility map information for a single block of a relation.
 | |
|  *
 | |
|  * Note: the VM code will silently return zeroes for pages past the end
 | |
|  * of the map, so we allow probes up to MaxBlockNumber regardless of the
 | |
|  * actual relation size.
 | |
|  */
 | |
| Datum
 | |
| pg_visibility_map(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Oid			relid = PG_GETARG_OID(0);
 | |
| 	int64		blkno = PG_GETARG_INT64(1);
 | |
| 	int32		mapbits;
 | |
| 	Relation	rel;
 | |
| 	Buffer		vmbuffer = InvalidBuffer;
 | |
| 	TupleDesc	tupdesc;
 | |
| 	Datum		values[2];
 | |
| 	bool		nulls[2] = {0};
 | |
| 
 | |
| 	rel = relation_open(relid, AccessShareLock);
 | |
| 
 | |
| 	/* Only some relkinds have a visibility map */
 | |
| 	check_relation_relkind(rel);
 | |
| 
 | |
| 	if (blkno < 0 || blkno > MaxBlockNumber)
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("invalid block number")));
 | |
| 
 | |
| 	tupdesc = pg_visibility_tupdesc(false, false);
 | |
| 
 | |
| 	mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
 | |
| 	if (vmbuffer != InvalidBuffer)
 | |
| 		ReleaseBuffer(vmbuffer);
 | |
| 	values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
 | |
| 	values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
 | |
| 
 | |
| 	relation_close(rel, AccessShareLock);
 | |
| 
 | |
| 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Visibility map information for a single block of a relation, plus the
 | |
|  * page-level information for the same block.
 | |
|  */
 | |
| Datum
 | |
| pg_visibility(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Oid			relid = PG_GETARG_OID(0);
 | |
| 	int64		blkno = PG_GETARG_INT64(1);
 | |
| 	int32		mapbits;
 | |
| 	Relation	rel;
 | |
| 	Buffer		vmbuffer = InvalidBuffer;
 | |
| 	Buffer		buffer;
 | |
| 	Page		page;
 | |
| 	TupleDesc	tupdesc;
 | |
| 	Datum		values[3];
 | |
| 	bool		nulls[3] = {0};
 | |
| 
 | |
| 	rel = relation_open(relid, AccessShareLock);
 | |
| 
 | |
| 	/* Only some relkinds have a visibility map */
 | |
| 	check_relation_relkind(rel);
 | |
| 
 | |
| 	if (blkno < 0 || blkno > MaxBlockNumber)
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("invalid block number")));
 | |
| 
 | |
| 	tupdesc = pg_visibility_tupdesc(false, true);
 | |
| 
 | |
| 	mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
 | |
| 	if (vmbuffer != InvalidBuffer)
 | |
| 		ReleaseBuffer(vmbuffer);
 | |
| 	values[0] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0);
 | |
| 	values[1] = BoolGetDatum((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0);
 | |
| 
 | |
| 	/* Here we have to explicitly check rel size ... */
 | |
| 	if (blkno < RelationGetNumberOfBlocks(rel))
 | |
| 	{
 | |
| 		buffer = ReadBuffer(rel, blkno);
 | |
| 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 | |
| 
 | |
| 		page = BufferGetPage(buffer);
 | |
| 		values[2] = BoolGetDatum(PageIsAllVisible(page));
 | |
| 
 | |
| 		UnlockReleaseBuffer(buffer);
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		/* As with the vismap, silently return 0 for pages past EOF */
 | |
| 		values[2] = BoolGetDatum(false);
 | |
| 	}
 | |
| 
 | |
| 	relation_close(rel, AccessShareLock);
 | |
| 
 | |
| 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Visibility map information for every block in a relation.
 | |
|  */
 | |
| Datum
 | |
| pg_visibility_map_rel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	FuncCallContext *funcctx;
 | |
| 	vbits	   *info;
 | |
| 
 | |
| 	if (SRF_IS_FIRSTCALL())
 | |
| 	{
 | |
| 		Oid			relid = PG_GETARG_OID(0);
 | |
| 		MemoryContext oldcontext;
 | |
| 
 | |
| 		funcctx = SRF_FIRSTCALL_INIT();
 | |
| 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 | |
| 		funcctx->tuple_desc = pg_visibility_tupdesc(true, false);
 | |
| 		/* collect_visibility_data will verify the relkind */
 | |
| 		funcctx->user_fctx = collect_visibility_data(relid, false);
 | |
| 		MemoryContextSwitchTo(oldcontext);
 | |
| 	}
 | |
| 
 | |
| 	funcctx = SRF_PERCALL_SETUP();
 | |
| 	info = (vbits *) funcctx->user_fctx;
 | |
| 
 | |
| 	if (info->next < info->count)
 | |
| 	{
 | |
| 		Datum		values[3];
 | |
| 		bool		nulls[3] = {0};
 | |
| 		HeapTuple	tuple;
 | |
| 
 | |
| 		values[0] = Int64GetDatum(info->next);
 | |
| 		values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
 | |
| 		values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
 | |
| 		info->next++;
 | |
| 
 | |
| 		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
 | |
| 		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
 | |
| 	}
 | |
| 
 | |
| 	SRF_RETURN_DONE(funcctx);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Visibility map information for every block in a relation, plus the page
 | |
|  * level information for each block.
 | |
|  */
 | |
| Datum
 | |
| pg_visibility_rel(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	FuncCallContext *funcctx;
 | |
| 	vbits	   *info;
 | |
| 
 | |
| 	if (SRF_IS_FIRSTCALL())
 | |
| 	{
 | |
| 		Oid			relid = PG_GETARG_OID(0);
 | |
| 		MemoryContext oldcontext;
 | |
| 
 | |
| 		funcctx = SRF_FIRSTCALL_INIT();
 | |
| 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 | |
| 		funcctx->tuple_desc = pg_visibility_tupdesc(true, true);
 | |
| 		/* collect_visibility_data will verify the relkind */
 | |
| 		funcctx->user_fctx = collect_visibility_data(relid, true);
 | |
| 		MemoryContextSwitchTo(oldcontext);
 | |
| 	}
 | |
| 
 | |
| 	funcctx = SRF_PERCALL_SETUP();
 | |
| 	info = (vbits *) funcctx->user_fctx;
 | |
| 
 | |
| 	if (info->next < info->count)
 | |
| 	{
 | |
| 		Datum		values[4];
 | |
| 		bool		nulls[4] = {0};
 | |
| 		HeapTuple	tuple;
 | |
| 
 | |
| 		values[0] = Int64GetDatum(info->next);
 | |
| 		values[1] = BoolGetDatum((info->bits[info->next] & (1 << 0)) != 0);
 | |
| 		values[2] = BoolGetDatum((info->bits[info->next] & (1 << 1)) != 0);
 | |
| 		values[3] = BoolGetDatum((info->bits[info->next] & (1 << 2)) != 0);
 | |
| 		info->next++;
 | |
| 
 | |
| 		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
 | |
| 		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
 | |
| 	}
 | |
| 
 | |
| 	SRF_RETURN_DONE(funcctx);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Count the number of all-visible and all-frozen pages in the visibility
 | |
|  * map for a particular relation.
 | |
|  */
 | |
| Datum
 | |
| pg_visibility_map_summary(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Oid			relid = PG_GETARG_OID(0);
 | |
| 	Relation	rel;
 | |
| 	BlockNumber nblocks;
 | |
| 	BlockNumber blkno;
 | |
| 	Buffer		vmbuffer = InvalidBuffer;
 | |
| 	int64		all_visible = 0;
 | |
| 	int64		all_frozen = 0;
 | |
| 	TupleDesc	tupdesc;
 | |
| 	Datum		values[2];
 | |
| 	bool		nulls[2] = {0};
 | |
| 
 | |
| 	rel = relation_open(relid, AccessShareLock);
 | |
| 
 | |
| 	/* Only some relkinds have a visibility map */
 | |
| 	check_relation_relkind(rel);
 | |
| 
 | |
| 	nblocks = RelationGetNumberOfBlocks(rel);
 | |
| 
 | |
| 	for (blkno = 0; blkno < nblocks; ++blkno)
 | |
| 	{
 | |
| 		int32		mapbits;
 | |
| 
 | |
| 		/* Make sure we are interruptible. */
 | |
| 		CHECK_FOR_INTERRUPTS();
 | |
| 
 | |
| 		/* Get map info. */
 | |
| 		mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
 | |
| 		if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
 | |
| 			++all_visible;
 | |
| 		if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
 | |
| 			++all_frozen;
 | |
| 	}
 | |
| 
 | |
| 	/* Clean up. */
 | |
| 	if (vmbuffer != InvalidBuffer)
 | |
| 		ReleaseBuffer(vmbuffer);
 | |
| 	relation_close(rel, AccessShareLock);
 | |
| 
 | |
| 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
 | |
| 		elog(ERROR, "return type must be a row type");
 | |
| 
 | |
| 	values[0] = Int64GetDatum(all_visible);
 | |
| 	values[1] = Int64GetDatum(all_frozen);
 | |
| 
 | |
| 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Return the TIDs of non-frozen tuples present in pages marked all-frozen
 | |
|  * in the visibility map.  We hope no one will ever find any, but there could
 | |
|  * be bugs, database corruption, etc.
 | |
|  */
 | |
| Datum
 | |
| pg_check_frozen(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	FuncCallContext *funcctx;
 | |
| 	corrupt_items *items;
 | |
| 
 | |
| 	if (SRF_IS_FIRSTCALL())
 | |
| 	{
 | |
| 		Oid			relid = PG_GETARG_OID(0);
 | |
| 		MemoryContext oldcontext;
 | |
| 
 | |
| 		funcctx = SRF_FIRSTCALL_INIT();
 | |
| 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 | |
| 		/* collect_corrupt_items will verify the relkind */
 | |
| 		funcctx->user_fctx = collect_corrupt_items(relid, false, true);
 | |
| 		MemoryContextSwitchTo(oldcontext);
 | |
| 	}
 | |
| 
 | |
| 	funcctx = SRF_PERCALL_SETUP();
 | |
| 	items = (corrupt_items *) funcctx->user_fctx;
 | |
| 
 | |
| 	if (items->next < items->count)
 | |
| 		SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
 | |
| 
 | |
| 	SRF_RETURN_DONE(funcctx);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Return the TIDs of not-all-visible tuples in pages marked all-visible
 | |
|  * in the visibility map.  We hope no one will ever find any, but there could
 | |
|  * be bugs, database corruption, etc.
 | |
|  */
 | |
| Datum
 | |
| pg_check_visible(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	FuncCallContext *funcctx;
 | |
| 	corrupt_items *items;
 | |
| 
 | |
| 	if (SRF_IS_FIRSTCALL())
 | |
| 	{
 | |
| 		Oid			relid = PG_GETARG_OID(0);
 | |
| 		MemoryContext oldcontext;
 | |
| 
 | |
| 		funcctx = SRF_FIRSTCALL_INIT();
 | |
| 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 | |
| 		/* collect_corrupt_items will verify the relkind */
 | |
| 		funcctx->user_fctx = collect_corrupt_items(relid, true, false);
 | |
| 		MemoryContextSwitchTo(oldcontext);
 | |
| 	}
 | |
| 
 | |
| 	funcctx = SRF_PERCALL_SETUP();
 | |
| 	items = (corrupt_items *) funcctx->user_fctx;
 | |
| 
 | |
| 	if (items->next < items->count)
 | |
| 		SRF_RETURN_NEXT(funcctx, PointerGetDatum(&items->tids[items->next++]));
 | |
| 
 | |
| 	SRF_RETURN_DONE(funcctx);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Remove the visibility map fork for a relation.  If there turn out to be
 | |
|  * any bugs in the visibility map code that require rebuilding the VM, this
 | |
|  * provides users with a way to do it that is cleaner than shutting down the
 | |
|  * server and removing files by hand.
 | |
|  *
 | |
|  * This is a cut-down version of RelationTruncate.
 | |
|  */
 | |
| Datum
 | |
| pg_truncate_visibility_map(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Oid			relid = PG_GETARG_OID(0);
 | |
| 	Relation	rel;
 | |
| 	ForkNumber	fork;
 | |
| 	BlockNumber block;
 | |
| 	BlockNumber old_block;
 | |
| 
 | |
| 	rel = relation_open(relid, AccessExclusiveLock);
 | |
| 
 | |
| 	/* Only some relkinds have a visibility map */
 | |
| 	check_relation_relkind(rel);
 | |
| 
 | |
| 	/* Forcibly reset cached file size */
 | |
| 	RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
 | |
| 
 | |
| 	/* Compute new and old size before entering critical section. */
 | |
| 	fork = VISIBILITYMAP_FORKNUM;
 | |
| 	block = visibilitymap_prepare_truncate(rel, 0);
 | |
| 	old_block = BlockNumberIsValid(block) ? smgrnblocks(RelationGetSmgr(rel), fork) : 0;
 | |
| 
 | |
| 	/*
 | |
| 	 * WAL-logging, buffer dropping, file truncation must be atomic and all on
 | |
| 	 * one side of a checkpoint.  See RelationTruncate() for discussion.
 | |
| 	 */
 | |
| 	Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
 | |
| 	MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
 | |
| 	START_CRIT_SECTION();
 | |
| 
 | |
| 	if (RelationNeedsWAL(rel))
 | |
| 	{
 | |
| 		XLogRecPtr	lsn;
 | |
| 		xl_smgr_truncate xlrec;
 | |
| 
 | |
| 		xlrec.blkno = 0;
 | |
| 		xlrec.rlocator = rel->rd_locator;
 | |
| 		xlrec.flags = SMGR_TRUNCATE_VM;
 | |
| 
 | |
| 		XLogBeginInsert();
 | |
| 		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
 | |
| 
 | |
| 		lsn = XLogInsert(RM_SMGR_ID,
 | |
| 						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
 | |
| 		XLogFlush(lsn);
 | |
| 	}
 | |
| 
 | |
| 	if (BlockNumberIsValid(block))
 | |
| 		smgrtruncate(RelationGetSmgr(rel), &fork, 1, &old_block, &block);
 | |
| 
 | |
| 	END_CRIT_SECTION();
 | |
| 	MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
 | |
| 
 | |
| 	/*
 | |
| 	 * Release the lock right away, not at commit time.
 | |
| 	 *
 | |
| 	 * It would be a problem to release the lock prior to commit if this
 | |
| 	 * truncate operation sends any transactional invalidation messages. Other
 | |
| 	 * backends would potentially be able to lock the relation without
 | |
| 	 * processing them in the window of time between when we release the lock
 | |
| 	 * here and when we sent the messages at our eventual commit.  However,
 | |
| 	 * we're currently only sending a non-transactional smgr invalidation,
 | |
| 	 * which will have been posted to shared memory immediately from within
 | |
| 	 * smgr_truncate.  Therefore, there should be no race here.
 | |
| 	 *
 | |
| 	 * The reason why it's desirable to release the lock early here is because
 | |
| 	 * of the possibility that someone will need to use this to blow away many
 | |
| 	 * visibility map forks at once.  If we can't release the lock until
 | |
| 	 * commit time, the transaction doing this will accumulate
 | |
| 	 * AccessExclusiveLocks on all of those relations at the same time, which
 | |
| 	 * is undesirable. However, if this turns out to be unsafe we may have no
 | |
| 	 * choice...
 | |
| 	 */
 | |
| 	relation_close(rel, AccessExclusiveLock);
 | |
| 
 | |
| 	/* Nothing to return. */
 | |
| 	PG_RETURN_VOID();
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Helper function to construct whichever TupleDesc we need for a particular
 | |
|  * call.
 | |
|  */
 | |
| static TupleDesc
 | |
| pg_visibility_tupdesc(bool include_blkno, bool include_pd)
 | |
| {
 | |
| 	TupleDesc	tupdesc;
 | |
| 	AttrNumber	maxattr = 2;
 | |
| 	AttrNumber	a = 0;
 | |
| 
 | |
| 	if (include_blkno)
 | |
| 		++maxattr;
 | |
| 	if (include_pd)
 | |
| 		++maxattr;
 | |
| 	tupdesc = CreateTemplateTupleDesc(maxattr);
 | |
| 	if (include_blkno)
 | |
| 		TupleDescInitEntry(tupdesc, ++a, "blkno", INT8OID, -1, 0);
 | |
| 	TupleDescInitEntry(tupdesc, ++a, "all_visible", BOOLOID, -1, 0);
 | |
| 	TupleDescInitEntry(tupdesc, ++a, "all_frozen", BOOLOID, -1, 0);
 | |
| 	if (include_pd)
 | |
| 		TupleDescInitEntry(tupdesc, ++a, "pd_all_visible", BOOLOID, -1, 0);
 | |
| 	Assert(a == maxattr);
 | |
| 
 | |
| 	return BlessTupleDesc(tupdesc);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Collect visibility data about a relation.
 | |
|  *
 | |
|  * Checks relkind of relid and will throw an error if the relation does not
 | |
|  * have a VM.
 | |
|  */
 | |
| static vbits *
 | |
| collect_visibility_data(Oid relid, bool include_pd)
 | |
| {
 | |
| 	Relation	rel;
 | |
| 	BlockNumber nblocks;
 | |
| 	vbits	   *info;
 | |
| 	BlockNumber blkno;
 | |
| 	Buffer		vmbuffer = InvalidBuffer;
 | |
| 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
 | |
| 	BlockRangeReadStreamPrivate p;
 | |
| 	ReadStream *stream = NULL;
 | |
| 
 | |
| 	rel = relation_open(relid, AccessShareLock);
 | |
| 
 | |
| 	/* Only some relkinds have a visibility map */
 | |
| 	check_relation_relkind(rel);
 | |
| 
 | |
| 	nblocks = RelationGetNumberOfBlocks(rel);
 | |
| 	info = palloc0(offsetof(vbits, bits) + nblocks);
 | |
| 	info->next = 0;
 | |
| 	info->count = nblocks;
 | |
| 
 | |
| 	/* Create a stream if reading main fork. */
 | |
| 	if (include_pd)
 | |
| 	{
 | |
| 		p.current_blocknum = 0;
 | |
| 		p.last_exclusive = nblocks;
 | |
| 		stream = read_stream_begin_relation(READ_STREAM_FULL,
 | |
| 											bstrategy,
 | |
| 											rel,
 | |
| 											MAIN_FORKNUM,
 | |
| 											block_range_read_stream_cb,
 | |
| 											&p,
 | |
| 											0);
 | |
| 	}
 | |
| 
 | |
| 	for (blkno = 0; blkno < nblocks; ++blkno)
 | |
| 	{
 | |
| 		int32		mapbits;
 | |
| 
 | |
| 		/* Make sure we are interruptible. */
 | |
| 		CHECK_FOR_INTERRUPTS();
 | |
| 
 | |
| 		/* Get map info. */
 | |
| 		mapbits = (int32) visibilitymap_get_status(rel, blkno, &vmbuffer);
 | |
| 		if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) != 0)
 | |
| 			info->bits[blkno] |= (1 << 0);
 | |
| 		if ((mapbits & VISIBILITYMAP_ALL_FROZEN) != 0)
 | |
| 			info->bits[blkno] |= (1 << 1);
 | |
| 
 | |
| 		/*
 | |
| 		 * Page-level data requires reading every block, so only get it if the
 | |
| 		 * caller needs it.  Use a buffer access strategy, too, to prevent
 | |
| 		 * cache-trashing.
 | |
| 		 */
 | |
| 		if (include_pd)
 | |
| 		{
 | |
| 			Buffer		buffer;
 | |
| 			Page		page;
 | |
| 
 | |
| 			buffer = read_stream_next_buffer(stream, NULL);
 | |
| 			LockBuffer(buffer, BUFFER_LOCK_SHARE);
 | |
| 
 | |
| 			page = BufferGetPage(buffer);
 | |
| 			if (PageIsAllVisible(page))
 | |
| 				info->bits[blkno] |= (1 << 2);
 | |
| 
 | |
| 			UnlockReleaseBuffer(buffer);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if (include_pd)
 | |
| 	{
 | |
| 		Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
 | |
| 		read_stream_end(stream);
 | |
| 	}
 | |
| 
 | |
| 	/* Clean up. */
 | |
| 	if (vmbuffer != InvalidBuffer)
 | |
| 		ReleaseBuffer(vmbuffer);
 | |
| 	relation_close(rel, AccessShareLock);
 | |
| 
 | |
| 	return info;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * The "strict" version of GetOldestNonRemovableTransactionId().  The
 | |
|  * pg_visibility check can tolerate false positives (don't report some of the
 | |
|  * errors), but can't tolerate false negatives (report false errors). Normally,
 | |
|  * horizons move forwards, but there are cases when it could move backward
 | |
|  * (see comment for ComputeXidHorizons()).
 | |
|  *
 | |
|  * This is why we have to implement our own function for xid horizon, which
 | |
|  * would be guaranteed to be newer or equal to any xid horizon computed before.
 | |
|  * We have to do the following to achieve this.
 | |
|  *
 | |
|  * 1. Ignore processes xmin's, because they consider connection to other
 | |
|  *    databases that were ignored before.
 | |
|  * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
 | |
|  *    now perform minimal checking on a standby by always using nextXid, this
 | |
|  *    approach is better than nothing and will at least catch extremely broken
 | |
|  *    cases where a xid is in the future.
 | |
|  * 3. Ignore walsender xmin, because it could go backward if some replication
 | |
|  *    connections don't use replication slots.
 | |
|  *
 | |
|  * While it might seem like we could use KnownAssignedXids for shared
 | |
|  * catalogs, since shared catalogs rely on a global horizon rather than a
 | |
|  * database-specific one - there are potential edge cases.  For example, a
 | |
|  * transaction may crash on the primary without writing a commit/abort record.
 | |
|  * This would lead to a situation where it appears to still be running on the
 | |
|  * standby, even though it has already ended on the primary.  For this reason,
 | |
|  * it's safer to ignore KnownAssignedXids, even for shared catalogs.
 | |
|  *
 | |
|  * As a result, we're using only currently running xids to compute the horizon.
 | |
|  * Surely these would significantly sacrifice accuracy.  But we have to do so
 | |
|  * to avoid reporting false errors.
 | |
|  */
 | |
| static TransactionId
 | |
| GetStrictOldestNonRemovableTransactionId(Relation rel)
 | |
| {
 | |
| 	RunningTransactions runningTransactions;
 | |
| 
 | |
| 	if (RecoveryInProgress())
 | |
| 	{
 | |
| 		TransactionId result;
 | |
| 
 | |
| 		/* As we ignore KnownAssignedXids on standby, just pick nextXid */
 | |
| 		LWLockAcquire(XidGenLock, LW_SHARED);
 | |
| 		result = XidFromFullTransactionId(TransamVariables->nextXid);
 | |
| 		LWLockRelease(XidGenLock);
 | |
| 		return result;
 | |
| 	}
 | |
| 	else if (rel == NULL || rel->rd_rel->relisshared)
 | |
| 	{
 | |
| 		/* Shared relation: take into account all running xids */
 | |
| 		runningTransactions = GetRunningTransactionData();
 | |
| 		LWLockRelease(ProcArrayLock);
 | |
| 		LWLockRelease(XidGenLock);
 | |
| 		return runningTransactions->oldestRunningXid;
 | |
| 	}
 | |
| 	else if (!RELATION_IS_LOCAL(rel))
 | |
| 	{
 | |
| 		/*
 | |
| 		 * Normal relation: take into account xids running within the current
 | |
| 		 * database
 | |
| 		 */
 | |
| 		runningTransactions = GetRunningTransactionData();
 | |
| 		LWLockRelease(ProcArrayLock);
 | |
| 		LWLockRelease(XidGenLock);
 | |
| 		return runningTransactions->oldestDatabaseRunningXid;
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		/*
 | |
| 		 * For temporary relations, ComputeXidHorizons() uses only
 | |
| 		 * TransamVariables->latestCompletedXid and MyProc->xid.  These two
 | |
| 		 * shouldn't go backwards.  So we're fine with this horizon.
 | |
| 		 */
 | |
| 		return GetOldestNonRemovableTransactionId(rel);
 | |
| 	}
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Callback function to get next block for read stream object used in
 | |
|  * collect_corrupt_items() function.
 | |
|  */
 | |
| static BlockNumber
 | |
| collect_corrupt_items_read_stream_next_block(ReadStream *stream,
 | |
| 											 void *callback_private_data,
 | |
| 											 void *per_buffer_data)
 | |
| {
 | |
| 	struct collect_corrupt_items_read_stream_private *p = callback_private_data;
 | |
| 
 | |
| 	for (; p->current_blocknum < p->last_exclusive; p->current_blocknum++)
 | |
| 	{
 | |
| 		bool		check_frozen = false;
 | |
| 		bool		check_visible = false;
 | |
| 
 | |
| 		/* Make sure we are interruptible. */
 | |
| 		CHECK_FOR_INTERRUPTS();
 | |
| 
 | |
| 		if (p->all_frozen && VM_ALL_FROZEN(p->rel, p->current_blocknum, &p->vmbuffer))
 | |
| 			check_frozen = true;
 | |
| 		if (p->all_visible && VM_ALL_VISIBLE(p->rel, p->current_blocknum, &p->vmbuffer))
 | |
| 			check_visible = true;
 | |
| 		if (!check_visible && !check_frozen)
 | |
| 			continue;
 | |
| 
 | |
| 		return p->current_blocknum++;
 | |
| 	}
 | |
| 
 | |
| 	return InvalidBlockNumber;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Returns a list of items whose visibility map information does not match
 | |
|  * the status of the tuples on the page.
 | |
|  *
 | |
|  * If all_visible is passed as true, this will include all items which are
 | |
|  * on pages marked as all-visible in the visibility map but which do not
 | |
|  * seem to in fact be all-visible.
 | |
|  *
 | |
|  * If all_frozen is passed as true, this will include all items which are
 | |
|  * on pages marked as all-frozen but which do not seem to in fact be frozen.
 | |
|  *
 | |
|  * Checks relkind of relid and will throw an error if the relation does not
 | |
|  * have a VM.
 | |
|  */
 | |
| static corrupt_items *
 | |
| collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 | |
| {
 | |
| 	Relation	rel;
 | |
| 	corrupt_items *items;
 | |
| 	Buffer		vmbuffer = InvalidBuffer;
 | |
| 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
 | |
| 	TransactionId OldestXmin = InvalidTransactionId;
 | |
| 	struct collect_corrupt_items_read_stream_private p;
 | |
| 	ReadStream *stream;
 | |
| 	Buffer		buffer;
 | |
| 
 | |
| 	rel = relation_open(relid, AccessShareLock);
 | |
| 
 | |
| 	/* Only some relkinds have a visibility map */
 | |
| 	check_relation_relkind(rel);
 | |
| 
 | |
| 	if (all_visible)
 | |
| 		OldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 | |
| 
 | |
| 	/*
 | |
| 	 * Guess an initial array size. We don't expect many corrupted tuples, so
 | |
| 	 * start with a small array.  This function uses the "next" field to track
 | |
| 	 * the next offset where we can store an item (which is the same thing as
 | |
| 	 * the number of items found so far) and the "count" field to track the
 | |
| 	 * number of entries allocated.  We'll repurpose these fields before
 | |
| 	 * returning.
 | |
| 	 */
 | |
| 	items = palloc0(sizeof(corrupt_items));
 | |
| 	items->next = 0;
 | |
| 	items->count = 64;
 | |
| 	items->tids = palloc(items->count * sizeof(ItemPointerData));
 | |
| 
 | |
| 	p.current_blocknum = 0;
 | |
| 	p.last_exclusive = RelationGetNumberOfBlocks(rel);
 | |
| 	p.rel = rel;
 | |
| 	p.vmbuffer = InvalidBuffer;
 | |
| 	p.all_frozen = all_frozen;
 | |
| 	p.all_visible = all_visible;
 | |
| 	stream = read_stream_begin_relation(READ_STREAM_FULL,
 | |
| 										bstrategy,
 | |
| 										rel,
 | |
| 										MAIN_FORKNUM,
 | |
| 										collect_corrupt_items_read_stream_next_block,
 | |
| 										&p,
 | |
| 										0);
 | |
| 
 | |
| 	/* Loop over every block in the relation. */
 | |
| 	while ((buffer = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
 | |
| 	{
 | |
| 		bool		check_frozen = all_frozen;
 | |
| 		bool		check_visible = all_visible;
 | |
| 		Page		page;
 | |
| 		OffsetNumber offnum,
 | |
| 					maxoff;
 | |
| 		BlockNumber blkno;
 | |
| 
 | |
| 		/* Make sure we are interruptible. */
 | |
| 		CHECK_FOR_INTERRUPTS();
 | |
| 
 | |
| 		LockBuffer(buffer, BUFFER_LOCK_SHARE);
 | |
| 
 | |
| 		page = BufferGetPage(buffer);
 | |
| 		maxoff = PageGetMaxOffsetNumber(page);
 | |
| 		blkno = BufferGetBlockNumber(buffer);
 | |
| 
 | |
| 		/*
 | |
| 		 * The visibility map bits might have changed while we were acquiring
 | |
| 		 * the page lock.  Recheck to avoid returning spurious results.
 | |
| 		 */
 | |
| 		if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
 | |
| 			check_frozen = false;
 | |
| 		if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
 | |
| 			check_visible = false;
 | |
| 		if (!check_visible && !check_frozen)
 | |
| 		{
 | |
| 			UnlockReleaseBuffer(buffer);
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		/* Iterate over each tuple on the page. */
 | |
| 		for (offnum = FirstOffsetNumber;
 | |
| 			 offnum <= maxoff;
 | |
| 			 offnum = OffsetNumberNext(offnum))
 | |
| 		{
 | |
| 			HeapTupleData tuple;
 | |
| 			ItemId		itemid;
 | |
| 
 | |
| 			itemid = PageGetItemId(page, offnum);
 | |
| 
 | |
| 			/* Unused or redirect line pointers are of no interest. */
 | |
| 			if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
 | |
| 				continue;
 | |
| 
 | |
| 			/* Dead line pointers are neither all-visible nor frozen. */
 | |
| 			if (ItemIdIsDead(itemid))
 | |
| 			{
 | |
| 				ItemPointerSet(&(tuple.t_self), blkno, offnum);
 | |
| 				record_corrupt_item(items, &tuple.t_self);
 | |
| 				continue;
 | |
| 			}
 | |
| 
 | |
| 			/* Initialize a HeapTupleData structure for checks below. */
 | |
| 			ItemPointerSet(&(tuple.t_self), blkno, offnum);
 | |
| 			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 | |
| 			tuple.t_len = ItemIdGetLength(itemid);
 | |
| 			tuple.t_tableOid = relid;
 | |
| 
 | |
| 			/*
 | |
| 			 * If we're checking whether the page is all-visible, we expect
 | |
| 			 * the tuple to be all-visible.
 | |
| 			 */
 | |
| 			if (check_visible &&
 | |
| 				!tuple_all_visible(&tuple, OldestXmin, buffer))
 | |
| 			{
 | |
| 				TransactionId RecomputedOldestXmin;
 | |
| 
 | |
| 				/*
 | |
| 				 * Time has passed since we computed OldestXmin, so it's
 | |
| 				 * possible that this tuple is all-visible in reality even
 | |
| 				 * though it doesn't appear so based on our
 | |
| 				 * previously-computed value.  Let's compute a new value so we
 | |
| 				 * can be certain whether there is a problem.
 | |
| 				 *
 | |
| 				 * From a concurrency point of view, it sort of sucks to
 | |
| 				 * retake ProcArrayLock here while we're holding the buffer
 | |
| 				 * exclusively locked, but it should be safe against
 | |
| 				 * deadlocks, because surely
 | |
| 				 * GetStrictOldestNonRemovableTransactionId() should never
 | |
| 				 * take a buffer lock. And this shouldn't happen often, so
 | |
| 				 * it's worth being careful so as to avoid false positives.
 | |
| 				 */
 | |
| 				RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId(rel);
 | |
| 
 | |
| 				if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
 | |
| 					record_corrupt_item(items, &tuple.t_self);
 | |
| 				else
 | |
| 				{
 | |
| 					OldestXmin = RecomputedOldestXmin;
 | |
| 					if (!tuple_all_visible(&tuple, OldestXmin, buffer))
 | |
| 						record_corrupt_item(items, &tuple.t_self);
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			/*
 | |
| 			 * If we're checking whether the page is all-frozen, we expect the
 | |
| 			 * tuple to be in a state where it will never need freezing.
 | |
| 			 */
 | |
| 			if (check_frozen)
 | |
| 			{
 | |
| 				if (heap_tuple_needs_eventual_freeze(tuple.t_data))
 | |
| 					record_corrupt_item(items, &tuple.t_self);
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		UnlockReleaseBuffer(buffer);
 | |
| 	}
 | |
| 	read_stream_end(stream);
 | |
| 
 | |
| 	/* Clean up. */
 | |
| 	if (vmbuffer != InvalidBuffer)
 | |
| 		ReleaseBuffer(vmbuffer);
 | |
| 	if (p.vmbuffer != InvalidBuffer)
 | |
| 		ReleaseBuffer(p.vmbuffer);
 | |
| 	relation_close(rel, AccessShareLock);
 | |
| 
 | |
| 	/*
 | |
| 	 * Before returning, repurpose the fields to match caller's expectations.
 | |
| 	 * next is now the next item that should be read (rather than written) and
 | |
| 	 * count is now the number of items we wrote (rather than the number we
 | |
| 	 * allocated).
 | |
| 	 */
 | |
| 	items->count = items->next;
 | |
| 	items->next = 0;
 | |
| 
 | |
| 	return items;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Remember one corrupt item.
 | |
|  */
 | |
| static void
 | |
| record_corrupt_item(corrupt_items *items, ItemPointer tid)
 | |
| {
 | |
| 	/* enlarge output array if needed. */
 | |
| 	if (items->next >= items->count)
 | |
| 	{
 | |
| 		items->count *= 2;
 | |
| 		items->tids = repalloc(items->tids,
 | |
| 							   items->count * sizeof(ItemPointerData));
 | |
| 	}
 | |
| 	/* and add the new item */
 | |
| 	items->tids[items->next++] = *tid;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Check whether a tuple is all-visible relative to a given OldestXmin value.
 | |
|  * The buffer should contain the tuple and should be locked and pinned.
 | |
|  */
 | |
| static bool
 | |
| tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
 | |
| {
 | |
| 	HTSV_Result state;
 | |
| 	TransactionId xmin;
 | |
| 
 | |
| 	state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer);
 | |
| 	if (state != HEAPTUPLE_LIVE)
 | |
| 		return false;			/* all-visible implies live */
 | |
| 
 | |
| 	/*
 | |
| 	 * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
 | |
| 	 * all-visible unless every tuple is hinted committed. However, those hint
 | |
| 	 * bits could be lost after a crash, so we can't be certain that they'll
 | |
| 	 * be set here.  So just check the xmin.
 | |
| 	 */
 | |
| 
 | |
| 	xmin = HeapTupleHeaderGetXmin(tup->t_data);
 | |
| 	if (!TransactionIdPrecedes(xmin, OldestXmin))
 | |
| 		return false;			/* xmin not old enough for all to see */
 | |
| 
 | |
| 	return true;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * check_relation_relkind - convenience routine to check that relation
 | |
|  * is of the relkind supported by the callers
 | |
|  */
 | |
| static void
 | |
| check_relation_relkind(Relation rel)
 | |
| {
 | |
| 	if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 | |
| 				 errmsg("relation \"%s\" is of wrong relation kind",
 | |
| 						RelationGetRelationName(rel)),
 | |
| 				 errdetail_relkind_not_supported(rel->rd_rel->relkind)));
 | |
| }
 |