mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 00:03:57 -04:00 
			
		
		
		
	More infrastructure for btree compaction project. Tree-traversal code
now knows what to do upon hitting a dead page (in theory anyway, it's untested...). Add a post-VACUUM-cleanup entry point for index AMs, to provide a place for dead-page scavenging to happen. Also, fix oversight that broke btpo_prev links in temporary indexes. initdb forced due to additions in pg_am.
This commit is contained in:
		
							parent
							
								
									4fff132d1b
								
							
						
					
					
						commit
						799bc58dc7
					
				| @ -1,6 +1,6 @@ | ||||
| <!-- | ||||
|  Documentation of the system catalogs, directed toward PostgreSQL developers | ||||
|  $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.65 2003/01/19 00:13:28 momjian Exp $ | ||||
|  $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.66 2003/02/22 00:45:03 tgl Exp $ | ||||
|  --> | ||||
| 
 | ||||
| <chapter id="catalogs"> | ||||
| @ -446,6 +446,13 @@ | ||||
|       <entry>bulk-delete function</entry> | ||||
|      </row> | ||||
| 
 | ||||
|      <row> | ||||
|       <entry>amvacuumcleanup</entry> | ||||
|       <entry><type>regproc</type></entry> | ||||
|       <entry>pg_proc.oid</entry> | ||||
|       <entry>post-VACUUM cleanup function</entry> | ||||
|      </row> | ||||
| 
 | ||||
|      <row> | ||||
|       <entry>amcostestimate</entry> | ||||
|       <entry><type>regproc</type></entry> | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.99 2002/11/13 00:39:46 momjian Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.100 2003/02/22 00:45:03 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -1650,8 +1650,9 @@ gistbulkdelete(PG_FUNCTION_ARGS) | ||||
| 
 | ||||
| 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); | ||||
| 	result->num_pages = num_pages; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->num_index_tuples = num_index_tuples; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->pages_free = 0; | ||||
| 
 | ||||
| 	PG_RETURN_POINTER(result); | ||||
| } | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.60 2002/09/04 20:31:09 momjian Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.61 2003/02/22 00:45:03 tgl Exp $ | ||||
|  * | ||||
|  * NOTES | ||||
|  *	  This file contains only the public interface routines. | ||||
| @ -491,8 +491,9 @@ hashbulkdelete(PG_FUNCTION_ARGS) | ||||
| 
 | ||||
| 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); | ||||
| 	result->num_pages = num_pages; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->num_index_tuples = num_index_tuples; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->pages_free = 0; | ||||
| 
 | ||||
| 	PG_RETURN_POINTER(result); | ||||
| } | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.63 2003/01/08 19:41:40 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/index/indexam.c,v 1.64 2003/02/22 00:45:03 tgl Exp $ | ||||
|  * | ||||
|  * INTERFACE ROUTINES | ||||
|  *		index_open		- open an index relation by relation OID | ||||
| @ -23,6 +23,7 @@ | ||||
|  *		index_restrpos	- restore a scan position | ||||
|  *		index_getnext	- get the next tuple from a scan | ||||
|  *		index_bulk_delete	- bulk deletion of index tuples | ||||
|  *		index_vacuum_cleanup	- post-deletion cleanup of an index | ||||
|  *		index_cost_estimator	- fetch amcostestimate procedure OID | ||||
|  *		index_getprocid - get a support procedure OID | ||||
|  * | ||||
| @ -579,6 +580,37 @@ index_bulk_delete(Relation indexRelation, | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /* ----------------
 | ||||
|  *		index_vacuum_cleanup - do post-deletion cleanup of an index | ||||
|  * | ||||
|  *		return value is an optional palloc'd struct of statistics | ||||
|  * ---------------- | ||||
|  */ | ||||
| IndexBulkDeleteResult * | ||||
| index_vacuum_cleanup(Relation indexRelation, | ||||
| 					 IndexVacuumCleanupInfo *info, | ||||
| 					 IndexBulkDeleteResult *stats) | ||||
| { | ||||
| 	RegProcedure procedure; | ||||
| 	IndexBulkDeleteResult *result; | ||||
| 
 | ||||
| 	RELATION_CHECKS; | ||||
| 
 | ||||
| 	/* It's okay for an index AM not to have a vacuumcleanup procedure */ | ||||
| 	if (!RegProcedureIsValid(indexRelation->rd_am->amvacuumcleanup)) | ||||
| 		return stats; | ||||
| 
 | ||||
| 	GET_REL_PROCEDURE(vacuum_cleanup, amvacuumcleanup); | ||||
| 
 | ||||
| 	result = (IndexBulkDeleteResult *) | ||||
| 		DatumGetPointer(OidFunctionCall3(procedure, | ||||
| 										 PointerGetDatum(indexRelation), | ||||
| 										 PointerGetDatum((Pointer) info), | ||||
| 										 PointerGetDatum((Pointer) stats))); | ||||
| 
 | ||||
| 	return result; | ||||
| } | ||||
| 
 | ||||
| /* ----------------
 | ||||
|  *		index_cost_estimator | ||||
|  * | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.97 2003/02/21 00:06:21 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.98 2003/02/22 00:45:03 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -280,12 +280,21 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, | ||||
| 			if (!_bt_isequal(itupdesc, page, P_HIKEY, | ||||
| 							 natts, itup_scankey)) | ||||
| 				break; | ||||
| 			/* Advance to next non-dead page --- there must be one */ | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 				nblkno = opaque->btpo_next; | ||||
| 				if (nbuf != InvalidBuffer) | ||||
| 					_bt_relbuf(rel, nbuf); | ||||
| 				nbuf = _bt_getbuf(rel, nblkno, BT_READ); | ||||
| 				page = BufferGetPage(nbuf); | ||||
| 				opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 				if (!P_IGNORE(opaque)) | ||||
| 					break; | ||||
| 				if (P_RIGHTMOST(opaque)) | ||||
| 					elog(ERROR, "_bt_check_unique: fell off the end of %s", | ||||
| 						 RelationGetRelationName(rel)); | ||||
| 			} | ||||
| 			maxoff = PageGetMaxOffsetNumber(page); | ||||
| 			offset = P_FIRSTDATAKEY(opaque); | ||||
| 		} | ||||
| @ -414,20 +423,34 @@ _bt_insertonpg(Relation rel, | ||||
| 			   _bt_compare(rel, keysz, scankey, page, P_HIKEY) == 0 && | ||||
| 			   random() > (MAX_RANDOM_VALUE / 100)) | ||||
| 		{ | ||||
| 			/* step right one page */ | ||||
| 			BlockNumber rblkno = lpageop->btpo_next; | ||||
| 			Buffer		rbuf; | ||||
| 
 | ||||
| 			/*
 | ||||
| 			 * must write-lock next page before releasing write lock on | ||||
| 			 * step right to next non-dead page | ||||
| 			 * | ||||
| 			 * must write-lock that page before releasing write lock on | ||||
| 			 * current page; else someone else's _bt_check_unique scan | ||||
| 			 * could fail to see our insertion. | ||||
| 			 * could fail to see our insertion.  write locks on intermediate | ||||
| 			 * dead pages won't do because we don't know when they will get | ||||
| 			 * de-linked from the tree. | ||||
| 			 */ | ||||
| 			Buffer		rbuf = InvalidBuffer; | ||||
| 
 | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 				BlockNumber rblkno = lpageop->btpo_next; | ||||
| 
 | ||||
| 				if (rbuf != InvalidBuffer) | ||||
| 					_bt_relbuf(rel, rbuf); | ||||
| 				rbuf = _bt_getbuf(rel, rblkno, BT_WRITE); | ||||
| 				page = BufferGetPage(rbuf); | ||||
| 				lpageop = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 				if (!P_IGNORE(lpageop)) | ||||
| 					break; | ||||
| 				if (P_RIGHTMOST(lpageop)) | ||||
| 					elog(ERROR, "_bt_insertonpg: fell off the end of %s", | ||||
| 						 RelationGetRelationName(rel)); | ||||
| 			} | ||||
| 			_bt_relbuf(rel, buf); | ||||
| 			buf = rbuf; | ||||
| 			page = BufferGetPage(buf); | ||||
| 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 			movedright = true; | ||||
| 		} | ||||
| 
 | ||||
| @ -633,8 +656,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, | ||||
| 	BTPageOpaque ropaque, | ||||
| 				lopaque, | ||||
| 				oopaque; | ||||
| 	Buffer		sbuf = 0; | ||||
| 	Page		spage = 0; | ||||
| 	Buffer		sbuf = InvalidBuffer; | ||||
| 	Page		spage = NULL; | ||||
| 	BTPageOpaque sopaque = NULL; | ||||
| 	Size		itemsz; | ||||
| 	ItemId		itemid; | ||||
| 	BTItem		item; | ||||
| @ -792,6 +816,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, | ||||
| 	{ | ||||
| 		sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE); | ||||
| 		spage = BufferGetPage(sbuf); | ||||
| 		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); | ||||
| 		if (sopaque->btpo_prev != ropaque->btpo_prev) | ||||
| 			elog(PANIC, "btree: right sibling's left-link doesn't match"); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -802,6 +829,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, | ||||
| 	 */ | ||||
| 	START_CRIT_SECTION(); | ||||
| 
 | ||||
| 	if (!P_RIGHTMOST(ropaque)) | ||||
| 		sopaque->btpo_prev = BufferGetBlockNumber(rbuf); | ||||
| 
 | ||||
| 	/* XLOG stuff */ | ||||
| 	if (!rel->rd_istemp) | ||||
| 	{ | ||||
| @ -847,10 +877,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, | ||||
| 
 | ||||
| 		if (!P_RIGHTMOST(ropaque)) | ||||
| 		{ | ||||
| 			BTPageOpaque sopaque = (BTPageOpaque) PageGetSpecialPointer(spage); | ||||
| 
 | ||||
| 			sopaque->btpo_prev = BufferGetBlockNumber(rbuf); | ||||
| 
 | ||||
| 			rdata[2].next = &(rdata[3]); | ||||
| 			rdata[3].buffer = sbuf; | ||||
| 			rdata[3].data = NULL; | ||||
| @ -1250,15 +1276,19 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) | ||||
| 		Buffer		buf; | ||||
| 		Page		page; | ||||
| 		BTPageOpaque opaque; | ||||
| 
 | ||||
| 		buf = _bt_getbuf(rel, blkno, access); | ||||
| 		page = BufferGetPage(buf); | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 
 | ||||
| 		if (!P_IGNORE(opaque)) | ||||
| 		{ | ||||
| 			OffsetNumber offnum, | ||||
| 						minoff, | ||||
| 						maxoff; | ||||
| 			ItemId		itemid; | ||||
| 			BTItem		item; | ||||
| 
 | ||||
| 		buf = _bt_getbuf(rel, blkno, access); | ||||
| 		page = BufferGetPage(buf); | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 			minoff = P_FIRSTDATAKEY(opaque); | ||||
| 			maxoff = PageGetMaxOffsetNumber(page); | ||||
| 
 | ||||
| @ -1271,9 +1301,9 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) | ||||
| 				start = minoff; | ||||
| 
 | ||||
| 			/*
 | ||||
| 		 * These loops will check every item on the page --- but in an order | ||||
| 		 * that's attuned to the probability of where it actually is.  Scan | ||||
| 		 * to the right first, then to the left. | ||||
| 			 * These loops will check every item on the page --- but in an | ||||
| 			 * order that's attuned to the probability of where it actually | ||||
| 			 * is.  Scan to the right first, then to the left. | ||||
| 			 */ | ||||
| 			for (offnum = start; | ||||
| 				 offnum <= maxoff; | ||||
| @ -1304,6 +1334,7 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access) | ||||
| 					return buf; | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * The item we're looking for moved right at least one page. | ||||
| @ -1365,6 +1396,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf) | ||||
| 	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); | ||||
| 	rootpage = BufferGetPage(rootbuf); | ||||
| 	rootblknum = BufferGetBlockNumber(rootbuf); | ||||
| 
 | ||||
| 	/* acquire lock on the metapage */ | ||||
| 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); | ||||
| 	metapg = BufferGetPage(metabuf); | ||||
| 	metad = BTPageGetMeta(metapg); | ||||
|  | ||||
| @ -9,7 +9,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.59 2003/02/21 00:06:21 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.60 2003/02/22 00:45:04 tgl Exp $ | ||||
|  * | ||||
|  *	NOTES | ||||
|  *	   Postgres btree pages look like ordinary relation pages.	The opaque | ||||
| @ -22,34 +22,17 @@ | ||||
|  */ | ||||
| #include "postgres.h" | ||||
| 
 | ||||
| #include <time.h> | ||||
| 
 | ||||
| #include "access/nbtree.h" | ||||
| #include "miscadmin.h" | ||||
| #include "storage/lmgr.h" | ||||
| 
 | ||||
| extern bool FixBTree;			/* comments in nbtree.c */ | ||||
| extern Buffer _bt_fixroot(Relation rel, Buffer oldrootbuf, bool release); | ||||
| 
 | ||||
| /*
 | ||||
|  *	We use high-concurrency locking on btrees.	There are two cases in | ||||
|  *	which we don't do locking.  One is when we're building the btree. | ||||
|  *	Since the creating transaction has not committed, no one can see | ||||
|  *	the index, and there's no reason to share locks.  The second case | ||||
|  *	is when we're just starting up the database system.  We use some | ||||
|  *	special-purpose initialization code in the relation cache manager | ||||
|  *	(see utils/cache/relcache.c) to allow us to do indexed scans on | ||||
|  *	the system catalogs before we'd normally be able to.  This happens | ||||
|  *	before the lock table is fully initialized, so we can't use it. | ||||
|  *	Strictly speaking, this violates 2pl, but we don't do 2pl on the | ||||
|  *	system catalogs anyway, so I declare this to be okay. | ||||
|  */ | ||||
| 
 | ||||
| #define USELOCKING		(!BuildingBtree && !IsInitProcessingMode()) | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  *	_bt_metapinit() -- Initialize the metadata page of a new btree. | ||||
|  * | ||||
|  * Note: there's no real need for any locking here.  Since the transaction | ||||
|  * creating the index hasn't committed yet, no one else can even see the index | ||||
|  * much less be trying to use it. | ||||
|  */ | ||||
| void | ||||
| _bt_metapinit(Relation rel) | ||||
| @ -59,10 +42,6 @@ _bt_metapinit(Relation rel) | ||||
| 	BTMetaPageData *metad; | ||||
| 	BTPageOpaque op; | ||||
| 
 | ||||
| 	/* can't be sharing this with anyone, now... */ | ||||
| 	if (USELOCKING) | ||||
| 		LockRelation(rel, AccessExclusiveLock); | ||||
| 
 | ||||
| 	if (RelationGetNumberOfBlocks(rel) != 0) | ||||
| 		elog(ERROR, "Cannot initialize non-empty btree %s", | ||||
| 			 RelationGetRelationName(rel)); | ||||
| @ -114,10 +93,6 @@ _bt_metapinit(Relation rel) | ||||
| 	END_CRIT_SECTION(); | ||||
| 
 | ||||
| 	WriteBuffer(buf); | ||||
| 
 | ||||
| 	/* all done */ | ||||
| 	if (USELOCKING) | ||||
| 		UnlockRelation(rel, AccessExclusiveLock); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
| @ -142,7 +117,8 @@ _bt_metapinit(Relation rel) | ||||
|  *		what we will return is the old root, which is now just the leftmost | ||||
|  *		page on a probably-not-very-wide level.  For most purposes this is | ||||
|  *		as good as or better than the true root, so we do not bother to | ||||
|  *		insist on finding the true root. | ||||
|  *		insist on finding the true root.  We do, however, guarantee to | ||||
|  *		return a live (not deleted or half-dead) page. | ||||
|  * | ||||
|  *		On successful return, the root page is pinned and read-locked. | ||||
|  *		The metadata page is not locked or pinned on exit. | ||||
| @ -157,6 +133,7 @@ _bt_getroot(Relation rel, int access) | ||||
| 	Page		rootpage; | ||||
| 	BTPageOpaque rootopaque; | ||||
| 	BlockNumber rootblkno; | ||||
| 	uint32		rootlevel; | ||||
| 	BTMetaPageData *metad; | ||||
| 
 | ||||
| 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); | ||||
| @ -164,6 +141,7 @@ _bt_getroot(Relation rel, int access) | ||||
| 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg); | ||||
| 	metad = BTPageGetMeta(metapg); | ||||
| 
 | ||||
| 	/* sanity-check the metapage */ | ||||
| 	if (!(metaopaque->btpo_flags & BTP_META) || | ||||
| 		metad->btm_magic != BTREE_MAGIC) | ||||
| 		elog(ERROR, "Index %s is not a btree", | ||||
| @ -191,10 +169,20 @@ _bt_getroot(Relation rel, int access) | ||||
| 		/*
 | ||||
| 		 * Race condition:	if someone else initialized the metadata | ||||
| 		 * between the time we released the read lock and acquired the | ||||
| 		 * write lock, above, we must avoid doing it again. | ||||
| 		 * write lock, we must avoid doing it again. | ||||
| 		 */ | ||||
| 		if (metad->btm_root == P_NONE) | ||||
| 		if (metad->btm_root != P_NONE) | ||||
| 		{ | ||||
| 			/*
 | ||||
| 			 * Metadata initialized by someone else.  In order to | ||||
| 			 * guarantee no deadlocks, we have to release the metadata | ||||
| 			 * page and start all over again.  (Is that really true? | ||||
| 			 * But it's hardly worth trying to optimize this case.) | ||||
| 			 */ | ||||
| 			_bt_relbuf(rel, metabuf); | ||||
| 			return _bt_getroot(rel, access); | ||||
| 		} | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Get, initialize, write, and leave a lock of the appropriate | ||||
| 		 * type on the new root page.  Since this is the first page in | ||||
| @ -258,23 +246,36 @@ _bt_getroot(Relation rel, int access) | ||||
| 		_bt_wrtbuf(rel, metabuf); | ||||
| 	} | ||||
| 	else | ||||
| 		{ | ||||
| 			/*
 | ||||
| 			 * Metadata initialized by someone else.  In order to | ||||
| 			 * guarantee no deadlocks, we have to release the metadata | ||||
| 			 * page and start all over again. | ||||
| 			 */ | ||||
| 			_bt_relbuf(rel, metabuf); | ||||
| 			return _bt_getroot(rel, access); | ||||
| 		} | ||||
| 	} | ||||
| 	else | ||||
| 	{ | ||||
| 		rootblkno = metad->btm_fastroot; | ||||
| 		Assert(rootblkno != P_NONE); | ||||
| 		rootlevel = metad->btm_fastlevel; | ||||
| 
 | ||||
| 		_bt_relbuf(rel, metabuf);		/* done with the meta page */ | ||||
| 
 | ||||
| 		for (;;) | ||||
| 		{ | ||||
| 			rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); | ||||
| 			rootpage = BufferGetPage(rootbuf); | ||||
| 			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); | ||||
| 
 | ||||
| 			if (!P_IGNORE(rootopaque)) | ||||
| 				break; | ||||
| 
 | ||||
| 			/* it's dead, Jim.  step right one page */ | ||||
| 			if (P_RIGHTMOST(rootopaque)) | ||||
| 				elog(ERROR, "No live root page found in %s", | ||||
| 					 RelationGetRelationName(rel)); | ||||
| 			rootblkno = rootopaque->btpo_next; | ||||
| 
 | ||||
| 			_bt_relbuf(rel, rootbuf); | ||||
| 		} | ||||
| 
 | ||||
| 		/* Note: can't check btpo.level on deleted pages */ | ||||
| 		if (rootopaque->btpo.level != rootlevel) | ||||
| 			elog(ERROR, "Root page %u of %s has level %u, expected %u", | ||||
| 				 rootblkno, RelationGetRelationName(rel), | ||||
| 				 rootopaque->btpo.level, rootlevel); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -305,7 +306,10 @@ _bt_gettrueroot(Relation rel) | ||||
| 	Page		metapg; | ||||
| 	BTPageOpaque metaopaque; | ||||
| 	Buffer		rootbuf; | ||||
| 	Page		rootpage; | ||||
| 	BTPageOpaque rootopaque; | ||||
| 	BlockNumber rootblkno; | ||||
| 	uint32		rootlevel; | ||||
| 	BTMetaPageData *metad; | ||||
| 
 | ||||
| 	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); | ||||
| @ -331,10 +335,33 @@ _bt_gettrueroot(Relation rel) | ||||
| 	} | ||||
| 
 | ||||
| 	rootblkno = metad->btm_root; | ||||
| 	rootlevel = metad->btm_level; | ||||
| 
 | ||||
| 	_bt_relbuf(rel, metabuf);	/* done with the meta page */ | ||||
| 
 | ||||
| 	for (;;) | ||||
| 	{ | ||||
| 		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); | ||||
| 		rootpage = BufferGetPage(rootbuf); | ||||
| 		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage); | ||||
| 
 | ||||
| 		if (!P_IGNORE(rootopaque)) | ||||
| 			break; | ||||
| 
 | ||||
| 		/* it's dead, Jim.  step right one page */ | ||||
| 		if (P_RIGHTMOST(rootopaque)) | ||||
| 			elog(ERROR, "No live root page found in %s", | ||||
| 				 RelationGetRelationName(rel)); | ||||
| 		rootblkno = rootopaque->btpo_next; | ||||
| 
 | ||||
| 		_bt_relbuf(rel, rootbuf); | ||||
| 	} | ||||
| 
 | ||||
| 	/* Note: can't check btpo.level on deleted pages */ | ||||
| 	if (rootopaque->btpo.level != rootlevel) | ||||
| 		elog(ERROR, "Root page %u of %s has level %u, expected %u", | ||||
| 			 rootblkno, RelationGetRelationName(rel), | ||||
| 			 rootopaque->btpo.level, rootlevel); | ||||
| 
 | ||||
| 	return rootbuf; | ||||
| } | ||||
| @ -342,6 +369,8 @@ _bt_gettrueroot(Relation rel) | ||||
| /*
 | ||||
|  *	_bt_getbuf() -- Get a buffer by block number for read or write. | ||||
|  * | ||||
|  *		blkno == P_NEW means to get an unallocated index page. | ||||
|  * | ||||
|  *		When this routine returns, the appropriate lock is set on the | ||||
|  *		requested buffer and its reference count has been incremented | ||||
|  *		(ie, the buffer is "locked and pinned"). | ||||
| @ -359,19 +388,36 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) | ||||
| 	} | ||||
| 	else | ||||
| 	{ | ||||
| 		bool		needLock; | ||||
| 		Page		page; | ||||
| 
 | ||||
| 		/* XXX soon: ask FSM about free space */ | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Extend the relation by one page. | ||||
| 		 * | ||||
| 		 * Extend bufmgr code is unclean and so we have to use extra locking | ||||
| 		 * here. | ||||
| 		 * We have to use a lock to ensure no one else is extending the rel at | ||||
| 		 * the same time, else we will both try to initialize the same new | ||||
| 		 * page.  We can skip locking for new or temp relations, however, | ||||
| 		 * since no one else could be accessing them. | ||||
| 		 */ | ||||
| 		needLock = !(rel->rd_isnew || rel->rd_istemp); | ||||
| 
 | ||||
| 		if (needLock) | ||||
| 			LockPage(rel, 0, ExclusiveLock); | ||||
| 		buf = ReadBuffer(rel, blkno); | ||||
| 		LockBuffer(buf, access); | ||||
| 
 | ||||
| 		buf = ReadBuffer(rel, P_NEW); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Release the file-extension lock; it's now OK for someone else to | ||||
| 		 * extend the relation some more. | ||||
| 		 */ | ||||
| 		if (needLock) | ||||
| 			UnlockPage(rel, 0, ExclusiveLock); | ||||
| 
 | ||||
| 		/* Acquire appropriate buffer lock on new page */ | ||||
| 		LockBuffer(buf, access); | ||||
| 
 | ||||
| 		/* Initialize the new page before returning it */ | ||||
| 		page = BufferGetPage(buf); | ||||
| 		_bt_pageinit(page, BufferGetPageSize(buf)); | ||||
| @ -403,10 +449,9 @@ _bt_relbuf(Relation rel, Buffer buf) | ||||
|  *		and a pin on the buffer. | ||||
|  * | ||||
|  * NOTE: actually, the buffer manager just marks the shared buffer page | ||||
|  * dirty here, the real I/O happens later.	Since we can't persuade the | ||||
|  * Unix kernel to schedule disk writes in a particular order, there's not | ||||
|  * much point in worrying about this.  The most we can say is that all the | ||||
|  * writes will occur before commit. | ||||
|  * dirty here; the real I/O happens later.  This is okay since we are not | ||||
|  * relying on write ordering anyway.  The WAL mechanism is responsible for | ||||
|  * guaranteeing correctness after a crash. | ||||
|  */ | ||||
| void | ||||
| _bt_wrtbuf(Relation rel, Buffer buf) | ||||
| @ -455,8 +500,9 @@ _bt_pageinit(Page page, Size size) | ||||
|  *		mistake.  On exit, metapage data is correct and we no longer have | ||||
|  *		a pin or lock on the metapage. | ||||
|  * | ||||
|  * XXX this is not used for splitting anymore, only in nbtsort.c at the | ||||
|  * completion of btree building. | ||||
|  * Actually this is not used for splitting on-the-fly anymore.  It's only used | ||||
|  * in nbtsort.c at the completion of btree building, where we know we have | ||||
|  * sole access to the index anyway. | ||||
|  */ | ||||
| void | ||||
| _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level) | ||||
| @ -512,6 +558,10 @@ _bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level) | ||||
| /*
 | ||||
|  * Delete an item from a btree page. | ||||
|  * | ||||
|  * This must only be used for deleting leaf items.  Deleting an item on a | ||||
|  * non-leaf page has to be done as part of an atomic action that includes | ||||
|  * deleting the page it points to. | ||||
|  * | ||||
|  * This routine assumes that the caller has pinned and locked the buffer, | ||||
|  * and will write the buffer afterwards. | ||||
|  */ | ||||
|  | ||||
| @ -12,7 +12,7 @@ | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.95 2003/02/21 00:06:21 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.96 2003/02/22 00:45:04 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -23,6 +23,7 @@ | ||||
| #include "access/nbtree.h" | ||||
| #include "catalog/index.h" | ||||
| #include "miscadmin.h" | ||||
| #include "storage/freespace.h" | ||||
| 
 | ||||
| 
 | ||||
| /* Working state for btbuild and its callback */ | ||||
| @ -44,7 +45,6 @@ typedef struct | ||||
| } BTBuildState; | ||||
| 
 | ||||
| 
 | ||||
| bool		BuildingBtree = false;		/* see comment in btbuild() */ | ||||
| bool		FastBuild = true;	/* use SORT instead of insertion build */ | ||||
| 
 | ||||
| /*
 | ||||
| @ -68,13 +68,7 @@ static void btbuildCallback(Relation index, | ||||
| void | ||||
| AtEOXact_nbtree(void) | ||||
| { | ||||
| 	/*
 | ||||
| 	 * Note: these actions should only be necessary during xact abort; but | ||||
| 	 * they can't hurt during a commit. | ||||
| 	 */ | ||||
| 
 | ||||
| 	/* If we were building a btree, we ain't anymore. */ | ||||
| 	BuildingBtree = false; | ||||
| 	/* nothing to do at the moment */ | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| @ -95,9 +89,6 @@ btbuild(PG_FUNCTION_ARGS) | ||||
| 	double		reltuples; | ||||
| 	BTBuildState buildstate; | ||||
| 
 | ||||
| 	/* set flag to disable locking */ | ||||
| 	BuildingBtree = true; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * bootstrap processing does something strange, so don't use | ||||
| 	 * sort/build for initial catalog indices.	at some point i need to | ||||
| @ -172,9 +163,6 @@ btbuild(PG_FUNCTION_ARGS) | ||||
| 	} | ||||
| #endif   /* BTREE_BUILD_STATS */ | ||||
| 
 | ||||
| 	/* all done */ | ||||
| 	BuildingBtree = false; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Since we just counted the tuples in the heap, we update its stats | ||||
| 	 * in pg_class to guarantee that the planner takes advantage of the | ||||
| @ -689,10 +677,6 @@ btbulkdelete(PG_FUNCTION_ARGS) | ||||
| 				 * We now need to back up the scan one item, so that the next | ||||
| 				 * cycle will re-examine the same offnum on this page (which | ||||
| 				 * now holds the next item). | ||||
| 				 * | ||||
| 				 * For now, just hack the current-item index.  Will need to | ||||
| 				 * be smarter when deletion includes removal of empty | ||||
| 				 * index pages. | ||||
| 				 */ | ||||
| 				current->ip_posid--; | ||||
| 			} | ||||
| @ -708,12 +692,89 @@ btbulkdelete(PG_FUNCTION_ARGS) | ||||
| 
 | ||||
| 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); | ||||
| 	result->num_pages = num_pages; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->num_index_tuples = num_index_tuples; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->pages_free = 0;		/* not computed here */ | ||||
| 
 | ||||
| 	PG_RETURN_POINTER(result); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Post-VACUUM cleanup. | ||||
|  * | ||||
|  * Here, we scan looking for pages we can delete or return to the freelist. | ||||
|  * | ||||
|  * Result: a palloc'd struct containing statistical info for VACUUM displays. | ||||
|  */ | ||||
| Datum | ||||
| btvacuumcleanup(PG_FUNCTION_ARGS) | ||||
| { | ||||
| 	Relation	rel = (Relation) PG_GETARG_POINTER(0); | ||||
| #ifdef NOT_USED | ||||
| 	IndexVacuumCleanupInfo *info = (IndexVacuumCleanupInfo *) PG_GETARG_POINTER(1); | ||||
| #endif | ||||
| 	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(2); | ||||
| 	BlockNumber num_pages; | ||||
| 	BlockNumber blkno; | ||||
| 	PageFreeSpaceInfo *pageSpaces; | ||||
| 	int			nFreePages, | ||||
| 				maxFreePages; | ||||
| 
 | ||||
| 	Assert(stats != NULL); | ||||
| 
 | ||||
| 	num_pages = RelationGetNumberOfBlocks(rel); | ||||
| 
 | ||||
| 	/* No point in remembering more than MaxFSMPages pages */ | ||||
| 	maxFreePages = MaxFSMPages; | ||||
| 	if ((BlockNumber) maxFreePages > num_pages) | ||||
| 		maxFreePages = (int) num_pages + 1;	/* +1 to avoid palloc(0) */ | ||||
| 	pageSpaces = (PageFreeSpaceInfo *) palloc(maxFreePages * sizeof(PageFreeSpaceInfo)); | ||||
| 	nFreePages = 0; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Scan through all pages of index, except metapage.  (Any pages added | ||||
| 	 * after we start the scan will not be examined; this should be fine, | ||||
| 	 * since they can't possibly be empty.) | ||||
| 	 */ | ||||
| 	for (blkno = BTREE_METAPAGE+1; blkno < num_pages; blkno++) | ||||
| 	{ | ||||
| 		Buffer	buf; | ||||
| 		Page	page; | ||||
| 		BTPageOpaque opaque; | ||||
| 
 | ||||
| 		buf = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 		page = BufferGetPage(buf); | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 		if (P_ISDELETED(opaque)) | ||||
| 		{ | ||||
| 			/* XXX if safe-to-reclaim... */ | ||||
| 			if (nFreePages < maxFreePages) | ||||
| 			{ | ||||
| 				pageSpaces[nFreePages].blkno = blkno; | ||||
| 				/* The avail-space value is bogus, but must be < BLCKSZ */ | ||||
| 				pageSpaces[nFreePages].avail = BLCKSZ-1; | ||||
| 				nFreePages++; | ||||
| 			} | ||||
| 		} | ||||
| 		_bt_relbuf(rel, buf); | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Update the shared Free Space Map with the info we now have about | ||||
| 	 * free space in the index, discarding any old info the map may have. | ||||
| 	 * We do not need to sort the page numbers; they're in order already. | ||||
| 	 */ | ||||
| 	MultiRecordFreeSpace(&rel->rd_node, 0, nFreePages, pageSpaces); | ||||
| 
 | ||||
| 	pfree(pageSpaces); | ||||
| 
 | ||||
| 	/* update statistics */ | ||||
| 	stats->num_pages = num_pages; | ||||
| 	stats->pages_free = nFreePages; | ||||
| 
 | ||||
| 	PG_RETURN_POINTER(stats); | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * Restore scan position when btgettuple is called to continue a scan. | ||||
|  * | ||||
| @ -739,7 +800,7 @@ _bt_restscan(IndexScanDesc scan) | ||||
| 				maxoff; | ||||
| 	BTPageOpaque opaque; | ||||
| 	Buffer		nextbuf; | ||||
| 	ItemPointerData target = so->curHeapIptr; | ||||
| 	ItemPointer target = &(so->curHeapIptr); | ||||
| 	BTItem		item; | ||||
| 	BlockNumber blkno; | ||||
| 
 | ||||
| @ -759,7 +820,7 @@ _bt_restscan(IndexScanDesc scan) | ||||
| 	 * current->ip_posid before first index tuple on the current page | ||||
| 	 * (_bt_step will move it right)...  XXX still needed? | ||||
| 	 */ | ||||
| 	if (!ItemPointerIsValid(&target)) | ||||
| 	if (!ItemPointerIsValid(target)) | ||||
| 	{ | ||||
| 		ItemPointerSetOffsetNumber(current, | ||||
| 							   OffsetNumberPrev(P_FIRSTDATAKEY(opaque))); | ||||
| @ -778,11 +839,7 @@ _bt_restscan(IndexScanDesc scan) | ||||
| 			 offnum = OffsetNumberNext(offnum)) | ||||
| 		{ | ||||
| 			item = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); | ||||
| 			if (item->bti_itup.t_tid.ip_blkid.bi_hi == | ||||
| 				target.ip_blkid.bi_hi && | ||||
| 				item->bti_itup.t_tid.ip_blkid.bi_lo == | ||||
| 				target.ip_blkid.bi_lo && | ||||
| 				item->bti_itup.t_tid.ip_posid == target.ip_posid) | ||||
| 			if (BTTidSame(item->bti_itup.t_tid, *target)) | ||||
| 			{ | ||||
| 				/* Found it */ | ||||
| 				current->ip_posid = offnum; | ||||
| @ -793,22 +850,33 @@ _bt_restscan(IndexScanDesc scan) | ||||
| 		/*
 | ||||
| 		 * The item we're looking for moved right at least one page, so | ||||
| 		 * move right.  We are careful here to pin and read-lock the next | ||||
| 		 * page before releasing the current one.  This ensures that a | ||||
| 		 * concurrent btbulkdelete scan cannot pass our position --- if it | ||||
| 		 * non-dead page before releasing the current one.  This ensures that | ||||
| 		 * a concurrent btbulkdelete scan cannot pass our position --- if it | ||||
| 		 * did, it might be able to reach and delete our target item before | ||||
| 		 * we can find it again. | ||||
| 		 */ | ||||
| 		if (P_RIGHTMOST(opaque)) | ||||
| 			elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!" | ||||
| 			elog(ERROR, "_bt_restscan: my bits moved right off the end of the world!" | ||||
| 				 "\n\tRecreate index %s.", RelationGetRelationName(rel)); | ||||
| 
 | ||||
| 		/* Advance to next non-dead page --- there must be one */ | ||||
| 		nextbuf = InvalidBuffer; | ||||
| 		for (;;) | ||||
| 		{ | ||||
| 			blkno = opaque->btpo_next; | ||||
| 			if (nextbuf != InvalidBuffer) | ||||
| 				_bt_relbuf(rel, nextbuf); | ||||
| 			nextbuf = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 			page = BufferGetPage(nextbuf); | ||||
| 			opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 			if (!P_IGNORE(opaque)) | ||||
| 				break; | ||||
| 			if (P_RIGHTMOST(opaque)) | ||||
| 				elog(ERROR, "_bt_restscan: fell off the end of %s", | ||||
| 					 RelationGetRelationName(rel)); | ||||
| 		} | ||||
| 		_bt_relbuf(rel, buf); | ||||
| 		so->btso_curbuf = buf = nextbuf; | ||||
| 		page = BufferGetPage(buf); | ||||
| 		maxoff = PageGetMaxOffsetNumber(page); | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 		offnum = P_FIRSTDATAKEY(opaque); | ||||
| 		ItemPointerSet(current, blkno, offnum); | ||||
| 	} | ||||
|  | ||||
| @ -1,14 +1,14 @@ | ||||
| /*-------------------------------------------------------------------------
 | ||||
|  * | ||||
|  * nbtsearch.c | ||||
|  *	  search code for postgres btrees. | ||||
|  *	  Search code for postgres btrees. | ||||
|  * | ||||
|  * | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.73 2003/02/21 00:06:21 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.74 2003/02/22 00:45:04 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -19,6 +19,7 @@ | ||||
| #include "access/nbtree.h" | ||||
| 
 | ||||
| 
 | ||||
| static Buffer _bt_walk_left(Relation rel, Buffer buf); | ||||
| static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); | ||||
| 
 | ||||
| 
 | ||||
| @ -79,10 +80,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, | ||||
| 		par_blkno = BufferGetBlockNumber(*bufP); | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * We need to save the bit image of the index entry we chose in | ||||
| 		 * We need to save the location of the index entry we chose in | ||||
| 		 * the parent page on a stack. In case we split the tree, we'll | ||||
| 		 * use this bit image to figure out what our real parent page is, | ||||
| 		 * in case the parent splits while we're working lower in the | ||||
| 		 * use the stack to work back up to the parent page.  We also save | ||||
| 		 * the actual downlink (TID) to uniquely identify the index entry, | ||||
| 		 * in case it moves right while we're working lower in the | ||||
| 		 * tree.  See the paper by Lehman and Yao for how this is detected | ||||
| 		 * and handled. (We use the child link to disambiguate duplicate | ||||
| 		 * keys in the index -- Lehman and Yao disallow duplicate keys.) | ||||
| @ -114,7 +116,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, | ||||
| /*
 | ||||
|  *	_bt_moveright() -- move right in the btree if necessary. | ||||
|  * | ||||
|  *		When we drop and reacquire a pointer to a page, it is possible that | ||||
|  *		When we follow a pointer to reach a page, it is possible that | ||||
|  *		the page has changed in the meanwhile.	If this happens, we're | ||||
|  *		guaranteed that the page has "split right" -- that is, that any | ||||
|  *		data that appeared on the page originally is either on the page | ||||
| @ -148,9 +150,13 @@ _bt_moveright(Relation rel, | ||||
| 	 * right.  (If the scan key is equal to the high key, we might or | ||||
| 	 * might not need to move right; have to scan the page first anyway.) | ||||
| 	 * It could even have split more than once, so scan as far as needed. | ||||
| 	 * | ||||
| 	 * We also have to move right if we followed a link that brought us to | ||||
| 	 * a dead page. | ||||
| 	 */ | ||||
| 	while (!P_RIGHTMOST(opaque) && | ||||
| 		   _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0) | ||||
| 		   (P_IGNORE(opaque) || | ||||
| 			_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)) | ||||
| 	{ | ||||
| 		/* step right one page */ | ||||
| 		BlockNumber rblkno = opaque->btpo_next; | ||||
| @ -161,6 +167,10 @@ _bt_moveright(Relation rel, | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 	} | ||||
| 
 | ||||
| 	if (P_IGNORE(opaque)) | ||||
| 		elog(ERROR, "_bt_moveright: fell off the end of %s", | ||||
| 			 RelationGetRelationName(rel)); | ||||
| 
 | ||||
| 	return buf; | ||||
| } | ||||
| 
 | ||||
| @ -796,7 +806,6 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) | ||||
| 	OffsetNumber offnum, | ||||
| 				maxoff; | ||||
| 	BlockNumber blkno; | ||||
| 	BlockNumber obknum; | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Don't use ItemPointerGetOffsetNumber or you risk to get assertion | ||||
| @ -814,7 +823,7 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) | ||||
| 			offnum = OffsetNumberNext(offnum); | ||||
| 		else | ||||
| 		{ | ||||
| 			/* walk right to the next page with data */ | ||||
| 			/* Walk right to the next page with data */ | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 				/* if we're at end of scan, release the buffer and return */ | ||||
| @ -831,6 +840,8 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) | ||||
| 				*bufP = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 				page = BufferGetPage(*bufP); | ||||
| 				opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 				if (!P_IGNORE(opaque)) | ||||
| 				{ | ||||
| 					maxoff = PageGetMaxOffsetNumber(page); | ||||
| 					/* done if it's not empty */ | ||||
| 					offnum = P_FIRSTDATAKEY(opaque); | ||||
| @ -839,53 +850,49 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	else | ||||
| 	} | ||||
| 	else						/* backwards scan */ | ||||
| 	{ | ||||
| 		if (offnum > P_FIRSTDATAKEY(opaque)) | ||||
| 			offnum = OffsetNumberPrev(offnum); | ||||
| 		else | ||||
| 		{ | ||||
| 			/* walk left to the next page with data */ | ||||
| 			/*
 | ||||
| 			 * Walk left to the next page with data.  This is much more | ||||
| 			 * complex than the walk-right case because of the possibility | ||||
| 			 * that the page to our left splits while we are in flight to it, | ||||
| 			 * plus the possibility that the page we were on gets deleted | ||||
| 			 * after we leave it.  See nbtree/README for details. | ||||
| 			 */ | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 				/* if we're at end of scan, release the buffer and return */ | ||||
| 				if (P_LEFTMOST(opaque)) | ||||
| 				*bufP = _bt_walk_left(rel, *bufP); | ||||
| 
 | ||||
| 				/* if we're at end of scan, return failure */ | ||||
| 				if (*bufP == InvalidBuffer) | ||||
| 				{ | ||||
| 					_bt_relbuf(rel, *bufP); | ||||
| 					ItemPointerSetInvalid(current); | ||||
| 					*bufP = so->btso_curbuf = InvalidBuffer; | ||||
| 					so->btso_curbuf = InvalidBuffer; | ||||
| 					return false; | ||||
| 				} | ||||
| 				/* step left */ | ||||
| 				obknum = BufferGetBlockNumber(*bufP); | ||||
| 				blkno = opaque->btpo_prev; | ||||
| 				_bt_relbuf(rel, *bufP); | ||||
| 				*bufP = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 				page = BufferGetPage(*bufP); | ||||
| 				opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 
 | ||||
| 				/*
 | ||||
| 				 * If the adjacent page just split, then we have to walk | ||||
| 				 * right to find the block that's now adjacent to where we | ||||
| 				 * were.  Because pages only split right, we don't have to | ||||
| 				 * worry about this failing to terminate. | ||||
| 				 * Okay, we managed to move left to a non-deleted page. | ||||
| 				 * Done if it's not half-dead and not empty.  Else loop back | ||||
| 				 * and do it all again. | ||||
| 				 */ | ||||
| 				while (opaque->btpo_next != obknum) | ||||
| 				if (!P_IGNORE(opaque)) | ||||
| 				{ | ||||
| 					blkno = opaque->btpo_next; | ||||
| 					_bt_relbuf(rel, *bufP); | ||||
| 					*bufP = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 					page = BufferGetPage(*bufP); | ||||
| 					opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 				} | ||||
| 				/* done if it's not empty */ | ||||
| 					maxoff = PageGetMaxOffsetNumber(page); | ||||
| 					offnum = maxoff; | ||||
| 				if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) | ||||
| 					if (!PageIsEmpty(page) && | ||||
| 						maxoff >= P_FIRSTDATAKEY(opaque)) | ||||
| 						break; | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	/* Update scan state */ | ||||
| 	so->btso_curbuf = *bufP; | ||||
| @ -895,11 +902,133 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) | ||||
| 	return true; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * _bt_walk_left() -- step left one page, if possible | ||||
|  * | ||||
|  * The given buffer must be pinned and read-locked.  This will be dropped | ||||
|  * before stepping left.  On return, we have pin and read lock on the | ||||
|  * returned page, instead. | ||||
|  * | ||||
|  * Returns InvalidBuffer if there is no page to the left (no lock is held | ||||
|  * in that case). | ||||
|  * | ||||
|  * When working on a non-leaf level, it is possible for the returned page | ||||
|  * to be half-dead; the caller should check that condition and step left | ||||
|  * again if it's important. | ||||
|  */ | ||||
| static Buffer | ||||
| _bt_walk_left(Relation rel, Buffer buf) | ||||
| { | ||||
| 	Page		page; | ||||
| 	BTPageOpaque opaque; | ||||
| 
 | ||||
| 	page = BufferGetPage(buf); | ||||
| 	opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 
 | ||||
| 	for (;;) | ||||
| 	{ | ||||
| 		BlockNumber obknum; | ||||
| 		BlockNumber lblkno; | ||||
| 		BlockNumber blkno; | ||||
| 		int			tries; | ||||
| 
 | ||||
| 		/* if we're at end of tree, release buf and return failure */ | ||||
| 		if (P_LEFTMOST(opaque)) | ||||
| 		{ | ||||
| 			_bt_relbuf(rel, buf); | ||||
| 			break; | ||||
| 		} | ||||
| 		/* remember original page we are stepping left from */ | ||||
| 		obknum = BufferGetBlockNumber(buf); | ||||
| 		/* step left */ | ||||
| 		blkno = lblkno = opaque->btpo_prev; | ||||
| 		_bt_relbuf(rel, buf); | ||||
| 		buf = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 		page = BufferGetPage(buf); | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 		/*
 | ||||
| 		 * If this isn't the page we want, walk right till we find | ||||
| 		 * what we want --- but go no more than four hops (an | ||||
| 		 * arbitrary limit).  If we don't find the correct page by then, | ||||
| 		 * the most likely bet is that the original page got deleted | ||||
| 		 * and isn't in the sibling chain at all anymore, not that its | ||||
| 		 * left sibling got split more than four times. | ||||
| 		 * | ||||
| 		 * Note that it is correct to test P_ISDELETED not P_IGNORE | ||||
| 		 * here, because half-dead pages are still in the sibling | ||||
| 		 * chain.  Caller must reject half-dead pages if wanted. | ||||
| 		 */ | ||||
| 		tries = 0; | ||||
| 		for (;;) | ||||
| 		{ | ||||
| 			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) | ||||
| 			{ | ||||
| 				/* Found desired page, return it */ | ||||
| 				return buf; | ||||
| 			} | ||||
| 			if (P_RIGHTMOST(opaque) || ++tries > 4) | ||||
| 				break; | ||||
| 			blkno = opaque->btpo_next; | ||||
| 			_bt_relbuf(rel, buf); | ||||
| 			buf = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 			page = BufferGetPage(buf); | ||||
| 			opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 		} | ||||
| 
 | ||||
| 		/* Return to the original page to see what's up */ | ||||
| 		_bt_relbuf(rel, buf); | ||||
| 		buf = _bt_getbuf(rel, obknum, BT_READ); | ||||
| 		page = BufferGetPage(buf); | ||||
| 		opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 		if (P_ISDELETED(opaque)) | ||||
| 		{ | ||||
| 			/*
 | ||||
| 			 * It was deleted.  Move right to first nondeleted page | ||||
| 			 * (there must be one); that is the page that has acquired the | ||||
| 			 * deleted one's keyspace, so stepping left from it will take | ||||
| 			 * us where we want to be. | ||||
| 			 */ | ||||
| 			for (;;) | ||||
| 			{ | ||||
| 				if (P_RIGHTMOST(opaque)) | ||||
| 					elog(ERROR, "_bt_walk_left: fell off the end of %s", | ||||
| 						 RelationGetRelationName(rel)); | ||||
| 				blkno = opaque->btpo_next; | ||||
| 				_bt_relbuf(rel, buf); | ||||
| 				buf = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 				page = BufferGetPage(buf); | ||||
| 				opaque = (BTPageOpaque) PageGetSpecialPointer(page); | ||||
| 				if (!P_ISDELETED(opaque)) | ||||
| 					break; | ||||
| 			} | ||||
| 			/*
 | ||||
| 			 * Now return to top of loop, resetting obknum to | ||||
| 			 * point to this nondeleted page, and try again. | ||||
| 			 */ | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
| 			/*
 | ||||
| 			 * It wasn't deleted; the explanation had better be | ||||
| 			 * that the page to the left got split or deleted. | ||||
| 			 * Without this check, we'd go into an infinite loop | ||||
| 			 * if there's anything wrong. | ||||
| 			 */ | ||||
| 			if (opaque->btpo_prev == lblkno) | ||||
| 				elog(ERROR, "_bt_walk_left: can't find left sibling in %s", | ||||
| 					 RelationGetRelationName(rel)); | ||||
| 			/* Okay to try again with new lblkno value */ | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return InvalidBuffer; | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * _bt_get_endpoint() -- Find the first or last page on a given tree level | ||||
|  * | ||||
|  * If the index is empty, we will return InvalidBuffer; any other failure | ||||
|  * condition causes elog(). | ||||
|  * condition causes elog().  We will not return a dead page. | ||||
|  * | ||||
|  * The returned buffer is pinned and read-locked. | ||||
|  */ | ||||
| @ -941,12 +1070,13 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) | ||||
| 		 * step right if needed to get to it (this could happen if the | ||||
| 		 * page split since we obtained a pointer to it). | ||||
| 		 */ | ||||
| 		while (P_ISDELETED(opaque) || | ||||
| 		while (P_IGNORE(opaque) || | ||||
| 			   (rightmost && !P_RIGHTMOST(opaque))) | ||||
| 		{ | ||||
| 			blkno = opaque->btpo_next; | ||||
| 			if (blkno == P_NONE) | ||||
| 				elog(ERROR, "_bt_get_endpoint: ran off end of btree"); | ||||
| 				elog(ERROR, "_bt_get_endpoint: fell off the end of %s", | ||||
| 					 RelationGetRelationName(rel)); | ||||
| 			_bt_relbuf(rel, buf); | ||||
| 			buf = _bt_getbuf(rel, blkno, BT_READ); | ||||
| 			page = BufferGetPage(buf); | ||||
| @ -959,7 +1089,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) | ||||
| 		if (opaque->btpo.level < level) | ||||
| 			elog(ERROR, "_bt_get_endpoint: btree level %u not found", level); | ||||
| 
 | ||||
| 		/* Step to leftmost or rightmost child page */ | ||||
| 		/* Descend to leftmost or rightmost child page */ | ||||
| 		if (rightmost) | ||||
| 			offnum = PageGetMaxOffsetNumber(page); | ||||
| 		else | ||||
|  | ||||
| @ -1,4 +1,5 @@ | ||||
| /*-------------------------------------------------------------------------
 | ||||
|  * | ||||
|  * nbtsort.c | ||||
|  *		Build a btree from sorted input by loading leaf pages sequentially. | ||||
|  * | ||||
| @ -35,7 +36,7 @@ | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.71 2003/02/21 00:06:21 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.72 2003/02/22 00:45:04 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -164,8 +165,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) | ||||
| 		ResetUsage(); | ||||
| 	} | ||||
| #endif   /* BTREE_BUILD_STATS */ | ||||
| 	tuplesort_performsort(btspool->sortstate); | ||||
| 
 | ||||
| 	tuplesort_performsort(btspool->sortstate); | ||||
| 	if (btspool2) | ||||
| 		tuplesort_performsort(btspool2->sortstate); | ||||
| 	_bt_load(btspool->index, btspool, btspool2); | ||||
| @ -331,7 +332,7 @@ _bt_sortaddtup(Page page, | ||||
| 
 | ||||
| 	if (PageAddItem(page, (Item) btitem, itemsize, itup_off, | ||||
| 					LP_USED) == InvalidOffsetNumber) | ||||
| 		elog(FATAL, "btree: failed to add item to the page in _bt_sort"); | ||||
| 		elog(ERROR, "btree: failed to add item to the page in _bt_sort"); | ||||
| } | ||||
| 
 | ||||
| /*----------
 | ||||
| @ -470,8 +471,7 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti) | ||||
| 
 | ||||
| 		/*
 | ||||
| 		 * Write out the old page.	We never want to see it again, so we | ||||
| 		 * can give up our lock (if we had one; most likely BuildingBtree | ||||
| 		 * is set, so we aren't locking). | ||||
| 		 * can give up our lock. | ||||
| 		 */ | ||||
| 		_bt_blwritepage(index, obuf); | ||||
| 
 | ||||
| @ -534,7 +534,7 @@ _bt_uppershutdown(Relation index, BTPageState *state) | ||||
| 		if (s->btps_next == (BTPageState *) NULL) | ||||
| 		{ | ||||
| 			opaque->btpo_flags |= BTP_ROOT; | ||||
| 			_bt_metaproot(index, blkno, s->btps_level + 1); | ||||
| 			_bt_metaproot(index, blkno, s->btps_level); | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.75 2002/09/04 20:31:13 momjian Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.76 2003/02/22 00:45:04 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -1250,8 +1250,9 @@ rtbulkdelete(PG_FUNCTION_ARGS) | ||||
| 
 | ||||
| 	result = (IndexBulkDeleteResult *) palloc(sizeof(IndexBulkDeleteResult)); | ||||
| 	result->num_pages = num_pages; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->num_index_tuples = num_index_tuples; | ||||
| 	result->tuples_removed = tuples_removed; | ||||
| 	result->pages_free = 0; | ||||
| 
 | ||||
| 	PG_RETURN_POINTER(result); | ||||
| } | ||||
|  | ||||
| @ -13,7 +13,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.247 2003/02/09 06:56:27 tgl Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.248 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -2603,17 +2603,25 @@ static void | ||||
| scan_index(Relation indrel, double num_tuples) | ||||
| { | ||||
| 	IndexBulkDeleteResult *stats; | ||||
| 	IndexVacuumCleanupInfo vcinfo; | ||||
| 	VacRUsage	ru0; | ||||
| 
 | ||||
| 	vac_init_rusage(&ru0); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Even though we're not planning to delete anything, use the | ||||
| 	 * ambulkdelete call, so that the scan happens within the index AM for | ||||
| 	 * more speed. | ||||
| 	 * Even though we're not planning to delete anything, we use the | ||||
| 	 * ambulkdelete call, because (a) the scan happens within the index AM | ||||
| 	 * for more speed, and (b) it may want to pass private statistics to | ||||
| 	 * the amvacuumcleanup call. | ||||
| 	 */ | ||||
| 	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL); | ||||
| 
 | ||||
| 	/* Do post-VACUUM cleanup, even though we deleted nothing */ | ||||
| 	vcinfo.vacuum_full = true; | ||||
| 	vcinfo.message_level = elevel; | ||||
| 
 | ||||
| 	stats = index_vacuum_cleanup(indrel, &vcinfo, stats); | ||||
| 
 | ||||
| 	if (!stats) | ||||
| 		return; | ||||
| 
 | ||||
| @ -2622,9 +2630,9 @@ scan_index(Relation indrel, double num_tuples) | ||||
| 						stats->num_pages, stats->num_index_tuples, | ||||
| 						false); | ||||
| 
 | ||||
| 	elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s", | ||||
| 	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s", | ||||
| 		 RelationGetRelationName(indrel), | ||||
| 		 stats->num_pages, stats->num_index_tuples, | ||||
| 		 stats->num_pages, stats->pages_free, stats->num_index_tuples, | ||||
| 		 vac_show_rusage(&ru0)); | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -2661,6 +2669,7 @@ vacuum_index(VacPageList vacpagelist, Relation indrel, | ||||
| 			 double num_tuples, int keep_tuples) | ||||
| { | ||||
| 	IndexBulkDeleteResult *stats; | ||||
| 	IndexVacuumCleanupInfo vcinfo; | ||||
| 	VacRUsage	ru0; | ||||
| 
 | ||||
| 	vac_init_rusage(&ru0); | ||||
| @ -2668,6 +2677,12 @@ vacuum_index(VacPageList vacpagelist, Relation indrel, | ||||
| 	/* Do bulk deletion */ | ||||
| 	stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist); | ||||
| 
 | ||||
| 	/* Do post-VACUUM cleanup */ | ||||
| 	vcinfo.vacuum_full = true; | ||||
| 	vcinfo.message_level = elevel; | ||||
| 
 | ||||
| 	stats = index_vacuum_cleanup(indrel, &vcinfo, stats); | ||||
| 
 | ||||
| 	if (!stats) | ||||
| 		return; | ||||
| 
 | ||||
| @ -2676,8 +2691,9 @@ vacuum_index(VacPageList vacpagelist, Relation indrel, | ||||
| 						stats->num_pages, stats->num_index_tuples, | ||||
| 						false); | ||||
| 
 | ||||
| 	elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s", | ||||
| 		 RelationGetRelationName(indrel), stats->num_pages, | ||||
| 	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s", | ||||
| 		 RelationGetRelationName(indrel), | ||||
| 		 stats->num_pages, stats->pages_free, | ||||
| 		 stats->num_index_tuples - keep_tuples, stats->tuples_removed, | ||||
| 		 vac_show_rusage(&ru0)); | ||||
| 
 | ||||
|  | ||||
| @ -31,7 +31,7 @@ | ||||
|  * | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.23 2002/11/13 00:39:46 momjian Exp $ | ||||
|  *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuumlazy.c,v 1.24 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -200,7 +200,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, | ||||
| 				tups_vacuumed, | ||||
| 				nkeep, | ||||
| 				nunused; | ||||
| 	bool		did_vacuum_index = false; | ||||
| 	int			i; | ||||
| 	VacRUsage	ru0; | ||||
| 
 | ||||
| @ -244,7 +243,6 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, | ||||
| 			/* Remove index entries */ | ||||
| 			for (i = 0; i < nindexes; i++) | ||||
| 				lazy_vacuum_index(Irel[i], vacrelstats); | ||||
| 			did_vacuum_index = true; | ||||
| 			/* Remove tuples from heap */ | ||||
| 			lazy_vacuum_heap(onerel, vacrelstats); | ||||
| 			/* Forget the now-vacuumed tuples, and press on */ | ||||
| @ -415,7 +413,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, | ||||
| 	vacrelstats->rel_tuples = num_tuples; | ||||
| 
 | ||||
| 	/* If any tuples need to be deleted, perform final vacuum cycle */ | ||||
| 	/* XXX put a threshold on min nuber of tuples here? */ | ||||
| 	/* XXX put a threshold on min number of tuples here? */ | ||||
| 	if (vacrelstats->num_dead_tuples > 0) | ||||
| 	{ | ||||
| 		/* Remove index entries */ | ||||
| @ -424,9 +422,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, | ||||
| 		/* Remove tuples from heap */ | ||||
| 		lazy_vacuum_heap(onerel, vacrelstats); | ||||
| 	} | ||||
| 	else if (!did_vacuum_index) | ||||
| 	else | ||||
| 	{ | ||||
| 		/* Scan indexes just to update pg_class statistics about them */ | ||||
| 		/* Must do post-vacuum cleanup and statistics update anyway */ | ||||
| 		for (i = 0; i < nindexes; i++) | ||||
| 			lazy_scan_index(Irel[i], vacrelstats); | ||||
| 	} | ||||
| @ -551,42 +549,36 @@ static void | ||||
| lazy_scan_index(Relation indrel, LVRelStats *vacrelstats) | ||||
| { | ||||
| 	IndexBulkDeleteResult *stats; | ||||
| 	IndexVacuumCleanupInfo vcinfo; | ||||
| 	VacRUsage	ru0; | ||||
| 
 | ||||
| 	vac_init_rusage(&ru0); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If the index is not partial, skip the scan, and just assume it has | ||||
| 	 * the same number of tuples as the heap. | ||||
| 	 */ | ||||
| 	if (!vac_is_partial_index(indrel)) | ||||
| 	{ | ||||
| 		vac_update_relstats(RelationGetRelid(indrel), | ||||
| 							RelationGetNumberOfBlocks(indrel), | ||||
| 							vacrelstats->rel_tuples, | ||||
| 							false); | ||||
| 		return; | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If index is unsafe for concurrent access, must lock it; but a | ||||
| 	 * shared lock should be sufficient. | ||||
| 	 * If index is unsafe for concurrent access, must lock it. | ||||
| 	 */ | ||||
| 	if (!indrel->rd_am->amconcurrent) | ||||
| 		LockRelation(indrel, AccessShareLock); | ||||
| 		LockRelation(indrel, AccessExclusiveLock); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Even though we're not planning to delete anything, use the | ||||
| 	 * ambulkdelete call, so that the scan happens within the index AM for | ||||
| 	 * more speed. | ||||
| 	 * Even though we're not planning to delete anything, we use the | ||||
| 	 * ambulkdelete call, because (a) the scan happens within the index AM | ||||
| 	 * for more speed, and (b) it may want to pass private statistics to | ||||
| 	 * the amvacuumcleanup call. | ||||
| 	 */ | ||||
| 	stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL); | ||||
| 
 | ||||
| 	/* Do post-VACUUM cleanup, even though we deleted nothing */ | ||||
| 	vcinfo.vacuum_full = false; | ||||
| 	vcinfo.message_level = elevel; | ||||
| 
 | ||||
| 	stats = index_vacuum_cleanup(indrel, &vcinfo, stats); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Release lock acquired above. | ||||
| 	 */ | ||||
| 	if (!indrel->rd_am->amconcurrent) | ||||
| 		UnlockRelation(indrel, AccessShareLock); | ||||
| 		UnlockRelation(indrel, AccessExclusiveLock); | ||||
| 
 | ||||
| 	if (!stats) | ||||
| 		return; | ||||
| @ -596,9 +588,9 @@ lazy_scan_index(Relation indrel, LVRelStats *vacrelstats) | ||||
| 						stats->num_pages, stats->num_index_tuples, | ||||
| 						false); | ||||
| 
 | ||||
| 	elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s", | ||||
| 	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f.\n\t%s", | ||||
| 		 RelationGetRelationName(indrel), | ||||
| 		 stats->num_pages, stats->num_index_tuples, | ||||
| 		 stats->num_pages, stats->pages_free, stats->num_index_tuples, | ||||
| 		 vac_show_rusage(&ru0)); | ||||
| 
 | ||||
| 	pfree(stats); | ||||
| @ -617,6 +609,7 @@ static void | ||||
| lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats) | ||||
| { | ||||
| 	IndexBulkDeleteResult *stats; | ||||
| 	IndexVacuumCleanupInfo vcinfo; | ||||
| 	VacRUsage	ru0; | ||||
| 
 | ||||
| 	vac_init_rusage(&ru0); | ||||
| @ -630,27 +623,34 @@ lazy_vacuum_index(Relation indrel, LVRelStats *vacrelstats) | ||||
| 	/* Do bulk deletion */ | ||||
| 	stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats); | ||||
| 
 | ||||
| 	/* Do post-VACUUM cleanup */ | ||||
| 	vcinfo.vacuum_full = false; | ||||
| 	vcinfo.message_level = elevel; | ||||
| 
 | ||||
| 	stats = index_vacuum_cleanup(indrel, &vcinfo, stats); | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * Release lock acquired above. | ||||
| 	 */ | ||||
| 	if (!indrel->rd_am->amconcurrent) | ||||
| 		UnlockRelation(indrel, AccessExclusiveLock); | ||||
| 
 | ||||
| 	if (!stats) | ||||
| 		return; | ||||
| 
 | ||||
| 	/* now update statistics in pg_class */ | ||||
| 	if (stats) | ||||
| 	{ | ||||
| 	vac_update_relstats(RelationGetRelid(indrel), | ||||
| 						stats->num_pages, stats->num_index_tuples, | ||||
| 						false); | ||||
| 
 | ||||
| 		elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s", | ||||
| 			 RelationGetRelationName(indrel), stats->num_pages, | ||||
| 	elog(elevel, "Index %s: Pages %u, %u free; Tuples %.0f: Deleted %.0f.\n\t%s", | ||||
| 		 RelationGetRelationName(indrel), | ||||
| 		 stats->num_pages, stats->pages_free, | ||||
| 		 stats->num_index_tuples, stats->tuples_removed, | ||||
| 		 vac_show_rusage(&ru0)); | ||||
| 
 | ||||
| 	pfree(stats); | ||||
| } | ||||
| } | ||||
| 
 | ||||
| /*
 | ||||
|  * lazy_truncate_heap - try to truncate off any empty pages at the end | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: genam.h,v 1.37 2002/09/04 20:31:36 momjian Exp $ | ||||
|  * $Id: genam.h,v 1.38 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -20,17 +20,32 @@ | ||||
| #include "nodes/primnodes.h" | ||||
| 
 | ||||
| 
 | ||||
| /* Struct for statistics returned by bulk-delete operation */ | ||||
| /*
 | ||||
|  * Struct for statistics returned by bulk-delete operation | ||||
|  * | ||||
|  * This is now also passed to the index AM's vacuum-cleanup operation, | ||||
|  * if it has one, which can modify the results as needed.  Note that | ||||
|  * an index AM could choose to have bulk-delete return a larger struct | ||||
|  * of which this is just the first field; this provides a way for bulk-delete | ||||
|  * to communicate additional private data to vacuum-cleanup. | ||||
|  */ | ||||
| typedef struct IndexBulkDeleteResult | ||||
| { | ||||
| 	BlockNumber num_pages;		/* pages remaining in index */ | ||||
| 	double		num_index_tuples;		/* tuples remaining */ | ||||
| 	double		tuples_removed; /* # removed by bulk-delete operation */ | ||||
| 	double		num_index_tuples;		/* # remaining */ | ||||
| 	BlockNumber	pages_free;		/* # unused pages in index */ | ||||
| } IndexBulkDeleteResult; | ||||
| 
 | ||||
| /* Typedef for callback function to determine if a tuple is bulk-deletable */ | ||||
| typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); | ||||
| 
 | ||||
| /* Struct for additional arguments passed to vacuum-cleanup operation */ | ||||
| typedef struct IndexVacuumCleanupInfo | ||||
| { | ||||
| 	bool		vacuum_full;	/* VACUUM FULL (we have exclusive lock) */ | ||||
| 	int			message_level;	/* elog level for progress messages */ | ||||
| } IndexVacuumCleanupInfo; | ||||
| 
 | ||||
| /* Struct for heap-or-index scans of system tables */ | ||||
| typedef struct SysScanDescData | ||||
| @ -72,6 +87,9 @@ extern bool index_getnext_indexitem(IndexScanDesc scan, | ||||
| extern IndexBulkDeleteResult *index_bulk_delete(Relation indexRelation, | ||||
| 				  IndexBulkDeleteCallback callback, | ||||
| 				  void *callback_state); | ||||
| extern IndexBulkDeleteResult *index_vacuum_cleanup(Relation indexRelation, | ||||
| 				  IndexVacuumCleanupInfo *info, | ||||
| 				  IndexBulkDeleteResult *stats); | ||||
| extern RegProcedure index_cost_estimator(Relation indexRelation); | ||||
| extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, | ||||
| 				uint16 procnum); | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: nbtree.h,v 1.64 2003/02/21 00:06:22 tgl Exp $ | ||||
|  * $Id: nbtree.h,v 1.65 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -54,6 +54,7 @@ typedef BTPageOpaqueData *BTPageOpaque; | ||||
| #define BTP_ROOT		(1 << 1)	/* root page (has no parent) */ | ||||
| #define BTP_DELETED		(1 << 2)	/* page has been deleted from tree */ | ||||
| #define BTP_META		(1 << 3)	/* meta-page */ | ||||
| #define BTP_HALF_DEAD	(1 << 4)	/* empty, but still in tree */ | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
| @ -124,12 +125,13 @@ typedef BTItemData *BTItem; | ||||
| #define SizeOfBTItem	sizeof(BTItemData) | ||||
| 
 | ||||
| /* Test whether items are the "same" per the above notes */ | ||||
| #define BTItemSame(i1, i2)	  ( (i1)->bti_itup.t_tid.ip_blkid.bi_hi == \ | ||||
| 								(i2)->bti_itup.t_tid.ip_blkid.bi_hi && \ | ||||
| 								(i1)->bti_itup.t_tid.ip_blkid.bi_lo == \ | ||||
| 								(i2)->bti_itup.t_tid.ip_blkid.bi_lo && \ | ||||
| 								(i1)->bti_itup.t_tid.ip_posid == \ | ||||
| 								(i2)->bti_itup.t_tid.ip_posid ) | ||||
| #define BTTidSame(i1, i2)	\ | ||||
| 	( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \ | ||||
| 	  (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \ | ||||
| 	  (i1).ip_posid == (i2).ip_posid ) | ||||
| #define BTItemSame(i1, i2)	\ | ||||
| 	BTTidSame((i1)->bti_itup.t_tid, (i2)->bti_itup.t_tid) | ||||
| 
 | ||||
| 
 | ||||
| /*
 | ||||
|  *	In general, the btree code tries to localize its knowledge about | ||||
| @ -150,6 +152,7 @@ typedef BTItemData *BTItem; | ||||
| #define P_ISLEAF(opaque)		((opaque)->btpo_flags & BTP_LEAF) | ||||
| #define P_ISROOT(opaque)		((opaque)->btpo_flags & BTP_ROOT) | ||||
| #define P_ISDELETED(opaque)		((opaque)->btpo_flags & BTP_DELETED) | ||||
| #define P_IGNORE(opaque)		((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) | ||||
| 
 | ||||
| /*
 | ||||
|  *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost | ||||
| @ -412,8 +415,6 @@ typedef BTScanOpaqueData *BTScanOpaque; | ||||
| /*
 | ||||
|  * prototypes for functions in nbtree.c (external entry points for btree) | ||||
|  */ | ||||
| extern bool BuildingBtree;		/* in nbtree.c */ | ||||
| 
 | ||||
| extern void AtEOXact_nbtree(void); | ||||
| 
 | ||||
| extern Datum btbuild(PG_FUNCTION_ARGS); | ||||
| @ -426,6 +427,7 @@ extern Datum btendscan(PG_FUNCTION_ARGS); | ||||
| extern Datum btmarkpos(PG_FUNCTION_ARGS); | ||||
| extern Datum btrestrpos(PG_FUNCTION_ARGS); | ||||
| extern Datum btbulkdelete(PG_FUNCTION_ARGS); | ||||
| extern Datum btvacuumcleanup(PG_FUNCTION_ARGS); | ||||
| 
 | ||||
| /*
 | ||||
|  * prototypes for functions in nbtinsert.c | ||||
|  | ||||
| @ -6,7 +6,7 @@ | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: xlog.h,v 1.41 2003/02/21 00:06:22 tgl Exp $ | ||||
|  * $Id: xlog.h,v 1.42 2003/02/22 00:45:05 tgl Exp $ | ||||
|  */ | ||||
| #ifndef XLOG_H | ||||
| #define XLOG_H | ||||
| @ -56,17 +56,18 @@ typedef struct XLogRecord | ||||
| #define XLR_INFO_MASK			0x0F | ||||
| 
 | ||||
| /*
 | ||||
|  * We support backup of up to 2 disk blocks per XLOG record (could support | ||||
|  * more if we cared to dedicate more xl_info bits for this purpose; currently | ||||
|  * do not need more than 2 anyway).  If we backed up any disk blocks then we | ||||
|  * use flag bits in xl_info to signal it. | ||||
|  * If we backed up any disk blocks with the XLOG record, we use flag bits in | ||||
|  * xl_info to signal it.  We support backup of up to 3 disk blocks per XLOG | ||||
|  * record.  (Could support 4 if we cared to dedicate all the xl_info bits for | ||||
|  * this purpose; currently bit 0 of xl_info is unused and available.) | ||||
|  */ | ||||
| #define XLR_BKP_BLOCK_MASK		0x0C	/* all info bits used for bkp | ||||
| #define XLR_BKP_BLOCK_MASK		0x0E	/* all info bits used for bkp | ||||
| 										 * blocks */ | ||||
| #define XLR_MAX_BKP_BLOCKS		2 | ||||
| #define XLR_MAX_BKP_BLOCKS		3 | ||||
| #define XLR_SET_BKP_BLOCK(iblk) (0x08 >> (iblk)) | ||||
| #define XLR_BKP_BLOCK_1			XLR_SET_BKP_BLOCK(0)	/* 0x08 */ | ||||
| #define XLR_BKP_BLOCK_2			XLR_SET_BKP_BLOCK(1)	/* 0x04 */ | ||||
| #define XLR_BKP_BLOCK_3			XLR_SET_BKP_BLOCK(2)	/* 0x02 */ | ||||
| 
 | ||||
| /*
 | ||||
|  * Sometimes we log records which are out of transaction control. | ||||
|  | ||||
| @ -37,7 +37,7 @@ | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: catversion.h,v 1.178 2003/02/21 00:06:22 tgl Exp $ | ||||
|  * $Id: catversion.h,v 1.179 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
| @ -53,6 +53,6 @@ | ||||
|  */ | ||||
| 
 | ||||
| /*							yyyymmddN */ | ||||
| #define CATALOG_VERSION_NO	200302171 | ||||
| #define CATALOG_VERSION_NO	200302211 | ||||
| 
 | ||||
| #endif | ||||
|  | ||||
| @ -8,7 +8,7 @@ | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: pg_am.h,v 1.23 2002/07/29 22:14:11 tgl Exp $ | ||||
|  * $Id: pg_am.h,v 1.24 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  * NOTES | ||||
|  *		the genbki.sh script reads this file and generates .bki | ||||
| @ -58,6 +58,7 @@ CATALOG(pg_am) | ||||
| 	regproc		amrestrpos;		/* "restore marked scan position" function */ | ||||
| 	regproc		ambuild;		/* "build new index" function */ | ||||
| 	regproc		ambulkdelete;	/* bulk-delete function */ | ||||
| 	regproc		amvacuumcleanup; /* post-VACUUM cleanup function */ | ||||
| 	regproc		amcostestimate; /* estimate cost of an indexscan */ | ||||
| } FormData_pg_am; | ||||
| 
 | ||||
| @ -72,7 +73,7 @@ typedef FormData_pg_am *Form_pg_am; | ||||
|  *		compiler constants for pg_am | ||||
|  * ---------------- | ||||
|  */ | ||||
| #define Natts_pg_am						19 | ||||
| #define Natts_pg_am						20 | ||||
| #define Anum_pg_am_amname				1 | ||||
| #define Anum_pg_am_amowner				2 | ||||
| #define Anum_pg_am_amstrategies			3 | ||||
| @ -91,21 +92,22 @@ typedef FormData_pg_am *Form_pg_am; | ||||
| #define Anum_pg_am_amrestrpos			16 | ||||
| #define Anum_pg_am_ambuild				17 | ||||
| #define Anum_pg_am_ambulkdelete			18 | ||||
| #define Anum_pg_am_amcostestimate		19 | ||||
| #define Anum_pg_am_amvacuumcleanup		19 | ||||
| #define Anum_pg_am_amcostestimate		20 | ||||
| 
 | ||||
| /* ----------------
 | ||||
|  *		initial contents of pg_am | ||||
|  * ---------------- | ||||
|  */ | ||||
| 
 | ||||
| DATA(insert OID = 402 (  rtree	PGUID	8 3 0 f f f f rtgettuple rtinsert rtbeginscan rtrescan rtendscan rtmarkpos rtrestrpos rtbuild rtbulkdelete rtcostestimate )); | ||||
| DATA(insert OID = 402 (  rtree	PGUID	8 3 0 f f f f rtgettuple rtinsert rtbeginscan rtrescan rtendscan rtmarkpos rtrestrpos rtbuild rtbulkdelete - rtcostestimate )); | ||||
| DESCR("r-tree index access method"); | ||||
| DATA(insert OID = 403 (  btree	PGUID	5 1 1 t t t t btgettuple btinsert btbeginscan btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btcostestimate )); | ||||
| DATA(insert OID = 403 (  btree	PGUID	5 1 1 t t t t btgettuple btinsert btbeginscan btrescan btendscan btmarkpos btrestrpos btbuild btbulkdelete btvacuumcleanup btcostestimate )); | ||||
| DESCR("b-tree index access method"); | ||||
| #define BTREE_AM_OID 403 | ||||
| DATA(insert OID = 405 (  hash	PGUID	1 1 0 f f f t hashgettuple hashinsert hashbeginscan hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete hashcostestimate )); | ||||
| DATA(insert OID = 405 (  hash	PGUID	1 1 0 f f f t hashgettuple hashinsert hashbeginscan hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbulkdelete - hashcostestimate )); | ||||
| DESCR("hash index access method"); | ||||
| DATA(insert OID = 783 (  gist	PGUID 100 7 0 f t f f gistgettuple gistinsert gistbeginscan gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete gistcostestimate )); | ||||
| DATA(insert OID = 783 (  gist	PGUID 100 7 0 f t f f gistgettuple gistinsert gistbeginscan gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbulkdelete - gistcostestimate )); | ||||
| DESCR("GiST index access method"); | ||||
| #define GIST_AM_OID 783 | ||||
| 
 | ||||
|  | ||||
| @ -7,7 +7,7 @@ | ||||
|  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1994, Regents of the University of California | ||||
|  * | ||||
|  * $Id: pg_proc.h,v 1.283 2003/02/13 05:24:02 momjian Exp $ | ||||
|  * $Id: pg_proc.h,v 1.284 2003/02/22 00:45:05 tgl Exp $ | ||||
|  * | ||||
|  * NOTES | ||||
|  *	  The script catalog/genbki.sh reads this file and generates .bki | ||||
| @ -710,6 +710,8 @@ DATA(insert OID = 338 (  btbuild		   PGNSP PGUID 12 f f t f v 3 2278 "2281 2281 | ||||
| DESCR("btree(internal)"); | ||||
| DATA(insert OID = 332 (  btbulkdelete	   PGNSP PGUID 12 f f t f v 3 2281 "2281 2281 2281" btbulkdelete - _null_ )); | ||||
| DESCR("btree(internal)"); | ||||
| DATA(insert OID = 972 (  btvacuumcleanup   PGNSP PGUID 12 f f t f v 3 2281 "2281 2281 2281" btvacuumcleanup - _null_ )); | ||||
| DESCR("btree(internal)"); | ||||
| DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 f f t f v 8 2278 "2281 2281 2281 2281 2281 2281 2281 2281"  btcostestimate - _null_ )); | ||||
| DESCR("btree(internal)"); | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user