PostgreSQL/src/backend/access/gin/ginentrypage.c
2008-01-01 19:46:01 +00:00

610 lines
15 KiB
C

/*-------------------------------------------------------------------------
*
* ginentrypage.c
* page utilities routines for the postgres inverted index access method.
*
*
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/gin/ginentrypage.c,v 1.12 2008/01/01 19:45:46 momjian Exp $
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/gin.h"
#include "access/tuptoaster.h"
/*
* forms tuple for entry tree. On leaf page, Index tuple has
* non-traditional layout. Tuple may contain posting list or
* root blocknumber of posting tree. Macros GinIsPostingTre: (itup) / GinSetPostingTree(itup, blkno)
* 1) Posting list
* - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual
* - ItemPointerGetBlockNumber(&itup->t_tid) contains original
* size of tuple (without posting list).
* Macroses: GinGetOrigSizePosting(itup) / GinSetOrigSizePosting(itup,n)
* - ItemPointerGetOffsetNumber(&itup->t_tid) contains number
* of elements in posting list (number of heap itempointer)
* Macroses: GinGetNPosting(itup) / GinSetNPosting(itup,n)
* - After usual part of tuple there is a posting list
* Macros: GinGetPosting(itup)
* 2) Posting tree
* - itup->t_info & INDEX_SIZE_MASK contains size of tuple as usual
* - ItemPointerGetBlockNumber(&itup->t_tid) contains block number of
* root of posting tree
* - ItemPointerGetOffsetNumber(&itup->t_tid) contains magic number GIN_TREE_POSTING
*/
IndexTuple
GinFormTuple(GinState *ginstate, Datum key, ItemPointerData *ipd, uint32 nipd)
{
bool isnull = FALSE;
IndexTuple itup;
itup = index_form_tuple(ginstate->tupdesc, &key, &isnull);
GinSetOrigSizePosting(itup, IndexTupleSize(itup));
if (nipd > 0)
{
uint32 newsize = MAXALIGN(SHORTALIGN(IndexTupleSize(itup)) + sizeof(ItemPointerData) * nipd);
if (newsize >= INDEX_SIZE_MASK)
return NULL;
if (newsize > TOAST_INDEX_TARGET && nipd > 1)
return NULL;
itup = repalloc(itup, newsize);
/* set new size */
itup->t_info &= ~INDEX_SIZE_MASK;
itup->t_info |= newsize;
if (ipd)
memcpy(GinGetPosting(itup), ipd, sizeof(ItemPointerData) * nipd);
GinSetNPosting(itup, nipd);
}
else
{
GinSetNPosting(itup, 0);
}
return itup;
}
/*
* Entry tree is a "static", ie tuple never deletes from it,
* so we don't use right bound, we use rightest key instead.
*/
static IndexTuple
getRightMostTuple(Page page)
{
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff));
}
Datum
ginGetHighKey(GinState *ginstate, Page page)
{
IndexTuple itup;
bool isnull;
itup = getRightMostTuple(page);
return index_getattr(itup, FirstOffsetNumber, ginstate->tupdesc, &isnull);
}
static bool
entryIsMoveRight(GinBtree btree, Page page)
{
Datum highkey;
if (GinPageRightMost(page))
return FALSE;
highkey = ginGetHighKey(btree->ginstate, page);
if (compareEntries(btree->ginstate, btree->entryValue, highkey) > 0)
return TRUE;
return FALSE;
}
/*
* Find correct tuple in non-leaf page. It supposed that
* page correctly choosen and searching value SHOULD be on page
*/
static BlockNumber
entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
{
OffsetNumber low,
high,
maxoff;
IndexTuple itup = NULL;
int result;
Page page = BufferGetPage(stack->buffer);
Assert(!GinPageIsLeaf(page));
Assert(!GinPageIsData(page));
if (btree->fullScan)
{
stack->off = FirstOffsetNumber;
stack->predictNumber *= PageGetMaxOffsetNumber(page);
return btree->getLeftMostPage(btree, page);
}
low = FirstOffsetNumber;
maxoff = high = PageGetMaxOffsetNumber(page);
Assert(high >= low);
high++;
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
if (mid == maxoff && GinPageRightMost(page))
/* Right infinity */
result = -1;
else
{
bool isnull;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
result = compareEntries(btree->ginstate, btree->entryValue,
index_getattr(itup, FirstOffsetNumber, btree->ginstate->tupdesc, &isnull));
}
if (result == 0)
{
stack->off = mid;
Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO);
return GinItemPointerGetBlockNumber(&(itup)->t_tid);
}
else if (result > 0)
low = mid + 1;
else
high = mid;
}
Assert(high >= FirstOffsetNumber && high <= maxoff);
stack->off = high;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
Assert(GinItemPointerGetBlockNumber(&(itup)->t_tid) != GIN_ROOT_BLKNO);
return GinItemPointerGetBlockNumber(&(itup)->t_tid);
}
/*
* Searches correct position for value on leaf page.
* Page should be corrrectly choosen.
* Returns true if value found on page.
*/
static bool
entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
{
Page page = BufferGetPage(stack->buffer);
OffsetNumber low,
high;
IndexTuple itup;
Assert(GinPageIsLeaf(page));
Assert(!GinPageIsData(page));
if (btree->fullScan)
{
stack->off = FirstOffsetNumber;
return TRUE;
}
low = FirstOffsetNumber;
high = PageGetMaxOffsetNumber(page);
if (high < low)
{
stack->off = FirstOffsetNumber;
return false;
}
high++;
while (high > low)
{
OffsetNumber mid = low + ((high - low) / 2);
bool isnull;
int result;
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
result = compareEntries(btree->ginstate, btree->entryValue,
index_getattr(itup, FirstOffsetNumber, btree->ginstate->tupdesc, &isnull));
if (result == 0)
{
stack->off = mid;
return true;
}
else if (result > 0)
low = mid + 1;
else
high = mid;
}
stack->off = high;
return false;
}
static OffsetNumber
entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
{
OffsetNumber i,
maxoff = PageGetMaxOffsetNumber(page);
IndexTuple itup;
Assert(!GinPageIsLeaf(page));
Assert(!GinPageIsData(page));
/* if page isn't changed, we returns storedOff */
if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
return storedOff;
/*
* we hope, that needed pointer goes to right. It's true if there
* wasn't a deletion
*/
for (i = storedOff + 1; i <= maxoff; i++)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
return i;
}
maxoff = storedOff - 1;
}
/* last chance */
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
if (GinItemPointerGetBlockNumber(&(itup)->t_tid) == blkno)
return i;
}
return InvalidOffsetNumber;
}
static BlockNumber
entryGetLeftMostPage(GinBtree btree, Page page)
{
IndexTuple itup;
Assert(!GinPageIsLeaf(page));
Assert(!GinPageIsData(page));
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
return GinItemPointerGetBlockNumber(&(itup)->t_tid);
}
static bool
entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
{
Size itupsz = 0;
Page page = BufferGetPage(buf);
Assert(btree->entry);
Assert(!GinPageIsData(page));
if (btree->isDelete)
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
}
if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData))
return true;
return false;
}
/*
* Delete tuple on leaf page if tuples was existed and we
* should update it, update old child blkno to new right page
* if child split is occured
*/
static BlockNumber
entryPreparePage(GinBtree btree, Page page, OffsetNumber off)
{
BlockNumber ret = InvalidBlockNumber;
Assert(btree->entry);
Assert(!GinPageIsData(page));
if (btree->isDelete)
{
Assert(GinPageIsLeaf(page));
PageIndexTupleDelete(page, off);
}
if (!GinPageIsLeaf(page) && btree->rightblkno != InvalidBlockNumber)
{
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
ItemPointerSet(&itup->t_tid, btree->rightblkno, InvalidOffsetNumber);
ret = btree->rightblkno;
}
btree->rightblkno = InvalidBlockNumber;
return ret;
}
/*
* Place tuple on page and fills WAL record
*/
static void
entryPlaceToPage(GinBtree btree, Buffer buf, OffsetNumber off, XLogRecData **prdata)
{
Page page = BufferGetPage(buf);
static XLogRecData rdata[3];
OffsetNumber placed;
static ginxlogInsert data;
int cnt = 0;
*prdata = rdata;
data.updateBlkno = entryPreparePage(btree, page, off);
placed = PageAddItem(page, (Item) btree->entry, IndexTupleSize(btree->entry), off, false, false);
if (placed != off)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(btree->index));
data.node = btree->index->rd_node;
data.blkno = BufferGetBlockNumber(buf);
data.offset = off;
data.nitem = 1;
data.isDelete = btree->isDelete;
data.isData = false;
data.isLeaf = GinPageIsLeaf(page) ? TRUE : FALSE;
/*
* Prevent full page write if child's split occurs. That is needed to
* remove incomplete splits while replaying WAL
*
* data.updateBlkno contains new block number (of newly created right
* page) for recently splited page.
*/
if (data.updateBlkno == InvalidBlockNumber)
{
rdata[0].buffer = buf;
rdata[0].buffer_std = TRUE;
rdata[0].data = NULL;
rdata[0].len = 0;
rdata[0].next = &rdata[1];
cnt++;
}
rdata[cnt].buffer = InvalidBuffer;
rdata[cnt].data = (char *) &data;
rdata[cnt].len = sizeof(ginxlogInsert);
rdata[cnt].next = &rdata[cnt + 1];
cnt++;
rdata[cnt].buffer = InvalidBuffer;
rdata[cnt].data = (char *) btree->entry;
rdata[cnt].len = IndexTupleSize(btree->entry);
rdata[cnt].next = NULL;
btree->entry = NULL;
}
/*
* Returns new tuple with copied value from source tuple.
* New tuple will not store posting list
*/
static IndexTuple
copyIndexTuple(IndexTuple itup, Page page)
{
IndexTuple nitup;
if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
{
nitup = (IndexTuple) palloc(MAXALIGN(GinGetOrigSizePosting(itup)));
memcpy(nitup, itup, GinGetOrigSizePosting(itup));
nitup->t_info &= ~INDEX_SIZE_MASK;
nitup->t_info |= GinGetOrigSizePosting(itup);
}
else
{
nitup = (IndexTuple) palloc(MAXALIGN(IndexTupleSize(itup)));
memcpy(nitup, itup, IndexTupleSize(itup));
}
return nitup;
}
/*
* Place tuple and split page, original buffer(lbuf) leaves untouched,
* returns shadow page of lbuf filled new data.
* Tuples are distributed between pages by equal size on its, not
* an equal number!
*/
static Page
entrySplitPage(GinBtree btree, Buffer lbuf, Buffer rbuf, OffsetNumber off, XLogRecData **prdata)
{
static XLogRecData rdata[2];
OffsetNumber i,
maxoff,
separator = InvalidOffsetNumber;
Size totalsize = 0;
Size lsize = 0,
size;
static char tupstore[2 * BLCKSZ];
char *ptr;
IndexTuple itup,
leftrightmost = NULL;
static ginxlogSplit data;
Page page;
Page lpage = GinPageGetCopyPage(BufferGetPage(lbuf));
Page rpage = BufferGetPage(rbuf);
Size pageSize = PageGetPageSize(lpage);
*prdata = rdata;
data.leftChildBlkno = (GinPageIsLeaf(lpage)) ?
InvalidOffsetNumber : GinItemPointerGetBlockNumber(&(btree->entry->t_tid));
data.updateBlkno = entryPreparePage(btree, lpage, off);
maxoff = PageGetMaxOffsetNumber(lpage);
ptr = tupstore;
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
if (i == off)
{
size = MAXALIGN(IndexTupleSize(btree->entry));
memcpy(ptr, btree->entry, size);
ptr += size;
totalsize += size + sizeof(ItemIdData);
}
itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i));
size = MAXALIGN(IndexTupleSize(itup));
memcpy(ptr, itup, size);
ptr += size;
totalsize += size + sizeof(ItemIdData);
}
if (off == maxoff + 1)
{
size = MAXALIGN(IndexTupleSize(btree->entry));
memcpy(ptr, btree->entry, size);
ptr += size;
totalsize += size + sizeof(ItemIdData);
}
GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
ptr = tupstore;
maxoff++;
lsize = 0;
page = lpage;
for (i = FirstOffsetNumber; i <= maxoff; i++)
{
itup = (IndexTuple) ptr;
if (lsize > totalsize / 2)
{
if (separator == InvalidOffsetNumber)
separator = i - 1;
page = rpage;
}
else
{
leftrightmost = itup;
lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
}
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index page in \"%s\"",
RelationGetRelationName(btree->index));
ptr += MAXALIGN(IndexTupleSize(itup));
}
btree->entry = copyIndexTuple(leftrightmost, lpage);
ItemPointerSet(&(btree->entry)->t_tid, BufferGetBlockNumber(lbuf), InvalidOffsetNumber);
btree->rightblkno = BufferGetBlockNumber(rbuf);
data.node = btree->index->rd_node;
data.rootBlkno = InvalidBlockNumber;
data.lblkno = BufferGetBlockNumber(lbuf);
data.rblkno = BufferGetBlockNumber(rbuf);
data.separator = separator;
data.nitem = maxoff;
data.isData = FALSE;
data.isLeaf = GinPageIsLeaf(lpage) ? TRUE : FALSE;
data.isRootSplit = FALSE;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &data;
rdata[0].len = sizeof(ginxlogSplit);
rdata[0].next = &rdata[1];
rdata[1].buffer = InvalidBuffer;
rdata[1].data = tupstore;
rdata[1].len = MAXALIGN(totalsize);
rdata[1].next = NULL;
return lpage;
}
/*
* return newly allocate rightmost tuple
*/
IndexTuple
ginPageGetLinkItup(Buffer buf)
{
IndexTuple itup,
nitup;
Page page = BufferGetPage(buf);
itup = getRightMostTuple(page);
nitup = copyIndexTuple(itup, page);
ItemPointerSet(&nitup->t_tid, BufferGetBlockNumber(buf), InvalidOffsetNumber);
return nitup;
}
/*
* Fills new root by rightest values from child.
* Also called from ginxlog, should not use btree
*/
void
entryFillRoot(GinBtree btree, Buffer root, Buffer lbuf, Buffer rbuf)
{
Page page;
IndexTuple itup;
page = BufferGetPage(root);
itup = ginPageGetLinkItup(lbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
itup = ginPageGetLinkItup(rbuf);
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
elog(ERROR, "failed to add item to index root page");
}
void
prepareEntryScan(GinBtree btree, Relation index, Datum value, GinState *ginstate)
{
memset(btree, 0, sizeof(GinBtreeData));
btree->isMoveRight = entryIsMoveRight;
btree->findChildPage = entryLocateEntry;
btree->findItem = entryLocateLeafEntry;
btree->findChildPtr = entryFindChildPtr;
btree->getLeftMostPage = entryGetLeftMostPage;
btree->isEnoughSpace = entryIsEnoughSpace;
btree->placeToPage = entryPlaceToPage;
btree->splitPage = entrySplitPage;
btree->fillRoot = entryFillRoot;
btree->index = index;
btree->ginstate = ginstate;
btree->entryValue = value;
btree->isDelete = FALSE;
btree->searchMode = FALSE;
btree->fullScan = FALSE;
btree->isBuild = FALSE;
}