PostgreSQL/src/backend/executor/execReplication.c
Andres Freund c2fe139c20 tableam: Add and use scan APIs.
Too allow table accesses to be not directly dependent on heap, several
new abstractions are needed. Specifically:

1) Heap scans need to be generalized into table scans. Do this by
   introducing TableScanDesc, which will be the "base class" for
   individual AMs. This contains the AM independent fields from
   HeapScanDesc.

   The previous heap_{beginscan,rescan,endscan} et al. have been
   replaced with a table_ version.

   There's no direct replacement for heap_getnext(), as that returned
   a HeapTuple, which is undesirable for a other AMs. Instead there's
   table_scan_getnextslot().  But note that heap_getnext() lives on,
   it's still used widely to access catalog tables.

   This is achieved by new scan_begin, scan_end, scan_rescan,
   scan_getnextslot callbacks.

2) The portion of parallel scans that's shared between backends need
   to be able to do so without the user doing per-AM work. To achieve
   that new parallelscan_{estimate, initialize, reinitialize}
   callbacks are introduced, which operate on a new
   ParallelTableScanDesc, which again can be subclassed by AMs.

   As it is likely that several AMs are going to be block oriented,
   block oriented callbacks that can be shared between such AMs are
   provided and used by heap. table_block_parallelscan_{estimate,
   intiialize, reinitialize} as callbacks, and
   table_block_parallelscan_{nextpage, init} for use in AMs. These
   operate on a ParallelBlockTableScanDesc.

3) Index scans need to be able to access tables to return a tuple, and
   there needs to be state across individual accesses to the heap to
   store state like buffers. That's now handled by introducing a
   sort-of-scan IndexFetchTable, which again is intended to be
   subclassed by individual AMs (for heap IndexFetchHeap).

   The relevant callbacks for an AM are index_fetch_{end, begin,
   reset} to create the necessary state, and index_fetch_tuple to
   retrieve an indexed tuple.  Note that index_fetch_tuple
   implementations need to be smarter than just blindly fetching the
   tuples for AMs that have optimizations similar to heap's HOT - the
   currently alive tuple in the update chain needs to be fetched if
   appropriate.

   Similar to table_scan_getnextslot(), it's undesirable to continue
   to return HeapTuples. Thus index_fetch_heap (might want to rename
   that later) now accepts a slot as an argument. Core code doesn't
   have a lot of call sites performing index scans without going
   through the systable_* API (in contrast to loads of heap_getnext
   calls and working directly with HeapTuples).

   Index scans now store the result of a search in
   IndexScanDesc->xs_heaptid, rather than xs_ctup->t_self. As the
   target is not generally a HeapTuple anymore that seems cleaner.

To be able to sensible adapt code to use the above, two further
callbacks have been introduced:

a) slot_callbacks returns a TupleTableSlotOps* suitable for creating
   slots capable of holding a tuple of the AMs
   type. table_slot_callbacks() and table_slot_create() are based
   upon that, but have additional logic to deal with views, foreign
   tables, etc.

   While this change could have been done separately, nearly all the
   call sites that needed to be adapted for the rest of this commit
   also would have been needed to be adapted for
   table_slot_callbacks(), making separation not worthwhile.

b) tuple_satisfies_snapshot checks whether the tuple in a slot is
   currently visible according to a snapshot. That's required as a few
   places now don't have a buffer + HeapTuple around, but a
   slot (which in heap's case internally has that information).

Additionally a few infrastructure changes were needed:

I) SysScanDesc, as used by systable_{beginscan, getnext} et al. now
   internally uses a slot to keep track of tuples. While
   systable_getnext() still returns HeapTuples, and will so for the
   foreseeable future, the index API (see 1) above) now only deals with
   slots.

The remainder, and largest part, of this commit is then adjusting all
scans in postgres to use the new APIs.

Author: Andres Freund, Haribabu Kommi, Alvaro Herrera
Discussion:
    https://postgr.es/m/20180703070645.wchpu5muyto5n647@alap3.anarazel.de
    https://postgr.es/m/20160812231527.GA690404@alvherre.pgsql
2019-03-11 12:46:41 -07:00

635 lines
18 KiB
C

/*-------------------------------------------------------------------------
*
* execReplication.c
* miscellaneous executor routines for logical replication
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/executor/execReplication.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xact.h"
#include "commands/trigger.h"
#include "executor/executor.h"
#include "nodes/nodeFuncs.h"
#include "parser/parse_relation.h"
#include "parser/parsetree.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/typcache.h"
/*
* Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that
* is setup to match 'rel' (*NOT* idxrel!).
*
* Returns whether any column contains NULLs.
*
* This is not generic routine, it expects the idxrel to be replication
* identity of a rel and meet all limitations associated with that.
*/
static bool
build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel,
TupleTableSlot *searchslot)
{
int attoff;
bool isnull;
Datum indclassDatum;
oidvector *opclass;
int2vector *indkey = &idxrel->rd_index->indkey;
bool hasnulls = false;
Assert(RelationGetReplicaIndex(rel) == RelationGetRelid(idxrel));
indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple,
Anum_pg_index_indclass, &isnull);
Assert(!isnull);
opclass = (oidvector *) DatumGetPointer(indclassDatum);
/* Build scankey for every attribute in the index. */
for (attoff = 0; attoff < IndexRelationGetNumberOfKeyAttributes(idxrel); attoff++)
{
Oid operator;
Oid opfamily;
RegProcedure regop;
int pkattno = attoff + 1;
int mainattno = indkey->values[attoff];
Oid optype = get_opclass_input_type(opclass->values[attoff]);
/*
* Load the operator info. We need this to get the equality operator
* function for the scan key.
*/
opfamily = get_opclass_family(opclass->values[attoff]);
operator = get_opfamily_member(opfamily, optype,
optype,
BTEqualStrategyNumber);
if (!OidIsValid(operator))
elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
BTEqualStrategyNumber, optype, optype, opfamily);
regop = get_opcode(operator);
/* Initialize the scankey. */
ScanKeyInit(&skey[attoff],
pkattno,
BTEqualStrategyNumber,
regop,
searchslot->tts_values[mainattno - 1]);
/* Check for null value. */
if (searchslot->tts_isnull[mainattno - 1])
{
hasnulls = true;
skey[attoff].sk_flags |= SK_ISNULL;
}
}
return hasnulls;
}
/*
* Search the relation 'rel' for tuple using the index.
*
* If a matching tuple is found, lock it with lockmode, fill the slot with its
* contents, and return true. Return false otherwise.
*/
bool
RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
LockTupleMode lockmode,
TupleTableSlot *searchslot,
TupleTableSlot *outslot)
{
ScanKeyData skey[INDEX_MAX_KEYS];
IndexScanDesc scan;
SnapshotData snap;
TransactionId xwait;
Relation idxrel;
bool found;
/* Open the index. */
idxrel = index_open(idxoid, RowExclusiveLock);
/* Start an index scan. */
InitDirtySnapshot(snap);
scan = index_beginscan(rel, idxrel, &snap,
IndexRelationGetNumberOfKeyAttributes(idxrel),
0);
/* Build scan key. */
build_replindex_scan_key(skey, rel, idxrel, searchslot);
retry:
found = false;
index_rescan(scan, skey, IndexRelationGetNumberOfKeyAttributes(idxrel), NULL, 0);
/* Try to find the tuple */
if (index_getnext_slot(scan, ForwardScanDirection, outslot))
{
found = true;
ExecMaterializeSlot(outslot);
xwait = TransactionIdIsValid(snap.xmin) ?
snap.xmin : snap.xmax;
/*
* If the tuple is locked, wait for locking transaction to finish and
* retry.
*/
if (TransactionIdIsValid(xwait))
{
XactLockTableWait(xwait, NULL, NULL, XLTW_None);
goto retry;
}
}
/* Found tuple, try to lock it in the lockmode. */
if (found)
{
Buffer buf;
HeapUpdateFailureData hufd;
HTSU_Result res;
HeapTupleData locktup;
HeapTupleTableSlot *hslot = (HeapTupleTableSlot *)outslot;
/* Only a heap tuple has item pointers. */
Assert(TTS_IS_HEAPTUPLE(outslot) || TTS_IS_BUFFERTUPLE(outslot));
ItemPointerCopy(&hslot->tuple->t_self, &locktup.t_self);
PushActiveSnapshot(GetLatestSnapshot());
res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false),
lockmode,
LockWaitBlock,
false /* don't follow updates */ ,
&buf, &hufd);
/* the tuple slot already has the buffer pinned */
ReleaseBuffer(buf);
PopActiveSnapshot();
switch (res)
{
case HeapTupleMayBeUpdated:
break;
case HeapTupleUpdated:
/* XXX: Improve handling here */
if (ItemPointerIndicatesMovedPartitions(&hufd.ctid))
ereport(LOG,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
else
ereport(LOG,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("concurrent update, retrying")));
goto retry;
case HeapTupleInvisible:
elog(ERROR, "attempted to lock invisible tuple");
break;
default:
elog(ERROR, "unexpected heap_lock_tuple status: %u", res);
break;
}
}
index_endscan(scan);
/* Don't release lock until commit. */
index_close(idxrel, NoLock);
return found;
}
/*
* Compare the tuples in the slots by checking if they have equal values.
*/
static bool
tuples_equal(TupleTableSlot *slot1, TupleTableSlot *slot2)
{
int attrnum;
Assert(slot1->tts_tupleDescriptor->natts ==
slot2->tts_tupleDescriptor->natts);
slot_getallattrs(slot1);
slot_getallattrs(slot2);
/* Check equality of the attributes. */
for (attrnum = 0; attrnum < slot1->tts_tupleDescriptor->natts; attrnum++)
{
Form_pg_attribute att;
TypeCacheEntry *typentry;
/*
* If one value is NULL and other is not, then they are certainly not
* equal
*/
if (slot1->tts_isnull[attrnum] != slot2->tts_isnull[attrnum])
return false;
/*
* If both are NULL, they can be considered equal.
*/
if (slot1->tts_isnull[attrnum] || slot2->tts_isnull[attrnum])
continue;
att = TupleDescAttr(slot1->tts_tupleDescriptor, attrnum);
typentry = lookup_type_cache(att->atttypid, TYPECACHE_EQ_OPR_FINFO);
if (!OidIsValid(typentry->eq_opr_finfo.fn_oid))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_FUNCTION),
errmsg("could not identify an equality operator for type %s",
format_type_be(att->atttypid))));
if (!DatumGetBool(FunctionCall2(&typentry->eq_opr_finfo,
slot1->tts_values[attrnum],
slot2->tts_values[attrnum])))
return false;
}
return true;
}
/*
* Search the relation 'rel' for tuple using the sequential scan.
*
* If a matching tuple is found, lock it with lockmode, fill the slot with its
* contents, and return true. Return false otherwise.
*
* Note that this stops on the first matching tuple.
*
* This can obviously be quite slow on tables that have more than few rows.
*/
bool
RelationFindReplTupleSeq(Relation rel, LockTupleMode lockmode,
TupleTableSlot *searchslot, TupleTableSlot *outslot)
{
TupleTableSlot *scanslot;
TableScanDesc scan;
SnapshotData snap;
TransactionId xwait;
bool found;
TupleDesc desc PG_USED_FOR_ASSERTS_ONLY = RelationGetDescr(rel);
Assert(equalTupleDescs(desc, outslot->tts_tupleDescriptor));
/* Start a heap scan. */
InitDirtySnapshot(snap);
scan = table_beginscan(rel, &snap, 0, NULL);
scanslot = table_slot_create(rel, NULL);
retry:
found = false;
table_rescan(scan, NULL);
/* Try to find the tuple */
while (table_scan_getnextslot(scan, ForwardScanDirection, scanslot))
{
if (!tuples_equal(scanslot, searchslot))
continue;
found = true;
ExecCopySlot(outslot, scanslot);
xwait = TransactionIdIsValid(snap.xmin) ?
snap.xmin : snap.xmax;
/*
* If the tuple is locked, wait for locking transaction to finish and
* retry.
*/
if (TransactionIdIsValid(xwait))
{
XactLockTableWait(xwait, NULL, NULL, XLTW_None);
goto retry;
}
}
/* Found tuple, try to lock it in the lockmode. */
if (found)
{
Buffer buf;
HeapUpdateFailureData hufd;
HTSU_Result res;
HeapTupleData locktup;
HeapTupleTableSlot *hslot = (HeapTupleTableSlot *)outslot;
/* Only a heap tuple has item pointers. */
Assert(TTS_IS_HEAPTUPLE(outslot) || TTS_IS_BUFFERTUPLE(outslot));
ItemPointerCopy(&hslot->tuple->t_self, &locktup.t_self);
PushActiveSnapshot(GetLatestSnapshot());
res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false),
lockmode,
LockWaitBlock,
false /* don't follow updates */ ,
&buf, &hufd);
/* the tuple slot already has the buffer pinned */
ReleaseBuffer(buf);
PopActiveSnapshot();
switch (res)
{
case HeapTupleMayBeUpdated:
break;
case HeapTupleUpdated:
/* XXX: Improve handling here */
if (ItemPointerIndicatesMovedPartitions(&hufd.ctid))
ereport(LOG,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("tuple to be locked was already moved to another partition due to concurrent update, retrying")));
else
ereport(LOG,
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
errmsg("concurrent update, retrying")));
goto retry;
case HeapTupleInvisible:
elog(ERROR, "attempted to lock invisible tuple");
break;
default:
elog(ERROR, "unexpected heap_lock_tuple status: %u", res);
break;
}
}
table_endscan(scan);
ExecDropSingleTupleTableSlot(scanslot);
return found;
}
/*
* Insert tuple represented in the slot to the relation, update the indexes,
* and execute any constraints and per-row triggers.
*
* Caller is responsible for opening the indexes.
*/
void
ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot)
{
bool skip_tuple = false;
HeapTuple tuple;
ResultRelInfo *resultRelInfo = estate->es_result_relation_info;
Relation rel = resultRelInfo->ri_RelationDesc;
/* For now we support only tables. */
Assert(rel->rd_rel->relkind == RELKIND_RELATION);
CheckCmdReplicaIdentity(rel, CMD_INSERT);
/* BEFORE ROW INSERT Triggers */
if (resultRelInfo->ri_TrigDesc &&
resultRelInfo->ri_TrigDesc->trig_insert_before_row)
{
if (!ExecBRInsertTriggers(estate, resultRelInfo, slot))
skip_tuple = true; /* "do nothing" */
}
if (!skip_tuple)
{
List *recheckIndexes = NIL;
/* Check the constraints of the tuple */
if (rel->rd_att->constr)
ExecConstraints(resultRelInfo, slot, estate);
if (resultRelInfo->ri_PartitionCheck)
ExecPartitionCheck(resultRelInfo, slot, estate, true);
/* Materialize slot into a tuple that we can scribble upon. */
tuple = ExecFetchSlotHeapTuple(slot, true, NULL);
/* OK, store the tuple and create index entries for it */
simple_heap_insert(rel, tuple);
ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
if (resultRelInfo->ri_NumIndices > 0)
recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self),
estate, false, NULL,
NIL);
/* AFTER ROW INSERT Triggers */
ExecARInsertTriggers(estate, resultRelInfo, slot,
recheckIndexes, NULL);
/*
* XXX we should in theory pass a TransitionCaptureState object to the
* above to capture transition tuples, but after statement triggers
* don't actually get fired by replication yet anyway
*/
list_free(recheckIndexes);
}
}
/*
* Find the searchslot tuple and update it with data in the slot,
* update the indexes, and execute any constraints and per-row triggers.
*
* Caller is responsible for opening the indexes.
*/
void
ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate,
TupleTableSlot *searchslot, TupleTableSlot *slot)
{
bool skip_tuple = false;
HeapTuple tuple;
ResultRelInfo *resultRelInfo = estate->es_result_relation_info;
Relation rel = resultRelInfo->ri_RelationDesc;
HeapTupleTableSlot *hsearchslot = (HeapTupleTableSlot *)searchslot;
/* We expect the searchslot to contain a heap tuple. */
Assert(TTS_IS_HEAPTUPLE(searchslot) || TTS_IS_BUFFERTUPLE(searchslot));
/* For now we support only tables. */
Assert(rel->rd_rel->relkind == RELKIND_RELATION);
CheckCmdReplicaIdentity(rel, CMD_UPDATE);
/* BEFORE ROW UPDATE Triggers */
if (resultRelInfo->ri_TrigDesc &&
resultRelInfo->ri_TrigDesc->trig_update_before_row)
{
if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
&hsearchslot->tuple->t_self,
NULL, slot))
skip_tuple = true; /* "do nothing" */
}
if (!skip_tuple)
{
List *recheckIndexes = NIL;
/* Check the constraints of the tuple */
if (rel->rd_att->constr)
ExecConstraints(resultRelInfo, slot, estate);
if (resultRelInfo->ri_PartitionCheck)
ExecPartitionCheck(resultRelInfo, slot, estate, true);
/* Materialize slot into a tuple that we can scribble upon. */
tuple = ExecFetchSlotHeapTuple(slot, true, NULL);
/* OK, update the tuple and index entries for it */
simple_heap_update(rel, &hsearchslot->tuple->t_self, tuple);
ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
if (resultRelInfo->ri_NumIndices > 0 &&
!HeapTupleIsHeapOnly(tuple))
recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self),
estate, false, NULL,
NIL);
/* AFTER ROW UPDATE Triggers */
ExecARUpdateTriggers(estate, resultRelInfo,
&(tuple->t_self),
NULL, slot,
recheckIndexes, NULL);
list_free(recheckIndexes);
}
}
/*
* Find the searchslot tuple and delete it, and execute any constraints
* and per-row triggers.
*
* Caller is responsible for opening the indexes.
*/
void
ExecSimpleRelationDelete(EState *estate, EPQState *epqstate,
TupleTableSlot *searchslot)
{
bool skip_tuple = false;
ResultRelInfo *resultRelInfo = estate->es_result_relation_info;
Relation rel = resultRelInfo->ri_RelationDesc;
HeapTupleTableSlot *hsearchslot = (HeapTupleTableSlot *)searchslot;
/* For now we support only tables and heap tuples. */
Assert(rel->rd_rel->relkind == RELKIND_RELATION);
Assert(TTS_IS_HEAPTUPLE(searchslot) || TTS_IS_BUFFERTUPLE(searchslot));
CheckCmdReplicaIdentity(rel, CMD_DELETE);
/* BEFORE ROW DELETE Triggers */
if (resultRelInfo->ri_TrigDesc &&
resultRelInfo->ri_TrigDesc->trig_delete_before_row)
{
skip_tuple = !ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
&hsearchslot->tuple->t_self,
NULL, NULL);
}
if (!skip_tuple)
{
List *recheckIndexes = NIL;
/* OK, delete the tuple */
simple_heap_delete(rel, &hsearchslot->tuple->t_self);
/* AFTER ROW DELETE Triggers */
ExecARDeleteTriggers(estate, resultRelInfo,
&hsearchslot->tuple->t_self, NULL, NULL);
list_free(recheckIndexes);
}
}
/*
* Check if command can be executed with current replica identity.
*/
void
CheckCmdReplicaIdentity(Relation rel, CmdType cmd)
{
PublicationActions *pubactions;
/* We only need to do checks for UPDATE and DELETE. */
if (cmd != CMD_UPDATE && cmd != CMD_DELETE)
return;
/* If relation has replica identity we are always good. */
if (rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL ||
OidIsValid(RelationGetReplicaIndex(rel)))
return;
/*
* This is either UPDATE OR DELETE and there is no replica identity.
*
* Check if the table publishes UPDATES or DELETES.
*/
pubactions = GetRelationPublicationActions(rel);
if (cmd == CMD_UPDATE && pubactions->pubupdate)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("cannot update table \"%s\" because it does not have a replica identity and publishes updates",
RelationGetRelationName(rel)),
errhint("To enable updating the table, set REPLICA IDENTITY using ALTER TABLE.")));
else if (cmd == CMD_DELETE && pubactions->pubdelete)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("cannot delete from table \"%s\" because it does not have a replica identity and publishes deletes",
RelationGetRelationName(rel)),
errhint("To enable deleting from the table, set REPLICA IDENTITY using ALTER TABLE.")));
}
/*
* Check if we support writing into specific relkind.
*
* The nspname and relname are only needed for error reporting.
*/
void
CheckSubscriptionRelkind(char relkind, const char *nspname,
const char *relname)
{
/*
* We currently only support writing to regular tables. However, give
* a more specific error for partitioned and foreign tables.
*/
if (relkind == RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot use relation \"%s.%s\" as logical replication target",
nspname, relname),
errdetail("\"%s.%s\" is a partitioned table.",
nspname, relname)));
else if (relkind == RELKIND_FOREIGN_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot use relation \"%s.%s\" as logical replication target",
nspname, relname),
errdetail("\"%s.%s\" is a foreign table.",
nspname, relname)));
if (relkind != RELKIND_RELATION)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot use relation \"%s.%s\" as logical replication target",
nspname, relname),
errdetail("\"%s.%s\" is not a table.",
nspname, relname)));
}