Fix failures with incorrect epoch handling for 2PC files at recovery

At the beginning of recovery, an orphaned two-phase file in an epoch
different than the one defined in the checkpoint record could not be
removed based on the assumptions that AdjustToFullTransactionId() relies
on, assuming that all files would be either from the current epoch or
from the previous epoch.

If the checkpoint epoch was 0 while the 2PC file was orphaned and in the
future, AdjustToFullTransactionId() would underflow the epoch used to
build the 2PC file path.  In non-assert builds, this would create a
WARNING message referring to a 2PC file with an epoch of "FFFFFFFF" (or
UINT32_MAX), as an effect of the underflow calculation, leaving the
orphaned file around.

Some tests are added with dummy 2PC files in the past and the future,
checking that these are properly removed.

Issue introduced by 5a1dfde8334b, that has switched two-phase state
files to use FullTransactionIds.

Reported-by: Vitaly Davydov
Author: Michael Paquier
Reviewed-by: Vitaly Davydov
Discussion: https://postgr.es/m/13b5b6-676c3080-4d-531db900@47931709
Backpatch-through: 17
This commit is contained in:
Michael Paquier 2024-12-30 09:58:09 +09:00
parent 03c46e1f26
commit c3de0f9eed
2 changed files with 150 additions and 51 deletions

View File

@ -221,13 +221,13 @@ static void ProcessRecords(char *bufptr, TransactionId xid,
static void RemoveGXact(GlobalTransaction gxact); static void RemoveGXact(GlobalTransaction gxact);
static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len); static void XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len);
static char *ProcessTwoPhaseBuffer(TransactionId xid, static char *ProcessTwoPhaseBuffer(FullTransactionId xid,
XLogRecPtr prepare_start_lsn, XLogRecPtr prepare_start_lsn,
bool fromdisk, bool setParent, bool setNextXid); bool fromdisk, bool setParent, bool setNextXid);
static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, static void MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid,
const char *gid, TimestampTz prepared_at, Oid owner, const char *gid, TimestampTz prepared_at, Oid owner,
Oid databaseid); Oid databaseid);
static void RemoveTwoPhaseFile(TransactionId xid, bool giveWarning); static void RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning);
static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len); static void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
/* /*
@ -927,41 +927,26 @@ TwoPhaseGetDummyProc(TransactionId xid, bool lock_held)
/************************************************************************/ /************************************************************************/
/* /*
* Compute the FullTransactionId for the given TransactionId. * Compute FullTransactionId for the given TransactionId, using the current
* * epoch.
* The wrap logic is safe here because the span of active xids cannot exceed one
* epoch at any given time.
*/ */
static inline FullTransactionId static inline FullTransactionId
AdjustToFullTransactionId(TransactionId xid) FullTransactionIdFromCurrentEpoch(TransactionId xid)
{ {
FullTransactionId fxid;
FullTransactionId nextFullXid; FullTransactionId nextFullXid;
TransactionId nextXid;
uint32 epoch; uint32 epoch;
Assert(TransactionIdIsValid(xid)); nextFullXid = ReadNextFullTransactionId();
LWLockAcquire(XidGenLock, LW_SHARED);
nextFullXid = TransamVariables->nextXid;
LWLockRelease(XidGenLock);
nextXid = XidFromFullTransactionId(nextFullXid);
epoch = EpochFromFullTransactionId(nextFullXid); epoch = EpochFromFullTransactionId(nextFullXid);
if (unlikely(xid > nextXid))
{
/* Wraparound occurred, must be from a prev epoch. */
Assert(epoch > 0);
epoch--;
}
return FullTransactionIdFromEpochAndXid(epoch, xid); fxid = FullTransactionIdFromEpochAndXid(epoch, xid);
return fxid;
} }
static inline int static inline int
TwoPhaseFilePath(char *path, TransactionId xid) TwoPhaseFilePath(char *path, FullTransactionId fxid)
{ {
FullTransactionId fxid = AdjustToFullTransactionId(xid);
return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X", return snprintf(path, MAXPGPATH, TWOPHASE_DIR "/%08X%08X",
EpochFromFullTransactionId(fxid), EpochFromFullTransactionId(fxid),
XidFromFullTransactionId(fxid)); XidFromFullTransactionId(fxid));
@ -1297,7 +1282,8 @@ RegisterTwoPhaseRecord(TwoPhaseRmgrId rmid, uint16 info,
* If it looks OK (has a valid magic number and CRC), return the palloc'd * If it looks OK (has a valid magic number and CRC), return the palloc'd
* contents of the file, issuing an error when finding corrupted data. If * contents of the file, issuing an error when finding corrupted data. If
* missing_ok is true, which indicates that missing files can be safely * missing_ok is true, which indicates that missing files can be safely
* ignored, then return NULL. This state can be reached when doing recovery. * ignored, then return NULL. This state can be reached when doing recovery
* after discarding two-phase files from other epochs.
*/ */
static char * static char *
ReadTwoPhaseFile(TransactionId xid, bool missing_ok) ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
@ -1311,8 +1297,10 @@ ReadTwoPhaseFile(TransactionId xid, bool missing_ok)
pg_crc32c calc_crc, pg_crc32c calc_crc,
file_crc; file_crc;
int r; int r;
FullTransactionId fxid;
TwoPhaseFilePath(path, xid); fxid = FullTransactionIdFromCurrentEpoch(xid);
TwoPhaseFilePath(path, fxid);
fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
if (fd < 0) if (fd < 0)
@ -1677,10 +1665,16 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
AtEOXact_PgStat(isCommit, false); AtEOXact_PgStat(isCommit, false);
/* /*
* And now we can clean up any files we may have left. * And now we can clean up any files we may have left. These should be
* from the current epoch.
*/ */
if (ondisk) if (ondisk)
RemoveTwoPhaseFile(xid, true); {
FullTransactionId fxid;
fxid = FullTransactionIdFromCurrentEpoch(xid);
RemoveTwoPhaseFile(fxid, true);
}
MyLockedGxact = NULL; MyLockedGxact = NULL;
@ -1719,13 +1713,17 @@ ProcessRecords(char *bufptr, TransactionId xid,
* *
* If giveWarning is false, do not complain about file-not-present; * If giveWarning is false, do not complain about file-not-present;
* this is an expected case during WAL replay. * this is an expected case during WAL replay.
*
* This routine is used at early stages at recovery where future and
* past orphaned files are checked, hence the FullTransactionId to build
* a complete file name fit for the removal.
*/ */
static void static void
RemoveTwoPhaseFile(TransactionId xid, bool giveWarning) RemoveTwoPhaseFile(FullTransactionId fxid, bool giveWarning)
{ {
char path[MAXPGPATH]; char path[MAXPGPATH];
TwoPhaseFilePath(path, xid); TwoPhaseFilePath(path, fxid);
if (unlink(path)) if (unlink(path))
if (errno != ENOENT || giveWarning) if (errno != ENOENT || giveWarning)
ereport(WARNING, ereport(WARNING,
@ -1745,13 +1743,16 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
char path[MAXPGPATH]; char path[MAXPGPATH];
pg_crc32c statefile_crc; pg_crc32c statefile_crc;
int fd; int fd;
FullTransactionId fxid;
/* Recompute CRC */ /* Recompute CRC */
INIT_CRC32C(statefile_crc); INIT_CRC32C(statefile_crc);
COMP_CRC32C(statefile_crc, content, len); COMP_CRC32C(statefile_crc, content, len);
FIN_CRC32C(statefile_crc); FIN_CRC32C(statefile_crc);
TwoPhaseFilePath(path, xid); /* Use current epoch */
fxid = FullTransactionIdFromCurrentEpoch(xid);
TwoPhaseFilePath(path, fxid);
fd = OpenTransientFile(path, fd = OpenTransientFile(path,
O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY); O_CREAT | O_TRUNC | O_WRONLY | PG_BINARY);
@ -1899,7 +1900,9 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
* Scan pg_twophase and fill TwoPhaseState depending on the on-disk data. * Scan pg_twophase and fill TwoPhaseState depending on the on-disk data.
* This is called once at the beginning of recovery, saving any extra * This is called once at the beginning of recovery, saving any extra
* lookups in the future. Two-phase files that are newer than the * lookups in the future. Two-phase files that are newer than the
* minimum XID horizon are discarded on the way. * minimum XID horizon are discarded on the way. Two-phase files with
* an epoch older or newer than the current checkpoint's record epoch
* are also discarded.
*/ */
void void
restoreTwoPhaseData(void) restoreTwoPhaseData(void)
@ -1914,14 +1917,11 @@ restoreTwoPhaseData(void)
if (strlen(clde->d_name) == 16 && if (strlen(clde->d_name) == 16 &&
strspn(clde->d_name, "0123456789ABCDEF") == 16) strspn(clde->d_name, "0123456789ABCDEF") == 16)
{ {
TransactionId xid;
FullTransactionId fxid; FullTransactionId fxid;
char *buf; char *buf;
fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16)); fxid = FullTransactionIdFromU64(strtou64(clde->d_name, NULL, 16));
xid = XidFromFullTransactionId(fxid); buf = ProcessTwoPhaseBuffer(fxid, InvalidXLogRecPtr,
buf = ProcessTwoPhaseBuffer(xid, InvalidXLogRecPtr,
true, false, false); true, false, false);
if (buf == NULL) if (buf == NULL)
continue; continue;
@ -1972,6 +1972,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
TransactionId origNextXid = XidFromFullTransactionId(nextXid); TransactionId origNextXid = XidFromFullTransactionId(nextXid);
TransactionId result = origNextXid; TransactionId result = origNextXid;
TransactionId *xids = NULL; TransactionId *xids = NULL;
uint32 epoch = EpochFromFullTransactionId(nextXid);
int nxids = 0; int nxids = 0;
int allocsize = 0; int allocsize = 0;
int i; int i;
@ -1980,6 +1981,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
for (i = 0; i < TwoPhaseState->numPrepXacts; i++) for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
{ {
TransactionId xid; TransactionId xid;
FullTransactionId fxid;
char *buf; char *buf;
GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
@ -1987,7 +1989,12 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
xid = gxact->xid; xid = gxact->xid;
buf = ProcessTwoPhaseBuffer(xid, /*
* All two-phase files with past and future epoch in pg_twophase are
* gone at this point, so we're OK to rely on only the current epoch.
*/
fxid = FullTransactionIdFromEpochAndXid(epoch, xid);
buf = ProcessTwoPhaseBuffer(fxid,
gxact->prepare_start_lsn, gxact->prepare_start_lsn,
gxact->ondisk, false, true); gxact->ondisk, false, true);
@ -2049,11 +2056,18 @@ void
StandbyRecoverPreparedTransactions(void) StandbyRecoverPreparedTransactions(void)
{ {
int i; int i;
uint32 epoch;
FullTransactionId nextFullXid;
/* get current epoch */
nextFullXid = ReadNextFullTransactionId();
epoch = EpochFromFullTransactionId(nextFullXid);
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
for (i = 0; i < TwoPhaseState->numPrepXacts; i++) for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
{ {
TransactionId xid; TransactionId xid;
FullTransactionId fxid;
char *buf; char *buf;
GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
@ -2061,7 +2075,12 @@ StandbyRecoverPreparedTransactions(void)
xid = gxact->xid; xid = gxact->xid;
buf = ProcessTwoPhaseBuffer(xid, /*
* At this stage, we're OK to work with the current epoch as all past
* and future files have been already discarded.
*/
fxid = FullTransactionIdFromEpochAndXid(epoch, xid);
buf = ProcessTwoPhaseBuffer(fxid,
gxact->prepare_start_lsn, gxact->prepare_start_lsn,
gxact->ondisk, true, false); gxact->ondisk, true, false);
if (buf != NULL) if (buf != NULL)
@ -2090,11 +2109,18 @@ void
RecoverPreparedTransactions(void) RecoverPreparedTransactions(void)
{ {
int i; int i;
uint32 epoch;
FullTransactionId nextFullXid;
/* get current epoch */
nextFullXid = ReadNextFullTransactionId();
epoch = EpochFromFullTransactionId(nextFullXid);
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
for (i = 0; i < TwoPhaseState->numPrepXacts; i++) for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
{ {
TransactionId xid; TransactionId xid;
FullTransactionId fxid;
char *buf; char *buf;
GlobalTransaction gxact = TwoPhaseState->prepXacts[i]; GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
char *bufptr; char *bufptr;
@ -2102,6 +2128,10 @@ RecoverPreparedTransactions(void)
TransactionId *subxids; TransactionId *subxids;
const char *gid; const char *gid;
/*
* At this stage, we're OK to work with the current epoch as all past
* and future files have been already discarded.
*/
xid = gxact->xid; xid = gxact->xid;
/* /*
@ -2113,7 +2143,8 @@ RecoverPreparedTransactions(void)
* SubTransSetParent has been set before, if the prepared transaction * SubTransSetParent has been set before, if the prepared transaction
* generated xid assignment records. * generated xid assignment records.
*/ */
buf = ProcessTwoPhaseBuffer(xid, fxid = FullTransactionIdFromEpochAndXid(epoch, xid);
buf = ProcessTwoPhaseBuffer(fxid,
gxact->prepare_start_lsn, gxact->prepare_start_lsn,
gxact->ondisk, true, false); gxact->ondisk, true, false);
if (buf == NULL) if (buf == NULL)
@ -2181,7 +2212,7 @@ RecoverPreparedTransactions(void)
/* /*
* ProcessTwoPhaseBuffer * ProcessTwoPhaseBuffer
* *
* Given a transaction id, read it either from disk or read it directly * Given a FullTransactionId, read it either from disk or read it directly
* via shmem xlog record pointer using the provided "prepare_start_lsn". * via shmem xlog record pointer using the provided "prepare_start_lsn".
* *
* If setParent is true, set up subtransaction parent linkages. * If setParent is true, set up subtransaction parent linkages.
@ -2190,32 +2221,35 @@ RecoverPreparedTransactions(void)
* value scanned. * value scanned.
*/ */
static char * static char *
ProcessTwoPhaseBuffer(TransactionId xid, ProcessTwoPhaseBuffer(FullTransactionId fxid,
XLogRecPtr prepare_start_lsn, XLogRecPtr prepare_start_lsn,
bool fromdisk, bool fromdisk,
bool setParent, bool setNextXid) bool setParent, bool setNextXid)
{ {
FullTransactionId nextXid = TransamVariables->nextXid; FullTransactionId nextXid = TransamVariables->nextXid;
TransactionId origNextXid = XidFromFullTransactionId(nextXid);
TransactionId *subxids; TransactionId *subxids;
char *buf; char *buf;
TwoPhaseFileHeader *hdr; TwoPhaseFileHeader *hdr;
int i; int i;
TransactionId xid = XidFromFullTransactionId(fxid);
Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE)); Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
if (!fromdisk) if (!fromdisk)
Assert(prepare_start_lsn != InvalidXLogRecPtr); Assert(prepare_start_lsn != InvalidXLogRecPtr);
/* Reject XID if too new */ /*
if (TransactionIdFollowsOrEquals(xid, origNextXid)) * Reject full XID if too new. Note that this discards files from future
* epochs.
*/
if (FullTransactionIdFollowsOrEquals(fxid, nextXid))
{ {
if (fromdisk) if (fromdisk)
{ {
ereport(WARNING, ereport(WARNING,
(errmsg("removing future two-phase state file for transaction %u", (errmsg("removing future two-phase state file of epoch %u for transaction %u",
xid))); EpochFromFullTransactionId(fxid), xid)));
RemoveTwoPhaseFile(xid, true); RemoveTwoPhaseFile(fxid, true);
} }
else else
{ {
@ -2227,6 +2261,26 @@ ProcessTwoPhaseBuffer(TransactionId xid,
return NULL; return NULL;
} }
/* Discard files from past epochs */
if (EpochFromFullTransactionId(fxid) < EpochFromFullTransactionId(nextXid))
{
if (fromdisk)
{
ereport(WARNING,
(errmsg("removing past two-phase state file of epoch %u for transaction %u",
EpochFromFullTransactionId(fxid), xid)));
RemoveTwoPhaseFile(fxid, true);
}
else
{
ereport(WARNING,
(errmsg("removing past two-phase state from memory for transaction %u",
xid)));
PrepareRedoRemove(xid, true);
}
return NULL;
}
/* Already processed? */ /* Already processed? */
if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
{ {
@ -2235,7 +2289,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
ereport(WARNING, ereport(WARNING,
(errmsg("removing stale two-phase state file for transaction %u", (errmsg("removing stale two-phase state file for transaction %u",
xid))); xid)));
RemoveTwoPhaseFile(xid, true); RemoveTwoPhaseFile(fxid, true);
} }
else else
{ {
@ -2521,8 +2575,11 @@ PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
if (!XLogRecPtrIsInvalid(start_lsn)) if (!XLogRecPtrIsInvalid(start_lsn))
{ {
char path[MAXPGPATH]; char path[MAXPGPATH];
FullTransactionId fxid;
TwoPhaseFilePath(path, hdr->xid); /* Use current epoch */
fxid = FullTransactionIdFromCurrentEpoch(hdr->xid);
TwoPhaseFilePath(path, fxid);
if (access(path, F_OK) == 0) if (access(path, F_OK) == 0)
{ {
@ -2617,7 +2674,15 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
*/ */
elog(DEBUG2, "removing 2PC data for transaction %u", xid); elog(DEBUG2, "removing 2PC data for transaction %u", xid);
if (gxact->ondisk) if (gxact->ondisk)
RemoveTwoPhaseFile(xid, giveWarning); {
FullTransactionId fxid;
/*
* We should deal with a file at the current epoch here.
*/
fxid = FullTransactionIdFromCurrentEpoch(xid);
RemoveTwoPhaseFile(fxid, giveWarning);
}
RemoveGXact(gxact); RemoveGXact(gxact);
} }

View File

@ -572,4 +572,38 @@ my $nsubtrans = $cur_primary->safe_psql('postgres',
); );
isnt($osubtrans, $nsubtrans, "contents of pg_subtrans/ have changed"); isnt($osubtrans, $nsubtrans, "contents of pg_subtrans/ have changed");
###############################################################################
# Check handling of orphaned 2PC files at recovery.
###############################################################################
$cur_standby->teardown_node;
$cur_primary->teardown_node;
# Grab location in logs of primary
my $log_offset = -s $cur_primary->logfile;
# Create fake files with a transaction ID large or low enough to be in the
# future or the past, in different epochs, then check that the primary is able
# to start and remove these files at recovery.
# First bump the epoch with pg_resetwal.
$cur_primary->command_ok(
[ 'pg_resetwal', '-e', 256, '-f', $cur_primary->data_dir ],
'bump epoch of primary');
my $future_2pc_file =
$cur_primary->data_dir . '/pg_twophase/000001FF00000FFF';
append_to_file $future_2pc_file, "";
my $past_2pc_file = $cur_primary->data_dir . '/pg_twophase/000000EE00000FFF';
append_to_file $past_2pc_file, "";
$cur_primary->start;
$cur_primary->log_check(
"two-phase files removed at recovery",
$log_offset,
log_like => [
qr/removing past two-phase state file of epoch 238 for transaction 4095/,
qr/removing future two-phase state file of epoch 511 for transaction 4095/
]);
done_testing(); done_testing();