Heikki Linnakangas 3c0fd64fec Split ProcSleep function into JoinWaitQueue and ProcSleep
Split ProcSleep into two functions: JoinWaitQueue and ProcSleep.
JoinWaitQueue is called while holding the partition lock, and inserts
the current process to the wait queue, while ProcSleep() does the
actual sleeping. ProcSleep() is now called without holding the
partition lock, and it no longer re-acquires the partition lock before
returning. That makes the wakeup a little cheaper. Once upon a time,
re-acquiring the partition lock was needed to prevent a signal handler
from longjmping out at a bad time, but these days our signal handlers
just set flags, and longjmping can only happen at points where we
explicitly run CHECK_FOR_INTERRUPTS().

If JoinWaitQueue detects an "early deadlock" before even joining the
wait queue, it returns without changing the shared lock entry, leaving
the cleanup of the shared lock entry to the caller. This makes the
handling of an early deadlock the same as the dontWait=true case.

One small user-visible side-effect of this refactoring is that we now
only set the 'ps' title to say "waiting" when we actually enter the
sleep, not when the lock is skipped because dontWait=true, or when a
deadlock is detected early before entering the sleep.

This eliminates the 'lockAwaited' global variable in proc.c, which was
largely redundant with 'awaitedLock' in lock.c

Note: Updating the local lock table is now the caller's responsibility.
JoinWaitQueue and ProcSleep are now only responsible for modifying the
shared state. Seems a little nicer that way.

Based on Thomas Munro's earlier patch and observation that ProcSleep
doesn't really need to re-acquire the partition lock.

Reviewed-by: Maxim Orlov
Discussion: https://www.postgresql.org/message-id/7c2090cd-a72a-4e34-afaa-6dd2ef31440e@iki.fi
2024-11-04 17:59:24 +02:00

504 lines
20 KiB
C

/*-------------------------------------------------------------------------
*
* proc.h
* per-process shared memory data structures
*
*
* Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/storage/proc.h
*
*-------------------------------------------------------------------------
*/
#ifndef _PROC_H_
#define _PROC_H_
#include "access/clog.h"
#include "access/xlogdefs.h"
#include "lib/ilist.h"
#include "storage/latch.h"
#include "storage/lock.h"
#include "storage/pg_sema.h"
#include "storage/proclist_types.h"
#include "storage/procnumber.h"
/*
* Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds
* for non-aborted subtransactions of its current top transaction. These
* have to be treated as running XIDs by other backends.
*
* We also keep track of whether the cache overflowed (ie, the transaction has
* generated at least one subtransaction that didn't fit in the cache).
* If none of the caches have overflowed, we can assume that an XID that's not
* listed anywhere in the PGPROC array is not a running transaction. Else we
* have to look at pg_subtrans.
*
* See src/test/isolation/specs/subxid-overflow.spec if you change this.
*/
#define PGPROC_MAX_CACHED_SUBXIDS 64 /* XXX guessed-at value */
typedef struct XidCacheStatus
{
/* number of cached subxids, never more than PGPROC_MAX_CACHED_SUBXIDS */
uint8 count;
/* has PGPROC->subxids overflowed */
bool overflowed;
} XidCacheStatus;
struct XidCache
{
TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS];
};
/*
* Flags for PGPROC->statusFlags and PROC_HDR->statusFlags[]
*/
#define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */
#define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */
#define PROC_IN_SAFE_IC 0x04 /* currently running CREATE INDEX
* CONCURRENTLY or REINDEX
* CONCURRENTLY on non-expressional,
* non-partial index */
#define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */
#define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical
* decoding outside xact */
#define PROC_AFFECTS_ALL_HORIZONS 0x20 /* this proc's xmin must be
* included in vacuum horizons
* in all databases */
/* flags reset at EOXact */
#define PROC_VACUUM_STATE_MASK \
(PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND)
/*
* Xmin-related flags. Make sure any flags that affect how the process' Xmin
* value is interpreted by VACUUM are included here.
*/
#define PROC_XMIN_FLAGS (PROC_IN_VACUUM | PROC_IN_SAFE_IC)
/*
* We allow a limited number of "weak" relation locks (AccessShareLock,
* RowShareLock, RowExclusiveLock) to be recorded in the PGPROC structure
* (or rather in shared memory referenced from PGPROC) rather than the main
* lock table. This eases contention on the lock manager LWLocks. See
* storage/lmgr/README for additional details.
*/
extern PGDLLIMPORT int FastPathLockGroupsPerBackend;
#define FP_LOCK_GROUPS_PER_BACKEND_MAX 1024
#define FP_LOCK_SLOTS_PER_GROUP 16 /* don't change */
#define FP_LOCK_SLOTS_PER_BACKEND (FP_LOCK_SLOTS_PER_GROUP * FastPathLockGroupsPerBackend)
/*
* Flags for PGPROC.delayChkptFlags
*
* These flags can be used to delay the start or completion of a checkpoint
* for short periods. A flag is in effect if the corresponding bit is set in
* the PGPROC of any backend.
*
* For our purposes here, a checkpoint has three phases: (1) determine the
* location to which the redo pointer will be moved, (2) write all the
* data durably to disk, and (3) WAL-log the checkpoint.
*
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
* to phase 2. This is useful when we are performing a WAL-logged modification
* of data that will be flushed to disk in phase 2. By setting this flag
* before writing WAL and clearing it after we've both written WAL and
* performed the corresponding modification, we ensure that if the WAL record
* is inserted prior to the new redo point, the corresponding data changes will
* also be flushed to disk before the checkpoint can complete. (In the
* extremely common case where the data being modified is in shared buffers
* and we acquire an exclusive content lock on the relevant buffers before
* writing WAL, this mechanism is not needed, because phase 2 will block
* until we release the content lock and then flush the modified data to
* disk.)
*
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
* to phase 3. This is useful if we are performing a WAL-logged operation that
* might invalidate buffers, such as relation truncation. In this case, we need
* to ensure that any buffers which were invalidated and thus not flushed by
* the checkpoint are actually destroyed on disk. Replay can cope with a file
* or block that doesn't exist, but not with a block that has the wrong
* contents.
*/
#define DELAY_CHKPT_START (1<<0)
#define DELAY_CHKPT_COMPLETE (1<<1)
typedef enum
{
PROC_WAIT_STATUS_OK,
PROC_WAIT_STATUS_WAITING,
PROC_WAIT_STATUS_ERROR,
} ProcWaitStatus;
/*
* Each backend has a PGPROC struct in shared memory. There is also a list of
* currently-unused PGPROC structs that will be reallocated to new backends.
*
* links: list link for any list the PGPROC is in. When waiting for a lock,
* the PGPROC is linked into that lock's waitProcs queue. A recycled PGPROC
* is linked into ProcGlobal's freeProcs list.
*
* Note: twophase.c also sets up a dummy PGPROC struct for each currently
* prepared transaction. These PGPROCs appear in the ProcArray data structure
* so that the prepared transactions appear to be still running and are
* correctly shown as holding locks. A prepared transaction PGPROC can be
* distinguished from a real one at need by the fact that it has pid == 0.
* The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
* but its myProcLocks[] lists are valid.
*
* We allow many fields of this struct to be accessed without locks, such as
* delayChkptFlags and isBackgroundWorker. However, keep in mind that writing
* mirrored ones (see below) requires holding ProcArrayLock or XidGenLock in
* at least shared mode, so that pgxactoff does not change concurrently.
*
* Mirrored fields:
*
* Some fields in PGPROC (see "mirrored in ..." comment) are mirrored into an
* element of more densely packed ProcGlobal arrays. These arrays are indexed
* by PGPROC->pgxactoff. Both copies need to be maintained coherently.
*
* NB: The pgxactoff indexed value can *never* be accessed without holding
* locks.
*
* See PROC_HDR for details.
*/
struct PGPROC
{
dlist_node links; /* list link if process is in a list */
dlist_head *procgloballist; /* procglobal list that owns this PGPROC */
PGSemaphore sem; /* ONE semaphore to sleep on */
ProcWaitStatus waitStatus;
Latch procLatch; /* generic latch for process */
TransactionId xid; /* id of top-level transaction currently being
* executed by this proc, if running and XID
* is assigned; else InvalidTransactionId.
* mirrored in ProcGlobal->xids[pgxactoff] */
TransactionId xmin; /* minimal running XID as it was when we were
* starting our xact, excluding LAZY VACUUM:
* vacuum must not remove tuples deleted by
* xid >= xmin ! */
int pid; /* Backend's process ID; 0 if prepared xact */
int pgxactoff; /* offset into various ProcGlobal->arrays with
* data mirrored from this PGPROC */
/*
* Currently running top-level transaction's virtual xid. Together these
* form a VirtualTransactionId, but we don't use that struct because this
* is not atomically assignable as whole, and we want to enforce code to
* consider both parts separately. See comments at VirtualTransactionId.
*/
struct
{
ProcNumber procNumber; /* For regular backends, equal to
* GetNumberFromPGProc(proc). For prepared
* xacts, ID of the original backend that
* processed the transaction. For unused
* PGPROC entries, INVALID_PROC_NUMBER. */
LocalTransactionId lxid; /* local id of top-level transaction
* currently * being executed by this
* proc, if running; else
* InvalidLocalTransactionId */
} vxid;
/* These fields are zero while a backend is still starting up: */
Oid databaseId; /* OID of database this backend is using */
Oid roleId; /* OID of role using this backend */
Oid tempNamespaceId; /* OID of temp schema this backend is
* using */
bool isBackgroundWorker; /* true if background worker. */
/*
* While in hot standby mode, shows that a conflict signal has been sent
* for the current transaction. Set/cleared while holding ProcArrayLock,
* though not required. Accessed without lock, if needed.
*/
bool recoveryConflictPending;
/* Info about LWLock the process is currently waiting for, if any. */
uint8 lwWaiting; /* see LWLockWaitState */
uint8 lwWaitMode; /* lwlock mode being waited for */
proclist_node lwWaitLink; /* position in LW lock wait list */
/* Support for condition variables. */
proclist_node cvWaitLink; /* position in CV wait list */
/* Info about lock the process is currently waiting for, if any. */
/* waitLock and waitProcLock are NULL if not currently waiting. */
LOCK *waitLock; /* Lock object we're sleeping on ... */
PROCLOCK *waitProcLock; /* Per-holder info for awaited lock */
LOCKMODE waitLockMode; /* type of lock we're waiting for */
LOCKMASK heldLocks; /* bitmask for lock types already held on this
* lock object by this backend */
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
* started */
int delayChkptFlags; /* for DELAY_CHKPT_* flags */
uint8 statusFlags; /* this backend's status flags, see PROC_*
* above. mirrored in
* ProcGlobal->statusFlags[pgxactoff] */
/*
* Info to allow us to wait for synchronous replication, if needed.
* waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend.
* syncRepState must not be touched except by owning process or WALSender.
* syncRepLinks used only while holding SyncRepLock.
*/
XLogRecPtr waitLSN; /* waiting for this LSN or higher */
int syncRepState; /* wait state for sync rep */
dlist_node syncRepLinks; /* list link if process is in syncrep queue */
/*
* All PROCLOCK objects for locks held or awaited by this backend are
* linked into one of these lists, according to the partition number of
* their lock.
*/
dlist_head myProcLocks[NUM_LOCK_PARTITIONS];
XidCacheStatus subxidStatus; /* mirrored with
* ProcGlobal->subxidStates[i] */
struct XidCache subxids; /* cache for subtransaction XIDs */
/* Support for group XID clearing. */
/* true, if member of ProcArray group waiting for XID clear */
bool procArrayGroupMember;
/* next ProcArray group member waiting for XID clear */
pg_atomic_uint32 procArrayGroupNext;
/*
* latest transaction id among the transaction's main XID and
* subtransactions
*/
TransactionId procArrayGroupMemberXid;
uint32 wait_event_info; /* proc's wait information */
/* Support for group transaction status update. */
bool clogGroupMember; /* true, if member of clog group */
pg_atomic_uint32 clogGroupNext; /* next clog group member */
TransactionId clogGroupMemberXid; /* transaction id of clog group member */
XidStatus clogGroupMemberXidStatus; /* transaction status of clog
* group member */
int64 clogGroupMemberPage; /* clog page corresponding to
* transaction id of clog group member */
XLogRecPtr clogGroupMemberLsn; /* WAL location of commit record for clog
* group member */
/* Lock manager data, recording fast-path locks taken by this backend. */
LWLock fpInfoLock; /* protects per-backend fast-path state */
uint64 *fpLockBits; /* lock modes held for each fast-path slot */
Oid *fpRelId; /* slots for rel oids */
bool fpVXIDLock; /* are we holding a fast-path VXID lock? */
LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID
* lock */
/*
* Support for lock groups. Use LockHashPartitionLockByProc on the group
* leader to get the LWLock protecting these fields.
*/
PGPROC *lockGroupLeader; /* lock group leader, if I'm a member */
dlist_head lockGroupMembers; /* list of members, if I'm a leader */
dlist_node lockGroupLink; /* my member link, if I'm a member */
};
/* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
extern PGDLLIMPORT PGPROC *MyProc;
/* Proc number of this backend. Equal to GetNumberFromPGProc(MyProc). */
extern PGDLLIMPORT ProcNumber MyProcNumber;
/* Our parallel session leader, or INVALID_PROC_NUMBER if none */
extern PGDLLIMPORT ProcNumber ParallelLeaderProcNumber;
/*
* The proc number to use for our session's temp relations is normally our own,
* but parallel workers should use their leader's ID.
*/
#define ProcNumberForTempRelations() \
(ParallelLeaderProcNumber == INVALID_PROC_NUMBER ? MyProcNumber : ParallelLeaderProcNumber)
/*
* There is one ProcGlobal struct for the whole database cluster.
*
* Adding/Removing an entry into the procarray requires holding *both*
* ProcArrayLock and XidGenLock in exclusive mode (in that order). Both are
* needed because the dense arrays (see below) are accessed from
* GetNewTransactionId() and GetSnapshotData(), and we don't want to add
* further contention by both using the same lock. Adding/Removing a procarray
* entry is much less frequent.
*
* Some fields in PGPROC are mirrored into more densely packed arrays (e.g.
* xids), with one entry for each backend. These arrays only contain entries
* for PGPROCs that have been added to the shared array with ProcArrayAdd()
* (in contrast to PGPROC array which has unused PGPROCs interspersed).
*
* The dense arrays are indexed by PGPROC->pgxactoff. Any concurrent
* ProcArrayAdd() / ProcArrayRemove() can lead to pgxactoff of a procarray
* member to change. Therefore it is only safe to use PGPROC->pgxactoff to
* access the dense array while holding either ProcArrayLock or XidGenLock.
*
* As long as a PGPROC is in the procarray, the mirrored values need to be
* maintained in both places in a coherent manner.
*
* The denser separate arrays are beneficial for three main reasons: First, to
* allow for as tight loops accessing the data as possible. Second, to prevent
* updates of frequently changing data (e.g. xmin) from invalidating
* cachelines also containing less frequently changing data (e.g. xid,
* statusFlags). Third to condense frequently accessed data into as few
* cachelines as possible.
*
* There are two main reasons to have the data mirrored between these dense
* arrays and PGPROC. First, as explained above, a PGPROC's array entries can
* only be accessed with either ProcArrayLock or XidGenLock held, whereas the
* PGPROC entries do not require that (obviously there may still be locking
* requirements around the individual field, separate from the concerns
* here). That is particularly important for a backend to efficiently checks
* it own values, which it often can safely do without locking. Second, the
* PGPROC fields allow to avoid unnecessary accesses and modification to the
* dense arrays. A backend's own PGPROC is more likely to be in a local cache,
* whereas the cachelines for the dense array will be modified by other
* backends (often removing it from the cache for other cores/sockets). At
* commit/abort time a check of the PGPROC value can avoid accessing/dirtying
* the corresponding array value.
*
* Basically it makes sense to access the PGPROC variable when checking a
* single backend's data, especially when already looking at the PGPROC for
* other reasons already. It makes sense to look at the "dense" arrays if we
* need to look at many / most entries, because we then benefit from the
* reduced indirection and better cross-process cache-ability.
*
* When entering a PGPROC for 2PC transactions with ProcArrayAdd(), the data
* in the dense arrays is initialized from the PGPROC while it already holds
* ProcArrayLock.
*/
typedef struct PROC_HDR
{
/* Array of PGPROC structures (not including dummies for prepared txns) */
PGPROC *allProcs;
/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
TransactionId *xids;
/*
* Array mirroring PGPROC.subxidStatus for each PGPROC currently in the
* procarray.
*/
XidCacheStatus *subxidStates;
/*
* Array mirroring PGPROC.statusFlags for each PGPROC currently in the
* procarray.
*/
uint8 *statusFlags;
/* Length of allProcs array */
uint32 allProcCount;
/* Head of list of free PGPROC structures */
dlist_head freeProcs;
/* Head of list of autovacuum's free PGPROC structures */
dlist_head autovacFreeProcs;
/* Head of list of bgworker free PGPROC structures */
dlist_head bgworkerFreeProcs;
/* Head of list of walsender free PGPROC structures */
dlist_head walsenderFreeProcs;
/* First pgproc waiting for group XID clear */
pg_atomic_uint32 procArrayGroupFirst;
/* First pgproc waiting for group transaction status update */
pg_atomic_uint32 clogGroupFirst;
/*
* Current slot numbers of some auxiliary processes. There can be only one
* of each of these running at a time.
*/
ProcNumber walwriterProc;
ProcNumber checkpointerProc;
/* Current shared estimate of appropriate spins_per_delay value */
int spins_per_delay;
/* Buffer id of the buffer that Startup process waits for pin on, or -1 */
int startupBufferPinWaitBufId;
} PROC_HDR;
extern PGDLLIMPORT PROC_HDR *ProcGlobal;
extern PGDLLIMPORT PGPROC *PreparedXactProcs;
/*
* Accessors for getting PGPROC given a ProcNumber and vice versa.
*/
#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)])
#define GetNumberFromPGProc(proc) ((proc) - &ProcGlobal->allProcs[0])
/*
* We set aside some extra PGPROC structures for auxiliary processes,
* ie things that aren't full-fledged backends but need shmem access.
*
* Background writer, checkpointer, WAL writer, WAL summarizer, and archiver
* run during normal operation. Startup process and WAL receiver also consume
* 2 slots, but WAL writer is launched only after startup has exited, so we
* only need 6 slots.
*/
#define NUM_AUXILIARY_PROCS 6
/* configurable options */
extern PGDLLIMPORT int DeadlockTimeout;
extern PGDLLIMPORT int StatementTimeout;
extern PGDLLIMPORT int LockTimeout;
extern PGDLLIMPORT int IdleInTransactionSessionTimeout;
extern PGDLLIMPORT int TransactionTimeout;
extern PGDLLIMPORT int IdleSessionTimeout;
extern PGDLLIMPORT bool log_lock_waits;
#ifdef EXEC_BACKEND
extern PGDLLIMPORT slock_t *ProcStructLock;
extern PGDLLIMPORT PGPROC *AuxiliaryProcs;
#endif
/*
* Function Prototypes
*/
extern int ProcGlobalSemas(void);
extern Size ProcGlobalShmemSize(void);
extern void InitProcGlobal(void);
extern void InitProcess(void);
extern void InitProcessPhase2(void);
extern void InitAuxiliaryProcess(void);
extern void SetStartupBufferPinWaitBufId(int bufid);
extern int GetStartupBufferPinWaitBufId(void);
extern bool HaveNFreeProcs(int n, int *nfree);
extern void ProcReleaseLocks(bool isCommit);
extern ProcWaitStatus JoinWaitQueue(LOCALLOCK *locallock,
LockMethod lockMethodTable, bool dontWait);
extern ProcWaitStatus ProcSleep(LOCALLOCK *locallock);
extern void ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus);
extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
extern void CheckDeadLockAlert(void);
extern void LockErrorCleanup(void);
extern void ProcWaitForSignal(uint32 wait_event_info);
extern void ProcSendSignal(ProcNumber procNumber);
extern PGPROC *AuxiliaryPidGetProc(int pid);
extern void BecomeLockGroupLeader(void);
extern bool BecomeLockGroupMember(PGPROC *leader, int pid);
#endif /* _PROC_H_ */