pgstat: Allow checksum errors to be reported in critical sections

For AIO we execute completion callbacks in critical sections (to ensure that
AIO can in the future be used for WAL, which in turn requires that we can call
completion callbacks in critical sections, to get the resources for WAL
io). To report checksum errors a backend now has to call
pgstat_prepare_report_checksum_failure(), before entering a critical section,
which guarantees the relevant pgstats entry is in shared memory, the relevant
DSM segment is mapped into the backend's memory and the address is known via a
PgStat_EntryRef.

Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/wkjj4p2rmkevutkwc6tewoovdqznj6c6nvjmvii4oo5wmbh5sr@retq7d6uqs4j
This commit is contained in:
Andres Freund 2025-03-30 16:10:51 -04:00
parent 4244cf6876
commit b96d3c3897
5 changed files with 52 additions and 3 deletions

View File

@ -1817,6 +1817,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename,
checksum_failures,
readfilename, checksum_failures)));
pgstat_prepare_report_checksum_failure(dboid);
pgstat_report_checksum_failures_in_db(dboid, checksum_failures);
}

View File

@ -524,6 +524,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
{
RelFileLocatorBackend rloc = src->smgr_rlocator;
pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
}

View File

@ -1590,6 +1590,7 @@ WaitReadBuffers(ReadBuffersOperation *operation)
{
RelFileLocatorBackend rloc = operation->smgr->smgr_rlocator;
pgstat_prepare_report_checksum_failure(rloc.locator.dbOid);
pgstat_report_checksum_failures_in_db(rloc.locator.dbOid, 1);
}

View File

@ -133,8 +133,34 @@ pgstat_report_deadlock(void)
dbent->deadlocks++;
}
/*
* Allow this backend to later report checksum failures for dboid, even if in
* a critical section at the time of the report.
*
* Without this function having been called first, the backend might need to
* allocate an EntryRef or might need to map in DSM segments. Neither should
* happen in a critical section.
*/
void
pgstat_prepare_report_checksum_failure(Oid dboid)
{
Assert(!CritSectionCount);
/*
* Just need to ensure this backend has an entry ref for the database.
* That will allows us to report checksum failures without e.g. needing to
* map in DSM segments.
*/
pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
true, NULL);
}
/*
* Report one or more checksum failures.
*
* To be allowed to report checksum failures in critical sections, we require
* pgstat_prepare_report_checksum_failure() to have been called before this
* function is called.
*/
void
pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
@ -147,10 +173,29 @@ pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
/*
* Update the shared stats directly - checksum failures should never be
* common enough for that to be a problem.
* common enough for that to be a problem. Note that we pass create=false
* here, as we want to be sure to not require memory allocations, so this
* can be called in critical sections.
*/
entry_ref =
pgstat_get_entry_ref_locked(PGSTAT_KIND_DATABASE, dboid, InvalidOid, false);
entry_ref = pgstat_get_entry_ref(PGSTAT_KIND_DATABASE, dboid, InvalidOid,
false, NULL);
/*
* Should always have been created by
* pgstat_prepare_report_checksum_failure().
*
* When not using assertions, we don't want to crash should something have
* gone wrong, so just return.
*/
Assert(entry_ref);
if (!entry_ref)
{
elog(WARNING, "could not report %d conflicts for DB %u",
failurecount, dboid);
return;
}
pgstat_lock_entry(entry_ref, false);
sharedent = (PgStatShared_Database *) entry_ref->shared_stats;
sharedent->stats.checksum_failures += failurecount;

View File

@ -611,6 +611,7 @@ extern void pgstat_drop_database(Oid databaseid);
extern void pgstat_report_autovac(Oid dboid);
extern void pgstat_report_recovery_conflict(int reason);
extern void pgstat_report_deadlock(void);
extern void pgstat_prepare_report_checksum_failure(Oid dboid);
extern void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount);
extern void pgstat_report_connect(Oid dboid);
extern void pgstat_update_parallel_workers_stats(PgStat_Counter workers_to_launch,