Limit VACUUM truncation retries after scan interruption

This commit is contained in:
Shayon Mukherjee 2025-05-05 15:32:41 -04:00
parent 94b84a6072
commit 060c89a566
3 changed files with 68 additions and 0 deletions

View File

@ -179,6 +179,8 @@
#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */
#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */
#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */
/* Max retries for the truncation attempt if the backward scan is interrupted */
#define VACUUM_TRUNCATE_INTERRUPTION_MAX_RETRIES 3
/*
* Threshold that controls whether we bypass index vacuuming and heap
@ -3213,6 +3215,7 @@ lazy_truncate_heap(LVRelState *vacrel)
BlockNumber new_rel_pages;
bool lock_waiter_detected;
int lock_retry;
int truncate_interruption_retry_count = 0;
/* Report that we are now truncating */
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
@ -3227,6 +3230,19 @@ lazy_truncate_heap(LVRelState *vacrel)
*/
do
{
/*
* Check if we've retried too many times due to interruptions. We
* check here to allow at least one full attempt even if
* max_truncate_retries is set low.
*/
if (truncate_interruption_retry_count >= VACUUM_TRUNCATE_INTERRUPTION_MAX_RETRIES)
{
ereport(vacrel->verbose ? INFO : DEBUG2,
(errmsg("table \"%s\": stopping truncate after %d retries due to repeated conflicting lock requests",
vacrel->relname, truncate_interruption_retry_count)));
break;
}
/*
* We need full exclusive lock on the relation in order to do
* truncation. If we can't get it, give up rather than waiting --- we
@ -3329,6 +3345,14 @@ lazy_truncate_heap(LVRelState *vacrel)
vacrel->relname,
orig_rel_pages, new_rel_pages)));
orig_rel_pages = new_rel_pages;
/*
* Increment retry count only if we were interrupted and will loop
* again
*/
if (lock_waiter_detected && new_rel_pages > vacrel->nonempty_pages)
truncate_interruption_retry_count++;
} while (new_rel_pages > vacrel->nonempty_pages && lock_waiter_detected);
}

View File

@ -116,3 +116,4 @@ test: serializable-parallel-2
test: serializable-parallel-3
test: matview-write-skew
test: lock-nowait
test: vacuum-truncate-retry-limit

View File

@ -0,0 +1,43 @@
# Test for VACUUM hitting the retry limit when truncation is repeatedly
# interrupted by conflicting locks during the backward scan phase.
setup
{
CREATE TABLE vac_retry_tab (a int, b char(100));
-- Need enough rows/pages to make truncation meaningful and the backward scan non-trivial
INSERT INTO vac_retry_tab SELECT g, 'foo' FROM generate_series(1, 20000) g;
ALTER TABLE vac_retry_tab SET (autovacuum_enabled = false);
-- Delete most rows, leaving only the first few pages populated
DELETE FROM vac_retry_tab WHERE a > 500;
-- Initial vacuum to clean up dead rows, setting up for truncation test
VACUUM vac_retry_tab;
}
teardown
{
DROP TABLE vac_retry_tab;
}
session s1
step s1_vacuum { VACUUM (VERBOSE) vac_retry_tab; }
# This step implicitly waits for s1_vacuum to complete or error out
step s1_finish_check { SELECT 1; }
session s2
# Repeatedly take and release a lock that conflicts with AccessExclusiveLock
step s2_lock1 { BEGIN; LOCK vac_retry_tab IN ACCESS SHARE MODE; }
step s2_unlock1 { COMMIT; }
step s2_lock2 { BEGIN; LOCK vac_retry_tab IN ACCESS SHARE MODE; }
step s2_unlock2 { COMMIT; }
step s2_lock3 { BEGIN; LOCK vac_retry_tab IN ACCESS SHARE MODE; }
step s2_unlock3 { COMMIT; }
# This last lock might be held while VACUUM finally gives up or finishes
step s2_lock4 { BEGIN; LOCK vac_retry_tab IN ACCESS SHARE MODE; }
step s2_unlock4 { COMMIT; }
# The permutation aims to have s1 acquire the AccessExclusiveLock and start
# the count_nondeletable_pages scan, then s2 repeatedly interrupts it.
# s1_finish_check will wait until s1_vacuum finishes (either by completing
# truncation, hitting the retry limit, or erroring).
# We expect s1_vacuum to log the message about hitting the retry limit.
permutation s1_vacuum s2_lock1 s2_unlock1 s2_lock2 s2_unlock2 s2_lock3 s2_unlock3 s2_lock4 s1_finish_check s2_unlock4