diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index f65c17e5ae4..0e9617bcff4 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1919,11 +1919,6 @@ include_dir 'conf.d' too high. It may be useful to control for this by separately setting . - - Note that for the collection of dead tuple identifiers, - VACUUM is only able to utilize up to a maximum of - 1GB of memory. - @@ -1946,13 +1941,6 @@ include_dir 'conf.d' postgresql.conf file or on the server command line. - - For the collection of dead tuple identifiers, autovacuum is only able - to utilize up to a maximum of 1GB of memory, so - setting autovacuum_work_mem to a value higher than - that has no effect on the number of dead tuples that autovacuum can - collect while scanning a table. - diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 8736eac2841..6a74e4a24df 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -6237,10 +6237,10 @@ FROM pg_stat_get_backend_idset() AS backendid; - max_dead_tuples bigint + max_dead_tuple_bytes bigint - Number of dead tuples that we can store before needing to perform + Amount of dead tuple data that we can store before needing to perform an index vacuum cycle, based on . @@ -6248,10 +6248,10 @@ FROM pg_stat_get_backend_idset() AS backendid; - num_dead_tuples bigint + dead_tuple_bytes bigint - Number of dead tuples collected since the last index vacuum cycle. + Amount of dead tuple data collected since the last index vacuum cycle. diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 9adb33ce9d3..5fb8f7727b3 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -3,18 +3,17 @@ * vacuumlazy.c * Concurrent ("lazy") vacuuming. * - * The major space usage for vacuuming is storage for the array of dead TIDs - * that are to be removed from indexes. We want to ensure we can vacuum even - * the very largest relations with finite memory space usage. To do that, we - * set upper bounds on the number of TIDs we can keep track of at once. + * The major space usage for vacuuming is storage for the dead tuple IDs that + * are to be removed from indexes. We want to ensure we can vacuum even the + * very largest relations with finite memory space usage. To do that, we set + * upper bounds on the memory that can be used for keeping track of dead TIDs + * at once. * * We are willing to use at most maintenance_work_mem (or perhaps - * autovacuum_work_mem) memory space to keep track of dead TIDs. We initially - * allocate an array of TIDs of that size, with an upper limit that depends on - * table size (this limit ensures we don't allocate a huge area uselessly for - * vacuuming small tables). If the array threatens to overflow, we must call - * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned). - * This frees up the memory space dedicated to storing dead TIDs. + * autovacuum_work_mem) memory space to keep track of dead TIDs. If the + * TID store is full, we must call lazy_vacuum to vacuum indexes (and to vacuum + * the pages that we've pruned). This frees up the memory space dedicated to + * to store dead TIDs. * * In practice VACUUM will often complete its initial pass over the target * heap relation without ever running out of space to store TIDs. This means @@ -39,6 +38,7 @@ #include "access/heapam_xlog.h" #include "access/htup_details.h" #include "access/multixact.h" +#include "access/tidstore.h" #include "access/transam.h" #include "access/visibilitymap.h" #include "access/xloginsert.h" @@ -179,8 +179,13 @@ typedef struct LVRelState * that has been processed by lazy_scan_prune. Also needed by * lazy_vacuum_heap_rel, which marks the same LP_DEAD line pointers as * LP_UNUSED during second heap pass. + * + * Both dead_items and dead_items_info are allocated in shared memory in + * parallel vacuum cases. */ - VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ + TidStore *dead_items; /* TIDs whose index tuples we'll delete */ + VacDeadItemsInfo *dead_items_info; + BlockNumber rel_pages; /* total number of pages */ BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ BlockNumber removed_pages; /* # pages removed by relation truncation */ @@ -239,8 +244,9 @@ static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, static void lazy_vacuum(LVRelState *vacrel); static bool lazy_vacuum_all_indexes(LVRelState *vacrel); static void lazy_vacuum_heap_rel(LVRelState *vacrel); -static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, - Buffer buffer, int index, Buffer vmbuffer); +static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, + Buffer buffer, OffsetNumber *offsets, + int num_offsets, Buffer vmbuffer); static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); static void lazy_cleanup_all_indexes(LVRelState *vacrel); static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, @@ -257,6 +263,9 @@ static void lazy_truncate_heap(LVRelState *vacrel); static BlockNumber count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected); static void dead_items_alloc(LVRelState *vacrel, int nworkers); +static void dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets); +static void dead_items_reset(LVRelState *vacrel); static void dead_items_cleanup(LVRelState *vacrel); static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen); @@ -472,7 +481,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, } /* - * Allocate dead_items array memory using dead_items_alloc. This handles + * Allocate dead_items memory using dead_items_alloc. This handles * parallel VACUUM initialization as part of allocating shared memory * space used for dead_items. (But do a failsafe precheck first, to * ensure that parallel VACUUM won't be attempted at all when relfrozenxid @@ -782,7 +791,7 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * have collected the TIDs whose index tuples need to be removed. * * Finally, invokes lazy_vacuum_heap_rel to vacuum heap pages, which - * largely consists of marking LP_DEAD items (from collected TID array) + * largely consists of marking LP_DEAD items (from vacrel->dead_items) * as LP_UNUSED. This has to happen in a second, final pass over the * heap, to preserve a basic invariant that all index AMs rely on: no * extant index tuple can ever be allowed to contain a TID that points to @@ -811,19 +820,20 @@ lazy_scan_heap(LVRelState *vacrel) next_fsm_block_to_vacuum = 0; bool all_visible_according_to_vm; - VacDeadItems *dead_items = vacrel->dead_items; + TidStore *dead_items = vacrel->dead_items; + VacDeadItemsInfo *dead_items_info = vacrel->dead_items_info; Buffer vmbuffer = InvalidBuffer; const int initprog_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_TOTAL_HEAP_BLKS, - PROGRESS_VACUUM_MAX_DEAD_TUPLES + PROGRESS_VACUUM_MAX_DEAD_TUPLE_BYTES }; int64 initprog_val[3]; /* Report that we're scanning the heap, advertising total # of blocks */ initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; initprog_val[1] = rel_pages; - initprog_val[2] = dead_items->max_items; + initprog_val[2] = dead_items_info->max_bytes; pgstat_progress_update_multi_param(3, initprog_index, initprog_val); /* Initialize for the first heap_vac_scan_next_block() call */ @@ -866,8 +876,7 @@ lazy_scan_heap(LVRelState *vacrel) * dead_items TIDs, pause and do a cycle of vacuuming before we tackle * this page. */ - Assert(dead_items->max_items >= MaxHeapTuplesPerPage); - if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage) + if (TidStoreMemoryUsage(dead_items) > dead_items_info->max_bytes) { /* * Before beginning index vacuuming, we release any pin we may @@ -930,7 +939,7 @@ lazy_scan_heap(LVRelState *vacrel) /* * If we didn't get the cleanup lock, we can still collect LP_DEAD - * items in the dead_items array for later vacuuming, count live and + * items in the dead_items area for later vacuuming, count live and * recently dead tuples for vacuum logging, and determine if this * block could later be truncated. If we encounter any xid/mxids that * require advancing the relfrozenxid/relminxid, we'll have to wait @@ -958,9 +967,9 @@ lazy_scan_heap(LVRelState *vacrel) * Like lazy_scan_noprune(), lazy_scan_prune() will count * recently_dead_tuples and live tuples for vacuum logging, determine * if the block can later be truncated, and accumulate the details of - * remaining LP_DEAD line pointers on the page in the dead_items - * array. These dead items include those pruned by lazy_scan_prune() - * as well we line pointers previously marked LP_DEAD. + * remaining LP_DEAD line pointers on the page into dead_items. These + * dead items include those pruned by lazy_scan_prune() as well as + * line pointers previously marked LP_DEAD. */ if (got_cleanup_lock) lazy_scan_prune(vacrel, buf, blkno, page, @@ -1037,7 +1046,7 @@ lazy_scan_heap(LVRelState *vacrel) * Do index vacuuming (call each index's ambulkdelete routine), then do * related heap vacuuming */ - if (dead_items->num_items > 0) + if (dead_items_info->num_items > 0) lazy_vacuum(vacrel); /* @@ -1766,22 +1775,9 @@ lazy_scan_prune(LVRelState *vacrel, */ if (lpdead_items > 0) { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; - vacrel->lpdead_item_pages++; - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); + dead_items_add(vacrel, blkno, deadoffsets, lpdead_items); /* * It was convenient to ignore LP_DEAD items in all_visible earlier on @@ -1928,7 +1924,7 @@ lazy_scan_prune(LVRelState *vacrel, * lazy_scan_prune, which requires a full cleanup lock. While pruning isn't * performed here, it's quite possible that an earlier opportunistic pruning * operation left LP_DEAD items behind. We'll at least collect any such items - * in the dead_items array for removal from indexes. + * in dead_items for removal from indexes. * * For aggressive VACUUM callers, we may return false to indicate that a full * cleanup lock is required for processing by lazy_scan_prune. This is only @@ -2087,7 +2083,7 @@ lazy_scan_noprune(LVRelState *vacrel, vacrel->NewRelfrozenXid = NoFreezePageRelfrozenXid; vacrel->NewRelminMxid = NoFreezePageRelminMxid; - /* Save any LP_DEAD items found on the page in dead_items array */ + /* Save any LP_DEAD items found on the page in dead_items */ if (vacrel->nindexes == 0) { /* Using one-pass strategy (since table has no indexes) */ @@ -2107,9 +2103,6 @@ lazy_scan_noprune(LVRelState *vacrel, } else if (lpdead_items > 0) { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; - /* * Page has LP_DEAD items, and so any references/TIDs that remain in * indexes will be deleted during index vacuuming (and then marked @@ -2117,17 +2110,7 @@ lazy_scan_noprune(LVRelState *vacrel, */ vacrel->lpdead_item_pages++; - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); - pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); + dead_items_add(vacrel, blkno, deadoffsets, lpdead_items); vacrel->lpdead_items += lpdead_items; } @@ -2177,7 +2160,7 @@ lazy_vacuum(LVRelState *vacrel) if (!vacrel->do_index_vacuuming) { Assert(!vacrel->do_index_cleanup); - vacrel->dead_items->num_items = 0; + dead_items_reset(vacrel); return; } @@ -2206,7 +2189,7 @@ lazy_vacuum(LVRelState *vacrel) BlockNumber threshold; Assert(vacrel->num_index_scans == 0); - Assert(vacrel->lpdead_items == vacrel->dead_items->num_items); + Assert(vacrel->lpdead_items == vacrel->dead_items_info->num_items); Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2234,7 +2217,7 @@ lazy_vacuum(LVRelState *vacrel) */ threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; bypass = (vacrel->lpdead_item_pages < threshold && - vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L)); + (TidStoreMemoryUsage(vacrel->dead_items) < (32L * 1024L * 1024L))); } if (bypass) @@ -2279,7 +2262,7 @@ lazy_vacuum(LVRelState *vacrel) * Forget the LP_DEAD items that we just vacuumed (or just decided to not * vacuum) */ - vacrel->dead_items->num_items = 0; + dead_items_reset(vacrel); } /* @@ -2371,7 +2354,7 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) * place). */ Assert(vacrel->num_index_scans > 0 || - vacrel->dead_items->num_items == vacrel->lpdead_items); + vacrel->dead_items_info->num_items == vacrel->lpdead_items); Assert(allindexes || VacuumFailsafeActive); /* @@ -2393,9 +2376,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) /* * lazy_vacuum_heap_rel() -- second pass over the heap for two pass strategy * - * This routine marks LP_DEAD items in vacrel->dead_items array as LP_UNUSED. - * Pages that never had lazy_scan_prune record LP_DEAD items are not visited - * at all. + * This routine marks LP_DEAD items in vacrel->dead_items as LP_UNUSED. Pages + * that never had lazy_scan_prune record LP_DEAD items are not visited at all. * * We may also be able to truncate the line pointer array of the heap pages we * visit. If there is a contiguous group of LP_UNUSED items at the end of the @@ -2411,10 +2393,11 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) static void lazy_vacuum_heap_rel(LVRelState *vacrel) { - int index = 0; BlockNumber vacuumed_pages = 0; Buffer vmbuffer = InvalidBuffer; LVSavedErrInfo saved_err_info; + TidStoreIter *iter; + TidStoreIterResult *iter_result; Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2429,7 +2412,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) VACUUM_ERRCB_PHASE_VACUUM_HEAP, InvalidBlockNumber, InvalidOffsetNumber); - while (index < vacrel->dead_items->num_items) + iter = TidStoreBeginIterate(vacrel->dead_items); + while ((iter_result = TidStoreIterateNext(iter)) != NULL) { BlockNumber blkno; Buffer buf; @@ -2438,7 +2422,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuum_delay_point(); - blkno = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); + blkno = iter_result->blkno; vacrel->blkno = blkno; /* @@ -2452,7 +2436,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, vacrel->bstrategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - index = lazy_vacuum_heap_page(vacrel, blkno, buf, index, vmbuffer); + lazy_vacuum_heap_page(vacrel, blkno, buf, iter_result->offsets, + iter_result->num_offsets, vmbuffer); /* Now that we've vacuumed the page, record its available space */ page = BufferGetPage(buf); @@ -2462,6 +2447,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); vacuumed_pages++; } + TidStoreEndIterate(iter); vacrel->blkno = InvalidBlockNumber; if (BufferIsValid(vmbuffer)) @@ -2471,14 +2457,14 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) * We set all LP_DEAD items from the first heap pass to LP_UNUSED during * the second heap pass. No more, no less. */ - Assert(index > 0); Assert(vacrel->num_index_scans > 1 || - (index == vacrel->lpdead_items && + (vacrel->dead_items_info->num_items == vacrel->lpdead_items && vacuumed_pages == vacrel->lpdead_item_pages)); ereport(DEBUG2, (errmsg("table \"%s\": removed %lld dead item identifiers in %u pages", - vacrel->relname, (long long) index, vacuumed_pages))); + vacrel->relname, (long long) vacrel->dead_items_info->num_items, + vacuumed_pages))); /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); @@ -2486,21 +2472,17 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) /* * lazy_vacuum_heap_page() -- free page's LP_DEAD items listed in the - * vacrel->dead_items array. + * vacrel->dead_items store. * * Caller must have an exclusive buffer lock on the buffer (though a full * cleanup lock is also acceptable). vmbuffer must be valid and already have * a pin on blkno's visibility map page. - * - * index is an offset into the vacrel->dead_items array for the first listed - * LP_DEAD item on the page. The return value is the first index immediately - * after all LP_DEAD items for the same page in the array. */ -static int +static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, - int index, Buffer vmbuffer) + OffsetNumber *deadoffsets, int num_offsets, + Buffer vmbuffer) { - VacDeadItems *dead_items = vacrel->dead_items; Page page = BufferGetPage(buffer); OffsetNumber unused[MaxHeapTuplesPerPage]; int nunused = 0; @@ -2519,16 +2501,11 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, START_CRIT_SECTION(); - for (; index < dead_items->num_items; index++) + for (int i = 0; i < num_offsets; i++) { - BlockNumber tblk; - OffsetNumber toff; ItemId itemid; + OffsetNumber toff = deadoffsets[i]; - tblk = ItemPointerGetBlockNumber(&dead_items->items[index]); - if (tblk != blkno) - break; /* past end of tuples for this block */ - toff = ItemPointerGetOffsetNumber(&dead_items->items[index]); itemid = PageGetItemId(page, toff); Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); @@ -2592,7 +2569,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); - return index; } /* @@ -2719,8 +2695,8 @@ lazy_cleanup_all_indexes(LVRelState *vacrel) * lazy_vacuum_one_index() -- vacuum index relation. * * Delete all the index tuples containing a TID collected in - * vacrel->dead_items array. Also update running statistics. - * Exact details depend on index AM's ambulkdelete routine. + * vacrel->dead_items. Also update running statistics. Exact + * details depend on index AM's ambulkdelete routine. * * reltuples is the number of heap tuples to be passed to the * bulkdelete callback. It's always assumed to be estimated. @@ -2757,7 +2733,8 @@ lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, InvalidBlockNumber, InvalidOffsetNumber); /* Do bulk deletion */ - istat = vac_bulkdel_one_index(&ivinfo, istat, (void *) vacrel->dead_items); + istat = vac_bulkdel_one_index(&ivinfo, istat, (void *) vacrel->dead_items, + vacrel->dead_items_info); /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); @@ -3123,48 +3100,8 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) } /* - * Returns the number of dead TIDs that VACUUM should allocate space to - * store, given a heap rel of size vacrel->rel_pages, and given current - * maintenance_work_mem setting (or current autovacuum_work_mem setting, - * when applicable). - * - * See the comments at the head of this file for rationale. - */ -static int -dead_items_max_items(LVRelState *vacrel) -{ - int64 max_items; - int vac_work_mem = AmAutoVacuumWorkerProcess() && - autovacuum_work_mem != -1 ? - autovacuum_work_mem : maintenance_work_mem; - - if (vacrel->nindexes > 0) - { - BlockNumber rel_pages = vacrel->rel_pages; - - max_items = MAXDEADITEMS(vac_work_mem * 1024L); - max_items = Min(max_items, INT_MAX); - max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); - - /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) - max_items = rel_pages * MaxHeapTuplesPerPage; - - /* stay sane if small maintenance_work_mem */ - max_items = Max(max_items, MaxHeapTuplesPerPage); - } - else - { - /* One-pass case only stores a single heap page's TIDs at a time */ - max_items = MaxHeapTuplesPerPage; - } - - return (int) max_items; -} - -/* - * Allocate dead_items (either using palloc, or in dynamic shared memory). - * Sets dead_items in vacrel for caller. + * Allocate dead_items and dead_items_info (either using palloc, or in dynamic + * shared memory). Sets both in vacrel for caller. * * Also handles parallel initialization as part of allocating dead_items in * DSM when required. @@ -3172,11 +3109,10 @@ dead_items_max_items(LVRelState *vacrel) static void dead_items_alloc(LVRelState *vacrel, int nworkers) { - VacDeadItems *dead_items; - int max_items; - - max_items = dead_items_max_items(vacrel); - Assert(max_items >= MaxHeapTuplesPerPage); + VacDeadItemsInfo *dead_items_info; + int vac_work_mem = AmAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem : maintenance_work_mem; /* * Initialize state for a parallel vacuum. As of now, only one worker can @@ -3203,24 +3139,72 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) else vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels, vacrel->nindexes, nworkers, - max_items, + vac_work_mem, vacrel->verbose ? INFO : DEBUG2, vacrel->bstrategy); - /* If parallel mode started, dead_items space is allocated in DSM */ + /* + * If parallel mode started, dead_items and dead_items_info spaces are + * allocated in DSM. + */ if (ParallelVacuumIsActive(vacrel)) { - vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs); + vacrel->dead_items = parallel_vacuum_get_dead_items(vacrel->pvs, + &vacrel->dead_items_info); return; } } - /* Serial VACUUM case */ - dead_items = (VacDeadItems *) palloc(vac_max_items_to_alloc_size(max_items)); - dead_items->max_items = max_items; - dead_items->num_items = 0; + /* + * Serial VACUUM case. Allocate both dead_items and dead_items_info + * locally. + */ - vacrel->dead_items = dead_items; + dead_items_info = (VacDeadItemsInfo *) palloc(sizeof(VacDeadItemsInfo)); + dead_items_info->max_bytes = vac_work_mem * 1024L; + dead_items_info->num_items = 0; + vacrel->dead_items_info = dead_items_info; + + vacrel->dead_items = TidStoreCreateLocal(dead_items_info->max_bytes); +} + +/* + * Add the given block number and offset numbers to dead_items. + */ +static void +dead_items_add(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets) +{ + TidStore *dead_items = vacrel->dead_items; + + TidStoreSetBlockOffsets(dead_items, blkno, offsets, num_offsets); + vacrel->dead_items_info->num_items += num_offsets; + + /* update the memory usage report */ + pgstat_progress_update_param(PROGRESS_VACUUM_DEAD_TUPLE_BYTES, + TidStoreMemoryUsage(dead_items)); +} + +/* + * Forget all collected dead items. + */ +static void +dead_items_reset(LVRelState *vacrel) +{ + TidStore *dead_items = vacrel->dead_items; + + if (ParallelVacuumIsActive(vacrel)) + { + parallel_vacuum_reset_dead_items(vacrel->pvs); + return; + } + + /* Recreate the tidstore with the same max_bytes limitation */ + TidStoreDestroy(dead_items); + vacrel->dead_items = TidStoreCreateLocal(vacrel->dead_items_info->max_bytes); + + /* Reset the counter */ + vacrel->dead_items_info->num_items = 0; } /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 401fb359478..2e61f6d74e7 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1223,7 +1223,7 @@ CREATE VIEW pg_stat_progress_vacuum AS END AS phase, S.param2 AS heap_blks_total, S.param3 AS heap_blks_scanned, S.param4 AS heap_blks_vacuumed, S.param5 AS index_vacuum_count, - S.param6 AS max_dead_tuples, S.param7 AS num_dead_tuples, + S.param6 AS max_dead_tuple_bytes, S.param7 AS dead_tuple_bytes, S.param8 AS indexes_total, S.param9 AS indexes_processed FROM pg_stat_get_progress_info('VACUUM') AS S LEFT JOIN pg_database D ON S.datid = D.oid; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index e63c86cae45..b589279d49f 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -116,7 +116,6 @@ static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); static bool vac_tid_reaped(ItemPointer itemptr, void *state); -static int vac_cmp_itemptr(const void *left, const void *right); /* * GUC check function to ensure GUC value specified is within the allowable @@ -2489,16 +2488,16 @@ get_vacoptval_from_boolean(DefElem *def) */ IndexBulkDeleteResult * vac_bulkdel_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat, - VacDeadItems *dead_items) + TidStore *dead_items, VacDeadItemsInfo *dead_items_info) { /* Do bulk deletion */ istat = index_bulk_delete(ivinfo, istat, vac_tid_reaped, (void *) dead_items); ereport(ivinfo->message_level, - (errmsg("scanned index \"%s\" to remove %d row versions", + (errmsg("scanned index \"%s\" to remove %lld row versions", RelationGetRelationName(ivinfo->index), - dead_items->num_items))); + (long long) dead_items_info->num_items))); return istat; } @@ -2529,82 +2528,15 @@ vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat) return istat; } -/* - * Returns the total required space for VACUUM's dead_items array given a - * max_items value. - */ -Size -vac_max_items_to_alloc_size(int max_items) -{ - Assert(max_items <= MAXDEADITEMS(MaxAllocSize)); - - return offsetof(VacDeadItems, items) + sizeof(ItemPointerData) * max_items; -} - /* * vac_tid_reaped() -- is a particular tid deletable? * * This has the right signature to be an IndexBulkDeleteCallback. - * - * Assumes dead_items array is sorted (in ascending TID order). */ static bool vac_tid_reaped(ItemPointer itemptr, void *state) { - VacDeadItems *dead_items = (VacDeadItems *) state; - int64 litem, - ritem, - item; - ItemPointer res; + TidStore *dead_items = (TidStore *) state; - litem = itemptr_encode(&dead_items->items[0]); - ritem = itemptr_encode(&dead_items->items[dead_items->num_items - 1]); - item = itemptr_encode(itemptr); - - /* - * Doing a simple bound check before bsearch() is useful to avoid the - * extra cost of bsearch(), especially if dead items on the heap are - * concentrated in a certain range. Since this function is called for - * every index tuple, it pays to be really fast. - */ - if (item < litem || item > ritem) - return false; - - res = (ItemPointer) bsearch(itemptr, - dead_items->items, - dead_items->num_items, - sizeof(ItemPointerData), - vac_cmp_itemptr); - - return (res != NULL); -} - -/* - * Comparator routines for use with qsort() and bsearch(). - */ -static int -vac_cmp_itemptr(const void *left, const void *right) -{ - BlockNumber lblk, - rblk; - OffsetNumber loff, - roff; - - lblk = ItemPointerGetBlockNumber((ItemPointer) left); - rblk = ItemPointerGetBlockNumber((ItemPointer) right); - - if (lblk < rblk) - return -1; - if (lblk > rblk) - return 1; - - loff = ItemPointerGetOffsetNumber((ItemPointer) left); - roff = ItemPointerGetOffsetNumber((ItemPointer) right); - - if (loff < roff) - return -1; - if (loff > roff) - return 1; - - return 0; + return TidStoreIsMember(dead_items, itemptr); } diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index befda1c1050..5174a4e9753 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -8,8 +8,8 @@ * * In a parallel vacuum, we perform both index bulk deletion and index cleanup * with parallel worker processes. Individual indexes are processed by one - * vacuum process. ParallelVacuumState contains shared information as well as - * the memory space for storing dead items allocated in the DSM segment. We + * vacuum process. ParalleVacuumState contains shared information as well as + * the memory space for storing dead items allocated in the DSA area. We * launch parallel worker processes at the start of parallel index * bulk-deletion and index cleanup and once all indexes are processed, the * parallel worker processes exit. Each time we process indexes in parallel, @@ -45,11 +45,10 @@ * use small integers. */ #define PARALLEL_VACUUM_KEY_SHARED 1 -#define PARALLEL_VACUUM_KEY_DEAD_ITEMS 2 -#define PARALLEL_VACUUM_KEY_QUERY_TEXT 3 -#define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4 -#define PARALLEL_VACUUM_KEY_WAL_USAGE 5 -#define PARALLEL_VACUUM_KEY_INDEX_STATS 6 +#define PARALLEL_VACUUM_KEY_QUERY_TEXT 2 +#define PARALLEL_VACUUM_KEY_BUFFER_USAGE 3 +#define PARALLEL_VACUUM_KEY_WAL_USAGE 4 +#define PARALLEL_VACUUM_KEY_INDEX_STATS 5 /* * Shared information among parallel workers. So this is allocated in the DSM @@ -110,6 +109,15 @@ typedef struct PVShared /* Counter for vacuuming and cleanup */ pg_atomic_uint32 idx; + + /* DSA handle where the TidStore lives */ + dsa_handle dead_items_dsa_handle; + + /* DSA pointer to the shared TidStore */ + dsa_pointer dead_items_handle; + + /* Statistics of shared dead items */ + VacDeadItemsInfo dead_items_info; } PVShared; /* Status used during parallel index vacuum or cleanup */ @@ -176,7 +184,7 @@ struct ParallelVacuumState PVIndStats *indstats; /* Shared dead items space among parallel vacuum workers */ - VacDeadItems *dead_items; + TidStore *dead_items; /* Points to buffer usage area in DSM */ BufferUsage *buffer_usage; @@ -232,20 +240,19 @@ static void parallel_vacuum_error_callback(void *arg); */ ParallelVacuumState * parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, - int nrequested_workers, int max_items, + int nrequested_workers, int vac_work_mem, int elevel, BufferAccessStrategy bstrategy) { ParallelVacuumState *pvs; ParallelContext *pcxt; PVShared *shared; - VacDeadItems *dead_items; + TidStore *dead_items; PVIndStats *indstats; BufferUsage *buffer_usage; WalUsage *wal_usage; bool *will_parallel_vacuum; Size est_indstats_len; Size est_shared_len; - Size est_dead_items_len; int nindexes_mwm = 0; int parallel_workers = 0; int querylen; @@ -294,11 +301,6 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_estimate_chunk(&pcxt->estimator, est_shared_len); shm_toc_estimate_keys(&pcxt->estimator, 1); - /* Estimate size for dead_items -- PARALLEL_VACUUM_KEY_DEAD_ITEMS */ - est_dead_items_len = vac_max_items_to_alloc_size(max_items); - shm_toc_estimate_chunk(&pcxt->estimator, est_dead_items_len); - shm_toc_estimate_keys(&pcxt->estimator, 1); - /* * Estimate space for BufferUsage and WalUsage -- * PARALLEL_VACUUM_KEY_BUFFER_USAGE and PARALLEL_VACUUM_KEY_WAL_USAGE. @@ -371,6 +373,14 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, (nindexes_mwm > 0) ? maintenance_work_mem / Min(parallel_workers, nindexes_mwm) : maintenance_work_mem; + shared->dead_items_info.max_bytes = vac_work_mem * 1024L; + + /* Prepare DSA space for dead items */ + dead_items = TidStoreCreateShared(shared->dead_items_info.max_bytes, + LWTRANCHE_PARALLEL_VACUUM_DSA); + pvs->dead_items = dead_items; + shared->dead_items_handle = TidStoreGetHandle(dead_items); + shared->dead_items_dsa_handle = dsa_get_handle(TidStoreGetDSA(dead_items)); /* Use the same buffer size for all workers */ shared->ring_nbuffers = GetAccessStrategyBufferCount(bstrategy); @@ -382,15 +392,6 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared); pvs->shared = shared; - /* Prepare the dead_items space */ - dead_items = (VacDeadItems *) shm_toc_allocate(pcxt->toc, - est_dead_items_len); - dead_items->max_items = max_items; - dead_items->num_items = 0; - MemSet(dead_items->items, 0, sizeof(ItemPointerData) * max_items); - shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_ITEMS, dead_items); - pvs->dead_items = dead_items; - /* * Allocate space for each worker's BufferUsage and WalUsage; no need to * initialize @@ -448,6 +449,8 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) istats[i] = NULL; } + TidStoreDestroy(pvs->dead_items); + DestroyParallelContext(pvs->pcxt); ExitParallelMode(); @@ -455,13 +458,40 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) pfree(pvs); } -/* Returns the dead items space */ -VacDeadItems * -parallel_vacuum_get_dead_items(ParallelVacuumState *pvs) +/* + * Returns the dead items space and dead items information. + */ +TidStore * +parallel_vacuum_get_dead_items(ParallelVacuumState *pvs, VacDeadItemsInfo **dead_items_info_p) { + *dead_items_info_p = &(pvs->shared->dead_items_info); return pvs->dead_items; } +/* Forget all items in dead_items */ +void +parallel_vacuum_reset_dead_items(ParallelVacuumState *pvs) +{ + TidStore *dead_items = pvs->dead_items; + VacDeadItemsInfo *dead_items_info = &(pvs->shared->dead_items_info); + + /* + * Free the current tidstore and return allocated DSA segments to the + * operating system. Then we recreate the tidstore with the same max_bytes + * limitation we just used. + */ + TidStoreDestroy(dead_items); + pvs->dead_items = TidStoreCreateShared(dead_items_info->max_bytes, + LWTRANCHE_PARALLEL_VACUUM_DSA); + + /* Update the DSA pointer for dead_items to the new one */ + pvs->shared->dead_items_dsa_handle = dsa_get_handle(TidStoreGetDSA(dead_items)); + pvs->shared->dead_items_handle = TidStoreGetHandle(dead_items); + + /* Reset the counter */ + dead_items_info->num_items = 0; +} + /* * Do parallel index bulk-deletion with parallel workers. */ @@ -861,7 +891,8 @@ parallel_vacuum_process_one_index(ParallelVacuumState *pvs, Relation indrel, switch (indstats->status) { case PARALLEL_INDVAC_STATUS_NEED_BULKDELETE: - istat_res = vac_bulkdel_one_index(&ivinfo, istat, pvs->dead_items); + istat_res = vac_bulkdel_one_index(&ivinfo, istat, pvs->dead_items, + &pvs->shared->dead_items_info); break; case PARALLEL_INDVAC_STATUS_NEED_CLEANUP: istat_res = vac_cleanup_one_index(&ivinfo, istat); @@ -961,7 +992,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) Relation *indrels; PVIndStats *indstats; PVShared *shared; - VacDeadItems *dead_items; + TidStore *dead_items; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -1005,10 +1036,9 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PARALLEL_VACUUM_KEY_INDEX_STATS, false); - /* Set dead_items space */ - dead_items = (VacDeadItems *) shm_toc_lookup(toc, - PARALLEL_VACUUM_KEY_DEAD_ITEMS, - false); + /* Find dead_items in shared memory */ + dead_items = TidStoreAttach(shared->dead_items_dsa_handle, + shared->dead_items_handle); /* Set cost-based vacuum delay */ VacuumUpdateCosts(); @@ -1056,6 +1086,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); + TidStoreDetach(dead_items); + /* Pop the error context stack */ error_context_stack = errcallback.previous; diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 83992725de3..b1e388dc7c9 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -168,6 +168,7 @@ static const char *const BuiltinTrancheNames[] = { [LWTRANCHE_SERIAL_SLRU] = "SerialSLRU", [LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU", [LWTRANCHE_XACT_SLRU] = "XactSLRU", + [LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 8d0571a03d1..d39d8d7e878 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -376,6 +376,7 @@ NotifySLRU "Waiting to access the NOTIFY message SLRU cache." SerialSLRU "Waiting to access the serializable transaction conflict SLRU cache." SubtransSLRU "Waiting to access the sub-transaction SLRU cache." XactSLRU "Waiting to access the transaction status SLRU cache." +ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation." # diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 86ace33dbeb..950f00bed48 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202403302 +#define CATALOG_VERSION_NO 202404021 #endif diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 73afa77a9c7..82a8fe6bd14 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -23,8 +23,8 @@ #define PROGRESS_VACUUM_HEAP_BLKS_SCANNED 2 #define PROGRESS_VACUUM_HEAP_BLKS_VACUUMED 3 #define PROGRESS_VACUUM_NUM_INDEX_VACUUMS 4 -#define PROGRESS_VACUUM_MAX_DEAD_TUPLES 5 -#define PROGRESS_VACUUM_NUM_DEAD_TUPLES 6 +#define PROGRESS_VACUUM_MAX_DEAD_TUPLE_BYTES 5 +#define PROGRESS_VACUUM_DEAD_TUPLE_BYTES 6 #define PROGRESS_VACUUM_INDEXES_TOTAL 7 #define PROGRESS_VACUUM_INDEXES_PROCESSED 8 diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 68068dd9003..9514f8b2fd8 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -17,6 +17,7 @@ #include "access/htup.h" #include "access/genam.h" #include "access/parallel.h" +#include "access/tidstore.h" #include "catalog/pg_class.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" @@ -293,19 +294,14 @@ struct VacuumCutoffs }; /* - * VacDeadItems stores TIDs whose index tuples are deleted by index vacuuming. + * VacDeadItemsInfo stores supplemental information for dead tuple TID + * storage (i.e. TidStore). */ -typedef struct VacDeadItems +typedef struct VacDeadItemsInfo { - int max_items; /* # slots allocated in array */ - int num_items; /* current # of entries */ - - /* Sorted array of TIDs to delete from indexes */ - ItemPointerData items[FLEXIBLE_ARRAY_MEMBER]; -} VacDeadItems; - -#define MAXDEADITEMS(avail_mem) \ - (((avail_mem) - offsetof(VacDeadItems, items)) / sizeof(ItemPointerData)) + size_t max_bytes; /* the maximum bytes TidStore can use */ + int64 num_items; /* current # of entries */ +} VacDeadItemsInfo; /* GUC parameters */ extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */ @@ -366,10 +362,10 @@ extern Relation vacuum_open_relation(Oid relid, RangeVar *relation, LOCKMODE lmode); extern IndexBulkDeleteResult *vac_bulkdel_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat, - VacDeadItems *dead_items); + TidStore *dead_items, + VacDeadItemsInfo *dead_items_info); extern IndexBulkDeleteResult *vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat); -extern Size vac_max_items_to_alloc_size(int max_items); /* In postmaster/autovacuum.c */ extern void AutoVacuumUpdateCostLimit(void); @@ -378,10 +374,12 @@ extern void VacuumUpdateCosts(void); /* in commands/vacuumparallel.c */ extern ParallelVacuumState *parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, int nrequested_workers, - int max_items, int elevel, + int vac_work_mem, int elevel, BufferAccessStrategy bstrategy); extern void parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats); -extern VacDeadItems *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs); +extern TidStore *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs, + VacDeadItemsInfo **dead_items_info_p); +extern void parallel_vacuum_reset_dead_items(ParallelVacuumState *pvs); extern void parallel_vacuum_bulkdel_all_indexes(ParallelVacuumState *pvs, long num_table_tuples, int num_index_scans); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 3479b4cf522..d70e6d37e09 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -214,6 +214,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_SERIAL_SLRU, LWTRANCHE_SUBTRANS_SLRU, LWTRANCHE_XACT_SLRU, + LWTRANCHE_PARALLEL_VACUUM_DSA, LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 5e45ce64f7a..f4a0f363775 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2050,8 +2050,8 @@ pg_stat_progress_vacuum| SELECT s.pid, s.param3 AS heap_blks_scanned, s.param4 AS heap_blks_vacuumed, s.param5 AS index_vacuum_count, - s.param6 AS max_dead_tuples, - s.param7 AS num_dead_tuples, + s.param6 AS max_dead_tuple_bytes, + s.param7 AS dead_tuple_bytes, s.param8 AS indexes_total, s.param9 AS indexes_processed FROM (pg_stat_get_progress_info('VACUUM'::text) s(pid, datid, relid, param1, param2, param3, param4, param5, param6, param7, param8, param9, param10, param11, param12, param13, param14, param15, param16, param17, param18, param19, param20) diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 9add48f9924..79745ba9134 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2985,7 +2985,7 @@ UserMapping UserOpts VacAttrStats VacAttrStatsP -VacDeadItems +VacDeadItemsInfo VacErrPhase VacObjFilter VacOptValue