Downloaded smgr patch

This commit is contained in:
Zsolt Parragi 2024-04-07 18:45:18 +01:00 committed by Zsolt Parragi
parent 7689301f3f
commit ce431e03d1
4 changed files with 1405 additions and 0 deletions

View File

@ -0,0 +1,911 @@
From 5ffbc7c35bb3248501b2517d26f99afe02fb53d6 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Tue, 27 Jun 2023 15:59:23 +0200
Subject: [PATCH v1 1/5] Expose f_smgr to extensions for manual implementation
There are various reasons why one would want to create their own
implementation of a storage manager, among which are block-level compression,
encryption and offloading to cold storage. This patch is a first patch that
allows extensions to register their own SMgr.
Note, however, that this SMgr is not yet used - only the first SMgr to register
is used, and this is currently the md.c smgr. Future commits will include
facilities to select an SMgr for each tablespace.
---
src/backend/postmaster/postmaster.c | 5 +
src/backend/storage/smgr/md.c | 172 +++++++++++++++++++---------
src/backend/storage/smgr/smgr.c | 129 ++++++++++-----------
src/backend/utils/init/miscinit.c | 13 +++
src/include/miscadmin.h | 1 +
src/include/storage/md.h | 4 +
src/include/storage/smgr.h | 59 ++++++++--
7 files changed, 252 insertions(+), 131 deletions(-)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index feb471dd1d..a0e46fe1f2 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1010,6 +1010,11 @@ PostmasterMain(int argc, char *argv[])
*/
ApplyLauncherRegister();
+ /*
+ * Register built-in managers that are not part of static arrays
+ */
+ register_builtin_dynamic_managers();
+
/*
* process any libraries that should be preloaded at postmaster start
*/
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index b1e9932a29..66a93101ab 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -87,6 +87,21 @@ typedef struct _MdfdVec
} MdfdVec;
static MemoryContext MdCxt; /* context for all MdfdVec objects */
+SMgrId MdSMgrId;
+
+typedef struct MdSMgrRelationData
+{
+ /* parent data */
+ SMgrRelationData reln;
+ /*
+ * for md.c; per-fork arrays of the number of open segments
+ * (md_num_open_segs) and the segments themselves (md_seg_fds).
+ */
+ int md_num_open_segs[MAX_FORKNUM + 1];
+ struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
+} MdSMgrRelationData;
+
+typedef MdSMgrRelationData *MdSMgrRelation;
/* Populate a file tag describing an md.c segment file. */
@@ -121,26 +136,52 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */
#define EXTENSION_DONT_OPEN (1 << 5)
+void mdsmgr_register(void)
+{
+ /* magnetic disk */
+ f_smgr md_smgr = (f_smgr) {
+ .name = "md",
+ .smgr_init = mdinit,
+ .smgr_shutdown = NULL,
+ .smgr_open = mdopen,
+ .smgr_close = mdclose,
+ .smgr_create = mdcreate,
+ .smgr_exists = mdexists,
+ .smgr_unlink = mdunlink,
+ .smgr_extend = mdextend,
+ .smgr_zeroextend = mdzeroextend,
+ .smgr_prefetch = mdprefetch,
+ .smgr_readv = mdreadv,
+ .smgr_writev = mdwritev,
+ .smgr_writeback = mdwriteback,
+ .smgr_nblocks = mdnblocks,
+ .smgr_truncate = mdtruncate,
+ .smgr_immedsync = mdimmedsync,
+ };
+
+ MdSMgrId = smgr_register(&md_smgr, sizeof(MdSMgrRelationData));
+}
+
/* local routines */
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
bool isRedo);
-static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
-static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
+static MdfdVec *mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior);
+static void register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
BlockNumber segno);
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
BlockNumber segno);
-static void _fdvec_resize(SMgrRelation reln,
+static void _fdvec_resize(MdSMgrRelation reln,
ForkNumber forknum,
int nseg);
-static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
+static char *_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum,
BlockNumber segno);
-static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
+static MdfdVec *_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum,
BlockNumber segno, int oflags);
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
+static MdfdVec *_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum,
BlockNumber blkno, bool skipFsync, int behavior);
-static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
+static BlockNumber _mdnblocks(MdSMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
static inline int
@@ -173,6 +214,8 @@ mdinit(void)
bool
mdexists(SMgrRelation reln, ForkNumber forknum)
{
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
+
/*
* Close it first, to ensure that we notice if the fork has been unlinked
* since we opened it. As an optimization, we can skip that in recovery,
@@ -181,7 +224,7 @@ mdexists(SMgrRelation reln, ForkNumber forknum)
if (!InRecovery)
mdclose(reln, forknum);
- return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
+ return (mdopenfork(mdreln, forknum, EXTENSION_RETURN_NULL) != NULL);
}
/*
@@ -195,11 +238,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
MdfdVec *mdfd;
char *path;
File fd;
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
+ // Assert(reln->smgr_which == MdSMgrId);
- if (isRedo && reln->md_num_open_segs[forknum] > 0)
+ if (isRedo && mdreln->md_num_open_segs[forknum] > 0)
return; /* created and opened already... */
- Assert(reln->md_num_open_segs[forknum] == 0);
+ Assert(mdreln->md_num_open_segs[forknum] == 0);
/*
* We may be using the target table space for the first time in this
@@ -236,13 +281,13 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
pfree(path);
- _fdvec_resize(reln, forknum, 1);
- mdfd = &reln->md_seg_fds[forknum][0];
+ _fdvec_resize(mdreln, forknum, 1);
+ mdfd = &mdreln->md_seg_fds[forknum][0];
mdfd->mdfd_vfd = fd;
mdfd->mdfd_segno = 0;
if (!SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, mdfd);
+ register_dirty_segment(mdreln, forknum, mdfd);
}
/*
@@ -466,6 +511,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
off_t seekpos;
int nbytes;
MdfdVec *v;
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
/* If this build supports direct I/O, the buffer must be I/O aligned. */
if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
@@ -489,7 +535,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
relpath(reln->smgr_rlocator, forknum),
InvalidBlockNumber)));
- v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
+ v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -513,9 +559,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
}
if (!skipFsync && !SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
+ register_dirty_segment(mdreln, forknum, v);
- Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
}
/*
@@ -531,6 +577,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
MdfdVec *v;
BlockNumber curblocknum = blocknum;
int remblocks = nblocks;
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
Assert(nblocks > 0);
@@ -562,7 +609,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
else
numblocks = remblocks;
- v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+ v = _mdfd_getseg(mdreln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
Assert(segstartblock < RELSEG_SIZE);
Assert(segstartblock + numblocks <= RELSEG_SIZE);
@@ -617,9 +664,9 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
}
if (!skipFsync && !SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
+ register_dirty_segment(mdreln, forknum, v);
- Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(mdreln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
remblocks -= numblocks;
curblocknum += numblocks;
@@ -637,7 +684,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
* invent one out of whole cloth.
*/
static MdfdVec *
-mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
+mdopenfork(MdSMgrRelation reln, ForkNumber forknum, int behavior)
{
MdfdVec *mdfd;
char *path;
@@ -647,7 +694,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
if (reln->md_num_open_segs[forknum] > 0)
return &reln->md_seg_fds[forknum][0];
- path = relpath(reln->smgr_rlocator, forknum);
+ path = relpath(reln->reln.smgr_rlocator, forknum);
fd = PathNameOpenFile(path, _mdfd_open_flags());
@@ -682,9 +729,10 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
void
mdopen(SMgrRelation reln)
{
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
/* mark it not open */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
- reln->md_num_open_segs[forknum] = 0;
+ mdreln->md_num_open_segs[forknum] = 0;
}
/*
@@ -693,7 +741,8 @@ mdopen(SMgrRelation reln)
void
mdclose(SMgrRelation reln, ForkNumber forknum)
{
- int nopensegs = reln->md_num_open_segs[forknum];
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
+ int nopensegs = mdreln->md_num_open_segs[forknum];
/* No work if already closed */
if (nopensegs == 0)
@@ -702,10 +751,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
/* close segments starting from the end */
while (nopensegs > 0)
{
- MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
+ MdfdVec *v = &mdreln->md_seg_fds[forknum][nopensegs - 1];
FileClose(v->mdfd_vfd);
- _fdvec_resize(reln, forknum, nopensegs - 1);
+ _fdvec_resize(mdreln, forknum, nopensegs - 1);
nopensegs--;
}
}
@@ -718,6 +767,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
int nblocks)
{
#ifdef USE_PREFETCH
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
@@ -730,7 +780,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
MdfdVec *v;
int nblocks_this_segment;
- v = _mdfd_getseg(reln, forknum, blocknum, false,
+ v = _mdfd_getseg(mdreln, forknum, blocknum, false,
InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
if (v == NULL)
return false;
@@ -813,6 +863,8 @@ void
mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void **buffers, BlockNumber nblocks)
{
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
+
while (nblocks > 0)
{
struct iovec iov[PG_IOV_MAX];
@@ -824,7 +876,7 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
size_t transferred_this_segment;
size_t size_this_segment;
- v = _mdfd_getseg(reln, forknum, blocknum, false,
+ v = _mdfd_getseg(mdreln, forknum, blocknum, false,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -931,6 +983,8 @@ void
mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
const void **buffers, BlockNumber nblocks, bool skipFsync)
{
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
+
/* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
Assert(blocknum < mdnblocks(reln, forknum));
@@ -947,7 +1001,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
size_t transferred_this_segment;
size_t size_this_segment;
- v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+ v = _mdfd_getseg(mdreln, forknum, blocknum, skipFsync,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -1014,7 +1068,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
}
if (!skipFsync && !SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
+ register_dirty_segment(mdreln, forknum, v);
nblocks -= nblocks_this_segment;
buffers += nblocks_this_segment;
@@ -1033,6 +1087,7 @@ void
mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
/*
@@ -1047,7 +1102,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
int segnum_start,
segnum_end;
- v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
+ v = _mdfd_getseg(mdreln, forknum, blocknum, true /* not used */ ,
EXTENSION_DONT_OPEN);
/*
@@ -1094,11 +1149,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
MdfdVec *v;
BlockNumber nblocks;
BlockNumber segno;
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
- mdopenfork(reln, forknum, EXTENSION_FAIL);
+ mdopenfork(mdreln, forknum, EXTENSION_FAIL);
/* mdopen has opened the first segment */
- Assert(reln->md_num_open_segs[forknum] > 0);
+ Assert(mdreln->md_num_open_segs[forknum] > 0);
/*
* Start from the last open segments, to avoid redundant seeks. We have
@@ -1113,12 +1169,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
* that's OK because the checkpointer never needs to compute relation
* size.)
*/
- segno = reln->md_num_open_segs[forknum] - 1;
- v = &reln->md_seg_fds[forknum][segno];
+ segno = mdreln->md_num_open_segs[forknum] - 1;
+ v = &mdreln->md_seg_fds[forknum][segno];
for (;;)
{
- nblocks = _mdnblocks(reln, forknum, v);
+ nblocks = _mdnblocks(mdreln, forknum, v);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
elog(FATAL, "segment too big");
if (nblocks < ((BlockNumber) RELSEG_SIZE))
@@ -1136,7 +1192,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
* undermines _mdfd_getseg's attempts to notice and report an error
* upon access to a missing segment.
*/
- v = _mdfd_openseg(reln, forknum, segno, 0);
+ v = _mdfd_openseg(mdreln, forknum, segno, 0);
if (v == NULL)
return segno * ((BlockNumber) RELSEG_SIZE);
}
@@ -1151,6 +1207,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
BlockNumber curnblk;
BlockNumber priorblocks;
int curopensegs;
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
/*
* NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1174,14 +1231,14 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
* Truncate segments, starting at the last one. Starting at the end makes
* managing the memory for the fd array easier, should there be errors.
*/
- curopensegs = reln->md_num_open_segs[forknum];
+ curopensegs = mdreln->md_num_open_segs[forknum];
while (curopensegs > 0)
{
MdfdVec *v;
priorblocks = (curopensegs - 1) * RELSEG_SIZE;
- v = &reln->md_seg_fds[forknum][curopensegs - 1];
+ v = &mdreln->md_seg_fds[forknum][curopensegs - 1];
if (priorblocks > nblocks)
{
@@ -1196,13 +1253,13 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
FilePathName(v->mdfd_vfd))));
if (!SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
+ register_dirty_segment(mdreln, forknum, v);
/* we never drop the 1st segment */
- Assert(v != &reln->md_seg_fds[forknum][0]);
+ Assert(v != &mdreln->md_seg_fds[forknum][0]);
FileClose(v->mdfd_vfd);
- _fdvec_resize(reln, forknum, curopensegs - 1);
+ _fdvec_resize(mdreln, forknum, curopensegs - 1);
}
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
{
@@ -1222,7 +1279,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
FilePathName(v->mdfd_vfd),
nblocks)));
if (!SmgrIsTemp(reln))
- register_dirty_segment(reln, forknum, v);
+ register_dirty_segment(mdreln, forknum, v);
}
else
{
@@ -1252,6 +1309,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
int segno;
int min_inactive_seg;
+ MdSMgrRelation mdreln = (MdSMgrRelation) reln;
/*
* NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1259,7 +1317,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
*/
mdnblocks(reln, forknum);
- min_inactive_seg = segno = reln->md_num_open_segs[forknum];
+ min_inactive_seg = segno = mdreln->md_num_open_segs[forknum];
/*
* Temporarily open inactive segments, then close them after sync. There
@@ -1267,12 +1325,12 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
* is harmless. We don't bother to clean them up and take a risk of
* further trouble. The next mdclose() will soon close them.
*/
- while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
+ while (_mdfd_openseg(mdreln, forknum, segno, 0) != NULL)
segno++;
while (segno > 0)
{
- MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
+ MdfdVec *v = &mdreln->md_seg_fds[forknum][segno - 1];
/*
* fsyncs done through mdimmedsync() should be tracked in a separate
@@ -1293,7 +1351,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
if (segno > min_inactive_seg)
{
FileClose(v->mdfd_vfd);
- _fdvec_resize(reln, forknum, segno - 1);
+ _fdvec_resize(mdreln, forknum, segno - 1);
}
segno--;
@@ -1310,14 +1368,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
* enough to be a performance problem).
*/
static void
-register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
+register_dirty_segment(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
FileTag tag;
- INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
+ INIT_MD_FILETAG(tag, reln->reln.smgr_rlocator.locator, forknum, seg->mdfd_segno);
/* Temp relations should never be fsync'd */
- Assert(!SmgrIsTemp(reln));
+ Assert(!SmgrIsTemp(&reln->reln));
if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
{
@@ -1435,7 +1493,7 @@ DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
* _fdvec_resize() -- Resize the fork's open segments array
*/
static void
-_fdvec_resize(SMgrRelation reln,
+_fdvec_resize(MdSMgrRelation reln,
ForkNumber forknum,
int nseg)
{
@@ -1473,12 +1531,12 @@ _fdvec_resize(SMgrRelation reln,
* returned string is palloc'd.
*/
static char *
-_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
+_mdfd_segpath(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno)
{
char *path,
*fullpath;
- path = relpath(reln->smgr_rlocator, forknum);
+ path = relpath(reln->reln.smgr_rlocator, forknum);
if (segno > 0)
{
@@ -1496,7 +1554,7 @@ _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
* and make a MdfdVec object for it. Returns NULL on failure.
*/
static MdfdVec *
-_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
+_mdfd_openseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber segno,
int oflags)
{
MdfdVec *v;
@@ -1541,7 +1599,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
* EXTENSION_CREATE case.
*/
static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+_mdfd_getseg(MdSMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
bool skipFsync, int behavior)
{
MdfdVec *v;
@@ -1615,7 +1673,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
MCXT_ALLOC_ZERO);
- mdextend(reln, forknum,
+ mdextend((SMgrRelation) reln, forknum,
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
zerobuf, skipFsync);
pfree(zerobuf);
@@ -1672,7 +1730,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
* Get number of blocks present in a single disk file
*/
static BlockNumber
-_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
+_mdnblocks(MdSMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
{
off_t len;
@@ -1695,7 +1753,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
int
mdsyncfiletag(const FileTag *ftag, char *path)
{
- SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
+ MdSMgrRelation reln = (MdSMgrRelation) smgropen(ftag->rlocator, InvalidBackendId);
File file;
instr_time io_start;
bool need_to_close;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 563a0be5c7..b586e6e25a 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -19,80 +19,23 @@
#include "access/xlogutils.h"
#include "lib/ilist.h"
+#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/smgr.h"
+#include "port/atomics.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
+#include "utils/memutils.h"
-/*
- * This struct of function pointers defines the API between smgr.c and
- * any individual storage manager module. Note that smgr subfunctions are
- * generally expected to report problems via elog(ERROR). An exception is
- * that smgr_unlink should use elog(WARNING), rather than erroring out,
- * because we normally unlink relations during post-commit/abort cleanup,
- * and so it's too late to raise an error. Also, various conditions that
- * would normally be errors should be allowed during bootstrap and/or WAL
- * recovery --- see comments in md.c for details.
- */
-typedef struct f_smgr
-{
- void (*smgr_init) (void); /* may be NULL */
- void (*smgr_shutdown) (void); /* may be NULL */
- void (*smgr_open) (SMgrRelation reln);
- void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
- void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
- bool isRedo);
- bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
- void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
- bool isRedo);
- void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, const void *buffer, bool skipFsync);
- void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, int nblocks, bool skipFsync);
- bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, int nblocks);
- void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum,
- void **buffers, BlockNumber nblocks);
- void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum,
- const void **buffers, BlockNumber nblocks,
- bool skipFsync);
- void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, BlockNumber nblocks);
- BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
- void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks);
- void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-} f_smgr;
-
-static const f_smgr smgrsw[] = {
- /* magnetic disk */
- {
- .smgr_init = mdinit,
- .smgr_shutdown = NULL,
- .smgr_open = mdopen,
- .smgr_close = mdclose,
- .smgr_create = mdcreate,
- .smgr_exists = mdexists,
- .smgr_unlink = mdunlink,
- .smgr_extend = mdextend,
- .smgr_zeroextend = mdzeroextend,
- .smgr_prefetch = mdprefetch,
- .smgr_readv = mdreadv,
- .smgr_writev = mdwritev,
- .smgr_writeback = mdwriteback,
- .smgr_nblocks = mdnblocks,
- .smgr_truncate = mdtruncate,
- .smgr_immedsync = mdimmedsync,
- }
-};
+static f_smgr *smgrsw;
-static const int NSmgr = lengthof(smgrsw);
+static int NSmgr = 0;
+
+static Size LargestSMgrRelationSize = 0;
/*
* Each backend has a hashtable that stores all extant SMgrRelation objects.
@@ -105,6 +48,57 @@ static dlist_head unowned_relns;
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
+SMgrId
+smgr_register(const f_smgr *smgr, Size smgrrelation_size)
+{
+ SMgrId my_id;
+ MemoryContext old;
+
+ if (process_shared_preload_libraries_done)
+ elog(FATAL, "SMgrs must be registered in the shared_preload_libraries phase");
+ if (NSmgr == MaxSMgrId)
+ elog(FATAL, "Too many smgrs registered");
+ if (smgr->name == NULL || *smgr->name == 0)
+ elog(FATAL, "smgr registered with invalid name");
+
+ Assert(smgr->smgr_open != NULL);
+ Assert(smgr->smgr_close != NULL);
+ Assert(smgr->smgr_create != NULL);
+ Assert(smgr->smgr_exists != NULL);
+ Assert(smgr->smgr_unlink != NULL);
+ Assert(smgr->smgr_extend != NULL);
+ Assert(smgr->smgr_zeroextend != NULL);
+ Assert(smgr->smgr_prefetch != NULL);
+ Assert(smgr->smgr_readv != NULL);
+ Assert(smgr->smgr_writev != NULL);
+ Assert(smgr->smgr_writeback != NULL);
+ Assert(smgr->smgr_nblocks != NULL);
+ Assert(smgr->smgr_truncate != NULL);
+ Assert(smgr->smgr_immedsync != NULL);
+ old = MemoryContextSwitchTo(TopMemoryContext);
+
+ my_id = NSmgr++;
+ if (my_id == 0)
+ smgrsw = palloc(sizeof(f_smgr));
+ else
+ smgrsw = repalloc(smgrsw, sizeof(f_smgr) * NSmgr);
+
+ MemoryContextSwitchTo(old);
+
+ pg_compiler_barrier();
+
+ if (!smgrsw)
+ {
+ NSmgr--;
+ elog(FATAL, "Failed to extend smgr array");
+ }
+
+ memcpy(&smgrsw[my_id], smgr, sizeof(f_smgr));
+
+ LargestSMgrRelationSize = Max(LargestSMgrRelationSize, smgrrelation_size);
+
+ return my_id;
+}
/*
* smgrinit(), smgrshutdown() -- Initialize or shut down storage
@@ -162,9 +156,11 @@ smgropen(RelFileLocator rlocator, BackendId backend)
{
/* First time through: initialize the hash table */
HASHCTL ctl;
+ LargestSMgrRelationSize = MAXALIGN(LargestSMgrRelationSize);
+ Assert(NSmgr > 0);
ctl.keysize = sizeof(RelFileLocatorBackend);
- ctl.entrysize = sizeof(SMgrRelationData);
+ ctl.entrysize = LargestSMgrRelationSize;
SMgrRelationHash = hash_create("smgr relation table", 400,
&ctl, HASH_ELEM | HASH_BLOBS);
dlist_init(&unowned_relns);
@@ -185,7 +181,8 @@ smgropen(RelFileLocator rlocator, BackendId backend)
reln->smgr_targblock = InvalidBlockNumber;
for (int i = 0; i <= MAX_FORKNUM; ++i)
reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
- reln->smgr_which = 0; /* we only have md.c at present */
+
+ reln->smgr_which = MdSMgrId; /* we only have md.c at present */
/* implementation-specific initialization */
smgrsw[reln->smgr_which].smgr_open(reln);
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 23f77a59e5..4ec7619302 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -42,6 +42,7 @@
#include "postmaster/postmaster.h"
#include "storage/fd.h"
#include "storage/ipc.h"
+#include "storage/md.h"
#include "storage/latch.h"
#include "storage/pg_shmem.h"
#include "storage/pmsignal.h"
@@ -198,6 +199,9 @@ InitStandaloneProcess(const char *argv0)
InitProcessLocalLatch();
InitializeLatchWaitSet();
+ /* Initialize smgrs */
+ register_builtin_dynamic_managers();
+
/*
* For consistency with InitPostmasterChild, initialize signal mask here.
* But we don't unblock SIGQUIT or provide a default handler for it.
@@ -1860,6 +1864,15 @@ process_session_preload_libraries(void)
true);
}
+/*
+ * Register any internal managers.
+ */
+void
+register_builtin_dynamic_managers(void)
+{
+ mdsmgr_register();
+}
+
/*
* process any shared memory requests from preloaded libraries
*/
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 0b01c1f093..d0d4ba38ef 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -493,6 +493,7 @@ extern void TouchSocketLockFiles(void);
extern void AddToDataDirLockFile(int target_line, const char *str);
extern bool RecheckDataDirLockFile(void);
extern void ValidatePgVersion(const char *path);
+extern void register_builtin_dynamic_managers(void);
extern void process_shared_preload_libraries(void);
extern void process_session_preload_libraries(void);
extern void process_shmem_requests(void);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 7c181e5a17..734bae07e1 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -19,6 +19,10 @@
#include "storage/smgr.h"
#include "storage/sync.h"
+/* registration function for md storage manager */
+extern void mdsmgr_register(void);
+extern SMgrId MdSMgrId;
+
/* md storage manager functionality */
extern void mdinit(void);
extern void mdopen(SMgrRelation reln);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 527cd2a056..95927b8bdd 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,10 @@
#include "storage/block.h"
#include "storage/relfilelocator.h"
+typedef uint8 SMgrId;
+
+#define MaxSMgrId UINT8_MAX
+
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
* cached file handles. An SMgrRelation is created (if not already present)
@@ -59,14 +63,8 @@ typedef struct SMgrRelationData
* Fields below here are intended to be private to smgr.c and its
* submodules. Do not touch them from elsewhere.
*/
- int smgr_which; /* storage manager selector */
-
- /*
- * for md.c; per-fork arrays of the number of open segments
- * (md_num_open_segs) and the segments themselves (md_seg_fds).
- */
- int md_num_open_segs[MAX_FORKNUM + 1];
- struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
+ SMgrId smgr_which; /* storage manager selector */
+ int smgrrelation_size; /* size of this struct, incl. smgr-specific data */
/* if unowned, list link in list of all unowned SMgrRelations */
dlist_node node;
@@ -77,6 +75,51 @@ typedef SMgrRelationData *SMgrRelation;
#define SmgrIsTemp(smgr) \
RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator)
+/*
+ * This struct of function pointers defines the API between smgr.c and
+ * any individual storage manager module. Note that smgr subfunctions are
+ * generally expected to report problems via elog(ERROR). An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error. Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
+ */
+typedef struct f_smgr
+{
+ const char *name;
+ void (*smgr_init) (void); /* may be NULL */
+ void (*smgr_shutdown) (void); /* may be NULL */
+ void (*smgr_open) (SMgrRelation reln);
+ void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
+ bool isRedo);
+ bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
+ bool isRedo);
+ void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void *buffer, bool skipFsync);
+ void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
+ bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks);
+ void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum,
+ void **buffers, BlockNumber nblocks);
+ void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum,
+ const void **buffers, BlockNumber nblocks,
+ bool skipFsync);
+ void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
+ BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber nblocks);
+ void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+} f_smgr;
+
+extern SMgrId smgr_register(const f_smgr *smgr, Size smgrrelation_size);
+
extern void smgrinit(void);
extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend);
extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
--
Tristan Partin
Neon (https://neon.tech)

View File

@ -0,0 +1,93 @@
From 59a667f079c9b040c23921e4c43fae94b88776f2 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 13 Oct 2023 14:00:44 -0500
Subject: [PATCH v1 2/5] Allow extensions to override the global storage
manager
---
src/backend/storage/smgr/md.c | 2 +-
src/backend/storage/smgr/smgr.c | 5 ++++-
src/backend/utils/init/miscinit.c | 2 ++
src/include/storage/md.h | 2 ++
src/include/storage/smgr.h | 2 ++
5 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 66a93101ab..13ec9da236 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -140,7 +140,7 @@ void mdsmgr_register(void)
{
/* magnetic disk */
f_smgr md_smgr = (f_smgr) {
- .name = "md",
+ .name = MdSMgrName,
.smgr_init = mdinit,
.smgr_shutdown = NULL,
.smgr_open = mdopen,
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index b586e6e25a..0814330b8a 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -37,6 +37,9 @@ static int NSmgr = 0;
static Size LargestSMgrRelationSize = 0;
+char *storage_manager_string;
+SMgrId storage_manager_id;
+
/*
* Each backend has a hashtable that stores all extant SMgrRelation objects.
* In addition, "unowned" SMgrRelation objects are chained together in a list.
@@ -182,7 +185,7 @@ smgropen(RelFileLocator rlocator, BackendId backend)
for (int i = 0; i <= MAX_FORKNUM; ++i)
reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
- reln->smgr_which = MdSMgrId; /* we only have md.c at present */
+ reln->smgr_which = storage_manager_id;
/* implementation-specific initialization */
smgrsw[reln->smgr_which].smgr_open(reln);
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 4ec7619302..f44f511f69 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1871,6 +1871,8 @@ void
register_builtin_dynamic_managers(void)
{
mdsmgr_register();
+
+ storage_manager_id = MdSMgrId;
}
/*
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 734bae07e1..fdafb2c8e3 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -19,6 +19,8 @@
#include "storage/smgr.h"
#include "storage/sync.h"
+#define MdSMgrName "md"
+
/* registration function for md storage manager */
extern void mdsmgr_register(void);
extern SMgrId MdSMgrId;
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 95927b8bdd..ee4fc27265 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -22,6 +22,8 @@ typedef uint8 SMgrId;
#define MaxSMgrId UINT8_MAX
+extern PGDLLIMPORT SMgrId storage_manager_id;
+
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
* cached file handles. An SMgrRelation is created (if not already present)
--
Tristan Partin
Neon (https://neon.tech)

View File

@ -0,0 +1,60 @@
From 9ed9b8ca36cdb75b44deccdfea619c7494fcc6ef Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 13 Oct 2023 13:57:18 -0500
Subject: [PATCH v1 3/5] Add checkpoint_create_hook
Allows an extension to hook into CheckPointCreate().
---
src/backend/access/transam/xlog.c | 5 +++++
src/include/access/xlog.h | 4 ++++
2 files changed, 9 insertions(+)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 478377c4a2..61ae5b63b8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -212,6 +212,8 @@ const struct config_enum_entry archive_mode_options[] = {
*/
CheckpointStatsData CheckpointStats;
+checkpoint_create_hook_type checkpoint_create_hook = NULL;
+
/*
* During recovery, lastFullPageWrites keeps track of full_page_writes that
* the replayed WAL records indicate. It's initialized with full_page_writes
@@ -6875,6 +6877,9 @@ CreateCheckPoint(int flags)
*/
END_CRIT_SECTION();
+ if (checkpoint_create_hook != NULL)
+ checkpoint_create_hook(&checkPoint);
+
/*
* In some cases there are groups of actions that must all occur on one
* side or the other of a checkpoint record. Before flushing the
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 301c5fa11f..437f2a994b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -13,6 +13,7 @@
#include "access/xlogbackup.h"
#include "access/xlogdefs.h"
+#include "catalog/pg_control.h"
#include "datatype/timestamp.h"
#include "lib/stringinfo.h"
#include "nodes/pg_list.h"
@@ -57,6 +58,9 @@ extern PGDLLIMPORT int wal_decode_buffer_size;
extern PGDLLIMPORT int CheckPointSegments;
+typedef void (*checkpoint_create_hook_type)(const CheckPoint *);
+extern PGDLLIMPORT checkpoint_create_hook_type checkpoint_create_hook;
+
/* Archive modes */
typedef enum ArchiveMode
{
--
Tristan Partin
Neon (https://neon.tech)

View File

@ -0,0 +1,341 @@
From d46b41d7c89deb23a6a1afec9d7fe3544b9a3327 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 20 Sep 2023 14:23:38 -0500
Subject: [PATCH v1 4/5] Add contrib/fsync_checker
fsync_checker is an extension which overrides the global storage manager
to check for volatile relations, those which have been written but not
synced to disk.
---
contrib/Makefile | 1 +
contrib/fsync_checker/fsync_checker.control | 5 +
contrib/fsync_checker/fsync_checker_smgr.c | 249 ++++++++++++++++++++
contrib/fsync_checker/meson.build | 22 ++
contrib/meson.build | 1 +
5 files changed, 278 insertions(+)
create mode 100644 contrib/fsync_checker/fsync_checker.control
create mode 100644 contrib/fsync_checker/fsync_checker_smgr.c
create mode 100644 contrib/fsync_checker/meson.build
diff --git a/contrib/Makefile b/contrib/Makefile
index da4e2316a3..c55ced6ec0 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -20,6 +20,7 @@ SUBDIRS = \
dict_int \
dict_xsyn \
earthdistance \
+ fsync_checker \
file_fdw \
fuzzystrmatch \
hstore \
diff --git a/contrib/fsync_checker/fsync_checker.control b/contrib/fsync_checker/fsync_checker.control
new file mode 100644
index 0000000000..7d0e36434b
--- /dev/null
+++ b/contrib/fsync_checker/fsync_checker.control
@@ -0,0 +1,5 @@
+# fsync_checker extension
+comment = 'SMGR extension for checking volatile writes'
+default_version = '1.0'
+module_pathname = '$libdir/fsync_checker'
+relocatable = true
diff --git a/contrib/fsync_checker/fsync_checker_smgr.c b/contrib/fsync_checker/fsync_checker_smgr.c
new file mode 100644
index 0000000000..feef2f7d3e
--- /dev/null
+++ b/contrib/fsync_checker/fsync_checker_smgr.c
@@ -0,0 +1,249 @@
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "storage/md.h"
+#include "utils/hsearch.h"
+
+PG_MODULE_MAGIC;
+
+typedef struct volatileRelnKey
+{
+ RelFileLocator locator;
+ ForkNumber forknum;
+} volatileRelnKey;
+
+typedef struct volatileRelnEntry
+{
+ volatileRelnKey key;
+ XLogRecPtr lsn;
+} volatileRelnEntry;
+
+void _PG_init(void);
+
+static void fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync);
+static void fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum);
+static void fsync_checker_writev(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void **buffers,
+ BlockNumber nblocks, bool skipFsync);
+static void fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
+static void fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
+
+static void fsync_checker_checkpoint_create(const CheckPoint *checkPoint);
+static void fsync_checker_shmem_request(void);
+static void fsync_checker_shmem_startup(void);
+
+static void add_reln(SMgrRelation reln, ForkNumber forknum);
+static void remove_reln(SMgrRelation reln, ForkNumber forknum);
+
+static SMgrId fsync_checker_smgr_id;
+static const struct f_smgr fsync_checker_smgr = {
+ .name = "fsync_checker",
+ .smgr_init = mdinit,
+ .smgr_shutdown = NULL,
+ .smgr_open = mdopen,
+ .smgr_close = mdclose,
+ .smgr_create = mdcreate,
+ .smgr_exists = mdexists,
+ .smgr_unlink = mdunlink,
+ .smgr_extend = fsync_checker_extend,
+ .smgr_zeroextend = fsync_checker_zeroextend,
+ .smgr_prefetch = mdprefetch,
+ .smgr_readv = mdreadv,
+ .smgr_writev = fsync_checker_writev,
+ .smgr_writeback = fsync_checker_writeback,
+ .smgr_nblocks = mdnblocks,
+ .smgr_truncate = mdtruncate,
+ .smgr_immedsync = fsync_checker_immedsync,
+};
+
+static HTAB *volatile_relns;
+static LWLock *volatile_relns_lock;
+static shmem_request_hook_type prev_shmem_request_hook;
+static shmem_startup_hook_type prev_shmem_startup_hook;
+static checkpoint_create_hook_type prev_checkpoint_create_hook;
+
+void
+_PG_init(void)
+{
+ prev_checkpoint_create_hook = checkpoint_create_hook;
+ checkpoint_create_hook = fsync_checker_checkpoint_create;
+
+ prev_shmem_request_hook = shmem_request_hook;
+ shmem_request_hook = fsync_checker_shmem_request;
+
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = fsync_checker_shmem_startup;
+
+ /*
+ * Relation size of 0 means we can just defer to md, but it would be nice
+ * to just expose this functionality, so if I needed my own relation, I
+ * could use MdSmgrRelation as the parent.
+ */
+ fsync_checker_smgr_id = smgr_register(&fsync_checker_smgr, 0);
+
+ storage_manager_id = fsync_checker_smgr_id;
+}
+
+static void
+fsync_checker_checkpoint_create(const CheckPoint *checkPoint)
+{
+ long num_entries;
+ HASH_SEQ_STATUS status;
+ volatileRelnEntry *entry;
+
+ if (prev_checkpoint_create_hook)
+ prev_checkpoint_create_hook(checkPoint);
+
+ LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE);
+
+ hash_seq_init(&status, volatile_relns);
+
+ num_entries = hash_get_num_entries(volatile_relns);
+ elog(INFO, "Analyzing %ld volatile relations", num_entries);
+ while ((entry = hash_seq_search(&status)))
+ {
+ if (entry->lsn < checkPoint->redo)
+ {
+ char *path;
+
+ path = relpathperm(entry->key.locator, entry->key.forknum);
+
+ elog(WARNING, "Relation not previously synced: %s", path);
+
+ pfree(path);
+ }
+ }
+
+ LWLockRelease(volatile_relns_lock);
+}
+
+static void
+fsync_checker_shmem_request(void)
+{
+ if (prev_shmem_request_hook)
+ prev_shmem_request_hook();
+
+ RequestAddinShmemSpace(hash_estimate_size(1024, sizeof(volatileRelnEntry)));
+ RequestNamedLWLockTranche("fsync_checker volatile relns lock", 1);
+}
+
+static void
+fsync_checker_shmem_startup(void)
+{
+ HASHCTL ctl;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ ctl.keysize = sizeof(volatileRelnKey);
+ ctl.entrysize = sizeof(volatileRelnEntry);
+ volatile_relns = NULL;
+ volatile_relns_lock = NULL;
+
+ /*
+ * Create or attach to the shared memory state, including hash table
+ */
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+ volatile_relns = ShmemInitHash("fsync_checker volatile relns",
+ 1024, 1024, &ctl, HASH_BLOBS | HASH_ELEM);
+ volatile_relns_lock = &GetNamedLWLockTranche("fsync_checker volatile relns lock")->lock;
+
+ LWLockRelease(AddinShmemInitLock);
+}
+
+static void
+add_reln(SMgrRelation reln, ForkNumber forknum)
+{
+ bool found;
+ XLogRecPtr lsn;
+ volatileRelnKey key;
+ volatileRelnEntry *entry;
+
+ key.locator = reln->smgr_rlocator.locator;
+ key.forknum = forknum;
+
+ lsn = GetXLogWriteRecPtr();
+
+ LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE);
+
+ entry = hash_search(volatile_relns, &key, HASH_ENTER, &found);
+ if (!found)
+ entry->lsn = lsn;
+
+ LWLockRelease(volatile_relns_lock);
+}
+
+static void
+remove_reln(SMgrRelation reln, ForkNumber forknum)
+{
+ volatileRelnKey key;
+
+ key.locator = reln->smgr_rlocator.locator;
+ key.forknum = forknum;
+
+ LWLockAcquire(volatile_relns_lock, LW_EXCLUSIVE);
+
+ hash_search(volatile_relns, &key, HASH_REMOVE, NULL);
+
+ LWLockRelease(volatile_relns_lock);
+}
+
+static void
+fsync_checker_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ if (!SmgrIsTemp(reln) && !skipFsync)
+ add_reln(reln, forknum);
+
+ mdextend(reln, forknum, blocknum, buffer, skipFsync);
+}
+
+static void
+fsync_checker_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+ if (!SmgrIsTemp(reln))
+ remove_reln(reln, forknum);
+
+ mdimmedsync(reln, forknum);
+}
+
+static void
+fsync_checker_writev(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void **buffers,
+ BlockNumber nblocks, bool skipFsync)
+{
+ if (!SmgrIsTemp(reln) && !skipFsync)
+ add_reln(reln, forknum);
+
+ mdwritev(reln, forknum, blocknum, buffers, nblocks, skipFsync);
+}
+
+static void
+fsync_checker_writeback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks)
+{
+ if (!SmgrIsTemp(reln))
+ remove_reln(reln, forknum);
+
+ mdwriteback(reln, forknum, blocknum, nblocks);
+}
+
+static void
+fsync_checker_zeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+ if (!SmgrIsTemp(reln) && !skipFsync)
+ add_reln(reln, forknum);
+
+ mdzeroextend(reln, forknum, blocknum, nblocks, skipFsync);
+}
diff --git a/contrib/fsync_checker/meson.build b/contrib/fsync_checker/meson.build
new file mode 100644
index 0000000000..ce6ed7fe90
--- /dev/null
+++ b/contrib/fsync_checker/meson.build
@@ -0,0 +1,22 @@
+# Copyright (c) 2023, PostgreSQL Global Development Group
+
+fsync_checker_sources = files(
+ 'fsync_checker_smgr.c',
+)
+
+if host_system == 'windows'
+ fsync_checker_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'fsync_checker',
+ '--FILEDESC', 'fsync_checker - SMGR extension for checking volatile relations',])
+endif
+
+fsync_checker = shared_module('fsync_checker',
+ fsync_checker_sources,
+ kwargs: contrib_mod_args,
+)
+contrib_targets += fsync_checker
+
+install_data(
+ 'fsync_checker.control',
+ kwargs: contrib_data_args,
+)
diff --git a/contrib/meson.build b/contrib/meson.build
index c12dc906ca..e5d872494a 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -29,6 +29,7 @@ subdir('dict_int')
subdir('dict_xsyn')
subdir('earthdistance')
subdir('file_fdw')
+subdir('fsync_checker')
subdir('fuzzystrmatch')
subdir('hstore')
subdir('hstore_plperl')
--
Tristan Partin
Neon (https://neon.tech)