Add temporary debug info to help debug 019_replslot_limit.pl failures.

I have not been able to reproduce the occasional failures of
019_replslot_limit.pl we are seeing in the buildfarm and not for lack of
trying. The additional logging and increased log level will hopefully help.

Will be reverted once the cause is identified.

Discussion: https://postgr.es/m/20220218231415.c4plkp4i3reqcwip@alap3.anarazel.de
This commit is contained in:
Andres Freund 2022-02-22 18:02:34 -08:00
parent 9467321649
commit afdeff1052
3 changed files with 34 additions and 2 deletions

View File

@ -177,6 +177,10 @@ ReplicationSlotInitialize(void)
static void static void
ReplicationSlotShmemExit(int code, Datum arg) ReplicationSlotShmemExit(int code, Datum arg)
{ {
/* temp debugging aid to analyze 019_replslot_limit failures */
elog(DEBUG3, "replication slot exit hook, %s active slot",
MyReplicationSlot != NULL ? "with" : "without");
/* Make sure active replication slots are released */ /* Make sure active replication slots are released */
if (MyReplicationSlot != NULL) if (MyReplicationSlot != NULL)
ReplicationSlotRelease(); ReplicationSlotRelease();
@ -554,6 +558,9 @@ ReplicationSlotCleanup(void)
Assert(MyReplicationSlot == NULL); Assert(MyReplicationSlot == NULL);
restart: restart:
/* temp debugging aid to analyze 019_replslot_limit failures */
elog(DEBUG3, "temporary replication slot cleanup: begin");
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
for (i = 0; i < max_replication_slots; i++) for (i = 0; i < max_replication_slots; i++)
{ {
@ -579,6 +586,8 @@ restart:
} }
LWLockRelease(ReplicationSlotControlLock); LWLockRelease(ReplicationSlotControlLock);
elog(DEBUG3, "temporary replication slot cleanup: done");
} }
/* /*
@ -1284,6 +1293,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
(void) kill(active_pid, SIGTERM); (void) kill(active_pid, SIGTERM);
last_signaled_pid = active_pid; last_signaled_pid = active_pid;
} }
else
{
/* temp debugging aid to analyze 019_replslot_limit failures */
elog(DEBUG3, "not signalling process %d during invalidation of slot \"%s\"",
active_pid, NameStr(slotname));
}
/* Wait until the slot is released. */ /* Wait until the slot is released. */
ConditionVariableSleep(&s->active_cv, ConditionVariableSleep(&s->active_cv,
@ -1347,6 +1362,10 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN); XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
restart: restart:
/* temp debugging aid to analyze 019_replslot_limit failures */
elog(DEBUG3, "begin invalidating obsolete replication slots older than %X/%X",
LSN_FORMAT_ARGS(oldestLSN));
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
for (int i = 0; i < max_replication_slots; i++) for (int i = 0; i < max_replication_slots; i++)
{ {
@ -1372,6 +1391,8 @@ restart:
ReplicationSlotsComputeRequiredLSN(); ReplicationSlotsComputeRequiredLSN();
} }
elog(DEBUG3, "done invalidating obsolete replication slots");
return invalidated; return invalidated;
} }

View File

@ -700,8 +700,16 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier)
bgchild = fork(); bgchild = fork();
if (bgchild == 0) if (bgchild == 0)
{ {
int ret;
/* in child process */ /* in child process */
exit(LogStreamerMain(param)); ret = LogStreamerMain(param);
/* temp debugging aid to analyze 019_replslot_limit failures */
if (verbose)
pg_log_info("log streamer with pid %d exiting", getpid());
exit(ret);
} }
else if (bgchild < 0) else if (bgchild < 0)
{ {

View File

@ -316,13 +316,16 @@ $node_primary3->append_conf(
max_wal_size = 2MB max_wal_size = 2MB
log_checkpoints = yes log_checkpoints = yes
max_slot_wal_keep_size = 1MB max_slot_wal_keep_size = 1MB
# temp debugging aid to analyze 019_replslot_limit failures
log_min_messages=debug3
)); ));
$node_primary3->start; $node_primary3->start;
$node_primary3->safe_psql('postgres', $node_primary3->safe_psql('postgres',
"SELECT pg_create_physical_replication_slot('rep3')"); "SELECT pg_create_physical_replication_slot('rep3')");
# Take backup # Take backup
$backup_name = 'my_backup'; $backup_name = 'my_backup';
$node_primary3->backup($backup_name); $node_primary3->backup($backup_name, backup_options => ['--verbose']);
# Create standby # Create standby
my $node_standby3 = PostgreSQL::Test::Cluster->new('standby_3'); my $node_standby3 = PostgreSQL::Test::Cluster->new('standby_3');
$node_standby3->init_from_backup($node_primary3, $backup_name, $node_standby3->init_from_backup($node_primary3, $backup_name,