mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-30 00:04:49 -04:00 
			
		
		
		
	Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d7f71764b8c628044e7ce2225f9aa43b67, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
This commit is contained in:
		
							parent
							
								
									6ba365aa46
								
							
						
					
					
						commit
						45811be94e
					
				| @ -249,6 +249,17 @@ static pid_t StartupPID = 0, | ||||
| 			PgStatPID = 0, | ||||
| 			SysLoggerPID = 0; | ||||
| 
 | ||||
| /* Startup process's status */ | ||||
| typedef enum | ||||
| { | ||||
| 	STARTUP_NOT_RUNNING, | ||||
| 	STARTUP_RUNNING, | ||||
| 	STARTUP_SIGNALED,			/* we sent it a SIGQUIT or SIGKILL */ | ||||
| 	STARTUP_CRASHED | ||||
| } StartupStatusEnum; | ||||
| 
 | ||||
| static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING; | ||||
| 
 | ||||
| /* Startup/shutdown state */ | ||||
| #define			NoShutdown		0 | ||||
| #define			SmartShutdown	1 | ||||
| @ -258,7 +269,6 @@ static pid_t StartupPID = 0, | ||||
| static int	Shutdown = NoShutdown; | ||||
| 
 | ||||
| static bool FatalError = false; /* T if recovering from backend crash */ | ||||
| static bool RecoveryError = false;		/* T if WAL recovery failed */ | ||||
| 
 | ||||
| /*
 | ||||
|  * We use a simple state machine to control startup, shutdown, and | ||||
| @ -301,8 +311,6 @@ static bool RecoveryError = false;		/* T if WAL recovery failed */ | ||||
|  * states, nor in PM_SHUTDOWN states (because we don't enter those states | ||||
|  * when trying to recover from a crash).  It can be true in PM_STARTUP state, | ||||
|  * because we don't clear it until we've successfully started WAL redo. | ||||
|  * Similarly, RecoveryError means that we have crashed during recovery, and | ||||
|  * should not try to restart. | ||||
|  */ | ||||
| typedef enum | ||||
| { | ||||
| @ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[]) | ||||
| 	 */ | ||||
| 	StartupPID = StartupDataBase(); | ||||
| 	Assert(StartupPID != 0); | ||||
| 	StartupStatus = STARTUP_RUNNING; | ||||
| 	pmState = PM_STARTUP; | ||||
| 
 | ||||
| 	/* Some workers may be scheduled to start now */ | ||||
| @ -1666,7 +1675,7 @@ ServerLoop(void) | ||||
| 
 | ||||
| 		/* If we have lost the archiver, try to start a new one. */ | ||||
| 		if (PgArchPID == 0 && PgArchStartupAllowed()) | ||||
| 				PgArchPID = pgarch_start(); | ||||
| 			PgArchPID = pgarch_start(); | ||||
| 
 | ||||
| 		/* If we need to signal the autovacuum launcher, do so now */ | ||||
| 		if (avlauncher_needs_signal) | ||||
| @ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS) | ||||
| 			if (Shutdown > NoShutdown && | ||||
| 				(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus))) | ||||
| 			{ | ||||
| 				StartupStatus = STARTUP_NOT_RUNNING; | ||||
| 				pmState = PM_WAIT_BACKENDS; | ||||
| 				/* PostmasterStateMachine logic does the rest */ | ||||
| 				continue; | ||||
| @ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS) | ||||
| 			{ | ||||
| 				ereport(LOG, | ||||
| 						(errmsg("shutdown at recovery target"))); | ||||
| 				StartupStatus = STARTUP_NOT_RUNNING; | ||||
| 				Shutdown = SmartShutdown; | ||||
| 				TerminateChildren(SIGTERM); | ||||
| 				pmState = PM_WAIT_BACKENDS; | ||||
| @ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS) | ||||
| 			/*
 | ||||
| 			 * After PM_STARTUP, any unexpected exit (including FATAL exit) of | ||||
| 			 * the startup process is catastrophic, so kill other children, | ||||
| 			 * and set RecoveryError so we don't try to reinitialize after | ||||
| 			 * they're gone.  Exception: if FatalError is already set, that | ||||
| 			 * implies we previously sent the startup process a SIGQUIT, so | ||||
| 			 * and set StartupStatus so we don't try to reinitialize after | ||||
| 			 * they're gone.  Exception: if StartupStatus is STARTUP_SIGNALED, | ||||
| 			 * then we previously sent the startup process a SIGQUIT; so | ||||
| 			 * that's probably the reason it died, and we do want to try to | ||||
| 			 * restart in that case. | ||||
| 			 */ | ||||
| 			if (!EXIT_STATUS_0(exitstatus)) | ||||
| 			{ | ||||
| 				if (!FatalError) | ||||
| 					RecoveryError = true; | ||||
| 				if (StartupStatus == STARTUP_SIGNALED) | ||||
| 					StartupStatus = STARTUP_NOT_RUNNING; | ||||
| 				else | ||||
| 					StartupStatus = STARTUP_CRASHED; | ||||
| 				HandleChildCrash(pid, exitstatus, | ||||
| 								 _("startup process")); | ||||
| 				continue; | ||||
| @ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS) | ||||
| 			/*
 | ||||
| 			 * Startup succeeded, commence normal operations | ||||
| 			 */ | ||||
| 			StartupStatus = STARTUP_NOT_RUNNING; | ||||
| 			FatalError = false; | ||||
| 			Assert(AbortStartTime == 0); | ||||
| 			ReachedNormalRunning = true; | ||||
| @ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid, | ||||
| 		ReportBackgroundWorkerPID(rw);	/* report child death */ | ||||
| 
 | ||||
| 		LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG, | ||||
| 							    namebuf, pid, exitstatus); | ||||
| 					 namebuf, pid, exitstatus); | ||||
| 
 | ||||
| 		return true; | ||||
| 	} | ||||
| @ -3190,7 +3204,10 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) | ||||
| 
 | ||||
| 	/* Take care of the startup process too */ | ||||
| 	if (pid == StartupPID) | ||||
| 	{ | ||||
| 		StartupPID = 0; | ||||
| 		StartupStatus = STARTUP_CRASHED; | ||||
| 	} | ||||
| 	else if (StartupPID != 0 && take_action) | ||||
| 	{ | ||||
| 		ereport(DEBUG2, | ||||
| @ -3198,6 +3215,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) | ||||
| 								 (SendStop ? "SIGSTOP" : "SIGQUIT"), | ||||
| 								 (int) StartupPID))); | ||||
| 		signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT)); | ||||
| 		StartupStatus = STARTUP_SIGNALED; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Take care of the bgwriter too */ | ||||
| @ -3589,13 +3607,14 @@ PostmasterStateMachine(void) | ||||
| 	} | ||||
| 
 | ||||
| 	/*
 | ||||
| 	 * If recovery failed, or the user does not want an automatic restart | ||||
| 	 * after backend crashes, wait for all non-syslogger children to exit, and | ||||
| 	 * then exit postmaster. We don't try to reinitialize when recovery fails, | ||||
| 	 * because more than likely it will just fail again and we will keep | ||||
| 	 * trying forever. | ||||
| 	 * If the startup process failed, or the user does not want an automatic | ||||
| 	 * restart after backend crashes, wait for all non-syslogger children to | ||||
| 	 * exit, and then exit postmaster.  We don't try to reinitialize when the | ||||
| 	 * startup process fails, because more than likely it will just fail again | ||||
| 	 * and we will keep trying forever. | ||||
| 	 */ | ||||
| 	if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash)) | ||||
| 	if (pmState == PM_NO_CHILDREN && | ||||
| 		(StartupStatus == STARTUP_CRASHED || !restart_after_crash)) | ||||
| 		ExitPostmaster(1); | ||||
| 
 | ||||
| 	/*
 | ||||
| @ -3615,6 +3634,7 @@ PostmasterStateMachine(void) | ||||
| 
 | ||||
| 		StartupPID = StartupDataBase(); | ||||
| 		Assert(StartupPID != 0); | ||||
| 		StartupStatus = STARTUP_RUNNING; | ||||
| 		pmState = PM_STARTUP; | ||||
| 		/* crash recovery started, reset SIGKILL flag */ | ||||
| 		AbortStartTime = 0; | ||||
| @ -3746,7 +3766,11 @@ TerminateChildren(int signal) | ||||
| { | ||||
| 	SignalChildren(signal); | ||||
| 	if (StartupPID != 0) | ||||
| 	{ | ||||
| 		signal_child(StartupPID, signal); | ||||
| 		if (signal == SIGQUIT || signal == SIGKILL) | ||||
| 			StartupStatus = STARTUP_SIGNALED; | ||||
| 	} | ||||
| 	if (BgWriterPID != 0) | ||||
| 		signal_child(BgWriterPID, signal); | ||||
| 	if (CheckpointerPID != 0) | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user