Magnus Hagander 1fde38beaa Allow on-line enabling and disabling of data checksums
This makes it possible to turn checksums on in a live cluster, without
the previous need for dump/reload or logical replication (and to turn it
off).

Enabling checkusm starts a background process in the form of a
launcher/worker combination that goes through the entire database and
recalculates checksums on each and every page. Only when all pages have
been checksummed are they fully enabled in the cluster. Any failure of
the process will revert to checksums off and the process has to be
started.

This adds a new WAL record that indicates the state of checksums, so
the process works across replicated clusters.

Authors: Magnus Hagander and Daniel Gustafsson
Review: Tomas Vondra, Michael Banck, Heikki Linnakangas, Andrey Borodin
2018-04-05 22:04:48 +02:00

760 lines
20 KiB
C

/*-------------------------------------------------------------------------
*
* xlogfuncs.c
*
* PostgreSQL write-ahead log manager user interface functions
*
* This file contains WAL control and information functions.
*
*
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/access/transam/xlogfuncs.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/htup_details.h"
#include "access/xlog.h"
#include "access/xlog_internal.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "postmaster/checksumhelper.h"
#include "replication/walreceiver.h"
#include "storage/smgr.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/numeric.h"
#include "utils/guc.h"
#include "utils/pg_lsn.h"
#include "utils/timestamp.h"
#include "utils/tuplestore.h"
#include "storage/fd.h"
#include "storage/ipc.h"
/*
* Store label file and tablespace map during non-exclusive backups.
*/
static StringInfo label_file;
static StringInfo tblspc_map_file;
/*
* Called when the backend exits with a running non-exclusive base backup,
* to clean up state.
*/
static void
nonexclusive_base_backup_cleanup(int code, Datum arg)
{
do_pg_abort_backup();
ereport(WARNING,
(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
}
/*
* pg_start_backup: set up for taking an on-line backup dump
*
* Essentially what this does is to create a backup label file in $PGDATA,
* where it will be archived as part of the backup dump. The label file
* contains the user-supplied label string (typically this would be used
* to tell where the backup dump will be stored) and the starting time and
* starting WAL location for the dump.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_start_backup(PG_FUNCTION_ARGS)
{
text *backupid = PG_GETARG_TEXT_PP(0);
bool fast = PG_GETARG_BOOL(1);
bool exclusive = PG_GETARG_BOOL(2);
char *backupidstr;
XLogRecPtr startpoint;
SessionBackupState status = get_backup_status();
backupidstr = text_to_cstring(backupid);
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("a backup is already in progress in this session")));
if (exclusive)
{
startpoint = do_pg_start_backup(backupidstr, fast, NULL, NULL,
NULL, NULL, false, true);
}
else
{
MemoryContext oldcontext;
/*
* Label file and tablespace map file need to be long-lived, since
* they are read in pg_stop_backup.
*/
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
label_file = makeStringInfo();
tblspc_map_file = makeStringInfo();
MemoryContextSwitchTo(oldcontext);
startpoint = do_pg_start_backup(backupidstr, fast, NULL, label_file,
NULL, tblspc_map_file, false, true);
before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
}
PG_RETURN_LSN(startpoint);
}
/*
* pg_stop_backup: finish taking an on-line backup dump
*
* We write an end-of-backup WAL record, and remove the backup label file
* created by pg_start_backup, creating a backup history file in pg_wal
* instead (whence it will immediately be archived). The backup history file
* contains the same info found in the label file, plus the backup-end time
* and WAL location. Before 9.0, the backup-end time was read from the backup
* history file at the beginning of archive recovery, but we now use the WAL
* record for that and the file is for informational and debug purposes only.
*
* Note: different from CancelBackup which just cancels online backup mode.
*
* Note: this version is only called to stop an exclusive backup. The function
* pg_stop_backup_v2 (overloaded as pg_stop_backup in SQL) is called to
* stop non-exclusive backups.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_stop_backup(PG_FUNCTION_ARGS)
{
XLogRecPtr stoppoint;
SessionBackupState status = get_backup_status();
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup in progress"),
errhint("Did you mean to use pg_stop_backup('f')?")));
/*
* Exclusive backups were typically started in a different connection, so
* don't try to verify that status of backup is set to
* SESSION_BACKUP_EXCLUSIVE in this function. Actual verification that an
* exclusive backup is in fact running is handled inside
* do_pg_stop_backup.
*/
stoppoint = do_pg_stop_backup(NULL, true, NULL);
PG_RETURN_LSN(stoppoint);
}
/*
* pg_stop_backup_v2: finish taking exclusive or nonexclusive on-line backup.
*
* Works the same as pg_stop_backup, except for non-exclusive backups it returns
* the backup label and tablespace map files as text fields in as part of the
* resultset.
*
* The first parameter (variable 'exclusive') allows the user to tell us if
* this is an exclusive or a non-exclusive backup.
*
* The second parameter (variable 'waitforarchive'), which is optional,
* allows the user to choose if they want to wait for the WAL to be archived
* or if we should just return as soon as the WAL record is written.
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_stop_backup_v2(PG_FUNCTION_ARGS)
{
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
TupleDesc tupdesc;
Tuplestorestate *tupstore;
MemoryContext per_query_ctx;
MemoryContext oldcontext;
Datum values[3];
bool nulls[3];
bool exclusive = PG_GETARG_BOOL(0);
bool waitforarchive = PG_GETARG_BOOL(1);
XLogRecPtr stoppoint;
SessionBackupState status = get_backup_status();
/* check to see if caller supports us returning a tuplestore */
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("set-valued function called in context that cannot accept a set")));
if (!(rsinfo->allowedModes & SFRM_Materialize))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("materialize mode required, but it is not " \
"allowed in this context")));
/* Build a tuple descriptor for our result type */
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
oldcontext = MemoryContextSwitchTo(per_query_ctx);
tupstore = tuplestore_begin_heap(true, false, work_mem);
rsinfo->returnMode = SFRM_Materialize;
rsinfo->setResult = tupstore;
rsinfo->setDesc = tupdesc;
MemoryContextSwitchTo(oldcontext);
MemSet(values, 0, sizeof(values));
MemSet(nulls, 0, sizeof(nulls));
if (exclusive)
{
if (status == SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup in progress"),
errhint("Did you mean to use pg_stop_backup('f')?")));
/*
* Stop the exclusive backup, and since we're in an exclusive backup
* return NULL for both backup_label and tablespace_map.
*/
stoppoint = do_pg_stop_backup(NULL, waitforarchive, NULL);
nulls[1] = true;
nulls[2] = true;
}
else
{
if (status != SESSION_BACKUP_NON_EXCLUSIVE)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("non-exclusive backup is not in progress"),
errhint("Did you mean to use pg_stop_backup('t')?")));
/*
* Stop the non-exclusive backup. Return a copy of the backup label
* and tablespace map so they can be written to disk by the caller.
*/
stoppoint = do_pg_stop_backup(label_file->data, waitforarchive, NULL);
cancel_before_shmem_exit(nonexclusive_base_backup_cleanup, (Datum) 0);
values[1] = CStringGetTextDatum(label_file->data);
values[2] = CStringGetTextDatum(tblspc_map_file->data);
/* Free structures allocated in TopMemoryContext */
pfree(label_file->data);
pfree(label_file);
label_file = NULL;
pfree(tblspc_map_file->data);
pfree(tblspc_map_file);
tblspc_map_file = NULL;
}
/* Stoppoint is included on both exclusive and nonexclusive backups */
values[0] = LSNGetDatum(stoppoint);
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
tuplestore_donestoring(typstore);
return (Datum) 0;
}
/*
* pg_switch_wal: switch to next xlog file
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_switch_wal(PG_FUNCTION_ARGS)
{
XLogRecPtr switchpoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
switchpoint = RequestXLogSwitch(false);
/*
* As a convenience, return the WAL location of the switch record
*/
PG_RETURN_LSN(switchpoint);
}
/*
* pg_create_restore_point: a named point for restore
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_create_restore_point(PG_FUNCTION_ARGS)
{
text *restore_name = PG_GETARG_TEXT_PP(0);
char *restore_name_str;
XLogRecPtr restorepoint;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
(errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery."))));
if (!XLogIsNeeded())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("WAL level not sufficient for creating a restore point"),
errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
restore_name_str = text_to_cstring(restore_name);
if (strlen(restore_name_str) >= MAXFNAMELEN)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("value too long for restore point (maximum %d characters)", MAXFNAMELEN - 1)));
restorepoint = XLogRestorePoint(restore_name_str);
/*
* As a convenience, return the WAL location of the restore point record
*/
PG_RETURN_LSN(restorepoint);
}
/*
* Report the current WAL write location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to an external
* archiving process. Note that the data before this point is written out
* to the kernel, but is not necessarily synced to disk.
*/
Datum
pg_current_wal_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetXLogWriteRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the current WAL insert location (same format as pg_start_backup etc)
*
* This function is mostly for debugging purposes.
*/
Datum
pg_current_wal_insert_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetXLogInsertRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the current WAL flush location (same format as pg_start_backup etc)
*
* This function is mostly for debugging purposes.
*/
Datum
pg_current_wal_flush_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr current_recptr;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("WAL control functions cannot be executed during recovery.")));
current_recptr = GetFlushRecPtr();
PG_RETURN_LSN(current_recptr);
}
/*
* Report the last WAL receive location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is guaranteed to be received
* and synced to disk by walreceiver.
*/
Datum
pg_last_wal_receive_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr recptr;
recptr = GetWalRcvWriteRecPtr(NULL, NULL);
if (recptr == 0)
PG_RETURN_NULL();
PG_RETURN_LSN(recptr);
}
/*
* Report the last WAL replay location (same format as pg_start_backup etc)
*
* This is useful for determining how much of WAL is visible to read-only
* connections during recovery.
*/
Datum
pg_last_wal_replay_lsn(PG_FUNCTION_ARGS)
{
XLogRecPtr recptr;
recptr = GetXLogReplayRecPtr(NULL);
if (recptr == 0)
PG_RETURN_NULL();
PG_RETURN_LSN(recptr);
}
/*
* Compute an xlog file name and decimal byte offset given a WAL location,
* such as is returned by pg_stop_backup() or pg_switch_wal().
*
* Note that a location exactly at a segment boundary is taken to be in
* the previous segment. This is usually the right thing, since the
* expected usage is to determine which xlog file(s) are ready to archive.
*/
Datum
pg_walfile_name_offset(PG_FUNCTION_ARGS)
{
XLogSegNo xlogsegno;
uint32 xrecoff;
XLogRecPtr locationpoint = PG_GETARG_LSN(0);
char xlogfilename[MAXFNAMELEN];
Datum values[2];
bool isnull[2];
TupleDesc resultTupleDesc;
HeapTuple resultHeapTuple;
Datum result;
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("pg_walfile_name_offset() cannot be executed during recovery.")));
/*
* Construct a tuple descriptor for the result row. This must match this
* function's pg_proc entry!
*/
resultTupleDesc = CreateTemplateTupleDesc(2, false);
TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
TEXTOID, -1, 0);
TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
INT4OID, -1, 0);
resultTupleDesc = BlessTupleDesc(resultTupleDesc);
/*
* xlogfilename
*/
XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
values[0] = CStringGetTextDatum(xlogfilename);
isnull[0] = false;
/*
* offset
*/
xrecoff = XLogSegmentOffset(locationpoint, wal_segment_size);
values[1] = UInt32GetDatum(xrecoff);
isnull[1] = false;
/*
* Tuple jam: Having first prepared your Datums, then squash together
*/
resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
result = HeapTupleGetDatum(resultHeapTuple);
PG_RETURN_DATUM(result);
}
/*
* Compute an xlog file name given a WAL location,
* such as is returned by pg_stop_backup() or pg_switch_wal().
*/
Datum
pg_walfile_name(PG_FUNCTION_ARGS)
{
XLogSegNo xlogsegno;
XLogRecPtr locationpoint = PG_GETARG_LSN(0);
char xlogfilename[MAXFNAMELEN];
if (RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is in progress"),
errhint("pg_walfile_name() cannot be executed during recovery.")));
XLByteToPrevSeg(locationpoint, xlogsegno, wal_segment_size);
XLogFileName(xlogfilename, ThisTimeLineID, xlogsegno, wal_segment_size);
PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
}
/*
* pg_wal_replay_pause - pause recovery now
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_wal_replay_pause(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
SetRecoveryPause(true);
PG_RETURN_VOID();
}
/*
* pg_wal_replay_resume - resume recovery now
*
* Permission checking for this function is managed through the normal
* GRANT system.
*/
Datum
pg_wal_replay_resume(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
SetRecoveryPause(false);
PG_RETURN_VOID();
}
/*
* pg_is_wal_replay_paused
*/
Datum
pg_is_wal_replay_paused(PG_FUNCTION_ARGS)
{
if (!RecoveryInProgress())
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("recovery is not in progress"),
errhint("Recovery control functions can only be executed during recovery.")));
PG_RETURN_BOOL(RecoveryIsPaused());
}
/*
* Returns timestamp of latest processed commit/abort record.
*
* When the server has been started normally without recovery the function
* returns NULL.
*/
Datum
pg_last_xact_replay_timestamp(PG_FUNCTION_ARGS)
{
TimestampTz xtime;
xtime = GetLatestXTime();
if (xtime == 0)
PG_RETURN_NULL();
PG_RETURN_TIMESTAMPTZ(xtime);
}
/*
* Returns bool with current recovery mode, a global state.
*/
Datum
pg_is_in_recovery(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(RecoveryInProgress());
}
/*
* Compute the difference in bytes between two WAL locations.
*/
Datum
pg_wal_lsn_diff(PG_FUNCTION_ARGS)
{
Datum result;
result = DirectFunctionCall2(pg_lsn_mi,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1));
PG_RETURN_NUMERIC(result);
}
/*
* Returns bool with current on-line backup mode, a global state.
*/
Datum
pg_is_in_backup(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(BackupInProgress());
}
/*
* Returns start time of an online exclusive backup.
*
* When there's no exclusive backup in progress, the function
* returns NULL.
*/
Datum
pg_backup_start_time(PG_FUNCTION_ARGS)
{
Datum xtime;
FILE *lfp;
char fline[MAXPGPATH];
char backup_start_time[30];
/*
* See if label file is present
*/
lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
if (lfp == NULL)
{
if (errno != ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
BACKUP_LABEL_FILE)));
PG_RETURN_NULL();
}
/*
* Parse the file to find the START TIME line.
*/
backup_start_time[0] = '\0';
while (fgets(fline, sizeof(fline), lfp) != NULL)
{
if (sscanf(fline, "START TIME: %25[^\n]\n", backup_start_time) == 1)
break;
}
/* Check for a read error. */
if (ferror(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE)));
/* Close the backup label file. */
if (FreeFile(lfp))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not close file \"%s\": %m", BACKUP_LABEL_FILE)));
if (strlen(backup_start_time) == 0)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
/*
* Convert the time string read from file to TimestampTz form.
*/
xtime = DirectFunctionCall3(timestamptz_in,
CStringGetDatum(backup_start_time),
ObjectIdGetDatum(InvalidOid),
Int32GetDatum(-1));
PG_RETURN_DATUM(xtime);
}
/*
* Disables checksums for the cluster, unless already disabled.
*
* Has immediate effect - the checksums are set to off right away.
*/
Datum
disable_data_checksums(PG_FUNCTION_ARGS)
{
/*
* If we don't need to write new checksums, then clearly they are already
* disabled.
*/
if (!DataChecksumsNeedWrite())
ereport(ERROR,
(errmsg("data checksums already disabled")));
ShutdownChecksumHelperIfRunning();
SetDataChecksumsOff();
PG_RETURN_VOID();
}
/*
* Enables checksums for the cluster, unless already enabled.
*
* Supports vacuum-like cost-based throttling, to limit system load.
* Starts a background worker that updates checksums on existing data.
*/
Datum
enable_data_checksums(PG_FUNCTION_ARGS)
{
int cost_delay = PG_GETARG_INT32(0);
int cost_limit = PG_GETARG_INT32(1);
if (cost_delay < 0)
ereport(ERROR,
(errmsg("cost delay cannot be less than zero")));
if (cost_limit <= 0)
ereport(ERROR,
(errmsg("cost limit must be a positive value")));
/*
* Allow state change from "off" or from "inprogress", since this is how
* we restart the worker if necessary.
*/
if (DataChecksumsNeedVerify())
ereport(ERROR,
(errmsg("data checksums already enabled")));
SetDataChecksumsInProgress();
if (!StartChecksumHelperLauncher(cost_delay, cost_limit))
ereport(ERROR,
(errmsg("failed to start checksum helper process")));
PG_RETURN_VOID();
}