PostgreSQL/src/backend/libpq/pqcomm.c

/*-------------------------------------------------------------------------
 *
 * pqcomm.c
 *	  Communication functions between the Frontend and the Backend
 *
 * These routines handle the low-level details of communication between
 * frontend and backend.  They just shove data across the communication
 * channel, and are ignorant of the semantics of the data --- or would be,
 * except for major brain damage in the design of the COPY OUT protocol.
 * Unfortunately, COPY OUT is designed to commandeer the communication
 * channel (it just transfers data without wrapping it into messages).
 * No other messages can be sent while COPY OUT is in progress; and if the
 * copy is aborted by an elog(ERROR), we need to close out the copy so that
 * the frontend gets back into sync.  Therefore, these routines have to be
 * aware of COPY OUT state.
 *
 * NOTE: generally, it's a bad idea to emit outgoing messages directly with
 * pq_putbytes(), especially if the message would require multiple calls
 * to send.  Instead, use the routines in pqformat.c to construct the message
 * in a buffer and then emit it in one call to pq_putmessage.  This helps
 * ensure that the channel will not be clogged by an incomplete message
 * if execution is aborted by elog(ERROR) partway through the message.
 * The only non-libpq code that should call pq_putbytes directly is COPY OUT.
 *
 * At one time, libpq was shared between frontend and backend, but now
 * the backend's "backend/libpq" is quite separate from "interfaces/libpq".
 * All that remains is similarities of names to trap the unwary...
 *
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *	$Id: pqcomm.c,v 1.134 2002/06/14 03:56:46 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */

/*------------------------
 * INTERFACE ROUTINES
 *
 * setup/teardown:
 *		StreamServerPort	- Open postmaster's server port
 *		StreamConnection	- Create new connection with client
 *		StreamClose			- Close a client/backend connection
 *		pq_init			- initialize libpq at backend startup
 *		pq_close		- shutdown libpq at backend exit
 *
 * low-level I/O:
 *		pq_getbytes		- get a known number of bytes from connection
 *		pq_getstring	- get a null terminated string from connection
 *		pq_getbyte		- get next byte from connection
 *		pq_peekbyte		- peek at next byte from connection
 *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
 *		pq_flush		- flush pending output
 *
 * message-level I/O (and COPY OUT cruft):
 *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
 *		pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
 *		pq_endcopyout	- end a COPY OUT transfer
 *
 *------------------------
 */
#include "postgres.h"

#include <signal.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <netdb.h>
#include <netinet/in.h>
#ifdef HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
#endif
#include <arpa/inet.h>
#include <sys/file.h>

#include "libpq/libpq.h"
#include "miscadmin.h"
#include "storage/ipc.h"

/* these functions are misnamed - they handle both SSL and non-SSL case */
extern ssize_t read_SSL(Port *, void *ptr, size_t len);
extern ssize_t write_SSL(Port *, const void *ptr, size_t len);

#ifdef USE_SSL
extern void close_SSL(Port *);
#endif /* USE_SSL */


static void pq_close(void);


/*
 * Configuration options
 */
int			Unix_socket_permissions;
char	   *Unix_socket_group;


/*
 * Buffers for low-level I/O
 */

#define PQ_BUFFER_SIZE 8192

static unsigned char PqSendBuffer[PQ_BUFFER_SIZE];
static int	PqSendPointer;		/* Next index to store a byte in
								 * PqSendBuffer */

static unsigned char PqRecvBuffer[PQ_BUFFER_SIZE];
static int	PqRecvPointer;		/* Next index to read a byte from
								 * PqRecvBuffer */
static int	PqRecvLength;		/* End of data available in PqRecvBuffer */

/*
 * Message status
 */
static bool DoingCopyOut;


/* --------------------------------
 *		pq_init - initialize libpq at backend startup
 * --------------------------------
 */
void
pq_init(void)
{
	PqSendPointer = PqRecvPointer = PqRecvLength = 0;
	DoingCopyOut = false;
	on_proc_exit(pq_close, 0);
}


/* --------------------------------
 *		pq_close - shutdown libpq at backend exit
 *
 * Note: in a standalone backend MyProcPort will be null,
 * don't crash during exit...
 * --------------------------------
 */
static void
pq_close(void)
{
	if (MyProcPort != NULL)
	{
#ifdef USE_SSL
		close_SSL(MyProcPort);
#endif /* USE_SSL */
		close(MyProcPort->sock);
		/* make sure any subsequent attempts to do I/O fail cleanly */
		MyProcPort->sock = -1;
	}
}


/*
 * Streams -- wrapper around Unix socket system calls
 *
 *
 *		Stream functions are used for vanilla TCP connection protocol.
 */

static char sock_path[MAXPGPATH];


/* StreamDoUnlink()
 * Shutdown routine for backend connection
 * If a Unix socket is used for communication, explicitly close it.
 */
static void
StreamDoUnlink(void)
{
	Assert(sock_path[0]);
	unlink(sock_path);
}

/*
 * StreamServerPort -- open a sock stream "listening" port.
 *
 * This initializes the Postmaster's connection-accepting port *fdP.
 *
 * RETURNS: STATUS_OK or STATUS_ERROR
 */

int
StreamServerPort(int family, char *hostName, unsigned short portNumber,
				 char *unixSocketName, int *fdP)
{
	SockAddr	saddr;
	int			fd,
				err;
	int			maxconn;
	size_t		len = 0;
	int			one = 1;

	Assert(family == AF_INET || family == AF_UNIX);

	if ((fd = socket(family, SOCK_STREAM, 0)) < 0)
	{
		elog(LOG, "StreamServerPort: socket() failed: %m");
		return STATUS_ERROR;
	}

	if (family == AF_INET)
	{
		if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &one,
						sizeof(one))) == -1)
		{
			elog(LOG, "StreamServerPort: setsockopt(SO_REUSEADDR) failed: %m");
			return STATUS_ERROR;
		}
	}

	MemSet((char *) &saddr, 0, sizeof(saddr));
	saddr.sa.sa_family = family;

#ifdef HAVE_UNIX_SOCKETS
	if (family == AF_UNIX)
	{
		UNIXSOCK_PATH(saddr.un, portNumber, unixSocketName);
		len = UNIXSOCK_LEN(saddr.un);
		strcpy(sock_path, saddr.un.sun_path);

		/*
		 * Grab an interlock file associated with the socket file.
		 */
		if (!CreateSocketLockFile(sock_path, true))
			return STATUS_ERROR;

		/*
		 * Once we have the interlock, we can safely delete any
		 * pre-existing socket file to avoid failure at bind() time.
		 */
		unlink(sock_path);
	}
#endif   /* HAVE_UNIX_SOCKETS */

	if (family == AF_INET)
	{
		/* TCP/IP socket */
		if (hostName[0] == '\0')
			saddr.in.sin_addr.s_addr = htonl(INADDR_ANY);
		else
		{
			struct hostent *hp;

			hp = gethostbyname(hostName);
			if ((hp == NULL) || (hp->h_addrtype != AF_INET))
			{
				elog(LOG, "StreamServerPort: gethostbyname(%s) failed",
					 hostName);
				return STATUS_ERROR;
			}
			memmove((char *) &(saddr.in.sin_addr), (char *) hp->h_addr,
					hp->h_length);
		}

		saddr.in.sin_port = htons(portNumber);
		len = sizeof(struct sockaddr_in);
	}

	err = bind(fd, (struct sockaddr *) & saddr.sa, len);
	if (err < 0)
	{
		if (family == AF_UNIX)
			elog(LOG, "StreamServerPort: bind() failed: %m\n"
				 "\tIs another postmaster already running on port %d?\n"
				 "\tIf not, remove socket node (%s) and retry.",
				 (int) portNumber, sock_path);
		else
			elog(LOG, "StreamServerPort: bind() failed: %m\n"
				 "\tIs another postmaster already running on port %d?\n"
				 "\tIf not, wait a few seconds and retry.",
				 (int) portNumber);
		return STATUS_ERROR;
	}

#ifdef HAVE_UNIX_SOCKETS
	if (family == AF_UNIX)
	{
		/* Arrange to unlink the socket file at exit */
		on_proc_exit(StreamDoUnlink, 0);

		/*
		 * Fix socket ownership/permission if requested.  Note we must do
		 * this before we listen() to avoid a window where unwanted
		 * connections could get accepted.
		 */
		Assert(Unix_socket_group);
		if (Unix_socket_group[0] != '\0')
		{
			char	   *endptr;
			unsigned long int val;
			gid_t		gid;

			val = strtoul(Unix_socket_group, &endptr, 10);
			if (*endptr == '\0')
			{
				/* numeric group id */
				gid = val;
			}
			else
			{
				/* convert group name to id */
				struct group *gr;

				gr = getgrnam(Unix_socket_group);
				if (!gr)
				{
					elog(LOG, "No such group as '%s'",
						 Unix_socket_group);
					return STATUS_ERROR;
				}
				gid = gr->gr_gid;
			}
			if (chown(sock_path, -1, gid) == -1)
			{
				elog(LOG, "Could not set group of %s: %m",
					 sock_path);
				return STATUS_ERROR;
			}
		}

		if (chmod(sock_path, Unix_socket_permissions) == -1)
		{
			elog(LOG, "Could not set permissions on %s: %m",
				 sock_path);
			return STATUS_ERROR;
		}
	}
#endif   /* HAVE_UNIX_SOCKETS */

	/*
	 * Select appropriate accept-queue length limit.  PG_SOMAXCONN is only
	 * intended to provide a clamp on the request on platforms where an
	 * overly large request provokes a kernel error (are there any?).
	 */
	maxconn = MaxBackends * 2;
	if (maxconn > PG_SOMAXCONN)
		maxconn = PG_SOMAXCONN;

	err = listen(fd, maxconn);
	if (err < 0)
	{
		elog(LOG, "StreamServerPort: listen() failed: %m");
		return STATUS_ERROR;
	}

	*fdP = fd;

	return STATUS_OK;
}

/*
 * StreamConnection -- create a new connection with client using
 *		server port.
 *
 * ASSUME: that this doesn't need to be non-blocking because
 *		the Postmaster uses select() to tell when the server master
 *		socket is ready for accept().
 *
 * RETURNS: STATUS_OK or STATUS_ERROR
 */
int
StreamConnection(int server_fd, Port *port)
{
	ACCEPT_TYPE_ARG3 addrlen;

	/* accept connection (and fill in the client (remote) address) */
	addrlen = sizeof(port->raddr);
	if ((port->sock = accept(server_fd,
							 (struct sockaddr *) & port->raddr,
							 &addrlen)) < 0)
	{
		elog(LOG, "StreamConnection: accept() failed: %m");
		return STATUS_ERROR;
	}

#ifdef SCO_ACCEPT_BUG

	/*
	 * UnixWare 7+ and OpenServer 5.0.4 are known to have this bug, but it
	 * shouldn't hurt to catch it for all versions of those platforms.
	 */
	if (port->raddr.sa.sa_family == 0)
		port->raddr.sa.sa_family = AF_UNIX;
#endif

	/* fill in the server (local) address */
	addrlen = sizeof(port->laddr);
	if (getsockname(port->sock, (struct sockaddr *) & port->laddr,
					&addrlen) < 0)
	{
		elog(LOG, "StreamConnection: getsockname() failed: %m");
		return STATUS_ERROR;
	}

	/* select NODELAY and KEEPALIVE options if it's a TCP connection */
	if (port->laddr.sa.sa_family == AF_INET)
	{
		int			on = 1;

		if (setsockopt(port->sock, IPPROTO_TCP, TCP_NODELAY,
					   (char *) &on, sizeof(on)) < 0)
		{
			elog(LOG, "StreamConnection: setsockopt(TCP_NODELAY) failed: %m");
			return STATUS_ERROR;
		}
		if (setsockopt(port->sock, SOL_SOCKET, SO_KEEPALIVE,
					   (char *) &on, sizeof(on)) < 0)
		{
			elog(LOG, "StreamConnection: setsockopt(SO_KEEPALIVE) failed: %m");
			return STATUS_ERROR;
		}
	}

	return STATUS_OK;
}

/*
 * StreamClose -- close a client/backend connection
 */
void
StreamClose(int sock)
{
	/* FIXME - what about closing SSL connections? */
	close(sock);
}


/* --------------------------------
 * Low-level I/O routines begin here.
 *
 * These routines communicate with a frontend client across a connection
 * already established by the preceding routines.
 * --------------------------------
 */


/* --------------------------------
 *		pq_recvbuf - load some bytes into the input buffer
 *
 *		returns 0 if OK, EOF if trouble
 * --------------------------------
 */
static int
pq_recvbuf(void)
{
	if (PqRecvPointer > 0)
	{
		if (PqRecvLength > PqRecvPointer)
		{
			/* still some unread data, left-justify it in the buffer */
			memmove(PqRecvBuffer, PqRecvBuffer + PqRecvPointer,
					PqRecvLength - PqRecvPointer);
			PqRecvLength -= PqRecvPointer;
			PqRecvPointer = 0;
		}
		else
			PqRecvLength = PqRecvPointer = 0;
	}

	/* Can fill buffer from PqRecvLength and upwards */
	for (;;)
	{
		int			r;

		r = read_SSL(MyProcPort, PqRecvBuffer + PqRecvLength,
					 PQ_BUFFER_SIZE - PqRecvLength);

		if (r < 0)
		{
			if (errno == EINTR)
				continue;		/* Ok if interrupted */

			/*
			 * Careful: an elog() that tries to write to the client
			 * would cause recursion to here, leading to stack overflow
			 * and core dump!  This message must go *only* to the postmaster
			 * log.
			 */
			elog(COMMERROR, "pq_recvbuf: recv() failed: %m");
			return EOF;
		}
#ifdef USE_SSL
		if (r == 0 && !MyProcPort->ssl)
#else /* USE_SSL */
		if (r == 0)
#endif /* USE_SSL */
		{
			/* as above, only write to postmaster log */
			elog(COMMERROR, "pq_recvbuf: unexpected EOF on client connection");
			return EOF;
		}
		/* r contains number of bytes read, so just incr length */
		PqRecvLength += r;
		return 0;
	}
}

/* --------------------------------
 *		pq_getbyte	- get a single byte from connection, or return EOF
 * --------------------------------
 */
int
pq_getbyte(void)
{
	while (PqRecvPointer >= PqRecvLength)
	{
		if (pq_recvbuf())		/* If nothing in buffer, then recv some */
			return EOF;			/* Failed to recv data */
	}
	return PqRecvBuffer[PqRecvPointer++];
}

/* --------------------------------
 *		pq_peekbyte		- peek at next byte from connection
 *
 *	 Same as pq_getbyte() except we don't advance the pointer.
 * --------------------------------
 */
int
pq_peekbyte(void)
{
	while (PqRecvPointer >= PqRecvLength)
	{
		if (pq_recvbuf())		/* If nothing in buffer, then recv some */
			return EOF;			/* Failed to recv data */
	}
	return PqRecvBuffer[PqRecvPointer];
}

/* --------------------------------
 *		pq_getbytes		- get a known number of bytes from connection
 *
 *		returns 0 if OK, EOF if trouble
 * --------------------------------
 */
int
pq_getbytes(char *s, size_t len)
{
	size_t		amount;

	while (len > 0)
	{
		while (PqRecvPointer >= PqRecvLength)
		{
			if (pq_recvbuf())	/* If nothing in buffer, then recv some */
				return EOF;		/* Failed to recv data */
		}
		amount = PqRecvLength - PqRecvPointer;
		if (amount > len)
			amount = len;
		memcpy(s, PqRecvBuffer + PqRecvPointer, amount);
		PqRecvPointer += amount;
		s += amount;
		len -= amount;
	}
	return 0;
}

/* --------------------------------
 *		pq_getstring	- get a null terminated string from connection
 *
 *		The return value is placed in an expansible StringInfo.
 *		Note that space allocation comes from the current memory context!
 *
 *		NOTE: this routine does not do any MULTIBYTE conversion,
 *		even though it is presumably useful only for text, because
 *		no code in this module should depend on MULTIBYTE mode.
 *		See pq_getstr in pqformat.c for that.
 *
 *		returns 0 if OK, EOF if trouble
 * --------------------------------
 */
int
pq_getstring(StringInfo s)
{
	int			i;

	/* Reset string to empty */
	s->len = 0;
	s->data[0] = '\0';

	/* Read until we get the terminating '\0' */
	for(;;)
	{
		while (PqRecvPointer >= PqRecvLength)
		{
			if (pq_recvbuf())		/* If nothing in buffer, then recv some */
				return EOF;			/* Failed to recv data */
		}

		for (i = PqRecvPointer; i < PqRecvLength; i++)
		{
			if (PqRecvBuffer[i] == '\0')
			{
				/* does not copy the \0 */
				appendBinaryStringInfo(s, PqRecvBuffer + PqRecvPointer,
									   i - PqRecvPointer);
				PqRecvPointer = i + 1; /* advance past \0 */
				return 0;
			}
		}

		/* If we're here we haven't got the \0 in the buffer yet. */

		appendBinaryStringInfo(s, PqRecvBuffer + PqRecvPointer,
							   PqRecvLength - PqRecvPointer);
		PqRecvPointer = PqRecvLength;
	}
}


/* --------------------------------
 *		pq_putbytes		- send bytes to connection (not flushed until pq_flush)
 *
 *		returns 0 if OK, EOF if trouble
 * --------------------------------
 */
int
pq_putbytes(const char *s, size_t len)
{
	size_t		amount;

	while (len > 0)
	{
		if (PqSendPointer >= PQ_BUFFER_SIZE)
			if (pq_flush())		/* If buffer is full, then flush it out */
				return EOF;
		amount = PQ_BUFFER_SIZE - PqSendPointer;
		if (amount > len)
			amount = len;
		memcpy(PqSendBuffer + PqSendPointer, s, amount);
		PqSendPointer += amount;
		s += amount;
		len -= amount;
	}
	return 0;
}

/* --------------------------------
 *		pq_flush		- flush pending output
 *
 *		returns 0 if OK, EOF if trouble
 * --------------------------------
 */
int
pq_flush(void)
{
	static int last_reported_send_errno = 0;

	unsigned char *bufptr = PqSendBuffer;
	unsigned char *bufend = PqSendBuffer + PqSendPointer;

	while (bufptr < bufend)
	{
		int			r;

		r = write_SSL(MyProcPort, bufptr, bufend - bufptr);

#ifdef USE_SSL
		if (r < 0 || (r == 0 && !MyProcPort->ssl))
#else /* USE_SSL */
		if (r <= 0)
#endif /* USE_SSL */
		{
			if (errno == EINTR)
				continue;		/* Ok if we were interrupted */

			/*
			 * Careful: an elog() that tries to write to the client
			 * would cause recursion to here, leading to stack overflow
			 * and core dump!  This message must go *only* to the postmaster
			 * log.
			 *
			 * If a client disconnects while we're in the midst of output,
			 * we might write quite a bit of data before we get to a safe
			 * query abort point.  So, suppress duplicate log messages.
			 */
			if (errno != last_reported_send_errno)
			{
				last_reported_send_errno = errno;
				elog(COMMERROR, "pq_flush: send() failed: %m");
			}

			/*
			 * We drop the buffered data anyway so that processing can
			 * continue, even though we'll probably quit soon.
			 */
			PqSendPointer = 0;
			return EOF;
		}

		last_reported_send_errno = 0; /* reset after any successful send */
		bufptr += r;
	}

	PqSendPointer = 0;
	return 0;
}


/*
 * Return EOF if the connection has been broken, else 0.
 */
int
pq_eof(void)
{
	char		x;
	int			res = 1;

#ifndef USE_SSL /* not a good solution, but better than nothing */
	res = recv(MyProcPort->sock, &x, 1, MSG_PEEK);

	if (res < 0)
	{
		/* can log to postmaster log only */
		elog(COMMERROR, "pq_eof: recv() failed: %m");
		return EOF;
	}
#endif /* USE_SSL */

	if (res == 0)
		return EOF;
	else
		return 0;
}


/* --------------------------------
 * Message-level I/O routines begin here.
 *
 * These routines understand about COPY OUT protocol.
 * --------------------------------
 */


/* --------------------------------
 *		pq_putmessage	- send a normal message (suppressed in COPY OUT mode)
 *
 *		If msgtype is not '\0', it is a message type code to place before
 *		the message body (len counts only the body size!).
 *		If msgtype is '\0', then the buffer already includes the type code.
 *
 *		All normal messages are suppressed while COPY OUT is in progress.
 *		(In practice only a few messages might get emitted then; dropping
 *		them is annoying, but at least they will still appear in the
 *		postmaster log.)
 *
 *		returns 0 if OK, EOF if trouble
 * --------------------------------
 */
int
pq_putmessage(char msgtype, const char *s, size_t len)
{
	if (DoingCopyOut)
		return 0;
	if (msgtype)
		if (pq_putbytes(&msgtype, 1))
			return EOF;
	return pq_putbytes(s, len);
}

/* --------------------------------
 *		pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
 * --------------------------------
 */
void
pq_startcopyout(void)
{
	DoingCopyOut = true;
}

/* --------------------------------
 *		pq_endcopyout	- end a COPY OUT transfer
 *
 *		If errorAbort is indicated, we are aborting a COPY OUT due to an error,
 *		and must send a terminator line.  Since a partial data line might have
 *		been emitted, send a couple of newlines first (the first one could
 *		get absorbed by a backslash...)
 * --------------------------------
 */
void
pq_endcopyout(bool errorAbort)
{
	if (!DoingCopyOut)
		return;
	if (errorAbort)
		pq_putbytes("\n\n\\.\n", 5);
	/* in non-error case, copy.c will have emitted the terminator line */
	DoingCopyOut = false;
}