Michael Paquier b1ec7f47e3 Cleanup more code and comments related to Windows NT4 (XP days)
All the code and comments cleaned up here is irrelevant since 495ed0e.
Note that this removes an assumption that CreateRestrictedToken() may
not exist, something that could have happened when running under Windows
NT as the code stated.  Rather than assuming that it may not exist, this
causes pg_ctl to fail hard if the function cannot be loaded.

Reported-by: Justin Pryzby
Discussion: https://postgr.es/m/20220826112637.GD2342@telsasoft.com
2022-08-30 09:52:58 +09:00

706 lines
16 KiB
C

/*-------------------------------------------------------------------------
*
* socket.c
* Microsoft Windows Win32 Socket Functions
*
* Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
*
* IDENTIFICATION
* src/backend/port/win32/socket.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
/*
* Indicate if pgwin32_recv() and pgwin32_send() should operate
* in non-blocking mode.
*
* Since the socket emulation layer always sets the actual socket to
* non-blocking mode in order to be able to deliver signals, we must
* specify this in a separate flag if we actually need non-blocking
* operation.
*
* This flag changes the behaviour *globally* for all socket operations,
* so it should only be set for very short periods of time.
*/
int pgwin32_noblock = 0;
/* Undef the macros defined in win32.h, so we can access system functions */
#undef socket
#undef bind
#undef listen
#undef accept
#undef connect
#undef select
#undef recv
#undef send
/*
* Blocking socket functions implemented so they listen on both
* the socket and the signal event, required for signal handling.
*/
/*
* Convert the last socket error code into errno
*
* Note: where there is a direct correspondence between a WSAxxx error code
* and a Berkeley error symbol, this mapping is actually a no-op, because
* in win32_port.h we redefine the network-related Berkeley error symbols to
* have the values of their WSAxxx counterparts. The point of the switch is
* mostly to translate near-miss error codes into something that's sensible
* in the Berkeley universe.
*/
static void
TranslateSocketError(void)
{
switch (WSAGetLastError())
{
case WSAEINVAL:
case WSANOTINITIALISED:
case WSAEINVALIDPROVIDER:
case WSAEINVALIDPROCTABLE:
case WSAEDESTADDRREQ:
errno = EINVAL;
break;
case WSAEINPROGRESS:
errno = EINPROGRESS;
break;
case WSAEFAULT:
errno = EFAULT;
break;
case WSAEISCONN:
errno = EISCONN;
break;
case WSAEMSGSIZE:
errno = EMSGSIZE;
break;
case WSAEAFNOSUPPORT:
errno = EAFNOSUPPORT;
break;
case WSAEMFILE:
errno = EMFILE;
break;
case WSAENOBUFS:
errno = ENOBUFS;
break;
case WSAEPROTONOSUPPORT:
case WSAEPROTOTYPE:
case WSAESOCKTNOSUPPORT:
errno = EPROTONOSUPPORT;
break;
case WSAECONNABORTED:
errno = ECONNABORTED;
break;
case WSAECONNREFUSED:
errno = ECONNREFUSED;
break;
case WSAECONNRESET:
errno = ECONNRESET;
break;
case WSAEINTR:
errno = EINTR;
break;
case WSAENOTSOCK:
errno = ENOTSOCK;
break;
case WSAEOPNOTSUPP:
errno = EOPNOTSUPP;
break;
case WSAEWOULDBLOCK:
errno = EWOULDBLOCK;
break;
case WSAEACCES:
errno = EACCES;
break;
case WSAEADDRINUSE:
errno = EADDRINUSE;
break;
case WSAEADDRNOTAVAIL:
errno = EADDRNOTAVAIL;
break;
case WSAEHOSTDOWN:
errno = EHOSTDOWN;
break;
case WSAEHOSTUNREACH:
case WSAHOST_NOT_FOUND:
errno = EHOSTUNREACH;
break;
case WSAENETDOWN:
errno = ENETDOWN;
break;
case WSAENETUNREACH:
errno = ENETUNREACH;
break;
case WSAENETRESET:
errno = ENETRESET;
break;
case WSAENOTCONN:
case WSAESHUTDOWN:
case WSAEDISCON:
errno = ENOTCONN;
break;
case WSAETIMEDOUT:
errno = ETIMEDOUT;
break;
default:
ereport(NOTICE,
(errmsg_internal("unrecognized win32 socket error code: %d",
WSAGetLastError())));
errno = EINVAL;
break;
}
}
static int
pgwin32_poll_signals(void)
{
if (UNBLOCKED_SIGNAL_QUEUE())
{
pgwin32_dispatch_queued_signals();
errno = EINTR;
return 1;
}
return 0;
}
static int
isDataGram(SOCKET s)
{
int type;
int typelen = sizeof(type);
if (getsockopt(s, SOL_SOCKET, SO_TYPE, (char *) &type, &typelen))
return 1;
return (type == SOCK_DGRAM) ? 1 : 0;
}
int
pgwin32_waitforsinglesocket(SOCKET s, int what, int timeout)
{
static HANDLE waitevent = INVALID_HANDLE_VALUE;
static SOCKET current_socket = INVALID_SOCKET;
static int isUDP = 0;
HANDLE events[2];
int r;
/* Create an event object just once and use it on all future calls */
if (waitevent == INVALID_HANDLE_VALUE)
{
waitevent = CreateEvent(NULL, TRUE, FALSE, NULL);
if (waitevent == INVALID_HANDLE_VALUE)
ereport(ERROR,
(errmsg_internal("could not create socket waiting event: error code %lu", GetLastError())));
}
else if (!ResetEvent(waitevent))
ereport(ERROR,
(errmsg_internal("could not reset socket waiting event: error code %lu", GetLastError())));
/*
* Track whether socket is UDP or not. (NB: most likely, this is both
* useless and wrong; there is no reason to think that the behavior of
* WSAEventSelect is different for TCP and UDP.)
*/
if (current_socket != s)
isUDP = isDataGram(s);
current_socket = s;
/*
* Attach event to socket. NOTE: we must detach it again before
* returning, since other bits of code may try to attach other events to
* the socket.
*/
if (WSAEventSelect(s, waitevent, what) != 0)
{
TranslateSocketError();
return 0;
}
events[0] = pgwin32_signal_event;
events[1] = waitevent;
/*
* Just a workaround of unknown locking problem with writing in UDP socket
* under high load: Client's pgsql backend sleeps infinitely in
* WaitForMultipleObjectsEx, pgstat process sleeps in pgwin32_select().
* So, we will wait with small timeout(0.1 sec) and if socket is still
* blocked, try WSASend (see comments in pgwin32_select) and wait again.
*/
if ((what & FD_WRITE) && isUDP)
{
for (;;)
{
r = WaitForMultipleObjectsEx(2, events, FALSE, 100, TRUE);
if (r == WAIT_TIMEOUT)
{
char c;
WSABUF buf;
DWORD sent;
buf.buf = &c;
buf.len = 0;
r = WSASend(s, &buf, 1, &sent, 0, NULL, NULL);
if (r == 0) /* Completed - means things are fine! */
{
WSAEventSelect(s, NULL, 0);
return 1;
}
else if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
WSAEventSelect(s, NULL, 0);
return 0;
}
}
else
break;
}
}
else
r = WaitForMultipleObjectsEx(2, events, FALSE, timeout, TRUE);
WSAEventSelect(s, NULL, 0);
if (r == WAIT_OBJECT_0 || r == WAIT_IO_COMPLETION)
{
pgwin32_dispatch_queued_signals();
errno = EINTR;
return 0;
}
if (r == WAIT_OBJECT_0 + 1)
return 1;
if (r == WAIT_TIMEOUT)
{
errno = EWOULDBLOCK;
return 0;
}
ereport(ERROR,
(errmsg_internal("unrecognized return value from WaitForMultipleObjects: %d (error code %lu)", r, GetLastError())));
return 0;
}
/*
* Create a socket, setting it to overlapped and non-blocking
*/
SOCKET
pgwin32_socket(int af, int type, int protocol)
{
SOCKET s;
unsigned long on = 1;
s = WSASocket(af, type, protocol, NULL, 0, WSA_FLAG_OVERLAPPED);
if (s == INVALID_SOCKET)
{
TranslateSocketError();
return INVALID_SOCKET;
}
if (ioctlsocket(s, FIONBIO, &on))
{
TranslateSocketError();
return INVALID_SOCKET;
}
errno = 0;
return s;
}
int
pgwin32_bind(SOCKET s, struct sockaddr *addr, int addrlen)
{
int res;
res = bind(s, addr, addrlen);
if (res < 0)
TranslateSocketError();
return res;
}
int
pgwin32_listen(SOCKET s, int backlog)
{
int res;
res = listen(s, backlog);
if (res < 0)
TranslateSocketError();
return res;
}
SOCKET
pgwin32_accept(SOCKET s, struct sockaddr *addr, int *addrlen)
{
SOCKET rs;
/*
* Poll for signals, but don't return with EINTR, since we don't handle
* that in pqcomm.c
*/
pgwin32_poll_signals();
rs = WSAAccept(s, addr, addrlen, NULL, 0);
if (rs == INVALID_SOCKET)
{
TranslateSocketError();
return INVALID_SOCKET;
}
return rs;
}
/* No signal delivery during connect. */
int
pgwin32_connect(SOCKET s, const struct sockaddr *addr, int addrlen)
{
int r;
r = WSAConnect(s, addr, addrlen, NULL, NULL, NULL, NULL);
if (r == 0)
return 0;
if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
while (pgwin32_waitforsinglesocket(s, FD_CONNECT, INFINITE) == 0)
{
/* Loop endlessly as long as we are just delivering signals */
}
return 0;
}
int
pgwin32_recv(SOCKET s, char *buf, int len, int f)
{
WSABUF wbuf;
int r;
DWORD b;
DWORD flags = f;
int n;
if (pgwin32_poll_signals())
return -1;
wbuf.len = len;
wbuf.buf = buf;
r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
if (r != SOCKET_ERROR)
return b; /* success */
if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
if (pgwin32_noblock)
{
/*
* No data received, and we are in "emulated non-blocking mode", so
* return indicating that we'd block if we were to continue.
*/
errno = EWOULDBLOCK;
return -1;
}
/* We're in blocking mode, so wait for data */
for (n = 0; n < 5; n++)
{
if (pgwin32_waitforsinglesocket(s, FD_READ | FD_CLOSE | FD_ACCEPT,
INFINITE) == 0)
return -1; /* errno already set */
r = WSARecv(s, &wbuf, 1, &b, &flags, NULL, NULL);
if (r != SOCKET_ERROR)
return b; /* success */
if (WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
/*
* There seem to be cases on win2k (at least) where WSARecv can return
* WSAEWOULDBLOCK even when pgwin32_waitforsinglesocket claims the
* socket is readable. In this case, just sleep for a moment and try
* again. We try up to 5 times - if it fails more than that it's not
* likely to ever come back.
*/
pg_usleep(10000);
}
ereport(NOTICE,
(errmsg_internal("could not read from ready socket (after retries)")));
errno = EWOULDBLOCK;
return -1;
}
/*
* The second argument to send() is defined by SUS to be a "const void *"
* and so we use the same signature here to keep compilers happy when
* handling callers.
*
* But the buf member of a WSABUF struct is defined as "char *", so we cast
* the second argument to that here when assigning it, also to keep compilers
* happy.
*/
int
pgwin32_send(SOCKET s, const void *buf, int len, int flags)
{
WSABUF wbuf;
int r;
DWORD b;
if (pgwin32_poll_signals())
return -1;
wbuf.len = len;
wbuf.buf = (char *) buf;
/*
* Readiness of socket to send data to UDP socket may be not true: socket
* can become busy again! So loop until send or error occurs.
*/
for (;;)
{
r = WSASend(s, &wbuf, 1, &b, flags, NULL, NULL);
if (r != SOCKET_ERROR && b > 0)
/* Write succeeded right away */
return b;
if (r == SOCKET_ERROR &&
WSAGetLastError() != WSAEWOULDBLOCK)
{
TranslateSocketError();
return -1;
}
if (pgwin32_noblock)
{
/*
* No data sent, and we are in "emulated non-blocking mode", so
* return indicating that we'd block if we were to continue.
*/
errno = EWOULDBLOCK;
return -1;
}
/* No error, zero bytes */
if (pgwin32_waitforsinglesocket(s, FD_WRITE | FD_CLOSE, INFINITE) == 0)
return -1;
}
return -1;
}
/*
* Wait for activity on one or more sockets.
* While waiting, allow signals to run
*
* NOTE! Currently does not implement exceptfds check,
* since it is not used in postgresql!
*/
int
pgwin32_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, const struct timeval *timeout)
{
WSAEVENT events[FD_SETSIZE * 2]; /* worst case is readfds totally
* different from writefds, so
* 2*FD_SETSIZE sockets */
SOCKET sockets[FD_SETSIZE * 2];
int numevents = 0;
int i;
int r;
DWORD timeoutval = WSA_INFINITE;
FD_SET outreadfds;
FD_SET outwritefds;
int nummatches = 0;
Assert(exceptfds == NULL);
if (pgwin32_poll_signals())
return -1;
FD_ZERO(&outreadfds);
FD_ZERO(&outwritefds);
/*
* Windows does not guarantee to log an FD_WRITE network event indicating
* that more data can be sent unless the previous send() failed with
* WSAEWOULDBLOCK. While our caller might well have made such a call, we
* cannot assume that here. Therefore, if waiting for write-ready, force
* the issue by doing a dummy send(). If the dummy send() succeeds,
* assume that the socket is in fact write-ready, and return immediately.
* Also, if it fails with something other than WSAEWOULDBLOCK, return a
* write-ready indication to let our caller deal with the error condition.
*/
if (writefds != NULL)
{
for (i = 0; i < writefds->fd_count; i++)
{
char c;
WSABUF buf;
DWORD sent;
buf.buf = &c;
buf.len = 0;
r = WSASend(writefds->fd_array[i], &buf, 1, &sent, 0, NULL, NULL);
if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
FD_SET(writefds->fd_array[i], &outwritefds);
}
/* If we found any write-ready sockets, just return them immediately */
if (outwritefds.fd_count > 0)
{
memcpy(writefds, &outwritefds, sizeof(fd_set));
if (readfds)
FD_ZERO(readfds);
return outwritefds.fd_count;
}
}
/* Now set up for an actual select */
if (timeout != NULL)
{
/* timeoutval is in milliseconds */
timeoutval = timeout->tv_sec * 1000 + timeout->tv_usec / 1000;
}
if (readfds != NULL)
{
for (i = 0; i < readfds->fd_count; i++)
{
events[numevents] = WSACreateEvent();
sockets[numevents] = readfds->fd_array[i];
numevents++;
}
}
if (writefds != NULL)
{
for (i = 0; i < writefds->fd_count; i++)
{
if (!readfds ||
!FD_ISSET(writefds->fd_array[i], readfds))
{
/* If the socket is not in the read list */
events[numevents] = WSACreateEvent();
sockets[numevents] = writefds->fd_array[i];
numevents++;
}
}
}
for (i = 0; i < numevents; i++)
{
int flags = 0;
if (readfds && FD_ISSET(sockets[i], readfds))
flags |= FD_READ | FD_ACCEPT | FD_CLOSE;
if (writefds && FD_ISSET(sockets[i], writefds))
flags |= FD_WRITE | FD_CLOSE;
if (WSAEventSelect(sockets[i], events[i], flags) != 0)
{
TranslateSocketError();
/* release already-assigned event objects */
while (--i >= 0)
WSAEventSelect(sockets[i], NULL, 0);
for (i = 0; i < numevents; i++)
WSACloseEvent(events[i]);
return -1;
}
}
events[numevents] = pgwin32_signal_event;
r = WaitForMultipleObjectsEx(numevents + 1, events, FALSE, timeoutval, TRUE);
if (r != WAIT_TIMEOUT && r != WAIT_IO_COMPLETION && r != (WAIT_OBJECT_0 + numevents))
{
/*
* We scan all events, even those not signaled, in case more than one
* event has been tagged but Wait.. can only return one.
*/
WSANETWORKEVENTS resEvents;
for (i = 0; i < numevents; i++)
{
ZeroMemory(&resEvents, sizeof(resEvents));
if (WSAEnumNetworkEvents(sockets[i], events[i], &resEvents) != 0)
elog(ERROR, "failed to enumerate network events: error code %d",
WSAGetLastError());
/* Read activity? */
if (readfds && FD_ISSET(sockets[i], readfds))
{
if ((resEvents.lNetworkEvents & FD_READ) ||
(resEvents.lNetworkEvents & FD_ACCEPT) ||
(resEvents.lNetworkEvents & FD_CLOSE))
{
FD_SET(sockets[i], &outreadfds);
nummatches++;
}
}
/* Write activity? */
if (writefds && FD_ISSET(sockets[i], writefds))
{
if ((resEvents.lNetworkEvents & FD_WRITE) ||
(resEvents.lNetworkEvents & FD_CLOSE))
{
FD_SET(sockets[i], &outwritefds);
nummatches++;
}
}
}
}
/* Clean up all the event objects */
for (i = 0; i < numevents; i++)
{
WSAEventSelect(sockets[i], NULL, 0);
WSACloseEvent(events[i]);
}
if (r == WSA_WAIT_TIMEOUT)
{
if (readfds)
FD_ZERO(readfds);
if (writefds)
FD_ZERO(writefds);
return 0;
}
/* Signal-like events. */
if (r == WAIT_OBJECT_0 + numevents || r == WAIT_IO_COMPLETION)
{
pgwin32_dispatch_queued_signals();
errno = EINTR;
if (readfds)
FD_ZERO(readfds);
if (writefds)
FD_ZERO(writefds);
return -1;
}
/* Overwrite socket sets with our resulting values */
if (readfds)
memcpy(readfds, &outreadfds, sizeof(fd_set));
if (writefds)
memcpy(writefds, &outwritefds, sizeof(fd_set));
return nummatches;
}