PostgreSQL Source Code interpretation (155)-background process # 7 (walsender#3) 04/16 Update SLTechnology News&Howtos

PostgreSQL Source Code interpretation (155)-background process # 7 (walsender#3)

2025-04-16 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Database >

Shulou(Shulou.com)06/01 Report--

< 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("poll() failed: %m"))); } return 0; } else if (rc == 0) { /* timeout exceeded */ return -1; } for (cur_event = set->

Events, cur_pollfd = set- > pollfds; cur_event

< (set->

Events + set- > nevents) & & returned_events

< nevents; cur_event++, cur_pollfd++) { /* no activity on this FD, skip */ if (cur_pollfd->

Revents = = 0) continue; occurred_events- > pos = cur_event- > pos; occurred_events- > user_data = cur_event- > user_data; occurred_events- > events = 0; if (cur_event- > events = = WL_LATCH_SET & & (cur_pollfd- > revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)) {/ * There's data in the self-pipe, clear it. * / drainSelfPipe (); if (set- > latch- > is_set) {occurred_events- > fd = PGINVALID_SOCKET; occurred_events- > events = WL_LATCH_SET; occurred_events++; returned_events++ }} else if (cur_event- > events = = WL_POSTMASTER_DEATH & & (cur_pollfd- > revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)) {/ * * We expect an POLLHUP when the remote end is closed, but because * we don't expect the pipe to become readable or to have any * errors either, treat those cases as postmaster death Too. * Be paranoid about a spurious event signalling the postmaster as * being dead. There have been reports about that happening with * older primitives (select (2) to be specific), and a spurious * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't * cost much. * / if (! PostmasterIsAliveInternal ()) {if (set- > exit_on_postmaster_death) proc_exit (1); occurred_events- > fd = PGINVALID_SOCKET; occurred_events- > events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++ }} else if (cur_event- > events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) {int errflags = POLLHUP | POLLERR | POLLNVAL; Assert (cur_event- > fd > = PGINVALID_SOCKET) If ((cur_event- > events & WL_SOCKET_READABLE) & & (cur_pollfd- > revents & (POLLIN | errflags)) {/ * data available in socket, or EOF * / occurred_events- > events | = WL_SOCKET_READABLE } if ((cur_event- > events & WL_SOCKET_WRITEABLE) & & (cur_pollfd- > revents & (POLLOUT | errflags)) {/ * writeable, or EOF * / occurred_events- > events | = WL_SOCKET_WRITEABLE } if (occurred_events- > events! = 0) {occurred_events- > fd = cur_event- > fd; occurred_events++; returned_events++;}} return returned_events;} # elif defined (WAIT_USE_WIN32) / * * Wait using Windows' WaitForMultipleObjects (). * * Unfortunately this will only ever return a single readiness notification at * a time. Note that while the official documentation for * WaitForMultipleObjects is ambiguous about multiple events being "consumed" * with a single bWaitAll = FALSE call, * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms * that only one event is "consumed". * / static inline intWaitEventSetWaitBlock (WaitEventSet * set, int cur_timeout, WaitEvent * occurred_events, int nevents) {int returned_events = 0; DWORD rc; WaitEvent * cur_event; / * Reset any wait events that need it * / for (cur_event = set- > events; cur_event

< (set->

Events + set- > nevents); cur_event++) {if (cur_event- > reset) {WaitEventAdjustWin32 (set, cur_event); cur_event- > reset = false;} / * * Windows does not guarantee to log an FD_WRITE network event * indicating that more data can be sent unless the previous send () * failed with WSAEWOULDBLOCK. While our caller might well have made * such a call, we cannot assume that here. Therefore, if waiting for * write-ready, force the issue by doing a dummy send (). If the dummy * send () succeeds, assume that the socket is in fact write-ready, and * return immediately. Also, if it fails with something other than * WSAEWOULDBLOCK, return a write-ready indication to let our caller * deal with the error condition. * / if (cur_event- > events & WL_SOCKET_WRITEABLE) {char c; WSABUF buf; DWORD sent; int r; buf.buf = & c; buf.len = 0; r = WSASend (cur_event- > fd, & buf, 1, & sent, 0, NULL, NULL) If (r = = 0 | | WSAGetLastError ()! = WSAEWOULDBLOCK) {occurred_events- > pos = cur_event- > pos; occurred_events- > user_data = cur_event- > user_data; occurred_events- > events = WL_SOCKET_WRITEABLE; occurred_events- > fd = cur_event- > fd; return 1 }} / * * Sleep. * * Need to wait for-> nevents + 1, because signal handle is in [0]. * / rc = WaitForMultipleObjects (set- > nevents + 1, set- > handles, FALSE, cur_timeout); / * Check return code * / if (rc = = WAIT_FAILED) elog (ERROR, "WaitForMultipleObjects () failed: error code% lu", GetLastError ()); else if (rc = = WAIT_TIMEOUT) {/ * timeout exceeded * / return-1 } if (rc = = WAIT_OBJECT_0) {/ * Service newly-arrived signals * / pgwin32_dispatch_queued_signals (); return 0; / * retry * /} / * * With an offset of one, due to the always present pgwin32_signal_event, * the handle offset directly corresponds to a wait event. * / cur_event = (WaitEvent *) & set- > events [rc-WAIT_OBJECT_0-1]; occurred_events- > pos = cur_event- > pos; occurred_events- > user_data = cur_event- > user_data; occurred_events- > events = 0; if (cur_event- > events = = WL_LATCH_SET) {if (! ResetEvent (set- > latch- > event) elog (ERROR, "ResetEvent failed: error code% lu", GetLastError ()) If (set- > latch- > is_set) {occurred_events- > fd = PGINVALID_SOCKET; occurred_events- > events = WL_LATCH_SET; occurred_events++; returned_events++;}} else if (cur_event- > events = = WL_POSTMASTER_DEATH) {/ * * Postmaster apparently died. Since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take * the trouble to positively verify this with PostmasterIsAlive (), * even though there is no known reason to think that the event could * be falsely set on Windows. * / if (! PostmasterIsAliveInternal ()) {if (set- > exit_on_postmaster_death) proc_exit (1); occurred_events- > fd = PGINVALID_SOCKET; occurred_events- > events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++ }} else if (cur_event- > events & WL_SOCKET_MASK) {WSANETWORKEVENTS resEvents; HANDLE handle = set- > handles [cur _ event- > pos + 1]; Assert (cur_event- > fd); occurred_events- > fd = cur_event- > fd; ZeroMemory (& resEvents, sizeof (resEvents)) If (WSAEnumNetworkEvents (cur_event- > fd, handle, & resEvents)! = 0) elog (ERROR, "failed to enumerate network events: error code% u", WSAGetLastError ()) If ((cur_event- > events & WL_SOCKET_READABLE) & & (resEvents.lNetworkEvents & FD_READ)) {/ * data available in socket * / occurred_events- > events | = WL_SOCKET_READABLE; / *-* WaitForMultipleObjects doesn't guarantee that a read event will * be returned if the latch is set at the same time. Even if it * did, the caller might drop that event expecting it to reoccur * on next call. So, we must force the event to be reset if this * WaitEventSet is used again in order to avoid an indefinite * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx * for the behavior of socket events. *-* / cur_event- > reset = true;} if ((cur_event- > events & WL_SOCKET_WRITEABLE) & & (resEvents.lNetworkEvents & FD_WRITE)) {/ * writeable * / occurred_events- > events | = WL_SOCKET_WRITEABLE } if ((cur_event- > events & WL_SOCKET_CONNECTED) & & (resEvents.lNetworkEvents & FD_CONNECT)) {/ * connected * / occurred_events- > events | = WL_SOCKET_CONNECTED } if (resEvents.lNetworkEvents & FD_CLOSE) {/ * EOF/error, so signal all caller-requested socket flags * / occurred_events- > events | = (cur_event- > events & WL_SOCKET_MASK);} if (occurred_events- > events! = 0) {occurred_events++; returned_events++;}} return returned_events } # endif III. Tracking and analysis

Use gdb to track postmaster on the primary node, set a breakpoint on PostgresMain, start the standby node and enter the breakpoint

[xdb@localhost ~] $ps-ef | grep postgresxdb 1376 11 14:16 pts/0 00:00:00 / appdb/xdb/pg11.2/bin/postgres [xdb@localhost ~] $gdb-p 1376GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7... (gdb) set follow-fork-mode child (gdb) b WalSndLoopBreakpoint 1 at 0x853e63: file walsender.c Line 2111. (gdb) cContinuing. [New process 1450] [Thread debugging using libthread_db enabled] Using host libthread_db library "/ lib64/libthread_db.so.1". [switching to Thread 0x7f17cfa9a8c0 (LWP 1450)] Breakpoint 1, WalSndLoop (send_data=0x8547fe) at walsender.c:21112111 last_reply_timestamp = GetCurrentTimestamp () (gdb)

Get the timestamp and set the related flag

(gdb) n2112 waiting_for_ping_response = false; (gdb) p last_reply_timestamp$1 = 606818445090174 (gdb)

Reset MyLatch

(gdb) n2124 if (! PostmasterIsAlive ()) (gdb) 2128 ResetLatch (MyLatch); (gdb) p MyLatch$2 = (struct Latch *) 0x7f17c46994d4 (gdb) p * MyLatch$3 = {is_set = 1, is_shared = true, owner_pid = 1465} (gdb) n2130 CHECK_FOR_INTERRUPTS (); (gdb) p * MyLatch$4 = {is_set = 0, is_shared = true, owner_pid = 1465} (gdb)

Process the recently received signal

(gdb) n2133 if (ConfigReloadPending) (gdb) 2141 ProcessRepliesIfAny (); (gdb) [Inferior 2 (process 1465) exited normally] (gdb)

The process exits and a new process 1466 is generated

Xdb 1466 1376 0 16:41? 00:00:00 postgres: walsender replicator 192.168.26.26 (40516) streaming 0/5D032830

Track 1466 processes

(gdb) attach 1466Attaching to program: / appdb/xdb/pg11.2/bin/postgres, process 1466Reading symbols from / lib64/libpthread.so.0... (no debugging symbols found). [Thread debugging using libthread_db enabled].

Execute SQL

Testdb=# drop table T1 to drop TABLE

After receiving the signal SIGUSR1, check the call stack as follows

Program received signal SIGUSR1, User defined signal 1.0x00007f17cde2d903 in _ _ epoll_wait_nocancel () from / lib64/libc.so.6 (gdb) bt#0 0x00007f17cde2d903 in _ _ epoll_wait_nocancel () from / lib64/libc.so.6#1 0x000000000088e668 in WaitEventSetWaitBlock (set=0x296e7c8, cur_timeout=29999, occurred_events=0x7fffed781d00, nevents=1) at latch.c:1048

DONE!

IV. Reference materials

PG Source Code

Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.

*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.