Fix lost logcontext when using timeout_deferred(...) (#19090)

Fix lost logcontext when using `timeout_deferred(...)` and things
actually timeout.

Fix https://github.com/element-hq/synapse/issues/19087 (our HTTP client
times out requests using `timeout_deferred(...)`
Fix https://github.com/element-hq/synapse/issues/19066 (`/sync` uses
`notifier.wait_for_events()` which uses `timeout_deferred(...)` under
the hood)


### When/why did these lost logcontext warnings start happening?

```
synapse.logging.context - 107 - WARNING - sentinel - Expected logging context call_later but found POST-2453

synapse.logging.context - 107 - WARNING - sentinel - Expected logging context call_later was lost
```

In https://github.com/element-hq/synapse/pull/18828, we switched
`timeout_deferred(...)` from using `reactor.callLater(...)` to
[`clock.call_later(...)`](3b59ac3b69/synapse/util/clock.py (L224-L313))
under the hood. This meant it started dealing with logcontexts but our
`time_it_out()` callback didn't follow our [Synapse logcontext
rules](3b59ac3b69/docs/log_contexts.md).
This commit is contained in:
Eric Eastwood 2025-10-30 11:49:15 -05:00 committed by GitHub
parent f0aae62f85
commit c0b9437ab6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 89 additions and 10 deletions

1
changelog.d/19090.bugfix Normal file
View File

@ -0,0 +1 @@
Fix lost logcontext warnings from timeouts in sync and requests made by Synapse itself.

View File

@ -896,7 +896,7 @@ def run_in_background(
# If the function messes with logcontexts, we can assume it follows the Synapse
# logcontext rules (Rules for functions returning awaitables: "If the awaitable
# is already complete, the function returns with the same logcontext it started
# with."). If it function doesn't touch logcontexts at all, we can also assume
# with."). If the function doesn't touch logcontexts at all, we can also assume
# the logcontext is unchanged.
#
# Either way, the function should have maintained the calling logcontext, so we
@ -905,11 +905,21 @@ def run_in_background(
# to reset the logcontext to the sentinel logcontext as that would run
# immediately (remember our goal is to maintain the calling logcontext when we
# return).
logcontext_debug_logger.debug(
"run_in_background(%s): deferred already completed and the function should have maintained the logcontext %s",
instance_id,
calling_context,
)
if current_context() != calling_context:
logcontext_error(
"run_in_background(%s): deferred already completed but the function did not maintain the calling logcontext %s (found %s)"
% (
instance_id,
calling_context,
current_context(),
)
)
else:
logcontext_debug_logger.debug(
"run_in_background(%s): deferred already completed (maintained the calling logcontext %s)",
instance_id,
calling_context,
)
return d
# Since the function we called may follow the Synapse logcontext rules (Rules for

View File

@ -808,7 +808,8 @@ def timeout_deferred(
timed_out[0] = True
try:
deferred.cancel()
with PreserveLoggingContext():
deferred.cancel()
except Exception: # if we throw any exception it'll break time outs
logger.exception("Canceller failed during timeout")

View File

@ -266,7 +266,7 @@ class Clock:
# We use `PreserveLoggingContext` to prevent our new `call_later`
# logcontext from finishing as soon as we exit this function, in case `f`
# returns an awaitable/deferred which would continue running and may try to
# restore the `loop_call` context when it's done (because it's trying to
# restore the `call_later` context when it's done (because it's trying to
# adhere to the Synapse logcontext rules.)
#
# This also ensures that we return to the `sentinel` context when we exit

View File

@ -341,6 +341,9 @@ def logcontext_clean(target: TV) -> TV:
"""
def logcontext_error(msg: str) -> NoReturn:
# Log so we can still see it in the logs like normal
logger.warning(msg)
# But also fail the test
raise AssertionError("logcontext error: %s" % (msg))
patcher = patch("synapse.logging.context.logcontext_error", new=logcontext_error)

View File

@ -45,7 +45,7 @@ from synapse.util.async_helpers import (
)
from tests.server import get_clock
from tests.unittest import TestCase
from tests.unittest import TestCase, logcontext_clean
logger = logging.getLogger(__name__)
@ -198,7 +198,12 @@ class TimeoutDeferredTest(TestCase):
self.failureResultOf(timing_out_d, defer.TimeoutError)
async def test_logcontext_is_preserved_on_cancellation(self) -> None:
@logcontext_clean
async def test_logcontext_is_preserved_on_timeout_cancellation(self) -> None:
"""
Test that the logcontext is preserved when we timeout and the deferred is
cancelled.
"""
# Sanity check that we start in the sentinel context
self.assertEqual(current_context(), SENTINEL_CONTEXT)
@ -270,6 +275,65 @@ class TimeoutDeferredTest(TestCase):
# Back to the sentinel context
self.assertEqual(current_context(), SENTINEL_CONTEXT)
@logcontext_clean
async def test_logcontext_is_not_lost_when_awaiting_on_timeout_cancellation(
self,
) -> None:
"""
Test that the logcontext isn't lost when we `await make_deferred_yieldable(...)`
the deferred to complete/timeout and it times out.
"""
# Sanity check that we start in the sentinel context
self.assertEqual(current_context(), SENTINEL_CONTEXT)
# Create a deferred which we will never complete
incomplete_d: Deferred = Deferred()
async def competing_task() -> None:
with LoggingContext(
name="competing", server_name="test_server"
) as context_competing:
timing_out_d = timeout_deferred(
deferred=incomplete_d,
timeout=1.0,
clock=self.clock,
)
self.assertNoResult(timing_out_d)
# We should still be in the logcontext we started in
self.assertIs(current_context(), context_competing)
# Mimic the normal use case to wait for the work to complete or timeout.
#
# In this specific test, we expect the deferred to timeout and raise an
# exception at this point.
await make_deferred_yieldable(timing_out_d)
self.fail(
"We should not make it to this point as the `timing_out_d` should have been cancelled"
)
d = defer.ensureDeferred(competing_task())
# Still in the sentinel context
self.assertEqual(current_context(), SENTINEL_CONTEXT)
# Pump until we trigger the timeout
self.reactor.pump(
# We only need to pump `1.0` (seconds) as we set
# `timeout_deferred(timeout=1.0)` above
(1.0,)
)
# Still in the sentinel context
self.assertEqual(current_context(), SENTINEL_CONTEXT)
# We expect a failure due to the timeout
self.failureResultOf(d, defer.TimeoutError)
# Back to the sentinel context at the end of the day
self.assertEqual(current_context(), SENTINEL_CONTEXT)
class _TestException(Exception): #
pass