synapse/tests/util/test_logcontext.py
Eric Eastwood 5143f93dc9
Fix server_name in logging context for multiple Synapse instances in one process (#18868)
### Background

As part of Element's plan to support a light form of vhosting (virtual
host) (multiple instances of Synapse in the same Python process), we're
currently diving into the details and implications of running multiple
instances of Synapse in the same Python process.

"Per-tenant logging" tracked internally by
https://github.com/element-hq/synapse-small-hosts/issues/48

### Prior art

Previously, we exposed `server_name` by providing a static logging
`MetadataFilter` that injected the values:


205d9e4fc4/synapse/config/logger.py (L216)

While this can work fine for the normal case of one Synapse instance per
Python process, this configures things globally and isn't compatible
when we try to start multiple Synapse instances because each subsequent
tenant will overwrite the previous tenant.


### What does this PR do?

We remove the `MetadataFilter` and replace it by tracking the
`server_name` in the `LoggingContext` and expose it with our existing
[`LoggingContextFilter`](205d9e4fc4/synapse/logging/context.py (L584-L622))
that we already use to expose information about the `request`.

This means that the `server_name` value follows wherever we log as
expected even when we have multiple Synapse instances running in the
same process.


### A note on logcontext

Anywhere, Synapse mistakenly uses the `sentinel` logcontext to log
something, we won't know which server sent the log. We've been fixing up
`sentinel` logcontext usage as tracked by
https://github.com/element-hq/synapse/issues/18905

Any further `sentinel` logcontext usage we find in the future can be
fixed piecemeal as normal.


d2a966f922/docs/log_contexts.md (L71-L81)


### Testing strategy

1. Adjust your logging config to include `%(server_name)s` in the format
    ```yaml
    formatters:
        precise:
format: '%(asctime)s - %(server_name)s - %(name)s - %(lineno)d -
%(levelname)s - %(request)s - %(message)s'
    ```
1. Start Synapse: `poetry run synapse_homeserver --config-path
homeserver.yaml`
1. Make some requests (`curl
http://localhost:8008/_matrix/client/versions`, etc)
1. Open the homeserver logs and notice the `server_name` in the logs as
expected. `unknown_server_from_sentinel_context` is expected for the
`sentinel` logcontext (things outside of Synapse).
2025-09-26 17:10:48 -05:00

700 lines
27 KiB
Python

#
# This file is licensed under the Affero General Public License (AGPL) version 3.
#
# Copyright 2014-2022 The Matrix.org Foundation C.I.C.
# Copyright (C) 2023 New Vector, Ltd
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# See the GNU Affero General Public License for more details:
# <https://www.gnu.org/licenses/agpl-3.0.html>.
#
# Originally licensed under the Apache License, Version 2.0:
# <http://www.apache.org/licenses/LICENSE-2.0>.
#
# [This file includes modifications made by New Vector Limited]
#
#
import logging
from typing import Callable, Generator, cast
from twisted.internet import defer, reactor as _reactor
from synapse.logging.context import (
SENTINEL_CONTEXT,
LoggingContext,
PreserveLoggingContext,
_Sentinel,
current_context,
make_deferred_yieldable,
nested_logging_context,
run_coroutine_in_background,
run_in_background,
)
from synapse.types import ISynapseReactor
from synapse.util.clock import Clock
from tests import unittest
from tests.unittest import logcontext_clean
logger = logging.getLogger(__name__)
reactor = cast(ISynapseReactor, _reactor)
class LoggingContextTestCase(unittest.TestCase):
def _check_test_key(self, value: str) -> None:
context = current_context()
assert isinstance(context, LoggingContext) or isinstance(context, _Sentinel), (
f"Expected LoggingContext({value}) but saw {context}"
)
self.assertEqual(
str(context), value, f"Expected LoggingContext({value}) but saw {context}"
)
@logcontext_clean
def test_with_context(self) -> None:
with LoggingContext(name="test", server_name="test_server"):
self._check_test_key("test")
@logcontext_clean
async def test_sleep(self) -> None:
"""
Test `Clock.sleep`
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# A callback from the reactor should start with the sentinel context. In
# other words, another task shouldn't have leaked their context to us.
self._check_test_key("sentinel")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("sentinel")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
reactor.callLater(0, lambda: defer.ensureDeferred(competing_callback()))
with LoggingContext(name="foo", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
async def test_looping_call(self) -> None:
"""
Test `Clock.looping_call`
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# A `looping_call` callback should have *some* logcontext since we should know
# which server spawned this loop and which server the logs came from.
self._check_test_key("looping_call")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("looping_call")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
lc = clock.looping_call(
lambda: defer.ensureDeferred(competing_callback()), 0
)
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
# Stop the looping call to prevent "Reactor was unclean" errors
lc.stop()
@logcontext_clean
async def test_looping_call_now(self) -> None:
"""
Test `Clock.looping_call_now`
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# A `looping_call` callback should have *some* logcontext since we should know
# which server spawned this loop and which server the logs came from.
self._check_test_key("looping_call")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("looping_call")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
lc = clock.looping_call_now(
lambda: defer.ensureDeferred(competing_callback()), 0
)
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
# Stop the looping call to prevent "Reactor was unclean" errors
lc.stop()
@logcontext_clean
async def test_call_later(self) -> None:
"""
Test `Clock.call_later`
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# A `call_later` callback should have *some* logcontext since we should know
# which server spawned this loop and which server the logs came from.
self._check_test_key("call_later")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("call_later")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
clock.call_later(0, lambda: defer.ensureDeferred(competing_callback()))
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
async def test_deferred_callback_await_in_current_logcontext(self) -> None:
"""
Test that calling the deferred callback in the current logcontext ("foo") and
waiting for it to finish in a logcontext blocks works as expected.
Works because "always await your awaitables".
Demonstrates one pattern that we can use fix the naive case where we just call
`d.callback(None)` without anything else. See the *Deferred callbacks* section
of docs/log_contexts.md for more details.
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# The deferred callback should have the same logcontext as the caller
self._check_test_key("foo")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("foo")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
d: defer.Deferred[None] = defer.Deferred()
d.addCallback(lambda _: defer.ensureDeferred(competing_callback()))
self._check_test_key("foo")
d.callback(None)
# The fix for the naive case is here (i.e. things don't work correctly if we
# don't await here).
#
# Wait for `d` to finish before continuing so the "main" logcontext is
# still active. This works because `d` already follows our logcontext
# rules. If not, we would also have to use `make_deferred_yieldable(d)`.
await d
self._check_test_key("foo")
await clock.sleep(0)
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
async def test_deferred_callback_preserve_logging_context(self) -> None:
"""
Test that calling the deferred callback inside `PreserveLoggingContext()` (in
the sentinel context) works as expected.
Demonstrates one pattern that we can use fix the naive case where we just call
`d.callback(None)` without anything else. See the *Deferred callbacks* section
of docs/log_contexts.md for more details.
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# The deferred callback should have the same logcontext as the caller
self._check_test_key("sentinel")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("sentinel")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
d: defer.Deferred[None] = defer.Deferred()
d.addCallback(lambda _: defer.ensureDeferred(competing_callback()))
self._check_test_key("foo")
# The fix for the naive case is here (i.e. things don't work correctly if we
# don't `PreserveLoggingContext()` here).
#
# `PreserveLoggingContext` will reset the logcontext to the sentinel before
# calling the callback, and restore the "foo" logcontext afterwards before
# continuing the foo block. This solves the problem because when the
# "competing" logcontext exits, it will restore the sentinel logcontext
# which is never finished by its nature, so there is no warning and no
# leakage into the reactor.
with PreserveLoggingContext():
d.callback(None)
self._check_test_key("foo")
await clock.sleep(0)
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
async def test_deferred_callback_fire_and_forget_with_current_context(self) -> None:
"""
Test that it's possible to call the deferred callback with the current context
while fire-and-forgetting the callback (no adverse effects like leaking the
logcontext into the reactor or restarting an already finished logcontext).
Demonstrates one pattern that we can use fix the naive case where we just call
`d.callback(None)` without anything else. See the *Deferred callbacks* section
of docs/log_contexts.md for more details.
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# The deferred callback should have the same logcontext as the caller
self._check_test_key("foo")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("foo")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
# Part of fix for the naive case is here (i.e. things don't work correctly if we
# don't `PreserveLoggingContext(...)` here).
#
# We can extend the lifetime of the "foo" logcontext is to avoid calling the
# context manager lifetime methods of `LoggingContext` (`__enter__`/`__exit__`).
# And we can still set the current logcontext by using `PreserveLoggingContext`
# and passing in the "foo" logcontext.
with PreserveLoggingContext(
LoggingContext(name="foo", server_name="test_server")
):
d: defer.Deferred[None] = defer.Deferred()
d.addCallback(lambda _: defer.ensureDeferred(competing_callback()))
self._check_test_key("foo")
# Other part of fix for the naive case is here (i.e. things don't work
# correctly if we don't `run_in_background(...)` here).
#
# `run_in_background(...)` will run the whole lambda in the current
# logcontext and it handles the magic behind the scenes of a) restoring the
# calling logcontext before returning to the caller and b) resetting the
# logcontext to the sentinel after the deferred completes and we yield
# control back to the reactor to avoid leaking the logcontext into the
# reactor.
#
# We're using a lambda here as a little trick so we can still get everything
# to run in the "foo" logcontext, but return the deferred `d` itself so that
# `run_in_background` will wait on that to complete before resetting the
# logcontext to the sentinel.
#
# type-ignore[call-overload]: This appears like a mypy type inference bug. A
# function that returns a deferred is exactly what `run_in_background`
# expects.
#
# type-ignore[func-returns-value]: This appears like a mypy type inference
# bug. We're always returning the deferred `d`.
run_in_background(lambda: (d.callback(None), d)[1]) # type: ignore[call-overload, func-returns-value]
self._check_test_key("foo")
await clock.sleep(0)
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
async def _test_run_in_background(self, function: Callable[[], object]) -> None:
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
with LoggingContext(name="foo", server_name="test_server"):
# Fire off the function, but don't wait on it.
deferred = run_in_background(function)
self._check_test_key("foo")
def callback(result: object) -> object:
nonlocal callback_finished
callback_finished = True
# Pass through the result
return result
# We `addBoth` because when exceptions happen, we still want to mark the
# callback as finished so that the test can complete and we see the
# underlying error.
deferred.addBoth(callback)
self._check_test_key("foo")
# Now wait for the function under test to have run, and check that
# the logcontext is left in a sane state.
while not callback_finished:
await clock.sleep(0)
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
async def test_run_in_background_with_blocking_fn(self) -> None:
async def blocking_function() -> None:
await Clock(reactor, server_name="test_server").sleep(0)
await self._test_run_in_background(blocking_function)
@logcontext_clean
async def test_run_in_background_with_non_blocking_fn(self) -> None:
@defer.inlineCallbacks
def nonblocking_function() -> Generator["defer.Deferred[object]", object, None]:
with PreserveLoggingContext():
yield defer.succeed(None)
await self._test_run_in_background(nonblocking_function)
@logcontext_clean
async def test_run_in_background_with_chained_deferred(self) -> None:
# a function which returns a deferred which looks like it has been
# called, but is actually paused
def testfunc() -> defer.Deferred:
return make_deferred_yieldable(_chained_deferred_function())
await self._test_run_in_background(testfunc)
@logcontext_clean
async def test_run_in_background_with_coroutine(self) -> None:
"""
Test `run_in_background` with a coroutine that yields control back to the
reactor.
This will stress the logic around incomplete deferreds in `run_in_background`.
"""
async def testfunc() -> None:
self._check_test_key("foo")
d = defer.ensureDeferred(Clock(reactor, server_name="test_server").sleep(0))
self.assertIs(current_context(), SENTINEL_CONTEXT)
await d
self._check_test_key("foo")
await self._test_run_in_background(testfunc)
@logcontext_clean
async def test_run_in_background_with_nonblocking_coroutine(self) -> None:
"""
Test `run_in_background` with a "nonblocking" coroutine (never yields control
back to the reactor).
This will stress the logic around completed deferreds in `run_in_background`.
"""
async def testfunc() -> None:
self._check_test_key("foo")
await self._test_run_in_background(testfunc)
@logcontext_clean
async def test_run_coroutine_in_background(self) -> None:
"""
Test `run_coroutine_in_background` with a coroutine that yields control back to the
reactor.
This will stress the logic around incomplete deferreds in `run_coroutine_in_background`.
"""
clock = Clock(reactor, server_name="test_server")
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# The callback should have the same logcontext as the caller
self._check_test_key("foo")
with LoggingContext(name="competing", server_name="test_server"):
await clock.sleep(0)
self._check_test_key("competing")
self._check_test_key("foo")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
run_coroutine_in_background(competing_callback())
self._check_test_key("foo")
await clock.sleep(0)
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
async def test_run_coroutine_in_background_with_nonblocking_coroutine(self) -> None:
"""
Test `run_coroutine_in_background` with a "nonblocking" coroutine (never yields control
back to the reactor).
This will stress the logic around completed deferreds in `run_coroutine_in_background`.
"""
# Sanity check that we start in the sentinel context
self._check_test_key("sentinel")
callback_finished = False
async def competing_callback() -> None:
nonlocal callback_finished
try:
# The callback should have the same logcontext as the caller
self._check_test_key("foo")
with LoggingContext(name="competing", server_name="test_server"):
# We `await` here but there is nothing to wait for here since the
# deferred is already complete so we should immediately continue
# executing in the same context.
await defer.succeed(None)
self._check_test_key("competing")
self._check_test_key("foo")
finally:
# When exceptions happen, we still want to mark the callback as finished
# so that the test can complete and we see the underlying error.
callback_finished = True
with LoggingContext(name="foo", server_name="test_server"):
run_coroutine_in_background(competing_callback())
self._check_test_key("foo")
self.assertTrue(
callback_finished,
"Callback never finished which means the test probably didn't wait long enough",
)
# Back to the sentinel context
self._check_test_key("sentinel")
@logcontext_clean
@defer.inlineCallbacks
def test_make_deferred_yieldable(
self,
) -> Generator["defer.Deferred[object]", object, None]:
# a function which returns an incomplete deferred, but doesn't follow
# the synapse rules.
def blocking_function() -> defer.Deferred:
d: defer.Deferred = defer.Deferred()
reactor.callLater(0, d.callback, None)
return d
sentinel_context = current_context()
with LoggingContext(name="foo", server_name="test_server"):
d1 = make_deferred_yieldable(blocking_function())
# make sure that the context was reset by make_deferred_yieldable
self.assertIs(current_context(), sentinel_context)
yield d1
# now it should be restored
self._check_test_key("foo")
@logcontext_clean
@defer.inlineCallbacks
def test_make_deferred_yieldable_with_chained_deferreds(
self,
) -> Generator["defer.Deferred[object]", object, None]:
sentinel_context = current_context()
with LoggingContext(name="foo", server_name="test_server"):
d1 = make_deferred_yieldable(_chained_deferred_function())
# make sure that the context was reset by make_deferred_yieldable
self.assertIs(current_context(), sentinel_context)
yield d1
# now it should be restored
self._check_test_key("foo")
@logcontext_clean
def test_nested_logging_context(self) -> None:
with LoggingContext(name="foo", server_name="test_server"):
nested_context = nested_logging_context(suffix="bar")
self.assertEqual(nested_context.name, "foo-bar")
# a function which returns a deferred which has been "called", but
# which had a function which returned another incomplete deferred on
# its callback list, so won't yet call any other new callbacks.
def _chained_deferred_function() -> defer.Deferred:
d = defer.succeed(None)
def cb(res: object) -> defer.Deferred:
d2: defer.Deferred = defer.Deferred()
reactor.callLater(0, d2.callback, res)
return d2
d.addCallback(cb)
return d