synapse/tests/handlers/test_worker_lock.py
gui-yue 07468a0f1c
Increase timeout for test_lock_contention on RISC-V (#18430)
This PR addresses a test failure for
`tests.handlers.test_worker_lock.WorkerLockTestCase.test_lock_contention`
which consistently times out on the RISC-V (specifically `riscv64`)
architecture.

The test simulates high lock contention and has a default timeout of 5
seconds, which seems sufficient for architectures like x86_64 but proves
too short for current RISC-V hardware/environment performance
characteristics, leading to spurious `tests.utils.TestTimeout` failures.

This fix introduces architecture detection using `platform.machine()`.
If a RISC-V architecture is detected:
* The timeout for this specific test is increased (e.g., to 15 seconds
).

The original, stricter timeout (5 seconds) and lock count (500) are
maintained for all other architectures to avoid masking potential
performance regressions elsewhere.

This change has been tested locally on RISC-V, where the test now passes
reliably, and on x86_64, where it continues to pass with the original
constraints.

---

### Pull Request Checklist

<!-- Please read
https://element-hq.github.io/synapse/latest/development/contributing_guide.html
before submitting your pull request -->

* [X] Pull request is based on the develop branch *(Assuming you based
it correctly)*
* [X] Pull request includes a [changelog
file](https://element-hq.github.io/synapse/latest/development/contributing_guide.html#changelog).
*(See below)*
* [X] [Code
style](https://element-hq.github.io/synapse/latest/code_style.html) is
correct (run the
[linters](https://element-hq.github.io/synapse/latest/development/contributing_guide.html#run-the-linters))
*(Please run linters locally)*
2025-05-27 17:17:04 +00:00

125 lines
4.3 KiB
Python

#
# This file is licensed under the Affero General Public License (AGPL) version 3.
#
# Copyright 2023 The Matrix.org Foundation C.I.C.
# Copyright (C) 2023 New Vector, Ltd
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# See the GNU Affero General Public License for more details:
# <https://www.gnu.org/licenses/agpl-3.0.html>.
#
# Originally licensed under the Apache License, Version 2.0:
# <http://www.apache.org/licenses/LICENSE-2.0>.
#
# [This file includes modifications made by New Vector Limited]
#
#
import logging
import platform
from twisted.internet import defer
from twisted.test.proto_helpers import MemoryReactor
from synapse.server import HomeServer
from synapse.util import Clock
from tests import unittest
from tests.replication._base import BaseMultiWorkerStreamTestCase
from tests.utils import test_timeout
logger = logging.getLogger(__name__)
class WorkerLockTestCase(unittest.HomeserverTestCase):
def prepare(
self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
) -> None:
self.worker_lock_handler = self.hs.get_worker_locks_handler()
def test_wait_for_lock_locally(self) -> None:
"""Test waiting for a lock on a single worker"""
lock1 = self.worker_lock_handler.acquire_lock("name", "key")
self.get_success(lock1.__aenter__())
lock2 = self.worker_lock_handler.acquire_lock("name", "key")
d2 = defer.ensureDeferred(lock2.__aenter__())
self.assertNoResult(d2)
self.get_success(lock1.__aexit__(None, None, None))
self.get_success(d2)
self.get_success(lock2.__aexit__(None, None, None))
def test_lock_contention(self) -> None:
"""Test lock contention when a lot of locks wait on a single worker"""
nb_locks_to_test = 500
current_machine = platform.machine().lower()
if current_machine.startswith("riscv"):
# RISC-V specific settings
timeout_seconds = 15 # Increased timeout for RISC-V
# add a print or log statement here for visibility in CI logs
logger.info( # use logger.info
f"Detected RISC-V architecture ({current_machine}). "
f"Adjusting test_lock_contention: timeout={timeout_seconds}s"
)
else:
# Settings for other architectures
timeout_seconds = 5
# It takes around 0.5s on a 5+ years old laptop
with test_timeout(timeout_seconds): # Use the dynamically set timeout
d = self._take_locks(
nb_locks_to_test
) # Use the (potentially adjusted) number of locks
self.assertEqual(
self.get_success(d), nb_locks_to_test
) # Assert against the used number of locks
async def _take_locks(self, nb_locks: int) -> int:
locks = [
self.hs.get_worker_locks_handler().acquire_lock("test_lock", "")
for _ in range(nb_locks)
]
nb_locks_taken = 0
for lock in locks:
async with lock:
nb_locks_taken += 1
return nb_locks_taken
class WorkerLockWorkersTestCase(BaseMultiWorkerStreamTestCase):
def prepare(
self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
) -> None:
self.main_worker_lock_handler = self.hs.get_worker_locks_handler()
def test_wait_for_lock_worker(self) -> None:
"""Test waiting for a lock on another worker"""
worker = self.make_worker_hs(
"synapse.app.generic_worker",
extra_config={
"redis": {"enabled": True},
},
)
worker_lock_handler = worker.get_worker_locks_handler()
lock1 = self.main_worker_lock_handler.acquire_lock("name", "key")
self.get_success(lock1.__aenter__())
lock2 = worker_lock_handler.acquire_lock("name", "key")
d2 = defer.ensureDeferred(lock2.__aenter__())
self.assertNoResult(d2)
self.get_success(lock1.__aexit__(None, None, None))
self.get_success(d2)
self.get_success(lock2.__aexit__(None, None, None))