Update metrics linting to be able to handle custom metrics (#18733)

Part of https://github.com/element-hq/synapse/issues/18592
This commit is contained in:
Eric Eastwood 2025-08-01 15:34:11 -05:00 committed by GitHub
parent e43a1cec84
commit e16fbdcdcc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 293 additions and 35 deletions

1
changelog.d/18733.misc Normal file
View File

@ -0,0 +1 @@
Update metrics linting to be able to handle custom metrics.

View File

@ -1,6 +1,17 @@
[mypy]
namespace_packages = True
plugins = pydantic.mypy, mypy_zope:plugin, scripts-dev/mypy_synapse_plugin.py
# Our custom mypy plugin should remain first in this list.
#
# mypy has a limitation where it only chooses the first plugin that returns a non-None
# value for each hook (known-limitation, c.f.
# https://github.com/python/mypy/issues/19524). We workaround this by putting our custom
# plugin first in the plugin order and then manually calling any other conflicting
# plugin hooks in our own plugin followed by our own checks.
#
# If you add a new plugin, make sure to check whether the hooks being used conflict with
# our custom plugin hooks and if so, manually call the other plugin's hooks in our
# custom plugin. (also applies to if the plugin is updated in the future)
plugins = scripts-dev/mypy_synapse_plugin.py, pydantic.mypy, mypy_zope:plugin
follow_imports = normal
show_error_codes = True
show_traceback = True
@ -99,3 +110,6 @@ ignore_missing_imports = True
[mypy-multipart.*]
ignore_missing_imports = True
[mypy-mypy_zope.*]
ignore_missing_imports = True

View File

@ -23,16 +23,21 @@
can crop up, e.g the cache descriptors.
"""
from typing import Callable, Optional, Tuple, Type, Union
import enum
from typing import Callable, Mapping, Optional, Tuple, Type, Union
import attr
import mypy.types
from mypy.erasetype import remove_instance_last_known_values
from mypy.errorcodes import ErrorCode
from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, TupleExpr, Var
from mypy.plugin import (
ClassDefContext,
Context,
FunctionLike,
FunctionSigContext,
MethodSigContext,
MypyFile,
Plugin,
)
from mypy.typeops import bind_self
@ -41,12 +46,15 @@ from mypy.types import (
CallableType,
Instance,
NoneType,
Options,
TupleType,
TypeAliasType,
TypeVarType,
UninhabitedType,
UnionType,
)
from mypy_zope import plugin as mypy_zope_plugin
from pydantic.mypy import plugin as mypy_pydantic_plugin
PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
"missing-server-name-label",
@ -54,19 +62,153 @@ PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
category="per-homeserver-tenant-metrics",
)
PROMETHEUS_METRIC_MISSING_FROM_LIST_TO_CHECK = ErrorCode(
"metric-type-missing-from-list",
"Every Prometheus metric type must be included in the `prometheus_metric_fullname_to_label_arg_map`.",
category="per-homeserver-tenant-metrics",
)
class Sentinel(enum.Enum):
# defining a sentinel in this way allows mypy to correctly handle the
# type of a dictionary lookup and subsequent type narrowing.
UNSET_SENTINEL = object()
@attr.s(auto_attribs=True)
class ArgLocation:
keyword_name: str
"""
The keyword argument name for this argument
"""
position: int
"""
The 0-based positional index of this argument
"""
prometheus_metric_fullname_to_label_arg_map: Mapping[str, Optional[ArgLocation]] = {
# `Collector` subclasses:
"prometheus_client.metrics.MetricWrapperBase": ArgLocation("labelnames", 2),
"prometheus_client.metrics.Counter": ArgLocation("labelnames", 2),
"prometheus_client.metrics.Histogram": ArgLocation("labelnames", 2),
"prometheus_client.metrics.Gauge": ArgLocation("labelnames", 2),
"prometheus_client.metrics.Summary": ArgLocation("labelnames", 2),
"prometheus_client.metrics.Info": ArgLocation("labelnames", 2),
"prometheus_client.metrics.Enum": ArgLocation("labelnames", 2),
"synapse.metrics.LaterGauge": ArgLocation("labelnames", 2),
"synapse.metrics.InFlightGauge": ArgLocation("labels", 2),
"synapse.metrics.GaugeBucketCollector": ArgLocation("labelnames", 2),
"prometheus_client.registry.Collector": None,
"prometheus_client.registry._EmptyCollector": None,
"prometheus_client.registry.CollectorRegistry": None,
"prometheus_client.process_collector.ProcessCollector": None,
"prometheus_client.platform_collector.PlatformCollector": None,
"prometheus_client.gc_collector.GCCollector": None,
"synapse.metrics._gc.GCCounts": None,
"synapse.metrics._gc.PyPyGCStats": None,
"synapse.metrics._reactor_metrics.ReactorLastSeenMetric": None,
"synapse.metrics.CPUMetrics": None,
"synapse.metrics.jemalloc.JemallocCollector": None,
"synapse.util.metrics.DynamicCollectorRegistry": None,
"synapse.metrics.background_process_metrics._Collector": None,
#
# `Metric` subclasses:
"prometheus_client.metrics_core.Metric": None,
"prometheus_client.metrics_core.UnknownMetricFamily": ArgLocation("labels", 3),
"prometheus_client.metrics_core.CounterMetricFamily": ArgLocation("labels", 3),
"prometheus_client.metrics_core.GaugeMetricFamily": ArgLocation("labels", 3),
"prometheus_client.metrics_core.SummaryMetricFamily": ArgLocation("labels", 3),
"prometheus_client.metrics_core.InfoMetricFamily": ArgLocation("labels", 3),
"prometheus_client.metrics_core.HistogramMetricFamily": ArgLocation("labels", 3),
"prometheus_client.metrics_core.GaugeHistogramMetricFamily": ArgLocation(
"labels", 4
),
"prometheus_client.metrics_core.StateSetMetricFamily": ArgLocation("labels", 3),
"synapse.metrics.GaugeHistogramMetricFamilyWithLabels": ArgLocation(
"labelnames", 4
),
}
"""
Map from the fullname of the Prometheus `Metric`/`Collector` classes to the keyword
argument name and positional index of the label names. This map is useful because
different metrics have different signatures for passing in label names and we just need
to know where to look.
This map should include any metrics that we collect with Prometheus. Which corresponds
to anything that inherits from `prometheus_client.registry.Collector`
(`synapse.metrics._types.Collector`) or `prometheus_client.metrics_core.Metric`. The
exhaustiveness of this list is enforced by `analyze_prometheus_metric_classes`.
The entries with `None` always fail the lint because they don't have a `labelnames`
argument (therefore, no `SERVER_NAME_LABEL`), but we include them here so that people
can notice and manually allow via a type ignore comment as the source of truth
should be in the source code.
"""
# Unbound at this point because we don't know the mypy version yet.
# This is set in the `plugin(...)` function below.
MypyPydanticPluginClass: Type[Plugin]
MypyZopePluginClass: Type[Plugin]
class SynapsePlugin(Plugin):
def __init__(self, options: Options):
super().__init__(options)
self.mypy_pydantic_plugin = MypyPydanticPluginClass(options)
self.mypy_zope_plugin = MypyZopePluginClass(options)
def set_modules(self, modules: dict[str, MypyFile]) -> None:
"""
This is called by mypy internals. We have to override this to ensure it's also
called for any other plugins that we're manually handling.
Here is how mypy describes it:
> [`self._modules`] can't be set in `__init__` because it is executed too soon
> in `build.py`. Therefore, `build.py` *must* set it later before graph processing
> starts by calling `set_modules()`.
"""
super().set_modules(modules)
self.mypy_pydantic_plugin.set_modules(modules)
self.mypy_zope_plugin.set_modules(modules)
def get_base_class_hook(
self, fullname: str
) -> Optional[Callable[[ClassDefContext], None]]:
def _get_base_class_hook(ctx: ClassDefContext) -> None:
# Run any `get_base_class_hook` checks from other plugins first.
#
# Unfortunately, because mypy only chooses the first plugin that returns a
# non-None value (known-limitation, c.f.
# https://github.com/python/mypy/issues/19524), we workaround this by
# putting our custom plugin first in the plugin order and then calling the
# other plugin's hook manually followed by our own checks.
if callback := self.mypy_pydantic_plugin.get_base_class_hook(fullname):
callback(ctx)
if callback := self.mypy_zope_plugin.get_base_class_hook(fullname):
callback(ctx)
# Now run our own checks
analyze_prometheus_metric_classes(ctx)
return _get_base_class_hook
def get_function_signature_hook(
self, fullname: str
) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
if fullname in (
"prometheus_client.metrics.Counter",
"prometheus_client.metrics.Histogram",
"prometheus_client.metrics.Gauge",
# TODO: Add other prometheus_client metrics that need checking as we
# refactor, see https://github.com/element-hq/synapse/issues/18592
):
return check_prometheus_metric_instantiation
# Strip off the unique identifier for classes that are dynamically created inside
# functions. ex. `synapse.metrics.jemalloc.JemallocCollector@185` (this is the line
# number)
if "@" in fullname:
fullname = fullname.split("@", 1)[0]
# Look for any Prometheus metrics to make sure they have the `SERVER_NAME_LABEL`
# label.
if fullname in prometheus_metric_fullname_to_label_arg_map.keys():
# Because it's difficult to determine the `fullname` of the function in the
# callback, let's just pass it in while we have it.
return lambda ctx: check_prometheus_metric_instantiation(ctx, fullname)
return None
@ -90,7 +232,44 @@ class SynapsePlugin(Plugin):
return None
def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
def analyze_prometheus_metric_classes(ctx: ClassDefContext) -> None:
"""
Cross-check the list of Prometheus metric classes against the
`prometheus_metric_fullname_to_label_arg_map` to ensure the list is exhaustive and
up-to-date.
"""
fullname = ctx.cls.fullname
# Strip off the unique identifier for classes that are dynamically created inside
# functions. ex. `synapse.metrics.jemalloc.JemallocCollector@185` (this is the line
# number)
if "@" in fullname:
fullname = fullname.split("@", 1)[0]
if any(
ancestor_type.fullname
in (
# All of the Prometheus metric classes inherit from the `Collector`.
"prometheus_client.registry.Collector",
"synapse.metrics._types.Collector",
# And custom metrics that inherit from `Metric`.
"prometheus_client.metrics_core.Metric",
)
for ancestor_type in ctx.cls.info.mro
):
if fullname not in prometheus_metric_fullname_to_label_arg_map:
ctx.api.fail(
f"Expected {fullname} to be in `prometheus_metric_fullname_to_label_arg_map`, "
f"but it was not found. This is a problem with our custom mypy plugin. "
f"Please add it to the map.",
Context(),
code=PROMETHEUS_METRIC_MISSING_FROM_LIST_TO_CHECK,
)
def check_prometheus_metric_instantiation(
ctx: FunctionSigContext, fullname: str
) -> CallableType:
"""
Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label
when instantiated.
@ -103,18 +282,49 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
Python garbage collection, and Twisted reactor tick time, which shouldn't have the
`SERVER_NAME_LABEL`. In those cases, use a type ignore comment to disable the
check, e.g. `# type: ignore[missing-server-name-label]`.
Args:
ctx: The `FunctionSigContext` from mypy.
fullname: The fully qualified name of the function being called,
e.g. `"prometheus_client.metrics.Counter"`
"""
# The true signature, this isn't being modified so this is what will be returned.
signature: CallableType = ctx.default_signature
signature = ctx.default_signature
# Find where the label names argument is in the function signature.
arg_location = prometheus_metric_fullname_to_label_arg_map.get(
fullname, Sentinel.UNSET_SENTINEL
)
assert arg_location is not Sentinel.UNSET_SENTINEL, (
f"Expected to find {fullname} in `prometheus_metric_fullname_to_label_arg_map`, "
f"but it was not found. This is a problem with our custom mypy plugin. "
f"Please add it to the map. Context: {ctx.context}"
)
# People should be using `# type: ignore[missing-server-name-label]` for
# process-level metrics that should not have the `SERVER_NAME_LABEL`.
if arg_location is None:
ctx.api.fail(
f"{signature.name} does not have a `labelnames`/`labels` argument "
"(if this is untrue, update `prometheus_metric_fullname_to_label_arg_map` "
"in our custom mypy plugin) and should probably have a type ignore comment, "
"e.g. `# type: ignore[missing-server-name-label]`. The reason we don't "
"automatically ignore this is the source of truth should be in the source code.",
ctx.context,
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
)
return signature
# Sanity check the arguments are still as expected in this version of
# `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)`
#
# `signature.arg_names` should be: ["name", "documentation", "labelnames", ...]
if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames":
if (
len(signature.arg_names) < (arg_location.position + 1)
or signature.arg_names[arg_location.position] != arg_location.keyword_name
):
ctx.api.fail(
f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got "
f"{signature.arg_names[2]}",
f"Expected argument number {arg_location.position + 1} of {signature.name} to be `labelnames`/`labels`, "
f"but got {signature.arg_names[arg_location.position]}",
ctx.context,
)
return signature
@ -137,7 +347,11 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy
# ...
# ]
# ```
labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
labelnames_arg_expression = (
ctx.args[arg_location.position][0]
if len(ctx.args[arg_location.position]) > 0
else None
)
if isinstance(labelnames_arg_expression, (ListExpr, TupleExpr)):
# Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
for labelname_expression in labelnames_arg_expression.items:
@ -476,10 +690,13 @@ def is_cacheable(
def plugin(version: str) -> Type[SynapsePlugin]:
global MypyPydanticPluginClass, MypyZopePluginClass
# This is the entry point of the plugin, and lets us deal with the fact
# that the mypy plugin interface is *not* stable by looking at the version
# string.
#
# However, since we pin the version of mypy Synapse uses in CI, we don't
# really care.
MypyPydanticPluginClass = mypy_pydantic_plugin(version)
MypyZopePluginClass = mypy_zope_plugin(version)
return SynapsePlugin

View File

@ -170,7 +170,9 @@ class LaterGauge(Collector):
]
def collect(self) -> Iterable[Metric]:
g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames)
# The decision to add `SERVER_NAME_LABEL` is from the `LaterGauge` usage itself
# (we don't enforce it here, one level up).
g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames) # type: ignore[missing-server-name-label]
try:
calls = self.caller()
@ -304,7 +306,9 @@ class InFlightGauge(Generic[MetricsEntry], Collector):
Note: may be called by a separate thread.
"""
in_flight = GaugeMetricFamily(
# The decision to add `SERVER_NAME_LABEL` is from the `GaugeBucketCollector`
# usage itself (we don't enforce it here, one level up).
in_flight = GaugeMetricFamily( # type: ignore[missing-server-name-label]
self.name + "_total", self.desc, labels=self.labels
)
@ -328,7 +332,9 @@ class InFlightGauge(Generic[MetricsEntry], Collector):
yield in_flight
for name in self.sub_metrics:
gauge = GaugeMetricFamily(
# The decision to add `SERVER_NAME_LABEL` is from the `InFlightGauge` usage
# itself (we don't enforce it here, one level up).
gauge = GaugeMetricFamily( # type: ignore[missing-server-name-label]
"_".join([self.name, name]), "", labels=self.labels
)
for key, metrics in metrics_by_key.items():
@ -483,7 +489,9 @@ class GaugeBucketCollector(Collector):
# that bucket or below.
accumulated_values = itertools.accumulate(bucket_values)
return GaugeHistogramMetricFamilyWithLabels(
# The decision to add `SERVER_NAME_LABEL` is from the `GaugeBucketCollector`
# usage itself (we don't enforce it here, one level up).
return GaugeHistogramMetricFamilyWithLabels( # type: ignore[missing-server-name-label]
name=self._name,
documentation=self._documentation,
labelnames=self._labelnames,
@ -519,16 +527,19 @@ class CPUMetrics(Collector):
line = s.read()
raw_stats = line.split(") ", 1)[1].split(" ")
user = GaugeMetricFamily("process_cpu_user_seconds_total", "")
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
user = GaugeMetricFamily("process_cpu_user_seconds_total", "") # type: ignore[missing-server-name-label]
user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec)
yield user
sys = GaugeMetricFamily("process_cpu_system_seconds_total", "")
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
sys = GaugeMetricFamily("process_cpu_system_seconds_total", "") # type: ignore[missing-server-name-label]
sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec)
yield sys
REGISTRY.register(CPUMetrics())
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
REGISTRY.register(CPUMetrics()) # type: ignore[missing-server-name-label]
#

View File

@ -83,7 +83,8 @@ gc_time = Histogram( # type: ignore[missing-server-name-label]
class GCCounts(Collector):
def collect(self) -> Iterable[Metric]:
cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"]) # type: ignore[missing-server-name-label]
for n, m in enumerate(gc.get_count()):
cm.add_metric([str(n)], m)
@ -102,7 +103,8 @@ def install_gc_manager() -> None:
if running_on_pypy:
return
REGISTRY.register(GCCounts())
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
REGISTRY.register(GCCounts()) # type: ignore[missing-server-name-label]
gc.disable()
@ -177,7 +179,8 @@ class PyPyGCStats(Collector):
#
# Total time spent in GC: 0.073 # s.total_gc_time
pypy_gc_time = CounterMetricFamily(
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
pypy_gc_time = CounterMetricFamily( # type: ignore[missing-server-name-label]
"pypy_gc_time_seconds_total",
"Total time spent in PyPy GC",
labels=[],
@ -185,7 +188,8 @@ class PyPyGCStats(Collector):
pypy_gc_time.add_metric([], s.total_gc_time / 1000)
yield pypy_gc_time
pypy_mem = GaugeMetricFamily(
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
pypy_mem = GaugeMetricFamily( # type: ignore[missing-server-name-label]
"pypy_memory_bytes",
"Memory tracked by PyPy allocator",
labels=["state", "class", "kind"],
@ -209,4 +213,5 @@ class PyPyGCStats(Collector):
if running_on_pypy:
REGISTRY.register(PyPyGCStats())
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
REGISTRY.register(PyPyGCStats()) # type: ignore[missing-server-name-label]

View File

@ -115,7 +115,8 @@ class ReactorLastSeenMetric(Collector):
self._call_wrapper = call_wrapper
def collect(self) -> Iterable[Metric]:
cm = GaugeMetricFamily(
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
cm = GaugeMetricFamily( # type: ignore[missing-server-name-label]
"python_twisted_reactor_last_seen",
"Seconds since the Twisted reactor was last seen",
)
@ -166,4 +167,5 @@ except Exception as e:
if wrapper:
REGISTRY.register(ReactorLastSeenMetric(wrapper))
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
REGISTRY.register(ReactorLastSeenMetric(wrapper)) # type: ignore[missing-server-name-label]

View File

@ -167,7 +167,9 @@ class _Collector(Collector):
yield from m.collect()
REGISTRY.register(_Collector())
# The `SERVER_NAME_LABEL` is included in the individual metrics added to this registry,
# so we don't need to worry about it on the collector itself.
REGISTRY.register(_Collector()) # type: ignore[missing-server-name-label]
class _BackgroundProcess:

View File

@ -188,7 +188,8 @@ def _setup_jemalloc_stats() -> None:
def collect(self) -> Iterable[Metric]:
stats.refresh_stats()
g = GaugeMetricFamily(
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
g = GaugeMetricFamily( # type: ignore[missing-server-name-label]
"jemalloc_stats_app_memory_bytes",
"The stats reported by jemalloc",
labels=["type"],
@ -230,7 +231,8 @@ def _setup_jemalloc_stats() -> None:
yield g
REGISTRY.register(JemallocCollector())
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`.
REGISTRY.register(JemallocCollector()) # type: ignore[missing-server-name-label]
logger.debug("Added jemalloc stats")

View File

@ -42,7 +42,10 @@ TRACK_MEMORY_USAGE = False
# We track cache metrics in a special registry that lets us update the metrics
# just before they are returned from the scrape endpoint.
CACHE_METRIC_REGISTRY = DynamicCollectorRegistry()
#
# The `SERVER_NAME_LABEL` is included in the individual metrics added to this registry,
# so we don't need to worry about it on the collector itself.
CACHE_METRIC_REGISTRY = DynamicCollectorRegistry() # type: ignore[missing-server-name-label]
cache_size = Gauge(
"synapse_util_caches_cache_size",

View File

@ -59,7 +59,8 @@ class TestMauLimit(unittest.TestCase):
foo: int
bar: int
gauge: InFlightGauge[MetricEntry] = InFlightGauge(
# This is a test and does not matter if it uses `SERVER_NAME_LABEL`.
gauge: InFlightGauge[MetricEntry] = InFlightGauge( # type: ignore[missing-server-name-label]
"test1", "", labels=["test_label"], sub_metrics=["foo", "bar"]
)