From: Nitzan Mordechai Date: Tue, 9 Dec 2025 12:34:07 +0000 (+0000) Subject: mgr/prometheus: Use RLock to fix deadlock in HealthHistory X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0c4e68af8e2bc005109b99c3db7f5004eead3c59;p=ceph-ci.git mgr/prometheus: Use RLock to fix deadlock in HealthHistory The HealthHistory.check() method acquires the lock and then calls HealthHistory.save(), which also tries to acquire the same lock. With a regular Lock(), the same thread blocks trying to re-acquire it (deadlock). Switch to RLock to allow nested acquisition by the same thread. PR #65245 added the locks. Fixes: https://tracker.ceph.com/issues/74148 Signed-off-by: Nitzan Mordechai --- diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 836f64aa2b7..679c7fa894e 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -16,7 +16,7 @@ from mgr_util import get_default_addr, profile_method, build_url, test_port_allo from orchestrator import OrchestratorClientMixin, raise_if_exception, OrchestratorError from rbd import RBD -from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable, IO, TypeVar, Generic +from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable, IO, TypeVar, Iterator LabelValues = Tuple[str, ...] Number = Union[int, float] MetricValue = Dict[LabelValues, Number] @@ -188,8 +188,7 @@ K = TypeVar('K') V = TypeVar('V') -class LRUCacheDict(OrderedDict[K, V], Generic[K, V]): - maxsize: int +class LRUCacheDict(OrderedDict[K, V]): def __init__(self, maxsize: int, *args: Any, **kwargs: Any) -> None: self.maxsize = maxsize @@ -210,9 +209,9 @@ class HealthHistory: def __init__(self, mgr: MgrModule): self.mgr = mgr - self.lock = threading.Lock() + self.lock = threading.RLock() self.max_entries = cast(int, self.mgr.get_localized_module_option('healthcheck_history_max_entries', 1000)) - self.healthcheck: LRUCacheDict[str, HealthCheckEvent] = LRUCacheDict(maxsize=self.max_entries) + self.healthcheck: ThreadSafeLRUCacheDict[str, HealthCheckEvent] = ThreadSafeLRUCacheDict(maxsize=self.max_entries) self._load() def _load(self) -> None: @@ -294,6 +293,7 @@ class HealthHistory: if changes_made: self.save() + self.mgr.log.debug("check(): DONE") def __str__(self) -> str: """Print the healthcheck history. @@ -357,6 +357,47 @@ class HealthHistory: return yaml.safe_dump(self.as_dict(), explicit_start=True, default_flow_style=False) +class ThreadSafeLRUCacheDict(LRUCacheDict[K, V]): + maxsize: int + + def __init__(self, maxsize: int, *args: Any, **kwargs: Any) -> None: + self.maxsize = maxsize + self._lock = threading.RLock() + super().__init__(maxsize, *args, **kwargs) + + def __setitem__(self, key: K, value: V) -> None: + with self._lock: + super().__setitem__(key, value) + + def __getitem__(self, key: Any) -> V: + with self._lock: + return super().__getitem__(key) + + def __iter__(self) -> Iterator[K]: + with self._lock: + return iter(list(super().keys())) + + def __contains__(self, key: object) -> bool: + with self._lock: + return super().__contains__(key) + + def __len__(self) -> int: + with self._lock: + return super().__len__() + + def get(self, key: K, default: Optional[V] = None) -> Optional[V]: # type: ignore[override] + with self._lock: + return super().get(key, default) + + def clear(self) -> None: + with self._lock: + super().clear() + + def keys(self) -> List[K]: # type: ignore[override] + with self._lock: + return list(super().keys()) + + class Metric(object): def __init__(self, mtype: str, name: str, desc: str, labels: Optional[LabelValues] = None) -> None: self.mtype = mtype @@ -1026,6 +1067,7 @@ class Module(MgrModule, OrchestratorClientMixin): self.metrics[path].set(0) self.health_history.check(health) + for name, info in self.health_history.healthcheck.items(): v = 1 if info.active else 0 self.metrics['health_detail'].set(