]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/prometheus: Use RLock to fix deadlock in HealthHistory
authorNitzan Mordechai <nmordech@redhat.com>
Tue, 9 Dec 2025 12:34:07 +0000 (12:34 +0000)
committerNitzan Mordechai <nmordch@li-2bf86dcc-35b8-11b2-a85c-b1dfd4248b42.ibm.com>
Wed, 7 Jan 2026 11:16:40 +0000 (13:16 +0200)
The HealthHistory.check() method acquires the lock and then calls
HealthHistory.save(), which also tries to acquire the same lock.
With a regular Lock(), the same thread blocks trying to re-acquire it (deadlock).
Switch to RLock to allow nested acquisition by the same thread.
PR #65245 added the locks.

Fixes: https://tracker.ceph.com/issues/74148
Signed-off-by: Nitzan Mordechai <nmordech@ibm.com>
src/pybind/mgr/prometheus/module.py

index 836f64aa2b76238d55ef435105edbe9faaeee3c0..679c7fa894e5dabe7fc01a83687f4a9b36b8866e 100644 (file)
@@ -16,7 +16,7 @@ from mgr_util import get_default_addr, profile_method, build_url, test_port_allo
 from orchestrator import OrchestratorClientMixin, raise_if_exception, OrchestratorError
 from rbd import RBD
 
-from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable, IO, TypeVar, Generic
+from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List, Callable, IO, TypeVar, Iterator
 LabelValues = Tuple[str, ...]
 Number = Union[int, float]
 MetricValue = Dict[LabelValues, Number]
@@ -188,8 +188,7 @@ K = TypeVar('K')
 V = TypeVar('V')
 
 
-class LRUCacheDict(OrderedDict[K, V], Generic[K, V]):
-    maxsize: int
+class LRUCacheDict(OrderedDict[K, V]):
 
     def __init__(self, maxsize: int, *args: Any, **kwargs: Any) -> None:
         self.maxsize = maxsize
@@ -210,9 +209,9 @@ class HealthHistory:
 
     def __init__(self, mgr: MgrModule):
         self.mgr = mgr
-        self.lock = threading.Lock()
+        self.lock = threading.RLock()
         self.max_entries = cast(int, self.mgr.get_localized_module_option('healthcheck_history_max_entries', 1000))
-        self.healthcheck: LRUCacheDict[str, HealthCheckEvent] = LRUCacheDict(maxsize=self.max_entries)
+        self.healthcheck: ThreadSafeLRUCacheDict[str, HealthCheckEvent] = ThreadSafeLRUCacheDict(maxsize=self.max_entries)
         self._load()
 
     def _load(self) -> None:
@@ -294,6 +293,7 @@ class HealthHistory:
 
             if changes_made:
                 self.save()
+            self.mgr.log.debug("check(): DONE")
 
     def __str__(self) -> str:
         """Print the healthcheck history.
@@ -357,6 +357,47 @@ class HealthHistory:
         return yaml.safe_dump(self.as_dict(), explicit_start=True, default_flow_style=False)
 
 
+class ThreadSafeLRUCacheDict(LRUCacheDict[K, V]):
+    maxsize: int
+
+    def __init__(self, maxsize: int, *args: Any, **kwargs: Any) -> None:
+        self.maxsize = maxsize
+        self._lock = threading.RLock()
+        super().__init__(maxsize, *args, **kwargs)
+
+    def __setitem__(self, key: K, value: V) -> None:
+        with self._lock:
+            super().__setitem__(key, value)
+
+    def __getitem__(self, key: Any) -> V:
+        with self._lock:
+            return super().__getitem__(key)
+
+    def __iter__(self) -> Iterator[K]:
+        with self._lock:
+            return iter(list(super().keys()))
+
+    def __contains__(self, key: object) -> bool:
+        with self._lock:
+            return super().__contains__(key)
+
+    def __len__(self) -> int:
+        with self._lock:
+            return super().__len__()
+
+    def get(self, key: K, default: Optional[V] = None) -> Optional[V]:  # type: ignore[override]
+        with self._lock:
+            return super().get(key, default)
+
+    def clear(self) -> None:
+        with self._lock:
+            super().clear()
+
+    def keys(self) -> List[K]:  # type: ignore[override]
+        with self._lock:
+            return list(super().keys())
+
+
 class Metric(object):
     def __init__(self, mtype: str, name: str, desc: str, labels: Optional[LabelValues] = None) -> None:
         self.mtype = mtype
@@ -1026,6 +1067,7 @@ class Module(MgrModule, OrchestratorClientMixin):
                     self.metrics[path].set(0)
 
         self.health_history.check(health)
+
         for name, info in self.health_history.healthcheck.items():
             v = 1 if info.active else 0
             self.metrics['health_detail'].set(