]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: make cache invalidate less racy
authorSage Weil <sage@redhat.com>
Fri, 21 Feb 2020 21:38:25 +0000 (15:38 -0600)
committerSage Weil <sage@redhat.com>
Mon, 24 Feb 2020 16:35:26 +0000 (10:35 -0600)
Consider a cache invalidation that races with an actual update:

- serve() refresh starts
- refresh runs cephadm ls
- add_daemon creates a new daemon
- add_daemon returns and invalidates the list (set last_udpate=None)
- serve() stores its ls result in the cache

In such a case the add result will get lost.

Fix this by taking a conservative strategy:

- invalidate adds host to a refresh list
- serve() removes an item from the refresh list and then does the ls,
then stores the result.

Any racing update will invalidate *after* it does it's work, which means
we will always do a final ls afterwards.

Signed-off-by: Sage Weil <sage@redhat.com>
src/pybind/mgr/cephadm/module.py

index 7d8ee5738a0598d18e414161190fe4cdd1fd9ae9..baaecdae31e5b6185ddd805398398047145cb607 100644 (file)
@@ -115,6 +115,8 @@ class HostCache():
         self.last_daemon_update = {}   # type: Dict[str, datetime.datetime]
         self.devices = {}              # type: Dict[str, List[inventory.Device]]
         self.last_device_update = {}   # type: Dict[str, datetime.datetime]
+        self.daemon_refresh_queue = [] # type: List[str]
+        self.device_refresh_queue = [] # type: List[str]
 
     def load(self):
         # type: () -> None
@@ -128,6 +130,8 @@ class HostCache():
                 j = json.loads(v)
                 # we do ignore the persisted last_*_update to trigger a new
                 # scrape on mgr restart
+                self.daemon_refresh_queue.append(host)
+                self.device_refresh_queue.append(host)
                 self.daemons[host] = {}
                 self.devices[host] = []
                 for name, d in j.get('daemons', {}).items():
@@ -159,15 +163,19 @@ class HostCache():
         """
         self.daemons[host] = {}
         self.devices[host] = []
+        self.daemon_refresh_queue.append(host)
+        self.device_refresh_queue.append(host)
 
     def invalidate_host_daemons(self, host):
         # type: (str) -> None
+        self.daemon_refresh_queue.append(host)
         if host in self.last_daemon_update:
             del self.last_daemon_update[host]
         self.mgr.event.set()
 
     def invalidate_host_devices(self, host):
         # type: (str) -> None
+        self.device_refresh_queue.append(host)
         if host in self.last_device_update:
             del self.last_device_update[host]
         self.mgr.event.set()
@@ -234,6 +242,9 @@ class HostCache():
 
     def host_needs_daemon_refresh(self, host):
         # type: (str) -> bool
+        if host in self.daemon_refresh_queue:
+            self.daemon_refresh_queue.remove(host)
+            return True
         cutoff = datetime.datetime.utcnow() - datetime.timedelta(
             seconds=self.mgr.daemon_cache_timeout)
         if host not in self.last_daemon_update or self.last_daemon_update[host] < cutoff:
@@ -242,6 +253,9 @@ class HostCache():
 
     def host_needs_device_refresh(self, host):
         # type: (str) -> bool
+        if host in self.device_refresh_queue:
+            self.device_refresh_queue.remove(host)
+            return True
         cutoff = datetime.datetime.utcnow() - datetime.timedelta(
             seconds=self.mgr.device_cache_timeout)
         if host not in self.last_device_update or self.last_device_update[host] < cutoff: