]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/snap_schedule: handle volume delete
authorMilind Changire <mchangir@redhat.com>
Thu, 11 Jan 2024 12:51:31 +0000 (18:21 +0530)
committerMilind Changire <mchangir@redhat.com>
Mon, 4 Nov 2024 13:15:10 +0000 (18:45 +0530)
Remove references to timers and db connection that refer to paths in the
deleted volume.

This is a best case effort and not 100% safe. There can be race
conditions where the db connection is closed while the module is
executing a query on the db while the volume itself is deleted.

Hopefully there won't be many tracebacks seen.

Fixes: https://tracker.ceph.com/issues/63999
Signed-off-by: Milind Changire <mchangir@redhat.com>
src/pybind/mgr/snap_schedule/fs/schedule_client.py
src/pybind/mgr/snap_schedule/module.py

index b58f20f127534d56b81a111b82a60d18342eb847..12e5e98073766fb1c3917cc3376472d2d3b80a49 100644 (file)
@@ -163,6 +163,7 @@ class SnapSchedClient(CephfsClient):
         self.sqlite_connections: Dict[str, DBInfo] = {}
         self.active_timers: Dict[Tuple[str, str], List[Timer]] = {}
         self.conn_lock: Lock = Lock()  # lock to protect add/lookup db connections
+        self.timers_lock: Lock = Lock()
 
         # restart old schedules
         for fs_name in self.get_all_filesystems():
@@ -273,6 +274,27 @@ class SnapSchedClient(CephfsClient):
                     if self._is_allowed_repeat(r, path)][0:1]
             return rows
 
+    def delete_references_to_unavailable_fs(self, available_fs_names: Set[str]) -> None:
+        fs_to_remove: Set[str] = set()
+        self.timers_lock.acquire()
+        for fs, path in list(self.active_timers.keys()):  # each key is a tuple
+            if fs not in available_fs_names:
+                fs_to_remove.add(fs)
+                log.debug(f'Cancelled timers for "{fs}:{path}"')
+                for t in self.active_timers[(fs, path)]:
+                    t.cancel()
+                log.debug(f'Removed timer instance for "{fs}"')
+                del self.active_timers[(fs, path)]
+        self.timers_lock.release()
+
+        self.conn_lock.acquire()
+        for fs in fs_to_remove:
+            log.debug(f'Closed DB connection to "{fs}"')
+            self.sqlite_connections[fs].db.close()
+            log.debug(f'Removed DB connection to "{fs}"')
+            del self.sqlite_connections[fs]
+        self.conn_lock.release()
+
     def refresh_snap_timers(self, fs: str, path: str, olddb: Optional[sqlite3.Connection] = None) -> None:
         try:
             log.debug((f'SnapDB on {fs} changed for {path}, '
@@ -286,6 +308,7 @@ class SnapSchedClient(CephfsClient):
                 with self.get_schedule_db(fs) as conn_mgr:
                     db = conn_mgr.dbinfo.db
                     rows = self.fetch_schedules(db, path)
+            self.timers_lock.acquire()
             timers = self.active_timers.get((fs, path), [])
             for timer in timers:
                 timer.cancel()
@@ -299,6 +322,7 @@ class SnapSchedClient(CephfsClient):
                 timers.append(t)
                 log.debug(f'Will snapshot {path} in fs {fs} in {row[1]}s')
             self.active_timers[(fs, path)] = timers
+            self.timers_lock.release()
         except Exception:
             self._log_exception('refresh_snap_timers')
 
index d8f04a62b94ecaa6f2715de6ac61ee42879dc566..adf982448b1cf0c2200458706d98d701a5712b8b 100644 (file)
@@ -8,12 +8,14 @@ import json
 import sqlite3
 from typing import Any, Dict, Optional, Tuple, Union
 from .fs.schedule_client import SnapSchedClient
-from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option
+from mgr_module import MgrModule, CLIReadCommand, CLIWriteCommand, Option, NotifyType
 from mgr_util import CephfsConnectionException
 from threading import Event
 
 
 class Module(MgrModule):
+    NOTIFY_TYPES = [NotifyType.fs_map]
+
     MODULE_OPTIONS = [
         Option(
             'allow_m_granularity',
@@ -37,6 +39,21 @@ class Module(MgrModule):
         self._initialized = Event()
         self.client = SnapSchedClient(self)
 
+    def notify(self, notify_type: NotifyType, notify_id: str) -> None:
+        if notify_type != NotifyType.fs_map:
+            return
+        fs_map = self.get('fs_map')
+        if not fs_map:
+            return
+
+        # we don't know for which fs config has been changed
+        fs_names = set()
+        for fs in fs_map['filesystems']:
+            fs_name = fs['mdsmap']['fs_name']
+            fs_names.add(fs_name)
+
+        self.client.delete_references_to_unavailable_fs(fs_names)
+
     def _subvolume_exist(self, fs: str, subvol: Union[str, None], group: Union[str, None]) -> bool:
         rc, subvolumes, err = self.remote('volumes', 'subvolume_ls', fs, group)
         if rc == 0: