]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: implement pause/resume to suspect non-monitoring background work 33930/head
authorSage Weil <sage@redhat.com>
Thu, 12 Mar 2020 18:13:11 +0000 (13:13 -0500)
committerSage Weil <sage@redhat.com>
Thu, 12 Mar 2020 18:13:11 +0000 (13:13 -0500)
If the user does 'orch pause', suspend all background work that makes
actual changes.

Continue to do read-only operations, like checking host connectivity
and scraping daemon and device status.

Signed-off-by: Sage Weil <sage@redhat.com>
doc/cephadm/administration.rst
src/pybind/mgr/cephadm/module.py

index 855f2df940267a8d97b0b5e61b651bde675d2ab8..afc1dd21cfec0af2fda952f47188e4e18e3c588a 100644 (file)
@@ -31,6 +31,18 @@ To clear this value use the command:
 Health checks
 =============
 
+CEPHADM_PAUSED
+--------------
+
+Cephadm background work has been paused with ``ceph orch pause``.  Cephadm
+will continue to perform passive monitoring activities (like checking
+host and daemon status), but it will not make any changes (like deploying
+or removing daemons).
+
+You can resume cephadm work with::
+
+  ceph orch resume
+
 CEPHADM_STRAY_HOST
 ------------------
 
index 5d20c393e74d003c3df2ee3c89a6d4723b4b79cd..6d5c0c139d477cb251d4836eafcb298c71b74530 100644 (file)
@@ -607,6 +607,11 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
         self.run = True
         self.event = Event()
 
+        if self.get_store('pause'):
+            self.paused = True
+        else:
+            self.paused = False
+
         # for mypy which does not run the code
         if TYPE_CHECKING:
             self.ssh_config_file = None  # type: Optional[str]
@@ -1049,7 +1054,6 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
         self.log.debug("serve starting")
         while self.run:
             self._check_hosts()
-            self.rm_util._remove_osds_bg()
 
             # refresh daemons
             self.log.debug('refreshing hosts')
@@ -1079,14 +1083,29 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
 
             self._check_for_strays()
 
-            if self._apply_all_services():
-                continue  # did something, refresh
+            if self.paused:
+                self.health_checks['CEPHADM_PAUSED'] = {
+                    'severity': 'warning',
+                    'summary': 'cephadm background work is paused',
+                    'count': 1,
+                    'detail': ["'ceph orch resume' to resume"],
+                }
+                self.set_health_checks(self.health_checks)
+            else:
+                if 'CEPHADM_PAUSED' in self.health_checks:
+                    del self.health_checks['CEPHADM_PAUSED']
+                    self.set_health_checks(self.health_checks)
+
+                self.rm_util._remove_osds_bg()
 
-            self._check_daemons()
+                if self._apply_all_services():
+                    continue  # did something, refresh
 
-            if self.upgrade_state and not self.upgrade_state.get('paused'):
-                self._do_upgrade()
-                continue
+                self._check_daemons()
+
+                if self.upgrade_state and not self.upgrade_state.get('paused'):
+                    self._do_upgrade()
+                    continue
 
             self._serve_sleep()
         self.log.debug("serve exit")
@@ -1112,6 +1131,23 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule):
     def notify(self, notify_type, notify_id):
         pass
 
+    def pause(self):
+        if not self.paused:
+            self.log.info('Paused')
+            self.set_store('pause', 'true')
+            self.paused = True
+            # wake loop so we update the health status
+            self._kick_serve_loop()
+
+    def resume(self):
+        if self.paused:
+            self.log.info('Resumed')
+            self.paused = False
+            self.set_store('pause', None)
+        # unconditionally wake loop so that 'orch resume' can be used to kick
+        # cephadm
+        self._kick_serve_loop()
+
     def get_unique_name(self, daemon_type, host, existing, prefix=None,
                         forcename=None):
         # type: (str, str, List[orchestrator.DaemonDescription], Optional[str], Optional[str]) -> str