mgr/cephadm:improve validation of orch device zap

author Paul Cuzner <pcuzner@redhat.com>

Thu, 14 Oct 2021 00:06:30 +0000 (13:06 +1300)

committer Sebastian Wagner <sewagner@redhat.com>

Tue, 2 Nov 2021 09:02:20 +0000 (10:02 +0100)
author Paul Cuzner <pcuzner@redhat.com>
Thu, 14 Oct 2021 00:06:30 +0000 (13:06 +1300)
committer Sebastian Wagner <sewagner@redhat.com>
Tue, 2 Nov 2021 09:02:20 +0000 (10:02 +0100)
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 403155efdf6f4a46dc7c65628bfdf672cf5642ed..5a0b4961ae749cdca783c757e18156bfd63bc80c 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1665,7 +1665,8 @@ Then run the following:
              self.remove_health_warning('HOST_IN_MAINTENANCE')
          else:
              s = "host is" if len(in_maintenance) == 1 else "hosts are"
-            self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [f"{h} is in maintenance" for h in in_maintenance])
+            self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [
+                                    f"{h} is in maintenance" for h in in_maintenance])
  
      @handle_orch_error
      @host_exists()
@@ -2101,32 +2102,88 @@ Then run the following:
  
      @handle_orch_error
      def zap_device(self, host: str, path: str) -> str:
+        """Zap a device on a managed host.
+
+        Use ceph-volume zap to return a device to an unused/free state
+
+        Args:
+            host (str): hostname of the cluster host
+            path (str): device path
+
+        Raises:
+            OrchestratorError: host is not a cluster host
+            OrchestratorError: host is in maintenance and therefore unavailable
+            OrchestratorError: device path not found on the host
+            OrchestratorError: device is known to a different ceph cluster
+            OrchestratorError: device holds active osd
+            OrchestratorError: device cache hasn't been populated yet..
+
+        Returns:
+            str: output from the zap command
+        """
+
          self.log.info('Zap device %s:%s' % (host, path))
  
-        out, err, code = CephadmServe(self)._run_cephadm(
-            host, 'osd.', 'ceph-volume',
-            ['--', 'lvm', 'list', path, '--format', 'json'],
-            error_ok=True)
-        if code:
-            raise OrchestratorError('Zap failed: ceph-volume lvm list %s' % '\n'.join(path))
-        osd_info = json.loads('\n'.join(out))
+        if host not in self.inventory.keys():
+            raise OrchestratorError(
+                f"Host '{host}' is not a member of the cluster")
  
-        for osd_id in osd_info.keys():
-            out, err, code = CephadmServe(self)._run_cephadm(
-                host, 'osd.', 'unit',
-                ['is-active', '--name', 'osd.%s' % osd_id],
-                error_ok=True)
-            if code == 0:
-                raise OrchestratorError('The systemctl unit of osd is active. Rm osd first')
+        host_info = self.inventory._inventory.get(host, {})
+        if host_info.get('status', '').lower() == 'maintenance':
+            raise OrchestratorError(
+                f"Host '{host}' is in maintenance mode, which prevents any actions against it.")
+
+        if host not in self.cache.devices:
+            raise OrchestratorError(
+                f"Host '{host} hasn't been scanned yet to determine it's inventory. Please try again later.")
+
+        host_devices = self.cache.devices[host]
+        path_found = False
+        osd_id_list: List[str] = []
+
+        for dev in host_devices:
+            if dev.path == path:
+                # match, so look a little deeper
+                if dev.lvs:
+                    for lv in cast(List[Dict[str, str]], dev.lvs):
+                        if lv.get('osd_id', ''):
+                            lv_fsid = lv.get('cluster_fsid')
+                            if lv_fsid != self._cluster_fsid:
+                                raise OrchestratorError(
+                                    f"device {path} has lv's from a different Ceph cluster ({lv_fsid})")
+                            osd_id_list.append(lv.get('osd_id', ''))
+                path_found = True
+                break
+        if not path_found:
+            raise OrchestratorError(
+                f"Device path '{path}' not found on host '{host}'")
+
+        if osd_id_list:
+            dev_name = os.path.basename(path)
+            active_osds: List[str] = []
+            for osd_id in osd_id_list:
+                metadata = self.get_metadata('osd', str(osd_id))
+                if metadata:
+                    if metadata.get('hostname', '') == host and dev_name in metadata.get('devices', '').split(','):
+                        active_osds.append("osd." + osd_id)
+            if active_osds:
+                raise OrchestratorError(
+                    f"Unable to zap: device '{path}' on {host} has {len(active_osds)} active "
+                    f"OSD{'s' if len(active_osds) > 1 else ''}"
+                    f" ({', '.join(active_osds)}). Use 'ceph orch osd rm' first.")
  
          out, err, code = CephadmServe(self)._run_cephadm(
              host, 'osd', 'ceph-volume',
              ['--', 'lvm', 'zap', '--destroy', path],
              error_ok=True)
+
          self.cache.invalidate_host_devices(host)
          if code:
              raise OrchestratorError('Zap failed: %s' % '\n'.join(out + err))
-        return '\n'.join(out + err)
+        msg = f'zap successful for {path} on {host}'
+        self.log.info(msg)
+
+        return msg + '\n'
  
      @handle_orch_error
      def blink_device_light(self, ident_fault: str, on: bool, locs: List[orchestrator.DeviceLightLoc]) -> List[str]:
author	Paul Cuzner <pcuzner@redhat.com>
	Thu, 14 Oct 2021 00:06:30 +0000 (13:06 +1300)
committer	Sebastian Wagner <sewagner@redhat.com>
	Tue, 2 Nov 2021 09:02:20 +0000 (10:02 +0100)