From: Paul Cuzner Date: Thu, 14 Oct 2021 00:06:30 +0000 (+1300) Subject: mgr/cephadm:improve validation of orch device zap X-Git-Tag: v17.1.0~602^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b7782084ac9657be9b2da6ebd56b5029cf859225;p=ceph.git mgr/cephadm:improve validation of orch device zap Improve the logic in the function to : 1. check that the host exists 2. check the host isn't in maintenance 3. check the disk exists on the host 4. check the disk isn't supporting a known OSD 5. check that the disk doesn't belong to another cluster In addition the command now returns a completion message so you know it worked. Fixes: https://tracker.ceph.com/issues/52919 Signed-off-by: Paul Cuzner --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 14bf025199cea..102a4cf63fbed 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1503,7 +1503,8 @@ Then run the following: self.remove_health_warning('HOST_IN_MAINTENANCE') else: s = "host is" if len(in_maintenance) == 1 else "hosts are" - self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [f"{h} is in maintenance" for h in in_maintenance]) + self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [ + f"{h} is in maintenance" for h in in_maintenance]) @handle_orch_error @host_exists() @@ -1941,33 +1942,89 @@ Then run the following: @handle_orch_error def zap_device(self, host: str, path: str) -> str: + """Zap a device on a managed host. + + Use ceph-volume zap to return a device to an unused/free state + + Args: + host (str): hostname of the cluster host + path (str): device path + + Raises: + OrchestratorError: host is not a cluster host + OrchestratorError: host is in maintenance and therefore unavailable + OrchestratorError: device path not found on the host + OrchestratorError: device is known to a different ceph cluster + OrchestratorError: device holds active osd + OrchestratorError: device cache hasn't been populated yet.. + + Returns: + str: output from the zap command + """ + self.log.info('Zap device %s:%s' % (host, path)) - out, err, code = CephadmServe(self)._run_cephadm( - host, 'osd.', 'ceph-volume', - ['--', 'lvm', 'list', path, '--format', 'json'], - error_ok=True) - if code: - raise OrchestratorError('Zap failed: ceph-volume lvm list %s' % '\n'.join(path)) - osd_info = json.loads('\n'.join(out)) + if host not in self.inventory.keys(): + raise OrchestratorError( + f"Host '{host}' is not a member of the cluster") - for osd_id in osd_info.keys(): - out, err, code = CephadmServe(self)._run_cephadm( - host, 'osd.', 'unit', - ['is-active', '--name', 'osd.%s' % osd_id], - error_ok=True) - if code == 0: - raise OrchestratorError('The systemctl unit of osd is active. Rm osd first') + host_info = self.inventory._inventory.get(host, {}) + if host_info.get('status', '').lower() == 'maintenance': + raise OrchestratorError( + f"Host '{host}' is in maintenance mode, which prevents any actions against it.") + + if host not in self.cache.devices: + raise OrchestratorError( + f"Host '{host} hasn't been scanned yet to determine it's inventory. Please try again later.") + + host_devices = self.cache.devices[host] + path_found = False + osd_id_list: List[str] = [] + + for dev in host_devices: + if dev.path == path: + # match, so look a little deeper + if dev.lvs: + for lv in cast(List[Dict[str, str]], dev.lvs): + if lv.get('osd_id', ''): + lv_fsid = lv.get('cluster_fsid') + if lv_fsid != self._cluster_fsid: + raise OrchestratorError( + f"device {path} has lv's from a different Ceph cluster ({lv_fsid})") + osd_id_list.append(lv.get('osd_id', '')) + path_found = True + break + if not path_found: + raise OrchestratorError( + f"Device path '{path}' not found on host '{host}'") + + if osd_id_list: + dev_name = os.path.basename(path) + active_osds: List[str] = [] + for osd_id in osd_id_list: + metadata = self.get_metadata('osd', str(osd_id)) + if metadata: + if metadata.get('hostname', '') == host and dev_name in metadata.get('devices', '').split(','): + active_osds.append("osd." + osd_id) + if active_osds: + raise OrchestratorError( + f"Unable to zap: device '{path}' on {host} has {len(active_osds)} active " + f"OSD{'s' if len(active_osds) > 1 else ''}" + f" ({', '.join(active_osds)}). Use 'ceph orch osd rm' first.") out, err, code = CephadmServe(self)._run_cephadm( host, 'osd', 'ceph-volume', ['--', 'lvm', 'zap', '--destroy', path], error_ok=True) + self.cache.invalidate_host_devices(host) self.cache.invalidate_host_networks(host) if code: raise OrchestratorError('Zap failed: %s' % '\n'.join(out + err)) - return '\n'.join(out + err) + msg = f'zap successful for {path} on {host}' + self.log.info(msg) + + return msg + '\n' @handle_orch_error def blink_device_light(self, ident_fault: str, on: bool, locs: List[orchestrator.DeviceLightLoc]) -> List[str]: