From d2c4e639e142a1c8c66d2dcd1a1ca7e1de869a28 Mon Sep 17 00:00:00 2001 From: Kushal Deb Date: Tue, 18 Feb 2025 11:12:02 +0530 Subject: [PATCH] cephadm: Provide user friendly error message if osd device path is invalid Signed-off-by: Kushal Deb --- qa/tasks/cephadm.py | 12 +++++- src/pybind/mgr/cephadm/module.py | 45 +++++++++++++++++++- src/pybind/mgr/cephadm/tests/test_cephadm.py | 4 +- src/pybind/mgr/orchestrator/_interface.py | 2 +- src/pybind/mgr/orchestrator/module.py | 5 ++- 5 files changed, 61 insertions(+), 7 deletions(-) diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py index 0eb7c839614..daf26c61721 100644 --- a/qa/tasks/cephadm.py +++ b/qa/tasks/cephadm.py @@ -1117,6 +1117,7 @@ def ceph_osds(ctx, config): cur = 0 raw = config.get('raw-osds', False) + use_skip_validation = True for osd_id in sorted(id_to_remote.keys()): if raw: raise ConfigError( @@ -1151,7 +1152,16 @@ def ceph_osds(ctx, config): osd_method = config.get('osd_method') if osd_method: add_osd_args.append(osd_method) - _shell(ctx, cluster_name, remote, add_osd_args) + if use_skip_validation: + try: + _shell(ctx, cluster_name, remote, add_osd_args + ['--skip-validation']) + except Exception as e: + log.warning(f"--skip-validation falied with error {e}. Retrying without it") + use_skip_validation = False + _shell(ctx, cluster_name, remote, add_osd_args) + else: + _shell(ctx, cluster_name, remote, add_osd_args) + ctx.daemons.register_daemon( remote, 'osd', id_, cluster=cluster_name, diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 4f01b78903d..566e9841f26 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2979,8 +2979,45 @@ Then run the following: self.spec_store.save(osd_default_spec) self.apply([osd_default_spec]) + def validate_device(self, host_name: str, drive_group: DriveGroupSpec) -> str: + """ + Validates whether the specified device exists and is available for OSD creation. + Returns: + str: An error message if validation fails; an empty string if validation passes. + """ + try: + + if not drive_group.data_devices or not drive_group.data_devices.paths: + return "Error: No data devices specified." + + if self.cache.is_host_unreachable(host_name): + return f"Host {host_name} is not reachable (it may be offline or in maintenance mode)." + + host_cache = self.cache.devices.get(host_name, []) + if not host_cache: + return (f"Error: No devices found for host {host_name}. " + "You can check known devices with 'ceph orch device ls'. " + "If no devices appear, wait for an automatic refresh.") + + available_devices = { + dev.path: dev for dev in host_cache if dev.available + } + self.log.debug(f"Host {host_name} has {len(available_devices)} available devices.") + + for device in drive_group.data_devices.paths: + matching_device = next((dev for dev in host_cache if dev.path == device.path), None) + if not matching_device: + return f"Error: Device {device.path} is not found on host {host_name}" + if not matching_device.available: + return (f"Error: Device {device.path} is present but unavailable for OSD creation. " + f"Reason: {', '.join(matching_device.rejected_reasons) if matching_device.rejected_reasons else 'Unknown'}") + + return "" + except AttributeError as e: + return f"Error- Attribute issue: {e}" + @handle_orch_error - def create_osds(self, drive_group: DriveGroupSpec) -> str: + def create_osds(self, drive_group: DriveGroupSpec, skip_validation: bool = False) -> str: hosts: List[HostSpec] = self.inventory.all_specs() filtered_hosts: List[str] = drive_group.placement.filter_matching_hostspecs(hosts) if not filtered_hosts: @@ -2994,6 +3031,12 @@ Then run the following: self.create_osd_default_spec(drive_group) else: self.log.info("osd.default already exists.") + host_name = filtered_hosts[0] + if not skip_validation: + self.log.warning("Skipping the validation of device paths for osd daemon add command. Please make sure that the osd path is valid") + err_msg = self.validate_device(host_name, drive_group) + if err_msg: + return err_msg return self.osd_service.create_from_spec(drive_group) diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 87fe08633fa..bbf9082804c 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -1208,7 +1208,7 @@ class TestCephadm(object): data_devices=DeviceSelection(paths=[''])) c = cephadm_module.create_osds(dg) out = wait(cephadm_module, c) - assert out == "Created no osd(s) on host test; already created?" + assert "Error: No devices found for host test." in out bad_dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='invalid_host'), data_devices=DeviceSelection(paths=[''])) c = cephadm_module.create_osds(bad_dg) @@ -1222,7 +1222,7 @@ class TestCephadm(object): data_devices=DeviceSelection(paths=[''])) c = cephadm_module.create_osds(dg) out = wait(cephadm_module, c) - assert out == "Created no osd(s) on host test; already created?" + assert "Error: No devices found for host test." in out @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @mock.patch('cephadm.services.osd.OSDService._run_ceph_volume_command') diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 0fc103344e3..90adece1856 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -737,7 +737,7 @@ class Orchestrator(object): # assert action in ["start", "stop", "reload, "restart", "redeploy"] raise NotImplementedError() - def create_osds(self, drive_group: DriveGroupSpec) -> OrchResult[str]: + def create_osds(self, drive_group: DriveGroupSpec, skip_validation: bool = False) -> OrchResult[str]: """ Create one or more OSDs within a single Drive Group. diff --git a/src/pybind/mgr/orchestrator/module.py b/src/pybind/mgr/orchestrator/module.py index 151616c8486..05327a52eee 100644 --- a/src/pybind/mgr/orchestrator/module.py +++ b/src/pybind/mgr/orchestrator/module.py @@ -1515,7 +1515,8 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule, @_cli_write_command('orch daemon add osd') def _daemon_add_osd(self, svc_arg: Optional[str] = None, - method: Optional[OSDMethod] = None) -> HandleCommandResult: + method: Optional[OSDMethod] = None, + skip_validation: bool = False) -> HandleCommandResult: """Create OSD daemon(s) on specified host and device(s) (e.g., ceph orch daemon add osd myhost:/dev/sdb)""" # Create one or more OSDs""" @@ -1570,7 +1571,7 @@ Usage: msg = f"Invalid 'host:device' spec: '{svc_arg}': {e}" + usage return HandleCommandResult(-errno.EINVAL, stderr=msg) - completion = self.create_osds(drive_group) + completion = self.create_osds(drive_group, skip_validation) raise_if_exception(completion) return HandleCommandResult(stdout=completion.result_str()) -- 2.47.3