]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
cephadm: Provide user friendly error message if osd device path is invalid 62023/head
authorKushal Deb <Kushal.Deb@ibm.com>
Tue, 18 Feb 2025 05:42:02 +0000 (11:12 +0530)
committerKushal Deb <Kushal.Deb@ibm.com>
Mon, 14 Apr 2025 02:50:58 +0000 (08:20 +0530)
Signed-off-by: Kushal Deb <Kushal.Deb@ibm.com>
qa/tasks/cephadm.py
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/module.py

index 0eb7c839614ea45fa210e4e28ee1c7a4a2fe24a8..daf26c617219fa464cfae36aa3a9af4e1a6c35fc 100644 (file)
@@ -1117,6 +1117,7 @@ def ceph_osds(ctx, config):
 
         cur = 0
         raw = config.get('raw-osds', False)
+        use_skip_validation = True
         for osd_id in sorted(id_to_remote.keys()):
             if raw:
                 raise ConfigError(
@@ -1151,7 +1152,16 @@ def ceph_osds(ctx, config):
             osd_method = config.get('osd_method')
             if osd_method:
                 add_osd_args.append(osd_method)
-            _shell(ctx, cluster_name, remote, add_osd_args)
+            if use_skip_validation:
+                try:
+                    _shell(ctx, cluster_name, remote, add_osd_args + ['--skip-validation'])
+                except Exception as e:
+                    log.warning(f"--skip-validation falied with error {e}. Retrying without it")
+                    use_skip_validation = False
+                    _shell(ctx, cluster_name, remote, add_osd_args)
+            else:
+                _shell(ctx, cluster_name, remote, add_osd_args)
+
             ctx.daemons.register_daemon(
                 remote, 'osd', id_,
                 cluster=cluster_name,
index 4f01b78903d020e98bec949a401fcecc7cc10eb2..566e9841f2624e8e98fe0631ffe7bd502c555312 100644 (file)
@@ -2979,8 +2979,45 @@ Then run the following:
         self.spec_store.save(osd_default_spec)
         self.apply([osd_default_spec])
 
+    def validate_device(self, host_name: str, drive_group: DriveGroupSpec) -> str:
+        """
+        Validates whether the specified device exists and is available for OSD creation.
+        Returns:
+            str: An error message if validation fails; an empty string if validation passes.
+        """
+        try:
+
+            if not drive_group.data_devices or not drive_group.data_devices.paths:
+                return "Error: No data devices specified."
+
+            if self.cache.is_host_unreachable(host_name):
+                return f"Host {host_name} is not reachable (it may be offline or in maintenance mode)."
+
+            host_cache = self.cache.devices.get(host_name, [])
+            if not host_cache:
+                return (f"Error: No devices found for host {host_name}. "
+                        "You can check known devices with 'ceph orch device ls'. "
+                        "If no devices appear, wait for an automatic refresh.")
+
+            available_devices = {
+                dev.path: dev for dev in host_cache if dev.available
+            }
+            self.log.debug(f"Host {host_name} has {len(available_devices)} available devices.")
+
+            for device in drive_group.data_devices.paths:
+                matching_device = next((dev for dev in host_cache if dev.path == device.path), None)
+                if not matching_device:
+                    return f"Error: Device {device.path} is not found on host {host_name}"
+                if not matching_device.available:
+                    return (f"Error: Device {device.path} is present but unavailable for OSD creation. "
+                            f"Reason: {', '.join(matching_device.rejected_reasons) if matching_device.rejected_reasons else 'Unknown'}")
+
+            return ""
+        except AttributeError as e:
+            return f"Error- Attribute issue: {e}"
+
     @handle_orch_error
-    def create_osds(self, drive_group: DriveGroupSpec) -> str:
+    def create_osds(self, drive_group: DriveGroupSpec, skip_validation: bool = False) -> str:
         hosts: List[HostSpec] = self.inventory.all_specs()
         filtered_hosts: List[str] = drive_group.placement.filter_matching_hostspecs(hosts)
         if not filtered_hosts:
@@ -2994,6 +3031,12 @@ Then run the following:
             self.create_osd_default_spec(drive_group)
         else:
             self.log.info("osd.default already exists.")
+        host_name = filtered_hosts[0]
+        if not skip_validation:
+            self.log.warning("Skipping the validation of device paths for osd daemon add command. Please make sure that the osd path is valid")
+            err_msg = self.validate_device(host_name, drive_group)
+            if err_msg:
+                return err_msg
 
         return self.osd_service.create_from_spec(drive_group)
 
index 87fe08633fa5861d46b9c530f2a091c689fd25e0..bbf9082804cb974d3776459d0cc39e07e511d0f1 100644 (file)
@@ -1208,7 +1208,7 @@ class TestCephadm(object):
                                 data_devices=DeviceSelection(paths=['']))
             c = cephadm_module.create_osds(dg)
             out = wait(cephadm_module, c)
-            assert out == "Created no osd(s) on host test; already created?"
+            assert "Error: No devices found for host test." in out
             bad_dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='invalid_host'),
                                     data_devices=DeviceSelection(paths=['']))
             c = cephadm_module.create_osds(bad_dg)
@@ -1222,7 +1222,7 @@ class TestCephadm(object):
                                 data_devices=DeviceSelection(paths=['']))
             c = cephadm_module.create_osds(dg)
             out = wait(cephadm_module, c)
-            assert out == "Created no osd(s) on host test; already created?"
+            assert "Error: No devices found for host test." in out
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     @mock.patch('cephadm.services.osd.OSDService._run_ceph_volume_command')
index 0fc103344e39f0e5af7ab1fe677d7f0874455752..90adece1856bf520b44427aefad2722c9f9a60f6 100644 (file)
@@ -737,7 +737,7 @@ class Orchestrator(object):
         # assert action in ["start", "stop", "reload, "restart", "redeploy"]
         raise NotImplementedError()
 
-    def create_osds(self, drive_group: DriveGroupSpec) -> OrchResult[str]:
+    def create_osds(self, drive_group: DriveGroupSpec, skip_validation: bool = False) -> OrchResult[str]:
         """
         Create one or more OSDs within a single Drive Group.
 
index 151616c84860d8c4c69d47812db4684bd18c2310..05327a52eeebdce3cb1a52359f7b422c50a23713 100644 (file)
@@ -1515,7 +1515,8 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
     @_cli_write_command('orch daemon add osd')
     def _daemon_add_osd(self,
                         svc_arg: Optional[str] = None,
-                        method: Optional[OSDMethod] = None) -> HandleCommandResult:
+                        method: Optional[OSDMethod] = None,
+                        skip_validation: bool = False) -> HandleCommandResult:
         """Create OSD daemon(s) on specified host and device(s) (e.g., ceph orch daemon add osd myhost:/dev/sdb)"""
         # Create one or more OSDs"""
 
@@ -1570,7 +1571,7 @@ Usage:
             msg = f"Invalid 'host:device' spec: '{svc_arg}': {e}" + usage
             return HandleCommandResult(-errno.EINVAL, stderr=msg)
 
-        completion = self.create_osds(drive_group)
+        completion = self.create_osds(drive_group, skip_validation)
         raise_if_exception(completion)
         return HandleCommandResult(stdout=completion.result_str())