From c546948eba826e762f8bb049655cc9a928387240 Mon Sep 17 00:00:00 2001 From: Timothy Q Nguyen Date: Wed, 11 Mar 2026 11:45:38 -0700 Subject: [PATCH] src/ceph-volume: fast device unavailable as error Normally when fast devices are passed to batch command but no fast allocations could be found the batch command will do nothing and return an empty plan. This leads to issues however because the return essentially makes this issue silent which makes it hard to debug in certain scenarios. I propose to change this to raise error, and have made changes in osd.py to better log the errors and process the exceptions. This shouldn't affect processes that much and the change in osd.py ensures the raised errors will not interrupt the return output. I've also changed the unit tests to account for change. Signed-off-by: Timothy Q Nguyen (cherry picked from commit 262175b107a86a0a330629645b4bc7a00a4fe047) --- .../ceph_volume/devices/lvm/batch.py | 8 +- .../tests/devices/lvm/test_batch.py | 74 +++++++++---------- src/pybind/mgr/cephadm/services/osd.py | 10 ++- 3 files changed, 44 insertions(+), 48 deletions(-) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/batch.py b/src/ceph-volume/ceph_volume/devices/lvm/batch.py index 9ed46d7afc32..ff97009a61d9 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/batch.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/batch.py @@ -438,8 +438,8 @@ class Batch(object): num_osds, fast_type) if fast_devices and not fast_allocations: - mlogger.info('{} fast devices were passed, but none are available'.format(len(fast_devices))) - return [] + mlogger.error('{} fast devices were passed, but none are available'.format(len(fast_devices))) + exit(1) if fast_devices and not len(fast_allocations) == num_osds: mlogger.error('{} fast allocations != {} num_osds'.format( len(fast_allocations), num_osds)) @@ -450,8 +450,8 @@ class Batch(object): num_osds, 'block_wal') if very_fast_devices and not very_fast_allocations: - mlogger.info('{} very fast devices were passed, but none are available'.format(len(very_fast_devices))) - return [] + mlogger.error('{} very fast devices were passed, but none are available'.format(len(very_fast_devices))) + exit(1) if very_fast_devices and not len(very_fast_allocations) == num_osds: mlogger.error('{} very fast allocations != {} num_osds'.format( len(very_fast_allocations), num_osds)) diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py index 79c9f7122743..0300cb772d4c 100644 --- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py +++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py @@ -43,73 +43,73 @@ class TestBatch(object): with pytest.raises(ArgumentError): arg_validators.ValidBatchDevice()('foo') - @pytest.mark.parametrize('format_', ['pretty', 'json', 'json-pretty']) - def test_report(self, format_, factory, conf_ceph_stub, mock_device_generator): - # just ensure reporting works + def test_exit_on_unavailable_fast_allocation(self, factory, conf_ceph_stub, mock_device_generator): conf_ceph_stub('[global]\nfsid=asdf-lkjh') devs = [mock_device_generator() for _ in range(5)] + fast_devs = [mock_device_generator()] + fast_devs[0].available_lvm = False args = factory(data_slots=1, osds_per_device=1, osd_ids=[], - report=True, - format=format_, devices=devs, - db_devices=[], + db_devices=fast_devs, wal_devices=[], objectstore='bluestore', - block_db_size=disk.Size(gb=1), - block_db_slots=1, + block_db_size="1G", + block_db_slots=1.0, dmcrypt=True, data_allocate_fraction=1.0, has_block_db_size_without_db_devices=None ) b = batch.Batch([]) b.args = args - plan = b.get_deployment_layout() - b.report(plan) + with pytest.raises(SystemExit) as err: + b.get_deployment_layout() + assert err.value.code == 1 - @pytest.mark.parametrize('format_', ['json', 'json-pretty']) - def test_json_report_valid_empty(self, format_, factory, conf_ceph_stub, mock_device_generator): + def test_exit_on_unavailable_very_fast_allocation(self, factory, conf_ceph_stub, mock_device_generator): # ensure json reports are valid when empty conf_ceph_stub('[global]\nfsid=asdf-lkjh') - devs = [] + devs = [mock_device_generator() for _ in range(5)] + fast_devs = [mock_device_generator()] + fast_devs[0].available_lvm = False + very_fast_devs = [mock_device_generator()] + very_fast_devs[0].available_lvm = False args = factory(data_slots=1, osds_per_device=1, osd_ids=[], - report=True, - format=format_, devices=devs, - db_devices=[], - wal_devices=[], + db_devices=fast_devs, + wal_devices=very_fast_devs, objectstore='bluestore', block_db_size="1G", + block_db_slots=5, dmcrypt=True, data_allocate_fraction=1.0, + has_block_db_size_without_db_devices=None ) b = batch.Batch([]) b.args = args - plan = b.get_deployment_layout() - report = b._create_report(plan) - json.loads(report) + with pytest.raises(SystemExit) as err: + b.get_deployment_layout() + assert err.value.code == 1 - @pytest.mark.parametrize('format_', ['json', 'json-pretty']) - def test_json_report_valid_empty_unavailable_fast(self, format_, factory, conf_ceph_stub, mock_device_generator): - # ensure json reports are valid when empty + @pytest.mark.parametrize('format_', ['pretty', 'json', 'json-pretty']) + def test_report(self, format_, factory, conf_ceph_stub, mock_device_generator): + # just ensure reporting works conf_ceph_stub('[global]\nfsid=asdf-lkjh') devs = [mock_device_generator() for _ in range(5)] - fast_devs = [mock_device_generator()] - fast_devs[0].available_lvm = False args = factory(data_slots=1, osds_per_device=1, osd_ids=[], report=True, format=format_, devices=devs, - db_devices=fast_devs, + db_devices=[], wal_devices=[], objectstore='bluestore', - block_db_size="1G", - block_db_slots=1.0, + block_db_size=disk.Size(gb=1), + block_db_slots=1, dmcrypt=True, data_allocate_fraction=1.0, has_block_db_size_without_db_devices=None @@ -117,33 +117,25 @@ class TestBatch(object): b = batch.Batch([]) b.args = args plan = b.get_deployment_layout() - report = b._create_report(plan) - json.loads(report) - + b.report(plan) @pytest.mark.parametrize('format_', ['json', 'json-pretty']) - def test_json_report_valid_empty_unavailable_very_fast(self, format_, factory, conf_ceph_stub, mock_device_generator): + def test_json_report_valid_empty(self, format_, factory, conf_ceph_stub, mock_device_generator): # ensure json reports are valid when empty conf_ceph_stub('[global]\nfsid=asdf-lkjh') - devs = [mock_device_generator() for _ in range(5)] - fast_devs = [mock_device_generator()] - fast_devs[0].available_lvm = False - very_fast_devs = [mock_device_generator()] - very_fast_devs[0].available_lvm = False + devs = [] args = factory(data_slots=1, osds_per_device=1, osd_ids=[], report=True, format=format_, devices=devs, - db_devices=fast_devs, - wal_devices=very_fast_devs, + db_devices=[], + wal_devices=[], objectstore='bluestore', block_db_size="1G", - block_db_slots=5, dmcrypt=True, data_allocate_fraction=1.0, - has_block_db_size_without_db_devices=None ) b = batch.Batch([]) b.args = args diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py index 60a399149f9f..49a8d9df1d75 100644 --- a/src/pybind/mgr/cephadm/services/osd.py +++ b/src/pybind/mgr/cephadm/services/osd.py @@ -72,14 +72,18 @@ class OSDService(CephService): self.mgr.cache.save_host(host) return ret_msg - async def all_hosts() -> List[Optional[str]]: + async def all_hosts() -> List[str]: futures = [create_from_spec_one(h, ds) for h, ds in self.prepare_drivegroup(drive_group)] - return await gather(*futures) + results = await gather(*futures, return_exceptions=True) + for result in results: + if isinstance(result, Exception): + self.mgr.log.error(f'Failed to create OSD: {result}') + return [result for result in results if isinstance(result, str)] with self.mgr.async_timeout_handler('cephadm deploy (osd daemon)'): ret = self.mgr.wait_async(all_hosts()) - return ", ".join(filter(None, ret)) + return ", ".join(ret) async def create_single_host(self, drive_group: DriveGroupSpec, -- 2.47.3