From: Ujjawal Anand Date: Fri, 6 Feb 2026 11:26:58 +0000 (+0530) Subject: ceph-volume: skip redundant NVMe mkfs discards X-Git-Tag: testing/wip-vshankar-testing-20260213.071255~2^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=daebcfb8944789a47da20ffec3e03a5b2737f711;p=ceph-ci.git ceph-volume: skip redundant NVMe mkfs discards - Avoid redundant discard during mkfs when discard is disabled - Reduces mkfs time on large NVMe devices by skipping long running discard operations Fixes: https://tracker.ceph.com/issues/74908 Signed-off-by: Ujjawal Anand --- diff --git a/PendingReleaseNotes b/PendingReleaseNotes index 446297dad61..e5f12413838 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -1,3 +1,6 @@ +* ceph-volume: Raw BlueStore OSD preparation now pre-formats NVMe devices and + skips the slower BlueStore discard phase,reducing mkfs time on + very large namespaces. * RGW: Bucket Logging suppports creating log buckets in EC pools. Implicit logging object commits are now performed asynchronously. * RGW: OpenSSL engine support is deprecated in favor of provider support. diff --git a/src/ceph-volume/ceph_volume/objectstore/baseobjectstore.py b/src/ceph-volume/ceph_volume/objectstore/baseobjectstore.py index e3844efc282..aa7bca4e5e2 100644 --- a/src/ceph-volume/ceph_volume/objectstore/baseobjectstore.py +++ b/src/ceph-volume/ceph_volume/objectstore/baseobjectstore.py @@ -51,6 +51,9 @@ class BaseObjectStore: self.cephx_lockbox_secret = prepare_utils.create_key() self.secrets['cephx_lockbox_secret'] = \ self.cephx_lockbox_secret + # If set, we skip mkfs-time discards by overriding bdev_enable_discard flag + # for the `ceph-osd --mkfs` command. + self.skip_mkfs_discard: bool = False def get_ptuuid(self, argument: str) -> str: uuid = disk.get_partuuid(argument) @@ -161,6 +164,10 @@ class BaseObjectStore: '-i', self.osd_id, '--monmap', self.monmap, ] + # Skip mkfs discard if we have already formatted the device + # set bdev_enable_discard = false + if self.skip_mkfs_discard and self.objectstore == 'bluestore': + self.osd_mkfs_cmd.extend(['--bdev-enable-discard', 'false']) if self.cephx_secret is not None: self.osd_mkfs_cmd.extend(['--keyfile', '-']) diff --git a/src/ceph-volume/ceph_volume/objectstore/raw.py b/src/ceph-volume/ceph_volume/objectstore/raw.py index 33e79d34c15..25a577d3f95 100644 --- a/src/ceph-volume/ceph_volume/objectstore/raw.py +++ b/src/ceph-volume/ceph_volume/objectstore/raw.py @@ -6,6 +6,7 @@ from ceph_volume import terminal, decorators, conf, process from ceph_volume.util import system, disk from ceph_volume.util import prepare as prepare_utils from ceph_volume.util import encryption as encryption_utils +from ceph_volume.util import nvme as nvme_utils from ceph_volume.util.device import Device from ceph_volume.devices.lvm.common import rollback_osd from ceph_volume.devices.raw.list import direct_report @@ -98,6 +99,9 @@ class Raw(BaseObjectStore): self.osd_id = prepare_utils.create_id( self.osd_fsid, json.dumps(self.secrets), self.osd_id) + if self.precondition_block_device(): + self.skip_mkfs_discard = True + if self.encrypted: self.prepare_dmcrypt() @@ -106,6 +110,15 @@ class Raw(BaseObjectStore): # prepare the osd filesystem self.osd_mkfs() + def precondition_block_device(self) -> bool: + """ + Run a fast NVMe format on the main block device when applicable. + Returns True if the block device was formatted, False otherwise. + """ + if not self.block_device_path: + return False + return nvme_utils.preformat(self.block_device_path) + def _activate(self, osd_id: str, osd_fsid: str) -> None: # mount on tmpfs the osd directory self.osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id) diff --git a/src/ceph-volume/ceph_volume/tests/objectstore/test_baseobjectstore.py b/src/ceph-volume/ceph_volume/tests/objectstore/test_baseobjectstore.py index d34e8cde06f..7a2f5cdf08a 100644 --- a/src/ceph-volume/ceph_volume/tests/objectstore/test_baseobjectstore.py +++ b/src/ceph-volume/ceph_volume/tests/objectstore/test_baseobjectstore.py @@ -150,6 +150,33 @@ class TestBaseObjectStore: '--setuser', 'ceph', '--setgroup', 'ceph'] + @patch('ceph_volume.conf.cluster', 'ceph') + def test_build_osd_mkfs_cmd_disables_discard(self): + bo = BaseObjectStore([]) + bo.osd_path = '/var/lib/ceph/osd/ceph-123/' + bo.osd_fsid = 'abcd-1234' + bo.objectstore = 'bluestore' + bo.osd_id = '123' + bo.monmap = '/etc/ceph/ceph.monmap' + bo.skip_mkfs_discard = True + result = bo.build_osd_mkfs_cmd() + + assert result == ['ceph-osd', + '--cluster', + 'ceph', + '--osd-objectstore', + 'bluestore', + '--mkfs', '-i', '123', + '--monmap', + '/etc/ceph/ceph.monmap', + '--bdev-enable-discard', 'false', + '--keyfile', '-', + '--osd-data', + '/var/lib/ceph/osd/ceph-123/', + '--osd-uuid', 'abcd-1234', + '--setuser', 'ceph', + '--setgroup', 'ceph'] + def test_osd_mkfs_ok(self, monkeypatch, fake_call, objectstore): args = objectstore(dmcrypt=False) bo = BaseObjectStore(args) @@ -171,6 +198,32 @@ class TestBaseObjectStore: } ] + def test_osd_mkfs_disables_discard_via_bdev_flag(self, monkeypatch, objectstore): + args = objectstore(dmcrypt=False) + bo = BaseObjectStore(args) + bo.objectstore = 'bluestore' + bo.skip_mkfs_discard = True + bo.osd_id = '123' + bo.osd_fsid = 'abcd-1234' + bo.get_osd_path = lambda: '/var/lib/ceph/osd/ceph-123/' + monkeypatch.setattr(system, 'chown', lambda path: 0) + captured = {} + + def fake_call(cmd, **kwargs): + captured['cmd'] = cmd + captured['env'] = kwargs.get('env') + return ([], [], 0) + + monkeypatch.setattr('ceph_volume.objectstore.baseobjectstore.process.call', fake_call) + + bo.osd_mkfs() + + assert captured.get('env') is None + cmd = captured.get('cmd', []) + assert '--bdev-enable-discard' in cmd + i = cmd.index('--bdev-enable-discard') + assert cmd[i + 1] == 'false' + @patch('ceph_volume.process.call', Mock(return_value=([], [], 999))) def test_osd_mkfs_fails(self, monkeypatch): bo = BaseObjectStore([]) diff --git a/src/ceph-volume/ceph_volume/tests/objectstore/test_raw.py b/src/ceph-volume/ceph_volume/tests/objectstore/test_raw.py index f49b09829d2..ba244d05e18 100644 --- a/src/ceph-volume/ceph_volume/tests/objectstore/test_raw.py +++ b/src/ceph-volume/ceph_volume/tests/objectstore/test_raw.py @@ -93,6 +93,54 @@ class TestRaw: assert self.raw_bs.osd_mkfs.called assert self.raw_bs.prepare_dmcrypt.called + @patch('ceph_volume.objectstore.raw.nvme_utils.preformat', return_value=True) + @patch('ceph_volume.objectstore.raw.prepare_utils.create_id') + @patch('ceph_volume.objectstore.raw.system.generate_uuid') + def test_prepare_sets_discard_flag_after_nvme_format(self, + m_generate_uuid, + m_create_id, + m_nvme, + is_root, + factory): + m_generate_uuid.return_value = 'fake-uuid' + m_create_id.return_value = MagicMock() + self.raw_bs.prepare_dmcrypt = MagicMock() + self.raw_bs.prepare_osd_req = MagicMock() + self.raw_bs.osd_mkfs = MagicMock() + args = factory(no_tmpfs=True, crush_device_class=None) + args.data = '/dev/nvme0n1' + self.raw_bs.args = args + self.raw_bs.block_device_path = args.data + self.raw_bs.secrets = dict() + self.raw_bs.encrypted = False + self.raw_bs.prepare() + assert self.raw_bs.skip_mkfs_discard is True + m_nvme.assert_called_once_with(args.data) + + @patch('ceph_volume.objectstore.raw.nvme_utils.preformat', return_value=False) + @patch('ceph_volume.objectstore.raw.prepare_utils.create_id') + @patch('ceph_volume.objectstore.raw.system.generate_uuid') + def test_prepare_keeps_discard_when_no_nvme_format(self, + m_generate_uuid, + m_create_id, + m_nvme, + is_root, + factory): + m_generate_uuid.return_value = 'fake-uuid' + m_create_id.return_value = MagicMock() + self.raw_bs.prepare_dmcrypt = MagicMock() + self.raw_bs.prepare_osd_req = MagicMock() + self.raw_bs.osd_mkfs = MagicMock() + args = factory(no_tmpfs=True, crush_device_class=None) + args.data = '/dev/nvme0n1' + self.raw_bs.args = args + self.raw_bs.block_device_path = args.data + self.raw_bs.secrets = dict() + self.raw_bs.encrypted = False + self.raw_bs.prepare() + assert self.raw_bs.skip_mkfs_discard is False + m_nvme.assert_called_once_with(args.data) + @patch('ceph_volume.conf.cluster', 'ceph') @patch('ceph_volume.objectstore.raw.prepare_utils.link_wal') @patch('ceph_volume.objectstore.raw.prepare_utils.link_db') diff --git a/src/ceph-volume/ceph_volume/tests/util/test_nvme.py b/src/ceph-volume/ceph_volume/tests/util/test_nvme.py new file mode 100644 index 00000000000..9a36cd7df70 --- /dev/null +++ b/src/ceph-volume/ceph_volume/tests/util/test_nvme.py @@ -0,0 +1,42 @@ +from unittest.mock import patch + +from ceph_volume.util import nvme + + +class TestNvmePreformat: + @patch('ceph_volume.util.nvme.process.call') + def test_non_nvme_device_skips_preformat(self, m_call): + assert nvme.preformat('/dev/sda') is False + m_call.assert_not_called() + + @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False) + @patch('ceph_volume.util.nvme.disk.is_device', return_value=True) + @patch('ceph_volume.util.nvme.process.call', return_value=([], [], 0)) + def test_preformat_invokes_nvme_cli(self, m_call, *_m_disk): + assert nvme.preformat('/dev/nvme0n1') is True + m_call.assert_called_once_with( + ['nvme', 'format', '/dev/nvme0n1', '--force'], + run_on_host=False, + show_command=True, + terminal_verbose=True, + verbose_on_failure=True + ) + + @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False) + @patch('ceph_volume.util.nvme.disk.is_device', return_value=True) + @patch('ceph_volume.util.nvme.process.call', return_value=([], [], 1)) + def test_preformat_handles_non_zero_rc(self, m_call, *_m_disk): + assert nvme.preformat('/dev/nvme0n1') is False + assert m_call.called + + @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False) + @patch('ceph_volume.util.nvme.disk.is_device', return_value=True) + @patch('ceph_volume.util.nvme.process.call', side_effect=FileNotFoundError('missing nvme')) + def test_preformat_handles_missing_cli(self, m_call, *_m_disk): + assert nvme.preformat('/dev/nvme0n1') is False + assert m_call.called + + @patch('ceph_volume.util.nvme.disk.is_partition', return_value=True) + def test_partition_is_not_formatted(self, *_): + assert nvme.preformat('/dev/nvme0n1p1') is False + diff --git a/src/ceph-volume/ceph_volume/util/nvme.py b/src/ceph-volume/ceph_volume/util/nvme.py new file mode 100644 index 00000000000..120f75c8a2d --- /dev/null +++ b/src/ceph-volume/ceph_volume/util/nvme.py @@ -0,0 +1,82 @@ +import logging +import os + +from ceph_volume import process, terminal +from ceph_volume.util import disk + +logger = logging.getLogger(__name__) + + +def resolve(device: str) -> str: + """ + Resolve the device path. + + We expect a valid 'device' here. If it's missing, that's a caller bug, + so fail fast instead. + """ + if not device: + raise ValueError('device path is required') + return os.path.realpath(device) + + +def is_namespace(resolved_device: str) -> bool: + """ + Return True if this looks like a whole NVMe namespace we can format. + + We only format whole NVMe devices (e.g. /dev/nvme0n1). Partitions like + /dev/nvme0n1p1 and non-block-device paths are intentionally skipped. + """ + if not resolved_device.startswith('/dev/nvme'): + return False + if not disk.is_device(resolved_device): + # disk.is_device() already excludes partitions + logger.info('Skipping NVMe format for non-whole-disk device %s', resolved_device) + return False + return True + + +def format(resolved_device: str) -> bool: + """ + Best-effort NVMe namespace format. + + Returns True only if `nvme format` succeeds. Otherwise return False and + fall back to the normal mkfs flow. + """ + # When ceph-volume runs inside a container, it sets I_AM_IN_A_CONTAINER=1. + run_on_host = bool(os.environ.get('I_AM_IN_A_CONTAINER', '')) + command = ['nvme', 'format', resolved_device, '--force'] + logger.info('Formatting NVMe namespace %s prior to ceph-volume mkfs', resolved_device) + try: + _, _, rc = process.call( + command, + run_on_host=run_on_host, + show_command=True, + terminal_verbose=True, + verbose_on_failure=True, + ) + except (FileNotFoundError, PermissionError) as exc: + logger.warning('Unable to execute nvme CLI for %s: %s', resolved_device, exc) + return False + if rc != 0: + logger.warning( + 'nvme format failed for %s (rc=%s); using default mkfs workflow', + resolved_device, + rc, + ) + return False + terminal.info('nvme format completed for {}'.format(resolved_device)) + return True + + +def preformat(device: str) -> bool: + """ + Resolve, validate, then format an NVMe namespace (when applicable). + + This is the main entrypoint used by ceph-volume: it returns True only + when we actually formatted the device. + """ + resolved_device = resolve(device) + if not is_namespace(resolved_device): + return False + return format(resolved_device) +