]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-volume: skip redundant NVMe mkfs discards 66646/head
authorUjjawal Anand <ujjawal.anand@ibm.com>
Fri, 6 Feb 2026 11:26:58 +0000 (16:56 +0530)
committerGuillaume Abrioux <gabrioux@ibm.com>
Thu, 12 Feb 2026 15:43:52 +0000 (16:43 +0100)
- Avoid redundant discard during mkfs when discard is disabled
- Reduces mkfs time on large NVMe devices by skipping long running discard operations

Fixes: https://tracker.ceph.com/issues/74908
Signed-off-by: Ujjawal Anand <ujjawal.anand@ibm.com>
PendingReleaseNotes
src/ceph-volume/ceph_volume/objectstore/baseobjectstore.py
src/ceph-volume/ceph_volume/objectstore/raw.py
src/ceph-volume/ceph_volume/tests/objectstore/test_baseobjectstore.py
src/ceph-volume/ceph_volume/tests/objectstore/test_raw.py
src/ceph-volume/ceph_volume/tests/util/test_nvme.py [new file with mode: 0644]
src/ceph-volume/ceph_volume/util/nvme.py [new file with mode: 0644]

index 446297dad6179e1a3f8f3e387069f606ca6bd150..e5f12413838d24f586cf3a9778f08f82f297a7a7 100644 (file)
@@ -1,3 +1,6 @@
+* ceph-volume: Raw BlueStore OSD preparation now pre-formats NVMe devices and
+  skips the slower BlueStore discard phase,reducing mkfs time on
+  very large namespaces.
 * RGW: Bucket Logging suppports creating log buckets in EC pools.
   Implicit logging object commits are now performed asynchronously.
 * RGW: OpenSSL engine support is deprecated in favor of provider support.
index e3844efc2828e0c97663eb3707e472ea3b204bc2..aa7bca4e5e28cf65a88088d02c21ccb61a3a9afd 100644 (file)
@@ -51,6 +51,9 @@ class BaseObjectStore:
                 self.cephx_lockbox_secret = prepare_utils.create_key()
                 self.secrets['cephx_lockbox_secret'] = \
                     self.cephx_lockbox_secret
+        # If set, we skip mkfs-time discards by overriding bdev_enable_discard flag
+        # for the `ceph-osd --mkfs` command.
+        self.skip_mkfs_discard: bool = False
 
     def get_ptuuid(self, argument: str) -> str:
         uuid = disk.get_partuuid(argument)
@@ -161,6 +164,10 @@ class BaseObjectStore:
             '-i', self.osd_id,
             '--monmap', self.monmap,
         ]
+        # Skip mkfs discard if we have already formatted the device
+        # set bdev_enable_discard = false
+        if self.skip_mkfs_discard and self.objectstore == 'bluestore':
+            self.osd_mkfs_cmd.extend(['--bdev-enable-discard', 'false'])
         if self.cephx_secret is not None:
             self.osd_mkfs_cmd.extend(['--keyfile', '-'])
 
index 33e79d34c151c4d0e0d32a4436a5b184e980857d..25a577d3f95a80ca76338b0e574dd421caefc7e2 100644 (file)
@@ -6,6 +6,7 @@ from ceph_volume import terminal, decorators, conf, process
 from ceph_volume.util import system, disk
 from ceph_volume.util import prepare as prepare_utils
 from ceph_volume.util import encryption as encryption_utils
+from ceph_volume.util import nvme as nvme_utils
 from ceph_volume.util.device import Device
 from ceph_volume.devices.lvm.common import rollback_osd
 from ceph_volume.devices.raw.list import direct_report
@@ -98,6 +99,9 @@ class Raw(BaseObjectStore):
         self.osd_id = prepare_utils.create_id(
             self.osd_fsid, json.dumps(self.secrets), self.osd_id)
 
+        if self.precondition_block_device():
+            self.skip_mkfs_discard = True
+
         if self.encrypted:
             self.prepare_dmcrypt()
 
@@ -106,6 +110,15 @@ class Raw(BaseObjectStore):
         # prepare the osd filesystem
         self.osd_mkfs()
 
+    def precondition_block_device(self) -> bool:
+        """
+        Run a fast NVMe format on the main block device when applicable.
+        Returns True if the block device was formatted, False otherwise.
+        """
+        if not self.block_device_path:
+            return False
+        return nvme_utils.preformat(self.block_device_path)
+
     def _activate(self, osd_id: str, osd_fsid: str) -> None:
         # mount on tmpfs the osd directory
         self.osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
index d34e8cde06faad30094df645488ada90b4b31a71..7a2f5cdf08a6b619e7f2e5c8da3b432e2d24d846 100644 (file)
@@ -150,6 +150,33 @@ class TestBaseObjectStore:
                           '--setuser', 'ceph',
                           '--setgroup', 'ceph']
 
+    @patch('ceph_volume.conf.cluster', 'ceph')
+    def test_build_osd_mkfs_cmd_disables_discard(self):
+        bo = BaseObjectStore([])
+        bo.osd_path = '/var/lib/ceph/osd/ceph-123/'
+        bo.osd_fsid = 'abcd-1234'
+        bo.objectstore = 'bluestore'
+        bo.osd_id = '123'
+        bo.monmap = '/etc/ceph/ceph.monmap'
+        bo.skip_mkfs_discard = True
+        result = bo.build_osd_mkfs_cmd()
+
+        assert result == ['ceph-osd',
+                          '--cluster',
+                          'ceph',
+                          '--osd-objectstore',
+                          'bluestore',
+                          '--mkfs', '-i', '123',
+                          '--monmap',
+                          '/etc/ceph/ceph.monmap',
+                          '--bdev-enable-discard', 'false',
+                          '--keyfile', '-',
+                          '--osd-data',
+                          '/var/lib/ceph/osd/ceph-123/',
+                          '--osd-uuid', 'abcd-1234',
+                          '--setuser', 'ceph',
+                          '--setgroup', 'ceph']
+
     def test_osd_mkfs_ok(self, monkeypatch, fake_call, objectstore):
         args = objectstore(dmcrypt=False)
         bo = BaseObjectStore(args)
@@ -171,6 +198,32 @@ class TestBaseObjectStore:
                 }
             ]
 
+    def test_osd_mkfs_disables_discard_via_bdev_flag(self, monkeypatch, objectstore):
+        args = objectstore(dmcrypt=False)
+        bo = BaseObjectStore(args)
+        bo.objectstore = 'bluestore'
+        bo.skip_mkfs_discard = True
+        bo.osd_id = '123'
+        bo.osd_fsid = 'abcd-1234'
+        bo.get_osd_path = lambda: '/var/lib/ceph/osd/ceph-123/'
+        monkeypatch.setattr(system, 'chown', lambda path: 0)
+        captured = {}
+
+        def fake_call(cmd, **kwargs):
+            captured['cmd'] = cmd
+            captured['env'] = kwargs.get('env')
+            return ([], [], 0)
+
+        monkeypatch.setattr('ceph_volume.objectstore.baseobjectstore.process.call', fake_call)
+
+        bo.osd_mkfs()
+
+        assert captured.get('env') is None
+        cmd = captured.get('cmd', [])
+        assert '--bdev-enable-discard' in cmd
+        i = cmd.index('--bdev-enable-discard')
+        assert cmd[i + 1] == 'false'
+
     @patch('ceph_volume.process.call', Mock(return_value=([], [], 999)))
     def test_osd_mkfs_fails(self, monkeypatch):
         bo = BaseObjectStore([])
index f49b09829d2501c76eb2f6b39f408fb235b96761..ba244d05e18dc82cda4aad3461d94e00a9719f6e 100644 (file)
@@ -93,6 +93,54 @@ class TestRaw:
         assert self.raw_bs.osd_mkfs.called
         assert self.raw_bs.prepare_dmcrypt.called
 
+    @patch('ceph_volume.objectstore.raw.nvme_utils.preformat', return_value=True)
+    @patch('ceph_volume.objectstore.raw.prepare_utils.create_id')
+    @patch('ceph_volume.objectstore.raw.system.generate_uuid')
+    def test_prepare_sets_discard_flag_after_nvme_format(self,
+                                                         m_generate_uuid,
+                                                         m_create_id,
+                                                         m_nvme,
+                                                         is_root,
+                                                         factory):
+        m_generate_uuid.return_value = 'fake-uuid'
+        m_create_id.return_value = MagicMock()
+        self.raw_bs.prepare_dmcrypt = MagicMock()
+        self.raw_bs.prepare_osd_req = MagicMock()
+        self.raw_bs.osd_mkfs = MagicMock()
+        args = factory(no_tmpfs=True, crush_device_class=None)
+        args.data = '/dev/nvme0n1'
+        self.raw_bs.args = args
+        self.raw_bs.block_device_path = args.data
+        self.raw_bs.secrets = dict()
+        self.raw_bs.encrypted = False
+        self.raw_bs.prepare()
+        assert self.raw_bs.skip_mkfs_discard is True
+        m_nvme.assert_called_once_with(args.data)
+
+    @patch('ceph_volume.objectstore.raw.nvme_utils.preformat', return_value=False)
+    @patch('ceph_volume.objectstore.raw.prepare_utils.create_id')
+    @patch('ceph_volume.objectstore.raw.system.generate_uuid')
+    def test_prepare_keeps_discard_when_no_nvme_format(self,
+                                                       m_generate_uuid,
+                                                       m_create_id,
+                                                       m_nvme,
+                                                       is_root,
+                                                       factory):
+        m_generate_uuid.return_value = 'fake-uuid'
+        m_create_id.return_value = MagicMock()
+        self.raw_bs.prepare_dmcrypt = MagicMock()
+        self.raw_bs.prepare_osd_req = MagicMock()
+        self.raw_bs.osd_mkfs = MagicMock()
+        args = factory(no_tmpfs=True, crush_device_class=None)
+        args.data = '/dev/nvme0n1'
+        self.raw_bs.args = args
+        self.raw_bs.block_device_path = args.data
+        self.raw_bs.secrets = dict()
+        self.raw_bs.encrypted = False
+        self.raw_bs.prepare()
+        assert self.raw_bs.skip_mkfs_discard is False
+        m_nvme.assert_called_once_with(args.data)
+
     @patch('ceph_volume.conf.cluster', 'ceph')
     @patch('ceph_volume.objectstore.raw.prepare_utils.link_wal')
     @patch('ceph_volume.objectstore.raw.prepare_utils.link_db')
diff --git a/src/ceph-volume/ceph_volume/tests/util/test_nvme.py b/src/ceph-volume/ceph_volume/tests/util/test_nvme.py
new file mode 100644 (file)
index 0000000..9a36cd7
--- /dev/null
@@ -0,0 +1,42 @@
+from unittest.mock import patch
+
+from ceph_volume.util import nvme
+
+
+class TestNvmePreformat:
+    @patch('ceph_volume.util.nvme.process.call')
+    def test_non_nvme_device_skips_preformat(self, m_call):
+        assert nvme.preformat('/dev/sda') is False
+        m_call.assert_not_called()
+
+    @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False)
+    @patch('ceph_volume.util.nvme.disk.is_device', return_value=True)
+    @patch('ceph_volume.util.nvme.process.call', return_value=([], [], 0))
+    def test_preformat_invokes_nvme_cli(self, m_call, *_m_disk):
+        assert nvme.preformat('/dev/nvme0n1') is True
+        m_call.assert_called_once_with(
+            ['nvme', 'format', '/dev/nvme0n1', '--force'],
+            run_on_host=False,
+            show_command=True,
+            terminal_verbose=True,
+            verbose_on_failure=True
+        )
+
+    @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False)
+    @patch('ceph_volume.util.nvme.disk.is_device', return_value=True)
+    @patch('ceph_volume.util.nvme.process.call', return_value=([], [], 1))
+    def test_preformat_handles_non_zero_rc(self, m_call, *_m_disk):
+        assert nvme.preformat('/dev/nvme0n1') is False
+        assert m_call.called
+
+    @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False)
+    @patch('ceph_volume.util.nvme.disk.is_device', return_value=True)
+    @patch('ceph_volume.util.nvme.process.call', side_effect=FileNotFoundError('missing nvme'))
+    def test_preformat_handles_missing_cli(self, m_call, *_m_disk):
+        assert nvme.preformat('/dev/nvme0n1') is False
+        assert m_call.called
+
+    @patch('ceph_volume.util.nvme.disk.is_partition', return_value=True)
+    def test_partition_is_not_formatted(self, *_):
+        assert nvme.preformat('/dev/nvme0n1p1') is False
+
diff --git a/src/ceph-volume/ceph_volume/util/nvme.py b/src/ceph-volume/ceph_volume/util/nvme.py
new file mode 100644 (file)
index 0000000..120f75c
--- /dev/null
@@ -0,0 +1,82 @@
+import logging
+import os
+
+from ceph_volume import process, terminal
+from ceph_volume.util import disk
+
+logger = logging.getLogger(__name__)
+
+
+def resolve(device: str) -> str:
+    """
+    Resolve the device path.
+
+    We expect a valid 'device' here. If it's missing, that's a caller bug,
+    so fail fast instead.
+    """
+    if not device:
+        raise ValueError('device path is required')
+    return os.path.realpath(device)
+
+
+def is_namespace(resolved_device: str) -> bool:
+    """
+    Return True if this looks like a whole NVMe namespace we can format.
+
+    We only format whole NVMe devices (e.g. /dev/nvme0n1). Partitions like
+    /dev/nvme0n1p1 and non-block-device paths are intentionally skipped.
+    """
+    if not resolved_device.startswith('/dev/nvme'):
+        return False
+    if not disk.is_device(resolved_device):
+        # disk.is_device() already excludes partitions
+        logger.info('Skipping NVMe format for non-whole-disk device %s', resolved_device)
+        return False
+    return True
+
+
+def format(resolved_device: str) -> bool:
+    """
+    Best-effort NVMe namespace format.
+
+    Returns True only if `nvme format` succeeds. Otherwise return False and
+    fall back to the normal mkfs flow.
+    """
+    # When ceph-volume runs inside a container, it sets I_AM_IN_A_CONTAINER=1.
+    run_on_host = bool(os.environ.get('I_AM_IN_A_CONTAINER', ''))
+    command = ['nvme', 'format', resolved_device, '--force']
+    logger.info('Formatting NVMe namespace %s prior to ceph-volume mkfs', resolved_device)
+    try:
+        _, _, rc = process.call(
+            command,
+            run_on_host=run_on_host,
+            show_command=True,
+            terminal_verbose=True,
+            verbose_on_failure=True,
+        )
+    except (FileNotFoundError, PermissionError) as exc:
+        logger.warning('Unable to execute nvme CLI for %s: %s', resolved_device, exc)
+        return False
+    if rc != 0:
+        logger.warning(
+            'nvme format failed for %s (rc=%s); using default mkfs workflow',
+            resolved_device,
+            rc,
+        )
+        return False
+    terminal.info('nvme format completed for {}'.format(resolved_device))
+    return True
+
+
+def preformat(device: str) -> bool:
+    """
+    Resolve, validate, then format an NVMe namespace (when applicable).
+
+    This is the main entrypoint used by ceph-volume: it returns True only
+    when we actually formatted the device.
+    """
+    resolved_device = resolve(device)
+    if not is_namespace(resolved_device):
+        return False
+    return format(resolved_device)
+