+* ceph-volume: Raw BlueStore OSD preparation now pre-formats NVMe devices and
+ skips the slower BlueStore discard phase,reducing mkfs time on
+ very large namespaces.
* RGW: Bucket Logging suppports creating log buckets in EC pools.
Implicit logging object commits are now performed asynchronously.
* RGW: OpenSSL engine support is deprecated in favor of provider support.
self.cephx_lockbox_secret = prepare_utils.create_key()
self.secrets['cephx_lockbox_secret'] = \
self.cephx_lockbox_secret
+ # If set, we skip mkfs-time discards by overriding bdev_enable_discard flag
+ # for the `ceph-osd --mkfs` command.
+ self.skip_mkfs_discard: bool = False
def get_ptuuid(self, argument: str) -> str:
uuid = disk.get_partuuid(argument)
'-i', self.osd_id,
'--monmap', self.monmap,
]
+ # Skip mkfs discard if we have already formatted the device
+ # set bdev_enable_discard = false
+ if self.skip_mkfs_discard and self.objectstore == 'bluestore':
+ self.osd_mkfs_cmd.extend(['--bdev-enable-discard', 'false'])
if self.cephx_secret is not None:
self.osd_mkfs_cmd.extend(['--keyfile', '-'])
from ceph_volume.util import system, disk
from ceph_volume.util import prepare as prepare_utils
from ceph_volume.util import encryption as encryption_utils
+from ceph_volume.util import nvme as nvme_utils
from ceph_volume.util.device import Device
from ceph_volume.devices.lvm.common import rollback_osd
from ceph_volume.devices.raw.list import direct_report
self.osd_id = prepare_utils.create_id(
self.osd_fsid, json.dumps(self.secrets), self.osd_id)
+ if self.precondition_block_device():
+ self.skip_mkfs_discard = True
+
if self.encrypted:
self.prepare_dmcrypt()
# prepare the osd filesystem
self.osd_mkfs()
+ def precondition_block_device(self) -> bool:
+ """
+ Run a fast NVMe format on the main block device when applicable.
+ Returns True if the block device was formatted, False otherwise.
+ """
+ if not self.block_device_path:
+ return False
+ return nvme_utils.preformat(self.block_device_path)
+
def _activate(self, osd_id: str, osd_fsid: str) -> None:
# mount on tmpfs the osd directory
self.osd_path = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
'--setuser', 'ceph',
'--setgroup', 'ceph']
+ @patch('ceph_volume.conf.cluster', 'ceph')
+ def test_build_osd_mkfs_cmd_disables_discard(self):
+ bo = BaseObjectStore([])
+ bo.osd_path = '/var/lib/ceph/osd/ceph-123/'
+ bo.osd_fsid = 'abcd-1234'
+ bo.objectstore = 'bluestore'
+ bo.osd_id = '123'
+ bo.monmap = '/etc/ceph/ceph.monmap'
+ bo.skip_mkfs_discard = True
+ result = bo.build_osd_mkfs_cmd()
+
+ assert result == ['ceph-osd',
+ '--cluster',
+ 'ceph',
+ '--osd-objectstore',
+ 'bluestore',
+ '--mkfs', '-i', '123',
+ '--monmap',
+ '/etc/ceph/ceph.monmap',
+ '--bdev-enable-discard', 'false',
+ '--keyfile', '-',
+ '--osd-data',
+ '/var/lib/ceph/osd/ceph-123/',
+ '--osd-uuid', 'abcd-1234',
+ '--setuser', 'ceph',
+ '--setgroup', 'ceph']
+
def test_osd_mkfs_ok(self, monkeypatch, fake_call, objectstore):
args = objectstore(dmcrypt=False)
bo = BaseObjectStore(args)
}
]
+ def test_osd_mkfs_disables_discard_via_bdev_flag(self, monkeypatch, objectstore):
+ args = objectstore(dmcrypt=False)
+ bo = BaseObjectStore(args)
+ bo.objectstore = 'bluestore'
+ bo.skip_mkfs_discard = True
+ bo.osd_id = '123'
+ bo.osd_fsid = 'abcd-1234'
+ bo.get_osd_path = lambda: '/var/lib/ceph/osd/ceph-123/'
+ monkeypatch.setattr(system, 'chown', lambda path: 0)
+ captured = {}
+
+ def fake_call(cmd, **kwargs):
+ captured['cmd'] = cmd
+ captured['env'] = kwargs.get('env')
+ return ([], [], 0)
+
+ monkeypatch.setattr('ceph_volume.objectstore.baseobjectstore.process.call', fake_call)
+
+ bo.osd_mkfs()
+
+ assert captured.get('env') is None
+ cmd = captured.get('cmd', [])
+ assert '--bdev-enable-discard' in cmd
+ i = cmd.index('--bdev-enable-discard')
+ assert cmd[i + 1] == 'false'
+
@patch('ceph_volume.process.call', Mock(return_value=([], [], 999)))
def test_osd_mkfs_fails(self, monkeypatch):
bo = BaseObjectStore([])
assert self.raw_bs.osd_mkfs.called
assert self.raw_bs.prepare_dmcrypt.called
+ @patch('ceph_volume.objectstore.raw.nvme_utils.preformat', return_value=True)
+ @patch('ceph_volume.objectstore.raw.prepare_utils.create_id')
+ @patch('ceph_volume.objectstore.raw.system.generate_uuid')
+ def test_prepare_sets_discard_flag_after_nvme_format(self,
+ m_generate_uuid,
+ m_create_id,
+ m_nvme,
+ is_root,
+ factory):
+ m_generate_uuid.return_value = 'fake-uuid'
+ m_create_id.return_value = MagicMock()
+ self.raw_bs.prepare_dmcrypt = MagicMock()
+ self.raw_bs.prepare_osd_req = MagicMock()
+ self.raw_bs.osd_mkfs = MagicMock()
+ args = factory(no_tmpfs=True, crush_device_class=None)
+ args.data = '/dev/nvme0n1'
+ self.raw_bs.args = args
+ self.raw_bs.block_device_path = args.data
+ self.raw_bs.secrets = dict()
+ self.raw_bs.encrypted = False
+ self.raw_bs.prepare()
+ assert self.raw_bs.skip_mkfs_discard is True
+ m_nvme.assert_called_once_with(args.data)
+
+ @patch('ceph_volume.objectstore.raw.nvme_utils.preformat', return_value=False)
+ @patch('ceph_volume.objectstore.raw.prepare_utils.create_id')
+ @patch('ceph_volume.objectstore.raw.system.generate_uuid')
+ def test_prepare_keeps_discard_when_no_nvme_format(self,
+ m_generate_uuid,
+ m_create_id,
+ m_nvme,
+ is_root,
+ factory):
+ m_generate_uuid.return_value = 'fake-uuid'
+ m_create_id.return_value = MagicMock()
+ self.raw_bs.prepare_dmcrypt = MagicMock()
+ self.raw_bs.prepare_osd_req = MagicMock()
+ self.raw_bs.osd_mkfs = MagicMock()
+ args = factory(no_tmpfs=True, crush_device_class=None)
+ args.data = '/dev/nvme0n1'
+ self.raw_bs.args = args
+ self.raw_bs.block_device_path = args.data
+ self.raw_bs.secrets = dict()
+ self.raw_bs.encrypted = False
+ self.raw_bs.prepare()
+ assert self.raw_bs.skip_mkfs_discard is False
+ m_nvme.assert_called_once_with(args.data)
+
@patch('ceph_volume.conf.cluster', 'ceph')
@patch('ceph_volume.objectstore.raw.prepare_utils.link_wal')
@patch('ceph_volume.objectstore.raw.prepare_utils.link_db')
--- /dev/null
+from unittest.mock import patch
+
+from ceph_volume.util import nvme
+
+
+class TestNvmePreformat:
+ @patch('ceph_volume.util.nvme.process.call')
+ def test_non_nvme_device_skips_preformat(self, m_call):
+ assert nvme.preformat('/dev/sda') is False
+ m_call.assert_not_called()
+
+ @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False)
+ @patch('ceph_volume.util.nvme.disk.is_device', return_value=True)
+ @patch('ceph_volume.util.nvme.process.call', return_value=([], [], 0))
+ def test_preformat_invokes_nvme_cli(self, m_call, *_m_disk):
+ assert nvme.preformat('/dev/nvme0n1') is True
+ m_call.assert_called_once_with(
+ ['nvme', 'format', '/dev/nvme0n1', '--force'],
+ run_on_host=False,
+ show_command=True,
+ terminal_verbose=True,
+ verbose_on_failure=True
+ )
+
+ @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False)
+ @patch('ceph_volume.util.nvme.disk.is_device', return_value=True)
+ @patch('ceph_volume.util.nvme.process.call', return_value=([], [], 1))
+ def test_preformat_handles_non_zero_rc(self, m_call, *_m_disk):
+ assert nvme.preformat('/dev/nvme0n1') is False
+ assert m_call.called
+
+ @patch('ceph_volume.util.nvme.disk.is_partition', return_value=False)
+ @patch('ceph_volume.util.nvme.disk.is_device', return_value=True)
+ @patch('ceph_volume.util.nvme.process.call', side_effect=FileNotFoundError('missing nvme'))
+ def test_preformat_handles_missing_cli(self, m_call, *_m_disk):
+ assert nvme.preformat('/dev/nvme0n1') is False
+ assert m_call.called
+
+ @patch('ceph_volume.util.nvme.disk.is_partition', return_value=True)
+ def test_partition_is_not_formatted(self, *_):
+ assert nvme.preformat('/dev/nvme0n1p1') is False
+
--- /dev/null
+import logging
+import os
+
+from ceph_volume import process, terminal
+from ceph_volume.util import disk
+
+logger = logging.getLogger(__name__)
+
+
+def resolve(device: str) -> str:
+ """
+ Resolve the device path.
+
+ We expect a valid 'device' here. If it's missing, that's a caller bug,
+ so fail fast instead.
+ """
+ if not device:
+ raise ValueError('device path is required')
+ return os.path.realpath(device)
+
+
+def is_namespace(resolved_device: str) -> bool:
+ """
+ Return True if this looks like a whole NVMe namespace we can format.
+
+ We only format whole NVMe devices (e.g. /dev/nvme0n1). Partitions like
+ /dev/nvme0n1p1 and non-block-device paths are intentionally skipped.
+ """
+ if not resolved_device.startswith('/dev/nvme'):
+ return False
+ if not disk.is_device(resolved_device):
+ # disk.is_device() already excludes partitions
+ logger.info('Skipping NVMe format for non-whole-disk device %s', resolved_device)
+ return False
+ return True
+
+
+def format(resolved_device: str) -> bool:
+ """
+ Best-effort NVMe namespace format.
+
+ Returns True only if `nvme format` succeeds. Otherwise return False and
+ fall back to the normal mkfs flow.
+ """
+ # When ceph-volume runs inside a container, it sets I_AM_IN_A_CONTAINER=1.
+ run_on_host = bool(os.environ.get('I_AM_IN_A_CONTAINER', ''))
+ command = ['nvme', 'format', resolved_device, '--force']
+ logger.info('Formatting NVMe namespace %s prior to ceph-volume mkfs', resolved_device)
+ try:
+ _, _, rc = process.call(
+ command,
+ run_on_host=run_on_host,
+ show_command=True,
+ terminal_verbose=True,
+ verbose_on_failure=True,
+ )
+ except (FileNotFoundError, PermissionError) as exc:
+ logger.warning('Unable to execute nvme CLI for %s: %s', resolved_device, exc)
+ return False
+ if rc != 0:
+ logger.warning(
+ 'nvme format failed for %s (rc=%s); using default mkfs workflow',
+ resolved_device,
+ rc,
+ )
+ return False
+ terminal.info('nvme format completed for {}'.format(resolved_device))
+ return True
+
+
+def preformat(device: str) -> bool:
+ """
+ Resolve, validate, then format an NVMe namespace (when applicable).
+
+ This is the main entrypoint used by ceph-volume: it returns True only
+ when we actually formatted the device.
+ """
+ resolved_device = resolve(device)
+ if not is_namespace(resolved_device):
+ return False
+ return format(resolved_device)
+