From 7dfeceed4a6996c32df6f56454d91d217ad32926 Mon Sep 17 00:00:00 2001 From: Guillaume Abrioux Date: Tue, 11 Feb 2025 16:00:51 +0000 Subject: [PATCH] ceph-volume: support splitting db even on collocated scenario This change enables ceph-volume to create OSDs where the DB is explicitly placed on a separate LVM partition, even in collocated scenarios (i.e., block and DB on the same device). This helps mitigate BlueStore fragmentation issues. Given that ceph-volume can't automatically predict a proper default size for the db device, the idea is to use the `--block-db-size` parameter: Passing `--block-db-size` and `--db-devices` makes ceph-volume create db devices on dedicated devices (current implementation): ``` Total OSDs: 2 Type Path LV Size % of device ---------------------------------------------------------------------------------------------------- data /dev/vdb 200.00 GB 100.00% block_db /dev/vdd 4.00 GB 2.00% ---------------------------------------------------------------------------------------------------- data /dev/vdc 200.00 GB 100.00% block_db /dev/vdd 4.00 GB 2.00% ``` Passing `--block-db-size` without `--db-devices` makes ceph-volume create a separate LV for db device on the same device (new behavior): ``` Total OSDs: 2 Type Path LV Size % of device ---------------------------------------------------------------------------------------------------- data /dev/vdb 196.00 GB 98.00% block_db /dev/vdb 4.00 GB 2.00% ---------------------------------------------------------------------------------------------------- data /dev/vdc 196.00 GB 98.00% block_db /dev/vdc 4.00 GB 2.00% ``` This new behavior is supported with the `--osds-per-device` parameter: ``` Total OSDs: 4 Type Path LV Size % of device ---------------------------------------------------------------------------------------------------- data /dev/vdb 96.00 GB 48.00% block_db /dev/vdb 4.00 GB 2.00% ---------------------------------------------------------------------------------------------------- data /dev/vdb 96.00 GB 48.00% block_db /dev/vdb 4.00 GB 2.00% ---------------------------------------------------------------------------------------------------- data /dev/vdc 96.00 GB 48.00% block_db /dev/vdc 4.00 GB 2.00% ---------------------------------------------------------------------------------------------------- data /dev/vdc 96.00 GB 48.00% block_db /dev/vdc 4.00 GB 2.00% ``` Fixes: https://tracker.ceph.com/issues/69996 Signed-off-by: Guillaume Abrioux --- .../ceph_volume/devices/lvm/batch.py | 49 +++++++++--- src/ceph-volume/ceph_volume/tests/conftest.py | 53 ++++++------- .../tests/devices/lvm/test_batch.py | 76 ++++++++++++------- 3 files changed, 113 insertions(+), 65 deletions(-) diff --git a/src/ceph-volume/ceph_volume/devices/lvm/batch.py b/src/ceph-volume/ceph_volume/devices/lvm/batch.py index 80803577bb482..57c715a2fb97c 100644 --- a/src/ceph-volume/ceph_volume/devices/lvm/batch.py +++ b/src/ceph-volume/ceph_volume/devices/lvm/batch.py @@ -19,9 +19,9 @@ device_list_template = """ * {path: <25} {size: <10} {state}""" -def ensure_disjoint_device_lists(data: List[str], - db: Optional[List[str]] = None, - wal: Optional[List[str]] = None) -> None: +def ensure_disjoint_device_lists(data: List[device.Device], + db: Optional[List[device.Device]] = None, + wal: Optional[List[device.Device]] = None) -> None: if db is None: db = [] if wal is None: @@ -341,6 +341,10 @@ class Batch(object): self.parser.print_help() raise SystemExit(0) + self.args.has_block_db_size_without_db_devices = ( + self.args.block_db_size is not None and not self.args.db_devices + ) + if (self.args.auto and not self.args.db_devices and not self.args.wal_devices): self._sort_rotational_disks() @@ -393,7 +397,10 @@ class Batch(object): functions. ''' devices = self.args.devices - fast_devices = self.args.db_devices + if self.args.block_db_size is not None: + fast_devices = self.args.db_devices or self.args.devices + else: + fast_devices = self.args.db_devices very_fast_devices = self.args.wal_devices plan = [] phys_devs, lvm_devs = separate_devices_from_lvs(devices) @@ -436,10 +443,20 @@ class Batch(object): len(very_fast_allocations), num_osds)) exit(1) - for osd in plan: - if fast_devices: - osd.add_fast_device(*fast_allocations.pop(), - type_=fast_type) + if fast_devices: + fast_alloc: Optional[tuple[str, float, disk.Size, int]] = None + for osd in plan: + if self.args.has_block_db_size_without_db_devices: + for i, _fast_alloc in enumerate(fast_allocations): + if osd.data.path == _fast_alloc[0]: + fast_alloc = fast_allocations.pop(i) + break + else: + fast_alloc = fast_allocations.pop() if fast_allocations else None + + if fast_alloc: + osd.add_fast_device(*fast_alloc, type_=fast_type) + if very_fast_devices and self.args.objectstore == 'bluestore': osd.add_very_fast_device(*very_fast_allocations.pop()) return plan @@ -586,13 +603,25 @@ def get_physical_osds(devices: List[device.Device], args: argparse.Namespace) -> data_slots = args.osds_per_device if args.data_slots: data_slots = max(args.data_slots, args.osds_per_device) - rel_data_size = args.data_allocate_fraction / data_slots - mlogger.debug('relative data size: {}'.format(rel_data_size)) + #rel_data_size = args.data_allocate_fraction / data_slots + #mlogger.debug('relative data size: {}'.format(rel_data_size)) ret = [] for dev in devices: + rel_data_size = args.data_allocate_fraction / data_slots if dev.available_lvm: + total_dev_size = dev.vg_size[0] dev_size = dev.vg_size[0] + + if args.has_block_db_size_without_db_devices: + all_db_space = args.block_db_size * data_slots + dev_size -= all_db_space.b.as_int() + abs_size = disk.Size(b=int(dev_size * rel_data_size)) + mlogger.error(f'{dev_size} {abs_size} {rel_data_size}') + + if args.has_block_db_size_without_db_devices: + rel_data_size = abs_size / disk.Size(b=total_dev_size) + free_size = dev.vg_free[0] for _ in range(args.osds_per_device): if abs_size > free_size: diff --git a/src/ceph-volume/ceph_volume/tests/conftest.py b/src/ceph-volume/ceph_volume/tests/conftest.py index ea080e1c52d1c..a32a85389bdef 100644 --- a/src/ceph-volume/ceph_volume/tests/conftest.py +++ b/src/ceph-volume/ceph_volume/tests/conftest.py @@ -1,6 +1,7 @@ import os import pytest -from mock.mock import patch, PropertyMock, create_autospec, Mock +import argparse +from mock.mock import patch, PropertyMock, create_autospec, Mock, MagicMock from ceph_volume.api import lvm from ceph_volume.util import disk from ceph_volume.util import device @@ -29,14 +30,14 @@ class Capture(object): class Factory(object): - def __init__(self, **kw): + def __init__(self, **kw: Any) -> None: for k, v in kw.items(): setattr(self, k, v) @pytest.fixture -def factory(): - return Factory +def factory() -> Callable[..., argparse.Namespace]: + return argparse.Namespace def objectstore_bluestore_factory(**kw): o = objectstore.bluestore.BlueStore([]) @@ -70,29 +71,29 @@ def mock_lv_device_generator(): return dev return mock_lv -def mock_device(name='foo', - vg_name='vg_foo', - vg_size=None, - lv_name='lv_foo', - lv_size=None, - path='foo', - lv_path='', - number_lvs=0): +def mock_device(**kw: Any) -> MagicMock: + number_lvs = kw.get('number_lvs', 0) + default_values = { + 'vg_size': [21474836480], + 'lv_size': kw.get('vg_size', [21474836480]), + 'path': f"/dev/{kw.get('path', 'foo')}", + 'vg_name': 'vg_foo', + 'lv_name': 'lv_foo', + 'symlink': None, + 'available_lvm': True, + 'vg_free': kw.get('vg_size', [21474836480]), + 'lvs': [], + 'lv_path': f"/dev/{kw.get('vg_name', 'vg_foo')}/{kw.get('lv_name', 'lv_foo')}", + 'vgs': [lvm.VolumeGroup(vg_name=kw.get('vg_name', 'vg_foo'), lv_name=kw.get('lv_name', 'lv_foo'))], + } + for key, value in default_values.items(): + kw.setdefault(key, value) + dev = create_autospec(device.Device) - if vg_size is None: - dev.vg_size = [21474836480] - if lv_size is None: - lv_size = dev.vg_size - dev.lv_size = lv_size - dev.path = f'/dev/{path}' - dev.vg_name = f'{vg_name}' - dev.lv_name = f'{lv_name}' - dev.lv_path = lv_path if lv_path else f'/dev/{dev.vg_name}/{dev.lv_name}' - dev.symlink = None - dev.vgs = [lvm.VolumeGroup(vg_name=dev.vg_name, lv_name=dev.lv_name)] - dev.available_lvm = True - dev.vg_free = dev.vg_size - dev.lvs = [] + + for k, v in kw.items(): + dev.__dict__[k] = v + for n in range(0, number_lvs): dev.lvs.append(lvm.Volume(vg_name=f'{dev.vg_name}{n}', lv_name=f'{dev.lv_name}-{n}', diff --git a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py index 7f9e20808b035..014d84010a767 100644 --- a/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py +++ b/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py @@ -2,11 +2,13 @@ import pytest import json import random -from argparse import ArgumentError +from argparse import ArgumentError, Namespace from mock import MagicMock, patch from ceph_volume.devices.lvm import batch -from ceph_volume.util import arg_validators +from ceph_volume.util import arg_validators, disk, device +from ceph_volume.configuration import Conf +from typing import List, Callable class TestBatch(object): @@ -20,9 +22,9 @@ class TestBatch(object): with pytest.raises(SystemExit): batch.Batch(argv=['--osd-ids', '1', 'foo']).main() - def test_disjoint_device_lists(self, factory): - device1 = factory(used_by_ceph=False, available=True, abspath="/dev/sda") - device2 = factory(used_by_ceph=False, available=True, abspath="/dev/sdb") + def test_disjoint_device_lists(self, mock_device_generator: Callable) -> None: + device1 = mock_device_generator(used_by_ceph=False, available=True, abspath='/dev/sda') + device2 = mock_device_generator(used_by_ceph=False, available=True, abspath='/dev/sdb') devices = [device1, device2] db_devices = [device2] with pytest.raises(Exception) as disjoint_ex: @@ -55,7 +57,8 @@ class TestBatch(object): db_devices=[], wal_devices=[], objectstore='bluestore', - block_db_size="1G", + block_db_size=disk.Size(gb=1), + block_db_slots=1, dmcrypt=True, data_allocate_fraction=1.0, ) @@ -174,38 +177,53 @@ class TestBatch(object): assert len(b.args.devices) == 2 assert len(b.args.db_devices) == 1 - def test_get_physical_osds_return_len(self, factory, - mock_devices_available, - conf_ceph_stub, - osds_per_device): + def test_get_physical_osds_return_len(self, + factory: Callable[..., Namespace], + mock_devices_available: List[device.Device], + conf_ceph_stub: Callable[[str], Conf], + osds_per_device: int) -> None: conf_ceph_stub('[global]\nfsid=asdf-lkjh') - args = factory(data_slots=1, osds_per_device=osds_per_device, - osd_ids=[], dmcrypt=False, - data_allocate_fraction=1.0) + args = factory(data_slots=1, + osds_per_device=osds_per_device, + osd_ids=[], + dmcrypt=False, + data_allocate_fraction=1.0, + block_db_size=None, + db_devices=[]) osds = batch.get_physical_osds(mock_devices_available, args) assert len(osds) == len(mock_devices_available) * osds_per_device - def test_get_physical_osds_rel_size(self, factory, - mock_devices_available, - conf_ceph_stub, - osds_per_device, - data_allocate_fraction): - args = factory(data_slots=1, osds_per_device=osds_per_device, - osd_ids=[], dmcrypt=False, - data_allocate_fraction=data_allocate_fraction) + def test_get_physical_osds_rel_size(self, + factory: Callable[..., Namespace], + mock_devices_available: List[device.Device], + conf_ceph_stub: Callable[[str], Conf], + osds_per_device: int, + data_allocate_fraction: float) -> None: + args = factory(data_slots=1, + osds_per_device=osds_per_device, + osd_ids=[], + dmcrypt=False, + data_allocate_fraction=data_allocate_fraction, + block_db_size=None, + db_devices=[]) osds = batch.get_physical_osds(mock_devices_available, args) for osd in osds: assert osd.data[1] == data_allocate_fraction / osds_per_device - def test_get_physical_osds_abs_size(self, factory, - mock_devices_available, - conf_ceph_stub, - osds_per_device, - data_allocate_fraction): + def test_get_physical_osds_abs_size(self, + factory: Callable[..., Namespace], + mock_devices_available: List[device.Device], + conf_ceph_stub: Callable[[str], Conf], + osds_per_device: int, + data_allocate_fraction: float) -> None: conf_ceph_stub('[global]\nfsid=asdf-lkjh') - args = factory(data_slots=1, osds_per_device=osds_per_device, - osd_ids=[], dmcrypt=False, - data_allocate_fraction=data_allocate_fraction) + args = factory(data_slots=1, + osds_per_device=osds_per_device, + osd_ids=[], + dmcrypt=False, + data_allocate_fraction=data_allocate_fraction, + block_db_size=None, + db_devices=[]) osds = batch.get_physical_osds(mock_devices_available, args) for osd, dev in zip(osds, mock_devices_available): assert osd.data[2] == int(dev.vg_size[0] * (data_allocate_fraction / osds_per_device)) -- 2.39.5