]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
ceph-volume: support splitting db even on collocated scenario
authorGuillaume Abrioux <gabrioux@ibm.com>
Tue, 11 Feb 2025 16:00:51 +0000 (16:00 +0000)
committerGuillaume Abrioux <gabrioux@ibm.com>
Thu, 20 Feb 2025 12:14:47 +0000 (12:14 +0000)
This change enables ceph-volume to create OSDs where the DB is
explicitly placed on a separate LVM partition, even in collocated
scenarios (i.e., block and DB on the same device).
This helps mitigate BlueStore fragmentation issues.

Given that ceph-volume can't automatically predict a proper default size for the db device,
the idea is to use the `--block-db-size` parameter:

Passing `--block-db-size` and `--db-devices` makes ceph-volume create db devices
on dedicated devices (current implementation):

```
Total OSDs: 2

  Type            Path                                                    LV Size         % of device
----------------------------------------------------------------------------------------------------
  data            /dev/vdb                                                200.00 GB       100.00%
  block_db        /dev/vdd                                                4.00 GB         2.00%
----------------------------------------------------------------------------------------------------
  data            /dev/vdc                                                200.00 GB       100.00%
  block_db        /dev/vdd                                                4.00 GB         2.00%
```

Passing `--block-db-size` without `--db-devices` makes ceph-volume create a separate
LV for db device on the same device (new behavior):
```
Total OSDs: 2

  Type            Path                                                    LV Size         % of device
----------------------------------------------------------------------------------------------------
  data            /dev/vdb                                                196.00 GB       98.00%
  block_db        /dev/vdb                                                4.00 GB         2.00%
----------------------------------------------------------------------------------------------------
  data            /dev/vdc                                                196.00 GB       98.00%
  block_db        /dev/vdc                                                4.00 GB         2.00%
```

This new behavior is supported with the `--osds-per-device` parameter:

```
Total OSDs: 4

  Type            Path                                                    LV Size         % of device
----------------------------------------------------------------------------------------------------
  data            /dev/vdb                                                96.00 GB        48.00%
  block_db        /dev/vdb                                                4.00 GB         2.00%
----------------------------------------------------------------------------------------------------
  data            /dev/vdb                                                96.00 GB        48.00%
  block_db        /dev/vdb                                                4.00 GB         2.00%
----------------------------------------------------------------------------------------------------
  data            /dev/vdc                                                96.00 GB        48.00%
  block_db        /dev/vdc                                                4.00 GB         2.00%
----------------------------------------------------------------------------------------------------
  data            /dev/vdc                                                96.00 GB        48.00%
  block_db        /dev/vdc                                                4.00 GB         2.00%
```

Fixes: https://tracker.ceph.com/issues/69996
Signed-off-by: Guillaume Abrioux <gabrioux@ibm.com>
src/ceph-volume/ceph_volume/devices/lvm/batch.py
src/ceph-volume/ceph_volume/tests/conftest.py
src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py

index 80803577bb48293d21ac4562533cd65ef5daecd5..57c715a2fb97c69a3cd4e8b85ce791c2e673e41a 100644 (file)
@@ -19,9 +19,9 @@ device_list_template = """
   * {path: <25} {size: <10} {state}"""
 
 
-def ensure_disjoint_device_lists(data: List[str],
-                                 db: Optional[List[str]] = None,
-                                 wal: Optional[List[str]] = None) -> None:
+def ensure_disjoint_device_lists(data: List[device.Device],
+                                 db: Optional[List[device.Device]] = None,
+                                 wal: Optional[List[device.Device]] = None) -> None:
     if db is None:
         db = []
     if wal is None:
@@ -341,6 +341,10 @@ class Batch(object):
             self.parser.print_help()
             raise SystemExit(0)
 
+        self.args.has_block_db_size_without_db_devices = (
+            self.args.block_db_size is not None and not self.args.db_devices
+        )
+
         if (self.args.auto and not self.args.db_devices and not
             self.args.wal_devices):
             self._sort_rotational_disks()
@@ -393,7 +397,10 @@ class Batch(object):
         functions.
         '''
         devices = self.args.devices
-        fast_devices = self.args.db_devices
+        if self.args.block_db_size is not None:
+            fast_devices = self.args.db_devices or self.args.devices
+        else:
+            fast_devices = self.args.db_devices
         very_fast_devices = self.args.wal_devices
         plan = []
         phys_devs, lvm_devs = separate_devices_from_lvs(devices)
@@ -436,10 +443,20 @@ class Batch(object):
                 len(very_fast_allocations), num_osds))
             exit(1)
 
-        for osd in plan:
-            if fast_devices:
-                osd.add_fast_device(*fast_allocations.pop(),
-                                    type_=fast_type)
+        if fast_devices:
+            fast_alloc: Optional[tuple[str, float, disk.Size, int]] = None
+            for osd in plan:
+                if self.args.has_block_db_size_without_db_devices:
+                    for i, _fast_alloc in enumerate(fast_allocations):
+                        if osd.data.path == _fast_alloc[0]:
+                            fast_alloc = fast_allocations.pop(i)
+                            break
+                else:
+                    fast_alloc = fast_allocations.pop() if fast_allocations else None
+
+                if fast_alloc:
+                    osd.add_fast_device(*fast_alloc, type_=fast_type)
+
             if very_fast_devices and self.args.objectstore == 'bluestore':
                 osd.add_very_fast_device(*very_fast_allocations.pop())
         return plan
@@ -586,13 +603,25 @@ def get_physical_osds(devices: List[device.Device], args: argparse.Namespace) ->
     data_slots = args.osds_per_device
     if args.data_slots:
         data_slots = max(args.data_slots, args.osds_per_device)
-    rel_data_size = args.data_allocate_fraction / data_slots
-    mlogger.debug('relative data size: {}'.format(rel_data_size))
+    #rel_data_size = args.data_allocate_fraction / data_slots
+    #mlogger.debug('relative data size: {}'.format(rel_data_size))
     ret = []
     for dev in devices:
+        rel_data_size = args.data_allocate_fraction / data_slots
         if dev.available_lvm:
+            total_dev_size = dev.vg_size[0]
             dev_size = dev.vg_size[0]
+
+            if args.has_block_db_size_without_db_devices:
+                all_db_space = args.block_db_size * data_slots
+                dev_size -= all_db_space.b.as_int()
+
             abs_size = disk.Size(b=int(dev_size * rel_data_size))
+            mlogger.error(f'{dev_size} {abs_size} {rel_data_size}')
+
+            if args.has_block_db_size_without_db_devices:
+                rel_data_size = abs_size / disk.Size(b=total_dev_size)
+
             free_size = dev.vg_free[0]
             for _ in range(args.osds_per_device):
                 if abs_size > free_size:
index ea080e1c52d1cc4fe4190098f90af4d8b0571fec..a32a85389bdeff1fa1ff8e9addb99ced17dacbcf 100644 (file)
@@ -1,6 +1,7 @@
 import os
 import pytest
-from mock.mock import patch, PropertyMock, create_autospec, Mock
+import argparse
+from mock.mock import patch, PropertyMock, create_autospec, Mock, MagicMock
 from ceph_volume.api import lvm
 from ceph_volume.util import disk
 from ceph_volume.util import device
@@ -29,14 +30,14 @@ class Capture(object):
 
 class Factory(object):
 
-    def __init__(self, **kw):
+    def __init__(self, **kw: Any) -> None:
         for k, v in kw.items():
             setattr(self, k, v)
 
 
 @pytest.fixture
-def factory():
-    return Factory
+def factory() -> Callable[..., argparse.Namespace]:
+    return argparse.Namespace
 
 def objectstore_bluestore_factory(**kw):
     o = objectstore.bluestore.BlueStore([])
@@ -70,29 +71,29 @@ def mock_lv_device_generator():
         return dev
     return mock_lv
 
-def mock_device(name='foo',
-                vg_name='vg_foo',
-                vg_size=None,
-                lv_name='lv_foo',
-                lv_size=None,
-                path='foo',
-                lv_path='',
-                number_lvs=0):
+def mock_device(**kw: Any) -> MagicMock:
+    number_lvs = kw.get('number_lvs', 0)
+    default_values = {
+        'vg_size': [21474836480],
+        'lv_size': kw.get('vg_size', [21474836480]),
+        'path': f"/dev/{kw.get('path', 'foo')}",
+        'vg_name': 'vg_foo',
+        'lv_name': 'lv_foo',
+        'symlink': None,
+        'available_lvm': True,
+        'vg_free': kw.get('vg_size', [21474836480]),
+        'lvs': [],
+        'lv_path': f"/dev/{kw.get('vg_name', 'vg_foo')}/{kw.get('lv_name', 'lv_foo')}",
+        'vgs': [lvm.VolumeGroup(vg_name=kw.get('vg_name', 'vg_foo'), lv_name=kw.get('lv_name', 'lv_foo'))],
+    }
+    for key, value in default_values.items():
+        kw.setdefault(key, value)
+
     dev = create_autospec(device.Device)
-    if vg_size is None:
-        dev.vg_size = [21474836480]
-    if lv_size is None:
-        lv_size = dev.vg_size
-    dev.lv_size = lv_size
-    dev.path = f'/dev/{path}'
-    dev.vg_name = f'{vg_name}'
-    dev.lv_name = f'{lv_name}'
-    dev.lv_path = lv_path if lv_path else f'/dev/{dev.vg_name}/{dev.lv_name}'
-    dev.symlink = None
-    dev.vgs = [lvm.VolumeGroup(vg_name=dev.vg_name, lv_name=dev.lv_name)]
-    dev.available_lvm = True
-    dev.vg_free = dev.vg_size
-    dev.lvs = []
+
+    for k, v in kw.items():
+        dev.__dict__[k] = v
+
     for n in range(0, number_lvs):
         dev.lvs.append(lvm.Volume(vg_name=f'{dev.vg_name}{n}',
                                   lv_name=f'{dev.lv_name}-{n}',
index 7f9e20808b0354af9f200e019c348c583ac3801a..014d84010a767ce88177bcf9fb88d1e518ede354 100644 (file)
@@ -2,11 +2,13 @@ import pytest
 import json
 import random
 
-from argparse import ArgumentError
+from argparse import ArgumentError, Namespace
 from mock import MagicMock, patch
 
 from ceph_volume.devices.lvm import batch
-from ceph_volume.util import arg_validators
+from ceph_volume.util import arg_validators, disk, device
+from ceph_volume.configuration import Conf
+from typing import List, Callable
 
 
 class TestBatch(object):
@@ -20,9 +22,9 @@ class TestBatch(object):
         with pytest.raises(SystemExit):
             batch.Batch(argv=['--osd-ids', '1', 'foo']).main()
 
-    def test_disjoint_device_lists(self, factory):
-        device1 = factory(used_by_ceph=False, available=True, abspath="/dev/sda")
-        device2 = factory(used_by_ceph=False, available=True, abspath="/dev/sdb")
+    def test_disjoint_device_lists(self, mock_device_generator: Callable) -> None:
+        device1 = mock_device_generator(used_by_ceph=False, available=True, abspath='/dev/sda')
+        device2 = mock_device_generator(used_by_ceph=False, available=True, abspath='/dev/sdb')
         devices = [device1, device2]
         db_devices = [device2]
         with pytest.raises(Exception) as disjoint_ex:
@@ -55,7 +57,8 @@ class TestBatch(object):
                        db_devices=[],
                        wal_devices=[],
                        objectstore='bluestore',
-                       block_db_size="1G",
+                       block_db_size=disk.Size(gb=1),
+                       block_db_slots=1,
                        dmcrypt=True,
                        data_allocate_fraction=1.0,
                       )
@@ -174,38 +177,53 @@ class TestBatch(object):
         assert len(b.args.devices) == 2
         assert len(b.args.db_devices) == 1
 
-    def test_get_physical_osds_return_len(self, factory,
-                                          mock_devices_available,
-                                          conf_ceph_stub,
-                                          osds_per_device):
+    def test_get_physical_osds_return_len(self,
+                                          factory: Callable[..., Namespace],
+                                          mock_devices_available: List[device.Device],
+                                          conf_ceph_stub: Callable[[str], Conf],
+                                          osds_per_device: int) -> None:
         conf_ceph_stub('[global]\nfsid=asdf-lkjh')
-        args = factory(data_slots=1, osds_per_device=osds_per_device,
-                       osd_ids=[], dmcrypt=False,
-                       data_allocate_fraction=1.0)
+        args = factory(data_slots=1,
+                       osds_per_device=osds_per_device,
+                       osd_ids=[],
+                       dmcrypt=False,
+                       data_allocate_fraction=1.0,
+                       block_db_size=None,
+                       db_devices=[])
         osds = batch.get_physical_osds(mock_devices_available, args)
         assert len(osds) == len(mock_devices_available) * osds_per_device
 
-    def test_get_physical_osds_rel_size(self, factory,
-                                          mock_devices_available,
-                                          conf_ceph_stub,
-                                          osds_per_device,
-                                          data_allocate_fraction):
-        args = factory(data_slots=1, osds_per_device=osds_per_device,
-                       osd_ids=[], dmcrypt=False,
-                       data_allocate_fraction=data_allocate_fraction)
+    def test_get_physical_osds_rel_size(self,
+                                        factory: Callable[..., Namespace],
+                                        mock_devices_available: List[device.Device],
+                                        conf_ceph_stub: Callable[[str], Conf],
+                                        osds_per_device: int,
+                                        data_allocate_fraction: float) -> None:
+        args = factory(data_slots=1,
+                       osds_per_device=osds_per_device,
+                       osd_ids=[],
+                       dmcrypt=False,
+                       data_allocate_fraction=data_allocate_fraction,
+                       block_db_size=None,
+                       db_devices=[])
         osds = batch.get_physical_osds(mock_devices_available, args)
         for osd in osds:
             assert osd.data[1] == data_allocate_fraction / osds_per_device
 
-    def test_get_physical_osds_abs_size(self, factory,
-                                          mock_devices_available,
-                                          conf_ceph_stub,
-                                          osds_per_device,
-                                          data_allocate_fraction):
+    def test_get_physical_osds_abs_size(self,
+                                        factory: Callable[..., Namespace],
+                                        mock_devices_available: List[device.Device],
+                                        conf_ceph_stub: Callable[[str], Conf],
+                                        osds_per_device: int,
+                                        data_allocate_fraction: float) -> None:
         conf_ceph_stub('[global]\nfsid=asdf-lkjh')
-        args = factory(data_slots=1, osds_per_device=osds_per_device,
-                       osd_ids=[], dmcrypt=False,
-                       data_allocate_fraction=data_allocate_fraction)
+        args = factory(data_slots=1,
+                       osds_per_device=osds_per_device,
+                       osd_ids=[],
+                       dmcrypt=False,
+                       data_allocate_fraction=data_allocate_fraction,
+                       block_db_size=None,
+                       db_devices=[])
         osds = batch.get_physical_osds(mock_devices_available, args)
         for osd, dev in zip(osds, mock_devices_available):
             assert osd.data[2] == int(dev.vg_size[0] * (data_allocate_fraction / osds_per_device))