]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
python-common/drive_selection: fix limit with existing devices 54681/head
authorAdam King <adking@redhat.com>
Mon, 27 Nov 2023 20:04:42 +0000 (15:04 -0500)
committerAdam King <adking@redhat.com>
Wed, 29 Nov 2023 18:57:00 +0000 (13:57 -0500)
When devices have already been used for OSDs, they are still
allowed to pass filtering as they are still needed for the
resulting ceph-volume lvm batch command. This was causing an
issue with limit however. Limit adds the devices we've found
that match the filter and existing OSD daemons tied to the spec.
This allows double counting of devices that hae been used for
OSDs, as they're counted in terms of being an existing device
and that they match the filter. To avoid this issue, devices
should only be counted towards the limit if they are not already
part of an OSD.

An additional note: The limit feature is only applied for
data devices, so there is no need to worry about the effect
of this change on selection of db, wal, or journal devices.
Also, we would still want to not count these devices if they
did end up passing the data device filter but had been used
for a db/wal/journal device previously.

Fixes: https://tracker.ceph.com/issues/63525
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/services/osd.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/python-common/ceph/deployment/drive_selection/selector.py
src/python-common/ceph/tests/test_disk_selector.py

index 3616d42deee4e9fdea9638a835ad688fed90e5fa..75b3fc58c761e0a50544a2fe48adc8bdef9fbc05 100644 (file)
@@ -319,11 +319,16 @@ class OSDService(CephService):
                             logger.exception('Cannot decode JSON: \'%s\'' % ' '.join(out))
                             concat_out = {}
                         notes = []
-                        if osdspec.data_devices is not None and osdspec.data_devices.limit and len(concat_out) < osdspec.data_devices.limit:
+                        if (
+                            osdspec.data_devices is not None
+                            and osdspec.data_devices.limit
+                            and (len(concat_out) + ds.existing_daemons) < osdspec.data_devices.limit
+                        ):
                             found = len(concat_out)
                             limit = osdspec.data_devices.limit
                             notes.append(
-                                f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit (Found: {found} | Limit: {limit})')
+                                f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit\n'
+                                f'(New Devices: {found} | Existing Matching Daemons: {ds.existing_daemons} | Limit: {limit})')
                         ret_all.append({'data': concat_out,
                                         'osdspec': osdspec.service_id,
                                         'host': host,
index ebdf7eea5aa0cba70bd6139161086cf950cc50c3..2477de13e00a0902fb28fbdc76f9e1b3194810ab 100644 (file)
@@ -1193,7 +1193,8 @@ class TestCephadm(object):
     @mock.patch('cephadm.services.osd.OSDService.driveselection_to_ceph_volume')
     @mock.patch('cephadm.services.osd.OsdIdClaims.refresh', lambda _: None)
     @mock.patch('cephadm.services.osd.OsdIdClaims.get', lambda _: {})
-    def test_limit_not_reached(self, d_to_cv, _run_cv_cmd, cephadm_module):
+    @mock.patch('cephadm.inventory.HostCache.get_daemons_by_service')
+    def test_limit_not_reached(self, _get_daemons_by_service, d_to_cv, _run_cv_cmd, cephadm_module):
         with with_host(cephadm_module, 'test'):
             dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='test'),
                                 data_devices=DeviceSelection(limit=5, rotational=1),
@@ -1203,12 +1204,14 @@ class TestCephadm(object):
                 '[{"data": "/dev/vdb", "data_size": "50.00 GB", "encryption": "None"}, {"data": "/dev/vdc", "data_size": "50.00 GB", "encryption": "None"}]']
             d_to_cv.return_value = 'foo'
             _run_cv_cmd.side_effect = async_side_effect((disks_found, '', 0))
+            _get_daemons_by_service.return_value = [DaemonDescription(daemon_type='osd', hostname='test', service_name='not_enough')]
             preview = cephadm_module.osd_service.generate_previews([dg], 'test')
 
             for osd in preview:
                 assert 'notes' in osd
                 assert osd['notes'] == [
-                    'NOTE: Did not find enough disks matching filter on host test to reach data device limit (Found: 2 | Limit: 5)']
+                    ('NOTE: Did not find enough disks matching filter on host test to reach '
+                     'data device limit\n(New Devices: 2 | Existing Matching Daemons: 1 | Limit: 5)')]
 
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
     def test_prepare_drivegroup(self, cephadm_module):
index 1b3bfbb4ee3c5772bb61a9d6cd502b83b49d1e28..31e330432cd1dae1ba37e36cc158575819d3d102 100644 (file)
@@ -53,9 +53,12 @@ class DriveSelection(object):
         # type: () -> List[Device]
         return self._journal
 
-    def _limit_reached(self, device_filter, len_devices,
-                       disk_path):
-        # type: (DeviceSelection, int, str) -> bool
+    def _limit_reached(
+        self,
+        device_filter: DeviceSelection,
+        devices: List[Device],
+        disk_path: str
+    ) -> bool:
         """ Check for the <limit> property and apply logic
 
         If a limit is set in 'device_attrs' we have to stop adding
@@ -63,14 +66,21 @@ class DriveSelection(object):
 
         If limit is set (>0) and len(devices) >= limit
 
-        :param int len_devices: Length of the already populated device set/list
+        :param List[Device] devices: Already populated device set/list
         :param str disk_path: The disk identifier (for logging purposes)
         :return: True/False if the device should be added to the list of devices
         :rtype: bool
         """
         limit = device_filter.limit or 0
-
-        if limit > 0 and (len_devices + self.existing_daemons >= limit):
+        # If device A is being used for an OSD already, it can still
+        # match the filter (this is necessary as we still want the
+        # device in the resulting ceph-volume lvm batch command).
+        # If that is the case, we don't want to count the device
+        # towards the limit as it will already be counted through the
+        # existing daemons
+        non_ceph_devices = [d for d in devices if not d.ceph_device]
+
+        if limit > 0 and (len(non_ceph_devices) + self.existing_daemons >= limit):
             logger.debug("Refuse to add {} due to limit policy of <{}>".format(
                 disk_path, limit))
             return True
@@ -147,7 +157,7 @@ class DriveSelection(object):
                 continue
 
             # break on this condition.
-            if self._limit_reached(device_filter, len(devices), disk.path):
+            if self._limit_reached(device_filter, devices, disk.path):
                 logger.debug("Ignoring disk {}. Limit reached".format(
                     disk.path))
                 break
index b08236130e0666e89ef1da201daafa29218ff8fd..03bfcbe16c900ef2e71156fae65b5e0411fcf6d5 100644 (file)
@@ -557,4 +557,43 @@ class TestDriveSelection(object):
         inventory = _mk_inventory(_mk_device(rotational=True)*2)
         m = 'Failed to validate OSD spec "foobar.data_devices": No filters applied'
         with pytest.raises(DriveGroupValidationError, match=m):
-            drive_selection.DriveSelection(spec, inventory)
\ No newline at end of file
+            drive_selection.DriveSelection(spec, inventory)
+
+
+class TestDeviceSelectionLimit:
+
+    def test_limit_existing_devices(self):
+        # Initial setup for this test is meant to be that /dev/sda
+        # is already being used for an OSD, hence it being marked
+        # as a ceph_device. /dev/sdb and /dev/sdc are not being used
+        # for OSDs yet. The limit will be set to 2 and the DriveSelection
+        # is set to have 1 pre-existing device (corresponding to /dev/sda)
+        dev_a = Device('/dev/sda', ceph_device=True, available=False)
+        dev_b = Device('/dev/sdb', ceph_device=False, available=True)
+        dev_c = Device('/dev/sdc', ceph_device=False, available=True)
+        all_devices: List[Device] = [dev_a, dev_b, dev_c]
+        processed_devices: List[Device] = []
+        filter = DeviceSelection(all=True, limit=2)
+        dgs = DriveGroupSpec(data_devices=filter)
+        ds = drive_selection.DriveSelection(dgs, all_devices, existing_daemons=1)
+
+        # Check /dev/sda. It's already being used for an OSD and will
+        # be counted in existing_daemons. This check should return False
+        # as we are not over the limit.
+        assert not ds._limit_reached(filter, processed_devices, '/dev/sda')
+        processed_devices.append(dev_a)
+
+        # We should still not be over the limit here with /dev/sdb since
+        # we will have only one pre-existing daemon /dev/sdb itself. This
+        # case previously failed as devices that contributed to existing_daemons
+        # would be double counted both as devices and daemons.
+        assert not ds._limit_reached(filter, processed_devices, '/dev/sdb')
+        processed_devices.append(dev_b)
+
+        # Now, with /dev/sdb and the pre-existing daemon taking up the 2
+        # slots, /dev/sdc should be rejected for us being over the limit.
+        assert ds._limit_reached(filter, processed_devices, '/dev/sdc')
+
+        # DriveSelection does device assignment on initialization. Let's check
+        # it picked up the expected devices
+        assert ds._data == [dev_a, dev_b]