python-common/drive_selection: fix limit with existing devices

author Adam King <adking@redhat.com>

Mon, 27 Nov 2023 20:04:42 +0000 (15:04 -0500)

committer Adam King <adking@redhat.com>

Wed, 29 Nov 2023 18:57:00 +0000 (13:57 -0500)
author Adam King <adking@redhat.com>
Mon, 27 Nov 2023 20:04:42 +0000 (15:04 -0500)
committer Adam King <adking@redhat.com>
Wed, 29 Nov 2023 18:57:00 +0000 (13:57 -0500)
diff --git a/src/pybind/mgr/cephadm/services/osd.py b/src/pybind/mgr/cephadm/services/osd.py

index 3616d42deee4e9fdea9638a835ad688fed90e5fa..75b3fc58c761e0a50544a2fe48adc8bdef9fbc05 100644 (file)
--- a/src/pybind/mgr/cephadm/services/osd.py
+++ b/src/pybind/mgr/cephadm/services/osd.py
@@ -319,11 +319,16 @@ class OSDService(CephService):
                              logger.exception('Cannot decode JSON: \'%s\'' % ' '.join(out))
                              concat_out = {}
                          notes = []
-                        if osdspec.data_devices is not None and osdspec.data_devices.limit and len(concat_out) < osdspec.data_devices.limit:
+                        if (
+                            osdspec.data_devices is not None
+                            and osdspec.data_devices.limit
+                            and (len(concat_out) + ds.existing_daemons) < osdspec.data_devices.limit
+                        ):
                              found = len(concat_out)
                              limit = osdspec.data_devices.limit
                              notes.append(
-                                f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit (Found: {found} | Limit: {limit})')
+                                f'NOTE: Did not find enough disks matching filter on host {host} to reach data device limit\n'
+                                f'(New Devices: {found} | Existing Matching Daemons: {ds.existing_daemons} | Limit: {limit})')
                          ret_all.append({'data': concat_out,
                                          'osdspec': osdspec.service_id,
                                          'host': host,
diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py

index ebdf7eea5aa0cba70bd6139161086cf950cc50c3..2477de13e00a0902fb28fbdc76f9e1b3194810ab 100644 (file)
--- a/src/pybind/mgr/cephadm/tests/test_cephadm.py
+++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py
@@ -1193,7 +1193,8 @@ class TestCephadm(object):
      @mock.patch('cephadm.services.osd.OSDService.driveselection_to_ceph_volume')
      @mock.patch('cephadm.services.osd.OsdIdClaims.refresh', lambda _: None)
      @mock.patch('cephadm.services.osd.OsdIdClaims.get', lambda _: {})
-    def test_limit_not_reached(self, d_to_cv, _run_cv_cmd, cephadm_module):
+    @mock.patch('cephadm.inventory.HostCache.get_daemons_by_service')
+    def test_limit_not_reached(self, _get_daemons_by_service, d_to_cv, _run_cv_cmd, cephadm_module):
          with with_host(cephadm_module, 'test'):
              dg = DriveGroupSpec(placement=PlacementSpec(host_pattern='test'),
                                  data_devices=DeviceSelection(limit=5, rotational=1),
@@ -1203,12 +1204,14 @@ class TestCephadm(object):
                  '[{"data": "/dev/vdb", "data_size": "50.00 GB", "encryption": "None"}, {"data": "/dev/vdc", "data_size": "50.00 GB", "encryption": "None"}]']
              d_to_cv.return_value = 'foo'
              _run_cv_cmd.side_effect = async_side_effect((disks_found, '', 0))
+            _get_daemons_by_service.return_value = [DaemonDescription(daemon_type='osd', hostname='test', service_name='not_enough')]
              preview = cephadm_module.osd_service.generate_previews([dg], 'test')
  
              for osd in preview:
                  assert 'notes' in osd
                  assert osd['notes'] == [
-                    'NOTE: Did not find enough disks matching filter on host test to reach data device limit (Found: 2 | Limit: 5)']
+                    ('NOTE: Did not find enough disks matching filter on host test to reach '
+                     'data device limit\n(New Devices: 2 | Existing Matching Daemons: 1 | Limit: 5)')]
  
      @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
      def test_prepare_drivegroup(self, cephadm_module):
diff --git a/src/python-common/ceph/deployment/drive_selection/selector.py b/src/python-common/ceph/deployment/drive_selection/selector.py

index 1b3bfbb4ee3c5772bb61a9d6cd502b83b49d1e28..31e330432cd1dae1ba37e36cc158575819d3d102 100644 (file)
--- a/src/python-common/ceph/deployment/drive_selection/selector.py
+++ b/src/python-common/ceph/deployment/drive_selection/selector.py
@@ -53,9 +53,12 @@ class DriveSelection(object):
          # type: () -> List[Device]
          return self._journal
  
-    def _limit_reached(self, device_filter, len_devices,
-                       disk_path):
-        # type: (DeviceSelection, int, str) -> bool
+    def _limit_reached(
+        self,
+        device_filter: DeviceSelection,
+        devices: List[Device],
+        disk_path: str
+    ) -> bool:
          """ Check for the <limit> property and apply logic
  
          If a limit is set in 'device_attrs' we have to stop adding
@@ -63,14 +66,21 @@ class DriveSelection(object):
  
          If limit is set (>0) and len(devices) >= limit
  
-        :param int len_devices: Length of the already populated device set/list
+        :param List[Device] devices: Already populated device set/list
          :param str disk_path: The disk identifier (for logging purposes)
          :return: True/False if the device should be added to the list of devices
          :rtype: bool
          """
          limit = device_filter.limit or 0
-
-        if limit > 0 and (len_devices + self.existing_daemons >= limit):
+        # If device A is being used for an OSD already, it can still
+        # match the filter (this is necessary as we still want the
+        # device in the resulting ceph-volume lvm batch command).
+        # If that is the case, we don't want to count the device
+        # towards the limit as it will already be counted through the
+        # existing daemons
+        non_ceph_devices = [d for d in devices if not d.ceph_device]
+
+        if limit > 0 and (len(non_ceph_devices) + self.existing_daemons >= limit):
              logger.debug("Refuse to add {} due to limit policy of <{}>".format(
                  disk_path, limit))
              return True
@@ -147,7 +157,7 @@ class DriveSelection(object):
                  continue
  
              # break on this condition.
-            if self._limit_reached(device_filter, len(devices), disk.path):
+            if self._limit_reached(device_filter, devices, disk.path):
                  logger.debug("Ignoring disk {}. Limit reached".format(
                      disk.path))
                  break
diff --git a/src/python-common/ceph/tests/test_disk_selector.py b/src/python-common/ceph/tests/test_disk_selector.py

index b08236130e0666e89ef1da201daafa29218ff8fd..03bfcbe16c900ef2e71156fae65b5e0411fcf6d5 100644 (file)
--- a/src/python-common/ceph/tests/test_disk_selector.py
+++ b/src/python-common/ceph/tests/test_disk_selector.py
@@ -557,4 +557,43 @@ class TestDriveSelection(object):
          inventory = _mk_inventory(_mk_device(rotational=True)*2)
          m = 'Failed to validate OSD spec "foobar.data_devices": No filters applied'
          with pytest.raises(DriveGroupValidationError, match=m):
-            drive_selection.DriveSelection(spec, inventory)
-\ No newline at end of file
+            drive_selection.DriveSelection(spec, inventory)
+
+
+class TestDeviceSelectionLimit:
+
+    def test_limit_existing_devices(self):
+        # Initial setup for this test is meant to be that /dev/sda
+        # is already being used for an OSD, hence it being marked
+        # as a ceph_device. /dev/sdb and /dev/sdc are not being used
+        # for OSDs yet. The limit will be set to 2 and the DriveSelection
+        # is set to have 1 pre-existing device (corresponding to /dev/sda)
+        dev_a = Device('/dev/sda', ceph_device=True, available=False)
+        dev_b = Device('/dev/sdb', ceph_device=False, available=True)
+        dev_c = Device('/dev/sdc', ceph_device=False, available=True)
+        all_devices: List[Device] = [dev_a, dev_b, dev_c]
+        processed_devices: List[Device] = []
+        filter = DeviceSelection(all=True, limit=2)
+        dgs = DriveGroupSpec(data_devices=filter)
+        ds = drive_selection.DriveSelection(dgs, all_devices, existing_daemons=1)
+
+        # Check /dev/sda. It's already being used for an OSD and will
+        # be counted in existing_daemons. This check should return False
+        # as we are not over the limit.
+        assert not ds._limit_reached(filter, processed_devices, '/dev/sda')
+        processed_devices.append(dev_a)
+
+        # We should still not be over the limit here with /dev/sdb since
+        # we will have only one pre-existing daemon /dev/sdb itself. This
+        # case previously failed as devices that contributed to existing_daemons
+        # would be double counted both as devices and daemons.
+        assert not ds._limit_reached(filter, processed_devices, '/dev/sdb')
+        processed_devices.append(dev_b)
+
+        # Now, with /dev/sdb and the pre-existing daemon taking up the 2
+        # slots, /dev/sdc should be rejected for us being over the limit.
+        assert ds._limit_reached(filter, processed_devices, '/dev/sdc')
+
+        # DriveSelection does device assignment on initialization. Let's check
+        # it picked up the expected devices
+        assert ds._data == [dev_a, dev_b]
author	Adam King <adking@redhat.com>
	Mon, 27 Nov 2023 20:04:42 +0000 (15:04 -0500)
committer	Adam King <adking@redhat.com>
	Wed, 29 Nov 2023 18:57:00 +0000 (13:57 -0500)
src/pybind/mgr/cephadm/services/osd.py		patch \| blob \| history
src/pybind/mgr/cephadm/tests/test_cephadm.py		patch \| blob \| history
src/python-common/ceph/deployment/drive_selection/selector.py		patch \| blob \| history
src/python-common/ceph/tests/test_disk_selector.py		patch \| blob \| history