]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: mgr or mds scale-down should prefer non-active daemons 36485/head
authorAdam King <adking@redhat.com>
Wed, 5 Aug 2020 21:04:46 +0000 (17:04 -0400)
committerAdam King <adking@redhat.com>
Tue, 18 Aug 2020 14:01:40 +0000 (10:01 -0400)
When removing daemons during a mgr/mds scale-down, prefer to remove
standby daemons so the active daemon is not killed

Fixes: https://tracker.ceph.com/issues/44252
Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/schedule.py
src/pybind/mgr/cephadm/services/cephadmservice.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/cephadm/tests/test_scheduling.py
src/pybind/mgr/cephadm/tests/test_spec.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/tests/test_orchestrator.py

index cbbdb5d819aad640f8eb6d7d9d6bd1ff66a68fd3..07e6c91c3c487505f83bb32290bb8b973d517564 100644 (file)
@@ -2135,6 +2135,12 @@ you may want to run:
             # These daemon types require additional configs after creation
             if dd.daemon_type in ['grafana', 'iscsi', 'prometheus', 'alertmanager', 'nfs']:
                 daemons_post[dd.daemon_type].append(dd)
+            
+            if self.cephadm_services[dd.daemon_type].get_active_daemon(
+               self.cache.get_daemons_by_service(dd.service_name())).daemon_id == dd.daemon_id:
+                dd.is_active = True
+            else:
+                dd.is_active = False
 
             deps = self._calc_daemon_deps(dd.daemon_type, dd.daemon_id)
             last_deps, last_config = self.cache.get_daemon_last_config_deps(
index 81f6a5e029fdaf30ed5c6fac14ac42a326436ad5..17fbe6bcb24766c5266fe0bff1aba59236db5e2c 100644 (file)
@@ -130,8 +130,7 @@ class HostAssignment(object):
 
         # we don't need any additional hosts
         if need < 0:
-            # ask the scheduler to return a set of hosts with a up to the value of <count>
-            return self.scheduler.place(hosts_with_daemons, count)
+            return self.prefer_hosts_with_active_daemons(hosts_with_daemons, count)
         else:
             # exclusive to 'mon' daemons. Filter out hosts that don't have a public network assigned
             if self.filter_new_host:
@@ -146,6 +145,30 @@ class HostAssignment(object):
             # if a host already has the anticipated daemon, merge it with the candidates
             # to get a list of HostPlacementSpec that can be deployed on.
             return list(merge_hostspecs(hosts_with_daemons, others))
+    
+    def get_hosts_with_active_daemon(self, hosts: List[HostPlacementSpec]) -> List[HostPlacementSpec]:
+        active_hosts: List['HostPlacementSpec'] = []
+        for daemon in self.daemons:
+            if daemon.is_active:
+                for h in hosts:
+                    if h.hostname == daemon.hostname:
+                        active_hosts.append(h)
+        # remove duplicates before returning
+        return list(dict.fromkeys(active_hosts))
+    
+    def prefer_hosts_with_active_daemons(self, hosts: List[HostPlacementSpec], count) -> List[HostPlacementSpec]:
+        # try to prefer host with active daemon if possible
+        active_hosts = self.get_hosts_with_active_daemon(hosts)
+        if len(active_hosts) != 0 and count > 0:
+            for host in active_hosts:
+                hosts.remove(host)
+            if len(active_hosts) >= count:
+                return self.scheduler.place(active_hosts, count)
+            else:
+                return list(merge_hostspecs(self.scheduler.place(active_hosts, count),
+                            self.scheduler.place(hosts, count - len(active_hosts))))
+        # ask the scheduler to return a set of hosts with a up to the value of <count>
+        return self.scheduler.place(hosts, count)
 
     def add_daemon_hosts(self, host_pool: List[HostPlacementSpec]) -> Set[HostPlacementSpec]:
         hosts_with_daemons = {d.hostname for d in self.daemons}
@@ -227,4 +250,3 @@ def difference_hostspecs(l: List[HostPlacementSpec], r: List[HostPlacementSpec])
     r_names = {h.hostname for h in r}
     return [h for h in l if h.hostname not in r_names]
 
-
index e513e809f96c0c7bbd0fc47b0ca71260182be45d..ec4cba0bbb3234a9926ba5995fb02e5bc7b79010 100644 (file)
@@ -110,7 +110,9 @@ class CephadmService(metaclass=ABCMeta):
         raise NotImplementedError()
 
     def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
-        raise NotImplementedError()
+        # if this is called for a service type where it hasn't explcitly been
+        # defined, return empty Daemon Desc
+        return DaemonDescription()
 
     def _inventory_get_addr(self, hostname: str) -> str:
         """Get a host's address with its hostname."""
@@ -324,6 +326,14 @@ class MgrService(CephadmService):
         daemon_spec.keyring = keyring
 
         return self.mgr._create_daemon(daemon_spec)
+    
+    def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
+        active_mgr_str = self.mgr.get('mgr_map')['active_name']
+        for daemon in daemon_descrs:
+            if daemon.daemon_id == active_mgr_str:
+                return daemon
+        # if no active mgr found, return empty Daemon Desc
+        return DaemonDescription()
 
 
 class MdsService(CephadmService):
@@ -356,6 +366,21 @@ class MdsService(CephadmService):
         daemon_spec.keyring = keyring
 
         return self.mgr._create_daemon(daemon_spec)
+    
+    def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
+        active_mds_strs = list()
+        for fs in self.mgr.get('fs_map')['filesystems']:
+            mds_map = fs['mdsmap']
+            if mds_map is not None:
+                for mds_id, mds_status in mds_map['info'].items():
+                    if mds_status['state'] == 'up:active':
+                        active_mds_strs.append(mds_status['name'])
+        if len(active_mds_strs) != 0:
+            for daemon in daemon_descrs:
+                if daemon.daemon_id in active_mds_strs:
+                    return daemon
+        # if no mds found, return empty Daemon Desc
+        return DaemonDescription()
 
 
 class RgwService(CephadmService):
index 1e9633173a6a23d1c81bef451f744eaed291f53f..d1072d4f250b58cadb19628d9efd281fc86278c4 100644 (file)
@@ -114,7 +114,8 @@ class TestCephadm(object):
                         'daemon_type': 'mds',
                         'hostname': 'test',
                         'status': 1,
-                        'status_desc': 'starting'}
+                        'status_desc': 'starting',
+                        'is_active': False}
                 ]
 
                 with with_service(cephadm_module, ServiceSpec('rgw', 'r.z'), CephadmOrchestrator.apply_rgw, 'test'):
index 806625934bf604b8457b53ab645c0e66d4074efd..76807f8fccad7591c35d018f78ae5df281689cbe 100644 (file)
@@ -625,3 +625,156 @@ def test_bad_specs(service_type, placement, hosts, daemons, expected):
             get_hosts_func=get_hosts_func,
             get_daemons_func=lambda _: daemons).place()
     assert str(e.value) == expected
+
+class ActiveAssignmentTest(NamedTuple):
+    service_type: str
+    placement: PlacementSpec
+    hosts: List[str]
+    daemons: List[DaemonDescription]
+    expected: List[List[str]]
+
+
+@pytest.mark.parametrize("service_type,placement,hosts,daemons,expected",
+                         [
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3'),
+                                 ],
+                                 [['host1', 'host2'], ['host1', 'host3']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host1', 'host3'], ['host2', 'host3']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2', is_active=True),
+                                     DaemonDescription('mgr', 'c', 'host3'),
+                                 ],
+                                 [['host2']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host3']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host1'], ['host3']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2', is_active=True),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host2', 'host3']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'b', 'host2', is_active=True),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host1'], ['host2'], ['host3']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'a2', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3'),
+                                 ],
+                                 [['host1']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'a2', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3'),
+                                 ],
+                                 [['host1']]
+                             ),
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=2),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1', is_active=True),
+                                     DaemonDescription('mgr', 'a2', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host1', 'host3']]
+                             ),
+                             # Explicit placement should override preference for active daemon
+                             ActiveAssignmentTest(
+                                 'mgr',
+                                 PlacementSpec(count=1, hosts=['host1']),
+                                 'host1 host2 host3'.split(),
+                                 [
+                                     DaemonDescription('mgr', 'a', 'host1'),
+                                     DaemonDescription('mgr', 'b', 'host2'),
+                                     DaemonDescription('mgr', 'c', 'host3', is_active=True),
+                                 ],
+                                 [['host1']]
+                             ),
+
+                         ])
+def test_active_assignment(service_type, placement, hosts, daemons, expected):
+    def get_hosts_func(label=None, as_hostspec=False):
+        if as_hostspec:
+            return [HostSpec(h) for h in hosts]
+        return hosts
+
+    spec = ServiceSpec(service_type=service_type,
+                       service_id=None,
+                       placement=placement)
+
+    hosts = HostAssignment(
+        spec=spec,
+        get_hosts_func=get_hosts_func,
+        get_daemons_func=lambda _: daemons).place()
+    assert sorted([h.hostname for h in hosts]) in expected
index 41b33ac9f185c46089474df89f7caa41c7029ede..2fa190a814fb98403371370c185326498fbec263 100644 (file)
@@ -122,7 +122,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.725856",
         "created": "2020-04-02T19:23:08.829543",
-        "started": "2020-04-03T07:29:16.932838" 
+        "started": "2020-04-03T07:29:16.932838",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -136,7 +137,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.725903",
         "created": "2020-04-02T19:23:11.390694",
-        "started": "2020-04-03T07:29:16.910897" 
+        "started": "2020-04-03T07:29:16.910897",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -150,7 +152,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.725950",
         "created": "2020-04-02T19:23:52.025088",
-        "started": "2020-04-03T07:29:16.847972" 
+        "started": "2020-04-03T07:29:16.847972",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -164,7 +167,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.725807",
         "created": "2020-04-02T19:22:18.648584",
-        "started": "2020-04-03T07:29:16.856153" 
+        "started": "2020-04-03T07:29:16.856153",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -178,7 +182,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.725715",
         "created": "2020-04-02T19:22:13.863300",
-        "started": "2020-04-03T07:29:17.206024" 
+        "started": "2020-04-03T07:29:17.206024",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -192,7 +197,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.725996",
         "created": "2020-04-02T19:23:53.880197",
-        "started": "2020-04-03T07:29:16.880044" 
+        "started": "2020-04-03T07:29:16.880044",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -206,7 +212,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.726088",
         "created": "2020-04-02T20:35:02.991435",
-        "started": "2020-04-03T07:29:19.373956" 
+        "started": "2020-04-03T07:29:19.373956",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -220,7 +227,8 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.726134",
         "created": "2020-04-02T20:35:17.142272",
-        "started": "2020-04-03T07:29:19.374002" 
+        "started": "2020-04-03T07:29:19.374002",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
@@ -234,14 +242,16 @@ def test_spec_octopus(spec_json):
         "status_desc": "running",
         "last_refresh": "2020-04-03T15:31:48.726042",
         "created": "2020-04-02T19:24:10.281163",
-        "started": "2020-04-03T07:29:16.926292" 
+        "started": "2020-04-03T07:29:16.926292",
+        "is_active": false 
     },
     {
         "hostname": "ceph-001",
         "daemon_id": "default-rgw-realm.eu-central-1.1.ceph-001.ytywjo",
         "daemon_type": "rgw",
         "status": 1,
-        "status_desc": "starting" 
+        "status_desc": "starting",
+        "is_active": false 
     }
 ]""")
 )
index 5af3503fef4806b49331982ff057e2c103b60aad..55c512aee449984e766db5354ee2fbf8350c5a43 100644 (file)
@@ -1200,7 +1200,8 @@ class DaemonDescription(object):
                  last_configured=None,
                  osdspec_affinity=None,
                  last_deployed=None,
-                 events: Optional[List['OrchestratorEvent']] = None):
+                 events: Optional[List['OrchestratorEvent']] = None,
+                 is_active: bool=False):
 
         # Host is at the same granularity as InventoryHost
         self.hostname: str = hostname
@@ -1242,6 +1243,8 @@ class DaemonDescription(object):
         self.osdspec_affinity: Optional[str] = osdspec_affinity
 
         self.events: List[OrchestratorEvent] = events or []
+        
+        self.is_active = is_active
 
     def name(self):
         return '%s.%s' % (self.daemon_type, self.daemon_id)
@@ -1318,6 +1321,7 @@ class DaemonDescription(object):
         out['status_desc'] = self.status_desc
         if self.daemon_type == 'osd':
             out['osdspec_affinity'] = self.osdspec_affinity
+        out['is_active'] = self.is_active
 
         for k in ['last_refresh', 'created', 'started', 'last_deployed',
                   'last_configured']:
index cc1e6e401fc289ef7293ecc0d7333b187c6285aa..b5b45a2caa36c5211a8d0c8cd79b809d4c4f167e 100644 (file)
@@ -251,6 +251,7 @@ daemon_id: ubuntu
 hostname: ubuntu
 status: 1
 status_desc: starting
+is_active: false
 events:
 - 2020-06-10T10:08:22.933241 daemon:crash.ubuntu [INFO] "Deployed crash.ubuntu on
   host 'ubuntu'"