]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: don't remove and deploy new daemon if ports change during upgrade 63563/head
authorAdam King <adking@redhat.com>
Wed, 28 May 2025 16:08:27 +0000 (12:08 -0400)
committerAdam King <adking@redhat.com>
Wed, 28 May 2025 16:08:27 +0000 (12:08 -0400)
If we're not mid-upgrade, a port/ip change is most likely a user
changing the configuration for the daemon and starting a new
daemon with the new ports can be justified. During upgrade, their
are a number of different upgrade paths (espeically considering
our N+2 upgrade support) where something internal to cephadm
has made the ip we're binding to or the ports being used
change. In these cases, the process of upgrading the daemon
will resolve the issue. Howeverm by having the scheduler
unilaterally remove and deploy fresh versions of daemons
when it sees port changes, we may effectively "upgrade" some
daemons out of the intended order just to make the ports
match up. This was seen with nvmeof which needs to be upgraded
after the mon daemons, but was being removed and redeployed
after the mgr upgrade once cephadm saw the set of expected
ports had changed.

This patch adds a new "upgrade_in_progress" attribute to the
HostAssignment class in the scheduler to make it aware of
an ongoing upgrade. It also changes the behavior specifically
around whether a daemon matches another if the ports match
up when "upgrade_in_progress" was set to True.

Signed-off-by: Adam King <adking@redhat.com>
src/pybind/mgr/cephadm/schedule.py
src/pybind/mgr/cephadm/serve.py

index 5cd68299ed42253b0c8f2adc21849b3dbfe772a9..e71c0345f75604a56fa2d7b63d2051b5e4068b44 100644 (file)
@@ -97,7 +97,7 @@ class DaemonPlacement(NamedTuple):
             gen,
         )
 
-    def matches_daemon(self, dd: DaemonDescription) -> bool:
+    def matches_daemon(self, dd: DaemonDescription, upgrade_in_progress: bool = False) -> bool:
         if self.daemon_type != dd.daemon_type:
             return False
         if self.hostname != dd.hostname:
@@ -105,11 +105,16 @@ class DaemonPlacement(NamedTuple):
         # fixme: how to match against network?
         if self.name and self.name != dd.daemon_id:
             return False
-        if self.ports:
-            if self.ports != dd.ports and dd.ports:
-                return False
-            if self.ip != dd.ip and dd.ip:
-                return False
+        # only consider daemon "not matching" on port/ip
+        # differences if we're not mid upgrade. During upgrade
+        # it's very likely we'll deploy the daemon with the
+        # new port/ips as part of the upgrade process
+        if not upgrade_in_progress:
+            if self.ports:
+                if self.ports != dd.ports and dd.ports:
+                    return False
+                if self.ip != dd.ip and dd.ip:
+                    return False
         return True
 
     def matches_rank_map(
@@ -154,6 +159,7 @@ class HostAssignment(object):
                  per_host_daemon_type: Optional[str] = None,
                  rank_map: Optional[Dict[int, Dict[int, Optional[str]]]] = None,
                  blocking_daemon_hosts: Optional[List[orchestrator.HostSpec]] = None,
+                 upgrade_in_progress: bool = False
                  ):
         assert spec
         self.spec = spec  # type: ServiceSpec
@@ -171,6 +177,7 @@ class HostAssignment(object):
         self.per_host_daemon_type = per_host_daemon_type
         self.ports_start = spec.get_port_start()
         self.rank_map = rank_map
+        self.upgrade_in_progress = upgrade_in_progress
 
     def hosts_by_label(self, label: str) -> List[orchestrator.HostSpec]:
         return [h for h in self.hosts if label in h.labels]
@@ -234,7 +241,7 @@ class HostAssignment(object):
             for dd in existing:
                 found = False
                 for p in host_slots:
-                    if p.matches_daemon(dd):
+                    if p.matches_daemon(dd, self.upgrade_in_progress):
                         host_slots.remove(p)
                         found = True
                         break
@@ -311,7 +318,7 @@ class HostAssignment(object):
         for dd in daemons:
             found = False
             for p in others:
-                if p.matches_daemon(dd) and p.matches_rank_map(dd, self.rank_map, ranks):
+                if p.matches_daemon(dd, self.upgrade_in_progress) and p.matches_rank_map(dd, self.rank_map, ranks):
                     others.remove(p)
                     if dd.is_active:
                         existing_active.append(dd)
index 68dc5e38b34dcecc4eb927f70d565475413f3bf9..fa455c3cb68e0918495b8507a2383ea3219b3de0 100644 (file)
@@ -826,6 +826,7 @@ class CephadmServe:
             primary_daemon_type=svc.primary_daemon_type(spec),
             per_host_daemon_type=svc.per_host_daemon_type(spec),
             rank_map=rank_map,
+            upgrade_in_progress=(self.mgr.upgrade.upgrade_state is not None)
         )
 
         try: