]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mgr/cephadm: Stop NFS service/daemon from starting automatically after reboot, cephad...
authorShweta Bhosale <Shweta.Bhosale1@ibm.com>
Thu, 23 Oct 2025 05:50:16 +0000 (11:20 +0530)
committerShweta Bhosale <Shweta.Bhosale1@ibm.com>
Wed, 29 Oct 2025 06:34:06 +0000 (12:04 +0530)
Fixes: https://tracker.ceph.com/issues/73442
Signed-off-by: Shweta Bhosale <Shweta.Bhosale1@ibm.com>
Resolves: rhbz#2377090

 Conflicts:
src/pybind/mgr/cephadm/serve.py

src/cephadm/cephadm.py
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/schedule.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_cephadm.py
src/pybind/mgr/cephadm/tests/test_spec.py
src/pybind/mgr/orchestrator/_interface.py
src/pybind/mgr/orchestrator/tests/test_orchestrator.py

index 6586b57ab68274c2d09bbf7e22efe774c687f882..2c7354ea5289b2c58be98e55aa72a4c6d211f1e9 100755 (executable)
@@ -979,12 +979,15 @@ def deploy_daemon(
             cephadm_agent.deploy_daemon_unit(config_js)
         else:
             if c:
+                # Disable automatic startup for NFS daemons
+                enable_daemon = daemon_type != 'nfs'
                 deploy_daemon_units(
                     ctx,
                     ident,
                     uid,
                     gid,
                     c,
+                    enable=enable_daemon,
                     osd_fsid=osd_fsid,
                     endpoints=endpoints,
                     init_containers=init_containers,
index 90da8143d8657e2220d8cb98ffb255d0c6d99060..845132a43bc4a830046017add140c86486a403f1 100644 (file)
@@ -47,7 +47,7 @@ from ceph.deployment.service_spec import (
 from ceph.deployment.drive_group import DeviceSelection
 from ceph.utils import str_to_datetime, datetime_to_str, datetime_now
 from ceph.cryptotools.select import choose_crypto_caller
-from cephadm.serve import CephadmServe, REQUIRES_POST_ACTIONS
+from cephadm.serve import CephadmServe
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 from cephadm.http_server import CephadmHttpServer
 from cephadm.agent import CephadmAgentHelpers
@@ -1077,6 +1077,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
                     'unknown': DaemonDescriptionStatus.error,
                 }[d['state']]
 
+            cached_dd = None
+            try:
+                cached_dd = self.cache.get_daemon(d['name'], host)
+            except OrchestratorError:
+                self.log.debug(f'Could not find daemon {d["name"]} in cache')
+
             sd = orchestrator.DaemonDescription(
                 daemon_type=daemon_type,
                 daemon_id='.'.join(d['name'].split('.')[1:]),
@@ -1106,16 +1112,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
                 rank_generation=rank_generation,
                 extra_container_args=d.get('extra_container_args'),
                 extra_entrypoint_args=d.get('extra_entrypoint_args'),
+                pending_daemon_config=cached_dd.pending_daemon_config if cached_dd else False,
+                user_stopped=cached_dd.user_stopped if cached_dd else False,
             )
 
-            if daemon_type in REQUIRES_POST_ACTIONS:
-                # If post action is required for daemon, then restore value of pending_daemon_config
-                try:
-                    cached_dd = self.cache.get_daemon(sd.name(), host)
-                    sd.update_pending_daemon_config(cached_dd.pending_daemon_config)
-                except orchestrator.OrchestratorError:
-                    pass
-
             dm[sd.name()] = sd
         self.log.debug('Refreshed host %s daemons (%d)' % (host, len(dm)))
         self.cache.update_host_daemons(host, dm)
@@ -1131,6 +1131,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
     def offline_hosts_remove(self, host: str) -> None:
         if host in self.offline_hosts:
             self.offline_hosts.remove(host)
+            self._invalidate_all_host_metadata_and_kick_serve(host)
 
     def update_failed_daemon_health_check(self) -> None:
         failed_daemons = []
@@ -2801,8 +2802,12 @@ Then run the following:
                     out, err, code = self.wait_async(CephadmServe(self)._run_cephadm(
                         daemon_spec.host, name, 'unit',
                         ['--name', name, a]))
-            except Exception:
-                self.log.exception(f'`{daemon_spec.host}: cephadm unit {name} {a}` failed')
+            except Exception as exp:
+                if a == 'reset-failed' and daemon_spec.daemon_type in ['nfs'] and 'not loaded' in str(exp):
+                    # Don't log exception if reset-failed fails because the unit is not loaded
+                    pass
+                else:
+                    self.log.exception(f'`{daemon_spec.host}: cephadm unit {name} {a}` failed')
         self.cache.invalidate_host_daemons(daemon_spec.host)
         msg = "{} {} from host '{}'".format(action, name, daemon_spec.host)
         self.events.for_daemon(name, 'INFO', msg)
@@ -2830,6 +2835,7 @@ Then run the following:
         d = self.cache.get_daemon(daemon_name)
         assert d.daemon_type is not None
         assert d.daemon_id is not None
+        assert d.hostname
 
         if (action == 'redeploy' or action == 'restart') and self.daemon_is_self(d.daemon_type, d.daemon_id) \
                 and not self.mgr_service.mgr_map_has_standby():
@@ -2849,6 +2855,14 @@ Then run the following:
                     f'key rotation not supported for {d.daemon_type}'
                 )
 
+        # Track user-initiated stop/start actions
+        if action == 'stop':
+            d.user_stopped = True
+            self.cache.update_host_daemons(d.hostname, {d.name(): d})
+        elif action in ['start', 'restart']:
+            d.user_stopped = False
+            self.cache.update_host_daemons(d.hostname, {d.name(): d})
+
         self._daemon_action_set_image(action, image, d.daemon_type, d.daemon_id)
 
         self.log.info(f'Schedule {action} daemon {daemon_name}')
index 7d43bffa25fab27b878ac32f024e66d99414784f..4ddaa01d393ba0efbe108031e662c20eb97b092c 100644 (file)
@@ -354,6 +354,7 @@ class HostAssignment(object):
 
         # get candidate hosts based on [hosts, label(s), host_pattern]
         candidates = self.get_candidates()  # type: List[DaemonPlacement]
+        all_candidates = candidates
         if self.primary_daemon_type in RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES:
             # remove unreachable hosts that are not in maintenance so daemons
             # on these hosts will be rescheduled
@@ -396,7 +397,7 @@ class HostAssignment(object):
         existing_slots: List[DaemonPlacement] = []
         to_add: List[DaemonPlacement] = []
         to_remove: List[orchestrator.DaemonDescription] = []
-        ranks: List[int] = list(range(len(candidates)))
+        ranks: List[int] = list(range(len(all_candidates)))
         others: List[DaemonPlacement] = candidates.copy()
         for dd in daemons:
             found = False
index b4c972ad4ba939c6809d21cd0cdbd86b700a7f62..de04c50c0a3768d91672ade5c87afd2fbe1d3b50 100644 (file)
@@ -53,6 +53,7 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 
 REQUIRES_POST_ACTIONS = ['grafana', 'iscsi', 'prometheus', 'alertmanager', 'rgw', 'nvmeof', 'mgmt-gateway']
+DISABLED_SERVICES = ['nfs']
 
 WHICH = ssh.RemoteExecutable('which')
 CEPHADM_EXE = ssh.RemoteExecutable('/usr/bin/cephadm')
@@ -1413,6 +1414,10 @@ class CephadmServe:
                     dd.daemon_type in CEPH_TYPES:
                 self.log.info('Reconfiguring %s (extra config changed)...' % dd.name())
                 action = 'reconfig'
+            elif dd.daemon_type in DISABLED_SERVICES:
+                if dd.status == 0 and not dd.user_stopped:
+                    self.log.debug(f'Starting daemon {dd.name()}')
+                    action = 'start'
 
             if action:
                 if self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \
index c62290da9425f189ce7895d98d76f894f42286ab..a27b1bf025f46cf75974e3648e699c5488833e13 100644 (file)
@@ -247,6 +247,7 @@ class TestCephadm(object):
                         'is_active': False,
                         'ports': [],
                         'pending_daemon_config': False,
+                        'user_stopped': False
                     }
                 ]
 
index c90b1e7ae7930980d2bb8d0a1f72fafd948b3891..e3f6e110ea5a088a1c09ead0faf726fe2761fb7c 100644 (file)
@@ -299,7 +299,7 @@ def test_dd_octopus(dd_json):
         del j['daemon_name']
         return j
 
-    dd_json.update({'pending_daemon_config': False})
+    dd_json.update({'pending_daemon_config': False, 'user_stopped': False})
     assert dd_json == convert_to_old_style_json(
         DaemonDescription.from_json(dd_json).to_json())
 
index 20c47ea9350cace5fe14a9ad5a94c6449fc39f6a..526dc663823acc21cb1aaca97f6ebc22f6b21ad0 100644 (file)
@@ -1283,7 +1283,8 @@ class DaemonDescription(object):
                  rank_generation: Optional[int] = None,
                  extra_container_args: Optional[GeneralArgList] = None,
                  extra_entrypoint_args: Optional[GeneralArgList] = None,
-                 pending_daemon_config: bool = False
+                 pending_daemon_config: bool = False,
+                 user_stopped: bool = False
                  ) -> None:
 
         #: Host is at the same granularity as InventoryHost
@@ -1359,6 +1360,7 @@ class DaemonDescription(object):
             self.extra_entrypoint_args = ArgumentSpec.from_general_args(
                 extra_entrypoint_args)
         self.pending_daemon_config = pending_daemon_config
+        self.user_stopped = user_stopped
 
     def __setattr__(self, name: str, value: Any) -> None:
         if value is not None and name in ('extra_container_args', 'extra_entrypoint_args'):
@@ -1517,6 +1519,7 @@ class DaemonDescription(object):
         out['rank_generation'] = self.rank_generation
         out['systemd_unit'] = self.systemd_unit
         out['pending_daemon_config'] = self.pending_daemon_config
+        out['user_stopped'] = self.user_stopped
 
         for k in ['last_refresh', 'created', 'started', 'last_deployed',
                   'last_configured']:
@@ -1555,6 +1558,7 @@ class DaemonDescription(object):
         out['ip'] = self.ip
         out['systemd_unit'] = self.systemd_unit
         out['pending_daemon_config'] = self.pending_daemon_config
+        out['user_stopped'] = self.user_stopped
 
         for k in ['last_refresh', 'created', 'started', 'last_deployed',
                   'last_configured']:
index 5448239215e0a58f6fa89377d1a7fa2f9c669157..38d087c17ea2c203eba3c1dc4b4b63a419f9d737 100644 (file)
@@ -93,6 +93,7 @@ status: 1
 status_desc: starting
 is_active: false
 pending_daemon_config: false
+user_stopped: false
 events:
 - 2020-06-10T10:08:22.933241Z daemon:crash.ubuntu [INFO] "Deployed crash.ubuntu on
   host 'ubuntu'"