From: Shweta Bhosale Date: Thu, 23 Oct 2025 05:50:16 +0000 (+0530) Subject: mgr/cephadm: Stop NFS service/daemon from starting automatically after reboot, cephad... X-Git-Tag: testing/wip-pdonnell-testing-20260323.122957-tentacle~390 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5896f9bb3e9e535d5ef0d753b274bbedf3d225a3;p=ceph-ci.git mgr/cephadm: Stop NFS service/daemon from starting automatically after reboot, cephadm to manage startup Fixes: https://tracker.ceph.com/issues/73442 Signed-off-by: Shweta Bhosale Resolves: rhbz#2377090 Conflicts: src/pybind/mgr/cephadm/serve.py --- diff --git a/src/cephadm/cephadm.py b/src/cephadm/cephadm.py index 6586b57ab68..2c7354ea528 100755 --- a/src/cephadm/cephadm.py +++ b/src/cephadm/cephadm.py @@ -979,12 +979,15 @@ def deploy_daemon( cephadm_agent.deploy_daemon_unit(config_js) else: if c: + # Disable automatic startup for NFS daemons + enable_daemon = daemon_type != 'nfs' deploy_daemon_units( ctx, ident, uid, gid, c, + enable=enable_daemon, osd_fsid=osd_fsid, endpoints=endpoints, init_containers=init_containers, diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 90da8143d86..845132a43bc 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -47,7 +47,7 @@ from ceph.deployment.service_spec import ( from ceph.deployment.drive_group import DeviceSelection from ceph.utils import str_to_datetime, datetime_to_str, datetime_now from ceph.cryptotools.select import choose_crypto_caller -from cephadm.serve import CephadmServe, REQUIRES_POST_ACTIONS +from cephadm.serve import CephadmServe from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from cephadm.http_server import CephadmHttpServer from cephadm.agent import CephadmAgentHelpers @@ -1077,6 +1077,12 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, 'unknown': DaemonDescriptionStatus.error, }[d['state']] + cached_dd = None + try: + cached_dd = self.cache.get_daemon(d['name'], host) + except OrchestratorError: + self.log.debug(f'Could not find daemon {d["name"]} in cache') + sd = orchestrator.DaemonDescription( daemon_type=daemon_type, daemon_id='.'.join(d['name'].split('.')[1:]), @@ -1106,16 +1112,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, rank_generation=rank_generation, extra_container_args=d.get('extra_container_args'), extra_entrypoint_args=d.get('extra_entrypoint_args'), + pending_daemon_config=cached_dd.pending_daemon_config if cached_dd else False, + user_stopped=cached_dd.user_stopped if cached_dd else False, ) - if daemon_type in REQUIRES_POST_ACTIONS: - # If post action is required for daemon, then restore value of pending_daemon_config - try: - cached_dd = self.cache.get_daemon(sd.name(), host) - sd.update_pending_daemon_config(cached_dd.pending_daemon_config) - except orchestrator.OrchestratorError: - pass - dm[sd.name()] = sd self.log.debug('Refreshed host %s daemons (%d)' % (host, len(dm))) self.cache.update_host_daemons(host, dm) @@ -1131,6 +1131,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, def offline_hosts_remove(self, host: str) -> None: if host in self.offline_hosts: self.offline_hosts.remove(host) + self._invalidate_all_host_metadata_and_kick_serve(host) def update_failed_daemon_health_check(self) -> None: failed_daemons = [] @@ -2801,8 +2802,12 @@ Then run the following: out, err, code = self.wait_async(CephadmServe(self)._run_cephadm( daemon_spec.host, name, 'unit', ['--name', name, a])) - except Exception: - self.log.exception(f'`{daemon_spec.host}: cephadm unit {name} {a}` failed') + except Exception as exp: + if a == 'reset-failed' and daemon_spec.daemon_type in ['nfs'] and 'not loaded' in str(exp): + # Don't log exception if reset-failed fails because the unit is not loaded + pass + else: + self.log.exception(f'`{daemon_spec.host}: cephadm unit {name} {a}` failed') self.cache.invalidate_host_daemons(daemon_spec.host) msg = "{} {} from host '{}'".format(action, name, daemon_spec.host) self.events.for_daemon(name, 'INFO', msg) @@ -2830,6 +2835,7 @@ Then run the following: d = self.cache.get_daemon(daemon_name) assert d.daemon_type is not None assert d.daemon_id is not None + assert d.hostname if (action == 'redeploy' or action == 'restart') and self.daemon_is_self(d.daemon_type, d.daemon_id) \ and not self.mgr_service.mgr_map_has_standby(): @@ -2849,6 +2855,14 @@ Then run the following: f'key rotation not supported for {d.daemon_type}' ) + # Track user-initiated stop/start actions + if action == 'stop': + d.user_stopped = True + self.cache.update_host_daemons(d.hostname, {d.name(): d}) + elif action in ['start', 'restart']: + d.user_stopped = False + self.cache.update_host_daemons(d.hostname, {d.name(): d}) + self._daemon_action_set_image(action, image, d.daemon_type, d.daemon_id) self.log.info(f'Schedule {action} daemon {daemon_name}') diff --git a/src/pybind/mgr/cephadm/schedule.py b/src/pybind/mgr/cephadm/schedule.py index 7d43bffa25f..4ddaa01d393 100644 --- a/src/pybind/mgr/cephadm/schedule.py +++ b/src/pybind/mgr/cephadm/schedule.py @@ -354,6 +354,7 @@ class HostAssignment(object): # get candidate hosts based on [hosts, label(s), host_pattern] candidates = self.get_candidates() # type: List[DaemonPlacement] + all_candidates = candidates if self.primary_daemon_type in RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES: # remove unreachable hosts that are not in maintenance so daemons # on these hosts will be rescheduled @@ -396,7 +397,7 @@ class HostAssignment(object): existing_slots: List[DaemonPlacement] = [] to_add: List[DaemonPlacement] = [] to_remove: List[orchestrator.DaemonDescription] = [] - ranks: List[int] = list(range(len(candidates))) + ranks: List[int] = list(range(len(all_candidates))) others: List[DaemonPlacement] = candidates.copy() for dd in daemons: found = False diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index b4c972ad4ba..de04c50c0a3 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -53,6 +53,7 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) REQUIRES_POST_ACTIONS = ['grafana', 'iscsi', 'prometheus', 'alertmanager', 'rgw', 'nvmeof', 'mgmt-gateway'] +DISABLED_SERVICES = ['nfs'] WHICH = ssh.RemoteExecutable('which') CEPHADM_EXE = ssh.RemoteExecutable('/usr/bin/cephadm') @@ -1413,6 +1414,10 @@ class CephadmServe: dd.daemon_type in CEPH_TYPES: self.log.info('Reconfiguring %s (extra config changed)...' % dd.name()) action = 'reconfig' + elif dd.daemon_type in DISABLED_SERVICES: + if dd.status == 0 and not dd.user_stopped: + self.log.debug(f'Starting daemon {dd.name()}') + action = 'start' if action: if self.mgr.cache.get_scheduled_daemon_action(dd.hostname, dd.name()) == 'redeploy' \ diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index c62290da942..a27b1bf025f 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -247,6 +247,7 @@ class TestCephadm(object): 'is_active': False, 'ports': [], 'pending_daemon_config': False, + 'user_stopped': False } ] diff --git a/src/pybind/mgr/cephadm/tests/test_spec.py b/src/pybind/mgr/cephadm/tests/test_spec.py index c90b1e7ae79..e3f6e110ea5 100644 --- a/src/pybind/mgr/cephadm/tests/test_spec.py +++ b/src/pybind/mgr/cephadm/tests/test_spec.py @@ -299,7 +299,7 @@ def test_dd_octopus(dd_json): del j['daemon_name'] return j - dd_json.update({'pending_daemon_config': False}) + dd_json.update({'pending_daemon_config': False, 'user_stopped': False}) assert dd_json == convert_to_old_style_json( DaemonDescription.from_json(dd_json).to_json()) diff --git a/src/pybind/mgr/orchestrator/_interface.py b/src/pybind/mgr/orchestrator/_interface.py index 20c47ea9350..526dc663823 100644 --- a/src/pybind/mgr/orchestrator/_interface.py +++ b/src/pybind/mgr/orchestrator/_interface.py @@ -1283,7 +1283,8 @@ class DaemonDescription(object): rank_generation: Optional[int] = None, extra_container_args: Optional[GeneralArgList] = None, extra_entrypoint_args: Optional[GeneralArgList] = None, - pending_daemon_config: bool = False + pending_daemon_config: bool = False, + user_stopped: bool = False ) -> None: #: Host is at the same granularity as InventoryHost @@ -1359,6 +1360,7 @@ class DaemonDescription(object): self.extra_entrypoint_args = ArgumentSpec.from_general_args( extra_entrypoint_args) self.pending_daemon_config = pending_daemon_config + self.user_stopped = user_stopped def __setattr__(self, name: str, value: Any) -> None: if value is not None and name in ('extra_container_args', 'extra_entrypoint_args'): @@ -1517,6 +1519,7 @@ class DaemonDescription(object): out['rank_generation'] = self.rank_generation out['systemd_unit'] = self.systemd_unit out['pending_daemon_config'] = self.pending_daemon_config + out['user_stopped'] = self.user_stopped for k in ['last_refresh', 'created', 'started', 'last_deployed', 'last_configured']: @@ -1555,6 +1558,7 @@ class DaemonDescription(object): out['ip'] = self.ip out['systemd_unit'] = self.systemd_unit out['pending_daemon_config'] = self.pending_daemon_config + out['user_stopped'] = self.user_stopped for k in ['last_refresh', 'created', 'started', 'last_deployed', 'last_configured']: diff --git a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py index 5448239215e..38d087c17ea 100644 --- a/src/pybind/mgr/orchestrator/tests/test_orchestrator.py +++ b/src/pybind/mgr/orchestrator/tests/test_orchestrator.py @@ -93,6 +93,7 @@ status: 1 status_desc: starting is_active: false pending_daemon_config: false +user_stopped: false events: - 2020-06-10T10:08:22.933241Z daemon:crash.ubuntu [INFO] "Deployed crash.ubuntu on host 'ubuntu'"