From: Shweta Bhosale Date: Mon, 22 Dec 2025 11:24:36 +0000 (+0530) Subject: mgr/cephadm: Handle cephadm set user configuration for add node X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=46bcde4f812afb6a822ecb782815c49156674ea9;p=ceph-ci.git mgr/cephadm: Handle cephadm set user configuration for add node Fixes: https://tracker.ceph.com/issues/74045 Signed-off-by: Shweta Bhosale --- diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 6c823e3a18d..c114e00ed0e 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -523,6 +523,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule): self.event = Event() self.ssh = ssh.SSHManager(self) + # Track hosts being added - these hosts will use root user temporarily + # even if cluster is configured to use non-root user + self.hosts_being_added: Set[str] = set() + if self.get_store('pause'): self.paused = True else: @@ -1983,44 +1987,111 @@ Then run the following: :param host: host name """ HostSpec.validate(spec) - ip_addr = self._check_valid_addr(spec.hostname, spec.addr) - if spec.addr == spec.hostname and ip_addr: - spec.addr = ip_addr - - if spec.hostname in self.inventory and self.inventory.get_addr(spec.hostname) != spec.addr: - self.cache.refresh_all_host_info(spec.hostname) - - if spec.oob: - if not spec.oob.get('addr'): - spec.oob['addr'] = self.oob_default_addr - if not spec.oob.get('port'): - spec.oob['port'] = '443' - host_oob_info = dict() - host_oob_info['addr'] = spec.oob['addr'] - host_oob_info['port'] = spec.oob['port'] - host_oob_info['username'] = spec.oob['username'] - host_oob_info['password'] = spec.oob['password'] - self.node_proxy_cache.update_oob(spec.hostname, host_oob_info) - - # prime crush map? - if spec.location: - self.check_mon_command({ - 'prefix': 'osd crush add-bucket', - 'name': spec.hostname, - 'type': 'host', - 'args': [f'{k}={v}' for k, v in spec.location.items()], - }) - if spec.hostname not in self.inventory: - self.cache.prime_empty_host(spec.hostname) - self.inventory.add_host(spec) - self.offline_hosts_remove(spec.hostname) - if spec.status == 'maintenance': - self.update_maintenance_healthcheck() - self.event.set() # refresh stray health check - self.log.info('Added host %s' % spec.hostname) + # Check if this is a new host BEFORE any SSH operations + is_new_host = spec.hostname not in self.inventory + if is_new_host and self.ssh_user and self.ssh_user != 'root': + try: + self.hosts_being_added.add(spec.hostname) + self.log.info(f'Adding new host {spec.hostname}, will use root user temporarily for setup') + except Exception as e: + self.log.warning(f'Failed to add {spec.hostname} to hosts_being_added tracking: {e}') + + try: + ip_addr = self._check_valid_addr(spec.hostname, spec.addr) + if spec.addr == spec.hostname and ip_addr: + spec.addr = ip_addr + if spec.hostname in self.inventory and self.inventory.get_addr(spec.hostname) != spec.addr: + self.cache.refresh_all_host_info(spec.hostname) + + if spec.oob: + if not spec.oob.get('addr'): + spec.oob['addr'] = self.oob_default_addr + if not spec.oob.get('port'): + spec.oob['port'] = '443' + host_oob_info = dict() + host_oob_info['addr'] = spec.oob['addr'] + host_oob_info['port'] = spec.oob['port'] + host_oob_info['username'] = spec.oob['username'] + host_oob_info['password'] = spec.oob['password'] + self.node_proxy_cache.update_oob(spec.hostname, host_oob_info) + + # prime crush map? + if spec.location: + self.check_mon_command({ + 'prefix': 'osd crush add-bucket', + 'name': spec.hostname, + 'type': 'host', + 'args': [f'{k}={v}' for k, v in spec.location.items()], + }) - return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr) + if spec.hostname not in self.inventory: + self.cache.prime_empty_host(spec.hostname) + self.inventory.add_host(spec) + self.offline_hosts_remove(spec.hostname) + if spec.status == 'maintenance': + self.update_maintenance_healthcheck() + self.event.set() # refresh stray health check + self.log.info('Added host %s' % spec.hostname) + + # If this is a new host and using non-root user, setup the user now + if is_new_host and self.ssh_user and self.ssh_user != 'root': + self.log.info(f'Setting up user {self.ssh_user} on new host {spec.hostname}') + try: + assert self.ssh_pub + self._setup_user_on_host(spec.hostname, self.ssh_user, self.ssh_pub, addr=spec.addr) + self.log.info(f'Successfully set up user {self.ssh_user} on {spec.hostname}') + except OrchestratorError as oe: + # OrchestratorError from user setup (user doesn't exist, SSH failures, etc.) + # Log warning but don't fail the add_host operation + self.log.warning(f'Failed to setup user {self.ssh_user} on {spec.hostname}: {oe}') + self.log.warning( + f'Host {spec.hostname} added but user setup incomplete. ' + f'Please manually setup user {self.ssh_user} on {spec.hostname}') + except Exception as e: + # Unexpected error during user setup + # Log warning but don't fail the add_host operation + self.log.error(f'Unexpected error setting up user {self.ssh_user} on {spec.hostname}: {e}') + self.log.warning(f'You may need to manually setup user {self.ssh_user} on {spec.hostname}') + finally: + # Always remove from hosts_being_added after setup attempt + try: + self.hosts_being_added.discard(spec.hostname) + except Exception as discard_err: + self.log.debug(f'Failed to remove {spec.hostname} from hosts_being_added: {discard_err}') + + # Reset SSH connection so next operations will use configured user + try: + self.ssh.reset_con(spec.hostname) + self.log.debug(f'Reset SSH connection for {spec.hostname} to use configured user') + except Exception as reset_err: + # Connection reset failure is not critical - connection will be recreated on next use + self.log.debug(f'Failed to reset SSH connection for {spec.hostname}: {reset_err}') + elif is_new_host: + # Root user, no need to track + try: + self.hosts_being_added.discard(spec.hostname) + except Exception as e: + self.log.debug(f'Failed to remove {spec.hostname} from hosts_being_added: {e}') + + return "Added host '{}' with addr '{}'".format(spec.hostname, spec.addr) + except Exception: + # If anything fails in core add_host operations, cleanup the new tracking mechanisms + if is_new_host and self.ssh_user and self.ssh_user != 'root': + # Cleanup tracking set + try: + self.hosts_being_added.discard(spec.hostname) + self.log.debug(f'Cleaned up hosts_being_added tracking for {spec.hostname} after error') + except Exception as cleanup_err: + self.log.debug(f'Failed to cleanup hosts_being_added for {spec.hostname}: {cleanup_err}') + + # Cleanup any stale SSH connection + try: + self.ssh.reset_con(spec.hostname) + self.log.debug(f'Reset SSH connection for {spec.hostname} after error') + except Exception as reset_err: + self.log.debug(f'Failed to reset SSH connection for {spec.hostname}: {reset_err}') + raise @handle_orch_error def add_host(self, spec: HostSpec) -> str: diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py index acb5a77c51b..7cf0c4de395 100644 --- a/src/pybind/mgr/cephadm/ssh.py +++ b/src/pybind/mgr/cephadm/ssh.py @@ -151,14 +151,19 @@ class SSHManager: addr: Optional[str] = None, ) -> "SSHClientConnection": if not self.cons.get(host) or host not in self.mgr.inventory: + # If host is getting added, use root user. Once add node completes it will + # reset this connection + is_host_being_added = host in self.mgr.hosts_being_added + ssh_user = 'root' if is_host_being_added else self.mgr.ssh_user + if not addr and host in self.mgr.inventory: addr = self.mgr.inventory.get_addr(host) if not addr: raise OrchestratorError("host address is empty") - assert self.mgr.ssh_user - n = self.mgr.ssh_user + '@' + addr + assert ssh_user + n = ssh_user + '@' + addr logger.debug("Opening connection to {} with ssh options '{}'".format( n, self.mgr._ssh_options)) @@ -171,7 +176,7 @@ class SSHManager: keepalive_interval=self.mgr.ssh_keepalive_interval, keepalive_count_max=self.mgr.ssh_keepalive_count_max ) - conn = await asyncssh.connect(addr, username=self.mgr.ssh_user, client_keys=[self.mgr.tkey.name], + conn = await asyncssh.connect(addr, username=ssh_user, client_keys=[self.mgr.tkey.name], known_hosts=None, config=[self.mgr.ssh_config_fname], preferred_auth=['publickey'], options=ssh_options) except OSError: @@ -233,7 +238,14 @@ class SSHManager: ) -> Tuple[str, str, int]: conn = await self._remote_connection(host, addr) - use_sudo = (self.mgr.ssh_user != 'root') + + # For hosts being added, always use root (no sudo) even if cluster + # is configured to use non-root user. This allows initial setup. + is_host_being_added = host in self.mgr.hosts_being_added + use_sudo = (self.mgr.ssh_user != 'root') and not is_host_being_added + if is_host_being_added: + logger.debug(f'Host {host} is being added, using root user without sudo') + rcmd = RemoteSudoCommand.wrap(cmd_components, use_sudo=use_sudo) try: address = addr or self.mgr.inventory.get_addr(host)