From: Sage Weil Date: Fri, 23 Apr 2021 19:33:23 +0000 (-0400) Subject: mgr/cephadm: enable ranked daemons for nfs X-Git-Tag: v16.2.5~87^2~61 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=8f2c20bf8d8ffd430c38b2155676b9e47b7cffeb;p=ceph.git mgr/cephadm: enable ranked daemons for nfs Use ranked daemons for NFS. Ganesha does not like it if multiple instances start up with the same rank, but we need stable ranks so that a rank can "fail over" to a new instance of a new daemon on another host (with the same rank) for NFS client reclaim to work. Specify a nodeid of '{service_name}.{rank}' for ganesha. Include a unique id in the daemon_id just because this avoids some issues with the create/destroy ordering, and because the daemon_id doesn't matter much anymore since we are using a stable rank. Signed-off-by: Sage Weil (cherry picked from commit a7d65f00ea554078f6e8f058e9ec137230902964) --- diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index b025fe98675..e662194a6ed 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -516,7 +516,14 @@ class NFSGanesha(object): args += ['--ns', self.namespace] if self.userid: args += ['--userid', self.userid] - args += [action, self.get_daemon_name()] + + meta = json.loads(self.ctx.meta_json) + if 'service_name' in meta and 'rank' in meta: + nodeid = f"{meta['service_name']}.{meta['rank']}" + else: + nodeid = self.daemon_id + + args += [action, nodeid] data_dir = get_data_dir(self.fsid, self.ctx.data_dir, self.daemon_type, self.daemon_id) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 15e0ae60498..fb77d0ff58a 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -584,7 +584,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, Generate a unique random service name """ suffix = daemon_type not in [ - 'mon', 'crash', 'nfs', + 'mon', 'crash', 'prometheus', 'node-exporter', 'grafana', 'alertmanager', 'container', 'cephadm-exporter', ] diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index e5773bfb194..4e4e0153e8d 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -4,7 +4,7 @@ from typing import Dict, Tuple, Any, List, cast, Optional from mgr_module import HandleCommandResult -from ceph.deployment.service_spec import NFSServiceSpec +from ceph.deployment.service_spec import ServiceSpec, NFSServiceSpec import rados from orchestrator import DaemonDescription @@ -17,6 +17,38 @@ logger = logging.getLogger(__name__) class NFSService(CephService): TYPE = 'nfs' + def ranked(self) -> bool: + return True + + def fence(self, daemon_id: str) -> None: + logger.info(f'Fencing old nfs.{daemon_id}') + ret, out, err = self.mgr.mon_command({ + 'prefix': 'auth rm', + 'entity': f'client.nfs.{daemon_id}', + }) + + # TODO: block/fence this entity (in case it is still running somewhere) + + def fence_old_ranks(self, + spec: ServiceSpec, + rank_map: Dict[int, Dict[int, Optional[str]]], + num_ranks: int) -> None: + for rank, m in list(rank_map.items()): + if rank >= num_ranks: + for daemon_id in m.values(): + if daemon_id is not None: + self.fence(daemon_id) + del rank_map[rank] + self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map) + else: + max_gen = max(m.keys()) + for gen, daemon_id in list(m.items()): + if gen < max_gen: + if daemon_id is not None: + self.fence(daemon_id) + del rank_map[rank][gen] + self.mgr.spec_store.save_rank_map(spec.service_name(), rank_map) + def config(self, spec: NFSServiceSpec, daemon_id: str) -> None: # type: ignore assert self.TYPE == spec.service_type assert spec.pool @@ -51,7 +83,7 @@ class NFSService(CephService): # generate the ganesha config def get_ganesha_conf() -> str: context = dict(user=rados_user, - nodeid=daemon_spec.name(), + nodeid=f'{daemon_spec.service_name}.{daemon_spec.rank}', pool=spec.pool, namespace=spec.namespace if spec.namespace else '', rgw_user=rgw_user, @@ -141,7 +173,7 @@ class NFSService(CephService): entity: AuthEntity = self.get_auth_entity(f'{daemon_id}-rgw') logger.info(f'Removing key for {entity}') - ret, out, err = self.mgr.check_mon_command({ + self.mgr.check_mon_command({ 'prefix': 'auth rm', 'entity': entity, })