From 7296e47e6f533e7014b14eea20c9e5a4c3e2327a Mon Sep 17 00:00:00 2001 From: Kritik Sachdeva Date: Fri, 28 Jun 2024 17:53:37 +0530 Subject: [PATCH] mgr/rgw: Adding a retry config while calling zone_create() Fixes https://tracker.ceph.com/issues/66750 Signed-off-by: Kritik Sachdeva (cherry picked from commit 480253eaea553b3827a9d584ba8b45a32c845386) Conflicts: src/pybind/mgr/rgw/module.py --- src/pybind/mgr/rgw/module.py | 17 ++++++++++++++--- src/python-common/ceph/rgw/rgwam_core.py | 22 ++++++++++++++++++++-- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/rgw/module.py b/src/pybind/mgr/rgw/module.py index fb2b9789956..e712099510c 100644 --- a/src/pybind/mgr/rgw/module.py +++ b/src/pybind/mgr/rgw/module.py @@ -101,7 +101,14 @@ def check_orchestrator(func: FuncT) -> FuncT: class Module(orchestrator.OrchestratorClientMixin, MgrModule): - MODULE_OPTIONS: List[Option] = [] + MODULE_OPTIONS: List[Option] = [ + Option( + 'secondary_zone_period_retry_limit', + type='int', + default=5, + desc='RGW module period update retry limit for secondary site' + ), + ] # These are "native" Ceph options that this module cares about. NATIVE_OPTIONS: List[Option] = [] @@ -115,6 +122,9 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): # ensure config options members are initialized; see config_notify() self.config_notify() + if TYPE_CHECKING: + self.secondary_zone_period_retry_limit = 5 + with self.lock: self.inited = True self.env = EnvArgs(RGWAMOrchMgr(self)) @@ -300,7 +310,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): try: created_zones = self.rgw_zone_create(zone_name, realm_token, port, placement, - start_radosgw, zone_endpoints, inbuf) + start_radosgw, zone_endpoints, self.secondary_zone_period_retry_limit, inbuf) return HandleCommandResult(retval=0, stdout=f"Zones {', '.join(created_zones)} created successfully") except RGWAMException as e: return HandleCommandResult(retval=e.retcode, stderr=f'Failed to create zone: {str(e)}') @@ -312,6 +322,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): placement: Optional[Union[str, Dict[str, Any]]] = None, start_radosgw: Optional[bool] = True, zone_endpoints: Optional[str] = None, + secondary_zone_period_retry_limit: Optional[int] = None, inbuf: Optional[str] = None) -> List[str]: if inbuf: @@ -338,7 +349,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule): try: created_zones = [] for rgw_spec in rgw_specs: - RGWAM(self.env).zone_create(rgw_spec, start_radosgw) + RGWAM(self.env).zone_create(rgw_spec, start_radosgw, secondary_zone_period_retry_limit) if rgw_spec.rgw_zone is not None: created_zones.append(rgw_spec.rgw_zone) return created_zones diff --git a/src/python-common/ceph/rgw/rgwam_core.py b/src/python-common/ceph/rgw/rgwam_core.py index 64e7180ad57..83e13119a93 100644 --- a/src/python-common/ceph/rgw/rgwam_core.py +++ b/src/python-common/ceph/rgw/rgwam_core.py @@ -744,7 +744,7 @@ class RGWAM: "secret": secret}) return realms_info - def zone_create(self, rgw_spec, start_radosgw): + def zone_create(self, rgw_spec, start_radosgw, secondary_zone_period_retry_limit=5): if not rgw_spec.rgw_realm_token: raise RGWAMException('missing realm token') @@ -781,7 +781,25 @@ class RGWAM: zone = self.create_zone(realm, zonegroup, rgw_spec.rgw_zone, False, # secondary zone access_key, secret, endpoints=rgw_spec.zone_endpoints) - self.update_period(realm, zonegroup, zone) + + # Adding a retry limit for period update in case the default 10s timeout is not sufficient + rgw_limit = 0 + + while rgw_limit != int(secondary_zone_period_retry_limit): + try: + self.update_period(realm, zonegroup, zone) + break + except RGWAMException as e: + logging.info(f'Failed to update Period in 10s. Retrying with current limit \ + & retry-limit values {rgw_limit} {secondary_zone_period_retry_limit}') + rgw_limit += 1 + if rgw_limit == secondary_zone_period_retry_limit: + raise RGWAMException(f'Period Update failed for zone {zone}. \ + Exception raised while period update {e.message}') + continue + + # By default the above operation is expected to be completed in 10s timeout but if we + # updating this for secondary site it would take some time because of pool creation period = RGWPeriod(period_info) logging.debug(period.to_json()) -- 2.39.5