]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/rgw: Adding a retry config while calling zone_create() 61717/head
authorKritik Sachdeva <sachdevakritik.440@gmail.com>
Fri, 28 Jun 2024 12:23:37 +0000 (17:53 +0530)
committerAdam King <adking@redhat.com>
Wed, 19 Feb 2025 18:35:05 +0000 (13:35 -0500)
Fixes https://tracker.ceph.com/issues/66750

Signed-off-by: Kritik Sachdeva <sachdevakritik.440@gmail.com>
(cherry picked from commit 480253eaea553b3827a9d584ba8b45a32c845386)

Conflicts:
src/pybind/mgr/rgw/module.py

src/pybind/mgr/rgw/module.py
src/python-common/ceph/rgw/rgwam_core.py

index fb2b978995685c7f2eb37ce27df808253cf3d8b2..e712099510c40ae6e5346fb691e189e00fa8ab95 100644 (file)
@@ -101,7 +101,14 @@ def check_orchestrator(func: FuncT) -> FuncT:
 
 
 class Module(orchestrator.OrchestratorClientMixin, MgrModule):
-    MODULE_OPTIONS: List[Option] = []
+    MODULE_OPTIONS: List[Option] = [
+        Option(
+            'secondary_zone_period_retry_limit',
+            type='int',
+            default=5,
+            desc='RGW module period update retry limit for secondary site'
+        ),
+    ]
 
     # These are "native" Ceph options that this module cares about.
     NATIVE_OPTIONS: List[Option] = []
@@ -115,6 +122,9 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
         # ensure config options members are initialized; see config_notify()
         self.config_notify()
 
+        if TYPE_CHECKING:
+            self.secondary_zone_period_retry_limit = 5
+
         with self.lock:
             self.inited = True
             self.env = EnvArgs(RGWAMOrchMgr(self))
@@ -300,7 +310,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
 
         try:
             created_zones = self.rgw_zone_create(zone_name, realm_token, port, placement,
-                                                 start_radosgw, zone_endpoints, inbuf)
+                                                 start_radosgw, zone_endpoints, self.secondary_zone_period_retry_limit, inbuf)
             return HandleCommandResult(retval=0, stdout=f"Zones {', '.join(created_zones)} created successfully")
         except RGWAMException as e:
             return HandleCommandResult(retval=e.retcode, stderr=f'Failed to create zone: {str(e)}')
@@ -312,6 +322,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
                         placement: Optional[Union[str, Dict[str, Any]]] = None,
                         start_radosgw: Optional[bool] = True,
                         zone_endpoints: Optional[str] = None,
+                        secondary_zone_period_retry_limit: Optional[int] = None,
                         inbuf: Optional[str] = None) -> List[str]:
 
         if inbuf:
@@ -338,7 +349,7 @@ class Module(orchestrator.OrchestratorClientMixin, MgrModule):
         try:
             created_zones = []
             for rgw_spec in rgw_specs:
-                RGWAM(self.env).zone_create(rgw_spec, start_radosgw)
+                RGWAM(self.env).zone_create(rgw_spec, start_radosgw, secondary_zone_period_retry_limit)
                 if rgw_spec.rgw_zone is not None:
                     created_zones.append(rgw_spec.rgw_zone)
                     return created_zones
index 64e7180ad57045ce93bbedf581a468f9583712e8..83e13119a93d8593f46d97a6bff8ddcfe5fe837c 100644 (file)
@@ -744,7 +744,7 @@ class RGWAM:
                                 "secret": secret})
         return realms_info
 
-    def zone_create(self, rgw_spec, start_radosgw):
+    def zone_create(self, rgw_spec, start_radosgw, secondary_zone_period_retry_limit=5):
 
         if not rgw_spec.rgw_realm_token:
             raise RGWAMException('missing realm token')
@@ -781,7 +781,25 @@ class RGWAM:
         zone = self.create_zone(realm, zonegroup, rgw_spec.rgw_zone,
                                 False,  # secondary zone
                                 access_key, secret, endpoints=rgw_spec.zone_endpoints)
-        self.update_period(realm, zonegroup, zone)
+
+        # Adding a retry limit for period update in case the default 10s timeout is not sufficient
+        rgw_limit = 0
+
+        while rgw_limit != int(secondary_zone_period_retry_limit):
+            try:
+                self.update_period(realm, zonegroup, zone)
+                break
+            except RGWAMException as e:
+                logging.info(f'Failed to update Period in 10s. Retrying with current limit \
+                             & retry-limit values {rgw_limit} {secondary_zone_period_retry_limit}')
+                rgw_limit += 1
+                if rgw_limit == secondary_zone_period_retry_limit:
+                    raise RGWAMException(f'Period Update failed for zone {zone}. \
+                                          Exception raised while period update {e.message}')
+                continue
+
+        # By default the above operation is expected to be completed in 10s timeout but if we
+        # updating this for secondary site it would take some time because of pool creation
 
         period = RGWPeriod(period_info)
         logging.debug(period.to_json())