From: Oguzhan Ozmen Date: Mon, 24 Nov 2025 03:03:18 +0000 (+0000) Subject: RGW/test_multi: testcase for rgw instance hang/crash during realm reload X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5e6a9c8f9de457406c3ef5f44bf2069c60bc6cd5;p=ceph.git RGW/test_multi: testcase for rgw instance hang/crash during realm reload Tests: https://tracker.ceph.com/issues/66100 Signed-off-by: Oguzhan Ozmen --- diff --git a/src/test/rgw/rgw_multi/tests.py b/src/test/rgw/rgw_multi/tests.py index 466a509f978..c732dbb92fe 100644 --- a/src/test/rgw/rgw_multi/tests.py +++ b/src/test/rgw/rgw_multi/tests.py @@ -6173,4 +6173,90 @@ def test_object_lock_sync(): assert(response['ObjectLockConfiguration'] == lock_config) - +def test_period_update_commit(): + wkld_concurrency = 25 + num_objects_to_upload = 2500 + number_of_period_updates = 5 + test_passed = False + + zonegroup = realm.master_zonegroup() + zonegroup_conns = ZonegroupConns(zonegroup) + primary_zone_client_conn = zonegroup_conns.rw_zones[0] + secondary_zone_cluster_conn = zonegroup.zones[1] + + bucket = primary_zone_client_conn.create_bucket(gen_bucket_name()) + log.info(f"created bucket={bucket.name}") + + def run_client_wkld(stop_event: threading.Event, key_range): + log.info(f"upload objects within range {key_range} to bucket={bucket.name}") + num_uploads = 0 + while not stop_event.is_set(): + start, end = key_range + for i in range(start, end + 1): + try: + primary_zone_client_conn.s3_client.put_object( + Bucket=bucket.name, Key=f"obj-{i:04d}", Body="..." + ) + num_uploads += 1 + except Exception as e: + log.debug(f"failed to upload object to bucket={bucket.name}: {e}") + log.info( + f"uploaded {num_uploads} times for the range {key_range} to bucket={bucket.name}" + ) + + try: + log.info("verify cluster is healthy before moving on") + try: + zonegroup_data_checkpoint(zonegroup_conns) + except: + log.info("restart gateways for a clean start") + for z in zonegroup_conns.rw_zones: + z.zone.start() + + log.info("start client write-only workload to generate replication traffic") + client_write_only_wkld_thread_stop = threading.Event() + step = num_objects_to_upload // wkld_concurrency + key_ranges = [(i * step, (i + 1) * step - 1) for i in range(wkld_concurrency)] + client_write_only_wkld_threads = [] + for key_range in key_ranges: + thread = threading.Thread( + target=run_client_wkld, + args=(client_write_only_wkld_thread_stop, key_range), + daemon=False, + ) + thread.start() + client_write_only_wkld_threads.append(thread) + + log.info("run period-update-commits and verify rgw instances reload properly") + for _ in range(number_of_period_updates): + log.info("issue period update commit") + zonegroup.period.update(secondary_zone_cluster_conn, commit=True) + log.info("verify data sync is making progress") + if not data_sync_making_progress(secondary_zone_cluster_conn): + break # the issue of realm reload freezing reproduced + client_write_only_wkld_thread_stop.set() # stop client wkld + zonegroup_data_checkpoint(zonegroup_conns) + test_passed = True + except Exception as e: + log.error(f"test_period_update_commit failed: {e}") + raise + finally: + client_write_only_wkld_thread_stop.set() + for t in client_write_only_wkld_threads: + t.join() + + # without a fix, an rgw instance may crash or freeze + # so restart all instances before further cleaning up + if not test_passed: + for z in zonegroup_conns.rw_zones: + z.zone.start() + + log.info(f"delete {num_objects_to_upload} objects from bucket={bucket.name}") + for i in range(num_objects_to_upload): + primary_zone_client_conn.s3_client.delete_object( + Bucket=bucket.name, + Key=f"obj-{i:04d}", + ) + log.info(f"delete bucket={bucket.name}") + primary_zone_client_conn.s3_client.delete_bucket(Bucket=bucket.name) +