From f166f40dab2f1bedbc073c1c4adb5dcce0b01bfd Mon Sep 17 00:00:00 2001 From: Soumya Koduri Date: Sat, 24 May 2025 01:55:30 +0530 Subject: [PATCH] rgw/cloud-restore: Handle failure with adding restore entry In case adding restore entry to FIFO fails, reset the `restore_status` of that object as "RestoreFailed" so that restore process can be retried from the end S3 user. Reviewed-by: Adam Emerson Reviewed-by: Jiffin Tony Thottan Signed-off-by: Soumya Koduri (cherry picked from commit 9974f51eb61603b8117d7b50e6b0b4614fcce721) --- src/rgw/rgw_op.cc | 4 ++-- src/rgw/rgw_restore.cc | 42 ++++++++++++++++++++++++++---------------- src/rgw/rgw_restore.h | 4 +++- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc index 1bd8824fa4583..871359f1b68ba 100644 --- a/src/rgw/rgw_op.cc +++ b/src/rgw/rgw_op.cc @@ -1080,7 +1080,7 @@ int handle_cloudtier_obj(req_state* s, const DoutPrefixProvider *dpp, rgw::sal:: } op_ret = driver->get_rgwrestore()->restore_obj_from_cloud(s->bucket.get(), - s->object.get(), tier.get(), days, y); + s->object.get(), tier.get(), days, dpp, y); if (op_ret < 0) { ldpp_dout(dpp, 0) << "Restore of object " << s->object->get_key() << " failed" << op_ret << dendl; @@ -1088,7 +1088,7 @@ int handle_cloudtier_obj(req_state* s, const DoutPrefixProvider *dpp, rgw::sal:: return op_ret; } - ldpp_dout(dpp, 20) << "Restore of object " << s->object->get_key() << " succeed" << dendl; + ldpp_dout(dpp, 20) << "Restore of object " << s->object->get_key() << " initiated" << dendl; /* Even if restore is complete the first read through request will return * but actually downloaded object asyncronously. */ diff --git a/src/rgw/rgw_restore.cc b/src/rgw/rgw_restore.cc index fb734b33010fb..842cbd178b3ad 100644 --- a/src/rgw/rgw_restore.cc +++ b/src/rgw/rgw_restore.cc @@ -193,7 +193,7 @@ void *RGWRestore::RestoreWorker::entry() { int r = 0; r = restore->process(this, null_yield); if (r < 0) { - ldpp_dout(dpp, 0) << "ERROR: restore process() returned error r=" << r << dendl; + ldpp_dout(dpp, -1) << "ERROR: restore process() returned error r=" << r << dendl; } if (restore->going_down()) break; @@ -316,14 +316,14 @@ int RGWRestore::process(int index, int max_secs, optional_yield y) } ret = sal_restore->trim_entries(this, y, index, marker); if (ret < 0) { - ldpp_dout(this, 0) << "RGWRestore::process() failed to trim entries on " + ldpp_dout(this, -1) << "ERROR: RGWRestore::process() failed to trim entries on " << obj_names[index] << dendl; } if (!r_entries.empty()) { ret = sal_restore->add_entries(this, y, index, r_entries); if (ret < 0) { - ldpp_dout(this, 0) << "RGWRestore::process() failed to add entries on " + ldpp_dout(this, -1) << "ERROR: RGWRestore::process() failed to add entries on " << obj_names[index] << dendl; } } @@ -358,7 +358,7 @@ int RGWRestore::process_restore_entry(RGWRestoreEntry& entry, optional_yield y) // bucket, obj, days, state=in_progress ret = driver->load_bucket(this, entry.bucket, &bucket, null_yield); if (ret < 0) { - ldpp_dout(this, 0) << "Restore:get_bucket for " << bucket->get_name() + ldpp_dout(this, -1) << "ERROR: Restore:get_bucket for " << bucket->get_name() << " failed" << dendl; return ret; } @@ -367,7 +367,7 @@ int RGWRestore::process_restore_entry(RGWRestoreEntry& entry, optional_yield y) ret = obj->load_obj_state(this, null_yield, true); if (ret < 0) { - ldpp_dout(this, 0) << "Restore:get_object for " << entry.obj_key + ldpp_dout(this, 0) << "ERROR: Restore:get_object for " << entry.obj_key << " failed" << dendl; return ret; } @@ -385,7 +385,7 @@ int RGWRestore::process_restore_entry(RGWRestoreEntry& entry, optional_yield y) } if (restore_status == rgw::sal::RGWRestoreStatus::CloudRestored) { // XXX: Check if expiry-date needs to be update - ldpp_dout(this, 20) << "Restore of object " << obj->get_key() << " already done" << dendl; + ldpp_dout(this, 5) << "Restore of object " << obj->get_key() << " already done" << dendl; entry.status = rgw::sal::RGWRestoreStatus::CloudRestored; return 0; } @@ -397,7 +397,7 @@ int RGWRestore::process_restore_entry(RGWRestoreEntry& entry, optional_yield y) ret = driver->get_zone()->get_zonegroup().get_placement_tier(target_placement, &tier); if (ret < 0) { - ldpp_dout(this, -1) << "failed to fetch tier placement handle, ret = " << ret << dendl; + ldpp_dout(this, -1) << "ERROR: failed to fetch tier placement handle, ret = " << ret << dendl; return ret; } else { ldpp_dout(this, 20) << "getting tier placement handle cloud tier for " << @@ -426,10 +426,10 @@ int RGWRestore::process_restore_entry(RGWRestoreEntry& entry, optional_yield y) } if (in_progress) { - ldpp_dout(this, 20) << "Restore of object " << obj->get_key() << " still in progress" << dendl; + ldpp_dout(this, 15) << "Restore of object " << obj->get_key() << " is still in progress" << dendl; entry.status = rgw::sal::RGWRestoreStatus::RestoreAlreadyInProgress; } else { - ldpp_dout(this, 20) << "Restore of object " << obj->get_key() << " succeeded" << dendl; + ldpp_dout(this, 15) << "Restore of object " << obj->get_key() << " succeeded" << dendl; entry.status = rgw::sal::RGWRestoreStatus::RestoreFailed; } return ret; @@ -466,7 +466,9 @@ int RGWRestore::set_cloud_restore_status(const DoutPrefixProvider* dpp, int RGWRestore::restore_obj_from_cloud(rgw::sal::Bucket* pbucket, rgw::sal::Object* pobj, rgw::sal::PlacementTier* tier, - std::optional days, optional_yield y) + std::optional days, + const DoutPrefixProvider* dpp, + optional_yield y) { int ret = 0; @@ -484,20 +486,19 @@ int RGWRestore::restore_obj_from_cloud(rgw::sal::Bucket* pbucket, // now go ahead with restoring object bool in_progress = false; - ret = pobj->restore_obj_from_cloud(pbucket, tier, cct, days, in_progress, this, y); + ret = pobj->restore_obj_from_cloud(pbucket, tier, cct, days, in_progress, dpp, y); if (ret < 0) { - ldpp_dout(this, 0) << "object " << pobj->get_key() << " fetching failed" << ret << dendl; + ldpp_dout(this, 0) << "ERROR: object " << pobj->get_key() << " fetching failed" << ret << dendl; auto reset_ret = set_cloud_restore_status(this, pobj, y, rgw::sal::RGWRestoreStatus::RestoreFailed); if (reset_ret < 0) { - ldpp_dout(this, -1) << "Setting restore status ad RestoreFailed failed for object(" << pobj->get_key() << ") " << reset_ret << dendl; + ldpp_dout(this, -1) << "Setting restore status to RestoreFailed failed for object(" << pobj->get_key() << ") " << reset_ret << dendl; } return ret; } - ldpp_dout(this, 20) << "Restore of object " << pobj->get_key() << " succeeded" << dendl; if (in_progress) { // add restore entry to the list RGWRestoreEntry entry; @@ -507,14 +508,23 @@ int RGWRestore::restore_obj_from_cloud(rgw::sal::Bucket* pbucket, entry.days = days; entry.zone_id = driver->get_zone()->get_id(); + ldpp_dout(this, 10) << "RGWRestore:: Adding restore entry of object(" << pobj->get_key() << ") entry: " << entry << dendl; + int index = choose_oid(entry); ret = sal_restore->add_entry(this, y, index, entry); if (ret < 0) { - ldpp_dout(this, -1) << "Adding restore entry of object(" << pobj->get_key() << ") failed" << ret << dendl; + ldpp_dout(this, -1) << "ERROR: Adding restore entry of object(" << pobj->get_key() << ") failed" << ret << dendl; + + auto reset_ret = set_cloud_restore_status(this, pobj, y, rgw::sal::RGWRestoreStatus::RestoreFailed); + if (reset_ret < 0) { + ldpp_dout(this, -1) << "Setting restore status as RestoreFailed failed for object(" << pobj->get_key() << ") " << reset_ret << dendl; + } + + return ret; } } - ldpp_dout(this, 20) << "Restore of object " << pobj->get_key() << " succeeded" << dendl; + ldpp_dout(this, 10) << "Restore of object " << pobj->get_key() << (in_progress ? " is in progress" : " succeeded") << dendl; return ret; } diff --git a/src/rgw/rgw_restore.h b/src/rgw/rgw_restore.h index 662745d240bd1..d18366cb78730 100644 --- a/src/rgw/rgw_restore.h +++ b/src/rgw/rgw_restore.h @@ -136,5 +136,7 @@ public: * to be procesed later by RestoreWorker thread. */ int restore_obj_from_cloud(rgw::sal::Bucket* pbucket, rgw::sal::Object* pobj, rgw::sal::PlacementTier* tier, - std::optional days, optional_yield y); + std::optional days, + const DoutPrefixProvider* dpp, + optional_yield y); }; -- 2.39.5