From ed481ece74610a94a2cfc2280232e3daf82a0349 Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Tue, 13 Jul 2021 13:54:36 -0400 Subject: [PATCH] rgw: metadata sync treats all errors as 'transient' collect_children() had a special case for EAGAIN that it treated as a 'transient' error, which set the can_adjust_marker = false to bail out of RGWMetaSyncShardCR and retry from the previous marker but the http client doesn't return EAGAIN - rgw_http_error_to_errno() defaults to EIO - so this retry logic based on can_adjust_marker never runs. on any other error, RGWMetaSyncSingleEntryCR would not call marker_tracker->finish() to advance the sync status marker, and RGWMetaSyncShardCR would continue on with full- or incremental sync without ever attempting to retry the failed entries a detailed comment in collect_children() describes a different strategy for handling 'permanent' errors, but that was never fully elaborated. i also don't think there's a reasonable way to differentiate between transient and permanent errors, so this treats all errors as transient to be retried if an error really is permanent for a given metadata key, metadata sync will get stuck there and require manual intervention Fixes: https://tracker.ceph.com/issues/39657 Signed-off-by: Casey Bodley (cherry picked from commit 866d66b8749b28ec626a8d0adba3d14fdd8abead) --- src/rgw/rgw_sync.cc | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc index ac15956c6da99..32464e896e928 100644 --- a/src/rgw/rgw_sync.cc +++ b/src/rgw/rgw_sync.cc @@ -1297,7 +1297,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) { break; } - if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) { ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl; continue; } @@ -1322,7 +1322,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) { tn->log(10, SSTR("removing local metadata entry")); yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key)); } - if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) { ldpp_dout(dpp, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl; continue; } @@ -1514,20 +1514,17 @@ public: if (child_ret < 0) { ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl; + // on any error code from RGWMetaSyncSingleEntryCR, we do not advance + // the sync status marker past this entry, and set + // can_adjust_marker=false to exit out of RGWMetaSyncShardCR. + // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the + // previous marker and retry + can_adjust_marker = false; } map::iterator prev_iter = pos_to_prev.find(pos); ceph_assert(prev_iter != pos_to_prev.end()); - /* - * we should get -EAGAIN for transient errors, for which we want to retry, so we don't - * update the marker and abort. We'll get called again for these. Permanent errors will be - * handled by marking the entry at the error log shard, so that we retry on it separately - */ - if (child_ret == -EAGAIN) { - can_adjust_marker = false; - } - if (pos_to_prev.size() == 1) { if (can_adjust_marker) { sync_marker.marker = pos; @@ -2236,7 +2233,7 @@ int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y) case rgw_meta_sync_info::StateBuildingFullSyncMaps: tn->log(20, "building full sync maps"); r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn)); - if (r == -EBUSY || r == -EAGAIN) { + if (r == -EBUSY || r == -EIO) { backoff.backoff_sleep(); continue; } -- 2.39.5