From: Casey Bodley Date: Tue, 13 Jul 2021 17:54:36 +0000 (-0400) Subject: rgw: metadata sync treats all errors as 'transient' X-Git-Tag: v17.1.0~1336^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=866d66b8749b28ec626a8d0adba3d14fdd8abead;p=ceph.git rgw: metadata sync treats all errors as 'transient' collect_children() had a special case for EAGAIN that it treated as a 'transient' error, which set the can_adjust_marker = false to bail out of RGWMetaSyncShardCR and retry from the previous marker but the http client doesn't return EAGAIN - rgw_http_error_to_errno() defaults to EIO - so this retry logic based on can_adjust_marker never runs. on any other error, RGWMetaSyncSingleEntryCR would not call marker_tracker->finish() to advance the sync status marker, and RGWMetaSyncShardCR would continue on with full- or incremental sync without ever attempting to retry the failed entries a detailed comment in collect_children() describes a different strategy for handling 'permanent' errors, but that was never fully elaborated. i also don't think there's a reasonable way to differentiate between transient and permanent errors, so this treats all errors as transient to be retried if an error really is permanent for a given metadata key, metadata sync will get stuck there and require manual intervention Fixes: https://tracker.ceph.com/issues/39657 Signed-off-by: Casey Bodley --- diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc index 5c652e07e8e5..233a827ab326 100644 --- a/src/rgw/rgw_sync.cc +++ b/src/rgw/rgw_sync.cc @@ -1296,7 +1296,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) { break; } - if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) { ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl; continue; } @@ -1321,7 +1321,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) { tn->log(10, SSTR("removing local metadata entry")); yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key)); } - if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) { + if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) { ldpp_dout(dpp, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl; continue; } @@ -1513,20 +1513,17 @@ public: if (child_ret < 0) { ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl; + // on any error code from RGWMetaSyncSingleEntryCR, we do not advance + // the sync status marker past this entry, and set + // can_adjust_marker=false to exit out of RGWMetaSyncShardCR. + // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the + // previous marker and retry + can_adjust_marker = false; } map::iterator prev_iter = pos_to_prev.find(pos); ceph_assert(prev_iter != pos_to_prev.end()); - /* - * we should get -EAGAIN for transient errors, for which we want to retry, so we don't - * update the marker and abort. We'll get called again for these. Permanent errors will be - * handled by marking the entry at the error log shard, so that we retry on it separately - */ - if (child_ret == -EAGAIN) { - can_adjust_marker = false; - } - if (pos_to_prev.size() == 1) { if (can_adjust_marker) { sync_marker.marker = pos; @@ -2235,7 +2232,7 @@ int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y) case rgw_meta_sync_info::StateBuildingFullSyncMaps: tn->log(20, "building full sync maps"); r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn)); - if (r == -EBUSY || r == -EAGAIN) { + if (r == -EBUSY || r == -EIO) { backoff.backoff_sleep(); continue; }