rgw: metadata sync treats all errors as 'transient'

author Casey Bodley <cbodley@redhat.com>

Tue, 13 Jul 2021 17:54:36 +0000 (13:54 -0400)

committer Cory Snyder <csnyder@iland.com>

Wed, 4 Aug 2021 16:38:25 +0000 (12:38 -0400)
author Casey Bodley <cbodley@redhat.com>
Tue, 13 Jul 2021 17:54:36 +0000 (13:54 -0400)
committer Cory Snyder <csnyder@iland.com>
Wed, 4 Aug 2021 16:38:25 +0000 (12:38 -0400)
diff --git a/src/rgw/rgw_sync.cc b/src/rgw/rgw_sync.cc

index ac15956c6da99806739152da66c57361313733d6..32464e896e9285ff893080f25542076d9c22c93c 100644 (file)
--- a/src/rgw/rgw_sync.cc
+++ b/src/rgw/rgw_sync.cc
@@ -1297,7 +1297,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
          break;
        }
  
-      if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+      if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
          ldpp_dout(dpp, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl;
          continue;
        }
@@ -1322,7 +1322,7 @@ int RGWMetaSyncSingleEntryCR::operate(const DoutPrefixProvider *dpp) {
          tn->log(10, SSTR("removing local metadata entry"));
            yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
        }
-      if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
+      if (tries < NUM_TRANSIENT_ERROR_RETRIES - 1) {
          ldpp_dout(dpp, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl;
          continue;
        }
@@ -1514,20 +1514,17 @@ public:
  
        if (child_ret < 0) {
          ldpp_dout(sync_env->dpp, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
+        // on any error code from RGWMetaSyncSingleEntryCR, we do not advance
+        // the sync status marker past this entry, and set
+        // can_adjust_marker=false to exit out of RGWMetaSyncShardCR.
+        // RGWMetaSyncShardControlCR will rerun RGWMetaSyncShardCR from the
+        // previous marker and retry
+        can_adjust_marker = false;
        }
  
        map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
        ceph_assert(prev_iter != pos_to_prev.end());
  
-      /*
-       * we should get -EAGAIN for transient errors, for which we want to retry, so we don't
-       * update the marker and abort. We'll get called again for these. Permanent errors will be
-       * handled by marking the entry at the error log shard, so that we retry on it separately
-       */
-      if (child_ret == -EAGAIN) {
-        can_adjust_marker = false;
-      }
-
        if (pos_to_prev.size() == 1) {
          if (can_adjust_marker) {
            sync_marker.marker = pos;
@@ -2236,7 +2233,7 @@ int RGWRemoteMetaLog::run_sync(const DoutPrefixProvider *dpp, optional_yield y)
        case rgw_meta_sync_info::StateBuildingFullSyncMaps:
          tn->log(20, "building full sync maps");
          r = run(dpp, new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers, tn));
-        if (r == -EBUSY || r == -EAGAIN) {
+        if (r == -EBUSY || r == -EIO) {
            backoff.backoff_sleep();
            continue;
          }
author	Casey Bodley <cbodley@redhat.com>
	Tue, 13 Jul 2021 17:54:36 +0000 (13:54 -0400)
committer	Cory Snyder <csnyder@iland.com>
	Wed, 4 Aug 2021 16:38:25 +0000 (12:38 -0400)