]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
rgw: prevent spurious/lost notifications in the index completion thread
authorYuval Lifshitz <ylifshit@redhat.com>
Wed, 23 Feb 2022 15:21:10 +0000 (17:21 +0200)
committerCasey Bodley <cbodley@redhat.com>
Fri, 27 May 2022 19:47:34 +0000 (15:47 -0400)
this was happening when asyn completions happened during reshard.
more information about testing:
https://gist.github.com/yuvalif/d526c0a3a4c5b245b9e951a6c5a10517

we also add more logs to the completion manager.
should allow finding unhandled completions due to reshards.

Signed-off-by: Yuval Lifshitz <ylifshit@redhat.com>
src/rgw/rgw_rados.cc

index 741faa3ba921947f86522fdadeb0e2de674fa445..9189bec7c22b0824f90fbefbba3d95867306aa7a 100644 (file)
@@ -948,12 +948,7 @@ void RGWIndexCompletionManager::process()
         continue;
       }
 
-      r = store->svc.datalog_rados->add_entry(&dpp, bucket_info,
-                                             bucket_info.layout.logs.back(),
-                                             bs.shard_id);
-      if (r < 0) {
-        ldpp_dout(&dpp, -1) << "ERROR: failed writing data log" << dendl;
-      }
+      add_datalog_entry(&dpp, store->svc.datalog_rados, bucket_info, bs.shard_id);
     }
   }
 }
@@ -6191,27 +6186,34 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp,
   for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
     int ret = get_bucket_shard(&bs, dpp);
     if (ret < 0) {
-      ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
+      ldpp_dout(dpp, 0) << "ERROR: failed to get BucketShard object. obj=" << 
+        obj_instance.key << ". ret=" << ret << dendl;
       return ret;
     }
     r = call(bs);
     if (r != -ERR_BUSY_RESHARDING) {
       break;
     }
-    ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
+    ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking. obj=" << 
+      obj_instance.key << dendl;
     r = store->block_while_resharding(bs, obj_instance, target->bucket_info, null_yield, dpp);
     if (r == -ERR_BUSY_RESHARDING) {
+      ldpp_dout(dpp, 0) << "ERROR: block_while_resharding() still busy. obj=" << dendl;
       continue;
     }
     if (r < 0) {
+      ldpp_dout(dpp, 0) << "ERROR: block_while_resharding() failed. obj=" << 
+        obj_instance.key << ". ret=" << r << dendl;
       return r;
     }
-    ldpp_dout(dpp, 20) << "reshard completion identified" << dendl;
+    ldpp_dout(dpp, 20) << "reshard completion identified. obj=" << obj_instance.key << dendl;
     i = 0; /* resharding is finished, make sure we can retry */
     invalidate_bs();
   } // for loop
 
   if (r < 0) {
+    ldpp_dout(dpp, 0) << "ERROR: bucket shard callback failed. obj=" << 
+      obj_instance.key << ". ret=" << r << dendl;
     return r;
   }