]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/osd: don't assume a pull must happen if there is no push. 44223/head
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Mon, 6 Dec 2021 16:54:41 +0000 (16:54 +0000)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Mon, 6 Dec 2021 17:34:20 +0000 (17:34 +0000)
In the classical OSD the `ReplicatedRecoveryBackend::recover_object()`
divides into two main flows: pull and push:

```cpp
int ReplicatedBackend::recover_object(
  const hobject_t &hoid,
  // ...
  )
{
  dout(10) << __func__ << ": " << hoid << dendl;
  RPGHandle *h = static_cast<RPGHandle *>(_h);
  if (get_parent()->get_local_missing().is_missing(hoid)) {
    ceph_assert(!obc);
    // pull
    prepare_pull(
      v,
      hoid,
      head,
      h);
  } else {
    ceph_assert(obc);
    int started = start_pushes(
      hoid,
      obc,
      h);
    // ...
  }
  return 0;
}
```

Pulls may also enter the push path (`C_ReplicatedBackend_OnPullComplete`)
but push handling doesn't draw any assumption on that. What's important,
`recover_object()` may result in no pulls and pushes.

This isn't the case of crimson as its implementation of the push path
asserts that, if no push is scheduled, `PullInfo` must be allocated.
This patch reworks this logic to reflects the classical one and to avoid
crashes like the following one:

```
DEBUG 2021-12-01 18:43:00,220 [shard 0] osd - recover_object: loaded obc: 3:4e058a2e:::smithi13839607-45:head
WARN  2021-12-01 18:43:00,220 [shard 0] none - intrusive_ptr_add_ref(p=0x6190000d7f80, use_count=3)
WARN  2021-12-01 18:43:00,220 [shard 0] none - intrusive_ptr_release(p=0x6190000d7f80, use_count=4)
TRACE 2021-12-01 18:43:00,220 [shard 0] osd - call_with_interruption_impl clearing interrupt_cond: 0x60300012b210,N7crimson3osd20IOInterruptConditionE
TRACE 2021-12-01 18:43:00,220 [shard 0] osd - call_with_interruption_impl: may_interrupt: false, local interrupt_condintion: 0x60300012b210, global interrupt_cond: 0x0,N7crimson3osd20IOInterruptConditionE
TRACE 2021-12-01 18:43:00,220 [shard 0] osd - set: interrupt_cond: 0x60300012b210, ref_count: 1
ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-8902-g52fd47fe/rpm/el8/BUILD/ceph-17.0.
0-8902-g52fd47fe/src/crimson/osd/replicated_recovery_backend.cc:84: ReplicatedRecoveryBackend::maybe_push_shards(const hobject_t&, eversion_t)::<lambda()>: Assertion `recovery.pi' failed.
Aborting on shard 0.
```

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/crimson/osd/replicated_recovery_backend.cc

index 19cf5d2ef27a377f1f1a35fe2a324f2aabfd690e..2347ae260fd6ffba1769b557b9e4df06a45414e5 100644 (file)
@@ -75,16 +75,19 @@ ReplicatedRecoveryBackend::maybe_push_shards(
     });
   }).then_interruptible([this, soid] {
     auto &recovery = recovering.at(soid);
-    auto push_info = recovery.pushing.begin();
-    object_stat_sum_t stat = {};
-    if (push_info != recovery.pushing.end()) {
-      stat = push_info->second.stat;
+    if (auto push_info = recovery.pushing.begin();
+        push_info != recovery.pushing.end()) {
+      pg.get_recovery_handler()->on_global_recover(soid,
+                                                   push_info->second.stat,
+                                                   false);
+    } else if (recovery.pi) {
+      // no push happened (empty get_shards_to_push()) but pull actually did
+      pg.get_recovery_handler()->on_global_recover(soid,
+                                                   recovery.pi->stat,
+                                                   false);
     } else {
-      // no push happened, take pull_info's stat
-      assert(recovery.pi);
-      stat = recovery.pi->stat;
+      // no pulls, no pushes
     }
-    pg.get_recovery_handler()->on_global_recover(soid, stat, false);
     return seastar::make_ready_future<>();
   }).handle_exception_interruptible([this, soid](auto e) {
     auto &recovery = recovering.at(soid);