From 930ab185c7876ed5193561b8183d09ac6e9f257d Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Wed, 14 Jul 2021 11:02:21 -0400 Subject: [PATCH] rgw: Robust notify invalidates on cache timeout This avoids a potential race condition in which updates are delayed. Fixes: https://tracker.ceph.com/issues/51674 Signed-off-by: Adam C. Emerson (cherry picked from commit 76247990ff38049ee32dd47d31482b9648353673) Conflicts: src/rgw/services/svc_notify.cc - Skip the renaming, since this is a backport and that's mostly a matter of futureproofing. Backport: https://tracker.ceph.com/issues/51679 Signed-off-by: Adam C. Emerson --- src/rgw/services/svc_notify.cc | 107 +++++++-------------------------- src/rgw/services/svc_notify.h | 2 +- 2 files changed, 24 insertions(+), 85 deletions(-) diff --git a/src/rgw/services/svc_notify.cc b/src/rgw/services/svc_notify.cc index c912e0e6a512a..c634934e48440 100644 --- a/src/rgw/services/svc_notify.cc +++ b/src/rgw/services/svc_notify.cc @@ -392,98 +392,37 @@ int RGWSI_Notify::robust_notify(const DoutPrefixProvider *dpp, const RGWCacheNotifyInfo& cni, optional_yield y) { - // The reply of every machine that acks goes in here. - boost::container::flat_set> acks; - bufferlist bl, rbl; + bufferlist bl; encode(cni, bl); // First, try to send, without being fancy about it. - auto r = notify_obj.notify(dpp, bl, 0, &rbl, y); + auto r = notify_obj.notify(dpp, bl, 0, nullptr, y); - // If that doesn't work, get serious. if (r < 0) { - ldpp_dout(dpp, 1) << "robust_notify: If at first you don't succeed: " + ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " Notify failed on object " << cni.obj << ": " << cpp_strerror(-r) << dendl; + } - - auto p = rbl.cbegin(); - // Gather up the replies to the first attempt. - try { - uint32_t num_acks; - decode(num_acks, p); - // Doing this ourselves since we don't care about the payload; - for (auto i = 0u; i < num_acks; ++i) { - std::pair id; - decode(id, p); - acks.insert(id); - ldpp_dout(dpp, 20) << "robust_notify: acked by " << id << dendl; - uint32_t blen; - decode(blen, p); - p += blen; - } - } catch (const buffer::error& e) { - ldpp_dout(dpp, 0) << "robust_notify: notify response parse failed: " - << e.what() << dendl; - acks.clear(); // Throw away junk on failed parse. - } - - - // Every machine that fails to reply and hasn't acked a previous - // attempt goes in here. - boost::container::flat_set> timeouts; - - auto tries = 1u; - while (r < 0 && tries < max_notify_retries) { - ++tries; - rbl.clear(); - // Reset the timeouts, we're only concerned with new ones. - timeouts.clear(); - r = notify_obj.notify(dpp, bl, 0, &rbl, y); + // If we timed out, get serious. + if (r == -ETIMEDOUT) { + RGWCacheNotifyInfo info; + info.op = REMOVE_OBJ; + info.obj = cni.obj; + bufferlist retrybl; + encode(info, retrybl); + + for (auto tries = 0u; + r == -ETIMEDOUT && tries < max_notify_retries; + ++tries) { + ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " Invalidating obj=" << info.obj << " tries=" + << tries << dendl; + r = notify_obj.notify(dpp, bl, 0, nullptr, y); if (r < 0) { - ldpp_dout(dpp, 1) << "robust_notify: retry " << tries << " failed: " - << cpp_strerror(-r) << dendl; - p = rbl.begin(); - try { - uint32_t num_acks; - decode(num_acks, p); - // Not only do we not care about the payload, but we don't - // want to empty the container; we just want to augment it - // with any new members. - for (auto i = 0u; i < num_acks; ++i) { - std::pair id; - decode(id, p); - auto ir = acks.insert(id); - if (ir.second) { - ldpp_dout(dpp, 20) << "robust_notify: acked by " << id << dendl; - } - uint32_t blen; - decode(blen, p); - p += blen; - } - - uint32_t num_timeouts; - decode(num_timeouts, p); - for (auto i = 0u; i < num_timeouts; ++i) { - std::pair id; - decode(id, p); - // Only track timeouts from hosts that haven't acked previously. - if (acks.find(id) != acks.cend()) { - ldpp_dout(dpp, 20) << "robust_notify: " << id << " timed out." - << dendl; - timeouts.insert(id); - } - } - } catch (const buffer::error& e) { - ldpp_dout(dpp, 0) << "robust_notify: notify response parse failed: " - << e.what() << dendl; - continue; - } - // If we got a good parse and timeouts is empty, that means - // everyone who timed out in one call received the update in a - // previous one. - if (timeouts.empty()) { - r = 0; - } + ldpp_dout(dpp, 1) << __PRETTY_FUNCTION__ << ":" << __LINE__ + << " invalidation attempt " << tries << " failed: " + << cpp_strerror(-r) << dendl; } } } diff --git a/src/rgw/services/svc_notify.h b/src/rgw/services/svc_notify.h index e269677a361f9..d4e3e2f510210 100644 --- a/src/rgw/services/svc_notify.h +++ b/src/rgw/services/svc_notify.h @@ -42,7 +42,7 @@ private: bool enabled{false}; double inject_notify_timeout_probability{0}; - unsigned max_notify_retries{0}; + static constexpr unsigned max_notify_retries = 10; string get_control_oid(int i); RGWSI_RADOS::Obj pick_control_obj(const string& key); -- 2.39.5