From b729e6288f1e914f3fa457916493f257e82b901f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 13 Feb 2017 18:02:39 -0500 Subject: [PATCH] osd: fix backoff vs reset race In OSD::ms_handle_reset, we clear session->con before removing any backoffs. That means we have to check if con has been cleared after any call to have_backoff, lest we race with ms_handle_reset and it removes the backoffs but we don't realize our client session is disconnected. Introduce a helper to do both these checks in a safe way, simplifying callers while we're at it. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 2 +- src/osd/PrimaryLogPG.cc | 12 +++--------- src/osd/Session.cc | 20 ++++++++++++++++++++ src/osd/Session.h | 3 +++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 77f8630f213..33b8b759824 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4946,7 +4946,7 @@ bool OSD::ms_handle_reset(Connection *con) session->con.reset(NULL); // break con <-> session ref cycle // note that we break session->con *before* the session_handle_reset // cleanup below. this avoids a race between us and - // PG::add_backoff, split_backoff, etc. + // PG::add_backoff, Session::check_backoff, etc. session_handle_reset(session); session->put(); return true; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 5ca2a8eced2..64711f2091e 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1621,11 +1621,8 @@ void PrimaryLogPG::do_request( session->put(); // get_priv takes a ref, and so does the SessionRef if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) { - BackoffRef b(session->have_backoff(info.pgid, - info.pgid.pgid.get_hobj_start())); - if (b) { - dout(10) << " have backoff " << *b << " " << *m << dendl; - assert(!b->is_acked() || !g_conf->osd_debug_crash_on_ignored_backoff); + if (session->check_backoff(cct, info.pgid, + info.pgid.pgid.get_hobj_start(), m)) { return; } @@ -1784,10 +1781,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op) } session->put(); // get_priv() takes a ref, and so does the intrusive_ptr - BackoffRef b(session->have_backoff(info.pgid, head)); - if (b) { - dout(10) << __func__ << " have backoff " << *b << " " << *m << dendl; - assert(!b->is_acked() || !g_conf->osd_debug_crash_on_ignored_backoff); + if (session->check_backoff(cct, info.pgid, head, m)) { return; } } diff --git a/src/osd/Session.cc b/src/osd/Session.cc index abe1a0c9eba..2de44271344 100644 --- a/src/osd/Session.cc +++ b/src/osd/Session.cc @@ -81,3 +81,23 @@ void Session::ack_backoff( } assert(!backoff_count == backoffs.empty()); } + +bool Session::check_backoff( + CephContext *cct, spg_t pgid, const hobject_t& oid, Message *m) +{ + BackoffRef b(have_backoff(pgid, oid)); + if (b) { + dout(10) << __func__ << " session " << this << " has backoff " << *b + << " for " << *m << dendl; + assert(!b->is_acked() || !g_conf->osd_debug_crash_on_ignored_backoff); + return true; + } + // we may race with ms_handle_reset. it clears session->con before removing + // backoffs, so if we see con is cleared here we have to abort this + // request. + if (!con) { + dout(10) << __func__ << " session " << this << " disconnected" << dendl; + return true; + } + return false; +} diff --git a/src/osd/Session.h b/src/osd/Session.h index 91648e60b73..257bc34a155 100644 --- a/src/osd/Session.h +++ b/src/osd/Session.h @@ -194,6 +194,9 @@ struct Session : public RefCountedObject { return nullptr; } + bool check_backoff( + CephContext *cct, spg_t pgid, const hobject_t& oid, Message *m); + void add_backoff(BackoffRef b) { Mutex::Locker l(backoff_lock); assert(!backoff_count == backoffs.empty()); -- 2.47.3