From 5825a1340374bb41491e0814782bd9f2f6a4b8aa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 13 Feb 2017 09:13:04 -0500 Subject: [PATCH] osd: manage backoffs per-pg; drop special split logic Switch backoffs to be owned by a specific spg_t. Instead of wonky split logic, just clear them. This is mostly just for convenience; we could conceivably only clear the range belonging to children (just to stay tidy--we'll never get a request in that range) but why bother. The full pg backoffs are still defined by the range for the pg, although it's a bit redundant--we could just as easily do [min,max). This way we get readable hobject ranges in the messages that go by without having to map to/from pgids. Add Session::add_backoff() helper to keep Session internals out of PG.h. Signed-off-by: Sage Weil --- src/osd/OSD.cc | 36 ++--------------- src/osd/PG.cc | 82 ++----------------------------------- src/osd/PG.h | 1 - src/osd/PrimaryLogPG.cc | 7 ++-- src/osd/Session.cc | 89 ++++++++++++++++++++++------------------- src/osd/Session.h | 77 ++++++++++++++++++++++------------- 6 files changed, 108 insertions(+), 184 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index fad41ca2868..77f8630f213 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -8867,38 +8867,10 @@ void OSD::handle_backoff(OpRequestRef& op, OSDMapRef& osdmap) } // map hobject range to PG(s) - bool queued = false; - hobject_t pos = m->begin; - do { - pg_t _pgid(pos.get_hash(), pos.pool); - if (osdmap->have_pg_pool(pos.pool)) { - _pgid = osdmap->raw_pg_to_pg(_pgid); - } - if (!osdmap->have_pg_pool(_pgid.pool())) { - // missing pool -- drop - return; - } - spg_t pgid; - if (osdmap->get_primary_shard(_pgid, &pgid)) { - dout(10) << __func__ << " pos " << pos << " pgid " << pgid << dendl; - PGRef pg = get_pg_or_queue_for_pg(pgid, op, s); - if (pg) { - if (!queued) { - enqueue_op(pg, op); - queued = true; - } else { - // use a fresh OpRequest - m->get(); // take a ref for the new OpRequest - OpRequestRef newop(op_tracker.create_request(m)); - newop->mark_event("duplicated original op for another pg"); - enqueue_op(pg, newop); - } - } - } - // advance - pos = _pgid.get_hobj_end(osdmap->get_pg_pool(pos.pool)->get_pg_num()); - dout(20) << __func__ << " next pg " << pos << dendl; - } while (pos < m->end); + PGRef pg = get_pg_or_queue_for_pg(m->pgid, op, s); + if (pg) { + enqueue_op(pg, op); + } } template diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 8cab55fb7b4..c6fadd997c9 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2285,11 +2285,8 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits) _split_into(child_pgid, child, split_bits); - // release all backoffs so that Objecter doesn't need to handle unblock - // on split backoffs + // release all backoffs for simplicity release_backoffs(hobject_t(), hobject_t::get_max()); - // split any remaining (deleting) backoffs among child PGs - split_backoffs(child, split_bits); child->on_new_interval(); @@ -2304,7 +2301,7 @@ void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end) ConnectionRef con = s->con; if (!con) // OSD::ms_handle_reset clears s->con without a lock return; - Backoff *b = s->have_backoff(begin); + Backoff *b = s->have_backoff(info.pgid, begin); if (b) { derr << __func__ << " already have backoff for " << s << " begin " << begin << " " << *b << dendl; @@ -2312,15 +2309,9 @@ void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end) } Mutex::Locker l(backoff_lock); { - Mutex::Locker l(s->backoff_lock); - b = new Backoff(this, s, ++s->backoff_seq, begin, end); - auto& ls = s->backoffs[begin]; - if (ls.empty()) { - ++s->backoff_count; - } - assert(s->backoff_count == (int)s->backoffs.size()); - ls.insert(b); + b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end); backoffs[begin].insert(b); + s->add_backoff(b); dout(10) << __func__ << " session " << s << " added " << *b << dendl; } con->send_message( @@ -2395,71 +2386,6 @@ void PG::release_backoffs(const hobject_t& begin, const hobject_t& end) } } -void PG::split_backoffs(PG *child, unsigned split_bits) -{ - dout(10) << __func__ << " split_bits " << split_bits << " child " - << child->info.pgid.pgid << dendl; - unsigned mask = ~((~0)< backoffs_to_dup; // pg backoffs - vector backoffs_to_move; // oid backoffs - { - Mutex::Locker l(backoff_lock); - auto p = backoffs.begin(); - while (p != backoffs.end()) { - if (p->first == info.pgid.pgid.get_hobj_start()) { - // if it is a full PG we always dup it for the child. - for (auto& q : p->second) { - dout(10) << __func__ << " pg backoff " << p->first - << " " << q << dendl; - backoffs_to_dup.push_back(q); - } - } else { - // otherwise, we move it to the child only if falls into the - // childs hash range. - if ((p->first.get_hash() & mask) == child->info.pgid.pgid.ps()) { - for (auto& q : p->second) { - dout(20) << __func__ << " will move " << p->first - << " " << q << dendl; - backoffs_to_move.push_back(q); - } - p = backoffs.erase(p); - continue; - } else { - dout(20) << __func__ << " will not move " << p->first - << " " << p->second << dendl; - } - } - ++p; - } - } - for (auto b : backoffs_to_dup) { - SessionRef s; - { - Mutex::Locker l(b->lock); - b->end = info.pgid.pgid.get_hobj_end(split_bits); - dout(10) << __func__ << " pg backoff " << *b << dendl; - s = b->session; - } - if (s) { - child->add_pg_backoff(b->session); - } else { - dout(20) << __func__ << " didn't dup pg backoff, session is null" - << dendl; - } - } - for (auto b : backoffs_to_move) { - Mutex::Locker l(b->lock); - if (b->pg == this) { - dout(10) << __func__ << " move backoff " << *b << " to child" << dendl; - b->pg = child; - child->backoffs[b->begin].insert(b); - } else { - dout(20) << __func__ << " move backoff " << *b << " nowhere... pg is null" - << dendl; - } - } -} - void PG::clear_backoffs() { dout(10) << __func__ << " " << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index e4531e56fa7..1944bbc1bcc 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1074,7 +1074,6 @@ public: release_backoffs(o, o); } void clear_backoffs(); - void split_backoffs(PG *child, unsigned split_bits); void add_pg_backoff(SessionRef s) { hobject_t begin = info.pgid.pgid.get_hobj_start(); diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 56954dba83b..09d85098ea0 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -1600,7 +1600,7 @@ void PrimaryLogPG::handle_backoff(OpRequestRef& op) } dout(10) << __func__ << " backoff ack id " << m->id << " [" << begin << "," << end << ")" << dendl; - session->ack_backoff(cct, m->id, begin, end); + session->ack_backoff(cct, m->pgid, m->id, begin, end); } void PrimaryLogPG::do_request( @@ -1621,7 +1621,8 @@ void PrimaryLogPG::do_request( session->put(); // get_priv takes a ref, and so does the SessionRef if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) { - Backoff *b = session->have_backoff(info.pgid.pgid.get_hobj_start()); + Backoff *b = session->have_backoff(info.pgid, + info.pgid.pgid.get_hobj_start()); if (b) { dout(10) << " have backoff " << *b << " " << *m << dendl; assert(!b->is_acked() || !g_conf->osd_debug_crash_on_ignored_backoff); @@ -1783,7 +1784,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op) } session->put(); // get_priv() takes a ref, and so does the intrusive_ptr - Backoff *b = session->have_backoff(head); + Backoff *b = session->have_backoff(info.pgid, head); if (b) { dout(10) << __func__ << " have backoff " << *b << " " << *m << dendl; assert(!b->is_acked() || !g_conf->osd_debug_crash_on_ignored_backoff); diff --git a/src/osd/Session.cc b/src/osd/Session.cc index 125bd1f1579..abe1a0c9eba 100644 --- a/src/osd/Session.cc +++ b/src/osd/Session.cc @@ -11,25 +11,27 @@ void Session::clear_backoffs() { - map> ls; + map>> ls; { Mutex::Locker l(backoff_lock); ls.swap(backoffs); backoff_count = 0; } - for (auto& p : ls) { - for (auto& b : p.second) { - Mutex::Locker l(b->lock); - if (b->pg) { - assert(b->session == this); - assert(b->is_new() || b->is_acked()); - b->pg->rm_backoff(b); - b->pg.reset(); - b->session.reset(); - } else if (b->session) { - assert(b->session == this); - assert(b->is_deleting()); - b->session.reset(); + for (auto& i : ls) { + for (auto& p : i.second) { + for (auto& b : p.second) { + Mutex::Locker l(b->lock); + if (b->pg) { + assert(b->session == this); + assert(b->is_new() || b->is_acked()); + b->pg->rm_backoff(b); + b->pg.reset(); + b->session.reset(); + } else if (b->session) { + assert(b->session == this); + assert(b->is_deleting()); + b->session.reset(); + } } } } @@ -37,42 +39,45 @@ void Session::clear_backoffs() void Session::ack_backoff( CephContext *cct, + spg_t pgid, uint64_t id, const hobject_t& begin, const hobject_t& end) { Mutex::Locker l(backoff_lock); - // NOTE that ack may be for [a,c] but osd may now have [a,b) and - // [b,c) due to a PG split. - auto p = backoffs.lower_bound(begin); - while (p != backoffs.end()) { - // note: must still examine begin=end=p->first case - int r = cmp(p->first, end); - if (r > 0 || (r == 0 && begin < end)) { - break; - } - auto q = p->second.begin(); - while (q != p->second.end()) { - Backoff *b = (*q).get(); - if (b->id == id) { - if (b->is_new()) { - b->state = Backoff::STATE_ACKED; - dout(20) << __func__ << " now " << *b << dendl; - } else if (b->is_deleting()) { - dout(20) << __func__ << " deleting " << *b << dendl; - q = p->second.erase(q); - continue; - } + auto p = backoffs.find(pgid); + if (p == backoffs.end()) { + dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << "," + << end << ") pg not found" << dendl; + return; + } + auto q = p->second.find(begin); + if (q == p->second.end()) { + dout(20) << __func__ << " " << pgid << " " << id << " [" << begin << "," + << end << ") begin not found" << dendl; + return; + } + for (auto i = q->second.begin(); i != q->second.end(); ++i) { + Backoff *b = (*i).get(); + if (b->id == id) { + if (b->is_new()) { + b->state = Backoff::STATE_ACKED; + dout(20) << __func__ << " now " << *b << dendl; + } else if (b->is_deleting()) { + dout(20) << __func__ << " deleting " << *b << dendl; + q->second.erase(i); + --backoff_count; } - ++q; + break; } + } + if (q->second.empty()) { + dout(20) << __func__ << " clearing begin bin " << q->first << dendl; + p->second.erase(q); if (p->second.empty()) { - dout(20) << __func__ << " clearing bin " << p->first << dendl; - p = backoffs.erase(p); - --backoff_count; - } else { - ++p; + dout(20) << __func__ << " clearing pg bin " << p->first << dendl; + backoffs.erase(p); } } - assert(backoff_count == (int)backoffs.size()); + assert(!backoff_count == backoffs.empty()); } diff --git a/src/osd/Session.h b/src/osd/Session.h index 6492c4bd4e4..d29448550b2 100644 --- a/src/osd/Session.h +++ b/src/osd/Session.h @@ -67,6 +67,7 @@ struct Backoff : public RefCountedObject { STATE_DELETING = 3 ///< backoff deleted, but un-acked }; std::atomic_int state = {STATE_NEW}; + spg_t pgid; ///< owning pgid uint64_t id = 0; ///< unique id (within the Session) bool is_new() const { @@ -96,10 +97,11 @@ struct Backoff : public RefCountedObject { SessionRef session; ///< owning session hobject_t begin, end; ///< [) range to block, unless ==, then single obj - Backoff(PGRef pg, SessionRef s, + Backoff(spg_t pgid, PGRef pg, SessionRef s, uint64_t i, const hobject_t& b, const hobject_t& e) : RefCountedObject(g_ceph_context, 0), + pgid(pgid), id(i), lock("Backoff::lock"), pg(pg), @@ -108,7 +110,7 @@ struct Backoff : public RefCountedObject { end(e) {} friend ostream& operator<<(ostream& out, const Backoff& b) { - return out << "Backoff(" << &b << " " << b.id + return out << "Backoff(" << &b << " " << b.pgid << " " << b.id << " " << b.get_state_name() << " [" << b.begin << "," << b.end << ") " << " session " << b.session @@ -139,7 +141,7 @@ struct Session : public RefCountedObject { /// protects backoffs; orders inside Backoff::lock *and* PG::backoff_lock Mutex backoff_lock; std::atomic_int backoff_count= {0}; ///< simple count of backoffs - map> backoffs; + map>> backoffs; std::atomic backoff_seq = {0}; @@ -159,26 +161,32 @@ struct Session : public RefCountedObject { void ack_backoff( CephContext *cct, + spg_t pgid, uint64_t id, const hobject_t& start, const hobject_t& end); - Backoff *have_backoff(const hobject_t& oid) { - if (backoff_count.load()) { - Mutex::Locker l(backoff_lock); - assert(backoff_count == (int)backoffs.size()); - auto p = backoffs.lower_bound(oid); - if (p != backoffs.begin() && - p->first > oid) { - --p; - } - if (p != backoffs.end()) { - int r = cmp(oid, p->first); - if (r == 0 || r > 0) { - for (auto& q : p->second) { - if (r == 0 || oid < q->end) { - return &(*q); - } + Backoff *have_backoff(spg_t pgid, const hobject_t& oid) { + if (!backoff_count.load()) { + return nullptr; + } + Mutex::Locker l(backoff_lock); + assert(!backoff_count == backoffs.empty()); + auto i = backoffs.find(pgid); + if (i == backoffs.end()) { + return nullptr; + } + auto p = i->second.lower_bound(oid); + if (p != i->second.begin() && + p->first > oid) { + --p; + } + if (p != i->second.end()) { + int r = cmp(oid, p->first); + if (r == 0 || r > 0) { + for (auto& q : p->second) { + if (r == 0 || oid < q->end) { + return &(*q); } } } @@ -186,24 +194,37 @@ struct Session : public RefCountedObject { return nullptr; } + void add_backoff(Backoff *b) { + Mutex::Locker l(backoff_lock); + assert(!backoff_count == backoffs.empty()); + backoffs[b->pgid][b->begin].insert(b); + ++backoff_count; + } + // called by PG::release_*_backoffs and PG::clear_backoffs() void rm_backoff(BackoffRef b) { Mutex::Locker l(backoff_lock); assert(b->lock.is_locked_by_me()); assert(b->session == this); - auto p = backoffs.find(b->begin); - // may race with clear_backoffs() - if (p != backoffs.end()) { - auto q = p->second.find(b); - if (q != p->second.end()) { - p->second.erase(q); - if (p->second.empty()) { - backoffs.erase(p); + auto i = backoffs.find(b->pgid); + if (i != backoffs.end()) { + // may race with clear_backoffs() + auto p = i->second.find(b->begin); + if (p != i->second.end()) { + auto q = p->second.find(b); + if (q != p->second.end()) { + p->second.erase(q); --backoff_count; + if (p->second.empty()) { + i->second.erase(p); + if (i->second.empty()) { + backoffs.erase(i); + } + } } } } - assert(backoff_count == (int)backoffs.size()); + assert(!backoff_count == backoffs.empty()); } void clear_backoffs(); }; -- 2.39.5