From 0f884fdb31a26f241401ce2a9329dc0f2c4eccc7 Mon Sep 17 00:00:00 2001 From: Guang Yang Date: Mon, 15 Sep 2014 11:41:06 +0000 Subject: [PATCH] For pgls OP, get/put budget on per list session basis, instead of per OP basis, which could lead to deadlock. Signed-off-by: Guang Yang (yguang@yahoo-inc.com) --- src/librados/IoCtxImpl.cc | 4 ++-- src/osdc/Objecter.cc | 33 ++++++++++++++++++++++----- src/osdc/Objecter.h | 48 +++++++++++++++++++++++++++++++-------- 3 files changed, 68 insertions(+), 17 deletions(-) diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc index aadd04fe6fdb9..210575bed7ad8 100644 --- a/src/librados/IoCtxImpl.cc +++ b/src/librados/IoCtxImpl.cc @@ -759,7 +759,7 @@ int librados::IoCtxImpl::hit_set_list(uint32_t hash, AioCompletionImpl *c, ::ObjectOperation rd; rd.hit_set_ls(pls, NULL); object_locator_t oloc(poolid); - c->tid = objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL); + c->tid = objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL); return 0; } @@ -774,7 +774,7 @@ int librados::IoCtxImpl::hit_set_get(uint32_t hash, AioCompletionImpl *c, ::ObjectOperation rd; rd.hit_set_get(utime_t(stamp, 0), pbl, 0); object_locator_t oloc(poolid); - c->tid = objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL); + c->tid = objecter->pg_read(hash, oloc, rd, NULL, 0, onack, NULL, NULL); return 0; } diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 952b2dc492989..7af51e3d268c3 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1669,14 +1669,14 @@ public: } }; -ceph_tid_t Objecter::op_submit(Op *op) +ceph_tid_t Objecter::op_submit(Op *op, int *ctx_budget) { RWLock::RLocker rl(rwlock); RWLock::Context lc(rwlock, RWLock::Context::TakenForRead); - return _op_submit_with_budget(op, lc); + return _op_submit_with_budget(op, lc, ctx_budget); } -ceph_tid_t Objecter::_op_submit_with_budget(Op *op, RWLock::Context& lc) +ceph_tid_t Objecter::_op_submit_with_budget(Op *op, RWLock::Context& lc, int *ctx_budget) { assert(initialized.read()); @@ -1691,7 +1691,14 @@ ceph_tid_t Objecter::_op_submit_with_budget(Op *op, RWLock::Context& lc) // throttle. before we look at any state, because // take_op_budget() may drop our lock while it blocks. - _take_op_budget(op); + if (!op->ctx_budgeted || (ctx_budget && (*ctx_budget == -1))) { + int op_budget = _take_op_budget(op); + // take and pass out the budget for the first OP + // in the context session + if (ctx_budget && (*ctx_budget == -1)) { + *ctx_budget = op_budget; + } + } return _op_submit(op, lc); } @@ -2287,7 +2294,7 @@ void Objecter::_finish_op(Op *op) assert(op->session->lock.is_wlocked()); - if (op->budgeted) + if (!op->ctx_budgeted && op->budgeted) put_op_budget(op); _session_op_remove(op->session, op); @@ -2690,6 +2697,10 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish) } } if (list_context->at_end_of_pool) { + // release the listing context's budget once all + // OPs (in the session) are finished + put_list_context_budget(list_context); + onfinish->complete(0); return; } @@ -2721,7 +2732,7 @@ void Objecter::list_objects(ListContext *list_context, Context *onfinish) object_locator_t oloc(list_context->pool_id, list_context->nspace); pg_read(list_context->current_pg, oloc, op, - &list_context->bl, 0, onack, &onack->epoch); + &list_context->bl, 0, onack, &onack->epoch, &list_context->ctx_budget); } void Objecter::_list_reply(ListContext *list_context, int r, @@ -2767,6 +2778,9 @@ void Objecter::_list_reply(ListContext *list_context, int r, } if (!list_context->list.empty()) { ldout(cct, 20) << " returning results so far" << dendl; + // release the listing context's budget once all + // OPs (in the session) are finished + put_list_context_budget(list_context); final_finish->complete(0); return; } @@ -2775,6 +2789,13 @@ void Objecter::_list_reply(ListContext *list_context, int r, list_objects(list_context, final_finish); } +void Objecter::put_list_context_budget(ListContext *list_context) { + if (list_context->ctx_budget >= 0) { + ldout(cct, 10) << " release listing context's budget " << list_context->ctx_budget << dendl; + put_op_budget_bytes(list_context->ctx_budget); + list_context->ctx_budget = -1; + } + } //snapshots diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 342dda5e3cc05..ca8411ca46c00 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1137,6 +1137,11 @@ public: /// true if we should resend this message on failure bool should_resend; + /// true if the throttle budget is get/put on a series of OPs, instead of + /// per OP basis, when this flag is set, the budget is acquired before sending + /// the very first OP of the series and released upon receiving the last OP reply. + bool ctx_budgeted; + Op(const object_t& o, const object_locator_t& ol, vector& op, int f, Context *ac, Context *co, version_t *ov) : session(NULL), incarnation(0), @@ -1150,7 +1155,8 @@ public: objver(ov), reply_epoch(NULL), map_dne_bound(0), budgeted(false), - should_resend(true) { + should_resend(true), + ctx_budgeted(false) { ops.swap(op); /* initialize out_* to match op vector */ @@ -1255,11 +1261,24 @@ public: bufferlist extra_info; + // The budget associated with this context, once it is set (>= 0), + // the budget is not get/released on OP basis, instead the budget + // is acquired before sending the first OP and released upon receiving + // the last op reply. + int ctx_budget; + ListContext() : current_pg(0), current_pg_epoch(0), starting_pg_num(0), at_end_of_pool(false), at_end_of_pg(false), pool_id(0), - pool_snap_seq(0), max_entries(0) {} + pool_snap_seq(0), + max_entries(0), + nspace(), + bl(), + list(), + filter(), + extra_info(), + ctx_budget(-1) {} bool at_end() const { return at_end_of_pool; @@ -1567,7 +1586,7 @@ public: */ int calc_op_budget(Op *op); void _throttle_op(Op *op, int op_size=0); - void _take_op_budget(Op *op) { + int _take_op_budget(Op *op) { assert(rwlock.is_locked()); int op_budget = calc_op_budget(op); if (keep_balanced_budget) { @@ -1577,13 +1596,19 @@ public: op_throttle_ops.take(1); } op->budgeted = true; + return op_budget; + } + void put_op_budget_bytes(int op_budget) { + assert(op_budget >= 0); + op_throttle_bytes.put(op_budget); + op_throttle_ops.put(1); } void put_op_budget(Op *op) { assert(op->budgeted); int op_budget = calc_op_budget(op); - op_throttle_bytes.put(op_budget); - op_throttle_ops.put(1); + put_op_budget_bytes(op_budget); } + void put_list_context_budget(ListContext *list_context); Throttle op_throttle_bytes, op_throttle_ops; public: @@ -1679,12 +1704,12 @@ private: // low-level ceph_tid_t _op_submit(Op *op, RWLock::Context& lc); - ceph_tid_t _op_submit_with_budget(Op *op, RWLock::Context& lc); + ceph_tid_t _op_submit_with_budget(Op *op, RWLock::Context& lc, int *ctx_budget = NULL); inline void unregister_op(Op *op); // public interface public: - ceph_tid_t op_submit(Op *op); + ceph_tid_t op_submit(Op *op, int *ctx_budget = NULL); bool is_active() { return !((!inflight_ops.read()) && linger_ops.empty() && poolstat_ops.empty() && statfs_ops.empty()); } @@ -1800,7 +1825,8 @@ public: ObjectOperation& op, bufferlist *pbl, int flags, Context *onack, - epoch_t *reply_epoch) { + epoch_t *reply_epoch, + int *ctx_budget) { Op *o = new Op(object_t(), oloc, op.ops, flags | global_op_flags.read() | CEPH_OSD_FLAG_READ, onack, NULL, NULL); @@ -1813,7 +1839,11 @@ public: o->out_handler.swap(op.out_handler); o->out_rval.swap(op.out_rval); o->reply_epoch = reply_epoch; - return op_submit(o); + if (ctx_budget) { + // budget is tracked by listing context + o->ctx_budgeted = true; + } + return op_submit(o, ctx_budget); } ceph_tid_t linger_mutate(const object_t& oid, const object_locator_t& oloc, ObjectOperation& op, -- 2.39.5