From: Sridhar Seshasayee Date: Fri, 3 Feb 2023 12:17:38 +0000 (+0530) Subject: osd: update OSDService::queue_recovery_context to specify cost X-Git-Tag: v17.2.7~418^2~27 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c30f2729b4815e56229c711cf0789cb704aec5dd;p=ceph.git osd: update OSDService::queue_recovery_context to specify cost Previously, we always queued this with cost osd_recovery_cost which defaults to 20M. With mclock, this caused these items to be delayed heavily. Instead, base the cost on the operation queued. Fixes: https://tracker.ceph.com/issues/58606 Signed-off-by: Samuel Just Signed-off-by: Sridhar Seshasayee --- diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc index b061e0f69ca1..dc4fa3d0558c 100644 --- a/src/osd/ECBackend.cc +++ b/src/osd/ECBackend.cc @@ -1433,9 +1433,25 @@ void ECBackend::filter_read_op( } if (op.in_progress.empty()) { + /* This case is odd. filter_read_op gets called while processing + * an OSDMap. Normal, non-recovery reads only happen from acting + * set osds. For this op to have had a read source go down and + * there not be an interval change, it must be part of a pull during + * log-based recovery. + * + * This callback delays calling complete_read_op until later to avoid + * dealing with recovery while handling an OSDMap. We assign a + * cost here of 1 because: + * 1) This should be very rare, and the operation itself was already + * throttled. + * 2) It shouldn't result in IO, rather it should result in restarting + * the pull on the affected objects and pushes from in-memory buffers + * on any now complete unaffected objects. + */ get_parent()->schedule_recovery_work( get_parent()->bless_unlocked_gencontext( - new FinishReadOp(this, op.tid))); + new FinishReadOp(this, op.tid)), + 1); } } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index aed5057b4559..d3cd2a116325 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1639,14 +1639,29 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi) void OSDService::queue_recovery_context( PG *pg, - GenContext *c) + GenContext *c, + uint64_t cost) { epoch_t e = get_osdmap_epoch(); + + uint64_t cost_for_queue = [this, cost] { + if (cct->_conf->osd_op_queue == "mclock_scheduler") { + return cost; + } else { + /* We retain this legacy behavior for WeightedPriorityQueue. It seems to + * require very large costs for several messages in order to do any + * meaningful amount of throttling. This branch should be removed after + * Reef. + */ + return cct->_conf->osd_recovery_cost; + } + }(); + enqueue_back( OpSchedulerItem( unique_ptr( new PGRecoveryContext(pg->get_pgid(), c, e)), - cct->_conf->osd_recovery_cost, + cost_for_queue, cct->_conf->osd_recovery_priority, ceph_clock_now(), 0, diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 18946f1e9678..8890c700e17a 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -536,7 +536,9 @@ public: void send_pg_created(); AsyncReserver snap_reserver; - void queue_recovery_context(PG *pg, GenContext *c); + void queue_recovery_context(PG *pg, + GenContext *c, + uint64_t cost); void queue_for_snap_trim(PG *pg); void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority); diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h index 34d74025173e..400b00347198 100644 --- a/src/osd/PGBackend.h +++ b/src/osd/PGBackend.h @@ -260,7 +260,8 @@ typedef std::shared_ptr OSDMapRef; const pg_stat_t &stat) = 0; virtual void schedule_recovery_work( - GenContext *c) = 0; + GenContext *c, + uint64_t cost) = 0; virtual pg_shard_t whoami_shard() const = 0; int whoami() const { diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index c86f40bb392d..bc2c4e0ca84d 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -525,9 +525,10 @@ void PrimaryLogPG::on_global_recover( } void PrimaryLogPG::schedule_recovery_work( - GenContext *c) + GenContext *c, + uint64_t cost) { - osd->queue_recovery_context(this, c); + osd->queue_recovery_context(this, c, cost); } void PrimaryLogPG::replica_clear_repop_obc( diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h index 1a2c1149c7b7..14b9aeb25d70 100644 --- a/src/osd/PrimaryLogPG.h +++ b/src/osd/PrimaryLogPG.h @@ -545,7 +545,8 @@ public: } void schedule_recovery_work( - GenContext *c) override; + GenContext *c, + uint64_t cost) override; pg_shard_t whoami_shard() const override { return pg_whoami; diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 50a6c60d7f0f..718f526c4ce7 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -68,12 +68,14 @@ class PG_SendMessageOnConn: public Context { class PG_RecoveryQueueAsync : public Context { PGBackend::Listener *pg; unique_ptr> c; + uint64_t cost; public: PG_RecoveryQueueAsync( PGBackend::Listener *pg, - GenContext *c) : pg(pg), c(c) {} + GenContext *c, + uint64_t cost) : pg(pg), c(c), cost(cost) {} void finish(int) override { - pg->schedule_recovery_work(c.release()); + pg->schedule_recovery_work(c.release(), cost); } }; } @@ -819,8 +821,11 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext { ReplicatedBackend *bc; list to_continue; int priority; - C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority) - : bc(bc), priority(priority) {} + C_ReplicatedBackend_OnPullComplete( + ReplicatedBackend *bc, + int priority, + list &&to_continue) + : bc(bc), to_continue(std::move(to_continue)), priority(priority) {} void finish(ThreadPool::TPHandle &handle) override { ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op(); @@ -843,6 +848,15 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext { } bc->run_recovery_op(h, priority); } + + /// Estimate total data reads required to perform pushes + uint64_t estimate_push_costs() const { + uint64_t cost = 0; + for (const auto &i: to_continue) { + cost += i.stat.num_bytes_recovered; + } + return cost; + } }; void ReplicatedBackend::_do_pull_response(OpRequestRef op) @@ -872,12 +886,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op) C_ReplicatedBackend_OnPullComplete *c = new C_ReplicatedBackend_OnPullComplete( this, - m->get_priority()); - c->to_continue.swap(to_continue); + m->get_priority(), + std::move(to_continue)); t.register_on_complete( new PG_RecoveryQueueAsync( get_parent(), - get_parent()->bless_unlocked_gencontext(c))); + get_parent()->bless_unlocked_gencontext(c), + std::max(1, c->estimate_push_costs()))); } replies.erase(replies.end() - 1);