From: Sridhar Seshasayee <sseshasa@redhat.com>
Date: Fri, 3 Feb 2023 12:17:38 +0000 (+0530)
Subject: osd: update OSDService::queue_recovery_context to specify cost
X-Git-Tag: v17.2.7~418^2~27
X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c30f2729b4815e56229c711cf0789cb704aec5dd;p=ceph.git

osd: update OSDService::queue_recovery_context to specify cost

Previously, we always queued this with cost osd_recovery_cost which
defaults to 20M. With mclock, this caused these items to be delayed
heavily. Instead, base the cost on the operation queued.

Fixes: https://tracker.ceph.com/issues/58606
Signed-off-by: Samuel Just <sjust@redhat.com>
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
---

diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index b061e0f69ca1..dc4fa3d0558c 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1433,9 +1433,25 @@ void ECBackend::filter_read_op(
   }
 
   if (op.in_progress.empty()) {
+    /* This case is odd.  filter_read_op gets called while processing
+     * an OSDMap.  Normal, non-recovery reads only happen from acting
+     * set osds.  For this op to have had a read source go down and
+     * there not be an interval change, it must be part of a pull during
+     * log-based recovery.
+     *
+     * This callback delays calling complete_read_op until later to avoid
+     * dealing with recovery while handling an OSDMap.  We assign a
+     * cost here of 1 because:
+     * 1) This should be very rare, and the operation itself was already
+     *    throttled.
+     * 2) It shouldn't result in IO, rather it should result in restarting
+     *    the pull on the affected objects and pushes from in-memory buffers
+     *    on any now complete unaffected objects.
+     */
     get_parent()->schedule_recovery_work(
       get_parent()->bless_unlocked_gencontext(
-	new FinishReadOp(this, op.tid)));
+        new FinishReadOp(this, op.tid)),
+      1);
   }
 }
 
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index aed5057b4559..d3cd2a116325 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1639,14 +1639,29 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi)
 
 void OSDService::queue_recovery_context(
   PG *pg,
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
 {
   epoch_t e = get_osdmap_epoch();
+
+  uint64_t cost_for_queue = [this, cost] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      return cost;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
+       * require very large costs for several messages in order to do any
+       * meaningful amount of throttling.  This branch should be removed after
+       * Reef.
+       */
+      return cct->_conf->osd_recovery_cost;
+    }
+  }();
+
   enqueue_back(
     OpSchedulerItem(
       unique_ptr<OpSchedulerItem::OpQueueable>(
 	new PGRecoveryContext(pg->get_pgid(), c, e)),
-      cct->_conf->osd_recovery_cost,
+      cost_for_queue,
       cct->_conf->osd_recovery_priority,
       ceph_clock_now(),
       0,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 18946f1e9678..8890c700e17a 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -536,7 +536,9 @@ public:
   void send_pg_created();
 
   AsyncReserver<spg_t, Finisher> snap_reserver;
-  void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
+  void queue_recovery_context(PG *pg,
+                              GenContext<ThreadPool::TPHandle&> *c,
+                              uint64_t cost);
   void queue_for_snap_trim(PG *pg);
   void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
 
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 34d74025173e..400b00347198 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -260,7 +260,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        const pg_stat_t &stat) = 0;
 
      virtual void schedule_recovery_work(
-       GenContext<ThreadPool::TPHandle&> *c) = 0;
+       GenContext<ThreadPool::TPHandle&> *c,
+       uint64_t cost) = 0;
 
      virtual pg_shard_t whoami_shard() const = 0;
      int whoami() const {
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc
index c86f40bb392d..bc2c4e0ca84d 100644
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -525,9 +525,10 @@ void PrimaryLogPG::on_global_recover(
 }
 
 void PrimaryLogPG::schedule_recovery_work(
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
 {
-  osd->queue_recovery_context(this, c);
+  osd->queue_recovery_context(this, c, cost);
 }
 
 void PrimaryLogPG::replica_clear_repop_obc(
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h
index 1a2c1149c7b7..14b9aeb25d70 100644
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -545,7 +545,8 @@ public:
   }
 
   void schedule_recovery_work(
-    GenContext<ThreadPool::TPHandle&> *c) override;
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) override;
 
   pg_shard_t whoami_shard() const override {
     return pg_whoami;
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 50a6c60d7f0f..718f526c4ce7 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -68,12 +68,14 @@ class PG_SendMessageOnConn: public Context {
 class PG_RecoveryQueueAsync : public Context {
   PGBackend::Listener *pg;
   unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+  uint64_t cost;
   public:
   PG_RecoveryQueueAsync(
     PGBackend::Listener *pg,
-    GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {}
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) : pg(pg), c(c), cost(cost) {}
   void finish(int) override {
-    pg->schedule_recovery_work(c.release());
+    pg->schedule_recovery_work(c.release(), cost);
   }
 };
 }
@@ -819,8 +821,11 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
   ReplicatedBackend *bc;
   list<ReplicatedBackend::pull_complete_info> to_continue;
   int priority;
-  C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
-    : bc(bc), priority(priority) {}
+  C_ReplicatedBackend_OnPullComplete(
+    ReplicatedBackend *bc,
+    int priority,
+    list<ReplicatedBackend::pull_complete_info> &&to_continue)
+    : bc(bc), to_continue(std::move(to_continue)), priority(priority) {}
 
   void finish(ThreadPool::TPHandle &handle) override {
     ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
@@ -843,6 +848,15 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
     }
     bc->run_recovery_op(h, priority);
   }
+
+  /// Estimate total data reads required to perform pushes
+  uint64_t estimate_push_costs() const {
+    uint64_t cost = 0;
+    for (const auto &i: to_continue) {
+      cost += i.stat.num_bytes_recovered;
+    }
+    return cost;
+  }
 };
 
 void ReplicatedBackend::_do_pull_response(OpRequestRef op)
@@ -872,12 +886,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
     C_ReplicatedBackend_OnPullComplete *c =
       new C_ReplicatedBackend_OnPullComplete(
 	this,
-	m->get_priority());
-    c->to_continue.swap(to_continue);
+	m->get_priority(),
+	std::move(to_continue));
     t.register_on_complete(
       new PG_RecoveryQueueAsync(
 	get_parent(),
-	get_parent()->bless_unlocked_gencontext(c)));
+	get_parent()->bless_unlocked_gencontext(c),
+        std::max<uint64_t>(1, c->estimate_push_costs())));
   }
   replies.erase(replies.end() - 1);