]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: update OSDService::queue_recovery_context to specify cost
authorSridhar Seshasayee <sseshasa@redhat.com>
Thu, 2 Feb 2023 08:12:39 +0000 (13:42 +0530)
committerSridhar Seshasayee <sseshasa@redhat.com>
Thu, 27 Apr 2023 13:11:06 +0000 (18:41 +0530)
Previously, we always queued this with cost osd_recovery_cost which
defaults to 20M. With mclock, this caused these items to be delayed
heavily. Instead, base the cost on the operation queued.

Fixes: https://tracker.ceph.com/issues/58606
Signed-off-by: Samuel Just <sjust@redhat.com>
Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
src/osd/ECBackend.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PGBackend.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osd/ReplicatedBackend.cc

index c6c34160a273e8e4b2d97a2ef748cc59219eac3b..685af573ea8099ef7646e64ca0be805c8bcc5b70 100644 (file)
@@ -1431,9 +1431,25 @@ void ECBackend::filter_read_op(
   }
 
   if (op.in_progress.empty()) {
+    /* This case is odd.  filter_read_op gets called while processing
+     * an OSDMap.  Normal, non-recovery reads only happen from acting
+     * set osds.  For this op to have had a read source go down and
+     * there not be an interval change, it must be part of a pull during
+     * log-based recovery.
+     *
+     * This callback delays calling complete_read_op until later to avoid
+     * dealing with recovery while handling an OSDMap.  We assign a
+     * cost here of 1 because:
+     * 1) This should be very rare, and the operation itself was already
+     *    throttled.
+     * 2) It shouldn't result in IO, rather it should result in restarting
+     *    the pull on the affected objects and pushes from in-memory buffers
+     *    on any now complete unaffected objects.
+     */
     get_parent()->schedule_recovery_work(
       get_parent()->bless_unlocked_gencontext(
-       new FinishReadOp(this, op.tid)));
+        new FinishReadOp(this, op.tid)),
+      1);
   }
 }
 
index 7ab35bf0e080b1d02c7f639b0bf6b91aceb37a38..68ff1702d9a667a678fb61d2b269e184843f5277 100644 (file)
@@ -1696,14 +1696,29 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi)
 
 void OSDService::queue_recovery_context(
   PG *pg,
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
 {
   epoch_t e = get_osdmap_epoch();
+
+  uint64_t cost_for_queue = [this, cost] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      return cost;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
+       * require very large costs for several messages in order to do any
+       * meaningful amount of throttling.  This branch should be removed after
+       * Reef.
+       */
+      return cct->_conf->osd_recovery_cost;
+    }
+  }();
+
   enqueue_back(
     OpSchedulerItem(
       unique_ptr<OpSchedulerItem::OpQueueable>(
        new PGRecoveryContext(pg->get_pgid(), c, e)),
-      cct->_conf->osd_recovery_cost,
+      cost_for_queue,
       cct->_conf->osd_recovery_priority,
       ceph_clock_now(),
       0,
index 2d65098f037bfcb4c1339272b144e7f6638270ea..88f75cc93eb9885b5fc09e2aa7071a37665a0e31 100644 (file)
@@ -507,7 +507,9 @@ public:
   void send_pg_created();
 
   AsyncReserver<spg_t, Finisher> snap_reserver;
-  void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
+  void queue_recovery_context(PG *pg,
+                              GenContext<ThreadPool::TPHandle&> *c,
+                              uint64_t cost);
   void queue_for_snap_trim(PG *pg);
   void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
 
index 82cb23a95ffa6e47f4ef04f12f310aef4191e6ed..a0919e1d87f57bb3a101d31ceadb413aaf6ec6cc 100644 (file)
@@ -260,7 +260,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
        const pg_stat_t &stat) = 0;
 
      virtual void schedule_recovery_work(
-       GenContext<ThreadPool::TPHandle&> *c) = 0;
+       GenContext<ThreadPool::TPHandle&> *c,
+       uint64_t cost) = 0;
 
      virtual pg_shard_t whoami_shard() const = 0;
      int whoami() const {
index 77acdecd0a4c44ebfa3a9e4afeb038bacbd7565c..9fc2a049331b58a803f0536ef8774cab507ecb1b 100644 (file)
@@ -523,9 +523,10 @@ void PrimaryLogPG::on_global_recover(
 }
 
 void PrimaryLogPG::schedule_recovery_work(
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
 {
-  osd->queue_recovery_context(this, c);
+  osd->queue_recovery_context(this, c, cost);
 }
 
 void PrimaryLogPG::replica_clear_repop_obc(
index e854bfc661063a541b236ff6f6526b59f329c3d1..f7356be8e9f010fab06a3781fa766eaa4f1a81c3 100644 (file)
@@ -543,7 +543,8 @@ public:
   }
 
   void schedule_recovery_work(
-    GenContext<ThreadPool::TPHandle&> *c) override;
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) override;
 
   pg_shard_t whoami_shard() const override {
     return pg_whoami;
index 4855ddd931ce057457519bbc43313f505d0f27da..99535ed5feb97c2a4e0158975e597dd0d48553bf 100644 (file)
@@ -68,12 +68,14 @@ class PG_SendMessageOnConn: public Context {
 class PG_RecoveryQueueAsync : public Context {
   PGBackend::Listener *pg;
   unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+  uint64_t cost;
   public:
   PG_RecoveryQueueAsync(
     PGBackend::Listener *pg,
-    GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {}
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) : pg(pg), c(c), cost(cost) {}
   void finish(int) override {
-    pg->schedule_recovery_work(c.release());
+    pg->schedule_recovery_work(c.release(), cost);
   }
 };
 }
@@ -817,8 +819,11 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
   ReplicatedBackend *bc;
   list<ReplicatedBackend::pull_complete_info> to_continue;
   int priority;
-  C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
-    : bc(bc), priority(priority) {}
+  C_ReplicatedBackend_OnPullComplete(
+    ReplicatedBackend *bc,
+    int priority,
+    list<ReplicatedBackend::pull_complete_info> &&to_continue)
+    : bc(bc), to_continue(std::move(to_continue)), priority(priority) {}
 
   void finish(ThreadPool::TPHandle &handle) override {
     ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
@@ -841,6 +846,15 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
     }
     bc->run_recovery_op(h, priority);
   }
+
+  /// Estimate total data reads required to perform pushes
+  uint64_t estimate_push_costs() const {
+    uint64_t cost = 0;
+    for (const auto &i: to_continue) {
+      cost += i.stat.num_bytes_recovered;
+    }
+    return cost;
+  }
 };
 
 void ReplicatedBackend::_do_pull_response(OpRequestRef op)
@@ -870,12 +884,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
     C_ReplicatedBackend_OnPullComplete *c =
       new C_ReplicatedBackend_OnPullComplete(
        this,
-       m->get_priority());
-    c->to_continue.swap(to_continue);
+       m->get_priority(),
+       std::move(to_continue));
     t.register_on_complete(
       new PG_RecoveryQueueAsync(
        get_parent(),
-       get_parent()->bless_unlocked_gencontext(c)));
+       get_parent()->bless_unlocked_gencontext(c),
+        std::max<uint64_t>(1, c->estimate_push_costs())));
   }
   replies.erase(replies.end() - 1);