osd: update OSDService::queue_recovery_context to specify cost

author Sridhar Seshasayee <sseshasa@redhat.com>

Thu, 2 Feb 2023 08:12:39 +0000 (13:42 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Thu, 27 Apr 2023 13:11:06 +0000 (18:41 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Thu, 2 Feb 2023 08:12:39 +0000 (13:42 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Thu, 27 Apr 2023 13:11:06 +0000 (18:41 +0530)
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index c6c34160a273e8e4b2d97a2ef748cc59219eac3b..685af573ea8099ef7646e64ca0be805c8bcc5b70 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1431,9 +1431,25 @@ void ECBackend::filter_read_op(
    }
  
    if (op.in_progress.empty()) {
+    /* This case is odd.  filter_read_op gets called while processing
+     * an OSDMap.  Normal, non-recovery reads only happen from acting
+     * set osds.  For this op to have had a read source go down and
+     * there not be an interval change, it must be part of a pull during
+     * log-based recovery.
+     *
+     * This callback delays calling complete_read_op until later to avoid
+     * dealing with recovery while handling an OSDMap.  We assign a
+     * cost here of 1 because:
+     * 1) This should be very rare, and the operation itself was already
+     *    throttled.
+     * 2) It shouldn't result in IO, rather it should result in restarting
+     *    the pull on the affected objects and pushes from in-memory buffers
+     *    on any now complete unaffected objects.
+     */
      get_parent()->schedule_recovery_work(
        get_parent()->bless_unlocked_gencontext(
-       new FinishReadOp(this, op.tid)));
+        new FinishReadOp(this, op.tid)),
+      1);
    }
  }
  
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 7ab35bf0e080b1d02c7f639b0bf6b91aceb37a38..68ff1702d9a667a678fb61d2b269e184843f5277 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1696,14 +1696,29 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi)
  
  void OSDService::queue_recovery_context(
    PG *pg,
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
  {
    epoch_t e = get_osdmap_epoch();
+
+  uint64_t cost_for_queue = [this, cost] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      return cost;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
+       * require very large costs for several messages in order to do any
+       * meaningful amount of throttling.  This branch should be removed after
+       * Reef.
+       */
+      return cct->_conf->osd_recovery_cost;
+    }
+  }();
+
    enqueue_back(
      OpSchedulerItem(
        unique_ptr<OpSchedulerItem::OpQueueable>(
         new PGRecoveryContext(pg->get_pgid(), c, e)),
-      cct->_conf->osd_recovery_cost,
+      cost_for_queue,
        cct->_conf->osd_recovery_priority,
        ceph_clock_now(),
        0,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 2d65098f037bfcb4c1339272b144e7f6638270ea..88f75cc93eb9885b5fc09e2aa7071a37665a0e31 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -507,7 +507,9 @@ public:
    void send_pg_created();
  
    AsyncReserver<spg_t, Finisher> snap_reserver;
-  void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
+  void queue_recovery_context(PG *pg,
+                              GenContext<ThreadPool::TPHandle&> *c,
+                              uint64_t cost);
    void queue_for_snap_trim(PG *pg);
    void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
  
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h

index 82cb23a95ffa6e47f4ef04f12f310aef4191e6ed..a0919e1d87f57bb3a101d31ceadb413aaf6ec6cc 100644 (file)
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -260,7 +260,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
         const pg_stat_t &stat) = 0;
  
       virtual void schedule_recovery_work(
-       GenContext<ThreadPool::TPHandle&> *c) = 0;
+       GenContext<ThreadPool::TPHandle&> *c,
+       uint64_t cost) = 0;
  
       virtual pg_shard_t whoami_shard() const = 0;
       int whoami() const {
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc

index 77acdecd0a4c44ebfa3a9e4afeb038bacbd7565c..9fc2a049331b58a803f0536ef8774cab507ecb1b 100644 (file)
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -523,9 +523,10 @@ void PrimaryLogPG::on_global_recover(
  }
  
  void PrimaryLogPG::schedule_recovery_work(
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
  {
-  osd->queue_recovery_context(this, c);
+  osd->queue_recovery_context(this, c, cost);
  }
  
  void PrimaryLogPG::replica_clear_repop_obc(
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h

index e854bfc661063a541b236ff6f6526b59f329c3d1..f7356be8e9f010fab06a3781fa766eaa4f1a81c3 100644 (file)
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -543,7 +543,8 @@ public:
    }
  
    void schedule_recovery_work(
-    GenContext<ThreadPool::TPHandle&> *c) override;
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) override;
  
    pg_shard_t whoami_shard() const override {
      return pg_whoami;
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc

index 4855ddd931ce057457519bbc43313f505d0f27da..99535ed5feb97c2a4e0158975e597dd0d48553bf 100644 (file)
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -68,12 +68,14 @@ class PG_SendMessageOnConn: public Context {
  class PG_RecoveryQueueAsync : public Context {
    PGBackend::Listener *pg;
    unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+  uint64_t cost;
    public:
    PG_RecoveryQueueAsync(
      PGBackend::Listener *pg,
-    GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {}
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) : pg(pg), c(c), cost(cost) {}
    void finish(int) override {
-    pg->schedule_recovery_work(c.release());
+    pg->schedule_recovery_work(c.release(), cost);
    }
  };
  }
@@ -817,8 +819,11 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
    ReplicatedBackend *bc;
    list<ReplicatedBackend::pull_complete_info> to_continue;
    int priority;
-  C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
-    : bc(bc), priority(priority) {}
+  C_ReplicatedBackend_OnPullComplete(
+    ReplicatedBackend *bc,
+    int priority,
+    list<ReplicatedBackend::pull_complete_info> &&to_continue)
+    : bc(bc), to_continue(std::move(to_continue)), priority(priority) {}
  
    void finish(ThreadPool::TPHandle &handle) override {
      ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
@@ -841,6 +846,15 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
      }
      bc->run_recovery_op(h, priority);
    }
+
+  /// Estimate total data reads required to perform pushes
+  uint64_t estimate_push_costs() const {
+    uint64_t cost = 0;
+    for (const auto &i: to_continue) {
+      cost += i.stat.num_bytes_recovered;
+    }
+    return cost;
+  }
  };
  
  void ReplicatedBackend::_do_pull_response(OpRequestRef op)
@@ -870,12 +884,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
      C_ReplicatedBackend_OnPullComplete *c =
        new C_ReplicatedBackend_OnPullComplete(
         this,
-       m->get_priority());
-    c->to_continue.swap(to_continue);
+       m->get_priority(),
+       std::move(to_continue));
      t.register_on_complete(
        new PG_RecoveryQueueAsync(
         get_parent(),
-       get_parent()->bless_unlocked_gencontext(c)));
+       get_parent()->bless_unlocked_gencontext(c),
+        std::max<uint64_t>(1, c->estimate_push_costs())));
    }
    replies.erase(replies.end() - 1);
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Thu, 2 Feb 2023 08:12:39 +0000 (13:42 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Thu, 27 Apr 2023 13:11:06 +0000 (18:41 +0530)
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/PGBackend.h		patch \| blob \| history
src/osd/PrimaryLogPG.cc		patch \| blob \| history
src/osd/PrimaryLogPG.h		patch \| blob \| history
src/osd/ReplicatedBackend.cc		patch \| blob \| history