osd: update OSDService::queue_recovery_context to specify cost

author Sridhar Seshasayee <sseshasa@redhat.com>

Fri, 3 Feb 2023 12:17:38 +0000 (17:47 +0530)

committer Sridhar Seshasayee <sseshasa@redhat.com>

Mon, 8 May 2023 09:16:25 +0000 (14:46 +0530)
author Sridhar Seshasayee <sseshasa@redhat.com>
Fri, 3 Feb 2023 12:17:38 +0000 (17:47 +0530)
committer Sridhar Seshasayee <sseshasa@redhat.com>
Mon, 8 May 2023 09:16:25 +0000 (14:46 +0530)
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index b061e0f69ca14160bff2947ffb7bad2740b17e1c..dc4fa3d0558c626c7a298f6ca5e555a3bbed9f44 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1433,9 +1433,25 @@ void ECBackend::filter_read_op(
    }
  
    if (op.in_progress.empty()) {
+    /* This case is odd.  filter_read_op gets called while processing
+     * an OSDMap.  Normal, non-recovery reads only happen from acting
+     * set osds.  For this op to have had a read source go down and
+     * there not be an interval change, it must be part of a pull during
+     * log-based recovery.
+     *
+     * This callback delays calling complete_read_op until later to avoid
+     * dealing with recovery while handling an OSDMap.  We assign a
+     * cost here of 1 because:
+     * 1) This should be very rare, and the operation itself was already
+     *    throttled.
+     * 2) It shouldn't result in IO, rather it should result in restarting
+     *    the pull on the affected objects and pushes from in-memory buffers
+     *    on any now complete unaffected objects.
+     */
      get_parent()->schedule_recovery_work(
        get_parent()->bless_unlocked_gencontext(
-       new FinishReadOp(this, op.tid)));
+        new FinishReadOp(this, op.tid)),
+      1);
    }
  }
  
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index aed5057b455970b7c79b8f9a8d04352f54b489c8..d3cd2a1163250f92add3fc8f1fee463eefe64dc6 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1639,14 +1639,29 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi)
  
  void OSDService::queue_recovery_context(
    PG *pg,
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
  {
    epoch_t e = get_osdmap_epoch();
+
+  uint64_t cost_for_queue = [this, cost] {
+    if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+      return cost;
+    } else {
+      /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
+       * require very large costs for several messages in order to do any
+       * meaningful amount of throttling.  This branch should be removed after
+       * Reef.
+       */
+      return cct->_conf->osd_recovery_cost;
+    }
+  }();
+
    enqueue_back(
      OpSchedulerItem(
        unique_ptr<OpSchedulerItem::OpQueueable>(
         new PGRecoveryContext(pg->get_pgid(), c, e)),
-      cct->_conf->osd_recovery_cost,
+      cost_for_queue,
        cct->_conf->osd_recovery_priority,
        ceph_clock_now(),
        0,
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 18946f1e967829300e17250e7231a5c10bc15945..8890c700e17adb0244c86ea59eb88e58506aa62d 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -536,7 +536,9 @@ public:
    void send_pg_created();
  
    AsyncReserver<spg_t, Finisher> snap_reserver;
-  void queue_recovery_context(PG *pg, GenContext<ThreadPool::TPHandle&> *c);
+  void queue_recovery_context(PG *pg,
+                              GenContext<ThreadPool::TPHandle&> *c,
+                              uint64_t cost);
    void queue_for_snap_trim(PG *pg);
    void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);
  
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h

index 34d74025173e0239d4364bc226244da5add91d5a..400b00347198d2466f067af8c31973b91d497372 100644 (file)
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -260,7 +260,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
         const pg_stat_t &stat) = 0;
  
       virtual void schedule_recovery_work(
-       GenContext<ThreadPool::TPHandle&> *c) = 0;
+       GenContext<ThreadPool::TPHandle&> *c,
+       uint64_t cost) = 0;
  
       virtual pg_shard_t whoami_shard() const = 0;
       int whoami() const {
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc

index c86f40bb392df19e066b8e78967eb08c3a6261d7..bc2c4e0ca84dc3c11810e74047988d16c9971e35 100644 (file)
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -525,9 +525,10 @@ void PrimaryLogPG::on_global_recover(
  }
  
  void PrimaryLogPG::schedule_recovery_work(
-  GenContext<ThreadPool::TPHandle&> *c)
+  GenContext<ThreadPool::TPHandle&> *c,
+  uint64_t cost)
  {
-  osd->queue_recovery_context(this, c);
+  osd->queue_recovery_context(this, c, cost);
  }
  
  void PrimaryLogPG::replica_clear_repop_obc(
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h

index 1a2c1149c7b75c97c7edf780618c83c055575bb4..14b9aeb25d70bdaf6d68c1bf138e5052c533c0a1 100644 (file)
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -545,7 +545,8 @@ public:
    }
  
    void schedule_recovery_work(
-    GenContext<ThreadPool::TPHandle&> *c) override;
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) override;
  
    pg_shard_t whoami_shard() const override {
      return pg_whoami;
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc

index 50a6c60d7f0f5f12ee350dd66b89332415ab7879..718f526c4ce76ea591094efa8851ed123a16cb0b 100644 (file)
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -68,12 +68,14 @@ class PG_SendMessageOnConn: public Context {
  class PG_RecoveryQueueAsync : public Context {
    PGBackend::Listener *pg;
    unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
+  uint64_t cost;
    public:
    PG_RecoveryQueueAsync(
      PGBackend::Listener *pg,
-    GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {}
+    GenContext<ThreadPool::TPHandle&> *c,
+    uint64_t cost) : pg(pg), c(c), cost(cost) {}
    void finish(int) override {
-    pg->schedule_recovery_work(c.release());
+    pg->schedule_recovery_work(c.release(), cost);
    }
  };
  }
@@ -819,8 +821,11 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
    ReplicatedBackend *bc;
    list<ReplicatedBackend::pull_complete_info> to_continue;
    int priority;
-  C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority)
-    : bc(bc), priority(priority) {}
+  C_ReplicatedBackend_OnPullComplete(
+    ReplicatedBackend *bc,
+    int priority,
+    list<ReplicatedBackend::pull_complete_info> &&to_continue)
+    : bc(bc), to_continue(std::move(to_continue)), priority(priority) {}
  
    void finish(ThreadPool::TPHandle &handle) override {
      ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op();
@@ -843,6 +848,15 @@ struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> {
      }
      bc->run_recovery_op(h, priority);
    }
+
+  /// Estimate total data reads required to perform pushes
+  uint64_t estimate_push_costs() const {
+    uint64_t cost = 0;
+    for (const auto &i: to_continue) {
+      cost += i.stat.num_bytes_recovered;
+    }
+    return cost;
+  }
  };
  
  void ReplicatedBackend::_do_pull_response(OpRequestRef op)
@@ -872,12 +886,13 @@ void ReplicatedBackend::_do_pull_response(OpRequestRef op)
      C_ReplicatedBackend_OnPullComplete *c =
        new C_ReplicatedBackend_OnPullComplete(
         this,
-       m->get_priority());
-    c->to_continue.swap(to_continue);
+       m->get_priority(),
+       std::move(to_continue));
      t.register_on_complete(
        new PG_RecoveryQueueAsync(
         get_parent(),
-       get_parent()->bless_unlocked_gencontext(c)));
+       get_parent()->bless_unlocked_gencontext(c),
+        std::max<uint64_t>(1, c->estimate_push_costs())));
    }
    replies.erase(replies.end() - 1);
author	Sridhar Seshasayee <sseshasa@redhat.com>
	Fri, 3 Feb 2023 12:17:38 +0000 (17:47 +0530)
committer	Sridhar Seshasayee <sseshasa@redhat.com>
	Mon, 8 May 2023 09:16:25 +0000 (14:46 +0530)
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history
src/osd/PGBackend.h		patch \| blob \| history
src/osd/PrimaryLogPG.cc		patch \| blob \| history
src/osd/PrimaryLogPG.h		patch \| blob \| history
src/osd/ReplicatedBackend.cc		patch \| blob \| history