From: Samuel Just <sjust@redhat.com>
Date: Fri, 29 Mar 2019 22:36:55 +0000 (-0700)
Subject: osd/: move the backfill space reservation back into PG
X-Git-Tag: v15.1.0~2774^2~54
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=cabe5b52ccc08d9401967baaab244f540b56aba1;p=ceph-ci.git

osd/: move the backfill space reservation back into PG

PeeringState really only needs an interface for requesting
it.

Signed-off-by: Samuel Just <sjust@redhat.com>
---

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 52661e59300..06d2da98647 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4082,34 +4082,96 @@ void PG::handle_scrub_reserve_release(OpRequestRef op)
   clear_scrub_reserved();
 }
 
+// Compute pending backfill data
+static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
+{
+  lgeneric_dout(cct, 20) << __func__ << " Adjust local usage "
+			 << (local_bytes >> 10) << "KiB"
+			 << " primary usage " << (bf_bytes >> 10)
+			 << "KiB" << dendl;
+
+  return std::max((int64_t)0, bf_bytes - local_bytes);
+}
+
+
 // We can zero the value of primary num_bytes as just an atomic.
 // However, setting above zero reserves space for backfill and requires
 // the OSDService::stat_lock which protects all OSD usage
-void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
-  ceph_assert(osd->stat_lock.is_locked_by_me());
-  primary_num_bytes.store(primary);
-  local_num_bytes.store(local);
-  return;
+bool PG::try_reserve_recovery_space(
+  int64_t primary_bytes, int64_t local_bytes) {
+  // Use tentative_bacfill_full() to make sure enough
+  // space is available to handle target bytes from primary.
+
+  // TODO: If we passed num_objects from primary we could account for
+  // an estimate of the metadata overhead.
+
+  // TODO: If we had compressed_allocated and compressed_original from primary
+  // we could compute compression ratio and adjust accordingly.
+
+  // XXX: There is no way to get omap overhead and this would only apply
+  // to whatever possibly different partition that is storing the database.
+
+  // update_osd_stat() from heartbeat will do this on a new
+  // statfs using ps->primary_bytes.
+  uint64_t pending_adjustment = 0;
+  if (primary_bytes) {
+    // For erasure coded pool overestimate by a full stripe per object
+    // because we don't know how each objected rounded to the nearest stripe
+    if (pool.info.is_erasure()) {
+      primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+	info.stats.stats.sum.num_objects;
+      local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+	info.stats.stats.sum.num_objects;
+    }
+    pending_adjustment = pending_backfill(
+      cct,
+      primary_bytes,
+      local_bytes);
+    dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10)
+	     << "KiB"
+	     << " local " << (local_bytes >> 10) << "KiB"
+	     << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
+	     << dendl;
+  }
+
+  // This lock protects not only the stats OSDService but also setting the
+  // pg primary_bytes.  That's why we don't immediately unlock
+  Mutex::Locker l(osd->stat_lock);
+  osd_stat_t cur_stat = osd->osd_stat;
+  if (cct->_conf->osd_debug_reject_backfill_probability > 0 &&
+      (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
+    dout(10) << "backfill reservation rejected: failure injection"
+	     << dendl;
+    return false;
+  } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
+      osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) {
+    dout(10) << "backfill reservation rejected: backfill full"
+	     << dendl;
+    return false;
+  } else {
+    // Don't reserve space if skipped reservation check, this is used
+    // to test the other backfill full check AND in case a corruption
+    // of num_bytes requires ignoring that value and trying the
+    // backfill anyway.
+    if (primary_bytes &&
+	!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+      primary_num_bytes.store(primary_bytes);
+      local_num_bytes.store(local_bytes);
+    } else {
+      unreserve_recovery_space();
+    }
+    return true;
+  }
 }
 
-void PG::clear_reserved_num_bytes() {
+void PG::unreserve_recovery_space() {
   primary_num_bytes.store(0);
   local_num_bytes.store(0);
   return;
 }
 
-void PG::reject_reservation()
-{
-  clear_reserved_num_bytes();
-  osd->send_message_osd_cluster(
-    primary.osd,
-    new MBackfillReserve(
-      MBackfillReserve::REJECT,
-      spg_t(info.pgid.pgid, primary.shard),
-      get_osdmap_epoch()),
-    get_osdmap_epoch());
-}
-
 void PG::clear_scrub_reserved()
 {
   scrubber.reserved_peers.clear();
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 76188e151c5..785a742436a 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -789,8 +789,8 @@ public:
     return primary_num_bytes.load() > 0;
   }
 
-  void set_reserved_num_bytes(int64_t primary, int64_t local);
-  void clear_reserved_num_bytes();
+  bool try_reserve_recovery_space(int64_t primary, int64_t local) override;
+  void unreserve_recovery_space() override;
 
   // If num_bytes are inconsistent and local_num- goes negative
   // it's ok, because it would then be ignored.
@@ -1429,8 +1429,6 @@ protected:
   void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
   void handle_scrub_reserve_release(OpRequestRef op);
 
-  void reject_reservation();
-
   // -- recovery state --
 
   struct QueuePeeringEvt : Context {
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc
index 4829edd7143..4d3451f18aa 100644
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -1115,6 +1115,20 @@ bool PeeringState::all_unfound_are_queried_or_lost(
 }
 
 
+void PeeringState::reject_reservation()
+{
+  pl->unreserve_recovery_space();
+  pl->send_cluster_message(
+    primary.osd,
+    new MBackfillReserve(
+      MBackfillReserve::REJECT,
+      spg_t(info.pgid.pgid, primary.shard),
+      get_osdmap_epoch()),
+    get_osdmap_epoch());
+}
+
+
+
 /*------------ Peering State Machine----------------*/
 #undef dout_prefix
 #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
@@ -1860,7 +1874,7 @@ boost::statechart::result
 PeeringState::RepNotRecovering::react(const RejectRemoteReservation &evt)
 {
   DECLARE_LOCALS
-  pg->reject_reservation();
+  ps->reject_reservation();
   post_event(RemoteReservationRejected());
   return discard_event();
 }
@@ -1900,7 +1914,7 @@ PeeringState::RepWaitRecoveryReserved::react(
   const RemoteReservationCanceled &evt)
 {
   DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
 
   pl->cancel_remote_recovery_reservation();
   return transit<RepNotRecovering>();
@@ -1922,77 +1936,16 @@ PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
   context< PeeringMachine >().log_enter(state_name);
 }
 
-// Compute pending backfill data
-static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
-{
-    lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
-		               << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
-    return std::max((int64_t)0, bf_bytes - local_bytes);
-}
-
 boost::statechart::result
 PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
 {
 
   DECLARE_LOCALS
-  // Use tentative_bacfill_full() to make sure enough
-  // space is available to handle target bytes from primary.
-
-  // TODO: If we passed num_objects from primary we could account for
-  // an estimate of the metadata overhead.
-
-  // TODO: If we had compressed_allocated and compressed_original from primary
-  // we could compute compression ratio and adjust accordingly.
-
-  // XXX: There is no way to get omap overhead and this would only apply
-  // to whatever possibly different partition that is storing the database.
 
-  // update_osd_stat() from heartbeat will do this on a new
-  // statfs using ps->primary_num_bytes.
-  uint64_t pending_adjustment = 0;
-  int64_t primary_num_bytes = evt.primary_num_bytes;
-  int64_t local_num_bytes = evt.local_num_bytes;
-  if (primary_num_bytes) {
-    // For erasure coded pool overestimate by a full stripe per object
-    // because we don't know how each objected rounded to the nearest stripe
-    if (ps->pool.info.is_erasure()) {
-      primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
-      primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * ps->info.stats.stats.sum.num_objects;
-      local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
-      local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * ps->info.stats.stats.sum.num_objects;
-    }
-    pending_adjustment = pending_backfill(
-      context< PeeringMachine >().cct,
-      primary_num_bytes,
-      local_num_bytes);
-    psdout(10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
-                       << " local " << (local_num_bytes >> 10) << "KiB"
-                       << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
-                       << dendl;
-  }
-  // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
-  // That's why we don't immediately unlock
-  Mutex::Locker l(pg->osd->stat_lock);
-  osd_stat_t cur_stat = pg->osd->osd_stat;
-  if (ps->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
-      (rand()%1000 < (ps->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
-    psdout(10) << "backfill reservation rejected: failure injection"
-		       << dendl;
-    post_event(RejectRemoteReservation());
-  } else if (!ps->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
-      pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
-    psdout(10) << "backfill reservation rejected: backfill full"
-		       << dendl;
+  if (!pl->try_reserve_recovery_space(
+	evt.primary_num_bytes, evt.local_num_bytes)) {
     post_event(RejectRemoteReservation());
   } else {
-    // Don't reserve space if skipped reservation check, this is used
-    // to test the other backfill full check AND in case a corruption
-    // of num_bytes requires ignoring that value and trying the
-    // backfill anyway.
-    if (primary_num_bytes && !ps->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
-      pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
-    else
-      pg->clear_reserved_num_bytes();
     // Use un-ec-adjusted bytes for stats.
     ps->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
 
@@ -2072,7 +2025,7 @@ PeeringState::RepWaitBackfillReserved::react(
   const RejectRemoteReservation &evt)
 {
   DECLARE_LOCALS
-  pg->reject_reservation();
+  ps->reject_reservation();
   post_event(RemoteReservationRejected());
   return discard_event();
 }
@@ -2082,7 +2035,7 @@ PeeringState::RepWaitBackfillReserved::react(
   const RemoteReservationRejected &evt)
 {
   DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
 
   pl->cancel_remote_recovery_reservation();
   return transit<RepNotRecovering>();
@@ -2093,7 +2046,7 @@ PeeringState::RepWaitBackfillReserved::react(
   const RemoteReservationCanceled &evt)
 {
   DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
 
   pl->cancel_remote_recovery_reservation();
   return transit<RepNotRecovering>();
@@ -2113,7 +2066,7 @@ PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
   DECLARE_LOCALS
 
 
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
   pl->send_cluster_message(
     ps->primary.osd,
     new MRecoveryReserve(
@@ -2130,7 +2083,7 @@ PeeringState::RepRecovering::react(const BackfillTooFull &)
   DECLARE_LOCALS
 
 
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
   pl->send_cluster_message(
     ps->primary.osd,
     new MBackfillReserve(
@@ -2147,7 +2100,7 @@ PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
   DECLARE_LOCALS
 
 
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
   pl->send_cluster_message(
     ps->primary.osd,
     new MBackfillReserve(
@@ -2162,7 +2115,7 @@ void PeeringState::RepRecovering::exit()
 {
   context< PeeringMachine >().log_exit(state_name, enter_time);
   DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
 
   pl->cancel_remote_recovery_reservation();
   utime_t dur = ceph_clock_now() - enter_time;
@@ -2894,7 +2847,7 @@ void PeeringState::ReplicaActive::exit()
 {
   context< PeeringMachine >().log_exit(state_name, enter_time);
   DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
 
   pl->cancel_remote_recovery_reservation();
   utime_t dur = ceph_clock_now() - enter_time;
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h
index b89f1f6780a..18e30ac8fa1 100644
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -131,6 +131,11 @@ public:
     virtual void on_backfill_canceled() = 0;
     virtual void on_recovery_reserved() = 0;
 
+    // recovery space accounting
+    virtual bool try_reserve_recovery_space(
+      int64_t primary_num_bytes, int64_t local_num_bytes) = 0;
+    virtual void unreserve_recovery_space() = 0;
+
     virtual epoch_t oldest_stored_osdmap() = 0;
     virtual LogChannel &get_clog() = 0;
 
@@ -1249,6 +1254,8 @@ public:
   bool adjust_need_up_thru(const OSDMapRef osdmap);
   PastIntervals::PriorSet build_prior();
 
+  void reject_reservation();
+
 public:
   PeeringState(
     CephContext *cct,