osd/: move the backfill space reservation back into PG

author Samuel Just <sjust@redhat.com>

Fri, 29 Mar 2019 22:36:55 +0000 (15:36 -0700)

committer sjust@redhat.com <sjust@redhat.com>

Wed, 1 May 2019 18:22:21 +0000 (11:22 -0700)
author Samuel Just <sjust@redhat.com>
Fri, 29 Mar 2019 22:36:55 +0000 (15:36 -0700)
committer sjust@redhat.com <sjust@redhat.com>
Wed, 1 May 2019 18:22:21 +0000 (11:22 -0700)
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 52661e593008cde869f93ceb34edf08fdb805ca4..06d2da9864755e90a7fe54a8e75afe4961e85673 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -4082,34 +4082,96 @@ void PG::handle_scrub_reserve_release(OpRequestRef op)
    clear_scrub_reserved();
  }
  
+// Compute pending backfill data
+static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
+{
+  lgeneric_dout(cct, 20) << __func__ << " Adjust local usage "
+                        << (local_bytes >> 10) << "KiB"
+                        << " primary usage " << (bf_bytes >> 10)
+                        << "KiB" << dendl;
+
+  return std::max((int64_t)0, bf_bytes - local_bytes);
+}
+
+
  // We can zero the value of primary num_bytes as just an atomic.
  // However, setting above zero reserves space for backfill and requires
  // the OSDService::stat_lock which protects all OSD usage
-void PG::set_reserved_num_bytes(int64_t primary, int64_t local) {
-  ceph_assert(osd->stat_lock.is_locked_by_me());
-  primary_num_bytes.store(primary);
-  local_num_bytes.store(local);
-  return;
+bool PG::try_reserve_recovery_space(
+  int64_t primary_bytes, int64_t local_bytes) {
+  // Use tentative_bacfill_full() to make sure enough
+  // space is available to handle target bytes from primary.
+
+  // TODO: If we passed num_objects from primary we could account for
+  // an estimate of the metadata overhead.
+
+  // TODO: If we had compressed_allocated and compressed_original from primary
+  // we could compute compression ratio and adjust accordingly.
+
+  // XXX: There is no way to get omap overhead and this would only apply
+  // to whatever possibly different partition that is storing the database.
+
+  // update_osd_stat() from heartbeat will do this on a new
+  // statfs using ps->primary_bytes.
+  uint64_t pending_adjustment = 0;
+  if (primary_bytes) {
+    // For erasure coded pool overestimate by a full stripe per object
+    // because we don't know how each objected rounded to the nearest stripe
+    if (pool.info.is_erasure()) {
+      primary_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      primary_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+       info.stats.stats.sum.num_objects;
+      local_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      local_bytes += get_pgbackend()->get_ec_stripe_chunk_size() *
+       info.stats.stats.sum.num_objects;
+    }
+    pending_adjustment = pending_backfill(
+      cct,
+      primary_bytes,
+      local_bytes);
+    dout(10) << __func__ << " primary_bytes " << (primary_bytes >> 10)
+            << "KiB"
+            << " local " << (local_bytes >> 10) << "KiB"
+            << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
+            << dendl;
+  }
+
+  // This lock protects not only the stats OSDService but also setting the
+  // pg primary_bytes.  That's why we don't immediately unlock
+  Mutex::Locker l(osd->stat_lock);
+  osd_stat_t cur_stat = osd->osd_stat;
+  if (cct->_conf->osd_debug_reject_backfill_probability > 0 &&
+      (rand()%1000 < (cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
+    dout(10) << "backfill reservation rejected: failure injection"
+            << dendl;
+    return false;
+  } else if (!cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
+      osd->tentative_backfill_full(this, pending_adjustment, cur_stat)) {
+    dout(10) << "backfill reservation rejected: backfill full"
+            << dendl;
+    return false;
+  } else {
+    // Don't reserve space if skipped reservation check, this is used
+    // to test the other backfill full check AND in case a corruption
+    // of num_bytes requires ignoring that value and trying the
+    // backfill anyway.
+    if (primary_bytes &&
+       !cct->_conf->osd_debug_skip_full_check_in_backfill_reservation) {
+      primary_num_bytes.store(primary_bytes);
+      local_num_bytes.store(local_bytes);
+    } else {
+      unreserve_recovery_space();
+    }
+    return true;
+  }
  }
  
-void PG::clear_reserved_num_bytes() {
+void PG::unreserve_recovery_space() {
    primary_num_bytes.store(0);
    local_num_bytes.store(0);
    return;
  }
  
-void PG::reject_reservation()
-{
-  clear_reserved_num_bytes();
-  osd->send_message_osd_cluster(
-    primary.osd,
-    new MBackfillReserve(
-      MBackfillReserve::REJECT,
-      spg_t(info.pgid.pgid, primary.shard),
-      get_osdmap_epoch()),
-    get_osdmap_epoch());
-}
-
  void PG::clear_scrub_reserved()
  {
    scrubber.reserved_peers.clear();
diff --git a/src/osd/PG.h b/src/osd/PG.h

index 76188e151c5a6dda7992cd77880286e252be3377..785a742436a360b902e465a3563ac3ddc40a1daf 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -789,8 +789,8 @@ public:
      return primary_num_bytes.load() > 0;
    }
  
-  void set_reserved_num_bytes(int64_t primary, int64_t local);
-  void clear_reserved_num_bytes();
+  bool try_reserve_recovery_space(int64_t primary, int64_t local) override;
+  void unreserve_recovery_space() override;
  
    // If num_bytes are inconsistent and local_num- goes negative
    // it's ok, because it would then be ignored.
@@ -1429,8 +1429,6 @@ protected:
    void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
    void handle_scrub_reserve_release(OpRequestRef op);
  
-  void reject_reservation();
-
    // -- recovery state --
  
    struct QueuePeeringEvt : Context {
diff --git a/src/osd/PeeringState.cc b/src/osd/PeeringState.cc

index 4829edd7143fe5d4e0caf6041339af4b70487e9b..4d3451f18aab7b90f40644ef0345b2a4cca98434 100644 (file)
--- a/src/osd/PeeringState.cc
+++ b/src/osd/PeeringState.cc
@@ -1115,6 +1115,20 @@ bool PeeringState::all_unfound_are_queried_or_lost(
  }
  
  
+void PeeringState::reject_reservation()
+{
+  pl->unreserve_recovery_space();
+  pl->send_cluster_message(
+    primary.osd,
+    new MBackfillReserve(
+      MBackfillReserve::REJECT,
+      spg_t(info.pgid.pgid, primary.shard),
+      get_osdmap_epoch()),
+    get_osdmap_epoch());
+}
+
+
+
  /*------------ Peering State Machine----------------*/
  #undef dout_prefix
  #define dout_prefix (context< PeeringMachine >().dpp->gen_prefix(*_dout) \
@@ -1860,7 +1874,7 @@ boost::statechart::result
  PeeringState::RepNotRecovering::react(const RejectRemoteReservation &evt)
  {
    DECLARE_LOCALS
-  pg->reject_reservation();
+  ps->reject_reservation();
    post_event(RemoteReservationRejected());
    return discard_event();
  }
@@ -1900,7 +1914,7 @@ PeeringState::RepWaitRecoveryReserved::react(
    const RemoteReservationCanceled &evt)
  {
    DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
  
    pl->cancel_remote_recovery_reservation();
    return transit<RepNotRecovering>();
@@ -1922,77 +1936,16 @@ PeeringState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
    context< PeeringMachine >().log_enter(state_name);
  }
  
-// Compute pending backfill data
-static int64_t pending_backfill(CephContext *cct, int64_t bf_bytes, int64_t local_bytes)
-{
-    lgeneric_dout(cct, 20) << __func__ << " Adjust local usage " << (local_bytes >> 10) << "KiB"
-                              << " primary usage " << (bf_bytes >> 10) << "KiB" << dendl;
-    return std::max((int64_t)0, bf_bytes - local_bytes);
-}
-
  boost::statechart::result
  PeeringState::RepNotRecovering::react(const RequestBackfillPrio &evt)
  {
  
    DECLARE_LOCALS
-  // Use tentative_bacfill_full() to make sure enough
-  // space is available to handle target bytes from primary.
-
-  // TODO: If we passed num_objects from primary we could account for
-  // an estimate of the metadata overhead.
-
-  // TODO: If we had compressed_allocated and compressed_original from primary
-  // we could compute compression ratio and adjust accordingly.
-
-  // XXX: There is no way to get omap overhead and this would only apply
-  // to whatever possibly different partition that is storing the database.
  
-  // update_osd_stat() from heartbeat will do this on a new
-  // statfs using ps->primary_num_bytes.
-  uint64_t pending_adjustment = 0;
-  int64_t primary_num_bytes = evt.primary_num_bytes;
-  int64_t local_num_bytes = evt.local_num_bytes;
-  if (primary_num_bytes) {
-    // For erasure coded pool overestimate by a full stripe per object
-    // because we don't know how each objected rounded to the nearest stripe
-    if (ps->pool.info.is_erasure()) {
-      primary_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
-      primary_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * ps->info.stats.stats.sum.num_objects;
-      local_num_bytes /= (int)pg->get_pgbackend()->get_ec_data_chunk_count();
-      local_num_bytes += pg->get_pgbackend()->get_ec_stripe_chunk_size() * ps->info.stats.stats.sum.num_objects;
-    }
-    pending_adjustment = pending_backfill(
-      context< PeeringMachine >().cct,
-      primary_num_bytes,
-      local_num_bytes);
-    psdout(10) << __func__ << " primary_num_bytes " << (primary_num_bytes >> 10) << "KiB"
-                       << " local " << (local_num_bytes >> 10) << "KiB"
-                       << " pending_adjustments " << (pending_adjustment >> 10) << "KiB"
-                       << dendl;
-  }
-  // This lock protects not only the stats OSDService but also setting the pg primary_num_bytes
-  // That's why we don't immediately unlock
-  Mutex::Locker l(pg->osd->stat_lock);
-  osd_stat_t cur_stat = pg->osd->osd_stat;
-  if (ps->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
-      (rand()%1000 < (ps->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
-    psdout(10) << "backfill reservation rejected: failure injection"
-                      << dendl;
-    post_event(RejectRemoteReservation());
-  } else if (!ps->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
-      pg->osd->tentative_backfill_full(pg, pending_adjustment, cur_stat)) {
-    psdout(10) << "backfill reservation rejected: backfill full"
-                      << dendl;
+  if (!pl->try_reserve_recovery_space(
+       evt.primary_num_bytes, evt.local_num_bytes)) {
      post_event(RejectRemoteReservation());
    } else {
-    // Don't reserve space if skipped reservation check, this is used
-    // to test the other backfill full check AND in case a corruption
-    // of num_bytes requires ignoring that value and trying the
-    // backfill anyway.
-    if (primary_num_bytes && !ps->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation)
-      pg->set_reserved_num_bytes(primary_num_bytes, local_num_bytes);
-    else
-      pg->clear_reserved_num_bytes();
      // Use un-ec-adjusted bytes for stats.
      ps->info.stats.stats.sum.num_bytes = evt.local_num_bytes;
  
@@ -2072,7 +2025,7 @@ PeeringState::RepWaitBackfillReserved::react(
    const RejectRemoteReservation &evt)
  {
    DECLARE_LOCALS
-  pg->reject_reservation();
+  ps->reject_reservation();
    post_event(RemoteReservationRejected());
    return discard_event();
  }
@@ -2082,7 +2035,7 @@ PeeringState::RepWaitBackfillReserved::react(
    const RemoteReservationRejected &evt)
  {
    DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
  
    pl->cancel_remote_recovery_reservation();
    return transit<RepNotRecovering>();
@@ -2093,7 +2046,7 @@ PeeringState::RepWaitBackfillReserved::react(
    const RemoteReservationCanceled &evt)
  {
    DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
  
    pl->cancel_remote_recovery_reservation();
    return transit<RepNotRecovering>();
@@ -2113,7 +2066,7 @@ PeeringState::RepRecovering::react(const RemoteRecoveryPreempted &)
    DECLARE_LOCALS
  
  
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
    pl->send_cluster_message(
      ps->primary.osd,
      new MRecoveryReserve(
@@ -2130,7 +2083,7 @@ PeeringState::RepRecovering::react(const BackfillTooFull &)
    DECLARE_LOCALS
  
  
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
    pl->send_cluster_message(
      ps->primary.osd,
      new MBackfillReserve(
@@ -2147,7 +2100,7 @@ PeeringState::RepRecovering::react(const RemoteBackfillPreempted &)
    DECLARE_LOCALS
  
  
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
    pl->send_cluster_message(
      ps->primary.osd,
      new MBackfillReserve(
@@ -2162,7 +2115,7 @@ void PeeringState::RepRecovering::exit()
  {
    context< PeeringMachine >().log_exit(state_name, enter_time);
    DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
  
    pl->cancel_remote_recovery_reservation();
    utime_t dur = ceph_clock_now() - enter_time;
@@ -2894,7 +2847,7 @@ void PeeringState::ReplicaActive::exit()
  {
    context< PeeringMachine >().log_exit(state_name, enter_time);
    DECLARE_LOCALS
-  pg->clear_reserved_num_bytes();
+  pl->unreserve_recovery_space();
  
    pl->cancel_remote_recovery_reservation();
    utime_t dur = ceph_clock_now() - enter_time;
diff --git a/src/osd/PeeringState.h b/src/osd/PeeringState.h

index b89f1f6780aee8cf276a78fd9455d9f1bbeb0adf..18e30ac8fa168b686326a73f08097012ce65649d 100644 (file)
--- a/src/osd/PeeringState.h
+++ b/src/osd/PeeringState.h
@@ -131,6 +131,11 @@ public:
      virtual void on_backfill_canceled() = 0;
      virtual void on_recovery_reserved() = 0;
  
+    // recovery space accounting
+    virtual bool try_reserve_recovery_space(
+      int64_t primary_num_bytes, int64_t local_num_bytes) = 0;
+    virtual void unreserve_recovery_space() = 0;
+
      virtual epoch_t oldest_stored_osdmap() = 0;
      virtual LogChannel &get_clog() = 0;
  
@@ -1249,6 +1254,8 @@ public:
    bool adjust_need_up_thru(const OSDMapRef osdmap);
    PastIntervals::PriorSet build_prior();
  
+  void reject_reservation();
+
  public:
    PeeringState(
      CephContext *cct,
author	Samuel Just <sjust@redhat.com>
	Fri, 29 Mar 2019 22:36:55 +0000 (15:36 -0700)
committer	sjust@redhat.com <sjust@redhat.com>
	Wed, 1 May 2019 18:22:21 +0000 (11:22 -0700)
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/PeeringState.cc		patch \| blob \| history
src/osd/PeeringState.h		patch \| blob \| history