]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
PG, OSD: reject backfills when an OSD is nearly full
authorMike Ryan <mike.ryan@inktank.com>
Fri, 14 Sep 2012 17:31:42 +0000 (10:31 -0700)
committerMike Ryan <mike.ryan@inktank.com>
Wed, 26 Sep 2012 18:57:31 +0000 (11:57 -0700)
Reject backfills when an OSD reaches a configurable full ratio. Retry
backfilling periodically in the hopes that the OSD has become less full.

This changeset introduces two configuration options for dealing with
this: osd_refuse_backfill_full_ratio and osd_backfill_retry_interval.

We also introduce two new state transitions in the PG's Active state.

Signed-off-by: Mike Ryan <mike.ryan@inktank.com>
src/common/config_opts.h
src/messages/MBackfillReserve.h
src/mon/PGMonitor.cc
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/osd_types.cc
src/osd/osd_types.h

index 7143361326c9920563e0dafc41bc14a646cb1574..1c09c5c5e659e6b59ef559a19e1de180284df74e 100644 (file)
@@ -267,6 +267,12 @@ OPTION(osd_tmapput_sets_uses_tmap, OPT_BOOL, false)
 // Maximum number of backfills to or from a single osd
 OPTION(osd_max_backfills, OPT_U64, 5)
 
+// Refuse backfills when OSD full ratio is above this value
+OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85)
+
+// Seconds to wait before retrying refused backfills
+OPTION(osd_backfill_retry_interval, OPT_DOUBLE, 10.0)
+
 OPTION(osd_uuid, OPT_UUID, uuid_d())
 OPTION(osd_data, OPT_STR, "/var/lib/ceph/osd/$cluster-$id")
 OPTION(osd_journal, OPT_STR, "/var/lib/ceph/osd/$cluster-$id/journal")
index 23308c22be0938e3b2ccdaeb40ad02857d72d7f4..47eb1d3781387e8e4b35ddb378662d01753457a4 100644 (file)
@@ -25,7 +25,8 @@ public:
   epoch_t query_epoch;
   enum {
     REQUEST = 0,
-    GRANT = 1
+    GRANT = 1,
+    REJECT = 2,
   };
   int type;
   MBackfillReserve(int type,
index 30ada2f95a50f8c3883cd60f4598cbf7ebe719d2..c30760a8404e432186a7cf4d83e574731ae3405b 100644 (file)
@@ -1228,6 +1228,8 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
       note["backfill"] += p->second;
     if (p->first & PG_STATE_BACKFILL)
       note["backfilling"] += p->second;
+    if (p->first & PG_STATE_BACKFILL_TOOFULL)
+      note["backfill_toofull"] += p->second;
   }
 
   hash_map<pg_t, pg_stat_t> stuck_pgs;
@@ -1277,7 +1279,8 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
                               PG_STATE_RECOVERING |
                               PG_STATE_INCOMPLETE |
                               PG_STATE_BACKFILL_WAIT |
-                              PG_STATE_BACKFILL) &&
+                              PG_STATE_BACKFILL |
+                              PG_STATE_BACKFILL_TOOFULL) &&
            stuck_pgs.count(p->first) == 0) {
          ostringstream ss;
          ss << "pg " << p->first << " is " << pg_state_string(p->second.state);
index 606b5c3cc46ae97e10bbbe66350f166b6ff109e3..1dc552a08cc3f54dd4d2ea40e9a23686fae4bd0b 100644 (file)
@@ -162,6 +162,8 @@ OSDService::OSDService(OSD *osd) :
   watch_lock("OSD::watch_lock"),
   watch_timer(osd->client_messenger->cct, watch_lock),
   watch(NULL),
+  backfill_request_lock("OSD::backfill_request_lock"),
+  backfill_request_timer(g_ceph_context, backfill_request_lock, false),
   last_tid(0),
   tid_lock("OSDService::tid_lock"),
   reserver_finisher(g_ceph_context),
@@ -819,6 +821,7 @@ int OSD::init()
   Mutex::Locker lock(osd_lock);
 
   timer.init();
+  service.backfill_request_timer.init();
 
   // mount.
   dout(2) << "mounting " << dev_path << " "
@@ -1048,6 +1051,10 @@ int OSD::shutdown()
 
   timer.shutdown();
 
+  service.backfill_request_lock.Lock();
+  service.backfill_request_timer.shutdown();
+  service.backfill_request_lock.Unlock();
+
   heartbeat_lock.Lock();
   heartbeat_stop = true;
   heartbeat_cond.Signal();
@@ -4904,6 +4911,13 @@ void OSD::handle_pg_backfill_reserve(OpRequestRef op)
          m->query_epoch,
          m->query_epoch,
          PG::RemoteBackfillReserved())));
+  } else if (m->type == MBackfillReserve::REJECT) {
+    pg->queue_peering_event(
+      PG::CephPeeringEvtRef(
+       new PG::CephPeeringEvt(
+         m->query_epoch,
+         m->query_epoch,
+         PG::RemoteReservationRejected())));
   } else {
     assert(0);
   }
index 1409158d05457fce36e1e5ea3a735498af0e75cf..f59910f41dad975e4f58302d2e81682212b97ca9 100644 (file)
@@ -244,6 +244,10 @@ public:
   SafeTimer watch_timer;
   Watch *watch;
 
+  // -- Backfill Request Scheduling --
+  Mutex backfill_request_lock;
+  SafeTimer backfill_request_timer;
+
   // -- tids --
   // for ops i issue
   tid_t last_tid;
index 1861130c65fd9954ca217785f6b2e1bc3d050244..5c1d838463c74bce66c8abc248812d6461d0ec41 100644 (file)
@@ -1670,6 +1670,7 @@ void PG::all_activated_and_committed()
 
   // make sure CLEAN is marked if we've been clean in this interval
   if (info.last_complete == info.last_update &&
+      !state_test(PG_STATE_BACKFILL_TOOFULL) &&
       !state_test(PG_STATE_BACKFILL_WAIT) &&
       !state_test(PG_STATE_RECOVERING)) {
     mark_clean();
@@ -1733,6 +1734,7 @@ void PG::finish_recovery(ObjectStore::Transaction& t, list<Context*>& tfin)
   dout(10) << "finish_recovery" << dendl;
   assert(info.last_complete == info.last_update);
 
+  state_clear(PG_STATE_BACKFILL_TOOFULL);
   state_clear(PG_STATE_BACKFILL_WAIT);
   state_clear(PG_STATE_RECOVERING);
 
@@ -5093,6 +5095,24 @@ void PG::RecoveryState::Backfilling::exit()
   pg->state_clear(PG_STATE_BACKFILL);
 }
 
+template <class EVT>
+struct QueuePeeringEvt : Context {
+  boost::intrusive_ptr<PG> pg;
+  epoch_t epoch;
+  EVT evt;
+  QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
+    pg(pg), epoch(epoch), evt(evt) {}
+  void finish(int r) {
+    pg->lock();
+    pg->queue_peering_event(PG::CephPeeringEvtRef(
+       new PG::CephPeeringEvt(
+         epoch,
+         epoch,
+         evt)));
+    pg->unlock();
+  }
+};
+
 /*--WaitRemoteBackfillReserved--*/
 
 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
@@ -5121,26 +5141,33 @@ void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
   context< RecoveryMachine >().log_exit(state_name, enter_time);
 }
 
+boost::statechart::result
+PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
+  return transit<Backfilling>();
+}
 
-/*--WaitLocalBackfillReserved--*/
-template <class EVT>
-struct C_QueuePeeringEvt : Context {
-  boost::intrusive_ptr<PG> pg;
-  epoch_t epoch;
-  EVT evt;
-  C_QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
-    pg(pg), epoch(epoch), evt(evt) {}
-  void finish(int r) {
-    pg->lock();
-    pg->queue_peering_event(PG::CephPeeringEvtRef(
-       new PG::CephPeeringEvt(
-         epoch,
-         epoch,
-         evt)));
-    pg->unlock();
-  }
-};
+boost::statechart::result
+PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  pg->state_clear(PG_STATE_BACKFILL_WAIT);
+  pg->state_set(PG_STATE_BACKFILL_TOOFULL);
+
+  Mutex::Locker lock(pg->osd->backfill_request_lock);
+  pg->osd->backfill_request_timer.add_event_after(
+    g_conf->osd_backfill_retry_interval,
+    new QueuePeeringEvt<RequestBackfill>(
+      pg, pg->get_osdmap()->get_epoch(),
+      RequestBackfill()));
 
+  return transit<NotBackfilling>();
+}
+
+/*--WaitLocalBackfillReserved--*/
 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
   : my_base(ctx)
 {
@@ -5149,7 +5176,7 @@ PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_conte
   PG *pg = context< RecoveryMachine >().pg;
   pg->osd->local_reserver.request_reservation(
     pg->info.pgid,
-    new C_QueuePeeringEvt<LocalBackfillReserved>(
+    new QueuePeeringEvt<LocalBackfillReserved>(
       pg, pg->get_osdmap()->get_epoch(),
       LocalBackfillReserved()));
 }
@@ -5192,11 +5219,21 @@ PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context c
   state_name = "Started/Primary/Active/RepWaitBackfillReserved";
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
-  pg->osd->remote_reserver.request_reservation(
-    pg->info.pgid,
-    new C_QueuePeeringEvt<RemoteBackfillReserved>(
-      pg, pg->get_osdmap()->get_epoch(),
-      RemoteBackfillReserved()));
+
+  int64_t kb      = pg->osd->osd->osd_stat.kb,
+          kb_used = pg->osd->osd->osd_stat.kb_used;
+  int64_t max = kb * g_conf->osd_backfill_full_ratio;
+  if (kb_used >= max) {
+    dout(10) << "backfill reservation rejected: kb used >= max: "
+             << kb_used << " >= " << max << dendl;
+    post_event(RemoteReservationRejected());
+  } else {
+    pg->osd->remote_reserver.request_reservation(
+      pg->info.pgid,
+      new QueuePeeringEvt<RemoteBackfillReserved>(
+        pg, pg->get_osdmap()->get_epoch(),
+        RemoteBackfillReserved()));
+  }
 }
 
 void PG::RecoveryState::RepWaitBackfillReserved::exit()
@@ -5217,6 +5254,19 @@ PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &
   return transit<RepBackfilling>();
 }
 
+boost::statechart::result
+PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->cluster_messenger->send_message(
+    new MBackfillReserve(
+      MBackfillReserve::REJECT,
+      pg->info.pgid,
+      pg->get_osdmap()->get_epoch()),
+    pg->get_osdmap()->get_cluster_inst(pg->acting[0]));
+  return transit<RepNotBackfilling>();
+}
+
 /*---RepBackfilling-------*/
 PG::RecoveryState::RepBackfilling::RepBackfilling(my_context ctx)
   : my_base(ctx)
@@ -5383,6 +5433,7 @@ boost::statechart::result PG::RecoveryState::Active::react(const RecoveryComplet
 
   int newest_update_osd;
 
+  pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_clear(PG_STATE_RECOVERING);
 
@@ -5470,6 +5521,7 @@ void PG::RecoveryState::Active::exit()
   pg->backfill_reserved = false;
   pg->backfill_reserving = false;
   pg->state_clear(PG_STATE_DEGRADED);
+  pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
   pg->state_clear(PG_STATE_BACKFILL_WAIT);
   pg->state_clear(PG_STATE_REPLAY);
 }
index bf3b2afab940e029cd7dee4afd2197c74010261b..aa10fe101748e2391795b8056be899c863cec091 100644 (file)
@@ -1046,6 +1046,7 @@ public:
   TrivialEvent(Backfilled)
   TrivialEvent(LocalBackfillReserved)
   TrivialEvent(RemoteBackfillReserved)
+  TrivialEvent(RemoteReservationRejected)
   TrivialEvent(RequestBackfill)
 
   /* Encapsulates PG recovery process */
@@ -1318,10 +1319,13 @@ public:
 
     struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
       typedef boost::mpl::list<
-       boost::statechart::transition< RemoteBackfillReserved, Backfilling >
+       boost::statechart::custom_reaction< RemoteBackfillReserved >,
+       boost::statechart::custom_reaction< RemoteReservationRejected >
        > reactions;
       WaitRemoteBackfillReserved(my_context ctx);
       void exit();
+      boost::statechart::result react(const RemoteBackfillReserved& evt);
+      boost::statechart::result react(const RemoteReservationRejected& evt);
     };
 
     struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
@@ -1371,11 +1375,13 @@ public:
 
     struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
       typedef boost::mpl::list<
-       boost::statechart::custom_reaction< RemoteBackfillReserved >
+       boost::statechart::custom_reaction< RemoteBackfillReserved >,
+       boost::statechart::custom_reaction< RemoteReservationRejected >
        > reactions;
       RepWaitBackfillReserved(my_context ctx);
       void exit();
       boost::statechart::result react(const RemoteBackfillReserved &evt);
+      boost::statechart::result react(const RemoteReservationRejected &evt);
     };
 
     struct RepNotBackfilling : boost::statechart::state< RepNotBackfilling, ReplicaActive>, NamedState {
index 0fe106286f5d2cac85cbe371383e4e83cc3fb0af..4cf2622cf803127e38e46b1e00cb5d273d8f7436 100644 (file)
@@ -421,6 +421,8 @@ std::string pg_state_string(int state)
     oss << "wait_backfill+";
   if (state & PG_STATE_BACKFILL)
     oss << "backfilling+";
+  if (state & PG_STATE_BACKFILL_TOOFULL)
+    oss << "backfill_toofull+";
   if (state & PG_STATE_INCOMPLETE)
     oss << "incomplete+";
   string ret(oss.str());
index 1f74a05c9c5eb2eeb3ec95fe8f108473ddfcc3ed..55d9be3f9e3ceabdbf0de4b64630b27b9a6b7353 100644 (file)
@@ -566,6 +566,7 @@ inline ostream& operator<<(ostream& out, const osd_stat_t& s) {
 #define PG_STATE_REMAPPED     (1<<18) // pg is explicitly remapped to different OSDs than CRUSH
 #define PG_STATE_DEEP_SCRUB   (1<<19) // deep scrub: check CRC32 on files
 #define PG_STATE_BACKFILL  (1<<20) // [active] backfilling pg content
+#define PG_STATE_BACKFILL_TOOFULL (1<<21) // backfill can't proceed: too full
 
 std::string pg_state_string(int state);