osd: restructure and simplify internal fullness checks

author Sage Weil <sage@redhat.com>

Thu, 23 Feb 2017 20:55:35 +0000 (15:55 -0500)

committer Sage Weil <sage@redhat.com>

Mon, 6 Mar 2017 22:21:21 +0000 (17:21 -0500)
author Sage Weil <sage@redhat.com>
Thu, 23 Feb 2017 20:55:35 +0000 (15:55 -0500)
committer Sage Weil <sage@redhat.com>
Mon, 6 Mar 2017 22:21:21 +0000 (17:21 -0500)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 0cbec2006d3bc44f613de6e4ebaeef58ec564134..ef4dc9ebceba9dbced29b92360726d04a49ae936 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -875,7 +875,6 @@ OPTION(osd_op_history_size, OPT_U32, 20)    // Max number of completed ops to tr
  OPTION(osd_op_history_duration, OPT_U32, 600) // Oldest completed op to track
  OPTION(osd_target_transaction_size, OPT_INT, 30)     // to adjust various transactions that batch smaller items
  OPTION(osd_failsafe_full_ratio, OPT_FLOAT, .97) // what % full makes an OSD "full" (failsafe)
-OPTION(osd_failsafe_nearfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD near full (failsafe)
  OPTION(osd_fast_fail_on_connection_refused, OPT_BOOL, true) // immediately mark OSDs as down once they refuse to accept connections
  
  OPTION(osd_pg_object_context_cache_count, OPT_INT, 64)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index d1df4799b88a35e877deccd3a98ae93b1944c798..5c2cf7975024d4e7bbf9a57700713f79bdefd4e4 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -278,7 +278,6 @@ OSDService::OSDService(OSD *osd) :
    stat_lock("OSDService::stat_lock"),
    full_status_lock("OSDService::full_status_lock"),
    cur_state(NONE),
-  last_msg(0),
    cur_ratio(0),
    epoch_lock("OSDService::epoch_lock"),
    boot_epoch(0), up_epoch(0), bind_epoch(0),
@@ -707,19 +706,9 @@ float OSDService::get_failsafe_full_ratio()
    return full_ratio;
  }
  
-float OSDService::get_failsafe_nearfull_ratio()
-{
-  float nearfull_ratio = cct->_conf->osd_failsafe_nearfull_ratio;
-  if (nearfull_ratio > 1.0) nearfull_ratio /= 100.0;
-  return nearfull_ratio;
-}
-
-void OSDService::check_nearfull_warning(const osd_stat_t &osd_stat)
+void OSDService::check_full_status(const osd_stat_t &osd_stat)
  {
    Mutex::Locker l(full_status_lock);
-  enum s_names new_state;
-
-  time_t now = ceph_clock_gettime();
  
    // We base ratio on kb_avail rather than kb_used because they can
    // differ significantly e.g. on btrfs volumes with a large number of
@@ -728,39 +717,80 @@ void OSDService::check_nearfull_warning(const osd_stat_t &osd_stat)
    // much space is available to use than how much we've already used.
    float ratio = ((float)(osd_stat.kb - osd_stat.kb_avail)) /
      ((float)osd_stat.kb);
-  float nearfull_ratio = get_failsafe_nearfull_ratio();
-  float full_ratio = get_failsafe_full_ratio();
    cur_ratio = ratio;
  
-  if (full_ratio > 0 && ratio > full_ratio) {
-    new_state = FULL;
-  } else if (nearfull_ratio > 0 && ratio > nearfull_ratio) {
-    new_state = NEAR;
-  } else {
+  // The OSDMap ratios take precendence.  So if the failsafe is .95 and
+  // the admin sets the cluster full to .96, the failsafe moves up to .96
+  // too.  (Not that having failsafe == full is ideal, but it's better than
+  // dropping writes before the clusters appears full.)
+  OSDMapRef osdmap = get_osdmap();
+  if (!osdmap || osdmap->get_epoch() == 0) {
      cur_state = NONE;
      return;
    }
+  float nearfull_ratio = osdmap->get_nearfull_ratio();
+  float full_ratio = std::max(osdmap->get_full_ratio(), nearfull_ratio);
+  float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
  
+  if (full_ratio <= 0 ||
+      nearfull_ratio <= 0) {
+    derr << __func__ << " full_ratio or nearfull_ratio is <= 0" << dendl;
+    cur_state = NONE;
+    return;
+  }
+
+  enum s_names new_state;
+  if (ratio > failsafe_ratio) {
+    new_state = FAILSAFE;
+  } else if (ratio > full_ratio) {
+    new_state = FULL;
+  } else if (ratio > nearfull_ratio) {
+    new_state = NEARFULL;
+  } else {
+    new_state = NONE;
+  }
+  dout(20) << __func__ << " cur ratio " << ratio
+          << ". nearfull_ratio " << nearfull_ratio
+          << ", full_ratio " << full_ratio
+          << ", failsafe_ratio " << failsafe_ratio
+          << ", new state " << get_full_state_name(new_state)
+          << dendl;
+
+  // warn
    if (cur_state != new_state) {
+    dout(10) << __func__ << " " << get_full_state_name(cur_state)
+            << " -> " << get_full_state_name(new_state) << dendl;
+    if (new_state == FAILSAFE) {
+      clog->error() << "failsafe engaged, dropping updates, now "
+                   << (int)roundf(ratio * 100) << "% full";
+    } else if (cur_state == FAILSAFE) {
+      clog->error() << "failsafe disengaged, no longer dropping updates, now "
+                   << (int)roundf(ratio * 100) << "% full";
+    }
      cur_state = new_state;
-  } else if (now - last_msg < cct->_conf->osd_op_complaint_time) {
-    return;
    }
-  last_msg = now;
-  if (cur_state == FULL)
-    clog->error() << "OSD full dropping all updates " << (int)roundf(ratio * 100) << "% full";
-  else
-    clog->warn() << "OSD near full (" << (int)roundf(ratio * 100) << "%)";
  }
  
  bool OSDService::check_failsafe_full()
  {
    Mutex::Locker l(full_status_lock);
-  if (cur_state == FULL)
+  if (cur_state == FAILSAFE)
      return true;
    return false;
  }
  
+bool OSDService::is_nearfull()
+{
+  Mutex::Locker l(full_status_lock);
+  return cur_state == NEARFULL;
+}
+
+bool OSDService::is_full()
+{
+  Mutex::Locker l(full_status_lock);
+  return cur_state >= FULL;
+}
+
  bool OSDService::too_full_for_backfill(double *_ratio, double *_max_ratio)
  {
    Mutex::Locker l(full_status_lock);
@@ -801,9 +831,9 @@ void OSDService::update_osd_stat(vector<int>& hb_peers)
    osd->logger->set(l_osd_stat_bytes_used, used);
    osd->logger->set(l_osd_stat_bytes_avail, avail);
  
-  check_nearfull_warning(osd_stat);
-
    dout(20) << "update_osd_stat " << osd_stat << dendl;
+
+  check_full_status(osd_stat);
  }
  
  void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 6e16d633cbdd2d1bc4e5aa0bd09b126498cfe3f5..e305d7ef5e41eb07ee528918f8476ed909af4543 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1136,14 +1136,23 @@ public:
    // -- OSD Full Status --
  private:
    Mutex full_status_lock;
-  enum s_names { NONE, NEAR, FULL } cur_state;
-  time_t last_msg;
-  double cur_ratio;
+  enum s_names { NONE, NEARFULL, FULL, FAILSAFE } cur_state;  // ascending
+  const char *get_full_state_name(s_names s) {
+    switch (s) {
+    case NONE: return "none";
+    case NEARFULL: return "nearfull";
+    case FULL: return "full";
+    case FAILSAFE: return "failsafe";
+    default: return "???";
+    }
+  }
+  double cur_ratio;  ///< current utilization
    float get_failsafe_full_ratio();
-  float get_failsafe_nearfull_ratio();
-  void check_nearfull_warning(const osd_stat_t &stat);
+  void check_full_status(const osd_stat_t &stat);
  public:
    bool check_failsafe_full();
+  bool is_nearfull();
+  bool is_full();
    bool too_full_for_backfill(double *ratio, double *max_ratio);
  
    // -- epochs --
author	Sage Weil <sage@redhat.com>
	Thu, 23 Feb 2017 20:55:35 +0000 (15:55 -0500)
committer	Sage Weil <sage@redhat.com>
	Mon, 6 Mar 2017 22:21:21 +0000 (17:21 -0500)
src/common/config_opts.h		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history