osd: wait for pgs to catch up in handle_osd_map

author Sage Weil <sage@redhat.com>

Sun, 5 Nov 2017 16:44:57 +0000 (10:44 -0600)

committer Sage Weil <sage@redhat.com>

Mon, 4 Dec 2017 18:45:17 +0000 (12:45 -0600)
author Sage Weil <sage@redhat.com>
Sun, 5 Nov 2017 16:44:57 +0000 (10:44 -0600)
committer Sage Weil <sage@redhat.com>
Mon, 4 Dec 2017 18:45:17 +0000 (12:45 -0600)
diff --git a/src/common/options.cc b/src/common/options.cc

index 2b311db7b93de85f8dd77aa17ec26a5124e1a4ce..e75f67b08b7f0cca799553622746307efb59f2ae 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -1911,6 +1911,11 @@ std::vector<Option> get_global_options() {
      .set_default(40)
      .set_description(""),
  
+    Option("osd_pg_epoch_max_lag_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.0)
+    .set_description("Max multiple of the map cache that PGs can lag before we throttle map injest")
+    .add_see_also("osd_map_cache_size"),
+
      Option("osd_inject_bad_map_crc_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
      .set_default(0)
      .set_description(""),
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc

index 6841a4017deb44dead7957561d9f2a48e500f519..ccc11628868def35bd849f94124a39fc48f961b9 100644 (file)
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1980,6 +1980,8 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
    store_is_rotational(store->is_rotational()),
    trace_endpoint("0.0.0.0", 0, "osd"),
    asok_hook(NULL),
+  m_osd_pg_epoch_max_lag_factor(cct->_conf->get_val<double>(
+                                 "osd_pg_epoch_max_lag_factor")),
    osd_compat(get_osd_compat_set()),
    osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
             get_num_op_threads()),
@@ -7347,6 +7349,29 @@ void OSD::handle_osd_map(MOSDMap *m)
      skip_maps = true;
    }
  
+  // wait for pgs to catch up
+  {
+    // we extend the map cache pins to accomodate pgs slow to consume maps
+    // for some period, until we hit the max_lag_factor bound, at which point
+    // we block here to stop injesting more maps than they are able to keep
+    // up with.
+    epoch_t max_lag = cct->_conf->osd_map_cache_size *
+      m_osd_pg_epoch_max_lag_factor;
+    assert(max_lag > 0);
+    if (osdmap->get_epoch() > max_lag) {
+      epoch_t min = service.get_min_pg_epoch();
+      epoch_t need = osdmap->get_epoch() - max_lag;
+      if (need > min) {
+       dout(10) << __func__ << " waiting for pgs to consume " << need
+                << " (current min " << min
+                << ", map cache is " << cct->_conf->osd_map_cache_size
+                << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
+                << ")" << dendl;
+       service.wait_min_pg_epoch(need);
+      }
+    }
+  }
+
    ObjectStore::Transaction t;
    uint64_t txn_size = 0;
  
@@ -9314,6 +9339,7 @@ const char** OSD::get_tracked_conf_keys() const
      "osd_op_history_slow_op_threshold",
      "osd_enable_op_tracker",
      "osd_map_cache_size",
+    "osd_pg_epoch_max_lag_factor",
      "osd_pg_epoch_persisted_max_stale",
      "osd_disk_thread_ioprio_class",
      "osd_disk_thread_ioprio_priority",
@@ -9390,6 +9416,10 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
        changed.count("fsid")) {
      update_log_config();
    }
+  if (changed.count("osd_pg_epoch_max_lag_factor")) {
+    m_osd_pg_epoch_max_lag_factor = conf->get_val<double>(
+      "osd_pg_epoch_max_lag_factor");
+  }
  
  #ifdef HAVE_LIBFUSE
    if (changed.count("osd_objectstore_fuse")) {
diff --git a/src/osd/OSD.h b/src/osd/OSD.h

index 02b01f6376d4aa1246a94bbbb3d58560f2e07aaf..cb3d8af8e43e54458b8649b2e4ae8554a7174fcb 100644 (file)
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -384,6 +384,7 @@ public:
  private:
    // -- map epoch lower bound --
    Mutex pg_epoch_lock;
+  Cond pg_cond;
    multiset<epoch_t> pg_epochs;
    map<spg_t,epoch_t> pg_epoch;
  
@@ -394,11 +395,19 @@ public:
      assert(t == pg_epoch.end());
      pg_epoch[pgid] = epoch;
      pg_epochs.insert(epoch);
+    if (*pg_epochs.begin() == epoch) {
+      // we are the (new?) blocking epoch
+      pg_cond.Signal();
+    }
    }
    void pg_update_epoch(spg_t pgid, epoch_t epoch) {
      Mutex::Locker l(pg_epoch_lock);
      map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
      assert(t != pg_epoch.end());
+    if (*pg_epochs.begin() == t->second) {
+      // we were on the blocking epoch
+      pg_cond.Signal();
+    }
      pg_epochs.erase(pg_epochs.find(t->second));
      t->second = epoch;
      pg_epochs.insert(epoch);
@@ -407,6 +416,10 @@ public:
      Mutex::Locker l(pg_epoch_lock);
      map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
      if (t != pg_epoch.end()) {
+      if (*pg_epochs.begin() == t->second) {
+       // we were on the blocking epoch
+       pg_cond.Signal();
+      }
        pg_epochs.erase(pg_epochs.find(t->second));
        pg_epoch.erase(t);
      }
@@ -419,6 +432,14 @@ public:
        return *pg_epochs.begin();
    }
  
+  void wait_min_pg_epoch(epoch_t e) {
+    Mutex::Locker l(pg_epoch_lock);
+    while (!pg_epochs.empty() &&
+          *pg_epochs.begin() < e) {
+      pg_cond.Wait(pg_epoch_lock);
+    }
+  }
+
  private:
    // -- superblock --
    Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
@@ -1283,6 +1304,9 @@ private:
    class C_Tick;
    class C_Tick_WithoutOSDLock;
  
+  // -- config settings --
+  float m_osd_pg_epoch_max_lag_factor;
+
    // -- superblock --
    OSDSuperblock superblock;
author	Sage Weil <sage@redhat.com>
	Sun, 5 Nov 2017 16:44:57 +0000 (10:44 -0600)
committer	Sage Weil <sage@redhat.com>
	Mon, 4 Dec 2017 18:45:17 +0000 (12:45 -0600)
src/common/options.cc		patch \| blob \| history
src/osd/OSD.cc		patch \| blob \| history
src/osd/OSD.h		patch \| blob \| history