osd: allow limited scrub preemption

author Sage Weil <sage@redhat.com>

Fri, 5 Jan 2018 14:35:34 +0000 (08:35 -0600)

committer Sage Weil <sage@redhat.com>

Wed, 17 Jan 2018 03:52:09 +0000 (21:52 -0600)
author Sage Weil <sage@redhat.com>
Fri, 5 Jan 2018 14:35:34 +0000 (08:35 -0600)
committer Sage Weil <sage@redhat.com>
Wed, 17 Jan 2018 03:52:09 +0000 (21:52 -0600)
diff --git a/src/common/options.cc b/src/common/options.cc

index 7f9222d35e707b14ba5399622947298b168ab9b9..fac7b6c52fde28c77bb381ae0f419590e008da04 100644 (file)
--- a/src/common/options.cc
+++ b/src/common/options.cc
@@ -2681,6 +2681,10 @@ std::vector<Option> get_global_options() {
      .set_default(5)
      .set_description(""),
  
+    Option("osd_scrub_max_preemptions", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Set the maximum number of times we will preempt a deep scrub due to a client operation before blocking client IO to complete the scrub"),
+
      Option("osd_deep_scrub_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
      .set_default(7_day)
      .set_description(""),
diff --git a/src/messages/MOSDRepScrub.h b/src/messages/MOSDRepScrub.h

index 134c40835dd2cb06c339a6e5b86b3b86dac86aca..35d1b3e9fee685ec27a7d46cf2091703c59d0ad8 100644 (file)
--- a/src/messages/MOSDRepScrub.h
+++ b/src/messages/MOSDRepScrub.h
@@ -24,7 +24,7 @@
  
  struct MOSDRepScrub : public MOSDFastDispatchOp {
  
-  static const int HEAD_VERSION = 7;
+  static const int HEAD_VERSION = 8;
    static const int COMPAT_VERSION = 6;
  
    spg_t pgid;             // PG to scrub
@@ -36,6 +36,7 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
    hobject_t end;         // upper bound of scrub, exclusive
    bool deep;             // true if scrub should be deep
    uint32_t seed;         // seed value for digest calculation
+  bool allow_preemption = false;
  
    epoch_t get_map_epoch() const override {
      return map_epoch;
@@ -54,7 +55,8 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
        seed(0) { }
  
    MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch, epoch_t min_epoch,
-               hobject_t start, hobject_t end, bool deep, uint32_t seed)
+               hobject_t start, hobject_t end, bool deep, uint32_t seed,
+              bool preemption)
      : MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
        pgid(pgid),
        scrub_to(scrub_to),
@@ -64,7 +66,8 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
        start(start),
        end(end),
        deep(deep),
-      seed(seed) { }
+      seed(seed),
+      allow_preemption(preemption) { }
  
  
  private:
@@ -73,15 +76,17 @@ private:
  public:
    const char *get_type_name() const override { return "replica scrub"; }
    void print(ostream& out) const override {
-    out << "replica scrub(pg: ";
-    out << pgid << ",from:" << scrub_from << ",to:" << scrub_to
+    out << "replica_scrub(pg: "        << pgid
+       << ",from:" << scrub_from
+       << ",to:" << scrub_to
          << ",epoch:" << map_epoch << "/" << min_epoch
         << ",start:" << start << ",end:" << end
          << ",chunky:" << chunky
          << ",deep:" << deep
         << ",seed:" << seed
-        << ",version:" << header.version;
-    out << ")";
+        << ",version:" << header.version
+       << ",allow_preemption:" << (int)allow_preemption
+       << ")";
    }
  
    void encode_payload(uint64_t features) override {
@@ -97,6 +102,7 @@ public:
      encode(pgid.shard, payload);
      encode(seed, payload);
      encode(min_epoch, payload);
+    encode(allow_preemption, payload);
    }
    void decode_payload() override {
      bufferlist::iterator p = payload.begin();
@@ -115,6 +121,9 @@ public:
      } else {
        min_epoch = map_epoch;
      }
+    if (header.version >= 8) {
+      decode(allow_preemption, p);
+    }
    }
  };
  
diff --git a/src/messages/MOSDRepScrubMap.h b/src/messages/MOSDRepScrubMap.h

index 9cec29c6857de770b62f89d28e4976e3231ed039..04ed9fa09ec4fe369887b58f912ca62f392683e4 100644 (file)
--- a/src/messages/MOSDRepScrubMap.h
+++ b/src/messages/MOSDRepScrubMap.h
@@ -23,13 +23,14 @@
  
  struct MOSDRepScrubMap : public MOSDFastDispatchOp {
  
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
    static const int COMPAT_VERSION = 1;
  
    spg_t pgid;            // primary spg_t
    epoch_t map_epoch = 0;
    pg_shard_t from;   // whose scrubmap this is
    bufferlist scrub_map_bl;
+  bool preempted = false;
  
    epoch_t get_map_epoch() const override {
      return map_epoch;
@@ -54,7 +55,8 @@ public:
    const char *get_type_name() const override { return "rep_scrubmap"; }
    void print(ostream& out) const override {
      out << "rep_scrubmap(" << pgid << " e" << map_epoch
-       << " from shard " << from << ")";
+       << " from shard " << from
+       << (preempted ? " PREEMPTED":"") << ")";
    }
  
    void encode_payload(uint64_t features) override {
@@ -62,12 +64,16 @@ public:
      encode(pgid, payload);
      encode(map_epoch, payload);
      encode(from, payload);
+    encode(preempted, payload);
    }
    void decode_payload() override {
      bufferlist::iterator p = payload.begin();
      decode(pgid, p);
      decode(map_epoch, p);
      decode(from, p);
+    if (header.version >= 2) {
+      decode(preempted, p);
+    }
    }
  };
  
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc

index d0df2a8a094d3c8c8be041d5912f4e57d5365316..d74fbf33ce7d05b871db29d6f65253b1f40df416 100644 (file)
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -759,6 +759,7 @@ bool ECBackend::_handle_message(
      // not conflict with ECSubWrite's operator<<.
      MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
        _op->get_nonconst_req());
+    parent->maybe_preempt_replica_scrub(op->op.soid);
      handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
      return true;
    }
diff --git a/src/osd/PG.cc b/src/osd/PG.cc

index 256a15a5ed19a1d424c338b52eff23c177bb34ef..8d8f7714d7e22ea0492d025cb6b3087b586310b5 100644 (file)
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -3859,6 +3859,10 @@ void PG::do_replica_scrub_map(OpRequestRef op)
            << dendl;
    assert(scrubber.waiting_on_whom.count(m->from));
    scrubber.waiting_on_whom.erase(m->from);
+  if (m->preempted) {
+    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+    scrub_preempted = true;
+  }
    if (scrubber.waiting_on_whom.empty()) {
      if (ops_blocked_by_scrub()) {
        requeue_scrub(true);
@@ -3872,7 +3876,8 @@ void PG::do_replica_scrub_map(OpRequestRef op)
  void PG::_request_scrub_map(
    pg_shard_t replica, eversion_t version,
    hobject_t start, hobject_t end,
-  bool deep, uint32_t seed)
+  bool deep, uint32_t seed,
+  bool allow_preemption)
  {
    assert(replica != pg_whoami);
    dout(10) << "scrub  requesting scrubmap from osd." << replica
@@ -3881,7 +3886,8 @@ void PG::_request_scrub_map(
      spg_t(info.pgid.pgid, replica.shard), version,
      get_osdmap()->get_epoch(),
      get_last_peering_reset(),
-    start, end, deep, seed);
+    start, end, deep, seed,
+    allow_preemption);
    // default priority, we want the rep scrub processed prior to any recovery
    // or client io messages (we are holding a lock!)
    osd->send_message_osd_cluster(
@@ -4350,6 +4356,8 @@ void PG::replica_scrub(
    scrubber.deep = msg->deep;
    scrubber.epoch_start = info.history.same_interval_since;
  
+  scrub_can_preempt = msg->allow_preemption;
+  scrub_preempted = false;
    scrubber.replica_scrubmap_pos.reset();
  
    requeue_scrub(false);
@@ -4570,13 +4578,26 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         }
  
         scrubber.seed = -1;
-
+       scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+         "osd_scrub_max_preemptions");
+       scrubber.preempt_divisor = 1;
          break;
  
        case PG::Scrubber::NEW_CHUNK:
          scrubber.primary_scrubmap = ScrubMap();
          scrubber.received_maps.clear();
  
+       // begin (possible) preemption window
+       if (scrub_preempted) {
+         scrubber.preempt_left--;
+         scrubber.preempt_divisor *= 2;
+         dout(10) << __func__ << " preempted, " << scrubber.preempt_left
+                  << " left" << dendl;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
+       }
+       scrub_preempted = false;
+       scrub_can_preempt = scrubber.preempt_left > 0;
+
          {
            /* get the start and end of our scrub chunk
            *
@@ -4676,14 +4697,14 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           if (*i == pg_whoami) continue;
            _request_scrub_map(*i, scrubber.subset_last_update,
                               scrubber.start, scrubber.end, scrubber.deep,
-                            scrubber.seed);
+                            scrubber.seed,
+                            scrubber.preempt_left > 0);
            scrubber.waiting_on_whom.insert(*i);
          }
         dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
                  << dendl;
  
          scrubber.state = PG::Scrubber::WAIT_PUSHES;
-
          break;
  
        case PG::Scrubber::WAIT_PUSHES:
@@ -4711,6 +4732,11 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          assert(last_update_applied >= scrubber.subset_last_update);
  
          // build my own scrub map
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted" << dendl;
+         scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
+         break;
+       }
         ret = build_scrub_map_chunk(
           scrubber.primary_scrubmap,
           scrubber.primary_scrubmap_pos,
@@ -4746,7 +4772,14 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
            // will be requeued by sub_op_scrub_map
            dout(10) << "wait for replicas to build scrub map" << dendl;
            done = true;
-        } else {
+         break;
+       }
+       // end (possible) preemption window
+       scrub_can_preempt = false;
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted, restarting chunk" << dendl;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
+       } else {
            scrubber.state = PG::Scrubber::COMPARE_MAPS;
          }
          break;
@@ -4775,8 +4808,12 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           break;
         }
  
+       scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+         "osd_scrub_max_preemptions");
+       scrubber.preempt_divisor = 1;
+
         if (!(scrubber.end.is_max())) {
-          scrubber.state = PG::Scrubber::NEW_CHUNK;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
           requeue_scrub();
            done = true;
          } else {
@@ -4799,12 +4836,17 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
  
        case PG::Scrubber::BUILD_MAP_REPLICA:
          // build my own scrub map
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted" << dendl;
+         ret = 0;
+       } else {
           ret = build_scrub_map_chunk(
             scrubber.replica_scrubmap,
             scrubber.replica_scrubmap_pos,
             scrubber.start, scrubber.end,
             scrubber.deep, scrubber.seed,
             handle);
+       }
         if (ret == -EINPROGRESS) {
           requeue_scrub();
           done = true;
@@ -4816,11 +4858,14 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
             spg_t(info.pgid.pgid, get_primary().shard),
             scrubber.replica_scrub_start,
             pg_whoami);
+         reply->preempted = scrub_preempted;
           ::encode(scrubber.replica_scrubmap, reply->get_data());
           osd->send_message_osd_cluster(
             get_primary().osd, reply,
             scrubber.replica_scrub_start);
         }
+       scrub_preempted = false;
+       scrub_can_preempt = false;
         scrubber.state = PG::Scrubber::INACTIVE;
         scrubber.replica_scrubmap = ScrubMap();
         scrubber.replica_scrubmap_pos = ScrubMapBuilder();
@@ -4837,6 +4882,23 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
            << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
  }
  
+bool PG::write_blocked_by_scrub(const hobject_t& soid)
+{
+  if (soid < scrubber.start || soid >= scrubber.end) {
+    return false;
+  }
+  if (scrub_can_preempt) {
+    if (!scrub_preempted) {
+      dout(10) << __func__ << " " << soid << " preempted" << dendl;
+      scrub_preempted = true;
+    } else {
+      dout(10) << __func__ << " " << soid << " already preempted" << dendl;
+    }
+    return false;
+  }
+  return true;
+}
+
  void PG::scrub_clear_state()
  {
    assert(is_locked());
diff --git a/src/osd/PG.h b/src/osd/PG.h

index d5a74d0f45bee6e3f73c2a993d03f17d26cb244d..5bf41c6e039f12a903ceeb9aaae21bc7f09ce165 100644 (file)
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1517,6 +1517,8 @@ public:
      // deep scrub
      bool deep;
      uint32_t seed;
+    int preempt_left;
+    int preempt_divisor;
  
      list<Context*> callbacks;
      void add_callback(Context *context) {
@@ -1553,12 +1555,6 @@ public:
  
      bool is_chunky_scrub_active() const { return state != INACTIVE; }
  
-    // classic (non chunk) scrubs block all writes
-    // chunky scrubs only block writes to a range
-    bool write_blocked_by_scrub(const hobject_t &soid) {
-      return (soid >= start && soid < end);
-    }
-
      // clear all state
      void reset() {
        active = false;
@@ -1607,6 +1603,14 @@ protected:
  
    int active_pushes;
  
+  bool scrub_can_preempt = false;
+  bool scrub_preempted = false;
+
+  // we allow some number of preemptions of the scrub, which mean we do
+  // not block.  then we start to block.  once we start blocking, we do
+  // not stop until the scrub range is completed.
+  bool write_blocked_by_scrub(const hobject_t &soid);
+
    void repair_object(
      const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
      pg_shard_t bad_peer);
@@ -1627,7 +1631,7 @@ protected:
      ThreadPool::TPHandle &handle);
    void _request_scrub_map(pg_shard_t replica, eversion_t version,
                            hobject_t start, hobject_t end, bool deep,
-                         uint32_t seed);
+                         uint32_t seed, bool allow_preemption);
    int build_scrub_map_chunk(
      ScrubMap &map,
      ScrubMapBuilder &pos,
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h

index bf57936940b6541d3a92b0d8d6397308acca6c6b..7db632ed6443a83c5e0c84fcb512b9c8061f07f2 100644 (file)
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -291,6 +291,7 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
  
       virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
  
+     virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
       virtual ~Listener() {}
     };
     Listener *parent;
diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc

index 7310c8699c940b04883d2cb68dc6760f4958f9be..5de76ba3f4928750e7ef664e5166806bf3e20857 100644 (file)
--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@@ -2059,7 +2059,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
    }
  
    if (write_ordered && scrubber.is_chunky_scrub_active() &&
-      scrubber.write_blocked_by_scrub(head)) {
+      write_blocked_by_scrub(head)) {
      dout(20) << __func__ << ": waiting for scrub" << dendl;
      waiting_for_scrub.push_back(op);
      op->mark_delayed("waiting for scrub");
@@ -2407,7 +2407,7 @@ PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
         return cache_result_t::BLOCKED_RECOVERY;
        }
  
-      if (scrubber.write_blocked_by_scrub(head)) {
+      if (write_blocked_by_scrub(head)) {
         dout(20) << __func__ << ": waiting for scrub" << dendl;
         waiting_for_scrub.push_back(op);
         op->mark_delayed("waiting for scrub");
@@ -3580,7 +3580,7 @@ void PrimaryLogPG::promote_object(ObjectContextRef obc,
  {
    hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
    assert(hoid != hobject_t());
-  if (scrubber.write_blocked_by_scrub(hoid)) {
+  if (write_blocked_by_scrub(hoid)) {
      dout(10) << __func__ << " " << hoid
              << " blocked by scrub" << dendl;
      if (op) {
@@ -9646,7 +9646,7 @@ int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
    }
  
    if (!fop->blocking &&
-      scrubber.write_blocked_by_scrub(oid)) {
+      write_blocked_by_scrub(oid)) {
      if (fop->op) {
        dout(10) << __func__ << " blocked by scrub" << dendl;
        requeue_op(fop->op);
@@ -10385,7 +10385,7 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
      return;
    }
  
-  if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+  if (write_blocked_by_scrub(obc->obs.oi.soid)) {
      dout(10) << "handle_watch_timeout waiting for scrub on obj "
              << obc->obs.oi.soid
              << dendl;
@@ -13109,7 +13109,7 @@ void PrimaryLogPG::hit_set_remove_all()
      // Once we hit a degraded object just skip
      if (is_degraded_or_backfilling_object(aoid))
        return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (write_blocked_by_scrub(aoid))
        return;
    }
  
@@ -13228,7 +13228,7 @@ void PrimaryLogPG::hit_set_persist()
      // Once we hit a degraded object just skip further trim
      if (is_degraded_or_backfilling_object(aoid))
        return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (write_blocked_by_scrub(aoid))
        return;
    }
  
@@ -13262,7 +13262,7 @@ void PrimaryLogPG::hit_set_persist()
      new_hset.using_gmt);
  
    // If the current object is degraded we skip this persist request
-  if (scrubber.write_blocked_by_scrub(oid))
+  if (write_blocked_by_scrub(oid))
      return;
  
    hit_set->seal();
@@ -13506,7 +13506,7 @@ bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
        osd->logger->inc(l_osd_agent_skip);
        continue;
      }
-    if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+    if (write_blocked_by_scrub(obc->obs.oi.soid)) {
        dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
        osd->logger->inc(l_osd_agent_skip);
        continue;
diff --git a/src/osd/PrimaryLogPG.h b/src/osd/PrimaryLogPG.h

index 9e97c42e86c8713b65e030cb9dc26c62139e578c..39ebd5660f1b699373ca2b58eb9c55f6b7907e1a 100644 (file)
--- a/src/osd/PrimaryLogPG.h
+++ b/src/osd/PrimaryLogPG.h
@@ -850,7 +850,7 @@ protected:
      if (!to_req.empty()) {
        // requeue at front of scrub blocking queue if we are blocked by scrub
        for (auto &&p: to_req) {
-       if (scrubber.write_blocked_by_scrub(p.first.get_head())) {
+       if (write_blocked_by_scrub(p.first.get_head())) {
            for (auto& op : p.second) {
              op->mark_delayed("waiting for scrub");
            }
@@ -1841,6 +1841,9 @@ public:
    void on_shutdown();
    bool check_failsafe_full(ostream &ss) override;
    bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
+  bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
+    return write_blocked_by_scrub(oid);
+  }
    int rep_repair_primary_object(const hobject_t& soid, OpRequestRef op);
  
    // attr cache handling
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc

index 43acf6af587d347418b2acfcf571a4a557bf90ca..769ff5ae1321de5499fb2cc9287598fdd6a09949 100644 (file)
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -1084,6 +1084,8 @@ void ReplicatedBackend::do_repop(OpRequestRef op)
    // we better not be missing this.
    assert(!parent->get_log().get_missing().is_missing(soid));
  
+  parent->maybe_preempt_replica_scrub(soid);
+
    int ackerosd = m->get_source().num();
  
    op->mark_started();
author	Sage Weil <sage@redhat.com>
	Fri, 5 Jan 2018 14:35:34 +0000 (08:35 -0600)
committer	Sage Weil <sage@redhat.com>
	Wed, 17 Jan 2018 03:52:09 +0000 (21:52 -0600)
src/common/options.cc		patch \| blob \| history
src/messages/MOSDRepScrub.h		patch \| blob \| history
src/messages/MOSDRepScrubMap.h		patch \| blob \| history
src/osd/ECBackend.cc		patch \| blob \| history
src/osd/PG.cc		patch \| blob \| history
src/osd/PG.h		patch \| blob \| history
src/osd/PGBackend.h		patch \| blob \| history
src/osd/PrimaryLogPG.cc		patch \| blob \| history
src/osd/PrimaryLogPG.h		patch \| blob \| history
src/osd/ReplicatedBackend.cc		patch \| blob \| history