]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: allow limited scrub preemption
authorSage Weil <sage@redhat.com>
Fri, 19 Jan 2018 17:29:19 +0000 (11:29 -0600)
committerDavid Zafman <dzafman@redhat.com>
Fri, 18 May 2018 16:37:56 +0000 (09:37 -0700)
If we receive a write within the scrub range, abort the scrub chunk and
shrink the chunk size.  If we do this too many times do not preempt and
allow the scrub to complete (to avoid scrub starvation due to client io).

Signed-off-by: Sage Weil <sage@redhat.com>
(cherry picked from commit 6dd42392c0f00011059ffa5de74cace7d1e911bd)

Conflicts:
src/messages/MOSDRepScrub.h
src/messages/MOSDRepScrubMap.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h

- encode vs ::encode etc
- dragged in waiting for scrub events from 508ea640e3b
- ignore change in chunked manifest code (which dne in luminous)

src/common/options.cc
src/messages/MOSDRepScrub.h
src/messages/MOSDRepScrubMap.h
src/osd/ECBackend.cc
src/osd/PG.cc
src/osd/PG.h
src/osd/PGBackend.h
src/osd/PrimaryLogPG.cc
src/osd/PrimaryLogPG.h
src/osd/ReplicatedBackend.cc

index 7f3bff84523143e888e643a403f21382e60230f0..3f7bc12367a955c71367ed4121566901ca1c0035 100644 (file)
@@ -2551,6 +2551,10 @@ std::vector<Option> get_global_options() {
     .set_default(5)
     .set_description(""),
 
+    Option("osd_scrub_max_preemptions", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(5)
+    .set_description("Set the maximum number of times we will preempt a deep scrub due to a client operation before blocking client IO to complete the scrub"),
+
     Option("osd_deep_scrub_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
     .set_default(7_day)
     .set_description(""),
index ce800aac068b3543fb981a595d0b27578912affd..63749e24f4cc9c542b9171f42ed1e5d26bce7b05 100644 (file)
@@ -24,7 +24,7 @@
 
 struct MOSDRepScrub : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 7;
+  static const int HEAD_VERSION = 8;
   static const int COMPAT_VERSION = 6;
 
   spg_t pgid;             // PG to scrub
@@ -36,6 +36,7 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
   hobject_t end;         // upper bound of scrub, exclusive
   bool deep;             // true if scrub should be deep
   uint32_t seed;         // seed value for digest calculation
+  bool allow_preemption = false;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
@@ -54,7 +55,8 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
       seed(0) { }
 
   MOSDRepScrub(spg_t pgid, eversion_t scrub_to, epoch_t map_epoch, epoch_t min_epoch,
-               hobject_t start, hobject_t end, bool deep, uint32_t seed)
+               hobject_t start, hobject_t end, bool deep, uint32_t seed,
+              bool preemption)
     : MOSDFastDispatchOp(MSG_OSD_REP_SCRUB, HEAD_VERSION, COMPAT_VERSION),
       pgid(pgid),
       scrub_to(scrub_to),
@@ -64,7 +66,8 @@ struct MOSDRepScrub : public MOSDFastDispatchOp {
       start(start),
       end(end),
       deep(deep),
-      seed(seed) { }
+      seed(seed),
+      allow_preemption(preemption) { }
 
 
 private:
@@ -73,15 +76,17 @@ private:
 public:
   const char *get_type_name() const override { return "replica scrub"; }
   void print(ostream& out) const override {
-    out << "replica scrub(pg: ";
-    out << pgid << ",from:" << scrub_from << ",to:" << scrub_to
+    out << "replica_scrub(pg: "        << pgid
+       << ",from:" << scrub_from
+       << ",to:" << scrub_to
         << ",epoch:" << map_epoch << "/" << min_epoch
        << ",start:" << start << ",end:" << end
         << ",chunky:" << chunky
         << ",deep:" << deep
        << ",seed:" << seed
-        << ",version:" << header.version;
-    out << ")";
+        << ",version:" << header.version
+       << ",allow_preemption:" << (int)allow_preemption
+       << ")";
   }
 
   void encode_payload(uint64_t features) override {
@@ -96,6 +101,7 @@ public:
     ::encode(pgid.shard, payload);
     ::encode(seed, payload);
     ::encode(min_epoch, payload);
+    ::encode(allow_preemption, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
@@ -114,6 +120,9 @@ public:
     } else {
       min_epoch = map_epoch;
     }
+    if (header.version >= 8) {
+      ::decode(allow_preemption, p);
+    }
   }
 };
 
index f17bb0c2cf30f1da0cc73ffeb24977a0ec8d45a5..a3835914b45f45867b7545058ce4fb6d6b4f65ff 100644 (file)
 
 struct MOSDRepScrubMap : public MOSDFastDispatchOp {
 
-  static const int HEAD_VERSION = 1;
+  static const int HEAD_VERSION = 2;
   static const int COMPAT_VERSION = 1;
 
   spg_t pgid;            // primary spg_t
   epoch_t map_epoch = 0;
   pg_shard_t from;   // whose scrubmap this is
   bufferlist scrub_map_bl;
+  bool preempted = false;
 
   epoch_t get_map_epoch() const override {
     return map_epoch;
@@ -54,19 +55,24 @@ public:
   const char *get_type_name() const { return "rep_scrubmap"; }
   void print(ostream& out) const {
     out << "rep_scrubmap(" << pgid << " e" << map_epoch
-       << " from shard " << from << ")";
+       << " from shard " << from
+       << (preempted ? " PREEMPTED":"") << ")";
   }
 
   void encode_payload(uint64_t features) {
     ::encode(pgid, payload);
     ::encode(map_epoch, payload);
     ::encode(from, payload);
+    ::encode(preempted, payload);
   }
   void decode_payload() {
     bufferlist::iterator p = payload.begin();
     ::decode(pgid, p);
     ::decode(map_epoch, p);
     ::decode(from, p);
+    if (header.version >= 2) {
+      ::decode(preempted, p);
+    }
   }
 };
 
index 030d175fe6178753ddb60386eba6597e70cdd9d8..9f51b468bdb27b34dce24aa650470810bf9ea6c3 100644 (file)
@@ -763,6 +763,7 @@ bool ECBackend::_handle_message(
     // not conflict with ECSubWrite's operator<<.
     MOSDECSubOpWrite *op = static_cast<MOSDECSubOpWrite*>(
       _op->get_nonconst_req());
+    parent->maybe_preempt_replica_scrub(op->op.soid);
     handle_sub_write(op->op.from, _op, op->op, _op->pg_trace);
     return true;
   }
index c3b4db01966a55ce377fdbba1e75a67dbdcd3cfd..2bca3b5a9ed3c72df643c4c1d1a2e0d0cddc1350 100644 (file)
@@ -3792,6 +3792,10 @@ void PG::do_replica_scrub_map(OpRequestRef op)
           << dendl;
   assert(scrubber.waiting_on_whom.count(m->from));
   scrubber.waiting_on_whom.erase(m->from);
+  if (m->preempted) {
+    dout(10) << __func__ << " replica was preempted, setting flag" << dendl;
+    scrub_preempted = true;
+  }
   if (scrubber.waiting_on_whom.empty()) {
     if (ops_blocked_by_scrub()) {
       requeue_scrub(true);
@@ -3844,7 +3848,8 @@ void PG::sub_op_scrub_map(OpRequestRef op)
 void PG::_request_scrub_map(
   pg_shard_t replica, eversion_t version,
   hobject_t start, hobject_t end,
-  bool deep, uint32_t seed)
+  bool deep, uint32_t seed,
+  bool allow_preemption)
 {
   assert(replica != pg_whoami);
   dout(10) << "scrub  requesting scrubmap from osd." << replica
@@ -3853,7 +3858,8 @@ void PG::_request_scrub_map(
     spg_t(info.pgid.pgid, replica.shard), version,
     get_osdmap()->get_epoch(),
     get_last_peering_reset(),
-    start, end, deep, seed);
+    start, end, deep, seed,
+    allow_preemption);
   // default priority, we want the rep scrub processed prior to any recovery
   // or client io messages (we are holding a lock!)
   osd->send_message_osd_cluster(
@@ -4392,6 +4398,8 @@ void PG::replica_scrub(
   scrubber.deep = msg->deep;
   scrubber.epoch_start = info.history.same_interval_since;
 
+  scrub_can_preempt = msg->allow_preemption;
+  scrub_preempted = false;
   scrubber.replica_scrubmap_pos.reset();
 
   requeue_scrub(false);
@@ -4612,13 +4620,26 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
        }
 
        scrubber.seed = -1;
-
+       scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+         "osd_scrub_max_preemptions");
+       scrubber.preempt_divisor = 1;
         break;
 
       case PG::Scrubber::NEW_CHUNK:
         scrubber.primary_scrubmap = ScrubMap();
         scrubber.received_maps.clear();
 
+       // begin (possible) preemption window
+       if (scrub_preempted) {
+         scrubber.preempt_left--;
+         scrubber.preempt_divisor *= 2;
+         dout(10) << __func__ << " preempted, " << scrubber.preempt_left
+                  << " left" << dendl;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
+       }
+       scrub_preempted = false;
+       scrub_can_preempt = scrubber.preempt_left > 0;
+
         {
           /* get the start and end of our scrub chunk
           *
@@ -4718,14 +4739,14 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          if (*i == pg_whoami) continue;
           _request_scrub_map(*i, scrubber.subset_last_update,
                              scrubber.start, scrubber.end, scrubber.deep,
-                            scrubber.seed);
+                            scrubber.seed,
+                            scrubber.preempt_left > 0);
           scrubber.waiting_on_whom.insert(*i);
         }
        dout(10) << __func__ << " waiting_on_whom " << scrubber.waiting_on_whom
                 << dendl;
 
         scrubber.state = PG::Scrubber::WAIT_PUSHES;
-
         break;
 
       case PG::Scrubber::WAIT_PUSHES:
@@ -4753,6 +4774,11 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
         assert(last_update_applied >= scrubber.subset_last_update);
 
         // build my own scrub map
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted" << dendl;
+         scrubber.state = PG::Scrubber::BUILD_MAP_DONE;
+         break;
+       }
        ret = build_scrub_map_chunk(
          scrubber.primary_scrubmap,
          scrubber.primary_scrubmap_pos,
@@ -4788,7 +4814,14 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           // will be requeued by sub_op_scrub_map
           dout(10) << "wait for replicas to build scrub map" << dendl;
           done = true;
-        } else {
+         break;
+       }
+       // end (possible) preemption window
+       scrub_can_preempt = false;
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted, restarting chunk" << dendl;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
+       } else {
           scrubber.state = PG::Scrubber::COMPARE_MAPS;
         }
         break;
@@ -4817,8 +4850,12 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          break;
        }
 
+       scrubber.preempt_left = cct->_conf->get_val<uint64_t>(
+         "osd_scrub_max_preemptions");
+       scrubber.preempt_divisor = 1;
+
        if (!(scrubber.end.is_max())) {
-          scrubber.state = PG::Scrubber::NEW_CHUNK;
+         scrubber.state = PG::Scrubber::NEW_CHUNK;
          requeue_scrub();
           done = true;
         } else {
@@ -4841,12 +4878,17 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 
       case PG::Scrubber::BUILD_MAP_REPLICA:
         // build my own scrub map
+       if (scrub_preempted) {
+         dout(10) << __func__ << " preempted" << dendl;
+         ret = 0;
+       } else {
          ret = build_scrub_map_chunk(
            scrubber.replica_scrubmap,
            scrubber.replica_scrubmap_pos,
            scrubber.start, scrubber.end,
            scrubber.deep, scrubber.seed,
            handle);
+       }
        if (ret == -EINPROGRESS) {
          requeue_scrub();
          done = true;
@@ -4858,6 +4900,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
            spg_t(info.pgid.pgid, get_primary().shard),
            scrubber.replica_scrub_start,
            pg_whoami);
+         reply->preempted = scrub_preempted;
          ::encode(scrubber.replica_scrubmap, reply->get_data());
          osd->send_message_osd_cluster(
            get_primary().osd, reply,
@@ -4884,6 +4927,8 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
            get_primary().osd, subop,
            scrubber.replica_scrub_start);
        }
+       scrub_preempted = false;
+       scrub_can_preempt = false;
        scrubber.state = PG::Scrubber::INACTIVE;
        scrubber.replica_scrubmap = ScrubMap();
        scrubber.replica_scrubmap_pos = ScrubMapBuilder();
@@ -4900,6 +4945,23 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
 }
 
+bool PG::write_blocked_by_scrub(const hobject_t& soid)
+{
+  if (soid < scrubber.start || soid >= scrubber.end) {
+    return false;
+  }
+  if (scrub_can_preempt) {
+    if (!scrub_preempted) {
+      dout(10) << __func__ << " " << soid << " preempted" << dendl;
+      scrub_preempted = true;
+    } else {
+      dout(10) << __func__ << " " << soid << " already preempted" << dendl;
+    }
+    return false;
+  }
+  return true;
+}
+
 void PG::scrub_clear_state()
 {
   assert(is_locked());
index c7d70d049675deab0ea966a97871a4343e4117b3..7637912a54c8ab96847f77c5221b7f8499f23d5c 100644 (file)
@@ -1275,6 +1275,8 @@ public:
     // deep scrub
     bool deep;
     uint32_t seed;
+    int preempt_left;
+    int preempt_divisor;
 
     list<Context*> callbacks;
     void add_callback(Context *context) {
@@ -1311,12 +1313,6 @@ public:
 
     bool is_chunky_scrub_active() const { return state != INACTIVE; }
 
-    // classic (non chunk) scrubs block all writes
-    // chunky scrubs only block writes to a range
-    bool write_blocked_by_scrub(const hobject_t &soid) {
-      return (soid >= start && soid < end);
-    }
-
     // clear all state
     void reset() {
       active = false;
@@ -1364,6 +1360,14 @@ public:
 
   int active_pushes;
 
+  bool scrub_can_preempt = false;
+  bool scrub_preempted = false;
+
+  // we allow some number of preemptions of the scrub, which mean we do
+  // not block.  then we start to block.  once we start blocking, we do
+  // not stop until the scrub range is completed.
+  bool write_blocked_by_scrub(const hobject_t &soid);
+
   void repair_object(
     const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
     pg_shard_t bad_peer);
@@ -1385,7 +1389,7 @@ public:
     ThreadPool::TPHandle &handle);
   void _request_scrub_map(pg_shard_t replica, eversion_t version,
                           hobject_t start, hobject_t end, bool deep,
-                         uint32_t seed);
+                         uint32_t seed, bool allow_preemption);
   int build_scrub_map_chunk(
     ScrubMap &map,
     ScrubMapBuilder &pos,
index 80adb8060ebecb3fece2969d3443bdfefb907bc3..eb80b7ce294ff70e75fab394a9021dbeaaa8c6bb 100644 (file)
@@ -289,6 +289,7 @@ typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
 
      virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
 
+     virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
      virtual ~Listener() {}
    };
    Listener *parent;
index 98a998f5739cad229b7bf9f450cbf27dbae9cd57..99b2b1ca114f5d45c95ae84fc471ff4bd5693288 100644 (file)
@@ -2060,7 +2060,7 @@ void PrimaryLogPG::do_op(OpRequestRef& op)
   }
 
   if (write_ordered && scrubber.is_chunky_scrub_active() &&
-      scrubber.write_blocked_by_scrub(head)) {
+      write_blocked_by_scrub(head)) {
     dout(20) << __func__ << ": waiting for scrub" << dendl;
     waiting_for_scrub.push_back(op);
     op->mark_delayed("waiting for scrub");
@@ -3127,7 +3127,7 @@ void PrimaryLogPG::promote_object(ObjectContextRef obc,
 {
   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
   assert(hoid != hobject_t());
-  if (scrubber.write_blocked_by_scrub(hoid)) {
+  if (write_blocked_by_scrub(hoid)) {
     dout(10) << __func__ << " " << hoid
             << " blocked by scrub" << dendl;
     if (op) {
@@ -9052,7 +9052,7 @@ int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
   }
 
   if (!fop->blocking &&
-      scrubber.write_blocked_by_scrub(oid)) {
+      write_blocked_by_scrub(oid)) {
     if (fop->op) {
       dout(10) << __func__ << " blocked by scrub" << dendl;
       requeue_op(fop->op);
@@ -9794,7 +9794,7 @@ void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
     return;
   }
 
-  if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+  if (write_blocked_by_scrub(obc->obs.oi.soid)) {
     dout(10) << "handle_watch_timeout waiting for scrub on obj "
             << obc->obs.oi.soid
             << dendl;
@@ -12681,7 +12681,7 @@ void PrimaryLogPG::hit_set_remove_all()
     // Once we hit a degraded object just skip
     if (is_degraded_or_backfilling_object(aoid))
       return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (write_blocked_by_scrub(aoid))
       return;
   }
 
@@ -12800,7 +12800,7 @@ void PrimaryLogPG::hit_set_persist()
     // Once we hit a degraded object just skip further trim
     if (is_degraded_or_backfilling_object(aoid))
       return;
-    if (scrubber.write_blocked_by_scrub(aoid))
+    if (write_blocked_by_scrub(aoid))
       return;
   }
 
@@ -12834,7 +12834,7 @@ void PrimaryLogPG::hit_set_persist()
     new_hset.using_gmt);
 
   // If the current object is degraded we skip this persist request
-  if (scrubber.write_blocked_by_scrub(oid))
+  if (write_blocked_by_scrub(oid))
     return;
 
   hit_set->seal();
@@ -13078,7 +13078,7 @@ bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
       osd->logger->inc(l_osd_agent_skip);
       continue;
     }
-    if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
+    if (write_blocked_by_scrub(obc->obs.oi.soid)) {
       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
       osd->logger->inc(l_osd_agent_skip);
       continue;
index e59f8c662daa6a9a473c855b786fe1f6483973d3..daeac1e6a6eaf08d28fa1cef36543f774b15c6c9 100644 (file)
@@ -829,7 +829,10 @@ protected:
     if (!to_req.empty()) {
       // requeue at front of scrub blocking queue if we are blocked by scrub
       for (auto &&p: to_req) {
-       if (scrubber.write_blocked_by_scrub(p.first.get_head())) {
+       if (write_blocked_by_scrub(p.first.get_head())) {
+          for (auto& op : p.second) {
+            op->mark_delayed("waiting for scrub");
+          }
          waiting_for_scrub.splice(
            waiting_for_scrub.begin(),
            p.second,
@@ -1785,6 +1788,9 @@ public:
   void on_shutdown() override;
   bool check_failsafe_full(ostream &ss) override;
   bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
+  bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
+    return write_blocked_by_scrub(oid);
+  }
   int rep_repair_primary_object(const hobject_t& soid, OpRequestRef op);
 
   // attr cache handling
index a6ec2d313adbff21d99523c0d10e2de0021f709b..ba24c7a2639dff653bfa01e876344a27a24adba6 100644 (file)
@@ -1136,6 +1136,8 @@ void ReplicatedBackend::do_repop(OpRequestRef op)
   // we better not be missing this.
   assert(!parent->get_log().get_missing().is_missing(soid));
 
+  parent->maybe_preempt_replica_scrub(soid);
+
   int ackerosd = m->get_source().num();
 
   op->mark_started();