]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
ReplicatedPG: Don't cache recovery and scrub data 3595/head
authorHaomai Wang <haomaiwang@gmail.com>
Sat, 27 Jun 2015 03:57:32 +0000 (11:57 +0800)
committerHaomai Wang <haomaiwang@gmail.com>
Fri, 14 Aug 2015 02:14:40 +0000 (10:14 +0800)
Signed-off-by: Haomai Wang <haomaiwang@gmail.com>
src/osd/ECBackend.cc
src/osd/PGBackend.h
src/osd/ReplicatedBackend.cc
src/osd/ReplicatedBackend.h
src/osd/ReplicatedPG.cc

index a64c0f2a079fe8c28c0a64adeecbb1223cf09f11..6cb63ecce12ec69218d43eb9da575a73a8cbad99 100644 (file)
@@ -1778,6 +1778,9 @@ void ECBackend::be_deep_scrub(
   if (stride % sinfo.get_chunk_size())
     stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
   uint64_t pos = 0;
+
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
   while (true) {
     bufferlist bl;
     handle.reset_tp_timeout();
@@ -1787,7 +1790,7 @@ void ECBackend::be_deep_scrub(
        poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
       pos,
       stride, bl,
-      true);
+      fadvise_flags, true);
     if (r < 0)
       break;
     if (bl.length() % sinfo.get_chunk_size()) {
index 518fdf4be850b186cf830b0f51f75fd360ff7826..dad8224c8399e4a398f9c28cfa33ae062274995e 100644 (file)
     * the pending recovery operations.
     */
    struct RecoveryHandle {
+     bool cache_dont_need;
+
+     RecoveryHandle(): cache_dont_need(false) {}
      virtual ~RecoveryHandle() {}
    };
 
index 2015dd0b6f5e08ee820fd975ff564e3ce34e3a5b..73906b4bec60fb3b92b07d77fd1f7af8549d41d4 100644 (file)
@@ -742,13 +742,16 @@ void ReplicatedBackend::be_deep_scrub(
   bufferlist bl, hdrbl;
   int r;
   __u64 pos = 0;
+
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
+
   while ( (r = store->read(
-            coll,
-            ghobject_t(
-              poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
-            pos,
-            cct->_conf->osd_deep_scrub_stride, bl,
-            true)) > 0) {
+             coll,
+             ghobject_t(
+               poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
+             pos,
+             cct->_conf->osd_deep_scrub_stride, bl,
+             fadvise_flags, true)) > 0) {
     handle.reset_tp_timeout();
     h << bl;
     pos += bl.length();
@@ -1518,6 +1521,7 @@ void ReplicatedBackend::prepare_pull(
   pi.head_ctx = headctx;
   pi.recovery_info = op.recovery_info;
   pi.recovery_progress = op.recovery_progress;
+  pi.cache_dont_need = h->cache_dont_need;
 }
 
 /*
@@ -1526,7 +1530,7 @@ void ReplicatedBackend::prepare_pull(
  */
 void ReplicatedBackend::prep_push_to_replica(
   ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
-  PushOp *pop)
+  PushOp *pop, bool cache_dont_need)
 {
   const object_info_t& oi = obc->obs.oi;
   uint64_t size = obc->obs.oi.size;
@@ -1582,7 +1586,7 @@ void ReplicatedBackend::prep_push_to_replica(
       data_subset, clone_subsets);
   }
 
-  prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
+  prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop, cache_dont_need);
 }
 
 void ReplicatedBackend::prep_push(ObjectContextRef obc,
@@ -1605,7 +1609,7 @@ void ReplicatedBackend::prep_push(
   eversion_t version,
   interval_set<uint64_t> &data_subset,
   map<hobject_t, interval_set<uint64_t> >& clone_subsets,
-  PushOp *pop)
+  PushOp *pop, bool cache_dont_need)
 {
   get_parent()->begin_peer_recover(peer, soid);
   // take note.
@@ -1627,7 +1631,7 @@ void ReplicatedBackend::prep_push(
                        pi.recovery_progress,
                        &new_progress,
                        pop,
-                       &(pi.stat));
+                       &(pi.stat), cache_dont_need);
   assert(r == 0);
   pi.recovery_progress = new_progress;
 }
@@ -1671,6 +1675,7 @@ void ReplicatedBackend::submit_push_data(
   ObjectRecoveryInfo &recovery_info,
   bool first,
   bool complete,
+  bool cache_dont_need,
   const interval_set<uint64_t> &intervals_included,
   bufferlist data_included,
   bufferlist omap_header,
@@ -1698,13 +1703,16 @@ void ReplicatedBackend::submit_push_data(
     t->omap_setheader(coll, ghobject_t(target_oid), omap_header);
   }
   uint64_t off = 0;
+  uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
+  if (cache_dont_need)
+    fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
   for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
        p != intervals_included.end();
        ++p) {
     bufferlist bit;
     bit.substr_of(data_included, off, p.get_len());
     t->write(coll, ghobject_t(target_oid),
-            p.get_start(), p.get_len(), bit);
+            p.get_start(), p.get_len(), bit, fadvise_flags);
     off += p.get_len();
   }
 
@@ -1827,7 +1835,7 @@ bool ReplicatedBackend::handle_pull_response(
   bool complete = pi.is_complete();
 
   submit_push_data(pi.recovery_info, first,
-                  complete,
+                  complete, pi.cache_dont_need,
                   data_included, data,
                   pop.omap_header,
                   pop.attrset,
@@ -1871,6 +1879,7 @@ void ReplicatedBackend::handle_push(
   submit_push_data(pop.recovery_info,
                   first,
                   complete,
+                  true, // must be replicate
                   pop.data_included,
                   data,
                   pop.omap_header,
@@ -1950,7 +1959,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
                                     const ObjectRecoveryProgress &progress,
                                     ObjectRecoveryProgress *out_progress,
                                     PushOp *out_op,
-                                    object_stat_sum_t *stat)
+                                    object_stat_sum_t *stat,
+                                     bool cache_dont_need)
 {
   ObjectRecoveryProgress _new_progress;
   if (!out_progress)
@@ -2042,7 +2052,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
        ++p) {
     bufferlist bit;
     store->read(coll, ghobject_t(recovery_info.soid),
-                    p.get_start(), p.get_len(), bit);
+               p.get_start(), p.get_len(), bit,
+                cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
     if (p.get_len() != bit.length()) {
       dout(10) << " extent " << p.get_start() << "~" << p.get_len()
               << " is actually " << p.get_start() << "~" << bit.length()
@@ -2387,8 +2398,7 @@ int ReplicatedBackend::start_pushes(
       ++pushes;
       h->pushes[peer].push_back(PushOp());
       prep_push_to_replica(obc, soid, peer,
-                          &(h->pushes[peer].back())
-       );
+                          &(h->pushes[peer].back()), h->cache_dont_need);
     }
   }
   return pushes;
index 55bef8de5dabfcef48871a335a612dc5e4d45562..f66c461a86844626976e8c64574e8f77c8d3064e 100644 (file)
@@ -191,6 +191,7 @@ private:
     ObjectContextRef head_ctx;
     ObjectContextRef obc;
     object_stat_sum_t stat;
+    bool cache_dont_need;
 
     void dump(Formatter *f) const {
       {
@@ -262,10 +263,12 @@ private:
                    const ObjectRecoveryProgress &progress,
                    ObjectRecoveryProgress *out_progress,
                    PushOp *out_op,
-                   object_stat_sum_t *stat = 0);
+                   object_stat_sum_t *stat = 0,
+                    bool cache_dont_need = true);
   void submit_push_data(ObjectRecoveryInfo &recovery_info,
                        bool first,
                        bool complete,
+                       bool cache_dont_need,
                        const interval_set<uint64_t> &intervals_included,
                        bufferlist data_included,
                        bufferlist omap_header,
@@ -291,7 +294,7 @@ private:
     RPGHandle *h);
   void prep_push_to_replica(
     ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
-    PushOp *pop);
+    PushOp *pop, bool cache_dont_need = true);
   void prep_push(ObjectContextRef obc,
                 const hobject_t& oid, pg_shard_t dest,
                 PushOp *op);
@@ -300,7 +303,8 @@ private:
                 eversion_t version,
                 interval_set<uint64_t> &data_subset,
                 map<hobject_t, interval_set<uint64_t> >& clone_subsets,
-                PushOp *op);
+                PushOp *op,
+                 bool cache = false);
   void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
                         const pg_missing_t& missing,
                         const hobject_t &last_backfill,
index dd760487c8ef9c503b9e250b7776166a0fdbd7b0..d34ad92696b8202f7d8559f24c975e75f9e9ef82 100644 (file)
@@ -388,6 +388,7 @@ void ReplicatedPG::wait_for_unreadable_object(
   } else {
     dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+    h->cache_dont_need = false;
     if (is_missing_object(soid)) {
       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
     } else {
@@ -464,6 +465,7 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
       }
     }
     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
+    h->cache_dont_need = false;
     prep_object_replica_pushes(soid, v, h);
     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
   }