]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: fadvise replica data don't use
authorDavid Zafman <david.zafman@inktank.com>
Fri, 8 Mar 2013 03:36:07 +0000 (19:36 -0800)
committerDavid Zafman <david.zafman@inktank.com>
Fri, 8 Mar 2013 03:36:07 +0000 (19:36 -0800)
Add transaction flag indicating if op is at a replica
After write has been sync'ed use posix_fadvise() to clear kernel cache

Feature: #2733

Signed-off-by: David Zafman <david.zafman@inktank.com>
Reviewed-by: Sam Just <sam.just@inktank.com>
src/os/FileStore.cc
src/os/FileStore.h
src/os/ObjectStore.h
src/osd/ReplicatedPG.cc

index b35bf160b07e6939bf0df70e3498fc98576d98d0..c15450b6429580e7611bbda8e9ba697d144944b4 100644 (file)
@@ -2348,10 +2348,11 @@ unsigned FileStore::_do_transaction(Transaction& t, uint64_t op_seq, int trans_n
        hobject_t oid = i.get_oid();
        uint64_t off = i.get_length();
        uint64_t len = i.get_length();
+       bool replica = i.get_replica();
        bufferlist bl;
        i.get_bl(bl);
        if (_check_replay_guard(cid, oid, spos) > 0)
-         r = _write(cid, oid, off, len, bl);
+         r = _write(cid, oid, off, len, bl, replica);
       }
       break;
       
@@ -2869,7 +2870,7 @@ int FileStore::_touch(coll_t cid, const hobject_t& oid)
 
 int FileStore::_write(coll_t cid, const hobject_t& oid, 
                      uint64_t offset, size_t len,
-                     const bufferlist& bl)
+                     const bufferlist& bl, bool replica)
 {
   dout(15) << "write " << cid << "/" << oid << " " << offset << "~" << len << dendl;
   int r;
@@ -2908,21 +2909,36 @@ int FileStore::_write(coll_t cid, const hobject_t& oid,
   // flush?
   {
     bool should_flush = (ssize_t)len >= m_filestore_flush_min;
+    bool local_flush = false;
+    bool async_done = false;
 #ifdef HAVE_SYNC_FILE_RANGE
     if (!should_flush ||
        !m_filestore_flusher ||
-       !queue_flusher(fd, offset, len)) {
-      if (should_flush && m_filestore_sync_flush)
+       !(async_done = queue_flusher(fd, offset, len, replica))) {
+      if (should_flush && m_filestore_sync_flush) {
        ::sync_file_range(fd, offset, len, SYNC_FILE_RANGE_WRITE);
-      lfn_close(fd);
+       local_flush = true;
+      }
     }
+    //Both lfn_close() and possible posix_fadvise() done by flusher
+    if (async_done) fd = -1;
 #else
     // no sync_file_range; (maybe) flush inline and close.
-    if (should_flush && m_filestore_sync_flush)
+    if (should_flush && m_filestore_sync_flush) {
       ::fdatasync(fd);
-    lfn_close(fd);
+      local_flush = true;
+    }
 #endif
+    if (local_flush && replica) {
+      int fa_r = posix_fadvise(fd, offset, len, POSIX_FADV_DONTNEED);
+      if (fa_r) {
+       dout(0) << "posic_fadvise failed: " << cpp_strerror(fa_r) << dendl;
+      } else {
+       dout(10) << "posix_fadvise performed after local flush" << dendl;
+      }
+    }
   }
+  if (fd >= 0) lfn_close(fd);
 
  out:
   dout(10) << "write " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl;
@@ -3211,7 +3227,7 @@ int FileStore::_clone_range(coll_t cid, const hobject_t& oldoid, const hobject_t
 }
 
 
-bool FileStore::queue_flusher(int fd, uint64_t off, uint64_t len)
+bool FileStore::queue_flusher(int fd, uint64_t off, uint64_t len, bool replica)
 {
   bool queued;
   lock.Lock();
@@ -3220,6 +3236,7 @@ bool FileStore::queue_flusher(int fd, uint64_t off, uint64_t len)
     flusher_queue.push_back(fd);
     flusher_queue.push_back(off);
     flusher_queue.push_back(len);
+    flusher_queue.push_back(replica);
     flusher_queue_len++;
     flusher_cond.Signal();
     dout(10) << "queue_flusher ep " << sync_epoch << " fd " << fd << " " << off << "~" << len
@@ -3259,13 +3276,23 @@ void FileStore::flusher_entry()
        q.pop_front();
        uint64_t len = q.front();
        q.pop_front();
+       bool replica = q.front();
+       q.pop_front();
        if (!stop && ep == sync_epoch) {
          dout(10) << "flusher_entry flushing+closing " << fd << " ep " << ep << dendl;
          ::sync_file_range(fd, off, len, SYNC_FILE_RANGE_WRITE);
+         if (replica) {
+           int fa_r = posix_fadvise(fd, off, len, POSIX_FADV_DONTNEED);
+           if (fa_r) {
+             dout(0) << "posic_fadvise failed: " << cpp_strerror(fa_r) << dendl;
+           } else {
+             dout(10) << "posix_fadvise performed after local flush" << dendl;
+           }
+         }
        } else 
          dout(10) << "flusher_entry JUST closing " << fd << " (stop=" << stop << ", ep=" << ep
                   << ", sync_epoch=" << sync_epoch << ")" << dendl;
-       TEMP_FAILURE_RETRY(::close(fd));
+       lfn_close(fd);
       }
       lock.Lock();
       flusher_queue_len -= num;   // they're definitely closed, forget
index e739447afc0bfa06224037433ea323e42157107c..d7c69837e403cd30bf1184d164a8944c57306048 100644 (file)
@@ -263,7 +263,7 @@ private:
       return 0;
     }
   } flusher_thread;
-  bool queue_flusher(int fd, uint64_t off, uint64_t len);
+  bool queue_flusher(int fd, uint64_t off, uint64_t len, bool replica);
 
   int open_journal();
 
@@ -372,7 +372,8 @@ public:
   int fiemap(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, bufferlist& bl);
 
   int _touch(coll_t cid, const hobject_t& oid);
-  int _write(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl);
+  int _write(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len, const bufferlist& bl,
+      bool replica = false);
   int _zero(coll_t cid, const hobject_t& oid, uint64_t offset, size_t len);
   int _truncate(coll_t cid, const hobject_t& oid, uint64_t size);
   int _clone(coll_t cid, const hobject_t& oldoid, const hobject_t& newoid,
index e88a67fe66b133bd542adac12f0767d8dbf3f1fd..2f3d73133949998af5ca03edf5696a622732d022 100644 (file)
@@ -165,11 +165,16 @@ public:
     bool sobject_encoding;
     int64_t pool_override;
     bool use_pool_override;
+    bool replica;
 
   public:
     void set_pool_override(int64_t pool) {
       pool_override = pool;
     }
+    void set_replica() {
+      replica = true;
+    }
+    bool get_replica() { return replica; }
 
     void swap(Transaction& other) {
       std::swap(ops, other.ops);
@@ -237,12 +242,14 @@ public:
       bool sobject_encoding;
       int64_t pool_override;
       bool use_pool_override;
+      bool replica;
 
       iterator(Transaction *t)
        : p(t->tbl.begin()),
          sobject_encoding(t->sobject_encoding),
          pool_override(t->pool_override),
-         use_pool_override(t->use_pool_override) {}
+         use_pool_override(t->use_pool_override),
+         replica(t->replica) {}
 
       friend class Transaction;
 
@@ -303,6 +310,7 @@ public:
        ::decode(bits, p);
        return bits;
       }
+      bool get_replica() { return replica; }
     };
 
     iterator begin() {
@@ -569,15 +577,15 @@ public:
     // etc.
     Transaction() :
       ops(0), pad_unused_bytes(0), largest_data_len(0), largest_data_off(0), largest_data_off_in_tbl(0),
-      sobject_encoding(false), pool_override(-1), use_pool_override(false) {}
+      sobject_encoding(false), pool_override(-1), use_pool_override(false), replica(false) {}
     Transaction(bufferlist::iterator &dp) :
       ops(0), pad_unused_bytes(0), largest_data_len(0), largest_data_off(0), largest_data_off_in_tbl(0),
-      sobject_encoding(false), pool_override(-1), use_pool_override(false) {
+      sobject_encoding(false), pool_override(-1), use_pool_override(false), replica(false) {
       decode(dp);
     }
     Transaction(bufferlist &nbl) :
       ops(0), pad_unused_bytes(0), largest_data_len(0), largest_data_off(0), largest_data_off_in_tbl(0),
-      sobject_encoding(false), pool_override(-1), use_pool_override(false) {
+      sobject_encoding(false), pool_override(-1), use_pool_override(false), replica(false) {
       bufferlist::iterator dp = nbl.begin();
       decode(dp); 
     }
index ea71f2b81e8af952fe3f5eb77e557ac35943c911..55aec87e897f029f726ad7615b2ebe5178dabec8 100644 (file)
@@ -4581,6 +4581,7 @@ void ReplicatedPG::sub_op_modify(OpRequestRef op)
        }
        rm->opt.set_pool_override(info.pgid.pool());
       }
+      rm->opt.set_replica();
       
       info.stats = m->pg_stats;
       if (!rm->opt.empty()) {