os/bluestore/BlueFS: periodically compact log

author Sage Weil <sage@redhat.com>

Thu, 10 Dec 2015 21:38:35 +0000 (16:38 -0500)

committer Sage Weil <sage@redhat.com>

Fri, 1 Jan 2016 18:06:55 +0000 (13:06 -0500)
author Sage Weil <sage@redhat.com>
Thu, 10 Dec 2015 21:38:35 +0000 (16:38 -0500)
committer Sage Weil <sage@redhat.com>
Fri, 1 Jan 2016 18:06:55 +0000 (13:06 -0500)
diff --git a/src/common/config_opts.h b/src/common/config_opts.h

index 0b32947140854a88344476b8c66eaebbe9d1a3a6..4822bb16d6b4e01372711721b38cce0d09fe59ec 100644 (file)
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -837,6 +837,8 @@ OPTION(bluefs_alloc_size, OPT_U64, 1048576)
  OPTION(bluefs_max_prefetch, OPT_U64, 1048576)
  OPTION(bluefs_min_log_runway, OPT_U64, 1048576)  // alloc when we get this low
  OPTION(bluefs_max_log_runway, OPT_U64, 4194304)  // alloc this much at a time
+OPTION(bluefs_log_compact_min_ratio, OPT_FLOAT, 5.0)      // before we consider
+OPTION(bluefs_log_compact_min_size, OPT_U64, 16*1048576)  // before we consider
  
  OPTION(bluestore_bluefs, OPT_BOOL, false)
  OPTION(bluestore_bluefs_mirror, OPT_BOOL, false) // mirror to normal Env for debug
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc

index 0321812d648cfc430873a2d1dc8f2a9f7cbeba3d..8e19e25eda77887fd94d15616c327e82e7093992 100644 (file)
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -225,6 +225,14 @@ void BlueFS::umount()
    log_t.clear();
  }
  
+int BlueFS::fsck()
+{
+  Mutex::Locker l(lock);
+  dout(1) << __func__ << dendl;
+  // hrm, i think we check everything on mount...
+  return 0;
+}
+
  int BlueFS::_write_super()
  {
    // build superblock
@@ -392,6 +400,17 @@ int BlueFS::_replay()
         assert(t.seq == 1);
         break;
  
+      case bluefs_transaction_t::OP_JUMP_SEQ:
+        {
+         uint64_t next_seq;
+         ::decode(next_seq, p);
+         dout(20) << __func__ << " " << pos << ":  op_jump_seq "
+                  << next_seq << dendl;
+         assert(next_seq >= log_seq);
+         log_seq = next_seq;
+       }
+       break;
+
        case bluefs_transaction_t::OP_ALLOC_ADD:
          {
           __u8 id;
@@ -643,6 +662,126 @@ void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
  #warning implement _invalidate_cache
  }
  
+uint64_t BlueFS::_estimate_log_size()
+{
+  int avg_dir_size = 40;  // fixme
+  int avg_file_size = 12;
+  uint64_t size = 4096 * 2;
+  size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
+  for (auto p : block_all)
+    size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
+  size += dir_map.size() + (1 + avg_dir_size);
+  size += file_map.size() * (1 + avg_dir_size + avg_file_size);
+  return ROUND_UP_TO(size, super.block_size);
+}
+
+void BlueFS::_maybe_compact_log()
+{
+  uint64_t current = log_writer->file->fnode.size;
+  uint64_t expected = _estimate_log_size();
+  float ratio = (float)current / (float)expected;
+  dout(10) << __func__ << " current " << current
+          << " expected " << expected
+          << " ratio " << ratio << dendl;
+  if (current < g_conf->bluefs_log_compact_min_size ||
+      ratio < g_conf->bluefs_log_compact_min_ratio)
+    return;
+  _compact_log();
+  dout(20) << __func__ << " done, actual " << log_writer->file->fnode.size
+          << " vs expected " << expected << dendl;
+}
+
+void BlueFS::_compact_log()
+{
+#warning smarter _compact_log
+  // FIXME: we currently hold the lock while writing out the compacted log,
+  // which may mean a latency spike.  we could drop the lock while writing out
+  // the big compacted log, while continuing to log at the end of the old log
+  // file, and once it's done swap out the old log extents for the new ones.
+  dout(10) << __func__ << dendl;
+  File *log_file = log_writer->file.get();
+
+  // clear out log (be careful who calls us!!!)
+  log_t.clear();
+
+  bluefs_transaction_t t;
+  t.seq = 1;
+  t.uuid = super.uuid;
+  dout(20) << __func__ << " op_init" << dendl;
+  t.op_init();
+  for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
+    interval_set<uint64_t>& p = block_all[bdev];
+    for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
+      dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
+              << "~" << q.get_len() << dendl;
+      t.op_alloc_add(bdev, q.get_start(), q.get_len());
+    }
+  }
+  for (auto p : file_map) {
+    if (p.first == 1)
+      continue;
+    dout(20) << __func__ << " op_file_update " << p.second->fnode << dendl;
+    t.op_file_update(p.second->fnode);
+  }
+  for (auto p : dir_map) {
+    dout(20) << __func__ << " op_dir_create " << p.first << dendl;
+    t.op_dir_create(p.first);
+    for (auto q : p.second->file_map) {
+      dout(20) << __func__ << " op_dir_link " << p.first << "/" << q.first
+              << " to " << q.second->fnode.ino << dendl;
+      t.op_dir_link(p.first, q.first, q.second->fnode.ino);
+    }
+  }
+  dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
+  t.op_jump_seq(log_seq);
+
+  bufferlist bl;
+  ::encode(t, bl);
+  _pad_bl(bl);
+
+  uint64_t need = bl.length() + g_conf->bluefs_max_log_runway;
+  dout(20) << __func__ << " need " << need << dendl;
+
+  vector<bluefs_extent_t> old_extents;
+  old_extents.swap(log_file->fnode.extents);
+  while (log_file->fnode.get_allocated() < need) {
+    int r = _allocate(0, need - log_file->fnode.get_allocated(),
+                     &log_file->fnode.extents);
+    assert(r == 0);
+  }
+
+  delete log_writer;
+
+  log_file->fnode.size = bl.length();
+  log_writer = new FileWriter(log_file);
+  log_writer->append(bl);
+  _flush(log_writer);
+
+  dout(10) << __func__ << " writing super" << dendl;
+  super.log_fnode = log_file->fnode;
+  ++super.version;
+  _write_super();
+  _flush_bdev();
+
+  dout(10) << __func__ << " release old log extents " << old_extents << dendl;
+  for (auto r : old_extents) {
+    alloc[r.bdev]->release(r.offset, r.length);
+  }
+}
+
+void BlueFS::_pad_bl(bufferlist& bl)
+{
+  uint64_t partial = bl.length() % super.block_size;
+  if (partial) {
+    bufferptr z(super.block_size - partial);
+    dout(10) << __func__ << " padding with " << z.length() << " zeros" << dendl;
+    z.zero();
+    bufferlist zbl;
+    zbl.append(z);
+    bl.append(z);
+  }
+}
+
  int BlueFS::_flush_log()
  {
    log_t.seq = ++log_seq;
@@ -665,15 +804,7 @@ int BlueFS::_flush_log()
    ::encode(log_t, bl);
  
    // pad to block boundary
-  uint64_t partial = bl.length() % super.block_size;
-  if (partial) {
-    bufferptr z(super.block_size - partial);
-    dout(10) << __func__ << " padding with " << z.length() << " zeros" << dendl;
-    z.zero();
-    bufferlist zbl;
-    zbl.append(z);
-    bl.append(z);
-  }
+  _pad_bl(bl);
    log_writer->append(bl);
  
    log_t.clear();
@@ -934,6 +1065,7 @@ void BlueFS::sync_metadata()
    utime_t end = ceph_clock_now(NULL);
    utime_t dur = end - start;
    dout(10) << __func__ << " done in " << dur << dendl;
+  _maybe_compact_log();
  }
  
  int BlueFS::open_for_write(
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h

index c276fbacb9547b1915e1023a88da24fdd55b18a9..c983f510d644ed3595c44ce0743a7c0ddd2d41ee 100644 (file)
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -149,15 +149,21 @@ private:
  
    void _init_alloc();
  
+  void _pad_bl(bufferlist& bl);  ///< pad bufferlist to block size w/ zeros
+
    FileRef _get_file(uint64_t ino);
    void _drop_link(FileRef f);
  
    int _allocate(unsigned bdev, uint64_t len, vector<bluefs_extent_t> *ev);
    int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
    int _flush(FileWriter *h);
-  int _flush_log();
    void _fsync(FileWriter *h);
  
+  int _flush_log();
+  uint64_t _estimate_log_size();
+  void _maybe_compact_log();
+  void _compact_log();
+
    void _submit_bdev();
    void _flush_bdev();
  
@@ -186,6 +192,8 @@ public:
    int mount(uint64_t super_offset_a, uint64_t super_offset_b);
    void umount();
  
+  int fsck();
+
    uint64_t get_total(unsigned id);
    uint64_t get_free(unsigned id);
  
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc

index 0d9249d161c2d2e629f397c6a9ab98e50d6ce706..4d9b2bc6d760889655f73c2935759db0bd86bef9 100644 (file)
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -951,19 +951,20 @@ int BlueStore::_open_db(bool create)
        rocksdb::Env *b = rocksdb::Env::Default();
        if (create) {
         string cmd = "rm -r " + path + "/db";
-       system(cmd.c_str());
+       int r = system(cmd.c_str());
+       (void)r;
        }
        env = new rocksdb::MirrorEnv(b, a);
      } else {
        env = new BlueRocksEnv(bluefs);
+
+      // simplify the dir names, too, as "seen" by rocksdb
+      strcpy(fn, "db");
      }
  
      if (create) {
        env->CreateDir(fn);
      }
-
-    // simplify the dir names, too, as "seen" by rocksdb
-    //strcpy(fn, "bluefs/db");
    } else if (create) {
      int r = ::mkdir(fn, 0755);
      if (r < 0)
@@ -1468,10 +1469,24 @@ int BlueStore::fsck()
    if (r < 0)
      goto out_db;
  
+  r = _open_super_meta();
+  if (r < 0)
+    goto out_alloc;
+
    r = _open_collections(&errors);
    if (r < 0)
      goto out_alloc;
  
+  if (bluefs) {
+    used_blocks.insert(0, g_conf->bluestore_bluefs_initial_offset); // fixme
+    used_blocks.insert(bluefs_extents);
+    r = bluefs->fsck();
+    if (r < 0)
+      goto out_alloc;
+    if (r > 0)
+      errors += r;
+  }
+
    // walk collections, objects
    for (ceph::unordered_map<coll_t, CollectionRef>::iterator p = coll_map.begin();
         p != coll_map.end() && !errors;
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h

index 48737c3176f73ea37ab324d40a544df834f574d9..c86f8b762e4527887f5c55b98a237d54bd841bc5 100644 (file)
--- a/src/os/bluestore/bluefs_types.h
+++ b/src/os/bluestore/bluefs_types.h
@@ -97,6 +97,7 @@ struct bluefs_transaction_t {
      OP_DIR_REMOVE,  ///< remove a dir (dirname)
      OP_FILE_UPDATE, ///< set/update file metadata (file)
      OP_FILE_REMOVE, ///< remove file (ino)
+    OP_JUMP_SEQ,    ///< jump the seq #
    } op_t;
  
    uuid_d uuid;          ///< fs uuid
@@ -154,6 +155,10 @@ struct bluefs_transaction_t {
      ::encode((__u8)OP_FILE_REMOVE, op_bl);
      ::encode(ino, op_bl);
    }
+  void op_jump_seq(uint64_t next_seq) {
+    ::encode((__u8)OP_JUMP_SEQ, op_bl);
+    ::encode(next_seq, op_bl);
+  }
  
    void encode(bufferlist& bl) const;
    void decode(bufferlist::iterator& p);
author	Sage Weil <sage@redhat.com>
	Thu, 10 Dec 2015 21:38:35 +0000 (16:38 -0500)
committer	Sage Weil <sage@redhat.com>
	Fri, 1 Jan 2016 18:06:55 +0000 (13:06 -0500)
src/common/config_opts.h		patch \| blob \| history
src/os/bluestore/BlueFS.cc		patch \| blob \| history
src/os/bluestore/BlueFS.h		patch \| blob \| history
src/os/bluestore/BlueStore.cc		patch \| blob \| history
src/os/bluestore/bluefs_types.h		patch \| blob \| history