]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: check bluefs allocations when replaying log
authorIgor Fedotov <ifedotov@suse.com>
Sun, 10 Nov 2019 23:35:04 +0000 (02:35 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Mon, 18 Nov 2019 09:20:13 +0000 (12:20 +0300)
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/common/legacy_config_opts.h
src/common/options.cc
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/test/objectstore/test_bluefs.cc

index a30e206fd5c2a842dbd4de77b41415669e6da4b2..4fc78c79c83afaf8530d1434719bc116476d162f 100644 (file)
@@ -912,6 +912,7 @@ OPTION(bluefs_buffered_io, OPT_BOOL)
 OPTION(bluefs_sync_write, OPT_BOOL)
 OPTION(bluefs_allocator, OPT_STR)     // stupid | bitmap
 OPTION(bluefs_preextend_wal_files, OPT_BOOL)  // this *requires* that rocksdb has recycling enabled
+OPTION(bluefs_log_replay_check_allocations, OPT_BOOL)
 
 OPTION(bluestore_bluefs, OPT_BOOL)
 OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug
index 046d62e916b45ff30d045ab771404d4fee445504..b4c8c28b4f8871a9808e6b2636a22f4d886f5f4b 100644 (file)
@@ -3973,6 +3973,10 @@ std::vector<Option> get_global_options() {
     .set_default(true)
     .set_description("Preextent rocksdb wal files on mkfs to avoid performance penalty"),
 
+    Option("bluefs_log_replay_check_allocations", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+      .set_default(true)
+      .set_description("Enables space allocations checking during log replay"),
+
     Option("bluestore_bluefs", Option::TYPE_BOOL, Option::LEVEL_DEV)
     .set_default(true)
     .set_flag(Option::FLAG_CREATE)
index 855a93bc4cdf3fa0e52e3b15670509e0172b3521..e156ca2485bf7563d6cbd46a89a5b27f58731d2b 100644 (file)
@@ -2,6 +2,8 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "boost/algorithm/string.hpp" 
+#include <boost/dynamic_bitset.hpp>
+#include "bluestore_common.h"
 #include "BlueFS.h"
 
 #include "common/debug.h"
@@ -783,6 +785,15 @@ int BlueFS::_replay(bool noop, bool to_stdout)
     true);  // ignore eof
 
   bool seen_recs = false;
+
+  boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
+  if (cct->_conf->bluefs_log_replay_check_allocations) {
+    for (size_t i = 0; i < MAX_BDEV; ++i) {
+      if (alloc_size[i] != 0 && bdev[i] != nullptr) {
+        used_blocks[i].resize(bdev[i]->get_size() / alloc_size[i]);
+      }
+    }
+  }
   while (true) {
     ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
     uint64_t pos = log_reader->buf.pos;
@@ -961,6 +972,24 @@ int BlueFS::_replay(bool noop, bool to_stdout)
          if (!noop) {
            block_all[id].insert(offset, length);
            alloc[id]->init_add_free(offset, length);
+
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              bool fail = false;
+              apply(offset, length, alloc_size[id], used_blocks[id],
+                [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+                  ceph_assert(pos < bs.size());
+                  
+                  if (bs.test(pos)) {
+                    fail = true;
+                  }
+                }
+              );
+              if (fail) {
+                derr << __func__ << " invalid extent " << id << ": 0x" << std::hex << offset << "~" << length
+                  << std::dec << ": already in use" << dendl;
+                return -EFAULT;
+              }
+            }
          }
        }
        break;
@@ -986,7 +1015,23 @@ int BlueFS::_replay(bool noop, bool to_stdout)
          if (!noop) {
            block_all[id].erase(offset, length);
            alloc[id]->init_rm_free(offset, length);
-         }
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              bool fail = false;
+              apply(offset, length, alloc_size[id], used_blocks[id],
+                [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+                  ceph_assert(pos < bs.size());
+                  if (bs.test(pos)) {
+                    fail = true;
+                  }
+                }
+              );
+              if (fail) {
+                derr << __func__ << " invalid extent " << id << ": 0x" << std::hex << offset << "~" << length
+                  << std::dec << ": still in use" << dendl;
+                return -EFAULT;
+              }
+            }
+          }
        }
        break;
 
@@ -1097,14 +1142,48 @@ int BlueFS::_replay(bool noop, bool to_stdout)
                       << ":  op_file_update " << " " << fnode << std::endl;
           }
 
-         if (!noop) {
+          if (!noop) {
            FileRef f = _get_file(fnode.ino);
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              auto& fnode_extents = f->fnode.extents;
+              for (auto e : fnode_extents) {
+                auto id = e.bdev;
+                apply(e.offset, e.length, alloc_size[id], used_blocks[id],
+                  [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+                    ceph_assert(pos < bs.size());
+                    bs.reset(pos);
+                  }
+                );
+              }
+            }
+
            f->fnode = fnode;
            if (fnode.ino > ino_last) {
              ino_last = fnode.ino;
            }
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              auto& fnode_extents = f->fnode.extents;
+              for (auto e : fnode_extents) {
+                auto id = e.bdev;
+                bool fail = false;
+                apply(e.offset, e.length, alloc_size[id], used_blocks[id],
+                  [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+                    ceph_assert(pos < bs.size());
+                    if (bs.test(pos)) {
+                      fail = true;
+                    }
+                    bs.set(pos);
+                  }
+                );
+                if (fail) {
+                  derr << __func__ << " invalid extent " << e.bdev << ": 0x" << std::hex << e.offset << "~" << e.length
+                    << std::dec << ": duplicate reference, ino " << fnode.ino << dendl;
+                  return -EFAULT;
+                }
+              }
+            }
          }
-       }
+        }
        break;
 
       case bluefs_transaction_t::OP_FILE_REMOVE:
@@ -1118,12 +1197,33 @@ int BlueFS::_replay(bool noop, bool to_stdout)
                       << ":  op_file_remove " << ino << std::endl;
           }
 
-         if (!noop) {
-           auto p = file_map.find(ino);
-           ceph_assert(p != file_map.end());
-           file_map.erase(p);
-         }
-       }
+          if (!noop) {
+            auto p = file_map.find(ino);
+            ceph_assert(p != file_map.end());
+            if (cct->_conf->bluefs_log_replay_check_allocations) {
+              auto& fnode_extents = p->second->fnode.extents;
+              for (auto e : fnode_extents) {
+                auto id = e.bdev;
+                bool fail = false;
+                apply(e.offset, e.length, alloc_size[id], used_blocks[id],
+                  [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
+                    ceph_assert(pos < bs.size());
+                    if (!bs.test(pos)) {
+                      fail = true;
+                    }
+                    bs.reset(pos);
+                  }
+                );
+                if (fail) {
+                  derr << __func__ << " invalid extent " << e.bdev << ": 0x" << std::hex << e.offset << "~" << e.length
+                    << std::dec << ": not in use while releasing for ino " << ino << dendl;
+                  return -EFAULT;
+                }
+              }
+            }
+            file_map.erase(p);
+          }
+        }
        break;
 
       default:
@@ -3190,3 +3290,15 @@ bool BlueFS::wal_is_rotational()
   }
   return bdev[BDEV_SLOW]->is_rotational();
 }
+
+void BlueFS::debug_inject_duplicate_gift(unsigned id,
+  uint64_t offset,
+  uint64_t len)
+{
+  dout(0) << __func__ << dendl;
+  if (id < alloc.size() && alloc[id]) {
+    //log_t.op_alloc_add(id, offset, len);
+    alloc[id]->init_add_free(offset, len);
+  }
+}
+
index 69506b80cdc843af844903480f9dcedfeb2ae8f8..e631b4ac14563656595931f351a52ac64476696f 100644 (file)
@@ -544,6 +544,8 @@ public:
     return _truncate(h, offset);
   }
 
+  /// test purpose methods
+  void debug_inject_duplicate_gift(unsigned bdev, uint64_t offset, uint64_t len);
 };
 
 #endif
index 4ce3ac7b2c70528430ef6ea265ca3592a0a0ec49..a22b50a09e7ac4d9d83eba3af87f1b81e8cc6440 100644 (file)
@@ -79,6 +79,61 @@ TEST(BlueFS, mkfs_mount) {
   fs.umount();
 }
 
+TEST(BlueFS, mkfs_mount_duplicate_gift) {
+  uint64_t size = 1048576 * 128;
+  TempBdev bdev{ size };
+  {
+    BlueFS fs(g_ceph_context);
+    ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
+    fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
+    uuid_d fsid;
+    ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+    ASSERT_EQ(0, fs.mount());
+
+    {
+      BlueFS::FileWriter *h;
+      ASSERT_EQ(0, fs.mkdir("dir"));
+      ASSERT_EQ(0, fs.open_for_write("dir", "file1", &h, false));
+      h->append("foo", 3);
+      h->append("bar", 3);
+      h->append("baz", 3);
+      fs.fsync(h);
+      fs.close_writer(h);
+    }
+
+    fs.umount();
+  }
+
+  {
+    BlueFS fs(g_ceph_context);
+    ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
+    ASSERT_EQ(0, fs.mount());
+    // free allocation presumably allocated for file1 
+    fs.debug_inject_duplicate_gift(BlueFS::BDEV_DB, 5 * 1048576, 1048576);
+    {
+      // overwrite file1 with file2 
+      BlueFS::FileWriter *h;
+      ASSERT_EQ(0, fs.open_for_write("dir", "file2", &h, false));
+      h->append("foo", 3);
+      h->append("bar", 3);
+      h->append("baz", 3);
+      fs.fsync(h);
+      fs.close_writer(h);
+    }
+    fs.umount();
+  }
+
+  g_ceph_context->_conf.set_val_or_die("bluefs_log_replay_check_allocations", "true");
+  g_ceph_context->_conf.apply_changes(nullptr);
+
+  {
+    // this should fail
+    BlueFS fs(g_ceph_context);
+    ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
+    ASSERT_NE(0, fs.mount());
+  }
+}
+
 TEST(BlueFS, write_read) {
   uint64_t size = 1048576 * 128;
   TempBdev bdev{size};