]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: verify disk layout of BlueFS. 30109/head
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Thu, 29 Aug 2019 10:17:34 +0000 (12:17 +0200)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Thu, 5 Sep 2019 09:57:51 +0000 (11:57 +0200)
Fixes: http://tracker.ceph.com/issues/25098
Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/bluefs_types.cc
src/os/bluestore/bluefs_types.h
src/test/objectstore/test_bluefs.cc

index 4ea80634a13ac7f9598c955c9f3aca9b59c25394..e9e6c5a9d41e38f7fd6743e24df7004cca75c1fe 100644 (file)
@@ -446,7 +446,7 @@ int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
   return 0;
 }
 
-int BlueFS::mkfs(uuid_d osd_uuid)
+int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
 {
   std::unique_lock l(lock);
   dout(1) << __func__
@@ -490,6 +490,7 @@ int BlueFS::mkfs(uuid_d osd_uuid)
 
   // write supers
   super.log_fnode = log_file->fnode;
+  super.memorized_layout = layout;
   _write_super(BDEV_DB);
   flush_bdev();
 
@@ -616,6 +617,23 @@ int BlueFS::mount()
   return r;
 }
 
+int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
+{
+  if (super.memorized_layout) {
+    if (layout == *super.memorized_layout) {
+      dout(10) << __func__ << " bluefs layout verified positively" << dendl;
+    } else {
+      derr << __func__ << " memorized layout doesn't fit current one" << dendl;
+      return -EIO;
+    }
+  } else {
+    dout(10) << __func__ << " no memorized_layout in bluefs superblock"
+             << dendl;
+  }
+
+  return 0;
+}
+
 void BlueFS::umount()
 {
   dout(1) << __func__ << dendl;
@@ -693,7 +711,7 @@ int BlueFS::_write_super(int dev)
   dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
   dout(10) << __func__ << " superblock " << super.version << dendl;
   dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
-  ceph_assert(bl.length() <= get_super_length());
+  ceph_assert_always(bl.length() <= get_super_length());
   bl.append_zero(get_super_length() - bl.length());
 
   bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
index f458f8912658068b502b0022db793e306321033e..07e3334902e280bc746c638d52d8df2ddac644d9 100644 (file)
@@ -415,8 +415,9 @@ public:
   ~BlueFS();
 
   // the super is always stored on bdev 0
-  int mkfs(uuid_d osd_uuid);
+  int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout);
   int mount();
+  int maybe_verify_layout(const bluefs_layout_t& layout) const;
   void umount();
   int prepare_new_device(int id);
   
index 6e5a135bf0efb966e672daf97036751391674da1..e29a0c5db84b890d42680bff5d3d929deec27f74 100644 (file)
@@ -5304,12 +5304,13 @@ int BlueStore::_open_bluefs(bool create)
     return r;
   }
   if (create) {
-    bluefs->mkfs(fsid);
+    bluefs->mkfs(fsid, bluefs_layout);
   }
   r = bluefs->mount();
   if (r < 0) {
     derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl;
   }
+  ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0);
   return r;
 }
 
index aad2fc11ed9d10a9adfaaed183528bfaea7317a9..eb51eca0ac099c693a1f87aa8d7758e2355566fa 100644 (file)
@@ -30,27 +30,58 @@ ostream& operator<<(ostream& out, const bluefs_extent_t& e)
             << std::dec;
 }
 
+// bluefs_layout_t
+
+void bluefs_layout_t::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  encode(shared_bdev, bl);
+  encode(dedicated_db, bl);
+  encode(dedicated_wal, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bluefs_layout_t::decode(bufferlist::const_iterator& p)
+{
+  DECODE_START(1, p);
+  decode(shared_bdev, p);
+  decode(dedicated_db, p);
+  decode(dedicated_wal, p);
+  DECODE_FINISH(p);
+}
+
+void bluefs_layout_t::dump(Formatter *f) const
+{
+  f->dump_stream("shared_bdev") << shared_bdev;
+  f->dump_stream("dedicated_db") << dedicated_db;
+  f->dump_stream("dedicated_wal") << dedicated_wal;
+}
+
 // bluefs_super_t
 
 void bluefs_super_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(1, 1, bl);
+  ENCODE_START(2, 1, bl);
   encode(uuid, bl);
   encode(osd_uuid, bl);
   encode(version, bl);
   encode(block_size, bl);
   encode(log_fnode, bl);
+  encode(memorized_layout, bl);
   ENCODE_FINISH(bl);
 }
 
 void bluefs_super_t::decode(bufferlist::const_iterator& p)
 {
-  DECODE_START(1, p);
+  DECODE_START(2, p);
   decode(uuid, p);
   decode(osd_uuid, p);
   decode(version, p);
   decode(block_size, p);
   decode(log_fnode, p);
+  if (struct_v >= 2) {
+    decode(memorized_layout, p);
+  }
   DECODE_FINISH(p);
 }
 
index 9046e511d9d0c3dbb36a795cf6862130c7f1840b..fde03842c8d053e8471e17923b3033a7c93d05cb 100644 (file)
@@ -3,6 +3,8 @@
 #ifndef CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
 #define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H
 
+#include <optional>
+
 #include "bluestore_types.h"
 #include "include/utime.h"
 #include "include/encoding.h"
@@ -138,7 +140,18 @@ struct bluefs_layout_t {
   bool single_shared_device() const {
     return !dedicated_db && !dedicated_wal;
   }
+
+  bool operator==(const bluefs_layout_t& other) const {
+    return shared_bdev == other.shared_bdev &&
+           dedicated_db == other.dedicated_db &&
+           dedicated_wal == other.dedicated_wal;
+  }
+
+  void encode(ceph::bufferlist& bl) const;
+  void decode(ceph::bufferlist::const_iterator& p);
+  void dump(Formatter *f) const;
 };
+WRITE_CLASS_ENCODER(bluefs_layout_t)
 
 struct bluefs_super_t {
   uuid_d uuid;      ///< unique to this bluefs instance
@@ -148,6 +161,8 @@ struct bluefs_super_t {
 
   bluefs_fnode_t log_fnode;
 
+  std::optional<bluefs_layout_t> memorized_layout;
+
   bluefs_super_t()
     : version(0),
       block_size(4096) { }
index 1e65a155fa3cf6c88143bbffa3dabaa7b9bb5cfc..4ce3ac7b2c70528430ef6ea265ca3592a0a0ec49 100644 (file)
@@ -61,7 +61,7 @@ TEST(BlueFS, mkfs) {
   BlueFS fs(g_ceph_context);
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
 }
 
 TEST(BlueFS, mkfs_mount) {
@@ -71,8 +71,9 @@ TEST(BlueFS, mkfs_mount) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576);
   ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576);
   fs.umount();
@@ -85,8 +86,9 @@ TEST(BlueFS, write_read) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     BlueFS::FileWriter *h;
     ASSERT_EQ(0, fs.mkdir("dir"));
@@ -116,8 +118,9 @@ TEST(BlueFS, small_appends) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     BlueFS::FileWriter *h;
     ASSERT_EQ(0, fs.mkdir("dir"));
@@ -152,8 +155,9 @@ TEST(BlueFS, very_large_write) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   char buf[1048571]; // this is biggish, but intentionally not evenly aligned
   for (unsigned i = 0; i < sizeof(buf); ++i) {
     buf[i] = i;
@@ -314,8 +318,9 @@ TEST(BlueFS, test_flush_1) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     std::vector<std::thread> write_thread_multiple;
     uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
@@ -348,8 +353,9 @@ TEST(BlueFS, test_flush_2) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     uint64_t effective_size = size - (128 * 1048576); // leaving the last 32 MB for log compaction
     uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS));
@@ -375,8 +381,9 @@ TEST(BlueFS, test_flush_3) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     std::vector<std::thread> write_threads;
     uint64_t effective_size = size - (64 * 1048576); // leaving the last 11 MB for log compaction
@@ -408,8 +415,9 @@ TEST(BlueFS, test_simple_compaction_sync) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     for (int i=0; i<10; i++) {
        string dir = "dir.";
@@ -460,8 +468,9 @@ TEST(BlueFS, test_simple_compaction_async) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     for (int i=0; i<10; i++) {
        string dir = "dir.";
@@ -515,8 +524,9 @@ TEST(BlueFS, test_compaction_sync) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     std::vector<std::thread> write_threads;
     uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
@@ -552,8 +562,9 @@ TEST(BlueFS, test_compaction_async) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     std::vector<std::thread> write_threads;
     uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
@@ -589,8 +600,9 @@ TEST(BlueFS, test_replay) {
   ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false));
   fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
   uuid_d fsid;
-  ASSERT_EQ(0, fs.mkfs(fsid));
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   {
     std::vector<std::thread> write_threads;
     uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction
@@ -612,6 +624,7 @@ TEST(BlueFS, test_replay) {
   fs.umount();
   // remount and check log can replay safe?
   ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
   fs.umount();
 }