From 54ac2bedd2b5ae7b4a8bc6b74bf614796515fcad Mon Sep 17 00:00:00 2001 From: Radoslaw Zarzynski Date: Thu, 29 Aug 2019 12:17:34 +0200 Subject: [PATCH] os/bluestore: verify disk layout of BlueFS. Fixes: http://tracker.ceph.com/issues/25098 Signed-off-by: Radoslaw Zarzynski --- src/os/bluestore/BlueFS.cc | 22 ++++++++++++++-- src/os/bluestore/BlueFS.h | 3 ++- src/os/bluestore/BlueStore.cc | 3 ++- src/os/bluestore/bluefs_types.cc | 35 ++++++++++++++++++++++++-- src/os/bluestore/bluefs_types.h | 15 +++++++++++ src/test/objectstore/test_bluefs.cc | 39 +++++++++++++++++++---------- 6 files changed, 98 insertions(+), 19 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 4ea80634a13..e9e6c5a9d41 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -446,7 +446,7 @@ int BlueFS::get_block_extents(unsigned id, interval_set *extents) return 0; } -int BlueFS::mkfs(uuid_d osd_uuid) +int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout) { std::unique_lock l(lock); dout(1) << __func__ @@ -490,6 +490,7 @@ int BlueFS::mkfs(uuid_d osd_uuid) // write supers super.log_fnode = log_file->fnode; + super.memorized_layout = layout; _write_super(BDEV_DB); flush_bdev(); @@ -616,6 +617,23 @@ int BlueFS::mount() return r; } +int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const +{ + if (super.memorized_layout) { + if (layout == *super.memorized_layout) { + dout(10) << __func__ << " bluefs layout verified positively" << dendl; + } else { + derr << __func__ << " memorized layout doesn't fit current one" << dendl; + return -EIO; + } + } else { + dout(10) << __func__ << " no memorized_layout in bluefs superblock" + << dendl; + } + + return 0; +} + void BlueFS::umount() { dout(1) << __func__ << dendl; @@ -693,7 +711,7 @@ int BlueFS::_write_super(int dev) dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl; dout(10) << __func__ << " superblock " << super.version << dendl; dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl; - ceph_assert(bl.length() <= get_super_length()); + ceph_assert_always(bl.length() <= get_super_length()); bl.append_zero(get_super_length() - bl.length()); bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT); diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index f458f891265..07e3334902e 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -415,8 +415,9 @@ public: ~BlueFS(); // the super is always stored on bdev 0 - int mkfs(uuid_d osd_uuid); + int mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout); int mount(); + int maybe_verify_layout(const bluefs_layout_t& layout) const; void umount(); int prepare_new_device(int id); diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 6e5a135bf0e..e29a0c5db84 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5304,12 +5304,13 @@ int BlueStore::_open_bluefs(bool create) return r; } if (create) { - bluefs->mkfs(fsid); + bluefs->mkfs(fsid, bluefs_layout); } r = bluefs->mount(); if (r < 0) { derr << __func__ << " failed bluefs mount: " << cpp_strerror(r) << dendl; } + ceph_assert_always(bluefs->maybe_verify_layout(bluefs_layout) == 0); return r; } diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc index aad2fc11ed9..eb51eca0ac0 100644 --- a/src/os/bluestore/bluefs_types.cc +++ b/src/os/bluestore/bluefs_types.cc @@ -30,27 +30,58 @@ ostream& operator<<(ostream& out, const bluefs_extent_t& e) << std::dec; } +// bluefs_layout_t + +void bluefs_layout_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + encode(shared_bdev, bl); + encode(dedicated_db, bl); + encode(dedicated_wal, bl); + ENCODE_FINISH(bl); +} + +void bluefs_layout_t::decode(bufferlist::const_iterator& p) +{ + DECODE_START(1, p); + decode(shared_bdev, p); + decode(dedicated_db, p); + decode(dedicated_wal, p); + DECODE_FINISH(p); +} + +void bluefs_layout_t::dump(Formatter *f) const +{ + f->dump_stream("shared_bdev") << shared_bdev; + f->dump_stream("dedicated_db") << dedicated_db; + f->dump_stream("dedicated_wal") << dedicated_wal; +} + // bluefs_super_t void bluefs_super_t::encode(bufferlist& bl) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); encode(uuid, bl); encode(osd_uuid, bl); encode(version, bl); encode(block_size, bl); encode(log_fnode, bl); + encode(memorized_layout, bl); ENCODE_FINISH(bl); } void bluefs_super_t::decode(bufferlist::const_iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); decode(uuid, p); decode(osd_uuid, p); decode(version, p); decode(block_size, p); decode(log_fnode, p); + if (struct_v >= 2) { + decode(memorized_layout, p); + } DECODE_FINISH(p); } diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h index 9046e511d9d..fde03842c8d 100644 --- a/src/os/bluestore/bluefs_types.h +++ b/src/os/bluestore/bluefs_types.h @@ -3,6 +3,8 @@ #ifndef CEPH_OS_BLUESTORE_BLUEFS_TYPES_H #define CEPH_OS_BLUESTORE_BLUEFS_TYPES_H +#include + #include "bluestore_types.h" #include "include/utime.h" #include "include/encoding.h" @@ -138,7 +140,18 @@ struct bluefs_layout_t { bool single_shared_device() const { return !dedicated_db && !dedicated_wal; } + + bool operator==(const bluefs_layout_t& other) const { + return shared_bdev == other.shared_bdev && + dedicated_db == other.dedicated_db && + dedicated_wal == other.dedicated_wal; + } + + void encode(ceph::bufferlist& bl) const; + void decode(ceph::bufferlist::const_iterator& p); + void dump(Formatter *f) const; }; +WRITE_CLASS_ENCODER(bluefs_layout_t) struct bluefs_super_t { uuid_d uuid; ///< unique to this bluefs instance @@ -148,6 +161,8 @@ struct bluefs_super_t { bluefs_fnode_t log_fnode; + std::optional memorized_layout; + bluefs_super_t() : version(0), block_size(4096) { } diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 1e65a155fa3..4ce3ac7b2c7 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -61,7 +61,7 @@ TEST(BlueFS, mkfs) { BlueFS fs(g_ceph_context); ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); } TEST(BlueFS, mkfs_mount) { @@ -71,8 +71,9 @@ TEST(BlueFS, mkfs_mount) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576); ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576); fs.umount(); @@ -85,8 +86,9 @@ TEST(BlueFS, write_read) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { BlueFS::FileWriter *h; ASSERT_EQ(0, fs.mkdir("dir")); @@ -116,8 +118,9 @@ TEST(BlueFS, small_appends) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { BlueFS::FileWriter *h; ASSERT_EQ(0, fs.mkdir("dir")); @@ -152,8 +155,9 @@ TEST(BlueFS, very_large_write) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); char buf[1048571]; // this is biggish, but intentionally not evenly aligned for (unsigned i = 0; i < sizeof(buf); ++i) { buf[i] = i; @@ -314,8 +318,9 @@ TEST(BlueFS, test_flush_1) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { std::vector write_thread_multiple; uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction @@ -348,8 +353,9 @@ TEST(BlueFS, test_flush_2) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { uint64_t effective_size = size - (128 * 1048576); // leaving the last 32 MB for log compaction uint64_t per_thread_bytes = (effective_size/(NUM_WRITERS)); @@ -375,8 +381,9 @@ TEST(BlueFS, test_flush_3) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { std::vector write_threads; uint64_t effective_size = size - (64 * 1048576); // leaving the last 11 MB for log compaction @@ -408,8 +415,9 @@ TEST(BlueFS, test_simple_compaction_sync) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { for (int i=0; i<10; i++) { string dir = "dir."; @@ -460,8 +468,9 @@ TEST(BlueFS, test_simple_compaction_async) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { for (int i=0; i<10; i++) { string dir = "dir."; @@ -515,8 +524,9 @@ TEST(BlueFS, test_compaction_sync) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { std::vector write_threads; uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction @@ -552,8 +562,9 @@ TEST(BlueFS, test_compaction_async) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { std::vector write_threads; uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction @@ -589,8 +600,9 @@ TEST(BlueFS, test_replay) { ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); uuid_d fsid; - ASSERT_EQ(0, fs.mkfs(fsid)); + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false })); ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); { std::vector write_threads; uint64_t effective_size = size - (32 * 1048576); // leaving the last 32 MB for log compaction @@ -612,6 +624,7 @@ TEST(BlueFS, test_replay) { fs.umount(); // remount and check log can replay safe? ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false })); fs.umount(); } -- 2.39.5