]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: Add fsck procedure for bdev multi labels
authorAdam Kupczyk <akupczyk@ibm.com>
Thu, 8 Feb 2024 22:28:22 +0000 (22:28 +0000)
committerPere Diaz Bou <pere-altea@hotmail.com>
Fri, 23 Aug 2024 09:49:23 +0000 (11:49 +0200)
Now fsck can properly detect collision between labels and object data / bluefs files.
Additional labels have lower precedence, they never overwrite other data.
If collision label - object data happens, the object is moved somewhere else.
If collision label - bluefs file happens, it is left unsolved.

Signed-off-by: Adam Kupczyk <akupczyk@ibm.com>
(cherry picked from commit 7ecaede175a13f600f50fd9c877132a4f130c321)

src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index a8b1fb25ee85ee09528b080231e1813658949cf6..e199a9f8d9cc528c936423ff228f7f161aa0939f 100644 (file)
@@ -490,7 +490,7 @@ int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
       break;
     case BDEV_DB:
     case BDEV_NEWDB:
-      reserved = DB_SUPER_RESERVED;
+      reserved = SUPER_RESERVED;
       break;
     case BDEV_SLOW:
       reserved = 0;
index b0c3c5b278533b048ea128282619e91e000799c4..f6fd2b3331a1fbbfb0031317fc5e9ce599e8161c 100644 (file)
@@ -10232,6 +10232,16 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
       depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
     << dendl;
 
+  {
+    string p = path + "/block";
+    int r = _read_main_bdev_label(cct, p, &bdev_label,
+      &bdev_label_valid_locations, &bdev_label_multi);
+    if (r < 0) {
+      derr << __func__ << " fsck error: no valid block device label found" << dendl;
+      return r;
+    }
+  }
+
   // in deep mode we need R/W write access to be able to replay deferred ops
   const bool read_only = !(repair || depth == FSCK_DEEP);
   int r = _open_db_and_around(read_only);
@@ -10292,6 +10302,8 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
   int64_t warnings = 0;
   unsigned repaired = 0;
 
+  std::vector<uint64_t> bdev_labels_broken;
+  std::vector<uint64_t> bdev_labels_in_repair;
   uint64_t_btree_t used_omap_head;
   uint64_t_btree_t used_sbids;
 
@@ -10319,6 +10331,26 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
 
   auto alloc_size = fm->get_alloc_size();
 
+  // Delayed action, we could not do it in _fsck().
+  if (bdev_label_multi) {
+    for (size_t i = 0; i < bdev_label_positions.size(); i++) {
+      uint64_t location = bdev_label_positions[i];
+      if (location > bdev->get_size()) {
+        continue;
+      }
+      if (std::find(
+        bdev_label_valid_locations.begin(),
+        bdev_label_valid_locations.end(),
+        location) == bdev_label_valid_locations.end()) {
+        derr << "fsck error: bdev label at 0x" << std::hex << location << std::dec
+             << " corrupted" << dendl;
+        errors++;
+        bdev_labels_broken.push_back(location);
+      }
+    }
+    // We have to wait for allocations check to know if we can fix.
+  }
+
   utime_t start = ceph_clock_now();
 
   _fsck_collections(&errors);
@@ -10342,8 +10374,46 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
 
   bluefs_used_blocks = used_blocks;
 
+  if (bdev_label_multi) {
+    // Forcibly mark regions of bdev label clones as used.
+    // If an object happens to be using it we will get an error and a repair applied.
+    // We can move away data only if it was allocated for object in BlueStore,
+    // we are unable to move away BlueFS data.
+
+    // skip first bdev label in this check
+    for (uint64_t position : bdev_labels_broken) {
+      uint64_t length = std::max<uint64_t>(BDEV_LABEL_BLOCK_SIZE, alloc_size);
+      bool is_taken_by_bluefs = false;
+      apply_for_bitset_range(position, length, alloc_size, bluefs_used_blocks,
+        [&](uint64_t pos, mempool_dynamic_bitset& bs) {
+          is_taken_by_bluefs |= bs.test_set(pos);
+        }
+      );
+      if (is_taken_by_bluefs) {
+        // We are unable to fix it.
+        dout(1) << "fsck bdev label at 0x" << std::hex << position << std::dec
+                <<  "taken by bluefs, cannot be fixed" << dendl;
+      } else {
+        if (repair) {
+          // Mark blocks so we could move offending objects away.
+          bdev_labels_in_repair.push_back(position);
+        }
+      }
+    }
+    // Mark bits or locations of all bdev labels.
+    for (size_t i = 0; i < bdev_label_positions.size(); i++) {
+      uint64_t position = bdev_label_positions[i];
+      uint64_t length = std::max<uint64_t>(BDEV_LABEL_BLOCK_SIZE, alloc_size);
+      apply_for_bitset_range(position, length, alloc_size, used_blocks,
+        [&](uint64_t pos, mempool_dynamic_bitset& bs) {
+          bs.set(pos);
+        }
+      );
+    }
+  }
+
   apply_for_bitset_range(
-    0, std::max<uint64_t>(min_alloc_size, DB_SUPER_RESERVED), alloc_size, used_blocks,
+    BDEV_LABEL_POSITION, std::max<uint64_t>(min_alloc_size, SUPER_RESERVED), alloc_size, used_blocks,
     [&](uint64_t pos, mempool_dynamic_bitset &bs) {
       bs.set(pos);
     }
@@ -10913,14 +10983,14 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
           [&](uint64_t pos, mempool_dynamic_bitset &bs) {
             ceph_assert(pos < bs.size());
             if (bs.test(pos) && !bluefs_used_blocks.test(pos)) {
-              if (offset == DB_SUPER_RESERVED &&
-                  length == min_alloc_size - DB_SUPER_RESERVED) {
+              if (offset == SUPER_RESERVED &&
+                  length == min_alloc_size - SUPER_RESERVED) {
                 // this is due to the change just after luminous to min_alloc_size
                 // granularity allocations, and our baked in assumption at the top
-                // of _fsck that 0~round_up_to(DB_SUPER_RESERVED,min_alloc_size) is used
-                // (vs luminous's round_up_to(DB_SUPER_RESERVED,block_size)).  harmless,
+                // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+                // (vs luminous's round_up_to(SUPER_RESERVED,block_size)).  harmless,
                 // since we will never allocate this region below min_alloc_size.
-                dout(10) << __func__ << " ignoring free extent between DB_SUPER_RESERVED"
+                dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
                          << " and min_alloc_size, 0x" << std::hex << offset << "~"
                          << length << std::dec << dendl;
               } else {
@@ -10987,6 +11057,12 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
     repaired = repairer.apply(db);
     dout(5) << __func__ << " repair applied" << dendl;
   }
+  if (repair) {
+    // Now fix bdev_labels that were detected to be broken & repairable.
+    string p = path + "/block";
+    _write_bdev_label(cct, p, bdev_label, bdev_labels_in_repair);
+    repaired += bdev_labels_in_repair.size();
+  }
 
 out_scan:
   dout(2) << __func__ << " " << num_objects << " objects, "
@@ -13180,9 +13256,10 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator(
 // write helpers
 
 uint64_t BlueStore::_get_ondisk_reserved() const {
+  static_assert(BDEV_LABEL_POSITION == 0);
   ceph_assert(min_alloc_size);
-  return round_up_to(
-    std::max<uint64_t>(DB_SUPER_RESERVED, min_alloc_size), min_alloc_size);
+  uint64_t size = p2roundup(BDEV_LABEL_BLOCK_SIZE + BLUEFS_SUPER_BLOCK_SIZE, min_alloc_size);
+  return size;
 }
 
 void BlueStore::_prepare_ondisk_format_super(KeyValueDB::Transaction& t)
@@ -19701,7 +19778,7 @@ int BlueStore::read_allocation_from_onodes(SimpleBitmap *sbmap, read_alloc_stats
 int BlueStore::reconstruct_allocations(SimpleBitmap *sbmap, read_alloc_stats_t &stats)
 {
   // first set space used by superblock
-  auto super_length = std::max<uint64_t>(min_alloc_size, DB_SUPER_RESERVED);
+  auto super_length = std::max<uint64_t>(min_alloc_size, SUPER_RESERVED);
   set_allocation_in_simple_bmap(sbmap, 0, super_length);
   stats.extent_count++;
 
index 89bf15e4de08ec4921cae64d3e809dcaf84e17bf..89bf2a2b9ef9159d0ab2b8259b80eb591295603e 100644 (file)
@@ -52,6 +52,7 @@
 #include "os/ObjectStore.h"
 
 #include "bluestore_types.h"
+#include "bluestore_common.h"
 #include "BlueFS.h"
 #include "common/EventTrace.h"