]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: fix startup vs device write pointers
authorSage Weil <sage@newdream.net>
Tue, 7 Sep 2021 18:53:06 +0000 (13:53 -0500)
committerSage Weil <sage@newdream.net>
Fri, 29 Oct 2021 13:55:57 +0000 (09:55 -0400)
Compare freelist write pointers to the device on startup.

Signed-off-by: Sage Weil <sage@newdream.net>
src/os/bluestore/BlueStore.cc
src/os/bluestore/ZonedAllocator.cc
src/os/bluestore/ZonedAllocator.h
src/os/bluestore/zoned_types.h

index 2004f7286457ffee0bb14b37b75319884e96a79e..459f7070b1bca0697b704880db86625db4b9b72a 100644 (file)
@@ -5592,7 +5592,36 @@ int BlueStore::_init_alloc()
     ceph_assert(a);
     auto f = dynamic_cast<ZonedFreelistManager*>(fm);
     ceph_assert(f);
-    a->init_from_zone_pointers(f->get_zone_states(db),
+    vector<uint64_t> wp = bdev->get_zones();
+    vector<zone_state_t> zones = f->get_zone_states(db);
+    ceph_assert(wp.size() == zones.size());
+
+    // reconcile zone state
+    auto num_zones = bdev->get_size() / zone_size;
+    for (unsigned i = first_sequential_zone; i < num_zones; ++i) {
+      ceph_assert(wp[i] >= i * zone_size);
+      ceph_assert(wp[i] <= (i + 1) * zone_size); // pos might be at start of next zone
+      uint64_t p = wp[i] - i * zone_size;
+      if (zones[i].write_pointer > p) {
+       derr << __func__ << " zone 0x" << std::hex << i
+            << " bluestore write pointer 0x" << zones[i].write_pointer
+            << " > device write pointer 0x" << p
+            << std::dec << dendl;
+       ceph_abort("bad write pointer");
+      } else if (zones[i].write_pointer < p) {
+       // this is "normal" in that it can happen after any crash (if we have a
+       // write in flight but did not manage to commit the transaction)
+       auto delta = p - zones[i].write_pointer;
+       dout(1) << __func__ << " zone 0x" << std::hex << i
+                << " device write pointer 0x" << p
+                << " > bluestore pointer 0x" << zones[i].write_pointer
+                << ", advancing 0x" << delta << std::dec << dendl;
+       zones[i].num_dead_bytes += delta;
+       zones[i].write_pointer = p;
+      }
+    }
+
+    a->init_from_zone_pointers(zones,
                               &zoned_cleaner_lock,
                               &zoned_cleaner_cond);
     dout(1) << __func__
index 0ec8000e17b6eef0de05726f0c49b2e99d87475e..03326d2e5fc5b9140663e37e7c9c87bc453dd168 100644 (file)
@@ -151,7 +151,7 @@ void ZonedAllocator::dump(std::function<void(uint64_t offset,
 }
 
 void ZonedAllocator::init_from_zone_pointers(
-  std::vector<zone_state_t> &&_zone_states,
+  std::vector<zone_state_t> _zone_states,
   ceph::mutex *_cleaner_lock,
   ceph::condition_variable *_cleaner_cond)
 {
index 73697e4cfd5f011b3fa6f7d94239f49428728b50..903b62fd1a09a7da72f3e10b0ca41f2cf64b64ba 100644 (file)
@@ -96,7 +96,7 @@ public:
   void mark_zones_to_clean_free(void);
 
   void init_from_zone_pointers(
-    std::vector<zone_state_t> &&_zone_states,
+    std::vector<zone_state_t> _zone_states,
     ceph::mutex *_cleaner_lock,
     ceph::condition_variable *_cleaner_cond);
   void init_add_free(uint64_t offset, uint64_t length) override {}
index c92f2ada0996724e369a463a37a6e69f49fd9516..93d3138399a0c0d7f965a91224bb6f8140a906d3 100644 (file)
@@ -14,8 +14,8 @@
 // We use the same struct for an on-disk and in-memory representation of the
 // state.
 struct zone_state_t {
-  uint64_t num_dead_bytes = 0;
-  uint64_t write_pointer = 0;
+  uint64_t num_dead_bytes = 0;  ///< dead bytes deallocated (behind the write pointer)
+  uint64_t write_pointer = 0;   ///< relative offset within the zone
 
   void encode(ceph::buffer::list &bl) const {
     using ceph::encode;