]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: use dev's block size as a minimal BlueFS allocation unit.
authorIgor Fedotov <igor.fedotov@croit.io>
Fri, 28 Feb 2025 09:40:33 +0000 (12:40 +0300)
committerIgor Fedotov <igor.fedotov@croit.io>
Tue, 29 Apr 2025 09:27:38 +0000 (12:27 +0300)
Additionall this locks tail of DB/WAL volumes which is unaligned to
configured (not minimal!!) BlueFS allocation unit.

Effectively replaces changes from
https://github.com/ceph/ceph/pull/57015

Fixes: https://tracker.ceph.com/issues/68772
Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
(cherry picked from commit effaa686f38b9eff2f7b9c8df2ffaf76c9a49aff)

src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/bluefs_types.cc
src/os/bluestore/bluefs_types.h

index b7cfe352c887298b60ab9a634c5aaefb12fa21f4..ebc49cff7352605b60b21008edcf604f310afc8e 100644 (file)
@@ -187,9 +187,9 @@ BlueFS::BlueFS(CephContext* cct)
   : cct(cct),
     bdev(MAX_BDEV),
     ioc(MAX_BDEV),
-    block_reserved(MAX_BDEV),
     alloc(MAX_BDEV),
-    alloc_size(MAX_BDEV, 0)
+    alloc_size(MAX_BDEV, 0),
+    locked_alloc(MAX_BDEV)
 {
   dirty.pending_release.resize(MAX_BDEV);
   discard_cb[BDEV_WAL] = wal_discard_cb;
@@ -482,33 +482,28 @@ void BlueFS::_update_logger_stats()
 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
                              bluefs_shared_alloc_context_t* _shared_alloc)
 {
-  uint64_t reserved;
   string dev_name;
   switch(id) {
     case BDEV_WAL:
     case BDEV_NEWWAL:
-      reserved = BDEV_LABEL_BLOCK_SIZE;
       dev_name = "wal";
       break;
     case BDEV_DB:
     case BDEV_NEWDB:
-      reserved = SUPER_RESERVED;
       dev_name = "db";
       break;
     case BDEV_SLOW:
-      reserved = 0;
       dev_name = "slow";
       break;
     default:
       ceph_assert(false);
   }
   dout(10) << __func__ << " bdev " << id << " path " << path << " "
-           << " reserved " << reserved << dendl;
+           << dendl;
   ceph_assert(id < bdev.size());
   ceph_assert(bdev[id] == NULL);
   BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
                                       discard_cb[id], static_cast<void*>(this), dev_name.c_str());
-  block_reserved[id] = reserved;
   if (_shared_alloc) {
     b->set_no_exclusive_lock();
   }
@@ -614,6 +609,35 @@ uint64_t BlueFS::get_free(unsigned id)
   return alloc[id]->get_free();
 }
 
+uint64_t BlueFS::_get_minimal_reserved(unsigned id) const
+{
+  uint64_t reserved = 0;
+  switch(id) {
+    case BDEV_WAL:
+    case BDEV_NEWWAL:
+      reserved = BDEV_LABEL_BLOCK_SIZE;
+      break;
+    case BDEV_DB:
+    case BDEV_NEWDB:
+      reserved = SUPER_RESERVED;
+      break;
+    case BDEV_SLOW:
+      reserved = 0;
+      break;
+    default:
+      ceph_assert(false);
+  }
+  return reserved;
+}
+
+uint64_t BlueFS::get_full_reserved(unsigned id)
+{
+  if (!is_shared_alloc(id)) {
+    return locked_alloc[id].length + _get_minimal_reserved(id);
+  }
+  return 0;
+}
+
 void BlueFS::dump_perf_counters(Formatter *f)
 {
   f->open_object_section("bluefs_perf_counters");
@@ -670,13 +694,13 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   }
 
   _init_logger();
-  _init_alloc();
 
   super.version = 0;
   super.block_size = bdev[BDEV_DB]->get_block_size();
   super.osd_uuid = osd_uuid;
   super.uuid.generate_random();
-  dout(1) << __func__ << " uuid " << super.uuid << dendl;
+
+  _init_alloc();
 
   // init log
   FileRef log_file = ceph::make_ref<File>();
@@ -701,6 +725,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   super.log_fnode = log_file->fnode;
   super.memorized_layout = layout;
   _write_super(BDEV_DB);
+  dout(1) << __func__ << " super " << super << dendl;
   _flush_bdev();
 
   // clean up
@@ -761,6 +786,8 @@ void BlueFS::_init_alloc()
       continue;
     }
     ceph_assert(bdev[id]->get_size());
+    locked_alloc[id] = bluefs_extent_t();
+
     if (is_shared_alloc(id)) {
       dout(1) << __func__ << " shared, id " << id << std::hex
               << ", capacity 0x" << bdev[id]->get_size()
@@ -774,21 +801,39 @@ void BlueFS::_init_alloc()
         name += devnames[id];
       else
         name += to_string(uintptr_t(this));
-      string alloc_type = cct->_conf->bluefs_allocator;
 
+      auto reserved = _get_minimal_reserved(id);
+      uint64_t locked_offs = 0;
+      {
+        // Try to lock tailing space at device if allocator controlled space
+        // isn't aligned with recommended alloc unit.
+        // Final decision whether locked tail to be maintained is made after
+        // BlueFS replay depending on existing allocations.
+        uint64_t size0 = _get_total(id);
+        uint64_t size = size0 - reserved;
+        size = p2align(size, alloc_size[id]) + reserved;
+        if (size < size0) {
+          locked_offs = size;
+          locked_alloc[id] = bluefs_extent_t(id, locked_offs, uint32_t(size0 - size));
+        }
+      }
+      string alloc_type = cct->_conf->bluefs_allocator;
       dout(1) << __func__ << " new, id " << id << std::hex
               << ", allocator name " << name
               << ", allocator type " << alloc_type
               << ", capacity 0x" << bdev[id]->get_size()
-              << ", reserved 0x" << block_reserved[id]
-              << ", block size 0x" << alloc_size[id]
+              << ", reserved 0x" << reserved
+              << ", locked 0x" << locked_alloc[id].offset
+              << "~" << locked_alloc[id].length
+              << ", block size 0x" << bdev[id]->get_block_size()
+              << ", alloc unit 0x" << alloc_size[id]
               << std::dec << dendl;
       alloc[id] = Allocator::create(cct, alloc_type,
                                    bdev[id]->get_size(),
-                                   alloc_size[id],
+                                   bdev[id]->get_block_size(),
                                    name);
-      auto reserved = block_reserved[id];
-      alloc[id]->init_add_free(reserved, _get_total(id) - reserved);
+      uint64_t free_len = locked_offs ? locked_offs : _get_total(id) - reserved;
+      alloc[id]->init_add_free(reserved, free_len);
     }
   }
 }
@@ -992,6 +1037,7 @@ int BlueFS::mount()
     derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
     goto out;
   }
+  dout(5) << __func__ << " super: " << super << dendl;
 
   // set volume selector if not provided before/outside
   if (vselector == nullptr) {
@@ -1021,6 +1067,20 @@ int BlueFS::mount()
         shared_alloc->bluefs_used += q.length;
         alloc[q.bdev]->init_rm_free(q.offset, q.length);
       } else if (!is_shared) {
+        if (locked_alloc[q.bdev].length) {
+          auto locked_offs = locked_alloc[q.bdev].offset;
+          if (q.offset + q.length > locked_offs) {
+            // we already have allocated extents in locked range,
+            // do not enforce this lock then.
+            bluefs_extent_t dummy;
+            std::swap(locked_alloc[q.bdev], dummy);
+            alloc[q.bdev]->init_add_free(dummy.offset, dummy.length);
+            dout(1) << __func__ << std::hex
+                    << " unlocked at " << q.bdev
+                    << " 0x" << dummy.offset << "~" << dummy.length
+                    << std::dec << dendl;
+          }
+        }
         alloc[q.bdev]->init_rm_free(q.offset, q.length);
       }
     }
@@ -1283,9 +1343,10 @@ int BlueFS::_replay(bool noop, bool to_stdout)
   bool seen_recs = false;
 
   boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
+  bool check_allocations = cct->_conf->bluefs_log_replay_check_allocations;
 
   if (!noop) {
-    if (cct->_conf->bluefs_log_replay_check_allocations) {
+    if (check_allocations) {
       for (size_t i = 0; i < MAX_BDEV; ++i) {
        if (bdev[i] != nullptr) {
           // let's use minimal allocation unit we can have
@@ -1617,7 +1678,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
           }
           if (!noop) {
            FileRef f = _get_file(fnode.ino);
-           if (cct->_conf->bluefs_log_replay_check_allocations) {
+           if (check_allocations) {
               int r = _check_allocations(f->fnode,
                used_blocks, false, "OP_FILE_UPDATE");
               if (r < 0) {
@@ -1633,7 +1694,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
            if (fnode.ino > ino_last) {
              ino_last = fnode.ino;
            }
-            if (cct->_conf->bluefs_log_replay_check_allocations) {
+            if (check_allocations) {
               int r = _check_allocations(f->fnode,
                used_blocks, true, "OP_FILE_UPDATE");
               if (r < 0) {
@@ -1667,7 +1728,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
              // be leanient, if there is no extents just produce error message
              ceph_assert(delta.offset == fnode.allocated || delta.extents.empty());
            }
-           if (cct->_conf->bluefs_log_replay_check_allocations) {
+           if (check_allocations) {
               int r = _check_allocations(fnode,
                used_blocks, false, "OP_FILE_UPDATE_INC");
               if (r < 0) {
@@ -1692,7 +1753,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
            if (fnode.ino > ino_last) {
              ino_last = fnode.ino;
            }
-           if (cct->_conf->bluefs_log_replay_check_allocations) {
+           if (check_allocations) {
               int r = _check_allocations(f->fnode,
                used_blocks, true, "OP_FILE_UPDATE_INC");
               if (r < 0) {
@@ -1726,7 +1787,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
             auto p = nodes.file_map.find(ino);
             ceph_assert(p != nodes.file_map.end());
             vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
-            if (cct->_conf->bluefs_log_replay_check_allocations) {
+            if (check_allocations) {
              int r = _check_allocations(p->second->fnode,
                used_blocks, false, "OP_FILE_REMOVE");
               if (r < 0) {
index c185b25301d1468b117e91cfba31aad981116d85..67a262aeaa11f531ccf992f3885809aaa49c17b0 100644 (file)
@@ -521,9 +521,12 @@ private:
    */
   std::vector<BlockDevice*> bdev;                  ///< block devices we can use
   std::vector<IOContext*> ioc;                     ///< IOContexts for bdevs
-  std::vector<uint64_t> block_reserved;            ///< starting reserve extent per device
   std::vector<Allocator*> alloc;                   ///< allocators for bdevs
   std::vector<uint64_t> alloc_size;                ///< alloc size for each device
+  std::vector<bluefs_extent_t> locked_alloc;       ///< candidate extents for locked alocations,
+                                                   ///< no alloc/release reqs matching these space
+                                                   ///< to be issued to allocator.
+
 
   //std::vector<interval_set<uint64_t>> block_unused_too_granular;
 
@@ -555,7 +558,7 @@ private:
 
   uint64_t _get_used(unsigned id) const;
   uint64_t _get_total(unsigned id) const;
-
+  uint64_t _get_minimal_reserved(unsigned id) const;
 
   FileRef _get_file(uint64_t ino);
   void _drop_link_DF(FileRef f);
@@ -711,6 +714,7 @@ public:
   uint64_t get_total(unsigned id);
   uint64_t get_free(unsigned id);
   uint64_t get_used(unsigned id);
+  uint64_t get_full_reserved(unsigned id);
   void dump_perf_counters(ceph::Formatter *f);
 
   void dump_block_extents(std::ostream& out);
index 8a574e9c3d6c2c6f5f738e36d0db3cd67dc28e22..0ba3a70d2e9ff9a1cf72861459bf5fe47c68a11b 100644 (file)
@@ -74,6 +74,8 @@ void bluefs_layout_t::generate_test_instances(list<bluefs_layout_t*>& ls)
 }
 
 // bluefs_super_t
+bluefs_super_t::bluefs_super_t() : version(0), block_size(4096) {
+}
 
 void bluefs_super_t::encode(bufferlist& bl) const
 {
index 99ce1c3c1461eeadec635a62a37eed9118708285..2d293d2a9ee4e2879c0bb35ce9efbde199d870a2 100644 (file)
@@ -220,9 +220,7 @@ struct bluefs_super_t {
 
   std::optional<bluefs_layout_t> memorized_layout;
 
-  bluefs_super_t()
-    : version(0),
-      block_size(4096) { }
+  bluefs_super_t();
 
   uint64_t block_mask() const {
     return ~((uint64_t)block_size - 1);