]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: rework vselector calls
authorIgor Fedotov <igor.fedotov@croit.io>
Thu, 19 Oct 2023 11:57:23 +0000 (14:57 +0300)
committerIgor Fedotov <igor.fedotov@croit.io>
Tue, 15 Oct 2024 05:39:22 +0000 (08:39 +0300)
We can provide fnode delta to vseector now. Which is a bit more
effective.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
(cherry picked from commit 3e04ac48779f62394ebd9587298823bf6b20a570)

 Conflicts:
src/os/bluestore/BlueFS.cc
 * trival

src/os/bluestore/BlueFS.cc
src/os/bluestore/BlueFS.h
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h

index 4c6a2042ab8829a9d9ca70bc6fd01b91f7f5a23c..1c14cd24d1d29fc5baae82373e5ddeaec47f0d60 100644 (file)
@@ -1545,11 +1545,9 @@ int BlueFS::_replay(bool noop, bool to_stdout)
             }
             if (fnode.ino != 1) {
               vselector->sub_usage(f->vselector_hint, f->fnode);
-            }
+             vselector->add_usage(f->vselector_hint, fnode);
+           }
             f->fnode = fnode;
-            if (fnode.ino != 1) {
-              vselector->add_usage(f->vselector_hint, f->fnode);
-            }
 
            if (fnode.ino > ino_last) {
              ino_last = fnode.ino;
@@ -1791,7 +1789,7 @@ int BlueFS::device_migrate_to_existing(
 
       // write entire file
       auto l = _allocate(dev_target, bl.length(), 0,
-        &file_ref->fnode, 0, false);
+        &file_ref->fnode, nullptr, 0, false);
       if (l < 0) {
        derr << __func__ << " unable to allocate len 0x" << std::hex
             << bl.length() << std::dec << " from " << (int)dev_target
@@ -1931,7 +1929,7 @@ int BlueFS::device_migrate_to_new(
 
       // write entire file
       auto l = _allocate(dev_target, bl.length(), 0,
-        &file_ref->fnode, 0, false);
+        &file_ref->fnode, nullptr, 0, false);
       if (l < 0) {
        derr << __func__ << " unable to allocate len 0x" << std::hex
             << bl.length() << std::dec << " from " << (int)dev_target
@@ -2521,7 +2519,6 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
   auto t0 = mono_clock::now();
 
   File *log_file = log.writer->file.get();
-  bluefs_fnode_t fnode_tail;
   // log.t.seq is always set to current live seq
   ceph_assert(log.t.seq == log.seq_live);
   // Capturing entire state. Dump anything that has been stored there.
@@ -2576,7 +2573,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
 
   dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
 
-  int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
+  bluefs_fnode_t fnode_tail;
+  int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, nullptr, 0,
     permit_dev_fallback);
   ceph_assert(r == 0);
 
@@ -2587,7 +2585,7 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
   uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
 
   bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
-  r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
+  r = _allocate(log_dev, starter_need, 0, &fnode_starter, nullptr, 0,
     permit_dev_fallback);
   ceph_assert(r == 0);
 
@@ -3087,16 +3085,17 @@ void BlueFS::_extend_log(uint64_t amount) {
   }
   ll.release();
   uint64_t allocated_before_extension = log.writer->file->fnode.get_allocated();
-  vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
   amount = round_up_to(amount, super.block_size);
   int r = _allocate(
       vselector->select_prefer_bdev(log.writer->file->vselector_hint),
       amount,
       0,
-      &log.writer->file->fnode);
+      &log.writer->file->fnode,
+      [&](const bluefs_extent_t& e) {
+        vselector->add_usage(log.writer->file->vselector_hint, e);
+      });
   ceph_assert(r == 0);
   dout(10) << "extended log by 0x" << std::hex << amount << " bytes " << dendl;
-  vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
 
   bluefs_transaction_t log_extend_transaction;
   log_extend_transaction.seq = log.t.seq;
@@ -3369,7 +3368,9 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
 
   dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
           << " 0x" << offset << "~" << length << std::dec
-          << " to " << h->file->fnode << dendl;
+          << " to " << h->file->fnode
+          << " hint " << h->file->vselector_hint
+           << dendl;
   if (h->file->deleted) {
     dout(10) << __func__ << "  deleted, no-op" << dendl;
     return 0;
@@ -3390,7 +3391,6 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
   ceph_assert(offset <= h->file->fnode.size);
 
   uint64_t allocated = h->file->fnode.get_allocated();
-  vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
   // do not bother to dirty the file if we are overwriting
   // previously allocated extents.
   if (allocated < offset + length) {
@@ -3399,25 +3399,26 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
     int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
                      offset + length - allocated,
                       0,
-                     &h->file->fnode);
+                     &h->file->fnode,
+                     [&](const bluefs_extent_t& e) {
+                       vselector->add_usage(h->file->vselector_hint, e);
+                     });
     if (r < 0) {
       derr << __func__ << " allocated: 0x" << std::hex << allocated
            << " offset: 0x" << offset << " length: 0x" << length << std::dec
            << dendl;
-      vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
       ceph_abort_msg("bluefs enospc");
       return r;
     }
     h->file->is_dirty = true;
   }
   if (h->file->fnode.size < offset + length) {
+    vselector->add_usage(h->file->vselector_hint, offset + length - h->file->fnode.size);
     h->file->fnode.size = offset + length;
     h->file->is_dirty = true;
   }
-
   dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
   int res = _flush_data(h, offset, length, buffered);
-  vselector->add_usage(h->file->vselector_hint, h->file->fnode);
   return res;
 }
 
@@ -3672,10 +3673,9 @@ int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
   _flush_bdev(h);
 
   std::lock_guard ll(log.lock);
-  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
+  vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size - offset);
   h->file->fnode.size = offset;
   h->file->is_dirty = true;
-  vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
   log.t.op_file_update_inc(h->file->fnode);
   return 0;
 }
@@ -3769,6 +3769,7 @@ const char* BlueFS::get_device_name(unsigned id)
 int BlueFS::_allocate(uint8_t id, uint64_t len,
                      uint64_t alloc_unit,
                      bluefs_fnode_t* node,
+                      update_fn_t cb,
                       size_t alloc_attempts,
                       bool permit_dev_fallback)
 {
@@ -3852,6 +3853,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
                        len,
                        alloc_unit,
                        node,
+                      cb,
                        alloc_attempts,
                        permit_dev_fallback);
     } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
@@ -3865,6 +3867,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
                        len,
                        0, // back to default alloc unit
                        node,
+                      cb,
                        alloc_attempts,
                        permit_dev_fallback);
     } else {
@@ -3884,9 +3887,12 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
   }
 
   for (auto& p : extents) {
-    node->append_extent(bluefs_extent_t(id, p.offset, p.length));
+    bluefs_extent_t e(id, p.offset, p.length);
+    node->append_extent(e);
+    if (cb) {
+      cb(e);
+    }
   }
-   
   return 0;
 }
 
@@ -3905,12 +3911,13 @@ int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
   if (off + len > allocated) {
     uint64_t want = off + len - allocated;
 
-    vselector->sub_usage(f->vselector_hint, f->fnode);
     int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
       want,
       0,
-      &f->fnode);
-    vselector->add_usage(f->vselector_hint, f->fnode);
+      &f->fnode,
+      [&](const bluefs_extent_t& e) {
+       vselector->add_usage(f->vselector_hint, e);
+      });
     if (r < 0)
       return r;
 
@@ -3967,7 +3974,6 @@ int BlueFS::open_for_write(
   _maybe_check_vselector_LNF();
   FileRef file;
   bool create = false;
-  bool truncate = false;
   mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
   {
   std::lock_guard ll(log.lock);
@@ -3994,10 +4000,12 @@ int BlueFS::open_for_write(
     }
     file = ceph::make_ref<File>();
     file->fnode.ino = ++ino_last;
+    file->vselector_hint = vselector->get_hint_by_dir(dirname);
     nodes.file_map[ino_last] = file;
     dir->file_map[string{filename}] = file;
     ++file->refs;
     create = true;
+    vselector->add_usage(file->vselector_hint, file->fnode.size, true); // update file count
     logger->set(l_bluefs_num_files, nodes.file_map.size());
   } else {
     // overwrite existing file?
@@ -4012,8 +4020,8 @@ int BlueFS::open_for_write(
               << " already exists, truncate + overwrite" << dendl;
       vselector->sub_usage(file->vselector_hint, file->fnode);
       file->fnode.size = 0;
+      vselector->add_usage(file->vselector_hint, file->fnode.size, true); // restore file count
       pending_release_extents.swap(file->fnode.extents);
-      truncate = true;
 
       file->fnode.clear_extents();
     }
@@ -4021,11 +4029,6 @@ int BlueFS::open_for_write(
   ceph_assert(file->fnode.ino > 1);
 
   file->fnode.mtime = ceph_clock_now();
-  file->vselector_hint = vselector->get_hint_by_dir(dirname);
-  if (create || truncate) {
-    vselector->add_usage(file->vselector_hint, file->fnode); // update file count
-  }
-
   dout(20) << __func__ << " mapping " << dirname << "/" << filename
           << " vsel_hint " << file->vselector_hint
           << dendl;
index 615ea49143e9804b268bdf88b954f4528a4184e7..aa43db8946f4e04ab7ab0f589489175c7e5548e6 100644 (file)
@@ -85,10 +85,22 @@ public:
   virtual void* get_hint_for_log() const = 0;
   virtual void* get_hint_by_dir(std::string_view dirname) const = 0;
 
-  virtual void add_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
-  virtual void sub_usage(void* file_hint, const bluefs_fnode_t& fnode) = 0;
-  virtual void add_usage(void* file_hint, uint64_t fsize) = 0;
-  virtual void sub_usage(void* file_hint, uint64_t fsize) = 0;
+  void add_usage(void* hint, const bluefs_fnode_t& fnode) {
+    for (auto& e : fnode.extents) {
+      add_usage(hint, e);
+    }
+    add_usage(hint, fnode.size, true);
+  }
+  void sub_usage(void* hint, const bluefs_fnode_t& fnode) {
+    for (auto& e : fnode.extents) {
+      sub_usage(hint, e);
+    }
+    sub_usage(hint, fnode.size, true);
+  }
+  virtual void add_usage(void* hint, const bluefs_extent_t& extent) = 0;
+  virtual void sub_usage(void* hint, const bluefs_extent_t& extent) = 0;
+  virtual void add_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0;
+  virtual void sub_usage(void* hint, uint64_t fsize, bool upd_files = false) = 0;
   virtual uint8_t select_prefer_bdev(void* hint) = 0;
   virtual void get_paths(const std::string& base, paths& res) const = 0;
   virtual void dump(std::ostream& sout) = 0;
@@ -433,9 +445,12 @@ private:
     return bdev[BDEV_SLOW] ? BDEV_SLOW : BDEV_DB;
   }
   const char* get_device_name(unsigned id);
+
+  typedef std::function<void(const bluefs_extent_t)> update_fn_t;
   int _allocate(uint8_t bdev, uint64_t len,
                 uint64_t alloc_unit,
                bluefs_fnode_t* node,
+                update_fn_t cb = nullptr,
                 size_t alloc_attempts = 0,
                 bool permit_dev_fallback = true);
 
@@ -710,19 +725,19 @@ public:
   void* get_hint_for_log() const override;
   void* get_hint_by_dir(std::string_view dirname) const override;
 
-  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+  void add_usage(void* hint, const bluefs_extent_t& extent) override {
     // do nothing
     return;
   }
-  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+  void sub_usage(void* hint, const bluefs_extent_t& extent) override {
     // do nothing
     return;
   }
-  void add_usage(void* hint, uint64_t fsize) override {
+  void add_usage(void*, uint64_t, bool) override {
     // do nothing
     return;
   }
-  void sub_usage(void* hint, uint64_t fsize) override {
+  void sub_usage(void*, uint64_t, bool) override {
     // do nothing
     return;
   }
index 1eba5dcdd9881036f39a19d326b69e69f5ed94b2..aa7c7eb5c1d7b1507e4e87597d84805d2aec5c88 100644 (file)
@@ -6299,7 +6299,7 @@ int BlueStore::_open_bluefs(bool create, bool read_only)
           bluefs->get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
           bluefs->get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
           bluefs->get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100,
-          1024 * 1024 * 1024, //FIXME: set expected l0 size here
+         rocks_opts.write_buffer_size * rocks_opts.max_write_buffer_number,
           rocks_opts.max_bytes_for_level_base,
           rocks_opts.max_bytes_for_level_multiplier,
           reserved_factor,
@@ -18126,7 +18126,13 @@ void RocksDBBlueFSVolumeSelector::dump(ostream& sout) {
   auto max_x = per_level_per_dev_usage.get_max_x();
   auto max_y = per_level_per_dev_usage.get_max_y();
 
-  sout << "RocksDBBlueFSVolumeSelector Usage Matrix:" << std::endl;
+  sout << "RocksDBBlueFSVolumeSelector " << std::endl;
+  sout << ">>Settings<<"
+       << " extra=" << byte_u_t(db_avail4slow)
+       << ", l0_size=" << byte_u_t(level0_size)
+       << ", l_base=" << byte_u_t(level_base)
+       << ", l_multi=" << byte_u_t(level_multiplier)
+       << std::endl;
   constexpr std::array<const char*, 8> names{ {
     "DEV/LEV",
     "WAL",
index 6613e301756f2d85225f0a502ea2cacd057a1e2f..cf60ece0ca996a310b565552a60607c240274356 100644 (file)
@@ -4140,6 +4140,9 @@ class RocksDBBlueFSVolumeSelector : public BlueFSVolumeSelector
 
   uint64_t l_totals[LEVEL_MAX - LEVEL_FIRST];
   uint64_t db_avail4slow = 0;
+  uint64_t level0_size = 0;
+  uint64_t level_base = 0;
+  uint64_t level_multiplier = 0;
   enum {
     OLD_POLICY,
     USE_SOME_EXTRA
@@ -4165,21 +4168,24 @@ public:
     if (!new_pol) {
       return;
     }
-
     // Calculating how much extra space is available at DB volume.
     // Depending on the presence of explicit reserved size specification it might be either
     // * DB volume size - reserved
     // or
     // * DB volume size - sum_max_level_size(0, L-1) - max_level_size(L) * reserved_factor
     if (!reserved) {
+      level0_size = _level0_size;
+      level_base = _level_base;
+      level_multiplier = _level_multiplier;
       uint64_t prev_levels = _level0_size;
       uint64_t cur_level = _level_base;
-      uint64_t cur_threshold = 0;
+      uint64_t cur_threshold = prev_levels + cur_level;
       do {
-        uint64_t next_level = cur_level * _level_multiplier;
-        uint64_t next_threshold = prev_levels + cur_level + next_level * reserved_factor;
+       uint64_t next_level = cur_level * _level_multiplier;
+        uint64_t next_threshold = prev_levels + cur_level + next_level;
         if (_db_total <= next_threshold) {
-          db_avail4slow = cur_threshold ? _db_total - cur_threshold : 0;
+         cur_threshold *= reserved_factor;
+          db_avail4slow = cur_threshold < _db_total ? _db_total - cur_threshold : 0;
           break;
         } else {
           prev_levels += cur_level;
@@ -4188,7 +4194,7 @@ public:
         }
       } while (true);
     } else {
-      db_avail4slow = _db_total - reserved;
+      db_avail4slow = reserved < _db_total ? _db_total - reserved : 0;
     }
   }
 
@@ -4197,63 +4203,40 @@ public:
   }
   void* get_hint_by_dir(std::string_view dirname) const override;
 
-  void add_usage(void* hint, const bluefs_fnode_t& fnode) override {
+  void add_usage(void* hint, const bluefs_extent_t& extent) override {
     if (hint == nullptr)
       return;
     size_t pos = (size_t)hint - LEVEL_FIRST;
-    for (auto& p : fnode.extents) {
-      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
-      auto& max = per_level_per_dev_max.at(p.bdev, pos);
-      uint64_t v = cur.fetch_add(p.length) + p.length;
-      while (v > max) {
-       max.exchange(v);
-      }
-      {
-        //update per-device totals
-        auto& cur = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
-        auto& max = per_level_per_dev_max.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
-        uint64_t v = cur.fetch_add(p.length) + p.length;
-       while (v > max) {
-         max.exchange(v);
-       }
-      }
+    auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
+    auto& max = per_level_per_dev_max.at(extent.bdev, pos);
+    uint64_t v = cur.fetch_add(extent.length) + extent.length;
+    while (v > max) {
+      max.exchange(v);
     }
     {
-      //update per-level actual totals
-      auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
-      auto& max = per_level_per_dev_max.at(BlueFS::MAX_BDEV, pos);
-      uint64_t v = cur.fetch_add(fnode.size) + fnode.size;
+      //update per-device totals
+      auto& cur = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+      auto& max = per_level_per_dev_max.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+      uint64_t v = cur.fetch_add(extent.length) + extent.length;
       while (v > max) {
        max.exchange(v);
       }
     }
-    ++per_level_files[pos];
-    ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
   }
-  void sub_usage(void* hint, const bluefs_fnode_t& fnode) override {
+  void sub_usage(void* hint, const bluefs_extent_t& extent) override {
     if (hint == nullptr)
       return;
     size_t pos = (size_t)hint - LEVEL_FIRST;
-    for (auto& p : fnode.extents) {
-      auto& cur = per_level_per_dev_usage.at(p.bdev, pos);
-      ceph_assert(cur >= p.length);
-      cur -= p.length;
-
-      //update per-device totals
-      auto& cur2 = per_level_per_dev_usage.at(p.bdev, LEVEL_MAX - LEVEL_FIRST);
-      ceph_assert(cur2 >= p.length);
-      cur2 -= p.length;
-    }
-    //update per-level actual totals
-    auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
-    ceph_assert(cur >= fnode.size);
-    cur -= fnode.size;
-    ceph_assert(per_level_files[pos] > 0);
-    --per_level_files[pos];
-    ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
-    --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+    auto& cur = per_level_per_dev_usage.at(extent.bdev, pos);
+    ceph_assert(cur >= extent.length);
+    cur -= extent.length;
+
+    //update per-device totals
+    auto& cur2 = per_level_per_dev_usage.at(extent.bdev, LEVEL_MAX - LEVEL_FIRST);
+    ceph_assert(cur2 >= extent.length);
+    cur2 -= extent.length;
   }
-  void add_usage(void* hint, uint64_t size_more) override {
+  void add_usage(void* hint, uint64_t size_more, bool upd_files) override {
     if (hint == nullptr)
       return;
     size_t pos = (size_t)hint - LEVEL_FIRST;
@@ -4264,8 +4247,12 @@ public:
     while (v > max) {
       max.exchange(v);
     }
+    if (upd_files) {
+      ++per_level_files[pos];
+      ++per_level_files[LEVEL_MAX - LEVEL_FIRST];
+    }
   }
-  void sub_usage(void* hint, uint64_t size_less) override {
+  void sub_usage(void* hint, uint64_t size_less, bool upd_files) override {
     if (hint == nullptr)
       return;
     size_t pos = (size_t)hint - LEVEL_FIRST;
@@ -4273,6 +4260,12 @@ public:
     auto& cur = per_level_per_dev_usage.at(BlueFS::MAX_BDEV, pos);
     ceph_assert(cur >= size_less);
     cur -= size_less;
+    if (upd_files) {
+      ceph_assert(per_level_files[pos] > 0);
+      --per_level_files[pos];
+      ceph_assert(per_level_files[LEVEL_MAX - LEVEL_FIRST] > 0);
+      --per_level_files[LEVEL_MAX - LEVEL_FIRST];
+    }
   }
 
   uint8_t select_prefer_bdev(void* h) override;