From: myoungwon oh Date: Thu, 1 Jan 2026 09:23:47 +0000 (+0900) Subject: crimson/os/seastore: make _fastinfo overwritable to minimize space overhead X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2919f9585ae0bcb9efc8ab72adacad34121016bf;p=ceph-ci.git crimson/os/seastore: make _fastinfo overwritable to minimize space overhead This commit forces _fastinfo to be stored at the last position of a LogNode. By doing so, _fastinfo can be overwritten by the next pg_log_entry. Since _fastinfo has a fixed key with varying contents and is included in every write transaction, placing it at the tail enables efficient overwrites. As a result, this change reduces LogNode allocation and deallocation, thereby lowering space overhead. Moreover, garbage collection for obsolete key-value pairs is unnecessary due to overwrite semantics. Signed-off-by: Myoungwon Oh --- diff --git a/src/crimson/os/seastore/omap_manager/log/log_manager.cc b/src/crimson/os/seastore/omap_manager/log/log_manager.cc index 6fc0655bf85..95d77670290 100644 --- a/src/crimson/os/seastore/omap_manager/log/log_manager.cc +++ b/src/crimson/os/seastore/omap_manager/log/log_manager.cc @@ -65,8 +65,9 @@ LogManager::omap_set_keys( auto ext = co_await log_load_extent( t, log_root.addr, BEGIN_KEY, END_KEY); ceph_assert(ext); - std::map dup_kvs; - for (auto &p : kvs) { + std::pair ow_kv; + auto f = [&](const std::string &k, const bufferlist &v, bool has_ow_key) + -> omap_set_key_ret { CachedExtentRef node; Transaction::get_extent_ret ret; // To find mutable extent in the same transaction @@ -74,13 +75,55 @@ LogManager::omap_set_keys( assert(ret == Transaction::get_extent_ret::PRESENT); assert(node); LogNodeRef log_node = node->template cast(); + bool can_ow = has_ow_key && log_node->can_ow(); + co_await _log_set_key(log_root, t, log_node, k, v, can_ow); + co_return; + }; + /* + * During a normal write transaction, pgmeta_oid receives two key–value pairs: + * _fastinfo and pg_log_entry. Unlike pg_log_entry, _fastinfo is likely to be + * overwritten in the near future. Storing _fastinfo in an append-only manner + * within a LogNode causes unnecessary space overhead and requires garbage + * collection. + * To mitigate this, LogManager adjusts the write sequence of _fastinfo and + * pg_log_entry by placing _fastinfo at the last position of the LogNode. + * As a result, _fastinfo can be overwritten by the next pg_log_entry, and a new + * _fastinfo is appended afterward. + * + * | pg_log_entry #1 | _fastinfo #1 | -> + * | pg_log_entry #1 | pg_log_entry #2 | _fastinfo #2 | + * + * Furthermore, if we ensure that the last entry of each LogNode is always + * _fastinfo, garbage collection is unnecessary, because the new _fastinfo + * will be appended to a new LogNode. + */ + bool has_ow_key = false; + if (kvs.size() == OW_SIZE) { + for (auto &p : kvs) { + if (is_ow_key(p.first)) { + ow_kv.first = p.first; + ow_kv.second = p.second; + has_ow_key = true; + break; + } + } + } + std::map dup_kvs; + for (auto &p : kvs) { + if (is_ow_key(p.first) && has_ow_key) { + continue; + } if (is_dup_log_key(p.first)) { dup_kvs[p.first] = p.second; continue; } - co_await _log_set_key(log_root, t, log_node, p.first, p.second); - co_return; - }; + co_await f(p.first, p.second, has_ow_key); + } + + if (!ow_kv.first.empty()) { + co_await f(ow_kv.first, ow_kv.second, has_ow_key); + } + if (!dup_kvs.empty()) { ext = co_await log_load_extent( @@ -89,10 +132,9 @@ LogManager::omap_set_keys( BEGIN_KEY, END_KEY); for (auto &p: dup_kvs) { - co_await _log_set_key(log_root, t, ext, p.first, p.second); + co_await f(p.first, p.second, false); } } - co_return; } @@ -114,16 +156,31 @@ LogManager::omap_set_key( LogManager::omap_set_key_ret LogManager::_log_set_key(omap_root_t &log_root, Transaction &t, LogNodeRef tail, - const std::string &key, const ceph::bufferlist &value) + const std::string &key, const ceph::bufferlist &value, bool can_ow) { LOG_PREFIX(LogManager::_log_set_key); DEBUGT("enter key={}", t, key); assert(tail); - if (!tail->expect_overflow(key.size(), value.length())) { + if (!tail->expect_overflow(key, value.length(), can_ow)) { auto mut = tm.get_mutable_extent(t, tail)->cast(); - mut->append_kv(t, key, value); + if (can_ow) { + mut->overwrite_kv(t, key, value); + } else { + mut->append_kv(t, key, value); + } co_return; } + + // This means the first entry of the new LogNode is not _fastinfo + if (!is_ow_key(key) && can_ow) { + // remove _fastinfo in old LogNode + auto e = co_await tail->get_value(key); + if (e != std::nullopt) { + auto mut = tm.get_mutable_extent(t, tail)->template cast(); + mut->remove_entry(get_ow_key()); + } + } + auto extent = co_await tm.alloc_non_data_extent( t, log_root.hint, LOG_NODE_BLOCK_SIZE ).handle_error_interruptible( diff --git a/src/crimson/os/seastore/omap_manager/log/log_manager.h b/src/crimson/os/seastore/omap_manager/log/log_manager.h index d11dfb90401..ee8d9cea5db 100644 --- a/src/crimson/os/seastore/omap_manager/log/log_manager.h +++ b/src/crimson/os/seastore/omap_manager/log/log_manager.h @@ -16,6 +16,7 @@ namespace crimson::os::seastore::log_manager{ class LogNode; using LogNodeRef = TCachedExtentRef; +constexpr uint8_t OW_SIZE = 2; /* * @@ -222,7 +223,7 @@ public: */ omap_set_key_ret _log_set_key(omap_root_t &log_root, Transaction &t, LogNodeRef e, const std::string &key, - const ceph::bufferlist &value); + const ceph::bufferlist &value, bool can_ow = false); /** * remove_kv @@ -296,7 +297,7 @@ public: TransactionManager &tm; }; -inline bool is_log_key(std::string s) { +inline bool is_log_key(const std::string &s) { pg_log_entry_t e; return (s.size() == e.get_key_name().size() && (s[0] >= (0 + '0') && s[0] <= (9 + '0'))) || @@ -312,4 +313,13 @@ inline bool is_pg_log_key(const std::string &s) { inline bool is_dup_log_key(const std::string &s) { return s.starts_with("dup_"); } + +inline bool is_ow_key(const std::string &s) { + return s == fastinfo_key; +} + +inline std::string get_ow_key() { + return std::string(fastinfo_key); +} + } diff --git a/src/crimson/os/seastore/omap_manager/log/log_node.cc b/src/crimson/os/seastore/omap_manager/log/log_node.cc index b994635ee36..87de649d90e 100644 --- a/src/crimson/os/seastore/omap_manager/log/log_node.cc +++ b/src/crimson/os/seastore/omap_manager/log/log_node.cc @@ -29,8 +29,9 @@ void delta_t::replay(LogKVNodeLayout &l) { auto biter = val.cbegin(); ceph::decode(bitmap, biter); l._set_d_bitmap(bitmap); + } else if (op == op_t::OVERWRITE) { + l._overwrite(key, val); } - } void LogNode::append_kv(Transaction &t, const std::string &key, @@ -41,6 +42,21 @@ void LogNode::append_kv(Transaction &t, const std::string &key, return; } append(key, val); + +} + +void LogNode::overwrite_kv(Transaction &t, const std::string &key, + const ceph::bufferlist &val) { + auto p = maybe_get_delta_buffer(); + if (p) { + int gap = ow_gap_from_last_entry(key.size(), val.length()); + journal_overwrite(key, val, p); + if (gap > 0) { + reserved_len += gap; + } + return; + } + overwrite(key, val); } void LogNode::set_prev_addr(laddr_t l) { @@ -218,7 +234,6 @@ bool LogNode::remove_entry(const std::string key) return false; } - bool LogNode::log_less_than(std::string_view str) const { std::string last_key = get_last_key(); @@ -253,10 +268,78 @@ bool LogNode::log_has_larger_than(std::string_view str) const return false; } +bool LogNode::can_ow() +{ + auto p = maybe_get_delta_buffer(); + if (p) { + auto ret = p->get_latest_write_delta(); + if (ret && (*ret).key == get_ow_key()) { + return true; + } else if (ret && (*ret).key != get_ow_key()) { + return false; + } + } + if (is_ow_key(get_last_key())) { + return true; + } + return false; +} + +int LogKVNodeLayout::_ow_gap_from_last_entry(const size_t key, const size_t val) +{ + iterator iter(this, get_last_pos()); + auto last = iter->get_node_key(); + assert(iter->get_key() == get_ow_key()); + return get_entry_size(key, val) + - get_entry_size(last.key_len, last.val_len); +} + void LogKVNodeLayout::journal_append_remove( delta_buffer_t *recorder, ceph::bufferlist bl) { recorder->insert_remove(bl); } +bool LogNode::expect_overflow(const std::string &key, + size_t vsize, bool can_ow) { + size_t ksize = key.size(); + if (can_ow) { + int gap = ow_gap_from_last_entry(key.size(), vsize); + uint64_t remain = capacity() - get_last_pos() - reserved_len; + if (gap >= 0) { + gap += static_cast(gap); + } else { + uint64_t d = static_cast(-gap); + gap -= d; + } + return remain < get_entry_size(ksize, vsize); + } else if (is_ow_key(key) && !can_ow) { + // guess there is enough space to store further entry in this node. + // this makes sure that the last entry of this node is non-ow entry, + // leading to reducing garbage collection for _fastinfo + size_t next_expected_size = get_entry_size(ksize, vsize) + reserved_len; + return free_space() < + get_entry_size(ksize, vsize) + reserved_len + next_expected_size; + } + return free_space() < get_entry_size(ksize, vsize) + reserved_len; +} + +int LogNode::ow_gap_from_last_entry(const size_t key, const size_t val) { + int gap = 0; + auto p = maybe_get_delta_buffer(); + if (p) { + auto ret = p->get_latest_write_delta(); + if (ret && (*ret).key == get_ow_key()) { + if ((*ret).val.length() < val) { + gap = val - (*ret).val.length(); + } + } else { + gap = _ow_gap_from_last_entry(key, val); + } + } else { + gap = _ow_gap_from_last_entry(key, val); + } + return gap; +} + } diff --git a/src/crimson/os/seastore/omap_manager/log/log_node.h b/src/crimson/os/seastore/omap_manager/log/log_node.h index 7153516c8bf..2c200de466e 100644 --- a/src/crimson/os/seastore/omap_manager/log/log_node.h +++ b/src/crimson/os/seastore/omap_manager/log/log_node.h @@ -27,6 +27,7 @@ struct delta_t { ADD_PREV, ADD_DUP_ADDR, INIT, + OVERWRITE } op; std::string key; ceph::bufferlist val; @@ -108,6 +109,17 @@ public: } } + void insert_overwrite( + const std::string &key, + const ceph::bufferlist &val) { + buffer.push_back( + delta_t{ + delta_t::op_t::OVERWRITE, + key, + val + }); + } + void clear() { buffer.clear(); } @@ -146,6 +158,18 @@ public: return ret; } + std::optional get_latest_write_delta() { + std::optional ret = std::nullopt; + for (auto it = buffer.rbegin(); it != buffer.rend(); ++it) { + if (it->op == delta_t::op_t::APPEND || + it->op == delta_t::op_t::OVERWRITE) { + ret = *it; + return ret; + } + } + return ret; + } + DENC(delta_buffer_t, v, p) { DENC_START(1, 1, p); denc(v.buffer, p); @@ -632,6 +656,11 @@ public: set_size(get_size() + 1); } + void _overwrite(const std::string &key, const ceph::bufferlist &val) { + iterator iter(this, get_last_pos()); + iter.set_node_key(log_key_t(key.size(), val.length())); + iter.set_node_val(key, val); + } void journal_append( const std::string &key, @@ -661,12 +690,25 @@ public: void journal_append_remove(delta_buffer_t *recorder, ceph::bufferlist bl); + void journal_overwrite( + const std::string &key, + const ceph::bufferlist &val, + delta_buffer_t *recorder) { + recorder->insert_overwrite(key, val); + } + void append( const std::string &key, const ceph::bufferlist &val) { _append(key, val); } + void overwrite( + const std::string &key, + const ceph::bufferlist &val) { + _overwrite(key, val); + } + void init_vars() { init_bitmap(); set_last_pos(0); @@ -678,14 +720,13 @@ public: } - bool expect_overflow(size_t ksize, size_t vsize) const { - return free_space() < get_entry_size(ksize, vsize) + reserved_len; - } - std::string get_last_key() const { const_iterator iter(this, get_last_pos()); return iter->get_key(); } + + int _ow_gap_from_last_entry(const size_t key, const size_t val); + friend class LogNode; }; struct LogNode @@ -743,6 +784,9 @@ struct LogNode void append_kv(Transaction &t, const std::string &key, const ceph::bufferlist &val); + void overwrite_kv(Transaction &t, const std::string &key, + const ceph::bufferlist &val); + /* * * set laddr directly if LogNode is not mutating @@ -869,6 +913,12 @@ struct LogNode return this->capacity(); } + bool can_ow(); + + int ow_gap_from_last_entry(const size_t key, const size_t val); + + bool expect_overflow(const std::string &key, size_t vsize, bool can_ow); + void update_delta() { if (!delta_buffer.empty()) { delta_buffer.replay(*this);