This commit forces _fastinfo to be stored at the last position of a LogNode.
By doing so, _fastinfo can be overwritten by the next pg_log_entry.
Since _fastinfo has a fixed key with varying contents and is included in
every write transaction, placing it at the tail enables efficient overwrites.
As a result, this change reduces LogNode allocation and deallocation,
thereby lowering space overhead. Moreover, garbage collection for obsolete
key-value pairs is unnecessary due to overwrite semantics.
Signed-off-by: Myoungwon Oh <ohmyoungwon@gmail.com>
auto ext = co_await log_load_extent<LogNode>(
t, log_root.addr, BEGIN_KEY, END_KEY);
ceph_assert(ext);
- std::map<std::string, ceph::bufferlist> dup_kvs;
- for (auto &p : kvs) {
+ std::pair<std::string, ceph::bufferlist> ow_kv;
+ auto f = [&](const std::string &k, const bufferlist &v, bool has_ow_key)
+ -> omap_set_key_ret {
CachedExtentRef node;
Transaction::get_extent_ret ret;
// To find mutable extent in the same transaction
assert(ret == Transaction::get_extent_ret::PRESENT);
assert(node);
LogNodeRef log_node = node->template cast<LogNode>();
+ bool can_ow = has_ow_key && log_node->can_ow();
+ co_await _log_set_key(log_root, t, log_node, k, v, can_ow);
+ co_return;
+ };
+ /*
+ * During a normal write transaction, pgmeta_oid receives two key–value pairs:
+ * _fastinfo and pg_log_entry. Unlike pg_log_entry, _fastinfo is likely to be
+ * overwritten in the near future. Storing _fastinfo in an append-only manner
+ * within a LogNode causes unnecessary space overhead and requires garbage
+ * collection.
+ * To mitigate this, LogManager adjusts the write sequence of _fastinfo and
+ * pg_log_entry by placing _fastinfo at the last position of the LogNode.
+ * As a result, _fastinfo can be overwritten by the next pg_log_entry, and a new
+ * _fastinfo is appended afterward.
+ *
+ * | pg_log_entry #1 | _fastinfo #1 | ->
+ * | pg_log_entry #1 | pg_log_entry #2 | _fastinfo #2 |
+ *
+ * Furthermore, if we ensure that the last entry of each LogNode is always
+ * _fastinfo, garbage collection is unnecessary, because the new _fastinfo
+ * will be appended to a new LogNode.
+ */
+ bool has_ow_key = false;
+ if (kvs.size() == OW_SIZE) {
+ for (auto &p : kvs) {
+ if (is_ow_key(p.first)) {
+ ow_kv.first = p.first;
+ ow_kv.second = p.second;
+ has_ow_key = true;
+ break;
+ }
+ }
+ }
+ std::map<std::string, ceph::bufferlist> dup_kvs;
+ for (auto &p : kvs) {
+ if (is_ow_key(p.first) && has_ow_key) {
+ continue;
+ }
if (is_dup_log_key(p.first)) {
dup_kvs[p.first] = p.second;
continue;
}
- co_await _log_set_key(log_root, t, log_node, p.first, p.second);
- co_return;
- };
+ co_await f(p.first, p.second, has_ow_key);
+ }
+
+ if (!ow_kv.first.empty()) {
+ co_await f(ow_kv.first, ow_kv.second, has_ow_key);
+ }
+
if (!dup_kvs.empty()) {
ext = co_await log_load_extent<LogNode>(
BEGIN_KEY,
END_KEY);
for (auto &p: dup_kvs) {
- co_await _log_set_key(log_root, t, ext, p.first, p.second);
+ co_await f(p.first, p.second, false);
}
}
-
co_return;
}
LogManager::omap_set_key_ret
LogManager::_log_set_key(omap_root_t &log_root,
Transaction &t, LogNodeRef tail,
- const std::string &key, const ceph::bufferlist &value)
+ const std::string &key, const ceph::bufferlist &value, bool can_ow)
{
LOG_PREFIX(LogManager::_log_set_key);
DEBUGT("enter key={}", t, key);
assert(tail);
- if (!tail->expect_overflow(key.size(), value.length())) {
+ if (!tail->expect_overflow(key, value.length(), can_ow)) {
auto mut = tm.get_mutable_extent(t, tail)->cast<LogNode>();
- mut->append_kv(t, key, value);
+ if (can_ow) {
+ mut->overwrite_kv(t, key, value);
+ } else {
+ mut->append_kv(t, key, value);
+ }
co_return;
}
+
+ // This means the first entry of the new LogNode is not _fastinfo
+ if (!is_ow_key(key) && can_ow) {
+ // remove _fastinfo in old LogNode
+ auto e = co_await tail->get_value(key);
+ if (e != std::nullopt) {
+ auto mut = tm.get_mutable_extent(t, tail)->template cast<LogNode>();
+ mut->remove_entry(get_ow_key());
+ }
+ }
+
auto extent = co_await tm.alloc_non_data_extent<LogNode>(
t, log_root.hint, LOG_NODE_BLOCK_SIZE
).handle_error_interruptible(
class LogNode;
using LogNodeRef = TCachedExtentRef<LogNode>;
+constexpr uint8_t OW_SIZE = 2;
/*
*
*/
omap_set_key_ret _log_set_key(omap_root_t &log_root,
Transaction &t, LogNodeRef e, const std::string &key,
- const ceph::bufferlist &value);
+ const ceph::bufferlist &value, bool can_ow = false);
/**
* remove_kv
TransactionManager &tm;
};
-inline bool is_log_key(std::string s) {
+inline bool is_log_key(const std::string &s) {
pg_log_entry_t e;
return (s.size() == e.get_key_name().size() &&
(s[0] >= (0 + '0') && s[0] <= (9 + '0'))) ||
inline bool is_dup_log_key(const std::string &s) {
return s.starts_with("dup_");
}
+
+inline bool is_ow_key(const std::string &s) {
+ return s == fastinfo_key;
+}
+
+inline std::string get_ow_key() {
+ return std::string(fastinfo_key);
+}
+
}
auto biter = val.cbegin();
ceph::decode(bitmap, biter);
l._set_d_bitmap(bitmap);
+ } else if (op == op_t::OVERWRITE) {
+ l._overwrite(key, val);
}
-
}
void LogNode::append_kv(Transaction &t, const std::string &key,
return;
}
append(key, val);
+
+}
+
+void LogNode::overwrite_kv(Transaction &t, const std::string &key,
+ const ceph::bufferlist &val) {
+ auto p = maybe_get_delta_buffer();
+ if (p) {
+ int gap = ow_gap_from_last_entry(key.size(), val.length());
+ journal_overwrite(key, val, p);
+ if (gap > 0) {
+ reserved_len += gap;
+ }
+ return;
+ }
+ overwrite(key, val);
}
void LogNode::set_prev_addr(laddr_t l) {
return false;
}
-
bool LogNode::log_less_than(std::string_view str) const
{
std::string last_key = get_last_key();
return false;
}
+bool LogNode::can_ow()
+{
+ auto p = maybe_get_delta_buffer();
+ if (p) {
+ auto ret = p->get_latest_write_delta();
+ if (ret && (*ret).key == get_ow_key()) {
+ return true;
+ } else if (ret && (*ret).key != get_ow_key()) {
+ return false;
+ }
+ }
+ if (is_ow_key(get_last_key())) {
+ return true;
+ }
+ return false;
+}
+
+int LogKVNodeLayout::_ow_gap_from_last_entry(const size_t key, const size_t val)
+{
+ iterator iter(this, get_last_pos());
+ auto last = iter->get_node_key();
+ assert(iter->get_key() == get_ow_key());
+ return get_entry_size(key, val)
+ - get_entry_size(last.key_len, last.val_len);
+}
+
void LogKVNodeLayout::journal_append_remove(
delta_buffer_t *recorder,
ceph::bufferlist bl) {
recorder->insert_remove(bl);
}
+bool LogNode::expect_overflow(const std::string &key,
+ size_t vsize, bool can_ow) {
+ size_t ksize = key.size();
+ if (can_ow) {
+ int gap = ow_gap_from_last_entry(key.size(), vsize);
+ uint64_t remain = capacity() - get_last_pos() - reserved_len;
+ if (gap >= 0) {
+ gap += static_cast<uint64_t>(gap);
+ } else {
+ uint64_t d = static_cast<uint64_t>(-gap);
+ gap -= d;
+ }
+ return remain < get_entry_size(ksize, vsize);
+ } else if (is_ow_key(key) && !can_ow) {
+ // guess there is enough space to store further entry in this node.
+ // this makes sure that the last entry of this node is non-ow entry,
+ // leading to reducing garbage collection for _fastinfo
+ size_t next_expected_size = get_entry_size(ksize, vsize) + reserved_len;
+ return free_space() <
+ get_entry_size(ksize, vsize) + reserved_len + next_expected_size;
+ }
+ return free_space() < get_entry_size(ksize, vsize) + reserved_len;
+}
+
+int LogNode::ow_gap_from_last_entry(const size_t key, const size_t val) {
+ int gap = 0;
+ auto p = maybe_get_delta_buffer();
+ if (p) {
+ auto ret = p->get_latest_write_delta();
+ if (ret && (*ret).key == get_ow_key()) {
+ if ((*ret).val.length() < val) {
+ gap = val - (*ret).val.length();
+ }
+ } else {
+ gap = _ow_gap_from_last_entry(key, val);
+ }
+ } else {
+ gap = _ow_gap_from_last_entry(key, val);
+ }
+ return gap;
+}
+
}
ADD_PREV,
ADD_DUP_ADDR,
INIT,
+ OVERWRITE
} op;
std::string key;
ceph::bufferlist val;
}
}
+ void insert_overwrite(
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ buffer.push_back(
+ delta_t{
+ delta_t::op_t::OVERWRITE,
+ key,
+ val
+ });
+ }
+
void clear() {
buffer.clear();
}
return ret;
}
+ std::optional<delta_t> get_latest_write_delta() {
+ std::optional<delta_t> ret = std::nullopt;
+ for (auto it = buffer.rbegin(); it != buffer.rend(); ++it) {
+ if (it->op == delta_t::op_t::APPEND ||
+ it->op == delta_t::op_t::OVERWRITE) {
+ ret = *it;
+ return ret;
+ }
+ }
+ return ret;
+ }
+
DENC(delta_buffer_t, v, p) {
DENC_START(1, 1, p);
denc(v.buffer, p);
set_size(get_size() + 1);
}
+ void _overwrite(const std::string &key, const ceph::bufferlist &val) {
+ iterator iter(this, get_last_pos());
+ iter.set_node_key(log_key_t(key.size(), val.length()));
+ iter.set_node_val(key, val);
+ }
void journal_append(
const std::string &key,
void journal_append_remove(delta_buffer_t *recorder, ceph::bufferlist bl);
+ void journal_overwrite(
+ const std::string &key,
+ const ceph::bufferlist &val,
+ delta_buffer_t *recorder) {
+ recorder->insert_overwrite(key, val);
+ }
+
void append(
const std::string &key,
const ceph::bufferlist &val) {
_append(key, val);
}
+ void overwrite(
+ const std::string &key,
+ const ceph::bufferlist &val) {
+ _overwrite(key, val);
+ }
+
void init_vars() {
init_bitmap();
set_last_pos(0);
}
- bool expect_overflow(size_t ksize, size_t vsize) const {
- return free_space() < get_entry_size(ksize, vsize) + reserved_len;
- }
-
std::string get_last_key() const {
const_iterator iter(this, get_last_pos());
return iter->get_key();
}
+
+ int _ow_gap_from_last_entry(const size_t key, const size_t val);
+ friend class LogNode;
};
struct LogNode
void append_kv(Transaction &t, const std::string &key,
const ceph::bufferlist &val);
+ void overwrite_kv(Transaction &t, const std::string &key,
+ const ceph::bufferlist &val);
+
/*
*
* set laddr directly if LogNode is not mutating
return this->capacity();
}
+ bool can_ow();
+
+ int ow_gap_from_last_entry(const size_t key, const size_t val);
+
+ bool expect_overflow(const std::string &key, size_t vsize, bool can_ow);
+
void update_delta() {
if (!delta_buffer.empty()) {
delta_buffer.replay(*this);