From d2c55356283bad64e267fc2f53ff62f6de08c7b7 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Tue, 9 Feb 2021 18:29:01 +0300 Subject: [PATCH] os/bluestore: cap omap naming scheme upgrade transactoin. We shouldn't use single per-onode transaction for such an upgrade when onode's omap list is huge. This results in similarly sized WAL/SST files which are inefficient, might cause high memory usage and sometimes error-prone. Fixes: https://tracker.ceph.com/issues/49170 Signed-off-by: Igor Fedotov (cherry picked from commit e897fa243c1dd38329733b452872616023f14ac8) --- src/os/bluestore/BlueStore.cc | 165 ++++++++++++++++++----------- src/os/bluestore/BlueStore.h | 25 ++++- src/os/bluestore/bluestore_types.h | 14 ++- 3 files changed, 138 insertions(+), 66 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index eb5aa0cb4ab13..795e9cb09c28b 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -542,6 +542,7 @@ static int get_key_pool_stat(const string& key, uint64_t* pool_id) return 0; } + template void _dump_extent_map(CephContext *cct, const BlueStore::ExtentMap &em) { @@ -3611,48 +3612,52 @@ void BlueStore::Onode::dump(Formatter* f) const extent_map.dump(f); } - -const string& BlueStore::Onode::get_omap_prefix() +const std::string& BlueStore::Onode::calc_omap_prefix(uint8_t flags) { - if (onode.is_pgmeta_omap()) { + if (bluestore_onode_t::is_pgmeta_omap(flags)) { return PREFIX_PGMETA_OMAP; } - if (onode.is_perpg_omap()) { + if (bluestore_onode_t::is_perpg_omap(flags)) { return PREFIX_PERPG_OMAP; } - if (onode.is_perpool_omap()) { + if (bluestore_onode_t::is_perpool_omap(flags)) { return PREFIX_PERPOOL_OMAP; } return PREFIX_OMAP; } // '-' < '.' < '~' - -void BlueStore::Onode::get_omap_header(string *out) +void BlueStore::Onode::calc_omap_header( + uint8_t flags, + const Onode* o, + std::string* out) { - if (!onode.is_pgmeta_omap()) { - if (onode.is_perpg_omap()) { - _key_encode_u64(c->pool(), out); - _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out); - } else if (onode.is_perpool_omap()) { - _key_encode_u64(c->pool(), out); + if (!bluestore_onode_t::is_pgmeta_omap(flags)) { + if (bluestore_onode_t::is_perpg_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); + } else if (bluestore_onode_t::is_perpool_omap(flags)) { + _key_encode_u64(o->c->pool(), out); } } - _key_encode_u64(onode.nid, out); + _key_encode_u64(o->onode.nid, out); out->push_back('-'); } -void BlueStore::Onode::get_omap_key(const string& key, string *out) +void BlueStore::Onode::calc_omap_key(uint8_t flags, + const Onode* o, + const std::string& key, + std::string* out) { - if (!onode.is_pgmeta_omap()) { - if (onode.is_perpg_omap()) { - _key_encode_u64(c->pool(), out); - _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out); - } else if (onode.is_perpool_omap()) { - _key_encode_u64(c->pool(), out); + if (!bluestore_onode_t::is_pgmeta_omap(flags)) { + if (bluestore_onode_t::is_perpg_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); + } else if (bluestore_onode_t::is_perpool_omap(flags)) { + _key_encode_u64(o->c->pool(), out); } } - _key_encode_u64(onode.nid, out); + _key_encode_u64(o->onode.nid, out); out->push_back('.'); out->append(key); } @@ -3671,17 +3676,20 @@ void BlueStore::Onode::rewrite_omap_key(const string& old, string *out) out->append(old.c_str() + out->length(), old.size() - out->length()); } -void BlueStore::Onode::get_omap_tail(string *out) +void BlueStore::Onode::calc_omap_tail( + uint8_t flags, + const Onode* o, + std::string* out) { - if (!onode.is_pgmeta_omap()) { - if (onode.is_perpg_omap()) { - _key_encode_u64(c->pool(), out); - _key_encode_u32(oid.hobj.get_bitwise_key_u32(), out); - } else if (onode.is_perpool_omap()) { - _key_encode_u64(c->pool(), out); + if (!bluestore_onode_t::is_pgmeta_omap(flags)) { + if (bluestore_onode_t::is_perpg_omap(flags)) { + _key_encode_u64(o->c->pool(), out); + _key_encode_u32(o->oid.hobj.get_bitwise_key_u32(), out); + } else if (bluestore_onode_t::is_perpool_omap(flags)) { + _key_encode_u64(o->c->pool(), out); } } - _key_encode_u64(onode.nid, out); + _key_encode_u64(o->onode.nid, out); out->push_back('~'); } @@ -3698,7 +3706,6 @@ void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) *user_key = key.substr(pos); } - // ======================================================= // WriteContext @@ -7819,12 +7826,71 @@ void BlueStore::_fsck_check_object_omap(FSCKDepth depth, !o->onode.is_perpg_omap() && !o->onode.is_pgmeta_omap()) { dout(10) << "fsck converting " << o->oid << " omap to per-pg" << dendl; - bufferlist h; + bufferlist header; map kv; - int r = _onode_omap_get(o, &h, &kv); - if (r < 0) { - derr << " got " << r << " " << cpp_strerror(r) << dendl; - } else { + { + KeyValueDB::Transaction txn = db->get_transaction(); + uint64_t txn_cost = 0; + const string& prefix = Onode::calc_omap_prefix(o->onode.flags); + uint8_t new_flags = o->onode.flags | + bluestore_onode_t::FLAG_PERPOOL_OMAP | + bluestore_onode_t::FLAG_PERPG_OMAP; + const string& new_omap_prefix = Onode::calc_omap_prefix(new_flags); + + KeyValueDB::Iterator it = db->get_iterator(prefix); + string head, tail; + o->get_omap_header(&head); + o->get_omap_tail(&tail); + it->lower_bound(head); + // head + if (it->valid() && it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + header = it->value(); + if (header.length()) { + string new_head; + Onode::calc_omap_header(new_flags, o.get(), &new_head); + txn->set(new_omap_prefix, new_head, header); + txn_cost += new_head.length() + header.length(); + } + } + // tail + { + string new_tail; + Onode::calc_omap_tail(new_flags, o.get(), &new_tail); + bufferlist empty; + txn->set(new_omap_prefix, new_tail, empty); + txn_cost += new_tail.length() + new_tail.length(); + } + // values + string final_key; + Onode::calc_omap_key(new_flags, o.get(), string(), &final_key); + size_t base_key_len = final_key.size(); + while (it->valid() && it->key() < tail) { + string user_key; + o->decode_omap_key(it->key(), &user_key); + dout(20) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + + final_key.resize(base_key_len); + final_key += it->key(); + auto v = it->value(); + txn->set(new_omap_prefix, final_key, v); + txn_cost += final_key.length() + v.length(); + + // submit a portion if cost exceeds 16MB + if (txn_cost >= 16 * (1 << 20) ) { + db->submit_transaction_sync(txn); + txn = db->get_transaction(); + txn_cost = 0; + } + it->next(); + } + if (txn_cost > 0) { + db->submit_transaction_sync(txn); + } + } + // finalize: remove legacy data + { KeyValueDB::Transaction txn = db->get_transaction(); // remove old keys const string& old_omap_prefix = o->get_omap_prefix(); @@ -7836,27 +7902,6 @@ void BlueStore::_fsck_check_object_omap(FSCKDepth depth, // set flag o->onode.set_flag(bluestore_onode_t::FLAG_PERPOOL_OMAP | bluestore_onode_t::FLAG_PERPG_OMAP); _record_onode(o, txn); - const string& new_omap_prefix = o->get_omap_prefix(); - // head - if (h.length()) { - string new_head; - o->get_omap_header(&new_head); - txn->set(new_omap_prefix, new_head, h); - } - // tail - string new_tail; - o->get_omap_tail(&new_tail); - bufferlist empty; - txn->set(new_omap_prefix, new_tail, empty); - // values - string final_key; - o->get_omap_key(string(), &final_key); - size_t base_key_len = final_key.size(); - for (auto& i : kv) { - final_key.resize(base_key_len); - final_key += i.first; - txn->set(new_omap_prefix, final_key, i.second); - } db->submit_transaction_sync(txn); repairer->inc_repaired(); repairer->request_compaction(); @@ -15009,7 +15054,7 @@ int BlueStore::_omap_setkeys(TransContext *txc, if (o->oid.is_pgmeta()) { o->onode.set_omap_flags_pgmeta(); } else { - o->onode.set_omap_flags(); + o->onode.set_omap_flags(per_pool_omap == OMAP_BULK); } txc->write_onode(o); @@ -15054,7 +15099,7 @@ int BlueStore::_omap_setheader(TransContext *txc, if (o->oid.is_pgmeta()) { o->onode.set_omap_flags_pgmeta(); } else { - o->onode.set_omap_flags(); + o->onode.set_omap_flags(per_pool_omap == OMAP_BULK); } txc->write_onode(o); @@ -15210,7 +15255,7 @@ int BlueStore::_clone(TransContext *txc, if (newo->oid.is_pgmeta()) { newo->onode.set_omap_flags_pgmeta(); } else { - newo->onode.set_omap_flags(); + newo->onode.set_omap_flags(per_pool_omap == OMAP_BULK); } const string& prefix = newo->get_omap_prefix(); KeyValueDB::Iterator it = db->get_iterator(prefix); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index eff8bb8a5076c..0690717c8b4b4 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1147,11 +1147,28 @@ public: return !pinned; } - const std::string& get_omap_prefix(); - void get_omap_header(std::string *out); - void get_omap_key(const std::string& key, std::string *out); + static const std::string& calc_omap_prefix(uint8_t flags); + static void calc_omap_header(uint8_t flags, const Onode* o, + std::string* out); + static void calc_omap_key(uint8_t flags, const Onode* o, + const std::string& key, std::string* out); + static void calc_omap_tail(uint8_t flags, const Onode* o, + std::string* out); + + const std::string& get_omap_prefix() { + return calc_omap_prefix(onode.flags); + } + void get_omap_header(std::string* out) { + calc_omap_header(onode.flags, this, out); + } + void get_omap_key(const std::string& key, std::string* out) { + calc_omap_key(onode.flags, this, key, out); + } + void get_omap_tail(std::string* out) { + calc_omap_tail(onode.flags, this, out); + } + void rewrite_omap_key(const std::string& old, std::string *out); - void get_omap_tail(std::string *out); void decode_omap_key(const std::string& key, std::string *user_key); // Return the offset of an object on disk. This function is intended *only* diff --git a/src/os/bluestore/bluestore_types.h b/src/os/bluestore/bluestore_types.h index 9a10304e3e2db..656449b19e14a 100644 --- a/src/os/bluestore/bluestore_types.h +++ b/src/os/bluestore/bluestore_types.h @@ -999,6 +999,16 @@ struct bluestore_onode_t { bool has_omap() const { return has_flag(FLAG_OMAP); } + + static bool is_pgmeta_omap(uint8_t flags) { + return flags & FLAG_PGMETA_OMAP; + } + static bool is_perpool_omap(uint8_t flags) { + return flags & FLAG_PERPOOL_OMAP; + } + static bool is_perpg_omap(uint8_t flags) { + return flags & FLAG_PERPG_OMAP; + } bool is_pgmeta_omap() const { return has_flag(FLAG_PGMETA_OMAP); } @@ -1009,8 +1019,8 @@ struct bluestore_onode_t { return has_flag(FLAG_PERPG_OMAP); } - void set_omap_flags() { - set_flag(FLAG_OMAP | FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP); + void set_omap_flags(bool legacy) { + set_flag(FLAG_OMAP | (legacy ? 0 : (FLAG_PERPOOL_OMAP | FLAG_PERPG_OMAP))); } void set_omap_flags_pgmeta() { set_flag(FLAG_OMAP | FLAG_PGMETA_OMAP); -- 2.39.5