From: Mykola Golub Date: Mon, 13 Jul 2020 06:33:07 +0000 (+0100) Subject: os/bluestore: fix collection_list ordering X-Git-Tag: v12.2.14~2^2~9 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cf9585f867dc553ed893b372e5866d305d8ad22b;p=ceph.git os/bluestore: fix collection_list ordering Fixes: https://tracker.ceph.com/issues/43174 Signed-off-by: Mykola Golub (cherry picked from commit a3d94deddaa5a56ade4a4a8a94d31424238b62ee) Conflicts: src/os/bluestore/BlueStore.cc: trivial src/test/objectstore/store_test.cc: different signature for collection_list, apply_transaction instead of queue_transaction, c++11 compatibility --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 64a6022441d..6f3b2a92cc6 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -132,6 +132,11 @@ const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t * We use ! as a terminator for strings; this works because it is < # * and will get escaped if it is present in the string. * + * NOTE: There is a bug in this implementation: due to implicit + * character type conversion in comparison it may produce unexpected + * ordering. Unfortunately fixing the bug would mean invalidating the + * keys in existing deployments. Instead we do additional sorting + * where it is needed. */ template static void append_escaped(const string &in, S *out) @@ -139,11 +144,11 @@ static void append_escaped(const string &in, S *out) char hexbyte[in.length() * 3 + 1]; char* ptr = &hexbyte[0]; for (string::const_iterator i = in.begin(); i != in.end(); ++i) { - if (*i <= '#') { + if (*i <= '#') { // bug: unexpected result for *i > 0x7f *ptr++ = '#'; *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; *ptr++ = "0123456789abcdef"[*i & 0x0f]; - } else if (*i >= '~') { + } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f *ptr++ = '~'; *ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f]; *ptr++ = "0123456789abcdef"[*i & 0x0f]; @@ -324,13 +329,15 @@ static int get_key_shared_blob(const string& key, uint64_t *sbid) } template -static int get_key_object(const S& key, ghobject_t *oid) +static void _key_encode_prefix(const ghobject_t& oid, S *key) { - int r; - const char *p = key.c_str(); + _key_encode_shard(oid.shard_id, key); + _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); + _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); +} - if (key.length() < 1 + 8 + 4) - return -1; +static const char *_key_decode_prefix(const char *p, ghobject_t *oid) +{ p = _key_decode_shard(p, &oid->shard_id); uint64_t pool; @@ -342,6 +349,22 @@ static int get_key_object(const S& key, ghobject_t *oid) oid->hobj.set_bitwise_key_u32(hash); + return p; +} + +#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4) + +template +static int get_key_object(const S& key, ghobject_t *oid) +{ + int r; + const char *p = key.c_str(); + + if (key.length() < ENCODED_KEY_PREFIX_LEN) + return -1; + + p = _key_decode_prefix(p, oid); + r = decode_escaped(p, &oid->hobj.nspace); if (r < 0) return -2; @@ -390,7 +413,7 @@ static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key) { key->clear(); - size_t max_len = 1 + 8 + 4 + + size_t max_len = ENCODED_KEY_PREFIX_LEN + (oid.hobj.nspace.length() * 3 + 1) + (oid.hobj.get_key().length() * 3 + 1) + 1 + // for '<', '=', or '>' @@ -398,9 +421,7 @@ static void get_object_key(CephContext *cct, const ghobject_t& oid, S *key) 8 + 8 + 1; key->reserve(max_len); - _key_encode_shard(oid.shard_id, key); - _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); - _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); + _key_encode_prefix(oid, key); append_escaped(oid.hobj.nspace, key); @@ -643,6 +664,122 @@ ostream& operator<<(ostream& out, const BlueStore::Buffer& b) return out << ")"; } +namespace { + +/* + * Due to a bug in key string encoding (see a comment for append_escaped) + * the KeyValueDB iterator does not lexicographically sort the same + * way that ghobject_t does: objects with the same hash may have wrong order. + * + * This is the iterator wrapper that fixes the keys order. + */ + +class CollectionListIterator { +public: + CollectionListIterator(const KeyValueDB::Iterator &it) + : m_it(it), m_chunk_iter(m_chunk.end()) { + } + + bool valid() const { + return m_chunk_iter != m_chunk.end(); + } + + const ghobject_t &oid() const { + ceph_assert(valid()); + + return m_chunk_iter->first; + } + + const std::string &key() const { + ceph_assert(valid()); + + return m_chunk_iter->second; + } + + void lower_bound(const std::string &key) { + ghobject_t oid; + int r = get_key_object(key, &oid); + ceph_assert(r != -1); + + std::string pkey; + _key_encode_prefix(oid, &pkey); + + m_it->lower_bound(pkey); + m_chunk_iter = m_chunk.end(); + if (!get_next_chunk()) { + return; + } + + if (this->oid().shard_id != oid.shard_id || + this->oid().hobj.pool != oid.hobj.pool || + this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { + return; + } + + m_chunk_iter = m_chunk.lower_bound(oid); + if (m_chunk_iter == m_chunk.end()) { + get_next_chunk(); + } + } + + void upper_bound(const std::string &key) { + lower_bound(key); + + if (valid() && this->key() == key) { + next(); + } + } + + void next() { + ceph_assert(valid()); + + m_chunk_iter++; + if (m_chunk_iter == m_chunk.end()) { + get_next_chunk(); + } + } + +private: + KeyValueDB::Iterator m_it; + std::map m_chunk; + std::map::iterator m_chunk_iter; + + bool get_next_chunk() { + if (!m_it->valid()) { + return false; + } + + ghobject_t oid; + int r = get_key_object(m_it->key(), &oid); + ceph_assert(r != -1); + + m_chunk.clear(); + while (true) { + m_chunk.insert({oid, m_it->key()}); + + m_it->next(); + if (!m_it->valid()) { + break; + } + + ghobject_t next; + r = get_key_object(m_it->key(), &next); + ceph_assert(r != -1); + if (next.shard_id != oid.shard_id || + next.hobj.pool != oid.hobj.pool || + next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) { + break; + } + oid = next; + } + + m_chunk_iter = m_chunk.begin(); + return true; + } +}; + +} // anonymous namespace + // Garbage Collector void BlueStore::GarbageCollector::process_protrusive_extents( @@ -7697,11 +7834,11 @@ int BlueStore::_collection_list( int r = 0; ghobject_t static_next; - KeyValueDB::Iterator it; + std::unique_ptr it; string temp_start_key, temp_end_key; string start_key, end_key; bool set_next = false; - string pend; + ghobject_t pend; bool temp; if (!pnext) @@ -7719,7 +7856,7 @@ int BlueStore::_collection_list( << " and " << pretty_binary_string(start_key) << " to " << pretty_binary_string(end_key) << " start " << start << dendl; - it = db->get_iterator(PREFIX_OBJ); + it.reset(new CollectionListIterator(db->get_iterator(PREFIX_OBJ))); if (start == ghobject_t() || start.hobj == hobject_t() || start == c->cid.get_min_hobj()) { @@ -7740,26 +7877,30 @@ int BlueStore::_collection_list( it->lower_bound(k); } if (end.hobj.is_max()) { - pend = temp ? temp_end_key : end_key; + if (temp) + get_key_object(temp_end_key, &pend); + else + get_key_object(end_key, &pend); } else { - get_object_key(cct, end, &end_key); if (end.hobj.is_temp()) { if (temp) - pend = end_key; + pend = end; else - goto out; + goto out; } else { - pend = temp ? temp_end_key : end_key; + if (temp) + get_key_object(temp_end_key, &pend); + else + pend = end; } } - dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl; + dout(20) << __func__ << " pend " << pend << dendl; while (true) { - if (!it->valid() || it->key() >= pend) { + if (!it->valid() || it->oid() >= pend) { if (!it->valid()) dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; else - dout(20) << __func__ << " key " << pretty_binary_string(it->key()) - << " >= " << end << dendl; + dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl; if (temp) { if (end.hobj.is_temp()) { break; @@ -7767,8 +7908,11 @@ int BlueStore::_collection_list( dout(30) << __func__ << " switch to non-temp namespace" << dendl; temp = false; it->upper_bound(start_key); - pend = end_key; - dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl; + if (end.hobj.is_max()) + get_key_object(end_key, &pend); + else + pend = end; + dout(30) << __func__ << " pend " << pend << dendl; continue; } break; @@ -7778,9 +7922,7 @@ int BlueStore::_collection_list( it->next(); continue; } - ghobject_t oid; - int r = get_key_object(it->key(), &oid); - assert(r == 0); + ghobject_t oid = it->oid(); dout(20) << __func__ << " oid " << oid << " end " << end << dendl; if (ls->size() >= (unsigned)max) { dout(20) << __func__ << " reached max " << max << dendl; diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 052aa41a1c2..3d8e3528444 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -4610,6 +4610,122 @@ TEST_P(StoreTest, HashCollisionTest) { ASSERT_EQ(r, 0); } +TEST_P(StoreTest, HashCollisionSorting) { + if (string(GetParam()) == "kstore") // TODO: fix kstore + return; + + char buf121664318_1[] = {18, -119, -121, -111, 0}; + char buf121664318_2[] = {19, 127, -121, 32, 0}; + char buf121664318_3[] = {19, -118, 15, 19, 0}; + char buf121664318_4[] = {28, 27, -116, -113, 0}; + char buf121664318_5[] = {28, 27, -115, -124, 0}; + + char buf121666222_1[] = {18, -119, -120, -111, 0}; + char buf121666222_2[] = {19, 127, -120, 32, 0}; + char buf121666222_3[] = {19, -118, 15, 30, 0}; + char buf121666222_4[] = {29, 17, -126, -113, 0}; + char buf121666222_5[] = {29, 17, -125, -124, 0}; + + std::map> object_names = { + {121664318, {{buf121664318_1}, + {buf121664318_2}, + {buf121664318_3}, + {buf121664318_4}, + {buf121664318_5}}}, + {121666222, {{buf121666222_1}, + {buf121666222_2}, + {buf121666222_3}, + {buf121666222_4}, + {buf121666222_5}}}}; + + ObjectStore::Sequencer osr("test"); + int64_t poolid = 111; + coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD)); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = apply_transaction(store, &osr, std::move(t)); + ASSERT_EQ(r, 0); + } + + std::set created; + for (auto &p : object_names) { + auto &hash = p.first; + auto &names = p.second; + for (auto &name : names) { + ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP), + string(), + hash, + poolid, + string())); + ASSERT_EQ(hash, hoid.hobj.get_hash()); + ObjectStore::Transaction t; + t.touch(cid, hoid); + r = apply_transaction(store, &osr, std::move(t)); + ASSERT_EQ(r, 0); + created.insert(hoid); + } + } + + vector objects; + r = store->collection_list(cid, ghobject_t(), ghobject_t::get_max(), INT_MAX, + &objects, 0); + ASSERT_EQ(r, 0); + ASSERT_EQ(created.size(), objects.size()); + auto it = objects.begin(); + for (auto &hoid : created) { + ASSERT_EQ(hoid, *it); + it++; + } + + for (auto i = created.begin(); i != created.end(); i++) { + auto j = i; + for (j++; j != created.end(); j++) { + std::set created_sub(i, j); + objects.clear(); + ghobject_t next; + r = store->collection_list(cid, *i, ghobject_t::get_max(), + created_sub.size(), + &objects, &next); + ASSERT_EQ(r, 0); + ASSERT_EQ(created_sub.size(), objects.size()); + it = objects.begin(); + for (auto &hoid : created_sub) { + ASSERT_EQ(hoid, *it); + it++; + } + if (j == created.end()) { + ASSERT_TRUE(next.is_max()); + } else { + ASSERT_EQ(*j, next); + } + } + } + + for (auto i = created.begin(); i != created.end(); i++) { + auto j = i; + for (j++; j != created.end(); j++) { + std::set created_sub(i, j); + objects.clear(); + ghobject_t next; + r = store->collection_list(cid, *i, *j, INT_MAX, &objects, &next); + ASSERT_EQ(r, 0); + ASSERT_EQ(created_sub.size(), objects.size()); + it = objects.begin(); + for (auto &hoid : created_sub) { + ASSERT_EQ(hoid, *it); + it++; + } + if (j == created.end()) { + ASSERT_TRUE(next.is_max()); + } else { + ASSERT_EQ(*j, next); + } + } + } +} + TEST_P(StoreTest, ScrubTest) { ObjectStore::Sequencer osr("test"); int64_t poolid = 111;