* We use ! as a terminator for strings; this works because it is < #
* and will get escaped if it is present in the string.
*
+ * NOTE: There is a bug in this implementation: due to implicit
+ * character type conversion in comparison it may produce unexpected
+ * ordering. Unfortunately fixing the bug would mean invalidating the
+ * keys in existing deployments. Instead we do additional sorting
+ * where it is needed.
*/
template<typename S>
static void append_escaped(const string &in, S *out)
char hexbyte[in.length() * 3 + 1];
char* ptr = &hexbyte[0];
for (string::const_iterator i = in.begin(); i != in.end(); ++i) {
- if (*i <= '#') {
+ if (*i <= '#') { // bug: unexpected result for *i > 0x7f
*ptr++ = '#';
*ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
*ptr++ = "0123456789abcdef"[*i & 0x0f];
- } else if (*i >= '~') {
+ } else if (*i >= '~') { // bug: unexpected result for *i > 0x7f
*ptr++ = '~';
*ptr++ = "0123456789abcdef"[(*i >> 4) & 0x0f];
*ptr++ = "0123456789abcdef"[*i & 0x0f];
}
template<typename S>
-static int get_key_object(const S& key, ghobject_t *oid)
+static void _key_encode_prefix(const ghobject_t& oid, S *key)
{
- int r;
- const char *p = key.c_str();
+ _key_encode_shard(oid.shard_id, key);
+ _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
+ _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+}
- if (key.length() < 1 + 8 + 4)
- return -1;
+static const char *_key_decode_prefix(const char *p, ghobject_t *oid)
+{
p = _key_decode_shard(p, &oid->shard_id);
uint64_t pool;
oid->hobj.set_bitwise_key_u32(hash);
+ return p;
+}
+
+#define ENCODED_KEY_PREFIX_LEN (1 + 8 + 4)
+
+template<typename S>
+static int get_key_object(const S& key, ghobject_t *oid)
+{
+ int r;
+ const char *p = key.c_str();
+
+ if (key.length() < ENCODED_KEY_PREFIX_LEN)
+ return -1;
+
+ p = _key_decode_prefix(p, oid);
+
r = decode_escaped(p, &oid->hobj.nspace);
if (r < 0)
return -2;
{
key->clear();
- size_t max_len = 1 + 8 + 4 +
+ size_t max_len = ENCODED_KEY_PREFIX_LEN +
(oid.hobj.nspace.length() * 3 + 1) +
(oid.hobj.get_key().length() * 3 + 1) +
1 + // for '<', '=', or '>'
8 + 8 + 1;
key->reserve(max_len);
- _key_encode_shard(oid.shard_id, key);
- _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key);
- _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key);
+ _key_encode_prefix(oid, key);
append_escaped(oid.hobj.nspace, key);
return out << ")";
}
+namespace {
+
+/*
+ * Due to a bug in key string encoding (see a comment for append_escaped)
+ * the KeyValueDB iterator does not lexicographically sort the same
+ * way that ghobject_t does: objects with the same hash may have wrong order.
+ *
+ * This is the iterator wrapper that fixes the keys order.
+ */
+
+class CollectionListIterator {
+public:
+ CollectionListIterator(const KeyValueDB::Iterator &it)
+ : m_it(it), m_chunk_iter(m_chunk.end()) {
+ }
+
+ bool valid() const {
+ return m_chunk_iter != m_chunk.end();
+ }
+
+ const ghobject_t &oid() const {
+ ceph_assert(valid());
+
+ return m_chunk_iter->first;
+ }
+
+ const std::string &key() const {
+ ceph_assert(valid());
+
+ return m_chunk_iter->second;
+ }
+
+ void lower_bound(const std::string &key) {
+ ghobject_t oid;
+ int r = get_key_object(key, &oid);
+ ceph_assert(r != -1);
+
+ std::string pkey;
+ _key_encode_prefix(oid, &pkey);
+
+ m_it->lower_bound(pkey);
+ m_chunk_iter = m_chunk.end();
+ if (!get_next_chunk()) {
+ return;
+ }
+
+ if (this->oid().shard_id != oid.shard_id ||
+ this->oid().hobj.pool != oid.hobj.pool ||
+ this->oid().hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ return;
+ }
+
+ m_chunk_iter = m_chunk.lower_bound(oid);
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+ void upper_bound(const std::string &key) {
+ lower_bound(key);
+
+ if (valid() && this->key() == key) {
+ next();
+ }
+ }
+
+ void next() {
+ ceph_assert(valid());
+
+ m_chunk_iter++;
+ if (m_chunk_iter == m_chunk.end()) {
+ get_next_chunk();
+ }
+ }
+
+private:
+ KeyValueDB::Iterator m_it;
+ std::map<ghobject_t, std::string> m_chunk;
+ std::map<ghobject_t, std::string>::iterator m_chunk_iter;
+
+ bool get_next_chunk() {
+ if (!m_it->valid()) {
+ return false;
+ }
+
+ ghobject_t oid;
+ int r = get_key_object(m_it->key(), &oid);
+ ceph_assert(r != -1);
+
+ m_chunk.clear();
+ while (true) {
+ m_chunk.insert({oid, m_it->key()});
+
+ m_it->next();
+ if (!m_it->valid()) {
+ break;
+ }
+
+ ghobject_t next;
+ r = get_key_object(m_it->key(), &next);
+ ceph_assert(r != -1);
+ if (next.shard_id != oid.shard_id ||
+ next.hobj.pool != oid.hobj.pool ||
+ next.hobj.get_bitwise_key_u32() != oid.hobj.get_bitwise_key_u32()) {
+ break;
+ }
+ oid = next;
+ }
+
+ m_chunk_iter = m_chunk.begin();
+ return true;
+ }
+};
+
+} // anonymous namespace
+
// Garbage Collector
void BlueStore::GarbageCollector::process_protrusive_extents(
auto start_time = mono_clock::now();
int r = 0;
ghobject_t static_next;
- KeyValueDB::Iterator it;
+ std::unique_ptr<CollectionListIterator> it;
string temp_start_key, temp_end_key;
string start_key, end_key;
bool set_next = false;
- string pend;
+ ghobject_t pend;
bool temp;
if (!pnext)
<< " and " << pretty_binary_string(start_key)
<< " to " << pretty_binary_string(end_key)
<< " start " << start << dendl;
- it = db->get_iterator(PREFIX_OBJ);
+ it = std::make_unique<CollectionListIterator>(db->get_iterator(PREFIX_OBJ));
if (start == ghobject_t() ||
start.hobj == hobject_t() ||
start == c->cid.get_min_hobj()) {
it->lower_bound(k);
}
if (end.hobj.is_max()) {
- pend = temp ? temp_end_key : end_key;
+ if (temp)
+ get_key_object(temp_end_key, &pend);
+ else
+ get_key_object(end_key, &pend);
} else {
- get_object_key(cct, end, &end_key);
if (end.hobj.is_temp()) {
if (temp)
- pend = end_key;
+ pend = end;
else
- goto out;
+ goto out;
} else {
- pend = temp ? temp_end_key : end_key;
+ if (temp)
+ get_key_object(temp_end_key, &pend);
+ else
+ pend = end;
}
}
- dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+ dout(20) << __func__ << " pend " << pend << dendl;
while (true) {
- if (!it->valid() || it->key() >= pend) {
+ if (!it->valid() || it->oid() >= pend) {
if (!it->valid())
dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
else
- dout(20) << __func__ << " key " << pretty_binary_string(it->key())
- << " >= " << end << dendl;
+ dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
if (temp) {
if (end.hobj.is_temp()) {
break;
dout(30) << __func__ << " switch to non-temp namespace" << dendl;
temp = false;
it->upper_bound(start_key);
- pend = end_key;
- dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl;
+ if (end.hobj.is_max())
+ get_key_object(end_key, &pend);
+ else
+ pend = end;
+ dout(30) << __func__ << " pend " << pend << dendl;
continue;
}
break;
it->next();
continue;
}
- ghobject_t oid;
- int r = get_key_object(it->key(), &oid);
- ceph_assert(r == 0);
+ ghobject_t oid = it->oid();
dout(20) << __func__ << " oid " << oid << " end " << end << dendl;
if (ls->size() >= (unsigned)max) {
dout(20) << __func__ << " reached max " << max << dendl;
ASSERT_EQ(r, 0);
}
+TEST_P(StoreTest, HashCollisionSorting) {
+ if (string(GetParam()) == "kstore") // TODO: fix kstore
+ return;
+
+ char buf121664318_1[] = {18, -119, -121, -111, 0};
+ char buf121664318_2[] = {19, 127, -121, 32, 0};
+ char buf121664318_3[] = {19, -118, 15, 19, 0};
+ char buf121664318_4[] = {28, 27, -116, -113, 0};
+ char buf121664318_5[] = {28, 27, -115, -124, 0};
+
+ char buf121666222_1[] = {18, -119, -120, -111, 0};
+ char buf121666222_2[] = {19, 127, -120, 32, 0};
+ char buf121666222_3[] = {19, -118, 15, 30, 0};
+ char buf121666222_4[] = {29, 17, -126, -113, 0};
+ char buf121666222_5[] = {29, 17, -125, -124, 0};
+
+ std::map<uint32_t, std::vector<std::string>> object_names = {
+ {121664318, {{buf121664318_1},
+ {buf121664318_2},
+ {buf121664318_3},
+ {buf121664318_4},
+ {buf121664318_5}}},
+ {121666222, {{buf121666222_1},
+ {buf121666222_2},
+ {buf121666222_3},
+ {buf121666222_4},
+ {buf121666222_5}}}};
+
+ int64_t poolid = 111;
+ coll_t cid = coll_t(spg_t(pg_t(0, poolid), shard_id_t::NO_SHARD));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+
+ std::set<ghobject_t> created;
+ for (auto &[hash, names] : object_names) {
+ for (auto &name : names) {
+ ghobject_t hoid(hobject_t(sobject_t(name, CEPH_NOSNAP),
+ string(),
+ hash,
+ poolid,
+ string()));
+ ASSERT_EQ(hash, hoid.hobj.get_hash());
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ int r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ created.insert(hoid);
+ }
+ }
+
+ vector<ghobject_t> objects;
+ int r = store->collection_list(ch, ghobject_t(), ghobject_t::get_max(),
+ INT_MAX, &objects, 0);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(created.size(), objects.size());
+ auto it = objects.begin();
+ for (auto &hoid : created) {
+ ASSERT_EQ(hoid, *it);
+ it++;
+ }
+
+ for (auto i = created.begin(); i != created.end(); i++) {
+ auto j = i;
+ for (j++; j != created.end(); j++) {
+ std::set<ghobject_t> created_sub(i, j);
+ objects.clear();
+ ghobject_t next;
+ r = store->collection_list(ch, *i, ghobject_t::get_max(),
+ created_sub.size(),
+ &objects, &next);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(created_sub.size(), objects.size());
+ it = objects.begin();
+ for (auto &hoid : created_sub) {
+ ASSERT_EQ(hoid, *it);
+ it++;
+ }
+ if (j == created.end()) {
+ ASSERT_TRUE(next.is_max());
+ } else {
+ ASSERT_EQ(*j, next);
+ }
+ }
+ }
+
+ for (auto i = created.begin(); i != created.end(); i++) {
+ auto j = i;
+ for (j++; j != created.end(); j++) {
+ std::set<ghobject_t> created_sub(i, j);
+ objects.clear();
+ ghobject_t next;
+ r = store->collection_list(ch, *i, *j, INT_MAX, &objects, &next);
+ ASSERT_EQ(r, 0);
+ ASSERT_EQ(created_sub.size(), objects.size());
+ it = objects.begin();
+ for (auto &hoid : created_sub) {
+ ASSERT_EQ(hoid, *it);
+ it++;
+ }
+ if (j == created.end()) {
+ ASSERT_TRUE(next.is_max());
+ } else {
+ ASSERT_EQ(*j, next);
+ }
+ }
+ }
+}
+
TEST_P(StoreTest, ScrubTest) {
int64_t poolid = 111;
coll_t cid(spg_t(pg_t(0, poolid),shard_id_t(1)));