From 19c39faead87655d174965d054de8bbbe4d79067 Mon Sep 17 00:00:00 2001 From: Haomai Wang Date: Fri, 22 Jan 2016 10:28:42 +0800 Subject: [PATCH] KeyValueStore: Kill this Signed-off-by: Haomai Wang --- .../keyvaluestore-config-ref.rst | 94 - src/CMakeLists.txt | 2 - src/common/config_opts.h | 16 +- src/kv/KineticStore.cc | 2 +- src/os/Makefile.am | 4 - src/os/ObjectStore.cc | 5 - src/os/keyvaluestore/GenericObjectMap.cc | 1127 ------ src/os/keyvaluestore/GenericObjectMap.h | 429 --- src/os/keyvaluestore/KeyValueStore.cc | 3015 ----------------- src/os/keyvaluestore/KeyValueStore.h | 700 ---- src/test/ceph_objectstore_tool.py | 4 +- src/test/objectstore/store_test.cc | 3 - src/tools/ceph_objectstore_tool.cc | 9 +- 13 files changed, 6 insertions(+), 5404 deletions(-) delete mode 100644 doc/rados/configuration/keyvaluestore-config-ref.rst delete mode 100644 src/os/keyvaluestore/GenericObjectMap.cc delete mode 100644 src/os/keyvaluestore/GenericObjectMap.h delete mode 100644 src/os/keyvaluestore/KeyValueStore.cc delete mode 100644 src/os/keyvaluestore/KeyValueStore.h diff --git a/doc/rados/configuration/keyvaluestore-config-ref.rst b/doc/rados/configuration/keyvaluestore-config-ref.rst deleted file mode 100644 index 103180792510..000000000000 --- a/doc/rados/configuration/keyvaluestore-config-ref.rst +++ /dev/null @@ -1,94 +0,0 @@ -=============================== - KeyValueStore Config Reference -=============================== - -``KeyValueStore`` is an alternative OSD backend compared to FileStore. -Currently, it uses LevelDB as backend. ``KeyValueStore`` doesn't need journal -device. Each operation will flush into the backend directly. - - -``keyvaluestore backend`` - -:Description: The backend used by ``KeyValueStore``. -:Type: String -:Required: No -:Default: ``leveldb`` - - -.. index:: keyvaluestore; queue - -Queue -===== - -The following settings provide limits on the size of the ``KeyValueStore`` -queue. - -``keyvaluestore queue max ops`` - -:Description: Defines the maximum number of operations in progress the - ``KeyValueStore`` accepts before blocking on queuing new operations. - -:Type: Integer -:Required: No. Minimal impact on performance. -:Default: ``50`` - - -``keyvaluestore queue max bytes`` - -:Description: The maximum number of bytes for an operation. -:Type: Integer -:Required: No -:Default: ``100 << 20`` - -.. index:: keyvaluestore; thread - -Thread -======== - - -``keyvaluestore op threads`` - -:Description: The number of ``KeyValueStore`` operation threads that execute in parallel. -:Type: Integer -:Required: No -:Default: ``2`` - - -``keyvaluestore op thread timeout`` - -:Description: The timeout for a ``KeyValueStore`` operation thread (in seconds). -:Type: Integer -:Required: No -:Default: ``60`` - - -``keyvaluestore op thread suicide timeout`` - -:Description: The timeout for a commit operation before canceling the commit (in seconds). -:Type: Integer -:Required: No -:Default: ``180`` - - -Misc -==== - - -``keyvaluestore default strip size`` - -:Description: Each object will be split into multiple key/value pairs and - stored in the backend. **Note:** The size of the workload has - a significant impact on performance. -:Type: Integer -:Required: No -:Default: ``4096`` - - -``keyvaluestore header cache size`` - -:Description: The size of the header cache (identical to ``inode`` in the local - filesystem). A larger cache size enhances performance. - -:Type: Integer -:Required: No -:Default: ``4096`` diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1be14fbc7fee..429992efe5f4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -641,8 +641,6 @@ set(libos_srcs os/filestore/LFNIndex.cc os/filestore/WBThrottle.cc os/filestore/ZFSFileStoreBackend.cc - os/keyvaluestore/GenericObjectMap.cc - os/keyvaluestore/KeyValueStore.cc os/memstore/MemStore.cc os/kstore/KStore.cc os/kstore/kstore_types.cc diff --git a/src/common/config_opts.h b/src/common/config_opts.h index dcbb7e027c26..d6d406255db4 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -117,7 +117,6 @@ SUBSYS(osd, 0, 5) SUBSYS(optracker, 0, 5) SUBSYS(objclass, 0, 5) SUBSYS(filestore, 1, 3) -SUBSYS(keyvaluestore, 1, 3) SUBSYS(journal, 1, 3) SUBSYS(ms, 0, 5) SUBSYS(mon, 1, 5) @@ -144,6 +143,7 @@ SUBSYS(bdev, 1, 3) SUBSYS(kstore, 1, 5) SUBSYS(rocksdb, 4, 5) SUBSYS(leveldb, 4, 5) +SUBSYS(kinetic, 1, 5) OPTION(key, OPT_STR, "") OPTION(keyfile, OPT_STR, "") @@ -788,8 +788,6 @@ OPTION(kinetic_use_ssl, OPT_BOOL, false) // whether to secure kinetic traffic wi OPTION(rocksdb_separate_wal_dir, OPT_BOOL, false) // use $path.wal for wal OPTION(rocksdb_db_paths, OPT_STR, "") // path,size( path,size)* OPTION(rocksdb_log_to_ceph_log, OPT_BOOL, true) // log to ceph log -// rocksdb options that will be used for keyvaluestore(if backend is rocksdb) -OPTION(keyvaluestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used for omap(if omap_backend is rocksdb) OPTION(filestore_rocksdb_options, OPT_STR, "") // rocksdb options that will be used in monstore @@ -1014,18 +1012,6 @@ OPTION(journal_dio, OPT_BOOL, true) OPTION(journal_aio, OPT_BOOL, true) OPTION(journal_force_aio, OPT_BOOL, false) -OPTION(keyvaluestore_queue_max_ops, OPT_INT, 50) -OPTION(keyvaluestore_queue_max_bytes, OPT_INT, 100 << 20) -OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging check on sync -OPTION(keyvaluestore_op_threads, OPT_INT, 2) -OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60) -OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180) -OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object -OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes -OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096) // Header cache size -OPTION(keyvaluestore_backend, OPT_STR, "leveldb") -OPTION(keyvaluestore_dump_file, OPT_STR, "") // file onto which store transaction dumps - // max bytes to search ahead in journal searching for corruption OPTION(journal_max_corrupt_search, OPT_U64, 10<<20) OPTION(journal_block_align, OPT_BOOL, true) diff --git a/src/kv/KineticStore.cc b/src/kv/KineticStore.cc index 71559f03d3ee..7a23714d4d52 100644 --- a/src/kv/KineticStore.cc +++ b/src/kv/KineticStore.cc @@ -11,7 +11,7 @@ using std::string; #include "common/perf_counters.h" -#define dout_subsys ceph_subsys_keyvaluestore +#define dout_subsys ceph_subsys_kinetic int KineticStore::init() { diff --git a/src/os/Makefile.am b/src/os/Makefile.am index d7cdbf10331a..4221eb8424aa 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -24,8 +24,6 @@ libos_a_SOURCES = \ os/filestore/LFNIndex.cc \ os/filestore/WBThrottle.cc \ os/fs/FS.cc \ - os/keyvaluestore/GenericObjectMap.cc \ - os/keyvaluestore/KeyValueStore.cc \ os/kstore/kv.cc \ os/kstore/KStore.cc \ os/memstore/MemStore.cc \ @@ -86,8 +84,6 @@ noinst_HEADERS += \ os/fs/btrfs_ioctl.h \ os/fs/FS.h \ os/fs/XFS.h \ - os/keyvaluestore/GenericObjectMap.h \ - os/keyvaluestore/KeyValueStore.h \ os/kstore/kstore_types.h \ os/kstore/KStore.h \ os/kstore/kv.h \ diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc index 62ec739e5ab6..93afd84896ef 100644 --- a/src/os/ObjectStore.cc +++ b/src/os/ObjectStore.cc @@ -20,7 +20,6 @@ #include "filestore/FileStore.h" #include "memstore/MemStore.h" -#include "keyvaluestore/KeyValueStore.h" #if defined(HAVE_LIBAIO) #include "bluestore/BlueStore.h" #endif @@ -73,10 +72,6 @@ ObjectStore *ObjectStore::create(CephContext *cct, if (type == "memstore") { return new MemStore(cct, data); } - if (type == "keyvaluestore" && - cct->check_experimental_feature_enabled("keyvaluestore")) { - return new KeyValueStore(data); - } #if defined(HAVE_LIBAIO) if (type == "bluestore" && cct->check_experimental_feature_enabled("bluestore")) { diff --git a/src/os/keyvaluestore/GenericObjectMap.cc b/src/os/keyvaluestore/GenericObjectMap.cc deleted file mode 100644 index 3453bc042be0..000000000000 --- a/src/os/keyvaluestore/GenericObjectMap.cc +++ /dev/null @@ -1,1127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/int_types.h" -#include "include/buffer.h" - -#include -#include -#include -#include -#include - -#include - -#include "GenericObjectMap.h" -#include "common/debug.h" -#include "common/config.h" -#include "include/assert.h" - -#define dout_subsys ceph_subsys_keyvaluestore - -const string GenericObjectMap::GLOBAL_STATE_KEY = "HEADER"; - -const string GenericObjectMap::USER_PREFIX = "_SEQ_"; -const string GenericObjectMap::INTERN_PREFIX = "_INTERN_"; -const string GenericObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; -const string GenericObjectMap::GHOBJECT_TO_SEQ_PREFIX = "_GHOBJTOSEQ_"; -const string GenericObjectMap::PARENT_KEY = "_PARENT_HEADER_"; - -// In order to make right ordering for leveldb matching with hobject_t, -// so use "!" to separated -const string GenericObjectMap::GHOBJECT_KEY_SEP_S = "!"; -const char GenericObjectMap::GHOBJECT_KEY_SEP_C = '!'; -const char GenericObjectMap::GHOBJECT_KEY_ENDING = 0xFF; - -// ============== GenericObjectMap Key Function ================= - -static void append_escaped(const string &in, string *out) -{ - for (string::const_iterator i = in.begin(); i != in.end(); ++i) { - if (*i == '%') { - out->push_back('%'); - out->push_back('p'); - } else if (*i == '.') { - out->push_back('%'); - out->push_back('e'); - } else if (*i == GenericObjectMap::GHOBJECT_KEY_SEP_C) { - out->push_back('%'); - out->push_back('u'); - } else if (*i == '!') { - out->push_back('%'); - out->push_back('s'); - } else { - out->push_back(*i); - } - } -} - -static bool append_unescaped(string::const_iterator begin, - string::const_iterator end, - string *out) -{ - for (string::const_iterator i = begin; i != end; ++i) { - if (*i == '%') { - ++i; - if (*i == 'p') - out->push_back('%'); - else if (*i == 'e') - out->push_back('.'); - else if (*i == 'u') - out->push_back(GenericObjectMap::GHOBJECT_KEY_SEP_C); - else if (*i == 's') - out->push_back('!'); - else - return false; - } else { - out->push_back(*i); - } - } - return true; -} - -string GenericObjectMap::header_key(const coll_t &cid) -{ - string full_name; - - append_escaped(cid.to_str(), &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - return full_name; -} - -string GenericObjectMap::header_key(const coll_t &cid, const ghobject_t &oid) -{ - string full_name; - - append_escaped(cid.to_str(), &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - char buf[PATH_MAX]; - char *t; - char *end; - - // make field ordering match with ghobject_t compare operations - t = buf; - end = t + sizeof(buf); - if (oid.shard_id == shard_id_t::NO_SHARD) { - // otherwise ff will sort *after* 0, not before. - full_name += "--"; - } else { - t += snprintf(t, end - t, "%02x", (int)oid.shard_id); - full_name += string(buf); - } - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - t += snprintf(t, end - t, "%016llx", - (long long)(oid.hobj.pool + 0x8000000000000000)); - full_name += string(buf); - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - snprintf(t, end - t, "%.*X", (int)(sizeof(oid.hobj.get_hash())*2), - (uint32_t)oid.hobj.get_bitwise_key_u32()); - full_name += string(buf); - full_name.append(GHOBJECT_KEY_SEP_S); - - append_escaped(oid.hobj.nspace, &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - append_escaped(oid.hobj.get_key(), &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - append_escaped(oid.hobj.oid.name, &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - if (oid.hobj.snap == CEPH_NOSNAP) - t += snprintf(t, end - t, "head"); - else if (oid.hobj.snap == CEPH_SNAPDIR) - t += snprintf(t, end - t, "snapdir"); - else - // Keep length align - t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.hobj.snap); - full_name += string(buf); - - if (oid.generation != ghobject_t::NO_GEN) { - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - end = t + sizeof(buf); - t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.generation); - full_name += string(buf); - } - - full_name.append(1, GHOBJECT_KEY_ENDING); - - return full_name; -} - -bool GenericObjectMap::parse_header_key(const string &long_name, - coll_t *out_coll, ghobject_t *out) -{ - string coll; - string name; - string key; - string ns; - uint32_t hash; - snapid_t snap; - int64_t pool; - gen_t generation = ghobject_t::NO_GEN; - shard_id_t shard_id = shard_id_t::NO_SHARD; - - string::const_iterator current = long_name.begin(); - string::const_iterator end; - - for (end = current; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (!append_unescaped(current, end, &coll)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - string shardstring = string(current, end); - if (shardstring == "--") - shard_id = shard_id_t::NO_SHARD; - else - shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - string pstring(current, end); - pool = strtoull(pstring.c_str(), NULL, 16); - pool -= 0x8000000000000000; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - string hash_str(current, end); - sscanf(hash_str.c_str(), "%X", &hash); - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &ns)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &key)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &name)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C && - *end != GHOBJECT_KEY_ENDING; ++end) ; - if (end == long_name.end()) - return false; - string snap_str(current, end); - if (snap_str == "head") - snap = CEPH_NOSNAP; - else if (snap_str == "snapdir") - snap = CEPH_SNAPDIR; - else - snap = strtoull(snap_str.c_str(), NULL, 16); - - // Optional generation/shard_id - string genstring; - if (*end == GHOBJECT_KEY_SEP_C) { - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_ENDING; ++end) ; - if (end != long_name.end()) - return false; - genstring = string(current, end); - generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); - } - - if (out) { - (*out) = ghobject_t(hobject_t(name, key, snap, - hobject_t::_reverse_bits(hash), - (int64_t)pool, ns), - generation, shard_id); - } - - if (out_coll) { - bool valid = out_coll->parse(coll); - assert(valid); - } - - return true; -} - - -// ============== GenericObjectMap Prefix ================= - -string GenericObjectMap::user_prefix(Header header, const string &prefix) -{ - return USER_PREFIX + seq_key(header->seq) + prefix; -} - -string GenericObjectMap::complete_prefix(Header header) -{ - return INTERN_PREFIX + seq_key(header->seq) + COMPLETE_PREFIX; -} - -string GenericObjectMap::parent_seq_prefix(uint64_t seq) -{ - return INTERN_PREFIX + seq_key(seq) + PARENT_KEY; -} - - -// ============== GenericObjectMapIteratorImpl ================= - -int GenericObjectMap::GenericObjectMapIteratorImpl::init() -{ - invalid = false; - if (ready) { - return 0; - } - - assert(!parent_iter); - if (header->parent) { - Header parent = map->lookup_parent(header); - if (!parent) { - assert(0); - return -EINVAL; - } - parent_iter.reset(new GenericObjectMapIteratorImpl(map, parent, prefix)); - } - - key_iter = map->db->get_iterator(map->user_prefix(header, prefix)); - assert(key_iter); - complete_iter = map->db->get_iterator(map->complete_prefix(header)); - assert(complete_iter); - cur_iter = key_iter; - assert(cur_iter); - ready = true; - return 0; -} - -ObjectMap::ObjectMapIterator GenericObjectMap::get_iterator( - const coll_t &cid, const ghobject_t &oid, const string &prefix) -{ - Header header = lookup_header(cid, oid); - if (!header) - return ObjectMap::ObjectMapIterator(new EmptyIteratorImpl()); - return _get_iterator(header, prefix); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::seek_to_first() -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->seek_to_first(); - if (r < 0) - return r; - } - r = key_iter->seek_to_first(); - if (r < 0) - return r; - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::seek_to_last() -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->seek_to_last(); - if (r < 0) - return r; - if (parent_iter->valid()) - r = parent_iter->next(); - if (r < 0) - return r; - } - r = key_iter->seek_to_last(); - if (r < 0) - return r; - if (key_iter->valid()) - r = key_iter->next(); - if (r < 0) - return r; - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::lower_bound(const string &to) -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->lower_bound(to); - if (r < 0) - return r; - } - r = key_iter->lower_bound(to); - if (r < 0) - return r; - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::upper_bound(const string &after) -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->upper_bound(after); - if (r < 0) - return r; - } - r = key_iter->upper_bound(after); - if (r < 0) - return r; - return adjust(); -} - -bool GenericObjectMap::GenericObjectMapIteratorImpl::valid() -{ - bool valid = !invalid && ready; - assert(!valid || cur_iter->valid()); - return valid; -} - -bool GenericObjectMap::GenericObjectMapIteratorImpl::valid_parent() -{ - if (parent_iter && parent_iter->valid() && - (!key_iter->valid() || key_iter->key() > parent_iter->key())) - return true; - return false; -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::next(bool validate) -{ - assert(cur_iter->valid()); - assert(valid()); - cur_iter->next(); - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::next_parent() -{ - if (!parent_iter || !parent_iter->valid()) { - invalid = true; - return 0; - } - r = next(); - if (r < 0) - return r; - if (!valid() || on_parent() || !parent_iter->valid()) - return 0; - - return lower_bound(parent_iter->key()); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::in_complete_region( - const string &to_test, string *begin, string *end) -{ - complete_iter->upper_bound(to_test); - if (complete_iter->valid()) - complete_iter->prev(); - else - complete_iter->seek_to_last(); - - if (!complete_iter->valid()) - return false; - - string _end; - if (begin) - *begin = complete_iter->key(); - _end = string(complete_iter->value().c_str()); - if (end) - *end = _end; - return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test); -} - -/** - * Moves parent_iter to the next position both out of the complete_region and - * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and - * less than key_iter and key_iter otherwise. - */ -int GenericObjectMap::GenericObjectMapIteratorImpl::adjust() -{ - string begin, end; - while (parent_iter && parent_iter->valid()) { - if (in_complete_region(parent_iter->key(), &begin, &end)) { - if (end.size() == 0) { - parent_iter->seek_to_last(); - if (parent_iter->valid()) - parent_iter->next(); - } else { - parent_iter->lower_bound(end); - } - } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { - parent_iter->next(); - } else { - break; - } - } - if (valid_parent()) { - cur_iter = parent_iter; - } else if (key_iter->valid()) { - cur_iter = key_iter; - } else { - invalid = true; - } - assert(invalid || cur_iter->valid()); - return 0; -} - -string GenericObjectMap::GenericObjectMapIteratorImpl::key() -{ - return cur_iter->key(); -} - -bufferlist GenericObjectMap::GenericObjectMapIteratorImpl::value() -{ - return cur_iter->value(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::status() -{ - return r; -} - - -// ============== GenericObjectMap Public API ================= - -void GenericObjectMap::set_keys(const Header header, - const string &prefix, - const map &set, - KeyValueDB::Transaction t) -{ - t->set(user_prefix(header, prefix), set); -} - -int GenericObjectMap::clear(const Header header, - KeyValueDB::Transaction t) -{ - remove_header(header->cid, header->oid, header, t); - assert(header->num_children > 0); - header->num_children--; - int r = _clear(header, t); - if (r < 0) - return r; - return 0; -} - -int GenericObjectMap::rm_keys(const Header header, - const string &prefix, - const set &buffered_keys, - const set &to_clear, - KeyValueDB::Transaction t) -{ - t->rmkeys(user_prefix(header, prefix), to_clear); - if (!header->parent) { - return 0; - } - - // Copy up keys from parent around to_clear - int keep_parent; - { - GenericObjectMapIterator iter = _get_iterator(header, prefix); - iter->seek_to_first(); - map new_complete; - map to_write; - for(set::const_iterator i = to_clear.begin(); - i != to_clear.end(); ) { - unsigned copied = 0; - iter->lower_bound(*i); - ++i; - if (!iter->valid()) - break; - string begin = iter->key(); - if (!iter->on_parent()) - iter->next_parent(); - if (new_complete.size() && new_complete.rbegin()->second == begin) { - begin = new_complete.rbegin()->first; - } - while (iter->valid() && copied < 20) { - if (!to_clear.count(iter->key()) && !buffered_keys.count(iter->key())) - to_write[iter->key()].append(iter->value()); - if (i != to_clear.end() && *i <= iter->key()) { - ++i; - copied = 0; - } - - iter->next_parent(); - copied++; - } - if (iter->valid()) { - new_complete[begin] = iter->key(); - } else { - new_complete[begin] = ""; - break; - } - } - t->set(user_prefix(header, prefix), to_write); - merge_new_complete(header, new_complete, iter, t); - keep_parent = need_parent(iter); - if (keep_parent < 0) - return keep_parent; - } - - if (!keep_parent) { - Header parent = lookup_parent(header); - if (!parent) - return -EINVAL; - parent->num_children--; - _clear(parent, t); - header->parent = 0; - set_header(header->cid, header->oid, *header, t); - t->rmkeys_by_prefix(complete_prefix(header)); - } - - return 0; -} - -int GenericObjectMap::get(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - map *out) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - - ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); - out->insert(make_pair(iter->key(), iter->value())); - } - - return 0; -} - -int GenericObjectMap::get_keys(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - set *keys) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - - ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); - keys->insert(iter->key()); - } - return 0; -} - -int GenericObjectMap::get_values(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - const set &keys, - map *out) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - return scan(header, prefix, keys, 0, out); -} - -int GenericObjectMap::check_keys(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - const set &keys, - set *out) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - return scan(header, prefix, keys, out, 0); -} - -void GenericObjectMap::clone(const Header parent, const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t, - Header *old_header, Header *new_header) -{ - { - Header destination = lookup_header(cid, target); - if (destination) { - remove_header(cid, target, destination, t); - destination->num_children--; - _clear(destination, t); - } - } - - Header source = generate_new_header(parent->cid, parent->oid, parent, t); - Header destination = generate_new_header(cid, target, parent, t); - - destination->data = parent->data; - source->data = parent->data; - - parent->num_children = 2; - set_parent_header(parent, t); - set_header(parent->cid, parent->oid, *source, t); - set_header(cid, target, *destination, t); - - if (new_header) - *old_header = source; - if (new_header) - *new_header = destination; - - // Clone will set parent header and rm_keys wll lookup_parent which will try - // to find parent header. So it will let lookup_parent fail when "clone" and - // "rm_keys" in one transaction. Here have to sync transaction to make - // visiable for lookup_parent - // FIXME: Clear transaction operations here - int r = submit_transaction_sync(t); - assert(r == 0); -} - -void GenericObjectMap::rename(const Header old_header, const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t) -{ - if (old_header->oid == target && old_header->cid == cid) - return ; - - remove_header(old_header->cid, old_header->oid, old_header, t); - old_header->cid = cid; - old_header->oid = target; - set_header(cid, target, *old_header, t); -} - -int GenericObjectMap::init(bool do_upgrade) -{ - map result; - set to_get; - to_get.insert(GLOBAL_STATE_KEY); - int r = db->get(INTERN_PREFIX, to_get, &result); - if (r < 0) - return r; - if (!result.empty()) { - bufferlist::iterator bliter = result.begin()->second.begin(); - state.decode(bliter); - if (state.v < 1) { // Needs upgrade - if (!do_upgrade) { - dout(1) << "GenericObjbectMap requires an upgrade," - << " set filestore_update_to" - << dendl; - return -ENOTSUP; - } else { - r = upgrade(); - if (r < 0) - return r; - } - } - } else { - // New store - state.v = 1; - state.seq = 1; - } - dout(20) << "(init)genericobjectmap: seq is " << state.seq << dendl; - return 0; -} - -bool GenericObjectMap::check(std::ostream &out) -{ - bool retval = true; - map parent_to_num_children; - map parent_to_actual_num_children; - KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); - - for (iter->seek_to_first(); iter->valid(); iter->next()) { - _Header header; - assert(header.num_children == 1); - header.num_children = 0; // Hack for leaf node - bufferlist bl = iter->value(); - while (true) { - bufferlist::iterator bliter = bl.begin(); - header.decode(bliter); - if (header.seq != 0) - parent_to_actual_num_children[header.seq] = header.num_children; - if (header.parent == 0) - break; - - if (!parent_to_num_children.count(header.parent)) - parent_to_num_children[header.parent] = 0; - parent_to_num_children[header.parent]++; - if (parent_to_actual_num_children.count(header.parent)) - break; - - set to_get; - map got; - to_get.insert(PARENT_KEY); - db->get(parent_seq_prefix(header.parent), to_get, &got); - if (got.empty()) { - out << "Missing: seq " << header.parent << std::endl; - retval = false; - break; - } else { - bl = got.begin()->second; - } - } - } - - for (map::iterator i = parent_to_num_children.begin(); - i != parent_to_num_children.end(); - parent_to_num_children.erase(i++)) { - if (!parent_to_actual_num_children.count(i->first)) - continue; - if (parent_to_actual_num_children[i->first] != i->second) { - out << "Invalid: seq " << i->first << " recorded children: " - << parent_to_actual_num_children[i->first] << " found: " - << i->second << std::endl; - retval = false; - } - parent_to_actual_num_children.erase(i->first); - } - return retval; -} - - -// ============== GenericObjectMap Intern Implementation ================= - -int GenericObjectMap::scan(Header header, - const string &prefix, - const set &in_keys, - set *out_keys, - map *out_values) -{ - ObjectMap::ObjectMapIterator db_iter = _get_iterator(header, prefix); - for (set::const_iterator key_iter = in_keys.begin(); - key_iter != in_keys.end(); - ++key_iter) { - db_iter->lower_bound(*key_iter); - if (db_iter->status()) - return db_iter->status(); - - if (db_iter->valid() && db_iter->key() == *key_iter) { - if (out_keys) - out_keys->insert(*key_iter); - if (out_values) - out_values->insert(make_pair(db_iter->key(), db_iter->value())); - } - } - return 0; -} - -int GenericObjectMap::_clear(Header header, KeyValueDB::Transaction t) -{ - while (1) { - if (header->num_children) { - set_parent_header(header, t); - break; - } - - clear_header(header, t); - if (!header->parent) - break; - - Header parent = lookup_parent(header); - if (!parent) { - return -EINVAL; - } - assert(parent->num_children > 0); - parent->num_children--; - header.swap(parent); - } - return 0; -} - -int GenericObjectMap::merge_new_complete( - Header header, const map &new_complete, - GenericObjectMapIterator iter, KeyValueDB::Transaction t) -{ - KeyValueDB::Iterator complete_iter = db->get_iterator( - complete_prefix(header)); - map::const_iterator i = new_complete.begin(); - set to_remove; - map to_add; - - string begin, end; - while (i != new_complete.end()) { - string new_begin = i->first; - string new_end = i->second; - int r = iter->in_complete_region(new_begin, &begin, &end); - if (r < 0) - return r; - if (r) { - to_remove.insert(begin); - new_begin = begin; - } - ++i; - while (i != new_complete.end()) { - if (!new_end.size() || i->first <= new_end) { - if (!new_end.size() && i->second > new_end) { - new_end = i->second; - } - ++i; - continue; - } - - r = iter->in_complete_region(new_end, &begin, &end); - if (r < 0) - return r; - if (r) { - to_remove.insert(begin); - new_end = end; - continue; - } - break; - } - bufferlist bl; - bl.append(bufferptr(new_end.c_str(), new_end.size() + 1)); - to_add.insert(make_pair(new_begin, bl)); - } - t->rmkeys(complete_prefix(header), to_remove); - t->set(complete_prefix(header), to_add); - return 0; -} - -int GenericObjectMap::need_parent(GenericObjectMapIterator iter) -{ - int r = iter->seek_to_first(); - if (r < 0) - return r; - - if (!iter->valid()) - return 0; - - string begin, end; - if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") { - return 0; - } - return 1; -} - -int GenericObjectMap::write_state(KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " seq is " << state.seq << dendl; - bufferlist bl; - state.encode(bl); - map to_write; - to_write[GLOBAL_STATE_KEY] = bl; - t->set(INTERN_PREFIX, to_write); - return 0; -} - -// NOTE(haomai): It may occur dead lock if thread A hold header A try to header -// B and thread hold header B try to get header A -GenericObjectMap::Header GenericObjectMap::_lookup_header( - const coll_t &cid, const ghobject_t &oid) -{ - set to_get; - to_get.insert(header_key(cid, oid)); - _Header header; - - map out; - - int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out); - if (r < 0) - return Header(); - if (out.empty()) - return Header(); - - bufferlist::iterator iter = out.begin()->second.begin(); - header.decode(iter); - - Header ret = Header(new _Header(header)); - return ret; -} - -GenericObjectMap::Header GenericObjectMap::_generate_new_header( - const coll_t &cid, const ghobject_t &oid, Header parent, - KeyValueDB::Transaction t) -{ - Header header = Header(new _Header()); - header->seq = state.seq++; - if (parent) { - header->parent = parent->seq; - } - header->num_children = 1; - header->oid = oid; - header->cid = cid; - - write_state(t); - return header; -} - -GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input) -{ - Mutex::Locker l(header_lock); - map out; - set keys; - keys.insert(PARENT_KEY); - - dout(20) << "lookup_parent: parent " << input->parent - << " for seq " << input->seq << dendl; - - int r = db->get(parent_seq_prefix(input->parent), keys, &out); - if (r < 0) { - assert(0); - return Header(); - } - if (out.empty()) { - assert(0); - return Header(); - } - - Header header = Header(new _Header()); - header->seq = input->parent; - bufferlist::iterator iter = out.begin()->second.begin(); - header->decode(iter); - dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " - << header->parent << dendl; - return header; -} - -GenericObjectMap::Header GenericObjectMap::lookup_create_header( - const coll_t &cid, const ghobject_t &oid, KeyValueDB::Transaction t) -{ - Mutex::Locker l(header_lock); - Header header = _lookup_header(cid, oid); - if (!header) { - header = _generate_new_header(cid, oid, Header(), t); - set_header(cid, oid, *header, t); - } - return header; -} - -void GenericObjectMap::set_parent_header(Header header, KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " setting seq " << header->seq << dendl; - map to_write; - header->encode(to_write[PARENT_KEY]); - t->set(parent_seq_prefix(header->seq), to_write); -} - -void GenericObjectMap::clear_header(Header header, KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " clearing seq " << header->seq << dendl; - t->rmkeys_by_prefix(user_prefix(header, string())); - t->rmkeys_by_prefix(complete_prefix(header)); - set keys; - keys.insert(PARENT_KEY); - t->rmkeys(parent_seq_prefix(header->seq), keys); -} - -// only remove GHOBJECT_TO_SEQ -void GenericObjectMap::remove_header(const coll_t &cid, - const ghobject_t &oid, Header header, - KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " removing " << header->seq - << " cid " << cid << " oid " << oid << dendl; - set to_remove; - to_remove.insert(header_key(cid, oid)); - t->rmkeys(GHOBJECT_TO_SEQ_PREFIX, to_remove); -} - -void GenericObjectMap::set_header(const coll_t &cid, const ghobject_t &oid, - _Header &header, KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " setting " << header.seq - << " cid " << cid << " oid " << oid << " parent seq " - << header.parent << dendl; - map to_set; - header.encode(to_set[header_key(cid, oid)]); - t->set(GHOBJECT_TO_SEQ_PREFIX, to_set); -} - -int GenericObjectMap::list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max, - vector *out, ghobject_t *next) -{ - // FIXME - Mutex::Locker l(header_lock); - if (start.is_max()) - return 0; - - if (start.is_min()) { - vector oids; - - KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); - for (iter->lower_bound(header_key(cid)); iter->valid(); iter->next()) { - bufferlist bl = iter->value(); - bufferlist::iterator bliter = bl.begin(); - _Header header; - header.decode(bliter); - - if (header.cid == cid) - oids.push_back(header.oid); - - break; - } - - if (oids.empty()) { - if (next) - *next = ghobject_t::get_max(); - return 0; - } - start = oids[0]; - } - - int size = 0; - KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); - for (iter->lower_bound(header_key(cid, start)); iter->valid(); iter->next()) { - bufferlist bl = iter->value(); - bufferlist::iterator bliter = bl.begin(); - _Header header; - header.decode(bliter); - - if (header.cid != cid) { - if (next) - *next = ghobject_t::get_max(); - break; - } - - if (max && size >= max) { - if (next) - *next = header.oid; - break; - } - - if (cmp_bitwise(header.oid, end) >= 0) { - if (next) - *next = ghobject_t::get_max(); - break; - } - - assert(cmp_bitwise(start, header.oid) <= 0); - assert(cmp_bitwise(header.oid, end) < 0); - - - size++; - if (out) - out->push_back(header.oid); - start = header.oid; - } - - if (out->size()) - dout(20) << "objects: " << *out << dendl; - - if (!iter->valid()) - if (next) - *next = ghobject_t::get_max(); - - return 0; -} diff --git a/src/os/keyvaluestore/GenericObjectMap.h b/src/os/keyvaluestore/GenericObjectMap.h deleted file mode 100644 index 9417937aea40..000000000000 --- a/src/os/keyvaluestore/GenericObjectMap.h +++ /dev/null @@ -1,429 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_GENERICOBJECTMAP_H -#define CEPH_GENERICOBJECTMAP_H - -#include "include/buffer.h" -#include -#include -#include -#include -#include - -#include "include/memory.h" -#include "os/ObjectMap.h" -#include "kv/KeyValueDB.h" -#include "osd/osd_types.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/simple_cache.hpp" - - -/** - * Genericobjectmap: Provide with key/value associated to ghobject_t APIs to caller - * and avoid concerning too much. Wrap and combine KeyValueDB/ObjectMap APIs - * with ghobject_t and adding clone capacity. - * - * Prefix space structure: - * - * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->Header(including - * hobj.seq and related metadata) - * - INTERN_PREFIX: GLOBAL_STATE_KEY - contains the global state - * @see State - * @see write_state - * @see init - * @see generate_new_header - * - INTERN_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below - * - INTERN_PREFIX + header_key(header->seq) + PARENT_KEY - * : used to store parent header(same as headers in GHOBJECT_TO_SEQ) - * - USER_PREFIX + header_key(header->seq) + [CUSTOM_PREFIX] - * : key->value which set by callers - * - * For each node (represented by a header), we - * store three mappings: the key mapping, the complete mapping, and the parent. - * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in - * this mapping indicates that the key mapping contains all entries on [x,y). - * Note, max string is represented by "", so ""->"" indicates that the parent - * is unnecessary (@see rm_keys). When looking up a key not contained in the - * the complete set, we have to check the parent if we don't find it in the - * key set. During rm_keys, we copy keys from the parent and update the - * complete set to reflect the change @see rm_keys. - */ - -// This class only provide basic read capacity, suggest inherit it to -// implement write transaction to use it. @see StripObjectMap -class GenericObjectMap { - public: - boost::scoped_ptr db; - - /** - * Serializes access to next_seq as well as the in_use set - */ - Mutex header_lock; - - GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {} - - int get( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - map *out - ); - - int get_keys( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - set *keys - ); - - int get_values( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - const set &keys, - map *out - ); - - int check_keys( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - const set &keys, - set *out - ); - - /// Read initial state from backing store - int init(bool upgrade = false); - - /// Upgrade store to current version - int upgrade() {return 0;} - - /// Consistency check, debug, there must be no parallel writes - bool check(std::ostream &out); - - /// Util, list all objects, there must be no other concurrent access - int list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max, - vector *objs, ///< [out] objects - ghobject_t *next); - - ObjectMap::ObjectMapIterator get_iterator(const coll_t &cid, - const ghobject_t &oid, - const string &prefix); - - KeyValueDB::Transaction get_transaction() { return db->get_transaction(); } - int submit_transaction(KeyValueDB::Transaction t) { - return db->submit_transaction(t); - } - int submit_transaction_sync(KeyValueDB::Transaction t) { - return db->submit_transaction_sync(t); - } - - /// persistent state for store @see generate_header - struct State { - __u8 v; - uint64_t seq; - State() : v(0), seq(1) {} - State(uint64_t seq) : v(0), seq(seq) {} - - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(v, bl); - ::encode(seq, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(v, bl); - ::decode(seq, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - } - - static void generate_test_instances(list &o) { - o.push_back(new State(0)); - o.push_back(new State(20)); - } - } state; - - struct _Header { - uint64_t seq; - uint64_t parent; - uint64_t num_children; - - coll_t cid; - ghobject_t oid; - - // Used by successor - bufferlist data; - - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(seq, bl); - ::encode(parent, bl); - ::encode(num_children, bl); - ::encode(cid, bl); - ::encode(oid, bl); - ::encode(data, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(seq, bl); - ::decode(parent, bl); - ::decode(num_children, bl); - ::decode(cid, bl); - ::decode(oid, bl); - ::decode(data, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - f->dump_unsigned("parent", parent); - f->dump_unsigned("num_children", num_children); - f->dump_stream("coll") << cid; - f->dump_stream("oid") << oid; - } - - _Header() : seq(0), parent(0), num_children(1) {} - }; - - typedef ceph::shared_ptr<_Header> Header; - - Header lookup_header(const coll_t &cid, const ghobject_t &oid) { - Mutex::Locker l(header_lock); - return _lookup_header(cid, oid); - } - - /// Lookup or create header for c oid - Header lookup_create_header(const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t); - - /// Set leaf node for c and oid to the value of header - void set_header(const coll_t &cid, const ghobject_t &oid, _Header &header, - KeyValueDB::Transaction t); - - // Move all modify member function to "protect", in order to indicate these - // should be made use of by sub-class - void set_keys( - const Header header, - const string &prefix, - const map &set, - KeyValueDB::Transaction t - ); - - int clear( - const Header header, - KeyValueDB::Transaction t - ); - - int rm_keys( - const Header header, - const string &prefix, - const set &buffered_keys, - const set &to_clear, - KeyValueDB::Transaction t - ); - - void clone( - const Header origin_header, - const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t, - Header *old_header, - Header *new_header - ); - - void rename( - const Header header, - const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t - ); - - static const string GLOBAL_STATE_KEY; - static const string PARENT_KEY; - - static const string USER_PREFIX; - static const string INTERN_PREFIX; - static const string PARENT_PREFIX; - static const string COMPLETE_PREFIX; - static const string GHOBJECT_TO_SEQ_PREFIX; - - static const string GHOBJECT_KEY_SEP_S; - static const char GHOBJECT_KEY_SEP_C; - static const char GHOBJECT_KEY_ENDING; - -private: - /// Implicit lock on Header->seq - - static string header_key(const coll_t &cid); - static string header_key(const coll_t &cid, const ghobject_t &oid); - static bool parse_header_key(const string &in, coll_t *c, ghobject_t *oid); - - string seq_key(uint64_t seq) { - char buf[100]; - snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); - return string(buf); - } - - string user_prefix(Header header, const string &prefix); - string complete_prefix(Header header); - string parent_seq_prefix(uint64_t seq); - - class EmptyIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { - public: - int seek_to_first() { return 0; } - int seek_to_last() { return 0; } - int upper_bound(const string &after) { return 0; } - int lower_bound(const string &to) { return 0; } - bool valid() { return false; } - int next(bool validate=true) { assert(0); return 0; } - string key() { assert(0); return ""; } - bufferlist value() { assert(0); return bufferlist(); } - int status() { return 0; } - }; - - - /// Iterator - class GenericObjectMapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { - public: - GenericObjectMap *map; - - /// NOTE: implicit lock on header->seq AND for all ancestors - Header header; - - /// parent_iter == NULL iff no parent - ceph::shared_ptr parent_iter; - KeyValueDB::Iterator key_iter; - KeyValueDB::Iterator complete_iter; - - /// cur_iter points to currently valid iterator - ceph::shared_ptr cur_iter; - int r; - - /// init() called, key_iter, complete_iter, parent_iter filled in - bool ready; - /// past end - bool invalid; - - string prefix; - - GenericObjectMapIteratorImpl(GenericObjectMap *map, Header header, - const string &_prefix) : map(map), header(header), r(0), ready(false), - invalid(true), prefix(_prefix) { } - int seek_to_first(); - int seek_to_last(); - int upper_bound(const string &after); - int lower_bound(const string &to); - bool valid(); - int next(bool validate=true); - string key(); - bufferlist value(); - int status(); - - bool on_parent() { - return cur_iter == parent_iter; - } - - /// skips to next valid parent entry - int next_parent(); - - /// Tests whether to_test is in complete region - int in_complete_region(const string &to_test, ///[in] key to test - string *begin, ///[out] beginning of region - string *end ///[out] end of region - ); ///< @returns true if to_test is in the complete region, else false - - private: - int init(); - bool valid_parent(); - int adjust(); - }; - -protected: - typedef ceph::shared_ptr GenericObjectMapIterator; - GenericObjectMapIterator _get_iterator(Header header, string prefix) { - return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix)); - } - - Header generate_new_header(const coll_t &cid, const ghobject_t &oid, - Header parent, KeyValueDB::Transaction t) { - Mutex::Locker l(header_lock); - return _generate_new_header(cid, oid, parent, t); - } - - // Scan keys in header into out_keys and out_values (if nonnull) - int scan(Header header, const string &prefix, const set &in_keys, - set *out_keys, map *out_values); - - private: - - /// Removes node corresponding to header - void clear_header(Header header, KeyValueDB::Transaction t); - - /// Set node containing input to new contents - void set_parent_header(Header input, KeyValueDB::Transaction t); - - /// Remove leaf node corresponding to oid in c - void remove_header(const coll_t &cid, const ghobject_t &oid, Header header, - KeyValueDB::Transaction t); - - /** - * Generate new header for c oid with new seq number - * - * Has the side effect of syncronously saving the new GenericObjectMap state - */ - Header _generate_new_header(const coll_t &cid, const ghobject_t &oid, - Header parent, KeyValueDB::Transaction t); - - // Lookup leaf header for c oid - Header _lookup_header(const coll_t &cid, const ghobject_t &oid); - - // Lookup header node for input - Header lookup_parent(Header input); - - // Remove header and all related prefixes - int _clear(Header header, KeyValueDB::Transaction t); - - // Adds to t operations necessary to add new_complete to the complete set - int merge_new_complete(Header header, const map &new_complete, - GenericObjectMapIterator iter, KeyValueDB::Transaction t); - - // Writes out State (mainly next_seq) - int write_state(KeyValueDB::Transaction _t); - - // 0 if the complete set now contains all of key space, < 0 on error, 1 else - int need_parent(GenericObjectMapIterator iter); - - // Copies header entry from parent @see rm_keys - int copy_up_header(Header header, KeyValueDB::Transaction t); - - // Sets header @see set_header - void _set_header(Header header, const bufferlist &bl, - KeyValueDB::Transaction t); -}; -WRITE_CLASS_ENCODER(GenericObjectMap::_Header) -WRITE_CLASS_ENCODER(GenericObjectMap::State) - -#endif diff --git a/src/os/keyvaluestore/KeyValueStore.cc b/src/os/keyvaluestore/KeyValueStore.cc deleted file mode 100644 index 5738c50153a9..000000000000 --- a/src/os/keyvaluestore/KeyValueStore.cc +++ /dev/null @@ -1,3015 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/int_types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "include/compat.h" - -#include -#include - -#include "KeyValueStore.h" -#include "common/BackTrace.h" -#include "include/types.h" - -#include "osd/osd_types.h" -#include "include/color.h" -#include "include/buffer.h" - -#include "common/debug.h" -#include "common/errno.h" -#include "common/run_cmd.h" -#include "common/safe_io.h" -#include "common/perf_counters.h" -#include "common/sync_filesystem.h" - -#include "common/ceph_crypto.h" -using ceph::crypto::SHA1; - -#include "include/assert.h" - -#include "common/config.h" - -#define dout_subsys ceph_subsys_keyvaluestore - -const string KeyValueStore::OBJECT_STRIP_PREFIX = "_STRIP_"; -const string KeyValueStore::OBJECT_XATTR = "__OBJATTR__"; -const string KeyValueStore::OBJECT_OMAP = "__OBJOMAP__"; -const string KeyValueStore::OBJECT_OMAP_HEADER = "__OBJOMAP_HEADER__"; -const string KeyValueStore::OBJECT_OMAP_HEADER_KEY = "__OBJOMAP_HEADER__KEY_"; -const string KeyValueStore::COLLECTION = "__COLLECTION__"; -const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__"; - - -//Initial features in new superblock. -static CompatSet get_kv_initial_compat_set() { - CompatSet::FeatureSet ceph_osd_feature_compat; - CompatSet::FeatureSet ceph_osd_feature_ro_compat; - CompatSet::FeatureSet ceph_osd_feature_incompat; - return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, - ceph_osd_feature_incompat); -} - -//Features are added here that this KeyValueStore supports. -static CompatSet get_kv_supported_compat_set() { - CompatSet compat = get_kv_initial_compat_set(); - //Any features here can be set in code, but not in initial superblock - return compat; -} - - -// ============== StripObjectMap Implementation ================= - -int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header, - KeyValueDB::Transaction t) -{ - if (strip_header->updated) { - strip_header->header->data.clear(); - ::encode(*strip_header, strip_header->header->data); - - set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t); - strip_header->updated = false; - } - return 0; -} - -int StripObjectMap::create_strip_header(const coll_t &cid, - const ghobject_t &oid, - StripObjectHeaderRef *strip_header, - KeyValueDB::Transaction t) -{ - Header header = generate_new_header(cid, oid, Header(), t); - if (!header) - return -EINVAL; - - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - tmp->oid = oid; - tmp->cid = cid; - tmp->header = header; - tmp->updated = true; - if (strip_header) - *strip_header = tmp; - - return 0; -} - -int StripObjectMap::lookup_strip_header(const coll_t &cid, - const ghobject_t &oid, - StripObjectHeaderRef *strip_header) -{ - { - Mutex::Locker l(lock); - pair p; - if (caches.lookup(oid, &p)) { - if (p.first == cid) { - *strip_header = p.second; - return 0; - } - } - } - Header header = lookup_header(cid, oid); - - if (!header) { - dout(20) << "lookup_strip_header failed to get strip_header " - << " cid " << cid <<" oid " << oid << dendl; - return -ENOENT; - } - - - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - if (header->data.length()) { - bufferlist::iterator bliter = header->data.begin(); - ::decode(*tmp, bliter); - } - - if (tmp->strip_size == 0) { - tmp->strip_size = default_strip_size; - tmp->updated = true; - } - - tmp->oid = oid; - tmp->cid = cid; - tmp->header = header; - - { - Mutex::Locker l(lock); - caches.add(oid, make_pair(cid, tmp)); - } - *strip_header = tmp; - dout(10) << "lookup_strip_header done " << " cid " << cid << " oid " - << oid << dendl; - return 0; -} - -int StripObjectMap::file_to_extents(uint64_t offset, size_t len, - uint64_t strip_size, - vector &extents) -{ - if (len == 0) - return 0; - - uint64_t start, end, strip_offset; - start = offset / strip_size; - end = (offset + len) / strip_size; - strip_offset = start * strip_size; - - // "offset" may in the middle of first strip object - if (offset > strip_offset) { - uint64_t extent_offset, extent_len; - extent_offset = offset - strip_offset; - if (extent_offset + len <= strip_size) - extent_len = len; - else - extent_len = strip_size - extent_offset; - extents.push_back(StripExtent(start, extent_offset, extent_len)); - start++; - strip_offset += strip_size; - } - - for (; start < end; ++start) { - extents.push_back(StripExtent(start, 0, strip_size)); - strip_offset += strip_size; - } - - // The end of strip object may be partial - if (offset + len > strip_offset) - extents.push_back(StripExtent(start, 0, offset+len-strip_offset)); - - assert(extents.size()); - dout(10) << "file_to_extents done " << dendl; - return 0; -} - -void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *target_header) -{ - Header new_origin_header; - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - - clone(old_header->header, cid, oid, t, &new_origin_header, - &tmp->header); - - tmp->oid = oid; - tmp->cid = cid; - tmp->strip_size = old_header->strip_size; - tmp->max_size = old_header->max_size; - tmp->bits = old_header->bits; - tmp->updated = true; - old_header->header = new_origin_header; - old_header->updated = true; - - if (target_header) - *target_header = tmp; -} - -void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *new_header) -{ - rename(old_header->header, cid, oid, t); - - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - tmp->strip_size = old_header->strip_size; - tmp->max_size = old_header->max_size; - tmp->bits = old_header->bits; - tmp->header = old_header->header; - tmp->oid = oid; - tmp->cid = cid; - tmp->updated = true; - - if (new_header) - *new_header = tmp; - - old_header->header = Header(); - old_header->deleted = true; -} - -int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header, - const string &prefix, - const set &keys, - map *out) -{ - return scan(header->header, prefix, keys, 0, out); -} - -int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header, - const string &prefix, - set *keys) -{ - ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - assert(!iter->status()); - keys->insert(iter->key()); - } - return 0; -} - -int StripObjectMap::get_with_header(const StripObjectHeaderRef header, - const string &prefix, map *out) -{ - ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - assert(!iter->status()); - out->insert(make_pair(iter->key(), iter->value())); - } - - return 0; -} - -// ========= KeyValueStore::BufferTransaction Implementation ============ - -int KeyValueStore::BufferTransaction::lookup_cached_header( - const coll_t &cid, const ghobject_t &oid, - StripObjectMap::StripObjectHeaderRef *strip_header, - bool create_if_missing) -{ - uniq_id uid = make_pair(cid, oid); - StripObjectMap::StripObjectHeaderRef header; - int r = 0; - - StripHeaderMap::iterator it = strip_headers.find(uid); - if (it != strip_headers.end()) { - - if (!it->second->deleted) { - if (strip_header) - *strip_header = it->second; - return 0; - } else if (!create_if_missing) { - return -ENOENT; - } - - // If (it->second.deleted && create_if_missing) go down - r = -ENOENT; - } else { - r = store->backend->lookup_strip_header(cid, oid, &header); - } - - if (r == -ENOENT && create_if_missing) { - r = store->backend->create_strip_header(cid, oid, &header, t); - } - - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " r = " << r << dendl; - return r; - } - - strip_headers[uid] = header; - if (strip_header) - *strip_header = header; - return r; -} - -int KeyValueStore::BufferTransaction::get_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix, - const set &keys, map *out) -{ - set need_lookup; - - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - for (set::iterator it = keys.begin(); it != keys.end(); ++it) { - map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); - if ( obj_it != buffers.end() ) { - map, bufferlist>::iterator i = - obj_it->second.find(make_pair(prefix, *it)); - if (i != obj_it->second.end()) { - (*out)[*it].swap(i->second); - } else { - need_lookup.insert(*it); - } - }else { - need_lookup.insert(*it); - } - } - - if (!need_lookup.empty()) { - int r = store->backend->get_values_with_header(strip_header, prefix, - need_lookup, out); - if (r < 0) { - dout(10) << __func__ << " " << strip_header->cid << "/" - << strip_header->oid << " " << " r = " << r << dendl; - return r; - } - } - - return 0; -} - -void KeyValueStore::BufferTransaction::set_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, map &values) -{ - store->backend->set_keys(strip_header->header, prefix, values, t); - - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - map, bufferlist> &uid_buffers = buffers[uid]; - for (map::iterator iter = values.begin(); - iter != values.end(); ++iter) { - uid_buffers[make_pair(prefix, iter->first)].swap(iter->second); - } -} - -int KeyValueStore::BufferTransaction::remove_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix, - const set &keys) -{ - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); - set buffered_keys; - if ( obj_it != buffers.end() ) { - // TODO: Avoid use empty bufferlist to indicate the key is removed - for (set::iterator iter = keys.begin(); iter != keys.end(); ++iter) { - obj_it->second[make_pair(prefix, *iter)] = bufferlist(); - } - // TODO: Avoid collect all buffered keys when remove keys - if (strip_header->header->parent) { - for (map, bufferlist>::iterator iter = obj_it->second.begin(); - iter != obj_it->second.end(); ++iter) { - buffered_keys.insert(iter->first.second); - } - } - } - - return store->backend->rm_keys(strip_header->header, prefix, buffered_keys, keys, t); -} - -void KeyValueStore::BufferTransaction::clear_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix) -{ - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); - if ( obj_it != buffers.end() ) { - for (map, bufferlist>::iterator iter = obj_it->second.begin(); - iter != obj_it->second.end(); ++iter) { - if (iter->first.first == prefix) - iter->second = bufferlist(); - } - } -} - -int KeyValueStore::BufferTransaction::clear_buffer( - StripObjectMap::StripObjectHeaderRef strip_header) -{ - strip_header->deleted = true; - - InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid); - finishes.push_back(c); - return store->backend->clear(strip_header->header, t); -} - -void KeyValueStore::BufferTransaction::clone_buffer( - StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid) -{ - // Remove target ahead to avoid dead lock - strip_headers.erase(make_pair(cid, oid)); - - StripObjectMap::StripObjectHeaderRef new_target_header; - - store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header); - - // FIXME: Lacking of lock for origin header(now become parent), it will - // cause other operation can get the origin header while submitting - // transactions - strip_headers[make_pair(cid, oid)] = new_target_header; -} - -void KeyValueStore::BufferTransaction::rename_buffer( - StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid) -{ - // FIXME: Lacking of lock for origin header, it will cause other operation - // can get the origin header while submitting transactions - StripObjectMap::StripObjectHeaderRef new_header; - store->backend->rename_wrap(old_header, cid, oid, t, &new_header); - - InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid); - finishes.push_back(c); - strip_headers[make_pair(cid, oid)] = new_header; -} - -int KeyValueStore::BufferTransaction::submit_transaction() -{ - int r = 0; - - for (StripHeaderMap::iterator header_iter = strip_headers.begin(); - header_iter != strip_headers.end(); ++header_iter) { - StripObjectMap::StripObjectHeaderRef header = header_iter->second; - - if (header->deleted) - continue; - - if (header->updated) { - r = store->backend->save_strip_header(header, t); - - if (r < 0) { - dout(10) << __func__ << " save strip header failed " << dendl; - goto out; - } - } - } - - r = store->backend->submit_transaction_sync(t); - for (list::iterator it = finishes.begin(); it != finishes.end(); ++it) { - (*it)->complete(r); - } - -out: - dout(5) << __func__ << " r = " << r << dendl; - return r; -} - -// =========== KeyValueStore Intern Helper Implementation ============== - -ostream& operator<<(ostream& out, const KeyValueStore::OpSequencer& s) -{ - assert(&out); - return out << *s.parent; -} - -int KeyValueStore::_create_current() -{ - struct stat st; - int ret = ::stat(current_fn.c_str(), &st); - if (ret == 0) { - // current/ exists - if (!S_ISDIR(st.st_mode)) { - dout(0) << "_create_current: current/ exists but is not a directory" << dendl; - ret = -EINVAL; - } - } else { - ret = ::mkdir(current_fn.c_str(), 0755); - if (ret < 0) { - ret = -errno; - dout(0) << "_create_current: mkdir " << current_fn << " failed: "<< cpp_strerror(ret) << dendl; - } - } - - return ret; -} - - - -// =========== KeyValueStore API Implementation ============== - -KeyValueStore::KeyValueStore(const std::string &base, - const char *name, bool do_update) : - ObjectStore(base), - internal_name(name), - basedir(base), - fsid_fd(-1), current_fd(-1), - backend(NULL), - ondisk_finisher(g_ceph_context), - collections_lock("KeyValueStore::collections_lock"), - lock("KeyValueStore::lock"), - throttle_ops(g_ceph_context, "keyvaluestore_ops", g_conf->keyvaluestore_queue_max_ops), - throttle_bytes(g_ceph_context, "keyvaluestore_bytes", g_conf->keyvaluestore_queue_max_bytes), - op_finisher(g_ceph_context), - op_tp(g_ceph_context, "KeyValueStore::op_tp", "tp_kvstore", - g_conf->keyvaluestore_op_threads, "keyvaluestore_op_threads"), - op_wq(this, g_conf->keyvaluestore_op_thread_timeout, - g_conf->keyvaluestore_op_thread_suicide_timeout, &op_tp), - perf_logger(NULL), - m_keyvaluestore_queue_max_ops(g_conf->keyvaluestore_queue_max_ops), - m_keyvaluestore_queue_max_bytes(g_conf->keyvaluestore_queue_max_bytes), - m_keyvaluestore_strip_size(g_conf->keyvaluestore_default_strip_size), - m_keyvaluestore_max_expected_write_size(g_conf->keyvaluestore_max_expected_write_size), - do_update(do_update), - m_keyvaluestore_do_dump(false), - m_keyvaluestore_dump_fmt(true) -{ - ostringstream oss; - oss << basedir << "/current"; - current_fn = oss.str(); - - // initialize perf_logger - PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_commit_len, l_os_last); - - plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations count in queue"); - plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations count in queue"); - plb.add_u64_counter(l_os_ops, "ops", "Operations"); - plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max size of queue"); - plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of queue"); - plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store"); - plb.add_time_avg(l_os_commit_lat, "commit_latency", "Commit latency"); - plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency"); - plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency"); - - perf_logger = plb.create_perf_counters(); - - g_ceph_context->get_perfcounters_collection()->add(perf_logger); - g_ceph_context->_conf->add_observer(this); - - superblock.compat_features = get_kv_initial_compat_set(); -} - -KeyValueStore::~KeyValueStore() -{ - g_ceph_context->_conf->remove_observer(this); - g_ceph_context->get_perfcounters_collection()->remove(perf_logger); - - delete perf_logger; - - if (m_keyvaluestore_do_dump) { - dump_stop(); - } -} - -int KeyValueStore::statfs(struct statfs *buf) -{ - int r = backend->db->get_statfs(buf); - if (r < 0) { - if (::statfs(basedir.c_str(), buf) < 0) { - int r = -errno; - return r; - } - } - return 0; -} - -void KeyValueStore::collect_metadata(map *pm) -{ - (*pm)["keyvaluestore_backend"] = superblock.backend; -} - -int KeyValueStore::mkfs() -{ - int ret = 0; - char fsid_fn[PATH_MAX]; - uuid_d old_fsid; - - dout(1) << "mkfs in " << basedir << dendl; - - // open+lock fsid - snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); - fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644); - if (fsid_fd < 0) { - ret = -errno; - derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; - return ret; - } - - if (lock_fsid() < 0) { - ret = -EBUSY; - goto close_fsid_fd; - } - - if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { - if (fsid.is_zero()) { - fsid.generate_random(); - dout(1) << "mkfs generated fsid " << fsid << dendl; - } else { - dout(1) << "mkfs using provided fsid " << fsid << dendl; - } - - char fsid_str[40]; - fsid.print(fsid_str); - strcat(fsid_str, "\n"); - ret = ::ftruncate(fsid_fd, 0); - if (ret < 0) { - ret = -errno; - derr << "mkfs: failed to truncate fsid: " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); - if (ret < 0) { - derr << "mkfs: failed to write fsid: " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - if (::fsync(fsid_fd) < 0) { - ret = -errno; - derr << "mkfs: close failed: can't write fsid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - dout(10) << "mkfs fsid is " << fsid << dendl; - } else { - if (!fsid.is_zero() && fsid != old_fsid) { - derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl; - ret = -EINVAL; - goto close_fsid_fd; - } - fsid = old_fsid; - dout(1) << "mkfs fsid is already set to " << fsid << dendl; - } - - // version stamp - ret = write_version_stamp(); - if (ret < 0) { - derr << "mkfs: write_version_stamp() failed: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - ret = _create_current(); - if (ret < 0) { - derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - // superblock - superblock.backend = g_conf->keyvaluestore_backend; - ret = write_superblock(); - if (ret < 0) { - derr << "KeyValueStore::mkfs write_superblock() failed: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - { - KeyValueDB *store = KeyValueDB::create(g_ceph_context, - superblock.backend, - current_fn.c_str()); - if (!store) { - derr << __func__ << " failed to create backend type " - << g_conf->keyvaluestore_backend << "." << dendl; - ret = -1; - goto close_fsid_fd; - } - - ostringstream err; - if (store->create_and_open(err)) { - derr << __func__ << " failed to create/open backend type " - << g_conf->keyvaluestore_backend << "." << dendl; - ret = -1; - delete store; - goto close_fsid_fd; - } - - bufferlist bl; - ::encode(collections, bl); - KeyValueDB::Transaction t = store->get_transaction(); - t->set("meta", "collections", bl); - store->submit_transaction_sync(t); - - dout(1) << g_conf->keyvaluestore_backend << " backend exists/created" << dendl; - delete store; - } - - ret = write_meta("type", "keyvaluestore"); - if (ret < 0) - goto close_fsid_fd; - - dout(1) << "mkfs done in " << basedir << dendl; - ret = 0; - - close_fsid_fd: - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - return ret; -} - -int KeyValueStore::read_fsid(int fd, uuid_d *uuid) -{ - char fsid_str[40]; - memset(fsid_str, 0, sizeof(fsid_str)); - int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); - if (ret < 0) - return ret; - if (ret == 8) { - // old 64-bit fsid... mirror it. - *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; - *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; - return 0; - } - - if (ret > 36) - fsid_str[36] = 0; - if (!uuid->parse(fsid_str)) - return -EINVAL; - return 0; -} - -int KeyValueStore::lock_fsid() -{ - struct flock l; - memset(&l, 0, sizeof(l)); - l.l_type = F_WRLCK; - l.l_whence = SEEK_SET; - l.l_start = 0; - l.l_len = 0; - int r = ::fcntl(fsid_fd, F_SETLK, &l); - if (r < 0) { - int err = errno; - dout(0) << "lock_fsid failed to lock " << basedir - << "/fsid, is another ceph-osd still running? " - << cpp_strerror(err) << dendl; - return -err; - } - return 0; -} - -bool KeyValueStore::test_mount_in_use() -{ - dout(5) << "test_mount basedir " << basedir << dendl; - char fn[PATH_MAX]; - snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); - - // verify fs isn't in use - - fsid_fd = ::open(fn, O_RDWR, 0644); - if (fsid_fd < 0) - return 0; // no fsid, ok. - bool inuse = lock_fsid() < 0; - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - return inuse; -} - -int KeyValueStore::write_superblock() -{ - bufferlist bl; - ::encode(superblock, bl); - return safe_write_file(basedir.c_str(), "superblock", - bl.c_str(), bl.length()); -} - -int KeyValueStore::read_superblock() -{ - bufferptr bp(PATH_MAX); - int ret = safe_read_file(basedir.c_str(), "superblock", - bp.c_str(), bp.length()); - if (ret < 0) { - if (ret == -ENOENT) { - // If the file doesn't exist write initial CompatSet - return write_superblock(); - } - return ret; - } - - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(superblock, i); - return 0; -} - - - -int KeyValueStore::update_version_stamp() -{ - return write_version_stamp(); -} - -int KeyValueStore::version_stamp_is_valid(uint32_t *version) -{ - bufferptr bp(PATH_MAX); - int ret = safe_read_file(basedir.c_str(), "store_version", - bp.c_str(), bp.length()); - if (ret < 0) { - if (ret == -ENOENT) - return 0; - return ret; - } - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(*version, i); - if (*version == target_version) - return 1; - else - return 0; -} - -int KeyValueStore::write_version_stamp() -{ - bufferlist bl; - ::encode(target_version, bl); - - return safe_write_file(basedir.c_str(), "store_version", - bl.c_str(), bl.length()); -} - -int KeyValueStore::mount() -{ - int ret; - char buf[PATH_MAX]; - CompatSet supported_compat_set = get_kv_supported_compat_set(); - - dout(5) << "basedir " << basedir << dendl; - - // make sure global base dir exists - if (::access(basedir.c_str(), R_OK | W_OK)) { - ret = -errno; - derr << "KeyValueStore::mount: unable to access basedir '" << basedir - << "': " << cpp_strerror(ret) << dendl; - goto done; - } - - // get fsid - snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); - fsid_fd = ::open(buf, O_RDWR, 0644); - if (fsid_fd < 0) { - ret = -errno; - derr << "KeyValueStore::mount: error opening '" << buf << "': " - << cpp_strerror(ret) << dendl; - goto done; - } - - ret = read_fsid(fsid_fd, &fsid); - if (ret < 0) { - derr << "KeyValueStore::mount: error reading fsid_fd: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - if (lock_fsid() < 0) { - derr << "KeyValueStore::mount: lock_fsid failed" << dendl; - ret = -EBUSY; - goto close_fsid_fd; - } - - dout(10) << "mount fsid is " << fsid << dendl; - - uint32_t version_stamp; - ret = version_stamp_is_valid(&version_stamp); - if (ret < 0) { - derr << "KeyValueStore::mount : error in version_stamp_is_valid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } else if (ret == 0) { - if (do_update) { - derr << "KeyValueStore::mount : stale version stamp detected: " - << version_stamp << ". Proceeding, do_update " - << "is set, performing disk format upgrade." << dendl; - } else { - ret = -EINVAL; - derr << "KeyValueStore::mount : stale version stamp " << version_stamp - << ". Please run the KeyValueStore update script before starting " - << "the OSD, or set keyvaluestore_update_to to " << target_version - << dendl; - goto close_fsid_fd; - } - } - - superblock.backend = g_conf->keyvaluestore_backend; - ret = read_superblock(); - if (ret < 0) { - ret = -EINVAL; - goto close_fsid_fd; - } - - // Check if this KeyValueStore supports all the necessary features to mount - if (supported_compat_set.compare(superblock.compat_features) == -1) { - derr << "KeyValueStore::mount : Incompatible features set " - << superblock.compat_features << dendl; - ret = -EINVAL; - goto close_fsid_fd; - } - - current_fd = ::open(current_fn.c_str(), O_RDONLY); - if (current_fd < 0) { - ret = -errno; - derr << "KeyValueStore::mount: error opening: " << current_fn << ": " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - assert(current_fd >= 0); - - { - if (superblock.backend.empty()) - superblock.backend = g_conf->keyvaluestore_backend; - KeyValueDB *store = KeyValueDB::create(g_ceph_context, - superblock.backend, - current_fn.c_str()); - if(!store) - { - derr << "KeyValueStore::mount backend type " - << superblock.backend << " error" << dendl; - ret = -1; - goto close_current_fd; - - } - - if (superblock.backend == "rocksdb") - store->init(g_conf->keyvaluestore_rocksdb_options); - else - store->init(); - stringstream err; - if (store->open(err)) { - derr << "KeyValueStore::mount Error initializing keyvaluestore backend " - << superblock.backend << ": " << err.str() << dendl; - ret = -1; - delete store; - goto close_current_fd; - } - - // get collection list - set keys; - keys.insert("collections"); - map values; - store->get("meta", keys, &values); - if (values.empty()) { - ret = -EIO; - derr << "Error no collection list; old store?" << dendl; - goto close_current_fd; - } - bufferlist::iterator p = values["collections"].begin(); - ::decode(collections, p); - dout(20) << "collections: " << collections << dendl; - - StripObjectMap *dbomap = new StripObjectMap(store); - ret = dbomap->init(do_update); - if (ret < 0) { - delete dbomap; - derr << "Error initializing StripObjectMap: " << ret << dendl; - goto close_current_fd; - } - stringstream err2; - - if (g_conf->keyvaluestore_debug_check_backend && !dbomap->check(err2)) { - derr << err2.str() << dendl; - delete dbomap; - ret = -EINVAL; - goto close_current_fd; - } - - default_strip_size = m_keyvaluestore_strip_size; - backend.reset(dbomap); - } - - op_tp.start(); - op_finisher.start(); - ondisk_finisher.start(); - - // all okay. - return 0; - -close_current_fd: - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - current_fd = -1; -close_fsid_fd: - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; -done: - return ret; -} - -int KeyValueStore::umount() -{ - dout(5) << "umount " << basedir << dendl; - - op_tp.stop(); - op_finisher.stop(); - ondisk_finisher.stop(); - - if (fsid_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - } - if (current_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - current_fd = -1; - } - - backend.reset(); - - // nothing - return 0; -} - -int KeyValueStore::queue_transactions(Sequencer *posr, list &tls, - TrackedOpRef osd_op, - ThreadPool::TPHandle *handle) -{ - utime_t start = ceph_clock_now(g_ceph_context); - Context *onreadable; - Context *ondisk; - Context *onreadable_sync; - ObjectStore::Transaction::collect_contexts( - tls, &onreadable, &ondisk, &onreadable_sync); - - // set up the sequencer - OpSequencer *osr; - assert(posr); - if (posr->p) { - osr = static_cast(posr->p.get()); - dout(5) << "queue_transactions existing " << osr << " " << *osr << "/" << osr->parent - << dendl; //<< " w/ q " << osr->q << dendl; - } else { - osr = new OpSequencer; - osr->parent = posr; - posr->p = osr; - dout(5) << "queue_transactions new " << osr << " " << *osr << "/" << osr->parent << dendl; - } - - Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op); - op_queue_reserve_throttle(o, handle); - if (m_keyvaluestore_do_dump) - dump_transactions(o->tls, o->op, osr); - dout(5) << "queue_transactions (trailing journal) " << " " << tls <tinc(l_os_queue_lat, end - start); - return 0; -} - - -// ============== KeyValueStore Op Handler ================= - -KeyValueStore::Op *KeyValueStore::build_op(list& tls, - Context *ondisk, Context *onreadable, Context *onreadable_sync, - TrackedOpRef osd_op) -{ - uint64_t bytes = 0, ops = 0; - for (list::iterator p = tls.begin(); - p != tls.end(); - ++p) { - bytes += (*p)->get_num_bytes(); - ops += (*p)->get_num_ops(); - } - - Op *o = new Op; - o->start = ceph_clock_now(g_ceph_context); - o->tls.swap(tls); - o->ondisk = ondisk; - o->onreadable = onreadable; - o->onreadable_sync = onreadable_sync; - o->ops = ops; - o->bytes = bytes; - o->osd_op = osd_op; - return o; -} - -void KeyValueStore::queue_op(OpSequencer *osr, Op *o) -{ - // queue op on sequencer, then queue sequencer for the threadpool, - // so that regardless of which order the threads pick up the - // sequencer, the op order will be preserved. - - osr->queue(o); - - perf_logger->inc(l_os_ops); - perf_logger->inc(l_os_bytes, o->bytes); - - dout(5) << "queue_op " << o << " seq " << o->op << " " << *osr << " " - << o->bytes << " bytes" << " (queue has " << throttle_ops.get_current() - << " ops and " << throttle_bytes.get_current() << " bytes)" << dendl; - op_wq.queue(osr); -} - -void KeyValueStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle) -{ - uint64_t max_ops = m_keyvaluestore_queue_max_ops; - uint64_t max_bytes = m_keyvaluestore_queue_max_bytes; - - perf_logger->set(l_os_oq_max_ops, max_ops); - perf_logger->set(l_os_oq_max_bytes, max_bytes); - - if (handle) - handle->suspend_tp_timeout(); - if (throttle_ops.should_wait(1) || - (throttle_bytes.get_current() // let single large ops through! - && throttle_bytes.should_wait(o->bytes))) { - dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || " - << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl; - } - throttle_ops.get(); - throttle_bytes.get(o->bytes); - if (handle) - handle->reset_tp_timeout(); - - perf_logger->set(l_os_oq_ops, throttle_ops.get_current()); - perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current()); -} - -void KeyValueStore::op_queue_release_throttle(Op *o) -{ - throttle_ops.put(); - throttle_bytes.put(o->bytes); - perf_logger->set(l_os_oq_ops, throttle_ops.get_current()); - perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current()); -} - -void KeyValueStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) -{ - // FIXME: Suppose the collection of transaction only affect objects in the - // one PG, so this lock will ensure no other concurrent write operation - osr->apply_lock.Lock(); - Op *o = osr->peek_queue(); - dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl; - int r = _do_transactions(o->tls, o->op, &handle); - dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r - << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; - - if (o->ondisk) { - if (r < 0) { - delete o->ondisk; - o->ondisk = 0; - } else { - ondisk_finisher.queue(o->ondisk, r); - } - } -} - -void KeyValueStore::_finish_op(OpSequencer *osr) -{ - list to_queue; - Op *o = osr->dequeue(&to_queue); - - utime_t lat = ceph_clock_now(g_ceph_context); - lat -= o->start; - - dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl; - osr->apply_lock.Unlock(); // locked in _do_op - op_queue_release_throttle(o); - - perf_logger->tinc(l_os_commit_lat, lat); - perf_logger->tinc(l_os_apply_lat, lat); - - if (o->onreadable_sync) { - o->onreadable_sync->complete(0); - } - if (o->onreadable) - op_finisher.queue(o->onreadable); - if (!to_queue.empty()) - op_finisher.queue(to_queue); - delete o; -} - -// Combine all the ops in the same transaction using "BufferTransaction" and -// cache the middle results in order to make visible to the following ops. -// -// Lock: KeyValueStore use "in_use" in GenericObjectMap to avoid concurrent -// operation on the same object. Not sure ReadWrite lock should be applied to -// improve concurrent performance. In the future, I'd like to remove apply_lock -// on "osr" and introduce PG RWLock. -int KeyValueStore::_do_transactions(list &tls, uint64_t op_seq, - ThreadPool::TPHandle *handle) -{ - int r = 0; - int trans_num = 0; - BufferTransaction bt(this); - - for (list::iterator p = tls.begin(); - p != tls.end(); - ++p, trans_num++) { - _do_transaction(**p, bt, handle); - if (handle) - handle->reset_tp_timeout(); - } - - r = bt.submit_transaction(); - if (r < 0) { - assert(0 == "unexpected error"); // FIXME - } - - return r; -} - -void KeyValueStore::_do_transaction(Transaction& transaction, - BufferTransaction &t, - ThreadPool::TPHandle *handle) -{ - dout(10) << "_do_transaction on " << &transaction << dendl; - - Transaction::iterator i = transaction.begin(); - uint64_t op_num = 0; - bool exist_clone = false; - - while (i.have_op()) { - if (handle) - handle->reset_tp_timeout(); - - Transaction::Op *op = i.decode_op(); - int r = 0; - - switch (op->op) { - case Transaction::OP_NOP: - break; - - case Transaction::OP_TOUCH: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _touch(cid, oid, t); - } - break; - - case Transaction::OP_WRITE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t off = op->off; - uint64_t len = op->len; - uint32_t fadvise_flags = i.get_fadvise_flags(); - bufferlist bl; - i.decode_bl(bl); - r = _write(cid, oid, off, len, bl, t, fadvise_flags); - } - break; - - case Transaction::OP_ZERO: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t off = op->off; - uint64_t len = op->len; - r = _zero(cid, oid, off, len, t); - } - break; - - case Transaction::OP_TRIMCACHE: - { - // deprecated, no-op - } - break; - - case Transaction::OP_TRUNCATE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t off = op->off; - r = _truncate(cid, oid, off, t); - } - break; - - case Transaction::OP_REMOVE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _remove(cid, oid, t); - } - break; - - case Transaction::OP_SETATTR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - string name = i.decode_string(); - bufferlist bl; - i.decode_bl(bl); - map to_set; - to_set[name] = bufferptr(bl.c_str(), bl.length()); - r = _setattrs(cid, oid, to_set, t); - if (r == -ENOSPC) - dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid - << " name " << name << " size " << bl.length() << dendl; - } - break; - - case Transaction::OP_SETATTRS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - map aset; - i.decode_attrset(aset); - r = _setattrs(cid, oid, aset, t); - if (r == -ENOSPC) - dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; - } - break; - - case Transaction::OP_RMATTR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - string name = i.decode_string(); - r = _rmattr(cid, oid, name.c_str(), t); - } - break; - - case Transaction::OP_RMATTRS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _rmattrs(cid, oid, t); - } - break; - - case Transaction::OP_CLONE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - ghobject_t noid = i.get_oid(op->dest_oid); - exist_clone = true; - r = _clone(cid, oid, noid, t); - } - break; - - case Transaction::OP_CLONERANGE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - ghobject_t noid = i.get_oid(op->dest_oid); - uint64_t off = op->off; - uint64_t len = op->len; - exist_clone = true; - r = _clone_range(cid, oid, noid, off, len, off, t); - } - break; - - case Transaction::OP_CLONERANGE2: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - ghobject_t noid = i.get_oid(op->dest_oid); - uint64_t srcoff = op->off; - uint64_t len = op->len; - uint64_t dstoff = op->dest_off; - exist_clone = true; - r = _clone_range(cid, oid, noid, srcoff, len, dstoff, t); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = i.get_cid(op->cid); - r = _create_collection(cid, t); - } - break; - - case Transaction::OP_COLL_HINT: - { - coll_t cid = i.get_cid(op->cid); - uint32_t type = op->hint_type; - bufferlist hint; - i.decode_bl(hint); - bufferlist::iterator hiter = hint.begin(); - if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { - uint32_t pg_num; - uint64_t num_objs; - ::decode(pg_num, hiter); - ::decode(num_objs, hiter); - r = _collection_hint_expected_num_objs(cid, pg_num, num_objs); - } else { - // Ignore the hint - dout(10) << "Unrecognized collection hint type: " << type << dendl; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = i.get_cid(op->cid); - r = _destroy_collection(cid, t); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t ocid = i.get_cid(op->cid); - coll_t ncid = i.get_cid(op->dest_cid); - ghobject_t oid = i.get_oid(op->oid); - r = _collection_add(ncid, ocid, oid, t); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _remove(cid, oid, t); - } - break; - - case Transaction::OP_COLL_MOVE: - { - // WARNING: this is deprecated and buggy; only here to replay old journals. - coll_t ocid = i.get_cid(op->cid); - coll_t ncid = i.get_cid(op->dest_cid); - ghobject_t oid = i.get_oid(op->oid); - r = _collection_move_rename(ocid, oid, ncid, oid, t); - } - break; - - case Transaction::OP_COLL_MOVE_RENAME: - { - coll_t oldcid = i.get_cid(op->cid); - ghobject_t oldoid = i.get_oid(op->oid); - coll_t newcid = i.get_cid(op->dest_cid); - ghobject_t newoid = i.get_oid(op->dest_oid); - r = _collection_move_rename(oldcid, oldoid, newcid, newoid, t); - } - break; - - case Transaction::OP_COLL_SETATTR: - case Transaction::OP_COLL_RMATTR: - assert(0 == "coll attrs no longer supported"); - break; - - case Transaction::OP_STARTSYNC: - { - start_sync(); - break; - } - - case Transaction::OP_COLL_RENAME: - { - assert(0 == "not implemented"); - } - break; - - case Transaction::OP_OMAP_CLEAR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _omap_clear(cid, oid, t); - } - break; - case Transaction::OP_OMAP_SETKEYS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - map aset; - i.decode_attrset(aset); - r = _omap_setkeys(cid, oid, aset, t); - } - break; - case Transaction::OP_OMAP_RMKEYS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - set keys; - i.decode_keyset(keys); - r = _omap_rmkeys(cid, oid, keys, t); - } - break; - case Transaction::OP_OMAP_RMKEYRANGE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - string first, last; - first = i.decode_string(); - last = i.decode_string(); - r = _omap_rmkeyrange(cid, oid, first, last, t); - } - break; - case Transaction::OP_OMAP_SETHEADER: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - bufferlist bl; - i.decode_bl(bl); - r = _omap_setheader(cid, oid, bl, t); - } - break; - case Transaction::OP_SPLIT_COLLECTION: - { - coll_t cid = i.get_cid(op->cid); - uint32_t bits = op->split_bits; - uint32_t rem = op->split_rem; - coll_t dest = i.get_cid(op->dest_cid); - r = _split_collection_create(cid, bits, rem, dest, t); - } - break; - case Transaction::OP_SPLIT_COLLECTION2: - { - coll_t cid = i.get_cid(op->cid); - uint32_t bits = op->split_bits; - uint32_t rem = op->split_rem; - coll_t dest = i.get_cid(op->dest_cid); - r = _split_collection(cid, bits, rem, dest, t); - } - break; - - case Transaction::OP_SETALLOCHINT: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t expected_object_size = op->expected_object_size; - uint64_t expected_write_size = op->expected_write_size; - r = _set_alloc_hint(cid, oid, expected_object_size, - expected_write_size, t); - } - break; - - default: - derr << "bad op " << op->op << dendl; - assert(0); - } - - if (r < 0) { - bool ok = false; - - if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || - op->op == Transaction::OP_CLONE || - op->op == Transaction::OP_CLONERANGE2 || - op->op == Transaction::OP_COLL_ADD)) - // -ENOENT is normally okay - // ...including on a replayed OP_RMCOLL with checkpoint mode - ok = true; - if (r == -ENODATA) - ok = true; - - if (!ok) { - const char *msg = "unexpected error code"; - - if (exist_clone) { - dout(0) << "BUG: clone failed will lead to paritial transaction applied" << dendl; - } - - if (r == -ENOSPC) - // For now, if we hit _any_ ENOSPC, crash, before we do any damage - // by partially applying transactions. - msg = "ENOSPC handling not implemented"; - - if (r == -ENOTEMPTY) { - msg = "ENOTEMPTY suggests garbage data in osd data dir"; - } - - dout(0) << " error " << cpp_strerror(r) << " not handled on operation " - << op->op << " op " << op_num << ", counting from 0)" << dendl; - dout(0) << msg << dendl; - dout(0) << " transaction dump:\n"; - JSONFormatter f(true); - f.open_object_section("transaction"); - transaction.dump(&f); - f.close_section(); - f.flush(*_dout); - *_dout << dendl; - - if (r == -EMFILE) { - dump_open_fds(g_ceph_context); - } - - assert(0 == "unexpected error"); - } - } - - op_num++; - } -} - - -// =========== KeyValueStore Op Implementation ============== -// objects - -bool KeyValueStore::exists(coll_t cid, const ghobject_t& oid) -{ - dout(10) << __func__ << "collection: " << cid << " object: " << oid - << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - return false; - } - - return true; -} - -int KeyValueStore::stat(coll_t cid, const ghobject_t& oid, - struct stat *st, bool allow_eio) -{ - dout(10) << "stat " << cid << "/" << oid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl; - return -ENOENT; - } - - st->st_blocks = header->max_size / header->strip_size; - if (header->max_size % header->strip_size) - st->st_blocks++; - st->st_nlink = 1; - st->st_size = header->max_size; - st->st_blksize = header->strip_size; - - return r; -} - -int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, bufferlist& bl, - bool allow_eio, BufferTransaction *bt) -{ - if (header->max_size < offset) { - dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")" - << " offset exceed the length of bl"<< dendl; - return 0; - } - - if (len == 0) - len = header->max_size - offset; - - if (offset + len > header->max_size) - len = header->max_size - offset; - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - map out; - set keys; - pair uid = make_pair(header->cid, header->oid); - - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - bufferlist old; - string key = strip_object_key(iter->no); - - map< pair, map, bufferlist> >::iterator obj_it; - if ( bt ){ - obj_it = bt->buffers.find(uid); - } - if ( bt && obj_it != bt->buffers.end() && obj_it->second.count(make_pair(OBJECT_STRIP_PREFIX, key))) { - // use strip_header buffer - assert(header->bits[iter->no]); - out[key] = obj_it->second[make_pair(OBJECT_STRIP_PREFIX, key)]; - }else if (header->bits[iter->no]) { - keys.insert(key); - } - } - - - int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out); - r = check_get_rc(header->cid, header->oid, r, out.size() == keys.size()); - if (r < 0) - return r; - - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - string key = strip_object_key(iter->no); - - if (header->bits[iter->no]) { - if (iter->len == header->strip_size) { - bl.claim_append(out[key]); - } else { - out[key].copy(iter->offset, iter->len, bl); - } - } else { - bl.append_zero(iter->len); - } - } - - dout(10) << __func__ << " " << header->cid << "/" << header->oid << " " - << offset << "~" << bl.length() << "/" << len << " r = " << r - << dendl; - - return bl.length(); -} - - -int KeyValueStore::read(coll_t cid, const ghobject_t& oid, uint64_t offset, - size_t len, bufferlist& bl, uint32_t op_flags, - bool allow_eio) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(cid, oid, &header); - - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << " header isn't exist: r = " << r << dendl; - return r; - } - - return _generic_read(header, offset, len, bl, allow_eio); -} - -int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid, - uint64_t offset, size_t len, bufferlist& bl) -{ - dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~" - << len << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len - << " failed to get header: r = " << r << dendl; - return r; - } - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - - map m; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - if (header->bits[iter->no]) { - uint64_t off = iter->no * header->strip_size + iter->offset; - m[off] = iter->len; - } - } - ::encode(m, bl); - return 0; -} - -int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - header->max_size = 0; - header->bits.clear(); - header->updated = true; - r = t.clear_buffer(header); - - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " size " << size - << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << size - << " failed to get header: r = " << r << dendl; - return r; - } - - if (header->max_size == size) - return 0; - - if (header->max_size > size) { - vector extents; - StripObjectMap::file_to_extents(size, header->max_size-size, - header->strip_size, extents); - assert(extents.size()); - - vector::iterator iter = extents.begin(); - if (header->bits[iter->no] && iter->offset != 0) { - bufferlist value; - map values; - set lookup_keys; - string key = strip_object_key(iter->no); - - lookup_keys.insert(key); - r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, - lookup_keys, &values); - r = check_get_rc(cid, oid, r, lookup_keys.size() == values.size()); - if (r < 0) - return r; - - values[key].copy(0, iter->offset, value); - value.append_zero(header->strip_size-iter->offset); - assert(value.length() == header->strip_size); - value.swap(values[key]); - - t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); - ++iter; - } - - set keys; - for (; iter != extents.end(); ++iter) { - if (header->bits[iter->no]) { - keys.insert(strip_object_key(iter->no)); - header->bits[iter->no] = 0; - } - } - r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << size << " = " << r << dendl; - return r; - } - } - - header->bits.resize(size/header->strip_size+1); - header->max_size = size; - header->updated = true; - - dout(10) << __func__ << " " << cid << "/" << oid << " size " << size << " = " - << r << dendl; - return r; -} - -int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " failed to get header: r = " << r << dendl; - r = -EINVAL; - return r; - } - - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, - const bufferlist& bl, BufferTransaction &t, - uint32_t fadvise_flags) -{ - if (len > bl.length()) - len = bl.length(); - - if (len + offset > header->max_size) { - header->max_size = len + offset; - header->bits.resize(header->max_size/header->strip_size+1); - header->updated = true; - } - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - - map out; - set keys; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - if (header->bits[iter->no] && !(iter->offset == 0 && - iter->len == header->strip_size)) - keys.insert(strip_object_key(iter->no)); - } - - int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out); - r = check_get_rc(header->cid, header->oid, r, keys.size() == out.size()); - if (r < 0) - return r; - - uint64_t bl_offset = 0; - map values; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - bufferlist value; - string key = strip_object_key(iter->no); - if (header->bits[iter->no]) { - if (iter->offset == 0 && iter->len == header->strip_size) { - bl.copy(bl_offset, iter->len, value); - bl_offset += iter->len; - } else { - assert(out[key].length() == header->strip_size); - - out[key].copy(0, iter->offset, value); - bl.copy(bl_offset, iter->len, value); - bl_offset += iter->len; - - if (value.length() != header->strip_size) - out[key].copy(value.length(), header->strip_size-value.length(), - value); - } - } else { - if (iter->offset) - value.append_zero(iter->offset); - bl.copy(bl_offset, iter->len, value); - bl_offset += iter->len; - - if (value.length() < header->strip_size) - value.append_zero(header->strip_size-value.length()); - - header->bits[iter->no] = 1; - header->updated = true; - } - assert(value.length() == header->strip_size); - values[key].swap(value); - } - assert(bl_offset == len); - - t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); - dout(10) << __func__ << " " << header->cid << "/" << header->oid << " " - << offset << "~" << len << " = " << r << dendl; - - return r; -} - -int KeyValueStore::_write(coll_t cid, const ghobject_t& oid, - uint64_t offset, size_t len, const bufferlist& bl, - BufferTransaction &t, uint32_t fadvise_flags) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset - << "~" << len << " failed to get header: r = " << r << dendl; - return r; - } - - return _generic_write(header, offset, len, bl, t, fadvise_flags); -} - -int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, - size_t len, BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" << len << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset - << "~" << len << " failed to get header: r = " << r << dendl; - return r; - } - - if (len + offset > header->max_size) { - header->max_size = len + offset; - header->bits.resize(header->max_size/header->strip_size+1); - header->updated = true; - } - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - set rm_keys; - set lookup_keys; - map values; - map > off_len; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - string key = strip_object_key(iter->no); - if (header->bits[iter->no]) { - if (iter->offset == 0 && iter->len == header->strip_size) { - rm_keys.insert(key); - header->bits[iter->no] = 0; - header->updated = true; - } else { - lookup_keys.insert(key); - off_len[key] = make_pair(iter->offset, iter->len); - } - } - } - r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, - lookup_keys, &values); - r = check_get_rc(header->cid, header->oid, r, lookup_keys.size() == values.size()); - if (r < 0) - return r; - for(set::iterator it = lookup_keys.begin(); it != lookup_keys.end(); ++it) - { - pair p = off_len[*it]; - values[*it].zero(p.first, p.second); - } - t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); - t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, rm_keys); - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << " = " << r << dendl; - return r; -} - -int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid, - const ghobject_t& newoid, BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << dendl; - - if (oldoid == newoid) - return 0; - - int r; - StripObjectMap::StripObjectHeaderRef old_header; - - r = t.lookup_cached_header(cid, oldoid, &old_header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " = " << r << dendl; - return r; - } - - t.clone_buffer(old_header, cid, newoid); - - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid, - const ghobject_t& newoid, uint64_t srcoff, - uint64_t len, uint64_t dstoff, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << dendl; - - int r; - bufferlist bl; - - StripObjectMap::StripObjectHeaderRef old_header, new_header; - - r = t.lookup_cached_header(cid, oldoid, &old_header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << " header isn't exist: r = " << r << dendl; - return r; - } - - r = t.lookup_cached_header(cid, newoid, &new_header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << " can't create header: r = " << r << dendl; - return r; - } - - r = _generic_read(old_header, srcoff, len, bl, &t); - if (r < 0) - goto out; - - r = _generic_write(new_header, dstoff, len, bl, t); - - out: - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << " = " << r << dendl; - return r; -} - -// attrs - -int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, - bufferptr &bp) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " '" << name << "'" - << dendl; - - int r; - map got; - set to_get; - StripObjectMap::StripObjectHeaderRef header; - - to_get.insert(string(name)); - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " get_xattrs err r =" << r << dendl; - goto out; - } - if (got.empty()) { - dout(10) << __func__ << " got.size() is 0" << dendl; - return -ENODATA; - } - bp = bufferptr(got.begin()->second.c_str(), - got.begin()->second.length()); - r = 0; - - out: - dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = " - << r << dendl; - return r; -} - -int KeyValueStore::getattrs(coll_t cid, const ghobject_t& oid, - map& aset) -{ - map attr_aset; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_with_header(header, OBJECT_XATTR, &attr_aset); - if (r < 0) { - dout(10) << __func__ << " could not get attrs r = " << r << dendl; - goto out; - } - - for (map::iterator i = attr_aset.begin(); - i != attr_aset.end(); ++i) { - string key; - key = i->first; - aset.insert(make_pair(key, - bufferptr(i->second.c_str(), i->second.length()))); - } - - out: - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - - return r; -} - -int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid, - map& aset, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - - StripObjectMap::StripObjectHeaderRef header; - map attrs; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) - goto out; - - for (map::iterator it = aset.begin(); - it != aset.end(); ++it) { - attrs[it->first].push_back(it->second); - } - - t.set_buffer_keys(header, OBJECT_XATTR, attrs); - -out: - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - - -int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " '" << name << "'" - << dendl; - - int r; - set to_remove; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " could not find header r = " << r - << dendl; - return r; - } - - to_remove.insert(string(name)); - r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove); - - dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = " - << r << dendl; - return r; -} - -int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - set attrs; - - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " could not find header r = " << r - << dendl; - return r; - } - - r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs); - if (r < 0) { - dout(10) << __func__ << " could not get attrs r = " << r << dendl; - return r; - } - - r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs); - t.clear_buffer_keys(header, OBJECT_XATTR); - - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - - -// collections - -int KeyValueStore::_create_collection(coll_t c, BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << dendl; - int r = 0; - bufferlist bl; - - RWLock::WLocker l(collections_lock); - if (collections.count(c)) { - r = -EEXIST; - goto out; - } - - collections.insert(c); - t.set_collections(collections); - - out: - dout(10) << __func__ << " cid " << c << " r = " << r << dendl; - return r; -} - -int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << dendl; - - int r; - uint64_t modified_object = 0; - vector oids; - bufferlist bl; - - { - RWLock::RLocker l(collections_lock); - if (!collections.count(c)) { - r = -ENOENT; - goto out; - } - } - - // All modified objects are marked deleted - for (BufferTransaction::StripHeaderMap::iterator iter = t.strip_headers.begin(); - iter != t.strip_headers.end(); ++iter) { - // sum the total modified object in this PG - if (iter->first.first != c) - continue; - - modified_object++; - if (!iter->second->deleted) { - r = -ENOTEMPTY; - goto out; - } - } - - r = backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), modified_object+1, &oids, - 0); - // No other object - if (oids.size() != modified_object && oids.size() != 0) { - r = -ENOTEMPTY; - goto out; - } - - for (vector::iterator iter = oids.begin(); - iter != oids.end(); ++iter) { - if (!t.strip_headers.count(make_pair(c, *iter))) { - r = -ENOTEMPTY; - goto out; - } - } - - { - RWLock::WLocker l(collections_lock); - collections.erase(c); - t.set_collections(collections); - } - r = 0; - -out: - dout(10) << __func__ << " " << c << " = " << r << dendl; - return r; -} - - -int KeyValueStore::_collection_add(coll_t c, coll_t oldcid, - const ghobject_t& o, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << o << dendl; - - bufferlist bl; - StripObjectMap::StripObjectHeaderRef header, old_header; - - int r = t.lookup_cached_header(oldcid, o, &old_header, false); - if (r < 0) { - goto out; - } - - r = t.lookup_cached_header(c, o, &header, false); - if (r == 0) { - r = -EEXIST; - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << o << " already exist " << dendl; - goto out; - } - - r = _generic_read(old_header, 0, old_header->max_size, bl, &t); - if (r < 0) { - r = -EINVAL; - goto out; - } - - r = _generic_write(header, 0, bl.length(), bl, t); - if (r < 0) { - r = -EINVAL; - } - -out: - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << o << " = " << r << dendl; - return r; -} - -int KeyValueStore::_collection_move_rename(coll_t oldcid, - const ghobject_t& oldoid, - coll_t c, const ghobject_t& o, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << oldoid << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(c, o, &header, false); - if (r == 0) { - dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c - << "/" << o << " = " << r << dendl; - return -EEXIST; - } - - r = t.lookup_cached_header(oldcid, oldoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c - << "/" << o << " = " << r << dendl; - return r; - } - - t.rename_buffer(header, c, o); - - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << oldoid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_collection_remove_recursive(const coll_t &cid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << dendl; - int r = 0; - - { - RWLock::RLocker l(collections_lock); - if (collections.count(cid) == 0) - return -ENOENT; - } - - vector objects; - ghobject_t max; - while (!max.is_max()) { - r = collection_list(cid, max, ghobject_t::get_max(), true, 300, &objects, &max); - if (r < 0) - goto out; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - r = _remove(cid, *i, t); - if (r < 0) - goto out; - } - } - - { - RWLock::WLocker l(collections_lock); - collections.erase(cid); - t.set_collections(collections); - } - - out: - dout(10) << __func__ << " " << cid << " r = " << r << dendl; - return r; -} - -int KeyValueStore::list_collections(vector& ls) -{ - dout(10) << __func__ << " " << dendl; - RWLock::RLocker l(collections_lock); - for (set::iterator p = collections.begin(); p != collections.end(); - ++p) { - ls.push_back(*p); - } - return 0; -} - -bool KeyValueStore::collection_exists(coll_t c) -{ - dout(10) << __func__ << " " << dendl; - RWLock::RLocker l(collections_lock); - return collections.count(c); -} - -bool KeyValueStore::collection_empty(coll_t c) -{ - dout(10) << __func__ << " " << dendl; - - vector oids; - backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), 1, &oids, 0); - - return oids.empty(); -} - -int KeyValueStore::collection_list(coll_t c, ghobject_t start, - ghobject_t end, bool sort_bitwise, int max, - vector *ls, ghobject_t *next) -{ - if (!sort_bitwise) - return -EOPNOTSUPP; - - if (max < 0) - return -EINVAL; - - if (start.is_max()) - return 0; - - int r = backend->list_objects(c, start, end, max, ls, next); - return r; -} - -int KeyValueStore::collection_version_current(coll_t c, uint32_t *version) -{ - *version = COLLECTION_VERSION; - if (*version == target_version) - return 1; - else - return 0; -} - -// omap - -int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid, - bufferlist *bl, map *out) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_with_header(header, OBJECT_OMAP, out); - if (r < 0) { - dout(10) << __func__ << " err r =" << r << dendl; - return r; - } - - set keys; - map got; - - keys.insert(OBJECT_OMAP_HEADER_KEY); - r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " err r =" << r << dendl; - return r; - } - - if (!got.empty()) { - assert(got.size() == 1); - bl->swap(got.begin()->second); - } - - return 0; -} - -int KeyValueStore::omap_get_header(coll_t c, const ghobject_t &hoid, - bufferlist *bl, bool allow_eio) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - set keys; - map got; - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - keys.insert(OBJECT_OMAP_HEADER_KEY); - r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " err r =" << r << dendl; - return r; - } - - if (!got.empty()) { - assert(got.size() == 1); - bl->swap(got.begin()->second); - } - - return 0; -} - -int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set *keys) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_keys_with_header(header, OBJECT_OMAP, keys); - if (r < 0) { - return r; - } - return 0; -} - -int KeyValueStore::omap_get_values(coll_t c, const ghobject_t &hoid, - const set &keys, - map *out) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out); - if (r < 0 && r != -ENOENT) { - return r; - } - return 0; -} - -int KeyValueStore::omap_check_keys(coll_t c, const ghobject_t &hoid, - const set &keys, set *out) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - int r = backend->check_keys(c, hoid, OBJECT_OMAP, keys, out); - if (r < 0 && r != -ENOENT) { - return r; - } - return 0; -} - -ObjectMap::ObjectMapIterator KeyValueStore::get_omap_iterator( - coll_t c, const ghobject_t &hoid) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - return backend->get_iterator(c, hoid, OBJECT_OMAP); -} - -int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - set keys; - r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys); - if (r < 0) { - dout(10) << __func__ << " could not get omap_keys r = " << r << dendl; - return r; - } - - r = t.remove_buffer_keys(header, OBJECT_OMAP, keys); - if (r < 0) { - dout(10) << __func__ << " could not remove keys r = " << r << dendl; - return r; - } - - keys.clear(); - keys.insert(OBJECT_OMAP_HEADER_KEY); - r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys); - if (r < 0) { - dout(10) << __func__ << " could not remove keys r = " << r << dendl; - return r; - } - - t.clear_buffer_keys(header, OBJECT_OMAP_HEADER); - - dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl; - return 0; -} - -int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid, - map &aset, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - t.set_buffer_keys(header, OBJECT_OMAP, aset); - - return 0; -} - -int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid, - const set &keys, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - r = t.remove_buffer_keys(header, OBJECT_OMAP, keys); - - dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl; - return r; -} - -int KeyValueStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid, - const string& first, const string& last, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," - << last << "]" << dendl; - - set keys; - { - ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); - if (!iter) - return -ENOENT; - - for (iter->lower_bound(first); iter->valid() && iter->key() < last; - iter->next(false)) { - keys.insert(iter->key()); - } - } - return _omap_rmkeys(cid, hoid, keys, t); -} - -int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid, - const bufferlist &bl, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - map sets; - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - sets[OBJECT_OMAP_HEADER_KEY] = bl; - t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets); - return 0; -} - -int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem, - coll_t dest, BufferTransaction &t) -{ - { - dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - { - RWLock::RLocker l(collections_lock); - if (collections.count(cid) == 0) - return -ENOENT; - if (collections.count(dest) == 0) - return -ENOENT; - } - - vector objects; - ghobject_t next, current; - int move_size = 0; - while (1) { - collection_list(cid, current, ghobject_t::get_max(), true, - get_ideal_list_max(), &objects, &next); - - dout(20) << __func__ << cid << "objects size: " << objects.size() - << dendl; - - if (objects.empty()) - break; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - if (i->match(bits, rem)) { - if (_collection_move_rename(cid, *i, dest, *i, t) < 0) { - return -1; - } - move_size++; - } - } - - objects.clear(); - current = next; - } - - dout(20) << __func__ << "move" << move_size << " object from " << cid - << "to " << dest << dendl; - } - - if (g_conf->filestore_debug_verify_split) { - vector objects; - ghobject_t next; - while (1) { - collection_list(cid, next, ghobject_t::get_max(), true, - get_ideal_list_max(), &objects, &next); - if (objects.empty()) - break; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - dout(20) << __func__ << ": " << *i << " still in source " - << cid << dendl; - assert(!i->match(bits, rem)); - } - objects.clear(); - } - - next = ghobject_t(); - while (1) { - collection_list(dest, next, ghobject_t::get_max(), true, - get_ideal_list_max(), &objects, &next); - if (objects.empty()) - break; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - dout(20) << __func__ << ": " << *i << " now in dest " - << *i << dendl; - assert(i->match(bits, rem)); - } - objects.clear(); - } - } - return 0; -} - -int KeyValueStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid, - uint64_t expected_object_size, - uint64_t expected_write_size, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " object_size " - << expected_object_size << " write_size " - << expected_write_size << dendl; - - int r = 0; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid - << " failed to get header: r = " << r << dendl; - return r; - } - - bool blank = true; - for (vector::iterator it = header->bits.begin(); - it != header->bits.end(); ++it) { - if (*it) { - blank = false; - break; - } - } - - // Now only consider to change "strip_size" when the object is blank, - // because set_alloc_hint is expected to be very lightweight - if (blank) { - // header->strip_size = MIN(expected_write_size, m_keyvaluestore_max_expected_write_size); - // dout(20) << __func__ << " hint " << header->strip_size << " success" << dendl; - } - - dout(10) << __func__ << "" << cid << "/" << oid << " object_size " - << expected_object_size << " write_size " - << expected_write_size << " = " << r << dendl; - - return r; -} - -const char** KeyValueStore::get_tracked_conf_keys() const -{ - static const char* KEYS[] = { - "keyvaluestore_queue_max_ops", - "keyvaluestore_queue_max_bytes", - "keyvaluestore_default_strip_size", - "keyvaluestore_dump_file", - NULL - }; - return KEYS; -} - -void KeyValueStore::handle_conf_change(const struct md_config_t *conf, - const std::set &changed) -{ - if (changed.count("keyvaluestore_queue_max_ops") || - changed.count("keyvaluestore_queue_max_bytes") || - changed.count("keyvaluestore_max_expected_write_size")) { - m_keyvaluestore_queue_max_ops = conf->keyvaluestore_queue_max_ops; - m_keyvaluestore_queue_max_bytes = conf->keyvaluestore_queue_max_bytes; - m_keyvaluestore_max_expected_write_size = conf->keyvaluestore_max_expected_write_size; - throttle_ops.reset_max(conf->keyvaluestore_queue_max_ops); - throttle_bytes.reset_max(conf->keyvaluestore_queue_max_bytes); - } - if (changed.count("keyvaluestore_default_strip_size")) { - m_keyvaluestore_strip_size = conf->keyvaluestore_default_strip_size; - default_strip_size = m_keyvaluestore_strip_size; - } - if (changed.count("keyvaluestore_dump_file")) { - if (conf->keyvaluestore_dump_file.length() && - conf->keyvaluestore_dump_file != "-") { - dump_start(conf->keyvaluestore_dump_file); - } else { - dump_stop(); - } - } -} - -int KeyValueStore::check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size) -{ - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " get rc = " << r << dendl; - } else if (!is_equal_size) { - dout(0) << __func__ << " broken header or missing data in backend " - << cid << "/" << oid << " get rc = " << r << dendl; - r = -EBADF; - } - return r; -} - -void KeyValueStore::dump_start(const std::string &file) -{ - dout(10) << "dump_start " << file << dendl; - if (m_keyvaluestore_do_dump) { - dump_stop(); - } - m_keyvaluestore_dump_fmt.reset(); - m_keyvaluestore_dump_fmt.open_array_section("dump"); - m_keyvaluestore_dump.open(file.c_str()); - m_keyvaluestore_do_dump = true; -} - -void KeyValueStore::dump_stop() -{ - dout(10) << "dump_stop" << dendl; - m_keyvaluestore_do_dump = false; - if (m_keyvaluestore_dump.is_open()) { - m_keyvaluestore_dump_fmt.close_section(); - m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump); - m_keyvaluestore_dump.flush(); - m_keyvaluestore_dump.close(); - } -} -void KeyValueStore::dump_transactions(list& ls, uint64_t seq, OpSequencer *osr) -{ - m_keyvaluestore_dump_fmt.open_array_section("transactions"); - unsigned trans_num = 0; - for (list::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { - m_keyvaluestore_dump_fmt.open_object_section("transaction"); - m_keyvaluestore_dump_fmt.dump_string("osr", osr->get_name()); - m_keyvaluestore_dump_fmt.dump_unsigned("seq", seq); - m_keyvaluestore_dump_fmt.dump_unsigned("trans_num", trans_num); - (*i)->dump(&m_keyvaluestore_dump_fmt); - m_keyvaluestore_dump_fmt.close_section(); - } - m_keyvaluestore_dump_fmt.close_section(); - m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump); - m_keyvaluestore_dump.flush(); -} - - -// -- KVSuperblock -- - -void KVSuperblock::encode(bufferlist &bl) const -{ - ENCODE_START(1, 1, bl); - compat_features.encode(bl); - ::encode(backend, bl); - ENCODE_FINISH(bl); -} - -void KVSuperblock::decode(bufferlist::iterator &bl) -{ - DECODE_START(1, bl); - compat_features.decode(bl); - ::decode(backend, bl); - DECODE_FINISH(bl); -} - -void KVSuperblock::dump(Formatter *f) const -{ - f->open_object_section("compat"); - compat_features.dump(f); - f->dump_string("backend", backend); - f->close_section(); -} - -void KVSuperblock::generate_test_instances(list& o) -{ - KVSuperblock z; - o.push_back(new KVSuperblock(z)); - CompatSet::FeatureSet feature_compat; - CompatSet::FeatureSet feature_ro_compat; - CompatSet::FeatureSet feature_incompat; - z.compat_features = CompatSet(feature_compat, feature_ro_compat, - feature_incompat); - o.push_back(new KVSuperblock(z)); - z.backend = "rocksdb"; - o.push_back(new KVSuperblock(z)); -} diff --git a/src/os/keyvaluestore/KeyValueStore.h b/src/os/keyvaluestore/KeyValueStore.h deleted file mode 100644 index 9e245ee3808d..000000000000 --- a/src/os/keyvaluestore/KeyValueStore.h +++ /dev/null @@ -1,700 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_KEYVALUESTORE_H -#define CEPH_KEYVALUESTORE_H - -#include "include/types.h" - -#include -#include -#include -#include -using namespace std; - -#include "include/assert.h" - -#include "os/ObjectStore.h" - -#include "common/WorkQueue.h" -#include "common/Finisher.h" -#include "common/fd.h" - -#include "common/Mutex.h" -#include "GenericObjectMap.h" -#include "kv/KeyValueDB.h" -#include "common/random_cache.hpp" - -#include "include/uuid.h" - -static uint64_t default_strip_size = 1024; - -class StripObjectMap: public GenericObjectMap { - public: - - struct StripExtent { - uint64_t no; - uint64_t offset; // in key - uint64_t len; // in key - StripExtent(uint64_t n, uint64_t off, size_t len): - no(n), offset(off), len(len) {} - }; - - // -- strip object -- - struct StripObjectHeader { - // Persistent state - uint64_t strip_size; - uint64_t max_size; - vector bits; - - // soft state - Header header; // FIXME: Hold lock to avoid concurrent operations, it will - // also block read operation which not should be permitted. - coll_t cid; - ghobject_t oid; - bool updated; - bool deleted; - - StripObjectHeader(): strip_size(default_strip_size), max_size(0), updated(false), deleted(false) {} - - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(strip_size, bl); - ::encode(max_size, bl); - ::encode(bits, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(strip_size, bl); - ::decode(max_size, bl); - ::decode(bits, bl); - DECODE_FINISH(bl); - } - }; - typedef ceph::shared_ptr StripObjectHeaderRef; - - static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size, - vector &extents); - int lookup_strip_header(const coll_t & cid, const ghobject_t &oid, - StripObjectHeaderRef *header); - int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t); - int create_strip_header(const coll_t &cid, const ghobject_t &oid, - StripObjectHeaderRef *strip_header, - KeyValueDB::Transaction t); - void clone_wrap(StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *target_header); - void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *new_header); - // Already hold header to avoid lock header seq again - int get_with_header( - const StripObjectHeaderRef header, - const string &prefix, - map *out - ); - - int get_values_with_header( - const StripObjectHeaderRef header, - const string &prefix, - const set &keys, - map *out - ); - int get_keys_with_header( - const StripObjectHeaderRef header, - const string &prefix, - set *keys - ); - - Mutex lock; - void invalidate_cache(const coll_t &c, const ghobject_t &oid) { - Mutex::Locker l(lock); - caches.clear(oid); - } - - RandomCache > caches; - StripObjectMap(KeyValueDB *db): GenericObjectMap(db), - lock("StripObjectMap::lock"), - caches(g_conf->keyvaluestore_header_cache_size) - {} -}; - - -class KVSuperblock { -public: - CompatSet compat_features; - string backend; - - KVSuperblock() { } - - void encode(bufferlist &bl) const; - void decode(bufferlist::iterator &bl); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); -}; -WRITE_CLASS_ENCODER(KVSuperblock) - - -inline ostream& operator<<(ostream& out, const KVSuperblock& sb) -{ - return out << "sb(" << sb.compat_features << " " << sb.backend << ")"; -} - - -class KeyValueStore : public ObjectStore, - public md_config_obs_t { - public: - struct KVPerfTracker { - PerfCounters::avg_tracker os_commit_latency; - PerfCounters::avg_tracker os_apply_latency; - - objectstore_perf_stat_t get_cur_stats() const { - objectstore_perf_stat_t ret; - ret.filestore_commit_latency = os_commit_latency.avg(); - ret.filestore_apply_latency = os_apply_latency.avg(); - return ret; - } - - void update_from_perfcounters(PerfCounters &logger) { - os_commit_latency.consume_next( - logger.get_tavg_ms( - l_os_commit_lat)); - os_apply_latency.consume_next( - logger.get_tavg_ms( - l_os_apply_lat)); - } - - } perf_tracker; - - objectstore_perf_stat_t get_cur_stats() { - perf_tracker.update_from_perfcounters(*perf_logger); - return perf_tracker.get_cur_stats(); - } - - static const uint32_t target_version = 1; - - private: - string internal_name; // internal name, used to name the perfcounter instance - string basedir; - std::string current_fn; - uuid_d fsid; - - int fsid_fd, current_fd; - - deque snaps; - - // ObjectMap - boost::scoped_ptr backend; - - Finisher ondisk_finisher; - - RWLock collections_lock; - set collections; - - Mutex lock; - - int _create_current(); - - /// read a uuid from fd - int read_fsid(int fd, uuid_d *uuid); - - /// lock fsid_fd - int lock_fsid(); - - string strip_object_key(uint64_t no) { - char n[100]; - snprintf(n, 100, "%08lld", (long long)no); - return string(n); - } - - // Each transaction has side effect which may influent the following - // operations, we need to make it visible for the following within - // transaction by caching middle result. - // Side effects contains: - // 1. Creating/Deleting collection - // 2. Creating/Deleting object - // 3. Object modify(including omap, xattr) - // 4. Clone or rename - struct BufferTransaction { - typedef pair uniq_id; - - struct CollGhobjectPairBitwiseComparator { - bool operator()(const uniq_id& l, - const uniq_id& r) const { - if (l.first < r.first) - return true; - if (l.first != r.first) - return false; - if (cmp_bitwise(l.second, r.second) < 0) - return true; - return false; - } - }; - - typedef map StripHeaderMap; - - //Dirty records - StripHeaderMap strip_headers; - map< uniq_id, map, bufferlist>, - CollGhobjectPairBitwiseComparator> buffers; // pair(prefix, key),to buffer updated data in one transaction - - list finishes; - - KeyValueStore *store; - - KeyValueDB::Transaction t; - - void set_collections(const set& collections) { - bufferlist collections_bl; - ::encode(collections, collections_bl); - t->set("meta", "collections", collections_bl); - } - - int lookup_cached_header(const coll_t &cid, const ghobject_t &oid, - StripObjectMap::StripObjectHeaderRef *strip_header, - bool create_if_missing); - int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, const set &keys, - map *out); - void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, map &bl); - int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, const set &keys); - void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix); - int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header); - void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid); - void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid); - int submit_transaction(); - - BufferTransaction(KeyValueStore *store): store(store) { - t = store->backend->get_transaction(); - } - - struct InvalidateCacheContext : public Context { - KeyValueStore *store; - const coll_t cid; - const ghobject_t oid; - InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {} - void finish(int r) { - if (r == 0) - store->backend->invalidate_cache(cid, oid); - } - }; - }; - - // -- op workqueue -- - struct Op { - utime_t start; - uint64_t op; - list tls; - Context *ondisk, *onreadable, *onreadable_sync; - uint64_t ops, bytes; - TrackedOpRef osd_op; - }; - class OpSequencer : public Sequencer_impl { - Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) - list q; - Cond cond; - list > flush_commit_waiters; - uint64_t op; // used by flush() to know the sequence of op - public: - Sequencer *parent; - Mutex apply_lock; // for apply mutual exclusion - - /// get_max_uncompleted - bool _get_max_uncompleted( - uint64_t *seq ///< [out] max uncompleted seq - ) { - assert(qlock.is_locked()); - assert(seq); - *seq = 0; - if (q.empty()) { - return true; - } else { - *seq = q.back()->op; - return false; - } - } /// @returns true if the queue is empty - - /// get_min_uncompleted - bool _get_min_uncompleted( - uint64_t *seq ///< [out] min uncompleted seq - ) { - assert(qlock.is_locked()); - assert(seq); - *seq = 0; - if (q.empty()) { - return true; - } else { - *seq = q.front()->op; - return false; - } - } /// @returns true if both queues are empty - - void _wake_flush_waiters(list *to_queue) { - uint64_t seq; - if (_get_min_uncompleted(&seq)) - seq = -1; - - for (list >::iterator i = - flush_commit_waiters.begin(); - i != flush_commit_waiters.end() && i->first < seq; - flush_commit_waiters.erase(i++)) { - to_queue->push_back(i->second); - } - } - - void queue(Op *o) { - Mutex::Locker l(qlock); - q.push_back(o); - op++; - o->op = op; - } - Op *peek_queue() { - assert(apply_lock.is_locked()); - return q.front(); - } - - Op *dequeue(list *to_queue) { - assert(to_queue); - assert(apply_lock.is_locked()); - Mutex::Locker l(qlock); - Op *o = q.front(); - q.pop_front(); - cond.Signal(); - - _wake_flush_waiters(to_queue); - return o; - } - - void flush() { - Mutex::Locker l(qlock); - - // get max for journal _or_ op queues - uint64_t seq = 0; - if (!q.empty()) - seq = q.back()->op; - - if (seq) { - // everything prior to our watermark to drain through either/both - // queues - while (!q.empty() && q.front()->op <= seq) - cond.Wait(qlock); - } - } - bool flush_commit(Context *c) { - Mutex::Locker l(qlock); - uint64_t seq = 0; - if (_get_max_uncompleted(&seq)) { - return true; - } else { - flush_commit_waiters.push_back(make_pair(seq, c)); - return false; - } - } - - OpSequencer() - : qlock("KeyValueStore::OpSequencer::qlock", false, false), - op(0), parent(0), - apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {} - ~OpSequencer() { - assert(q.empty()); - } - - const string& get_name() const { - return parent->get_name(); - } - }; - - friend ostream& operator<<(ostream& out, const OpSequencer& s); - - deque op_queue; - Throttle throttle_ops, throttle_bytes; - Finisher op_finisher; - - ThreadPool op_tp; - struct OpWQ : public ThreadPool::WorkQueue { - KeyValueStore *store; - OpWQ(KeyValueStore *fs, time_t timeout, time_t suicide_timeout, - ThreadPool *tp) : - ThreadPool::WorkQueue("KeyValueStore::OpWQ", - timeout, suicide_timeout, tp), - store(fs) {} - - bool _enqueue(OpSequencer *osr) { - store->op_queue.push_back(osr); - return true; - } - void _dequeue(OpSequencer *o) { - assert(0); - } - bool _empty() { - return store->op_queue.empty(); - } - OpSequencer *_dequeue() { - if (store->op_queue.empty()) - return NULL; - OpSequencer *osr = store->op_queue.front(); - store->op_queue.pop_front(); - return osr; - } - using ThreadPool::WorkQueue::_process; - void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) { - store->_do_op(osr, handle); - } - void _process_finish(OpSequencer *osr) { - store->_finish_op(osr); - } - void _clear() { - assert(store->op_queue.empty()); - } - } op_wq; - - Op *build_op(list& tls, Context *ondisk, Context *onreadable, - Context *onreadable_sync, TrackedOpRef osd_op); - void queue_op(OpSequencer *osr, Op *o); - void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL); - void _do_op(OpSequencer *osr, ThreadPool::TPHandle &handle); - void op_queue_release_throttle(Op *o); - void _finish_op(OpSequencer *osr); - - PerfCounters *perf_logger; - - public: - - KeyValueStore(const std::string &base, - const char *internal_name = "keyvaluestore", - bool update_to=false); - ~KeyValueStore(); - - bool test_mount_in_use(); - int version_stamp_is_valid(uint32_t *version); - int update_version_stamp(); - uint32_t get_target_version() { - return target_version; - } - - int write_version_stamp(); - int mount(); - int umount(); - unsigned get_max_object_name_length() { - return 4096; // no real limit for leveldb - } - unsigned get_max_attr_name_length() { - return 256; // arbitrary; there is no real limit internally - } - int mkfs(); - int mkjournal() {return 0;} - bool wants_journal() { - return false; - } - bool allows_journal() { - return false; - } - bool needs_journal() { - return false; - } - - void collect_metadata(map *pm); - - int statfs(struct statfs *buf); - - int _do_transactions( - list &tls, uint64_t op_seq, - ThreadPool::TPHandle *handle); - int do_transactions(list &tls, uint64_t op_seq) { - return _do_transactions(tls, op_seq, 0); - } - void _do_transaction(Transaction& transaction, - BufferTransaction &bt, - ThreadPool::TPHandle *handle); - - int queue_transactions(Sequencer *osr, list& tls, - TrackedOpRef op = TrackedOpRef(), - ThreadPool::TPHandle *handle = NULL); - - - // ------------------ - // objects - - int _generic_read(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, bufferlist& bl, - bool allow_eio = false, BufferTransaction *bt = 0); - int _generic_write(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, const bufferlist& bl, - BufferTransaction &t, uint32_t fadvise_flags = 0); - - bool exists(coll_t cid, const ghobject_t& oid); - int stat(coll_t cid, const ghobject_t& oid, struct stat *st, - bool allow_eio = false); - int read(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - bufferlist& bl, uint32_t op_flags = 0, bool allow_eio = false); - int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - bufferlist& bl); - - int _touch(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - const bufferlist& bl, BufferTransaction &t, uint32_t fadvise_flags = 0); - int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - BufferTransaction &t); - int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size, - BufferTransaction &t); - int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, - BufferTransaction &t); - int _clone_range(coll_t cid, const ghobject_t& oldoid, - const ghobject_t& newoid, uint64_t srcoff, - uint64_t len, uint64_t dstoff, BufferTransaction &t); - int _remove(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - int _set_alloc_hint(coll_t cid, const ghobject_t& oid, - uint64_t expected_object_size, - uint64_t expected_write_size, - BufferTransaction &t); - - void start_sync() {} - - void set_fsid(uuid_d u) { fsid = u; } - uuid_d get_fsid() { return fsid; } - - // attrs - int getattr(coll_t cid, const ghobject_t& oid, const char *name, - bufferptr &bp); - int getattrs(coll_t cid, const ghobject_t& oid, map& aset); - - int _setattrs(coll_t cid, const ghobject_t& oid, - map& aset, BufferTransaction &t); - int _rmattr(coll_t cid, const ghobject_t& oid, const char *name, - BufferTransaction &t); - int _rmattrs(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - - // collections - int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num, - uint64_t num_objs) const { return 0; } - int _create_collection(coll_t c, BufferTransaction &t); - int _destroy_collection(coll_t c, BufferTransaction &t); - int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid, - BufferTransaction &t); - int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, - coll_t c, const ghobject_t& o, - BufferTransaction &t); - int _collection_remove_recursive(const coll_t &cid, - BufferTransaction &t); - int list_collections(vector& ls); - bool collection_exists(coll_t c); - bool collection_empty(coll_t c); - int collection_list(coll_t c, ghobject_t start, ghobject_t end, - bool sort_bitwise, int max, - vector *ls, ghobject_t *next); - int collection_version_current(coll_t c, uint32_t *version); - - // omap (see ObjectStore.h for documentation) - int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header, - map *out); - int omap_get_header( - coll_t c, - const ghobject_t &oid, - bufferlist *out, - bool allow_eio = false); - int omap_get_keys(coll_t c, const ghobject_t &oid, set *keys); - int omap_get_values(coll_t c, const ghobject_t &oid, const set &keys, - map *out); - int omap_check_keys(coll_t c, const ghobject_t &oid, const set &keys, - set *out); - ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, - const ghobject_t &oid); - - int check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size); - void dump_start(const std::string &file); - void dump_stop(); - void dump_transactions(list& ls, uint64_t seq, - OpSequencer *osr); - - private: - void _inject_failure() {} - - // omap - int _omap_clear(coll_t cid, const ghobject_t &oid, - BufferTransaction &t); - int _omap_setkeys(coll_t cid, const ghobject_t &oid, - map &aset, - BufferTransaction &t); - int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set &keys, - BufferTransaction &t); - int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid, - const string& first, const string& last, - BufferTransaction &t); - int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl, - BufferTransaction &t); - int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, - BufferTransaction &t); - int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem, - coll_t dest, BufferTransaction &t){ - return 0; - } - - virtual const char** get_tracked_conf_keys() const; - virtual void handle_conf_change(const struct md_config_t *conf, - const std::set &changed); - - std::string m_osd_rollback_to_cluster_snap; - int m_keyvaluestore_queue_max_ops; - int m_keyvaluestore_queue_max_bytes; - int m_keyvaluestore_strip_size; - uint64_t m_keyvaluestore_max_expected_write_size; - int do_update; - bool m_keyvaluestore_do_dump; - std::ofstream m_keyvaluestore_dump; - JSONFormatter m_keyvaluestore_dump_fmt; - - static const string OBJECT_STRIP_PREFIX; - static const string OBJECT_XATTR; - static const string OBJECT_OMAP; - static const string OBJECT_OMAP_HEADER; - static const string OBJECT_OMAP_HEADER_KEY; - static const string COLLECTION; - static const string COLLECTION_ATTR; - static const uint32_t COLLECTION_VERSION = 1; - - KVSuperblock superblock; - /** - * write_superblock() - * - * Write superblock to persisent storage - * - * return value: 0 on success, otherwise negative errno - */ - int write_superblock(); - - /** - * read_superblock() - * - * Fill in KeyValueStore::superblock by reading persistent storage - * - * return value: 0 on success, otherwise negative errno - */ - int read_superblock(); -}; - -WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader) - -#endif diff --git a/src/test/ceph_objectstore_tool.py b/src/test/ceph_objectstore_tool.py index 060e14439249..01b0f756eea1 100755 --- a/src/test/ceph_objectstore_tool.py +++ b/src/test/ceph_objectstore_tool.py @@ -417,7 +417,7 @@ def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight): osdmap_file=osdmap_file.name) output = check_output(cmd, shell=True) epoch = int(re.findall('#(\d+)', output)[0]) - + new_crush_file = tempfile.NamedTemporaryFile(delete=False) old_crush_file = tempfile.NamedTemporaryFile(delete=False) ret = call("./osdmaptool --export-crush {crush_file} {osdmap_file}".format(osdmap_file=osdmap_file.name, @@ -889,7 +889,7 @@ def main(argv): # Specify a bad --type os.mkdir(OSDDIR + "/fakeosd") cmd = ("./ceph-objectstore-tool --data-path " + OSDDIR + "/{osd} --type foobar --op list --pgid {pg}").format(osd="fakeosd", pg=ONEPG) - ERRORS += test_failure(cmd, "Need a valid --type e.g. filestore, memstore, keyvaluestore") + ERRORS += test_failure(cmd, "Need a valid --type e.g. filestore, memstore") # Don't specify a data-path cmd = "./ceph-objectstore-tool --journal-path {dir}/{osd}.journal --type memstore --op list --pgid {pg}".format(dir=OSDDIR, osd=ONEOSD, pg=ONEPG) diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index d57e82c221dd..aabaab1c3aa8 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -104,8 +104,6 @@ TEST_P(StoreTest, collect_metadata) { ASSERT_NE(pm.count("filestore_f_type"), 0u); ASSERT_NE(pm.count("backend_filestore_partition_path"), 0u); ASSERT_NE(pm.count("backend_filestore_dev_node"), 0u); - } else if (GetParam() == string("keyvaluestore")) { - ASSERT_NE(pm.count("keyvaluestore_backend"), 0u); } } @@ -3258,7 +3256,6 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( "memstore", "filestore", - "keyvaluestore", "bluestore", "kstore")); diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index 7f39f92ca2ef..33ee6096b3f9 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -2220,7 +2220,7 @@ int main(int argc, char **argv) desc.add_options() ("help", "produce help message") ("type", po::value(&type), - "Arg is one of [filestore (default), memstore, keyvaluestore]") + "Arg is one of [filestore (default), memstore]") ("data-path", po::value(&dpath), "path to object store, mandatory") ("journal-path", po::value(&jpath), @@ -2466,12 +2466,7 @@ int main(int argc, char **argv) ObjectStore *fs = ObjectStore::create(g_ceph_context, type, dpath, jpath, flags); if (fs == NULL) { - cerr << "Need a valid --type e.g. filestore, memstore, keyvaluestore" << std::endl; - if (type == "keyvaluestore") { - cerr << "Add \"keyvaluestore\" to " - << "enable_experimental_unrecoverable_data_corrupting_features" - << std::endl; - } + cerr << "Need a valid --type e.g. filestore, memstore" << std::endl; myexit(1); } -- 2.47.3