From: Sage Weil Date: Wed, 23 Dec 2015 16:30:30 +0000 (-0500) Subject: os/keyvaluestore: move KeyValueStore into os/keyvaluestore/* X-Git-Tag: v10.0.3~154^2~38 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=98a0e107aa8154b021a45884b78092e986bd032d;p=ceph.git os/keyvaluestore: move KeyValueStore into os/keyvaluestore/* Signed-off-by: Sage Weil --- diff --git a/src/messages/#MOSDOp.h# b/src/messages/#MOSDOp.h# new file mode 100644 index 000000000000..8de809fab7c9 --- /dev/null +++ b/src/messages/#MOSDOp.h# @@ -0,0 +1,397 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_MOSDOP_H +#define CEPH_MOSDOP_H + +#include "msg/Message.h" +#include "osd/osd_types.h" +#include "include/ceph_features.h" + +/* + * OSD op + * + * oid - object id + * op - OSD_OP_DELETE, etc. + * + */ + +class OSD; + +class MOSDOp : public Message { + + static const int HEAD_VERSION = 5; + static const int COMPAT_VERSION = 3; + +private: + uint32_t client_inc; +/ __u32 osdmap_epoch; + __u32 flags; + utime_t mtime; + eversion_t reassert_version; + int32_t retry_attempt; // 0 is first attempt. -1 if we don't know. + +? object_t oid; + object_locator_t oloc; +/ pg_t pgid; +public: + vector ops; +private: + + snapid_t snapid; + snapid_t snap_seq; + vector snaps; + + uint64_t features; + +public: + friend class MOSDOpReply; + + // read + const snapid_t& get_snapid() { return snapid; } + void set_snapid(const snapid_t& s) { snapid = s; } + // writ + const snapid_t& get_snap_seq() const { return snap_seq; } + const vector &get_snaps() const { return snaps; } + void set_snaps(const vector& i) { + snaps = i; + } + void set_snap_seq(const snapid_t& s) { snap_seq = s; } + + osd_reqid_t get_reqid() const { + return osd_reqid_t(get_orig_source(), + client_inc, + header.tid); + } + int get_client_inc() { return client_inc; } + ceph_tid_t get_client_tid() { return header.tid; } + + object_t& get_oid() { return oid; } + + const pg_t& get_pg() const { return pgid; } + + const object_locator_t& get_object_locator() const { + return oloc; + } + + epoch_t get_map_epoch() { return osdmap_epoch; } + + const eversion_t& get_version() { return reassert_version; } + + utime_t get_mtime() { return mtime; } + + MOSDOp() + : Message(CEPH_MSG_OSD_OP, HEAD_VERSION, COMPAT_VERSION) { } + MOSDOp(int inc, long tid, + object_t& _oid, object_locator_t& _oloc, pg_t& _pgid, epoch_t _osdmap_epoch, + int _flags, uint64_t feat) + : Message(CEPH_MSG_OSD_OP, HEAD_VERSION, COMPAT_VERSION), + client_inc(inc), + osdmap_epoch(_osdmap_epoch), flags(_flags), retry_attempt(-1), + oid(_oid), oloc(_oloc), pgid(_pgid), + features(feat) { + set_tid(tid); + } +private: + ~MOSDOp() {} + +public: + void set_version(eversion_t v) { reassert_version = v; } + void set_mtime(utime_t mt) { mtime = mt; } + + // ops + void add_simple_op(int o, uint64_t off, uint64_t len) { + OSDOp osd_op; + osd_op.op.op = o; + osd_op.op.extent.offset = off; + osd_op.op.extent.length = len; + ops.push_back(osd_op); + } + void write(uint64_t off, uint64_t len, bufferlist& bl) { + add_simple_op(CEPH_OSD_OP_WRITE, off, len); + data.claim(bl); + header.data_off = off; + } + void writefull(bufferlist& bl) { + add_simple_op(CEPH_OSD_OP_WRITEFULL, 0, bl.length()); + data.claim(bl); + header.data_off = 0; + } + void zero(uint64_t off, uint64_t len) { + add_simple_op(CEPH_OSD_OP_ZERO, off, len); + } + void truncate(uint64_t off) { + add_simple_op(CEPH_OSD_OP_TRUNCATE, off, 0); + } + void remove() { + add_simple_op(CEPH_OSD_OP_DELETE, 0, 0); + } + + void read(uint64_t off, uint64_t len) { + add_simple_op(CEPH_OSD_OP_READ, off, len); + } + void stat() { + add_simple_op(CEPH_OSD_OP_STAT, 0, 0); + } + + uint64_t get_features() const { + if (features) + return features; + return get_connection()->get_features(); + } + + // flags + int get_flags() const { return flags; } + bool has_flag(__u32 flag) { return flags & flag; }; + + bool wants_ack() const { return flags & CEPH_OSD_FLAG_ACK; } + bool wants_ondisk() const { return flags & CEPH_OSD_FLAG_ONDISK; } + bool wants_onnvram() const { return flags & CEPH_OSD_FLAG_ONNVRAM; } + + void set_want_ack(bool b) { flags |= CEPH_OSD_FLAG_ACK; } + void set_want_onnvram(bool b) { flags |= CEPH_OSD_FLAG_ONNVRAM; } + void set_want_ondisk(bool b) { flags |= CEPH_OSD_FLAG_ONDISK; } + + bool is_retry_attempt() const { return flags & CEPH_OSD_FLAG_RETRY; } + void set_retry_attempt(unsigned a) { + if (a) + flags |= CEPH_OSD_FLAG_RETRY; + else + flags &= ~CEPH_OSD_FLAG_RETRY; + retry_attempt = a; + } + + /** + * get retry attempt + * + * 0 is the first attempt. + * + * @return retry attempt, or -1 if we don't know + */ + int get_retry_attempt() const { + return retry_attempt; + } + + // marshalling + virtual void encode_payload(uint64_t features) { + + OSDOp::merge_osd_op_vector_in_data(ops, data); + + if ((features & CEPH_FEATURE_OBJECTLOCATOR) == 0) { + // here is the old structure we are encoding to: // +#if 0 +struct ceph_osd_request_head { + __le32 client_inc; /* client incarnation */ + struct ceph_object_layout layout; /* pgid */ + __le32 osdmap_epoch; /* client's osdmap epoch */ + + __le32 flags; + + struct ceph_timespec mtime; /* for mutations only */ + struct ceph_eversion reassert_version; /* if we are replaying op */ + + __le32 object_len; /* length of object name */ + + __le64 snapid; /* snapid to read */ + __le64 snap_seq; /* writer's snap context */ + __le32 num_snaps; + + __le16 num_ops; + struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ +} __attribute__ ((packed)); +#endif + header.version = 1; + + ::encode(client_inc, payload); + + __u32 su = 0; + ::encode(pgid, payload); + ::encode(su, payload); + + ::encode(osdmap_epoch, payload); + ::encode(flags, payload); + ::encode(mtime, payload); + ::encode(reassert_version, payload); + + __u32 oid_len = oid.name.length(); + ::encode(oid_len, payload); + ::encode(snapid, payload); + ::encode(snap_seq, payload); + __u32 num_snaps = snaps.size(); + ::encode(num_snaps, payload); + + //::encode(ops, payload); + __u16 num_ops = ops.size(); + ::encode(num_ops, payload); + for (unsigned i = 0; i < ops.size(); i++) + ::encode(ops[i].op, payload); + + ::encode_nohead(oid.name, payload); + ::encode_nohead(snaps, payload); + } else { + ::encode(client_inc, payload); + ::encode(osdmap_epoch, payload); + ::encode(flags, payload); + ::encode(mtime, payload); + ::encode(reassert_version, payload); + + ::encode(oloc, payload); + ::encode(pgid, payload); + ::encode(oid, payload); + + __u16 num_ops = ops.size(); + ::encode(num_ops, payload); + for (unsigned i = 0; i < ops.size(); i++) + ::encode(ops[i].op, payload); + + ::encode(snapid, payload); + ::encode(snap_seq, payload); + ::encode(snaps, payload); + + ::encode(retry_attempt, payload); + ::encode(features, payload); + } + } + + virtual void decode_payload() { + bufferlist::iterator p = payload.begin(); + + if (header.version < 2) { + // old decode + ::decode(client_inc, p); + + old_pg_t opgid; + ::decode_raw(opgid, p); + pgid = opgid; + + __u32 su; + ::decode(su, p); + oloc.pool = pgid.pool(); + + ::decode(osdmap_epoch, p); + ::decode(flags, p); + ::decode(mtime, p); + ::decode(reassert_version, p); + + __u32 oid_len; + ::decode(oid_len, p); + ::decode(snapid, p); + ::decode(snap_seq, p); + __u32 num_snaps; + ::decode(num_snaps, p); + + //::decode(ops, p); + __u16 num_ops; + ::decode(num_ops, p); + ops.resize(num_ops); + for (unsigned i = 0; i < num_ops; i++) + ::decode(ops[i].op, p); + + decode_nohead(oid_len, oid.name, p); + decode_nohead(num_snaps, snaps, p); + + // recalculate pgid hash value + pgid.set_ps(ceph_str_hash(CEPH_STR_HASH_RJENKINS, + oid.name.c_str(), + oid.name.length())); + + retry_attempt = -1; + features = 0; + } else { + // new decode + ::decode(client_inc, p); + ::decode(osdmap_epoch, p); + ::decode(flags, p); + ::decode(mtime, p); + ::decode(reassert_version, p); + + ::decode(oloc, p); + + if (header.version < 3) { + old_pg_t opgid; + ::decode_raw(opgid, p); + pgid = opgid; + } else { + ::decode(pgid, p); + } + + ::decode(oid, p); + + //::decode(ops, p); + __u16 num_ops; + ::decode(num_ops, p); + ops.resize(num_ops); + for (unsigned i = 0; i < num_ops; i++) + ::decode(ops[i].op, p); + + ::decode(snapid, p); + ::decode(snap_seq, p); + ::decode(snaps, p); + + if (header.version >= 4) + ::decode(retry_attempt, p); + else + retry_attempt = -1; + + if (header.version >= 5) + ::decode(features, p); + else + features = 0; + } + + OSDOp::split_osd_op_vector_in_data(ops, data); + } + + void clear_buffers() { + ops.clear(); + } + + const char *get_type_name() const { return "osd_op"; } + void print(ostream& out) const { + out << "osd_op(" << get_reqid(); + out << " "; + if (!oloc.nspace.empty()) + out << oloc.nspace << "/"; + out << oid; + +#if 0 + out << " "; + if (may_read()) + out << "r"; + if (may_write()) + out << "w"; +#endif + if (snapid != CEPH_NOSNAP) + out << "@" << snapid; + + if (oloc.key.size()) + out << " " << oloc; + + out << " " << ops; + out << " " << pgid; + if (is_retry_attempt()) + out << " RETRY=" << get_retry_attempt(); + if (reassert_version != eversion_t()) + out << " reassert_version=" << reassert_version; + if (get_snap_seq()) + out << " snapc " << get_snap_seq() << "=" << snaps; + out << " " << ceph_osd_flag_string(get_flags()); + out << " e" << osdmap_epoch; + out << ")"; + } +}; + + +#endif diff --git a/src/os/GenericObjectMap.cc b/src/os/GenericObjectMap.cc deleted file mode 100644 index 14f8cd476e3a..000000000000 --- a/src/os/GenericObjectMap.cc +++ /dev/null @@ -1,1127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/int_types.h" -#include "include/buffer.h" - -#include -#include -#include -#include -#include - -#include - -#include "GenericObjectMap.h" -#include "common/debug.h" -#include "common/config.h" -#include "include/assert.h" - -#define dout_subsys ceph_subsys_keyvaluestore - -const string GenericObjectMap::GLOBAL_STATE_KEY = "HEADER"; - -const string GenericObjectMap::USER_PREFIX = "_SEQ_"; -const string GenericObjectMap::INTERN_PREFIX = "_INTERN_"; -const string GenericObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; -const string GenericObjectMap::GHOBJECT_TO_SEQ_PREFIX = "_GHOBJTOSEQ_"; -const string GenericObjectMap::PARENT_KEY = "_PARENT_HEADER_"; - -// In order to make right ordering for leveldb matching with hobject_t, -// so use "!" to separated -const string GenericObjectMap::GHOBJECT_KEY_SEP_S = "!"; -const char GenericObjectMap::GHOBJECT_KEY_SEP_C = '!'; -const char GenericObjectMap::GHOBJECT_KEY_ENDING = 0xFF; - -// ============== GenericObjectMap Key Function ================= - -static void append_escaped(const string &in, string *out) -{ - for (string::const_iterator i = in.begin(); i != in.end(); ++i) { - if (*i == '%') { - out->push_back('%'); - out->push_back('p'); - } else if (*i == '.') { - out->push_back('%'); - out->push_back('e'); - } else if (*i == GenericObjectMap::GHOBJECT_KEY_SEP_C) { - out->push_back('%'); - out->push_back('u'); - } else if (*i == '!') { - out->push_back('%'); - out->push_back('s'); - } else { - out->push_back(*i); - } - } -} - -static bool append_unescaped(string::const_iterator begin, - string::const_iterator end, - string *out) -{ - for (string::const_iterator i = begin; i != end; ++i) { - if (*i == '%') { - ++i; - if (*i == 'p') - out->push_back('%'); - else if (*i == 'e') - out->push_back('.'); - else if (*i == 'u') - out->push_back(GenericObjectMap::GHOBJECT_KEY_SEP_C); - else if (*i == 's') - out->push_back('!'); - else - return false; - } else { - out->push_back(*i); - } - } - return true; -} - -string GenericObjectMap::header_key(const coll_t &cid) -{ - string full_name; - - append_escaped(cid.to_str(), &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - return full_name; -} - -string GenericObjectMap::header_key(const coll_t &cid, const ghobject_t &oid) -{ - string full_name; - - append_escaped(cid.to_str(), &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - char buf[PATH_MAX]; - char *t; - char *end; - - // make field ordering match with ghobject_t compare operations - t = buf; - end = t + sizeof(buf); - if (oid.shard_id == shard_id_t::NO_SHARD) { - // otherwise ff will sort *after* 0, not before. - full_name += "--"; - } else { - t += snprintf(t, end - t, "%02x", (int)oid.shard_id); - full_name += string(buf); - } - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - t += snprintf(t, end - t, "%016llx", - (long long)(oid.hobj.pool + 0x8000000000000000)); - full_name += string(buf); - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - snprintf(t, end - t, "%.*X", (int)(sizeof(oid.hobj.get_hash())*2), - (uint32_t)oid.hobj.get_bitwise_key_u32()); - full_name += string(buf); - full_name.append(GHOBJECT_KEY_SEP_S); - - append_escaped(oid.hobj.nspace, &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - append_escaped(oid.hobj.get_key(), &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - append_escaped(oid.hobj.oid.name, &full_name); - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - if (oid.hobj.snap == CEPH_NOSNAP) - t += snprintf(t, end - t, "head"); - else if (oid.hobj.snap == CEPH_SNAPDIR) - t += snprintf(t, end - t, "snapdir"); - else - // Keep length align - t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.hobj.snap); - full_name += string(buf); - - if (oid.generation != ghobject_t::NO_GEN) { - full_name.append(GHOBJECT_KEY_SEP_S); - - t = buf; - end = t + sizeof(buf); - t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.generation); - full_name += string(buf); - } - - full_name.append(1, GHOBJECT_KEY_ENDING); - - return full_name; -} - -bool GenericObjectMap::parse_header_key(const string &long_name, - coll_t *out_coll, ghobject_t *out) -{ - string coll; - string name; - string key; - string ns; - uint32_t hash; - snapid_t snap; - int64_t pool; - gen_t generation = ghobject_t::NO_GEN; - shard_id_t shard_id = shard_id_t::NO_SHARD; - - string::const_iterator current = long_name.begin(); - string::const_iterator end; - - for (end = current; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (!append_unescaped(current, end, &coll)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - string shardstring = string(current, end); - if (shardstring == "--") - shard_id = shard_id_t::NO_SHARD; - else - shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - string pstring(current, end); - pool = strtoull(pstring.c_str(), NULL, 16); - pool -= 0x8000000000000000; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - string hash_str(current, end); - sscanf(hash_str.c_str(), "%X", &hash); - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &ns)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &key)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; - if (end == long_name.end()) - return false; - if (!append_unescaped(current, end, &name)) - return false; - - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C && - *end != GHOBJECT_KEY_ENDING; ++end) ; - if (end == long_name.end()) - return false; - string snap_str(current, end); - if (snap_str == "head") - snap = CEPH_NOSNAP; - else if (snap_str == "snapdir") - snap = CEPH_SNAPDIR; - else - snap = strtoull(snap_str.c_str(), NULL, 16); - - // Optional generation/shard_id - string genstring; - if (*end == GHOBJECT_KEY_SEP_C) { - current = ++end; - for ( ; end != long_name.end() && *end != GHOBJECT_KEY_ENDING; ++end) ; - if (end != long_name.end()) - return false; - genstring = string(current, end); - generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); - } - - if (out) { - (*out) = ghobject_t(hobject_t(name, key, snap, - hobject_t::_reverse_bits(hash), - (int64_t)pool, ns), - generation, shard_id); - } - - if (out_coll) { - bool valid = out_coll->parse(coll); - assert(valid); - } - - return true; -} - - -// ============== GenericObjectMap Prefix ================= - -string GenericObjectMap::user_prefix(Header header, const string &prefix) -{ - return USER_PREFIX + seq_key(header->seq) + prefix; -} - -string GenericObjectMap::complete_prefix(Header header) -{ - return INTERN_PREFIX + seq_key(header->seq) + COMPLETE_PREFIX; -} - -string GenericObjectMap::parent_seq_prefix(uint64_t seq) -{ - return INTERN_PREFIX + seq_key(seq) + PARENT_KEY; -} - - -// ============== GenericObjectMapIteratorImpl ================= - -int GenericObjectMap::GenericObjectMapIteratorImpl::init() -{ - invalid = false; - if (ready) { - return 0; - } - - assert(!parent_iter); - if (header->parent) { - Header parent = map->lookup_parent(header); - if (!parent) { - assert(0); - return -EINVAL; - } - parent_iter.reset(new GenericObjectMapIteratorImpl(map, parent, prefix)); - } - - key_iter = map->db->get_iterator(map->user_prefix(header, prefix)); - assert(key_iter); - complete_iter = map->db->get_iterator(map->complete_prefix(header)); - assert(complete_iter); - cur_iter = key_iter; - assert(cur_iter); - ready = true; - return 0; -} - -ObjectMap::ObjectMapIterator GenericObjectMap::get_iterator( - const coll_t &cid, const ghobject_t &oid, const string &prefix) -{ - Header header = lookup_header(cid, oid); - if (!header) - return ObjectMap::ObjectMapIterator(new EmptyIteratorImpl()); - return _get_iterator(header, prefix); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::seek_to_first() -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->seek_to_first(); - if (r < 0) - return r; - } - r = key_iter->seek_to_first(); - if (r < 0) - return r; - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::seek_to_last() -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->seek_to_last(); - if (r < 0) - return r; - if (parent_iter->valid()) - r = parent_iter->next(); - if (r < 0) - return r; - } - r = key_iter->seek_to_last(); - if (r < 0) - return r; - if (key_iter->valid()) - r = key_iter->next(); - if (r < 0) - return r; - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::lower_bound(const string &to) -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->lower_bound(to); - if (r < 0) - return r; - } - r = key_iter->lower_bound(to); - if (r < 0) - return r; - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::upper_bound(const string &after) -{ - init(); - r = 0; - if (parent_iter) { - r = parent_iter->upper_bound(after); - if (r < 0) - return r; - } - r = key_iter->upper_bound(after); - if (r < 0) - return r; - return adjust(); -} - -bool GenericObjectMap::GenericObjectMapIteratorImpl::valid() -{ - bool valid = !invalid && ready; - assert(!valid || cur_iter->valid()); - return valid; -} - -bool GenericObjectMap::GenericObjectMapIteratorImpl::valid_parent() -{ - if (parent_iter && parent_iter->valid() && - (!key_iter->valid() || key_iter->key() > parent_iter->key())) - return true; - return false; -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::next(bool validate) -{ - assert(cur_iter->valid()); - assert(valid()); - cur_iter->next(); - return adjust(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::next_parent() -{ - if (!parent_iter || !parent_iter->valid()) { - invalid = true; - return 0; - } - r = next(); - if (r < 0) - return r; - if (!valid() || on_parent() || !parent_iter->valid()) - return 0; - - return lower_bound(parent_iter->key()); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::in_complete_region( - const string &to_test, string *begin, string *end) -{ - complete_iter->upper_bound(to_test); - if (complete_iter->valid()) - complete_iter->prev(); - else - complete_iter->seek_to_last(); - - if (!complete_iter->valid()) - return false; - - string _end; - if (begin) - *begin = complete_iter->key(); - _end = string(complete_iter->value().c_str()); - if (end) - *end = _end; - return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test); -} - -/** - * Moves parent_iter to the next position both out of the complete_region and - * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and - * less than key_iter and key_iter otherwise. - */ -int GenericObjectMap::GenericObjectMapIteratorImpl::adjust() -{ - string begin, end; - while (parent_iter && parent_iter->valid()) { - if (in_complete_region(parent_iter->key(), &begin, &end)) { - if (end.size() == 0) { - parent_iter->seek_to_last(); - if (parent_iter->valid()) - parent_iter->next(); - } else { - parent_iter->lower_bound(end); - } - } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { - parent_iter->next(); - } else { - break; - } - } - if (valid_parent()) { - cur_iter = parent_iter; - } else if (key_iter->valid()) { - cur_iter = key_iter; - } else { - invalid = true; - } - assert(invalid || cur_iter->valid()); - return 0; -} - -string GenericObjectMap::GenericObjectMapIteratorImpl::key() -{ - return cur_iter->key(); -} - -bufferlist GenericObjectMap::GenericObjectMapIteratorImpl::value() -{ - return cur_iter->value(); -} - -int GenericObjectMap::GenericObjectMapIteratorImpl::status() -{ - return r; -} - - -// ============== GenericObjectMap Public API ================= - -void GenericObjectMap::set_keys(const Header header, - const string &prefix, - const map &set, - KeyValueDB::Transaction t) -{ - t->set(user_prefix(header, prefix), set); -} - -int GenericObjectMap::clear(const Header header, - KeyValueDB::Transaction t) -{ - remove_header(header->cid, header->oid, header, t); - assert(header->num_children > 0); - header->num_children--; - int r = _clear(header, t); - if (r < 0) - return r; - return 0; -} - -int GenericObjectMap::rm_keys(const Header header, - const string &prefix, - const set &buffered_keys, - const set &to_clear, - KeyValueDB::Transaction t) -{ - t->rmkeys(user_prefix(header, prefix), to_clear); - if (!header->parent) { - return 0; - } - - // Copy up keys from parent around to_clear - int keep_parent; - { - GenericObjectMapIterator iter = _get_iterator(header, prefix); - iter->seek_to_first(); - map new_complete; - map to_write; - for(set::const_iterator i = to_clear.begin(); - i != to_clear.end(); ) { - unsigned copied = 0; - iter->lower_bound(*i); - ++i; - if (!iter->valid()) - break; - string begin = iter->key(); - if (!iter->on_parent()) - iter->next_parent(); - if (new_complete.size() && new_complete.rbegin()->second == begin) { - begin = new_complete.rbegin()->first; - } - while (iter->valid() && copied < 20) { - if (!to_clear.count(iter->key()) && !buffered_keys.count(iter->key())) - to_write[iter->key()].append(iter->value()); - if (i != to_clear.end() && *i <= iter->key()) { - ++i; - copied = 0; - } - - iter->next_parent(); - copied++; - } - if (iter->valid()) { - new_complete[begin] = iter->key(); - } else { - new_complete[begin] = ""; - break; - } - } - t->set(user_prefix(header, prefix), to_write); - merge_new_complete(header, new_complete, iter, t); - keep_parent = need_parent(iter); - if (keep_parent < 0) - return keep_parent; - } - - if (!keep_parent) { - Header parent = lookup_parent(header); - if (!parent) - return -EINVAL; - parent->num_children--; - _clear(parent, t); - header->parent = 0; - set_header(header->cid, header->oid, *header, t); - t->rmkeys_by_prefix(complete_prefix(header)); - } - - return 0; -} - -int GenericObjectMap::get(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - map *out) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - - ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); - out->insert(make_pair(iter->key(), iter->value())); - } - - return 0; -} - -int GenericObjectMap::get_keys(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - set *keys) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - - ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - if (iter->status()) - return iter->status(); - keys->insert(iter->key()); - } - return 0; -} - -int GenericObjectMap::get_values(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - const set &keys, - map *out) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - return scan(header, prefix, keys, 0, out); -} - -int GenericObjectMap::check_keys(const coll_t &cid, const ghobject_t &oid, - const string &prefix, - const set &keys, - set *out) -{ - Header header = lookup_header(cid, oid); - if (!header) - return -ENOENT; - return scan(header, prefix, keys, out, 0); -} - -void GenericObjectMap::clone(const Header parent, const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t, - Header *old_header, Header *new_header) -{ - { - Header destination = lookup_header(cid, target); - if (destination) { - remove_header(cid, target, destination, t); - destination->num_children--; - _clear(destination, t); - } - } - - Header source = generate_new_header(parent->cid, parent->oid, parent, t); - Header destination = generate_new_header(cid, target, parent, t); - - destination->data = parent->data; - source->data = parent->data; - - parent->num_children = 2; - set_parent_header(parent, t); - set_header(parent->cid, parent->oid, *source, t); - set_header(cid, target, *destination, t); - - if (new_header) - *old_header = source; - if (new_header) - *new_header = destination; - - // Clone will set parent header and rm_keys wll lookup_parent which will try - // to find parent header. So it will let lookup_parent fail when "clone" and - // "rm_keys" in one transaction. Here have to sync transaction to make - // visiable for lookup_parent - // FIXME: Clear transaction operations here - int r = submit_transaction_sync(t); - assert(r == 0); -} - -void GenericObjectMap::rename(const Header old_header, const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t) -{ - if (old_header->oid == target && old_header->cid == cid) - return ; - - remove_header(old_header->cid, old_header->oid, old_header, t); - old_header->cid = cid; - old_header->oid = target; - set_header(cid, target, *old_header, t); -} - -int GenericObjectMap::init(bool do_upgrade) -{ - map result; - set to_get; - to_get.insert(GLOBAL_STATE_KEY); - int r = db->get(INTERN_PREFIX, to_get, &result); - if (r < 0) - return r; - if (!result.empty()) { - bufferlist::iterator bliter = result.begin()->second.begin(); - state.decode(bliter); - if (state.v < 1) { // Needs upgrade - if (!do_upgrade) { - dout(1) << "GenericObjbectMap requires an upgrade," - << " set filestore_update_to" - << dendl; - return -ENOTSUP; - } else { - r = upgrade(); - if (r < 0) - return r; - } - } - } else { - // New store - state.v = 1; - state.seq = 1; - } - dout(20) << "(init)genericobjectmap: seq is " << state.seq << dendl; - return 0; -} - -bool GenericObjectMap::check(std::ostream &out) -{ - bool retval = true; - map parent_to_num_children; - map parent_to_actual_num_children; - KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); - - for (iter->seek_to_first(); iter->valid(); iter->next()) { - _Header header; - assert(header.num_children == 1); - header.num_children = 0; // Hack for leaf node - bufferlist bl = iter->value(); - while (true) { - bufferlist::iterator bliter = bl.begin(); - header.decode(bliter); - if (header.seq != 0) - parent_to_actual_num_children[header.seq] = header.num_children; - if (header.parent == 0) - break; - - if (!parent_to_num_children.count(header.parent)) - parent_to_num_children[header.parent] = 0; - parent_to_num_children[header.parent]++; - if (parent_to_actual_num_children.count(header.parent)) - break; - - set to_get; - map got; - to_get.insert(PARENT_KEY); - db->get(parent_seq_prefix(header.parent), to_get, &got); - if (got.empty()) { - out << "Missing: seq " << header.parent << std::endl; - retval = false; - break; - } else { - bl = got.begin()->second; - } - } - } - - for (map::iterator i = parent_to_num_children.begin(); - i != parent_to_num_children.end(); - parent_to_num_children.erase(i++)) { - if (!parent_to_actual_num_children.count(i->first)) - continue; - if (parent_to_actual_num_children[i->first] != i->second) { - out << "Invalid: seq " << i->first << " recorded children: " - << parent_to_actual_num_children[i->first] << " found: " - << i->second << std::endl; - retval = false; - } - parent_to_actual_num_children.erase(i->first); - } - return retval; -} - - -// ============== GenericObjectMap Intern Implementation ================= - -int GenericObjectMap::scan(Header header, - const string &prefix, - const set &in_keys, - set *out_keys, - map *out_values) -{ - ObjectMap::ObjectMapIterator db_iter = _get_iterator(header, prefix); - for (set::const_iterator key_iter = in_keys.begin(); - key_iter != in_keys.end(); - ++key_iter) { - db_iter->lower_bound(*key_iter); - if (db_iter->status()) - return db_iter->status(); - - if (db_iter->valid() && db_iter->key() == *key_iter) { - if (out_keys) - out_keys->insert(*key_iter); - if (out_values) - out_values->insert(make_pair(db_iter->key(), db_iter->value())); - } - } - return 0; -} - -int GenericObjectMap::_clear(Header header, KeyValueDB::Transaction t) -{ - while (1) { - if (header->num_children) { - set_parent_header(header, t); - break; - } - - clear_header(header, t); - if (!header->parent) - break; - - Header parent = lookup_parent(header); - if (!parent) { - return -EINVAL; - } - assert(parent->num_children > 0); - parent->num_children--; - header.swap(parent); - } - return 0; -} - -int GenericObjectMap::merge_new_complete( - Header header, const map &new_complete, - GenericObjectMapIterator iter, KeyValueDB::Transaction t) -{ - KeyValueDB::Iterator complete_iter = db->get_iterator( - complete_prefix(header)); - map::const_iterator i = new_complete.begin(); - set to_remove; - map to_add; - - string begin, end; - while (i != new_complete.end()) { - string new_begin = i->first; - string new_end = i->second; - int r = iter->in_complete_region(new_begin, &begin, &end); - if (r < 0) - return r; - if (r) { - to_remove.insert(begin); - new_begin = begin; - } - ++i; - while (i != new_complete.end()) { - if (!new_end.size() || i->first <= new_end) { - if (!new_end.size() && i->second > new_end) { - new_end = i->second; - } - ++i; - continue; - } - - r = iter->in_complete_region(new_end, &begin, &end); - if (r < 0) - return r; - if (r) { - to_remove.insert(begin); - new_end = end; - continue; - } - break; - } - bufferlist bl; - bl.append(bufferptr(new_end.c_str(), new_end.size() + 1)); - to_add.insert(make_pair(new_begin, bl)); - } - t->rmkeys(complete_prefix(header), to_remove); - t->set(complete_prefix(header), to_add); - return 0; -} - -int GenericObjectMap::need_parent(GenericObjectMapIterator iter) -{ - int r = iter->seek_to_first(); - if (r < 0) - return r; - - if (!iter->valid()) - return 0; - - string begin, end; - if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") { - return 0; - } - return 1; -} - -int GenericObjectMap::write_state(KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " seq is " << state.seq << dendl; - bufferlist bl; - state.encode(bl); - map to_write; - to_write[GLOBAL_STATE_KEY] = bl; - t->set(INTERN_PREFIX, to_write); - return 0; -} - -// NOTE(haomai): It may occur dead lock if thread A hold header A try to header -// B and thread hold header B try to get header A -GenericObjectMap::Header GenericObjectMap::_lookup_header( - const coll_t &cid, const ghobject_t &oid) -{ - set to_get; - to_get.insert(header_key(cid, oid)); - _Header header; - - map out; - - int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out); - if (r < 0) - return Header(); - if (out.empty()) - return Header(); - - bufferlist::iterator iter = out.begin()->second.begin(); - header.decode(iter); - - Header ret = Header(new _Header(header)); - return ret; -} - -GenericObjectMap::Header GenericObjectMap::_generate_new_header( - const coll_t &cid, const ghobject_t &oid, Header parent, - KeyValueDB::Transaction t) -{ - Header header = Header(new _Header()); - header->seq = state.seq++; - if (parent) { - header->parent = parent->seq; - } - header->num_children = 1; - header->oid = oid; - header->cid = cid; - - write_state(t); - return header; -} - -GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input) -{ - Mutex::Locker l(header_lock); - map out; - set keys; - keys.insert(PARENT_KEY); - - dout(20) << "lookup_parent: parent " << input->parent - << " for seq " << input->seq << dendl; - - int r = db->get(parent_seq_prefix(input->parent), keys, &out); - if (r < 0) { - assert(0); - return Header(); - } - if (out.empty()) { - assert(0); - return Header(); - } - - Header header = Header(new _Header()); - header->seq = input->parent; - bufferlist::iterator iter = out.begin()->second.begin(); - header->decode(iter); - dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " - << header->parent << dendl; - return header; -} - -GenericObjectMap::Header GenericObjectMap::lookup_create_header( - const coll_t &cid, const ghobject_t &oid, KeyValueDB::Transaction t) -{ - Mutex::Locker l(header_lock); - Header header = _lookup_header(cid, oid); - if (!header) { - header = _generate_new_header(cid, oid, Header(), t); - set_header(cid, oid, *header, t); - } - return header; -} - -void GenericObjectMap::set_parent_header(Header header, KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " setting seq " << header->seq << dendl; - map to_write; - header->encode(to_write[PARENT_KEY]); - t->set(parent_seq_prefix(header->seq), to_write); -} - -void GenericObjectMap::clear_header(Header header, KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " clearing seq " << header->seq << dendl; - t->rmkeys_by_prefix(user_prefix(header, string())); - t->rmkeys_by_prefix(complete_prefix(header)); - set keys; - keys.insert(PARENT_KEY); - t->rmkeys(parent_seq_prefix(header->seq), keys); -} - -// only remove GHOBJECT_TO_SEQ -void GenericObjectMap::remove_header(const coll_t &cid, - const ghobject_t &oid, Header header, - KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " removing " << header->seq - << " cid " << cid << " oid " << oid << dendl; - set to_remove; - to_remove.insert(header_key(cid, oid)); - t->rmkeys(GHOBJECT_TO_SEQ_PREFIX, to_remove); -} - -void GenericObjectMap::set_header(const coll_t &cid, const ghobject_t &oid, - _Header &header, KeyValueDB::Transaction t) -{ - dout(20) << __func__ << " setting " << header.seq - << " cid " << cid << " oid " << oid << " parent seq " - << header.parent << dendl; - map to_set; - header.encode(to_set[header_key(cid, oid)]); - t->set(GHOBJECT_TO_SEQ_PREFIX, to_set); -} - -int GenericObjectMap::list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max, - vector *out, ghobject_t *next) -{ - // FIXME - Mutex::Locker l(header_lock); - if (start.is_max()) - return 0; - - if (start.is_min()) { - vector oids; - - KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); - for (iter->lower_bound(header_key(cid)); iter->valid(); iter->next()) { - bufferlist bl = iter->value(); - bufferlist::iterator bliter = bl.begin(); - _Header header; - header.decode(bliter); - - if (header.cid == cid) - oids.push_back(header.oid); - - break; - } - - if (oids.empty()) { - if (next) - *next = ghobject_t::get_max(); - return 0; - } - start = oids[0]; - } - - int size = 0; - KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); - for (iter->lower_bound(header_key(cid, start)); iter->valid(); iter->next()) { - bufferlist bl = iter->value(); - bufferlist::iterator bliter = bl.begin(); - _Header header; - header.decode(bliter); - - if (header.cid != cid) { - if (next) - *next = ghobject_t::get_max(); - break; - } - - if (max && size >= max) { - if (next) - *next = header.oid; - break; - } - - if (cmp_bitwise(header.oid, end) >= 0) { - if (next) - *next = ghobject_t::get_max(); - break; - } - - assert(cmp_bitwise(start, header.oid) <= 0); - assert(cmp_bitwise(header.oid, end) < 0); - - - size++; - if (out) - out->push_back(header.oid); - start = header.oid; - } - - if (out->size()) - dout(20) << "objects: " << *out << dendl; - - if (!iter->valid()) - if (next) - *next = ghobject_t::get_max(); - - return 0; -} diff --git a/src/os/GenericObjectMap.h b/src/os/GenericObjectMap.h deleted file mode 100644 index 62c376c338d4..000000000000 --- a/src/os/GenericObjectMap.h +++ /dev/null @@ -1,429 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef CEPH_GENERICOBJECTMAP_H -#define CEPH_GENERICOBJECTMAP_H - -#include "include/buffer.h" -#include -#include -#include -#include -#include - -#include "include/memory.h" -#include "ObjectMap.h" -#include "kv/KeyValueDB.h" -#include "osd/osd_types.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/simple_cache.hpp" - - -/** - * Genericobjectmap: Provide with key/value associated to ghobject_t APIs to caller - * and avoid concerning too much. Wrap and combine KeyValueDB/ObjectMap APIs - * with ghobject_t and adding clone capacity. - * - * Prefix space structure: - * - * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->Header(including - * hobj.seq and related metadata) - * - INTERN_PREFIX: GLOBAL_STATE_KEY - contains the global state - * @see State - * @see write_state - * @see init - * @see generate_new_header - * - INTERN_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below - * - INTERN_PREFIX + header_key(header->seq) + PARENT_KEY - * : used to store parent header(same as headers in GHOBJECT_TO_SEQ) - * - USER_PREFIX + header_key(header->seq) + [CUSTOM_PREFIX] - * : key->value which set by callers - * - * For each node (represented by a header), we - * store three mappings: the key mapping, the complete mapping, and the parent. - * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in - * this mapping indicates that the key mapping contains all entries on [x,y). - * Note, max string is represented by "", so ""->"" indicates that the parent - * is unnecessary (@see rm_keys). When looking up a key not contained in the - * the complete set, we have to check the parent if we don't find it in the - * key set. During rm_keys, we copy keys from the parent and update the - * complete set to reflect the change @see rm_keys. - */ - -// This class only provide basic read capacity, suggest inherit it to -// implement write transaction to use it. @see StripObjectMap -class GenericObjectMap { - public: - boost::scoped_ptr db; - - /** - * Serializes access to next_seq as well as the in_use set - */ - Mutex header_lock; - - GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {} - - int get( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - map *out - ); - - int get_keys( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - set *keys - ); - - int get_values( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - const set &keys, - map *out - ); - - int check_keys( - const coll_t &cid, - const ghobject_t &oid, - const string &prefix, - const set &keys, - set *out - ); - - /// Read initial state from backing store - int init(bool upgrade = false); - - /// Upgrade store to current version - int upgrade() {return 0;} - - /// Consistency check, debug, there must be no parallel writes - bool check(std::ostream &out); - - /// Util, list all objects, there must be no other concurrent access - int list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max, - vector *objs, ///< [out] objects - ghobject_t *next); - - ObjectMap::ObjectMapIterator get_iterator(const coll_t &cid, - const ghobject_t &oid, - const string &prefix); - - KeyValueDB::Transaction get_transaction() { return db->get_transaction(); } - int submit_transaction(KeyValueDB::Transaction t) { - return db->submit_transaction(t); - } - int submit_transaction_sync(KeyValueDB::Transaction t) { - return db->submit_transaction_sync(t); - } - - /// persistent state for store @see generate_header - struct State { - __u8 v; - uint64_t seq; - State() : v(0), seq(1) {} - State(uint64_t seq) : v(0), seq(seq) {} - - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(v, bl); - ::encode(seq, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(v, bl); - ::decode(seq, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - } - - static void generate_test_instances(list &o) { - o.push_back(new State(0)); - o.push_back(new State(20)); - } - } state; - - struct _Header { - uint64_t seq; - uint64_t parent; - uint64_t num_children; - - coll_t cid; - ghobject_t oid; - - // Used by successor - bufferlist data; - - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(seq, bl); - ::encode(parent, bl); - ::encode(num_children, bl); - ::encode(cid, bl); - ::encode(oid, bl); - ::encode(data, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(seq, bl); - ::decode(parent, bl); - ::decode(num_children, bl); - ::decode(cid, bl); - ::decode(oid, bl); - ::decode(data, bl); - DECODE_FINISH(bl); - } - - void dump(Formatter *f) const { - f->dump_unsigned("seq", seq); - f->dump_unsigned("parent", parent); - f->dump_unsigned("num_children", num_children); - f->dump_stream("coll") << cid; - f->dump_stream("oid") << oid; - } - - _Header() : seq(0), parent(0), num_children(1) {} - }; - - typedef ceph::shared_ptr<_Header> Header; - - Header lookup_header(const coll_t &cid, const ghobject_t &oid) { - Mutex::Locker l(header_lock); - return _lookup_header(cid, oid); - } - - /// Lookup or create header for c oid - Header lookup_create_header(const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t); - - /// Set leaf node for c and oid to the value of header - void set_header(const coll_t &cid, const ghobject_t &oid, _Header &header, - KeyValueDB::Transaction t); - - // Move all modify member function to "protect", in order to indicate these - // should be made use of by sub-class - void set_keys( - const Header header, - const string &prefix, - const map &set, - KeyValueDB::Transaction t - ); - - int clear( - const Header header, - KeyValueDB::Transaction t - ); - - int rm_keys( - const Header header, - const string &prefix, - const set &buffered_keys, - const set &to_clear, - KeyValueDB::Transaction t - ); - - void clone( - const Header origin_header, - const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t, - Header *old_header, - Header *new_header - ); - - void rename( - const Header header, - const coll_t &cid, - const ghobject_t &target, - KeyValueDB::Transaction t - ); - - static const string GLOBAL_STATE_KEY; - static const string PARENT_KEY; - - static const string USER_PREFIX; - static const string INTERN_PREFIX; - static const string PARENT_PREFIX; - static const string COMPLETE_PREFIX; - static const string GHOBJECT_TO_SEQ_PREFIX; - - static const string GHOBJECT_KEY_SEP_S; - static const char GHOBJECT_KEY_SEP_C; - static const char GHOBJECT_KEY_ENDING; - -private: - /// Implicit lock on Header->seq - - static string header_key(const coll_t &cid); - static string header_key(const coll_t &cid, const ghobject_t &oid); - static bool parse_header_key(const string &in, coll_t *c, ghobject_t *oid); - - string seq_key(uint64_t seq) { - char buf[100]; - snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); - return string(buf); - } - - string user_prefix(Header header, const string &prefix); - string complete_prefix(Header header); - string parent_seq_prefix(uint64_t seq); - - class EmptyIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { - public: - int seek_to_first() { return 0; } - int seek_to_last() { return 0; } - int upper_bound(const string &after) { return 0; } - int lower_bound(const string &to) { return 0; } - bool valid() { return false; } - int next(bool validate=true) { assert(0); return 0; } - string key() { assert(0); return ""; } - bufferlist value() { assert(0); return bufferlist(); } - int status() { return 0; } - }; - - - /// Iterator - class GenericObjectMapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { - public: - GenericObjectMap *map; - - /// NOTE: implicit lock on header->seq AND for all ancestors - Header header; - - /// parent_iter == NULL iff no parent - ceph::shared_ptr parent_iter; - KeyValueDB::Iterator key_iter; - KeyValueDB::Iterator complete_iter; - - /// cur_iter points to currently valid iterator - ceph::shared_ptr cur_iter; - int r; - - /// init() called, key_iter, complete_iter, parent_iter filled in - bool ready; - /// past end - bool invalid; - - string prefix; - - GenericObjectMapIteratorImpl(GenericObjectMap *map, Header header, - const string &_prefix) : map(map), header(header), r(0), ready(false), - invalid(true), prefix(_prefix) { } - int seek_to_first(); - int seek_to_last(); - int upper_bound(const string &after); - int lower_bound(const string &to); - bool valid(); - int next(bool validate=true); - string key(); - bufferlist value(); - int status(); - - bool on_parent() { - return cur_iter == parent_iter; - } - - /// skips to next valid parent entry - int next_parent(); - - /// Tests whether to_test is in complete region - int in_complete_region(const string &to_test, ///[in] key to test - string *begin, ///[out] beginning of region - string *end ///[out] end of region - ); ///< @returns true if to_test is in the complete region, else false - - private: - int init(); - bool valid_parent(); - int adjust(); - }; - -protected: - typedef ceph::shared_ptr GenericObjectMapIterator; - GenericObjectMapIterator _get_iterator(Header header, string prefix) { - return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix)); - } - - Header generate_new_header(const coll_t &cid, const ghobject_t &oid, - Header parent, KeyValueDB::Transaction t) { - Mutex::Locker l(header_lock); - return _generate_new_header(cid, oid, parent, t); - } - - // Scan keys in header into out_keys and out_values (if nonnull) - int scan(Header header, const string &prefix, const set &in_keys, - set *out_keys, map *out_values); - - private: - - /// Removes node corresponding to header - void clear_header(Header header, KeyValueDB::Transaction t); - - /// Set node containing input to new contents - void set_parent_header(Header input, KeyValueDB::Transaction t); - - /// Remove leaf node corresponding to oid in c - void remove_header(const coll_t &cid, const ghobject_t &oid, Header header, - KeyValueDB::Transaction t); - - /** - * Generate new header for c oid with new seq number - * - * Has the side effect of syncronously saving the new GenericObjectMap state - */ - Header _generate_new_header(const coll_t &cid, const ghobject_t &oid, - Header parent, KeyValueDB::Transaction t); - - // Lookup leaf header for c oid - Header _lookup_header(const coll_t &cid, const ghobject_t &oid); - - // Lookup header node for input - Header lookup_parent(Header input); - - // Remove header and all related prefixes - int _clear(Header header, KeyValueDB::Transaction t); - - // Adds to t operations necessary to add new_complete to the complete set - int merge_new_complete(Header header, const map &new_complete, - GenericObjectMapIterator iter, KeyValueDB::Transaction t); - - // Writes out State (mainly next_seq) - int write_state(KeyValueDB::Transaction _t); - - // 0 if the complete set now contains all of key space, < 0 on error, 1 else - int need_parent(GenericObjectMapIterator iter); - - // Copies header entry from parent @see rm_keys - int copy_up_header(Header header, KeyValueDB::Transaction t); - - // Sets header @see set_header - void _set_header(Header header, const bufferlist &bl, - KeyValueDB::Transaction t); -}; -WRITE_CLASS_ENCODER(GenericObjectMap::_Header) -WRITE_CLASS_ENCODER(GenericObjectMap::State) - -#endif diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc deleted file mode 100644 index 9ed4cc8c3bb9..000000000000 --- a/src/os/KeyValueStore.cc +++ /dev/null @@ -1,3017 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/int_types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "include/compat.h" - -#include -#include - -#include "KeyValueStore.h" -#include "common/BackTrace.h" -#include "include/types.h" - -#include "osd/osd_types.h" -#include "include/color.h" -#include "include/buffer.h" - -#include "common/debug.h" -#include "common/errno.h" -#include "common/run_cmd.h" -#include "common/safe_io.h" -#include "common/perf_counters.h" -#include "common/sync_filesystem.h" - -#include "common/ceph_crypto.h" -using ceph::crypto::SHA1; - -#include "include/assert.h" - -#include "common/config.h" - -#define dout_subsys ceph_subsys_keyvaluestore - -const string KeyValueStore::OBJECT_STRIP_PREFIX = "_STRIP_"; -const string KeyValueStore::OBJECT_XATTR = "__OBJATTR__"; -const string KeyValueStore::OBJECT_OMAP = "__OBJOMAP__"; -const string KeyValueStore::OBJECT_OMAP_HEADER = "__OBJOMAP_HEADER__"; -const string KeyValueStore::OBJECT_OMAP_HEADER_KEY = "__OBJOMAP_HEADER__KEY_"; -const string KeyValueStore::COLLECTION = "__COLLECTION__"; -const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__"; - - -//Initial features in new superblock. -static CompatSet get_kv_initial_compat_set() { - CompatSet::FeatureSet ceph_osd_feature_compat; - CompatSet::FeatureSet ceph_osd_feature_ro_compat; - CompatSet::FeatureSet ceph_osd_feature_incompat; - return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, - ceph_osd_feature_incompat); -} - -//Features are added here that this KeyValueStore supports. -static CompatSet get_kv_supported_compat_set() { - CompatSet compat = get_kv_initial_compat_set(); - //Any features here can be set in code, but not in initial superblock - return compat; -} - - -// ============== StripObjectMap Implementation ================= - -int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header, - KeyValueDB::Transaction t) -{ - if (strip_header->updated) { - strip_header->header->data.clear(); - ::encode(*strip_header, strip_header->header->data); - - set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t); - strip_header->updated = false; - } - return 0; -} - -int StripObjectMap::create_strip_header(const coll_t &cid, - const ghobject_t &oid, - StripObjectHeaderRef *strip_header, - KeyValueDB::Transaction t) -{ - Header header = generate_new_header(cid, oid, Header(), t); - if (!header) - return -EINVAL; - - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - tmp->oid = oid; - tmp->cid = cid; - tmp->header = header; - tmp->updated = true; - if (strip_header) - *strip_header = tmp; - - return 0; -} - -int StripObjectMap::lookup_strip_header(const coll_t &cid, - const ghobject_t &oid, - StripObjectHeaderRef *strip_header) -{ - { - Mutex::Locker l(lock); - pair p; - if (caches.lookup(oid, &p)) { - if (p.first == cid) { - *strip_header = p.second; - return 0; - } - } - } - Header header = lookup_header(cid, oid); - - if (!header) { - dout(20) << "lookup_strip_header failed to get strip_header " - << " cid " << cid <<" oid " << oid << dendl; - return -ENOENT; - } - - - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - if (header->data.length()) { - bufferlist::iterator bliter = header->data.begin(); - ::decode(*tmp, bliter); - } - - if (tmp->strip_size == 0) { - tmp->strip_size = default_strip_size; - tmp->updated = true; - } - - tmp->oid = oid; - tmp->cid = cid; - tmp->header = header; - - { - Mutex::Locker l(lock); - caches.add(oid, make_pair(cid, tmp)); - } - *strip_header = tmp; - dout(10) << "lookup_strip_header done " << " cid " << cid << " oid " - << oid << dendl; - return 0; -} - -int StripObjectMap::file_to_extents(uint64_t offset, size_t len, - uint64_t strip_size, - vector &extents) -{ - if (len == 0) - return 0; - - uint64_t start, end, strip_offset; - start = offset / strip_size; - end = (offset + len) / strip_size; - strip_offset = start * strip_size; - - // "offset" may in the middle of first strip object - if (offset > strip_offset) { - uint64_t extent_offset, extent_len; - extent_offset = offset - strip_offset; - if (extent_offset + len <= strip_size) - extent_len = len; - else - extent_len = strip_size - extent_offset; - extents.push_back(StripExtent(start, extent_offset, extent_len)); - start++; - strip_offset += strip_size; - } - - for (; start < end; ++start) { - extents.push_back(StripExtent(start, 0, strip_size)); - strip_offset += strip_size; - } - - // The end of strip object may be partial - if (offset + len > strip_offset) - extents.push_back(StripExtent(start, 0, offset+len-strip_offset)); - - assert(extents.size()); - dout(10) << "file_to_extents done " << dendl; - return 0; -} - -void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *target_header) -{ - Header new_origin_header; - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - - clone(old_header->header, cid, oid, t, &new_origin_header, - &tmp->header); - - tmp->oid = oid; - tmp->cid = cid; - tmp->strip_size = old_header->strip_size; - tmp->max_size = old_header->max_size; - tmp->bits = old_header->bits; - tmp->updated = true; - old_header->header = new_origin_header; - old_header->updated = true; - - if (target_header) - *target_header = tmp; -} - -void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *new_header) -{ - rename(old_header->header, cid, oid, t); - - StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); - tmp->strip_size = old_header->strip_size; - tmp->max_size = old_header->max_size; - tmp->bits = old_header->bits; - tmp->header = old_header->header; - tmp->oid = oid; - tmp->cid = cid; - tmp->updated = true; - - if (new_header) - *new_header = tmp; - - old_header->header = Header(); - old_header->deleted = true; -} - -int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header, - const string &prefix, - const set &keys, - map *out) -{ - return scan(header->header, prefix, keys, 0, out); -} - -int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header, - const string &prefix, - set *keys) -{ - ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - assert(!iter->status()); - keys->insert(iter->key()); - } - return 0; -} - -int StripObjectMap::get_with_header(const StripObjectHeaderRef header, - const string &prefix, map *out) -{ - ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); - for (iter->seek_to_first(); iter->valid(); iter->next()) { - assert(!iter->status()); - out->insert(make_pair(iter->key(), iter->value())); - } - - return 0; -} - -// ========= KeyValueStore::BufferTransaction Implementation ============ - -int KeyValueStore::BufferTransaction::lookup_cached_header( - const coll_t &cid, const ghobject_t &oid, - StripObjectMap::StripObjectHeaderRef *strip_header, - bool create_if_missing) -{ - uniq_id uid = make_pair(cid, oid); - StripObjectMap::StripObjectHeaderRef header; - int r = 0; - - StripHeaderMap::iterator it = strip_headers.find(uid); - if (it != strip_headers.end()) { - - if (!it->second->deleted) { - if (strip_header) - *strip_header = it->second; - return 0; - } else if (!create_if_missing) { - return -ENOENT; - } - - // If (it->second.deleted && create_if_missing) go down - r = -ENOENT; - } else { - r = store->backend->lookup_strip_header(cid, oid, &header); - } - - if (r == -ENOENT && create_if_missing) { - r = store->backend->create_strip_header(cid, oid, &header, t); - } - - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " r = " << r << dendl; - return r; - } - - strip_headers[uid] = header; - if (strip_header) - *strip_header = header; - return r; -} - -int KeyValueStore::BufferTransaction::get_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix, - const set &keys, map *out) -{ - set need_lookup; - - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - for (set::iterator it = keys.begin(); it != keys.end(); ++it) { - map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); - if ( obj_it != buffers.end() ) { - map, bufferlist>::iterator i = - obj_it->second.find(make_pair(prefix, *it)); - if (i != obj_it->second.end()) { - (*out)[*it].swap(i->second); - } else { - need_lookup.insert(*it); - } - }else { - need_lookup.insert(*it); - } - } - - if (!need_lookup.empty()) { - int r = store->backend->get_values_with_header(strip_header, prefix, - need_lookup, out); - if (r < 0) { - dout(10) << __func__ << " " << strip_header->cid << "/" - << strip_header->oid << " " << " r = " << r << dendl; - return r; - } - } - - return 0; -} - -void KeyValueStore::BufferTransaction::set_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, map &values) -{ - store->backend->set_keys(strip_header->header, prefix, values, t); - - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - map, bufferlist> &uid_buffers = buffers[uid]; - for (map::iterator iter = values.begin(); - iter != values.end(); ++iter) { - uid_buffers[make_pair(prefix, iter->first)].swap(iter->second); - } -} - -int KeyValueStore::BufferTransaction::remove_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix, - const set &keys) -{ - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); - set buffered_keys; - if ( obj_it != buffers.end() ) { - // TODO: Avoid use empty bufferlist to indicate the key is removed - for (set::iterator iter = keys.begin(); iter != keys.end(); ++iter) { - obj_it->second[make_pair(prefix, *iter)] = bufferlist(); - } - // TODO: Avoid collect all buffered keys when remove keys - if (strip_header->header->parent) { - for (map, bufferlist>::iterator iter = obj_it->second.begin(); - iter != obj_it->second.end(); ++iter) { - buffered_keys.insert(iter->first.second); - } - } - } - - return store->backend->rm_keys(strip_header->header, prefix, buffered_keys, keys, t); -} - -void KeyValueStore::BufferTransaction::clear_buffer_keys( - StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix) -{ - uniq_id uid = make_pair(strip_header->cid, strip_header->oid); - map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); - if ( obj_it != buffers.end() ) { - for (map, bufferlist>::iterator iter = obj_it->second.begin(); - iter != obj_it->second.end(); ++iter) { - if (iter->first.first == prefix) - iter->second = bufferlist(); - } - } -} - -int KeyValueStore::BufferTransaction::clear_buffer( - StripObjectMap::StripObjectHeaderRef strip_header) -{ - strip_header->deleted = true; - - InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid); - finishes.push_back(c); - return store->backend->clear(strip_header->header, t); -} - -void KeyValueStore::BufferTransaction::clone_buffer( - StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid) -{ - // Remove target ahead to avoid dead lock - strip_headers.erase(make_pair(cid, oid)); - - StripObjectMap::StripObjectHeaderRef new_target_header; - - store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header); - - // FIXME: Lacking of lock for origin header(now become parent), it will - // cause other operation can get the origin header while submitting - // transactions - strip_headers[make_pair(cid, oid)] = new_target_header; -} - -void KeyValueStore::BufferTransaction::rename_buffer( - StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid) -{ - // FIXME: Lacking of lock for origin header, it will cause other operation - // can get the origin header while submitting transactions - StripObjectMap::StripObjectHeaderRef new_header; - store->backend->rename_wrap(old_header, cid, oid, t, &new_header); - - InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid); - finishes.push_back(c); - strip_headers[make_pair(cid, oid)] = new_header; -} - -int KeyValueStore::BufferTransaction::submit_transaction() -{ - int r = 0; - - for (StripHeaderMap::iterator header_iter = strip_headers.begin(); - header_iter != strip_headers.end(); ++header_iter) { - StripObjectMap::StripObjectHeaderRef header = header_iter->second; - - if (header->deleted) - continue; - - if (header->updated) { - r = store->backend->save_strip_header(header, t); - - if (r < 0) { - dout(10) << __func__ << " save strip header failed " << dendl; - goto out; - } - } - } - - r = store->backend->submit_transaction_sync(t); - for (list::iterator it = finishes.begin(); it != finishes.end(); ++it) { - (*it)->complete(r); - } - -out: - dout(5) << __func__ << " r = " << r << dendl; - return r; -} - -// =========== KeyValueStore Intern Helper Implementation ============== - -ostream& operator<<(ostream& out, const KeyValueStore::OpSequencer& s) -{ - assert(&out); - return out << *s.parent; -} - -int KeyValueStore::_create_current() -{ - struct stat st; - int ret = ::stat(current_fn.c_str(), &st); - if (ret == 0) { - // current/ exists - if (!S_ISDIR(st.st_mode)) { - dout(0) << "_create_current: current/ exists but is not a directory" << dendl; - ret = -EINVAL; - } - } else { - ret = ::mkdir(current_fn.c_str(), 0755); - if (ret < 0) { - ret = -errno; - dout(0) << "_create_current: mkdir " << current_fn << " failed: "<< cpp_strerror(ret) << dendl; - } - } - - return ret; -} - - - -// =========== KeyValueStore API Implementation ============== - -KeyValueStore::KeyValueStore(const std::string &base, - const char *name, bool do_update) : - ObjectStore(base), - internal_name(name), - basedir(base), - fsid_fd(-1), current_fd(-1), - backend(NULL), - ondisk_finisher(g_ceph_context), - collections_lock("KeyValueStore::collections_lock"), - lock("KeyValueStore::lock"), - throttle_ops(g_ceph_context, "keyvaluestore_ops", g_conf->keyvaluestore_queue_max_ops), - throttle_bytes(g_ceph_context, "keyvaluestore_bytes", g_conf->keyvaluestore_queue_max_bytes), - op_finisher(g_ceph_context), - op_tp(g_ceph_context, "KeyValueStore::op_tp", - g_conf->keyvaluestore_op_threads, "keyvaluestore_op_threads"), - op_wq(this, g_conf->keyvaluestore_op_thread_timeout, - g_conf->keyvaluestore_op_thread_suicide_timeout, &op_tp), - perf_logger(NULL), - m_keyvaluestore_queue_max_ops(g_conf->keyvaluestore_queue_max_ops), - m_keyvaluestore_queue_max_bytes(g_conf->keyvaluestore_queue_max_bytes), - m_keyvaluestore_strip_size(g_conf->keyvaluestore_default_strip_size), - m_keyvaluestore_max_expected_write_size(g_conf->keyvaluestore_max_expected_write_size), - do_update(do_update), - m_keyvaluestore_do_dump(false), - m_keyvaluestore_dump_fmt(true) -{ - ostringstream oss; - oss << basedir << "/current"; - current_fn = oss.str(); - - // initialize perf_logger - PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_commit_len, l_os_last); - - plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations count in queue"); - plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations count in queue"); - plb.add_u64_counter(l_os_ops, "ops", "Operations"); - plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max size of queue"); - plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of queue"); - plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store"); - plb.add_time_avg(l_os_commit_lat, "commit_latency", "Commit latency"); - plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency"); - plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency"); - - perf_logger = plb.create_perf_counters(); - - g_ceph_context->get_perfcounters_collection()->add(perf_logger); - g_ceph_context->_conf->add_observer(this); - - superblock.compat_features = get_kv_initial_compat_set(); -} - -KeyValueStore::~KeyValueStore() -{ - g_ceph_context->_conf->remove_observer(this); - g_ceph_context->get_perfcounters_collection()->remove(perf_logger); - - delete perf_logger; - - if (m_keyvaluestore_do_dump) { - dump_stop(); - } -} - -int KeyValueStore::statfs(struct statfs *buf) -{ - int r = backend->db->get_statfs(buf); - if (r < 0) { - if (::statfs(basedir.c_str(), buf) < 0) { - int r = -errno; - return r; - } - } - return 0; -} - -void KeyValueStore::collect_metadata(map *pm) -{ - (*pm)["keyvaluestore_backend"] = superblock.backend; -} - -int KeyValueStore::mkfs() -{ - int ret = 0; - char fsid_fn[PATH_MAX]; - uuid_d old_fsid; - - dout(1) << "mkfs in " << basedir << dendl; - - // open+lock fsid - snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); - fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644); - if (fsid_fd < 0) { - ret = -errno; - derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; - return ret; - } - - if (lock_fsid() < 0) { - ret = -EBUSY; - goto close_fsid_fd; - } - - if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { - if (fsid.is_zero()) { - fsid.generate_random(); - dout(1) << "mkfs generated fsid " << fsid << dendl; - } else { - dout(1) << "mkfs using provided fsid " << fsid << dendl; - } - - char fsid_str[40]; - fsid.print(fsid_str); - strcat(fsid_str, "\n"); - ret = ::ftruncate(fsid_fd, 0); - if (ret < 0) { - ret = -errno; - derr << "mkfs: failed to truncate fsid: " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); - if (ret < 0) { - derr << "mkfs: failed to write fsid: " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - if (::fsync(fsid_fd) < 0) { - ret = -errno; - derr << "mkfs: close failed: can't write fsid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - dout(10) << "mkfs fsid is " << fsid << dendl; - } else { - if (!fsid.is_zero() && fsid != old_fsid) { - derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl; - ret = -EINVAL; - goto close_fsid_fd; - } - fsid = old_fsid; - dout(1) << "mkfs fsid is already set to " << fsid << dendl; - } - - // version stamp - ret = write_version_stamp(); - if (ret < 0) { - derr << "mkfs: write_version_stamp() failed: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - ret = _create_current(); - if (ret < 0) { - derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - // superblock - superblock.backend = g_conf->keyvaluestore_backend; - ret = write_superblock(); - if (ret < 0) { - derr << "KeyValueStore::mkfs write_superblock() failed: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - { - KeyValueDB *store = KeyValueDB::create(g_ceph_context, - superblock.backend, - current_fn.c_str()); - if (!store) { - derr << __func__ << " failed to create backend type " - << g_conf->keyvaluestore_backend << "." << dendl; - ret = -1; - goto close_fsid_fd; - } - - ostringstream err; - if (store->create_and_open(err)) { - derr << __func__ << " failed to create/open backend type " - << g_conf->keyvaluestore_backend << "." << dendl; - ret = -1; - delete store; - goto close_fsid_fd; - } - - bufferlist bl; - ::encode(collections, bl); - KeyValueDB::Transaction t = store->get_transaction(); - t->set("meta", "collections", bl); - store->submit_transaction_sync(t); - - dout(1) << g_conf->keyvaluestore_backend << " backend exists/created" << dendl; - delete store; - } - - ret = write_meta("type", "keyvaluestore"); - if (ret < 0) - goto close_fsid_fd; - - dout(1) << "mkfs done in " << basedir << dendl; - ret = 0; - - close_fsid_fd: - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - return ret; -} - -int KeyValueStore::read_fsid(int fd, uuid_d *uuid) -{ - char fsid_str[40]; - int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); - if (ret < 0) - return ret; - if (ret == 8) { - // old 64-bit fsid... mirror it. - *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; - *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; - return 0; - } - - if (ret > 36) - fsid_str[36] = 0; - if (!uuid->parse(fsid_str)) - return -EINVAL; - return 0; -} - -int KeyValueStore::lock_fsid() -{ - struct flock l; - memset(&l, 0, sizeof(l)); - l.l_type = F_WRLCK; - l.l_whence = SEEK_SET; - l.l_start = 0; - l.l_len = 0; - int r = ::fcntl(fsid_fd, F_SETLK, &l); - if (r < 0) { - int err = errno; - dout(0) << "lock_fsid failed to lock " << basedir - << "/fsid, is another ceph-osd still running? " - << cpp_strerror(err) << dendl; - return -err; - } - return 0; -} - -bool KeyValueStore::test_mount_in_use() -{ - dout(5) << "test_mount basedir " << basedir << dendl; - char fn[PATH_MAX]; - snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); - - // verify fs isn't in use - - fsid_fd = ::open(fn, O_RDWR, 0644); - if (fsid_fd < 0) - return 0; // no fsid, ok. - bool inuse = lock_fsid() < 0; - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - return inuse; -} - -int KeyValueStore::write_superblock() -{ - bufferlist bl; - ::encode(superblock, bl); - return safe_write_file(basedir.c_str(), "superblock", - bl.c_str(), bl.length()); -} - -int KeyValueStore::read_superblock() -{ - bufferptr bp(PATH_MAX); - int ret = safe_read_file(basedir.c_str(), "superblock", - bp.c_str(), bp.length()); - if (ret < 0) { - if (ret == -ENOENT) { - // If the file doesn't exist write initial CompatSet - return write_superblock(); - } - return ret; - } - - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(superblock, i); - return 0; -} - - - -int KeyValueStore::update_version_stamp() -{ - return write_version_stamp(); -} - -int KeyValueStore::version_stamp_is_valid(uint32_t *version) -{ - bufferptr bp(PATH_MAX); - int ret = safe_read_file(basedir.c_str(), "store_version", - bp.c_str(), bp.length()); - if (ret < 0) { - if (ret == -ENOENT) - return 0; - return ret; - } - bufferlist bl; - bl.push_back(bp); - bufferlist::iterator i = bl.begin(); - ::decode(*version, i); - if (*version == target_version) - return 1; - else - return 0; -} - -int KeyValueStore::write_version_stamp() -{ - bufferlist bl; - ::encode(target_version, bl); - - return safe_write_file(basedir.c_str(), "store_version", - bl.c_str(), bl.length()); -} - -int KeyValueStore::mount() -{ - int ret; - char buf[PATH_MAX]; - CompatSet supported_compat_set = get_kv_supported_compat_set(); - - dout(5) << "basedir " << basedir << dendl; - - // make sure global base dir exists - if (::access(basedir.c_str(), R_OK | W_OK)) { - ret = -errno; - derr << "KeyValueStore::mount: unable to access basedir '" << basedir - << "': " << cpp_strerror(ret) << dendl; - goto done; - } - - // get fsid - snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); - fsid_fd = ::open(buf, O_RDWR, 0644); - if (fsid_fd < 0) { - ret = -errno; - derr << "KeyValueStore::mount: error opening '" << buf << "': " - << cpp_strerror(ret) << dendl; - goto done; - } - - ret = read_fsid(fsid_fd, &fsid); - if (ret < 0) { - derr << "KeyValueStore::mount: error reading fsid_fd: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - if (lock_fsid() < 0) { - derr << "KeyValueStore::mount: lock_fsid failed" << dendl; - ret = -EBUSY; - goto close_fsid_fd; - } - - dout(10) << "mount fsid is " << fsid << dendl; - - uint32_t version_stamp; - ret = version_stamp_is_valid(&version_stamp); - if (ret < 0) { - derr << "KeyValueStore::mount : error in version_stamp_is_valid: " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } else if (ret == 0) { - if (do_update) { - derr << "KeyValueStore::mount : stale version stamp detected: " - << version_stamp << ". Proceeding, do_update " - << "is set, performing disk format upgrade." << dendl; - } else { - ret = -EINVAL; - derr << "KeyValueStore::mount : stale version stamp " << version_stamp - << ". Please run the KeyValueStore update script before starting " - << "the OSD, or set keyvaluestore_update_to to " << target_version - << dendl; - goto close_fsid_fd; - } - } - - superblock.backend = g_conf->keyvaluestore_backend; - ret = read_superblock(); - if (ret < 0) { - ret = -EINVAL; - goto close_fsid_fd; - } - - // Check if this KeyValueStore supports all the necessary features to mount - if (supported_compat_set.compare(superblock.compat_features) == -1) { - derr << "KeyValueStore::mount : Incompatible features set " - << superblock.compat_features << dendl; - ret = -EINVAL; - goto close_fsid_fd; - } - - current_fd = ::open(current_fn.c_str(), O_RDONLY); - if (current_fd < 0) { - ret = -errno; - derr << "KeyValueStore::mount: error opening: " << current_fn << ": " - << cpp_strerror(ret) << dendl; - goto close_fsid_fd; - } - - assert(current_fd >= 0); - - { - if (superblock.backend.empty()) - superblock.backend = g_conf->keyvaluestore_backend; - KeyValueDB *store = KeyValueDB::create(g_ceph_context, - superblock.backend, - current_fn.c_str()); - if(!store) - { - derr << "KeyValueStore::mount backend type " - << superblock.backend << " error" << dendl; - ret = -1; - goto close_fsid_fd; - - } - - if (superblock.backend == "rocksdb") - store->init(g_conf->keyvaluestore_rocksdb_options); - else - store->init(); - stringstream err; - if (store->open(err)) { - derr << "KeyValueStore::mount Error initializing keyvaluestore backend " - << superblock.backend << ": " << err.str() << dendl; - ret = -1; - delete store; - goto close_current_fd; - } - - // get collection list - set keys; - keys.insert("collections"); - map values; - store->get("meta", keys, &values); - if (values.empty()) { - ret = -EIO; - derr << "Error no collection list; old store?" << dendl; - goto close_current_fd; - } - bufferlist::iterator p = values["collections"].begin(); - ::decode(collections, p); - dout(20) << "collections: " << collections << dendl; - - StripObjectMap *dbomap = new StripObjectMap(store); - ret = dbomap->init(do_update); - if (ret < 0) { - delete dbomap; - derr << "Error initializing StripObjectMap: " << ret << dendl; - goto close_current_fd; - } - stringstream err2; - - if (g_conf->keyvaluestore_debug_check_backend && !dbomap->check(err2)) { - derr << err2.str() << dendl; - delete dbomap; - ret = -EINVAL; - goto close_current_fd; - } - - default_strip_size = m_keyvaluestore_strip_size; - backend.reset(dbomap); - } - - op_tp.start(); - op_finisher.start(); - ondisk_finisher.start(); - - // all okay. - return 0; - -close_current_fd: - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - current_fd = -1; -close_fsid_fd: - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; -done: - return ret; -} - -int KeyValueStore::umount() -{ - dout(5) << "umount " << basedir << dendl; - - op_tp.stop(); - op_finisher.stop(); - ondisk_finisher.stop(); - - if (fsid_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); - fsid_fd = -1; - } - if (current_fd >= 0) { - VOID_TEMP_FAILURE_RETRY(::close(current_fd)); - current_fd = -1; - } - - backend.reset(); - - // nothing - return 0; -} - -int KeyValueStore::queue_transactions(Sequencer *posr, list &tls, - TrackedOpRef osd_op, - ThreadPool::TPHandle *handle) -{ - utime_t start = ceph_clock_now(g_ceph_context); - Context *onreadable; - Context *ondisk; - Context *onreadable_sync; - ObjectStore::Transaction::collect_contexts( - tls, &onreadable, &ondisk, &onreadable_sync); - - // set up the sequencer - OpSequencer *osr; - assert(posr); - if (posr->p) { - osr = static_cast(posr->p.get()); - dout(5) << "queue_transactions existing " << osr << " " << *osr << "/" << osr->parent - << dendl; //<< " w/ q " << osr->q << dendl; - } else { - osr = new OpSequencer; - osr->parent = posr; - posr->p = osr; - dout(5) << "queue_transactions new " << osr << " " << *osr << "/" << osr->parent << dendl; - } - - Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op); - op_queue_reserve_throttle(o, handle); - if (m_keyvaluestore_do_dump) - dump_transactions(o->tls, o->op, osr); - dout(5) << "queue_transactions (trailing journal) " << " " << tls <tinc(l_os_queue_lat, end - start); - return 0; -} - - -// ============== KeyValueStore Op Handler ================= - -KeyValueStore::Op *KeyValueStore::build_op(list& tls, - Context *ondisk, Context *onreadable, Context *onreadable_sync, - TrackedOpRef osd_op) -{ - uint64_t bytes = 0, ops = 0; - for (list::iterator p = tls.begin(); - p != tls.end(); - ++p) { - bytes += (*p)->get_num_bytes(); - ops += (*p)->get_num_ops(); - } - - Op *o = new Op; - o->start = ceph_clock_now(g_ceph_context); - o->tls.swap(tls); - o->ondisk = ondisk; - o->onreadable = onreadable; - o->onreadable_sync = onreadable_sync; - o->ops = ops; - o->bytes = bytes; - o->osd_op = osd_op; - return o; -} - -void KeyValueStore::queue_op(OpSequencer *osr, Op *o) -{ - // queue op on sequencer, then queue sequencer for the threadpool, - // so that regardless of which order the threads pick up the - // sequencer, the op order will be preserved. - - osr->queue(o); - - perf_logger->inc(l_os_ops); - perf_logger->inc(l_os_bytes, o->bytes); - - dout(5) << "queue_op " << o << " seq " << o->op << " " << *osr << " " - << o->bytes << " bytes" << " (queue has " << throttle_ops.get_current() - << " ops and " << throttle_bytes.get_current() << " bytes)" << dendl; - op_wq.queue(osr); -} - -void KeyValueStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle) -{ - uint64_t max_ops = m_keyvaluestore_queue_max_ops; - uint64_t max_bytes = m_keyvaluestore_queue_max_bytes; - - perf_logger->set(l_os_oq_max_ops, max_ops); - perf_logger->set(l_os_oq_max_bytes, max_bytes); - - if (handle) - handle->suspend_tp_timeout(); - if (throttle_ops.should_wait(1) || - (throttle_bytes.get_current() // let single large ops through! - && throttle_bytes.should_wait(o->bytes))) { - dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || " - << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl; - } - throttle_ops.get(); - throttle_bytes.get(o->bytes); - if (handle) - handle->reset_tp_timeout(); - - perf_logger->set(l_os_oq_ops, throttle_ops.get_current()); - perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current()); -} - -void KeyValueStore::op_queue_release_throttle(Op *o) -{ - throttle_ops.put(); - throttle_bytes.put(o->bytes); - perf_logger->set(l_os_oq_ops, throttle_ops.get_current()); - perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current()); -} - -void KeyValueStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) -{ - // FIXME: Suppose the collection of transaction only affect objects in the - // one PG, so this lock will ensure no other concurrent write operation - osr->apply_lock.Lock(); - Op *o = osr->peek_queue(); - dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl; - int r = _do_transactions(o->tls, o->op, &handle); - dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r - << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; - - if (o->ondisk) { - if (r < 0) { - delete o->ondisk; - o->ondisk = 0; - } else { - ondisk_finisher.queue(o->ondisk, r); - } - } -} - -void KeyValueStore::_finish_op(OpSequencer *osr) -{ - list to_queue; - Op *o = osr->dequeue(&to_queue); - - utime_t lat = ceph_clock_now(g_ceph_context); - lat -= o->start; - - dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl; - osr->apply_lock.Unlock(); // locked in _do_op - op_queue_release_throttle(o); - - perf_logger->tinc(l_os_commit_lat, lat); - perf_logger->tinc(l_os_apply_lat, lat); - - if (o->onreadable_sync) { - o->onreadable_sync->complete(0); - } - if (o->onreadable) - op_finisher.queue(o->onreadable); - if (!to_queue.empty()) - op_finisher.queue(to_queue); - delete o; -} - -// Combine all the ops in the same transaction using "BufferTransaction" and -// cache the middle results in order to make visible to the following ops. -// -// Lock: KeyValueStore use "in_use" in GenericObjectMap to avoid concurrent -// operation on the same object. Not sure ReadWrite lock should be applied to -// improve concurrent performance. In the future, I'd like to remove apply_lock -// on "osr" and introduce PG RWLock. -int KeyValueStore::_do_transactions(list &tls, uint64_t op_seq, - ThreadPool::TPHandle *handle) -{ - int r = 0; - int trans_num = 0; - BufferTransaction bt(this); - - for (list::iterator p = tls.begin(); - p != tls.end(); - ++p, trans_num++) { - r = _do_transaction(**p, bt, handle); - if (r < 0) - break; - if (handle) - handle->reset_tp_timeout(); - } - - r = bt.submit_transaction(); - if (r < 0) { - assert(0 == "unexpected error"); // FIXME - } - - return r; -} - -unsigned KeyValueStore::_do_transaction(Transaction& transaction, - BufferTransaction &t, - ThreadPool::TPHandle *handle) -{ - dout(10) << "_do_transaction on " << &transaction << dendl; - - Transaction::iterator i = transaction.begin(); - uint64_t op_num = 0; - bool exist_clone = false; - - while (i.have_op()) { - if (handle) - handle->reset_tp_timeout(); - - Transaction::Op *op = i.decode_op(); - int r = 0; - - switch (op->op) { - case Transaction::OP_NOP: - break; - - case Transaction::OP_TOUCH: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _touch(cid, oid, t); - } - break; - - case Transaction::OP_WRITE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t off = op->off; - uint64_t len = op->len; - uint32_t fadvise_flags = i.get_fadvise_flags(); - bufferlist bl; - i.decode_bl(bl); - r = _write(cid, oid, off, len, bl, t, fadvise_flags); - } - break; - - case Transaction::OP_ZERO: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t off = op->off; - uint64_t len = op->len; - r = _zero(cid, oid, off, len, t); - } - break; - - case Transaction::OP_TRIMCACHE: - { - // deprecated, no-op - } - break; - - case Transaction::OP_TRUNCATE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t off = op->off; - r = _truncate(cid, oid, off, t); - } - break; - - case Transaction::OP_REMOVE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _remove(cid, oid, t); - } - break; - - case Transaction::OP_SETATTR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - string name = i.decode_string(); - bufferlist bl; - i.decode_bl(bl); - map to_set; - to_set[name] = bufferptr(bl.c_str(), bl.length()); - r = _setattrs(cid, oid, to_set, t); - if (r == -ENOSPC) - dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid - << " name " << name << " size " << bl.length() << dendl; - } - break; - - case Transaction::OP_SETATTRS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - map aset; - i.decode_attrset(aset); - r = _setattrs(cid, oid, aset, t); - if (r == -ENOSPC) - dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; - } - break; - - case Transaction::OP_RMATTR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - string name = i.decode_string(); - r = _rmattr(cid, oid, name.c_str(), t); - } - break; - - case Transaction::OP_RMATTRS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _rmattrs(cid, oid, t); - } - break; - - case Transaction::OP_CLONE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - ghobject_t noid = i.get_oid(op->dest_oid); - exist_clone = true; - r = _clone(cid, oid, noid, t); - } - break; - - case Transaction::OP_CLONERANGE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - ghobject_t noid = i.get_oid(op->dest_oid); - uint64_t off = op->off; - uint64_t len = op->len; - exist_clone = true; - r = _clone_range(cid, oid, noid, off, len, off, t); - } - break; - - case Transaction::OP_CLONERANGE2: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - ghobject_t noid = i.get_oid(op->dest_oid); - uint64_t srcoff = op->off; - uint64_t len = op->len; - uint64_t dstoff = op->dest_off; - exist_clone = true; - r = _clone_range(cid, oid, noid, srcoff, len, dstoff, t); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = i.get_cid(op->cid); - r = _create_collection(cid, t); - } - break; - - case Transaction::OP_COLL_HINT: - { - coll_t cid = i.get_cid(op->cid); - uint32_t type = op->hint_type; - bufferlist hint; - i.decode_bl(hint); - bufferlist::iterator hiter = hint.begin(); - if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { - uint32_t pg_num; - uint64_t num_objs; - ::decode(pg_num, hiter); - ::decode(num_objs, hiter); - r = _collection_hint_expected_num_objs(cid, pg_num, num_objs); - } else { - // Ignore the hint - dout(10) << "Unrecognized collection hint type: " << type << dendl; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = i.get_cid(op->cid); - r = _destroy_collection(cid, t); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t ocid = i.get_cid(op->cid); - coll_t ncid = i.get_cid(op->dest_cid); - ghobject_t oid = i.get_oid(op->oid); - r = _collection_add(ncid, ocid, oid, t); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _remove(cid, oid, t); - } - break; - - case Transaction::OP_COLL_MOVE: - { - // WARNING: this is deprecated and buggy; only here to replay old journals. - coll_t ocid = i.get_cid(op->cid); - coll_t ncid = i.get_cid(op->dest_cid); - ghobject_t oid = i.get_oid(op->oid); - r = _collection_move_rename(ocid, oid, ncid, oid, t); - } - break; - - case Transaction::OP_COLL_MOVE_RENAME: - { - coll_t oldcid = i.get_cid(op->cid); - ghobject_t oldoid = i.get_oid(op->oid); - coll_t newcid = i.get_cid(op->dest_cid); - ghobject_t newoid = i.get_oid(op->dest_oid); - r = _collection_move_rename(oldcid, oldoid, newcid, newoid, t); - } - break; - - case Transaction::OP_COLL_SETATTR: - case Transaction::OP_COLL_RMATTR: - assert(0 == "coll attrs no longer supported"); - break; - - case Transaction::OP_STARTSYNC: - { - start_sync(); - break; - } - - case Transaction::OP_COLL_RENAME: - { - assert(0 == "not implemented"); - } - break; - - case Transaction::OP_OMAP_CLEAR: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - r = _omap_clear(cid, oid, t); - } - break; - case Transaction::OP_OMAP_SETKEYS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - map aset; - i.decode_attrset(aset); - r = _omap_setkeys(cid, oid, aset, t); - } - break; - case Transaction::OP_OMAP_RMKEYS: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - set keys; - i.decode_keyset(keys); - r = _omap_rmkeys(cid, oid, keys, t); - } - break; - case Transaction::OP_OMAP_RMKEYRANGE: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - string first, last; - first = i.decode_string(); - last = i.decode_string(); - r = _omap_rmkeyrange(cid, oid, first, last, t); - } - break; - case Transaction::OP_OMAP_SETHEADER: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - bufferlist bl; - i.decode_bl(bl); - r = _omap_setheader(cid, oid, bl, t); - } - break; - case Transaction::OP_SPLIT_COLLECTION: - { - coll_t cid = i.get_cid(op->cid); - uint32_t bits = op->split_bits; - uint32_t rem = op->split_rem; - coll_t dest = i.get_cid(op->dest_cid); - r = _split_collection_create(cid, bits, rem, dest, t); - } - break; - case Transaction::OP_SPLIT_COLLECTION2: - { - coll_t cid = i.get_cid(op->cid); - uint32_t bits = op->split_bits; - uint32_t rem = op->split_rem; - coll_t dest = i.get_cid(op->dest_cid); - r = _split_collection(cid, bits, rem, dest, t); - } - break; - - case Transaction::OP_SETALLOCHINT: - { - coll_t cid = i.get_cid(op->cid); - ghobject_t oid = i.get_oid(op->oid); - uint64_t expected_object_size = op->expected_object_size; - uint64_t expected_write_size = op->expected_write_size; - r = _set_alloc_hint(cid, oid, expected_object_size, - expected_write_size, t); - } - break; - - default: - derr << "bad op " << op->op << dendl; - assert(0); - } - - if (r < 0) { - bool ok = false; - - if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || - op->op == Transaction::OP_CLONE || - op->op == Transaction::OP_CLONERANGE2 || - op->op == Transaction::OP_COLL_ADD)) - // -ENOENT is normally okay - // ...including on a replayed OP_RMCOLL with checkpoint mode - ok = true; - if (r == -ENODATA) - ok = true; - - if (!ok) { - const char *msg = "unexpected error code"; - - if (exist_clone) { - dout(0) << "BUG: clone failed will lead to paritial transaction applied" << dendl; - } - - if (r == -ENOSPC) - // For now, if we hit _any_ ENOSPC, crash, before we do any damage - // by partially applying transactions. - msg = "ENOSPC handling not implemented"; - - if (r == -ENOTEMPTY) { - msg = "ENOTEMPTY suggests garbage data in osd data dir"; - } - - dout(0) << " error " << cpp_strerror(r) << " not handled on operation " - << op->op << " op " << op_num << ", counting from 0)" << dendl; - dout(0) << msg << dendl; - dout(0) << " transaction dump:\n"; - JSONFormatter f(true); - f.open_object_section("transaction"); - transaction.dump(&f); - f.close_section(); - f.flush(*_dout); - *_dout << dendl; - assert(0 == "unexpected error"); - - if (r == -EMFILE) { - dump_open_fds(g_ceph_context); - } - } - } - - op_num++; - } - - return 0; // FIXME count errors -} - - -// =========== KeyValueStore Op Implementation ============== -// objects - -bool KeyValueStore::exists(coll_t cid, const ghobject_t& oid) -{ - dout(10) << __func__ << "collection: " << cid << " object: " << oid - << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - return false; - } - - return true; -} - -int KeyValueStore::stat(coll_t cid, const ghobject_t& oid, - struct stat *st, bool allow_eio) -{ - dout(10) << "stat " << cid << "/" << oid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl; - return -ENOENT; - } - - st->st_blocks = header->max_size / header->strip_size; - if (header->max_size % header->strip_size) - st->st_blocks++; - st->st_nlink = 1; - st->st_size = header->max_size; - st->st_blksize = header->strip_size; - - return r; -} - -int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, bufferlist& bl, - bool allow_eio, BufferTransaction *bt) -{ - if (header->max_size < offset) { - dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")" - << " offset exceed the length of bl"<< dendl; - return 0; - } - - if (len == 0) - len = header->max_size - offset; - - if (offset + len > header->max_size) - len = header->max_size - offset; - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - map out; - set keys; - pair uid = make_pair(header->cid, header->oid); - - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - bufferlist old; - string key = strip_object_key(iter->no); - - map< pair, map, bufferlist> >::iterator obj_it; - if ( bt ){ - obj_it = bt->buffers.find(uid); - } - if ( bt && obj_it != bt->buffers.end() && obj_it->second.count(make_pair(OBJECT_STRIP_PREFIX, key))) { - // use strip_header buffer - assert(header->bits[iter->no]); - out[key] = obj_it->second[make_pair(OBJECT_STRIP_PREFIX, key)]; - }else if (header->bits[iter->no]) { - keys.insert(key); - } - } - - - int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out); - r = check_get_rc(header->cid, header->oid, r, out.size() == keys.size()); - if (r < 0) - return r; - - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - string key = strip_object_key(iter->no); - - if (header->bits[iter->no]) { - if (iter->len == header->strip_size) { - bl.claim_append(out[key]); - } else { - out[key].copy(iter->offset, iter->len, bl); - } - } else { - bl.append_zero(iter->len); - } - } - - dout(10) << __func__ << " " << header->cid << "/" << header->oid << " " - << offset << "~" << bl.length() << "/" << len << " r = " << r - << dendl; - - return bl.length(); -} - - -int KeyValueStore::read(coll_t cid, const ghobject_t& oid, uint64_t offset, - size_t len, bufferlist& bl, uint32_t op_flags, - bool allow_eio) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(cid, oid, &header); - - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << " header isn't exist: r = " << r << dendl; - return r; - } - - return _generic_read(header, offset, len, bl, allow_eio); -} - -int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid, - uint64_t offset, size_t len, bufferlist& bl) -{ - dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~" - << len << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len - << " failed to get header: r = " << r << dendl; - return r; - } - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - - map m; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - if (header->bits[iter->no]) { - uint64_t off = iter->no * header->strip_size + iter->offset; - m[off] = iter->len; - } - } - ::encode(m, bl); - return 0; -} - -int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - header->max_size = 0; - header->bits.clear(); - header->updated = true; - r = t.clear_buffer(header); - - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " size " << size - << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << size - << " failed to get header: r = " << r << dendl; - return r; - } - - if (header->max_size == size) - return 0; - - if (header->max_size > size) { - vector extents; - StripObjectMap::file_to_extents(size, header->max_size-size, - header->strip_size, extents); - assert(extents.size()); - - vector::iterator iter = extents.begin(); - if (header->bits[iter->no] && iter->offset != 0) { - bufferlist value; - map values; - set lookup_keys; - string key = strip_object_key(iter->no); - - lookup_keys.insert(key); - r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, - lookup_keys, &values); - r = check_get_rc(cid, oid, r, lookup_keys.size() == values.size()); - if (r < 0) - return r; - - values[key].copy(0, iter->offset, value); - value.append_zero(header->strip_size-iter->offset); - assert(value.length() == header->strip_size); - value.swap(values[key]); - - t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); - ++iter; - } - - set keys; - for (; iter != extents.end(); ++iter) { - if (header->bits[iter->no]) { - keys.insert(strip_object_key(iter->no)); - header->bits[iter->no] = 0; - } - } - r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << size << " = " << r << dendl; - return r; - } - } - - header->bits.resize(size/header->strip_size+1); - header->max_size = size; - header->updated = true; - - dout(10) << __func__ << " " << cid << "/" << oid << " size " << size << " = " - << r << dendl; - return r; -} - -int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " failed to get header: r = " << r << dendl; - r = -EINVAL; - return r; - } - - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, - const bufferlist& bl, BufferTransaction &t, - uint32_t fadvise_flags) -{ - if (len > bl.length()) - len = bl.length(); - - if (len + offset > header->max_size) { - header->max_size = len + offset; - header->bits.resize(header->max_size/header->strip_size+1); - header->updated = true; - } - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - - map out; - set keys; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - if (header->bits[iter->no] && !(iter->offset == 0 && - iter->len == header->strip_size)) - keys.insert(strip_object_key(iter->no)); - } - - int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out); - r = check_get_rc(header->cid, header->oid, r, keys.size() == out.size()); - if (r < 0) - return r; - - uint64_t bl_offset = 0; - map values; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - bufferlist value; - string key = strip_object_key(iter->no); - if (header->bits[iter->no]) { - if (iter->offset == 0 && iter->len == header->strip_size) { - bl.copy(bl_offset, iter->len, value); - bl_offset += iter->len; - } else { - assert(out[key].length() == header->strip_size); - - out[key].copy(0, iter->offset, value); - bl.copy(bl_offset, iter->len, value); - bl_offset += iter->len; - - if (value.length() != header->strip_size) - out[key].copy(value.length(), header->strip_size-value.length(), - value); - } - } else { - if (iter->offset) - value.append_zero(iter->offset); - bl.copy(bl_offset, iter->len, value); - bl_offset += iter->len; - - if (value.length() < header->strip_size) - value.append_zero(header->strip_size-value.length()); - - header->bits[iter->no] = 1; - header->updated = true; - } - assert(value.length() == header->strip_size); - values[key].swap(value); - } - assert(bl_offset == len); - - t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); - dout(10) << __func__ << " " << header->cid << "/" << header->oid << " " - << offset << "~" << len << " = " << r << dendl; - - return r; -} - -int KeyValueStore::_write(coll_t cid, const ghobject_t& oid, - uint64_t offset, size_t len, const bufferlist& bl, - BufferTransaction &t, uint32_t fadvise_flags) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << dendl; - - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset - << "~" << len << " failed to get header: r = " << r << dendl; - return r; - } - - return _generic_write(header, offset, len, bl, t, fadvise_flags); -} - -int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, - size_t len, BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" << len << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset - << "~" << len << " failed to get header: r = " << r << dendl; - return r; - } - - if (len + offset > header->max_size) { - header->max_size = len + offset; - header->bits.resize(header->max_size/header->strip_size+1); - header->updated = true; - } - - vector extents; - StripObjectMap::file_to_extents(offset, len, header->strip_size, - extents); - set rm_keys; - set lookup_keys; - map values; - map > off_len; - for (vector::iterator iter = extents.begin(); - iter != extents.end(); ++iter) { - string key = strip_object_key(iter->no); - if (header->bits[iter->no]) { - if (iter->offset == 0 && iter->len == header->strip_size) { - rm_keys.insert(key); - header->bits[iter->no] = 0; - header->updated = true; - } else { - lookup_keys.insert(key); - off_len[key] = make_pair(iter->offset, iter->len); - } - } - } - r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, - lookup_keys, &values); - r = check_get_rc(header->cid, header->oid, r, lookup_keys.size() == values.size()); - if (r < 0) - return r; - for(set::iterator it = lookup_keys.begin(); it != lookup_keys.end(); ++it) - { - pair p = off_len[*it]; - values[*it].zero(p.first, p.second); - } - t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); - t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, rm_keys); - dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~" - << len << " = " << r << dendl; - return r; -} - -int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid, - const ghobject_t& newoid, BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << dendl; - - if (oldoid == newoid) - return 0; - - int r; - StripObjectMap::StripObjectHeaderRef old_header; - - r = t.lookup_cached_header(cid, oldoid, &old_header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " = " << r << dendl; - return r; - } - - t.clone_buffer(old_header, cid, newoid); - - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid, - const ghobject_t& newoid, uint64_t srcoff, - uint64_t len, uint64_t dstoff, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << dendl; - - int r; - bufferlist bl; - - StripObjectMap::StripObjectHeaderRef old_header, new_header; - - r = t.lookup_cached_header(cid, oldoid, &old_header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << " header isn't exist: r = " << r << dendl; - return r; - } - - r = t.lookup_cached_header(cid, newoid, &new_header, true); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << " can't create header: r = " << r << dendl; - return r; - } - - r = _generic_read(old_header, srcoff, len, bl, &t); - if (r < 0) - goto out; - - r = _generic_write(new_header, dstoff, len, bl, t); - - out: - dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" - << newoid << " " << srcoff << "~" << len << " to " << dstoff - << " = " << r << dendl; - return r; -} - -// attrs - -int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, - bufferptr &bp) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " '" << name << "'" - << dendl; - - int r; - map got; - set to_get; - StripObjectMap::StripObjectHeaderRef header; - - to_get.insert(string(name)); - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " get_xattrs err r =" << r << dendl; - goto out; - } - if (got.empty()) { - dout(10) << __func__ << " got.size() is 0" << dendl; - return -ENODATA; - } - bp = bufferptr(got.begin()->second.c_str(), - got.begin()->second.length()); - r = 0; - - out: - dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = " - << r << dendl; - return r; -} - -int KeyValueStore::getattrs(coll_t cid, const ghobject_t& oid, - map& aset) -{ - map attr_aset; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = backend->lookup_strip_header(cid, oid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_with_header(header, OBJECT_XATTR, &attr_aset); - if (r < 0) { - dout(10) << __func__ << " could not get attrs r = " << r << dendl; - goto out; - } - - for (map::iterator i = attr_aset.begin(); - i != attr_aset.end(); ++i) { - string key; - key = i->first; - aset.insert(make_pair(key, - bufferptr(i->second.c_str(), i->second.length()))); - } - - out: - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - - return r; -} - -int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid, - map& aset, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - - StripObjectMap::StripObjectHeaderRef header; - map attrs; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) - goto out; - - for (map::iterator it = aset.begin(); - it != aset.end(); ++it) { - attrs[it->first].push_back(it->second); - } - - t.set_buffer_keys(header, OBJECT_XATTR, attrs); - -out: - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - - -int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " '" << name << "'" - << dendl; - - int r; - set to_remove; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " could not find header r = " << r - << dendl; - return r; - } - - to_remove.insert(string(name)); - r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove); - - dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = " - << r << dendl; - return r; -} - -int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << dendl; - - int r; - set attrs; - - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " could not find header r = " << r - << dendl; - return r; - } - - r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs); - if (r < 0) { - dout(10) << __func__ << " could not get attrs r = " << r << dendl; - return r; - } - - r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs); - t.clear_buffer_keys(header, OBJECT_XATTR); - - dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; - return r; -} - - -// collections - -int KeyValueStore::_create_collection(coll_t c, BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << dendl; - int r = 0; - bufferlist bl; - - RWLock::WLocker l(collections_lock); - if (collections.count(c)) { - r = -EEXIST; - goto out; - } - - collections.insert(c); - t.set_collections(collections); - - out: - dout(10) << __func__ << " cid " << c << " r = " << r << dendl; - return r; -} - -int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << dendl; - - int r; - uint64_t modified_object = 0; - vector oids; - bufferlist bl; - - { - RWLock::RLocker l(collections_lock); - if (!collections.count(c)) { - r = -ENOENT; - goto out; - } - } - - // All modified objects are marked deleted - for (BufferTransaction::StripHeaderMap::iterator iter = t.strip_headers.begin(); - iter != t.strip_headers.end(); ++iter) { - // sum the total modified object in this PG - if (iter->first.first != c) - continue; - - modified_object++; - if (!iter->second->deleted) { - r = -ENOTEMPTY; - goto out; - } - } - - r = backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), modified_object+1, &oids, - 0); - // No other object - if (oids.size() != modified_object && oids.size() != 0) { - r = -ENOTEMPTY; - goto out; - } - - for (vector::iterator iter = oids.begin(); - iter != oids.end(); ++iter) { - if (!t.strip_headers.count(make_pair(c, *iter))) { - r = -ENOTEMPTY; - goto out; - } - } - - { - RWLock::WLocker l(collections_lock); - collections.erase(c); - t.set_collections(collections); - } - r = 0; - -out: - dout(10) << __func__ << " " << c << " = " << r << dendl; - return r; -} - - -int KeyValueStore::_collection_add(coll_t c, coll_t oldcid, - const ghobject_t& o, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << o << dendl; - - bufferlist bl; - StripObjectMap::StripObjectHeaderRef header, old_header; - - int r = t.lookup_cached_header(oldcid, o, &old_header, false); - if (r < 0) { - goto out; - } - - r = t.lookup_cached_header(c, o, &header, false); - if (r == 0) { - r = -EEXIST; - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << o << " already exist " << dendl; - goto out; - } - - r = _generic_read(old_header, 0, old_header->max_size, bl, &t); - if (r < 0) { - r = -EINVAL; - goto out; - } - - r = _generic_write(header, 0, bl.length(), bl, t); - if (r < 0) { - r = -EINVAL; - } - -out: - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << o << " = " << r << dendl; - return r; -} - -int KeyValueStore::_collection_move_rename(coll_t oldcid, - const ghobject_t& oldoid, - coll_t c, const ghobject_t& o, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << oldoid << dendl; - int r; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(c, o, &header, false); - if (r == 0) { - dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c - << "/" << o << " = " << r << dendl; - return -EEXIST; - } - - r = t.lookup_cached_header(oldcid, oldoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c - << "/" << o << " = " << r << dendl; - return r; - } - - t.rename_buffer(header, c, o); - - dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" - << oldoid << " = " << r << dendl; - return r; -} - -int KeyValueStore::_collection_remove_recursive(const coll_t &cid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << dendl; - int r = 0; - - { - RWLock::RLocker l(collections_lock); - if (collections.count(cid) == 0) - return -ENOENT; - } - - vector objects; - ghobject_t max; - while (!max.is_max()) { - r = collection_list(cid, max, ghobject_t::get_max(), true, 300, &objects, &max); - if (r < 0) - goto out; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - r = _remove(cid, *i, t); - if (r < 0) - goto out; - } - } - - { - RWLock::WLocker l(collections_lock); - collections.erase(cid); - t.set_collections(collections); - } - - out: - dout(10) << __func__ << " " << cid << " r = " << r << dendl; - return r; -} - -int KeyValueStore::list_collections(vector& ls) -{ - dout(10) << __func__ << " " << dendl; - RWLock::RLocker l(collections_lock); - for (set::iterator p = collections.begin(); p != collections.end(); - ++p) { - ls.push_back(*p); - } - return 0; -} - -bool KeyValueStore::collection_exists(coll_t c) -{ - dout(10) << __func__ << " " << dendl; - RWLock::RLocker l(collections_lock); - return collections.count(c); -} - -bool KeyValueStore::collection_empty(coll_t c) -{ - dout(10) << __func__ << " " << dendl; - - vector oids; - backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), 1, &oids, 0); - - return oids.empty(); -} - -int KeyValueStore::collection_list(coll_t c, ghobject_t start, - ghobject_t end, bool sort_bitwise, int max, - vector *ls, ghobject_t *next) -{ - if (!sort_bitwise) - return -EOPNOTSUPP; - - if (max < 0) - return -EINVAL; - - if (start.is_max()) - return 0; - - int r = backend->list_objects(c, start, end, max, ls, next); - return r; -} - -int KeyValueStore::collection_version_current(coll_t c, uint32_t *version) -{ - *version = COLLECTION_VERSION; - if (*version == target_version) - return 1; - else - return 0; -} - -// omap - -int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid, - bufferlist *bl, map *out) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_with_header(header, OBJECT_OMAP, out); - if (r < 0) { - dout(10) << __func__ << " err r =" << r << dendl; - return r; - } - - set keys; - map got; - - keys.insert(OBJECT_OMAP_HEADER_KEY); - r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " err r =" << r << dendl; - return r; - } - - if (!got.empty()) { - assert(got.size() == 1); - bl->swap(got.begin()->second); - } - - return 0; -} - -int KeyValueStore::omap_get_header(coll_t c, const ghobject_t &hoid, - bufferlist *bl, bool allow_eio) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - set keys; - map got; - StripObjectMap::StripObjectHeaderRef header; - - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - keys.insert(OBJECT_OMAP_HEADER_KEY); - r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got); - if (r < 0 && r != -ENOENT) { - dout(10) << __func__ << " err r =" << r << dendl; - return r; - } - - if (!got.empty()) { - assert(got.size() == 1); - bl->swap(got.begin()->second); - } - - return 0; -} - -int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set *keys) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_keys_with_header(header, OBJECT_OMAP, keys); - if (r < 0) { - return r; - } - return 0; -} - -int KeyValueStore::omap_get_values(coll_t c, const ghobject_t &hoid, - const set &keys, - map *out) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - int r = backend->lookup_strip_header(c, hoid, &header); - if (r < 0) { - dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; - return r; - } - - r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out); - if (r < 0 && r != -ENOENT) { - return r; - } - return 0; -} - -int KeyValueStore::omap_check_keys(coll_t c, const ghobject_t &hoid, - const set &keys, set *out) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - - int r = backend->check_keys(c, hoid, OBJECT_OMAP, keys, out); - if (r < 0 && r != -ENOENT) { - return r; - } - return 0; -} - -ObjectMap::ObjectMapIterator KeyValueStore::get_omap_iterator( - coll_t c, const ghobject_t &hoid) -{ - dout(15) << __func__ << " " << c << "/" << hoid << dendl; - return backend->get_iterator(c, hoid, OBJECT_OMAP); -} - -int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - set keys; - r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys); - if (r < 0) { - dout(10) << __func__ << " could not get omap_keys r = " << r << dendl; - return r; - } - - r = t.remove_buffer_keys(header, OBJECT_OMAP, keys); - if (r < 0) { - dout(10) << __func__ << " could not remove keys r = " << r << dendl; - return r; - } - - keys.clear(); - keys.insert(OBJECT_OMAP_HEADER_KEY); - r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys); - if (r < 0) { - dout(10) << __func__ << " could not remove keys r = " << r << dendl; - return r; - } - - t.clear_buffer_keys(header, OBJECT_OMAP_HEADER); - - dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl; - return 0; -} - -int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid, - map &aset, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - t.set_buffer_keys(header, OBJECT_OMAP, aset); - - return 0; -} - -int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid, - const set &keys, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - r = t.remove_buffer_keys(header, OBJECT_OMAP, keys); - - dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl; - return r; -} - -int KeyValueStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid, - const string& first, const string& last, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," - << last << "]" << dendl; - - set keys; - { - ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); - if (!iter) - return -ENOENT; - - for (iter->lower_bound(first); iter->valid() && iter->key() < last; - iter->next(false)) { - keys.insert(iter->key()); - } - } - return _omap_rmkeys(cid, hoid, keys, t); -} - -int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid, - const bufferlist &bl, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << hoid << dendl; - - map sets; - StripObjectMap::StripObjectHeaderRef header; - - int r = t.lookup_cached_header(cid, hoid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << hoid << " " - << " failed to get header: r = " << r << dendl; - return r; - } - - sets[OBJECT_OMAP_HEADER_KEY] = bl; - t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets); - return 0; -} - -int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem, - coll_t dest, BufferTransaction &t) -{ - { - dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; - - StripObjectMap::StripObjectHeaderRef header; - - { - RWLock::RLocker l(collections_lock); - if (collections.count(cid) == 0) - return -ENOENT; - if (collections.count(dest) == 0) - return -ENOENT; - } - - vector objects; - ghobject_t next, current; - int move_size = 0; - while (1) { - collection_list(cid, current, ghobject_t::get_max(), true, - get_ideal_list_max(), &objects, &next); - - dout(20) << __func__ << cid << "objects size: " << objects.size() - << dendl; - - if (objects.empty()) - break; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - if (i->match(bits, rem)) { - if (_collection_move_rename(cid, *i, dest, *i, t) < 0) { - return -1; - } - move_size++; - } - } - - objects.clear(); - current = next; - } - - dout(20) << __func__ << "move" << move_size << " object from " << cid - << "to " << dest << dendl; - } - - if (g_conf->filestore_debug_verify_split) { - vector objects; - ghobject_t next; - while (1) { - collection_list(cid, next, ghobject_t::get_max(), true, - get_ideal_list_max(), &objects, &next); - if (objects.empty()) - break; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - dout(20) << __func__ << ": " << *i << " still in source " - << cid << dendl; - assert(!i->match(bits, rem)); - } - objects.clear(); - } - - next = ghobject_t(); - while (1) { - collection_list(dest, next, ghobject_t::get_max(), true, - get_ideal_list_max(), &objects, &next); - if (objects.empty()) - break; - - for (vector::iterator i = objects.begin(); - i != objects.end(); ++i) { - dout(20) << __func__ << ": " << *i << " now in dest " - << *i << dendl; - assert(i->match(bits, rem)); - } - objects.clear(); - } - } - return 0; -} - -int KeyValueStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid, - uint64_t expected_object_size, - uint64_t expected_write_size, - BufferTransaction &t) -{ - dout(15) << __func__ << " " << cid << "/" << oid << " object_size " - << expected_object_size << " write_size " - << expected_write_size << dendl; - - int r = 0; - StripObjectMap::StripObjectHeaderRef header; - - r = t.lookup_cached_header(cid, oid, &header, false); - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid - << " failed to get header: r = " << r << dendl; - return r; - } - - bool blank = true; - for (vector::iterator it = header->bits.begin(); - it != header->bits.end(); ++it) { - if (*it) { - blank = false; - break; - } - } - - // Now only consider to change "strip_size" when the object is blank, - // because set_alloc_hint is expected to be very lightweight - if (blank) { - // header->strip_size = MIN(expected_write_size, m_keyvaluestore_max_expected_write_size); - // dout(20) << __func__ << " hint " << header->strip_size << " success" << dendl; - } - - dout(10) << __func__ << "" << cid << "/" << oid << " object_size " - << expected_object_size << " write_size " - << expected_write_size << " = " << r << dendl; - - return r; -} - -const char** KeyValueStore::get_tracked_conf_keys() const -{ - static const char* KEYS[] = { - "keyvaluestore_queue_max_ops", - "keyvaluestore_queue_max_bytes", - "keyvaluestore_default_strip_size", - "keyvaluestore_dump_file", - NULL - }; - return KEYS; -} - -void KeyValueStore::handle_conf_change(const struct md_config_t *conf, - const std::set &changed) -{ - if (changed.count("keyvaluestore_queue_max_ops") || - changed.count("keyvaluestore_queue_max_bytes") || - changed.count("keyvaluestore_max_expected_write_size")) { - m_keyvaluestore_queue_max_ops = conf->keyvaluestore_queue_max_ops; - m_keyvaluestore_queue_max_bytes = conf->keyvaluestore_queue_max_bytes; - m_keyvaluestore_max_expected_write_size = conf->keyvaluestore_max_expected_write_size; - throttle_ops.reset_max(conf->keyvaluestore_queue_max_ops); - throttle_bytes.reset_max(conf->keyvaluestore_queue_max_bytes); - } - if (changed.count("keyvaluestore_default_strip_size")) { - m_keyvaluestore_strip_size = conf->keyvaluestore_default_strip_size; - default_strip_size = m_keyvaluestore_strip_size; - } - if (changed.count("keyvaluestore_dump_file")) { - if (conf->keyvaluestore_dump_file.length() && - conf->keyvaluestore_dump_file != "-") { - dump_start(conf->keyvaluestore_dump_file); - } else { - dump_stop(); - } - } -} - -int KeyValueStore::check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size) -{ - if (r < 0) { - dout(10) << __func__ << " " << cid << "/" << oid << " " - << " get rc = " << r << dendl; - } else if (!is_equal_size) { - dout(0) << __func__ << " broken header or missing data in backend " - << cid << "/" << oid << " get rc = " << r << dendl; - r = -EBADF; - } - return r; -} - -void KeyValueStore::dump_start(const std::string &file) -{ - dout(10) << "dump_start " << file << dendl; - if (m_keyvaluestore_do_dump) { - dump_stop(); - } - m_keyvaluestore_dump_fmt.reset(); - m_keyvaluestore_dump_fmt.open_array_section("dump"); - m_keyvaluestore_dump.open(file.c_str()); - m_keyvaluestore_do_dump = true; -} - -void KeyValueStore::dump_stop() -{ - dout(10) << "dump_stop" << dendl; - m_keyvaluestore_do_dump = false; - if (m_keyvaluestore_dump.is_open()) { - m_keyvaluestore_dump_fmt.close_section(); - m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump); - m_keyvaluestore_dump.flush(); - m_keyvaluestore_dump.close(); - } -} -void KeyValueStore::dump_transactions(list& ls, uint64_t seq, OpSequencer *osr) -{ - m_keyvaluestore_dump_fmt.open_array_section("transactions"); - unsigned trans_num = 0; - for (list::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { - m_keyvaluestore_dump_fmt.open_object_section("transaction"); - m_keyvaluestore_dump_fmt.dump_string("osr", osr->get_name()); - m_keyvaluestore_dump_fmt.dump_unsigned("seq", seq); - m_keyvaluestore_dump_fmt.dump_unsigned("trans_num", trans_num); - (*i)->dump(&m_keyvaluestore_dump_fmt); - m_keyvaluestore_dump_fmt.close_section(); - } - m_keyvaluestore_dump_fmt.close_section(); - m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump); - m_keyvaluestore_dump.flush(); -} - - -// -- KVSuperblock -- - -void KVSuperblock::encode(bufferlist &bl) const -{ - ENCODE_START(1, 1, bl); - compat_features.encode(bl); - ::encode(backend, bl); - ENCODE_FINISH(bl); -} - -void KVSuperblock::decode(bufferlist::iterator &bl) -{ - DECODE_START(1, bl); - compat_features.decode(bl); - ::decode(backend, bl); - DECODE_FINISH(bl); -} - -void KVSuperblock::dump(Formatter *f) const -{ - f->open_object_section("compat"); - compat_features.dump(f); - f->dump_string("backend", backend); - f->close_section(); -} - -void KVSuperblock::generate_test_instances(list& o) -{ - KVSuperblock z; - o.push_back(new KVSuperblock(z)); - CompatSet::FeatureSet feature_compat; - CompatSet::FeatureSet feature_ro_compat; - CompatSet::FeatureSet feature_incompat; - z.compat_features = CompatSet(feature_compat, feature_ro_compat, - feature_incompat); - o.push_back(new KVSuperblock(z)); - z.backend = "rocksdb"; - o.push_back(new KVSuperblock(z)); -} diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h deleted file mode 100644 index 28e086ae818d..000000000000 --- a/src/os/KeyValueStore.h +++ /dev/null @@ -1,700 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2013 UnitedStack - * - * Author: Haomai Wang - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_KEYVALUESTORE_H -#define CEPH_KEYVALUESTORE_H - -#include "include/types.h" - -#include -#include -#include -#include -using namespace std; - -#include "include/assert.h" - -#include "ObjectStore.h" - -#include "common/WorkQueue.h" -#include "common/Finisher.h" -#include "common/fd.h" - -#include "common/Mutex.h" -#include "GenericObjectMap.h" -#include "kv/KeyValueDB.h" -#include "common/random_cache.hpp" - -#include "include/uuid.h" - -static uint64_t default_strip_size = 1024; - -class StripObjectMap: public GenericObjectMap { - public: - - struct StripExtent { - uint64_t no; - uint64_t offset; // in key - uint64_t len; // in key - StripExtent(uint64_t n, uint64_t off, size_t len): - no(n), offset(off), len(len) {} - }; - - // -- strip object -- - struct StripObjectHeader { - // Persistent state - uint64_t strip_size; - uint64_t max_size; - vector bits; - - // soft state - Header header; // FIXME: Hold lock to avoid concurrent operations, it will - // also block read operation which not should be permitted. - coll_t cid; - ghobject_t oid; - bool updated; - bool deleted; - - StripObjectHeader(): strip_size(default_strip_size), max_size(0), updated(false), deleted(false) {} - - void encode(bufferlist &bl) const { - ENCODE_START(1, 1, bl); - ::encode(strip_size, bl); - ::encode(max_size, bl); - ::encode(bits, bl); - ENCODE_FINISH(bl); - } - - void decode(bufferlist::iterator &bl) { - DECODE_START(1, bl); - ::decode(strip_size, bl); - ::decode(max_size, bl); - ::decode(bits, bl); - DECODE_FINISH(bl); - } - }; - typedef ceph::shared_ptr StripObjectHeaderRef; - - static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size, - vector &extents); - int lookup_strip_header(const coll_t & cid, const ghobject_t &oid, - StripObjectHeaderRef *header); - int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t); - int create_strip_header(const coll_t &cid, const ghobject_t &oid, - StripObjectHeaderRef *strip_header, - KeyValueDB::Transaction t); - void clone_wrap(StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *target_header); - void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid, - KeyValueDB::Transaction t, - StripObjectHeaderRef *new_header); - // Already hold header to avoid lock header seq again - int get_with_header( - const StripObjectHeaderRef header, - const string &prefix, - map *out - ); - - int get_values_with_header( - const StripObjectHeaderRef header, - const string &prefix, - const set &keys, - map *out - ); - int get_keys_with_header( - const StripObjectHeaderRef header, - const string &prefix, - set *keys - ); - - Mutex lock; - void invalidate_cache(const coll_t &c, const ghobject_t &oid) { - Mutex::Locker l(lock); - caches.clear(oid); - } - - RandomCache > caches; - StripObjectMap(KeyValueDB *db): GenericObjectMap(db), - lock("StripObjectMap::lock"), - caches(g_conf->keyvaluestore_header_cache_size) - {} -}; - - -class KVSuperblock { -public: - CompatSet compat_features; - string backend; - - KVSuperblock() { } - - void encode(bufferlist &bl) const; - void decode(bufferlist::iterator &bl); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); -}; -WRITE_CLASS_ENCODER(KVSuperblock) - - -inline ostream& operator<<(ostream& out, const KVSuperblock& sb) -{ - return out << "sb(" << sb.compat_features << " " << sb.backend << ")"; -} - - -class KeyValueStore : public ObjectStore, - public md_config_obs_t { - public: - struct KVPerfTracker { - PerfCounters::avg_tracker os_commit_latency; - PerfCounters::avg_tracker os_apply_latency; - - objectstore_perf_stat_t get_cur_stats() const { - objectstore_perf_stat_t ret; - ret.filestore_commit_latency = os_commit_latency.avg(); - ret.filestore_apply_latency = os_apply_latency.avg(); - return ret; - } - - void update_from_perfcounters(PerfCounters &logger) { - os_commit_latency.consume_next( - logger.get_tavg_ms( - l_os_commit_lat)); - os_apply_latency.consume_next( - logger.get_tavg_ms( - l_os_apply_lat)); - } - - } perf_tracker; - - objectstore_perf_stat_t get_cur_stats() { - perf_tracker.update_from_perfcounters(*perf_logger); - return perf_tracker.get_cur_stats(); - } - - static const uint32_t target_version = 1; - - private: - string internal_name; // internal name, used to name the perfcounter instance - string basedir; - std::string current_fn; - uuid_d fsid; - - int fsid_fd, current_fd; - - deque snaps; - - // ObjectMap - boost::scoped_ptr backend; - - Finisher ondisk_finisher; - - RWLock collections_lock; - set collections; - - Mutex lock; - - int _create_current(); - - /// read a uuid from fd - int read_fsid(int fd, uuid_d *uuid); - - /// lock fsid_fd - int lock_fsid(); - - string strip_object_key(uint64_t no) { - char n[100]; - snprintf(n, 100, "%08lld", (long long)no); - return string(n); - } - - // Each transaction has side effect which may influent the following - // operations, we need to make it visible for the following within - // transaction by caching middle result. - // Side effects contains: - // 1. Creating/Deleting collection - // 2. Creating/Deleting object - // 3. Object modify(including omap, xattr) - // 4. Clone or rename - struct BufferTransaction { - typedef pair uniq_id; - - struct CollGhobjectPairBitwiseComparator { - bool operator()(const uniq_id& l, - const uniq_id& r) const { - if (l.first < r.first) - return true; - if (l.first != r.first) - return false; - if (cmp_bitwise(l.second, r.second) < 0) - return true; - return false; - } - }; - - typedef map StripHeaderMap; - - //Dirty records - StripHeaderMap strip_headers; - map< uniq_id, map, bufferlist>, - CollGhobjectPairBitwiseComparator> buffers; // pair(prefix, key),to buffer updated data in one transaction - - list finishes; - - KeyValueStore *store; - - KeyValueDB::Transaction t; - - void set_collections(const set& collections) { - bufferlist collections_bl; - ::encode(collections, collections_bl); - t->set("meta", "collections", collections_bl); - } - - int lookup_cached_header(const coll_t &cid, const ghobject_t &oid, - StripObjectMap::StripObjectHeaderRef *strip_header, - bool create_if_missing); - int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, const set &keys, - map *out); - void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, map &bl); - int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix, const set &keys); - void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, - const string &prefix); - int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header); - void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid); - void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header, - const coll_t &cid, const ghobject_t &oid); - int submit_transaction(); - - BufferTransaction(KeyValueStore *store): store(store) { - t = store->backend->get_transaction(); - } - - struct InvalidateCacheContext : public Context { - KeyValueStore *store; - const coll_t cid; - const ghobject_t oid; - InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {} - void finish(int r) { - if (r == 0) - store->backend->invalidate_cache(cid, oid); - } - }; - }; - - // -- op workqueue -- - struct Op { - utime_t start; - uint64_t op; - list tls; - Context *ondisk, *onreadable, *onreadable_sync; - uint64_t ops, bytes; - TrackedOpRef osd_op; - }; - class OpSequencer : public Sequencer_impl { - Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) - list q; - Cond cond; - list > flush_commit_waiters; - uint64_t op; // used by flush() to know the sequence of op - public: - Sequencer *parent; - Mutex apply_lock; // for apply mutual exclusion - - /// get_max_uncompleted - bool _get_max_uncompleted( - uint64_t *seq ///< [out] max uncompleted seq - ) { - assert(qlock.is_locked()); - assert(seq); - *seq = 0; - if (q.empty()) { - return true; - } else { - *seq = q.back()->op; - return false; - } - } /// @returns true if the queue is empty - - /// get_min_uncompleted - bool _get_min_uncompleted( - uint64_t *seq ///< [out] min uncompleted seq - ) { - assert(qlock.is_locked()); - assert(seq); - *seq = 0; - if (q.empty()) { - return true; - } else { - *seq = q.front()->op; - return false; - } - } /// @returns true if both queues are empty - - void _wake_flush_waiters(list *to_queue) { - uint64_t seq; - if (_get_min_uncompleted(&seq)) - seq = -1; - - for (list >::iterator i = - flush_commit_waiters.begin(); - i != flush_commit_waiters.end() && i->first < seq; - flush_commit_waiters.erase(i++)) { - to_queue->push_back(i->second); - } - } - - void queue(Op *o) { - Mutex::Locker l(qlock); - q.push_back(o); - op++; - o->op = op; - } - Op *peek_queue() { - assert(apply_lock.is_locked()); - return q.front(); - } - - Op *dequeue(list *to_queue) { - assert(to_queue); - assert(apply_lock.is_locked()); - Mutex::Locker l(qlock); - Op *o = q.front(); - q.pop_front(); - cond.Signal(); - - _wake_flush_waiters(to_queue); - return o; - } - - void flush() { - Mutex::Locker l(qlock); - - // get max for journal _or_ op queues - uint64_t seq = 0; - if (!q.empty()) - seq = q.back()->op; - - if (seq) { - // everything prior to our watermark to drain through either/both - // queues - while (!q.empty() && q.front()->op <= seq) - cond.Wait(qlock); - } - } - bool flush_commit(Context *c) { - Mutex::Locker l(qlock); - uint64_t seq = 0; - if (_get_max_uncompleted(&seq)) { - return true; - } else { - flush_commit_waiters.push_back(make_pair(seq, c)); - return false; - } - } - - OpSequencer() - : qlock("KeyValueStore::OpSequencer::qlock", false, false), - op(0), parent(0), - apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {} - ~OpSequencer() { - assert(q.empty()); - } - - const string& get_name() const { - return parent->get_name(); - } - }; - - friend ostream& operator<<(ostream& out, const OpSequencer& s); - - deque op_queue; - Throttle throttle_ops, throttle_bytes; - Finisher op_finisher; - - ThreadPool op_tp; - struct OpWQ : public ThreadPool::WorkQueue { - KeyValueStore *store; - OpWQ(KeyValueStore *fs, time_t timeout, time_t suicide_timeout, - ThreadPool *tp) : - ThreadPool::WorkQueue("KeyValueStore::OpWQ", - timeout, suicide_timeout, tp), - store(fs) {} - - bool _enqueue(OpSequencer *osr) { - store->op_queue.push_back(osr); - return true; - } - void _dequeue(OpSequencer *o) { - assert(0); - } - bool _empty() { - return store->op_queue.empty(); - } - OpSequencer *_dequeue() { - if (store->op_queue.empty()) - return NULL; - OpSequencer *osr = store->op_queue.front(); - store->op_queue.pop_front(); - return osr; - } - using ThreadPool::WorkQueue::_process; - void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) { - store->_do_op(osr, handle); - } - void _process_finish(OpSequencer *osr) { - store->_finish_op(osr); - } - void _clear() { - assert(store->op_queue.empty()); - } - } op_wq; - - Op *build_op(list& tls, Context *ondisk, Context *onreadable, - Context *onreadable_sync, TrackedOpRef osd_op); - void queue_op(OpSequencer *osr, Op *o); - void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL); - void _do_op(OpSequencer *osr, ThreadPool::TPHandle &handle); - void op_queue_release_throttle(Op *o); - void _finish_op(OpSequencer *osr); - - PerfCounters *perf_logger; - - public: - - KeyValueStore(const std::string &base, - const char *internal_name = "keyvaluestore", - bool update_to=false); - ~KeyValueStore(); - - bool test_mount_in_use(); - int version_stamp_is_valid(uint32_t *version); - int update_version_stamp(); - uint32_t get_target_version() { - return target_version; - } - - int write_version_stamp(); - int mount(); - int umount(); - unsigned get_max_object_name_length() { - return 4096; // no real limit for leveldb - } - unsigned get_max_attr_name_length() { - return 256; // arbitrary; there is no real limit internally - } - int mkfs(); - int mkjournal() {return 0;} - bool wants_journal() { - return false; - } - bool allows_journal() { - return false; - } - bool needs_journal() { - return false; - } - - void collect_metadata(map *pm); - - int statfs(struct statfs *buf); - - int _do_transactions( - list &tls, uint64_t op_seq, - ThreadPool::TPHandle *handle); - int do_transactions(list &tls, uint64_t op_seq) { - return _do_transactions(tls, op_seq, 0); - } - unsigned _do_transaction(Transaction& transaction, - BufferTransaction &bt, - ThreadPool::TPHandle *handle); - - int queue_transactions(Sequencer *osr, list& tls, - TrackedOpRef op = TrackedOpRef(), - ThreadPool::TPHandle *handle = NULL); - - - // ------------------ - // objects - - int _generic_read(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, bufferlist& bl, - bool allow_eio = false, BufferTransaction *bt = 0); - int _generic_write(StripObjectMap::StripObjectHeaderRef header, - uint64_t offset, size_t len, const bufferlist& bl, - BufferTransaction &t, uint32_t fadvise_flags = 0); - - bool exists(coll_t cid, const ghobject_t& oid); - int stat(coll_t cid, const ghobject_t& oid, struct stat *st, - bool allow_eio = false); - int read(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - bufferlist& bl, uint32_t op_flags = 0, bool allow_eio = false); - int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - bufferlist& bl); - - int _touch(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - const bufferlist& bl, BufferTransaction &t, uint32_t fadvise_flags = 0); - int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, - BufferTransaction &t); - int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size, - BufferTransaction &t); - int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, - BufferTransaction &t); - int _clone_range(coll_t cid, const ghobject_t& oldoid, - const ghobject_t& newoid, uint64_t srcoff, - uint64_t len, uint64_t dstoff, BufferTransaction &t); - int _remove(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - int _set_alloc_hint(coll_t cid, const ghobject_t& oid, - uint64_t expected_object_size, - uint64_t expected_write_size, - BufferTransaction &t); - - void start_sync() {} - - void set_fsid(uuid_d u) { fsid = u; } - uuid_d get_fsid() { return fsid; } - - // attrs - int getattr(coll_t cid, const ghobject_t& oid, const char *name, - bufferptr &bp); - int getattrs(coll_t cid, const ghobject_t& oid, map& aset); - - int _setattrs(coll_t cid, const ghobject_t& oid, - map& aset, BufferTransaction &t); - int _rmattr(coll_t cid, const ghobject_t& oid, const char *name, - BufferTransaction &t); - int _rmattrs(coll_t cid, const ghobject_t& oid, BufferTransaction &t); - - // collections - int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num, - uint64_t num_objs) const { return 0; } - int _create_collection(coll_t c, BufferTransaction &t); - int _destroy_collection(coll_t c, BufferTransaction &t); - int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid, - BufferTransaction &t); - int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, - coll_t c, const ghobject_t& o, - BufferTransaction &t); - int _collection_remove_recursive(const coll_t &cid, - BufferTransaction &t); - int list_collections(vector& ls); - bool collection_exists(coll_t c); - bool collection_empty(coll_t c); - int collection_list(coll_t c, ghobject_t start, ghobject_t end, - bool sort_bitwise, int max, - vector *ls, ghobject_t *next); - int collection_version_current(coll_t c, uint32_t *version); - - // omap (see ObjectStore.h for documentation) - int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header, - map *out); - int omap_get_header( - coll_t c, - const ghobject_t &oid, - bufferlist *out, - bool allow_eio = false); - int omap_get_keys(coll_t c, const ghobject_t &oid, set *keys); - int omap_get_values(coll_t c, const ghobject_t &oid, const set &keys, - map *out); - int omap_check_keys(coll_t c, const ghobject_t &oid, const set &keys, - set *out); - ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, - const ghobject_t &oid); - - int check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size); - void dump_start(const std::string &file); - void dump_stop(); - void dump_transactions(list& ls, uint64_t seq, - OpSequencer *osr); - - private: - void _inject_failure() {} - - // omap - int _omap_clear(coll_t cid, const ghobject_t &oid, - BufferTransaction &t); - int _omap_setkeys(coll_t cid, const ghobject_t &oid, - map &aset, - BufferTransaction &t); - int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set &keys, - BufferTransaction &t); - int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid, - const string& first, const string& last, - BufferTransaction &t); - int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl, - BufferTransaction &t); - int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, - BufferTransaction &t); - int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem, - coll_t dest, BufferTransaction &t){ - return 0; - } - - virtual const char** get_tracked_conf_keys() const; - virtual void handle_conf_change(const struct md_config_t *conf, - const std::set &changed); - - std::string m_osd_rollback_to_cluster_snap; - int m_keyvaluestore_queue_max_ops; - int m_keyvaluestore_queue_max_bytes; - int m_keyvaluestore_strip_size; - uint64_t m_keyvaluestore_max_expected_write_size; - int do_update; - bool m_keyvaluestore_do_dump; - std::ofstream m_keyvaluestore_dump; - JSONFormatter m_keyvaluestore_dump_fmt; - - static const string OBJECT_STRIP_PREFIX; - static const string OBJECT_XATTR; - static const string OBJECT_OMAP; - static const string OBJECT_OMAP_HEADER; - static const string OBJECT_OMAP_HEADER_KEY; - static const string COLLECTION; - static const string COLLECTION_ATTR; - static const uint32_t COLLECTION_VERSION = 1; - - KVSuperblock superblock; - /** - * write_superblock() - * - * Write superblock to persisent storage - * - * return value: 0 on success, otherwise negative errno - */ - int write_superblock(); - - /** - * read_superblock() - * - * Fill in KeyValueStore::superblock by reading persistent storage - * - * return value: 0 on success, otherwise negative errno - */ - int read_superblock(); -}; - -WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader) - -#endif diff --git a/src/os/Makefile.am b/src/os/Makefile.am index 7aea87ccfb82..abba3fcbd4d9 100644 --- a/src/os/Makefile.am +++ b/src/os/Makefile.am @@ -28,11 +28,11 @@ libos_a_SOURCES = \ os/filestore/LFNIndex.cc \ os/filestore/WBThrottle.cc \ os/fs/FS.cc \ + os/keyvaluestore/GenericObjectMap.cc \ + os/keyvaluestore/KeyValueStore.cc \ os/kstore/kv.cc \ os/kstore/KStore.cc \ os/memstore/MemStore.cc \ - os/GenericObjectMap.cc \ - os/KeyValueStore.cc \ os/ObjectStore.cc if LINUX @@ -88,13 +88,13 @@ noinst_HEADERS += \ os/filestore/ZFSFileStoreBackend.h os/fs/FS.h \ os/fs/XFS.h \ + os/keyvaluestore/GenericObjectMap.h \ + os/keyvaluestore/KeyValueStore.h \ os/kstore/kstore_types.h \ os/kstore/KStore.h \ os/kstore/kv.h \ os/memstore/MemStore.h \ os/memstore/PageSet.h \ - os/GenericObjectMap.h \ - os/KeyValueStore.h \ os/ObjectMap.h \ os/ObjectStore.h diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc index e347eb9b51b8..5a2ae9ac2255 100644 --- a/src/os/ObjectStore.cc +++ b/src/os/ObjectStore.cc @@ -20,7 +20,7 @@ #include "filestore/FileStore.h" #include "memstore/MemStore.h" -#include "KeyValueStore.h" +#include "keyvaluestore/KeyValueStore.h" #if defined(HAVE_LIBAIO) #include "bluestore/BlueStore.h" #endif diff --git a/src/os/keyvaluestore/GenericObjectMap.cc b/src/os/keyvaluestore/GenericObjectMap.cc new file mode 100644 index 000000000000..3453bc042be0 --- /dev/null +++ b/src/os/keyvaluestore/GenericObjectMap.cc @@ -0,0 +1,1127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 UnitedStack + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" +#include "include/buffer.h" + +#include +#include +#include +#include +#include + +#include + +#include "GenericObjectMap.h" +#include "common/debug.h" +#include "common/config.h" +#include "include/assert.h" + +#define dout_subsys ceph_subsys_keyvaluestore + +const string GenericObjectMap::GLOBAL_STATE_KEY = "HEADER"; + +const string GenericObjectMap::USER_PREFIX = "_SEQ_"; +const string GenericObjectMap::INTERN_PREFIX = "_INTERN_"; +const string GenericObjectMap::COMPLETE_PREFIX = "_COMPLETE_"; +const string GenericObjectMap::GHOBJECT_TO_SEQ_PREFIX = "_GHOBJTOSEQ_"; +const string GenericObjectMap::PARENT_KEY = "_PARENT_HEADER_"; + +// In order to make right ordering for leveldb matching with hobject_t, +// so use "!" to separated +const string GenericObjectMap::GHOBJECT_KEY_SEP_S = "!"; +const char GenericObjectMap::GHOBJECT_KEY_SEP_C = '!'; +const char GenericObjectMap::GHOBJECT_KEY_ENDING = 0xFF; + +// ============== GenericObjectMap Key Function ================= + +static void append_escaped(const string &in, string *out) +{ + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i == '%') { + out->push_back('%'); + out->push_back('p'); + } else if (*i == '.') { + out->push_back('%'); + out->push_back('e'); + } else if (*i == GenericObjectMap::GHOBJECT_KEY_SEP_C) { + out->push_back('%'); + out->push_back('u'); + } else if (*i == '!') { + out->push_back('%'); + out->push_back('s'); + } else { + out->push_back(*i); + } + } +} + +static bool append_unescaped(string::const_iterator begin, + string::const_iterator end, + string *out) +{ + for (string::const_iterator i = begin; i != end; ++i) { + if (*i == '%') { + ++i; + if (*i == 'p') + out->push_back('%'); + else if (*i == 'e') + out->push_back('.'); + else if (*i == 'u') + out->push_back(GenericObjectMap::GHOBJECT_KEY_SEP_C); + else if (*i == 's') + out->push_back('!'); + else + return false; + } else { + out->push_back(*i); + } + } + return true; +} + +string GenericObjectMap::header_key(const coll_t &cid) +{ + string full_name; + + append_escaped(cid.to_str(), &full_name); + full_name.append(GHOBJECT_KEY_SEP_S); + return full_name; +} + +string GenericObjectMap::header_key(const coll_t &cid, const ghobject_t &oid) +{ + string full_name; + + append_escaped(cid.to_str(), &full_name); + full_name.append(GHOBJECT_KEY_SEP_S); + + char buf[PATH_MAX]; + char *t; + char *end; + + // make field ordering match with ghobject_t compare operations + t = buf; + end = t + sizeof(buf); + if (oid.shard_id == shard_id_t::NO_SHARD) { + // otherwise ff will sort *after* 0, not before. + full_name += "--"; + } else { + t += snprintf(t, end - t, "%02x", (int)oid.shard_id); + full_name += string(buf); + } + full_name.append(GHOBJECT_KEY_SEP_S); + + t = buf; + t += snprintf(t, end - t, "%016llx", + (long long)(oid.hobj.pool + 0x8000000000000000)); + full_name += string(buf); + full_name.append(GHOBJECT_KEY_SEP_S); + + t = buf; + snprintf(t, end - t, "%.*X", (int)(sizeof(oid.hobj.get_hash())*2), + (uint32_t)oid.hobj.get_bitwise_key_u32()); + full_name += string(buf); + full_name.append(GHOBJECT_KEY_SEP_S); + + append_escaped(oid.hobj.nspace, &full_name); + full_name.append(GHOBJECT_KEY_SEP_S); + + append_escaped(oid.hobj.get_key(), &full_name); + full_name.append(GHOBJECT_KEY_SEP_S); + + append_escaped(oid.hobj.oid.name, &full_name); + full_name.append(GHOBJECT_KEY_SEP_S); + + t = buf; + if (oid.hobj.snap == CEPH_NOSNAP) + t += snprintf(t, end - t, "head"); + else if (oid.hobj.snap == CEPH_SNAPDIR) + t += snprintf(t, end - t, "snapdir"); + else + // Keep length align + t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.hobj.snap); + full_name += string(buf); + + if (oid.generation != ghobject_t::NO_GEN) { + full_name.append(GHOBJECT_KEY_SEP_S); + + t = buf; + end = t + sizeof(buf); + t += snprintf(t, end - t, "%016llx", (long long unsigned)oid.generation); + full_name += string(buf); + } + + full_name.append(1, GHOBJECT_KEY_ENDING); + + return full_name; +} + +bool GenericObjectMap::parse_header_key(const string &long_name, + coll_t *out_coll, ghobject_t *out) +{ + string coll; + string name; + string key; + string ns; + uint32_t hash; + snapid_t snap; + int64_t pool; + gen_t generation = ghobject_t::NO_GEN; + shard_id_t shard_id = shard_id_t::NO_SHARD; + + string::const_iterator current = long_name.begin(); + string::const_iterator end; + + for (end = current; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (!append_unescaped(current, end, &coll)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (end == long_name.end()) + return false; + string shardstring = string(current, end); + if (shardstring == "--") + shard_id = shard_id_t::NO_SHARD; + else + shard_id = (shard_id_t)strtoul(shardstring.c_str(), NULL, 16); + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (end == long_name.end()) + return false; + string pstring(current, end); + pool = strtoull(pstring.c_str(), NULL, 16); + pool -= 0x8000000000000000; + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (end == long_name.end()) + return false; + string hash_str(current, end); + sscanf(hash_str.c_str(), "%X", &hash); + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &ns)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &key)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C; ++end) ; + if (end == long_name.end()) + return false; + if (!append_unescaped(current, end, &name)) + return false; + + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_SEP_C && + *end != GHOBJECT_KEY_ENDING; ++end) ; + if (end == long_name.end()) + return false; + string snap_str(current, end); + if (snap_str == "head") + snap = CEPH_NOSNAP; + else if (snap_str == "snapdir") + snap = CEPH_SNAPDIR; + else + snap = strtoull(snap_str.c_str(), NULL, 16); + + // Optional generation/shard_id + string genstring; + if (*end == GHOBJECT_KEY_SEP_C) { + current = ++end; + for ( ; end != long_name.end() && *end != GHOBJECT_KEY_ENDING; ++end) ; + if (end != long_name.end()) + return false; + genstring = string(current, end); + generation = (gen_t)strtoull(genstring.c_str(), NULL, 16); + } + + if (out) { + (*out) = ghobject_t(hobject_t(name, key, snap, + hobject_t::_reverse_bits(hash), + (int64_t)pool, ns), + generation, shard_id); + } + + if (out_coll) { + bool valid = out_coll->parse(coll); + assert(valid); + } + + return true; +} + + +// ============== GenericObjectMap Prefix ================= + +string GenericObjectMap::user_prefix(Header header, const string &prefix) +{ + return USER_PREFIX + seq_key(header->seq) + prefix; +} + +string GenericObjectMap::complete_prefix(Header header) +{ + return INTERN_PREFIX + seq_key(header->seq) + COMPLETE_PREFIX; +} + +string GenericObjectMap::parent_seq_prefix(uint64_t seq) +{ + return INTERN_PREFIX + seq_key(seq) + PARENT_KEY; +} + + +// ============== GenericObjectMapIteratorImpl ================= + +int GenericObjectMap::GenericObjectMapIteratorImpl::init() +{ + invalid = false; + if (ready) { + return 0; + } + + assert(!parent_iter); + if (header->parent) { + Header parent = map->lookup_parent(header); + if (!parent) { + assert(0); + return -EINVAL; + } + parent_iter.reset(new GenericObjectMapIteratorImpl(map, parent, prefix)); + } + + key_iter = map->db->get_iterator(map->user_prefix(header, prefix)); + assert(key_iter); + complete_iter = map->db->get_iterator(map->complete_prefix(header)); + assert(complete_iter); + cur_iter = key_iter; + assert(cur_iter); + ready = true; + return 0; +} + +ObjectMap::ObjectMapIterator GenericObjectMap::get_iterator( + const coll_t &cid, const ghobject_t &oid, const string &prefix) +{ + Header header = lookup_header(cid, oid); + if (!header) + return ObjectMap::ObjectMapIterator(new EmptyIteratorImpl()); + return _get_iterator(header, prefix); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::seek_to_first() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_first(); + if (r < 0) + return r; + } + r = key_iter->seek_to_first(); + if (r < 0) + return r; + return adjust(); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::seek_to_last() +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->seek_to_last(); + if (r < 0) + return r; + if (parent_iter->valid()) + r = parent_iter->next(); + if (r < 0) + return r; + } + r = key_iter->seek_to_last(); + if (r < 0) + return r; + if (key_iter->valid()) + r = key_iter->next(); + if (r < 0) + return r; + return adjust(); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::lower_bound(const string &to) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->lower_bound(to); + if (r < 0) + return r; + } + r = key_iter->lower_bound(to); + if (r < 0) + return r; + return adjust(); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::upper_bound(const string &after) +{ + init(); + r = 0; + if (parent_iter) { + r = parent_iter->upper_bound(after); + if (r < 0) + return r; + } + r = key_iter->upper_bound(after); + if (r < 0) + return r; + return adjust(); +} + +bool GenericObjectMap::GenericObjectMapIteratorImpl::valid() +{ + bool valid = !invalid && ready; + assert(!valid || cur_iter->valid()); + return valid; +} + +bool GenericObjectMap::GenericObjectMapIteratorImpl::valid_parent() +{ + if (parent_iter && parent_iter->valid() && + (!key_iter->valid() || key_iter->key() > parent_iter->key())) + return true; + return false; +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::next(bool validate) +{ + assert(cur_iter->valid()); + assert(valid()); + cur_iter->next(); + return adjust(); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::next_parent() +{ + if (!parent_iter || !parent_iter->valid()) { + invalid = true; + return 0; + } + r = next(); + if (r < 0) + return r; + if (!valid() || on_parent() || !parent_iter->valid()) + return 0; + + return lower_bound(parent_iter->key()); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::in_complete_region( + const string &to_test, string *begin, string *end) +{ + complete_iter->upper_bound(to_test); + if (complete_iter->valid()) + complete_iter->prev(); + else + complete_iter->seek_to_last(); + + if (!complete_iter->valid()) + return false; + + string _end; + if (begin) + *begin = complete_iter->key(); + _end = string(complete_iter->value().c_str()); + if (end) + *end = _end; + return (to_test >= complete_iter->key()) && (!_end.size() || _end > to_test); +} + +/** + * Moves parent_iter to the next position both out of the complete_region and + * not equal to key_iter. Then, we set cur_iter to parent_iter if valid and + * less than key_iter and key_iter otherwise. + */ +int GenericObjectMap::GenericObjectMapIteratorImpl::adjust() +{ + string begin, end; + while (parent_iter && parent_iter->valid()) { + if (in_complete_region(parent_iter->key(), &begin, &end)) { + if (end.size() == 0) { + parent_iter->seek_to_last(); + if (parent_iter->valid()) + parent_iter->next(); + } else { + parent_iter->lower_bound(end); + } + } else if (key_iter->valid() && key_iter->key() == parent_iter->key()) { + parent_iter->next(); + } else { + break; + } + } + if (valid_parent()) { + cur_iter = parent_iter; + } else if (key_iter->valid()) { + cur_iter = key_iter; + } else { + invalid = true; + } + assert(invalid || cur_iter->valid()); + return 0; +} + +string GenericObjectMap::GenericObjectMapIteratorImpl::key() +{ + return cur_iter->key(); +} + +bufferlist GenericObjectMap::GenericObjectMapIteratorImpl::value() +{ + return cur_iter->value(); +} + +int GenericObjectMap::GenericObjectMapIteratorImpl::status() +{ + return r; +} + + +// ============== GenericObjectMap Public API ================= + +void GenericObjectMap::set_keys(const Header header, + const string &prefix, + const map &set, + KeyValueDB::Transaction t) +{ + t->set(user_prefix(header, prefix), set); +} + +int GenericObjectMap::clear(const Header header, + KeyValueDB::Transaction t) +{ + remove_header(header->cid, header->oid, header, t); + assert(header->num_children > 0); + header->num_children--; + int r = _clear(header, t); + if (r < 0) + return r; + return 0; +} + +int GenericObjectMap::rm_keys(const Header header, + const string &prefix, + const set &buffered_keys, + const set &to_clear, + KeyValueDB::Transaction t) +{ + t->rmkeys(user_prefix(header, prefix), to_clear); + if (!header->parent) { + return 0; + } + + // Copy up keys from parent around to_clear + int keep_parent; + { + GenericObjectMapIterator iter = _get_iterator(header, prefix); + iter->seek_to_first(); + map new_complete; + map to_write; + for(set::const_iterator i = to_clear.begin(); + i != to_clear.end(); ) { + unsigned copied = 0; + iter->lower_bound(*i); + ++i; + if (!iter->valid()) + break; + string begin = iter->key(); + if (!iter->on_parent()) + iter->next_parent(); + if (new_complete.size() && new_complete.rbegin()->second == begin) { + begin = new_complete.rbegin()->first; + } + while (iter->valid() && copied < 20) { + if (!to_clear.count(iter->key()) && !buffered_keys.count(iter->key())) + to_write[iter->key()].append(iter->value()); + if (i != to_clear.end() && *i <= iter->key()) { + ++i; + copied = 0; + } + + iter->next_parent(); + copied++; + } + if (iter->valid()) { + new_complete[begin] = iter->key(); + } else { + new_complete[begin] = ""; + break; + } + } + t->set(user_prefix(header, prefix), to_write); + merge_new_complete(header, new_complete, iter, t); + keep_parent = need_parent(iter); + if (keep_parent < 0) + return keep_parent; + } + + if (!keep_parent) { + Header parent = lookup_parent(header); + if (!parent) + return -EINVAL; + parent->num_children--; + _clear(parent, t); + header->parent = 0; + set_header(header->cid, header->oid, *header, t); + t->rmkeys_by_prefix(complete_prefix(header)); + } + + return 0; +} + +int GenericObjectMap::get(const coll_t &cid, const ghobject_t &oid, + const string &prefix, + map *out) +{ + Header header = lookup_header(cid, oid); + if (!header) + return -ENOENT; + + ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + out->insert(make_pair(iter->key(), iter->value())); + } + + return 0; +} + +int GenericObjectMap::get_keys(const coll_t &cid, const ghobject_t &oid, + const string &prefix, + set *keys) +{ + Header header = lookup_header(cid, oid); + if (!header) + return -ENOENT; + + ObjectMap::ObjectMapIterator iter = _get_iterator(header, prefix); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + if (iter->status()) + return iter->status(); + keys->insert(iter->key()); + } + return 0; +} + +int GenericObjectMap::get_values(const coll_t &cid, const ghobject_t &oid, + const string &prefix, + const set &keys, + map *out) +{ + Header header = lookup_header(cid, oid); + if (!header) + return -ENOENT; + return scan(header, prefix, keys, 0, out); +} + +int GenericObjectMap::check_keys(const coll_t &cid, const ghobject_t &oid, + const string &prefix, + const set &keys, + set *out) +{ + Header header = lookup_header(cid, oid); + if (!header) + return -ENOENT; + return scan(header, prefix, keys, out, 0); +} + +void GenericObjectMap::clone(const Header parent, const coll_t &cid, + const ghobject_t &target, + KeyValueDB::Transaction t, + Header *old_header, Header *new_header) +{ + { + Header destination = lookup_header(cid, target); + if (destination) { + remove_header(cid, target, destination, t); + destination->num_children--; + _clear(destination, t); + } + } + + Header source = generate_new_header(parent->cid, parent->oid, parent, t); + Header destination = generate_new_header(cid, target, parent, t); + + destination->data = parent->data; + source->data = parent->data; + + parent->num_children = 2; + set_parent_header(parent, t); + set_header(parent->cid, parent->oid, *source, t); + set_header(cid, target, *destination, t); + + if (new_header) + *old_header = source; + if (new_header) + *new_header = destination; + + // Clone will set parent header and rm_keys wll lookup_parent which will try + // to find parent header. So it will let lookup_parent fail when "clone" and + // "rm_keys" in one transaction. Here have to sync transaction to make + // visiable for lookup_parent + // FIXME: Clear transaction operations here + int r = submit_transaction_sync(t); + assert(r == 0); +} + +void GenericObjectMap::rename(const Header old_header, const coll_t &cid, + const ghobject_t &target, + KeyValueDB::Transaction t) +{ + if (old_header->oid == target && old_header->cid == cid) + return ; + + remove_header(old_header->cid, old_header->oid, old_header, t); + old_header->cid = cid; + old_header->oid = target; + set_header(cid, target, *old_header, t); +} + +int GenericObjectMap::init(bool do_upgrade) +{ + map result; + set to_get; + to_get.insert(GLOBAL_STATE_KEY); + int r = db->get(INTERN_PREFIX, to_get, &result); + if (r < 0) + return r; + if (!result.empty()) { + bufferlist::iterator bliter = result.begin()->second.begin(); + state.decode(bliter); + if (state.v < 1) { // Needs upgrade + if (!do_upgrade) { + dout(1) << "GenericObjbectMap requires an upgrade," + << " set filestore_update_to" + << dendl; + return -ENOTSUP; + } else { + r = upgrade(); + if (r < 0) + return r; + } + } + } else { + // New store + state.v = 1; + state.seq = 1; + } + dout(20) << "(init)genericobjectmap: seq is " << state.seq << dendl; + return 0; +} + +bool GenericObjectMap::check(std::ostream &out) +{ + bool retval = true; + map parent_to_num_children; + map parent_to_actual_num_children; + KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); + + for (iter->seek_to_first(); iter->valid(); iter->next()) { + _Header header; + assert(header.num_children == 1); + header.num_children = 0; // Hack for leaf node + bufferlist bl = iter->value(); + while (true) { + bufferlist::iterator bliter = bl.begin(); + header.decode(bliter); + if (header.seq != 0) + parent_to_actual_num_children[header.seq] = header.num_children; + if (header.parent == 0) + break; + + if (!parent_to_num_children.count(header.parent)) + parent_to_num_children[header.parent] = 0; + parent_to_num_children[header.parent]++; + if (parent_to_actual_num_children.count(header.parent)) + break; + + set to_get; + map got; + to_get.insert(PARENT_KEY); + db->get(parent_seq_prefix(header.parent), to_get, &got); + if (got.empty()) { + out << "Missing: seq " << header.parent << std::endl; + retval = false; + break; + } else { + bl = got.begin()->second; + } + } + } + + for (map::iterator i = parent_to_num_children.begin(); + i != parent_to_num_children.end(); + parent_to_num_children.erase(i++)) { + if (!parent_to_actual_num_children.count(i->first)) + continue; + if (parent_to_actual_num_children[i->first] != i->second) { + out << "Invalid: seq " << i->first << " recorded children: " + << parent_to_actual_num_children[i->first] << " found: " + << i->second << std::endl; + retval = false; + } + parent_to_actual_num_children.erase(i->first); + } + return retval; +} + + +// ============== GenericObjectMap Intern Implementation ================= + +int GenericObjectMap::scan(Header header, + const string &prefix, + const set &in_keys, + set *out_keys, + map *out_values) +{ + ObjectMap::ObjectMapIterator db_iter = _get_iterator(header, prefix); + for (set::const_iterator key_iter = in_keys.begin(); + key_iter != in_keys.end(); + ++key_iter) { + db_iter->lower_bound(*key_iter); + if (db_iter->status()) + return db_iter->status(); + + if (db_iter->valid() && db_iter->key() == *key_iter) { + if (out_keys) + out_keys->insert(*key_iter); + if (out_values) + out_values->insert(make_pair(db_iter->key(), db_iter->value())); + } + } + return 0; +} + +int GenericObjectMap::_clear(Header header, KeyValueDB::Transaction t) +{ + while (1) { + if (header->num_children) { + set_parent_header(header, t); + break; + } + + clear_header(header, t); + if (!header->parent) + break; + + Header parent = lookup_parent(header); + if (!parent) { + return -EINVAL; + } + assert(parent->num_children > 0); + parent->num_children--; + header.swap(parent); + } + return 0; +} + +int GenericObjectMap::merge_new_complete( + Header header, const map &new_complete, + GenericObjectMapIterator iter, KeyValueDB::Transaction t) +{ + KeyValueDB::Iterator complete_iter = db->get_iterator( + complete_prefix(header)); + map::const_iterator i = new_complete.begin(); + set to_remove; + map to_add; + + string begin, end; + while (i != new_complete.end()) { + string new_begin = i->first; + string new_end = i->second; + int r = iter->in_complete_region(new_begin, &begin, &end); + if (r < 0) + return r; + if (r) { + to_remove.insert(begin); + new_begin = begin; + } + ++i; + while (i != new_complete.end()) { + if (!new_end.size() || i->first <= new_end) { + if (!new_end.size() && i->second > new_end) { + new_end = i->second; + } + ++i; + continue; + } + + r = iter->in_complete_region(new_end, &begin, &end); + if (r < 0) + return r; + if (r) { + to_remove.insert(begin); + new_end = end; + continue; + } + break; + } + bufferlist bl; + bl.append(bufferptr(new_end.c_str(), new_end.size() + 1)); + to_add.insert(make_pair(new_begin, bl)); + } + t->rmkeys(complete_prefix(header), to_remove); + t->set(complete_prefix(header), to_add); + return 0; +} + +int GenericObjectMap::need_parent(GenericObjectMapIterator iter) +{ + int r = iter->seek_to_first(); + if (r < 0) + return r; + + if (!iter->valid()) + return 0; + + string begin, end; + if (iter->in_complete_region(iter->key(), &begin, &end) && end == "") { + return 0; + } + return 1; +} + +int GenericObjectMap::write_state(KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " seq is " << state.seq << dendl; + bufferlist bl; + state.encode(bl); + map to_write; + to_write[GLOBAL_STATE_KEY] = bl; + t->set(INTERN_PREFIX, to_write); + return 0; +} + +// NOTE(haomai): It may occur dead lock if thread A hold header A try to header +// B and thread hold header B try to get header A +GenericObjectMap::Header GenericObjectMap::_lookup_header( + const coll_t &cid, const ghobject_t &oid) +{ + set to_get; + to_get.insert(header_key(cid, oid)); + _Header header; + + map out; + + int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out); + if (r < 0) + return Header(); + if (out.empty()) + return Header(); + + bufferlist::iterator iter = out.begin()->second.begin(); + header.decode(iter); + + Header ret = Header(new _Header(header)); + return ret; +} + +GenericObjectMap::Header GenericObjectMap::_generate_new_header( + const coll_t &cid, const ghobject_t &oid, Header parent, + KeyValueDB::Transaction t) +{ + Header header = Header(new _Header()); + header->seq = state.seq++; + if (parent) { + header->parent = parent->seq; + } + header->num_children = 1; + header->oid = oid; + header->cid = cid; + + write_state(t); + return header; +} + +GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input) +{ + Mutex::Locker l(header_lock); + map out; + set keys; + keys.insert(PARENT_KEY); + + dout(20) << "lookup_parent: parent " << input->parent + << " for seq " << input->seq << dendl; + + int r = db->get(parent_seq_prefix(input->parent), keys, &out); + if (r < 0) { + assert(0); + return Header(); + } + if (out.empty()) { + assert(0); + return Header(); + } + + Header header = Header(new _Header()); + header->seq = input->parent; + bufferlist::iterator iter = out.begin()->second.begin(); + header->decode(iter); + dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent " + << header->parent << dendl; + return header; +} + +GenericObjectMap::Header GenericObjectMap::lookup_create_header( + const coll_t &cid, const ghobject_t &oid, KeyValueDB::Transaction t) +{ + Mutex::Locker l(header_lock); + Header header = _lookup_header(cid, oid); + if (!header) { + header = _generate_new_header(cid, oid, Header(), t); + set_header(cid, oid, *header, t); + } + return header; +} + +void GenericObjectMap::set_parent_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " setting seq " << header->seq << dendl; + map to_write; + header->encode(to_write[PARENT_KEY]); + t->set(parent_seq_prefix(header->seq), to_write); +} + +void GenericObjectMap::clear_header(Header header, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " clearing seq " << header->seq << dendl; + t->rmkeys_by_prefix(user_prefix(header, string())); + t->rmkeys_by_prefix(complete_prefix(header)); + set keys; + keys.insert(PARENT_KEY); + t->rmkeys(parent_seq_prefix(header->seq), keys); +} + +// only remove GHOBJECT_TO_SEQ +void GenericObjectMap::remove_header(const coll_t &cid, + const ghobject_t &oid, Header header, + KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " removing " << header->seq + << " cid " << cid << " oid " << oid << dendl; + set to_remove; + to_remove.insert(header_key(cid, oid)); + t->rmkeys(GHOBJECT_TO_SEQ_PREFIX, to_remove); +} + +void GenericObjectMap::set_header(const coll_t &cid, const ghobject_t &oid, + _Header &header, KeyValueDB::Transaction t) +{ + dout(20) << __func__ << " setting " << header.seq + << " cid " << cid << " oid " << oid << " parent seq " + << header.parent << dendl; + map to_set; + header.encode(to_set[header_key(cid, oid)]); + t->set(GHOBJECT_TO_SEQ_PREFIX, to_set); +} + +int GenericObjectMap::list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max, + vector *out, ghobject_t *next) +{ + // FIXME + Mutex::Locker l(header_lock); + if (start.is_max()) + return 0; + + if (start.is_min()) { + vector oids; + + KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); + for (iter->lower_bound(header_key(cid)); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + bufferlist::iterator bliter = bl.begin(); + _Header header; + header.decode(bliter); + + if (header.cid == cid) + oids.push_back(header.oid); + + break; + } + + if (oids.empty()) { + if (next) + *next = ghobject_t::get_max(); + return 0; + } + start = oids[0]; + } + + int size = 0; + KeyValueDB::Iterator iter = db->get_iterator(GHOBJECT_TO_SEQ_PREFIX); + for (iter->lower_bound(header_key(cid, start)); iter->valid(); iter->next()) { + bufferlist bl = iter->value(); + bufferlist::iterator bliter = bl.begin(); + _Header header; + header.decode(bliter); + + if (header.cid != cid) { + if (next) + *next = ghobject_t::get_max(); + break; + } + + if (max && size >= max) { + if (next) + *next = header.oid; + break; + } + + if (cmp_bitwise(header.oid, end) >= 0) { + if (next) + *next = ghobject_t::get_max(); + break; + } + + assert(cmp_bitwise(start, header.oid) <= 0); + assert(cmp_bitwise(header.oid, end) < 0); + + + size++; + if (out) + out->push_back(header.oid); + start = header.oid; + } + + if (out->size()) + dout(20) << "objects: " << *out << dendl; + + if (!iter->valid()) + if (next) + *next = ghobject_t::get_max(); + + return 0; +} diff --git a/src/os/keyvaluestore/GenericObjectMap.h b/src/os/keyvaluestore/GenericObjectMap.h new file mode 100644 index 000000000000..9417937aea40 --- /dev/null +++ b/src/os/keyvaluestore/GenericObjectMap.h @@ -0,0 +1,429 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 UnitedStack + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_GENERICOBJECTMAP_H +#define CEPH_GENERICOBJECTMAP_H + +#include "include/buffer.h" +#include +#include +#include +#include +#include + +#include "include/memory.h" +#include "os/ObjectMap.h" +#include "kv/KeyValueDB.h" +#include "osd/osd_types.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/simple_cache.hpp" + + +/** + * Genericobjectmap: Provide with key/value associated to ghobject_t APIs to caller + * and avoid concerning too much. Wrap and combine KeyValueDB/ObjectMap APIs + * with ghobject_t and adding clone capacity. + * + * Prefix space structure: + * + * - GHOBJECT_TO_SEQ: Contains leaf mapping from ghobject_t->Header(including + * hobj.seq and related metadata) + * - INTERN_PREFIX: GLOBAL_STATE_KEY - contains the global state + * @see State + * @see write_state + * @see init + * @see generate_new_header + * - INTERN_PREFIX + header_key(header->seq) + COMPLETE_PREFIX: see below + * - INTERN_PREFIX + header_key(header->seq) + PARENT_KEY + * : used to store parent header(same as headers in GHOBJECT_TO_SEQ) + * - USER_PREFIX + header_key(header->seq) + [CUSTOM_PREFIX] + * : key->value which set by callers + * + * For each node (represented by a header), we + * store three mappings: the key mapping, the complete mapping, and the parent. + * The complete mapping (COMPLETE_PREFIX space) is key->key. Each x->y entry in + * this mapping indicates that the key mapping contains all entries on [x,y). + * Note, max string is represented by "", so ""->"" indicates that the parent + * is unnecessary (@see rm_keys). When looking up a key not contained in the + * the complete set, we have to check the parent if we don't find it in the + * key set. During rm_keys, we copy keys from the parent and update the + * complete set to reflect the change @see rm_keys. + */ + +// This class only provide basic read capacity, suggest inherit it to +// implement write transaction to use it. @see StripObjectMap +class GenericObjectMap { + public: + boost::scoped_ptr db; + + /** + * Serializes access to next_seq as well as the in_use set + */ + Mutex header_lock; + + GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {} + + int get( + const coll_t &cid, + const ghobject_t &oid, + const string &prefix, + map *out + ); + + int get_keys( + const coll_t &cid, + const ghobject_t &oid, + const string &prefix, + set *keys + ); + + int get_values( + const coll_t &cid, + const ghobject_t &oid, + const string &prefix, + const set &keys, + map *out + ); + + int check_keys( + const coll_t &cid, + const ghobject_t &oid, + const string &prefix, + const set &keys, + set *out + ); + + /// Read initial state from backing store + int init(bool upgrade = false); + + /// Upgrade store to current version + int upgrade() {return 0;} + + /// Consistency check, debug, there must be no parallel writes + bool check(std::ostream &out); + + /// Util, list all objects, there must be no other concurrent access + int list_objects(const coll_t &cid, ghobject_t start, ghobject_t end, int max, + vector *objs, ///< [out] objects + ghobject_t *next); + + ObjectMap::ObjectMapIterator get_iterator(const coll_t &cid, + const ghobject_t &oid, + const string &prefix); + + KeyValueDB::Transaction get_transaction() { return db->get_transaction(); } + int submit_transaction(KeyValueDB::Transaction t) { + return db->submit_transaction(t); + } + int submit_transaction_sync(KeyValueDB::Transaction t) { + return db->submit_transaction_sync(t); + } + + /// persistent state for store @see generate_header + struct State { + __u8 v; + uint64_t seq; + State() : v(0), seq(1) {} + State(uint64_t seq) : v(0), seq(seq) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(v, bl); + ::encode(seq, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(v, bl); + ::decode(seq, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + } + + static void generate_test_instances(list &o) { + o.push_back(new State(0)); + o.push_back(new State(20)); + } + } state; + + struct _Header { + uint64_t seq; + uint64_t parent; + uint64_t num_children; + + coll_t cid; + ghobject_t oid; + + // Used by successor + bufferlist data; + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(seq, bl); + ::encode(parent, bl); + ::encode(num_children, bl); + ::encode(cid, bl); + ::encode(oid, bl); + ::encode(data, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(seq, bl); + ::decode(parent, bl); + ::decode(num_children, bl); + ::decode(cid, bl); + ::decode(oid, bl); + ::decode(data, bl); + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const { + f->dump_unsigned("seq", seq); + f->dump_unsigned("parent", parent); + f->dump_unsigned("num_children", num_children); + f->dump_stream("coll") << cid; + f->dump_stream("oid") << oid; + } + + _Header() : seq(0), parent(0), num_children(1) {} + }; + + typedef ceph::shared_ptr<_Header> Header; + + Header lookup_header(const coll_t &cid, const ghobject_t &oid) { + Mutex::Locker l(header_lock); + return _lookup_header(cid, oid); + } + + /// Lookup or create header for c oid + Header lookup_create_header(const coll_t &cid, const ghobject_t &oid, + KeyValueDB::Transaction t); + + /// Set leaf node for c and oid to the value of header + void set_header(const coll_t &cid, const ghobject_t &oid, _Header &header, + KeyValueDB::Transaction t); + + // Move all modify member function to "protect", in order to indicate these + // should be made use of by sub-class + void set_keys( + const Header header, + const string &prefix, + const map &set, + KeyValueDB::Transaction t + ); + + int clear( + const Header header, + KeyValueDB::Transaction t + ); + + int rm_keys( + const Header header, + const string &prefix, + const set &buffered_keys, + const set &to_clear, + KeyValueDB::Transaction t + ); + + void clone( + const Header origin_header, + const coll_t &cid, + const ghobject_t &target, + KeyValueDB::Transaction t, + Header *old_header, + Header *new_header + ); + + void rename( + const Header header, + const coll_t &cid, + const ghobject_t &target, + KeyValueDB::Transaction t + ); + + static const string GLOBAL_STATE_KEY; + static const string PARENT_KEY; + + static const string USER_PREFIX; + static const string INTERN_PREFIX; + static const string PARENT_PREFIX; + static const string COMPLETE_PREFIX; + static const string GHOBJECT_TO_SEQ_PREFIX; + + static const string GHOBJECT_KEY_SEP_S; + static const char GHOBJECT_KEY_SEP_C; + static const char GHOBJECT_KEY_ENDING; + +private: + /// Implicit lock on Header->seq + + static string header_key(const coll_t &cid); + static string header_key(const coll_t &cid, const ghobject_t &oid); + static bool parse_header_key(const string &in, coll_t *c, ghobject_t *oid); + + string seq_key(uint64_t seq) { + char buf[100]; + snprintf(buf, sizeof(buf), "%.*" PRId64, (int)(2*sizeof(seq)), seq); + return string(buf); + } + + string user_prefix(Header header, const string &prefix); + string complete_prefix(Header header); + string parent_seq_prefix(uint64_t seq); + + class EmptyIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { + public: + int seek_to_first() { return 0; } + int seek_to_last() { return 0; } + int upper_bound(const string &after) { return 0; } + int lower_bound(const string &to) { return 0; } + bool valid() { return false; } + int next(bool validate=true) { assert(0); return 0; } + string key() { assert(0); return ""; } + bufferlist value() { assert(0); return bufferlist(); } + int status() { return 0; } + }; + + + /// Iterator + class GenericObjectMapIteratorImpl : public ObjectMap::ObjectMapIteratorImpl { + public: + GenericObjectMap *map; + + /// NOTE: implicit lock on header->seq AND for all ancestors + Header header; + + /// parent_iter == NULL iff no parent + ceph::shared_ptr parent_iter; + KeyValueDB::Iterator key_iter; + KeyValueDB::Iterator complete_iter; + + /// cur_iter points to currently valid iterator + ceph::shared_ptr cur_iter; + int r; + + /// init() called, key_iter, complete_iter, parent_iter filled in + bool ready; + /// past end + bool invalid; + + string prefix; + + GenericObjectMapIteratorImpl(GenericObjectMap *map, Header header, + const string &_prefix) : map(map), header(header), r(0), ready(false), + invalid(true), prefix(_prefix) { } + int seek_to_first(); + int seek_to_last(); + int upper_bound(const string &after); + int lower_bound(const string &to); + bool valid(); + int next(bool validate=true); + string key(); + bufferlist value(); + int status(); + + bool on_parent() { + return cur_iter == parent_iter; + } + + /// skips to next valid parent entry + int next_parent(); + + /// Tests whether to_test is in complete region + int in_complete_region(const string &to_test, ///[in] key to test + string *begin, ///[out] beginning of region + string *end ///[out] end of region + ); ///< @returns true if to_test is in the complete region, else false + + private: + int init(); + bool valid_parent(); + int adjust(); + }; + +protected: + typedef ceph::shared_ptr GenericObjectMapIterator; + GenericObjectMapIterator _get_iterator(Header header, string prefix) { + return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix)); + } + + Header generate_new_header(const coll_t &cid, const ghobject_t &oid, + Header parent, KeyValueDB::Transaction t) { + Mutex::Locker l(header_lock); + return _generate_new_header(cid, oid, parent, t); + } + + // Scan keys in header into out_keys and out_values (if nonnull) + int scan(Header header, const string &prefix, const set &in_keys, + set *out_keys, map *out_values); + + private: + + /// Removes node corresponding to header + void clear_header(Header header, KeyValueDB::Transaction t); + + /// Set node containing input to new contents + void set_parent_header(Header input, KeyValueDB::Transaction t); + + /// Remove leaf node corresponding to oid in c + void remove_header(const coll_t &cid, const ghobject_t &oid, Header header, + KeyValueDB::Transaction t); + + /** + * Generate new header for c oid with new seq number + * + * Has the side effect of syncronously saving the new GenericObjectMap state + */ + Header _generate_new_header(const coll_t &cid, const ghobject_t &oid, + Header parent, KeyValueDB::Transaction t); + + // Lookup leaf header for c oid + Header _lookup_header(const coll_t &cid, const ghobject_t &oid); + + // Lookup header node for input + Header lookup_parent(Header input); + + // Remove header and all related prefixes + int _clear(Header header, KeyValueDB::Transaction t); + + // Adds to t operations necessary to add new_complete to the complete set + int merge_new_complete(Header header, const map &new_complete, + GenericObjectMapIterator iter, KeyValueDB::Transaction t); + + // Writes out State (mainly next_seq) + int write_state(KeyValueDB::Transaction _t); + + // 0 if the complete set now contains all of key space, < 0 on error, 1 else + int need_parent(GenericObjectMapIterator iter); + + // Copies header entry from parent @see rm_keys + int copy_up_header(Header header, KeyValueDB::Transaction t); + + // Sets header @see set_header + void _set_header(Header header, const bufferlist &bl, + KeyValueDB::Transaction t); +}; +WRITE_CLASS_ENCODER(GenericObjectMap::_Header) +WRITE_CLASS_ENCODER(GenericObjectMap::State) + +#endif diff --git a/src/os/keyvaluestore/KeyValueStore.cc b/src/os/keyvaluestore/KeyValueStore.cc new file mode 100644 index 000000000000..35eb6f4489f6 --- /dev/null +++ b/src/os/keyvaluestore/KeyValueStore.cc @@ -0,0 +1,3017 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 UnitedStack + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/int_types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "include/compat.h" + +#include +#include + +#include "KeyValueStore.h" +#include "common/BackTrace.h" +#include "include/types.h" + +#include "osd/osd_types.h" +#include "include/color.h" +#include "include/buffer.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/run_cmd.h" +#include "common/safe_io.h" +#include "common/perf_counters.h" +#include "common/sync_filesystem.h" + +#include "common/ceph_crypto.h" +using ceph::crypto::SHA1; + +#include "include/assert.h" + +#include "common/config.h" + +#define dout_subsys ceph_subsys_keyvaluestore + +const string KeyValueStore::OBJECT_STRIP_PREFIX = "_STRIP_"; +const string KeyValueStore::OBJECT_XATTR = "__OBJATTR__"; +const string KeyValueStore::OBJECT_OMAP = "__OBJOMAP__"; +const string KeyValueStore::OBJECT_OMAP_HEADER = "__OBJOMAP_HEADER__"; +const string KeyValueStore::OBJECT_OMAP_HEADER_KEY = "__OBJOMAP_HEADER__KEY_"; +const string KeyValueStore::COLLECTION = "__COLLECTION__"; +const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__"; + + +//Initial features in new superblock. +static CompatSet get_kv_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this KeyValueStore supports. +static CompatSet get_kv_supported_compat_set() { + CompatSet compat = get_kv_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + return compat; +} + + +// ============== StripObjectMap Implementation ================= + +int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header, + KeyValueDB::Transaction t) +{ + if (strip_header->updated) { + strip_header->header->data.clear(); + ::encode(*strip_header, strip_header->header->data); + + set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t); + strip_header->updated = false; + } + return 0; +} + +int StripObjectMap::create_strip_header(const coll_t &cid, + const ghobject_t &oid, + StripObjectHeaderRef *strip_header, + KeyValueDB::Transaction t) +{ + Header header = generate_new_header(cid, oid, Header(), t); + if (!header) + return -EINVAL; + + StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); + tmp->oid = oid; + tmp->cid = cid; + tmp->header = header; + tmp->updated = true; + if (strip_header) + *strip_header = tmp; + + return 0; +} + +int StripObjectMap::lookup_strip_header(const coll_t &cid, + const ghobject_t &oid, + StripObjectHeaderRef *strip_header) +{ + { + Mutex::Locker l(lock); + pair p; + if (caches.lookup(oid, &p)) { + if (p.first == cid) { + *strip_header = p.second; + return 0; + } + } + } + Header header = lookup_header(cid, oid); + + if (!header) { + dout(20) << "lookup_strip_header failed to get strip_header " + << " cid " << cid <<" oid " << oid << dendl; + return -ENOENT; + } + + + StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); + if (header->data.length()) { + bufferlist::iterator bliter = header->data.begin(); + ::decode(*tmp, bliter); + } + + if (tmp->strip_size == 0) { + tmp->strip_size = default_strip_size; + tmp->updated = true; + } + + tmp->oid = oid; + tmp->cid = cid; + tmp->header = header; + + { + Mutex::Locker l(lock); + caches.add(oid, make_pair(cid, tmp)); + } + *strip_header = tmp; + dout(10) << "lookup_strip_header done " << " cid " << cid << " oid " + << oid << dendl; + return 0; +} + +int StripObjectMap::file_to_extents(uint64_t offset, size_t len, + uint64_t strip_size, + vector &extents) +{ + if (len == 0) + return 0; + + uint64_t start, end, strip_offset; + start = offset / strip_size; + end = (offset + len) / strip_size; + strip_offset = start * strip_size; + + // "offset" may in the middle of first strip object + if (offset > strip_offset) { + uint64_t extent_offset, extent_len; + extent_offset = offset - strip_offset; + if (extent_offset + len <= strip_size) + extent_len = len; + else + extent_len = strip_size - extent_offset; + extents.push_back(StripExtent(start, extent_offset, extent_len)); + start++; + strip_offset += strip_size; + } + + for (; start < end; ++start) { + extents.push_back(StripExtent(start, 0, strip_size)); + strip_offset += strip_size; + } + + // The end of strip object may be partial + if (offset + len > strip_offset) + extents.push_back(StripExtent(start, 0, offset+len-strip_offset)); + + assert(extents.size()); + dout(10) << "file_to_extents done " << dendl; + return 0; +} + +void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header, + const coll_t &cid, const ghobject_t &oid, + KeyValueDB::Transaction t, + StripObjectHeaderRef *target_header) +{ + Header new_origin_header; + StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); + + clone(old_header->header, cid, oid, t, &new_origin_header, + &tmp->header); + + tmp->oid = oid; + tmp->cid = cid; + tmp->strip_size = old_header->strip_size; + tmp->max_size = old_header->max_size; + tmp->bits = old_header->bits; + tmp->updated = true; + old_header->header = new_origin_header; + old_header->updated = true; + + if (target_header) + *target_header = tmp; +} + +void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid, + KeyValueDB::Transaction t, + StripObjectHeaderRef *new_header) +{ + rename(old_header->header, cid, oid, t); + + StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader()); + tmp->strip_size = old_header->strip_size; + tmp->max_size = old_header->max_size; + tmp->bits = old_header->bits; + tmp->header = old_header->header; + tmp->oid = oid; + tmp->cid = cid; + tmp->updated = true; + + if (new_header) + *new_header = tmp; + + old_header->header = Header(); + old_header->deleted = true; +} + +int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header, + const string &prefix, + const set &keys, + map *out) +{ + return scan(header->header, prefix, keys, 0, out); +} + +int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header, + const string &prefix, + set *keys) +{ + ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + assert(!iter->status()); + keys->insert(iter->key()); + } + return 0; +} + +int StripObjectMap::get_with_header(const StripObjectHeaderRef header, + const string &prefix, map *out) +{ + ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix); + for (iter->seek_to_first(); iter->valid(); iter->next()) { + assert(!iter->status()); + out->insert(make_pair(iter->key(), iter->value())); + } + + return 0; +} + +// ========= KeyValueStore::BufferTransaction Implementation ============ + +int KeyValueStore::BufferTransaction::lookup_cached_header( + const coll_t &cid, const ghobject_t &oid, + StripObjectMap::StripObjectHeaderRef *strip_header, + bool create_if_missing) +{ + uniq_id uid = make_pair(cid, oid); + StripObjectMap::StripObjectHeaderRef header; + int r = 0; + + StripHeaderMap::iterator it = strip_headers.find(uid); + if (it != strip_headers.end()) { + + if (!it->second->deleted) { + if (strip_header) + *strip_header = it->second; + return 0; + } else if (!create_if_missing) { + return -ENOENT; + } + + // If (it->second.deleted && create_if_missing) go down + r = -ENOENT; + } else { + r = store->backend->lookup_strip_header(cid, oid, &header); + } + + if (r == -ENOENT && create_if_missing) { + r = store->backend->create_strip_header(cid, oid, &header, t); + } + + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " + << " r = " << r << dendl; + return r; + } + + strip_headers[uid] = header; + if (strip_header) + *strip_header = header; + return r; +} + +int KeyValueStore::BufferTransaction::get_buffer_keys( + StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix, + const set &keys, map *out) +{ + set need_lookup; + + uniq_id uid = make_pair(strip_header->cid, strip_header->oid); + for (set::iterator it = keys.begin(); it != keys.end(); ++it) { + map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); + if ( obj_it != buffers.end() ) { + map, bufferlist>::iterator i = + obj_it->second.find(make_pair(prefix, *it)); + if (i != obj_it->second.end()) { + (*out)[*it].swap(i->second); + } else { + need_lookup.insert(*it); + } + }else { + need_lookup.insert(*it); + } + } + + if (!need_lookup.empty()) { + int r = store->backend->get_values_with_header(strip_header, prefix, + need_lookup, out); + if (r < 0) { + dout(10) << __func__ << " " << strip_header->cid << "/" + << strip_header->oid << " " << " r = " << r << dendl; + return r; + } + } + + return 0; +} + +void KeyValueStore::BufferTransaction::set_buffer_keys( + StripObjectMap::StripObjectHeaderRef strip_header, + const string &prefix, map &values) +{ + store->backend->set_keys(strip_header->header, prefix, values, t); + + uniq_id uid = make_pair(strip_header->cid, strip_header->oid); + map, bufferlist> &uid_buffers = buffers[uid]; + for (map::iterator iter = values.begin(); + iter != values.end(); ++iter) { + uid_buffers[make_pair(prefix, iter->first)].swap(iter->second); + } +} + +int KeyValueStore::BufferTransaction::remove_buffer_keys( + StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix, + const set &keys) +{ + uniq_id uid = make_pair(strip_header->cid, strip_header->oid); + map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); + set buffered_keys; + if ( obj_it != buffers.end() ) { + // TODO: Avoid use empty bufferlist to indicate the key is removed + for (set::iterator iter = keys.begin(); iter != keys.end(); ++iter) { + obj_it->second[make_pair(prefix, *iter)] = bufferlist(); + } + // TODO: Avoid collect all buffered keys when remove keys + if (strip_header->header->parent) { + for (map, bufferlist>::iterator iter = obj_it->second.begin(); + iter != obj_it->second.end(); ++iter) { + buffered_keys.insert(iter->first.second); + } + } + } + + return store->backend->rm_keys(strip_header->header, prefix, buffered_keys, keys, t); +} + +void KeyValueStore::BufferTransaction::clear_buffer_keys( + StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix) +{ + uniq_id uid = make_pair(strip_header->cid, strip_header->oid); + map< uniq_id, map, bufferlist> >::iterator obj_it = buffers.find(uid); + if ( obj_it != buffers.end() ) { + for (map, bufferlist>::iterator iter = obj_it->second.begin(); + iter != obj_it->second.end(); ++iter) { + if (iter->first.first == prefix) + iter->second = bufferlist(); + } + } +} + +int KeyValueStore::BufferTransaction::clear_buffer( + StripObjectMap::StripObjectHeaderRef strip_header) +{ + strip_header->deleted = true; + + InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid); + finishes.push_back(c); + return store->backend->clear(strip_header->header, t); +} + +void KeyValueStore::BufferTransaction::clone_buffer( + StripObjectMap::StripObjectHeaderRef old_header, + const coll_t &cid, const ghobject_t &oid) +{ + // Remove target ahead to avoid dead lock + strip_headers.erase(make_pair(cid, oid)); + + StripObjectMap::StripObjectHeaderRef new_target_header; + + store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header); + + // FIXME: Lacking of lock for origin header(now become parent), it will + // cause other operation can get the origin header while submitting + // transactions + strip_headers[make_pair(cid, oid)] = new_target_header; +} + +void KeyValueStore::BufferTransaction::rename_buffer( + StripObjectMap::StripObjectHeaderRef old_header, + const coll_t &cid, const ghobject_t &oid) +{ + // FIXME: Lacking of lock for origin header, it will cause other operation + // can get the origin header while submitting transactions + StripObjectMap::StripObjectHeaderRef new_header; + store->backend->rename_wrap(old_header, cid, oid, t, &new_header); + + InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid); + finishes.push_back(c); + strip_headers[make_pair(cid, oid)] = new_header; +} + +int KeyValueStore::BufferTransaction::submit_transaction() +{ + int r = 0; + + for (StripHeaderMap::iterator header_iter = strip_headers.begin(); + header_iter != strip_headers.end(); ++header_iter) { + StripObjectMap::StripObjectHeaderRef header = header_iter->second; + + if (header->deleted) + continue; + + if (header->updated) { + r = store->backend->save_strip_header(header, t); + + if (r < 0) { + dout(10) << __func__ << " save strip header failed " << dendl; + goto out; + } + } + } + + r = store->backend->submit_transaction_sync(t); + for (list::iterator it = finishes.begin(); it != finishes.end(); ++it) { + (*it)->complete(r); + } + +out: + dout(5) << __func__ << " r = " << r << dendl; + return r; +} + +// =========== KeyValueStore Intern Helper Implementation ============== + +ostream& operator<<(ostream& out, const KeyValueStore::OpSequencer& s) +{ + assert(&out); + return out << *s.parent; +} + +int KeyValueStore::_create_current() +{ + struct stat st; + int ret = ::stat(current_fn.c_str(), &st); + if (ret == 0) { + // current/ exists + if (!S_ISDIR(st.st_mode)) { + dout(0) << "_create_current: current/ exists but is not a directory" << dendl; + ret = -EINVAL; + } + } else { + ret = ::mkdir(current_fn.c_str(), 0755); + if (ret < 0) { + ret = -errno; + dout(0) << "_create_current: mkdir " << current_fn << " failed: "<< cpp_strerror(ret) << dendl; + } + } + + return ret; +} + + + +// =========== KeyValueStore API Implementation ============== + +KeyValueStore::KeyValueStore(const std::string &base, + const char *name, bool do_update) : + ObjectStore(base), + internal_name(name), + basedir(base), + fsid_fd(-1), current_fd(-1), + backend(NULL), + ondisk_finisher(g_ceph_context), + collections_lock("KeyValueStore::collections_lock"), + lock("KeyValueStore::lock"), + throttle_ops(g_ceph_context, "keyvaluestore_ops", g_conf->keyvaluestore_queue_max_ops), + throttle_bytes(g_ceph_context, "keyvaluestore_bytes", g_conf->keyvaluestore_queue_max_bytes), + op_finisher(g_ceph_context), + op_tp(g_ceph_context, "KeyValueStore::op_tp", + g_conf->keyvaluestore_op_threads, "keyvaluestore_op_threads"), + op_wq(this, g_conf->keyvaluestore_op_thread_timeout, + g_conf->keyvaluestore_op_thread_suicide_timeout, &op_tp), + perf_logger(NULL), + m_keyvaluestore_queue_max_ops(g_conf->keyvaluestore_queue_max_ops), + m_keyvaluestore_queue_max_bytes(g_conf->keyvaluestore_queue_max_bytes), + m_keyvaluestore_strip_size(g_conf->keyvaluestore_default_strip_size), + m_keyvaluestore_max_expected_write_size(g_conf->keyvaluestore_max_expected_write_size), + do_update(do_update), + m_keyvaluestore_do_dump(false), + m_keyvaluestore_dump_fmt(true) +{ + ostringstream oss; + oss << basedir << "/current"; + current_fn = oss.str(); + + // initialize perf_logger + PerfCountersBuilder plb(g_ceph_context, internal_name, l_os_commit_len, l_os_last); + + plb.add_u64(l_os_oq_max_ops, "op_queue_max_ops", "Max operations count in queue"); + plb.add_u64(l_os_oq_ops, "op_queue_ops", "Operations count in queue"); + plb.add_u64_counter(l_os_ops, "ops", "Operations"); + plb.add_u64(l_os_oq_max_bytes, "op_queue_max_bytes", "Max size of queue"); + plb.add_u64(l_os_oq_bytes, "op_queue_bytes", "Size of queue"); + plb.add_u64_counter(l_os_bytes, "bytes", "Data written to store"); + plb.add_time_avg(l_os_commit_lat, "commit_latency", "Commit latency"); + plb.add_time_avg(l_os_apply_lat, "apply_latency", "Apply latency"); + plb.add_time_avg(l_os_queue_lat, "queue_transaction_latency_avg", "Store operation queue latency"); + + perf_logger = plb.create_perf_counters(); + + g_ceph_context->get_perfcounters_collection()->add(perf_logger); + g_ceph_context->_conf->add_observer(this); + + superblock.compat_features = get_kv_initial_compat_set(); +} + +KeyValueStore::~KeyValueStore() +{ + g_ceph_context->_conf->remove_observer(this); + g_ceph_context->get_perfcounters_collection()->remove(perf_logger); + + delete perf_logger; + + if (m_keyvaluestore_do_dump) { + dump_stop(); + } +} + +int KeyValueStore::statfs(struct statfs *buf) +{ + int r = backend->db->get_statfs(buf); + if (r < 0) { + if (::statfs(basedir.c_str(), buf) < 0) { + int r = -errno; + return r; + } + } + return 0; +} + +void KeyValueStore::collect_metadata(map *pm) +{ + (*pm)["keyvaluestore_backend"] = superblock.backend; +} + +int KeyValueStore::mkfs() +{ + int ret = 0; + char fsid_fn[PATH_MAX]; + uuid_d old_fsid; + + dout(1) << "mkfs in " << basedir << dendl; + + // open+lock fsid + snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << "mkfs: failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + if (lock_fsid() < 0) { + ret = -EBUSY; + goto close_fsid_fd; + } + + if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << "mkfs generated fsid " << fsid << dendl; + } else { + dout(1) << "mkfs using provided fsid " << fsid << dendl; + } + + char fsid_str[40]; + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << "mkfs: failed to truncate fsid: " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << "mkfs: failed to write fsid: " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (::fsync(fsid_fd) < 0) { + ret = -errno; + derr << "mkfs: close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + dout(10) << "mkfs fsid is " << fsid << dendl; + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << "mkfs on-disk fsid " << old_fsid << " != provided " << fsid << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + fsid = old_fsid; + dout(1) << "mkfs fsid is already set to " << fsid << dendl; + } + + // version stamp + ret = write_version_stamp(); + if (ret < 0) { + derr << "mkfs: write_version_stamp() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + ret = _create_current(); + if (ret < 0) { + derr << "mkfs: failed to create current/ " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // superblock + superblock.backend = g_conf->keyvaluestore_backend; + ret = write_superblock(); + if (ret < 0) { + derr << "KeyValueStore::mkfs write_superblock() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + { + KeyValueDB *store = KeyValueDB::create(g_ceph_context, + superblock.backend, + current_fn.c_str()); + if (!store) { + derr << __func__ << " failed to create backend type " + << g_conf->keyvaluestore_backend << "." << dendl; + ret = -1; + goto close_fsid_fd; + } + + ostringstream err; + if (store->create_and_open(err)) { + derr << __func__ << " failed to create/open backend type " + << g_conf->keyvaluestore_backend << "." << dendl; + ret = -1; + delete store; + goto close_fsid_fd; + } + + bufferlist bl; + ::encode(collections, bl); + KeyValueDB::Transaction t = store->get_transaction(); + t->set("meta", "collections", bl); + store->submit_transaction_sync(t); + + dout(1) << g_conf->keyvaluestore_backend << " backend exists/created" << dendl; + delete store; + } + + ret = write_meta("type", "keyvaluestore"); + if (ret < 0) + goto close_fsid_fd; + + dout(1) << "mkfs done in " << basedir << dendl; + ret = 0; + + close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + return ret; +} + +int KeyValueStore::read_fsid(int fd, uuid_d *uuid) +{ + char fsid_str[40]; + int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) + return ret; + if (ret == 8) { + // old 64-bit fsid... mirror it. + *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; + *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; + return 0; + } + + if (ret > 36) + fsid_str[36] = 0; + if (!uuid->parse(fsid_str)) + return -EINVAL; + return 0; +} + +int KeyValueStore::lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + dout(0) << "lock_fsid failed to lock " << basedir + << "/fsid, is another ceph-osd still running? " + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool KeyValueStore::test_mount_in_use() +{ + dout(5) << "test_mount basedir " << basedir << dendl; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + + // verify fs isn't in use + + fsid_fd = ::open(fn, O_RDWR, 0644); + if (fsid_fd < 0) + return 0; // no fsid, ok. + bool inuse = lock_fsid() < 0; + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + return inuse; +} + +int KeyValueStore::write_superblock() +{ + bufferlist bl; + ::encode(superblock, bl); + return safe_write_file(basedir.c_str(), "superblock", + bl.c_str(), bl.length()); +} + +int KeyValueStore::read_superblock() +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "superblock", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) { + // If the file doesn't exist write initial CompatSet + return write_superblock(); + } + return ret; + } + + bufferlist bl; + bl.push_back(bp); + bufferlist::iterator i = bl.begin(); + ::decode(superblock, i); + return 0; +} + + + +int KeyValueStore::update_version_stamp() +{ + return write_version_stamp(); +} + +int KeyValueStore::version_stamp_is_valid(uint32_t *version) +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "store_version", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) + return 0; + return ret; + } + bufferlist bl; + bl.push_back(bp); + bufferlist::iterator i = bl.begin(); + ::decode(*version, i); + if (*version == target_version) + return 1; + else + return 0; +} + +int KeyValueStore::write_version_stamp() +{ + bufferlist bl; + ::encode(target_version, bl); + + return safe_write_file(basedir.c_str(), "store_version", + bl.c_str(), bl.length()); +} + +int KeyValueStore::mount() +{ + int ret; + char buf[PATH_MAX]; + CompatSet supported_compat_set = get_kv_supported_compat_set(); + + dout(5) << "basedir " << basedir << dendl; + + // make sure global base dir exists + if (::access(basedir.c_str(), R_OK | W_OK)) { + ret = -errno; + derr << "KeyValueStore::mount: unable to access basedir '" << basedir + << "': " << cpp_strerror(ret) << dendl; + goto done; + } + + // get fsid + snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(buf, O_RDWR, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << "KeyValueStore::mount: error opening '" << buf << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + ret = read_fsid(fsid_fd, &fsid); + if (ret < 0) { + derr << "KeyValueStore::mount: error reading fsid_fd: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (lock_fsid() < 0) { + derr << "KeyValueStore::mount: lock_fsid failed" << dendl; + ret = -EBUSY; + goto close_fsid_fd; + } + + dout(10) << "mount fsid is " << fsid << dendl; + + uint32_t version_stamp; + ret = version_stamp_is_valid(&version_stamp); + if (ret < 0) { + derr << "KeyValueStore::mount : error in version_stamp_is_valid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } else if (ret == 0) { + if (do_update) { + derr << "KeyValueStore::mount : stale version stamp detected: " + << version_stamp << ". Proceeding, do_update " + << "is set, performing disk format upgrade." << dendl; + } else { + ret = -EINVAL; + derr << "KeyValueStore::mount : stale version stamp " << version_stamp + << ". Please run the KeyValueStore update script before starting " + << "the OSD, or set keyvaluestore_update_to to " << target_version + << dendl; + goto close_fsid_fd; + } + } + + superblock.backend = g_conf->keyvaluestore_backend; + ret = read_superblock(); + if (ret < 0) { + ret = -EINVAL; + goto close_fsid_fd; + } + + // Check if this KeyValueStore supports all the necessary features to mount + if (supported_compat_set.compare(superblock.compat_features) == -1) { + derr << "KeyValueStore::mount : Incompatible features set " + << superblock.compat_features << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + + current_fd = ::open(current_fn.c_str(), O_RDONLY); + if (current_fd < 0) { + ret = -errno; + derr << "KeyValueStore::mount: error opening: " << current_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + assert(current_fd >= 0); + + { + if (superblock.backend.empty()) + superblock.backend = g_conf->keyvaluestore_backend; + KeyValueDB *store = KeyValueDB::create(g_ceph_context, + superblock.backend, + current_fn.c_str()); + if(!store) + { + derr << "KeyValueStore::mount backend type " + << superblock.backend << " error" << dendl; + ret = -1; + goto close_fsid_fd; + + } + + if (superblock.backend == "rocksdb") + store->init(g_conf->keyvaluestore_rocksdb_options); + else + store->init(); + stringstream err; + if (store->open(err)) { + derr << "KeyValueStore::mount Error initializing keyvaluestore backend " + << superblock.backend << ": " << err.str() << dendl; + ret = -1; + delete store; + goto close_current_fd; + } + + // get collection list + set keys; + keys.insert("collections"); + map values; + store->get("meta", keys, &values); + if (values.empty()) { + ret = -EIO; + derr << "Error no collection list; old store?" << dendl; + goto close_current_fd; + } + bufferlist::iterator p = values["collections"].begin(); + ::decode(collections, p); + dout(20) << "collections: " << collections << dendl; + + StripObjectMap *dbomap = new StripObjectMap(store); + ret = dbomap->init(do_update); + if (ret < 0) { + delete dbomap; + derr << "Error initializing StripObjectMap: " << ret << dendl; + goto close_current_fd; + } + stringstream err2; + + if (g_conf->keyvaluestore_debug_check_backend && !dbomap->check(err2)) { + derr << err2.str() << dendl; + delete dbomap; + ret = -EINVAL; + goto close_current_fd; + } + + default_strip_size = m_keyvaluestore_strip_size; + backend.reset(dbomap); + } + + op_tp.start(); + op_finisher.start(); + ondisk_finisher.start(); + + // all okay. + return 0; + +close_current_fd: + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; +close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +done: + return ret; +} + +int KeyValueStore::umount() +{ + dout(5) << "umount " << basedir << dendl; + + op_tp.stop(); + op_finisher.stop(); + ondisk_finisher.stop(); + + if (fsid_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + } + if (current_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; + } + + backend.reset(); + + // nothing + return 0; +} + +int KeyValueStore::queue_transactions(Sequencer *posr, list &tls, + TrackedOpRef osd_op, + ThreadPool::TPHandle *handle) +{ + utime_t start = ceph_clock_now(g_ceph_context); + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + + // set up the sequencer + OpSequencer *osr; + assert(posr); + if (posr->p) { + osr = static_cast(posr->p.get()); + dout(5) << "queue_transactions existing " << osr << " " << *osr << "/" << osr->parent + << dendl; //<< " w/ q " << osr->q << dendl; + } else { + osr = new OpSequencer; + osr->parent = posr; + posr->p = osr; + dout(5) << "queue_transactions new " << osr << " " << *osr << "/" << osr->parent << dendl; + } + + Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op); + op_queue_reserve_throttle(o, handle); + if (m_keyvaluestore_do_dump) + dump_transactions(o->tls, o->op, osr); + dout(5) << "queue_transactions (trailing journal) " << " " << tls <tinc(l_os_queue_lat, end - start); + return 0; +} + + +// ============== KeyValueStore Op Handler ================= + +KeyValueStore::Op *KeyValueStore::build_op(list& tls, + Context *ondisk, Context *onreadable, Context *onreadable_sync, + TrackedOpRef osd_op) +{ + uint64_t bytes = 0, ops = 0; + for (list::iterator p = tls.begin(); + p != tls.end(); + ++p) { + bytes += (*p)->get_num_bytes(); + ops += (*p)->get_num_ops(); + } + + Op *o = new Op; + o->start = ceph_clock_now(g_ceph_context); + o->tls.swap(tls); + o->ondisk = ondisk; + o->onreadable = onreadable; + o->onreadable_sync = onreadable_sync; + o->ops = ops; + o->bytes = bytes; + o->osd_op = osd_op; + return o; +} + +void KeyValueStore::queue_op(OpSequencer *osr, Op *o) +{ + // queue op on sequencer, then queue sequencer for the threadpool, + // so that regardless of which order the threads pick up the + // sequencer, the op order will be preserved. + + osr->queue(o); + + perf_logger->inc(l_os_ops); + perf_logger->inc(l_os_bytes, o->bytes); + + dout(5) << "queue_op " << o << " seq " << o->op << " " << *osr << " " + << o->bytes << " bytes" << " (queue has " << throttle_ops.get_current() + << " ops and " << throttle_bytes.get_current() << " bytes)" << dendl; + op_wq.queue(osr); +} + +void KeyValueStore::op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle) +{ + uint64_t max_ops = m_keyvaluestore_queue_max_ops; + uint64_t max_bytes = m_keyvaluestore_queue_max_bytes; + + perf_logger->set(l_os_oq_max_ops, max_ops); + perf_logger->set(l_os_oq_max_bytes, max_bytes); + + if (handle) + handle->suspend_tp_timeout(); + if (throttle_ops.should_wait(1) || + (throttle_bytes.get_current() // let single large ops through! + && throttle_bytes.should_wait(o->bytes))) { + dout(2) << "waiting " << throttle_ops.get_current() + 1 << " > " << max_ops << " ops || " + << throttle_bytes.get_current() + o->bytes << " > " << max_bytes << dendl; + } + throttle_ops.get(); + throttle_bytes.get(o->bytes); + if (handle) + handle->reset_tp_timeout(); + + perf_logger->set(l_os_oq_ops, throttle_ops.get_current()); + perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current()); +} + +void KeyValueStore::op_queue_release_throttle(Op *o) +{ + throttle_ops.put(); + throttle_bytes.put(o->bytes); + perf_logger->set(l_os_oq_ops, throttle_ops.get_current()); + perf_logger->set(l_os_oq_bytes, throttle_bytes.get_current()); +} + +void KeyValueStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) +{ + // FIXME: Suppose the collection of transaction only affect objects in the + // one PG, so this lock will ensure no other concurrent write operation + osr->apply_lock.Lock(); + Op *o = osr->peek_queue(); + dout(5) << "_do_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl; + int r = _do_transactions(o->tls, o->op, &handle); + dout(10) << "_do_op " << o << " seq " << o->op << " r = " << r + << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; + + if (o->ondisk) { + if (r < 0) { + delete o->ondisk; + o->ondisk = 0; + } else { + ondisk_finisher.queue(o->ondisk, r); + } + } +} + +void KeyValueStore::_finish_op(OpSequencer *osr) +{ + list to_queue; + Op *o = osr->dequeue(&to_queue); + + utime_t lat = ceph_clock_now(g_ceph_context); + lat -= o->start; + + dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl; + osr->apply_lock.Unlock(); // locked in _do_op + op_queue_release_throttle(o); + + perf_logger->tinc(l_os_commit_lat, lat); + perf_logger->tinc(l_os_apply_lat, lat); + + if (o->onreadable_sync) { + o->onreadable_sync->complete(0); + } + if (o->onreadable) + op_finisher.queue(o->onreadable); + if (!to_queue.empty()) + op_finisher.queue(to_queue); + delete o; +} + +// Combine all the ops in the same transaction using "BufferTransaction" and +// cache the middle results in order to make visible to the following ops. +// +// Lock: KeyValueStore use "in_use" in GenericObjectMap to avoid concurrent +// operation on the same object. Not sure ReadWrite lock should be applied to +// improve concurrent performance. In the future, I'd like to remove apply_lock +// on "osr" and introduce PG RWLock. +int KeyValueStore::_do_transactions(list &tls, uint64_t op_seq, + ThreadPool::TPHandle *handle) +{ + int r = 0; + int trans_num = 0; + BufferTransaction bt(this); + + for (list::iterator p = tls.begin(); + p != tls.end(); + ++p, trans_num++) { + r = _do_transaction(**p, bt, handle); + if (r < 0) + break; + if (handle) + handle->reset_tp_timeout(); + } + + r = bt.submit_transaction(); + if (r < 0) { + assert(0 == "unexpected error"); // FIXME + } + + return r; +} + +unsigned KeyValueStore::_do_transaction(Transaction& transaction, + BufferTransaction &t, + ThreadPool::TPHandle *handle) +{ + dout(10) << "_do_transaction on " << &transaction << dendl; + + Transaction::iterator i = transaction.begin(); + uint64_t op_num = 0; + bool exist_clone = false; + + while (i.have_op()) { + if (handle) + handle->reset_tp_timeout(); + + Transaction::Op *op = i.decode_op(); + int r = 0; + + switch (op->op) { + case Transaction::OP_NOP: + break; + + case Transaction::OP_TOUCH: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _touch(cid, oid, t); + } + break; + + case Transaction::OP_WRITE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(cid, oid, off, len, bl, t, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(cid, oid, off, len, t); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(cid, oid, off, t); + } + break; + + case Transaction::OP_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid, t); + } + break; + + case Transaction::OP_SETATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + map to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set, t); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid + << " name " << name << " size " << bl.length() << dendl; + } + break; + + case Transaction::OP_SETATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + map aset; + i.decode_attrset(aset); + r = _setattrs(cid, oid, aset, t); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; + } + break; + + case Transaction::OP_RMATTR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string name = i.decode_string(); + r = _rmattr(cid, oid, name.c_str(), t); + } + break; + + case Transaction::OP_RMATTRS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _rmattrs(cid, oid, t); + } + break; + + case Transaction::OP_CLONE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + exist_clone = true; + r = _clone(cid, oid, noid, t); + } + break; + + case Transaction::OP_CLONERANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + uint64_t off = op->off; + uint64_t len = op->len; + exist_clone = true; + r = _clone_range(cid, oid, noid, off, len, off, t); + } + break; + + case Transaction::OP_CLONERANGE2: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + ghobject_t noid = i.get_oid(op->dest_oid); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + exist_clone = true; + r = _clone_range(cid, oid, noid, srcoff, len, dstoff, t); + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _create_collection(cid, t); + } + break; + + case Transaction::OP_COLL_HINT: + { + coll_t cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + bufferlist::iterator hiter = hint.begin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + ::decode(pg_num, hiter); + ::decode(num_objs, hiter); + r = _collection_hint_expected_num_objs(cid, pg_num, num_objs); + } else { + // Ignore the hint + dout(10) << "Unrecognized collection hint type: " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _destroy_collection(cid, t); + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + r = _collection_add(ncid, ocid, oid, t); + } + break; + + case Transaction::OP_COLL_REMOVE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _remove(cid, oid, t); + } + break; + + case Transaction::OP_COLL_MOVE: + { + // WARNING: this is deprecated and buggy; only here to replay old journals. + coll_t ocid = i.get_cid(op->cid); + coll_t ncid = i.get_cid(op->dest_cid); + ghobject_t oid = i.get_oid(op->oid); + r = _collection_move_rename(ocid, oid, ncid, oid, t); + } + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + coll_t oldcid = i.get_cid(op->cid); + ghobject_t oldoid = i.get_oid(op->oid); + coll_t newcid = i.get_cid(op->dest_cid); + ghobject_t newoid = i.get_oid(op->dest_oid); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, t); + } + break; + + case Transaction::OP_COLL_SETATTR: + case Transaction::OP_COLL_RMATTR: + assert(0 == "coll attrs no longer supported"); + break; + + case Transaction::OP_STARTSYNC: + { + start_sync(); + break; + } + + case Transaction::OP_COLL_RENAME: + { + assert(0 == "not implemented"); + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(cid, oid, t); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + map aset; + i.decode_attrset(aset); + r = _omap_setkeys(cid, oid, aset, t); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + set keys; + i.decode_keyset(keys); + r = _omap_rmkeys(cid, oid, keys, t); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkeyrange(cid, oid, first, last, t); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(cid, oid, bl, t); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + r = _split_collection_create(cid, bits, rem, dest, t); + } + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + r = _split_collection(cid, bits, rem, dest, t); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + coll_t cid = i.get_cid(op->cid); + ghobject_t oid = i.get_oid(op->oid); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + r = _set_alloc_hint(cid, oid, expected_object_size, + expected_write_size, t); + } + break; + + default: + derr << "bad op " << op->op << dendl; + assert(0); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD)) + // -ENOENT is normally okay + // ...including on a replayed OP_RMCOLL with checkpoint mode + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (exist_clone) { + dout(0) << "BUG: clone failed will lead to paritial transaction applied" << dendl; + } + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC handling not implemented"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + dout(0) << " error " << cpp_strerror(r) << " not handled on operation " + << op->op << " op " << op_num << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + transaction.dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + assert(0 == "unexpected error"); + + if (r == -EMFILE) { + dump_open_fds(g_ceph_context); + } + } + } + + op_num++; + } + + return 0; // FIXME count errors +} + + +// =========== KeyValueStore Op Implementation ============== +// objects + +bool KeyValueStore::exists(coll_t cid, const ghobject_t& oid) +{ + dout(10) << __func__ << "collection: " << cid << " object: " << oid + << dendl; + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = backend->lookup_strip_header(cid, oid, &header); + if (r < 0) { + return false; + } + + return true; +} + +int KeyValueStore::stat(coll_t cid, const ghobject_t& oid, + struct stat *st, bool allow_eio) +{ + dout(10) << "stat " << cid << "/" << oid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + int r = backend->lookup_strip_header(cid, oid, &header); + if (r < 0) { + dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl; + return -ENOENT; + } + + st->st_blocks = header->max_size / header->strip_size; + if (header->max_size % header->strip_size) + st->st_blocks++; + st->st_nlink = 1; + st->st_size = header->max_size; + st->st_blksize = header->strip_size; + + return r; +} + +int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header, + uint64_t offset, size_t len, bufferlist& bl, + bool allow_eio, BufferTransaction *bt) +{ + if (header->max_size < offset) { + dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")" + << " offset exceed the length of bl"<< dendl; + return 0; + } + + if (len == 0) + len = header->max_size - offset; + + if (offset + len > header->max_size) + len = header->max_size - offset; + + vector extents; + StripObjectMap::file_to_extents(offset, len, header->strip_size, + extents); + map out; + set keys; + pair uid = make_pair(header->cid, header->oid); + + for (vector::iterator iter = extents.begin(); + iter != extents.end(); ++iter) { + bufferlist old; + string key = strip_object_key(iter->no); + + map< pair, map, bufferlist> >::iterator obj_it; + if ( bt ){ + obj_it = bt->buffers.find(uid); + } + if ( bt && obj_it != bt->buffers.end() && obj_it->second.count(make_pair(OBJECT_STRIP_PREFIX, key))) { + // use strip_header buffer + assert(header->bits[iter->no]); + out[key] = obj_it->second[make_pair(OBJECT_STRIP_PREFIX, key)]; + }else if (header->bits[iter->no]) { + keys.insert(key); + } + } + + + int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out); + r = check_get_rc(header->cid, header->oid, r, out.size() == keys.size()); + if (r < 0) + return r; + + for (vector::iterator iter = extents.begin(); + iter != extents.end(); ++iter) { + string key = strip_object_key(iter->no); + + if (header->bits[iter->no]) { + if (iter->len == header->strip_size) { + bl.claim_append(out[key]); + } else { + out[key].copy(iter->offset, iter->len, bl); + } + } else { + bl.append_zero(iter->len); + } + } + + dout(10) << __func__ << " " << header->cid << "/" << header->oid << " " + << offset << "~" << bl.length() << "/" << len << " r = " << r + << dendl; + + return bl.length(); +} + + +int KeyValueStore::read(coll_t cid, const ghobject_t& oid, uint64_t offset, + size_t len, bufferlist& bl, uint32_t op_flags, + bool allow_eio) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" + << len << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + int r = backend->lookup_strip_header(cid, oid, &header); + + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~" + << len << " header isn't exist: r = " << r << dendl; + return r; + } + + return _generic_read(header, offset, len, bl, allow_eio); +} + +int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid, + uint64_t offset, size_t len, bufferlist& bl) +{ + dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~" + << len << dendl; + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = backend->lookup_strip_header(cid, oid, &header); + if (r < 0) { + dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len + << " failed to get header: r = " << r << dendl; + return r; + } + + vector extents; + StripObjectMap::file_to_extents(offset, len, header->strip_size, + extents); + + map m; + for (vector::iterator iter = extents.begin(); + iter != extents.end(); ++iter) { + if (header->bits[iter->no]) { + uint64_t off = iter->no * header->strip_size + iter->offset; + m[off] = iter->len; + } + } + ::encode(m, bl); + return 0; +} + +int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << dendl; + + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " + << " failed to get header: r = " << r << dendl; + return r; + } + + header->max_size = 0; + header->bits.clear(); + header->updated = true; + r = t.clear_buffer(header); + + dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " size " << size + << dendl; + + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " << size + << " failed to get header: r = " << r << dendl; + return r; + } + + if (header->max_size == size) + return 0; + + if (header->max_size > size) { + vector extents; + StripObjectMap::file_to_extents(size, header->max_size-size, + header->strip_size, extents); + assert(extents.size()); + + vector::iterator iter = extents.begin(); + if (header->bits[iter->no] && iter->offset != 0) { + bufferlist value; + map values; + set lookup_keys; + string key = strip_object_key(iter->no); + + lookup_keys.insert(key); + r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, + lookup_keys, &values); + r = check_get_rc(cid, oid, r, lookup_keys.size() == values.size()); + if (r < 0) + return r; + + values[key].copy(0, iter->offset, value); + value.append_zero(header->strip_size-iter->offset); + assert(value.length() == header->strip_size); + value.swap(values[key]); + + t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); + ++iter; + } + + set keys; + for (; iter != extents.end(); ++iter) { + if (header->bits[iter->no]) { + keys.insert(strip_object_key(iter->no)); + header->bits[iter->no] = 0; + } + } + r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " + << size << " = " << r << dendl; + return r; + } + } + + header->bits.resize(size/header->strip_size+1); + header->max_size = size; + header->updated = true; + + dout(10) << __func__ << " " << cid << "/" << oid << " size " << size << " = " + << r << dendl; + return r; +} + +int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << dendl; + + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, true); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " + << " failed to get header: r = " << r << dendl; + r = -EINVAL; + return r; + } + + dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header, + uint64_t offset, size_t len, + const bufferlist& bl, BufferTransaction &t, + uint32_t fadvise_flags) +{ + if (len > bl.length()) + len = bl.length(); + + if (len + offset > header->max_size) { + header->max_size = len + offset; + header->bits.resize(header->max_size/header->strip_size+1); + header->updated = true; + } + + vector extents; + StripObjectMap::file_to_extents(offset, len, header->strip_size, + extents); + + map out; + set keys; + for (vector::iterator iter = extents.begin(); + iter != extents.end(); ++iter) { + if (header->bits[iter->no] && !(iter->offset == 0 && + iter->len == header->strip_size)) + keys.insert(strip_object_key(iter->no)); + } + + int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out); + r = check_get_rc(header->cid, header->oid, r, keys.size() == out.size()); + if (r < 0) + return r; + + uint64_t bl_offset = 0; + map values; + for (vector::iterator iter = extents.begin(); + iter != extents.end(); ++iter) { + bufferlist value; + string key = strip_object_key(iter->no); + if (header->bits[iter->no]) { + if (iter->offset == 0 && iter->len == header->strip_size) { + bl.copy(bl_offset, iter->len, value); + bl_offset += iter->len; + } else { + assert(out[key].length() == header->strip_size); + + out[key].copy(0, iter->offset, value); + bl.copy(bl_offset, iter->len, value); + bl_offset += iter->len; + + if (value.length() != header->strip_size) + out[key].copy(value.length(), header->strip_size-value.length(), + value); + } + } else { + if (iter->offset) + value.append_zero(iter->offset); + bl.copy(bl_offset, iter->len, value); + bl_offset += iter->len; + + if (value.length() < header->strip_size) + value.append_zero(header->strip_size-value.length()); + + header->bits[iter->no] = 1; + header->updated = true; + } + assert(value.length() == header->strip_size); + values[key].swap(value); + } + assert(bl_offset == len); + + t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); + dout(10) << __func__ << " " << header->cid << "/" << header->oid << " " + << offset << "~" << len << " = " << r << dendl; + + return r; +} + +int KeyValueStore::_write(coll_t cid, const ghobject_t& oid, + uint64_t offset, size_t len, const bufferlist& bl, + BufferTransaction &t, uint32_t fadvise_flags) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" + << len << dendl; + + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, true); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " << offset + << "~" << len << " failed to get header: r = " << r << dendl; + return r; + } + + return _generic_write(header, offset, len, bl, t, fadvise_flags); +} + +int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset, + size_t len, BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, true); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " << offset + << "~" << len << " failed to get header: r = " << r << dendl; + return r; + } + + if (len + offset > header->max_size) { + header->max_size = len + offset; + header->bits.resize(header->max_size/header->strip_size+1); + header->updated = true; + } + + vector extents; + StripObjectMap::file_to_extents(offset, len, header->strip_size, + extents); + set rm_keys; + set lookup_keys; + map values; + map > off_len; + for (vector::iterator iter = extents.begin(); + iter != extents.end(); ++iter) { + string key = strip_object_key(iter->no); + if (header->bits[iter->no]) { + if (iter->offset == 0 && iter->len == header->strip_size) { + rm_keys.insert(key); + header->bits[iter->no] = 0; + header->updated = true; + } else { + lookup_keys.insert(key); + off_len[key] = make_pair(iter->offset, iter->len); + } + } + } + r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, + lookup_keys, &values); + r = check_get_rc(header->cid, header->oid, r, lookup_keys.size() == values.size()); + if (r < 0) + return r; + for(set::iterator it = lookup_keys.begin(); it != lookup_keys.end(); ++it) + { + pair p = off_len[*it]; + values[*it].zero(p.first, p.second); + } + t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values); + t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, rm_keys); + dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~" + << len << " = " << r << dendl; + return r; +} + +int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid, + const ghobject_t& newoid, BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << dendl; + + if (oldoid == newoid) + return 0; + + int r; + StripObjectMap::StripObjectHeaderRef old_header; + + r = t.lookup_cached_header(cid, oldoid, &old_header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << " = " << r << dendl; + return r; + } + + t.clone_buffer(old_header, cid, newoid); + + dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << " = " << r << dendl; + return r; +} + +int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid, + const ghobject_t& newoid, uint64_t srcoff, + uint64_t len, uint64_t dstoff, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << " " << srcoff << "~" << len << " to " << dstoff + << dendl; + + int r; + bufferlist bl; + + StripObjectMap::StripObjectHeaderRef old_header, new_header; + + r = t.lookup_cached_header(cid, oldoid, &old_header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << " " << srcoff << "~" << len << " to " << dstoff + << " header isn't exist: r = " << r << dendl; + return r; + } + + r = t.lookup_cached_header(cid, newoid, &new_header, true); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << " " << srcoff << "~" << len << " to " << dstoff + << " can't create header: r = " << r << dendl; + return r; + } + + r = _generic_read(old_header, srcoff, len, bl, &t); + if (r < 0) + goto out; + + r = _generic_write(new_header, dstoff, len, bl, t); + + out: + dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/" + << newoid << " " << srcoff << "~" << len << " to " << dstoff + << " = " << r << dendl; + return r; +} + +// attrs + +int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, + bufferptr &bp) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " '" << name << "'" + << dendl; + + int r; + map got; + set to_get; + StripObjectMap::StripObjectHeaderRef header; + + to_get.insert(string(name)); + + r = backend->lookup_strip_header(cid, oid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " get_xattrs err r =" << r << dendl; + goto out; + } + if (got.empty()) { + dout(10) << __func__ << " got.size() is 0" << dendl; + return -ENODATA; + } + bp = bufferptr(got.begin()->second.c_str(), + got.begin()->second.length()); + r = 0; + + out: + dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = " + << r << dendl; + return r; +} + +int KeyValueStore::getattrs(coll_t cid, const ghobject_t& oid, + map& aset) +{ + map attr_aset; + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = backend->lookup_strip_header(cid, oid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + r = backend->get_with_header(header, OBJECT_XATTR, &attr_aset); + if (r < 0) { + dout(10) << __func__ << " could not get attrs r = " << r << dendl; + goto out; + } + + for (map::iterator i = attr_aset.begin(); + i != attr_aset.end(); ++i) { + string key; + key = i->first; + aset.insert(make_pair(key, + bufferptr(i->second.c_str(), i->second.length()))); + } + + out: + dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; + + return r; +} + +int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid, + map& aset, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << dendl; + + int r; + + StripObjectMap::StripObjectHeaderRef header; + map attrs; + + r = t.lookup_cached_header(cid, oid, &header, false); + if (r < 0) + goto out; + + for (map::iterator it = aset.begin(); + it != aset.end(); ++it) { + attrs[it->first].push_back(it->second); + } + + t.set_buffer_keys(header, OBJECT_XATTR, attrs); + +out: + dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + +int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " '" << name << "'" + << dendl; + + int r; + set to_remove; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, false); + if (r < 0) { + dout(10) << __func__ << " could not find header r = " << r + << dendl; + return r; + } + + to_remove.insert(string(name)); + r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove); + + dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = " + << r << dendl; + return r; +} + +int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << dendl; + + int r; + set attrs; + + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, false); + if (r < 0) { + dout(10) << __func__ << " could not find header r = " << r + << dendl; + return r; + } + + r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs); + if (r < 0) { + dout(10) << __func__ << " could not get attrs r = " << r << dendl; + return r; + } + + r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs); + t.clear_buffer_keys(header, OBJECT_XATTR); + + dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + +// collections + +int KeyValueStore::_create_collection(coll_t c, BufferTransaction &t) +{ + dout(15) << __func__ << " " << c << dendl; + int r = 0; + bufferlist bl; + + RWLock::WLocker l(collections_lock); + if (collections.count(c)) { + r = -EEXIST; + goto out; + } + + collections.insert(c); + t.set_collections(collections); + + out: + dout(10) << __func__ << " cid " << c << " r = " << r << dendl; + return r; +} + +int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t) +{ + dout(15) << __func__ << " " << c << dendl; + + int r; + uint64_t modified_object = 0; + vector oids; + bufferlist bl; + + { + RWLock::RLocker l(collections_lock); + if (!collections.count(c)) { + r = -ENOENT; + goto out; + } + } + + // All modified objects are marked deleted + for (BufferTransaction::StripHeaderMap::iterator iter = t.strip_headers.begin(); + iter != t.strip_headers.end(); ++iter) { + // sum the total modified object in this PG + if (iter->first.first != c) + continue; + + modified_object++; + if (!iter->second->deleted) { + r = -ENOTEMPTY; + goto out; + } + } + + r = backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), modified_object+1, &oids, + 0); + // No other object + if (oids.size() != modified_object && oids.size() != 0) { + r = -ENOTEMPTY; + goto out; + } + + for (vector::iterator iter = oids.begin(); + iter != oids.end(); ++iter) { + if (!t.strip_headers.count(make_pair(c, *iter))) { + r = -ENOTEMPTY; + goto out; + } + } + + { + RWLock::WLocker l(collections_lock); + collections.erase(c); + t.set_collections(collections); + } + r = 0; + +out: + dout(10) << __func__ << " " << c << " = " << r << dendl; + return r; +} + + +int KeyValueStore::_collection_add(coll_t c, coll_t oldcid, + const ghobject_t& o, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" + << o << dendl; + + bufferlist bl; + StripObjectMap::StripObjectHeaderRef header, old_header; + + int r = t.lookup_cached_header(oldcid, o, &old_header, false); + if (r < 0) { + goto out; + } + + r = t.lookup_cached_header(c, o, &header, false); + if (r == 0) { + r = -EEXIST; + dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" + << o << " already exist " << dendl; + goto out; + } + + r = _generic_read(old_header, 0, old_header->max_size, bl, &t); + if (r < 0) { + r = -EINVAL; + goto out; + } + + r = _generic_write(header, 0, bl.length(), bl, t); + if (r < 0) { + r = -EINVAL; + } + +out: + dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" + << o << " = " << r << dendl; + return r; +} + +int KeyValueStore::_collection_move_rename(coll_t oldcid, + const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" + << oldoid << dendl; + int r; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(c, o, &header, false); + if (r == 0) { + dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c + << "/" << o << " = " << r << dendl; + return -EEXIST; + } + + r = t.lookup_cached_header(oldcid, oldoid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c + << "/" << o << " = " << r << dendl; + return r; + } + + t.rename_buffer(header, c, o); + + dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/" + << oldoid << " = " << r << dendl; + return r; +} + +int KeyValueStore::_collection_remove_recursive(const coll_t &cid, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << dendl; + int r = 0; + + { + RWLock::RLocker l(collections_lock); + if (collections.count(cid) == 0) + return -ENOENT; + } + + vector objects; + ghobject_t max; + while (!max.is_max()) { + r = collection_list(cid, max, ghobject_t::get_max(), true, 300, &objects, &max); + if (r < 0) + goto out; + + for (vector::iterator i = objects.begin(); + i != objects.end(); ++i) { + r = _remove(cid, *i, t); + if (r < 0) + goto out; + } + } + + { + RWLock::WLocker l(collections_lock); + collections.erase(cid); + t.set_collections(collections); + } + + out: + dout(10) << __func__ << " " << cid << " r = " << r << dendl; + return r; +} + +int KeyValueStore::list_collections(vector& ls) +{ + dout(10) << __func__ << " " << dendl; + RWLock::RLocker l(collections_lock); + for (set::iterator p = collections.begin(); p != collections.end(); + ++p) { + ls.push_back(*p); + } + return 0; +} + +bool KeyValueStore::collection_exists(coll_t c) +{ + dout(10) << __func__ << " " << dendl; + RWLock::RLocker l(collections_lock); + return collections.count(c); +} + +bool KeyValueStore::collection_empty(coll_t c) +{ + dout(10) << __func__ << " " << dendl; + + vector oids; + backend->list_objects(c, ghobject_t(), ghobject_t::get_max(), 1, &oids, 0); + + return oids.empty(); +} + +int KeyValueStore::collection_list(coll_t c, ghobject_t start, + ghobject_t end, bool sort_bitwise, int max, + vector *ls, ghobject_t *next) +{ + if (!sort_bitwise) + return -EOPNOTSUPP; + + if (max < 0) + return -EINVAL; + + if (start.is_max()) + return 0; + + int r = backend->list_objects(c, start, end, max, ls, next); + return r; +} + +int KeyValueStore::collection_version_current(coll_t c, uint32_t *version) +{ + *version = COLLECTION_VERSION; + if (*version == target_version) + return 1; + else + return 0; +} + +// omap + +int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid, + bufferlist *bl, map *out) +{ + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + int r = backend->lookup_strip_header(c, hoid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + r = backend->get_with_header(header, OBJECT_OMAP, out); + if (r < 0) { + dout(10) << __func__ << " err r =" << r << dendl; + return r; + } + + set keys; + map got; + + keys.insert(OBJECT_OMAP_HEADER_KEY); + r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " err r =" << r << dendl; + return r; + } + + if (!got.empty()) { + assert(got.size() == 1); + bl->swap(got.begin()->second); + } + + return 0; +} + +int KeyValueStore::omap_get_header(coll_t c, const ghobject_t &hoid, + bufferlist *bl, bool allow_eio) +{ + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + + set keys; + map got; + StripObjectMap::StripObjectHeaderRef header; + + int r = backend->lookup_strip_header(c, hoid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + keys.insert(OBJECT_OMAP_HEADER_KEY); + r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __func__ << " err r =" << r << dendl; + return r; + } + + if (!got.empty()) { + assert(got.size() == 1); + bl->swap(got.begin()->second); + } + + return 0; +} + +int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set *keys) +{ + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + int r = backend->lookup_strip_header(c, hoid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + r = backend->get_keys_with_header(header, OBJECT_OMAP, keys); + if (r < 0) { + return r; + } + return 0; +} + +int KeyValueStore::omap_get_values(coll_t c, const ghobject_t &hoid, + const set &keys, + map *out) +{ + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + int r = backend->lookup_strip_header(c, hoid, &header); + if (r < 0) { + dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl; + return r; + } + + r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out); + if (r < 0 && r != -ENOENT) { + return r; + } + return 0; +} + +int KeyValueStore::omap_check_keys(coll_t c, const ghobject_t &hoid, + const set &keys, set *out) +{ + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + + int r = backend->check_keys(c, hoid, OBJECT_OMAP, keys, out); + if (r < 0 && r != -ENOENT) { + return r; + } + return 0; +} + +ObjectMap::ObjectMapIterator KeyValueStore::get_omap_iterator( + coll_t c, const ghobject_t &hoid) +{ + dout(15) << __func__ << " " << c << "/" << hoid << dendl; + return backend->get_iterator(c, hoid, OBJECT_OMAP); +} + +int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + int r = t.lookup_cached_header(cid, hoid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << hoid << " " + << " failed to get header: r = " << r << dendl; + return r; + } + + set keys; + r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys); + if (r < 0) { + dout(10) << __func__ << " could not get omap_keys r = " << r << dendl; + return r; + } + + r = t.remove_buffer_keys(header, OBJECT_OMAP, keys); + if (r < 0) { + dout(10) << __func__ << " could not remove keys r = " << r << dendl; + return r; + } + + keys.clear(); + keys.insert(OBJECT_OMAP_HEADER_KEY); + r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys); + if (r < 0) { + dout(10) << __func__ << " could not remove keys r = " << r << dendl; + return r; + } + + t.clear_buffer_keys(header, OBJECT_OMAP_HEADER); + + dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl; + return 0; +} + +int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid, + map &aset, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + int r = t.lookup_cached_header(cid, hoid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << hoid << " " + << " failed to get header: r = " << r << dendl; + return r; + } + + t.set_buffer_keys(header, OBJECT_OMAP, aset); + + return 0; +} + +int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid, + const set &keys, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + int r = t.lookup_cached_header(cid, hoid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << hoid << " " + << " failed to get header: r = " << r << dendl; + return r; + } + + r = t.remove_buffer_keys(header, OBJECT_OMAP, keys); + + dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl; + return r; +} + +int KeyValueStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid, + const string& first, const string& last, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << hoid << " [" << first << "," + << last << "]" << dendl; + + set keys; + { + ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); + if (!iter) + return -ENOENT; + + for (iter->lower_bound(first); iter->valid() && iter->key() < last; + iter->next(false)) { + keys.insert(iter->key()); + } + } + return _omap_rmkeys(cid, hoid, keys, t); +} + +int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid, + const bufferlist &bl, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << hoid << dendl; + + map sets; + StripObjectMap::StripObjectHeaderRef header; + + int r = t.lookup_cached_header(cid, hoid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << hoid << " " + << " failed to get header: r = " << r << dendl; + return r; + } + + sets[OBJECT_OMAP_HEADER_KEY] = bl; + t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets); + return 0; +} + +int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem, + coll_t dest, BufferTransaction &t) +{ + { + dout(15) << __func__ << " " << cid << " bits: " << bits << dendl; + + StripObjectMap::StripObjectHeaderRef header; + + { + RWLock::RLocker l(collections_lock); + if (collections.count(cid) == 0) + return -ENOENT; + if (collections.count(dest) == 0) + return -ENOENT; + } + + vector objects; + ghobject_t next, current; + int move_size = 0; + while (1) { + collection_list(cid, current, ghobject_t::get_max(), true, + get_ideal_list_max(), &objects, &next); + + dout(20) << __func__ << cid << "objects size: " << objects.size() + << dendl; + + if (objects.empty()) + break; + + for (vector::iterator i = objects.begin(); + i != objects.end(); ++i) { + if (i->match(bits, rem)) { + if (_collection_move_rename(cid, *i, dest, *i, t) < 0) { + return -1; + } + move_size++; + } + } + + objects.clear(); + current = next; + } + + dout(20) << __func__ << "move" << move_size << " object from " << cid + << "to " << dest << dendl; + } + + if (g_conf->filestore_debug_verify_split) { + vector objects; + ghobject_t next; + while (1) { + collection_list(cid, next, ghobject_t::get_max(), true, + get_ideal_list_max(), &objects, &next); + if (objects.empty()) + break; + + for (vector::iterator i = objects.begin(); + i != objects.end(); ++i) { + dout(20) << __func__ << ": " << *i << " still in source " + << cid << dendl; + assert(!i->match(bits, rem)); + } + objects.clear(); + } + + next = ghobject_t(); + while (1) { + collection_list(dest, next, ghobject_t::get_max(), true, + get_ideal_list_max(), &objects, &next); + if (objects.empty()) + break; + + for (vector::iterator i = objects.begin(); + i != objects.end(); ++i) { + dout(20) << __func__ << ": " << *i << " now in dest " + << *i << dendl; + assert(i->match(bits, rem)); + } + objects.clear(); + } + } + return 0; +} + +int KeyValueStore::_set_alloc_hint(coll_t cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size, + BufferTransaction &t) +{ + dout(15) << __func__ << " " << cid << "/" << oid << " object_size " + << expected_object_size << " write_size " + << expected_write_size << dendl; + + int r = 0; + StripObjectMap::StripObjectHeaderRef header; + + r = t.lookup_cached_header(cid, oid, &header, false); + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid + << " failed to get header: r = " << r << dendl; + return r; + } + + bool blank = true; + for (vector::iterator it = header->bits.begin(); + it != header->bits.end(); ++it) { + if (*it) { + blank = false; + break; + } + } + + // Now only consider to change "strip_size" when the object is blank, + // because set_alloc_hint is expected to be very lightweight + if (blank) { + // header->strip_size = MIN(expected_write_size, m_keyvaluestore_max_expected_write_size); + // dout(20) << __func__ << " hint " << header->strip_size << " success" << dendl; + } + + dout(10) << __func__ << "" << cid << "/" << oid << " object_size " + << expected_object_size << " write_size " + << expected_write_size << " = " << r << dendl; + + return r; +} + +const char** KeyValueStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "keyvaluestore_queue_max_ops", + "keyvaluestore_queue_max_bytes", + "keyvaluestore_default_strip_size", + "keyvaluestore_dump_file", + NULL + }; + return KEYS; +} + +void KeyValueStore::handle_conf_change(const struct md_config_t *conf, + const std::set &changed) +{ + if (changed.count("keyvaluestore_queue_max_ops") || + changed.count("keyvaluestore_queue_max_bytes") || + changed.count("keyvaluestore_max_expected_write_size")) { + m_keyvaluestore_queue_max_ops = conf->keyvaluestore_queue_max_ops; + m_keyvaluestore_queue_max_bytes = conf->keyvaluestore_queue_max_bytes; + m_keyvaluestore_max_expected_write_size = conf->keyvaluestore_max_expected_write_size; + throttle_ops.reset_max(conf->keyvaluestore_queue_max_ops); + throttle_bytes.reset_max(conf->keyvaluestore_queue_max_bytes); + } + if (changed.count("keyvaluestore_default_strip_size")) { + m_keyvaluestore_strip_size = conf->keyvaluestore_default_strip_size; + default_strip_size = m_keyvaluestore_strip_size; + } + if (changed.count("keyvaluestore_dump_file")) { + if (conf->keyvaluestore_dump_file.length() && + conf->keyvaluestore_dump_file != "-") { + dump_start(conf->keyvaluestore_dump_file); + } else { + dump_stop(); + } + } +} + +int KeyValueStore::check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size) +{ + if (r < 0) { + dout(10) << __func__ << " " << cid << "/" << oid << " " + << " get rc = " << r << dendl; + } else if (!is_equal_size) { + dout(0) << __func__ << " broken header or missing data in backend " + << cid << "/" << oid << " get rc = " << r << dendl; + r = -EBADF; + } + return r; +} + +void KeyValueStore::dump_start(const std::string &file) +{ + dout(10) << "dump_start " << file << dendl; + if (m_keyvaluestore_do_dump) { + dump_stop(); + } + m_keyvaluestore_dump_fmt.reset(); + m_keyvaluestore_dump_fmt.open_array_section("dump"); + m_keyvaluestore_dump.open(file.c_str()); + m_keyvaluestore_do_dump = true; +} + +void KeyValueStore::dump_stop() +{ + dout(10) << "dump_stop" << dendl; + m_keyvaluestore_do_dump = false; + if (m_keyvaluestore_dump.is_open()) { + m_keyvaluestore_dump_fmt.close_section(); + m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump); + m_keyvaluestore_dump.flush(); + m_keyvaluestore_dump.close(); + } +} +void KeyValueStore::dump_transactions(list& ls, uint64_t seq, OpSequencer *osr) +{ + m_keyvaluestore_dump_fmt.open_array_section("transactions"); + unsigned trans_num = 0; + for (list::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { + m_keyvaluestore_dump_fmt.open_object_section("transaction"); + m_keyvaluestore_dump_fmt.dump_string("osr", osr->get_name()); + m_keyvaluestore_dump_fmt.dump_unsigned("seq", seq); + m_keyvaluestore_dump_fmt.dump_unsigned("trans_num", trans_num); + (*i)->dump(&m_keyvaluestore_dump_fmt); + m_keyvaluestore_dump_fmt.close_section(); + } + m_keyvaluestore_dump_fmt.close_section(); + m_keyvaluestore_dump_fmt.flush(m_keyvaluestore_dump); + m_keyvaluestore_dump.flush(); +} + + +// -- KVSuperblock -- + +void KVSuperblock::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + compat_features.encode(bl); + ::encode(backend, bl); + ENCODE_FINISH(bl); +} + +void KVSuperblock::decode(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + compat_features.decode(bl); + ::decode(backend, bl); + DECODE_FINISH(bl); +} + +void KVSuperblock::dump(Formatter *f) const +{ + f->open_object_section("compat"); + compat_features.dump(f); + f->dump_string("backend", backend); + f->close_section(); +} + +void KVSuperblock::generate_test_instances(list& o) +{ + KVSuperblock z; + o.push_back(new KVSuperblock(z)); + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + z.compat_features = CompatSet(feature_compat, feature_ro_compat, + feature_incompat); + o.push_back(new KVSuperblock(z)); + z.backend = "rocksdb"; + o.push_back(new KVSuperblock(z)); +} diff --git a/src/os/keyvaluestore/KeyValueStore.h b/src/os/keyvaluestore/KeyValueStore.h new file mode 100644 index 000000000000..ff272f1c247d --- /dev/null +++ b/src/os/keyvaluestore/KeyValueStore.h @@ -0,0 +1,700 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 UnitedStack + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_KEYVALUESTORE_H +#define CEPH_KEYVALUESTORE_H + +#include "include/types.h" + +#include +#include +#include +#include +using namespace std; + +#include "include/assert.h" + +#include "os/ObjectStore.h" + +#include "common/WorkQueue.h" +#include "common/Finisher.h" +#include "common/fd.h" + +#include "common/Mutex.h" +#include "GenericObjectMap.h" +#include "kv/KeyValueDB.h" +#include "common/random_cache.hpp" + +#include "include/uuid.h" + +static uint64_t default_strip_size = 1024; + +class StripObjectMap: public GenericObjectMap { + public: + + struct StripExtent { + uint64_t no; + uint64_t offset; // in key + uint64_t len; // in key + StripExtent(uint64_t n, uint64_t off, size_t len): + no(n), offset(off), len(len) {} + }; + + // -- strip object -- + struct StripObjectHeader { + // Persistent state + uint64_t strip_size; + uint64_t max_size; + vector bits; + + // soft state + Header header; // FIXME: Hold lock to avoid concurrent operations, it will + // also block read operation which not should be permitted. + coll_t cid; + ghobject_t oid; + bool updated; + bool deleted; + + StripObjectHeader(): strip_size(default_strip_size), max_size(0), updated(false), deleted(false) {} + + void encode(bufferlist &bl) const { + ENCODE_START(1, 1, bl); + ::encode(strip_size, bl); + ::encode(max_size, bl); + ::encode(bits, bl); + ENCODE_FINISH(bl); + } + + void decode(bufferlist::iterator &bl) { + DECODE_START(1, bl); + ::decode(strip_size, bl); + ::decode(max_size, bl); + ::decode(bits, bl); + DECODE_FINISH(bl); + } + }; + typedef ceph::shared_ptr StripObjectHeaderRef; + + static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size, + vector &extents); + int lookup_strip_header(const coll_t & cid, const ghobject_t &oid, + StripObjectHeaderRef *header); + int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t); + int create_strip_header(const coll_t &cid, const ghobject_t &oid, + StripObjectHeaderRef *strip_header, + KeyValueDB::Transaction t); + void clone_wrap(StripObjectHeaderRef old_header, + const coll_t &cid, const ghobject_t &oid, + KeyValueDB::Transaction t, + StripObjectHeaderRef *target_header); + void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid, + KeyValueDB::Transaction t, + StripObjectHeaderRef *new_header); + // Already hold header to avoid lock header seq again + int get_with_header( + const StripObjectHeaderRef header, + const string &prefix, + map *out + ); + + int get_values_with_header( + const StripObjectHeaderRef header, + const string &prefix, + const set &keys, + map *out + ); + int get_keys_with_header( + const StripObjectHeaderRef header, + const string &prefix, + set *keys + ); + + Mutex lock; + void invalidate_cache(const coll_t &c, const ghobject_t &oid) { + Mutex::Locker l(lock); + caches.clear(oid); + } + + RandomCache > caches; + StripObjectMap(KeyValueDB *db): GenericObjectMap(db), + lock("StripObjectMap::lock"), + caches(g_conf->keyvaluestore_header_cache_size) + {} +}; + + +class KVSuperblock { +public: + CompatSet compat_features; + string backend; + + KVSuperblock() { } + + void encode(bufferlist &bl) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(KVSuperblock) + + +inline ostream& operator<<(ostream& out, const KVSuperblock& sb) +{ + return out << "sb(" << sb.compat_features << " " << sb.backend << ")"; +} + + +class KeyValueStore : public ObjectStore, + public md_config_obs_t { + public: + struct KVPerfTracker { + PerfCounters::avg_tracker os_commit_latency; + PerfCounters::avg_tracker os_apply_latency; + + objectstore_perf_stat_t get_cur_stats() const { + objectstore_perf_stat_t ret; + ret.filestore_commit_latency = os_commit_latency.avg(); + ret.filestore_apply_latency = os_apply_latency.avg(); + return ret; + } + + void update_from_perfcounters(PerfCounters &logger) { + os_commit_latency.consume_next( + logger.get_tavg_ms( + l_os_commit_lat)); + os_apply_latency.consume_next( + logger.get_tavg_ms( + l_os_apply_lat)); + } + + } perf_tracker; + + objectstore_perf_stat_t get_cur_stats() { + perf_tracker.update_from_perfcounters(*perf_logger); + return perf_tracker.get_cur_stats(); + } + + static const uint32_t target_version = 1; + + private: + string internal_name; // internal name, used to name the perfcounter instance + string basedir; + std::string current_fn; + uuid_d fsid; + + int fsid_fd, current_fd; + + deque snaps; + + // ObjectMap + boost::scoped_ptr backend; + + Finisher ondisk_finisher; + + RWLock collections_lock; + set collections; + + Mutex lock; + + int _create_current(); + + /// read a uuid from fd + int read_fsid(int fd, uuid_d *uuid); + + /// lock fsid_fd + int lock_fsid(); + + string strip_object_key(uint64_t no) { + char n[100]; + snprintf(n, 100, "%08lld", (long long)no); + return string(n); + } + + // Each transaction has side effect which may influent the following + // operations, we need to make it visible for the following within + // transaction by caching middle result. + // Side effects contains: + // 1. Creating/Deleting collection + // 2. Creating/Deleting object + // 3. Object modify(including omap, xattr) + // 4. Clone or rename + struct BufferTransaction { + typedef pair uniq_id; + + struct CollGhobjectPairBitwiseComparator { + bool operator()(const uniq_id& l, + const uniq_id& r) const { + if (l.first < r.first) + return true; + if (l.first != r.first) + return false; + if (cmp_bitwise(l.second, r.second) < 0) + return true; + return false; + } + }; + + typedef map StripHeaderMap; + + //Dirty records + StripHeaderMap strip_headers; + map< uniq_id, map, bufferlist>, + CollGhobjectPairBitwiseComparator> buffers; // pair(prefix, key),to buffer updated data in one transaction + + list finishes; + + KeyValueStore *store; + + KeyValueDB::Transaction t; + + void set_collections(const set& collections) { + bufferlist collections_bl; + ::encode(collections, collections_bl); + t->set("meta", "collections", collections_bl); + } + + int lookup_cached_header(const coll_t &cid, const ghobject_t &oid, + StripObjectMap::StripObjectHeaderRef *strip_header, + bool create_if_missing); + int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, + const string &prefix, const set &keys, + map *out); + void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, + const string &prefix, map &bl); + int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, + const string &prefix, const set &keys); + void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header, + const string &prefix); + int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header); + void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header, + const coll_t &cid, const ghobject_t &oid); + void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header, + const coll_t &cid, const ghobject_t &oid); + int submit_transaction(); + + BufferTransaction(KeyValueStore *store): store(store) { + t = store->backend->get_transaction(); + } + + struct InvalidateCacheContext : public Context { + KeyValueStore *store; + const coll_t cid; + const ghobject_t oid; + InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {} + void finish(int r) { + if (r == 0) + store->backend->invalidate_cache(cid, oid); + } + }; + }; + + // -- op workqueue -- + struct Op { + utime_t start; + uint64_t op; + list tls; + Context *ondisk, *onreadable, *onreadable_sync; + uint64_t ops, bytes; + TrackedOpRef osd_op; + }; + class OpSequencer : public Sequencer_impl { + Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock) + list q; + Cond cond; + list > flush_commit_waiters; + uint64_t op; // used by flush() to know the sequence of op + public: + Sequencer *parent; + Mutex apply_lock; // for apply mutual exclusion + + /// get_max_uncompleted + bool _get_max_uncompleted( + uint64_t *seq ///< [out] max uncompleted seq + ) { + assert(qlock.is_locked()); + assert(seq); + *seq = 0; + if (q.empty()) { + return true; + } else { + *seq = q.back()->op; + return false; + } + } /// @returns true if the queue is empty + + /// get_min_uncompleted + bool _get_min_uncompleted( + uint64_t *seq ///< [out] min uncompleted seq + ) { + assert(qlock.is_locked()); + assert(seq); + *seq = 0; + if (q.empty()) { + return true; + } else { + *seq = q.front()->op; + return false; + } + } /// @returns true if both queues are empty + + void _wake_flush_waiters(list *to_queue) { + uint64_t seq; + if (_get_min_uncompleted(&seq)) + seq = -1; + + for (list >::iterator i = + flush_commit_waiters.begin(); + i != flush_commit_waiters.end() && i->first < seq; + flush_commit_waiters.erase(i++)) { + to_queue->push_back(i->second); + } + } + + void queue(Op *o) { + Mutex::Locker l(qlock); + q.push_back(o); + op++; + o->op = op; + } + Op *peek_queue() { + assert(apply_lock.is_locked()); + return q.front(); + } + + Op *dequeue(list *to_queue) { + assert(to_queue); + assert(apply_lock.is_locked()); + Mutex::Locker l(qlock); + Op *o = q.front(); + q.pop_front(); + cond.Signal(); + + _wake_flush_waiters(to_queue); + return o; + } + + void flush() { + Mutex::Locker l(qlock); + + // get max for journal _or_ op queues + uint64_t seq = 0; + if (!q.empty()) + seq = q.back()->op; + + if (seq) { + // everything prior to our watermark to drain through either/both + // queues + while (!q.empty() && q.front()->op <= seq) + cond.Wait(qlock); + } + } + bool flush_commit(Context *c) { + Mutex::Locker l(qlock); + uint64_t seq = 0; + if (_get_max_uncompleted(&seq)) { + return true; + } else { + flush_commit_waiters.push_back(make_pair(seq, c)); + return false; + } + } + + OpSequencer() + : qlock("KeyValueStore::OpSequencer::qlock", false, false), + op(0), parent(0), + apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {} + ~OpSequencer() { + assert(q.empty()); + } + + const string& get_name() const { + return parent->get_name(); + } + }; + + friend ostream& operator<<(ostream& out, const OpSequencer& s); + + deque op_queue; + Throttle throttle_ops, throttle_bytes; + Finisher op_finisher; + + ThreadPool op_tp; + struct OpWQ : public ThreadPool::WorkQueue { + KeyValueStore *store; + OpWQ(KeyValueStore *fs, time_t timeout, time_t suicide_timeout, + ThreadPool *tp) : + ThreadPool::WorkQueue("KeyValueStore::OpWQ", + timeout, suicide_timeout, tp), + store(fs) {} + + bool _enqueue(OpSequencer *osr) { + store->op_queue.push_back(osr); + return true; + } + void _dequeue(OpSequencer *o) { + assert(0); + } + bool _empty() { + return store->op_queue.empty(); + } + OpSequencer *_dequeue() { + if (store->op_queue.empty()) + return NULL; + OpSequencer *osr = store->op_queue.front(); + store->op_queue.pop_front(); + return osr; + } + using ThreadPool::WorkQueue::_process; + void _process(OpSequencer *osr, ThreadPool::TPHandle &handle) { + store->_do_op(osr, handle); + } + void _process_finish(OpSequencer *osr) { + store->_finish_op(osr); + } + void _clear() { + assert(store->op_queue.empty()); + } + } op_wq; + + Op *build_op(list& tls, Context *ondisk, Context *onreadable, + Context *onreadable_sync, TrackedOpRef osd_op); + void queue_op(OpSequencer *osr, Op *o); + void op_queue_reserve_throttle(Op *o, ThreadPool::TPHandle *handle = NULL); + void _do_op(OpSequencer *osr, ThreadPool::TPHandle &handle); + void op_queue_release_throttle(Op *o); + void _finish_op(OpSequencer *osr); + + PerfCounters *perf_logger; + + public: + + KeyValueStore(const std::string &base, + const char *internal_name = "keyvaluestore", + bool update_to=false); + ~KeyValueStore(); + + bool test_mount_in_use(); + int version_stamp_is_valid(uint32_t *version); + int update_version_stamp(); + uint32_t get_target_version() { + return target_version; + } + + int write_version_stamp(); + int mount(); + int umount(); + unsigned get_max_object_name_length() { + return 4096; // no real limit for leveldb + } + unsigned get_max_attr_name_length() { + return 256; // arbitrary; there is no real limit internally + } + int mkfs(); + int mkjournal() {return 0;} + bool wants_journal() { + return false; + } + bool allows_journal() { + return false; + } + bool needs_journal() { + return false; + } + + void collect_metadata(map *pm); + + int statfs(struct statfs *buf); + + int _do_transactions( + list &tls, uint64_t op_seq, + ThreadPool::TPHandle *handle); + int do_transactions(list &tls, uint64_t op_seq) { + return _do_transactions(tls, op_seq, 0); + } + unsigned _do_transaction(Transaction& transaction, + BufferTransaction &bt, + ThreadPool::TPHandle *handle); + + int queue_transactions(Sequencer *osr, list& tls, + TrackedOpRef op = TrackedOpRef(), + ThreadPool::TPHandle *handle = NULL); + + + // ------------------ + // objects + + int _generic_read(StripObjectMap::StripObjectHeaderRef header, + uint64_t offset, size_t len, bufferlist& bl, + bool allow_eio = false, BufferTransaction *bt = 0); + int _generic_write(StripObjectMap::StripObjectHeaderRef header, + uint64_t offset, size_t len, const bufferlist& bl, + BufferTransaction &t, uint32_t fadvise_flags = 0); + + bool exists(coll_t cid, const ghobject_t& oid); + int stat(coll_t cid, const ghobject_t& oid, struct stat *st, + bool allow_eio = false); + int read(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, + bufferlist& bl, uint32_t op_flags = 0, bool allow_eio = false); + int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, + bufferlist& bl); + + int _touch(coll_t cid, const ghobject_t& oid, BufferTransaction &t); + int _write(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, + const bufferlist& bl, BufferTransaction &t, uint32_t fadvise_flags = 0); + int _zero(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, + BufferTransaction &t); + int _truncate(coll_t cid, const ghobject_t& oid, uint64_t size, + BufferTransaction &t); + int _clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& newoid, + BufferTransaction &t); + int _clone_range(coll_t cid, const ghobject_t& oldoid, + const ghobject_t& newoid, uint64_t srcoff, + uint64_t len, uint64_t dstoff, BufferTransaction &t); + int _remove(coll_t cid, const ghobject_t& oid, BufferTransaction &t); + int _set_alloc_hint(coll_t cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size, + BufferTransaction &t); + + void start_sync() {} + + void set_fsid(uuid_d u) { fsid = u; } + uuid_d get_fsid() { return fsid; } + + // attrs + int getattr(coll_t cid, const ghobject_t& oid, const char *name, + bufferptr &bp); + int getattrs(coll_t cid, const ghobject_t& oid, map& aset); + + int _setattrs(coll_t cid, const ghobject_t& oid, + map& aset, BufferTransaction &t); + int _rmattr(coll_t cid, const ghobject_t& oid, const char *name, + BufferTransaction &t); + int _rmattrs(coll_t cid, const ghobject_t& oid, BufferTransaction &t); + + // collections + int _collection_hint_expected_num_objs(coll_t cid, uint32_t pg_num, + uint64_t num_objs) const { return 0; } + int _create_collection(coll_t c, BufferTransaction &t); + int _destroy_collection(coll_t c, BufferTransaction &t); + int _collection_add(coll_t c, coll_t ocid, const ghobject_t& oid, + BufferTransaction &t); + int _collection_move_rename(coll_t oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + BufferTransaction &t); + int _collection_remove_recursive(const coll_t &cid, + BufferTransaction &t); + int list_collections(vector& ls); + bool collection_exists(coll_t c); + bool collection_empty(coll_t c); + int collection_list(coll_t c, ghobject_t start, ghobject_t end, + bool sort_bitwise, int max, + vector *ls, ghobject_t *next); + int collection_version_current(coll_t c, uint32_t *version); + + // omap (see ObjectStore.h for documentation) + int omap_get(coll_t c, const ghobject_t &oid, bufferlist *header, + map *out); + int omap_get_header( + coll_t c, + const ghobject_t &oid, + bufferlist *out, + bool allow_eio = false); + int omap_get_keys(coll_t c, const ghobject_t &oid, set *keys); + int omap_get_values(coll_t c, const ghobject_t &oid, const set &keys, + map *out); + int omap_check_keys(coll_t c, const ghobject_t &oid, const set &keys, + set *out); + ObjectMap::ObjectMapIterator get_omap_iterator(coll_t c, + const ghobject_t &oid); + + int check_get_rc(const coll_t cid, const ghobject_t& oid, int r, bool is_equal_size); + void dump_start(const std::string &file); + void dump_stop(); + void dump_transactions(list& ls, uint64_t seq, + OpSequencer *osr); + + private: + void _inject_failure() {} + + // omap + int _omap_clear(coll_t cid, const ghobject_t &oid, + BufferTransaction &t); + int _omap_setkeys(coll_t cid, const ghobject_t &oid, + map &aset, + BufferTransaction &t); + int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set &keys, + BufferTransaction &t); + int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid, + const string& first, const string& last, + BufferTransaction &t); + int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl, + BufferTransaction &t); + int _split_collection(coll_t cid, uint32_t bits, uint32_t rem, coll_t dest, + BufferTransaction &t); + int _split_collection_create(coll_t cid, uint32_t bits, uint32_t rem, + coll_t dest, BufferTransaction &t){ + return 0; + } + + virtual const char** get_tracked_conf_keys() const; + virtual void handle_conf_change(const struct md_config_t *conf, + const std::set &changed); + + std::string m_osd_rollback_to_cluster_snap; + int m_keyvaluestore_queue_max_ops; + int m_keyvaluestore_queue_max_bytes; + int m_keyvaluestore_strip_size; + uint64_t m_keyvaluestore_max_expected_write_size; + int do_update; + bool m_keyvaluestore_do_dump; + std::ofstream m_keyvaluestore_dump; + JSONFormatter m_keyvaluestore_dump_fmt; + + static const string OBJECT_STRIP_PREFIX; + static const string OBJECT_XATTR; + static const string OBJECT_OMAP; + static const string OBJECT_OMAP_HEADER; + static const string OBJECT_OMAP_HEADER_KEY; + static const string COLLECTION; + static const string COLLECTION_ATTR; + static const uint32_t COLLECTION_VERSION = 1; + + KVSuperblock superblock; + /** + * write_superblock() + * + * Write superblock to persisent storage + * + * return value: 0 on success, otherwise negative errno + */ + int write_superblock(); + + /** + * read_superblock() + * + * Fill in KeyValueStore::superblock by reading persistent storage + * + * return value: 0 on success, otherwise negative errno + */ + int read_superblock(); +}; + +WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader) + +#endif diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index dbc2e2cdf676..598b444e6fde 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -19,7 +19,6 @@ #include #include "os/ObjectStore.h" #include "os/filestore/FileStore.h" -#include "os/KeyValueStore.h" #include "include/Context.h" #include "common/ceph_argparse.h" #include "global/global_init.h"