From: Yehuda Sadeh Date: Tue, 19 Jan 2016 19:01:24 +0000 (-0800) Subject: Merge remote-tracking branch 'origin/master' into wip-rgw-static-website-yehuda-2 X-Git-Tag: v10.0.4~152^2^2~10 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5f61d36d7189052bcaf22ace1c604bb45f3721ae;p=ceph.git Merge remote-tracking branch 'origin/master' into wip-rgw-static-website-yehuda-2 Signed-off-by: Yehuda Sadeh Conflicts: src/global/Makefile.am src/rgw/rgw_common.cc src/rgw/rgw_common.h src/rgw/rgw_json_enc.cc src/rgw/rgw_op.h src/rgw/rgw_rest.cc src/rgw/rgw_rest_s3.cc src/tracing/Makefile.am --- 5f61d36d7189052bcaf22ace1c604bb45f3721ae diff --cc src/CMakeLists.txt index 0bb5c8868c68,d2c4cfda6242..4500bf8c3287 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@@ -268,8 -292,8 +292,9 @@@ set(libcommon_file common/simple_spin.cc common/Thread.cc common/Formatter.cc + common/HTMLFormatter.cc common/HeartbeatMap.cc + common/PluginRegistry.cc common/ceph_fs.cc common/ceph_hash.cc common/ceph_strings.cc diff --cc src/common/Formatter.cc index 345f95df66dc,9e7fd384d874..4a165243c236 --- a/src/common/Formatter.cc +++ b/src/common/Formatter.cc @@@ -18,8 -18,10 +18,11 @@@ #include "assert.h" #include "Formatter.h" ++#include "HTMLFormatter.h" #include "common/escape.h" + #include "include/buffer.h" + #include #include #include #include @@@ -321,8 -331,8 +336,11 @@@ XMLFormatter::XMLFormatter(bool pretty void XMLFormatter::flush(std::ostream& os) { finish_pending_string(); -- os << m_ss.str(); -- if (m_pretty) ++ std::string m_ss_str = m_ss.str(); ++ os << m_ss_str; ++ /* There is a small catch here. If the rest of the formatter had NO output, ++ * we should NOT output a newline. This primarily triggers on HTTP redirects */ ++ if (m_pretty && !m_ss_str.empty()) os << "\n"; m_ss.clear(); m_ss.str(""); @@@ -336,6 -346,6 +354,24 @@@ void XMLFormatter::reset( m_pending_string.str(""); m_sections.clear(); m_pending_string_name.clear(); ++ m_header_done = false; ++} ++ ++void XMLFormatter::output_header() ++{ ++ if(!m_header_done) { ++ m_header_done = true; ++ write_raw_data(XMLFormatter::XML_1_DTD);; ++ if (m_pretty) ++ m_ss << "\n"; ++ } ++} ++ ++void XMLFormatter::output_footer() ++{ ++ while(!m_sections.empty()) { ++ close_section(); ++ } } void XMLFormatter::open_object_section(const char *name) diff --cc src/common/Formatter.h index b251aa6f4684,3784bdb4cfcd..f67aebd04dbe --- a/src/common/Formatter.h +++ b/src/common/Formatter.h @@@ -41,18 -41,9 +41,13 @@@ namespace ceph virtual ~Formatter(); virtual void flush(std::ostream& os) = 0; - void flush(bufferlist &bl) - { - std::stringstream os; - flush(os); - bl.append(os.str()); - } + void flush(bufferlist &bl); virtual void reset() = 0; - virtual void set_status(const char* status, const char* status_name) = 0; ++ virtual void set_status(int status, const char* status_name) = 0; + virtual void output_header() = 0; + virtual void output_footer() = 0; + virtual void open_array_section(const char *name) = 0; virtual void open_array_section_in_ns(const char *name, const char *ns) = 0; virtual void open_object_section(const char *name) = 0; @@@ -98,6 -89,6 +93,9 @@@ public: JSONFormatter(bool p = false); ++ virtual void set_status(int status, const char* status_name) {}; ++ virtual void output_header() {}; ++ virtual void output_footer() {}; void flush(std::ostream& os); void reset(); virtual void open_array_section(const char *name); @@@ -137,8 -128,8 +135,12 @@@ class XMLFormatter : public Formatter { public: static const char *XML_1_DTD; - XMLFormatter(bool pretty = false); + XMLFormatter(bool pretty = false, bool lowercased_underscored = false); + ++ virtual void set_status(int status, const char* status_name) {} ++ virtual void output_header(); ++ virtual void output_footer(); + void flush(std::ostream& os); void reset(); void open_array_section(const char *name); @@@ -159,7 -150,7 +161,7 @@@ void open_array_section_with_attrs(const char *name, const FormatterAttrs& attrs); void open_object_section_with_attrs(const char *name, const FormatterAttrs& attrs); void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs); -- private: ++ protected: void open_section_in_ns(const char *name, const char *ns, const FormatterAttrs *attrs); void finish_pending_string(); void print_spaces(); @@@ -169,13 -160,14 +171,18 @@@ std::stringstream m_ss, m_pending_string; std::deque m_sections; bool m_pretty; + bool m_lowercased_underscored; std::string m_pending_string_name; ++ bool m_header_done; }; class TableFormatter : public Formatter { public: TableFormatter(bool keyval = false); ++ virtual void set_status(int status, const char* status_name) {}; ++ virtual void output_header() {}; ++ virtual void output_footer() {}; void flush(std::ostream& os); void reset(); virtual void open_array_section(const char *name); diff --cc src/common/HTMLFormatter.cc index 35e36a234079,000000000000..b10c296d07cb mode 100644,000000..100644 --- a/src/common/HTMLFormatter.cc +++ b/src/common/HTMLFormatter.cc @@@ -1,166 -1,0 +1,158 @@@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#define LARGE_SIZE 1024 + +#include "include/int_types.h" + +#include "assert.h" +#include "Formatter.h" +#include "HTMLFormatter.h" - #include "XMLFormatter.h" +#include "common/escape.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ----------------------- +namespace ceph { + +HTMLFormatter::HTMLFormatter(bool pretty) - : XMLFormatter(pretty), m_header_done(false), m_status(NULL), m_status_name(NULL) ++: XMLFormatter(pretty), m_header_done(false), m_status(0), m_status_name(NULL) +{ +} + +HTMLFormatter::~HTMLFormatter() +{ - if (m_status) { - free((void*)m_status); - m_status = NULL; - } + if (m_status_name) { + free((void*)m_status_name); + m_status_name = NULL; + } +} + +void HTMLFormatter::reset() +{ + XMLFormatter::reset(); + m_header_done = false; - if (m_status) { - free((void*)m_status); - m_status = NULL; - } ++ m_status = 0; + if (m_status_name) { + free((void*)m_status_name); + m_status_name = NULL; + } +} + - void HTMLFormatter::set_status(const char* status, const char* status_name) ++void HTMLFormatter::set_status(int status, const char* status_name) +{ - assert(status != NULL); // new status must not be NULL - assert(m_status == NULL); // status should NOT be set multiple times - m_status = strdup(status); - if (status_name) ++ m_status = status; ++ if (status_name) { + m_status_name = strdup(status_name); ++ } +}; + +void HTMLFormatter::output_header() { + if (!m_header_done) { + m_header_done = true; - assert(m_status != NULL); // it should be set by this point - std::string status_line(m_status); ++ char buf[16]; ++ snprintf(buf, sizeof(buf), "%d", m_status); ++ std::string status_line(buf); + if (m_status_name) { + status_line += " "; + status_line += m_status_name; + } + open_object_section("html"); + print_spaces(); + m_ss << "" << status_line << ""; + if (m_pretty) + m_ss << "\n"; + open_object_section("body"); + print_spaces(); + m_ss << "

" << status_line << "

"; + if (m_pretty) + m_ss << "\n"; + open_object_section("ul"); + } +} + +template +void HTMLFormatter::dump_template(const char *name, T arg) +{ + print_spaces(); + m_ss << "
  • " << name << ": " << arg << "
  • "; + if (m_pretty) + m_ss << "\n"; +} + +void HTMLFormatter::dump_unsigned(const char *name, uint64_t u) +{ + dump_template(name, u); +} + +void HTMLFormatter::dump_int(const char *name, int64_t u) +{ + dump_template(name, u); +} + +void HTMLFormatter::dump_float(const char *name, double d) +{ + dump_template(name, d); +} + +void HTMLFormatter::dump_string(const char *name, const std::string& s) +{ + dump_template(name, escape_xml_str(s.c_str())); +} + +void HTMLFormatter::dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs) +{ + std::string e(name); + std::string attrs_str; + get_attrs_str(&attrs, attrs_str); + print_spaces(); + m_ss << "
  • " << e << ": " << escape_xml_str(s.c_str()) << attrs_str << "
  • "; + if (m_pretty) + m_ss << "\n"; +} + +std::ostream& HTMLFormatter::dump_stream(const char *name) +{ + print_spaces(); + m_pending_string_name = "li"; + m_ss << "
  • " << name << ": "; + return m_pending_string; +} + +void HTMLFormatter::dump_format_va(const char* name, const char *ns, bool quoted, const char *fmt, va_list ap) +{ + char buf[LARGE_SIZE]; + vsnprintf(buf, LARGE_SIZE, fmt, ap); + + std::string e(name); + print_spaces(); + if (ns) { + m_ss << "
  • " << e << ": " << escape_xml_str(buf) << "
  • "; + } else { + m_ss << "
  • " << e << ": " << escape_xml_str(buf) << "
  • "; + } + + if (m_pretty) + m_ss << "\n"; +} + +} // namespace ceph diff --cc src/common/HTMLFormatter.h index a8c6b20a78b2,000000000000..50d0777973f6 mode 100644,000000..100644 --- a/src/common/HTMLFormatter.h +++ b/src/common/HTMLFormatter.h @@@ -1,50 -1,0 +1,50 @@@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_HTML_FORMATTER_H +#define CEPH_HTML_FORMATTER_H + +#include "include/int_types.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "include/buffer.h" - #include "XMLFormatter.h" ++#include "Formatter.h" + +namespace ceph { + class HTMLFormatter : public XMLFormatter { + public: + HTMLFormatter(bool pretty = false); + ~HTMLFormatter(); + void reset(); + - virtual void set_status(const char* status, const char* status_name); ++ virtual void set_status(int status, const char* status_name); + virtual void output_header(); + + void dump_unsigned(const char *name, uint64_t u); + void dump_int(const char *name, int64_t u); + void dump_float(const char *name, double d); + void dump_string(const char *name, const std::string& s); + std::ostream& dump_stream(const char *name); + void dump_format_va(const char *name, const char *ns, bool quoted, const char *fmt, va_list ap); + + /* with attrs */ + void dump_string_with_attrs(const char *name, const std::string& s, const FormatterAttrs& attrs); + private: + template void dump_template(const char *name, T arg); + + bool m_header_done; + - const char* m_status; ++ int m_status; + const char* m_status_name; + }; + +} + +#endif diff --cc src/global/Makefile.am index 00a7e373cad2,4738b37b0238..51fff4b4136a --- a/src/global/Makefile.am +++ b/src/global/Makefile.am @@@ -2,11 -2,9 +2,12 @@@ libglobal_la_SOURCES = global/global_context.cc \ global/global_init.cc \ global/pidfile.cc \ - global/signal_handler.cc + global/signal_handler.cc \ + common/TrackedOp.cc libglobal_la_LIBADD = $(LIBCOMMON) +if WITH_LTTNG +libglobal_la_LIBADD += -ldl -llttng-ust +endif noinst_LTLIBRARIES += libglobal.la noinst_HEADERS += \ diff --cc src/os/kstore/KStore.cc index 000000000000,9490751f72d9..58024493eb49 mode 000000,100644..100644 --- a/src/os/kstore/KStore.cc +++ b/src/os/kstore/KStore.cc @@@ -1,0 -1,3741 +1,3742 @@@ + // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + // vim: ts=8 sw=2 smarttab + /* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + #include + #include + #include + #include + #include + #include + + #include "KStore.h" + #include "kv.h" + #include "include/compat.h" + #include "include/stringify.h" + #include "common/errno.h" + #include "common/safe_io.h" ++#include "common/Formatter.h" + + + #define dout_subsys ceph_subsys_kstore + + /* + + TODO: + + * superblock, features + * refcounted extents (for efficient clone) + + */ + + const string PREFIX_SUPER = "S"; // field -> value + const string PREFIX_COLL = "C"; // collection name -> (nothing) + const string PREFIX_OBJ = "O"; // object name -> onode + const string PREFIX_DATA = "D"; // nid + offset -> data + const string PREFIX_OMAP = "M"; // u64 + keyname -> value + + /* + * object name key structure + * + * 2 chars: shard (-- for none, or hex digit, so that we sort properly) + * encoded u64: poolid + 2^63 (so that it sorts properly) + * encoded u32: hash (bit reversed) + * + * 1 char: '.' + * + * escaped string: namespace + * + * 1 char: '<', '=', or '>'. if =, then object key == object name, and + * we are followed just by the key. otherwise, we are followed by + * the key and then the object name. + * escaped string: key + * escaped string: object name (unless '=' above) + * + * encoded u64: snap + * encoded u64: generation + */ + + /* + * string encoding in the key + * + * The key string needs to lexicographically sort the same way that + * ghobject_t does. We do this by escaping anything <= to '%' with % + * plus a 2 digit hex string, and anything >= '~' with ~ plus the two + * hex digits. + * + * We use ! as a terminator for strings; this works because it is < % + * and will get escaped if it is present in the string. + * + */ + + static void append_escaped(const string &in, string *out) + { + char hexbyte[8]; + for (string::const_iterator i = in.begin(); i != in.end(); ++i) { + if (*i <= '#') { + snprintf(hexbyte, sizeof(hexbyte), "#%02x", (unsigned)*i); + out->append(hexbyte); + } else if (*i >= '~') { + snprintf(hexbyte, sizeof(hexbyte), "~%02x", (unsigned)*i); + out->append(hexbyte); + } else { + out->push_back(*i); + } + } + out->push_back('!'); + } + + static int decode_escaped(const char *p, string *out) + { + const char *orig_p = p; + while (*p && *p != '!') { + if (*p == '#' || *p == '~') { + unsigned hex; + int r = sscanf(++p, "%2x", &hex); + if (r < 1) + return -EINVAL; + out->push_back((char)hex); + p += 2; + } else { + out->push_back(*p++); + } + } + return p - orig_p; + } + + // some things we encode in binary (as le32 or le64); print the + // resulting key strings nicely + static string pretty_binary_string(const string& in) + { + char buf[10]; + string out; + out.reserve(in.length() * 3); + enum { NONE, HEX, STRING } mode = NONE; + unsigned from = 0, i; + for (i=0; i < in.length(); ++i) { + if ((in[i] < 32 || (unsigned char)in[i] > 126) || + (mode == HEX && in.length() - i >= 4 && + ((in[i] < 32 || (unsigned char)in[i] > 126) || + (in[i+1] < 32 || (unsigned char)in[i+1] > 126) || + (in[i+2] < 32 || (unsigned char)in[i+2] > 126) || + (in[i+3] < 32 || (unsigned char)in[i+3] > 126)))) { + if (mode == STRING) { + out.append(in.substr(from, i - from)); + out.push_back('\''); + } + if (mode != HEX) { + out.append("0x"); + mode = HEX; + } + if (in.length() - i >= 4) { + // print a whole u32 at once + snprintf(buf, sizeof(buf), "%08x", + (uint32_t)(((unsigned char)in[i] << 24) | + ((unsigned char)in[i+1] << 16) | + ((unsigned char)in[i+2] << 8) | + ((unsigned char)in[i+3] << 0))); + i += 3; + } else { + snprintf(buf, sizeof(buf), "%02x", (int)(unsigned char)in[i]); + } + out.append(buf); + } else { + if (mode != STRING) { + out.push_back('\''); + mode = STRING; + from = i; + } + } + } + if (mode == STRING) { + out.append(in.substr(from, i - from)); + out.push_back('\''); + } + return out; + } + + static void _key_encode_shard(shard_id_t shard, string *key) + { + // make field ordering match with ghobject_t compare operations + if (shard == shard_id_t::NO_SHARD) { + // otherwise ff will sort *after* 0, not before. + key->append("--"); + } else { + char buf[32]; + snprintf(buf, sizeof(buf), "%02x", (int)shard); + key->append(buf); + } + } + static const char *_key_decode_shard(const char *key, shard_id_t *pshard) + { + if (key[0] == '-') { + *pshard = shard_id_t::NO_SHARD; + } else { + unsigned shard; + int r = sscanf(key, "%x", &shard); + if (r < 1) + return NULL; + *pshard = shard_id_t(shard); + } + return key + 2; + } + + static void get_coll_key_range(const coll_t& cid, int bits, + string *temp_start, string *temp_end, + string *start, string *end) + { + temp_start->clear(); + temp_end->clear(); + start->clear(); + end->clear(); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + _key_encode_shard(pgid.shard, start); + *end = *start; + *temp_start = *start; + *temp_end = *start; + + _key_encode_u64(pgid.pool() + 0x8000000000000000ull, start); + _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_start); + _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), start); + _key_encode_u32(hobject_t::_reverse_bits(pgid.ps()), temp_start); + start->append("."); + temp_start->append("."); + + _key_encode_u64(pgid.pool() + 0x8000000000000000ull, end); + _key_encode_u64((-2ll - pgid.pool()) + 0x8000000000000000ull, temp_end); + + uint64_t end_hash = + hobject_t::_reverse_bits(pgid.ps()) + (1ull << (32-bits)); + if (end_hash <= 0xffffffffull) { + _key_encode_u32(end_hash, end); + _key_encode_u32(end_hash, temp_end); + end->append("."); + temp_end->append("."); + } else { + _key_encode_u32(0xffffffff, end); + _key_encode_u32(0xffffffff, temp_end); + end->append(":"); + temp_end->append(":"); + } + } else { + _key_encode_shard(shard_id_t::NO_SHARD, start); + _key_encode_u64(-1ull + 0x8000000000000000ull, start); + *end = *start; + _key_encode_u32(0, start); + start->append("."); + _key_encode_u32(0xffffffff, end); + end->append(":"); + + // no separate temp section + *temp_start = *end; + *temp_end = *end; + } + } + + static int get_key_object(const string& key, ghobject_t *oid); + + static void get_object_key(const ghobject_t& oid, string *key) + { + key->clear(); + + _key_encode_shard(oid.shard_id, key); + _key_encode_u64(oid.hobj.pool + 0x8000000000000000ull, key); + _key_encode_u32(oid.hobj.get_bitwise_key_u32(), key); + key->append("."); + + append_escaped(oid.hobj.nspace, key); + + if (oid.hobj.get_key().length()) { + // is a key... could be < = or >. + // (ASCII chars < = and > sort in that order, yay) + if (oid.hobj.get_key() < oid.hobj.oid.name) { + key->append("<"); + append_escaped(oid.hobj.get_key(), key); + append_escaped(oid.hobj.oid.name, key); + } else if (oid.hobj.get_key() > oid.hobj.oid.name) { + key->append(">"); + append_escaped(oid.hobj.get_key(), key); + append_escaped(oid.hobj.oid.name, key); + } else { + // same as no key + key->append("="); + append_escaped(oid.hobj.oid.name, key); + } + } else { + // no key + key->append("="); + append_escaped(oid.hobj.oid.name, key); + } + + _key_encode_u64(oid.hobj.snap, key); + _key_encode_u64(oid.generation, key); + + // sanity check + if (true) { + ghobject_t t; + int r = get_key_object(*key, &t); + if (r || t != oid) { + derr << " r " << r << dendl; + derr << "key " << pretty_binary_string(*key) << dendl; + derr << "oid " << oid << dendl; + derr << " t " << t << dendl; + assert(t == oid); + } + } + } + + static int get_key_object(const string& key, ghobject_t *oid) + { + int r; + const char *p = key.c_str(); + + p = _key_decode_shard(p, &oid->shard_id); + if (!p) + return -2; + + uint64_t pool; + p = _key_decode_u64(p, &pool); + if (!p) + return -3; + oid->hobj.pool = pool - 0x8000000000000000; + + unsigned hash; + p = _key_decode_u32(p, &hash); + if (!p) + return -4; + oid->hobj.set_bitwise_key_u32(hash); + if (*p != '.') + return -5; + ++p; + + r = decode_escaped(p, &oid->hobj.nspace); + if (r < 0) + return -6; + p += r + 1; + + if (*p == '=') { + // no key + ++p; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -8; + p += r + 1; + } else if (*p == '<' || *p == '>') { + // key + name + ++p; + string okey; + r = decode_escaped(p, &okey); + if (r < 0) + return -8; + p += r + 1; + r = decode_escaped(p, &oid->hobj.oid.name); + if (r < 0) + return -9; + p += r + 1; + oid->hobj.set_key(okey); + } else { + // malformed + return -7; + } + + p = _key_decode_u64(p, &oid->hobj.snap.val); + if (!p) + return -10; + p = _key_decode_u64(p, &oid->generation); + if (!p) + return -11; + return 0; + } + + + static void get_data_key(uint64_t nid, uint64_t offset, string *out) + { + _key_encode_u64(nid, out); + _key_encode_u64(offset, out); + } + + // '-' < '.' < '~' + static void get_omap_header(uint64_t id, string *out) + { + _key_encode_u64(id, out); + out->push_back('-'); + } + + // hmm, I don't think there's any need to escape the user key since we + // have a clean prefix. + static void get_omap_key(uint64_t id, const string& key, string *out) + { + _key_encode_u64(id, out); + out->push_back('.'); + out->append(key); + } + + static void rewrite_omap_key(uint64_t id, string old, string *out) + { + _key_encode_u64(id, out); + out->append(old.substr(out->length())); + } + + static void decode_omap_key(const string& key, string *user_key) + { + *user_key = key.substr(sizeof(uint64_t) + 1); + } + + static void get_omap_tail(uint64_t id, string *out) + { + _key_encode_u64(id, out); + out->push_back('~'); + } + + + + // Onode + + #undef dout_prefix + #define dout_prefix *_dout << "kstore.onode(" << this << ") " + + KStore::Onode::Onode(const ghobject_t& o, const string& k) + : nref(0), + oid(o), + key(k), + dirty(false), + exists(true), + flush_lock("KStore::Onode::flush_lock") { + } + + void KStore::Onode::flush() + { + Mutex::Locker l(flush_lock); + dout(20) << __func__ << " " << flush_txns << dendl; + while (!flush_txns.empty()) + flush_cond.Wait(flush_lock); + dout(20) << __func__ << " done" << dendl; + } + + // OnodeHashLRU + + #undef dout_prefix + #define dout_prefix *_dout << "kstore.lru(" << this << ") " + + void KStore::OnodeHashLRU::_touch(OnodeRef o) + { + lru_list_t::iterator p = lru.iterator_to(*o); + lru.erase(p); + lru.push_front(*o); + } + + void KStore::OnodeHashLRU::add(const ghobject_t& oid, OnodeRef o) + { + Mutex::Locker l(lock); + dout(30) << __func__ << " " << oid << " " << o << dendl; + assert(onode_map.count(oid) == 0); + onode_map[oid] = o; + lru.push_back(*o); + } + + KStore::OnodeRef KStore::OnodeHashLRU::lookup(const ghobject_t& oid) + { + Mutex::Locker l(lock); + dout(30) << __func__ << dendl; + ceph::unordered_map::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + dout(30) << __func__ << " " << oid << " miss" << dendl; + return OnodeRef(); + } + dout(30) << __func__ << " " << oid << " hit " << p->second << dendl; + _touch(p->second); + return p->second; + } + + void KStore::OnodeHashLRU::clear() + { + Mutex::Locker l(lock); + dout(10) << __func__ << dendl; + lru.clear(); + onode_map.clear(); + } + + void KStore::OnodeHashLRU::remove(const ghobject_t& oid) + { + Mutex::Locker l(lock); + ceph::unordered_map::iterator p = onode_map.find(oid); + if (p == onode_map.end()) { + dout(30) << __func__ << " " << oid << " miss" << dendl; + return; + } + dout(30) << __func__ << " " << oid << " hit " << p->second << dendl; + lru_list_t::iterator pi = lru.iterator_to(*p->second); + lru.erase(pi); + onode_map.erase(p); + } + + void KStore::OnodeHashLRU::rename(const ghobject_t& old_oid, + const ghobject_t& new_oid) + { + Mutex::Locker l(lock); + dout(30) << __func__ << " " << old_oid << " -> " << new_oid << dendl; + ceph::unordered_map::iterator po, pn; + po = onode_map.find(old_oid); + pn = onode_map.find(new_oid); + + assert(po != onode_map.end()); + if (pn != onode_map.end()) { + lru_list_t::iterator p = lru.iterator_to(*pn->second); + lru.erase(p); + onode_map.erase(pn); + } + OnodeRef o = po->second; + + // install a non-existent onode it its place + po->second.reset(new Onode(old_oid, o->key)); + po->second->exists = false; + lru.push_back(*po->second); + + // fix oid, key + onode_map.insert(make_pair(new_oid, o)); + _touch(o); + o->oid = new_oid; + get_object_key(new_oid, &o->key); + } + + bool KStore::OnodeHashLRU::get_next( + const ghobject_t& after, + pair *next) + { + Mutex::Locker l(lock); + dout(20) << __func__ << " after " << after << dendl; + + if (after == ghobject_t()) { + if (lru.empty()) { + return false; + } + ceph::unordered_map::iterator p = onode_map.begin(); + assert(p != onode_map.end()); + next->first = p->first; + next->second = p->second; + return true; + } + + ceph::unordered_map::iterator p = onode_map.find(after); + assert(p != onode_map.end()); // for now + lru_list_t::iterator pi = lru.iterator_to(*p->second); + ++pi; + if (pi == lru.end()) { + return false; + } + next->first = pi->oid; + next->second = onode_map[pi->oid]; + return true; + } + + int KStore::OnodeHashLRU::trim(int max) + { + Mutex::Locker l(lock); + dout(20) << __func__ << " max " << max + << " size " << onode_map.size() << dendl; + int trimmed = 0; + int num = onode_map.size() - max; + lru_list_t::iterator p = lru.end(); + if (num) + --p; + while (num > 0) { + Onode *o = &*p; + int refs = o->nref.read(); + if (refs > 1) { + dout(20) << __func__ << " " << o->oid << " has " << refs + << " refs; stopping with " << num << " left to trim" << dendl; + break; + } + dout(30) << __func__ << " trim " << o->oid << dendl; + if (p != lru.begin()) { + lru.erase(p--); + } else { + lru.erase(p); + assert(num == 1); + } + o->get(); // paranoia + onode_map.erase(o->oid); + o->put(); + --num; + ++trimmed; + } + return trimmed; + } + + // ======================================================= + + // Collection + + #undef dout_prefix + #define dout_prefix *_dout << "kstore(" << store->path << ").collection(" << cid << ") " + + KStore::Collection::Collection(KStore *ns, coll_t c) + : store(ns), + cid(c), + lock("KStore::Collection::lock"), + onode_map() + { + } + + KStore::OnodeRef KStore::Collection::get_onode( + const ghobject_t& oid, + bool create) + { + assert(create ? lock.is_wlocked() : lock.is_locked()); + + spg_t pgid; + if (cid.is_pg(&pgid)) { + if (!oid.match(cnode.bits, pgid.ps())) { + derr << __func__ << " oid " << oid << " not part of " << pgid + << " bits " << cnode.bits << dendl; + assert(0); + } + } + + OnodeRef o = onode_map.lookup(oid); + if (o) + return o; + + string key; + get_object_key(oid, &key); + + dout(20) << __func__ << " oid " << oid << " key " + << pretty_binary_string(key) << dendl; + + bufferlist v; + int r = store->db->get(PREFIX_OBJ, key, &v); + dout(20) << " r " << r << " v.len " << v.length() << dendl; + Onode *on; + if (v.length() == 0) { + assert(r == -ENOENT); + if (!create) + return OnodeRef(); + + // new + on = new Onode(oid, key); + on->dirty = true; + } else { + // loaded + assert(r >=0); + on = new Onode(oid, key); + bufferlist::iterator p = v.begin(); + ::decode(on->onode, p); + } + o.reset(on); + onode_map.add(oid, o); + return o; + } + + + + // ======================================================= + + #undef dout_prefix + #define dout_prefix *_dout << "kstore(" << path << ") " + + KStore::KStore(CephContext *cct, const string& path) + : ObjectStore(path), + cct(cct), + db(NULL), + path_fd(-1), + fsid_fd(-1), + mounted(false), + coll_lock("KStore::coll_lock"), + nid_lock("KStore::nid_lock"), + nid_max(0), + throttle_ops(cct, "kstore_max_ops", cct->_conf->kstore_max_ops), + throttle_bytes(cct, "kstore_max_bytes", cct->_conf->kstore_max_bytes), + finisher(cct), + kv_sync_thread(this), + kv_lock("KStore::kv_lock"), + kv_stop(false), + logger(NULL), + reap_lock("KStore::reap_lock") + { + _init_logger(); + } + + KStore::~KStore() + { + _shutdown_logger(); + assert(!mounted); + assert(db == NULL); + assert(fsid_fd < 0); + } + + void KStore::_init_logger() + { + // XXX + } + + void KStore::_shutdown_logger() + { + // XXX + } + + int KStore::_open_path() + { + assert(path_fd < 0); + path_fd = ::open(path.c_str(), O_DIRECTORY); + if (path_fd < 0) { + int r = -errno; + derr << __func__ << " unable to open " << path << ": " << cpp_strerror(r) + << dendl; + return r; + } + return 0; + } + + void KStore::_close_path() + { + VOID_TEMP_FAILURE_RETRY(::close(path_fd)); + path_fd = -1; + } + + int KStore::_open_fsid(bool create) + { + assert(fsid_fd < 0); + int flags = O_RDWR; + if (create) + flags |= O_CREAT; + fsid_fd = ::openat(path_fd, "fsid", flags, 0644); + if (fsid_fd < 0) { + int err = -errno; + derr << __func__ << " " << cpp_strerror(err) << dendl; + return err; + } + return 0; + } + + int KStore::_read_fsid(uuid_d *uuid) + { + char fsid_str[40]; + int ret = safe_read(fsid_fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) { + derr << __func__ << " failed: " << cpp_strerror(ret) << dendl; + return ret; + } + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) { + derr << __func__ << " unparsable uuid " << fsid_str << dendl; + return -EINVAL; + } + return 0; + } + + int KStore::_write_fsid() + { + int r = ::ftruncate(fsid_fd, 0); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid truncate failed: " << cpp_strerror(r) << dendl; + return r; + } + string str = stringify(fsid) + "\n"; + r = safe_write(fsid_fd, str.c_str(), str.length()); + if (r < 0) { + derr << __func__ << " fsid write failed: " << cpp_strerror(r) << dendl; + return r; + } + r = ::fsync(fsid_fd); + if (r < 0) { + r = -errno; + derr << __func__ << " fsid fsync failed: " << cpp_strerror(r) << dendl; + return r; + } + return 0; + } + + void KStore::_close_fsid() + { + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + } + + int KStore::_lock_fsid() + { + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + derr << __func__ << " failed to lock " << path << "/fsid" + << " (is another ceph-osd still running?)" + << cpp_strerror(err) << dendl; + return -err; + } + return 0; + } + + bool KStore::test_mount_in_use() + { + // most error conditions mean the mount is not in use (e.g., because + // it doesn't exist). only if we fail to lock do we conclude it is + // in use. + bool ret = false; + int r = _open_path(); + if (r < 0) + return false; + r = _open_fsid(false); + if (r < 0) + goto out_path; + r = _lock_fsid(); + if (r < 0) + ret = true; // if we can't lock, it is in use + _close_fsid(); + out_path: + _close_path(); + return ret; + } + + int KStore::_open_db(bool create) + { + int r; + assert(!db); + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/db", path.c_str()); + + string kv_backend; + if (create) { + kv_backend = g_conf->kstore_backend; + } else { + r = read_meta("kv_backend", &kv_backend); + if (r < 0) { + derr << __func__ << " uanble to read 'kv_backend' meta" << dendl; + return -EIO; + } + } + dout(10) << __func__ << " kv_backend = " << kv_backend << dendl; + + if (create) { + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << fn << ": " << cpp_strerror(r) + << dendl; + return r; + } + + // wal_dir, too! + char walfn[PATH_MAX]; + snprintf(walfn, sizeof(walfn), "%s/db.wal", path.c_str()); + r = ::mkdir(walfn, 0755); + if (r < 0) + r = -errno; + if (r < 0 && r != -EEXIST) { + derr << __func__ << " failed to create " << walfn + << ": " << cpp_strerror(r) + << dendl; + return r; + } + } + + db = KeyValueDB::create(g_ceph_context, + kv_backend, + fn); + if (!db) { + derr << __func__ << " error creating db" << dendl; + return -EIO; + } + string options; + if (kv_backend == "rocksdb") + options = g_conf->kstore_rocksdb_options; + db->init(options); + stringstream err; + if (create) + r = db->create_and_open(err); + else + r = db->open(err); + if (r) { + derr << __func__ << " erroring opening db: " << err.str() << dendl; + delete db; + db = NULL; + return -EIO; + } + dout(1) << __func__ << " opened " << kv_backend + << " path " << fn << " options " << options << dendl; + return 0; + } + + void KStore::_close_db() + { + assert(db); + delete db; + db = NULL; + } + + int KStore::_open_collections(int *errors) + { + assert(coll_map.empty()); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_COLL); + for (it->upper_bound(string()); + it->valid(); + it->next()) { + coll_t cid; + if (cid.parse(it->key())) { + CollectionRef c(new Collection(this, cid)); + bufferlist bl; + db->get(PREFIX_COLL, it->key(), &bl); + bufferlist::iterator p = bl.begin(); + ::decode(c->cnode, p); + dout(20) << __func__ << " opened " << cid << dendl; + coll_map[cid] = c; + } else { + derr << __func__ << " unrecognized collection " << it->key() << dendl; + if (errors) + (*errors)++; + } + } + return 0; + } + + int KStore::mkfs() + { + dout(1) << __func__ << " path " << path << dendl; + int r; + uuid_d old_fsid; + + r = _open_path(); + if (r < 0) + return r; + + r = _open_fsid(true); + if (r < 0) + goto out_path_fd; + + r = _lock_fsid(); + if (r < 0) + goto out_close_fsid; + + r = _read_fsid(&old_fsid); + if (r < 0 && old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __func__ << " generated fsid " << fsid << dendl; + } else { + dout(1) << __func__ << " using provided fsid " << fsid << dendl; + } + // we'll write it last. + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __func__ << " on-disk fsid " << old_fsid + << " != provided " << fsid << dendl; + r = -EINVAL; + goto out_close_fsid; + } + fsid = old_fsid; + dout(1) << __func__ << " already created, fsid is " << fsid << dendl; + goto out_close_fsid; + } + + r = _open_db(true); + if (r < 0) + goto out_close_fsid; + + r = write_meta("kv_backend", g_conf->kstore_backend); + if (r < 0) + goto out_close_db; + + // indicate mkfs completion/success by writing the fsid file + r = _write_fsid(); + if (r == 0) + dout(10) << __func__ << " success" << dendl; + else + derr << __func__ << " error writing fsid: " << cpp_strerror(r) << dendl; + + out_close_db: + _close_db(); + out_close_fsid: + _close_fsid(); + out_path_fd: + _close_path(); + return r; + } + + int KStore::mount() + { + dout(1) << __func__ << " path " << path << dendl; + + if (g_conf->kstore_fsck_on_mount) { + int rc = fsck(); + if (rc < 0) + return rc; + } + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_db(false); + if (r < 0) + goto out_fsid; + + r = _open_super_meta(); + if (r < 0) + goto out_db; + + r = _open_collections(); + if (r < 0) + goto out_db; + + finisher.start(); + kv_sync_thread.create("kstore_kv_sync"); + + mounted = true; + return 0; + + _kv_stop(); + finisher.wait_for_empty(); + finisher.stop(); + out_db: + _close_db(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + return r; + } + + int KStore::umount() + { + assert(mounted); + dout(1) << __func__ << dendl; + + _sync(); + _reap_collections(); + coll_map.clear(); + + dout(20) << __func__ << " stopping kv thread" << dendl; + _kv_stop(); + dout(20) << __func__ << " draining finisher" << dendl; + finisher.wait_for_empty(); + dout(20) << __func__ << " stopping finisher" << dendl; + finisher.stop(); + dout(20) << __func__ << " closing" << dendl; + + mounted = false; + _close_db(); + _close_fsid(); + _close_path(); + return 0; + } + + int KStore::fsck() + { + dout(1) << __func__ << dendl; + int errors = 0; + #if 0 + set used_nids; + set used_omap_head; + interval_set used_blocks; + KeyValueDB::Iterator it; + + int r = _open_path(); + if (r < 0) + return r; + r = _open_fsid(false); + if (r < 0) + goto out_path; + + r = _read_fsid(&fsid); + if (r < 0) + goto out_fsid; + + r = _lock_fsid(); + if (r < 0) + goto out_fsid; + + r = _open_bdev(false); + if (r < 0) + goto out_fsid; + + r = _open_db(false); + if (r < 0) + goto out_bdev; + + r = _open_alloc(); + if (r < 0) + goto out_db; + + r = _open_super_meta(); + if (r < 0) + goto out_alloc; + + r = _open_collections(&errors); + if (r < 0) + goto out_alloc; + + if (bluefs) { + used_blocks.insert(0, BLUEFS_START); + used_blocks.insert(bluefs_extents); + r = bluefs->fsck(); + if (r < 0) + goto out_alloc; + if (r > 0) + errors += r; + } + + // walk collections, objects + for (ceph::unordered_map::iterator p = coll_map.begin(); + p != coll_map.end() && !errors; + ++p) { + dout(1) << __func__ << " collection " << p->first << dendl; + CollectionRef c = _get_collection(p->first); + RWLock::RLocker l(c->lock); + ghobject_t pos; + while (!errors) { + vector ols; + int r = collection_list(p->first, pos, ghobject_t::get_max(), true, + 100, &ols, &pos); + if (r < 0) { + ++errors; + break; + } + if (ols.empty()) { + break; + } + for (auto& oid : ols) { + dout(10) << __func__ << " " << oid << dendl; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + ++errors; + break; + } + if (o->onode.nid) { + if (used_nids.count(o->onode.nid)) { + derr << " " << oid << " nid " << o->onode.nid << " already in use" + << dendl; + ++errors; + break; + } + used_nids.insert(o->onode.nid); + } + // blocks + for (auto& b : o->onode.block_map) { + if (used_blocks.contains(b.second.offset, b.second.length)) { + derr << " " << oid << " extent " << b.first << ": " << b.second + << " already allocated" << dendl; + ++errors; + continue; + } + used_blocks.insert(b.second.offset, b.second.length); + if (b.second.end() > bdev->get_size()) { + derr << " " << oid << " extent " << b.first << ": " << b.second + << " past end of block device" << dendl; + ++errors; + } + } + // overlays + set overlay_keys; + map refs; + for (auto& v : o->onode.overlay_map) { + if (v.first + v.second.length > o->onode.size) { + derr << " " << oid << " overlay " << v.first << " " << v.second + << " extends past end of object" << dendl; + ++errors; + } + if (v.second.key > o->onode.last_overlay_key) { + derr << " " << oid << " overlay " << v.first << " " << v.second + << " is > last_overlay_key " << o->onode.last_overlay_key + << dendl; + ++errors; + } + ++refs[v.second.key]; + string key; + bufferlist val; + get_overlay_key(o->onode.nid, v.second.key, &key); + overlay_keys.insert(key); + int r = db->get(PREFIX_OVERLAY, key, &val); + if (r < 0) { + derr << " " << oid << " overlay " << v.first << " " << v.second + << " failed to fetch: " << cpp_strerror(r) << dendl; + ++errors; + } + if (val.length() < v.second.value_offset + v.second.length) { + derr << " " << oid << " overlay " << v.first << " " << v.second + << " too short, " << val.length() << dendl; + ++errors; + } + } + for (auto& vr : o->onode.overlay_refs) { + if (refs[vr.first] != vr.second) { + derr << " " << oid << " overlay key " << vr.first + << " says " << vr.second << " refs but we have " + << refs[vr.first] << dendl; + ++errors; + } + refs.erase(vr.first); + } + for (auto& p : refs) { + if (p.second > 1) { + derr << " " << oid << " overlay key " << p.first + << " has " << p.second << " refs but they are not recorded" + << dendl; + ++errors; + } + } + do { + string start; + get_overlay_key(o->onode.nid, 0, &start); + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OVERLAY); + if (!it) + break; + for (it->lower_bound(start); it->valid(); it->next()) { + string k = it->key(); + const char *p = k.c_str(); + uint64_t nid; + p = _key_decode_u64(p, &nid); + if (nid != o->onode.nid) + break; + if (!overlay_keys.count(k)) { + derr << " " << oid << " has stray overlay kv pair for " + << k << dendl; + ++errors; + } + } + } while (false); + // omap + while (o->onode.omap_head) { + if (used_omap_head.count(o->onode.omap_head)) { + derr << " " << oid << " omap_head " << o->onode.omap_head + << " already in use" << dendl; + ++errors; + break; + } + used_omap_head.insert(o->onode.omap_head); + // hrm, scan actual key/value pairs? + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + if (!it) + break; + string head, tail; + get_omap_header(o->onode.omap_head, &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + } else if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ + << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + assert(it->key() < tail); + } + it->next(); + } + break; + } + } + } + } + + dout(1) << __func__ << " checking for stray objects" << dendl; + it = db->get_iterator(PREFIX_OBJ); + if (it) { + CollectionRef c; + for (it->lower_bound(string()); it->valid(); it->next()) { + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + if (r < 0) { + dout(30) << __func__ << " bad object key " + << pretty_binary_string(it->key()) << dendl; + ++errors; + continue; + } + if (!c || !c->contains(oid)) { + c = NULL; + for (ceph::unordered_map::iterator p = + coll_map.begin(); + p != coll_map.end() && !errors; + ++p) { + if (p->second->contains(oid)) { + c = p->second; + break; + } + } + if (!c) { + dout(30) << __func__ << " stray object " << oid + << " not owned by any collection" << dendl; + ++errors; + continue; + } + } + } + } + + dout(1) << __func__ << " checking for stray overlay data" << dendl; + it = db->get_iterator(PREFIX_OVERLAY); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + string key = it->key(); + const char *p = key.c_str(); + uint64_t nid; + p = _key_decode_u64(p, &nid); + if (used_nids.count(nid) == 0) { + derr << __func__ << " found stray overlay data on nid " << nid << dendl; + ++errors; + } + } + } + + dout(1) << __func__ << " checking for stray omap data" << dendl; + it = db->get_iterator(PREFIX_OMAP); + if (it) { + for (it->lower_bound(string()); it->valid(); it->next()) { + string key = it->key(); + const char *p = key.c_str(); + uint64_t omap_head; + p = _key_decode_u64(p, &omap_head); + if (used_omap_head.count(omap_head) == 0) { + derr << __func__ << " found stray omap data on omap_head " << omap_head + << dendl; + ++errors; + } + } + } + + dout(1) << __func__ << " checking freelist vs allocated" << dendl; + { + const map& free = fm->get_freelist(); + for (map::const_iterator p = free.begin(); + p != free.end(); ++p) { + if (used_blocks.contains(p->first, p->second)) { + derr << __func__ << " free extent " << p->first << "~" << p->second + << " intersects allocated blocks" << dendl; + ++errors; + continue; + } + used_blocks.insert(p->first, p->second); + } + if (!used_blocks.contains(0, bdev->get_size())) { + derr << __func__ << " leaked some space; free+used = " + << used_blocks + << " != expected 0~" << bdev->get_size() + << dendl; + ++errors; + } + } + coll_map.clear(); + out_alloc: + _close_alloc(); + out_db: + it.reset(); // before db is closed + _close_db(); + out_bdev: + _close_bdev(); + out_fsid: + _close_fsid(); + out_path: + _close_path(); + + #endif + dout(1) << __func__ << " finish with " << errors << " errors" << dendl; + return errors; + } + + void KStore::_sync() + { + dout(10) << __func__ << dendl; + + kv_lock.Lock(); + while (!kv_committing.empty() || + !kv_queue.empty()) { + dout(20) << " waiting for kv to commit" << dendl; + kv_sync_cond.Wait(kv_lock); + } + kv_lock.Unlock(); + + dout(10) << __func__ << " done" << dendl; + } + + int KStore::statfs(struct statfs *buf) + { + return db->get_statfs(buf); + } + + // --------------- + // cache + + KStore::CollectionRef KStore::_get_collection(coll_t cid) + { + RWLock::RLocker l(coll_lock); + ceph::unordered_map::iterator cp = coll_map.find(cid); + if (cp == coll_map.end()) + return CollectionRef(); + return cp->second; + } + + void KStore::_queue_reap_collection(CollectionRef& c) + { + dout(10) << __func__ << " " << c->cid << dendl; + Mutex::Locker l(reap_lock); + removed_collections.push_back(c); + } + + void KStore::_reap_collections() + { + reap_lock.Lock(); + + list removed_colls; + removed_colls.swap(removed_collections); + reap_lock.Unlock(); + + for (list::iterator p = removed_colls.begin(); + p != removed_colls.end(); + ++p) { + CollectionRef c = *p; + dout(10) << __func__ << " " << c->cid << dendl; + { + pair next; + while (c->onode_map.get_next(next.first, &next)) { + assert(!next.second->exists); + if (!next.second->flush_txns.empty()) { + dout(10) << __func__ << " " << c->cid << " " << next.second->oid + << " flush_txns " << next.second->flush_txns << dendl; + return; + } + } + } + c->onode_map.clear(); + dout(10) << __func__ << " " << c->cid << " done" << dendl; + } + + dout(10) << __func__ << " all reaped" << dendl; + reap_cond.Signal(); + } + + // --------------- + // read operations + + bool KStore::exists(coll_t cid, const ghobject_t& oid) + { + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return false; + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return false; + return true; + } + + int KStore::stat( + coll_t cid, + const ghobject_t& oid, + struct stat *st, + bool allow_eio) + { + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) + return -ENOENT; + st->st_size = o->onode.size; + st->st_blksize = 4096; + st->st_blocks = (st->st_size + st->st_blksize - 1) / st->st_blksize; + st->st_nlink = 1; + return 0; + } + + int KStore::read( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags, + bool allow_eio) + { + dout(15) << __func__ << " " << cid << " " << oid + << " " << offset << "~" << length + << dendl; + bl.clear(); + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + + int r; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (offset == length && offset == 0) + length = o->onode.size; + + r = _do_read(o, offset, length, bl, op_flags); + + out: + dout(10) << __func__ << " " << cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; + } + + int KStore::_do_read( + OnodeRef o, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags) + { + int r = 0; + uint64_t stripe_size = o->onode.stripe_size; + uint64_t stripe_off; + + dout(20) << __func__ << " " << offset << "~" << length << " size " + << o->onode.size << " nid " << o->onode.nid << dendl; + bl.clear(); + + if (offset > o->onode.size) { + goto out; + } + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + if (stripe_size == 0) { + bufferptr z(length); + z.zero(); + bl.append(z); + r = length; + goto out; + } + + o->flush(); + + stripe_off = offset % stripe_size; + while (length > 0) { + bufferlist stripe; + _do_read_stripe(o, offset - stripe_off, &stripe); + dout(30) << __func__ << " stripe " << offset - stripe_off << " got " + << stripe.length() << dendl; + unsigned swant = MIN(stripe_size - stripe_off, length); + if (stripe.length()) { + if (swant == stripe.length()) { + bl.claim_append(stripe); + dout(30) << __func__ << " taking full stripe" << dendl; + } else { + unsigned l = 0; + if (stripe_off < stripe.length()) { + l = MIN(stripe.length() - stripe_off, swant); + bufferlist t; + t.substr_of(stripe, stripe_off, l); + bl.claim_append(t); + dout(30) << __func__ << " taking " << stripe_off << "~" << l << dendl; + } + if (l < swant) { + bufferptr z(swant - l); + z.zero(); + bl.append(z); + dout(30) << __func__ << " adding " << z.length() << " zeros" << dendl; + } + } + } else { + dout(30) << __func__ << " generating " << swant << " zeros" << dendl; + bufferptr z(swant); + z.zero(); + bl.append(z); + } + offset += swant; + length -= swant; + stripe_off = 0; + } + r = bl.length(); + dout(30) << " result:\n"; + bl.hexdump(*_dout); + *_dout << dendl; + + out: + return r; + } + + int KStore::fiemap( + coll_t cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl) + { + map m; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + return -ENOENT; + } + + if (offset == len && offset == 0) + len = o->onode.size; + + if (offset > o->onode.size) + goto out; + + if (offset + len > o->onode.size) { + len = o->onode.size - offset; + } + + dout(20) << __func__ << " " << offset << "~" << len << " size " + << o->onode.size << dendl; + + // FIXME: do something smarter here + m[0] = o->onode.size; + + out: + ::encode(m, bl); + dout(20) << __func__ << " " << offset << "~" << len + << " size = 0 (" << m << ")" << dendl; + return 0; + } + + int KStore::getattr( + coll_t cid, + const ghobject_t& oid, + const char *name, + bufferptr& value) + { + dout(15) << __func__ << " " << cid << " " << oid << " " << name << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r; + string k(name); + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + if (!o->onode.attrs.count(k)) { + r = -ENODATA; + goto out; + } + value = o->onode.attrs[k]; + r = 0; + out: + dout(10) << __func__ << " " << cid << " " << oid << " " << name + << " = " << r << dendl; + return r; + } + + int KStore::getattrs( + coll_t cid, + const ghobject_t& oid, + map& aset) + { + dout(15) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + aset = o->onode.attrs; + r = 0; + out: + dout(10) << __func__ << " " << cid << " " << oid + << " = " << r << dendl; + return r; + } + + int KStore::list_collections(vector& ls) + { + RWLock::RLocker l(coll_lock); + for (ceph::unordered_map::iterator p = coll_map.begin(); + p != coll_map.end(); + ++p) + ls.push_back(p->first); + return 0; + } + + bool KStore::collection_exists(coll_t c) + { + RWLock::RLocker l(coll_lock); + return coll_map.count(c); + } + + bool KStore::collection_empty(coll_t cid) + { + dout(15) << __func__ << " " << cid << dendl; + vector ls; + ghobject_t next; + int r = collection_list(cid, ghobject_t(), ghobject_t::get_max(), true, 5, + &ls, &next); + if (r < 0) + return false; // fixme? + bool empty = ls.empty(); + dout(10) << __func__ << " " << cid << " = " << (int)empty << dendl; + return empty; + } + + int KStore::collection_list( + coll_t cid, ghobject_t start, ghobject_t end, + bool sort_bitwise, int max, + vector *ls, ghobject_t *pnext) + { + dout(15) << __func__ << " " << cid + << " start " << start << " end " << end << " max " << max << dendl; + if (!sort_bitwise) + return -EOPNOTSUPP; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + KeyValueDB::Iterator it; + string temp_start_key, temp_end_key; + string start_key, end_key; + bool set_next = false; + string pend; + bool temp; + + ghobject_t static_next; + if (!pnext) + pnext = &static_next; + + if (start == ghobject_t::get_max()) + goto out; + get_coll_key_range(cid, c->cnode.bits, &temp_start_key, &temp_end_key, + &start_key, &end_key); + dout(20) << __func__ + << " range " << pretty_binary_string(temp_start_key) + << " to " << pretty_binary_string(temp_end_key) + << " and " << pretty_binary_string(start_key) + << " to " << pretty_binary_string(end_key) + << " start " << start << dendl; + it = db->get_iterator(PREFIX_OBJ); + if (start == ghobject_t() || start == cid.get_min_hobj()) { + it->upper_bound(temp_start_key); + temp = true; + } else { + string k; + get_object_key(start, &k); + if (start.hobj.is_temp()) { + temp = true; + assert(k >= temp_start_key && k < temp_end_key); + } else { + temp = false; + assert(k >= start_key && k < end_key); + } + dout(20) << " start from " << pretty_binary_string(k) + << " temp=" << (int)temp << dendl; + it->lower_bound(k); + } + if (end.hobj.is_max()) { + pend = temp ? temp_end_key : end_key; + } else { + get_object_key(end, &end_key); + if (end.hobj.is_temp()) { + if (temp) + pend = end_key; + else + goto out; + } else { + pend = temp ? temp_end_key : end_key; + } + } + dout(20) << __func__ << " pend " << pretty_binary_string(pend) << dendl; + while (true) { + if (!it->valid() || it->key() > pend) { + if (!it->valid()) + dout(20) << __func__ << " iterator not valid (end of db?)" << dendl; + else + dout(20) << __func__ << " key " << pretty_binary_string(it->key()) + << " > " << end << dendl; + if (temp) { + if (end.hobj.is_temp()) { + break; + } + dout(30) << __func__ << " switch to non-temp namespace" << dendl; + temp = false; + it->upper_bound(start_key); + pend = end_key; + dout(30) << __func__ << " pend " << pretty_binary_string(pend) << dendl; + continue; + } + break; + } + dout(20) << __func__ << " key " << pretty_binary_string(it->key()) << dendl; + ghobject_t oid; + int r = get_key_object(it->key(), &oid); + assert(r == 0); + if (ls->size() >= (unsigned)max) { + dout(20) << __func__ << " reached max " << max << dendl; + *pnext = oid; + set_next = true; + break; + } + ls->push_back(oid); + it->next(); + } + if (!set_next) { + *pnext = ghobject_t::get_max(); + } + out: + dout(10) << __func__ << " " << cid + << " start " << start << " end " << end << " max " << max + << " = " << r << ", ls.size() = " << ls->size() + << ", next = " << *pnext << dendl; + return r; + } + + // omap reads + + KStore::OmapIteratorImpl::OmapIteratorImpl( + CollectionRef c, OnodeRef o, KeyValueDB::Iterator it) + : c(c), o(o), it(it) + { + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + get_omap_key(o->onode.omap_head, string(), &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + } + } + + int KStore::OmapIteratorImpl::seek_to_first() + { + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + it->lower_bound(head); + } else { + it = KeyValueDB::Iterator(); + } + return 0; + } + + int KStore::OmapIteratorImpl::upper_bound(const string& after) + { + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + string key; + get_omap_key(o->onode.omap_head, after, &key); + it->upper_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + return 0; + } + + int KStore::OmapIteratorImpl::lower_bound(const string& to) + { + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + string key; + get_omap_key(o->onode.omap_head, to, &key); + it->lower_bound(key); + } else { + it = KeyValueDB::Iterator(); + } + return 0; + } + + bool KStore::OmapIteratorImpl::valid() + { + RWLock::RLocker l(c->lock); + if (o->onode.omap_head && it->valid() && it->raw_key().second <= tail) { + return true; + } else { + return false; + } + } + + int KStore::OmapIteratorImpl::next(bool validate) + { + RWLock::RLocker l(c->lock); + if (o->onode.omap_head) { + it->next(); + return 0; + } else { + return -1; + } + } + + string KStore::OmapIteratorImpl::key() + { + RWLock::RLocker l(c->lock); + assert(it->valid()); + string db_key = it->raw_key().second; + string user_key; + decode_omap_key(db_key, &user_key); + return user_key; + } + + bufferlist KStore::OmapIteratorImpl::value() + { + RWLock::RLocker l(c->lock); + assert(it->valid()); + return it->value(); + } + + int KStore::omap_get( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + map *out /// < [out] Key to value map + ) + { + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(o->onode.omap_head, &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() == head) { + dout(30) << __func__ << " got header" << dendl; + *header = it->value(); + } else if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + assert(it->key() < tail); + (*out)[user_key] = it->value(); + } + it->next(); + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; + } + + int KStore::omap_get_header( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + bufferlist *header, ///< [out] omap header + bool allow_eio ///< [in] don't assert on eio + ) + { + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + { + string head; + get_omap_header(o->onode.omap_head, &head); + if (db->get(PREFIX_OMAP, head, header) >= 0) { + dout(30) << __func__ << " got header" << dendl; + } else { + dout(30) << __func__ << " no header" << dendl; + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; + } + + int KStore::omap_get_keys( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + set *keys ///< [out] Keys defined on oid + ) + { + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_key(o->onode.omap_head, string(), &head); + get_omap_tail(o->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } + string user_key; + decode_omap_key(it->key(), &user_key); + dout(30) << __func__ << " got " << pretty_binary_string(it->key()) + << " -> " << user_key << dendl; + assert(it->key() < tail); + keys->insert(user_key); + it->next(); + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; + } + + int KStore::omap_get_values( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set &keys, ///< [in] Keys to get + map *out ///< [out] Returned keys and values + ) + { + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + bufferlist val; + if (db->get(PREFIX_OMAP, key, &val) >= 0) { + dout(30) << __func__ << " got " << pretty_binary_string(key) + << " -> " << *p << dendl; + out->insert(make_pair(*p, val)); + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; + } + + int KStore::omap_check_keys( + coll_t cid, ///< [in] Collection containing oid + const ghobject_t &oid, ///< [in] Object containing omap + const set &keys, ///< [in] Keys to check + set *out ///< [out] Subset of keys defined on oid + ) + { + dout(15) << __func__ << " " << cid << " oid " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) + return -ENOENT; + RWLock::RLocker l(c->lock); + int r = 0; + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) + goto out; + o->flush(); + for (set::const_iterator p = keys.begin(); p != keys.end(); ++p) { + string key; + get_omap_key(o->onode.omap_head, *p, &key); + bufferlist val; + if (db->get(PREFIX_OMAP, key, &val) >= 0) { + dout(30) << __func__ << " have " << pretty_binary_string(key) + << " -> " << *p << dendl; + out->insert(*p); + } else { + dout(30) << __func__ << " miss " << pretty_binary_string(key) + << " -> " << *p << dendl; + } + } + out: + dout(10) << __func__ << " " << cid << " oid " << oid << " = " << r << dendl; + return r; + } + + ObjectMap::ObjectMapIterator KStore::get_omap_iterator( + coll_t cid, ///< [in] collection + const ghobject_t &oid ///< [in] object + ) + { + + dout(10) << __func__ << " " << cid << " " << oid << dendl; + CollectionRef c = _get_collection(cid); + if (!c) { + dout(10) << __func__ << " " << cid << "doesn't exist" <lock); + OnodeRef o = c->get_onode(oid, false); + if (!o) { + dout(10) << __func__ << " " << oid << "doesn't exist" <flush(); + dout(10) << __func__ << " header = " << o->onode.omap_head <get_iterator(PREFIX_OMAP); + return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); + } + + + // ----------------- + // write helpers + + int KStore::_open_super_meta() + { + // nid + { + nid_max = 0; + bufferlist bl; + db->get(PREFIX_SUPER, "nid_max", &bl); + try { + ::decode(nid_max, bl); + } catch (buffer::error& e) { + } + dout(10) << __func__ << " old nid_max " << nid_max << dendl; + nid_last = nid_max; + } + return 0; + } + + void KStore::_assign_nid(TransContext *txc, OnodeRef o) + { + if (o->onode.nid) + return; + Mutex::Locker l(nid_lock); + o->onode.nid = ++nid_last; + dout(20) << __func__ << " " << o->oid << " nid " << o->onode.nid << dendl; + if (nid_last > nid_max) { + nid_max += g_conf->kstore_nid_prealloc; + bufferlist bl; + ::encode(nid_max, bl); + txc->t->set(PREFIX_SUPER, "nid_max", bl); + dout(10) << __func__ << " nid_max now " << nid_max << dendl; + } + } + + KStore::TransContext *KStore::_txc_create(OpSequencer *osr) + { + TransContext *txc = new TransContext(osr); + txc->t = db->get_transaction(); + osr->queue_new(txc); + dout(20) << __func__ << " osr " << osr << " = " << txc << dendl; + return txc; + } + + void KStore::_txc_state_proc(TransContext *txc) + { + while (true) { + dout(10) << __func__ << " txc " << txc + << " " << txc->get_state_name() << dendl; + switch (txc->state) { + case TransContext::STATE_PREPARE: + txc->state = TransContext::STATE_KV_QUEUED; + if (!g_conf->kstore_sync_transaction) { + Mutex::Locker l(kv_lock); + if (g_conf->kstore_sync_submit_transaction) { + db->submit_transaction(txc->t); + } + kv_queue.push_back(txc); + kv_cond.SignalOne(); + return; + } + db->submit_transaction_sync(txc->t); + break; + + case TransContext::STATE_KV_QUEUED: + txc->state = TransContext::STATE_KV_DONE; + _txc_finish_kv(txc); + // ** fall-thru ** + + case TransContext::STATE_KV_DONE: + txc->state = TransContext::STATE_FINISHING; + break; + + case TransContext::TransContext::STATE_FINISHING: + _txc_finish(txc); + return; + + default: + derr << __func__ << " unexpected txc " << txc + << " state " << txc->get_state_name() << dendl; + assert(0 == "unexpected txc state"); + return; + } + } + } + + int KStore::_txc_finalize(OpSequencer *osr, TransContext *txc) + { + dout(20) << __func__ << " osr " << osr << " txc " << txc + << " onodes " << txc->onodes << dendl; + + // finalize onodes + for (set::iterator p = txc->onodes.begin(); + p != txc->onodes.end(); + ++p) { + bufferlist bl; + ::encode((*p)->onode, bl); + dout(20) << " onode size is " << bl.length() << dendl; + txc->t->set(PREFIX_OBJ, (*p)->key, bl); + + Mutex::Locker l((*p)->flush_lock); + (*p)->flush_txns.insert(txc); + } + + return 0; + } + + void KStore::_txc_finish_kv(TransContext *txc) + { + dout(20) << __func__ << " txc " << txc << dendl; + + // warning: we're calling onreadable_sync inside the sequencer lock + if (txc->onreadable_sync) { + txc->onreadable_sync->complete(0); + txc->onreadable_sync = NULL; + } + if (txc->onreadable) { + finisher.queue(txc->onreadable); + txc->onreadable = NULL; + } + if (txc->oncommit) { + finisher.queue(txc->oncommit); + txc->oncommit = NULL; + } + while (!txc->oncommits.empty()) { + finisher.queue(txc->oncommits.front()); + txc->oncommits.pop_front(); + } + + throttle_ops.put(txc->ops); + throttle_bytes.put(txc->bytes); + } + + void KStore::_txc_finish(TransContext *txc) + { + dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; + assert(txc->state == TransContext::STATE_FINISHING); + + for (set::iterator p = txc->onodes.begin(); + p != txc->onodes.end(); + ++p) { + Mutex::Locker l((*p)->flush_lock); + dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns + << dendl; + assert((*p)->flush_txns.count(txc)); + (*p)->flush_txns.erase(txc); + if ((*p)->flush_txns.empty()) { + (*p)->flush_cond.Signal(); + (*p)->clear_pending_stripes(); + } + } + + // clear out refs + txc->onodes.clear(); + + while (!txc->removed_collections.empty()) { + _queue_reap_collection(txc->removed_collections.front()); + txc->removed_collections.pop_front(); + } + + OpSequencerRef osr = txc->osr; + osr->qlock.Lock(); + txc->state = TransContext::STATE_DONE; + osr->qlock.Unlock(); + + _osr_reap_done(osr.get()); + } + + void KStore::_osr_reap_done(OpSequencer *osr) + { + Mutex::Locker l(osr->qlock); + dout(20) << __func__ << " osr " << osr << dendl; + while (!osr->q.empty()) { + TransContext *txc = &osr->q.front(); + dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() + << dendl; + if (txc->state != TransContext::STATE_DONE) { + break; + } + + if (txc->first_collection) { + txc->first_collection->onode_map.trim(g_conf->kstore_onode_map_size); + } + + osr->q.pop_front(); + delete txc; + osr->qcond.Signal(); + if (osr->q.empty()) + dout(20) << __func__ << " osr " << osr << " q now empty" << dendl; + } + } + + void KStore::_kv_sync_thread() + { + dout(10) << __func__ << " start" << dendl; + kv_lock.Lock(); + while (true) { + assert(kv_committing.empty()); + if (kv_queue.empty()) { + if (kv_stop) + break; + dout(20) << __func__ << " sleep" << dendl; + kv_sync_cond.Signal(); + kv_cond.Wait(kv_lock); + dout(20) << __func__ << " wake" << dendl; + } else { + dout(20) << __func__ << " committing " << kv_queue.size() << dendl; + kv_committing.swap(kv_queue); + utime_t start = ceph_clock_now(NULL); + kv_lock.Unlock(); + + dout(30) << __func__ << " committing txc " << kv_committing << dendl; + + // one transaction to force a sync + KeyValueDB::Transaction t = db->get_transaction(); + if (!g_conf->kstore_sync_submit_transaction) { + for (std::deque::iterator it = kv_committing.begin(); + it != kv_committing.end(); + ++it) { + db->submit_transaction((*it)->t); + } + } + db->submit_transaction_sync(t); + utime_t finish = ceph_clock_now(NULL); + utime_t dur = finish - start; + dout(20) << __func__ << " committed " << kv_committing.size() + << " in " << dur << dendl; + while (!kv_committing.empty()) { + TransContext *txc = kv_committing.front(); + _txc_state_proc(txc); + kv_committing.pop_front(); + } + + // this is as good a place as any ... + _reap_collections(); + + kv_lock.Lock(); + } + } + kv_lock.Unlock(); + dout(10) << __func__ << " finish" << dendl; + } + + + // --------------------------- + // transactions + + int KStore::queue_transactions( + Sequencer *posr, + list& tls, + TrackedOpRef op, + ThreadPool::TPHandle *handle) + { + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + int r; + + // set up the sequencer + OpSequencer *osr; + assert(posr); + if (posr->p) { + osr = static_cast(posr->p.get()); + dout(10) << __func__ << " existing " << osr << " " << *osr << dendl; + } else { + osr = new OpSequencer; + osr->parent = posr; + posr->p = osr; + dout(10) << __func__ << " new " << osr << " " << *osr << dendl; + } + + // prepare + TransContext *txc = _txc_create(osr); + txc->onreadable = onreadable; + txc->onreadable_sync = onreadable_sync; + txc->oncommit = ondisk; + + for (list::iterator p = tls.begin(); p != tls.end(); ++p) { + (*p)->set_osr(osr); + txc->ops += (*p)->get_num_ops(); + txc->bytes += (*p)->get_num_bytes(); + _txc_add_transaction(txc, *p); + } + + r = _txc_finalize(osr, txc); + assert(r == 0); + + throttle_ops.get(txc->ops); + throttle_bytes.get(txc->bytes); + + // execute (start) + _txc_state_proc(txc); + return 0; + } + + int KStore::_txc_add_transaction(TransContext *txc, Transaction *t) + { + Transaction::iterator i = t->begin(); + int pos = 0; + + vector cvec(i.colls.size()); + unsigned j = 0; + for (vector::iterator p = i.colls.begin(); p != i.colls.end(); + ++p, ++j) { + cvec[j] = _get_collection(*p); + + // note first collection we reference + if (!j && !txc->first_collection) + txc->first_collection = cvec[j]; + } + + while (i.have_op()) { + Transaction::Op *op = i.decode_op(); + int r = 0; + CollectionRef &c = cvec[op->cid]; + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + const ghobject_t &oid = i.get_oid(op->oid); + r = _touch(txc, c, oid); + } + break; + + case Transaction::OP_WRITE: + { + const ghobject_t &oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + r = _write(txc, c, oid, off, len, bl, fadvise_flags); + } + break; + + case Transaction::OP_ZERO: + { + const ghobject_t &oid = i.get_oid(op->oid); + uint64_t off = op->off; + uint64_t len = op->len; + r = _zero(txc, c, oid, off, len); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + const ghobject_t& oid = i.get_oid(op->oid); + uint64_t off = op->off; + r = _truncate(txc, c, oid, off); + } + break; + + case Transaction::OP_REMOVE: + { + const ghobject_t& oid = i.get_oid(op->oid); + r = _remove(txc, c, oid); + } + break; + + case Transaction::OP_SETATTR: + { + const ghobject_t &oid = i.get_oid(op->oid); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + map to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(txc, c, oid, to_set); + } + break; + + case Transaction::OP_SETATTRS: + { + const ghobject_t& oid = i.get_oid(op->oid); + map aset; + i.decode_attrset(aset); + r = _setattrs(txc, c, oid, aset); + } + break; + + case Transaction::OP_RMATTR: + { + const ghobject_t &oid = i.get_oid(op->oid); + string name = i.decode_string(); + r = _rmattr(txc, c, oid, name); + } + break; + + case Transaction::OP_RMATTRS: + { + const ghobject_t &oid = i.get_oid(op->oid); + r = _rmattrs(txc, c, oid); + } + break; + + case Transaction::OP_CLONE: + { + const ghobject_t& oid = i.get_oid(op->oid); + const ghobject_t& noid = i.get_oid(op->dest_oid); + r = _clone(txc, c, oid, noid); + } + break; + + case Transaction::OP_CLONERANGE: + assert(0 == "deprecated"); + break; + + case Transaction::OP_CLONERANGE2: + { + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + r = _clone_range(txc, c, oid, noid, srcoff, len, dstoff); + } + break; + + case Transaction::OP_MKCOLL: + { + assert(!c); + coll_t cid = i.get_cid(op->cid); + r = _create_collection(txc, cid, op->split_bits, &c); + } + break; + + case Transaction::OP_COLL_HINT: + { + coll_t cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + bufferlist::iterator hiter = hint.begin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + ::decode(pg_num, hiter); + ::decode(num_objs, hiter); + dout(10) << __func__ << " collection hint objects is a no-op, " + << " pg_num " << pg_num << " num_objects " << num_objs + << dendl; + } else { + // Ignore the hint + dout(10) << __func__ << " unknown collection hint " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = i.get_cid(op->cid); + r = _remove_collection(txc, cid, &c); + } + break; + + case Transaction::OP_COLL_ADD: + assert(0 == "not implmeented"); + break; + + case Transaction::OP_COLL_REMOVE: + assert(0 == "not implmeented"); + break; + + case Transaction::OP_COLL_MOVE: + assert(0 == "deprecated"); + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + assert(op->cid == op->dest_cid); + ghobject_t oldoid = i.get_oid(op->oid); + ghobject_t newoid = i.get_oid(op->dest_oid); + r = _rename(txc, c, oldoid, newoid); + } + break; + + case Transaction::OP_COLL_SETATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RMATTR: + r = -EOPNOTSUPP; + break; + + case Transaction::OP_COLL_RENAME: + assert(0 == "not implmeneted"); + break; + + case Transaction::OP_OMAP_CLEAR: + { + ghobject_t oid = i.get_oid(op->oid); + r = _omap_clear(txc, c, oid); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + ghobject_t oid = i.get_oid(op->oid); + bufferlist aset_bl; + i.decode_attrset_bl(&aset_bl); + r = _omap_setkeys(txc, c, oid, aset_bl); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + ghobject_t oid = i.get_oid(op->oid); + bufferlist keys_bl; + i.decode_keyset_bl(&keys_bl); + r = _omap_rmkeys(txc, c, oid, keys_bl); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + ghobject_t oid = i.get_oid(op->oid); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + r = _omap_rmkey_range(txc, c, oid, first, last); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + ghobject_t oid = i.get_oid(op->oid); + bufferlist bl; + i.decode_bl(bl); + r = _omap_setheader(txc, c, oid, bl); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + assert(0 == "deprecated"); + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + r = _split_collection(txc, c, cvec[op->dest_cid], bits, rem); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + ghobject_t oid = i.get_oid(op->oid); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + r = _setallochint(txc, c, oid, + expected_object_size, + expected_write_size); + } + break; + + default: + derr << "bad op " << op->op << dendl; + assert(0); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD)) + // -ENOENT is usually okay + ok = true; + if (r == -ENODATA) + ok = true; + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) + msg = "ENOENT on clone suggests osd bug"; + + if (r == -ENOSPC) + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC handling not implemented"; + + if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } + + dout(0) << " error " << cpp_strerror(r) << " not handled on operation " << op->op + << " (op " << pos << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t->dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + assert(0 == "unexpected error"); + } + } + + ++pos; + } + + return 0; + } + + + + // ----------------- + // write operations + + int KStore::_touch(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + assert(o); + o->exists = true; + _assign_nid(txc, o); + txc->write_onode(o); + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + void KStore::_dump_onode(OnodeRef o) + { + dout(30) << __func__ << " " << o + << " nid " << o->onode.nid + << " size " << o->onode.size + << " expected_object_size " << o->onode.expected_object_size + << " expected_write_size " << o->onode.expected_write_size + << dendl; + for (map::iterator p = o->onode.attrs.begin(); + p != o->onode.attrs.end(); + ++p) { + dout(30) << __func__ << " attr " << p->first + << " len " << p->second.length() << dendl; + } + } + + void KStore::_pad_zeros( + OnodeRef o, + bufferlist *bl, uint64_t *offset, uint64_t *length, + uint64_t block_size) + { + dout(40) << "before:\n"; + bl->hexdump(*_dout); + *_dout << dendl; + // front + size_t front_pad = *offset % block_size; + size_t back_pad = 0; + if (front_pad) { + size_t front_copy = MIN(block_size - front_pad, *length); + bufferptr z = buffer::create_page_aligned(block_size); + memset(z.c_str(), 0, front_pad); + memcpy(z.c_str() + front_pad, bl->get_contiguous(0, front_copy), front_copy); + if (front_copy + front_pad < block_size) { + back_pad = block_size - (*length + front_pad); + memset(z.c_str() + front_pad + *length, 0, back_pad); + } + bufferlist old, t; + old.swap(*bl); + t.substr_of(old, front_copy, *length - front_copy); + bl->append(z); + bl->claim_append(t); + *offset -= front_pad; + *length += front_pad + back_pad; + } + + // back + uint64_t end = *offset + *length; + unsigned back_copy = end % block_size; + if (back_copy) { + assert(back_pad == 0); + back_pad = block_size - back_copy; + assert(back_copy <= *length); + bufferptr tail(block_size); + memcpy(tail.c_str(), bl->get_contiguous(*length - back_copy, back_copy), + back_copy); + memset(tail.c_str() + back_copy, 0, back_pad); + bufferlist old; + old.swap(*bl); + bl->substr_of(old, 0, *length - back_copy); + bl->append(tail); + *length += back_pad; + if (end > o->onode.size && g_conf->kstore_cache_tails) { + o->tail_bl.clear(); + o->tail_bl.append(tail, 0, back_copy); + o->tail_offset = end - back_copy; + dout(20) << __func__ << " cached "<< back_copy << " of tail block at " + << o->tail_offset << dendl; + } + } + dout(20) << __func__ << " pad " << front_pad << " + " << back_pad + << " on front/back, now " << *offset << "~" << *length << dendl; + dout(40) << "after:\n"; + bl->hexdump(*_dout); + *_dout << dendl; + } + + void KStore::_do_read_stripe(OnodeRef o, uint64_t offset, bufferlist *pbl) + { + map::iterator p = o->pending_stripes.find(offset); + if (p == o->pending_stripes.end()) { + string key; + get_data_key(o->onode.nid, offset, &key); + db->get(PREFIX_DATA, key, pbl); + o->pending_stripes[offset] = *pbl; + } else { + *pbl = p->second; + } + } + + void KStore::_do_write_stripe(TransContext *txc, OnodeRef o, + uint64_t offset, bufferlist& bl) + { + o->pending_stripes[offset] = bl; + string key; + get_data_key(o->onode.nid, offset, &key); + txc->t->set(PREFIX_DATA, key, bl); + } + + void KStore::_do_remove_stripe(TransContext *txc, OnodeRef o, uint64_t offset) + { + o->pending_stripes.erase(offset); + string key; + get_data_key(o->onode.nid, offset, &key); + txc->t->rmkey(PREFIX_DATA, key); + } + + int KStore::_do_write(TransContext *txc, + OnodeRef o, + uint64_t offset, uint64_t length, + bufferlist& orig_bl, + uint32_t fadvise_flags) + { + int r = 0; + + dout(20) << __func__ + << " " << o->oid << " " << offset << "~" << length + << " - have " << o->onode.size + << " bytes, nid " << o->onode.nid << dendl; + _dump_onode(o); + o->exists = true; + + if (length == 0) { + return 0; + } + + uint64_t stripe_size = o->onode.stripe_size; + if (!stripe_size) { + o->onode.stripe_size = g_conf->kstore_default_stripe_size; + stripe_size = o->onode.stripe_size; + } + + unsigned bl_off = 0; + while (length > 0) { + uint64_t offset_rem = offset % stripe_size; + uint64_t end_rem = (offset + length) % stripe_size; + if (offset_rem == 0 && end_rem == 0) { + bufferlist bl; + bl.substr_of(orig_bl, bl_off, stripe_size); + dout(30) << __func__ << " full stripe " << offset << dendl; + _do_write_stripe(txc, o, offset, bl); + offset += stripe_size; + length -= stripe_size; + bl_off += stripe_size; + continue; + } + uint64_t stripe_off = offset - offset_rem; + bufferlist prev; + _do_read_stripe(o, stripe_off, &prev); + dout(20) << __func__ << " read previous stripe " << stripe_off + << ", got " << prev.length() << dendl; + bufferlist bl; + if (offset_rem) { + unsigned p = MIN(prev.length(), offset_rem); + if (p) { + dout(20) << __func__ << " reuse leading " << p << " bytes" << dendl; + bl.substr_of(prev, 0, p); + } + if (p < offset_rem) { + bufferptr z(offset_rem - p); + dout(20) << __func__ << " add leading " << z.length() << " zeros" << dendl; + z.zero(); + bl.append(z); + } + } + unsigned use = stripe_size - offset_rem; + if (use > length) + use -= stripe_size - end_rem; + dout(20) << __func__ << " using " << use << " for this stripe" << dendl; + bufferlist t; + t.substr_of(orig_bl, bl_off, use); + bl.claim_append(t); + bl_off += use; + if (end_rem) { + if (end_rem < prev.length()) { + unsigned l = prev.length() - end_rem; + dout(20) << __func__ << " reuse trailing " << l << " bytes" << dendl; + bufferlist t; + t.substr_of(prev, end_rem, l); + bl.claim_append(t); + } + } + dout(30) << " writing:\n"; + bl.hexdump(*_dout); + *_dout << dendl; + _do_write_stripe(txc, o, stripe_off, bl); + offset += use; + length -= use; + } + + if (offset > o->onode.size) { + dout(20) << __func__ << " extending size to " << offset + length + << dendl; + o->onode.size = offset; + } + + return r; + } + + int KStore::_write(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset, size_t length, + bufferlist& bl, + uint32_t fadvise_flags) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << dendl; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + _assign_nid(txc, o); + int r = _do_write(txc, o, offset, length, bl, fadvise_flags); + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; + } + + int KStore::_zero(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset, size_t length) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, true); + _assign_nid(txc, o); + + uint64_t stripe_size = o->onode.stripe_size; + if (stripe_size) { + uint64_t end = offset + length; + uint64_t pos = offset; + uint64_t stripe_off = pos % stripe_size; + while (pos < offset + length) { + if (stripe_off || end - pos < stripe_size) { + bufferlist stripe; + _do_read_stripe(o, pos - stripe_off, &stripe); + dout(30) << __func__ << " stripe " << pos - stripe_off << " got " + << stripe.length() << dendl; + bufferlist bl; + bl.substr_of(stripe, 0, MIN(stripe.length(), stripe_off)); + if (end >= pos - stripe_off + stripe_size || + end >= o->onode.size) { + dout(20) << __func__ << " truncated stripe " << pos - stripe_off + << " to " << bl.length() << dendl; + } else { + bufferptr z(end - (pos - stripe_off + bl.length())); + z.zero(); + bl.append(z); + dout(20) << __func__ << " adding " << z.length() << " of zeros" << dendl; + if (stripe.length() > bl.length()) { + unsigned l = stripe.length() - bl.length(); + bufferlist t; + t.substr_of(stripe, stripe.length() - l, l); + dout(20) << __func__ << " keeping tail " << l << " of stripe" << dendl; + bl.claim_append(t); + } + } + _do_write_stripe(txc, o, pos - stripe_off, bl); + pos += stripe_size - stripe_off; + stripe_off = 0; + } else { + dout(20) << __func__ << " rm stripe " << pos << dendl; + _do_remove_stripe(txc, o, pos - stripe_off); + pos += stripe_size; + } + } + } + if (offset + length > o->onode.size) { + o->onode.size = offset + length; + dout(20) << __func__ << " extending size to " << offset + length + << dendl; + } + txc->write_onode(o); + + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << offset << "~" << length + << " = " << r << dendl; + return r; + } + + int KStore::_do_truncate(TransContext *txc, OnodeRef o, uint64_t offset) + { + uint64_t stripe_size = o->onode.stripe_size; + + // trim down stripes + if (stripe_size) { + uint64_t pos = offset; + uint64_t stripe_off = pos % stripe_size; + while (pos < o->onode.size) { + if (stripe_off) { + bufferlist stripe; + _do_read_stripe(o, pos - stripe_off, &stripe); + dout(30) << __func__ << " stripe " << pos - stripe_off << " got " + << stripe.length() << dendl; + bufferlist t; + t.substr_of(stripe, 0, MIN(stripe_off, stripe.length())); + _do_write_stripe(txc, o, pos - stripe_off, t); + dout(20) << __func__ << " truncated stripe " << pos - stripe_off + << " to " << t.length() << dendl; + pos += stripe_size - stripe_off; + stripe_off = 0; + } else { + dout(20) << __func__ << " rm stripe " << pos << dendl; + _do_remove_stripe(txc, o, pos - stripe_off); + pos += stripe_size; + } + } + + // trim down cached tail + if (o->tail_bl.length()) { + if (offset / stripe_size != o->onode.size / stripe_size) { + dout(20) << __func__ << " clear cached tail" << dendl; + o->clear_tail(); + } + } + } + + o->onode.size = offset; + dout(10) << __func__ << " truncate size to " << offset << dendl; + + txc->write_onode(o); + return 0; + } + + int KStore::_truncate(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t offset) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << offset + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + r = _do_truncate(txc, o, offset); + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << offset + << " = " << r << dendl; + return r; + } + + int KStore::_do_remove(TransContext *txc, + OnodeRef o) + { + string key; + + _do_truncate(txc, o, 0); + + o->onode.size = 0; + if (o->onode.omap_head) { + _do_omap_clear(txc, o->onode.omap_head); + } + o->exists = false; + o->onode = kstore_onode_t(); + txc->onodes.erase(o); + get_object_key(o->oid, &key); + txc->t->rmkey(PREFIX_OBJ, key); + return 0; + } + + int KStore::_remove(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + r = _do_remove(txc, o); + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + int KStore::_setattr(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& name, + bufferptr& val) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << name << " (" << val.length() << " bytes)" + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + o->onode.attrs[name] = val; + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << name << " (" << val.length() << " bytes)" + << " = " << r << dendl; + return r; + } + + int KStore::_setattrs(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const map& aset) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << aset.size() << " keys" + << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + for (map::const_iterator p = aset.begin(); + p != aset.end(); ++p) + o->onode.attrs[p->first] = p->second; + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << aset.size() << " keys" + << " = " << r << dendl; + return r; + } + + + int KStore::_rmattr(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& name) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " " << name << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + o->onode.attrs.erase(name); + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " " << name << " = " << r << dendl; + return r; + } + + int KStore::_rmattrs(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + o->onode.attrs.clear(); + txc->write_onode(o); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + void KStore::_do_omap_clear(TransContext *txc, uint64_t id) + { + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string prefix, tail; + get_omap_header(id, &prefix); + get_omap_tail(id, &tail); + it->lower_bound(prefix); + while (it->valid()) { + if (it->key() >= tail) { + dout(30) << __func__ << " stop at " << tail << dendl; + break; + } + txc->t->rmkey(PREFIX_OMAP, it->key()); + dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl; + it->next(); + } + } + + int KStore::_omap_clear(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (o->onode.omap_head != 0) { + _do_omap_clear(txc, o->onode.omap_head); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + int KStore::_omap_setkeys(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + bufferlist &bl) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + bufferlist::iterator p = bl.begin(); + __u32 num; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } + ::decode(num, p); + while (num--) { + string key; + bufferlist value; + ::decode(key, p); + ::decode(value, p); + string final_key; + get_omap_key(o->onode.omap_head, key, &final_key); + dout(30) << __func__ << " " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->set(PREFIX_OMAP, final_key, value); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + int KStore::_omap_setheader(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + bufferlist& bl) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + string key; + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) { + o->onode.omap_head = o->onode.nid; + txc->write_onode(o); + } + get_omap_header(o->onode.omap_head, &key); + txc->t->set(PREFIX_OMAP, key, bl); + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + int KStore::_omap_rmkeys(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + bufferlist& bl) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + bufferlist::iterator p = bl.begin(); + __u32 num; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) { + r = 0; + goto out; + } + ::decode(num, p); + while (num--) { + string key; + ::decode(key, p); + string final_key; + get_omap_key(o->onode.omap_head, key, &final_key); + dout(30) << __func__ << " rm " << pretty_binary_string(final_key) + << " <- " << key << dendl; + txc->t->rmkey(PREFIX_OMAP, final_key); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + int KStore::_omap_rmkey_range(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + const string& first, const string& last) + { + dout(15) << __func__ << " " << c->cid << " " << oid << dendl; + int r = 0; + KeyValueDB::Iterator it; + string key_first, key_last; + + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + if (!o->onode.omap_head) { + r = 0; + goto out; + } + it = db->get_iterator(PREFIX_OMAP); + get_omap_key(o->onode.omap_head, first, &key_first); + get_omap_key(o->onode.omap_head, last, &key_last); + it->lower_bound(key_first); + while (it->valid()) { + if (it->key() >= key_last) { + dout(30) << __func__ << " stop at " << pretty_binary_string(key_last) + << dendl; + break; + } + txc->t->rmkey(PREFIX_OMAP, it->key()); + dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl; + it->next(); + } + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << oid << " = " << r << dendl; + return r; + } + + int KStore::_setallochint(TransContext *txc, + CollectionRef& c, + const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size) + { + dout(15) << __func__ << " " << c->cid << " " << oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << dendl; + int r = 0; + RWLock::WLocker l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + r = -ENOENT; + goto out; + } + + o->onode.expected_object_size = expected_object_size; + o->onode.expected_write_size = expected_write_size; + txc->write_onode(o); + + out: + dout(10) << __func__ << " " << c->cid << " " << oid + << " object_size " << expected_object_size + << " write_size " << expected_write_size + << " = " << r << dendl; + return r; + } + + int KStore::_clone(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid) + { + dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + bufferlist bl; + OnodeRef newo; + OnodeRef oldo = c->get_onode(old_oid, false); + if (!oldo || !oldo->exists) { + r = -ENOENT; + goto out; + } + newo = c->get_onode(new_oid, true); + assert(newo); + newo->exists = true; + _assign_nid(txc, newo); + + r = _do_read(oldo, 0, oldo->onode.size, bl, 0); + if (r < 0) + goto out; + + // truncate any old data + r = _do_truncate(txc, newo, 0); + if (r < 0) + goto out; + + r = _do_write(txc, newo, 0, oldo->onode.size, bl, 0); + + newo->onode.attrs = oldo->onode.attrs; + + // clone omap + if (newo->onode.omap_head) { + dout(20) << __func__ << " clearing old omap data" << dendl; + _do_omap_clear(txc, newo->onode.omap_head); + } + if (oldo->onode.omap_head) { + dout(20) << __func__ << " copying omap data" << dendl; + if (!newo->onode.omap_head) { + newo->onode.omap_head = newo->onode.nid; + } + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + string head, tail; + get_omap_header(oldo->onode.omap_head, &head); + get_omap_tail(oldo->onode.omap_head, &tail); + it->lower_bound(head); + while (it->valid()) { + string key; + if (it->key() >= tail) { + dout(30) << __func__ << " reached tail" << dendl; + break; + } else { + dout(30) << __func__ << " got header/data " + << pretty_binary_string(it->key()) << dendl; + assert(it->key() < tail); + rewrite_omap_key(newo->onode.omap_head, it->key(), &key); + txc->t->set(PREFIX_OMAP, key, it->value()); + } + it->next(); + } + } + + txc->write_onode(newo); + + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; + } + + int KStore::_clone_range(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid, + uint64_t srcoff, uint64_t length, uint64_t dstoff) + { + dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " from " << srcoff << "~" << length + << " to offset " << dstoff << dendl; + int r = 0; + + RWLock::WLocker l(c->lock); + bufferlist bl; + OnodeRef newo; + OnodeRef oldo = c->get_onode(old_oid, false); + if (!oldo || !oldo->exists) { + r = -ENOENT; + goto out; + } + newo = c->get_onode(new_oid, true); + assert(newo); + newo->exists = true; + + r = _do_read(oldo, srcoff, length, bl, 0); + if (r < 0) + goto out; + + r = _do_write(txc, newo, dstoff, bl.length(), bl, 0); + + txc->write_onode(newo); + + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " from " << srcoff << "~" << length + << " to offset " << dstoff + << " = " << r << dendl; + return r; + } + + int KStore::_rename(TransContext *txc, + CollectionRef& c, + const ghobject_t& old_oid, + const ghobject_t& new_oid) + { + dout(15) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << dendl; + int r; + + RWLock::WLocker l(c->lock); + bufferlist bl; + string old_key, new_key; + OnodeRef newo; + OnodeRef oldo = c->get_onode(old_oid, false); + if (!oldo || !oldo->exists) { + r = -ENOENT; + goto out; + } + newo = c->get_onode(new_oid, true); + assert(newo); + + if (newo->exists) { + r = _do_remove(txc, newo); + if (r < 0) + return r; + } + + txc->t->rmkey(PREFIX_OBJ, oldo->key); + txc->write_onode(oldo); + c->onode_map.rename(old_oid, new_oid); // this adjusts oldo->{oid,key} + r = 0; + + out: + dout(10) << __func__ << " " << c->cid << " " << old_oid << " -> " + << new_oid << " = " << r << dendl; + return r; + } + + // collections + + int KStore::_create_collection( + TransContext *txc, + coll_t cid, + unsigned bits, + CollectionRef *c) + { + dout(15) << __func__ << " " << cid << " bits " << bits << dendl; + int r; + bufferlist bl; + + { + RWLock::WLocker l(coll_lock); + if (*c) { + r = -EEXIST; + goto out; + } + c->reset(new Collection(this, cid)); + (*c)->cnode.bits = bits; + coll_map[cid] = *c; + } + ::encode((*c)->cnode, bl); + txc->t->set(PREFIX_COLL, stringify(cid), bl); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " bits " << bits << " = " << r << dendl; + return r; + } + + int KStore::_remove_collection(TransContext *txc, coll_t cid, + CollectionRef *c) + { + dout(15) << __func__ << " " << cid << dendl; + int r; + + { + RWLock::WLocker l(coll_lock); + if (!*c) { + r = -ENOENT; + goto out; + } + pair next; + while ((*c)->onode_map.get_next(next.first, &next)) { + if (next.second->exists) { + r = -ENOTEMPTY; + goto out; + } + } + coll_map.erase(cid); + txc->removed_collections.push_back(*c); + c->reset(); + } + txc->t->rmkey(PREFIX_COLL, stringify(cid)); + r = 0; + + out: + dout(10) << __func__ << " " << cid << " = " << r << dendl; + return r; + } + + int KStore::_split_collection(TransContext *txc, + CollectionRef& c, + CollectionRef& d, + unsigned bits, int rem) + { + dout(15) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << dendl; + int r; + RWLock::WLocker l(c->lock); + RWLock::WLocker l2(d->lock); + c->onode_map.clear(); + d->onode_map.clear(); + c->cnode.bits = bits; + assert(d->cnode.bits == bits); + r = 0; + + dout(10) << __func__ << " " << c->cid << " to " << d->cid << " " + << " bits " << bits << " = " << r << dendl; + return r; + } + + // =========================================== diff --cc src/rgw/rgw_common.cc index e20417444b4e,dfd31050a43c..80b6c1bd4ace --- a/src/rgw/rgw_common.cc +++ b/src/rgw/rgw_common.cc @@@ -613,7 -612,7 +612,8 @@@ int RGWHTTPArgs::parse( (name.compare("versionId") == 0) || (name.compare("versions") == 0) || (name.compare("versioning") == 0) || + (name.compare("website") == 0) || + (name.compare("requestPayment") == 0) || (name.compare("torrent") == 0)) { sub_resources[name] = val; } else if (name[0] == 'r') { // root of all evil diff --cc src/rgw/rgw_common.h index aeb6866e2df4,59893527d03b..b5a4d4867b29 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@@ -74,7 -71,10 +75,11 @@@ using ceph::crypto::MD5 #define RGW_ATTR_SHADOW_OBJ RGW_ATTR_PREFIX "shadow_name" #define RGW_ATTR_MANIFEST RGW_ATTR_PREFIX "manifest" #define RGW_ATTR_USER_MANIFEST RGW_ATTR_PREFIX "user_manifest" +#define RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION RGW_ATTR_PREFIX RGW_AMZ_WEBSITE_REDIRECT_LOCATION + #define RGW_ATTR_SLO_MANIFEST RGW_ATTR_PREFIX "slo_manifest" + /* Information whether an object is SLO or not must be exposed to + * user through custom HTTP header named X-Static-Large-Object. */ + #define RGW_ATTR_SLO_UINDICATOR RGW_ATTR_META_PREFIX "static-large-object" #define RGW_ATTR_TEMPURL_KEY1 RGW_ATTR_META_PREFIX "temp-url-key" #define RGW_ATTR_TEMPURL_KEY2 RGW_ATTR_META_PREFIX "temp-url-key-2" @@@ -158,8 -155,12 +163,14 @@@ #define ERR_INVALID_ACCESS_KEY 2028 #define ERR_MALFORMED_XML 2029 #define ERR_USER_EXIST 2030 - #define ERR_WEBSITE_REDIRECT 2031 - #define ERR_NO_SUCH_WEBSITE_CONFIGURATION 2032 + #define ERR_NOT_SLO_MANIFEST 2031 + #define ERR_EMAIL_EXIST 2032 + #define ERR_KEY_EXIST 2033 + #define ERR_INVALID_SECRET_KEY 2034 + #define ERR_INVALID_KEY_TYPE 2035 + #define ERR_INVALID_CAP 2036 ++#define ERR_WEBSITE_REDIRECT 2037 ++#define ERR_NO_SUCH_WEBSITE_CONFIGURATION 2038 #define ERR_USER_SUSPENDED 2100 #define ERR_INTERNAL_ERROR 2200 #define ERR_NOT_IMPLEMENTED 2201 @@@ -804,13 -807,12 +818,15 @@@ struct RGWBucketInf // Represents the shard number for blind bucket. const static uint32_t NUM_SHARDS_BLIND_BUCKET; + bool requester_pays; + + bool has_website; + RGWBucketWebsiteConf website_conf; + void encode(bufferlist& bl) const { - ENCODE_START(12, 4, bl); - ENCODE_START(13, 4, bl); ++ ENCODE_START(14, 4, bl); ::encode(bucket, bl); - ::encode(owner, bl); + ::encode(owner.id, bl); ::encode(flags, bl); ::encode(region, bl); uint64_t ct = (uint64_t)creation_time; @@@ -820,17 -822,18 +836,22 @@@ ::encode(quota, bl); ::encode(num_shards, bl); ::encode(bucket_index_shard_hash_type, bl); + ::encode(requester_pays, bl); + ::encode(owner.tenant, bl); + ::encode(has_website, bl); + if (has_website) { + ::encode(website_conf, bl); + } ENCODE_FINISH(bl); } void decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN_32(12, 4, 4, bl); - DECODE_START_LEGACY_COMPAT_LEN_32(13, 4, 4, bl); ++ DECODE_START_LEGACY_COMPAT_LEN_32(14, 4, 4, bl); ::decode(bucket, bl); - if (struct_v >= 2) - ::decode(owner, bl); + if (struct_v >= 2) { + string s; + ::decode(s, bl); + owner.from_str(s); + } if (struct_v >= 3) ::decode(flags, bl); if (struct_v >= 5) @@@ -850,14 -853,10 +871,18 @@@ ::decode(num_shards, bl); if (struct_v >= 11) ::decode(bucket_index_shard_hash_type, bl); - if (struct_v >= 12) { + if (struct_v >= 12) + ::decode(requester_pays, bl); + if (struct_v >= 13) + ::decode(owner.tenant, bl); ++ if (struct_v >= 14) { + ::decode(has_website, bl); + if (has_website) { + ::decode(website_conf, bl); + } else { + website_conf = RGWBucketWebsiteConf(); + } + } DECODE_FINISH(bl); } void dump(Formatter *f) const; @@@ -869,8 -868,7 +894,8 @@@ int versioning_status() { return flags & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); } bool versioning_enabled() { return versioning_status() == BUCKET_VERSIONED; } - RGWBucketInfo() : flags(0), creation_time(0), has_instance_obj(false), num_shards(0), bucket_index_shard_hash_type(MOD), - RGWBucketInfo() : flags(0), creation_time(0), has_instance_obj(false), num_shards(0), bucket_index_shard_hash_type(MOD), requester_pays(false) {} ++ RGWBucketInfo() : flags(0), creation_time(0), has_instance_obj(false), num_shards(0), bucket_index_shard_hash_type(MOD), requester_pays(false), + has_website(false) {} }; WRITE_CLASS_ENCODER(RGWBucketInfo) diff --cc src/rgw/rgw_formats.h index 1a22f8093c33,43c087d440ea..9ce841a55481 --- a/src/rgw/rgw_formats.h +++ b/src/rgw/rgw_formats.h @@@ -22,12 -22,9 +22,12 @@@ struct plain_stack_entry class RGWFormatter_Plain : public Formatter { void reset_buf(); public: - RGWFormatter_Plain(); + RGWFormatter_Plain(bool use_kv = false); virtual ~RGWFormatter_Plain(); - virtual void set_status(const char* status, const char* status_name) {}; ++ virtual void set_status(int status, const char* status_name) {}; + virtual void output_header() {}; + virtual void output_footer() {}; virtual void flush(ostream& os); virtual void reset(); diff --cc src/rgw/rgw_json_enc.cc index c1e4bcf4087e,e18bbb658441..3e6a031b3ed3 --- a/src/rgw/rgw_json_enc.cc +++ b/src/rgw/rgw_json_enc.cc @@@ -623,10 -555,7 +629,11 @@@ void RGWBucketInfo::dump(Formatter *f) encode_json("quota", quota, f); encode_json("num_shards", num_shards, f); encode_json("bi_shard_hash_type", (uint32_t)bucket_index_shard_hash_type, f); + encode_json("requester_pays", requester_pays, f); + encode_json("has_website", has_website, f); + if (has_website) { + encode_json("website_conf", website_conf, f); + } } void RGWBucketInfo::decode_json(JSONObj *obj) { @@@ -642,10 -571,7 +649,11 @@@ uint32_t hash_type; JSONDecoder::decode_json("bi_shard_hash_type", hash_type, obj); bucket_index_shard_hash_type = (uint8_t)hash_type; + JSONDecoder::decode_json("requester_pays", requester_pays, obj); + JSONDecoder::decode_json("has_website", has_website, obj); + if (has_website) { + JSONDecoder::decode_json("website_conf", website_conf, obj); + } } void RGWObjEnt::dump(Formatter *f) const diff --cc src/rgw/rgw_op.h index 19acfa3fb922,880c634e016e..06d0f206c76c --- a/src/rgw/rgw_op.h +++ b/src/rgw/rgw_op.h @@@ -189,18 -197,87 +201,99 @@@ public virtual bool need_object_expiration() { return false; } }; +class RGWGetObj_CB : public RGWGetDataCB +{ + RGWGetObj *op; +public: + RGWGetObj_CB(RGWGetObj *_op) : op(_op) {} + virtual ~RGWGetObj_CB() {} + + int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) { + return op->get_data_cb(bl, bl_ofs, bl_len); + } +}; + + class RGWBulkDelete : public RGWOp { + public: + struct acct_path_t { + std::string bucket_name; + rgw_obj_key obj_key; + }; + + struct fail_desc_t { + int err; + acct_path_t path; + }; + + class Deleter { + protected: + unsigned int num_deleted; + unsigned int num_unfound; + std::list failures; + + RGWRados * const store; + req_state * const s; + + public: + Deleter(RGWRados * const str, req_state * const s) + : num_deleted(0), + num_unfound(0), + store(str), + s(s) { + } + + unsigned int get_num_deleted() const { + return num_deleted; + } + + unsigned int get_num_unfound() const { + return num_unfound; + } + + const std::list get_failures() const { + return failures; + } + + bool verify_permission(RGWBucketInfo& binfo, + map& battrs, + rgw_obj& obj, + ACLOwner& bucket_owner /* out */); + bool verify_permission(RGWBucketInfo& binfo, + map& battrs); + bool delete_single(const acct_path_t& path); + bool delete_chunk(const std::list& paths); + }; + /* End of Deleter subclass */ + + static const size_t MAX_CHUNK_ENTRIES = 1024; + + protected: + int ret; + std::unique_ptr deleter; + + public: + RGWBulkDelete() + : ret(0), + deleter(nullptr) { + } + + int verify_permission(); + void pre_exec(); + void execute(); + + virtual int get_data(std::list& items, + bool * is_truncated) = 0; + virtual void send_response() = 0; + + virtual const string name() { return "bulk_delete"; } + virtual RGWOpType get_type() { return RGW_OP_BULK_DELETE; } + virtual uint32_t op_mask() { return RGW_OP_TYPE_DELETE; } + }; + + inline ostream& operator<<(ostream& out, const RGWBulkDelete::acct_path_t &o) { + return out << o.bucket_name << "/" << o.obj_key; + } + #define RGW_LIST_BUCKETS_LIMIT_MAX 10000 class RGWListBuckets : public RGWOp { diff --cc src/rgw/rgw_rest.cc index dde67a4606d9,d7f6b6072f71..db7a056f2388 --- a/src/rgw/rgw_rest.cc +++ b/src/rgw/rgw_rest.cc @@@ -35,16 -32,13 +35,14 @@@ struct rgw_http_attr /* * mapping between rgw object attrs and output http fields */ - static struct rgw_http_attr rgw_to_http_attr_list[] = { - { RGW_ATTR_CONTENT_TYPE, "Content-Type"}, - { RGW_ATTR_CONTENT_LANG, "Content-Language"}, - { RGW_ATTR_EXPIRES, "Expires"}, - { RGW_ATTR_CACHE_CONTROL, "Cache-Control"}, - { RGW_ATTR_CONTENT_DISP, "Content-Disposition"}, - { RGW_ATTR_CONTENT_ENC, "Content-Encoding"}, - { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest"}, + static const struct rgw_http_attr base_rgw_to_http_attrs[] = { + { RGW_ATTR_CONTENT_LANG, "Content-Language" }, + { RGW_ATTR_EXPIRES, "Expires" }, + { RGW_ATTR_CACHE_CONTROL, "Cache-Control" }, + { RGW_ATTR_CONTENT_DISP, "Content-Disposition" }, + { RGW_ATTR_CONTENT_ENC, "Content-Encoding" }, + { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest" }, + { RGW_ATTR_AMZ_WEBSITE_REDIRECT_LOCATION, "Location"}, - { NULL, NULL}, }; @@@ -269,9 -247,8 +268,9 @@@ static bool rgw_find_host_in_domains(co return false; } - static void dump_status(struct req_state *s, const char *status, const char *status_name) + static void dump_status(struct req_state *s, int status, const char *status_name) { + s->formatter->set_status(status, status_name); int r = s->cio->send_status(status, status_name); if (r < 0) { ldout(s->cct, 0) << "ERROR: s->cio->send_status() returned err=" << r << dendl; @@@ -317,15 -294,10 +317,15 @@@ void set_req_state_err(struct rgw_err& return; } } + r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS)); if (r) { - if (s->prot_flags & RGW_REST_WEBSITE && err_no == ERR_WEBSITE_REDIRECT && !s->err.is_clear()) { - err.http_ret = r->http_ret; ++ if (prot_flags & RGW_REST_WEBSITE && err_no == ERR_WEBSITE_REDIRECT && err.is_clear()) { + // http_ret was custom set, so don't change it! + } else { - s->err.http_ret = r->http_ret; ++ err.http_ret = r->http_ret; + } - s->err.s3_code = r->s3_code; + err.s3_code = r->s3_code; return; } dout(0) << "WARNING: set_req_state_err err_no=" << err_no << " resorting to 500" << dendl; @@@ -586,15 -583,9 +615,15 @@@ void end_header(struct req_state *s, RG s->formatter->dump_string("Code", s->err.s3_code); if (!s->err.message.empty()) s->formatter->dump_string("Message", s->err.message); - if (!s->bucket_name_str.empty()) // TODO: connect to expose_bucket - s->formatter->dump_string("BucketName", s->bucket_name_str); - if (!s->trans_id.empty()) ++ if (!s->bucket_name.empty()) // TODO: connect to expose_bucket ++ s->formatter->dump_string("BucketName", s->bucket_name); + if (!s->trans_id.empty()) // TODO: connect to expose_bucket or another toggle s->formatter->dump_string("RequestId", s->trans_id); - s->formatter->close_section(); + s->formatter->dump_string("HostId", "FIXME-TODO-How-does-amazon-generate-HostId"); // TODO, FIXME + if (s->format != RGW_FORMAT_HTML) { + s->formatter->close_section(); + } + s->formatter->output_footer(); dump_content_length(s, s->formatter->get_len()); } else { if (proposed_content_length != NO_CONTENT_LENGTH) { diff --cc src/rgw/rgw_rest.h index 8954650066cc,4ba3c96f09ab..ca143ce4da10 --- a/src/rgw/rgw_rest.h +++ b/src/rgw/rgw_rest.h @@@ -380,11 -422,10 +428,11 @@@ extern void end_header(struct req_stat RGWOp *op = NULL, const char *content_type = NULL, const int64_t proposed_content_length = NO_CONTENT_LENGTH, - bool force_content_type = false); + bool force_content_type = false, + bool force_no_error = false); extern void dump_start(struct req_state *s); extern void list_all_buckets_start(struct req_state *s); - extern void dump_owner(struct req_state *s, string& id, string& name, const char *section = NULL); + extern void dump_owner(struct req_state *s, rgw_user& id, string& name, const char *section = NULL); extern void dump_string_header(struct req_state *s, const char *name, const char *val); extern void dump_content_length(struct req_state *s, uint64_t len); extern void dump_etag(struct req_state *s, const char *etag); diff --cc src/rgw/rgw_rest_s3.cc index 29033512a0c3,8c00e192c03f..6eaad59ed98f --- a/src/rgw/rgw_rest_s3.cc +++ b/src/rgw/rgw_rest_s3.cc @@@ -2598,179 -2681,11 +2782,179 @@@ RGWHandler *RGWRESTMgr_S3::get_handler( if (ret < 0) return NULL; - if (s->bucket_name.empty()) - return new RGWHandler_ObjStore_Service_S3; + RGWHandler* handler; + // TODO: Make this more readable + if (is_s3website) { - if (s->bucket_name_str.empty()) { ++ if (s->bucket_name.empty()) { + handler = new RGWHandler_ObjStore_Service_S3Website; + } else if (s->object.empty()) { + handler = new RGWHandler_ObjStore_Bucket_S3Website; + } else { + handler = new RGWHandler_ObjStore_Obj_S3Website; + } + } else { - if (s->bucket_name_str.empty()) { ++ if (s->bucket_name.empty()) { + handler = new RGWHandler_ObjStore_Service_S3; + } else if (s->object.empty()) { + handler = new RGWHandler_ObjStore_Bucket_S3; + } else { + handler = new RGWHandler_ObjStore_Obj_S3; + } + } + + ldout(s->cct, 20) << __func__ << " handler=" << typeid(*handler).name() << dendl; + return handler; +} + +int RGWHandler_ObjStore_S3Website::retarget(RGWOp *op, RGWOp **new_op) { + *new_op = op; + ldout(s->cct, 10) << __func__ << "Starting retarget" << dendl; - if (s->object.empty()) - return new RGWHandler_ObjStore_Bucket_S3; + if (!(s->prot_flags & RGW_REST_WEBSITE)) + return 0; + + RGWObjectCtx& obj_ctx = *static_cast(s->obj_ctx); - int ret = store->get_bucket_info(obj_ctx, s->bucket_name_str, s->bucket_info, NULL, &s->bucket_attrs); ++ int ret = store->get_bucket_info(obj_ctx, s->bucket_tenant, s->bucket_name, s->bucket_info, NULL, &s->bucket_attrs); + if (ret < 0) { + // TODO-FUTURE: if the bucket does not exist, maybe expose it here? + return -ERR_NO_SUCH_BUCKET; + } + if (!s->bucket_info.has_website) { + // TODO-FUTURE: if the bucket has no WebsiteConfig, expose it here + return -ERR_NO_SUCH_WEBSITE_CONFIGURATION; + } + + rgw_obj_key new_obj; + s->bucket_info.website_conf.get_effective_key(s->object.name, &new_obj.name); + ldout(s->cct, 10) << "retarget get_effective_key " << s->object << " -> " << new_obj << dendl; + + RGWBWRoutingRule rrule; + bool should_redirect = s->bucket_info.website_conf.should_redirect(new_obj.name, 0, &rrule); + + if (should_redirect) { + const string& hostname = s->info.env->get("HTTP_HOST", ""); + const string& protocol = (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http"); + int redirect_code = 0; + rrule.apply_rule(protocol, hostname, s->object.name, &s->redirect, &redirect_code); + // APply a custom HTTP response code + if (redirect_code > 0) + s->err.http_ret = redirect_code; // Apply a custom HTTP response code + ldout(s->cct, 10) << "retarget redirect code=" << redirect_code << " proto+host:" << protocol << "://" << hostname << " -> " << s->redirect << dendl; + return -ERR_WEBSITE_REDIRECT; + } + +#warning FIXME +#if 0 + if (s->object.empty() != new_obj.empty()) { + op->put(); + s->object = new_obj; + *new_op = get_op(); + } +#endif + + s->object = new_obj; + + return 0; +} - return new RGWHandler_ObjStore_Obj_S3; +RGWOp *RGWHandler_ObjStore_S3Website::op_get() +{ + return get_obj_op(true); +} + +RGWOp *RGWHandler_ObjStore_S3Website::op_head() +{ + return get_obj_op(false); +} + +int RGWHandler_ObjStore_S3Website::get_errordoc(const string errordoc_key, string *error_content) { + ldout(s->cct, 20) << "TODO Serve Custom error page here if bucket has " << dendl; + *error_content = errordoc_key; + // 1. Check if errordoc exists + // 2. Check if errordoc is public + // 3. Fetch errordoc content +#warning 2015119: FIXME need to clear all + RGWGetObj_ObjStore_S3Website *getop = new RGWGetObj_ObjStore_S3Website(true); + getop->set_get_data(true); + getop->init(store, s, this); + + RGWGetObj_CB cb(getop); + rgw_obj obj(s->bucket, errordoc_key); + RGWObjectCtx rctx(store); + //RGWRados::Object op_target(store, s->bucket_info, *static_cast(s->obj_ctx), obj); + RGWRados::Object op_target(store, s->bucket_info, rctx, obj); + RGWRados::Object::Read read_op(&op_target); + + int ret; + int64_t ofs = 0; + int64_t end = -1; + ret = read_op.prepare(&ofs, &end); + if (ret < 0) + return ret; + + ret = read_op.iterate(ofs, end, &cb); // FIXME: need to know the final size? + return ret; +} + +int RGWHandler_ObjStore_S3Website::error_handler(int err_no, string *error_content) { + const struct rgw_http_errors *r; + int http_error_code = -1; + r = search_err(err_no, RGW_HTTP_ERRORS, ARRAY_LEN(RGW_HTTP_ERRORS)); + if (r) { + http_error_code = r->http_ret; + } + + RGWBWRoutingRule rrule; + bool should_redirect = s->bucket_info.website_conf.should_redirect(s->object.name, http_error_code, &rrule); + + if (should_redirect) { + const string& hostname = s->info.env->get("HTTP_HOST", ""); + const string& protocol = (s->info.env->get("SERVER_PORT_SECURE") ? "https" : "http"); + int redirect_code = 0; + rrule.apply_rule(protocol, hostname, s->object.name, &s->redirect, &redirect_code); + // APply a custom HTTP response code + if (redirect_code > 0) + s->err.http_ret = redirect_code; // Apply a custom HTTP response code + ldout(s->cct, 10) << "error handler redirect code=" << redirect_code << " proto+host:" << protocol << "://" << hostname << " -> " << s->redirect << dendl; + return -ERR_WEBSITE_REDIRECT; + } else if (!s->bucket_info.website_conf.error_doc.empty()) { + RGWHandler_ObjStore_S3Website::get_errordoc(s->bucket_info.website_conf.error_doc, error_content); + } else { + ldout(s->cct, 20) << "No special error handling today!" << dendl; + } + + return err_no; +} + +RGWOp *RGWHandler_ObjStore_Obj_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website *op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + +RGWOp *RGWHandler_ObjStore_Bucket_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website *op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; +} + +RGWOp *RGWHandler_ObjStore_Service_S3Website::get_obj_op(bool get_data) +{ + /** If we are in website mode, then it is explicitly impossible to run GET or + * HEAD on the actual directory. We must convert the request to run on the + * suffix object instead! + */ + RGWGetObj_ObjStore_S3Website *op = new RGWGetObj_ObjStore_S3Website; + op->set_get_data(get_data); + return op; } diff --cc src/tracing/Makefile.am index 30c5b92b3469,5c6a4e243287..1c2349e5d2b6 --- a/src/tracing/Makefile.am +++ b/src/tracing/Makefile.am @@@ -1,80 -1,78 +1,78 @@@ - EXTRA_DIST = tracing-common.h + EXTRA_DIST += \ + tracing/tracing-common.h if WITH_LTTNG - %.c %.h: %.tp - $(LTTNG_GEN_TP_PROG) $< -o $*.c -o $*.h + tracing/%.h: tracing/%.tp + $(LTTNG_GEN_TP_PROG) $< -o tracing/$*.h endif - dist_noinst_DATA = \ - librados.tp \ - librbd.tp \ - oprequest.tp \ - osd.tp \ - pg.tp \ - objectstore.tp + dist_noinst_DATA += \ + tracing/librados.tp \ + tracing/librbd.tp \ + tracing/oprequest.tp \ + tracing/osd.tp \ + tracing/pg.tp \ + tracing/objectstore.tp if WITH_LTTNG + libosd_tp_la_SOURCES = \ + tracing/oprequest.c \ + tracing/osd.c \ + tracing/pg.c nodist_libosd_tp_la_SOURCES = \ - oprequest.c \ - oprequest.h \ - osd.c \ - osd.h \ - pg.h \ - pg.c + tracing/oprequest.h \ + tracing/osd.h \ + tracing/pg.h endif -libosd_tp_la_LIBADD = -llttng-ust -ldl +libosd_tp_la_LIBADD = -ldl -llttng-ust - libosd_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE - libosd_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS) - libosd_tp_la_LDFLAGS = + libosd_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic + libosd_tp_la_LDFLAGS = -version-info 1:0:0 if WITH_LTTNG + librados_tp_la_SOURCES = \ + tracing/librados.c nodist_librados_tp_la_SOURCES = \ - librados.c \ - librados.h + tracing/librados.h endif -librados_tp_la_LIBADD = -llttng-ust -ldl +librados_tp_la_LIBADD = -ldl -llttng-ust - librados_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE - librados_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS) - librados_tp_la_LDFLAGS = + librados_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic + librados_tp_la_LDFLAGS = -version-info 2:0:0 if WITH_LTTNG + librbd_tp_la_SOURCES = \ + tracing/librbd.c nodist_librbd_tp_la_SOURCES = \ - librbd.c \ - librbd.h + tracing/librbd.h endif -librbd_tp_la_LIBADD = -llttng-ust -ldl +librbd_tp_la_LIBADD = -ldl -llttng-ust - librbd_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE - librbd_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS) - librbd_tp_la_LDFLAGS = + librbd_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic + librbd_tp_la_LDFLAGS = -version-info 1:0:0 if WITH_LTTNG + libos_tp_la_SOURCES = \ + tracing/objectstore.c nodist_libos_tp_la_SOURCES = \ - objectstore.c \ - objectstore.h + tracing/objectstore.h endif -libos_tp_la_LIBADD = -llttng-ust -ldl +libos_tp_la_LIBADD = -ldl -llttng-ust - libos_tp_la_CPPFLAGS = -DTRACEPOINT_PROBE_DYNAMIC_LINKAGE - libos_tp_la_CFLAGS = -I$(top_srcdir)/src $(AM_CFLAGS) - libos_tp_la_LDFLAGS = + libos_tp_la_CFLAGS = -I$(top_srcdir)/src/tracing -I$(top_srcdir)/src $(AM_CFLAGS) -fpic + libos_tp_la_LDFLAGS = -version-info 1:0:0 if WITH_LTTNG - noinst_LTLIBRARIES = \ + lib_LTLIBRARIES += \ libosd_tp.la \ + libos_tp.la \ librados_tp.la \ - librbd_tp.la \ - libos_tp.la + librbd_tp.la - BUILT_SOURCES = \ - librados.h \ - librbd.h \ - oprequest.h \ - osd.h \ - pg.h \ - objectstore.h + BUILT_SOURCES += \ + tracing/librados.h \ + tracing/librbd.h \ + tracing/objectstore.h \ + tracing/oprequest.h \ + tracing/osd.h \ + tracing/pg.h endif - CLEANFILES = \ - $(nodist_libosd_tp_la_SOURCES) \ - $(nodist_librados_tp_la_SOURCES) \ - $(nodist_librbd_tp_la_SOURCES) \ - $(nodist_libos_tp_la_SOURCES) + CLEANFILES += \ + $(BUILT_SOURCES)