-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
+ * License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
- *
+ *
*/
#ifndef CEPH_OBJECTSTORE_H
#define CEPH_OBJECTSTORE_H
public:
/**
- * create - create an ObjectStore instance
+ * create - create an ObjectStore instance.
+ *
+ * This is invoked once at initialization time.
*
- * @param type type of store
+ * @param type type of store. This is a string from the configuration file.
* @param data path (or other descriptor) for data
* @param journal path (or other descriptor) for journal (optional)
*/
Logger *logger;
+ /**
+ * Fetch Object Store statistics.
+ *
+ * Currently only latency of write and apply times are measured.
+ *
+ * This appears to be called with nothing locked.
+ */
virtual objectstore_perf_stat_t get_cur_stats() = 0;
/**
* Any transactions queued under a given sequencer will be applied in
* sequence. Transactions queued under different sequencers may run
* in parallel.
+ *
+ * Clients of ObjectStore create and maintain their own Sequencer objects.
+ * When a list of transactions is queued the caller specifies a Sequencer to be used.
+ *
+ */
+
+ /**
+ * ABC for Sequencer implementation, private to the ObjectStore derived class.
+ * created in ...::queue_transaction(s)
*/
struct Sequencer_impl {
virtual void flush() = 0;
virtual ~Sequencer_impl() {}
};
+
+ /**
+ * External (opaque) sequencer implementation
+ */
struct Sequencer {
string name;
Sequencer_impl *p;
p->flush();
}
};
-
+
+ /*********************************
+ *
+ * Object Contents and semantics
+ *
+ * All ObjectStore objects are identified as a named object
+ * (ghobject_t and hobject_t) in a named collection (coll_t).
+ * ObjectStore operations support the creation, mutation, deletion
+ * and enumeration of objects within a collection. Enumeration is
+ * in sorted key order (where keys are sorted by hash). Object names
+ * are globally unique.
+ *
+ * Each object has four distinct parts: byte data, xattrs, omap_header
+ * and omap entries.
+ *
+ * The data portion of an object is conceptually equivalent to a
+ * file in a file system. Random and Partial access for both read
+ * and operations is required. The ability to have a sparse
+ * implementation of the data portion of an object is beneficial for
+ * some workloads, but not required. There is a system-wide limit on
+ * the maximum size of an object, which is typically around 100 MB.
+ *
+ * Xattrs are equivalent to the extended attributes of file
+ * systems. Xattrs are a set of key/value pairs. Sub-value access
+ * is not required. It is possible to enumerate the set of xattrs in
+ * key order. At the implementation level, xattrs are used
+ * exclusively internal to Ceph and the implementer can expect the
+ * total size of all of the xattrs on an object to be relatively
+ * small, i.e., less than 64KB. Much of Ceph assumes that accessing
+ * xattrs on temporally adjacent object accesses (recent past or
+ * near future) is inexpensive.
+ *
+ * omap_header is a single blob of data. It can be read or written
+ * in total.
+ *
+ * Omap entries are conceptually the same as xattrs
+ * but in a different address space. In other words, you can have
+ * the same key as an xattr and an omap entry and they have distinct
+ * values. Enumeration of xattrs doesn't include omap entries and
+ * vice versa. The size and access characteristics of omap entries
+ * are very different from xattrs. In particular, the value portion
+ * of an omap entry can be quite large (MBs). More importantly, the
+ * interface must support efficient range queries on omap entries even
+ * when there are a large numbers of entries.
+ *
+ *********************************/
+
+ /*******************************
+ *
+ * Collections
+ *
+ * A collection is simply a grouping of objects. Collections have
+ * names (coll_t) and can be enumerated in order. Like an
+ * individual object, a collection also has a set of xattrs.
+ *
+ *
+ */
+
/*********************************
* transaction
+ *
+ * A Transaction represents a sequence of primitive mutation
+ * operations.
+ *
+ * Three events in the life of a Transaction result in
+ * callbacks. Any Transaction can contain any number of callback
+ * objects (Context) for any combination of the three classes of
+ * callbacks:
+ *
+ * on_applied_sync, on_applied, and on_commit.
+ *
+ * The "on_applied" and "on_applied_sync" callbacks are invoked when
+ * the modifications requested by the Transaction are visible to
+ * subsequent ObjectStore operations, i.e., the results are
+ * readable. The only conceptual difference between on_applied and
+ * on_applied_sync is the specific thread and locking environment in
+ * which the callbacks operate. "on_applied_sync" is called
+ * directly by an ObjectStore execution thread. It is expected to
+ * execute quickly and must not acquire any locks of the calling
+ * environment. Conversely, "on_applied" is called from the separate
+ * Finisher thread, meaning that it can contend for calling
+ * environment locks. NB, on_applied and on_applied sync are
+ * sometimes called on_readable and on_readable_sync.
+ *
+ * The "on_commit" callback is also called from the Finisher thread
+ * and indicates that all of the mutations have been durably
+ * committed to stable storage (i.e., are now software/hardware
+ * crashproof).
+ *
+ * At the implementation level, each mutation primitive (and its
+ * associated data) can be serialized to a single buffer. That
+ * serialization, however, does not copy any data, but (using the
+ * bufferlist library) will reference the original buffers. This
+ * implies that the buffer that contains the data being submitted
+ * must remain stable until the on_commit callback completes. In
+ * practice, bufferlist handles all of this for you and this
+ * subtlety is only relevant if you are referencing an existing
+ * buffer via buffer::raw_static.
+ *
+ * Some implementations of ObjectStore choose to implement their own
+ * form of journaling that uses the serialized form of a
+ * Transaction. This requires that the encode/decode logic properly
+ * version itself and handle version upgrades that might change the
+ * format of the encoded Transaction. This has already happened a
+ * couple of times and the Transaction object contains some helper
+ * variables that aid in this legacy decoding:
+ *
+ * sobject_encoding detects an older/simpler version of oid
+ * present in pre-bobtail versions of ceph. use_pool_override
+ * also detects a situation where the pool of an oid can be
+ * override for legacy operations/buffers. For non-legacy
+ * implementation of ObjectStore, neither of these fields is
+ * relevant.
+ *
+ *
+ * TRANSACTION ISOLATION
+ *
+ * Except as noted below, isolation is the responsibility of the
+ * caller. In other words, if any storage element (storage element
+ * == any of the four portions of an object as described above) is
+ * altered by a transaction (including deletion), the caller
+ * promises not to attempt to read that element while the
+ * transaction is pending (here pending means from the time of
+ * issuance until the "on_applied_sync" callback has been
+ * received). Violations of isolation need not be detected by
+ * ObjectStore and there is no corresponding error mechanism for
+ * reporting an isolation violation (crashing would be the
+ * appropriate way to report an isolation violation if detected).
+ *
+ * Enumeration operations may violate transaction isolation as
+ * described above when a storage element is being created or
+ * deleted as part of a transaction. In this case, ObjectStore is
+ * allowed to consider the enumeration operation to either preceed
+ * or follow the violating transaction element. In other words, the
+ * presence/absence of the mutated element in the enumeration is
+ * entirely at the discretion of ObjectStore. The arbitrary ordering
+ * applies independently to each transaction element. For example,
+ * if a transaction contains two mutating elements "create A" and
+ * "delete B". And an enumeration operation is performed while this
+ * transaction is pending. It is permissable for ObjectStore to
+ * report any of the four possible combinations of the existance of
+ * A and B.
+ *
*/
class Transaction {
public:
OP_COLL_SETATTRS = 26, // cid, attrset
OP_COLL_MOVE = 8, // newcid, oldcid, oid
- OP_STARTSYNC = 27, // start a sync
+ OP_STARTSYNC = 27, // start a sync
OP_RMATTRS = 28, // cid, oid
OP_COLL_RENAME = 29, // cid, newcid
void set_tolerate_collection_add_enoent() {
tolerate_collection_add_enoent = true;
}
+
+ /* Operations on callback contexts */
void register_on_applied(Context *c) {
if (!c) return;
on_applied.push_back(c);
return C_Contexts::list_to_context(on_applied_sync);
}
+ /// For legacy transactions, provide the pool to override the encoded pool with
void set_pool_override(int64_t pool) {
pool_override = pool;
}
tbl.swap(other.tbl);
}
+ /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction
void append(Transaction& other) {
ops += other.ops;
assert(pad_unused_bytes == 0);
on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync);
}
+ /** Inquires about the Transaction as a whole. */
+
+ /// How big is the encoded Transaction buffer?
uint64_t get_encoded_bytes() {
return 1 + 8 + 8 + 4 + 4 + 4 + 4 + tbl.length();
}
uint64_t get_num_bytes() {
return get_encoded_bytes();
}
-
+ /// Size of largest data buffer to the "write" operation encountered so far
uint32_t get_data_length() {
return largest_data_len;
}
+ /// offset within the encoded buffer to the start of the first data buffer that's encoded
uint32_t get_data_offset() {
if (largest_data_off_in_tbl) {
return largest_data_off_in_tbl +
}
return 0; // none
}
+ /// offset of buffer as aligned to destination within object.
int get_data_alignment() {
if (!largest_data_len)
return -1;
return (largest_data_off - get_data_offset()) & ~CEPH_PAGE_MASK;
}
-
+ /// Is the Transaction empty (no operations)
bool empty() {
return !ops;
}
-
+ /// Number of operations in the transation
int get_num_ops() {
return ops;
}
- // ---- iterator ----
+ /**
+ * iterator
+ *
+ * Helper object to parse Transactions.
+ *
+ * ObjectStore instances use this object to step down the encoded
+ * buffer decoding operation codes and parameters as we go.
+ *
+ */
class iterator {
bufferlist::iterator p;
bool sobject_encoding;
bool tolerate_collection_add_enoent() const {
return _tolerate_collection_add_enoent;
}
+ /// true if there are more operations left to be enumerated
bool have_op() {
return !p.end();
}
+
+ /* Decode the specified type of object from the input
+ * stream. There is no checking that the encoded data is of the
+ * correct type.
+ */
int get_op() {
__u32 op;
::decode(op, p);
void get_bl(bufferlist& bl) {
::decode(bl, p);
}
+ /// Get an oid, recognize various legacy forms and update them.
ghobject_t get_oid() {
ghobject_t oid;
if (sobject_encoding) {
iterator begin() {
return iterator(this);
}
- // -----------------------------
+ /**
+ * Helper functions to encode the various mutation elements of a
+ * transaction. These are 1:1 with the operation codes (see
+ * enumeration above). These routines ensure that the
+ * encoder/creator of a transaction gets the right data in the
+ * right place. Sadly, there's no corresponding version nor any
+ * form of seat belts for the decoder.
+ */
+
+ /// Commence a global file system sync operation.
void start_sync() {
__u32 op = OP_STARTSYNC;
::encode(op, tbl);
ops++;
}
+ /// noop. 'nuf said
void nop() {
__u32 op = OP_NOP;
::encode(op, tbl);
ops++;
}
+ /**
+ * touch
+ *
+ * Ensure the existance of an object in a collection. Create an
+ * empty object if necessary
+ */
void touch(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_TOUCH;
::encode(op, tbl);
::encode(oid, tbl);
ops++;
}
- void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) {
+ /**
+ * Write data to an offset within an object. If the object is too
+ * small, it is expanded as needed. It is possible to specify an
+ * offset beyond the current end of an object and it will be
+ * expanded as needed. Simple implementations of ObjectStore will
+ * just zero the data between the old end of the object and the
+ * newly provided data. More sophisticated implementations of
+ * ObjectStore will omit the untouched data and store it as a
+ * "hole" in the file.
+ */
+ void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len,
+ const bufferlist& data) {
__u32 op = OP_WRITE;
::encode(op, tbl);
::encode(cid, tbl);
if (data.length() > largest_data_len) {
largest_data_len = data.length();
largest_data_off = off;
- largest_data_off_in_tbl = tbl.length() + sizeof(__u32); // we are about to
+ largest_data_off_in_tbl = tbl.length() + sizeof(__u32); // we are about to
}
::encode(data, tbl);
ops++;
}
+ /**
+ * zero out the indicated byte range within an object. Some
+ * ObjectStore instances may optimize this to release the
+ * underlying storage space.
+ */
void zero(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len) {
__u32 op = OP_ZERO;
::encode(op, tbl);
::encode(len, tbl);
ops++;
}
+ /// Discard all data in the object beyond the specified size.
void truncate(coll_t cid, const ghobject_t& oid, uint64_t off) {
__u32 op = OP_TRUNCATE;
::encode(op, tbl);
::encode(off, tbl);
ops++;
}
+ /// Remove an object. All four parts of the object are removed.
void remove(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_REMOVE;
::encode(op, tbl);
::encode(oid, tbl);
ops++;
}
+ /// Set an xattr of an object
void setattr(coll_t cid, const ghobject_t& oid, const char* name, bufferlist& val) {
string n(name);
setattr(cid, oid, n, val);
}
+ /// Set an xattr of an object
void setattr(coll_t cid, const ghobject_t& oid, const string& s, bufferlist& val) {
__u32 op = OP_SETATTR;
::encode(op, tbl);
::encode(val, tbl);
ops++;
}
+ /// Set multiple xattrs of an object
void setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& attrset) {
__u32 op = OP_SETATTRS;
::encode(op, tbl);
::encode(attrset, tbl);
ops++;
}
- void setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferlist>& attrset) {
+ /// Set multiple xattrs of an object
+ void setattrs(coll_t cid, const hobject_t& oid, map<string,bufferlist>& attrset) {
__u32 op = OP_SETATTRS;
::encode(op, tbl);
::encode(cid, tbl);
::encode(attrset, tbl);
ops++;
}
+ /// remove an xattr from an object
void rmattr(coll_t cid, const ghobject_t& oid, const char *name) {
string n(name);
rmattr(cid, oid, n);
}
+ /// remove an xattr from an object
void rmattr(coll_t cid, const ghobject_t& oid, const string& s) {
__u32 op = OP_RMATTR;
::encode(op, tbl);
::encode(s, tbl);
ops++;
}
+ /// remove all xattrs from an object
void rmattrs(coll_t cid, const ghobject_t& oid) {
__u32 op = OP_RMATTRS;
::encode(op, tbl);
::encode(oid, tbl);
ops++;
}
+ /**
+ * Clone an object into another object.
+ *
+ * Low-cost (e.g., O(1)) cloning (if supported) is best, but
+ * fallback to an O(n) copy is allowed. All four parts of the
+ * object are cloned (data, xattrs, omap header, omap
+ * entries).
+ *
+ * The destination named object may already exist in
+ * which case its previous contents are discarded.
+ */
void clone(coll_t cid, const ghobject_t& oid, ghobject_t noid) {
__u32 op = OP_CLONE;
::encode(op, tbl);
::encode(noid, tbl);
ops++;
}
+ /**
+ * Clone a byte range from one object to another.
+ *
+ * The data portion of the destination object receives a copy of a
+ * portion of the data from the source object. None of the other
+ * three parts of an object is copied from the source.
+ */
void clone_range(coll_t cid, const ghobject_t& oid, ghobject_t noid,
uint64_t srcoff, uint64_t srclen, uint64_t dstoff) {
__u32 op = OP_CLONERANGE2;
::encode(dstoff, tbl);
ops++;
}
+ /// Create the collection
void create_collection(coll_t cid) {
__u32 op = OP_MKCOLL;
::encode(op, tbl);
::encode(cid, tbl);
ops++;
}
+ /// remove the collection, the collection must be empty
void remove_collection(coll_t cid) {
__u32 op = OP_RMCOLL;
::encode(op, tbl);
::encode(cid, tbl);
ops++;
}
+ /**
+ * Add object to another collection (DEPRECATED)
+ *
+ * The Object is added to the new collection. This is a virtual
+ * add, we now have two names for the same object. This is only
+ * used for conversion of old stores to new stores and is not
+ * needed for new implementations unless they expect to make use
+ * of the conversion infrastructure.
+ */
void collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid) {
__u32 op = OP_COLL_ADD;
::encode(op, tbl);
ops++;
}
+ /// Set an xattr on a collection
void collection_setattr(coll_t cid, const char* name, bufferlist& val) {
string n(name);
collection_setattr(cid, n, val);
}
+ /// Set an xattr on a collection
void collection_setattr(coll_t cid, const string& name, bufferlist& val) {
__u32 op = OP_COLL_SETATTR;
::encode(op, tbl);
ops++;
}
+ /// Remove an xattr from a collection
void collection_rmattr(coll_t cid, const char* name) {
string n(name);
collection_rmattr(cid, n);
}
+ /// Remove an xattr from a collection
void collection_rmattr(coll_t cid, const string& name) {
__u32 op = OP_COLL_RMATTR;
::encode(op, tbl);
::encode(name, tbl);
ops++;
}
+ /// Set multiple xattrs on a collection
void collection_setattrs(coll_t cid, map<string,bufferptr>& aset) {
__u32 op = OP_COLL_SETATTRS;
::encode(op, tbl);
::encode(aset, tbl);
ops++;
}
+ /// Set multiple xattrs on a collection
void collection_setattrs(coll_t cid, map<string,bufferlist>& aset) {
__u32 op = OP_COLL_SETATTRS;
::encode(op, tbl);
::encode(aset, tbl);
ops++;
}
+ /// Change the name of a collection
void collection_rename(coll_t cid, coll_t ncid) {
__u32 op = OP_COLL_RENAME;
::encode(op, tbl);
/// Remove key range from oid omap
void omap_rmkeyrange(
coll_t cid, ///< [in] Collection containing oid
- const ghobject_t &oid, ///< [in] Object from which to remove the omap
+ const ghobject_t &oid, ///< [in] Object from which to remove the omap keys
const string& first, ///< [in] first key in range
- const string& last ///< [in] first key past range
+ const string& last ///< [in] first key past range, range is [first,last)
) {
__u32 op = OP_OMAP_RMKEYRANGE;
::encode(op, tbl);
/// Set omap header
void omap_setheader(
coll_t cid, ///< [in] Collection containing oid
- const ghobject_t &oid, ///< [in] Object from which to remove the omap
+ const ghobject_t &oid, ///< [in] Object
const bufferlist &bl ///< [in] Header value
) {
__u32 op = OP_OMAP_SETHEADER;
ops++;
}
- /// Split collection based on given prefixes
+ /// Split collection based on given prefixes, objects matching the specified bits/rem are
+ /// moved to the new collection
void split_collection(
coll_t cid,
uint32_t bits,
replica(false),
tolerate_collection_add_enoent(false) {
bufferlist::iterator dp = nbl.begin();
- decode(dp);
+ decode(dp);
}
void encode(bufferlist& bl) const {
list<Transaction *> tls;
tls.push_back(t);
return queue_transactions(osr, tls, new C_DeleteTransaction(t),
- NULL, NULL, TrackedOpRef(), handle);
+ NULL, NULL, TrackedOpRef(), handle);
}
int queue_transaction(Sequencer *osr, Transaction *t, Context *onreadable, Context *ondisk=0,
list<Transaction*> tls;
tls.push_back(t);
return queue_transactions(osr, tls, onreadable, ondisk, onreadable_sync,
- op, handle);
+ op, handle);
}
int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
*/
virtual int get_ideal_list_max() { return 64; }
- // objects
+ /**
+ * Synchronous read operations
+ */
+
+
+ /**
+ * exists -- Test for existance of object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @returns true if object exists, false otherwise
+ */
virtual bool exists(coll_t cid, const ghobject_t& oid) = 0; // useful?
+
+ /**
+ * stat -- get information for an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param st output information for the object
+ * @param allow_eio if false, assert on -EIO operation failure
+ * @returns 0 on success, negative error code on failure.
+ */
virtual int stat(
coll_t cid,
const ghobject_t& oid,
struct stat *st,
bool allow_eio = false) = 0; // struct stat?
- virtual int read(
+ /**
+ * read -- read a byte range of data from an object
+ *
+ * Note: if reading from an offset past the end of the object, we
+ * return 0 (not, say, -EINVAL).
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param offset location offset of first byte to be read
+ * @param len number of bytes to be read
+ * @param bl output bufferlist
+ * @param allow_eio if false, assert on -EIO operation failure
+ * @returns number of bytes read on success, or negative error code on failure.
+ */
+ virtual int read(
coll_t cid,
const ghobject_t& oid,
uint64_t offset,
bufferlist& bl,
bool allow_eio = false) = 0;
+ /**
+ * fiemap -- get extent map of data of an object
+ *
+ * Returns an encoded map of the extents of an object's data portion
+ * (map<offset,size>).
+ *
+ * A non-enlightend implementation is free to return the extent (offset, len)
+ * as the sole extent.
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param offset location offset of first byte to be read
+ * @param len number of bytes to be read
+ * @param bl output bufferlist for extent map information.
+ * @returns 0 on success, negative error code on failure.
+ */
virtual int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0;
+ /**
+ * getattr -- get an xattr of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param name name of attr to read
+ * @param value place to put output result.
+ * @returns 0 on success, negative error code on failure.
+ */
virtual int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value) = 0;
+
+ /**
+ * getattr -- get an xattr of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param name name of attr to read
+ * @param value place to put output result.
+ * @returns 0 on success, negative error code on failure.
+ */
int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferlist& value) {
bufferptr bp;
int r = getattr(cid, oid, name, bp);
value.push_back(bp);
return r;
}
- int getattr(
- coll_t cid, const ghobject_t& oid,
- const string name, bufferlist& value) {
- bufferptr bp;
- int r = getattr(cid, oid, name.c_str(), bp);
- value.push_back(bp);
- return r;
- }
+ /**
+ * getattrs -- get all of the xattrs of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param aset place to put output result.
+ * @param user_only true -> only user attributes are return else all attributes are returned
+ * @returns 0 on success, negative error code on failure.
+ */
virtual int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only = false) = 0;
+
+ /**
+ * getattrs -- get all of the xattrs of an object
+ *
+ * @param cid collection for object
+ * @param oid oid of object
+ * @param aset place to put output result.
+ * @param user_only true -> only user attributes are return else all attributes are returned
+ * @returns 0 on success, negative error code on failure.
+ */
int getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferlist>& aset, bool user_only = false) {
map<string,bufferptr> bmap;
int r = getattrs(cid, oid, bmap, user_only);
return r;
}
-
+
// collections
+
+ /**
+ * list_collections -- get all of the collections known to this ObjectStore
+ *
+ * @param ls list of the collections in sorted order.
+ * @returns 0 on success, negative error code on failure.
+ */
virtual int list_collections(vector<coll_t>& ls) = 0;
- virtual int collection_version_current(coll_t c, uint32_t *version) {
+
+ virtual int collection_version_current(coll_t c, uint32_t *version) {
*version = 0;
return 1;
}
+ /**
+ * does a collection exist?
+ *
+ * @param c collection
+ * @returns true if it exists, false otherwise
+ */
virtual bool collection_exists(coll_t c) = 0;
+ /**
+ * collection_getattr - get an xattr of a collection
+ *
+ * @param cid collection name
+ * @param name xattr name
+ * @param value pointer of buffer to receive value
+ * @param size size of buffer to receive value
+ * @returns 0 on success, negative error code on failure
+ */
virtual int collection_getattr(coll_t cid, const char *name,
- void *value, size_t size) = 0;
+ void *value, size_t size) = 0;
+ /**
+ * collection_getattr - get an xattr of a collection
+ *
+ * @param cid collection name
+ * @param name xattr name
+ * @param bl buffer to receive value
+ * @returns 0 on success, negative error code on failure
+ */
virtual int collection_getattr(coll_t cid, const char *name, bufferlist& bl) = 0;
+ /**
+ * collection_getattrs - get all xattrs of a collection
+ *
+ * @param cid collection name
+ * @param asert map of keys and buffers that contain the values
+ * @returns 0 on success, negative error code on failure
+ */
virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset) = 0;
+ /**
+ * is a collection empty?
+ *
+ * @param c collection
+ * @returns true if empty, false otherwise
+ */
virtual bool collection_empty(coll_t c) = 0;
+
+ /**
+ * collection_list - get all objects of a collection in sorted order
+ *
+ * @param c collection name
+ * @param o [out] list of objects
+ * @returns 0 on success, negative error code on failure
+ */
virtual int collection_list(coll_t c, vector<ghobject_t>& o) = 0;
/**
* @return zero on success, or negative error
*/
virtual int collection_list_partial(coll_t c, ghobject_t start,
- int min, int max, snapid_t snap,
+ int min, int max, snapid_t snap,
vector<ghobject_t> *ls, ghobject_t *next) = 0;
/**
* @return zero on success, or negative error
*/
virtual int collection_list_range(coll_t c, ghobject_t start, ghobject_t end,
- snapid_t seq, vector<ghobject_t> *ls) = 0;
+ snapid_t seq, vector<ghobject_t> *ls) = 0;
//TODO: Remove
int collection_list(coll_t c, vector<hobject_t>& o);
vector<hobject_t> *ls, hobject_t *next);
int collection_list_range(coll_t c, hobject_t start, hobject_t end,
- snapid_t seq, vector<hobject_t> *ls);
+ snapid_t seq, vector<hobject_t> *ls);
/// OMAP
/// Get omap contents
virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; }
virtual int snapshot(const string& name) { return -EOPNOTSUPP; }
-
+
+ /**
+ * Set and get internal fsid for this instance. No external data is modified
+ */
virtual void set_fsid(uuid_d u) = 0;
virtual uuid_d get_fsid() = 0;