From: Allen Samuels Date: Sun, 23 Feb 2014 15:29:05 +0000 (-0800) Subject: os/ObjectStore: document interface X-Git-Tag: v0.79~200^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F1290%2Fhead;p=ceph.git os/ObjectStore: document interface Signed-off-by: Sage Weil Reviewed-by: Haomai Wang --- diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 1b75ecb26501..c387a984f50c 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -1,4 +1,4 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system @@ -7,9 +7,9 @@ * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software + * License version 2.1, as published by the Free Software * Foundation. See file COPYING. - * + * */ #ifndef CEPH_OBJECTSTORE_H #define CEPH_OBJECTSTORE_H @@ -58,9 +58,11 @@ protected: public: /** - * create - create an ObjectStore instance + * create - create an ObjectStore instance. + * + * This is invoked once at initialization time. * - * @param type type of store + * @param type type of store. This is a string from the configuration file. * @param data path (or other descriptor) for data * @param journal path (or other descriptor) for journal (optional) */ @@ -71,6 +73,13 @@ public: Logger *logger; + /** + * Fetch Object Store statistics. + * + * Currently only latency of write and apply times are measured. + * + * This appears to be called with nothing locked. + */ virtual objectstore_perf_stat_t get_cur_stats() = 0; /** @@ -79,11 +88,24 @@ public: * Any transactions queued under a given sequencer will be applied in * sequence. Transactions queued under different sequencers may run * in parallel. + * + * Clients of ObjectStore create and maintain their own Sequencer objects. + * When a list of transactions is queued the caller specifies a Sequencer to be used. + * + */ + + /** + * ABC for Sequencer implementation, private to the ObjectStore derived class. + * created in ...::queue_transaction(s) */ struct Sequencer_impl { virtual void flush() = 0; virtual ~Sequencer_impl() {} }; + + /** + * External (opaque) sequencer implementation + */ struct Sequencer { string name; Sequencer_impl *p; @@ -104,10 +126,150 @@ public: p->flush(); } }; - + + /********************************* + * + * Object Contents and semantics + * + * All ObjectStore objects are identified as a named object + * (ghobject_t and hobject_t) in a named collection (coll_t). + * ObjectStore operations support the creation, mutation, deletion + * and enumeration of objects within a collection. Enumeration is + * in sorted key order (where keys are sorted by hash). Object names + * are globally unique. + * + * Each object has four distinct parts: byte data, xattrs, omap_header + * and omap entries. + * + * The data portion of an object is conceptually equivalent to a + * file in a file system. Random and Partial access for both read + * and operations is required. The ability to have a sparse + * implementation of the data portion of an object is beneficial for + * some workloads, but not required. There is a system-wide limit on + * the maximum size of an object, which is typically around 100 MB. + * + * Xattrs are equivalent to the extended attributes of file + * systems. Xattrs are a set of key/value pairs. Sub-value access + * is not required. It is possible to enumerate the set of xattrs in + * key order. At the implementation level, xattrs are used + * exclusively internal to Ceph and the implementer can expect the + * total size of all of the xattrs on an object to be relatively + * small, i.e., less than 64KB. Much of Ceph assumes that accessing + * xattrs on temporally adjacent object accesses (recent past or + * near future) is inexpensive. + * + * omap_header is a single blob of data. It can be read or written + * in total. + * + * Omap entries are conceptually the same as xattrs + * but in a different address space. In other words, you can have + * the same key as an xattr and an omap entry and they have distinct + * values. Enumeration of xattrs doesn't include omap entries and + * vice versa. The size and access characteristics of omap entries + * are very different from xattrs. In particular, the value portion + * of an omap entry can be quite large (MBs). More importantly, the + * interface must support efficient range queries on omap entries even + * when there are a large numbers of entries. + * + *********************************/ + + /******************************* + * + * Collections + * + * A collection is simply a grouping of objects. Collections have + * names (coll_t) and can be enumerated in order. Like an + * individual object, a collection also has a set of xattrs. + * + * + */ + /********************************* * transaction + * + * A Transaction represents a sequence of primitive mutation + * operations. + * + * Three events in the life of a Transaction result in + * callbacks. Any Transaction can contain any number of callback + * objects (Context) for any combination of the three classes of + * callbacks: + * + * on_applied_sync, on_applied, and on_commit. + * + * The "on_applied" and "on_applied_sync" callbacks are invoked when + * the modifications requested by the Transaction are visible to + * subsequent ObjectStore operations, i.e., the results are + * readable. The only conceptual difference between on_applied and + * on_applied_sync is the specific thread and locking environment in + * which the callbacks operate. "on_applied_sync" is called + * directly by an ObjectStore execution thread. It is expected to + * execute quickly and must not acquire any locks of the calling + * environment. Conversely, "on_applied" is called from the separate + * Finisher thread, meaning that it can contend for calling + * environment locks. NB, on_applied and on_applied sync are + * sometimes called on_readable and on_readable_sync. + * + * The "on_commit" callback is also called from the Finisher thread + * and indicates that all of the mutations have been durably + * committed to stable storage (i.e., are now software/hardware + * crashproof). + * + * At the implementation level, each mutation primitive (and its + * associated data) can be serialized to a single buffer. That + * serialization, however, does not copy any data, but (using the + * bufferlist library) will reference the original buffers. This + * implies that the buffer that contains the data being submitted + * must remain stable until the on_commit callback completes. In + * practice, bufferlist handles all of this for you and this + * subtlety is only relevant if you are referencing an existing + * buffer via buffer::raw_static. + * + * Some implementations of ObjectStore choose to implement their own + * form of journaling that uses the serialized form of a + * Transaction. This requires that the encode/decode logic properly + * version itself and handle version upgrades that might change the + * format of the encoded Transaction. This has already happened a + * couple of times and the Transaction object contains some helper + * variables that aid in this legacy decoding: + * + * sobject_encoding detects an older/simpler version of oid + * present in pre-bobtail versions of ceph. use_pool_override + * also detects a situation where the pool of an oid can be + * override for legacy operations/buffers. For non-legacy + * implementation of ObjectStore, neither of these fields is + * relevant. + * + * + * TRANSACTION ISOLATION + * + * Except as noted below, isolation is the responsibility of the + * caller. In other words, if any storage element (storage element + * == any of the four portions of an object as described above) is + * altered by a transaction (including deletion), the caller + * promises not to attempt to read that element while the + * transaction is pending (here pending means from the time of + * issuance until the "on_applied_sync" callback has been + * received). Violations of isolation need not be detected by + * ObjectStore and there is no corresponding error mechanism for + * reporting an isolation violation (crashing would be the + * appropriate way to report an isolation violation if detected). + * + * Enumeration operations may violate transaction isolation as + * described above when a storage element is being created or + * deleted as part of a transaction. In this case, ObjectStore is + * allowed to consider the enumeration operation to either preceed + * or follow the violating transaction element. In other words, the + * presence/absence of the mutated element in the enumeration is + * entirely at the discretion of ObjectStore. The arbitrary ordering + * applies independently to each transaction element. For example, + * if a transaction contains two mutating elements "create A" and + * "delete B". And an enumeration operation is performed while this + * transaction is pending. It is permissable for ObjectStore to + * report any of the four possible combinations of the existance of + * A and B. + * */ class Transaction { public: @@ -136,7 +298,7 @@ public: OP_COLL_SETATTRS = 26, // cid, attrset OP_COLL_MOVE = 8, // newcid, oldcid, oid - OP_STARTSYNC = 27, // start a sync + OP_STARTSYNC = 27, // start a sync OP_RMATTRS = 28, // cid, oid OP_COLL_RENAME = 29, // cid, newcid @@ -171,6 +333,8 @@ public: void set_tolerate_collection_add_enoent() { tolerate_collection_add_enoent = true; } + + /* Operations on callback contexts */ void register_on_applied(Context *c) { if (!c) return; on_applied.push_back(c); @@ -221,6 +385,7 @@ public: return C_Contexts::list_to_context(on_applied_sync); } + /// For legacy transactions, provide the pool to override the encoded pool with void set_pool_override(int64_t pool) { pool_override = pool; } @@ -240,6 +405,7 @@ public: tbl.swap(other.tbl); } + /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction void append(Transaction& other) { ops += other.ops; assert(pad_unused_bytes == 0); @@ -255,6 +421,9 @@ public: on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync); } + /** Inquires about the Transaction as a whole. */ + + /// How big is the encoded Transaction buffer? uint64_t get_encoded_bytes() { return 1 + 8 + 8 + 4 + 4 + 4 + 4 + tbl.length(); } @@ -262,10 +431,11 @@ public: uint64_t get_num_bytes() { return get_encoded_bytes(); } - + /// Size of largest data buffer to the "write" operation encountered so far uint32_t get_data_length() { return largest_data_len; } + /// offset within the encoded buffer to the start of the first data buffer that's encoded uint32_t get_data_offset() { if (largest_data_off_in_tbl) { return largest_data_off_in_tbl + @@ -281,21 +451,30 @@ public: } return 0; // none } + /// offset of buffer as aligned to destination within object. int get_data_alignment() { if (!largest_data_len) return -1; return (largest_data_off - get_data_offset()) & ~CEPH_PAGE_MASK; } - + /// Is the Transaction empty (no operations) bool empty() { return !ops; } - + /// Number of operations in the transation int get_num_ops() { return ops; } - // ---- iterator ---- + /** + * iterator + * + * Helper object to parse Transactions. + * + * ObjectStore instances use this object to step down the encoded + * buffer decoding operation codes and parameters as we go. + * + */ class iterator { bufferlist::iterator p; bool sobject_encoding; @@ -319,9 +498,15 @@ public: bool tolerate_collection_add_enoent() const { return _tolerate_collection_add_enoent; } + /// true if there are more operations left to be enumerated bool have_op() { return !p.end(); } + + /* Decode the specified type of object from the input + * stream. There is no checking that the encoded data is of the + * correct type. + */ int get_op() { __u32 op; ::decode(op, p); @@ -330,6 +515,7 @@ public: void get_bl(bufferlist& bl) { ::decode(bl, p); } + /// Get an oid, recognize various legacy forms and update them. ghobject_t get_oid() { ghobject_t oid; if (sobject_encoding) { @@ -388,18 +574,34 @@ public: iterator begin() { return iterator(this); } - // ----------------------------- + /** + * Helper functions to encode the various mutation elements of a + * transaction. These are 1:1 with the operation codes (see + * enumeration above). These routines ensure that the + * encoder/creator of a transaction gets the right data in the + * right place. Sadly, there's no corresponding version nor any + * form of seat belts for the decoder. + */ + + /// Commence a global file system sync operation. void start_sync() { __u32 op = OP_STARTSYNC; ::encode(op, tbl); ops++; } + /// noop. 'nuf said void nop() { __u32 op = OP_NOP; ::encode(op, tbl); ops++; } + /** + * touch + * + * Ensure the existance of an object in a collection. Create an + * empty object if necessary + */ void touch(coll_t cid, const ghobject_t& oid) { __u32 op = OP_TOUCH; ::encode(op, tbl); @@ -407,7 +609,18 @@ public: ::encode(oid, tbl); ops++; } - void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, const bufferlist& data) { + /** + * Write data to an offset within an object. If the object is too + * small, it is expanded as needed. It is possible to specify an + * offset beyond the current end of an object and it will be + * expanded as needed. Simple implementations of ObjectStore will + * just zero the data between the old end of the object and the + * newly provided data. More sophisticated implementations of + * ObjectStore will omit the untouched data and store it as a + * "hole" in the file. + */ + void write(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len, + const bufferlist& data) { __u32 op = OP_WRITE; ::encode(op, tbl); ::encode(cid, tbl); @@ -418,11 +631,16 @@ public: if (data.length() > largest_data_len) { largest_data_len = data.length(); largest_data_off = off; - largest_data_off_in_tbl = tbl.length() + sizeof(__u32); // we are about to + largest_data_off_in_tbl = tbl.length() + sizeof(__u32); // we are about to } ::encode(data, tbl); ops++; } + /** + * zero out the indicated byte range within an object. Some + * ObjectStore instances may optimize this to release the + * underlying storage space. + */ void zero(coll_t cid, const ghobject_t& oid, uint64_t off, uint64_t len) { __u32 op = OP_ZERO; ::encode(op, tbl); @@ -432,6 +650,7 @@ public: ::encode(len, tbl); ops++; } + /// Discard all data in the object beyond the specified size. void truncate(coll_t cid, const ghobject_t& oid, uint64_t off) { __u32 op = OP_TRUNCATE; ::encode(op, tbl); @@ -440,6 +659,7 @@ public: ::encode(off, tbl); ops++; } + /// Remove an object. All four parts of the object are removed. void remove(coll_t cid, const ghobject_t& oid) { __u32 op = OP_REMOVE; ::encode(op, tbl); @@ -447,10 +667,12 @@ public: ::encode(oid, tbl); ops++; } + /// Set an xattr of an object void setattr(coll_t cid, const ghobject_t& oid, const char* name, bufferlist& val) { string n(name); setattr(cid, oid, n, val); } + /// Set an xattr of an object void setattr(coll_t cid, const ghobject_t& oid, const string& s, bufferlist& val) { __u32 op = OP_SETATTR; ::encode(op, tbl); @@ -460,6 +682,7 @@ public: ::encode(val, tbl); ops++; } + /// Set multiple xattrs of an object void setattrs(coll_t cid, const ghobject_t& oid, map& attrset) { __u32 op = OP_SETATTRS; ::encode(op, tbl); @@ -468,7 +691,8 @@ public: ::encode(attrset, tbl); ops++; } - void setattrs(coll_t cid, const ghobject_t& oid, map& attrset) { + /// Set multiple xattrs of an object + void setattrs(coll_t cid, const hobject_t& oid, map& attrset) { __u32 op = OP_SETATTRS; ::encode(op, tbl); ::encode(cid, tbl); @@ -476,10 +700,12 @@ public: ::encode(attrset, tbl); ops++; } + /// remove an xattr from an object void rmattr(coll_t cid, const ghobject_t& oid, const char *name) { string n(name); rmattr(cid, oid, n); } + /// remove an xattr from an object void rmattr(coll_t cid, const ghobject_t& oid, const string& s) { __u32 op = OP_RMATTR; ::encode(op, tbl); @@ -488,6 +714,7 @@ public: ::encode(s, tbl); ops++; } + /// remove all xattrs from an object void rmattrs(coll_t cid, const ghobject_t& oid) { __u32 op = OP_RMATTRS; ::encode(op, tbl); @@ -495,6 +722,17 @@ public: ::encode(oid, tbl); ops++; } + /** + * Clone an object into another object. + * + * Low-cost (e.g., O(1)) cloning (if supported) is best, but + * fallback to an O(n) copy is allowed. All four parts of the + * object are cloned (data, xattrs, omap header, omap + * entries). + * + * The destination named object may already exist in + * which case its previous contents are discarded. + */ void clone(coll_t cid, const ghobject_t& oid, ghobject_t noid) { __u32 op = OP_CLONE; ::encode(op, tbl); @@ -503,6 +741,13 @@ public: ::encode(noid, tbl); ops++; } + /** + * Clone a byte range from one object to another. + * + * The data portion of the destination object receives a copy of a + * portion of the data from the source object. None of the other + * three parts of an object is copied from the source. + */ void clone_range(coll_t cid, const ghobject_t& oid, ghobject_t noid, uint64_t srcoff, uint64_t srclen, uint64_t dstoff) { __u32 op = OP_CLONERANGE2; @@ -515,18 +760,29 @@ public: ::encode(dstoff, tbl); ops++; } + /// Create the collection void create_collection(coll_t cid) { __u32 op = OP_MKCOLL; ::encode(op, tbl); ::encode(cid, tbl); ops++; } + /// remove the collection, the collection must be empty void remove_collection(coll_t cid) { __u32 op = OP_RMCOLL; ::encode(op, tbl); ::encode(cid, tbl); ops++; } + /** + * Add object to another collection (DEPRECATED) + * + * The Object is added to the new collection. This is a virtual + * add, we now have two names for the same object. This is only + * used for conversion of old stores to new stores and is not + * needed for new implementations unless they expect to make use + * of the conversion infrastructure. + */ void collection_add(coll_t cid, coll_t ocid, const ghobject_t& oid) { __u32 op = OP_COLL_ADD; ::encode(op, tbl); @@ -558,10 +814,12 @@ public: ops++; } + /// Set an xattr on a collection void collection_setattr(coll_t cid, const char* name, bufferlist& val) { string n(name); collection_setattr(cid, n, val); } + /// Set an xattr on a collection void collection_setattr(coll_t cid, const string& name, bufferlist& val) { __u32 op = OP_COLL_SETATTR; ::encode(op, tbl); @@ -571,10 +829,12 @@ public: ops++; } + /// Remove an xattr from a collection void collection_rmattr(coll_t cid, const char* name) { string n(name); collection_rmattr(cid, n); } + /// Remove an xattr from a collection void collection_rmattr(coll_t cid, const string& name) { __u32 op = OP_COLL_RMATTR; ::encode(op, tbl); @@ -582,6 +842,7 @@ public: ::encode(name, tbl); ops++; } + /// Set multiple xattrs on a collection void collection_setattrs(coll_t cid, map& aset) { __u32 op = OP_COLL_SETATTRS; ::encode(op, tbl); @@ -589,6 +850,7 @@ public: ::encode(aset, tbl); ops++; } + /// Set multiple xattrs on a collection void collection_setattrs(coll_t cid, map& aset) { __u32 op = OP_COLL_SETATTRS; ::encode(op, tbl); @@ -596,6 +858,7 @@ public: ::encode(aset, tbl); ops++; } + /// Change the name of a collection void collection_rename(coll_t cid, coll_t ncid) { __u32 op = OP_COLL_RENAME; ::encode(op, tbl); @@ -645,9 +908,9 @@ public: /// Remove key range from oid omap void omap_rmkeyrange( coll_t cid, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object from which to remove the omap + const ghobject_t &oid, ///< [in] Object from which to remove the omap keys const string& first, ///< [in] first key in range - const string& last ///< [in] first key past range + const string& last ///< [in] first key past range, range is [first,last) ) { __u32 op = OP_OMAP_RMKEYRANGE; ::encode(op, tbl); @@ -661,7 +924,7 @@ public: /// Set omap header void omap_setheader( coll_t cid, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object from which to remove the omap + const ghobject_t &oid, ///< [in] Object const bufferlist &bl ///< [in] Header value ) { __u32 op = OP_OMAP_SETHEADER; @@ -672,7 +935,8 @@ public: ops++; } - /// Split collection based on given prefixes + /// Split collection based on given prefixes, objects matching the specified bits/rem are + /// moved to the new collection void split_collection( coll_t cid, uint32_t bits, @@ -708,7 +972,7 @@ public: replica(false), tolerate_collection_add_enoent(false) { bufferlist::iterator dp = nbl.begin(); - decode(dp); + decode(dp); } void encode(bufferlist& bl) const { @@ -789,7 +1053,7 @@ public: list tls; tls.push_back(t); return queue_transactions(osr, tls, new C_DeleteTransaction(t), - NULL, NULL, TrackedOpRef(), handle); + NULL, NULL, TrackedOpRef(), handle); } int queue_transaction(Sequencer *osr, Transaction *t, Context *onreadable, Context *ondisk=0, @@ -799,7 +1063,7 @@ public: list tls; tls.push_back(t); return queue_transactions(osr, tls, onreadable, ondisk, onreadable_sync, - op, handle); + op, handle); } int queue_transactions(Sequencer *osr, list& tls, @@ -920,15 +1184,50 @@ public: */ virtual int get_ideal_list_max() { return 64; } - // objects + /** + * Synchronous read operations + */ + + + /** + * exists -- Test for existance of object + * + * @param cid collection for object + * @param oid oid of object + * @returns true if object exists, false otherwise + */ virtual bool exists(coll_t cid, const ghobject_t& oid) = 0; // useful? + + /** + * stat -- get information for an object + * + * @param cid collection for object + * @param oid oid of object + * @param st output information for the object + * @param allow_eio if false, assert on -EIO operation failure + * @returns 0 on success, negative error code on failure. + */ virtual int stat( coll_t cid, const ghobject_t& oid, struct stat *st, bool allow_eio = false) = 0; // struct stat? - virtual int read( + /** + * read -- read a byte range of data from an object + * + * Note: if reading from an offset past the end of the object, we + * return 0 (not, say, -EINVAL). + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output bufferlist + * @param allow_eio if false, assert on -EIO operation failure + * @returns number of bytes read on success, or negative error code on failure. + */ + virtual int read( coll_t cid, const ghobject_t& oid, uint64_t offset, @@ -936,9 +1235,44 @@ public: bufferlist& bl, bool allow_eio = false) = 0; + /** + * fiemap -- get extent map of data of an object + * + * Returns an encoded map of the extents of an object's data portion + * (map). + * + * A non-enlightend implementation is free to return the extent (offset, len) + * as the sole extent. + * + * @param cid collection for object + * @param oid oid of object + * @param offset location offset of first byte to be read + * @param len number of bytes to be read + * @param bl output bufferlist for extent map information. + * @returns 0 on success, negative error code on failure. + */ virtual int fiemap(coll_t cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0; + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ virtual int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferptr& value) = 0; + + /** + * getattr -- get an xattr of an object + * + * @param cid collection for object + * @param oid oid of object + * @param name name of attr to read + * @param value place to put output result. + * @returns 0 on success, negative error code on failure. + */ int getattr(coll_t cid, const ghobject_t& oid, const char *name, bufferlist& value) { bufferptr bp; int r = getattr(cid, oid, name, bp); @@ -946,15 +1280,26 @@ public: value.push_back(bp); return r; } - int getattr( - coll_t cid, const ghobject_t& oid, - const string name, bufferlist& value) { - bufferptr bp; - int r = getattr(cid, oid, name.c_str(), bp); - value.push_back(bp); - return r; - } + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @param user_only true -> only user attributes are return else all attributes are returned + * @returns 0 on success, negative error code on failure. + */ virtual int getattrs(coll_t cid, const ghobject_t& oid, map& aset, bool user_only = false) = 0; + + /** + * getattrs -- get all of the xattrs of an object + * + * @param cid collection for object + * @param oid oid of object + * @param aset place to put output result. + * @param user_only true -> only user attributes are return else all attributes are returned + * @returns 0 on success, negative error code on failure. + */ int getattrs(coll_t cid, const ghobject_t& oid, map& aset, bool user_only = false) { map bmap; int r = getattrs(cid, oid, bmap, user_only); @@ -966,19 +1311,71 @@ public: return r; } - + // collections + + /** + * list_collections -- get all of the collections known to this ObjectStore + * + * @param ls list of the collections in sorted order. + * @returns 0 on success, negative error code on failure. + */ virtual int list_collections(vector& ls) = 0; - virtual int collection_version_current(coll_t c, uint32_t *version) { + + virtual int collection_version_current(coll_t c, uint32_t *version) { *version = 0; return 1; } + /** + * does a collection exist? + * + * @param c collection + * @returns true if it exists, false otherwise + */ virtual bool collection_exists(coll_t c) = 0; + /** + * collection_getattr - get an xattr of a collection + * + * @param cid collection name + * @param name xattr name + * @param value pointer of buffer to receive value + * @param size size of buffer to receive value + * @returns 0 on success, negative error code on failure + */ virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) = 0; + void *value, size_t size) = 0; + /** + * collection_getattr - get an xattr of a collection + * + * @param cid collection name + * @param name xattr name + * @param bl buffer to receive value + * @returns 0 on success, negative error code on failure + */ virtual int collection_getattr(coll_t cid, const char *name, bufferlist& bl) = 0; + /** + * collection_getattrs - get all xattrs of a collection + * + * @param cid collection name + * @param asert map of keys and buffers that contain the values + * @returns 0 on success, negative error code on failure + */ virtual int collection_getattrs(coll_t cid, map &aset) = 0; + /** + * is a collection empty? + * + * @param c collection + * @returns true if empty, false otherwise + */ virtual bool collection_empty(coll_t c) = 0; + + /** + * collection_list - get all objects of a collection in sorted order + * + * @param c collection name + * @param o [out] list of objects + * @returns 0 on success, negative error code on failure + */ virtual int collection_list(coll_t c, vector& o) = 0; /** @@ -994,7 +1391,7 @@ public: * @return zero on success, or negative error */ virtual int collection_list_partial(coll_t c, ghobject_t start, - int min, int max, snapid_t snap, + int min, int max, snapid_t snap, vector *ls, ghobject_t *next) = 0; /** @@ -1008,7 +1405,7 @@ public: * @return zero on success, or negative error */ virtual int collection_list_range(coll_t c, ghobject_t start, ghobject_t end, - snapid_t seq, vector *ls) = 0; + snapid_t seq, vector *ls) = 0; //TODO: Remove int collection_list(coll_t c, vector& o); @@ -1018,7 +1415,7 @@ public: vector *ls, hobject_t *next); int collection_list_range(coll_t c, hobject_t start, hobject_t end, - snapid_t seq, vector *ls); + snapid_t seq, vector *ls); /// OMAP /// Get omap contents @@ -1082,7 +1479,10 @@ public: virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; } virtual int snapshot(const string& name) { return -EOPNOTSUPP; } - + + /** + * Set and get internal fsid for this instance. No external data is modified + */ virtual void set_fsid(uuid_d u) = 0; virtual uuid_d get_fsid() = 0;