1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef CEPH_OBJECTSTORE_H
15 #define CEPH_OBJECTSTORE_H
17 #include "include/buffer.h"
18 #include "include/common_fwd.h"
19 #include "include/Context.h"
20 #include "include/interval_set.h"
21 #include "include/stringify.h"
22 #include "include/types.h"
24 #include "osd/osd_types.h"
25 #include "common/TrackedOp.h"
26 #include "common/WorkQueue.h"
27 #include "ObjectMap.h"
28 #include "os/Transaction.h"
35 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) || defined(_WIN32)
36 #include <sys/statvfs.h>
38 #include <sys/vfs.h> /* or <sys/statfs.h> */
46 * low-level interface to the local OSD file system
52 static inline void encode(const std::map<std::string,ceph::buffer::ptr> *attrset, ceph::buffer::list &bl) {
58 typedef uint32_t osflagbits_t;
59 const int SKIP_JOURNAL_REPLAY = 1 << 0;
60 const int SKIP_MOUNT_OMAP = 1 << 1;
67 using Transaction = ceph::os::Transaction;
71 * create - create an ObjectStore instance.
73 * This is invoked once at initialization time.
75 * @param type type of store. This is a std::string from the configuration file.
76 * @param data path (or other descriptor) for data
77 * @param journal path (or other descriptor) for journal (optional)
78 * @param flags which filestores should check if applicable
80 static ObjectStore *create(CephContext *cct,
81 const std::string& type,
82 const std::string& data,
83 const std::string& journal,
84 osflagbits_t flags = 0);
87 * probe a block device to learn the uuid of the owning OSD
90 * @param path path to device
91 * @param fsid [out] osd uuid
93 static int probe_block_device_fsid(
95 const std::string& path,
99 * Fetch Object Store statistics.
101 * Currently only latency of write and apply times are measured.
103 * This appears to be called with nothing locked.
105 virtual objectstore_perf_stat_t get_cur_stats() = 0;
108 * Fetch Object Store performance counters.
111 * This appears to be called with nothing locked.
113 virtual const PerfCounters* get_perf_counters() const = 0;
116 * a collection also orders transactions
118 * Any transactions queued under a given collection will be applied in
119 * sequence. Transactions queued under different collections may run
122 * ObjectStore users may get collection handles with open_collection() (or,
123 * for bootstrapping a new collection, create_new_collection()).
125 struct CollectionImpl : public RefCountedObject {
128 /// wait for any queued transactions to apply
129 // block until any previous transactions are visible. specifically,
130 // collection_list and collection_empty need to reflect prior operations.
131 virtual void flush() = 0;
136 * There are two cases:
137 * 1) collection is currently idle: the method returns true. c is
139 * 2) collection is not idle: the method returns false and c is
140 * called asynchronously with a value of 0 once all transactions
141 * queued on this collection prior to the call have been applied
144 virtual bool flush_commit(Context *c) = 0;
146 const coll_t &get_cid() {
150 CollectionImpl() = delete;
151 CollectionImpl(CephContext* cct, const coll_t& c) : RefCountedObject(cct), cid(c) {}
152 ~CollectionImpl() = default;
154 using CollectionHandle = ceph::ref_t<CollectionImpl>;
157 /*********************************
159 * Object Contents and semantics
161 * All ObjectStore objects are identified as a named object
162 * (ghobject_t and hobject_t) in a named collection (coll_t).
163 * ObjectStore operations support the creation, mutation, deletion
164 * and enumeration of objects within a collection. Enumeration is
165 * in sorted key order (where keys are sorted by hash). Object names
166 * are globally unique.
168 * Each object has four distinct parts: byte data, xattrs, omap_header
171 * The data portion of an object is conceptually equivalent to a
172 * file in a file system. Random and Partial access for both read
173 * and write operations is required. The ability to have a sparse
174 * implementation of the data portion of an object is beneficial for
175 * some workloads, but not required. There is a system-wide limit on
176 * the maximum size of an object, which is typically around 100 MB.
178 * Xattrs are equivalent to the extended attributes of file
179 * systems. Xattrs are a std::set of key/value pairs. Sub-value access
180 * is not required. It is possible to enumerate the std::set of xattrs in
181 * key order. At the implementation level, xattrs are used
182 * exclusively internal to Ceph and the implementer can expect the
183 * total size of all of the xattrs on an object to be relatively
184 * small, i.e., less than 64KB. Much of Ceph assumes that accessing
185 * xattrs on temporally adjacent object accesses (recent past or
186 * near future) is inexpensive.
188 * omap_header is a single blob of data. It can be read or written
191 * Omap entries are conceptually the same as xattrs
192 * but in a different address space. In other words, you can have
193 * the same key as an xattr and an omap entry and they have distinct
194 * values. Enumeration of xattrs doesn't include omap entries and
195 * vice versa. The size and access characteristics of omap entries
196 * are very different from xattrs. In particular, the value portion
197 * of an omap entry can be quite large (MBs). More importantly, the
198 * interface must support efficient range queries on omap entries even
199 * when there are a large numbers of entries.
201 *********************************/
203 /*******************************
207 * A collection is simply a grouping of objects. Collections have
208 * names (coll_t) and can be enumerated in order. Like an
209 * individual object, a collection also has a std::set of xattrs.
215 int queue_transaction(CollectionHandle& ch,
217 TrackedOpRef op = TrackedOpRef(),
218 ThreadPool::TPHandle *handle = NULL) {
219 std::vector<Transaction> tls;
220 tls.push_back(std::move(t));
221 return queue_transactions(ch, tls, op, handle);
224 virtual int queue_transactions(
225 CollectionHandle& ch, std::vector<Transaction>& tls,
226 TrackedOpRef op = TrackedOpRef(),
227 ThreadPool::TPHandle *handle = NULL) = 0;
231 ObjectStore(CephContext* cct,
232 const std::string& path_) : path(path_), cct(cct) {}
233 virtual ~ObjectStore() {}
236 explicit ObjectStore(const ObjectStore& o) = delete;
237 const ObjectStore& operator=(const ObjectStore& o) = delete;
240 virtual int upgrade() {
244 virtual void get_db_statistics(ceph::Formatter *f) { }
245 virtual void generate_db_histogram(ceph::Formatter *f) { }
246 virtual int flush_cache(std::ostream *os = NULL) { return -1; }
247 virtual void dump_perf_counters(ceph::Formatter *f) {}
248 virtual void dump_cache_stats(ceph::Formatter *f) {}
249 virtual void dump_cache_stats(std::ostream& os) {}
251 virtual std::string get_type() = 0;
254 virtual bool test_mount_in_use() = 0;
255 virtual int mount() = 0;
256 virtual int umount() = 0;
257 virtual int fsck(bool deep) {
260 virtual int repair(bool deep) {
263 virtual int quick_fix() {
267 virtual void set_cache_shards(unsigned num) { }
270 * Returns 0 if the hobject is valid, -error otherwise
273 * -ENAMETOOLONG: locator/namespace/name too large
275 virtual int validate_hobject_key(const hobject_t &obj) const = 0;
277 virtual unsigned get_max_attr_name_length() = 0;
278 virtual int mkfs() = 0; // wipe
279 virtual int mkjournal() = 0; // journal only
280 virtual bool needs_journal() = 0; //< requires a journal
281 virtual bool wants_journal() = 0; //< prefers a journal
282 virtual bool allows_journal() = 0; //< allows a journal
284 // return store min allocation size, if applicable
285 virtual uint64_t get_min_alloc_size() const {
289 /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda)
290 virtual int get_devices(std::set<std::string> *devls) {
294 /// true if a txn is readable immediately after it is queued.
295 virtual bool is_sync_onreadable() const {
302 * Check whether store is backed by a rotational (HDD) or non-rotational
305 * This must be usable *before* the store is mounted.
307 * @return true for HDD, false for SSD
309 virtual bool is_rotational() {
314 * is_journal_rotational
316 * Check whether journal is backed by a rotational (HDD) or non-rotational
320 * @return true for HDD, false for SSD
322 virtual bool is_journal_rotational() {
326 virtual std::string get_default_device_class() {
327 return is_rotational() ? "hdd" : "ssd";
330 virtual int get_numa_node(
332 std::set<int> *nodes,
333 std::set<std::string> *failed) {
338 virtual bool can_sort_nibblewise() {
339 return false; // assume a backend cannot, unless it says otherwise
342 virtual int statfs(struct store_statfs_t *buf,
343 osd_alert_list_t* alerts = nullptr) = 0;
344 virtual int pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
345 bool *per_pool_omap) = 0;
347 virtual void collect_metadata(std::map<std::string,std::string> *pm) { }
350 * write_meta - write a simple configuration key out-of-band
352 * Write a simple key/value pair for basic store configuration
353 * (e.g., a uuid or magic number) to an unopened/unmounted store.
354 * The default implementation writes this to a plaintext file in the
357 * A newline is appended.
359 * @param key key name (e.g., "fsid")
360 * @param value value (e.g., a uuid rendered as a std::string)
361 * @returns 0 for success, or an error code
363 virtual int write_meta(const std::string& key,
364 const std::string& value);
367 * read_meta - read a simple configuration key out-of-band
369 * Read a simple key value to an unopened/mounted store.
371 * Trailing whitespace is stripped off.
373 * @param key key name
374 * @param value pointer to value std::string
375 * @returns 0 for success, or an error code
377 virtual int read_meta(const std::string& key,
381 * get ideal max value for collection_list()
383 * default to some arbitrary values; the implementation will override.
385 virtual int get_ideal_list_max() { return 64; }
389 * get a collection handle
391 * Provide a trivial handle as a default to avoid converting legacy
394 virtual CollectionHandle open_collection(const coll_t &cid) = 0;
397 * get a collection handle for a soon-to-be-created collection
399 * This handle must be used by queue_transaction that includes a
400 * create_collection call in order to become valid. It will become the
401 * reference to the created collection.
403 virtual CollectionHandle create_new_collection(const coll_t &cid) = 0;
406 * std::set ContextQueue for a collection
408 * After that, oncommits of Transaction will queue into commit_queue.
409 * And osd ShardThread will call oncommits.
411 virtual void set_collection_commit_queue(const coll_t &cid, ContextQueue *commit_queue) = 0;
414 * Synchronous read operations
418 * exists -- Test for existance of object
420 * @param cid collection for object
421 * @param oid oid of object
422 * @returns true if object exists, false otherwise
424 virtual bool exists(CollectionHandle& c, const ghobject_t& oid) = 0;
426 * set_collection_opts -- std::set pool options for a collectioninformation for an object
428 * @param cid collection
429 * @param opts new collection options
430 * @returns 0 on success, negative error code on failure.
432 virtual int set_collection_opts(
434 const pool_opts_t& opts) = 0;
437 * stat -- get information for an object
439 * @param cid collection for object
440 * @param oid oid of object
441 * @param st output information for the object
442 * @param allow_eio if false, assert on -EIO operation failure
443 * @returns 0 on success, negative error code on failure.
447 const ghobject_t& oid,
449 bool allow_eio = false) = 0;
451 * read -- read a byte range of data from an object
453 * Note: if reading from an offset past the end of the object, we
454 * return 0 (not, say, -EINVAL).
456 * @param cid collection for object
457 * @param oid oid of object
458 * @param offset location offset of first byte to be read
459 * @param len number of bytes to be read
460 * @param bl output ceph::buffer::list
461 * @param op_flags is CEPH_OSD_OP_FLAG_*
462 * @returns number of bytes read on success, or negative error code on failure.
466 const ghobject_t& oid,
469 ceph::buffer::list& bl,
470 uint32_t op_flags = 0) = 0;
473 * fiemap -- get extent std::map of data of an object
475 * Returns an encoded std::map of the extents of an object's data portion
476 * (std::map<offset,size>).
478 * A non-enlightened implementation is free to return the extent (offset, len)
479 * as the sole extent.
481 * @param cid collection for object
482 * @param oid oid of object
483 * @param offset location offset of first byte to be read
484 * @param len number of bytes to be read
485 * @param bl output ceph::buffer::list for extent std::map information.
486 * @returns 0 on success, negative error code on failure.
488 virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
489 uint64_t offset, size_t len, ceph::buffer::list& bl) = 0;
490 virtual int fiemap(CollectionHandle& c, const ghobject_t& oid,
491 uint64_t offset, size_t len, std::map<uint64_t, uint64_t>& destmap) = 0;
494 * readv -- read specfic intervals from an object;
495 * caller must call fiemap to fill in the extent-map first.
497 * Note: if reading from an offset past the end of the object, we
498 * return 0 (not, say, -EINVAL). Also the default version of readv
499 * reads each extent separately synchronously, which can become horribly
500 * inefficient if the physical layout of the pushing object get massively
501 * fragmented and hence should be overridden by any real os that
502 * cares about the performance..
504 * @param cid collection for object
505 * @param oid oid of object
506 * @param m intervals to be read
507 * @param bl output ceph::buffer::list
508 * @param op_flags is CEPH_OSD_OP_FLAG_*
509 * @returns number of bytes read on success, or negative error code on failure.
513 const ghobject_t& oid,
514 interval_set<uint64_t>& m,
515 ceph::buffer::list& bl,
516 uint32_t op_flags = 0) {
518 for (auto p = m.begin(); p != m.end(); p++) {
519 ceph::buffer::list t;
520 int r = read(c, oid, p.get_start(), p.get_len(), t, op_flags);
524 // prune fiemap, if necessary
525 if (p.get_len() != t.length()) {
527 if (t.length() == 0) {
528 m.erase(save); // Remove this empty interval
530 save.set_len(t.length()); // fix interval length
533 // Remove any other follow-up intervals present too
534 while (p != m.end()) {
546 * dump_onode -- dumps onode metadata in human readable form,
547 intended primiarily for debugging
549 * @param cid collection for object
550 * @param oid oid of object
551 * @param section_name section name to create and print under
552 * @param f Formatter class instance to print to
553 * @returns 0 on success, negative error code on failure.
555 virtual int dump_onode(
557 const ghobject_t& oid,
558 const std::string& section_name,
559 ceph::Formatter *f) {
564 * getattr -- get an xattr of an object
566 * @param cid collection for object
567 * @param oid oid of object
568 * @param name name of attr to read
569 * @param value place to put output result.
570 * @returns 0 on success, negative error code on failure.
572 virtual int getattr(CollectionHandle &c, const ghobject_t& oid,
573 const char *name, ceph::buffer::ptr& value) = 0;
576 * getattr -- get an xattr of an object
578 * @param cid collection for object
579 * @param oid oid of object
580 * @param name name of attr to read
581 * @param value place to put output result.
582 * @returns 0 on success, negative error code on failure.
585 CollectionHandle &c, const ghobject_t& oid,
586 const std::string& name, ceph::buffer::list& value) {
587 ceph::buffer::ptr bp;
588 int r = getattr(c, oid, name.c_str(), bp);
594 * getattrs -- get all of the xattrs of an object
596 * @param cid collection for object
597 * @param oid oid of object
598 * @param aset place to put output result.
599 * @returns 0 on success, negative error code on failure.
601 virtual int getattrs(CollectionHandle &c, const ghobject_t& oid,
602 std::map<std::string,ceph::buffer::ptr>& aset) = 0;
605 * getattrs -- get all of the xattrs of an object
607 * @param cid collection for object
608 * @param oid oid of object
609 * @param aset place to put output result.
610 * @returns 0 on success, negative error code on failure.
612 int getattrs(CollectionHandle &c, const ghobject_t& oid,
613 std::map<std::string,ceph::buffer::list>& aset) {
614 std::map<std::string,ceph::buffer::ptr> bmap;
615 int r = getattrs(c, oid, bmap);
616 for (auto i = bmap.begin(); i != bmap.end(); ++i) {
617 aset[i->first].append(i->second);
626 * list_collections -- get all of the collections known to this ObjectStore
628 * @param ls std::list of the collections in sorted order.
629 * @returns 0 on success, negative error code on failure.
631 virtual int list_collections(std::vector<coll_t>& ls) = 0;
634 * does a collection exist?
636 * @param c collection
637 * @returns true if it exists, false otherwise
639 virtual bool collection_exists(const coll_t& c) = 0;
642 * is a collection empty?
644 * @param c collection
645 * @param empty true if the specified collection is empty, false otherwise
646 * @returns 0 on success, negative error code on failure.
648 virtual int collection_empty(CollectionHandle& c, bool *empty) = 0;
651 * return the number of significant bits of the coll_t::pgid.
653 * This should return what the last create_collection or split_collection
654 * std::set. A legacy backend may return -EAGAIN if the value is unavailable
655 * (because we upgraded from an older version, e.g., FileStore).
657 virtual int collection_bits(CollectionHandle& c) = 0;
661 * std::list contents of a collection that fall in the range [start, end) and no more than a specified many result
663 * @param c collection
664 * @param start list object that sort >= this value
665 * @param end list objects that sort < this value
666 * @param max return no more than this many results
667 * @param seq return no objects with snap < seq
668 * @param ls [out] result
669 * @param next [out] next item sorts >= this value
670 * @return zero on success, or negative error
672 virtual int collection_list(CollectionHandle &c,
673 const ghobject_t& start, const ghobject_t& end,
675 std::vector<ghobject_t> *ls, ghobject_t *next) = 0;
677 virtual int collection_list_legacy(CollectionHandle &c,
678 const ghobject_t& start,
679 const ghobject_t& end, int max,
680 std::vector<ghobject_t> *ls,
682 return collection_list(c, start, end, max, ls, next);
686 /// Get omap contents
687 virtual int omap_get(
688 CollectionHandle &c, ///< [in] Collection containing oid
689 const ghobject_t &oid, ///< [in] Object containing omap
690 ceph::buffer::list *header, ///< [out] omap header
691 std::map<std::string, ceph::buffer::list> *out /// < [out] Key to value std::map
695 virtual int omap_get_header(
696 CollectionHandle &c, ///< [in] Collection containing oid
697 const ghobject_t &oid, ///< [in] Object containing omap
698 ceph::buffer::list *header, ///< [out] omap header
699 bool allow_eio = false ///< [in] don't assert on eio
702 /// Get keys defined on oid
703 virtual int omap_get_keys(
704 CollectionHandle &c, ///< [in] Collection containing oid
705 const ghobject_t &oid, ///< [in] Object containing omap
706 std::set<std::string> *keys ///< [out] Keys defined on oid
710 virtual int omap_get_values(
711 CollectionHandle &c, ///< [in] Collection containing oid
712 const ghobject_t &oid, ///< [in] Object containing omap
713 const std::set<std::string> &keys, ///< [in] Keys to get
714 std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
718 virtual int omap_get_values(
719 CollectionHandle &c, ///< [in] Collection containing oid
720 const ghobject_t &oid, ///< [in] Object containing omap
721 const std::optional<std::string> &start_after, ///< [in] Keys to get
722 std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
726 /// Filters keys into out which are defined on oid
727 virtual int omap_check_keys(
728 CollectionHandle &c, ///< [in] Collection containing oid
729 const ghobject_t &oid, ///< [in] Object containing omap
730 const std::set<std::string> &keys, ///< [in] Keys to check
731 std::set<std::string> *out ///< [out] Subset of keys defined on oid
735 * Returns an object map iterator
737 * Warning! The returned iterator is an implicit lock on filestore
738 * operations in c. Do not use filestore methods on c while the returned
739 * iterator is live. (Filling in a transaction is no problem).
741 * @return iterator, null on error
743 virtual ObjectMap::ObjectMapIterator get_omap_iterator(
744 CollectionHandle &c, ///< [in] collection
745 const ghobject_t &oid ///< [in] object
748 virtual int flush_journal() { return -EOPNOTSUPP; }
750 virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; }
752 virtual int snapshot(const std::string& name) { return -EOPNOTSUPP; }
755 * Set and get internal fsid for this instance. No external data is modified
757 virtual void set_fsid(uuid_d u) = 0;
758 virtual uuid_d get_fsid() = 0;
761 * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
762 * - num objects - total (including witeouts) object count to measure used space for.
764 virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0;
768 virtual void inject_data_error(const ghobject_t &oid) {}
769 virtual void inject_mdata_error(const ghobject_t &oid) {}
771 virtual void compact() {}
772 virtual bool has_builtin_csum() const {