Rename old FreelistManager -> ExtentFreelistManager.
Persist the freelist_type in the super area.
Drop get_total_free() (use Allocator for that instead).
Signed-off-by: Sage Weil <sage@redhat.com>
OPTION(bluestore_onode_map_size, OPT_U32, 1024) // onodes per collection
OPTION(bluestore_cache_tails, OPT_BOOL, true) // cache tail blocks in Onode
OPTION(bluestore_kvbackend, OPT_STR, "rocksdb")
+OPTION(bluestore_freelist_type, OPT_STR, "extent")
OPTION(bluestore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=16,min_write_buffer_number_to_merge=3,recycle_log_file_num=16")
OPTION(bluestore_fsck_on_mount, OPT_BOOL, false)
OPTION(bluestore_fsck_on_umount, OPT_BOOL, false)
os/bluestore/BlueFS.cc \
os/bluestore/BlueRocksEnv.cc \
os/bluestore/BlueStore.cc \
+ os/bluestore/ExtentFreelistManager.cc \
os/bluestore/FreelistManager.cc \
os/bluestore/KernelDevice.cc \
os/bluestore/StupidAllocator.cc
os/bluestore/BlueRocksEnv.h \
os/bluestore/BlueStore.h \
os/bluestore/KernelDevice.h \
+ os/bluestore/ExtentFreelistManager.h \
os/bluestore/FreelistManager.h \
os/bluestore/StupidAllocator.h
endif
{
assert(fm == NULL);
assert(alloc == NULL);
- fm = new FreelistManager();
+ fm = FreelistManager::create(freelist_type);
int r = fm->init(db, PREFIX_ALLOC);
if (r < 0) {
delete fm;
return r;
}
+ freelist_type = g_conf->bluestore_freelist_type;
+
r = _open_path();
if (r < 0)
return r;
{
dout(20) << __func__ << " initializing freespace" << dendl;
KeyValueDB::Transaction t = db->get_transaction();
+ {
+ bufferlist bl;
+ bl.append(freelist_type);
+ t->set(PREFIX_SUPER, "freelist_type", bl);
+ }
+ fm->create(t);
uint64_t reserved = 0;
if (g_conf->bluestore_bluefs) {
assert(bluefs_extents.num_intervals() == 1);
if (r < 0)
goto out_bdev;
- r = _open_alloc();
+ r = _open_super_meta();
if (r < 0)
goto out_db;
- r = _open_super_meta();
+ r = _open_alloc();
if (r < 0)
- goto out_alloc;
+ goto out_db;
r = _open_collections();
if (r < 0)
memset(buf, 0, sizeof(*buf));
buf->f_blocks = bdev->get_size() / bdev->get_block_size();
buf->f_bsize = bdev->get_block_size();
- buf->f_bfree = fm->get_total_free() / bdev->get_block_size();
+ buf->f_bfree = alloc->get_free() / bdev->get_block_size();
buf->f_bavail = buf->f_bfree;
dout(20) << __func__ << " free " << pretty_si_t(buf->f_bfree * buf->f_bsize)
<< " / " << pretty_si_t(buf->f_blocks * buf->f_bsize) << dendl;
nid_last = nid_max;
}
+ // freelist
+ {
+ bufferlist bl;
+ db->get(PREFIX_SUPER, "freelist_type", &bl);
+ freelist_type = std::string(bl.c_str(), bl.length());
+ dout(10) << __func__ << " freelist_type " << freelist_type << dendl;
+ }
+
// bluefs alloc
{
bluefs_extents.clear();
unsigned bluefs_shared_bdev; ///< which bluefs bdev we are sharing
KeyValueDB *db;
BlockDevice *bdev;
+ std::string freelist_type;
FreelistManager *fm;
Allocator *alloc;
uuid_d fsid;
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ExtentFreelistManager.h"
+#include "kv/KeyValueDB.h"
+#include "kv.h"
+
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "freelist "
+
+int ExtentFreelistManager::init(KeyValueDB *db, string p)
+{
+ dout(1) << __func__ << " prefix " << p << dendl;
+
+ // load state from kvstore
+ prefix = p;
+
+ KeyValueDB::Transaction txn = db->get_transaction();
+ int fixed = 0;
+
+ KeyValueDB::Iterator it = db->get_iterator(prefix);
+ it->lower_bound(string());
+ uint64_t last_offset = 0;
+ uint64_t last_length = 0;
+ while (it->valid()) {
+ uint64_t offset, length;
+ string k = it->key();
+ const char *p = _key_decode_u64(k.c_str(), &offset);
+ assert(p);
+ bufferlist bl = it->value();
+ bufferlist::iterator bp = bl.begin();
+ ::decode(length, bp);
+
+ total_free += length;
+
+ if (offset < last_offset + last_length) {
+ derr << __func__ << " detected overlapping extent on load, had "
+ << last_offset << "~" << last_length
+ << " and got "
+ << offset << "~" << length
+ << dendl;
+ return -EIO;
+ }
+ if (offset && offset == last_offset + last_length) {
+ derr << __func__ << " detected contiguous extent on load, merging "
+ << last_offset << "~" << last_length << " with "
+ << offset << "~" << length
+ << dendl;
+ kv_free.erase(last_offset);
+ string key;
+ _key_encode_u64(last_offset, &key);
+ txn->rmkey(prefix, key);
+ offset -= last_length;
+ length += last_length;
+ bufferlist value;
+ ::encode(length, value);
+ txn->set(prefix, key, value);
+ fixed++;
+ }
+
+ kv_free[offset] = length;
+ dout(20) << __func__ << " " << offset << "~" << length << dendl;
+
+ last_offset = offset;
+ last_length = length;
+ it->next();
+ }
+
+ if (fixed) {
+ db->submit_transaction_sync(txn);
+ derr << " fixed " << fixed << " extents" << dendl;
+ }
+
+ dout(10) << __func__ << " loaded " << kv_free.size() << " extents" << dendl;
+ return 0;
+}
+
+void ExtentFreelistManager::shutdown()
+{
+ dout(1) << __func__ << dendl;
+}
+
+void ExtentFreelistManager::dump()
+{
+ std::lock_guard<std::mutex> l(lock);
+ _dump();
+}
+
+void ExtentFreelistManager::enumerate_reset()
+{
+ std::lock_guard<std::mutex> l(lock);
+ enumerate_p = kv_free.begin();
+}
+
+bool ExtentFreelistManager::enumerate_next(uint64_t *offset, uint64_t *length)
+{
+ std::lock_guard<std::mutex> l(lock);
+ if (enumerate_p == kv_free.end())
+ return false;
+ *offset = enumerate_p->first;
+ *length = enumerate_p->second;
+ ++enumerate_p;
+ return true;
+}
+
+void ExtentFreelistManager::_dump()
+{
+ dout(30) << __func__ << " " << total_free
+ << " in " << kv_free.size() << " extents" << dendl;
+ for (auto p = kv_free.begin();
+ p != kv_free.end();
+ ++p) {
+ dout(30) << __func__ << " " << p->first << "~" << p->second << dendl;
+ }
+}
+
+void ExtentFreelistManager::_audit()
+{
+ uint64_t sum = 0;
+ for (auto& p : kv_free) {
+ sum += p.second;
+ }
+ if (total_free != sum) {
+ derr << __func__ << " sum " << sum << " != total_free " << total_free
+ << dendl;
+ derr << kv_free << dendl;
+ assert(0 == "freelistmanager bug");
+ }
+}
+
+void ExtentFreelistManager::allocate(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(10) << __func__ << " " << offset << "~" << length << dendl;
+ total_free -= length;
+ auto p = kv_free.lower_bound(offset);
+ if ((p == kv_free.end() || p->first > offset) &&
+ p != kv_free.begin()) {
+ --p;
+ }
+ if (p == kv_free.end() ||
+ p->first > offset ||
+ p->first + p->second < offset + length) {
+ derr << " bad allocate " << offset << "~" << length << " - dne" << dendl;
+ if (p != kv_free.end()) {
+ derr << " existing extent " << p->first << "~" << p->second << dendl;
+ }
+ _dump();
+ assert(0 == "bad allocate");
+ }
+
+ if (p->first == offset) {
+ string key;
+ _key_encode_u64(offset, &key);
+ txn->rmkey(prefix, key);
+ dout(20) << __func__ << " rm " << p->first << "~" << p->second << dendl;
+ if (p->second > length) {
+ uint64_t newoff = offset + length;
+ uint64_t newlen = p->second - length;
+ string newkey;
+ _key_encode_u64(newoff, &newkey);
+ bufferlist newvalue;
+ ::encode(newlen, newvalue);
+ txn->set(prefix, newkey, newvalue);
+ dout(20) << __func__ << " set " << newoff << "~" << newlen
+ << " (remaining tail)" << dendl;
+ kv_free.erase(p);
+ kv_free[newoff] = newlen;
+ } else {
+ kv_free.erase(p);
+ }
+ } else {
+ assert(p->first < offset);
+ // shorten
+ uint64_t newlen = offset - p->first;
+ string key;
+ _key_encode_u64(p->first, &key);
+ bufferlist newvalue;
+ ::encode(newlen, newvalue);
+ txn->set(prefix, key, newvalue);
+ dout(30) << __func__ << " set " << p->first << "~" << newlen
+ << " (remaining head from " << p->second << ")" << dendl;
+ if (p->first + p->second > offset + length) {
+ // new trailing piece, too
+ uint64_t tailoff = offset + length;
+ uint64_t taillen = p->first + p->second - (offset + length);
+ string tailkey;
+ _key_encode_u64(tailoff, &tailkey);
+ bufferlist tailvalue;
+ ::encode(taillen, tailvalue);
+ txn->set(prefix, tailkey, tailvalue);
+ dout(20) << __func__ << " set " << tailoff << "~" << taillen
+ << " (remaining tail from " << p->first << "~" << p->second << ")"
+ << dendl;
+ p->second = newlen;
+ kv_free[tailoff] = taillen;
+ } else {
+ p->second = newlen;
+ }
+ }
+ if (g_conf->bluestore_debug_freelist)
+ _audit();
+}
+
+void ExtentFreelistManager::release(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn)
+{
+ std::lock_guard<std::mutex> l(lock);
+ dout(10) << __func__ << " " << offset << "~" << length << dendl;
+ total_free += length;
+ auto p = kv_free.lower_bound(offset);
+
+ // contiguous with previous extent?
+ if (p != kv_free.begin()) {
+ --p;
+ if (p->first + p->second == offset) {
+ string prevkey;
+ _key_encode_u64(p->first, &prevkey);
+ txn->rmkey(prefix, prevkey);
+ dout(20) << __func__ << " rm " << p->first << "~" << p->second
+ << " (merge with previous)" << dendl;
+ length += p->second;
+ offset = p->first;
+ if (map_t_has_stable_iterators) {
+ kv_free.erase(p++);
+ } else {
+ p = kv_free.erase(p);
+ }
+ } else if (p->first + p->second > offset) {
+ derr << __func__ << " bad release " << offset << "~" << length
+ << " overlaps with " << p->first << "~" << p->second << dendl;
+ _dump();
+ assert(0 == "bad release overlap");
+ } else {
+ dout(30) << __func__ << " previous extent " << p->first << "~" << p->second
+ << " is not contiguous" << dendl;
+ ++p;
+ }
+ }
+
+ // contiguous with next extent?
+ if (p != kv_free.end()) {
+ if (p->first == offset + length) {
+ string tailkey;
+ _key_encode_u64(p->first, &tailkey);
+ txn->rmkey(prefix, tailkey);
+ dout(20) << __func__ << " rm " << p->first << "~" << p->second
+ << " (merge with next)" << dendl;
+ length += p->second;
+ kv_free.erase(p);
+ } else if (p->first < offset + length) {
+ derr << __func__ << " bad release " << offset << "~" << length
+ << " overlaps with " << p->first << "~" << p->second << dendl;
+ _dump();
+ assert(0 == "bad release overlap");
+ } else {
+ dout(30) << __func__ << " next extent " << p->first << "~" << p->second
+ << " is not contiguous" << dendl;
+ }
+ }
+
+ string key;
+ _key_encode_u64(offset, &key);
+ bufferlist value;
+ ::encode(length, value);
+ txn->set(prefix, key, value);
+ dout(20) << __func__ << " set " << offset << "~" << length << dendl;
+
+ kv_free[offset] = length;
+
+ if (g_conf->bluestore_debug_freelist)
+ _audit();
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_OS_BLUESTORE_EXTENTFREELISTMANAGER_H
+#define CEPH_OS_BLUESTORE_EXTENTFREELISTMANAGER_H
+
+#include <string>
+#include <map>
+#include <mutex>
+#include <ostream>
+#include "FreelistManager.h"
+
+#include "include/cpp-btree/btree_map.h"
+
+class ExtentFreelistManager : public FreelistManager {
+ std::string prefix;
+ std::mutex lock;
+ uint64_t total_free;
+
+ typedef btree::btree_map<uint64_t,uint64_t> map_t;
+ static const bool map_t_has_stable_iterators = false;
+
+ map_t kv_free; ///< mirrors our kv values in the db
+
+ map_t::const_iterator enumerate_p;
+
+ void _audit();
+ void _dump();
+
+public:
+ ExtentFreelistManager() :
+ total_free(0) {
+ }
+
+ int init(KeyValueDB *kvdb, std::string prefix) override;
+ void shutdown() override;
+
+ void dump() override;
+
+ void enumerate_reset() override;
+ bool enumerate_next(uint64_t *offset, uint64_t *length) override;
+
+ void allocate(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn) override;
+ void release(
+ uint64_t offset, uint64_t length,
+ KeyValueDB::Transaction txn) override;
+};
+
+
+#endif
// vim: ts=8 sw=2 smarttab
#include "FreelistManager.h"
-#include "kv/KeyValueDB.h"
-#include "kv.h"
+#include "ExtentFreelistManager.h"
-#include "common/debug.h"
-
-#define dout_subsys ceph_subsys_bluestore
-#undef dout_prefix
-#define dout_prefix *_dout << "freelist "
-
-int FreelistManager::init(KeyValueDB *db, string p)
+FreelistManager *FreelistManager::create(string type)
{
- dout(1) << __func__ << " prefix " << p << dendl;
-
- // load state from kvstore
- prefix = p;
-
- KeyValueDB::Transaction txn = db->get_transaction();
- int fixed = 0;
-
- KeyValueDB::Iterator it = db->get_iterator(prefix);
- it->lower_bound(string());
- uint64_t last_offset = 0;
- uint64_t last_length = 0;
- while (it->valid()) {
- uint64_t offset, length;
- string k = it->key();
- const char *p = _key_decode_u64(k.c_str(), &offset);
- assert(p);
- bufferlist bl = it->value();
- bufferlist::iterator bp = bl.begin();
- ::decode(length, bp);
-
- total_free += length;
-
- if (offset < last_offset + last_length) {
- derr << __func__ << " detected overlapping extent on load, had "
- << last_offset << "~" << last_length
- << " and got "
- << offset << "~" << length
- << dendl;
- return -EIO;
- }
- if (offset && offset == last_offset + last_length) {
- derr << __func__ << " detected contiguous extent on load, merging "
- << last_offset << "~" << last_length << " with "
- << offset << "~" << length
- << dendl;
- kv_free.erase(last_offset);
- string key;
- _key_encode_u64(last_offset, &key);
- txn->rmkey(prefix, key);
- offset -= last_length;
- length += last_length;
- bufferlist value;
- ::encode(length, value);
- txn->set(prefix, key, value);
- fixed++;
- }
-
- kv_free[offset] = length;
- dout(20) << __func__ << " " << offset << "~" << length << dendl;
-
- last_offset = offset;
- last_length = length;
- it->next();
- }
-
- if (fixed) {
- db->submit_transaction_sync(txn);
- derr << " fixed " << fixed << " extents" << dendl;
- }
-
- dout(10) << __func__ << " loaded " << kv_free.size() << " extents" << dendl;
- return 0;
-}
-
-void FreelistManager::shutdown()
-{
- dout(1) << __func__ << dendl;
-}
-
-void FreelistManager::dump()
-{
- std::lock_guard<std::mutex> l(lock);
- _dump();
-}
-
-void FreelistManager::enumerate_reset()
-{
- std::lock_guard<std::mutex> l(lock);
- enumerate_p = kv_free.begin();
-}
-
-bool FreelistManager::enumerate_next(uint64_t *offset, uint64_t *length)
-{
- std::lock_guard<std::mutex> l(lock);
- if (enumerate_p == kv_free.end())
- return false;
- *offset = enumerate_p->first;
- *length = enumerate_p->second;
- ++enumerate_p;
- return true;
-}
-
-void FreelistManager::_dump()
-{
- dout(30) << __func__ << " " << total_free
- << " in " << kv_free.size() << " extents" << dendl;
- for (auto p = kv_free.begin();
- p != kv_free.end();
- ++p) {
- dout(30) << __func__ << " " << p->first << "~" << p->second << dendl;
- }
-}
-
-void FreelistManager::_audit()
-{
- uint64_t sum = 0;
- for (auto& p : kv_free) {
- sum += p.second;
- }
- if (total_free != sum) {
- derr << __func__ << " sum " << sum << " != total_free " << total_free
- << dendl;
- derr << kv_free << dendl;
- assert(0 == "freelistmanager bug");
- }
-}
-
-void FreelistManager::allocate(
- uint64_t offset, uint64_t length,
- KeyValueDB::Transaction txn)
-{
- std::lock_guard<std::mutex> l(lock);
- dout(10) << __func__ << " " << offset << "~" << length << dendl;
- total_free -= length;
- auto p = kv_free.lower_bound(offset);
- if ((p == kv_free.end() || p->first > offset) &&
- p != kv_free.begin()) {
- --p;
- }
- if (p == kv_free.end() ||
- p->first > offset ||
- p->first + p->second < offset + length) {
- derr << " bad allocate " << offset << "~" << length << " - dne" << dendl;
- if (p != kv_free.end()) {
- derr << " existing extent " << p->first << "~" << p->second << dendl;
- }
- _dump();
- assert(0 == "bad allocate");
- }
-
- if (p->first == offset) {
- string key;
- _key_encode_u64(offset, &key);
- txn->rmkey(prefix, key);
- dout(20) << __func__ << " rm " << p->first << "~" << p->second << dendl;
- if (p->second > length) {
- uint64_t newoff = offset + length;
- uint64_t newlen = p->second - length;
- string newkey;
- _key_encode_u64(newoff, &newkey);
- bufferlist newvalue;
- ::encode(newlen, newvalue);
- txn->set(prefix, newkey, newvalue);
- dout(20) << __func__ << " set " << newoff << "~" << newlen
- << " (remaining tail)" << dendl;
- kv_free.erase(p);
- kv_free[newoff] = newlen;
- } else {
- kv_free.erase(p);
- }
- } else {
- assert(p->first < offset);
- // shorten
- uint64_t newlen = offset - p->first;
- string key;
- _key_encode_u64(p->first, &key);
- bufferlist newvalue;
- ::encode(newlen, newvalue);
- txn->set(prefix, key, newvalue);
- dout(30) << __func__ << " set " << p->first << "~" << newlen
- << " (remaining head from " << p->second << ")" << dendl;
- if (p->first + p->second > offset + length) {
- // new trailing piece, too
- uint64_t tailoff = offset + length;
- uint64_t taillen = p->first + p->second - (offset + length);
- string tailkey;
- _key_encode_u64(tailoff, &tailkey);
- bufferlist tailvalue;
- ::encode(taillen, tailvalue);
- txn->set(prefix, tailkey, tailvalue);
- dout(20) << __func__ << " set " << tailoff << "~" << taillen
- << " (remaining tail from " << p->first << "~" << p->second << ")"
- << dendl;
- p->second = newlen;
- kv_free[tailoff] = taillen;
- } else {
- p->second = newlen;
- }
- }
- if (g_conf->bluestore_debug_freelist)
- _audit();
-}
-
-void FreelistManager::release(
- uint64_t offset, uint64_t length,
- KeyValueDB::Transaction txn)
-{
- std::lock_guard<std::mutex> l(lock);
- dout(10) << __func__ << " " << offset << "~" << length << dendl;
- total_free += length;
- auto p = kv_free.lower_bound(offset);
-
- // contiguous with previous extent?
- if (p != kv_free.begin()) {
- --p;
- if (p->first + p->second == offset) {
- string prevkey;
- _key_encode_u64(p->first, &prevkey);
- txn->rmkey(prefix, prevkey);
- dout(20) << __func__ << " rm " << p->first << "~" << p->second
- << " (merge with previous)" << dendl;
- length += p->second;
- offset = p->first;
- if (map_t_has_stable_iterators) {
- kv_free.erase(p++);
- } else {
- p = kv_free.erase(p);
- }
- } else if (p->first + p->second > offset) {
- derr << __func__ << " bad release " << offset << "~" << length
- << " overlaps with " << p->first << "~" << p->second << dendl;
- _dump();
- assert(0 == "bad release overlap");
- } else {
- dout(30) << __func__ << " previous extent " << p->first << "~" << p->second
- << " is not contiguous" << dendl;
- ++p;
- }
- }
-
- // contiguous with next extent?
- if (p != kv_free.end()) {
- if (p->first == offset + length) {
- string tailkey;
- _key_encode_u64(p->first, &tailkey);
- txn->rmkey(prefix, tailkey);
- dout(20) << __func__ << " rm " << p->first << "~" << p->second
- << " (merge with next)" << dendl;
- length += p->second;
- kv_free.erase(p);
- } else if (p->first < offset + length) {
- derr << __func__ << " bad release " << offset << "~" << length
- << " overlaps with " << p->first << "~" << p->second << dendl;
- _dump();
- assert(0 == "bad release overlap");
- } else {
- dout(30) << __func__ << " next extent " << p->first << "~" << p->second
- << " is not contiguous" << dendl;
- }
- }
-
- string key;
- _key_encode_u64(offset, &key);
- bufferlist value;
- ::encode(length, value);
- txn->set(prefix, key, value);
- dout(20) << __func__ << " set " << offset << "~" << length << dendl;
-
- kv_free[offset] = length;
-
- if (g_conf->bluestore_debug_freelist)
- _audit();
+ if (type == "extent")
+ return new ExtentFreelistManager;
+ return NULL;
}
#include <ostream>
#include "kv/KeyValueDB.h"
-#include "include/cpp-btree/btree_map.h"
-
class FreelistManager {
- std::string prefix;
- std::mutex lock;
- uint64_t total_free;
-
- typedef btree::btree_map<uint64_t,uint64_t> map_t;
- static const bool map_t_has_stable_iterators = false;
-
- map_t kv_free; ///< mirrors our kv values in the db
-
- map_t::const_iterator enumerate_p;
+public:
+ FreelistManager() {}
+ virtual ~FreelistManager() {}
- void _audit();
- void _dump();
+ static FreelistManager *create(string type);
-public:
- FreelistManager() :
- total_free(0) {
+ virtual int create(KeyValueDB::Transaction txn) {
+ return 0;
}
- int init(KeyValueDB *kvdb, std::string prefix);
- void shutdown();
+ virtual int init(KeyValueDB *kvdb, std::string prefix) = 0;
+ virtual void shutdown() = 0;
- void dump();
-
- uint64_t get_total_free() {
- std::lock_guard<std::mutex> l(lock);
- return total_free;
- }
+ virtual void dump() = 0;
- void enumerate_reset();
- bool enumerate_next(uint64_t *offset, uint64_t *length);
+ virtual void enumerate_reset() = 0;
+ virtual bool enumerate_next(uint64_t *offset, uint64_t *length) = 0;
- void allocate(
+ virtual void allocate(
uint64_t offset, uint64_t length,
- KeyValueDB::Transaction txn);
- void release(
+ KeyValueDB::Transaction txn) = 0;
+ virtual void release(
uint64_t offset, uint64_t length,
- KeyValueDB::Transaction txn);
+ KeyValueDB::Transaction txn) = 0;
};