BlueFS::BlueFS()
: ino_last(0),
log_seq(0),
- log_writer(NULL)
+ log_writer(NULL),
+ bdev(MAX_BDEV),
+ ioc(MAX_BDEV),
+ block_all(MAX_BDEV),
+ block_total(MAX_BDEV, 0)
{
}
BlueFS::~BlueFS()
{
for (auto p : bdev) {
- p->close();
- delete p;
+ if (p) {
+ p->close();
+ delete p;
+ }
}
for (auto p : ioc) {
delete p;
int BlueFS::add_block_device(unsigned id, string path)
{
dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
- assert(id == bdev.size());
+ assert(id < bdev.size());
+ assert(bdev[id] == NULL);
BlockDevice *b = BlockDevice::create(path, NULL, NULL); //aio_cb, this);
int r = b->open(path);
if (r < 0) {
}
dout(1) << __func__ << " bdev " << id << " path " << path
<< " size " << pretty_si_t(b->get_size()) << "B" << dendl;
- bdev.push_back(b);
- ioc.push_back(new IOContext(NULL));
- block_all.resize(bdev.size());
+ bdev[id] = b;
+ ioc[id] = new IOContext(NULL);
return 0;
}
uint64_t BlueFS::get_block_device_size(unsigned id)
{
- return bdev[id]->get_size();
+ if (bdev[id])
+ return bdev[id]->get_size();
+ return 0;
}
void BlueFS::add_block_extent(unsigned id, uint64_t offset, uint64_t length)
dout(1) << __func__ << " bdev " << id << " " << offset << "~" << length
<< dendl;
assert(id < bdev.size());
+ assert(bdev[id]);
assert(bdev[id]->get_size() >= offset + length);
block_all[id].insert(offset, length);
+ block_total[id] += length;
if (alloc.size()) {
log_t.op_alloc_add(id, offset, length);
std::lock_guard<std::mutex> l(lock);
dout(1) << __func__ << " bdev " << id << " want " << want << dendl;
assert(id < alloc.size());
+ assert(alloc[id]);
int r = alloc[id]->reserve(want);
assert(r == 0); // caller shouldn't ask for more than they can get
alloc[id]->unreserve(want - *length);
block_all[id].erase(*offset, *length);
+ block_total[id] -= *length;
log_t.op_alloc_rm(id, *offset, *length);
r = _flush_log();
assert(r == 0);
{
std::lock_guard<std::mutex> l(lock);
assert(id < block_all.size());
- uint64_t r = 0;
- interval_set<uint64_t>& p = block_all[id];
- for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
- r += q.get_len();
- }
- return r;
+ return block_total[id];
}
uint64_t BlueFS::get_free(unsigned id)
std::lock_guard<std::mutex> l(lock);
usage->resize(bdev.size());
for (unsigned id = 0; id < bdev.size(); ++id) {
- uint64_t total = 0;
- interval_set<uint64_t>& p = block_all[id];
- for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
- total += q.get_len();
+ if (!bdev[id]) {
+ (*usage)[id] = make_pair(0, 0);
+ continue;
}
(*usage)[id].first = alloc[id]->get_free();
- (*usage)[id].second = total;
- uint64_t used = (total - (*usage)[id].first) * 100 / total;
+ (*usage)[id].second = block_total[id];
+ uint64_t used =
+ (block_total[id] - (*usage)[id].first) * 100 / block_total[id];
dout(10) << __func__ << " bdev " << id
<< " free " << (*usage)[id].first
<< " (" << pretty_si_t((*usage)[id].first) << "B)"
dout(1) << __func__
<< " osd_uuid " << osd_uuid
<< dendl;
- assert(bdev.size() >= 1);
_init_alloc();
super.version = 1;
- super.block_size = bdev[0]->get_block_size();
+ super.block_size = bdev[BDEV_DB]->get_block_size();
super.osd_uuid = osd_uuid;
super.uuid.generate_random();
dout(1) << __func__ << " uuid " << super.uuid << dendl;
// init log
FileRef log_file = new File;
log_file->fnode.ino = 1;
- log_file->fnode.prefer_bdev = bdev.size() - 1;
- int r = _allocate(log_file->fnode.prefer_bdev,
- g_conf->bluefs_max_log_runway,
- &log_file->fnode.extents);
+ log_file->fnode.prefer_bdev = BDEV_WAL;
+ int r = _allocate(
+ log_file->fnode.prefer_bdev,
+ g_conf->bluefs_max_log_runway,
+ &log_file->fnode.extents);
assert(r == 0);
- log_writer = new FileWriter(log_file, bdev.size());
+ log_writer = _create_writer(log_file);
// initial txn
log_t.op_init();
- for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
+ for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
interval_set<uint64_t>& p = block_all[bdev];
+ if (p.empty())
+ continue;
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
<< "~" << q.get_len() << dendl;
_close_writer(log_writer);
log_writer = NULL;
block_all.clear();
+ block_total.clear();
_stop_alloc();
dout(10) << __func__ << " success" << dendl;
void BlueFS::_init_alloc()
{
dout(20) << __func__ << dendl;
- alloc.resize(bdev.size());
+ alloc.resize(MAX_BDEV);
for (unsigned id = 0; id < bdev.size(); ++id) {
+ if (!bdev[id])
+ continue;
alloc[id] = new StupidAllocator;
interval_set<uint64_t>& p = block_all[id];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
int BlueFS::mount()
{
dout(1) << __func__ << dendl;
- assert(!bdev.empty());
int r = _open_super();
if (r < 0) {
}
block_all.clear();
- block_all.resize(bdev.size());
+ block_all.resize(MAX_BDEV);
+ block_total.clear();
+ block_total.resize(MAX_BDEV, 0);
_init_alloc();
r = _replay();
}
// set up the log for future writes
- log_writer = new FileWriter(_get_file(1), bdev.size());
+ log_writer = _create_writer(_get_file(1));
assert(log_writer->file->fnode.ino == 1);
log_writer->pos = log_writer->file->fnode.size;
dout(10) << __func__ << " log write pos set to " << log_writer->pos << dendl;
_close_writer(log_writer);
log_writer = NULL;
- block_all.clear();
_stop_alloc();
file_map.clear();
dir_map.clear();
bl.rebuild();
IOContext ioc(NULL);
- bdev[0]->aio_write(get_super_offset(), bl, &ioc, false);
- bdev[0]->aio_submit(&ioc);
+ bdev[BDEV_DB]->aio_write(get_super_offset(), bl, &ioc, false);
+ bdev[BDEV_DB]->aio_submit(&ioc);
ioc.aio_wait();
dout(20) << __func__ << " v " << super.version << " crc " << crc
<< " offset " << get_super_offset() << dendl;
int r;
// always the second block
- r = bdev[0]->read(get_super_offset(), get_super_length(),
- &bl, ioc[0], false);
+ r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
+ &bl, ioc[BDEV_DB], false);
if (r < 0)
return r;
dout(20) << __func__ << " " << pos << ": op_alloc_add "
<< " " << (int)id << ":" << offset << "~" << length << dendl;
block_all[id].insert(offset, length);
+ block_total[id] += length;
alloc[id]->init_add_free(offset, length);
}
break;
dout(20) << __func__ << " " << pos << ": op_alloc_rm "
<< " " << (int)id << ":" << offset << "~" << length << dendl;
block_all[id].erase(offset, length);
+ block_total[id] -= length;
alloc[id]->init_rm_free(offset, length);
}
break;
t.uuid = super.uuid;
dout(20) << __func__ << " op_init" << dendl;
t.op_init();
- for (unsigned bdev = 0; bdev < block_all.size(); ++bdev) {
+ for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
interval_set<uint64_t>& p = block_all[bdev];
for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
dout(20) << __func__ << " op_alloc_add " << bdev << " " << q.get_start()
_close_writer(log_writer);
log_file->fnode.size = bl.length();
- log_writer = new FileWriter(log_file, bdev.size());
+ log_writer = _create_writer(log_file);
log_writer->append(bl);
int r = _flush(log_writer, true);
assert(r == 0);
length += partial;
dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
for (auto p : h->iocv) {
- p->aio_wait();
+ if (p) {
+ p->aio_wait();
+ }
}
}
if (length == partial + h->buffer.length()) {
++p;
x_off = 0;
}
- for (unsigned i = 0; i < bdev.size(); ++i) {
- if (h->iocv[i]->has_aios()) {
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (bdev[i] && h->iocv[i]->has_aios()) {
+ assert(h->iocv[i]);
bdev[i]->aio_submit(h->iocv[i]);
}
}
dout(10) << __func__ << " " << h << dendl;
utime_t start = ceph_clock_now(NULL);
for (auto p : h->iocv) {
- p->aio_wait();
+ if (p) {
+ p->aio_wait();
+ }
}
utime_t end = ceph_clock_now(NULL);
utime_t dur = end - start;
{
dout(20) << __func__ << dendl;
for (auto p : bdev) {
- p->flush();
+ if (p)
+ p->flush();
}
}
assert(id < alloc.size());
uint64_t left = ROUND_UP_TO(len, g_conf->bluefs_alloc_size);
- int r = alloc[id]->reserve(left);
+ int r = -ENOSPC;
+ if (alloc[id]) {
+ r = alloc[id]->reserve(left);
+ }
if (r < 0) {
- if (id) {
- derr << __func__ << " failed to allocate " << left << " on bdev " << id
- << ", free " << alloc[id]->get_free()
- << "; fallback to bdev 0" << dendl;
- return _allocate(0, len, ev);
+ if (id != BDEV_SLOW) {
+ if (bdev[id])
+ derr << __func__ << " failed to allocate " << left << " on bdev " << id
+ << ", free " << alloc[id]->get_free()
+ << "; fallback to bdev " << id + 1 << dendl;
+ return _allocate(id + 1, len, ev);
}
- derr << __func__ << " failed to allocate " << left << " on bdev " << id
- << ", free " << alloc[id]->get_free() << dendl;
+ if (bdev[id])
+ derr << __func__ << " failed to allocate " << left << " on bdev " << id
+ << ", free " << alloc[id]->get_free() << dendl;
+ else
+ derr << __func__ << " failed to allocate " << left << " on bdev " << id
+ << ", dne" << dendl;
return r;
}
dout(10) << __func__ << dendl;
utime_t start = ceph_clock_now(NULL);
for (auto p : alloc) {
- p->commit_start();
+ if (p) {
+ p->commit_start();
+ }
}
_flush_log();
for (auto p : alloc) {
- p->commit_finish();
+ if (p) {
+ p->commit_finish();
+ }
}
_maybe_compact_log();
utime_t end = ceph_clock_now(NULL);
file->fnode.mtime = ceph_clock_now(NULL);
}
+ file->fnode.prefer_bdev = BlueFS::BDEV_DB;
if (dirname.length() > 5) {
// the "db.slow" and "db.wal" directory names are hard-coded at
// match up with bluestore. the slow device is always the second
// one (when a dedicated block.db device is present and used at
// bdev 0). the wal device is always last.
if (strcmp(dirname.c_str() + dirname.length() - 5, ".slow") == 0) {
- assert(bdev.size() > 1);
- dout(20) << __func__ << " mapping " << dirname << "/" << filename
- << " to bdev 1" << dendl;
- file->fnode.prefer_bdev = 1;
+ file->fnode.prefer_bdev = BlueFS::BDEV_SLOW;
} else if (strcmp(dirname.c_str() + dirname.length() - 4, ".wal") == 0) {
- assert(bdev.size() > 1);
- file->fnode.prefer_bdev = bdev.size() - 1;
- dout(20) << __func__ << " mapping " << dirname << "/" << filename
- << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
+ file->fnode.prefer_bdev = BlueFS::BDEV_WAL;
}
}
+ dout(20) << __func__ << " mapping " << dirname << "/" << filename
+ << " to bdev " << (int)file->fnode.prefer_bdev << dendl;
log_t.op_file_update(file->fnode);
if (create)
log_t.op_dir_link(dirname, filename, file->fnode.ino);
- *h = new FileWriter(file, bdev.size());
+ *h = _create_writer(file);
dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
return 0;
}
+BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
+{
+ FileWriter *w = new FileWriter(f);
+ for (unsigned i = 0; i < MAX_BDEV; ++i) {
+ if (bdev[i]) {
+ w->iocv[i] = new IOContext(NULL);
+ } else {
+ w->iocv[i] = NULL;
+ }
+ }
+ return w;
+}
+
void BlueFS::_close_writer(FileWriter *h)
{
dout(10) << __func__ << " " << h << dendl;
- for (unsigned i=0; i<bdev.size(); ++i) {
- h->iocv[i]->aio_wait();
- bdev[i]->queue_reap_ioc(h->iocv[i]);
+ for (unsigned i=0; i<MAX_BDEV; ++i) {
+ if (bdev[i]) {
+ assert(h->iocv[i]);
+ h->iocv[i]->aio_wait();
+ bdev[i]->queue_reap_ioc(h->iocv[i]);
+ }
}
- h->iocv.clear();
delete h;
}
class BlueFS {
public:
+ static constexpr unsigned MAX_BDEV = 3;
+ static constexpr unsigned BDEV_WAL = 0;
+ static constexpr unsigned BDEV_DB = 1;
+ static constexpr unsigned BDEV_SLOW = 2;
+
struct File : public RefCountedObject {
bluefs_fnode_t fnode;
int refs;
bufferlist tail_block; ///< existing partial block at end of file, if any
std::mutex lock;
- vector<IOContext*> iocv; ///< one for each bdev
+ std::array<IOContext*,MAX_BDEV> iocv; ///< for each bdev
- FileWriter(FileRef f, unsigned num_bdev)
+ FileWriter(FileRef f)
: file(f),
pos(0) {
++file->num_writers;
- iocv.resize(num_bdev);
- for (unsigned i = 0; i < num_bdev; ++i) {
- iocv[i] = new IOContext(NULL);
- }
}
+ // NOTE: caller must call BlueFS::close_writer()
~FileWriter() {
--file->num_writers;
- assert(iocv.empty()); // caller must call BlueFS::close_writer()
}
-
void append(const char *buf, size_t len) {
buffer.append(buf, len);
}
bluefs_transaction_t log_t; ///< pending, unwritten log transaction
/*
- * - there can be from 1 to 3 block devices.
- *
- * - the first device always has the superblock.
- *
- * - if there is a dedicated db device, it is the first device, and the
- * second device is shared with bluestore. the first device will be
- * db/, and the second device will be db.slow/.
- *
- * - if there is no dedicated db device, then the first device is shared, and
- * maps to the db/ directory.
+ * There are up to 3 block devices:
*
- * - a wal device, if present, it always the last device. it should be
- * used for any files in the db.wal/ directory.
+ * BDEV_DB db/ - the primary db device
+ * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL
+ * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills
*/
vector<BlockDevice*> bdev; ///< block devices we can use
vector<IOContext*> ioc; ///< IOContexts for bdevs
vector<interval_set<uint64_t> > block_all; ///< extents in bdev we own
+ vector<uint64_t> block_total; ///< sum of block_all
vector<Allocator*> alloc; ///< allocators for bdevs
void _init_alloc();
int _write_super();
int _replay(); ///< replay journal
+ FileWriter *_create_writer(FileRef f);
void _close_writer(FileWriter *h);
// always put the super in the second 4k block. FIXME should this be
} else if (s == "0") {
do_bluefs = false;
} else {
- derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting" << dendl;
+ derr << __func__ << " bluefs = " << s << " : not 0 or 1, aborting"
+ << dendl;
return -EIO;
}
}
char bfn[PATH_MAX];
struct stat st;
- int id = 0;
snprintf(bfn, sizeof(bfn), "%s/block.db", path.c_str());
if (::stat(bfn, &st) == 0) {
- r = bluefs->add_block_device(id, bfn);
+ r = bluefs->add_block_device(BlueFS::BDEV_DB, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
- r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB),
"bluefs db", create);
if (r < 0) {
- derr << __func__ << " check block device(" << bfn << ") label returned: "
+ derr << __func__
+ << " check block device(" << bfn << ") label returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
if (create) {
bluefs->add_block_extent(
- id, BLUEFS_START,
- bluefs->get_block_device_size(id) - BLUEFS_START);
+ BlueFS::BDEV_DB,
+ BLUEFS_START,
+ bluefs->get_block_device_size(BlueFS::BDEV_DB) - BLUEFS_START);
}
- ++id;
+ bluefs_shared_bdev = BlueFS::BDEV_SLOW;
+ } else {
+ bluefs_shared_bdev = BlueFS::BDEV_DB;
}
+ // shared device
snprintf(bfn, sizeof(bfn), "%s/block", path.c_str());
- r = bluefs->add_block_device(id, bfn);
+ r = bluefs->add_block_device(bluefs_shared_bdev, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
- << cpp_strerror(r) << dendl;
+ << cpp_strerror(r) << dendl;
goto free_bluefs;
}
if (create) {
// align to bluefs's alloc_size
initial = ROUND_UP_TO(initial, g_conf->bluefs_alloc_size);
initial += g_conf->bluefs_alloc_size - BLUEFS_START;
- bluefs->add_block_extent(id, BLUEFS_START, initial);
+ bluefs->add_block_extent(bluefs_shared_bdev, BLUEFS_START, initial);
bluefs_extents.insert(BLUEFS_START, initial);
}
- bluefs_shared_bdev = id;
- ++id;
- if (id == 2) {
+
+ if (bluefs_shared_bdev == BlueFS::BDEV_SLOW) {
// we have both block.db and block; tell rocksdb!
// note: the second (last) size value doesn't really matter
char db_paths[PATH_MAX*3];
snprintf(
db_paths, sizeof(db_paths), "%s/db,%lld %s/db.slow,%lld",
path.c_str(),
- (unsigned long long)bluefs->get_block_device_size(0) * 95 / 100,
+ (unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_DB) *
+ 95 / 100,
path.c_str(),
- (unsigned long long)bluefs->get_block_device_size(1) * 95 / 100);
+ (unsigned long long)bluefs->get_block_device_size(BlueFS::BDEV_SLOW) *
+ 95 / 100);
g_conf->set_val("rocksdb_db_paths", db_paths, false, false);
dout(10) << __func__ << " set rocksdb_db_paths to "
<< g_conf->rocksdb_db_paths << dendl;
snprintf(bfn, sizeof(bfn), "%s/block.wal", path.c_str());
if (::stat(bfn, &st) == 0) {
- r = bluefs->add_block_device(id, bfn);
+ r = bluefs->add_block_device(BlueFS::BDEV_WAL, bfn);
if (r < 0) {
derr << __func__ << " add block device(" << bfn << ") returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
- r = _check_or_set_bdev_label(bfn, bluefs->get_block_device_size(id),
+ r = _check_or_set_bdev_label(
+ bfn,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL),
"bluefs wal", create);
if (r < 0) {
- derr << __func__ << " check block device(" << bfn << ") label returned: "
+ derr << __func__ << " check block device(" << bfn << ") label returned: "
<< cpp_strerror(r) << dendl;
goto free_bluefs;
}
if (create) {
bluefs->add_block_extent(
- id, BDEV_LABEL_BLOCK_SIZE,
- bluefs->get_block_device_size(id) - BDEV_LABEL_BLOCK_SIZE);
+ BlueFS::BDEV_WAL, BDEV_LABEL_BLOCK_SIZE,
+ bluefs->get_block_device_size(BlueFS::BDEV_WAL) -
+ BDEV_LABEL_BLOCK_SIZE);
}
g_conf->set_val("rocksdb_separate_wal_dir", "true");
} else {
delete bluefs;
bluefs = NULL;
}
- // delete env manually here since we can't depend on db to do this under this case
+ // delete env manually here since we can't depend on db to do this
+ // under this case
delete env;
env = NULL;
return -EIO;
string fn = get_temp_bdev(size);
uuid_d fsid;
BlueFS fs;
- fs.add_block_device(0, fn);
- fs.add_block_extent(0, 1048576, size - 1048576);
+ fs.add_block_device(BlueFS::BDEV_DB, fn);
+ fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
fs.mkfs(fsid);
rm_temp_bdev(fn);
}
uint64_t size = 1048476 * 128;
string fn = get_temp_bdev(size);
BlueFS fs;
- ASSERT_EQ(0, fs.add_block_device(0, fn));
- fs.add_block_extent(0, 1048576, size - 1048576);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
+ fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid));
ASSERT_EQ(0, fs.mount());
- ASSERT_EQ(fs.get_total(0), size - 1048576);
- ASSERT_LT(fs.get_free(0), size - 1048576);
+ ASSERT_EQ(fs.get_total(BlueFS::BDEV_DB), size - 1048576);
+ ASSERT_LT(fs.get_free(BlueFS::BDEV_DB), size - 1048576);
fs.umount();
rm_temp_bdev(fn);
}
uint64_t size = 1048476 * 128;
string fn = get_temp_bdev(size);
BlueFS fs;
- ASSERT_EQ(0, fs.add_block_device(0, fn));
- fs.add_block_extent(0, 1048576, size - 1048576);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
+ fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid));
ASSERT_EQ(0, fs.mount());
uint64_t size = 1048476 * 128;
string fn = get_temp_bdev(size);
BlueFS fs;
- ASSERT_EQ(0, fs.add_block_device(0, fn));
- fs.add_block_extent(0, 1048576, size - 1048576);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, fn));
+ fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576);
uuid_d fsid;
ASSERT_EQ(0, fs.mkfs(fsid));
ASSERT_EQ(0, fs.mount());