OPTION(bluefs_replay_recovery_disable_compact, OPT_BOOL)
OPTION(bluefs_check_for_zeros, OPT_BOOL)
+OPTION(bluefs_check_volume_selector_on_umount, OPT_BOOL)
+OPTION(bluefs_check_volume_selector_often, OPT_BOOL)
OPTION(bluestore_bluefs, OPT_BOOL)
OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug
// how often (sec) to dump allocator on allocation failure
"If this happens, we re-read data. If there is difference, we print error to log.")
.add_see_also("bluestore_retry_disk_reads"),
+ Option("bluefs_check_volume_selector_on_umount", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_flag(Option::FLAG_RUNTIME)
+ .set_description("Check validity of volume selector on umount")
+ .set_long_description("Checks if volume selector did not diverge from the state it should be in. "
+ "Reference is constructed from bluefs inode table. Asserts on inconsistency."),
+ Option("bluefs_check_volume_selector_often", Option::TYPE_BOOL, Option::LEVEL_DEV)
+ .set_default(false)
+ .set_flag(Option::FLAG_STARTUP)
+ .set_description("Periodically check validity of volume selector")
+ .set_long_description("Periodically checks if current volume selector does not diverge from the valid state. "
+ "Reference is constructed from bluefs inode table. Asserts on inconsistency. "
+ " This is debug feature.")
+ .add_see_also("bluefs_check_volume_selector_on_umount"),
+
Option("bluestore_bluefs", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(true)
.set_flag(Option::FLAG_CREATE)
discard_cb[BDEV_DB] = db_discard_cb;
discard_cb[BDEV_SLOW] = slow_discard_cb;
asok_hook = SocketHook::create(this);
-
}
BlueFS::~BlueFS()
dout(1) << __func__ << dendl;
sync_metadata(avoid_compact);
-
+ if (cct->_conf->bluefs_check_volume_selector_on_umount) {
+ _check_vselector_LNF();
+ }
_close_writer(log.writer);
log.writer = NULL;
log.t.clear();
void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
{
+ _maybe_check_vselector_LNF();
std::unique_lock hl(h->lock);
_flush_range_F(h, offset, length);
}
}
ceph_assert(h->file->fnode.size >= offset);
_flush_bdev(h);
+
+ std::lock_guard ll(log.lock);
vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
h->file->fnode.size = offset;
vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
-
- std::lock_guard ll(log.lock);
log.t.op_file_update_inc(h->file->fnode);
return 0;
}
int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
{
+ _maybe_check_vselector_LNF();
std::unique_lock hl(h->lock);
uint64_t old_dirty_seq = 0;
{
_flush_and_sync_log_LD(old_dirty_seq);
}
_maybe_compact_log_LNF_NF_LD_D();
+
return 0;
}
FileWriter **h,
bool overwrite)/*_N_LD*/
{
+ _maybe_check_vselector_LNF();
FileRef file;
bool create = false;
bool truncate = false;
FileReader **h,
bool random)/*_N*/
{
+ _maybe_check_vselector_LNF();
std::lock_guard nl(nodes.lock);
dout(10) << __func__ << " " << dirname << "/" << filename
<< (random ? " (random)":" (sequential)") << dendl;
return 0;
}
+void BlueFS::_check_vselector_LNF() {
+ BlueFSVolumeSelector* vs = vselector->clone_empty();
+ if (!vs) {
+ return;
+ }
+ std::lock_guard ll(log.lock);
+ std::lock_guard nl(nodes.lock);
+ // Checking vselector is under log, nodes and file(s) locks,
+ // so any modification of vselector must be under at least one of those locks.
+ for (auto& f : nodes.file_map) {
+ f.second->lock.lock();
+ vs->add_usage(f.second->vselector_hint, f.second->fnode);
+ }
+ bool res = vselector->compare(vs);
+ if (!res) {
+ dout(0) << "Current:";
+ vselector->dump(*_dout);
+ *_dout << dendl;
+ dout(0) << "Expected:";
+ vs->dump(*_dout);
+ *_dout << dendl;
+ }
+ ceph_assert(res);
+ for (auto& f : nodes.file_map) {
+ f.second->lock.unlock();
+ }
+ delete vs;
+}
+
size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
{
size_t total = 0;
virtual uint8_t select_prefer_bdev(void* hint) = 0;
virtual void get_paths(const std::string& base, paths& res) const = 0;
virtual void dump(std::ostream& sout) = 0;
+
+ /* used for sanity checking of vselector */
+ virtual BlueFSVolumeSelector* clone_empty() const { return nullptr; }
+ virtual bool compare(BlueFSVolumeSelector* other) { return true; };
};
struct bluefs_shared_alloc_context_t {
unsigned get_super_length() {
return 4096;
}
-
+ void _maybe_check_vselector_LNF() {
+ if (cct->_conf->bluefs_check_volume_selector_often) {
+ _check_vselector_LNF();
+ }
+ }
public:
BlueFS(CephContext* cct);
~BlueFS();
size_t read_offset,
size_t read_len,
bufferlist* bl);
+ void _check_vselector_LNF();
};
class OriginalVolumeSelector : public BlueFSVolumeSelector {
sout << std::endl;
}
+BlueFSVolumeSelector* RocksDBBlueFSVolumeSelector::clone_empty() const {
+ RocksDBBlueFSVolumeSelector* ns =
+ new RocksDBBlueFSVolumeSelector(0, 0, 0,
+ 0, 0, 0,
+ 0, 0, false);
+ return ns;
+}
+
+bool RocksDBBlueFSVolumeSelector::compare(BlueFSVolumeSelector* other) {
+ RocksDBBlueFSVolumeSelector* o = dynamic_cast<RocksDBBlueFSVolumeSelector*>(other);
+ ceph_assert(o);
+ bool equal = true;
+ for (size_t x = 0; x < BlueFS::MAX_BDEV + 1; x++) {
+ for (size_t y = 0; y <LEVEL_MAX - LEVEL_FIRST + 1; y++) {
+ equal &= (per_level_per_dev_usage.at(x, y) == o->per_level_per_dev_usage.at(x, y));
+ }
+ }
+ for (size_t t = 0; t < LEVEL_MAX - LEVEL_FIRST + 1; t++) {
+ equal &= (per_level_files[t] == o->per_level_files[t]);
+ }
+ return equal;
+}
+
// =======================================================
// =======================================================
BlueFSVolumeSelector::paths& res) const override;
void dump(std::ostream& sout) override;
+ BlueFSVolumeSelector* clone_empty() const override;
+ bool compare(BlueFSVolumeSelector* other) override;
};
#endif