ceph_assert(is_smr());
return conventional_region_size;
}
- virtual void reset_all_zones() {}
- virtual void reset_zones(const std::set<uint64_t>& zones) {
+ virtual void reset_all_zones() {
+ ceph_assert(is_smr());
+ }
+ virtual void reset_zone(uint64_t zone) {
ceph_assert(is_smr());
}
virtual std::vector<uint64_t> get_zones() {
+ ceph_assert(is_smr());
return std::vector<uint64_t>();
}
zbd_reset_zones(zbd_fd, conventional_region_size, 0);
}
-void HMSMRDevice::reset_zones(const std::set<uint64_t>& zones)
+void HMSMRDevice::reset_zone(uint64_t zone)
{
- dout(10) << __func__ << " 0x" << std::hex << zones << std::dec << dendl;
- for (auto zone_num : zones) {
- if (zbd_reset_zones(zbd_fd, zone_num * zone_size, zone_size) != 0) {
- derr << __func__ << " resetting zone failed for zone 0x" << std::hex
- << zone_num << std::dec << dendl;
- }
+ dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl;
+ if (zbd_reset_zones(zbd_fd, zone * zone_size, zone_size) != 0) {
+ derr << __func__ << " resetting zone failed for zone 0x" << std::hex
+ << zone << std::dec << dendl;
+ ceph_abort("zbd_reset_zones failed");
}
}
// smr-specific methods
bool is_smr() const final { return true; }
void reset_all_zones() override;
- void reset_zones(const std::set<uint64_t>& zones) override;
+ void reset_zone(uint64_t zone) override;
std::vector<uint64_t> get_zones() override;
};
}
}
- a->init_from_zone_pointers(zones,
- &zoned_cleaner_lock,
- &zoned_cleaner_cond);
+ a->init_from_zone_pointers(zones);
dout(1) << __func__
<< " loaded zone pointers: "
<< std::hex
}
#ifdef HAVE_LIBZBD
-void BlueStore::_zoned_cleaner_start() {
+void BlueStore::_zoned_cleaner_start()
+{
dout(10) << __func__ << dendl;
-
- auto f = dynamic_cast<ZonedFreelistManager*>(fm);
- ceph_assert(f);
-
- auto zones_to_clean = f->get_cleaning_in_progress_zones(db);
- if (!zones_to_clean.empty()) {
- dout(10) << __func__ << " resuming cleaning after unclean shutdown." << dendl;
- for (auto zone_num : zones_to_clean) {
- _zoned_clean_zone(zone_num);
- }
- bdev->reset_zones(zones_to_clean);
- f->mark_zones_to_clean_free(zones_to_clean, db);
- }
-
zoned_cleaner_thread.create("bstore_zcleaner");
}
-void BlueStore::_zoned_cleaner_stop() {
+void BlueStore::_zoned_cleaner_stop()
+{
dout(10) << __func__ << dendl;
{
std::unique_lock l{zoned_cleaner_lock};
dout(10) << __func__ << " done" << dendl;
}
-void BlueStore::_zoned_cleaner_thread() {
+void BlueStore::_zoned_cleaner_thread()
+{
dout(10) << __func__ << " start" << dendl;
std::unique_lock l{zoned_cleaner_lock};
ceph_assert(!zoned_cleaner_started);
auto f = dynamic_cast<ZonedFreelistManager*>(fm);
ceph_assert(f);
while (true) {
- const auto *zones_to_clean = a->get_zones_to_clean();
- if (!zones_to_clean) {
+ auto zone_to_clean = a->pick_zone_to_clean();
+ if (zone_to_clean < 0) {
if (zoned_cleaner_stop) {
break;
}
dout(20) << __func__ << " wake" << dendl;
} else {
l.unlock();
- f->mark_zones_to_clean_in_progress(*zones_to_clean, db);
- for (auto zone_num : *zones_to_clean) {
- _zoned_clean_zone(zone_num);
- }
- bdev->reset_zones(*zones_to_clean);
- f->mark_zones_to_clean_free(*zones_to_clean, db);
- a->mark_zones_to_clean_free();
+ _zoned_clean_zone(zone_to_clean);
+ bdev->reset_zone(zone_to_clean);
+ f->mark_zone_to_clean_free(zone_to_clean, db);
+ //a->mark_zone_to_clean_free();
l.lock();
}
}
zoned_cleaner_started = false;
}
-void BlueStore::_zoned_clean_zone(uint64_t zone_num) {
+void BlueStore::_zoned_clean_zone(uint64_t zone_num)
+{
dout(10) << __func__ << " cleaning zone " << zone_num << dendl;
// TODO: (1) copy live objects from zone_num to a new zone, (2) issue a RESET
// ZONE operation to the device for the corresponding zone.
zone_size(_zone_size),
first_seq_zone_num(_first_sequential_zone),
starting_zone_num(first_seq_zone_num),
- num_zones(size / zone_size),
- num_zones_to_clean(0)
+ num_zones(size / zone_size)
{
ldout(cct, 10) << " size 0x" << std::hex << size
<< " zone size 0x" << zone_size << std::dec
<< std::hex << want_size << std::dec << dendl;
uint64_t zone_num = starting_zone_num;
- auto p = zones_to_clean.lower_bound(zone_num);
for ( ; zone_num < num_zones; ++zone_num) {
- if (p != zones_to_clean.cend() && *p == zone_num) {
- ldout(cct, 10) << " skipping zone 0x" << std::hex << zone_num
- << " because it is being cleaned" << std::dec << dendl;
- ++p;
- continue;
- }
if (fits(want_size, zone_num)) {
break;
}
<< " and zone offset 0x" << (offset % zone_size)
<< std::dec << dendl;
- find_zones_to_clean();
-
extents->emplace_back(bluestore_pextent_t(offset, want_size));
return want_size;
}
}
void ZonedAllocator::init_from_zone_pointers(
- std::vector<zone_state_t> _zone_states,
- ceph::mutex *_cleaner_lock,
- ceph::condition_variable *_cleaner_cond)
+ std::vector<zone_state_t> _zone_states)
{
// this is called once, based on the device's zone pointers
std::lock_guard l(lock);
ldout(cct, 10) << dendl;
- cleaner_lock = _cleaner_lock;
- cleaner_cond = _cleaner_cond;
zone_states = std::move(_zone_states);
num_free = 0;
for (size_t i = first_seq_zone_num; i < num_zones; ++i) {
<< dendl;
}
-const std::set<uint64_t> *ZonedAllocator::get_zones_to_clean(void)
+int64_t ZonedAllocator::pick_zone_to_clean(void)
{
- ldout(cct, 10) << dendl;
- return num_zones_to_clean ? &zones_to_clean : nullptr;
+ int32_t best = -1;
+ int64_t best_score = 0;
+ for (size_t i = first_seq_zone_num; i < num_zones; ++i) {
+ int64_t score = zone_states[i].num_dead_bytes;
+ // discount by remaining space so we will tend to clean full zones
+ score -= (zone_size - zone_states[i].write_pointer) / 2;
+ if (score > 0 && (best < 0 || score > best_score)) {
+ best = i;
+ best_score = score;
+ }
+ }
+ if (best >= 0) {
+ ldout(cct, 10) << " zone 0x" << std::hex << best << " with score 0x" << best_score
+ << ": 0x" << zone_states[best].num_dead_bytes
+ << " dead and 0x"
+ << zone_states[best].write_pointer - zone_states[best].num_dead_bytes
+ << " live bytes" << std::dec << dendl;
+ } else {
+ ldout(cct, 10) << " no zones found that are good cleaning candidates" << dendl;
+ }
+ return best;
}
bool ZonedAllocator::low_on_space(void)
{
- ceph_assert(zones_to_clean.empty());
-
+ std::lock_guard l(lock);
uint64_t sequential_num_free = num_free - conventional_size;
double free_ratio = static_cast<double>(sequential_num_free) / sequential_size;
return free_ratio <= 0.25;
}
-void ZonedAllocator::find_zones_to_clean(void)
-{
- ldout(cct, 40) << dendl;
-
- if (num_zones_to_clean || !low_on_space())
- return;
-
- ceph_assert(zones_to_clean.empty());
-
- // TODO: make this tunable; handle the case when there aren't this many zones
- // to clean.
- const int64_t num_zones_to_clean_at_once = 1;
-
- std::vector<uint64_t> idx(num_zones);
- std::iota(idx.begin(), idx.end(), 0);
-
- if (cct->_conf->subsys.should_gather<ceph_subsys_bluestore, 40>()) {
- for (size_t i = 0; i < zone_states.size(); ++i) {
- dout(40) << " zone 0x" << std::hex << i << std::dec << " "
- << zone_states[i] << dendl;
- }
- }
-
- std::partial_sort(idx.begin(), idx.begin() + num_zones_to_clean_at_once, idx.end(),
- [this](uint64_t i1, uint64_t i2) {
- return zone_states[i1].num_dead_bytes > zone_states[i2].num_dead_bytes;
- });
-
- ldout(cct, 10) << " the zone that needs cleaning is 0x"
- << std::hex << *idx.begin() << " num_dead_bytes = 0x"
- << zone_states[*idx.begin()].num_dead_bytes
- << std::dec
- << dendl;
-
- zones_to_clean = {idx.begin(), idx.begin() + num_zones_to_clean_at_once};
- num_zones_to_clean = num_zones_to_clean_at_once;
-
- // TODO: handle the case of disk being full.
- ceph_assert(!zones_to_clean.empty());
- ceph_assert(num_zones_to_clean != 0);
-
- cleaner_lock->lock();
- cleaner_cond->notify_one();
- cleaner_lock->unlock();
-}
-
-void ZonedAllocator::mark_zones_to_clean_free(void)
-{
- std::lock_guard l(lock);
- ldout(cct, 10) << dendl;
- for (auto zone_num : zones_to_clean) {
- ldout(cct, 10) << " zone 0x" << std::hex << zone_num
- << " is now clean" << std::dec << dendl;
- num_free += zone_states[zone_num].write_pointer;
- zone_states[zone_num].num_dead_bytes = 0;
- zone_states[zone_num].write_pointer = 0;
- }
- zones_to_clean.clear();
- num_zones_to_clean = 0;
-}
-
void ZonedAllocator::shutdown()
{
ldout(cct, 1) << dendl;
uint64_t starting_zone_num;
uint64_t num_zones;
std::vector<zone_state_t> zone_states;
- std::set<uint64_t> zones_to_clean;
- std::atomic<int64_t> num_zones_to_clean;
-
- ceph::mutex *cleaner_lock = nullptr;
- ceph::condition_variable *cleaner_cond = nullptr;
inline uint64_t get_offset(uint64_t zone_num) const {
return zone_num * zone_size + get_write_pointer(zone_num);
void dump(std::function<void(uint64_t offset,
uint64_t length)> notify) override;
- const std::set<uint64_t> *get_zones_to_clean(void);
- void mark_zones_to_clean_free(void);
+ int64_t pick_zone_to_clean(void);
void init_from_zone_pointers(
- std::vector<zone_state_t> _zone_states,
- ceph::mutex *_cleaner_lock,
- ceph::condition_variable *_cleaner_cond);
+ std::vector<zone_state_t> _zone_states);
void init_add_free(uint64_t offset, uint64_t length) override {}
void init_rm_free(uint64_t offset, uint64_t length) override {}
private:
bool low_on_space(void);
- void find_zones_to_clean(void);
};
#endif
return 0;
}
-std::set<uint64_t> ZonedFreelistManager::get_cleaning_in_progress_zones(
- KeyValueDB *kvdb) const
-{
- bufferlist bl;
- std::set<uint64_t> zones_to_clean;
- if (kvdb->get(meta_prefix, CLEANING_IN_PROGRESS_KEY, &bl) == 0) {
- decode(zones_to_clean, bl);
- }
- return zones_to_clean;
-}
-
-void ZonedFreelistManager::mark_zones_to_clean_free(
- const std::set<uint64_t>& zones_to_clean, KeyValueDB *kvdb)
-{
- dout(10) << __func__ << dendl;
-
- KeyValueDB::Transaction txn = kvdb->get_transaction();
- for (auto zone_num : zones_to_clean) {
- ldout(cct, 10) << __func__ << " zone " << zone_num << " is now clean in DB" << dendl;
-
- zone_state_t zone_state;
- write_zone_state_to_db(zone_num, zone_state, txn);
- }
- txn->rmkey(meta_prefix, CLEANING_IN_PROGRESS_KEY);
- kvdb->submit_transaction_sync(txn);
-}
-
-// Marks the zones currently being cleaned in the db. Should be called before
-// starting the cleaning. If we crash mid-cleaning, the recovery code will check
-// if there is a key CLEANING_IN_PROGRESS_KEY in the meta_prefix namespace, and
-// if so, will read the zones and resume cleaning.
-void ZonedFreelistManager::mark_zones_to_clean_in_progress(
- const std::set<uint64_t>& zones_to_clean, KeyValueDB *kvdb)
+void ZonedFreelistManager::mark_zone_to_clean_free(
+ uint64_t zone,
+ KeyValueDB *kvdb)
{
- dout(10) << __func__ << dendl;
+ dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl;
- bufferlist bl;
- encode(zones_to_clean, bl);
-
KeyValueDB::Transaction txn = kvdb->get_transaction();
- txn->set(meta_prefix, CLEANING_IN_PROGRESS_KEY, bl);
+ zone_state_t zone_state;
+ write_zone_state_to_db(zone, zone_state, txn);
kvdb->submit_transaction_sync(txn);
}
using cfg_reader_t = std::function<int(const std::string&, std::string*)>;
-const std::string CLEANING_IN_PROGRESS_KEY = "cleaning_in_progress";
-
class ZonedFreelistManager : public FreelistManager {
std::string meta_prefix; ///< device size, zone size, etc.
std::string info_prefix; ///< per zone write pointer, dead bytes
std::vector<std::pair<std::string, std::string>>*) const override;
std::vector<zone_state_t> get_zone_states(KeyValueDB *kvdb) const;
- std::set<uint64_t> get_cleaning_in_progress_zones(KeyValueDB *kvdb) const;
- void mark_zones_to_clean_free(const std::set<uint64_t>& zones_to_clean,
- KeyValueDB *kvdb);
- void mark_zones_to_clean_in_progress(const std::set<uint64_t>& zones_to_clean,
- KeyValueDB *kvdb);
+
+ void mark_zone_to_clean_free(uint64_t zone, KeyValueDB *kvdb);
};
#endif