From: Sage Weil Date: Fri, 17 Sep 2021 19:14:51 +0000 (-0500) Subject: os/bluestore: simple cleaner X-Git-Tag: v17.1.0~535^2~19 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=6917b2f8db8a9e51f9eb075a427cb9c82512edaf;p=ceph-ci.git os/bluestore: simple cleaner Rewrite object ranges to move by reusing _do_read and _do_write. Note that this will rewrite shared/cloned extents multiple times, so it is suitable only for workloads/datasets that do not use clone (snapshots). Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index db29a98b339..c37aff0c15d 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5668,11 +5668,12 @@ int BlueStore::_init_alloc(std::map *zone_adjustments) // for now we require a conventional zone ceph_assert(bdev->get_conventional_region_size()); ceph_assert(shared_alloc.a != alloc); // zoned allocator doesn't use conventional region - shared_alloc.a->init_add_free(reserved, - bdev->get_conventional_region_size() - reserved); + shared_alloc.a->init_add_free( + reserved, + p2align(bdev->get_conventional_region_size(), min_alloc_size) - reserved); // init sequential zone based on the device's write pointers - a->init_from_zone_pointers(zones); + a->init_from_zone_pointers(std::move(zones)); dout(1) << __func__ << " loaded zone pointers: " << std::hex @@ -8849,10 +8850,12 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair) auto num_zones = bdev->get_size() / zone_size; for (unsigned i = first_sequential_zone; i < num_zones; ++i) { uint64_t p = wp[i] == (i + 1) * zone_size ? zone_size : wp[i] % zone_size; - if (zones[i].write_pointer > p) { + if (zones[i].write_pointer > p && + zones[i].num_dead_bytes < zones[i].write_pointer) { derr << "fsck error: zone 0x" << std::hex << i << " bluestore write pointer 0x" << zones[i].write_pointer << " > device write pointer 0x" << p + << " (with only 0x" << zones[i].num_dead_bytes << " dead bytes)" << std::dec << dendl; ++errors; } @@ -10081,6 +10084,22 @@ BlueStore::CollectionRef BlueStore::_get_collection(const coll_t& cid) return cp->second; } +BlueStore::CollectionRef BlueStore::_get_collection_by_oid(const ghobject_t& oid) +{ + std::shared_lock l(coll_lock); + + // FIXME: we must replace this with something more efficient + + for (auto& i : coll_map) { + spg_t spgid; + if (i.first.is_pg(&spgid) && + i.second->contains(oid)) { + return i.second; + } + } + return CollectionRef(); +} + void BlueStore::_queue_reap_collection(CollectionRef& c) { dout(10) << __func__ << " " << c << " " << c->cid << dendl; @@ -13034,14 +13053,12 @@ void BlueStore::_kv_finalize_thread() void BlueStore::_zoned_cleaner_start() { dout(10) << __func__ << dendl; - return; // temporarily disable cleaner until it actually works zoned_cleaner_thread.create("bstore_zcleaner"); } void BlueStore::_zoned_cleaner_stop() { dout(10) << __func__ << dendl; - return; // temporarily disable cleaner until it actually works { std::unique_lock l{zoned_cleaner_lock}; while (!zoned_cleaner_started) { @@ -13070,7 +13087,11 @@ void BlueStore::_zoned_cleaner_thread() auto f = dynamic_cast(fm); ceph_assert(f); while (true) { - auto zone_to_clean = a->pick_zone_to_clean(.1, zone_size / 16); // FIXME + // thresholds to trigger cleaning + // FIXME + float min_score = .05; // score: bytes saved / bytes moved + uint64_t min_saved = zone_size / 32; // min bytes saved to consider cleaning + auto zone_to_clean = a->pick_zone_to_clean(min_score, min_saved); if (zone_to_clean < 0) { if (zoned_cleaner_stop) { break; @@ -13090,11 +13111,136 @@ void BlueStore::_zoned_cleaner_thread() zoned_cleaner_started = false; } -void BlueStore::_zoned_clean_zone(uint64_t zone_num) +void BlueStore::_zoned_clean_zone(uint64_t zone) +{ + dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone << std::dec << dendl; + auto a = dynamic_cast(alloc); + auto f = dynamic_cast(fm); + + KeyValueDB::Iterator it = db->get_iterator(PREFIX_ZONED_CL_INFO); + std::string zone_start; + get_zone_offset_object_key(zone, 0, ghobject_t(), &zone_start); + for (it->lower_bound(zone_start); it->valid(); it->next()) { + uint32_t z; + uint64_t offset; + ghobject_t oid; + string k = it->key(); + int r = get_key_zone_offset_object(k, &z, &offset, &oid); + if (r < 0) { + derr << __func__ << " failed to decode zone ref " << pretty_binary_string(k) + << dendl; + continue; + } + if (zone != z) { + dout(10) << __func__ << " reached end of zone refs" << dendl; + break; + } + dout(10) << __func__ << " zone 0x" << std::hex << zone << " offset 0x" << offset + << std::dec << " " << oid << dendl; + _clean_some(oid, zone); + } + + if (a->get_live_bytes(zone) > 0) { + derr << "zone 0x" << std::hex << zone << " still has 0x" << a->get_live_bytes(zone) + << " live bytes" << std::dec << dendl; + // should we do something else here to avoid a live-lock in the event of a problem? + return; + } + + // reset the device zone + dout(10) << __func__ << " resetting zone 0x" << std::hex << zone << std::dec << dendl; + bdev->reset_zone(zone); + + // record that we can now write there + f->mark_zone_to_clean_free(zone, a->get_write_pointer(zone), + a->get_dead_bytes(zone), db); + bdev->flush(); + + // then allow ourselves to start allocating there + dout(10) << __func__ << " done cleaning zone 0x" << std::hex << zone << std::dec + << dendl; + a->reset_zone(zone); +} + +void BlueStore::_clean_some(ghobject_t oid, uint32_t zone) { - dout(10) << __func__ << " cleaning zone 0x" << std::hex << zone_num << std::dec << dendl; - // TODO: (1) copy live objects from zone_num to a new zone, (2) issue a RESET - // ZONE operation to the device for the corresponding zone. + dout(10) << __func__ << " " << oid << " from zone 0x" << std::hex << zone << std::dec + << dendl; + + CollectionRef cref = _get_collection_by_oid(oid); + Collection *c = cref.get(); + + // serialize io dispatch vs other transactions + std::lock_guard l(atomic_alloc_and_submit_lock); + std::unique_lock l2(c->lock); + + auto o = c->get_onode(oid, false); + if (!o) { + derr << __func__ << " can't find " << oid << dendl; + return; + } + + o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE); + _dump_onode<30>(cct, *o); + + // NOTE: This is a naive rewrite strategy. If any blobs are + // shared, they will be duplicated for each object that references + // them. That means any cloned/snapshotted objects will explode + // their utilization. This won't matter for RGW workloads, but + // for RBD and CephFS it is completely unacceptable, and it's + // entirely reasonable to have "archival" data workloads on SMR + // for CephFS and (possibly/probably) RBD. + // + // At some point we need to replace this with something more + // sophisticated that ensures that a shared blob gets moved once + // and all referencing objects get updated to point to the new + // location. + + map to_move; + for (auto& e : o->extent_map.extent_map) { + bool touches_zone = false; + for (auto& be : e.blob->get_blob().get_extents()) { + if (be.is_valid()) { + uint32_t z = be.offset / zone_size; + if (z == zone) { + touches_zone = true; + break; + } + } + } + if (touches_zone) { + to_move[e.logical_offset] = e.length; + } + } + if (to_move.empty()) { + dout(10) << __func__ << " no references to zone 0x" << std::hex << zone + << std::dec << " from " << oid << dendl; + return; + } + + dout(10) << __func__ << " rewriting object extents 0x" << std::hex << to_move + << std::dec << dendl; + OpSequencer *osr = c->osr.get(); + TransContext *txc = _txc_create(c, osr, nullptr); + + spg_t pgid; + if (c->cid.is_pg(&pgid)) { + txc->osd_pool_id = pgid.pool(); + } + + for (auto& [offset, length] : to_move) { + bufferlist bl; + int r = _do_read(c, o, offset, length, bl, 0); + ceph_assert(r == (int)length); + + r = _do_write(txc, cref, o, offset, length, bl, 0); + ceph_assert(r >= 0); + } + txc->write_onode(o); + + _txc_write_nodes(txc, txc->t); + _txc_finalize_kv(txc, txc->t); + _txc_state_proc(txc); } #endif diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 5ba447ae13a..f9e1b31d613 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2428,6 +2428,7 @@ private: void _dump_alloc_on_failure(); CollectionRef _get_collection(const coll_t& cid); + CollectionRef _get_collection_by_oid(const ghobject_t& oid); void _queue_reap_collection(CollectionRef& c); void _reap_collections(); void _update_cache_logger(); @@ -2479,6 +2480,7 @@ private: void _zoned_cleaner_stop(); void _zoned_cleaner_thread(); void _zoned_clean_zone(uint64_t zone_num); + void _clean_some(ghobject_t oid, uint32_t zone_num); #endif bluestore_deferred_op_t *_get_deferred_op(TransContext *txc, uint64_t len); diff --git a/src/os/bluestore/ZonedAllocator.cc b/src/os/bluestore/ZonedAllocator.cc index 7feb6f3e014..8401eab1858 100644 --- a/src/os/bluestore/ZonedAllocator.cc +++ b/src/os/bluestore/ZonedAllocator.cc @@ -150,7 +150,7 @@ void ZonedAllocator::dump(std::function _zone_states) + std::vector &&_zone_states) { // this is called once, based on the device's zone pointers std::lock_guard l(lock); @@ -215,6 +215,12 @@ int64_t ZonedAllocator::pick_zone_to_clean(float min_score, uint64_t min_saved) return best; } +void ZonedAllocator::reset_zone(uint32_t zone) +{ + num_free += zone_states[zone].write_pointer; + zone_states[zone].reset(); +} + bool ZonedAllocator::low_on_space(void) { std::lock_guard l(lock); diff --git a/src/os/bluestore/ZonedAllocator.h b/src/os/bluestore/ZonedAllocator.h index 9f5b632eb93..12fa6cddd3e 100644 --- a/src/os/bluestore/ZonedAllocator.h +++ b/src/os/bluestore/ZonedAllocator.h @@ -45,10 +45,11 @@ class ZonedAllocator : public Allocator { return zone_num * zone_size + get_write_pointer(zone_num); } +public: inline uint64_t get_write_pointer(uint64_t zone_num) const { return zone_states[zone_num].get_write_pointer(); } - +private: inline uint64_t get_remaining_space(uint64_t zone_num) const { return zone_size - get_write_pointer(zone_num); } @@ -102,9 +103,10 @@ public: void clear_cleaning_zone(uint32_t zone) { cleaning_zone = -1; } + void reset_zone(uint32_t zone); void init_from_zone_pointers( - std::vector _zone_states); + std::vector &&_zone_states); void init_add_free(uint64_t offset, uint64_t length) override {} void init_rm_free(uint64_t offset, uint64_t length) override {} diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc index 60e72c2cd27..092e8d540aa 100644 --- a/src/os/bluestore/ZonedFreelistManager.cc +++ b/src/os/bluestore/ZonedFreelistManager.cc @@ -236,8 +236,9 @@ void ZonedFreelistManager::allocate( uint64_t length, KeyValueDB::Transaction txn) { - dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << dendl; uint64_t zone_num = offset / zone_size; + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << " zone 0x" << zone_num << std::dec << dendl; zone_state_t zone_state; zone_state.increment_write_pointer(length); write_zone_state_to_db(zone_num, zone_state, txn); @@ -254,8 +255,9 @@ void ZonedFreelistManager::release( uint64_t length, KeyValueDB::Transaction txn) { - dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << dendl; uint64_t zone_num = offset / zone_size; + dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length + << " zone 0x" << zone_num << std::dec << dendl; zone_state_t zone_state; zone_state.increment_num_dead_bytes(length); write_zone_state_to_db(zone_num, zone_state, txn); @@ -333,12 +335,46 @@ int ZonedFreelistManager::_read_cfg(cfg_reader_t cfg_reader) void ZonedFreelistManager::mark_zone_to_clean_free( uint64_t zone, + uint64_t write_pointer, + uint64_t dead, KeyValueDB *kvdb) { - dout(10) << __func__ << " zone 0x" << std::hex << zone << std::dec << dendl; - + dout(10) << __func__ << " zone 0x" << std::hex << zone + << " (dead 0x" << dead << " write pointer 0x" << write_pointer + << ")" << std::dec << dendl; + + if (true) { + string key; + _key_encode_u64(zone, &key); + KeyValueDB::Iterator it = kvdb->get_iterator(info_prefix); + it->lower_bound(key); + zone_state_t zs; + load_zone_state_from_db(zone, zs, it); + dout(20) << __func__ << " before " << zs << dendl; + ceph_assert(zs.num_dead_bytes == dead); + ceph_assert(zs.write_pointer == write_pointer); + } KeyValueDB::Transaction txn = kvdb->get_transaction(); - zone_state_t zone_state; - write_zone_state_to_db(zone, zone_state, txn); + + zone_state_t neg_zone_state; + neg_zone_state.num_dead_bytes = 0ll - (int64_t)dead; + neg_zone_state.write_pointer = 0ll - (int64_t)write_pointer; + write_zone_state_to_db(zone, neg_zone_state, txn); + + // block here until this commits so that we don't end up starting to allocate and + // write to the new zone before this fully commits. kvdb->submit_transaction_sync(txn); + + if (true) { + // read it back to verify it is really zero! + string key; + _key_encode_u64(zone, &key); + KeyValueDB::Iterator it = kvdb->get_iterator(info_prefix); + it->lower_bound(key); + zone_state_t zs; + load_zone_state_from_db(zone, zs, it); + dout(20) << __func__ << " read back " << zs << dendl; + ceph_assert(zs.num_dead_bytes == 0); + ceph_assert(zs.write_pointer == 0); + } } diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h index e60a40222eb..4e0b040c80c 100644 --- a/src/os/bluestore/ZonedFreelistManager.h +++ b/src/os/bluestore/ZonedFreelistManager.h @@ -103,7 +103,9 @@ public: std::vector get_zone_states(KeyValueDB *kvdb) const; - void mark_zone_to_clean_free(uint64_t zone, KeyValueDB *kvdb); + void mark_zone_to_clean_free(uint64_t zone, + uint64_t write_pointer, uint64_t dead, + KeyValueDB *kvdb); }; #endif diff --git a/src/os/bluestore/zoned_types.h b/src/os/bluestore/zoned_types.h index 5507e64c7c5..d8ca3a0c7c6 100644 --- a/src/os/bluestore/zoned_types.h +++ b/src/os/bluestore/zoned_types.h @@ -28,6 +28,11 @@ struct zone_state_t { decode(num_dead_bytes, p); } + void reset() { + write_pointer = 0; + num_dead_bytes = 0; + } + uint64_t get_num_dead_bytes() const { return num_dead_bytes; }