From: Abutalib Aghayev Date: Thu, 17 Jun 2021 17:16:03 +0000 (-0400) Subject: os/bluestore: More support for cleaning zones. X-Git-Tag: v17.1.0~1525^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=797e1a10f7c570a06132d640d773c9385bdaa9fe;p=ceph.git os/bluestore: More support for cleaning zones. The protocol for cleaning zones is as follows: 1. The ZonedAllocator wakes up the cleaner thread. 2. The cleaner thread acquires the list of zones to clean 3. Cleaning multiple zones is not atomic; therefore, to support resuming the cleaning if crashed, the cleaner thread first persists the list of zones to clean as a value of a key "cleaning_in_progress_zones", by calling ZonedFreelistManager's mark_zones_to_clean_in_progress. 4. The cleaner thread then iterates over the zones and cleans zones by calling _zoned_clean_zone on each zone. The latter calls an operation _do_move on each live object on the zone that atomically moves an object from the cleaned zone to a new zone. (_do_move is to be implemented.) 5. Once all of the zones are cleaned, the cleaner thread calls reset_zones, which resets the write pointer within the physical zoned block device 6. Finally, it calls ZonedFreelistManager's mark_zones_to_clean_free method which in one atomic operation resets the write pointer of the cleaned zones in the db and deletes the key "cleaning_in_progress_zones", that is, the list of zones to be cleaned recorded in step 3. A crash between or within any of these steps will leave the system in consistent state. Specifically, each zone will either be completely cleaned, or partially cleaned, or not cleaned. A recovery code will need to check for the existence of the "cleaning_in_progress_zones" key and if found, it will resume cleaning zones where it left off. It is possible that if we crash between steps 5 and 6, or within step 5, we end resetting the write pointer within the physical zoned block device multiple times, but that's okay because the latter is an idempotent operation. Signed-off-by: Abutalib Aghayev --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index e93c87fc26e4..19faaa8859f9 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -12418,9 +12418,11 @@ void BlueStore::_zoned_cleaner_thread() { dout(20) << __func__ << " wake" << dendl; } else { l.unlock(); + f->mark_zones_to_clean_in_progress(zones_to_clean, db); for (auto zone_num : *zones_to_clean) { _zoned_clean_zone(zone_num); } + bdev->reset_zones(zones_to_clean); f->mark_zones_to_clean_free(zones_to_clean, db); a->mark_zones_to_clean_free(); l.lock(); diff --git a/src/os/bluestore/ZonedFreelistManager.cc b/src/os/bluestore/ZonedFreelistManager.cc index 951bd90877b0..40e4f51c1cc5 100644 --- a/src/os/bluestore/ZonedFreelistManager.cc +++ b/src/os/bluestore/ZonedFreelistManager.cc @@ -325,5 +325,27 @@ void ZonedFreelistManager::mark_zones_to_clean_free( zone_state_t zone_state; write_zone_state_to_db(zone_num, zone_state, txn); } + + txn->rmkey(meta_prefix, "cleaning_in_progress_zones"); + + kvdb->submit_transaction_sync(txn); +} + +// Marks the zones currently being cleaned in the db. Should be called before +// starting the cleaning. If we crash mid-cleaning, the recovery code will check +// if there is a key "cleaning_in_progress_zones" in the meta_prefix namespace, +// and if so, will read the zones and resume cleaning. +void ZonedFreelistManager::mark_zones_to_clean_in_progress( + const std::set *zones_to_clean, KeyValueDB *kvdb) { + dout(10) << __func__ << dendl; + + bufferlist bl; + uint64_t num_zones = zones_to_clean->size(); + encode(num_zones, bl); + for (auto zone_num : *zones_to_clean) + encode(zone_num, bl); + + KeyValueDB::Transaction txn = kvdb->get_transaction(); + txn->set(meta_prefix, "cleaning_in_progress_zones", bl); kvdb->submit_transaction_sync(txn); } diff --git a/src/os/bluestore/ZonedFreelistManager.h b/src/os/bluestore/ZonedFreelistManager.h index 0a389a484702..9e01dcbe487d 100644 --- a/src/os/bluestore/ZonedFreelistManager.h +++ b/src/os/bluestore/ZonedFreelistManager.h @@ -104,6 +104,8 @@ public: std::vector get_zone_states(KeyValueDB *kvdb) const; void mark_zones_to_clean_free(const std::set *zones_to_clean, KeyValueDB *kvdb); + void mark_zones_to_clean_in_progress(const std::set *zones_to_clean, + KeyValueDB *kvdb); }; #endif