From eaa1731026a5fed2b543914aed89f07258c7976a Mon Sep 17 00:00:00 2001 From: Abutalib Aghayev Date: Tue, 30 Jun 2020 16:33:15 -0400 Subject: [PATCH] os/bluestore: Track per zone metadata For every object we maintain tuple in the key-value store. When a new object written to a zone, we insert the corresponding tuple to the database. When an object is truncated, we remove the corresponding tuple. When an object is overwritten, we remove the old tuple and insert a new tuple corresponding to the new location of the object. The cleaner can now identify live objects within the zone by enumerating all the keys starting with prefix. Signed-off-by: Abutalib Aghayev --- src/os/bluestore/BlueStore.cc | 58 +++++++++++++++++++++++++++++++++-- src/os/bluestore/BlueStore.h | 50 ++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 3 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 212f90b8a3bc6..a5f3edc55a69f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -117,8 +117,9 @@ const string PREFIX_DEFERRED = "L"; // id -> deferred_transaction_t const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist) const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager) const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t -const string PREFIX_ZONED_META = "Z"; // (see ZonedFreelistManager) -const string PREFIX_ZONED_INFO = "z"; // (see ZonedFreelistManager) +const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager) +const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager) +const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata) const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs"; @@ -11493,6 +11494,36 @@ void BlueStore::BSPerfTracker::update_from_perfcounters( l_bluestore_commit_lat)); } +// For every object we maintain tuple in the key-value +// store. When a new object written to a zone, we insert the corresponding +// tuple to the database. When an object is truncated, we remove the +// corresponding tuple. When an object is overwritten, we remove the old tuple +// and insert a new tuple corresponding to the new location of the object. The +// cleaner can now identify live objects within the zone by +// enumerating all the keys starting with prefix. +void BlueStore::zoned_update_cleaning_metadata(TransContext *txc) { + for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) { + std::string key; + get_object_key(cct, o->oid, &key); + for (auto offset : offsets) { + if (offset > 0) { + bufferlist offset_bl; + encode(offset, offset_bl); + txc->t->set(zoned_get_prefix(offset), key, offset_bl); + } else { + txc->t->rmkey(zoned_get_prefix(-offset), key); + } + } + } +} + +std::string BlueStore::zoned_get_prefix(uint64_t offset) { + uint64_t zone_num = offset / bdev->get_zone_size(); + std::string zone_key; + _key_encode_u64(zone_num, &zone_key); + return PREFIX_ZONED_CL_INFO + zone_key; +} + void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) { dout(20) << __func__ << " txc " << txc << std::hex @@ -11538,6 +11569,10 @@ void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t) fm->release(p.get_start(), p.get_len(), t); } + if (bdev->is_smr()) { + zoned_update_cleaning_metadata(txc); + } + _txc_update_store_statfs(txc); } @@ -14390,6 +14425,15 @@ int BlueStore::_do_write( min_alloc_size); } + if (bdev->is_smr()) { + if (wctx.old_extents.empty()) { + txc->zoned_note_new_object(o); + } else { + int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset; + txc->zoned_note_updated_object(o, old_ondisk_offset); + } + } + // NB: _wctx_finish() will empty old_extents // so we must do gc estimation before that _wctx_finish(txc, c, o, &wctx); @@ -14516,8 +14560,8 @@ void BlueStore::_do_truncate( if (offset == o->onode.size) return; + WriteContext wctx; if (offset < o->onode.size) { - WriteContext wctx; uint64_t length = o->onode.size - offset; o->extent_map.fault_range(db, offset, length); o->extent_map.punch_hole(c, offset, length, &wctx.old_extents); @@ -14538,6 +14582,14 @@ void BlueStore::_do_truncate( o->onode.size = offset; + if (bdev->is_smr()) { + // On zoned devices, we currently support only removing an object or + // truncating it to zero size, both of which fall through this code path. + ceph_assert(offset == 0 && !wctx.old_extents.empty()); + int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset; + txc->zoned_note_truncated_object(o, ondisk_offset); + } + txc->write_onode(o); } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 74e9dd5176bae..270328941670f 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1148,6 +1148,16 @@ public: void rewrite_omap_key(const std::string& old, std::string *out); void get_omap_tail(std::string *out); void decode_omap_key(const std::string& key, std::string *user_key); + + // Return the offset of an object on disk. This function is intended *only* + // for use with zoned storage devices because in these devices, the objects + // are laid out contiguously on disk, which is not the case in general. + // Also, it should always be called after calling extent_map.fault_range(), + // so that the extent map is loaded. + int64_t get_ondisk_starting_offset() const { + return extent_map.extent_map.begin()->blob-> + get_blob().calc_offset(0, nullptr); + } }; typedef boost::intrusive_ptr OnodeRef; @@ -1577,6 +1587,18 @@ public: std::set onodes; ///< these need to be updated/written std::set modified_objects; ///< objects we modified (and need a ref) + + // A map from onode to a vector of object offset. For new objects created + // in the transaction we append the new offset to the vector, for + // overwritten objects we append the negative of the previous ondisk offset + // followed by the new offset, and for truncated objects we append the + // negative of the previous ondisk offset. We need to maintain a vector of + // offsets because *within the same transaction* an object may be truncated + // and then written again, or an object may be overwritten multiple times to + // different zones. See update_cleaning_metadata function for how this map + // is used. + std::map> zoned_onode_to_offset_map; + std::set shared_blobs; ///< these need to be updated/written std::set shared_blobs_written; ///< update these on io completion @@ -1649,6 +1671,30 @@ public: modified_objects.insert(o); } + void zoned_note_new_object(OnodeRef &o) { + auto [_, ok] = zoned_onode_to_offset_map.emplace( + std::pair>(o, {o->get_ondisk_starting_offset()})); + ceph_assert(ok); + } + + void zoned_note_updated_object(OnodeRef &o, int64_t prev_offset) { + int64_t new_offset = o->get_ondisk_starting_offset(); + auto [it, ok] = zoned_onode_to_offset_map.emplace( + std::pair>(o, {-prev_offset, new_offset})); + if (!ok) { + it->second.push_back(-prev_offset); + it->second.push_back(new_offset); + } + } + + void zoned_note_truncated_object(OnodeRef &o, int64_t offset) { + auto [it, ok] = zoned_onode_to_offset_map.emplace( + std::pair>(o, {-offset})); + if (!ok) { + it->second.push_back(-offset); + } + } + void aio_finish(BlueStore *store) override { store->txc_aio_finish(this); } @@ -3429,6 +3475,10 @@ private: void _fsck_check_objects(FSCKDepth depth, FSCK_ObjectCtx& ctx); + + // Zoned storage related stuff + void zoned_update_cleaning_metadata(TransContext *txc); + std::string zoned_get_prefix(uint64_t offset); }; inline std::ostream& operator<<(std::ostream& out, const BlueStore::volatile_statfs& s) { -- 2.39.5