const string PREFIX_ALLOC = "B"; // u64 offset -> u64 length (freelist)
const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
-const string PREFIX_ZONED_META = "Z"; // (see ZonedFreelistManager)
-const string PREFIX_ZONED_INFO = "z"; // (see ZonedFreelistManager)
+const string PREFIX_ZONED_FM_META = "Z"; // (see ZonedFreelistManager)
+const string PREFIX_ZONED_FM_INFO = "z"; // (see ZonedFreelistManager)
+const string PREFIX_ZONED_CL_INFO = "G"; // (per-zone cleaner metadata)
const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
l_bluestore_commit_lat));
}
+// For every object we maintain <zone_num+oid, offset> tuple in the key-value
+// store. When a new object written to a zone, we insert the corresponding
+// tuple to the database. When an object is truncated, we remove the
+// corresponding tuple. When an object is overwritten, we remove the old tuple
+// and insert a new tuple corresponding to the new location of the object. The
+// cleaner can now identify live objects within the zone <zone_num> by
+// enumerating all the keys starting with <zone_num> prefix.
+void BlueStore::zoned_update_cleaning_metadata(TransContext *txc) {
+ for (const auto &[o, offsets] : txc->zoned_onode_to_offset_map) {
+ std::string key;
+ get_object_key(cct, o->oid, &key);
+ for (auto offset : offsets) {
+ if (offset > 0) {
+ bufferlist offset_bl;
+ encode(offset, offset_bl);
+ txc->t->set(zoned_get_prefix(offset), key, offset_bl);
+ } else {
+ txc->t->rmkey(zoned_get_prefix(-offset), key);
+ }
+ }
+ }
+}
+
+std::string BlueStore::zoned_get_prefix(uint64_t offset) {
+ uint64_t zone_num = offset / bdev->get_zone_size();
+ std::string zone_key;
+ _key_encode_u64(zone_num, &zone_key);
+ return PREFIX_ZONED_CL_INFO + zone_key;
+}
+
void BlueStore::_txc_finalize_kv(TransContext *txc, KeyValueDB::Transaction t)
{
dout(20) << __func__ << " txc " << txc << std::hex
fm->release(p.get_start(), p.get_len(), t);
}
+ if (bdev->is_smr()) {
+ zoned_update_cleaning_metadata(txc);
+ }
+
_txc_update_store_statfs(txc);
}
min_alloc_size);
}
+ if (bdev->is_smr()) {
+ if (wctx.old_extents.empty()) {
+ txc->zoned_note_new_object(o);
+ } else {
+ int64_t old_ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
+ txc->zoned_note_updated_object(o, old_ondisk_offset);
+ }
+ }
+
// NB: _wctx_finish() will empty old_extents
// so we must do gc estimation before that
_wctx_finish(txc, c, o, &wctx);
if (offset == o->onode.size)
return;
+ WriteContext wctx;
if (offset < o->onode.size) {
- WriteContext wctx;
uint64_t length = o->onode.size - offset;
o->extent_map.fault_range(db, offset, length);
o->extent_map.punch_hole(c, offset, length, &wctx.old_extents);
o->onode.size = offset;
+ if (bdev->is_smr()) {
+ // On zoned devices, we currently support only removing an object or
+ // truncating it to zero size, both of which fall through this code path.
+ ceph_assert(offset == 0 && !wctx.old_extents.empty());
+ int64_t ondisk_offset = wctx.old_extents.begin()->r.begin()->offset;
+ txc->zoned_note_truncated_object(o, ondisk_offset);
+ }
+
txc->write_onode(o);
}
void rewrite_omap_key(const std::string& old, std::string *out);
void get_omap_tail(std::string *out);
void decode_omap_key(const std::string& key, std::string *user_key);
+
+ // Return the offset of an object on disk. This function is intended *only*
+ // for use with zoned storage devices because in these devices, the objects
+ // are laid out contiguously on disk, which is not the case in general.
+ // Also, it should always be called after calling extent_map.fault_range(),
+ // so that the extent map is loaded.
+ int64_t get_ondisk_starting_offset() const {
+ return extent_map.extent_map.begin()->blob->
+ get_blob().calc_offset(0, nullptr);
+ }
};
typedef boost::intrusive_ptr<Onode> OnodeRef;
std::set<OnodeRef> onodes; ///< these need to be updated/written
std::set<OnodeRef> modified_objects; ///< objects we modified (and need a ref)
+
+ // A map from onode to a vector of object offset. For new objects created
+ // in the transaction we append the new offset to the vector, for
+ // overwritten objects we append the negative of the previous ondisk offset
+ // followed by the new offset, and for truncated objects we append the
+ // negative of the previous ondisk offset. We need to maintain a vector of
+ // offsets because *within the same transaction* an object may be truncated
+ // and then written again, or an object may be overwritten multiple times to
+ // different zones. See update_cleaning_metadata function for how this map
+ // is used.
+ std::map<OnodeRef, std::vector<int64_t>> zoned_onode_to_offset_map;
+
std::set<SharedBlobRef> shared_blobs; ///< these need to be updated/written
std::set<SharedBlobRef> shared_blobs_written; ///< update these on io completion
modified_objects.insert(o);
}
+ void zoned_note_new_object(OnodeRef &o) {
+ auto [_, ok] = zoned_onode_to_offset_map.emplace(
+ std::pair<OnodeRef, std::vector<int64_t>>(o, {o->get_ondisk_starting_offset()}));
+ ceph_assert(ok);
+ }
+
+ void zoned_note_updated_object(OnodeRef &o, int64_t prev_offset) {
+ int64_t new_offset = o->get_ondisk_starting_offset();
+ auto [it, ok] = zoned_onode_to_offset_map.emplace(
+ std::pair<OnodeRef, std::vector<int64_t>>(o, {-prev_offset, new_offset}));
+ if (!ok) {
+ it->second.push_back(-prev_offset);
+ it->second.push_back(new_offset);
+ }
+ }
+
+ void zoned_note_truncated_object(OnodeRef &o, int64_t offset) {
+ auto [it, ok] = zoned_onode_to_offset_map.emplace(
+ std::pair<OnodeRef, std::vector<int64_t>>(o, {-offset}));
+ if (!ok) {
+ it->second.push_back(-offset);
+ }
+ }
+
void aio_finish(BlueStore *store) override {
store->txc_aio_finish(this);
}
void _fsck_check_objects(FSCKDepth depth,
FSCK_ObjectCtx& ctx);
+
+ // Zoned storage related stuff
+ void zoned_update_cleaning_metadata(TransContext *txc);
+ std::string zoned_get_prefix(uint64_t offset);
};
inline std::ostream& operator<<(std::ostream& out, const BlueStore::volatile_statfs& s) {