From: Matan Breizman Date: Mon, 21 Mar 2022 16:24:14 +0000 (+0000) Subject: crimson/osd: Snapshots: copy-on-write clones X-Git-Tag: v18.0.0~197^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=80103ca585aafdb48a06d75e367326d8248f920c;p=ceph-ci.git crimson/osd: Snapshots: copy-on-write clones Creating copy-on-write (COW) clones in the backing store when a write arrives after a snapshot is taken of an object. Both for Pool or Self-managed snapshotting mechanisms. Trello: https://trello.com/c/yTDCGNCf Signed-off-by: Matan Breizman --- diff --git a/src/crimson/osd/objclass.cc b/src/crimson/osd/objclass.cc index ece780bf51a..d6d2e914a7b 100644 --- a/src/crimson/osd/objclass.cc +++ b/src/crimson/osd/objclass.cc @@ -520,6 +520,14 @@ const object_info_t& cls_get_object_info(cls_method_context_t hctx) int cls_get_snapset_seq(cls_method_context_t hctx, uint64_t *snap_seq) { + auto* ox = reinterpret_cast(hctx); + auto obc = ox->get_obc(); + if (!obc->obs.exists || + (obc->obs.oi.is_whiteout() && + obc->ssc->snapset.clones.empty())) { + return -ENOENT; + } + *snap_seq = obc->ssc->snapset.seq; return 0; } diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h index d5165dc23e8..35aae18cda4 100644 --- a/src/crimson/osd/object_context.h +++ b/src/crimson/osd/object_context.h @@ -26,6 +26,8 @@ namespace crimson::common { namespace crimson::osd { class Watch; +struct SnapSetContext; +using SnapSetContextRef = boost::intrusive_ptr; template struct obc_to_hoid { @@ -35,6 +37,26 @@ struct obc_to_hoid { } }; +struct SnapSetContext : + public boost::intrusive_ref_counter +{ + hobject_t oid; + SnapSet snapset; + bool exists = false; + /** + * exists + * + * Because ObjectContext's are cached, we need to be able to express the case + * where the object to which a cached ObjectContext refers does not exist. + * ObjectContext's for yet-to-be-created objects are initialized with exists=false. + * The ObjectContext for a deleted object will have exists set to false until it falls + * out of cache (or another write recreates the object). + */ + explicit SnapSetContext(const hobject_t& o) : + oid(o), exists(false) {} +}; + class ObjectContext : public ceph::common::intrusive_lru_base< ceph::common::intrusive_lru_config< hobject_t, ObjectContext, obc_to_hoid>> @@ -42,7 +64,7 @@ class ObjectContext : public ceph::common::intrusive_lru_base< public: Ref head; // Ref defined as part of ceph::common::intrusive_lru_base ObjectState obs; - std::optional ss; + SnapSetContextRef ssc; // the watch / notify machinery rather stays away from the hot and // frequented paths. std::map is used mostly because of developer's // convenience. @@ -69,18 +91,17 @@ public: const SnapSet &get_ro_ss() const { if (is_head()) { - ceph_assert(ss); - return *ss; + ceph_assert(ssc); + return ssc->snapset; } else { - ceph_assert(head); return head->get_ro_ss(); } } - void set_head_state(ObjectState &&_obs, SnapSet &&_ss) { + void set_head_state(ObjectState &&_obs, SnapSetContextRef &&_ssc) { ceph_assert(is_head()); obs = std::move(_obs); - ss = std::move(_ss); + ssc = std::move(_ssc); } void set_clone_state(ObjectState &&_obs, Ref &&_head) { diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc index ba2ece77bc5..df91f9bdfa1 100644 --- a/src/crimson/osd/ops_executer.cc +++ b/src/crimson/osd/ops_executer.cc @@ -476,6 +476,7 @@ OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver( OpsExecuter::interruptible_errorated_future OpsExecuter::execute_op(OSDOp& osd_op) { + head_existed = obc->obs.exists; return do_execute_op(osd_op).handle_error_interruptible( osd_op_errorator::all_same_way([&osd_op](auto e, auto&& e_raw) -> OpsExecuter::osd_op_errorator::future<> { @@ -729,6 +730,108 @@ version_t OpsExecuter::get_last_user_version() const return pg->get_last_user_version(); } +void OpsExecuter::make_writeable(std::vector& log_entries) +{ + const hobject_t& soid = obc->obs.oi.soid; + logger().debug("{} {} snapset={} snapc={}", + __func__, soid, + obc->ssc->snapset, snapc); + + // clone? + if (head_existed && // old obs.exists + snapc.snaps.size() && // there are snaps + snapc.snaps[0] > obc->ssc->snapset.seq) { // existing obj is old + + // clone object, the snap field is set to the seq of the SnapContext + // at its creation. + hobject_t coid = soid; + coid.snap = snapc.seq; + + // existing snaps are stored in descending order in snapc, + // cloned_snaps vector will hold all the snaps stored until snapset.seq + const std::vector cloned_snaps = [&] { + auto last = std::find_if( + std::begin(snapc.snaps), std::end(snapc.snaps), + [&](snapid_t snap_id) { return snap_id <= obc->ssc->snapset.seq; }); + return std::vector{std::begin(snapc.snaps), last}; + }(); + + // version + osd_op_params->at_version = pg->next_version(); + + auto snap_oi = prepare_clone(coid); + + // make clone + do_write_op([this, &snap_oi](auto& backend, auto& os, auto& txn) { + return backend.clone(snap_oi, os, clone_obc->obs, txn); + }); + + delta_stats.num_objects++; + if (snap_oi.is_omap()) { + delta_stats.num_objects_omap++; + } + delta_stats.num_object_clones++; + // newsnapset is obc's ssc + obc->ssc->snapset.clones.push_back(coid.snap); + obc->ssc->snapset.clone_size[coid.snap] = obc->obs.oi.size; + obc->ssc->snapset.clone_snaps[coid.snap] = cloned_snaps; + + // clone_overlap should contain an entry for each clone + // (an empty interval_set if there is no overlap) + auto &overlap = obc->ssc->snapset.clone_overlap[coid.snap]; + if (obc->obs.oi.size) { + overlap.insert(0, obc->obs.oi.size); + } + + // log clone + logger().debug("cloning v {} to {} v {} snaps= {} snapset={}", + obc->obs.oi.version, coid, + osd_op_params->at_version, cloned_snaps, obc->ssc->snapset); + + log_entries.emplace_back(pg_log_entry_t::CLONE, + coid, osd_op_params->at_version, + obc->obs.oi.version, obc->obs.oi.user_version, + osd_reqid_t(), + obc->obs.oi.mtime, 0); + encode(cloned_snaps, log_entries.back().snaps); + osd_op_params->at_version.version++; + + // TODO: update most recent clone_overlap and usage stats + + if (snapc.seq > obc->ssc->snapset.seq) { + // update snapset with latest snap context + obc->ssc->snapset.seq = snapc.seq; + obc->ssc->snapset.snaps.clear(); + } + logger().debug("{} {} done, snapset={}", + __func__, soid, obc->ssc->snapset); + } +} + +const object_info_t OpsExecuter::prepare_clone( + const hobject_t& coid) +{ + object_info_t static_snap_oi(coid); + static_snap_oi.version = osd_op_params->at_version; + static_snap_oi.prior_version = obc->obs.oi.version; + static_snap_oi.copy_user_bits(obc->obs.oi); + + if (pg->is_primary()) { + // lookup_or_create + auto [c_obc, existed] = + pg->get_shard_services().get_cached_obc( + std::move(coid)); + assert(!existed); + c_obc->obs.oi = static_snap_oi; + c_obc->obs.exists = true; + c_obc->ssc = obc->ssc; + c_obc->head = obc->head; + logger().debug("clone_obc: {}", c_obc->obs.oi); + clone_obc = std::move(c_obc); + } + return static_snap_oi; +} + static inline std::unique_ptr get_pgls_filter( const std::string& type, bufferlist::const_iterator& iter) diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h index 34241606cac..0fdce7cefd7 100644 --- a/src/crimson/osd/ops_executer.h +++ b/src/crimson/osd/ops_executer.h @@ -167,16 +167,20 @@ private: Ref pg; // for the sake of object class ObjectContextRef obc; + ObjectContextRef clone_obc; // if we create a clone const OpInfo& op_info; ceph::static_ptr)> msg; std::optional osd_op_params; bool user_modify = false; + bool head_existed = false; ceph::os::Transaction txn; size_t num_read = 0; ///< count read ops size_t num_write = 0; ///< count update ops + SnapContext snapc; // writer snap context + // this gizmo could be wrapped in std::optional for the sake of lazy // initialization. we don't need it for ops that doesn't have effect // TODO: verify the init overhead of chunked_fifo @@ -252,11 +256,13 @@ public: OpsExecuter(Ref pg, ObjectContextRef obc, const OpInfo& op_info, - const MsgT& msg) + const MsgT& msg, + const SnapContext& snapc) : pg(std::move(pg)), obc(std::move(obc)), op_info(op_info), - msg(std::in_place_type_t>{}, &msg) { + msg(std::in_place_type_t>{}, &msg), + snapc(snapc) { } template @@ -279,6 +285,10 @@ public: const std::vector& ops); void fill_op_params_bump_pg_version(); + ObjectContextRef get_obc() const { + return obc; + } + const object_info_t &get_object_info() const { return obc->obs.oi; } @@ -305,6 +315,15 @@ public: } version_t get_last_user_version() const; + + const SnapContext& get_snapc() const { + return snapc; + } + + void make_writeable(std::vector& log_entries); + + const object_info_t prepare_clone( + const hobject_t& coid); }; template @@ -361,6 +380,7 @@ OpsExecuter::flush_changes_n_do_ops_effects( if (want_mutate) { fill_op_params_bump_pg_version(); auto log_entries = prepare_transaction(ops); + make_writeable(log_entries); auto [submitted, all_completed] = std::forward(mut_func)(std::move(txn), std::move(obc), std::move(*osd_op_params), diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 961d4f9eaff..da72e3d2b56 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -772,9 +772,26 @@ PG::do_osd_ops( if (__builtin_expect(stopping, false)) { throw crimson::common::system_shutdown_exception(); } + SnapContext snapc; + if (op_info.may_write() || op_info.may_cache()) { + // snap + if (get_pgpool().info.is_pool_snaps_mode()) { + // use pool's snapc + snapc = get_pgpool().snapc; + logger().debug("{} using pool's snapc snaps={}", + __func__, snapc.snaps); + + } else { + // client specified snapc + snapc.seq = m->get_snap_seq(); + snapc.snaps = m->get_snaps(); + logger().debug("{} client specified snapc seq={} snaps={}", + __func__, snapc.seq, snapc.snaps); + } + } return do_osd_ops_execute>( seastar::make_lw_shared( - Ref{this}, obc, op_info, *m), + Ref{this}, obc, op_info, *m, snapc), m->ops, [this, m, obc, may_write = op_info.may_write(), may_read = op_info.may_read(), rvec = op_info.allows_returnvec()] { @@ -889,11 +906,14 @@ PG::do_osd_ops( do_osd_ops_success_func_t success_func, do_osd_ops_failure_func_t failure_func) { - return seastar::do_with(std::move(msg_params), [=, this, &ops, &op_info] - (auto &msg_params) { + // This overload is generally used for internal client requests, + // use an empty SnapContext. + return seastar::do_with( + std::move(msg_params), + [=, this, &ops, &op_info](auto &msg_params) { return do_osd_ops_execute( seastar::make_lw_shared( - Ref{this}, std::move(obc), op_info, msg_params), + Ref{this}, std::move(obc), op_info, msg_params, SnapContext{}), ops, std::move(success_func), std::move(failure_func)); @@ -950,11 +970,15 @@ std::optional PG::resolve_oid( const SnapSet &ss, const hobject_t &oid) { + logger().debug("{} oid.snap={},head snapset.seq={}", + __func__, oid.snap, ss.seq); if (oid.snap > ss.seq) { + // Because oid.snap > ss.seq, we are trying to read from a snapshot + // taken after the most recent write to this object. Read from head. return oid.get_head(); } else { // which clone would it be? - auto clone = std::upper_bound( + auto clone = std::lower_bound( begin(ss.clones), end(ss.clones), oid.snap); if (clone == end(ss.clones)) { @@ -1037,7 +1061,7 @@ PG::with_clone_obc(hobject_t oid, with_obc_func_t&& func) return load_obc_iertr::future<>{crimson::ct_error::object_corrupted::make()}; } auto [clone, existed] = shard_services.get_cached_obc(*coid); - return clone->template with_lock( + return clone->template with_lock( [coid=*coid, existed=existed, head=std::move(head), clone=std::move(clone), func=std::move(func), this]() -> load_obc_iertr::future<> { @@ -1046,7 +1070,7 @@ PG::with_clone_obc(hobject_t oid, with_obc_func_t&& func) logger().debug("with_clone_obc: found {} in cache", coid); } else { logger().debug("with_clone_obc: cache miss on {}", coid); - loaded = clone->template with_promoted_lock( + loaded = clone->template with_promoted_lock( [coid, clone, head, this] { return backend->load_metadata(coid).safe_then_interruptible( [coid, clone=std::move(clone), head=std::move(head)](auto md) mutable { @@ -1092,12 +1116,13 @@ PG::load_head_obc(ObjectContextRef obc) const hobject_t& oid = md->os.oi.soid; logger().debug( "load_head_obc: loaded obs {} for {}", md->os.oi, oid); - if (!md->ss) { + if (!md->ssc) { logger().error( - "load_head_obc: oid {} missing snapset", oid); + "load_head_obc: oid {} missing snapsetcontext", oid); return crimson::ct_error::object_corrupted::make(); + } - obc->set_head_state(std::move(md->os), std::move(*(md->ss))); + obc->set_head_state(std::move(md->os), std::move(md->ssc)); logger().debug( "load_head_obc: returning obc {} for {}", obc->obs.oi, obc->obs.oi.soid); @@ -1117,14 +1142,14 @@ PG::reload_obc(crimson::osd::ObjectContext& obc) const __func__, md->os.oi, obc.get_oid()); - if (!md->ss) { + if (!md->ssc) { logger().error( - "{}: oid {} missing snapset", + "{}: oid {} missing snapsetcontext", __func__, obc.get_oid()); return crimson::ct_error::object_corrupted::make(); } - obc.set_head_state(std::move(md->os), std::move(*(md->ss))); + obc.set_head_state(std::move(md->os), std::move(md->ssc)); return load_obc_ertr::now(); }); } diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc index 5adcd112940..7f3fa19664e 100644 --- a/src/crimson/osd/pg_backend.cc +++ b/src/crimson/osd/pg_backend.cc @@ -92,19 +92,35 @@ PGBackend::load_metadata(const hobject_t& oid) oid); return crimson::ct_error::object_corrupted::make(); } - - if (oid.is_head()) { - if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) { - bufferlist bl = std::move(ssiter->second); - ret->ss = SnapSet(bl); - } else { - /* TODO: add support for writing out snapsets - logger().error( - "load_metadata: object {} present but missing snapset", - oid); - //return crimson::ct_error::object_corrupted::make(); - */ - ret->ss = SnapSet(); + + if (oid.is_head()) { + // Returning object_corrupted when the object exsits and the + // Snapset is either not found or empty. + bool object_corrupted = true; + if (auto ssiter = attrs.find(SS_ATTR); ssiter != attrs.end()) { + object_corrupted = false; + logger().debug( + "load_metadata: object {} and snapset {} present", + oid, ssiter->second); + bufferlist bl = std::move(ssiter->second); + if (bl.length()) { + ret->ssc = new crimson::osd::SnapSetContext(oid.get_snapdir()); + try { + ret->ssc->snapset = SnapSet(bl); + ret->ssc->exists = true; + } catch (const buffer::error&) { + logger().warn("unable to decode SnapSet"); + throw crimson::osd::invalid_argument(); + } + } else { + object_corrupted = true; + } + } + if (object_corrupted) { + logger().error( + "load_metadata: object {} present but missing snapset", + oid); + return crimson::ct_error::object_corrupted::make(); } } @@ -119,7 +135,7 @@ PGBackend::load_metadata(const hobject_t& oid) ObjectState( object_info_t(oid), false), - oid.is_head() ? std::optional(SnapSet()) : std::nullopt + oid.is_head() ? (new crimson::osd::SnapSetContext(oid)) : nullptr }); })); } @@ -156,6 +172,18 @@ PGBackend::mutate_object( // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr)); txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv); } + + // snapset + if (obc->obs.oi.soid.snap == CEPH_NOSNAP) { + logger().debug("final snapset {} in {}", + obc->ssc->snapset, obc->obs.oi.soid); + ceph::bufferlist bss; + encode(obc->ssc->snapset, bss); + txn.setattr(coll->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss); + obc->ssc->exists = true; + } else { + logger().debug("no snapset (this is a clone)"); + } } else { // reset cached ObjectState without enforcing eviction obc->obs.oi = object_info_t(obc->obs.oi.soid); @@ -1105,6 +1133,26 @@ PGBackend::rm_xattr( return rm_xattr_iertr::now(); } +void PGBackend::clone( + object_info_t& snap_oi, + ObjectState& os, + ObjectState& d_os, + ceph::os::Transaction& txn) +{ + // Prepend the cloning operation to txn + ceph::os::Transaction c_txn; + c_txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid}); + // Operations will be removed from txn while appending + c_txn.append(txn); + txn = std::move(c_txn); + + ceph::bufferlist bv; + snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL); + + txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv); + txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR); +} + using get_omap_ertr = crimson::os::FuturizedStore::read_errorator::extend< crimson::ct_error::enodata>; diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h index 076da7d3843..34ea12652af 100644 --- a/src/crimson/osd/pg_backend.h +++ b/src/crimson/osd/pg_backend.h @@ -260,6 +260,11 @@ public: ObjectState& os, const OSDOp& osd_op, ceph::os::Transaction& trans); + void clone( + object_info_t& snap_oi, + ObjectState& os, + ObjectState& d_os, + ceph::os::Transaction& trans); interruptible_future stat( CollectionRef c, const ghobject_t& oid) const; @@ -354,7 +359,7 @@ protected: public: struct loaded_object_md_t { ObjectState os; - std::optional ss; + crimson::osd::SnapSetContextRef ssc; using ref = std::unique_ptr; }; load_metadata_iertr::future