From e6498eeed798ae10711e14b70d2c67e97b5a70a0 Mon Sep 17 00:00:00 2001 From: "Adam C. Emerson" Date: Mon, 15 Aug 2022 17:19:06 -0400 Subject: [PATCH] rgw: Document `RGWOvjVersionTracker` Give some guidelines as to what it's for and how to use it. Signed-off-by: Adam C. Emerson --- src/rgw/rgw_common.h | 115 ++++++++++++++++++++++++++++++++++++++----- src/rgw/rgw_rados.cc | 12 ++--- 2 files changed, 109 insertions(+), 18 deletions(-) diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h index 4bc18b2413630..59431df401359 100644 --- a/src/rgw/rgw_common.h +++ b/src/rgw/rgw_common.h @@ -996,39 +996,130 @@ struct rgw_bucket_placement { void dump(Formatter *f) const; }; +/// `RGWObjVersionTracker` +/// ====================== +/// +/// What and why is this? +/// --------------------- +/// +/// This is a wrapper around `cls_version` functionality. If two RGWs +/// (or two non-synchronized threads in the same RGW) are accessing +/// the same object, they may race and overwrite each other's work. +/// +/// This class solves this issue by tracking and recording an object's +/// version in the extended attributes. Operations are failed with +/// ECANCELED if the version is not what we expect. +/// +/// How to Use It +/// ------------- +/// +/// When preparing a read operation, call `prepare_op_for_read`. +/// For a write, call `prepare_op_for_write` when preparing the +/// operation, and `apply_write` after it succeeds. +/// +/// Adhere to the following guidelines: +/// +/// - Each RGWObjVersionTracker should be used with only one object. +/// +/// - If you receive `ECANCELED`, throw away whatever you were doing +/// based on the content of the versioned object, re-read, and +/// restart as appropriate. +/// +/// - If one code path uses RGWObjVersionTracker, then they all +/// should. In a situation where a writer should unconditionally +/// overwrite an object, call `generate_new_write_ver` on a default +/// constructed `RGWObjVersionTracker`. +/// +/// - If we have a version from a previous read, we will check against +/// it and fail the read if it doesn't match. Thus, if we want to +/// re-read a new version of the object, call `clear()` on the +/// `RGWObjVersionTracker`. +/// +/// - This type is not thread-safe. Every thread must have its own +/// instance. +/// struct RGWObjVersionTracker { - obj_version read_version; - obj_version write_version; - - obj_version *version_for_read() { + obj_version read_version; //< The version read from an object. If + // set, this value is used to check the + // stored version. + obj_version write_version; //< Set the object to this version on + // write, if set. + + /// Pointer to the read version. + obj_version* version_for_read() { return &read_version; } - obj_version *version_for_write() { + /// If we have a write version, return a pointer to it. Otherwise + /// return null. This is used in `prepare_op_for_write` to treat the + /// `write_version` as effectively an `option` type. + obj_version* version_for_write() { if (write_version.ver == 0) - return NULL; + return nullptr; return &write_version; } - obj_version *version_for_check() { + /// If read_version is non-empty, return a pointer to it, otherwise + /// null. This is used internally by `prepare_op_for_read` and + /// `prepare_op_for_write` to treat the `read_version` as + /// effectively an `option` type. + obj_version* version_for_check() { if (read_version.ver == 0) - return NULL; + return nullptr; return &read_version; } - void prepare_op_for_read(librados::ObjectReadOperation *op); - void prepare_op_for_write(librados::ObjectWriteOperation *op); - + /// This function is to be called on any read operation. If we have + /// a non-empty `read_version`, assert on the OSD that the object + /// has the same version. Also reads the version into `read_version`. + /// + /// This function is defined in `rgw_rados.cc` rather than `rgw_common.cc`. + void prepare_op_for_read(librados::ObjectReadOperation* op); + + /// This function is to be called on any write operation. If we have + /// a non-empty read operation, assert on the OSD that the object + /// has the same version. If we have a non-empty `write_version`, + /// set the object to it. Otherwise increment the version on the OSD. + /// + /// This function is defined in `rgw_rados.cc` rather than + /// `rgw_common.cc`. + void prepare_op_for_write(librados::ObjectWriteOperation* op); + + /// This function is to be called after the completion of any write + /// operation on which `prepare_op_for_write` was called. If we did + /// not set the write version explicitly, it increments + /// `read_version`. If we did, it sets `read_version` to + /// `write_version`. In either case, it clears `write_version`. + /// + /// RADOS write operations, at least those not using the relatively + /// new RETURNVEC flag, cannot return more information than an error + /// code. Thus, write operations can't simply fill in the read + /// version the way read operations can, so prepare_op_for_write` + /// instructs the OSD to increment the object as stored in RADOS and + /// `apply_write` increments our `read_version` in RAM. + /// + /// This function is defined in `rgw_rados.cc` rather than + /// `rgw_common.cc`. void apply_write(); + /// Clear `read_version` and `write_version`, making the instance + /// identical to a default-constructed instance. void clear() { read_version = obj_version(); write_version = obj_version(); } - void generate_new_write_ver(CephContext *cct); + /// Set `write_version` to a new, unique version. + /// + /// An `obj_version` contains an opaque, random tag and a + /// sequence. If the tags of two `obj_version`s don't match, the + /// versions are unordered and unequal. This function creates a + /// version with a new tag, ensuring that any other process + /// operating on the object will receive `ECANCELED` and will know + /// to re-read the object and restart whatever it was doing. + void generate_new_write_ver(CephContext* cct); }; inline std::ostream& operator<<(std::ostream& out, const obj_version &v) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index f2a76cf3d4270..1ece7bf8bc8e7 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -167,9 +167,9 @@ rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* store) const return raw_obj; } -void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op) +void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation* op) { - obj_version *check_objv = version_for_check(); + obj_version* check_objv = version_for_check(); if (check_objv) { cls_version_check(*op, *check_objv, VER_COND_EQ); @@ -180,8 +180,8 @@ void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op) void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op) { - obj_version *check_objv = version_for_check(); - obj_version *modify_version = version_for_write(); + obj_version* check_objv = version_for_check(); + obj_version* modify_version = version_for_write(); if (check_objv) { cls_version_check(*op, *check_objv, VER_COND_EQ); @@ -263,10 +263,10 @@ void RGWObjectCtx::invalidate(const rgw_obj& obj) { } } -void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct) +void RGWObjVersionTracker::generate_new_write_ver(CephContext* cct) { + static constexpr auto TAG_LEN = 24; write_version.ver = 1; -#define TAG_LEN 24 write_version.tag.clear(); append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN); -- 2.39.5