From ef72fc736d05b1a65b2b61729eaf96eed6f03ea3 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Mon, 14 Sep 2020 15:23:17 +0800 Subject: [PATCH] crimson: use tri_mutex for guarding ObjectContext before this change, a seastar::shared_mutex, a RWState and a shared_promise are used for tracking the consumers of ObjectContext. and all of the consumers are put into writers if the predicate function evaluates to "false", and is awaken if the predicate function evaluates to "true" afterwards in a polling loop waiting on the shared_promise, which is in turn fulfilled once the last consumer of the given category relinquishes the lock. this approach has couple issues: * it is heavy weighted. seastar::shared_mutex already tracks each of the waiters' continuation using separate promise<>, and it does try to reschedule them once a given consumer releases the last lock. so it's like a design of a customized shared_mutex over a shared_mutex. * it is complicated. 3 variables for tracking the different consumers of ObjectContext. in this change, * `tri_mutex` is introduced as a variant of the original `seastar::shared_mutex` to track two different shared users in addition to an exclusive user. * replace `shared_mutex` with `tri_mutex` in `ObjectContext`, to simplify the design. * move recovery_read_marker into `ObjectContext`. assuming all pending actions will be added as a waiter for the related object context before they acquire the lock. Signed-off-by: Kefu Chai --- src/crimson/CMakeLists.txt | 3 +- src/crimson/common/tri_mutex.cc | 131 +++++++++++++++++++++++++++++++ src/crimson/common/tri_mutex.h | 57 ++++++++++++++ src/crimson/osd/object_context.h | 105 +++++++++---------------- 4 files changed, 226 insertions(+), 70 deletions(-) create mode 100644 src/crimson/common/tri_mutex.cc create mode 100644 src/crimson/common/tri_mutex.h diff --git a/src/crimson/CMakeLists.txt b/src/crimson/CMakeLists.txt index 35ff74498a5f6..d346e942f0869 100644 --- a/src/crimson/CMakeLists.txt +++ b/src/crimson/CMakeLists.txt @@ -21,7 +21,8 @@ set(crimson_common_srcs common/formatter.cc common/perf_counters_collection.cc common/log.cc - common/throttle.cc) + common/throttle.cc + common/tri_mutex.cc) # the specialized version of ceph-common, where # - the logging is sent to Seastar backend diff --git a/src/crimson/common/tri_mutex.cc b/src/crimson/common/tri_mutex.cc new file mode 100644 index 0000000000000..247f37436777b --- /dev/null +++ b/src/crimson/common/tri_mutex.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#include "tri_mutex.h" + +seastar::future<> tri_mutex::lock_for_read() +{ + if (try_lock_for_read()) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::read); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_read() noexcept +{ + if (!writers && !exclusively_used && waiters.empty()) { + ++readers; + return true; + } else { + return false; + } +} + +void tri_mutex::unlock_for_read() +{ + assert(readers > 0); + if (--readers == 0) { + wake(); + } +} + +seastar::future<> tri_mutex::lock_for_write(bool greedy) +{ + if (try_lock_for_write(greedy)) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::write); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_write(bool greedy) noexcept +{ + if (!readers && !exclusively_used) { + if (greedy || waiters.empty()) { + ++writers; + return true; + } + } + return false; +} + +void tri_mutex::unlock_for_write() +{ + assert(writers > 0); + if (--writers == 0) { + wake(); + } +} + +// for exclusive users +seastar::future<> tri_mutex::lock_for_excl() +{ + if (try_lock_for_excl()) { + return seastar::make_ready_future<>(); + } + waiters.emplace_back(seastar::promise<>(), type_t::exclusive); + return waiters.back().pr.get_future(); +} + +bool tri_mutex::try_lock_for_excl() noexcept +{ + if (!readers && !writers && !exclusively_used) { + exclusively_used = true; + return true; + } else { + return false; + } +} + +void tri_mutex::unlock_for_excl() +{ + assert(exclusively_used); + exclusively_used = false; + wake(); +} + +bool tri_mutex::is_acquired() const +{ + if (readers) { + return true; + } else if (writers) { + return true; + } else if (exclusively_used) { + return true; + } else { + return false; + } +} + +void tri_mutex::wake() +{ + assert(!readers && !writers && !exclusively_used); + type_t type = type_t::none; + while (!waiters.empty()) { + auto& waiter = waiters.front(); + if (type == type_t::exclusive) { + break; + } if (type == type_t::none) { + type = waiter.type; + } else if (type != waiter.type) { + // to be woken in the next batch + break; + } + switch (type) { + case type_t::read: + ++readers; + break; + case type_t::write: + ++writers; + break; + case type_t::exclusive: + exclusively_used = true; + break; + default: + assert(0); + } + waiter.pr.set_value(); + waiters.pop_front(); + } +} diff --git a/src/crimson/common/tri_mutex.h b/src/crimson/common/tri_mutex.h new file mode 100644 index 0000000000000..63ca81a154234 --- /dev/null +++ b/src/crimson/common/tri_mutex.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:nil -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include +#include + +/// shared/exclusive mutual exclusion +/// +/// similar to seastar::shared_mutex, but instead of two kinds of waiters, +/// tri_mutex keeps track of three kinds of them: +/// - readers +/// - writers +/// - exclusive users +/// and unlike shared_mutex, tri_mutex have two kinds of shared users of lock: +/// - readers +/// - writers, which are not mutual-exclusive +/// the exclusive users is like the writers in shared_mutex. +class tri_mutex { +public: + tri_mutex() = default; + // for shared readers + seastar::future<> lock_for_read(); + bool try_lock_for_read() noexcept; + void unlock_for_read(); + // for shared writers + seastar::future<> lock_for_write(bool greedy); + bool try_lock_for_write(bool greedy) noexcept; + void unlock_for_write(); + // for exclusive users + seastar::future<> lock_for_excl(); + bool try_lock_for_excl() noexcept; + void unlock_for_excl(); + + bool is_acquired() const; + +private: + void wake(); + unsigned readers = 0; + unsigned writers = 0; + bool exclusively_used = false; + enum class type_t : uint8_t { + read, + write, + exclusive, + none, + }; + struct waiter_t { + waiter_t(seastar::promise<>&& pr, type_t type) + : pr(std::move(pr)), type(type) + {} + seastar::promise<> pr; + type_t type; + }; + seastar::circular_buffer waiters; +}; diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h index 80b64cd553595..3bd6073d9e624 100644 --- a/src/crimson/osd/object_context.h +++ b/src/crimson/osd/object_context.h @@ -11,6 +11,7 @@ #include "common/intrusive_lru.h" #include "osd/object_state.h" +#include "crimson/common/tri_mutex.h" #include "crimson/osd/osd_operation.h" namespace ceph { @@ -84,33 +85,8 @@ public: } private: - RWState rwstate; - seastar::shared_mutex wait_queue; - std::optional> wake; - - template - seastar::future<> with_queue(F &&f) { - return wait_queue.lock().then([this, f=std::move(f)] { - ceph_assert(!wake); - return seastar::repeat([this, f=std::move(f)]() { - if (f()) { - wait_queue.unlock(); - return seastar::make_ready_future( - seastar::stop_iteration::yes); - } else { - rwstate.inc_waiters(); - wake = seastar::shared_promise<>(); - return wake->get_shared_future().then([this, f=std::move(f)] { - wake = std::nullopt; - rwstate.dec_waiters(1); - return seastar::make_ready_future( - seastar::stop_iteration::no); - }); - } - }); - }); - } - + tri_mutex rwlock; + bool recovery_read_marker = false; const char *get_type_name() const final { return "ObjectContext"; @@ -122,23 +98,18 @@ private: Operation *op, LockF &&lockf) { return op->with_blocking_future( - make_blocking_future(with_queue(std::forward(lockf)))); + make_blocking_future(std::forward(lockf))); } - template - void put_lock( - UnlockF &&unlockf) { - if (unlockf() && wake) wake->set_value(); - } public: seastar::future<> get_lock_type(Operation *op, RWState::State type) { switch (type) { case RWState::RWWRITE: - return get_lock(op, [this] { return rwstate.get_write_lock(); }); + return get_lock(op, rwlock.lock_for_write(false)); case RWState::RWREAD: - return get_lock(op, [this] { return rwstate.get_read_lock(); }); + return get_lock(op, rwlock.lock_for_read()); case RWState::RWEXCL: - return get_lock(op, [this] { return rwstate.get_excl_lock(); }); + return get_lock(op, rwlock.lock_for_excl()); case RWState::RWNONE: return seastar::make_ready_future<>(); default: @@ -150,11 +121,11 @@ public: void put_lock_type(RWState::State type) { switch (type) { case RWState::RWWRITE: - return put_lock([this] { return rwstate.put_write(); }); + return rwlock.unlock_for_write(); case RWState::RWREAD: - return put_lock([this] { return rwstate.put_read(); }); + return rwlock.unlock_for_read(); case RWState::RWEXCL: - return put_lock([this] { return rwstate.put_excl(); }); + return rwlock.unlock_for_excl(); case RWState::RWNONE: return; default: @@ -165,66 +136,62 @@ public: void degrade_excl_to(RWState::State type) { // assume we already hold an excl lock - bool put = rwstate.put_excl(); + rwlock.unlock_for_excl(); bool success = false; switch (type) { case RWState::RWWRITE: - success = rwstate.get_write_lock(); + success = rwlock.try_lock_for_write(false); break; case RWState::RWREAD: - success = rwstate.get_read_lock(); + success = rwlock.try_lock_for_read(); break; case RWState::RWEXCL: - success = rwstate.get_excl_lock(); + success = rwlock.try_lock_for_excl(); break; case RWState::RWNONE: success = true; break; default: - ceph_abort_msg("invalid lock type"); + assert(0 == "invalid lock type"); break; } ceph_assert(success); - if (put && wake) { - wake->set_value(); - } } - bool empty() const { return rwstate.empty(); } + bool empty() const { + return !rwlock.is_acquired(); + } + bool is_request_pending() const { + return rwlock.is_acquired(); + } template seastar::future<> get_write_greedy(Operation *op) { - return get_lock(op, [this] { return rwstate.get_write_lock(true); }); + return get_lock(op, [this] { + return rwlock.lock_for_write(true); + }); } - bool try_get_read_lock() { - return rwstate.get_read_lock(); - } - void drop_read() { - return put_lock_type(RWState::RWREAD); - } bool get_recovery_read() { - return rwstate.get_recovery_read(); + if (rwlock.try_lock_for_read()) { + recovery_read_marker = true; + return true; + } else { + return false; + } } seastar::future<> wait_recovery_read() { - if (rwstate.get_recovery_read()) { - return seastar::make_ready_future<>(); - } - return with_queue([this] { - return rwstate.get_recovery_read(); + return rwlock.lock_for_read().then([this] { + recovery_read_marker = true; }); } void drop_recovery_read() { - ceph_assert(rwstate.recovery_read_marker); - drop_read(); - rwstate.recovery_read_marker = false; + assert(recovery_read_marker); + rwlock.unlock_for_read(); + recovery_read_marker = false; } bool maybe_get_excl() { - return rwstate.get_excl_lock(); - } - - bool is_request_pending() const { - return !rwstate.empty(); + return rwlock.try_lock_for_excl(); } }; using ObjectContextRef = ObjectContext::Ref; -- 2.39.5