From: Radoslaw Zarzynski Date: Wed, 24 Nov 2021 15:41:22 +0000 (+0000) Subject: crimson/os: fix a shutdown-related race condition in AlienStore. X-Git-Tag: v17.1.0~361^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=5a7fc07933c908d339b5f2146088c8ec7003aa41;p=ceph.git crimson/os: fix a shutdown-related race condition in AlienStore. This is supposed to tackle crashes like the following one: ``` INFO 2021-11-17 16:33:12,048 [shard 0] alienstore - stat ... DEBUG 2021-11-17 16:33:12,789 [shard 0] ms - [osd.2(hb_front) v2:0.0.0.0:6813/34383 >> osd.0 v2:127.0.0.1:6809/34293@56992] closed! DEBUG 2021-11-17 16:33:12,791 [shard 0] ms - [osd.2(hb_front) v2:0.0.0.0:6813/34383@53359 >> osd.7 v2:0.0.0.0:6815/34448] closed! INFO 2021-11-17 16:33:12,795 [shard 0] alienstore - umount INFO 2021-11-17 16:33:12,804 [shard 0] osd - osd.2: committed_osd_maps(23, 62) ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-8896-gf35358f1/rpm/el8/BUILD/ceph-17.0.0-8896-gf35358f1/src/rocksdb/db/db_impl/db_impl.cc:1615: rocksdb::Status rocksdb::DBImpl::GetImpl(const rocksdb::ReadOptions&, const rocksdb::Slice&, rocksdb::DBImpl::GetImplOptions&): Assertion `get_impl_options.column_family' failed. Aborting. Backtrace: INFO 2021-11-17 16:33:13,542 [shard 0] ms - [osd.2(cluster) v2:172.21.15.17:6804/34383 >> osd.3 v2:172.21.15.17:6806/34387@50001] execute_ready(): fault at READY with nothing to send, going to STANDBY -- std::system_error (error crimson::net:4, read eof) DEBUG 2021-11-17 16:33:13,542 [shard 0] ms - [osd.2(cluster) v2:172.21.15.17:6804/34383 >> osd.3 v2:172.21.15.17:6806/34387@50001] TRIGGER STANDBY, was READY 0# gsignal in /lib64/libc.so.6 1# abort in /lib64/libc.so.6 2# 0x00007F12FA13FC89 in /lib64/libc.so.6 3# 0x00007F12FA14DA76 in /lib64/libc.so.6 4# rocksdb::DBImpl::GetImpl(rocksdb::ReadOptions const&, rocksdb::Slice const&, rocksdb::DBImpl::GetImplOptions&) in ceph-osd 5# rocksdb::DBImpl::Get(rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Slice const&, rocksdb::PinnableSlice*, std::__cxx11::basic_string, std::allocator >*) in ceph-osd 6# rocksdb::DBImpl::Get(rocksdb::ReadOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Slice const&, rocksdb::PinnableSlice*) in ceph-osd 7# RocksDBStore::get(std::__cxx11::basic_string, std::allocator > const&, char const*, unsigned long, ceph::buffer::v15_2_0::list*) in ceph-osd 8# BlueStore::Collection::get_onode(ghobject_t const&, bool, bool) in ceph-osd 9# BlueStore::read(boost::intrusive_ptr&, ghobject_t const&, unsigned long, unsigned long, ceph::buffer::v15_2_0::list&, unsigned int) in ceph-osd 10# 0x00005584E516577F in ceph-osd 11# crimson::os::ThreadPool::loop(std::chrono::duration >, unsigned long) in ceph-osd 12# 0x00005584E54E71E9 in ceph-osd 13# 0x00007F12FB861BA3 in /lib64/libstdc++.so.6 14# 0x00007F12FBB3C14A in /lib64/libpthread.so.0 15# clone in /lib64/libc.so.6 Content of /proc/self/maps: 7fff7000-8fff7000 rw-p 00000000 00:00 0 ``` The problem happened in RocksDB: ```cpp Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, GetImplOptions& get_impl_options) { assert(get_impl_options.value != nullptr || get_impl_options.merge_operands != nullptr); assert(get_impl_options.column_family); // ... ``` ```cpp tatus DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value, std::string* timestamp) { GetImplOptions get_impl_options; get_impl_options.column_family = column_family; get_impl_options.value = value; get_impl_options.timestamp = timestamp; Status s = GetImpl(read_options, key, get_impl_options); return s; } ``` ```cpp int RocksDBStore::get( const string& prefix, const char *key, size_t keylen, bufferlist *out) { ceph_assert(out && (out->length() == 0)); utime_t start = ceph_clock_now(); int r = 0; rocksdb::PinnableSlice value; rocksdb::Status s; auto cf = get_cf_handle(prefix, key, keylen); if (cf) { s = db->Get(rocksdb::ReadOptions(), cf, rocksdb::Slice(key, keylen), &value); } else { string k; combine_strings(prefix, key, keylen, &k); s = db->Get(rocksdb::ReadOptions(), default_cf, rocksdb::Slice(k), &value); } // ... ``` It may be explained by a race condition between `AlienStore::stat()` and `AlienStore::umount()`. Umounting a BlueStore means nullifying `default_cf`: ```cpp void RocksDBStore::close() { // ... default_cf = nullptr; delete db; db = nullptr; } ``` ``` INFO 2021-11-17 16:33:12,048 [shard 0] alienstore - stat ... INFO 2021-11-17 16:33:12,795 [shard 0] alienstore - umount INFO 2021-11-17 16:33:12,804 [shard 0] osd - osd.2: committed_osd_maps(23, 62) ``` Although `AlienStore` synchronizes `umount()` and `do_transaction()` with a `seastar::gate`, it lacks similar mechanism for read-like operations. Signed-off-by: Radoslaw Zarzynski --- diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc index 09df2e6f70450..3790b7d06041d 100644 --- a/src/crimson/os/alienstore/alien_store.cc +++ b/src/crimson/os/alienstore/alien_store.cc @@ -149,7 +149,7 @@ seastar::future<> AlienStore::umount() // not really started yet return seastar::now(); } - return transaction_gate.close().then([this] { + return op_gate.close().then([this] { return tp->submit([this] { return store->umount(); }); @@ -184,8 +184,8 @@ AlienStore::list_objects(CollectionRef ch, { logger().debug("{}", __func__); assert(tp); - return seastar::do_with(std::vector(), ghobject_t(), - [=] (auto &objects, auto &next) { + return do_with_op_gate(std::vector(), ghobject_t(), + [=] (auto &objects, auto &next) { objects.reserve(limit); return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &objects, &next] { @@ -257,7 +257,7 @@ seastar::future> AlienStore::list_collections() logger().debug("{}", __func__); assert(tp); - return seastar::do_with(std::vector{}, [=] (auto &ls) { + return do_with_op_gate(std::vector{}, [=] (auto &ls) { return tp->submit([this, &ls] { return store->list_collections(ls); }).then([&ls] (int r) { @@ -276,7 +276,7 @@ AlienStore::read(CollectionRef ch, { logger().debug("{}", __func__); assert(tp); - return seastar::do_with(ceph::bufferlist{}, [=] (auto &bl) { + return do_with_op_gate(ceph::bufferlist{}, [=] (auto &bl) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &bl] { auto c = static_cast(ch.get()); return store->read(c->collection, oid, offset, len, bl, op_flags); @@ -287,7 +287,7 @@ AlienStore::read(CollectionRef ch, return crimson::ct_error::input_output_error::make(); } else { return read_errorator::make_ready_future( - std::move(bl)); + std::move(bl)); } }); }); @@ -301,7 +301,7 @@ AlienStore::readv(CollectionRef ch, { logger().debug("{}", __func__); assert(tp); - return seastar::do_with(ceph::bufferlist{}, + return do_with_op_gate(ceph::bufferlist{}, [this, ch, oid, &m, op_flags](auto& bl) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [this, ch, oid, &m, op_flags, &bl] { @@ -327,8 +327,8 @@ AlienStore::get_attr(CollectionRef ch, { logger().debug("{}", __func__); assert(tp); - return seastar::do_with(ceph::bufferlist{}, std::string{name}, - [=] (auto &value, const auto& name) { + return do_with_op_gate(ceph::bufferlist{}, std::string{name}, + [=] (auto &value, const auto& name) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &value, &name] { // XXX: `name` isn't a `std::string_view` anymore! it had to be converted // to `std::string` for the sake of extending life-time not only of @@ -355,7 +355,7 @@ AlienStore::get_attrs(CollectionRef ch, { logger().debug("{}", __func__); assert(tp); - return seastar::do_with(attrs_t{}, [=] (auto &aset) { + return do_with_op_gate(attrs_t{}, [=] (auto &aset) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &aset] { auto c = static_cast(ch.get()); const auto r = store->getattrs(c->collection, oid, aset); @@ -377,7 +377,7 @@ auto AlienStore::omap_get_values(CollectionRef ch, { logger().debug("{}", __func__); assert(tp); - return seastar::do_with(omap_values_t{}, [=] (auto &values) { + return do_with_op_gate(omap_values_t{}, [=] (auto &values) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &values] { auto c = static_cast(ch.get()); return store->omap_get_values(c->collection, oid, keys, @@ -401,7 +401,7 @@ auto AlienStore::omap_get_values(CollectionRef ch, { logger().debug("{} with_start", __func__); assert(tp); - return seastar::do_with(omap_values_t{}, [=] (auto &values) { + return do_with_op_gate(omap_values_t{}, [=] (auto &values) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &values] { auto c = static_cast(ch.get()); return store->omap_get_values(c->collection, oid, start, @@ -427,11 +427,10 @@ seastar::future<> AlienStore::do_transaction(CollectionRef ch, logger().debug("{}", __func__); auto id = seastar::this_shard_id(); auto done = seastar::promise<>(); - return seastar::do_with( + return do_with_op_gate( std::move(txn), std::move(done), [this, ch, id] (auto &txn, auto &done) { - return seastar::with_gate(transaction_gate, [this, ch, id, &txn, &done] { AlienCollection* alien_coll = static_cast(ch.get()); return alien_coll->with_lock([this, ch, id, &txn, &done] { Context *crimson_wrapper = @@ -448,7 +447,6 @@ seastar::future<> AlienStore::do_transaction(CollectionRef ch, assert(r == 0); return done.get_future(); }); - }); }); } @@ -456,8 +454,10 @@ seastar::future<> AlienStore::inject_data_error(const ghobject_t& o) { logger().debug("{}", __func__); assert(tp); - return tp->submit([=] { - return store->inject_data_error(o); + return seastar::with_gate(op_gate, [=] { + return tp->submit([=] { + return store->inject_data_error(o); + }); }); } @@ -465,8 +465,10 @@ seastar::future<> AlienStore::inject_mdata_error(const ghobject_t& o) { logger().debug("{}", __func__); assert(tp); - return tp->submit([=] { - return store->inject_mdata_error(o); + return seastar::with_gate(op_gate, [=] { + return tp->submit([=] { + return store->inject_mdata_error(o); + }); }); } @@ -475,11 +477,13 @@ seastar::future<> AlienStore::write_meta(const std::string& key, { logger().debug("{}", __func__); assert(tp); - return tp->submit([=] { - return store->write_meta(key, value); - }).then([] (int r) { - assert(r == 0); - return seastar::make_ready_future<>(); + return seastar::with_gate(op_gate, [=] { + return tp->submit([=] { + return store->write_meta(key, value); + }).then([] (int r) { + assert(r == 0); + return seastar::make_ready_future<>(); + }); }); } @@ -488,20 +492,22 @@ AlienStore::read_meta(const std::string& key) { logger().debug("{}", __func__); assert(tp); - return tp->submit([this, key] { - std::string value; - int r = store->read_meta(key, &value); - if (r > 0) { - value.resize(r); - boost::algorithm::trim_right_if(value, - [] (unsigned char c) {return isspace(c);}); - } else { - value.clear(); - } - return std::make_pair(r, value); - }).then([] (auto entry) { - return seastar::make_ready_future>( - std::move(entry)); + return seastar::with_gate(op_gate, [this, key] { + return tp->submit([this, key] { + std::string value; + int r = store->read_meta(key, &value); + if (r > 0) { + value.resize(r); + boost::algorithm::trim_right_if(value, + [] (unsigned char c) {return isspace(c);}); + } else { + value.clear(); + } + return std::make_pair(r, value); + }).then([] (auto entry) { + return seastar::make_ready_future>( + std::move(entry)); + }); }); } @@ -515,7 +521,7 @@ seastar::future AlienStore::stat() const { logger().info("{}", __func__); assert(tp); - return seastar::do_with(store_statfs_t{}, [this] (store_statfs_t &st) { + return do_with_op_gate(store_statfs_t{}, [this] (store_statfs_t &st) { return tp->submit([this, &st] { return store->statfs(&st, nullptr); }).then([&st] (int r) { @@ -536,7 +542,7 @@ seastar::future AlienStore::stat( const ghobject_t& oid) { assert(tp); - return seastar::do_with((struct stat){}, [this, ch, oid](auto& st) { + return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [this, ch, oid, &st] { auto c = static_cast(ch.get()); store->stat(c->collection, oid, &st); @@ -550,7 +556,7 @@ auto AlienStore::omap_get_header(CollectionRef ch, -> read_errorator::future { assert(tp); - return seastar::do_with(ceph::bufferlist(), [=](auto& bl) { + return do_with_op_gate(ceph::bufferlist(), [=](auto& bl) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &bl] { auto c = static_cast(ch.get()); return store->omap_get_header(c->collection, oid, &bl); @@ -575,7 +581,7 @@ seastar::future> AlienStore::fiemap( uint64_t len) { assert(tp); - return seastar::do_with(std::map(), [=](auto& destmap) { + return do_with_op_gate(std::map(), [=](auto& destmap) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, &destmap] { auto c = static_cast(ch.get()); return store->fiemap(c->collection, oid, off, len, destmap); diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h index 0557dae77c28d..3c4dfa154ae88 100644 --- a/src/crimson/os/alienstore/alien_store.h +++ b/src/crimson/os/alienstore/alien_store.h @@ -119,6 +119,18 @@ public: const ghobject_t& oid) final; private: + template + auto do_with_op_gate(Args&&... args) const { + return seastar::with_gate(op_gate, + // perfect forwarding in lambda's closure isn't available in C++17 + // using tuple as workaround; see: https://stackoverflow.com/a/49902823 + [args = std::make_tuple(std::forward(args)...)] () mutable { + return std::apply([] (auto&&... args) { + return seastar::do_with(std::forward(args)...); + }, std::move(args)); + }); + } + // number of cores that are PREVENTED from being scheduled // to run alien store threads. static constexpr int N_CORES_FOR_SEASTAR = 3; @@ -129,7 +141,7 @@ private: uint64_t used_bytes = 0; std::unique_ptr store; std::unique_ptr cct; - seastar::gate transaction_gate; + mutable seastar::gate op_gate; std::unordered_map coll_map; std::vector _parse_cpu_cores(); };