From: Adam Kupczyk Date: Mon, 27 Jun 2022 12:50:10 +0000 (+0000) Subject: test/objectstore: Add test for deferred writes X-Git-Tag: v16.2.11~414^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=eaa02600bb96af3f523666d3a70fd451e438416e;p=ceph.git test/objectstore: Add test for deferred writes Add test that recreates situation when BlueStore deferred writes can cause RocksDB files corruption. Signed-off-by: Adam Kupczyk (cherry picked from commit 42de47ae8468d000d5bca62bd537d7a1028bae42) Conflicts: src/test/objectstore/test_deferred.cc Non-critical code parts. --- diff --git a/src/test/objectstore/CMakeLists.txt b/src/test/objectstore/CMakeLists.txt index 340855657c4f..3294616b9b26 100644 --- a/src/test/objectstore/CMakeLists.txt +++ b/src/test/objectstore/CMakeLists.txt @@ -147,6 +147,13 @@ if(WITH_BLUESTORE) add_ceph_unittest(unittest_bdev) target_link_libraries(unittest_bdev os global) + # unittest_deferred + add_executable(unittest_deferred + test_deferred.cc + ) + add_ceph_unittest(unittest_deferred) + target_link_libraries(unittest_deferred os global) + endif(WITH_BLUESTORE) # unittest_transaction diff --git a/src/test/objectstore/run_test_deferred.sh b/src/test/objectstore/run_test_deferred.sh new file mode 100755 index 000000000000..1be4d91048e4 --- /dev/null +++ b/src/test/objectstore/run_test_deferred.sh @@ -0,0 +1,52 @@ +#!/bin/bash + + +if [[ ! (-x ./bin/unittest_deferred) || ! (-x ./bin/ceph-kvstore-tool) || ! (-x ./bin/ceph-bluestore-tool)]] +then + echo Test must be run from ceph build directory + echo with unittest_deferred, ceph-kvstore-tool and ceph-bluestore-tool compiled + exit 1 +fi + +# Create BlueStore, only main block device, 4K AU, forced deferred 4K, 64K AU for BlueFS + +# Create file zapchajdziura, that is 0xe000 in size. +# This adds to 0x0000 - 0x1000 of BlueStore superblock and 0x1000 - 0x2000 of BlueFS superblock, +# making 0x00000 - 0x10000 filled, nicely aligning for 64K BlueFS requirements + +# Prefill 10 objects Object-0 .. Object-9, each 64K. Sync to disk. +# Do transactions like: +# - fill Object-x+1 16 times at offsets 0x0000, 0x1000, ... 0xf000 with 8bytes, trigerring deferred writes +# - fill Object-x with 64K data +# Repeat for Object-0 to Object-8. + +# Right after getting notification on_complete for all 9 transactions, immediately exit(1). +./bin/unittest_deferred --log-to-stderr=false + +# Now we should have a considerable amount of pending deferred writes. +# They do refer disk regions that do not belong to any object. + +# Perform compaction on RocksDB +# This initializes BlueFS, but does not replay deferred writes. +# It jiggles RocksDB files around. CURRENT and MANIFEST are recreated, with some .sst files too. +# The hope here is that newly created RocksDB files will occupy space that is free, +# but targetted by pending deferred writes. +./bin/ceph-kvstore-tool bluestore-kv bluestore.test_temp_dir/ compact --log-to-stderr=false + +# It this step we (hopefully) get RocksDB files overwritten +# We initialize BlueFS and RocksDB, there should be no problem here. +# Then we apply deferred writes. Now some of RocksDB files might get corrupted. +# It is very likely that this will not cause any problems, since CURRENT and MANIFEST are only read at bootup. +./bin/ceph-bluestore-tool --path bluestore.test_temp_dir/ --command fsck --deep 1 --debug-bluestore=30/30 --debug-bdev=30/30 --log-file=log-bs-corrupts.txt --log-to-file --log-to-stderr=false + +# If we were lucky, this command now fails +./bin/ceph-bluestore-tool --path bluestore.test_temp_dir/ --command fsck --deep 1 --debug-bluestore=30/30 --debug-bdev=30/30 --log-file=log-bs-crash.txt --log-to-file --log-to-stderr=false +if [[ $? != 0 ]] +then + echo "Deferred writes corruption successfully created !" +else + echo "No deferred write problems detected." +fi + +#cleanup +rm -rf bluestore.test_temp_dir/ diff --git a/src/test/objectstore/test_deferred.cc b/src/test/objectstore/test_deferred.cc new file mode 100644 index 000000000000..f2fd6054f1d2 --- /dev/null +++ b/src/test/objectstore/test_deferred.cc @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include +#include +#include +#include +#include + +#include "os/ObjectStore.h" +#include "os/bluestore/BlueStore.h" +#include "include/Context.h" +#include "common/ceph_argparse.h" +#include "global/global_init.h" +#include "common/ceph_mutex.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "common/options.h" // for the size literals +#include + + + +class C_do_action : public Context { +public: + std::function action; + C_do_action(std::function action) + : action(action) {} + + void finish(int r) override { + action(); + } +}; + +void create_deferred_and_terminate() { + std::unique_ptr store; + + g_ceph_context->_conf._clear_safe_to_start_threads(); + g_ceph_context->_conf.set_val_or_die("bluestore_prefer_deferred_size", "4096"); + g_ceph_context->_conf.set_val_or_die("bluestore_allocator", "bitmap"); + g_ceph_context->_conf.set_val_or_die("bluestore_block_size", "10240000000"); + g_ceph_context->_conf.apply_changes(nullptr); + + int64_t poolid; + coll_t cid; + ghobject_t hoid; + ObjectStore::CollectionHandle ch; + ceph_assert(::mkdir("bluestore.test_temp_dir", 0777) == 0); + store.reset(ObjectStore::create(g_ceph_context, + "bluestore", + "bluestore.test_temp_dir", + "store_test_temp_journal")); + ceph_assert(store->mkfs() == 0); + ceph_assert(store->mount() == 0); + + poolid = 11; + cid = coll_t(spg_t(pg_t(1, poolid), shard_id_t::NO_SHARD)); + ch = store->create_new_collection(cid); + int r; + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + + { + ObjectStore::Transaction t; + std::string oid = "zapchajdziura"; + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 1, poolid, "")); + bufferlist bl; + bl.append(std::string(0xe000, '-')); + t.write(cid, hoid, 0, 0xe000, bl); + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + + size_t object_count = 10; + + // initial fill + bufferlist bl_64K; + bl_64K.append(std::string(64 * 1024, '-')); + + std::atomic prefill_counter{0}; + sem_t prefill_mutex; + sem_init(&prefill_mutex, 0, 0); + + for (size_t o = 0; o < object_count; o++) { + ObjectStore::Transaction t; + std::string oid = "object-" + std::to_string(o); + ghobject_t hoid(hobject_t(oid, "", CEPH_NOSNAP, 1, poolid, "")); + + t.write(cid, hoid, 0, bl_64K.length(), bl_64K); + t.register_on_commit(new C_do_action([&] { + if (++prefill_counter == object_count) { + sem_post(&prefill_mutex); + } + })); + + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + sem_wait(&prefill_mutex); + + // small deferred writes over object + // and complete overwrite of previous one + bufferlist bl_8_bytes; + bl_8_bytes.append("abcdefgh"); + std::atomic deferred_counter{0}; + for (size_t o = 0; o < object_count - 1; o++) { + ObjectStore::Transaction t; + + // sprinkle deferred writes + std::string oid_d = "object-" + std::to_string(o + 1); + ghobject_t hoid_d(hobject_t(oid_d, "", CEPH_NOSNAP, 1, poolid, "")); + + for(int i = 0; i < 16; i++) { + t.write(cid, hoid_d, 4096 * i, bl_8_bytes.length(), bl_8_bytes); + } + + // overwrite previous object + std::string oid_m = "object-" + std::to_string(o); + ghobject_t hoid_m(hobject_t(oid_m, "", CEPH_NOSNAP, 1, poolid, "")); + t.write(cid, hoid_m, 0, bl_64K.length(), bl_64K); + + t.register_on_commit(new C_do_action([&] { + if (++deferred_counter == object_count - 1) { + exit(0); + } + })); + r = store->queue_transaction(ch, std::move(t)); + ceph_assert(r == 0); + } + sleep(10); + ceph_assert(0 && "should not reach here"); +} + +int main(int argc, const char **argv) { + std::vector args; + argv_to_vec(argc, argv, args); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + create_deferred_and_terminate(); + return 0; +}