From 38ac89621184ab6dd3d9e04ca6edc3e6bad402f7 Mon Sep 17 00:00:00 2001 From: Adam Kupczyk Date: Thu, 14 May 2020 22:43:56 +0200 Subject: [PATCH] kv/RocksDBStore: Added resharding control Added possibility to control batch size and iterator refresh time for resharding process. Replaced getenv() with new control for resharding unittests. Signed-off-by: Adam Kupczyk --- doc/man/8/ceph-bluestore-tool.rst | 19 ++++++++++++++ src/kv/RocksDBStore.cc | 21 +++++++-------- src/kv/RocksDBStore.h | 11 +++++++- src/os/bluestore/bluestore_tool.cc | 33 +++++++++++++++++++++++- src/test/objectstore/test_kv.cc | 41 +++++++++++++++--------------- 5 files changed, 91 insertions(+), 34 deletions(-) diff --git a/doc/man/8/ceph-bluestore-tool.rst b/doc/man/8/ceph-bluestore-tool.rst index 2bc951ebd13..b18f7bc51a9 100644 --- a/doc/man/8/ceph-bluestore-tool.rst +++ b/doc/man/8/ceph-bluestore-tool.rst @@ -23,6 +23,7 @@ Synopsis | **ceph-bluestore-tool** bluefs-bdev-new-db --path *osd path* --dev-target *new-device* | **ceph-bluestore-tool** bluefs-bdev-migrate --path *osd path* --dev-target *new-device* --devs-source *device1* [--devs-source *device2*] | **ceph-bluestore-tool** free-dump|free-score --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ] +| **ceph-bluestore-tool** reshard --path *osd path* --sharding *new sharding* [ --sharding-ctrl *control string* ] Description @@ -97,6 +98,17 @@ Commands Give a [0-1] number that represents quality of fragmentation in allocator. 0 represents case when all free space is in one chunk. 1 represents worst possible fragmentation. +:command:`reshard` --path *osd path* --sharding *new sharding* [ --resharding-ctrl *control string* ] + + Changes sharding of BlueStore's RocksDB. Sharding is build on top of RocksDB column families. + This option allows to test performance of *new sharding* without need to redeploy OSD. + Resharding is usually a long process, which involves walking through entire RocksDB key space + and moving some of them to different column families. + Option --resharding-ctrl provides performance control over resharding process. + Interrupted resharding will prevent OSD from running. + Interrupted resharding does not corrupt data. It is always possible to continue previous resharding, + or select any other sharding scheme, including reverting to original one. + Options ======= @@ -137,6 +149,13 @@ Options Useful for *free-dump* and *free-score* actions. Selects allocator(s). +.. option:: --resharding-ctrl *control string* + + Provides control over resharding process. Specifies how often refresh RocksDB iterator, + and how large should commit batch be before committing to RocksDB. Option format is: + /// + Default: 10000000/10000/1000000/1000 + Device labels ============= diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index 6279034159f..92b4ed057c1 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -2919,13 +2919,14 @@ int RocksDBStore::reshard_cleanup(const std::vector& current_column return 0; } -int RocksDBStore::reshard(const std::string& new_sharding) +int RocksDBStore::reshard(const std::string& new_sharding, const RocksDBStore::resharding_ctrl* ctrl_in) { rocksdb::Status status; int r; std::vector to_process_columns; std::vector to_process_handles; + resharding_ctrl ctrl = ctrl_in ? *ctrl_in : resharding_ctrl(); size_t bytes_in_batch = 0; size_t keys_in_batch = 0; size_t bytes_per_iterator = 0; @@ -2935,10 +2936,6 @@ int RocksDBStore::reshard(const std::string& new_sharding) rocksdb::WriteBatch* bat = nullptr; - //check for injected unittest commands - const char* unittest_str = getenv("RocksDBStore::reshard::unittest"); - size_t unittest_command = unittest_str ? atoi(unittest_str) : 0; - auto flush_batch = [&]() { dout(10) << "flushing batch, " << keys_in_batch << " keys, for " << bytes_in_batch << " bytes" << dendl; @@ -2968,8 +2965,8 @@ int RocksDBStore::reshard(const std::string& new_sharding) rocksdb::Slice raw_key = it->key(); dout(30) << "key=" << pretty_binary_string(raw_key.ToString()) << dendl; //check if need to refresh iterator - if (bytes_per_iterator >= 10000000 || - keys_per_iterator >= 10000) { + if (bytes_per_iterator >= ctrl.bytes_per_iterator || + keys_per_iterator >= ctrl.keys_per_iterator) { dout(8) << "refreshing iterator" << dendl; bytes_per_iterator = 0; keys_per_iterator = 0; @@ -3018,10 +3015,10 @@ int RocksDBStore::reshard(const std::string& new_sharding) keys_per_iterator++; //check if need to write batch - if (bytes_in_batch >= 1000000 || - keys_in_batch >= 1000) { + if (bytes_in_batch >= ctrl.bytes_per_batch || + keys_in_batch >= ctrl.keys_per_batch) { flush_batch(); - if (unittest_command & 1) { + if (ctrl.unittest_fail_after_first_batch) { r = -1000; goto out; } @@ -3056,7 +3053,7 @@ int RocksDBStore::reshard(const std::string& new_sharding) derr << "Error processing column " << to_process_columns[idx] << dendl; goto cleanup; } - if (unittest_command & 2) { + if (ctrl.unittest_fail_after_processing_column) { r = -1001; goto cleanup; } @@ -3068,7 +3065,7 @@ int RocksDBStore::reshard(const std::string& new_sharding) goto cleanup; } - if (unittest_command & 4) { + if (ctrl.unittest_fail_after_successful_processing) { r = -1002; goto cleanup; } diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index c6e1261325e..068dff673be 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -527,7 +527,16 @@ private: int reshard_cleanup(const std::vector& current_columns, const std::vector& current_handles); public: - int reshard(const std::string& new_sharding); + struct resharding_ctrl { + size_t bytes_per_iterator = 10000000; /// amount of data to process before refreshing iterator + size_t keys_per_iterator = 10000; + size_t bytes_per_batch = 1000000; /// amount of data before submitting batch + size_t keys_per_batch = 1000; + bool unittest_fail_after_first_batch = false; + bool unittest_fail_after_processing_column = false; + bool unittest_fail_after_successful_processing = false; + }; + int reshard(const std::string& new_sharding, const resharding_ctrl* ctrl = nullptr); }; #endif diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc index 209618aa400..58a75bc0b49 100644 --- a/src/os/bluestore/bluestore_tool.cc +++ b/src/os/bluestore/bluestore_tool.cc @@ -229,6 +229,7 @@ int main(int argc, char **argv) vector allocs_name; string empty_sharding(1, '\0'); string new_sharding = empty_sharding; + string resharding_ctrl; int log_level = 30; bool fsck_deep = false; po::options_description po_options("Options"); @@ -246,6 +247,7 @@ int main(int argc, char **argv) ("value,v", po::value(&value), "label metadata value") ("allocator", po::value>(&allocs_name), "allocator to inspect: 'block'/'bluefs-wal'/'bluefs-db'/'bluefs-slow'") ("sharding", po::value(&new_sharding), "new sharding to apply") + ("resharding-ctrl", po::value(&resharding_ctrl), "gives control over resharding procedure details") ; po::options_description po_positional("Positional options"); po_positional.add_options() @@ -896,8 +898,37 @@ int main(int argc, char **argv) cout << std::string(out.c_str(), out.length()) << std::endl; bluestore.cold_close(); } else if (action == "reshard") { + auto get_ctrl = [&](size_t& val) { + if (!resharding_ctrl.empty()) { + size_t pos; + std::string token; + pos = resharding_ctrl.find('/'); + token = resharding_ctrl.substr(0, pos); + if (pos != std::string::npos) + resharding_ctrl.erase(0, pos + 1); + else + resharding_ctrl.erase(); + char* endptr; + val = strtoll(token.c_str(), &endptr, 0); + if (*endptr != '\0') { + cerr << "invalid --resharding-ctrl. '" << token << "' is not a number" << std::endl; + exit(EXIT_FAILURE); + } + } + }; BlueStore bluestore(cct.get(), path); KeyValueDB *db_ptr; + RocksDBStore::resharding_ctrl ctrl; + if (!resharding_ctrl.empty()) { + get_ctrl(ctrl.bytes_per_iterator); + get_ctrl(ctrl.keys_per_iterator); + get_ctrl(ctrl.bytes_per_batch); + get_ctrl(ctrl.keys_per_batch); + if (!resharding_ctrl.empty()) { + cerr << "extra chars in --resharding-ctrl" << std::endl; + exit(EXIT_FAILURE); + } + } int r = bluestore.open_db_environment(&db_ptr); if (r < 0) { cerr << "error preparing db environment: " << cpp_strerror(r) << std::endl; @@ -910,7 +941,7 @@ int main(int argc, char **argv) RocksDBStore* rocks_db = dynamic_cast(db_ptr); ceph_assert(db_ptr); ceph_assert(rocks_db); - r = rocks_db->reshard(new_sharding); + r = rocks_db->reshard(new_sharding, &ctrl); if (r < 0) { cerr << "error resharding: " << cpp_strerror(r) << std::endl; } else { diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc index 9e88388d731..7ea1e38d4c9 100644 --- a/src/test/objectstore/test_kv.cc +++ b/src/test/objectstore/test_kv.cc @@ -1178,9 +1178,9 @@ TEST_F(RocksDBResharding, resume_interrupted_at_batch) { data_to_db(); check_db(); db->close(); - setenv("RocksDBStore::reshard::unittest", "1", 1); - ASSERT_EQ(db->reshard("Evade(4)"), -1000); - unsetenv("RocksDBStore::reshard::unittest"); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_first_batch = true; + ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1000); ASSERT_NE(db->open(cout), 0); ASSERT_EQ(db->reshard("Evade(4)"), 0); ASSERT_EQ(db->open(cout), 0); @@ -1194,9 +1194,9 @@ TEST_F(RocksDBResharding, resume_interrupted_at_column) { data_to_db(); check_db(); db->close(); - setenv("RocksDBStore::reshard::unittest", "2", 1); - ASSERT_EQ(db->reshard("Evade(4)"), -1001); - unsetenv("RocksDBStore::reshard::unittest"); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_processing_column = true; + ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1001); ASSERT_NE(db->open(cout), 0); ASSERT_EQ(db->reshard("Evade(4)"), 0); ASSERT_EQ(db->open(cout), 0); @@ -1209,11 +1209,10 @@ TEST_F(RocksDBResharding, resume_interrupted_before_commit) { generate_data(); data_to_db(); check_db(); - // ASSERT_EQ(check_db_expect_difference(), true); db->close(); - setenv("RocksDBStore::reshard::unittest", "4", 1); - ASSERT_EQ(db->reshard("Evade(4)"), -1002); - unsetenv("RocksDBStore::reshard::unittest"); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_successful_processing = true; + ASSERT_EQ(db->reshard("Evade(4)", &ctrl), -1002); ASSERT_NE(db->open(cout), 0); ASSERT_EQ(db->reshard("Evade(4)"), 0); ASSERT_EQ(db->open(cout), 0); @@ -1227,9 +1226,9 @@ TEST_F(RocksDBResharding, prevent_incomplete_hash_change) { data_to_db(); check_db(); db->close(); - setenv("RocksDBStore::reshard::unittest", "4", 1); - ASSERT_EQ(db->reshard("Evade(4,0-8)"), -1002); - unsetenv("RocksDBStore::reshard::unittest"); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_successful_processing = true; + ASSERT_EQ(db->reshard("Evade(4,0-8)", &ctrl), -1002); ASSERT_NE(db->open(cout), 0); ASSERT_EQ(db->reshard("Evade(4,0-8)"), 0); ASSERT_EQ(db->open(cout), 0); @@ -1243,16 +1242,18 @@ TEST_F(RocksDBResharding, change_reshard) { data_to_db(); check_db(); db->close(); - setenv("RocksDBStore::reshard::unittest", "1", 1); - ASSERT_EQ(db->reshard("C(5) D(3)"), -1000); + RocksDBStore::resharding_ctrl ctrl; + ctrl.unittest_fail_after_first_batch = true; + ASSERT_EQ(db->reshard("C(5) D(3)", &ctrl), -1000); ASSERT_NE(db->open(cout), 0); - setenv("RocksDBStore::reshard::unittest", "2", 1); - ASSERT_EQ(db->reshard("C(5) Evade(2)"), -1001); + ctrl.unittest_fail_after_first_batch = false; + ctrl.unittest_fail_after_processing_column = true; + ASSERT_EQ(db->reshard("C(5) Evade(2)", &ctrl), -1001); ASSERT_NE(db->open(cout), 0); - setenv("RocksDBStore::reshard::unittest", "4", 1); - ASSERT_EQ(db->reshard("Evade(2) D(3)"), -1002); + ctrl.unittest_fail_after_processing_column = false; + ctrl.unittest_fail_after_successful_processing = true; + ASSERT_EQ(db->reshard("Evade(2) D(3)", &ctrl), -1002); ASSERT_NE(db->open(cout), 0); - unsetenv("RocksDBStore::reshard::unittest"); ASSERT_EQ(db->reshard("Ad(1) Evade(5)"), 0); ASSERT_EQ(db->open(cout), 0); check_db(); -- 2.39.5