From: Radosław Zarzyński Date: Sat, 11 Jun 2022 19:29:29 +0000 (+0200) Subject: tools: ceph-objectstore-tool is able to trim pg log dups' entries. X-Git-Tag: v17.2.1~1 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=eb0eac1a195f1d8e9e3c472c7b1ca1e9add581c2;p=ceph.git tools: ceph-objectstore-tool is able to trim pg log dups' entries. The main assumption is trimming just dups doesn't need any update to the corresponding pg_info_t. Testing: 1. cluster without the autoscaler ``` rzarz@ubulap:~/dev/ceph/build$ MON=1 MGR=1 OSD=3 MGR=1 MDS=0 ../src/vstart.sh -l -b -n -o "osd_pg_log_dups_tracked=3000000" -o "osd_pool_default_pg_autoscale_mode=off" ``` 2. 8 PGs in the testing pool. ``` rzarz@ubulap:~/dev/ceph/build$ bin/ceph osd pool create test-pool 8 8 ``` 3. Provisioning dups with rados bench ``` bin/rados bench -p test-pool 300 write -b 4096 --no-cleanup ... Total time run: 300.034 Total writes made: 103413 Write size: 4096 Object size: 4096 Bandwidth (MB/sec): 1.34637 Stddev Bandwidth: 0.589071 Max bandwidth (MB/sec): 2.4375 Min bandwidth (MB/sec): 0.902344 Average IOPS: 344 Stddev IOPS: 150.802 Max IOPS: 624 Min IOPS: 231 Average Latency(s): 0.0464151 Stddev Latency(s): 0.0183627 Max latency(s): 0.0928424 Min latency(s): 0.0131932 ``` 4. Killing osd.0 ``` rzarz@ubulap:~/dev/ceph/build$ kill 2572129 # pid of osd.0 ``` 5. Listing PGs on osd.0 and calculating number of pg log's entries and dups: ``` rzarz@ubulap:~/dev/ceph/build$ bin/ceph-objectstore-tool --data-path dev/osd0 --op list-pgs --pgid 2.c > osd0_pgs.txt rzarz@ubulap:~/dev/ceph/build$ for pgid in `cat osd0_pgs.txt`; do echo $pgid; bin/ceph-objectstore-tool --data-path dev/osd0 --op log --pgid $pgid | jq '(.pg_log_t.log|length),(.pg_log_t.dups|length)'; done 2.7 10020 3100 2.6 10100 3000 2.3 10012 2800 2.1 10049 2900 2.2 10057 2700 2.0 10027 2900 2.5 10077 2700 2.4 10072 2900 1.0 97 0 ``` 6. Trimming dups ``` rzarz@ubulap:~/dev/ceph/build$ CEPH_ARGS="--osd_pg_log_dups_tracked 2500 --osd_pg_log_trim_max=100" bin/ceph-objectstore-tool --data-path dev/osd0 --op trim-pg-log-dups --pgid 2.7 max_dup_entries=2500 max_chunk_size=100 Removing keys dup_0000000020.00000000000000000001 - dup_0000000020.00000000000000000100 Removing keys dup_0000000020.00000000000000000101 - dup_0000000020.00000000000000000200 Removing keys dup_0000000020.00000000000000000201 - dup_0000000020.00000000000000000300 Removing keys dup_0000000020.00000000000000000301 - dup_0000000020.00000000000000000400 Removing keys dup_0000000020.00000000000000000401 - dup_0000000020.00000000000000000500 Removing keys dup_0000000020.00000000000000000501 - dup_0000000020.00000000000000000600 Finished trimming, now compacting... Finished trimming pg log dups ``` 7. Checking number of pg log's entries and dups ``` rzarz@ubulap:~/dev/ceph/build$ for pgid in `cat osd0_pgs.txt`; do echo $pgid; bin/ceph-objectstore-tool --data-path dev/osd0 --op log --pgid $pgid | jq '(.pg_log_t.log|length),(.pg_log_t.dups|length)'; done 2.7 10020 2500 2.6 10100 3000 2.3 10012 2800 2.1 10049 2900 2.2 10057 2700 2.0 10027 2900 2.5 10077 2700 2.4 10072 2900 1.0 97 0 ``` Fixes: https://tracker.ceph.com/issues/53729 Signed-off-by: Radosław Zarzyński (cherry picked from commit a2190f901abf2fed20c65e59f53b38c10545cb5a) (cherry picked from commit 3d3193fc6d71e178af0a288e010c308d61767562) --- diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc index 8690bd96a1887..f4ad1a54de9a0 100644 --- a/src/tools/ceph_objectstore_tool.cc +++ b/src/tools/ceph_objectstore_tool.cc @@ -736,6 +736,82 @@ int do_trim_pg_log(ObjectStore *store, const coll_t &coll, return 0; } +int do_trim_pg_log_dups(ObjectStore *store, const coll_t &coll, + pg_info_t &info, const spg_t &pgid, + epoch_t map_epoch, + PastIntervals &past_intervals) +{ + ghobject_t oid = pgid.make_pgmeta_oid(); + struct stat st; + auto ch = store->open_collection(coll); + int r = store->stat(ch, oid, &st); + ceph_assert(r == 0); + ceph_assert(st.st_size == 0); + + const size_t max_dup_entries = g_ceph_context->_conf->osd_pg_log_dups_tracked; + ceph_assert(max_dup_entries > 0); + const size_t max_chunk_size = g_ceph_context->_conf->osd_pg_log_trim_max; + ceph_assert(max_chunk_size > 0); + + cout << "max_dup_entries=" << max_dup_entries + << " max_chunk_size=" << max_chunk_size << std::endl; + if (dry_run) { + cout << "Dry run enabled, so when many chunks are needed," + << " the trimming will never stop!" << std::endl; + } + + set keys_to_keep; + size_t num_removed = 0; + do { + set keys_to_trim; + { + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid); + if (!p) + break; + for (p->seek_to_first(); p->valid(); p->next()) { + if (p->key()[0] == '_') + continue; + if (p->key() == "can_rollback_to") + continue; + if (p->key() == "divergent_priors") + continue; + if (p->key() == "rollback_info_trimmed_to") + continue; + if (p->key() == "may_include_deletes_in_missing") + continue; + if (p->key().substr(0, 7) == string("missing")) + continue; + if (p->key().substr(0, 4) != string("dup_")) + continue; + keys_to_keep.insert(p->key()); + if (keys_to_keep.size() > max_dup_entries) { + auto oldest_to_keep = keys_to_keep.begin(); + keys_to_trim.emplace(*oldest_to_keep); + keys_to_keep.erase(oldest_to_keep); + } + if (keys_to_trim.size() >= max_chunk_size) { + break; + } + } + } // deconstruct ObjectMapIterator + // delete the keys + num_removed = keys_to_trim.size(); + if (!dry_run && !keys_to_trim.empty()) { + cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl; + ObjectStore::Transaction t; + t.omap_rmkeys(coll, oid, keys_to_trim); + store->queue_transaction(ch, std::move(t)); + ch->flush(); + } + } while (num_removed == max_chunk_size); + + // compact the db since we just removed a bunch of data + cerr << "Finished trimming, now compacting..." << std::endl; + if (!dry_run) + store->compact(); + return 0; +} + const int OMAP_BATCH_SIZE = 25; void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map &oset) { @@ -3219,12 +3295,12 @@ int main(int argc, char **argv) ("journal-path", po::value(&jpath), "path to journal, use if tool can't find it") ("pgid", po::value(&pgidstr), - "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, and mandatory for apply-layout-settings if --pool is not specified") + "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, trim-pg-log-dups and mandatory for apply-layout-settings if --pool is not specified") ("pool", po::value(&pool), "Pool name, mandatory for apply-layout-settings if --pgid is not specified") ("op", po::value(&op), "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, list-slow-omap, fix-lost, list-pgs, dump-journal, dump-super, meta-list, " - "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, statfs]") + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, trim-pg-log-dups statfs]") ("epoch", po::value(&epoch), "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified") ("file", po::value(&file), @@ -3793,7 +3869,8 @@ int main(int argc, char **argv) if ((op == "info" || op == "log" || op == "remove" || op == "export" || op == "export-remove" || op == "mark-complete" || op == "reset-last-complete" - || op == "trim-pg-log") && + || op == "trim-pg-log" + || op == "trim-pg-log-dups") && pgidstr.length() == 0) { cerr << "Must provide pgid" << std::endl; usage(desc); @@ -4020,9 +4097,9 @@ int main(int argc, char **argv) // If not an object command nor any of the ops handled below, then output this usage // before complaining about a bad pgid - if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log") { + if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log" && op != "trim-pg-log-dups") { cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, " - "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, statfs)" + "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, trim-pg-log-dups statfs)" << std::endl; usage(desc); ret = 1; @@ -4375,6 +4452,15 @@ int main(int argc, char **argv) } cout << "Finished trimming pg log" << std::endl; goto out; + } else if (op == "trim-pg-log-dups") { + ret = do_trim_pg_log_dups(fs.get(), coll, info, pgid, + map_epoch, past_intervals); + if (ret < 0) { + cerr << "Error trimming pg log dups: " << cpp_strerror(ret) << std::endl; + goto out; + } + cout << "Finished trimming pg log dups" << std::endl; + goto out; } else if (op == "reset-last-complete") { if (!force) { std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost "