]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
tools: ceph-objectstore-tool is able to trim pg log dups' entries. 46631/head
authorRadosław Zarzyński <rzarzyns@redhat.com>
Sat, 11 Jun 2022 19:29:29 +0000 (21:29 +0200)
committerRadosław Zarzyński <rzarzyns@redhat.com>
Sun, 12 Jun 2022 00:35:24 +0000 (02:35 +0200)
The main assumption is trimming just dups doesn't need any update
to the corresponding pg_info_t.

Testing:

1. cluster without the autoscaler
```
rzarz@ubulap:~/dev/ceph/build$ MON=1 MGR=1 OSD=3 MGR=1 MDS=0 ../src/vstart.sh -l -b -n -o "osd_pg_log_dups_tracked=3000000" -o "osd_pool_default_pg_autoscale_mode=off"
```

2. 8 PGs in the testing pool.
```
rzarz@ubulap:~/dev/ceph/build$ bin/ceph osd pool create test-pool 8 8
```

3. Provisioning dups with rados bench
```
bin/rados bench -p test-pool 300 write -b 4096  --no-cleanup
...
Total time run:         300.034
Total writes made:      103413
Write size:             4096
Object size:            4096
Bandwidth (MB/sec):     1.34637
Stddev Bandwidth:       0.589071
Max bandwidth (MB/sec): 2.4375
Min bandwidth (MB/sec): 0.902344
Average IOPS:           344
Stddev IOPS:            150.802
Max IOPS:               624
Min IOPS:               231
Average Latency(s):     0.0464151
Stddev Latency(s):      0.0183627
Max latency(s):         0.0928424
Min latency(s):         0.0131932
```

4. Killing osd.0
```
rzarz@ubulap:~/dev/ceph/build$ kill 2572129 # pid of osd.0
```

5. Listing PGs on osd.0 and calculating number of pg log's entries and
dups:

```
rzarz@ubulap:~/dev/ceph/build$ bin/ceph-objectstore-tool --data-path dev/osd0 --op list-pgs --pgid 2.c > osd0_pgs.txt
rzarz@ubulap:~/dev/ceph/build$ for pgid in `cat osd0_pgs.txt`; do echo $pgid; bin/ceph-objectstore-tool --data-path dev/osd0 --op log --pgid $pgid | jq '(.pg_log_t.log|length),(.pg_log_t.dups|length)'; done
2.7
10020
3100
2.6
10100
3000
2.3
10012
2800
2.1
10049
2900
2.2
10057
2700
2.0
10027
2900
2.5
10077
2700
2.4
10072
2900
1.0
97
0
```

6. Trimming dups
```
rzarz@ubulap:~/dev/ceph/build$ CEPH_ARGS="--osd_pg_log_dups_tracked 2500 --osd_pg_log_trim_max=100" bin/ceph-objectstore-tool --data-path dev/osd0 --op trim-pg-log-dups --pgid 2.7
max_dup_entries=2500 max_chunk_size=100
Removing keys dup_0000000020.00000000000000000001 - dup_0000000020.00000000000000000100
Removing keys dup_0000000020.00000000000000000101 - dup_0000000020.00000000000000000200
Removing keys dup_0000000020.00000000000000000201 - dup_0000000020.00000000000000000300
Removing keys dup_0000000020.00000000000000000301 - dup_0000000020.00000000000000000400
Removing keys dup_0000000020.00000000000000000401 - dup_0000000020.00000000000000000500
Removing keys dup_0000000020.00000000000000000501 - dup_0000000020.00000000000000000600
Finished trimming, now compacting...
Finished trimming pg log dups
```

7. Checking number of pg log's entries and dups
```
rzarz@ubulap:~/dev/ceph/build$ for pgid in `cat osd0_pgs.txt`; do echo $pgid; bin/ceph-objectstore-tool --data-path dev/osd0 --op log --pgid $pgid | jq '(.pg_log_t.log|length),(.pg_log_t.dups|length)'; done
2.7
10020
2500
2.6
10100
3000
2.3
10012
2800
2.1
10049
2900
2.2
10057
2700
2.0
10027
2900
2.5
10077
2700
2.4
10072
2900
1.0
97
0
```

Conflicts:
        src/tools/ceph_objectstore_tool.cc -- undetected conflict
        with d5445b8f113797718a0dbb05e884a6bffbfed76a. Fixed by
        adopting the patch no not require the `unique_ptr<T>::get()`.

Fixes: https://tracker.ceph.com/issues/53729
Signed-off-by: Radosław Zarzyński <rzarzyns@redhat.com>
(cherry picked from commit a2190f901abf2fed20c65e59f53b38c10545cb5a)

src/tools/ceph_objectstore_tool.cc

index d9abe03af6edf452d689ced3ef636a5cc5fa38bc..03b27c494d6ca6d617624ecbfff492cb0226abe0 100644 (file)
@@ -728,6 +728,82 @@ int do_trim_pg_log(ObjectStore *store, const coll_t &coll,
   return 0;
 }
 
+int do_trim_pg_log_dups(ObjectStore *store, const coll_t &coll,
+                  pg_info_t &info, const spg_t &pgid,
+                  epoch_t map_epoch,
+                  PastIntervals &past_intervals)
+{
+  ghobject_t oid = pgid.make_pgmeta_oid();
+  struct stat st;
+  auto ch = store->open_collection(coll);
+  int r = store->stat(ch, oid, &st);
+  ceph_assert(r == 0);
+  ceph_assert(st.st_size == 0);
+
+  const size_t max_dup_entries = g_ceph_context->_conf->osd_pg_log_dups_tracked;
+  ceph_assert(max_dup_entries > 0);
+  const size_t max_chunk_size = g_ceph_context->_conf->osd_pg_log_trim_max;
+  ceph_assert(max_chunk_size > 0);
+
+  cout << "max_dup_entries=" << max_dup_entries
+       << " max_chunk_size=" << max_chunk_size << std::endl;
+  if (dry_run) {
+    cout << "Dry run enabled, so when many chunks are needed,"
+        << " the trimming will never stop!" << std::endl;
+  }
+
+  set<string> keys_to_keep;
+  size_t num_removed = 0;
+  do {
+    set<string> keys_to_trim;
+    {
+    ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, oid);
+    if (!p)
+      break;
+    for (p->seek_to_first(); p->valid(); p->next()) {
+      if (p->key()[0] == '_')
+       continue;
+      if (p->key() == "can_rollback_to")
+       continue;
+      if (p->key() == "divergent_priors")
+       continue;
+      if (p->key() == "rollback_info_trimmed_to")
+       continue;
+      if (p->key() == "may_include_deletes_in_missing")
+       continue;
+      if (p->key().substr(0, 7) == string("missing"))
+       continue;
+      if (p->key().substr(0, 4) != string("dup_"))
+       continue;
+      keys_to_keep.insert(p->key());
+      if (keys_to_keep.size() > max_dup_entries) {
+       auto oldest_to_keep = keys_to_keep.begin();
+       keys_to_trim.emplace(*oldest_to_keep);
+       keys_to_keep.erase(oldest_to_keep);
+      }
+      if (keys_to_trim.size() >= max_chunk_size) {
+       break;
+      }
+    }
+    } // deconstruct ObjectMapIterator
+    // delete the keys
+    num_removed = keys_to_trim.size();
+    if (!dry_run && !keys_to_trim.empty()) {
+      cout << "Removing keys " << *keys_to_trim.begin() << " - " << *keys_to_trim.rbegin() << std::endl;
+      ObjectStore::Transaction t;
+      t.omap_rmkeys(coll, oid, keys_to_trim);
+      store->queue_transaction(ch, std::move(t));
+      ch->flush();
+    }
+  } while (num_removed == max_chunk_size);
+
+  // compact the db since we just removed a bunch of data
+  cerr << "Finished trimming, now compacting..." << std::endl;
+  if (!dry_run)
+    store->compact();
+  return 0;
+}
+
 const int OMAP_BATCH_SIZE = 25;
 void get_omap_batch(ObjectMap::ObjectMapIterator &iter, map<string, bufferlist> &oset)
 {
@@ -3211,12 +3287,12 @@ int main(int argc, char **argv)
     ("journal-path", po::value<string>(&jpath),
      "path to journal, use if tool can't find it")
     ("pgid", po::value<string>(&pgidstr),
-     "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, and mandatory for apply-layout-settings if --pool is not specified")
+     "PG id, mandatory for info, log, remove, export, export-remove, mark-complete, trim-pg-log, trim-pg-log-dups and mandatory for apply-layout-settings if --pool is not specified")
     ("pool", po::value<string>(&pool),
      "Pool name, mandatory for apply-layout-settings if --pgid is not specified")
     ("op", po::value<string>(&op),
      "Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, list-slow-omap, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
-     "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, statfs]")
+     "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log, trim-pg-log-dups statfs]")
     ("epoch", po::value<unsigned>(&epoch),
      "epoch# for get-osdmap and get-inc-osdmap, the current epoch in use if not specified")
     ("file", po::value<string>(&file),
@@ -3782,7 +3858,8 @@ int main(int argc, char **argv)
   if ((op == "info" || op == "log" || op == "remove" || op == "export"
       || op == "export-remove" || op == "mark-complete"
       || op == "reset-last-complete"
-      || op == "trim-pg-log") &&
+      || op == "trim-pg-log"
+      || op == "trim-pg-log-dups") &&
       pgidstr.length() == 0) {
     cerr << "Must provide pgid" << std::endl;
     usage(desc);
@@ -4009,9 +4086,9 @@ int main(int argc, char **argv)
 
   // If not an object command nor any of the ops handled below, then output this usage
   // before complaining about a bad pgid
-  if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log") {
+  if (!vm.count("objcmd") && op != "export" && op != "export-remove" && op != "info" && op != "log" && op != "mark-complete" && op != "trim-pg-log" && op != "trim-pg-log-dups") {
     cerr << "Must provide --op (info, log, remove, mkfs, fsck, repair, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, "
-      "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, statfs)"
+      "get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, dump-export, trim-pg-log, trim-pg-log-dups statfs)"
         << std::endl;
     usage(desc);
     ret = 1;
@@ -4364,6 +4441,15 @@ int main(int argc, char **argv)
       }
       cout << "Finished trimming pg log" << std::endl;
       goto out;
+    } else if (op == "trim-pg-log-dups") {
+      ret = do_trim_pg_log_dups(fs, coll, info, pgid,
+                          map_epoch, past_intervals);
+      if (ret < 0) {
+       cerr << "Error trimming pg log dups: " << cpp_strerror(ret) << std::endl;
+       goto out;
+      }
+      cout << "Finished trimming pg log dups" << std::endl;
+      goto out;
     } else if (op == "reset-last-complete") {
       if (!force) {
         std::cerr << "WARNING: reset-last-complete is extremely dangerous and almost "