]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: make shared blob fsck much less RAM-greedy. 44614/head
authorIgor Fedotov <ifed@suse.com>
Tue, 26 Oct 2021 10:35:00 +0000 (13:35 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Tue, 18 Jan 2022 11:01:10 +0000 (14:01 +0300)
Fixes: https://tracker.ceph.com/issues/44924
Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
(cherry picked from commit 8fddc8464ee4dbb0ee22a10de21e8c16f38bf1ed)
(cherry picked from commit a902d22b6c785099c704a229db1dc1e6fefee3e2)

 Conflicts:
src/common/options/global.yaml.in
src/os/bluestore/BlueStore.cc
        src/os/bluestore/BlueStore.h
src/os/bluestore/bluestore_types.h
src/test/objectstore/store_test.cc
caused by lack of ZNS stuff and using options.cc for config parameter
defitions rather than yaml file(s)

src/common/options.cc
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/bluestore_types.cc
src/os/bluestore/bluestore_types.h
src/test/objectstore/store_test.cc
src/test/objectstore/test_bluestore_types.cc

index 4fa13e1076b3e0f1050630e4446dc6a732fcfbac..d2cd2cafc6a33df63af422455b10da13789e18c4 100644 (file)
@@ -4538,14 +4538,20 @@ std::vector<Option> get_global_options() {
     .set_default(false)
     .set_description("Try to submit metadata transaction to rocksdb in queuing thread context"),
 
+    Option("bluestore_fsck_quick_fix_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+      .set_default(2)
+      .set_description("Number of additional threads to perform quick-fix (shallow fsck) command"),
+
     Option("bluestore_fsck_read_bytes_cap", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
     .set_default(64_M)
     .set_flag(Option::FLAG_RUNTIME)
     .set_description("Maximum bytes read at once by deep fsck"),
 
-    Option("bluestore_fsck_quick_fix_threads", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-      .set_default(2)
-      .set_description("Number of additional threads to perform quick-fix (shallow fsck) command"),
+    Option("bluestore_fsck_shared_blob_tracker_size", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+    .set_default(0.03125)
+    .set_flag(Option::FLAG_RUNTIME)
+    .set_description("Size(a fraction of osd_memory_target, defaults to 128MB) of a hash table to track shared blobs ref counts. Higher the size, more precise is the tracker -> less overhead during the repair.")
+    .add_see_also("osd_memory_target"),
 
     Option("bluestore_throttle_bytes", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
     .set_default(64_M)
index 12fed048f88e00525e61c2dc6b619168ae6ad023..be19d2a520811715cf14b475ef6c6afbba5c1625 100644 (file)
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <algorithm>
 
 #include <boost/container/flat_set.hpp>
-#include "boost/algorithm/string.hpp"
+#include <boost/algorithm/string.hpp>
 
 #include "include/cpp-btree/btree_set.h"
 
@@ -90,7 +91,7 @@ const string PREFIX_PERPOOL_OMAP = "m"; // s64 + u64 + keyname -> value
 const string PREFIX_DEFERRED = "L";    // id -> deferred_transaction_t
 const string PREFIX_ALLOC = "B";       // u64 offset -> u64 length (freelist)
 const string PREFIX_ALLOC_BITMAP = "b";// (see BitmapFreelistManager)
-const string PREFIX_SHARED_BLOB = "X"; // u64 offset -> shared_blob_t
+const string PREFIX_SHARED_BLOB = "X"; // u64 SB id -> shared_blob_t
 
 const string BLUESTORE_GLOBAL_STATFS_KEY = "bluestore_statfs";
 
@@ -7518,8 +7519,7 @@ int _fsck_sum_extents(
 }
 
 int BlueStore::_fsck_check_extents(
-  const coll_t& cid,
-  const ghobject_t& oid,
+  std::string_view ctx_descr,
   const PExtentVector& extents,
   bool compressed,
   mempool_dynamic_bitset &used_blocks,
@@ -7528,7 +7528,7 @@ int BlueStore::_fsck_check_extents(
   store_statfs_t& expected_statfs,
   FSCKDepth depth)
 {
-  dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
+  dout(30) << __func__ << " " << ctx_descr << ", extents " << extents << dendl;
   int errors = 0;
   for (auto e : extents) {
     if (!e.is_valid())
@@ -7548,7 +7548,7 @@ int BlueStore::_fsck_check_extents(
                pos * min_alloc_size, min_alloc_size, !already);
            }
             if (!already) {
-              derr << "fsck error: " << oid << " extent " << e
+              derr << __func__ << "::fsck error: " << ctx_descr << ", extent " << e
                   << " or a subset is already allocated (misreferenced)" << dendl;
              ++errors;
              already = true;
@@ -7557,12 +7557,9 @@ int BlueStore::_fsck_check_extents(
          else
            bs.set(pos);
         });
-        if (repairer) {
-         repairer->set_space_used(e.offset, e.length, cid, oid);
-        }
 
       if (e.end() > bdev->get_size()) {
-        derr << "fsck error:  " << oid << " extent " << e
+        derr << "fsck error:  " << ctx_descr << ", extent " << e
             << " past end of block device" << dendl;
         ++errors;
       }
@@ -7678,6 +7675,170 @@ void BlueStore::_fsck_check_pool_statfs(
   }
 }
 
+void BlueStore::_fsck_repair_shared_blobs(
+  BlueStoreRepairer& repairer,
+  shared_blob_2hash_tracker_t& sb_ref_counts,
+  sb_info_space_efficient_map_t& sb_info)
+{
+  auto sb_ref_mismatches = sb_ref_counts.count_non_zero();
+  dout(1) << __func__ << " repairing shared_blobs, ref mismatch estimate: "
+         << sb_ref_mismatches << dendl;
+  if (!sb_ref_mismatches) // not expected to succeed, just in case
+    return;
+
+
+  auto foreach_shared_blob = [&](std::function<
+    void (coll_t,
+          ghobject_t,
+          uint64_t,
+          const bluestore_blob_t&)> cb) {
+      auto it = db->get_iterator(PREFIX_OBJ);
+      if (it) {
+        CollectionRef c;
+        spg_t pgid;
+        for (it->lower_bound(string()); it->valid(); it->next()) {
+          dout(30) << __func__ << " key "
+                  << pretty_binary_string(it->key())
+                  << dendl;
+          if (is_extent_shard_key(it->key())) {
+           continue;
+          }
+
+          ghobject_t oid;
+          int r = get_key_object(it->key(), &oid);
+          if (r < 0) {
+           continue;
+          }
+
+          if (!c ||
+           oid.shard_id != pgid.shard ||
+           oid.hobj.get_logical_pool() != (int64_t)pgid.pool() ||
+           !c->contains(oid)) {
+           c = nullptr;
+           for (auto& p : coll_map) {
+             if (p.second->contains(oid)) {
+               c = p.second;
+               break;
+             }
+           }
+           if (!c) {
+             continue;
+           }
+          }
+          dout(20) << __func__
+                   << " inspecting shared blob refs for col:" << c->cid
+                  << " obj:" << oid
+                  << dendl;
+
+          OnodeRef o;
+          o.reset(Onode::decode(c, oid, it->key(), it->value()));
+          o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+
+          _dump_onode<30>(cct, *o);
+
+          mempool::bluestore_fsck::set<BlobRef> passed_sbs;
+          for (auto& e : o->extent_map.extent_map) {
+           auto& b = e.blob->get_blob();
+           if (b.is_shared() && passed_sbs.count(e.blob) == 0) {
+             auto sbid = e.blob->shared_blob->get_sbid();
+             cb(c->cid, oid, sbid, b);
+             passed_sbs.emplace(e.blob);
+           }
+          } // for ... extent_map
+        } // for ... it->valid
+      } //if (it(PREFIX_OBJ))
+    }; //foreach_shared_blob fn declaration
+
+  mempool::bluestore_fsck::map<uint64_t, bluestore_extent_ref_map_t> refs_map;
+
+  // first iteration over objects to identify all the broken sbids
+  foreach_shared_blob( [&](coll_t cid,
+                           ghobject_t oid,
+                           uint64_t sbid,
+                           const bluestore_blob_t& b) {
+    auto it = refs_map.lower_bound(sbid);
+    if(it != refs_map.end() && it->first == sbid) {
+      return;
+    }
+    for (auto& p : b.get_extents()) {
+      if (p.is_valid() &&
+         !sb_ref_counts.test_all_zero_range(sbid,
+                                            p.offset,
+                                            p.length)) {
+       refs_map.emplace_hint(it, sbid, bluestore_extent_ref_map_t());
+        dout(20) << __func__
+                 << " broken shared blob found for col:" << cid
+                << " obj:" << oid
+                << " sbid 0x " << std::hex << sbid << std::dec
+                << dendl;
+       break;
+      }
+    }
+  });
+
+  // second iteration over objects to build new ref map for the broken sbids
+  foreach_shared_blob( [&](coll_t cid,
+                           ghobject_t oid,
+                           uint64_t sbid,
+                           const bluestore_blob_t& b) {
+    auto it = refs_map.find(sbid);
+    if(it == refs_map.end()) {
+      return;
+    }
+    for (auto& p : b.get_extents()) {
+      if (p.is_valid()) {
+       it->second.get(p.offset, p.length);
+       break;
+      }
+    }
+  });
+
+  // update shared blob records
+  auto ref_it = refs_map.begin();
+  while (ref_it != refs_map.end()) {
+    size_t cnt = 0;
+    const size_t max_transactions = 4096;
+    KeyValueDB::Transaction txn = db->get_transaction();
+    for (cnt = 0;
+      cnt < max_transactions && ref_it != refs_map.end();
+      ref_it++) {
+      auto sbid = ref_it->first;
+      dout(20) << __func__ << " repaired shared_blob 0x"
+       << std::hex << sbid << std::dec
+       << ref_it->second << dendl;
+      repairer.fix_shared_blob(txn, sbid, &ref_it->second, 0);
+      cnt++;
+    }
+    if (cnt) {
+      db->submit_transaction_sync(txn);
+      cnt = 0;
+    }
+  }
+  // remove stray shared blob records
+  size_t cnt = 0;
+  const size_t max_transactions = 4096;
+  KeyValueDB::Transaction txn = db->get_transaction();
+  sb_info.foreach_stray([&](const sb_info_t& sbi) {
+    auto sbid = sbi.get_sbid();
+    dout(20) << __func__ << " removing stray shared_blob 0x"
+      << std::hex << sbid << std::dec
+      << dendl;
+    repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+    cnt++;
+    if (cnt >= max_transactions) {}
+      db->submit_transaction_sync(txn);
+      txn = db->get_transaction();
+      cnt = 0;
+    });
+  if (cnt > 0) {
+    db->submit_transaction_sync(txn);
+  }
+
+  // amount of repairs to report to be equal to previously
+  // determined error estimation, not the actual number of updated shared blobs
+  repairer.inc_repaired(sb_ref_mismatches);
+}
+
 BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
   BlueStore::FSCKDepth depth,
   int64_t pool_id,
@@ -7698,6 +7859,7 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
   auto used_blocks = ctx.used_blocks;
   auto sb_info_lock = ctx.sb_info_lock;
   auto& sb_info = ctx.sb_info;
+  auto& sb_ref_counts = ctx.sb_ref_counts;
   auto repairer = ctx.repairer;
 
   store_statfs_t* res_statfs = (per_pool_stat_collection || repairer) ?
@@ -7809,14 +7971,20 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
       res_statfs->data_compressed_original +=
         i.first->get_referenced_bytes();
     }
+    if (depth != FSCK_SHALLOW && repairer) {
+      for (auto e : blob.get_extents()) {
+       if (!e.is_valid())
+         continue;
+       repairer->set_space_used(e.offset, e.length, c->cid, oid);
+      }
+    }
     if (blob.is_shared()) {
       if (i.first->shared_blob->get_sbid() > blobid_max) {
         derr << "fsck error: " << oid << " blob " << blob
           << " sbid " << i.first->shared_blob->get_sbid() << " > blobid_max "
           << blobid_max << dendl;
         ++errors;
-      }
-      else if (i.first->shared_blob->get_sbid() == 0) {
+      } else if (i.first->shared_blob->get_sbid() == 0) {
         derr << "fsck error: " << oid << " blob " << blob
           << " marked as shared but has uninitialized sbid"
           << dendl;
@@ -7826,18 +7994,22 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
       if (sb_info_lock) {
         sb_info_lock->lock();
       }
-      sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
-      ceph_assert(sbi.cid == coll_t() || sbi.cid == c->cid);
-      ceph_assert(sbi.pool_id == INT64_MIN ||
+      auto sbid = i.first->shared_blob->get_sbid();
+      sb_info_t& sbi = sb_info.add_or_adopt(i.first->shared_blob->get_sbid());
+      ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID ||
         sbi.pool_id == oid.hobj.get_logical_pool());
-      sbi.cid = c->cid;
       sbi.pool_id = oid.hobj.get_logical_pool();
-      sbi.sb = i.first->shared_blob;
-      sbi.oids.push_back(oid);
-      sbi.compressed = blob.is_compressed();
+      bool compressed = blob.is_compressed();
       for (auto e : blob.get_extents()) {
         if (e.is_valid()) {
-          sbi.ref_map.get(e.offset, e.length);
+         if (compressed) {
+           ceph_assert(sbi.allocated_chunks <= 0);
+           sbi.allocated_chunks -= (e.length >> min_alloc_size_order);
+         } else {
+           ceph_assert(sbi.allocated_chunks >= 0);
+           sbi.allocated_chunks += (e.length >> min_alloc_size_order);
+         }
+         sb_ref_counts.inc_range(sbid, e.offset, e.length, 1);
         }
       }
       if (sb_info_lock) {
@@ -7845,11 +8017,13 @@ BlueStore::OnodeRef BlueStore::fsck_check_objects_shallow(
       }
     } else if (depth != FSCK_SHALLOW) {
       ceph_assert(used_blocks);
-      errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
+      string ctx_descr = " oid " + stringify(oid);
+      errors += _fsck_check_extents(ctx_descr,
+       blob.get_extents(),
         blob.is_compressed(),
         *used_blocks,
         fm->get_alloc_size(),
-        repairer,
+       repairer,
         *res_statfs,
         depth);
     } else {
@@ -7951,7 +8125,8 @@ public:
     BlueStore* store = nullptr;
 
     ceph::mutex* sb_info_lock = nullptr;
-    BlueStore::sb_info_map_t* sb_info = nullptr;
+    sb_info_space_efficient_map_t* sb_info = nullptr;
+    shared_blob_2hash_tracker_t* sb_ref_counts = nullptr;
     BlueStoreRepairer* repairer = nullptr;
 
     Batch* batches = nullptr;
@@ -7962,13 +8137,15 @@ public:
                   size_t _batchCount,
                   BlueStore* _store,
                   ceph::mutex* _sb_info_lock,
-                  BlueStore::sb_info_map_t& _sb_info,
+                  sb_info_space_efficient_map_t& _sb_info,
+                 shared_blob_2hash_tracker_t& _sb_ref_counts,
                   BlueStoreRepairer* _repairer) :
       WorkQueue_(n, time_t(), time_t()),
       batchCount(_batchCount),
       store(_store),
       sb_info_lock(_sb_info_lock),
       sb_info(&_sb_info),
+      sb_ref_counts(&_sb_ref_counts),
       repairer(_repairer)
     {
       batches = new Batch[batchCount];
@@ -8021,6 +8198,7 @@ public:
         nullptr, //used_omap_head
         sb_info_lock,
         *sb_info,
+       *sb_ref_counts,
         batch->expected_store_statfs,
         batch->expected_pool_statfs,
         repairer);
@@ -8268,6 +8446,7 @@ void BlueStore::_fsck_check_objects(FSCKDepth depth,
   auto& errors = ctx.errors;
   auto sb_info_lock = ctx.sb_info_lock;
   auto& sb_info = ctx.sb_info;
+  auto& sb_ref_counts = ctx.sb_ref_counts;
   auto repairer = ctx.repairer;
 
   uint64_t_btree_t used_nids;
@@ -8286,6 +8465,7 @@ void BlueStore::_fsck_check_objects(FSCKDepth depth,
         this,
         sb_info_lock,
         sb_info,
+       sb_ref_counts,
         repairer));
 
     ShallowFSCKThreadPool thread_pool(cct, "ShallowFSCKThreadPool", "ShallowFSCK", thread_count);
@@ -8629,12 +8809,18 @@ out_path:
 
 int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
 {
+  uint64_t sb_hash_size = uint64_t(
+    cct->_conf.get_val<Option::size_t>("osd_memory_target") *
+    cct->_conf.get_val<double>(
+      "bluestore_fsck_shared_blob_tracker_size"));
+
   dout(1) << __func__
          << " <<<START>>>"
          << (repair ? " repair" : " check")
          << (depth == FSCK_DEEP ? " (deep)" :
                 depth == FSCK_SHALLOW ? " (shallow)" : " (regular)")
-          << " start" << dendl;
+          << " start sb_tracker_hash_size:" << sb_hash_size
+          << dendl;
   int64_t errors = 0;
   int64_t warnings = 0;
   unsigned repaired = 0;
@@ -8647,7 +8833,11 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
   store_statfs_t expected_store_statfs, actual_statfs;
   per_pool_statfs expected_pool_statfs;
 
-  sb_info_map_t sb_info;
+  sb_info_space_efficient_map_t sb_info;
+  shared_blob_2hash_tracker_t sb_ref_counts(
+    sb_hash_size,
+    min_alloc_size);
+  size_t sb_ref_mismatches = 0;
 
   uint64_t num_objects = 0;
   uint64_t num_extents = 0;
@@ -8753,6 +8943,44 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
     dout(1) << __func__ << " debug abort" << dendl;
     goto out_scan;
   }
+
+  dout(1) << __func__ << " checking shared_blobs (phase 1)" << dendl;
+  it = db->get_iterator(PREFIX_SHARED_BLOB);
+  if (it) {
+    for (it->lower_bound(string()); it->valid(); it->next()) {
+      string key = it->key();
+      uint64_t sbid;
+      if (get_key_shared_blob(key, &sbid) < 0) {
+        // Failed to parse the key.
+       // This gonna to be handled at the second stage
+       continue;
+      }
+      bluestore_shared_blob_t shared_blob(sbid);
+      bufferlist bl = it->value();
+      auto blp = bl.cbegin();
+      try {
+       decode(shared_blob, blp);
+      }
+      catch (ceph::buffer::error& e) {
+       // this gonna to be handled at the second stage
+       continue;
+      }
+      dout(20) << __func__ << "  " << shared_blob << dendl;
+      auto& sbi = sb_info.add_maybe_stray(sbid);
+
+      // primarily to silent the 'unused' warning
+      ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
+
+      for (auto& r : shared_blob.ref_map.ref_map) {
+       sb_ref_counts.inc_range(
+         sbid,
+         r.first,
+         r.second.length,
+         -r.second.refs);
+      }
+    }
+  } // if (it) //checking shared_blobs (phase1)
+
   // walk PREFIX_OBJ
   {
     dout(1) << __func__ << " walking object keyspace" << dendl;
@@ -8771,6 +8999,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
       // there is no multithreading in this case
       depth == FSCK_SHALLOW ? &sb_info_lock : nullptr,
       sb_info,
+      sb_ref_counts,
       expected_store_statfs,
       expected_pool_statfs,
       repair ? &repairer : nullptr);
@@ -8778,19 +9007,28 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
     _fsck_check_objects(depth, ctx);
   }
 
-  dout(1) << __func__ << " checking shared_blobs" << dendl;
+  sb_ref_mismatches = sb_ref_counts.count_non_zero();
+  if (sb_ref_mismatches != 0) {
+    derr << "fsck error: shared blob references aren't matching, at least "
+      << sb_ref_mismatches << " found" << dendl;
+    errors += sb_ref_mismatches;
+  }
+
+  if (depth != FSCK_SHALLOW && repair) {
+    _fsck_repair_shared_blobs(repairer, sb_ref_counts, sb_info);
+  }
+  dout(1) << __func__ << " checking shared_blobs (phase 2)" << dendl;
   it = db->get_iterator(PREFIX_SHARED_BLOB);
   if (it) {
     // FIXME minor: perhaps simplify for shallow mode?
     // fill global if not overriden below
     auto expected_statfs = &expected_store_statfs;
-
     for (it->lower_bound(string()); it->valid(); it->next()) {
       string key = it->key();
       uint64_t sbid;
       if (get_key_shared_blob(key, &sbid)) {
        derr << "fsck error: bad key '" << key
-            << "' in shared blob namespace" << dendl;
+         << "' in shared blob namespace" << dendl;
        if (repair) {
          repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
        }
@@ -8799,63 +9037,62 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
       }
       auto p = sb_info.find(sbid);
       if (p == sb_info.end()) {
-       derr << "fsck error: found stray shared blob data for sbid 0x"
-            << std::hex << sbid << std::dec << dendl;
-       if (repair) {
-         repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+        if (sb_ref_mismatches > 0) {
+         // highly likely this has been already reported before, ignoring...
+         dout(5) << __func__ << " found duplicate(?) stray shared blob data for sbid 0x"
+           << std::hex << sbid << std::dec << dendl;
+       } else {
+         derr<< "fsck error: found stray shared blob data for sbid 0x"
+           << std::hex << sbid << std::dec << dendl;
+         ++errors;
+         if (repair) {
+           repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+         }
        }
-       ++errors;
       } else {
        ++num_shared_blobs;
-       sb_info_t& sbi = p->second;
+       sb_info_t& sbi = *p;
        bluestore_shared_blob_t shared_blob(sbid);
        bufferlist bl = it->value();
        auto blp = bl.cbegin();
        try {
-         decode(shared_blob, blp);
-       } catch (buffer::error& e) {
-          ++errors;
-          // Force update and don't report as missing
-          sbi.updated = sbi.passed = true;
-
-          derr << "fsck error: failed to decode Shared Blob"
-              << pretty_binary_string(it->key()) << dendl;
-          if (repair) {
-           dout(20) << __func__ << " undecodable Shared Blob, key:'"
-                    << pretty_binary_string(it->key())
-                    << "', removing" << dendl;
-            repairer.remove_key(db, PREFIX_SHARED_BLOB, it->key());
-          }
-          continue;
-        }      
-       dout(20) << __func__ << "  " << *sbi.sb << " " << shared_blob << dendl;
-       if (shared_blob.ref_map != sbi.ref_map) {
-         derr << "fsck error: shared blob 0x" << std::hex << sbid
-               << std::dec << " ref_map " << shared_blob.ref_map
-               << " != expected " << sbi.ref_map << dendl;
-         sbi.updated = true; // will update later in repair mode only!
+         decode(shared_blob, blp);
+       }
+       catch (ceph::buffer::error& e) {
          ++errors;
+
+         derr << "fsck error: failed to decode Shared Blob"
+           << pretty_binary_string(key) << dendl;
+         if (repair) {
+           dout(20) << __func__ << " undecodable Shared Blob, key:'"
+             << pretty_binary_string(key)
+             << "', removing" << dendl;
+           repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+         }
+         continue;
        }
+       dout(20) << __func__ << "  " << shared_blob << dendl;
        PExtentVector extents;
-       for (auto &r : shared_blob.ref_map.ref_map) {
+       for (autor : shared_blob.ref_map.ref_map) {
          extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
        }
-       if (per_pool_stat_collection || repair) {
+       if (sbi.pool_id != sb_info_t::INVALID_POOL_ID &&
+           (per_pool_stat_collection || repair)) {
          expected_statfs = &expected_pool_statfs[sbi.pool_id];
        }
-       errors += _fsck_check_extents(sbi.cid,
-                                     sbi.oids.front(),
-                                     extents,
-                                     sbi.compressed,
-                                     used_blocks,
-                                     fm->get_alloc_size(),
-                                     repair ? &repairer : nullptr,
-                                     *expected_statfs,
-                                      depth);
-       sbi.passed = true;
+       std::stringstream ss;
+       ss << "sbid 0x" << std::hex << sbid << std::dec;
+       errors += _fsck_check_extents(ss.str(),
+         extents,
+         sbi.allocated_chunks < 0,
+         used_blocks,
+         fm->get_alloc_size(),
+         repair ? &repairer : nullptr,
+         *expected_statfs,
+         depth);
       }
     }
-  } // if (it)
+  } // if (it) /* checking shared_blobs (phase 2)*/
 
   if (repair && repairer.preprocess_misreference(db)) {
 
@@ -9005,22 +9242,24 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
          if (b->get_blob().is_shared()) {
             b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
 
-           auto sb_it = sb_info.find(b->shared_blob->get_sbid());
+           auto sbid = b->shared_blob->get_sbid();
+           auto sb_it = sb_info.find(sbid);
            ceph_assert(sb_it != sb_info.end());
-           sb_info_t& sbi = sb_it->second;
-
-           for (auto& r : sbi.ref_map.ref_map) {
-             expected_statfs->allocated -= r.second.length;
-             if (sbi.compressed) {
-               // NB: it's crucial to use compressed flag from sb_info_t
-               // as we originally used that value while accumulating 
-               // expected_statfs
-               expected_statfs->data_compressed_allocated -= r.second.length;
-             }
+           sb_info_t& sbi = *sb_it;
+
+           if (sbi.allocated_chunks < 0) {
+             // NB: it's crucial to use compressed_allocated_chunks from sb_info_t
+             // as we originally used that value while accumulating
+             // expected_statfs
+             expected_statfs->allocated -= uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+             expected_statfs->data_compressed_allocated -=
+               uint64_t(-sbi.allocated_chunks) << min_alloc_size_order;
+           } else {
+             expected_statfs->allocated -= uint64_t(sbi.allocated_chunks) << min_alloc_size_order;
            }
-           sbi.updated = sbi.passed = true;
-           sbi.ref_map.clear();
-           
+           sbi.allocated_chunks = 0;
+           repairer.fix_shared_blob(txn, sbid, nullptr, 0);
+
            // relying on blob's pextents to decide what to release.
            for (auto& p : pext_to_release) {
              to_release.union_insert(p.offset, p.length);
@@ -9053,57 +9292,8 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
       to_release.clear();
     } // if (it) {
   } //if (repair && repairer.preprocess_misreference()) {
-
-  if (depth != FSCK_SHALLOW) {
-    for (auto &p : sb_info) {
-      sb_info_t& sbi = p.second;
-      if (!sbi.passed) {
-        derr << "fsck error: missing " << *sbi.sb << dendl;
-        ++errors;
-      }
-      if (repair && (!sbi.passed || sbi.updated)) {
-        auto sbid = p.first;
-        if (sbi.ref_map.empty()) {
-         ceph_assert(sbi.passed);
-         dout(20) << __func__ << " " << *sbi.sb
-                  << " is empty, removing" << dendl;
-         repairer.fix_shared_blob(db, sbid, nullptr);
-        } else {
-         bufferlist bl;
-         bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
-         encode(persistent, bl);
-         dout(20) << __func__ << " " << *sbi.sb
-                  << " is " << bl.length() << " bytes, updating"
-                  << dendl;
-
-         repairer.fix_shared_blob(db, sbid, &bl);
-         // we need to account for shared blob pextents at both
-         // stats and used blocks to avoid related errors.
-         PExtentVector extents;
-         for (auto& r : persistent.ref_map.ref_map) {
-           extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
-         }
-         auto* expected_statfs = &expected_pool_statfs[sbi.pool_id];
-         int errors = _fsck_check_extents(sbi.cid,
-           ghobject_t(), // doesn't matter
-           extents,
-           sbi.compressed,
-           used_blocks,
-           fm->get_alloc_size(),
-           nullptr,
-           *expected_statfs,
-           depth);
-         if (errors) {
-           derr << __func__ << " " << errors
-                << "  unexpected error(s) after missed shared blob repair,"
-                << " perhaps worth one more repair attempt"
-                << dendl;
-         }
-        }
-      }
-    }
-  }
   sb_info.clear();
+  sb_ref_counts.reset();
 
   // check global stats only if fscking (not repairing) w/o per-pool stats
   if (!per_pool_stat_collection &&
@@ -9353,6 +9543,27 @@ void BlueStore::inject_no_shared_blob_key()
   db->submit_transaction_sync(txn);
 };
 
+void BlueStore::inject_stray_shared_blob_key(uint64_t sbid)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  dout(5) << __func__ << " " << sbid << dendl;
+
+  string key;
+  get_shared_blob_key(sbid, &key);
+  bluestore_shared_blob_t persistent(sbid);
+  persistent.ref_map.get(0xdead0000, 0x1000);
+  bufferlist bl;
+  encode(persistent, bl);
+  dout(20) << __func__ << " sbid " << sbid
+    << " takes " << bl.length() << " bytes, updating"
+    << dendl;
+
+  txn->set(PREFIX_SHARED_BLOB, key, bl);
+  db->submit_transaction_sync(txn);
+};
+
 
 void BlueStore::inject_leaked(uint64_t len)
 {
@@ -16213,29 +16424,22 @@ void BlueStoreRepairer::fix_per_pool_omap(KeyValueDB *db)
 }
 
 bool BlueStoreRepairer::fix_shared_blob(
-  KeyValueDB *db,
+  KeyValueDB::Transaction txn,
   uint64_t sbid,
-  const bufferlist* bl)
+  bluestore_extent_ref_map_t* ref_map,
+  size_t repaired)
 {
-  std::lock_guard l(lock); // possibly redundant
-  KeyValueDB::Transaction txn;
-  if (fix_misreferences_txn) { // reuse this txn
-    txn = fix_misreferences_txn;
-  } else {
-    if (!fix_shared_blob_txn) {
-      fix_shared_blob_txn = db->get_transaction();
-    }
-    txn = fix_shared_blob_txn;
-  }
   string key;
   get_shared_blob_key(sbid, &key);
-
-  ++to_repair_cnt;
-  if (bl) {
-    txn->set(PREFIX_SHARED_BLOB, key, *bl);
+  if (ref_map) {
+    bluestore_shared_blob_t persistent(sbid, std::move(*ref_map));
+    bufferlist bl;
+    encode(persistent, bl);
+    txn->set(PREFIX_SHARED_BLOB, key, bl);
   } else {
     txn->rmkey(PREFIX_SHARED_BLOB, key);
   }
+  to_repair_cnt += repaired;
   return true;
 }
 
index 850dfdbdc0298b8a84920456f0dda904c18d22e7..a8b3614f1230c50cc240549c1532694a48e9f1e4 100644 (file)
@@ -2403,8 +2403,7 @@ public:
 
 private:
   int _fsck_check_extents(
-    const coll_t& cid,
-    const ghobject_t& oid,
+    std::string_view ctx_descr,
     const PExtentVector& extents,
     bool compressed,
     mempool_dynamic_bitset &used_blocks,
@@ -2418,6 +2417,10 @@ private:
     int64_t& errors,
     int64_t &warnings,
     BlueStoreRepairer* repairer);
+  void _fsck_repair_shared_blobs(
+    BlueStoreRepairer& repairer,
+    shared_blob_2hash_tracker_t& sb_ref_counts,
+    sb_info_space_efficient_map_t& sb_info);
 
   int _fsck(FSCKDepth depth, bool repair);
   int _fsck_on_open(BlueStore::FSCKDepth depth, bool repair);
@@ -2883,6 +2886,7 @@ public:
   void inject_broken_shared_blob_key(const string& key,
                         const bufferlist& bl);
   void inject_no_shared_blob_key();
+  void inject_stray_shared_blob_key(uint64_t sbid);
 
   void inject_leaked(uint64_t len);
   void inject_false_free(coll_t cid, ghobject_t oid);
@@ -3282,21 +3286,10 @@ private:
   inline bool _use_rotational_settings();
 
 public:
-  struct sb_info_t {
-    coll_t cid;
-    int64_t pool_id = INT64_MIN;
-    list<ghobject_t> oids;
-    BlueStore::SharedBlobRef sb;
-    bluestore_extent_ref_map_t ref_map;
-    bool compressed = false;
-    bool passed = false;
-    bool updated = false;
-  };
   typedef btree::btree_set<
     uint64_t, std::less<uint64_t>,
     mempool::bluestore_fsck::pool_allocator<uint64_t>> uint64_t_btree_t;
 
-  typedef mempool::bluestore_fsck::map<uint64_t, sb_info_t> sb_info_map_t;
   struct FSCK_ObjectCtx {
     int64_t& errors;
     int64_t& warnings;
@@ -3310,7 +3303,9 @@ public:
     uint64_t_btree_t* used_omap_head;
 
     ceph::mutex* sb_info_lock;
-    sb_info_map_t& sb_info;
+    sb_info_space_efficient_map_t& sb_info;
+    // approximate amount of references per <shared blob, chunk>
+    shared_blob_2hash_tracker_t& sb_ref_counts;
 
     store_statfs_t& expected_store_statfs;
     per_pool_statfs& expected_pool_statfs;
@@ -3325,8 +3320,10 @@ public:
                    uint64_t& _num_spanning_blobs,
                    mempool_dynamic_bitset* _ub,
                    uint64_t_btree_t* _used_omap_head,
+
                    ceph::mutex* _sb_info_lock,
-                   sb_info_map_t& _sb_info,
+                   sb_info_space_efficient_map_t& _sb_info,
+                  shared_blob_2hash_tracker_t& _sb_ref_counts,
                    store_statfs_t& _store_statfs,
                    per_pool_statfs& _pool_statfs,
                    BlueStoreRepairer* _repairer) :
@@ -3341,6 +3338,7 @@ public:
       used_omap_head(_used_omap_head),
       sb_info_lock(_sb_info_lock),
       sb_info(_sb_info),
+      sb_ref_counts(_sb_ref_counts),
       expected_store_statfs(_store_statfs),
       expected_pool_statfs(_pool_statfs),
       repairer(_repairer) {
@@ -3528,9 +3526,13 @@ public:
 public:
   void fix_per_pool_omap(KeyValueDB *db);
   bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
-  bool fix_shared_blob(KeyValueDB *db,
-                        uint64_t sbid,
-                      const bufferlist* bl);
+  //bool fix_shared_blob(KeyValueDB *db,
+  //                  uint64_t sbid,
+  //                  const ceph::buffer::list* bl);
+  bool fix_shared_blob(KeyValueDB::Transaction txn,
+                       uint64_t sbid,
+                       bluestore_extent_ref_map_t* ref_map,
+                       size_t repaired = 1);
   bool fix_statfs(KeyValueDB *db, const string& key,
     const store_statfs_t& new_statfs);
 
@@ -3558,8 +3560,8 @@ public:
   }
   //////////////////////
   //In fact two methods below are the only ones in this class which are thread-safe!!
-  void inc_repaired() {
-    ++to_repair_cnt;
+  void inc_repaired(size_t n = 1) {
+    to_repair_cnt += n;
   }
   void request_compaction() {
     need_compact = true;
index d04773728c59be0cb9f5c9cef489f6d051de6d18..f85f842f6add77d8d28f26564cb8e13930cb6c6e 100644 (file)
@@ -1168,3 +1168,88 @@ void bluestore_compression_header_t::generate_test_instances(
   o.push_back(new bluestore_compression_header_t(1));
   o.back()->length = 1234;
 }
+
+// adds more salt to build a hash func input
+shared_blob_2hash_tracker_t::hash_input_t
+  shared_blob_2hash_tracker_t::build_hash_input(
+    uint64_t sbid,
+    uint64_t offset) const
+{
+  hash_input_t res = {
+    sbid,
+    offset >> au_void_bits,
+    ((sbid & 0xffffffff) << 32) + ~(uint32_t((offset >> au_void_bits) & 0xffffffff))
+  };
+  return res;
+}
+
+void shared_blob_2hash_tracker_t::inc(
+  uint64_t sbid,
+  uint64_t offset,
+  int n)
+{
+  auto hash_input = build_hash_input(sbid, offset);
+  ref_counter_2hash_tracker_t::inc(
+    (char*)hash_input.data(),
+    get_hash_input_size(),
+    n);
+}
+
+void shared_blob_2hash_tracker_t::inc_range(
+  uint64_t sbid,
+  uint64_t offset,
+  uint32_t len,
+  int n)
+{
+  uint32_t alloc_unit = 1 << au_void_bits;
+  int64_t l = len;
+  while (l > 0) {
+    // don't care about ofset alignment as inc() trims it anyway
+    inc(sbid, offset, n);
+    offset += alloc_unit;
+    l -= alloc_unit;
+  }
+}
+
+bool shared_blob_2hash_tracker_t::test_hash_conflict(
+  uint64_t sbid1,
+  uint64_t offset1,
+  uint64_t sbid2,
+  uint64_t offset2) const
+{
+  auto hash_input1 = build_hash_input(sbid1, offset1);
+  auto hash_input2 = build_hash_input(sbid2, offset2);
+  return ref_counter_2hash_tracker_t::test_hash_conflict(
+    (char*)hash_input1.data(),
+    (char*)hash_input2.data(),
+    get_hash_input_size());
+}
+
+bool shared_blob_2hash_tracker_t::test_all_zero(
+  uint64_t sbid,
+  uint64_t offset) const
+{
+  auto hash_input = build_hash_input(sbid, offset);
+  return
+    ref_counter_2hash_tracker_t::test_all_zero(
+      (char*)hash_input.data(),
+      get_hash_input_size());
+}
+
+bool shared_blob_2hash_tracker_t::test_all_zero_range(
+  uint64_t sbid,
+  uint64_t offset,
+  uint32_t len) const
+{
+  uint32_t alloc_unit = 1 << au_void_bits;
+  int64_t l = len;
+  while (l > 0) {
+    // don't care about ofset alignment as inc() trims it anyway
+    if (!test_all_zero(sbid, offset)) {
+      return false;
+    }
+    offset += alloc_unit;
+    l -= alloc_unit;
+  }
+  return true;
+}
index 09f297e291f5c90f3a3857e747b962f60521d602..3a7db755668b014497762d7fbad24d8da391950e 100644 (file)
@@ -16,7 +16,6 @@
 #define CEPH_OSD_BLUESTORE_BLUESTORE_TYPES_H
 
 #include <ostream>
-#include <bitset>
 #include <type_traits>
 #include "include/types.h"
 #include "include/interval_set.h"
@@ -25,6 +24,7 @@
 #include "compressor/Compressor.h"
 #include "common/Checksummer.h"
 #include "include/mempool.h"
+#include "include/ceph_hash.h"
 
 namespace ceph {
   class Formatter;
@@ -237,7 +237,7 @@ static inline bool operator!=(const bluestore_extent_ref_map_t& l,
   return !(l == r);
 }
 
-/// blob_use_tracker: a set of per-alloc unit ref counters to track blob usage
+/// blob_use_tracker: a set of per-alloc unit ref buckets to track blob usage
 struct bluestore_blob_use_tracker_t {
   // N.B.: There is no need to minimize au_size/num_au
   //   as much as possible (e.g. have just a single byte for au_size) since:
@@ -1062,5 +1062,250 @@ struct bluestore_compression_header_t {
 };
 WRITE_CLASS_DENC(bluestore_compression_header_t)
 
+template <template <typename> typename V, class COUNTER_TYPE = int32_t>
+class ref_counter_2hash_tracker_t {
+  size_t num_non_zero = 0;
+  size_t num_buckets = 0;
+  V<COUNTER_TYPE> buckets1;
+  V<COUNTER_TYPE> buckets2;
+
+public:
+  ref_counter_2hash_tracker_t(uint64_t mem_cap) {
+    num_buckets = mem_cap / sizeof(COUNTER_TYPE) / 2;
+    ceph_assert(num_buckets);
+    buckets1.resize(num_buckets);
+    buckets2.resize(num_buckets);
+    reset();
+  }
+
+  size_t get_num_buckets() const {
+    return num_buckets;
+  }
+
+  void inc(const char* hash_val, size_t hash_val_len, int n) {
+    auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len) %
+      num_buckets;
+    if (buckets1[h] == 0 && n) {
+      ++num_non_zero;
+    } else if (buckets1[h] == -n) {
+      --num_non_zero;
+    }
+    buckets1[h] += n;
+    h = ceph_str_hash_linux((const char*)hash_val, hash_val_len) % num_buckets;
+    if (buckets2[h] == 0 && n) {
+      ++num_non_zero;
+    } else if (buckets2[h] == -n) {
+      --num_non_zero;
+    }
+    buckets2[h] += n;
+  }
+
+  bool test_hash_conflict(
+    const char* hash_val1,
+    const char* hash_val2,
+    size_t hash_val_len) const {
+
+    auto h1 = ceph_str_hash_rjenkins((const char*)hash_val1, hash_val_len);
+    auto h2 = ceph_str_hash_rjenkins((const char*)hash_val2, hash_val_len);
+    auto h3 = ceph_str_hash_linux((const char*)hash_val1, hash_val_len);
+    auto h4 = ceph_str_hash_linux((const char*)hash_val2, hash_val_len);
+    return ((h1 % num_buckets) == (h2 % num_buckets)) &&
+      ((h3 % num_buckets) == (h4 % num_buckets));
+  }
+
+  bool test_all_zero(const char* hash_val, size_t hash_val_len) const {
+    auto h = ceph_str_hash_rjenkins((const char*)hash_val, hash_val_len);
+    if (buckets1[h % num_buckets] != 0) {
+      return false;
+    }
+    h = ceph_str_hash_linux((const char*)hash_val, hash_val_len);
+    return buckets2[h % num_buckets] == 0;
+  }
+
+  // returns number of mismatching buckets
+  size_t count_non_zero() const {
+    return num_non_zero;
+  }
+  void reset() {
+    for (size_t i = 0; i < num_buckets; i++) {
+      buckets1[i] = 0;
+      buckets2[i] = 0;
+    }
+    num_non_zero = 0;
+  }
+};
+
+class shared_blob_2hash_tracker_t
+  : public ref_counter_2hash_tracker_t<mempool::bluestore_fsck::vector> {
+
+  static const size_t hash_input_len = 3;
+
+  typedef std::array<uint64_t, hash_input_len> hash_input_t;
+
+  static size_t get_hash_input_size() {
+    return hash_input_len * sizeof(hash_input_t::value_type);
+  }
+
+  inline hash_input_t build_hash_input(uint64_t sbid, uint64_t offset) const;
+
+  size_t au_void_bits = 0;
+
+
+public:
+  shared_blob_2hash_tracker_t(uint64_t mem_cap, size_t alloc_unit)
+    : ref_counter_2hash_tracker_t(mem_cap) {
+    ceph_assert(alloc_unit);
+    ceph_assert(isp2(alloc_unit));
+    au_void_bits = ctz(alloc_unit);
+  }
+  void inc(uint64_t sbid, uint64_t offset, int n);
+  void inc_range(uint64_t sbid, uint64_t offset, uint32_t len, int n);
+
+  bool test_hash_conflict(
+    uint64_t sbid,
+    uint64_t offset,
+    uint64_t sbid2,
+    uint64_t offset2) const;
+  bool test_all_zero(
+    uint64_t sbid,
+    uint64_t offset) const;
+  bool test_all_zero_range(
+    uint64_t sbid,
+    uint64_t offset,
+    uint32_t len) const;
+};
+
+class sb_info_t {
+  // subzero value indicates (potentially) stray blob,
+  // i.e. blob that has got no real references from onodes
+  int64_t sbid = 0;
+
+public:
+  enum {
+    INVALID_POOL_ID = INT64_MIN
+  };
+
+  int64_t pool_id = INVALID_POOL_ID;
+  // subzero value indicates compressed_allocated as well
+  int32_t allocated_chunks = 0;
+
+  sb_info_t(int64_t _sbid = 0) : sbid(_sbid)
+  {
+  }
+  bool operator< (const sb_info_t& other) const {
+    return std::abs(sbid) < std::abs(other.sbid);
+  }
+  bool operator< (const uint64_t& other_sbid) const {
+    return uint64_t(std::abs(sbid)) < other_sbid;
+  }
+  bool is_stray() const {
+    return sbid < 0;
+  }
+  uint64_t get_sbid() const {
+    return uint64_t(std::abs(sbid));
+  }
+  void adopt() {
+    sbid = std::abs(sbid);
+  }
+} __attribute__((packed));
+
+// Space-efficient container to keep a set of sb_info structures
+// given that the majority of entries are appended in a proper id-sorted
+// order. Hence one can keep them in a regular vector and apply binary search
+// whenever specific entry to be found.
+// For the rare occasions when out-of-order append takes place - an auxilliary
+// regular map is used.
+struct sb_info_space_efficient_map_t {
+  // large array sorted by the user
+  mempool::bluestore_fsck::vector<sb_info_t> items;
+  // small additional set of items we maintain sorting ourselves
+  // this would never keep an entry with id > items.back().id
+  mempool::bluestore_fsck::vector<sb_info_t> aux_items;
+
+  sb_info_t& add_maybe_stray(uint64_t sbid) {
+    return _add(-int64_t(sbid));
+  }
+  sb_info_t& add_or_adopt(uint64_t sbid) {
+    auto& r = _add(sbid);
+    r.adopt();
+    return r;
+  }
+  auto find(uint64_t id) {
+    if (items.size() != 0) {
+      auto it = std::lower_bound(
+       items.begin(),
+       items.end() - 1,
+       id,
+       [](const sb_info_t& a, const uint64_t& b) {
+         return a < b;
+       });
+      if (it->get_sbid() == id) {
+       return it;
+      }
+      if (aux_items.size() != 0) {
+       auto it = std::lower_bound(
+         aux_items.begin(),
+         aux_items.end(),
+         id,
+         [](const sb_info_t& a, const uint64_t& b) {
+           return a < b;
+         });
+       if (it->get_sbid() == id) {
+         return it;
+       }
+      }
+    }
+    return items.end();
+  }
+  // enumerates strays, order isn't guaranteed.
+  void foreach_stray(std::function<void(const sb_info_t&)> cb) {
+    for (auto& sbi : items) {
+      if (sbi.is_stray()) {
+       cb(sbi);
+      }
+    }
+    for (auto& sbi : aux_items) {
+      if (sbi.is_stray()) {
+       cb(sbi);
+      }
+    }
+  }
+  auto end() {
+    return items.end();
+  }
+
+  void shrink() {
+    items.shrink_to_fit();
+    aux_items.shrink_to_fit();
+  }
+  void clear() {
+    items.clear();
+    aux_items.clear();
+    shrink();
+  }
+private:
+  sb_info_t& _add(int64_t id) {
+    uint64_t n_id = uint64_t(std::abs(id));
+    if (items.size() == 0 || n_id > items.back().get_sbid()) {
+      return items.emplace_back(id);
+    }
+    auto it = find(n_id);
+    if (it != items.end()) {
+      return *it;
+    }
+    if (aux_items.size() == 0 || n_id > aux_items.back().get_sbid()) {
+      return aux_items.emplace_back(id);
+    }
+    // do sorted insertion, may be expensive!
+    it = std::upper_bound(
+      aux_items.begin(),
+      aux_items.end(),
+      n_id,
+      [](const uint64_t& a, const sb_info_t& b) {
+       return a < b.get_sbid();
+      });
+    return *aux_items.emplace(it, id);
+  }
+};
 
 #endif
index 846185a077e76d6f6c508d5e9476f603daf961ad..6bbb396442ef776b52bbb28ca8002f77d6ca19ed 100644 (file)
 #include "common/errno.h"
 #include "include/stringify.h"
 #include "include/coredumpctl.h"
-
 #include "include/unordered_map.h"
+#include "os/kv.h"
 #include "store_test_fixture.h"
 
+using namespace std;
 using namespace std::placeholders;
 
 typedef boost::mt11213b gen_type;
@@ -7392,6 +7393,7 @@ TEST_P(StoreTestSpecificAUSize, garbageCollection) {
       const PerfCounters* counters = store->get_perf_counters();
       ASSERT_EQ(counters->get(l_bluestore_gc_merged), 0x3ffffu);
     }
+
     {
       struct store_statfs_t statfs;
       WRITE_AT(0, buf_len-1);
@@ -7613,7 +7615,7 @@ TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) {
     }
 
     bstore->umount();
-    ASSERT_EQ(bstore->fsck(false), 3);
+    ASSERT_EQ(bstore->fsck(false), 4);
     ASSERT_LE(bstore->repair(false), 0);
     ASSERT_EQ(bstore->fsck(false), 0);
   }
@@ -7722,6 +7724,78 @@ TEST_P(StoreTestSpecificAUSize, BluestoreBrokenZombieRepairTest) {
   bstore->mount();
 }
 
+TEST_P(StoreTestSpecificAUSize, BluestoreRepairSharedBlobTest) {
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  SetVal(g_conf(), "bluestore_fsck_on_mount", "false");
+  SetVal(g_conf(), "bluestore_fsck_on_umount", "false");
+
+  const size_t block_size = 0x1000;
+  StartDeferred(block_size);
+
+  BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+  // fill the store with some data
+  const uint64_t pool = 555;
+  coll_t cid(spg_t(pg_t(0, pool), shard_id_t::NO_SHARD));
+  auto ch = store->create_new_collection(cid);
+
+  ghobject_t hoid = make_object("Object 1", pool);
+  ghobject_t hoid_cloned = hoid;
+  hoid_cloned.hobj.snap = 1;
+  ghobject_t hoid2 = make_object("Object 2", pool);
+
+  string s(block_size, 1);
+  bufferlist bl;
+  bl.append(s);
+  int r;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  // check the scenario when shared blob contains
+  // references to extents from two objects which don't overlapp
+  // o1 -> 0x2000~1K
+  // o2 -> 0x4000~1k
+  cerr << "introduce 2 non-overlapped extents in a shared blob"
+       << std::endl;
+  {
+    ObjectStore::Transaction t;
+    t.write(cid, hoid, 0, bl.length(), bl);
+    t.write(cid, hoid2, 0, bl.length(), bl); // to make a gap in allocations
+    t.write(cid, hoid, block_size * 2 , bl.length(), bl);
+    t.clone(cid, hoid, hoid_cloned);
+    t.zero(cid, hoid, 0, bl.length());
+    t.zero(cid, hoid_cloned, block_size * 2, bl.length());
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  bstore->umount();
+  bstore->mount();
+  {
+    string key;
+    _key_encode_u64(1, &key);
+    bluestore_shared_blob_t sb(1);
+    sb.ref_map.get(0x2000, block_size);
+    sb.ref_map.get(0x4000, block_size);
+    sb.ref_map.get(0x4000, block_size);
+    bufferlist bl;
+    encode(sb, bl);
+    bstore->inject_broken_shared_blob_key(key, bl);
+  }
+  bstore->umount();
+  ASSERT_EQ(bstore->fsck(false), 2);
+  ASSERT_EQ(bstore->repair(false), 0);
+  ASSERT_EQ(bstore->fsck(false), 0);
+
+  cerr << "Completing" << std::endl;
+  bstore->mount();
+}
+
 TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) {
   if (string(GetParam()) != "bluestore")
     return;
@@ -7746,6 +7820,7 @@ TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) {
     ghobject_t hoid_cloned = hoid;
     hoid_cloned.hobj.snap = 1;
 
+
     {
       ObjectStore::Transaction t;
       t.create_collection(cid, 0);
@@ -7772,12 +7847,22 @@ TEST_P(StoreTestSpecificAUSize, BluestoreBrokenNoSharedBlobRepairTest) {
   cerr << "injecting" << std::endl;
   sleep(3); // need some time for the previous write to land
   bstore->inject_no_shared_blob_key();
+  bstore->inject_stray_shared_blob_key(12345678);
 
   {
     cerr << "fscking/fixing" << std::endl;
     bstore->umount();
-    ASSERT_EQ(bstore->fsck(false), 3);
-    ASSERT_LE(bstore->repair(false), 3);
+    // depending on the allocation map's source we can
+    // either observe or don't observe an additional
+    // extent leak detection. Hence adjusting the expected
+    // value
+    size_t expected_error_count =
+      7; // 4 sb ref mismatch errors + 1 statfs + 1 block leak + 1 non-free
+    ASSERT_EQ(bstore->fsck(false), expected_error_count);
+    // repair might report less errors than fsck above showed
+    // as some errors, e.g. statfs mismatch, are implicitly fixed
+    // before the detection during the previous repair steps...
+    ASSERT_LE(bstore->repair(false), expected_error_count);
     ASSERT_EQ(bstore->fsck(false), 0);
   }
 
index 383fbdb9ab2cf001a001c08ca8226e80ead2cf9e..f8c57c1c35feae8c5ad7b04f413b7806a4d61547 100644 (file)
@@ -11,6 +11,7 @@
 #include "common/ceph_argparse.h"
 #include "global/global_init.h"
 #include "global/global_context.h"
+#include "perfglue/heap_profiler.h"
 
 #include <sstream>
 
@@ -31,6 +32,7 @@ TEST(bluestore, sizeof) {
   P(bluestore_onode_t);
   P(bluestore_blob_t);
   P(PExtentVector);
+  P(ghobject_t);
   P(bluestore_shared_blob_t);
   P(bluestore_extent_ref_map_t);
   P(bluestore_extent_ref_map_t::record_t);
@@ -42,10 +44,78 @@ TEST(bluestore, sizeof) {
   P(bufferlist);
   P(bufferptr);
   P(range_seg_t);
+  P(sb_info_t);
   cout << "map<uint64_t,uint64_t>\t" << sizeof(map<uint64_t,uint64_t>) << std::endl;
   cout << "map<char,char>\t" << sizeof(map<char,char>) << std::endl;
 }
 
+void dump_mempools()
+{
+  ostringstream ostr;
+  Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty");
+  ostr << "Mempools: ";
+  f->open_object_section("mempools");
+  mempool::dump(f);
+  f->close_section();
+  f->flush(ostr);
+  delete f;
+  cout << ostr.str() << std::endl;
+}
+
+TEST(sb_info_space_efficient_map_t, basic) {
+  sb_info_space_efficient_map_t sb_info;
+  const size_t num_shared = 1000;
+  for (size_t i = 0; i < num_shared; i += 2) {
+    auto& sbi = sb_info.add_maybe_stray(i);
+    sbi.pool_id = i;
+  }
+  ASSERT_TRUE(sb_info.find(0) != sb_info.end());
+  ASSERT_TRUE(sb_info.find(1) == sb_info.end());
+  ASSERT_TRUE(sb_info.find(2) != sb_info.end());
+  ASSERT_TRUE(sb_info.find(4)->pool_id == 4);
+  ASSERT_TRUE(sb_info.find(num_shared) == sb_info.end());
+
+  // ordered insertion
+  sb_info.add_or_adopt(num_shared).pool_id = num_shared;
+  ASSERT_TRUE(sb_info.find(num_shared) != sb_info.end());
+  ASSERT_TRUE(sb_info.find(num_shared)->pool_id == num_shared);
+
+  // out of order insertion
+  sb_info.add_or_adopt(1).pool_id = 1;
+  ASSERT_TRUE(sb_info.find(1) != sb_info.end());
+  ASSERT_TRUE(sb_info.find(1)->pool_id == 1);
+
+  // ordered insertion
+  sb_info.add_maybe_stray(num_shared + 1).pool_id = num_shared + 1;
+  ASSERT_TRUE(sb_info.find(num_shared + 1) != sb_info.end());
+  ASSERT_TRUE(sb_info.find(num_shared + 1)->pool_id == num_shared + 1);
+
+  // out of order insertion
+  sb_info.add_maybe_stray(105).pool_id = 105;
+  ASSERT_TRUE(sb_info.find(105) != sb_info.end());
+  ASSERT_TRUE(sb_info.find(105)->pool_id == 105);
+}
+
+TEST(sb_info_space_efficient_map_t, size) {
+  const size_t num_shared = 10000000;
+  sb_info_space_efficient_map_t sb_info;
+
+  BlueStore store(g_ceph_context, "", 4096);
+  BlueStore::OnodeCacheShard* oc = BlueStore::OnodeCacheShard::create(
+    g_ceph_context, "lru", NULL);
+  BlueStore::BufferCacheShard* bc = BlueStore::BufferCacheShard::create(
+    g_ceph_context, "lru", NULL);
+
+  auto coll = ceph::make_ref<BlueStore::Collection>(&store, oc, bc, coll_t());
+
+  for (size_t i = 0; i < num_shared; i++) {
+    auto& sbi = sb_info.add_or_adopt(i);
+    // primarily to silent the 'unused' warning
+    ceph_assert(sbi.pool_id == sb_info_t::INVALID_POOL_ID);
+  }
+  dump_mempools();
+}
+
 TEST(bluestore_extent_ref_map_t, add)
 {
   bluestore_extent_ref_map_t m;
@@ -1739,6 +1809,71 @@ TEST(bluestore_blob_t, wrong_map_bl_in_51682)
   }
 }
 
+TEST(shared_blob_2hash_tracker_t, basic_test)
+{
+  shared_blob_2hash_tracker_t t1(1024 * 1024, 4096);
+
+  ASSERT_TRUE(t1.count_non_zero() == 0);
+
+  t1.inc(0, 0, 1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(0, 0, -1);
+  ASSERT_TRUE(t1.count_non_zero() == 0);
+
+  t1.inc(3, 0x1000, 2);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(3, 0x1000, -1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(3, 0x1000, -1);
+  ASSERT_TRUE(t1.count_non_zero() == 0);
+
+  t1.inc(2, 0x2000, 5);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(18, 0x2000, -5);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(18, 0x2000, 1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(2, 0x2000, -1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(18, 0x2000, 4);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(2, 0x2000, -4);
+  ASSERT_TRUE(t1.count_non_zero() == 0);
+
+  t1.inc(3, 0x3000, 2);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(4, 0x3000, -1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(4, 0x3000, -1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(3, 0x3000, -2);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(4, 0x3000, 1);
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+  t1.inc(4, 0x3000, 1);
+  ASSERT_TRUE(t1.count_non_zero() == 0);
+
+  t1.inc(5, 0x1000, 1);
+  t1.inc(5, 0x2000, 3);
+  t1.inc(5, 0x3000, 2);
+  t1.inc(5, 0x8000, 1);
+
+  ASSERT_TRUE(t1.count_non_zero() != 0);
+
+  ASSERT_TRUE(!t1.test_all_zero(5,0x1000));
+  ASSERT_TRUE(!t1.test_all_zero(5, 0x2000));
+  ASSERT_TRUE(!t1.test_all_zero(5, 0x3000));
+  ASSERT_TRUE(t1.test_all_zero(5, 0x4000));
+  ASSERT_TRUE(!t1.test_all_zero(5, 0x8000));
+
+  ASSERT_TRUE(t1.test_all_zero_range(5, 0, 0x1000));
+  ASSERT_TRUE(t1.test_all_zero_range(5, 0x500, 0x500));
+  ASSERT_TRUE(!t1.test_all_zero_range(5, 0x500, 0x1500));
+  ASSERT_TRUE(!t1.test_all_zero_range(5, 0x1500, 0x3200));
+  ASSERT_TRUE(t1.test_all_zero_range(5, 0x4500, 0x1500));
+  ASSERT_TRUE(t1.test_all_zero_range(5, 0x4500, 0x3b00));
+  ASSERT_TRUE(!t1.test_all_zero_range(5, 0, 0x9000));
+}
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);