]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: implement BlueStore repair
authorIgor Fedotov <ifedotov@suse.com>
Thu, 21 Dec 2017 13:46:01 +0000 (16:46 +0300)
committerIgor Fedotov <ifedotov@suse.com>
Wed, 21 Feb 2018 12:56:35 +0000 (15:56 +0300)
Signed-off-by: Igor Fedotov <ifedotov@suse.com>
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
src/os/bluestore/bluestore_types.h
src/test/objectstore/store_test.cc
src/test/objectstore/test_bluestore_types.cc

index b2625476147be7ddf401fe8b6c17e87ba6db8b35..618ad3e45838ab680045dd2af73863e94476982c 100644 (file)
@@ -5689,11 +5689,13 @@ static void apply(uint64_t off,
 }
 
 int BlueStore::_fsck_check_extents(
+  const coll_t& cid,
   const ghobject_t& oid,
   const PExtentVector& extents,
   bool compressed,
   mempool_dynamic_bitset &used_blocks,
   uint64_t granularity,
+  BlueStoreRepairer* repairer,
   store_statfs_t& expected_statfs)
 {
   dout(30) << __func__ << " oid " << oid << " extents " << extents << dendl;
@@ -5710,18 +5712,27 @@ int BlueStore::_fsck_check_extents(
       e.offset, e.length, granularity, used_blocks,
       [&](uint64_t pos, mempool_dynamic_bitset &bs) {
        assert(pos < bs.size());
-       if (bs.test(pos))
-         already = true;
+       if (bs.test(pos)) {
+         if (repairer) {
+           repairer->note_misreference(
+             pos * min_alloc_size, min_alloc_size, !already);
+         }
+          if (!already) {
+            derr << "fsck error: " << oid << " extent " << e
+                << " or a subset is already allocated (misreferenced)" << dendl;
+           ++errors;
+           already = true;
+         }
+       }
        else
          bs.set(pos);
       });
-    if (already) {
-      derr << " " << oid << " extent " << e
-          << " or a subset is already allocated" << dendl;
-      ++errors;
-    }
+      if (repairer) {
+       repairer->get_space_usage_tracker().set_used( e.offset, e.length, cid, oid);
+      }
+
     if (e.end() > bdev->get_size()) {
-      derr << " " << oid << " extent " << e
+      derr << "fsck error:  " << oid << " extent " << e
           << " past end of block device" << dendl;
       ++errors;
     }
@@ -5729,13 +5740,50 @@ int BlueStore::_fsck_check_extents(
   return errors;
 }
 
+/**
+An overview for currently implemented repair logics 
+performed in fsck in two stages: detection(+preparation) and commit.
+Detection stage (in processing order):
+  (Issue -> Repair action to schedule)
+  - Detect undecodable keys for Shared Blobs -> Remove
+  - Detect undecodable records for Shared Blobs -> Remove 
+    (might trigger missed Shared Blob detection below)
+  - Detect stray records for Shared Blobs -> Remove
+  - Detect misreferenced pextents -> Fix
+    Prepare Bloom-like filter to track cid/oid -> pextent 
+    Prepare list of extents that are improperly referenced
+    Enumerate Onode records that might use 'misreferenced' pextents
+    (Bloom-like filter applied to reduce computation)
+      Per each questinable Onode enumerate all blobs and identify broken ones 
+      (i.e. blobs having 'misreferences')
+      Rewrite each broken blob data by allocating another extents and 
+      copying data there
+      If blob is shared - unshare it and mark corresponding Shared Blob 
+      for removal
+      Release previously allocated space
+      Update Extent Map
+  - Detect missed Shared Blobs -> Recreate
+  - Detect undecodable deferred transaction -> Remove
+  - Detect Freelist Manager's 'false free' entries -> Mark as used
+  - Detect Freelist Manager's leaked entries -> Mark as free
+  - Detect statfs inconsistency - Update
+  Commit stage (separate DB commit per each step):
+  - Apply leaked FM entries fix
+  - Apply 'false free' FM entries fix
+  - Apply 'Remove' actions
+  - Apply fix for misreference pextents
+  - Apply Shared Blob recreate 
+    (can be merged with the step above if misreferences were dectected)
+  - Apply StatFS update
+*/
 int BlueStore::_fsck(bool deep, bool repair)
 {
   dout(1) << __func__
-         << (repair ? " fsck" : " repair")
+         << " <<<START>>>"
+         << (repair ? " repair" : " check")
          << (deep ? " (deep)" : " (shallow)") << " start" << dendl;
   int errors = 0;
-  int repaired = 0;
+  unsigned repaired = 0;
 
   typedef btree::btree_set<
     uint64_t,std::less<uint64_t>,
@@ -5749,10 +5797,13 @@ int BlueStore::_fsck(bool deep, bool repair)
   KeyValueDB::Iterator it;
   store_statfs_t expected_statfs, actual_statfs;
   struct sb_info_t {
+    coll_t cid;
     list<ghobject_t> oids;
     SharedBlobRef sb;
     bluestore_extent_ref_map_t ref_map;
-    bool compressed;
+    bool compressed = false;
+    bool passed = false;
+    bool updated = false;
   };
   mempool::bluestore_fsck::map<uint64_t,sb_info_t> sb_info;
 
@@ -5763,6 +5814,7 @@ int BlueStore::_fsck(bool deep, bool repair)
   uint64_t num_shared_blobs = 0;
   uint64_t num_sharded_objects = 0;
   uint64_t num_object_shards = 0;
+  BlueStoreRepairer repairer;
 
   utime_t start = ceph_clock_now();
 
@@ -5822,6 +5874,11 @@ int BlueStore::_fsck(bool deep, bool repair)
       bs.set(pos);
     }
   );
+  if (repair) {
+    repairer.get_space_usage_tracker().init(
+      bdev->get_size(),
+      min_alloc_size);
+  }
 
   if (bluefs) {
     for (auto e = bluefs_extents.begin(); e != bluefs_extents.end(); ++e) {
@@ -5914,12 +5971,9 @@ int BlueStore::_fsck(bool deep, bool repair)
          oid.hobj.pool != (int64_t)pgid.pool() ||
          !c->contains(oid)) {
        c = nullptr;
-       for (ceph::unordered_map<coll_t, CollectionRef>::iterator p =
-              coll_map.begin();
-            p != coll_map.end();
-            ++p) {
-         if (p->second->contains(oid)) {
-           c = p->second;
+       for (auto& p : coll_map) {
+         if (p.second->contains(oid)) {
+           c = p.second;
            break;
          }
        }
@@ -6103,6 +6157,8 @@ int BlueStore::_fsck(bool deep, bool repair)
             ++errors;
           }
          sb_info_t& sbi = sb_info[i.first->shared_blob->get_sbid()];
+         assert(sbi.cid == coll_t() || sbi.cid == c->cid);
+         sbi.cid = c->cid;
          sbi.sb = i.first->shared_blob;
          sbi.oids.push_back(oid);
          sbi.compressed = blob.is_compressed();
@@ -6112,10 +6168,11 @@ int BlueStore::_fsck(bool deep, bool repair)
            }
          }
        } else {
-         errors += _fsck_check_extents(oid, blob.get_extents(),
+         errors += _fsck_check_extents(c->cid, oid, blob.get_extents(),
                                        blob.is_compressed(),
                                        used_blocks,
                                        fm->get_alloc_size(),
+                                       repair ? &repairer : nullptr,
                                        expected_statfs);
         }
       }
@@ -6142,6 +6199,7 @@ int BlueStore::_fsck(bool deep, bool repair)
       }
     }
   }
+
   dout(1) << __func__ << " checking shared_blobs" << dendl;
   it = db->get_iterator(PREFIX_SHARED_BLOB);
   if (it) {
@@ -6151,6 +6209,9 @@ int BlueStore::_fsck(bool deep, bool repair)
       if (get_key_shared_blob(key, &sbid)) {
        derr << "fsck error: bad key '" << key
             << "' in shared blob namespace" << dendl;
+       if (repair) {
+         repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+       }
        ++errors;
        continue;
       }
@@ -6158,6 +6219,9 @@ int BlueStore::_fsck(bool deep, bool repair)
       if (p == sb_info.end()) {
        derr << "fsck error: found stray shared blob data for sbid 0x"
             << std::hex << sbid << std::dec << dendl;
+       if (repair) {
+         repairer.remove_key(db, PREFIX_SHARED_BLOB, key);
+       }
        ++errors;
       } else {
        ++num_shared_blobs;
@@ -6165,36 +6229,266 @@ int BlueStore::_fsck(bool deep, bool repair)
        bluestore_shared_blob_t shared_blob(sbid);
        bufferlist bl = it->value();
        bufferlist::iterator blp = bl.begin();
-       decode(shared_blob, blp);
+       try {
+         decode(shared_blob, blp);
+       } catch (buffer::error& e) {
+          ++errors;
+          // Force update and don't report as missing
+          sbi.updated = sbi.passed = true;
+
+          derr << "fsck error: failed to decode Shared Blob"
+              << pretty_binary_string(it->key()) << dendl;
+          if (repair) {
+           dout(20) << __func__ << " undecodable Shared Blob, key:'"
+                    << pretty_binary_string(it->key())
+                    << "', removing" << dendl;
+            repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+          }
+          continue;
+        }      
        dout(20) << __func__ << "  " << *sbi.sb << " " << shared_blob << dendl;
        if (shared_blob.ref_map != sbi.ref_map) {
          derr << "fsck error: shared blob 0x" << std::hex << sbid
-              << std::dec << " ref_map " << shared_blob.ref_map
-              << " != expected " << sbi.ref_map << dendl;
+               << std::dec << " ref_map " << shared_blob.ref_map
+               << " != expected " << sbi.ref_map << dendl;
+         sbi.updated = true; // will update later in repair mode only!
          ++errors;
        }
        PExtentVector extents;
        for (auto &r : shared_blob.ref_map.ref_map) {
          extents.emplace_back(bluestore_pextent_t(r.first, r.second.length));
        }
-       errors += _fsck_check_extents(p->second.oids.front(),
+       errors += _fsck_check_extents(sbi.cid,
+                                     p->second.oids.front(),
                                      extents,
                                      p->second.compressed,
                                      used_blocks,
                                      fm->get_alloc_size(),
+                                     repair ? &repairer : nullptr,
                                      expected_statfs);
-       sb_info.erase(p);
+       sbi.passed = true;
+      }
+    }
+  } // if (it)
+
+  if (repair && repairer.preprocess_misreference(db)) {
+    dout(1) << __func__ << " sorting out misreferenced extents" << dendl;
+    auto& space_tracker = repairer.get_space_usage_tracker();
+    auto& misref_extents = repairer.get_misreferences();
+    interval_set<uint64_t> to_release;
+    it = db->get_iterator(PREFIX_OBJ);
+    if (it) {
+      CollectionRef c;
+      spg_t pgid;
+      KeyValueDB::Transaction txn = repairer.get_fix_misreferences_txn();
+      bool bypass_rest = false;
+      for (it->lower_bound(string()); it->valid() && !bypass_rest;
+          it->next()) {
+       dout(30) << __func__ << " key "
+                << pretty_binary_string(it->key()) << dendl;
+       if (is_extent_shard_key(it->key())) {
+         continue;
+       }
+
+       ghobject_t oid;
+       int r = get_key_object(it->key(), &oid);
+       if (r < 0 || !space_tracker.is_used(oid)) {
+         continue;
+       }
+
+       if (!c ||
+           oid.shard_id != pgid.shard ||
+           oid.hobj.pool != (int64_t)pgid.pool() ||
+           !c->contains(oid)) {
+         c = nullptr;
+         for (auto& p : coll_map) {
+           if (p.second->contains(oid)) {
+             c = p.second;
+             break;
+           }
+         }
+         if (!c) {
+           continue;
+         }
+         c->cid.is_pg(&pgid);
+       }
+       if (!space_tracker.is_used(c->cid)) {
+         continue;
+       }
+       dout(20) << __func__ << " check misreference for col:" << c->cid
+                 << " obj:" << oid << dendl;
+
+       RWLock::RLocker l(c->lock);
+       OnodeRef o = c->get_onode(oid, false);
+       o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+       mempool::bluestore_fsck::set<BlobRef> blobs;
+
+       for (auto& e : o->extent_map.extent_map) {
+         blobs.insert(e.blob);
+       }
+       bool need_onode_update = false;
+       bool first_dump = true;
+       for(auto b : blobs) {
+         bool broken_blob = false;
+         auto& pextents = b->dirty_blob().dirty_extents();
+         for (auto& e : pextents) {
+           if (!e.is_valid()) {
+             continue;
+           }
+           // for the sake of simplicity and proper shared blob handling
+           // always rewrite the whole blob even when it's partially
+           // misreferenced.
+           if (misref_extents.intersects(e.offset, e.length)) {
+             if (first_dump) {
+               first_dump = false;
+               _dump_onode(o, 10);
+             }
+             broken_blob = true;
+             break;
+           }
+         }
+         if (!broken_blob)
+           continue;
+         bool compressed = b->get_blob().is_compressed();
+          need_onode_update = true;
+         dout(10) << __func__
+                   << " fix misreferences in oid:" << oid
+                   << " " << *b << dendl;
+         uint64_t b_off = 0;
+         PExtentVector pext_to_release;
+         pext_to_release.reserve(pextents.size());
+         // rewriting all valid pextents
+         for (auto e = pextents.begin(); e != pextents.end();
+                b_off += e->length, e++) {
+           if (!e->is_valid()) {
+             continue;
+           }
+           int r = alloc->reserve(e->length);
+           if (r != 0) {
+             derr << __func__ << " failed to reserve 0x" << std::hex << e->length
+                  << " bytes, misreferences repair's incomplete" << std::dec << dendl;
+             bypass_rest = true;
+             break;
+           }
+           PExtentVector exts;
+           int64_t alloc_len = alloc->allocate(e->length, min_alloc_size,
+                                               0, 0, &exts);
+           if (alloc_len < (int64_t)e->length) {
+             derr << __func__ << " allocate failed on 0x" << std::hex << e->length
+                   << " min_alloc_size 0x" << min_alloc_size << std::dec << dendl;
+             assert(0 == "allocate failed, wtf");
+           }
+           expected_statfs.allocated += e->length;
+           if (compressed) {
+             expected_statfs.compressed_allocated += e->length;
+           }
+           bufferlist bl;
+           IOContext ioc(cct, NULL, true); // allow EIO
+           r = bdev->read(e->offset, e->length, &bl, &ioc, false);
+           if (r < 0) {
+             derr << __func__ << " failed to read from 0x" << std::hex << e->offset
+                   <<"~" << e->length << std::dec << dendl;
+             assert(0 == "read failed, wtf");
+           }
+           pext_to_release.push_back(*e);
+           e = pextents.erase(e);
+           e = pextents.insert(e, exts.begin(), exts.end());
+           b->get_blob().map_bl(
+             b_off, bl,
+             [&](uint64_t offset, bufferlist& t) {
+               int r = bdev->write(offset, t, false);
+               assert(r == 0);
+             });
+           e += exts.size() - 1;
+            for (auto& p : exts) {
+             fm->allocate(p.offset, p.length, txn);
+           }
+         } // for (auto e = pextents.begin(); e != pextents.end(); e++) {
+
+         if (b->get_blob().is_shared()) {
+            b->dirty_blob().clear_flag(bluestore_blob_t::FLAG_SHARED);
+
+           auto sb_it = sb_info.find(b->shared_blob->get_sbid());
+           assert(sb_it != sb_info.end());
+           sb_info_t& sbi = sb_it->second;
+
+           for (auto& r : sbi.ref_map.ref_map) {
+             expected_statfs.allocated -= r.second.length;
+             if (sbi.compressed) {
+               // NB: it's crucial to use compressed flag from sb_info_t
+               // as we originally used that value while accumulating 
+               // expected_statfs
+               expected_statfs.compressed_allocated -= r.second.length;
+             }
+           }
+           sbi.updated = sbi.passed = true;
+           sbi.ref_map.clear();
+           
+           // relying on blob's pextents to decide what to release.
+           for (auto& p : pext_to_release) {
+             to_release.insert(p.offset, p.length);
+           }
+         } else {
+           for (auto& p : pext_to_release) {
+             expected_statfs.allocated -= p.length;
+             if (compressed) {
+               expected_statfs.compressed_allocated -= p.length;
+             }
+             to_release.insert(p.offset, p.length);
+           }
+         }
+         if (bypass_rest) {
+           break;
+         }
+       } // for(auto b : blobs) 
+       if (need_onode_update) {
+         o->extent_map.dirty_range(0, OBJECT_MAX_SIZE);
+         _record_onode(o, txn);
+       }
+      } // for (it->lower_bound(string()); it->valid(); it->next())
+
+      for (auto it = to_release.begin(); it != to_release.end(); ++it) {
+       dout(10) << __func__ << " release 0x" << std::hex << it.get_start()
+                << "~" << it.get_len() << std::dec << dendl;
+       fm->release(it.get_start(), it.get_len(), txn);
       }
-    }
-  }
+      alloc->release(to_release);
+      to_release.clear();
+    } // if (it) {
+  } //if (repair && repairer.preprocess_misreference()) {
+
   for (auto &p : sb_info) {
-    derr << "fsck error: shared_blob 0x" << p.first
-        << " key is missing (" << *p.second.sb << ")" << dendl;
-    ++errors;
+    sb_info_t& sbi = p.second;
+    if (!sbi.passed) {
+      derr << "fsck error: missing " << *sbi.sb << dendl;
+      ++errors;
+    }
+    if (repair && (!sbi.passed || sbi.updated)) {
+      auto sbid = p.first;
+      if (sbi.ref_map.empty()) {
+       assert(sbi.passed);
+       dout(20) << __func__ << " " << *sbi.sb
+                << " is empty, removing" << dendl;
+       repairer.fix_shared_blob(db, sbid, nullptr);
+      } else {
+       bufferlist bl;
+       bluestore_shared_blob_t persistent(sbid, std::move(sbi.ref_map));
+       encode(persistent, bl);
+       dout(20) << __func__ << " " << *sbi.sb
+                << " is " << bl.length() << " bytes, updating" << dendl;
+
+       repairer.fix_shared_blob(db, sbid, &bl);
+      }
+    }
   }
+  sb_info.clear();
+
   if (!(actual_statfs == expected_statfs)) {
     derr << "fsck error: actual " << actual_statfs
         << " != expected " << expected_statfs << dendl;
+    if (repair) {
+      repairer.fix_statfs(db, expected_statfs);
+    }
     ++errors;
   }
 
@@ -6236,8 +6530,13 @@ int BlueStore::_fsck(bool deep, bool repair)
       } catch (buffer::error& e) {
        derr << "fsck error: failed to decode deferred txn "
             << pretty_binary_string(it->key()) << dendl;
-       r = -EIO;
-        goto out_scan;
+       if (repair) {
+          dout(20) << __func__ << " undecodable deferred TXN record, key: '"
+                  << pretty_binary_string(it->key())
+                  << "', removing" << dendl;   
+         repairer.remove_key(db, PREFIX_DEFERRED, it->key());
+       }
+       continue;
       }
       dout(20) << __func__ << "  deferred " << wt.seq
               << " ops " << wt.ops.size()
@@ -6276,36 +6575,40 @@ int BlueStore::_fsck(bool deep, bool repair)
         [&](uint64_t pos, mempool_dynamic_bitset &bs) {
          assert(pos < bs.size());
           if (bs.test(pos)) {
-            intersects = true;
+           if (offset == SUPER_RESERVED &&
+               length == min_alloc_size - SUPER_RESERVED) {
+             // this is due to the change just after luminous to min_alloc_size
+             // granularity allocations, and our baked in assumption at the top
+             // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
+             // (vs luminous's round_up_to(SUPER_RESERVED,block_size)).  harmless,
+             // since we will never allocate this region below min_alloc_size.
+             dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
+                      << " and min_alloc_size, 0x" << std::hex << offset << "~"
+                      << length << dendl;
+           } else {
+              intersects = true;
+             if (repair) {
+               repairer.fix_false_free(db, fm,
+                                       pos * min_alloc_size,
+                                       min_alloc_size);
+             }
+           }
           } else {
            bs.set(pos);
           }
         }
       );
       if (intersects) {
-       if (offset == SUPER_RESERVED &&
-           length == min_alloc_size - SUPER_RESERVED) {
-         // this is due to the change just after luminous to min_alloc_size
-         // granularity allocations, and our baked in assumption at the top
-         // of _fsck that 0~round_up_to(SUPER_RESERVED,min_alloc_size) is used
-         // (vs luminous's round_up_to(SUPER_RESERVED,block_size)).  harmless,
-         // since we will never allocate this region below min_alloc_size.
-         dout(10) << __func__ << " ignoring free extent between SUPER_RESERVED"
-                  << " and min_alloc_size, 0x" << std::hex << offset << "~"
-                  << length << dendl;
-       } else {
-         derr << "fsck error: free extent 0x" << std::hex << offset
-              << "~" << length << std::dec
-              << " intersects allocated blocks" << dendl;
-         ++errors;
-       }
+       derr << "fsck error: free extent 0x" << std::hex << offset
+             << "~" << length << std::dec
+             << " intersects allocated blocks" << dendl;
+       ++errors;
       }
     }
     fm->enumerate_reset();
     size_t count = used_blocks.count();
     if (used_blocks.size() != count) {
       assert(used_blocks.size() > count);
-      ++errors;
       used_blocks.flip();
       size_t start = used_blocks.find_first();
       while (start != decltype(used_blocks)::npos) {
@@ -6313,10 +6616,17 @@ int BlueStore::_fsck(bool deep, bool repair)
        while (true) {
          size_t next = used_blocks.find_next(cur);
          if (next != cur + 1) {
+           ++errors;
            derr << "fsck error: leaked extent 0x" << std::hex
                 << ((uint64_t)start * fm->get_alloc_size()) << "~"
                 << ((cur + 1 - start) * fm->get_alloc_size()) << std::dec
                 << dendl;
+           if (repair) {
+             repairer.fix_leaked(db,
+                                 fm,
+                                 start * min_alloc_size,
+                                 (cur + 1 - start) * min_alloc_size);
+           }
            start = next;
            break;
          }
@@ -6326,7 +6636,11 @@ int BlueStore::_fsck(bool deep, bool repair)
       used_blocks.flip();
     }
   }
-
+  if (repair) {
+    dout(5) << __func__ << " applying repair results" << dendl;
+    repaired = repairer.apply(db);
+    dout(5) << __func__ << " repair applied" << dendl;
+  }
  out_scan:
   mempool_thread.shutdown();
   _flush_cache();
@@ -6358,10 +6672,135 @@ int BlueStore::_fsck(bool deep, bool repair)
          << dendl;
 
   utime_t duration = ceph_clock_now() - start;
-  dout(1) << __func__ << " finish with " << errors << " errors, " << repaired
-         << " repaired, " << (errors - repaired) << " remaining in "
+  dout(1) << __func__ << " <<<FINISH>>> with " << errors << " errors, " << repaired
+         << " repaired, " << (errors - (int)repaired) << " remaining in "
          << duration << " seconds" << dendl;
-  return errors - repaired;
+  return errors - (int)repaired;
+}
+
+/// methods to inject various errors fsck can repair
+void BlueStore::inject_broken_shared_blob_key(const string& key,
+                                 const bufferlist& bl)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+  txn->set(PREFIX_SHARED_BLOB, key, bl);
+  db->submit_transaction_sync(txn);
+};
+
+void BlueStore::inject_leaked(uint64_t len)
+{
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  PExtentVector exts;
+  int r = alloc->reserve(len);
+  assert(r == 0);
+  int64_t alloc_len = alloc->allocate(len, min_alloc_size,
+                                          min_alloc_size * 256, 0, &exts);
+  assert(alloc_len >= (int64_t)len);
+  for (auto& p : exts) {
+    fm->allocate(p.offset, p.length, txn);
+  }
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_false_free(coll_t cid, ghobject_t oid)
+{
+  KeyValueDB::Transaction txn;
+  OnodeRef o;
+  CollectionRef c = _get_collection(cid);
+  assert(c);
+  {
+    RWLock::WLocker l(c->lock); // just to avoid internal asserts
+    o = c->get_onode(oid, false);
+    assert(o);
+    o->extent_map.fault_range(db, 0, OBJECT_MAX_SIZE);
+  }
+
+  bool injected = false;
+  txn = db->get_transaction();
+  auto& em = o->extent_map.extent_map;
+  std::vector<const PExtentVector*> v;
+  if (em.size()) {
+    v.push_back(&em.begin()->blob->get_blob().get_extents());
+  }
+  if (em.size() > 1) {
+    auto it = em.end();
+    --it;
+    v.push_back(&(it->blob->get_blob().get_extents()));
+  }
+  for (auto pext : v) {
+    if (pext->size()) {
+      auto p = pext->begin();
+      while (p != pext->end()) {
+       if (p->is_valid()) {
+         dout(20) << __func__ << " release 0x" << std::hex << p->offset
+                  << "~" << p->length << std::dec << dendl;
+         fm->release(p->offset, p->length, txn);
+         injected = true;
+         break;
+       }
+       ++p;
+      }
+    }
+  }
+  assert(injected);
+  db->submit_transaction_sync(txn);
+}
+
+void BlueStore::inject_statfs(const store_statfs_t& new_statfs)
+{
+  BlueStoreRepairer repairer;
+  repairer.fix_statfs(db, new_statfs);
+  repairer.apply(db);
+}
+
+void BlueStore::inject_misreference(coll_t cid1, ghobject_t oid1,
+                                   coll_t cid2, ghobject_t oid2,
+                                   uint64_t offset)
+{
+  OnodeRef o1;
+  CollectionRef c1 = _get_collection(cid1);
+  assert(c1);
+  {
+    RWLock::WLocker l(c1->lock); // just to avoid internal asserts
+    o1 = c1->get_onode(oid1, false);
+    assert(o1);
+    o1->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+  }
+  OnodeRef o2;
+  CollectionRef c2 = _get_collection(cid2);
+  assert(c2);
+  {
+    RWLock::WLocker l(c2->lock); // just to avoid internal asserts
+    o2 = c2->get_onode(oid2, false);
+    assert(o2);
+    o2->extent_map.fault_range(db, offset, OBJECT_MAX_SIZE);
+  }
+  Extent& e1 = *(o1->extent_map.seek_lextent(offset));
+  Extent& e2 = *(o2->extent_map.seek_lextent(offset));
+
+  // require onode/extent layout to be the same (and simple)
+  // to make things easier
+  assert(o1->onode.extent_map_shards.empty());
+  assert(o2->onode.extent_map_shards.empty());
+  assert(o1->extent_map.spanning_blob_map.size() == 0);
+  assert(o2->extent_map.spanning_blob_map.size() == 0);
+  assert(e1.logical_offset == e2.logical_offset);
+  assert(e1.length == e2.length);
+  assert(e1.blob_offset == e2.blob_offset);
+
+  KeyValueDB::Transaction txn;
+  txn = db->get_transaction();
+
+  // along with misreference error this will create space leaks errors
+  e2.blob->dirty_blob() = e1.blob->get_blob();
+  o2->extent_map.dirty_range(offset, e2.length);
+  o2->extent_map.update(txn, false);
+
+  _record_onode(o2, txn);
+  db->submit_transaction_sync(txn);
 }
 
 void BlueStore::collect_metadata(map<string,string> *pm)
@@ -8042,48 +8481,7 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t)
 
   // finalize onodes
   for (auto o : txc->onodes) {
-    // finalize extent_map shards
-    o->extent_map.update(t, false);
-    if (o->extent_map.needs_reshard()) {
-      o->extent_map.reshard(db, t);
-      o->extent_map.update(t, true);
-      if (o->extent_map.needs_reshard()) {
-       dout(20) << __func__ << " warning: still wants reshard, check options?"
-                << dendl;
-       o->extent_map.clear_needs_reshard();
-      }
-      logger->inc(l_bluestore_onode_reshard);
-    }
-
-    // bound encode
-    size_t bound = 0;
-    denc(o->onode, bound);
-    o->extent_map.bound_encode_spanning_blobs(bound);
-    if (o->onode.extent_map_shards.empty()) {
-      denc(o->extent_map.inline_bl, bound);
-    }
-
-    // encode
-    bufferlist bl;
-    unsigned onode_part, blob_part, extent_part;
-    {
-      auto p = bl.get_contiguous_appender(bound, true);
-      denc(o->onode, p);
-      onode_part = p.get_logical_offset();
-      o->extent_map.encode_spanning_blobs(p);
-      blob_part = p.get_logical_offset() - onode_part;
-      if (o->onode.extent_map_shards.empty()) {
-       denc(o->extent_map.inline_bl, p);
-      }
-      extent_part = p.get_logical_offset() - onode_part - blob_part;
-    }
-
-    dout(20) << __func__  << " onode " << o->oid << " is " << bl.length()
-            << " (" << onode_part << " bytes onode + "
-            << blob_part << " bytes spanning blobs + "
-            << extent_part << " bytes inline extents)"
-            << dendl;
-    t->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+    _record_onode(o, t);
     o->flushing_count++;
   }
 
@@ -11696,4 +12094,215 @@ void BlueStore::_apply_padding(uint64_t head_pad,
   }
 }
 
+void BlueStore::_record_onode(OnodeRef &o, KeyValueDB::Transaction &txn)
+{
+  // finalize extent_map shards
+  o->extent_map.update(txn, false);
+  if (o->extent_map.needs_reshard()) {
+    o->extent_map.reshard(db, txn);
+    o->extent_map.update(txn, true);
+    if (o->extent_map.needs_reshard()) {
+      dout(20) << __func__ << " warning: still wants reshard, check options?"
+               << dendl;
+      o->extent_map.clear_needs_reshard();
+    }
+    logger->inc(l_bluestore_onode_reshard);
+  }
+
+  // bound encode
+  size_t bound = 0;
+  denc(o->onode, bound);
+  o->extent_map.bound_encode_spanning_blobs(bound);
+  if (o->onode.extent_map_shards.empty()) {
+    denc(o->extent_map.inline_bl, bound);
+  }
+
+  // encode
+  bufferlist bl;
+  unsigned onode_part, blob_part, extent_part;
+  {
+    auto p = bl.get_contiguous_appender(bound, true);
+    denc(o->onode, p);
+    onode_part = p.get_logical_offset();
+    o->extent_map.encode_spanning_blobs(p);
+    blob_part = p.get_logical_offset() - onode_part;
+    if (o->onode.extent_map_shards.empty()) {
+      denc(o->extent_map.inline_bl, p);
+    }
+    extent_part = p.get_logical_offset() - onode_part - blob_part;
+  }
+
+  dout(20) << __func__  << " onode " << o->oid << " is " << bl.length()
+           << " (" << onode_part << " bytes onode + "
+           << blob_part << " bytes spanning blobs + "
+           << extent_part << " bytes inline extents)"
+           << dendl;
+
+
+  txn->set(PREFIX_OBJ, o->key.c_str(), o->key.size(), bl);
+}
+
 // ===========================================
+// BlueStoreRepairer
+
+size_t BlueStoreRepairer::StoreSpaceTracker::filter_out(
+  const interval_set<uint64_t>& extents)
+{
+  assert(granularity); // initialized
+  // can't call for the second time
+  assert(!was_filtered_out);
+  assert(collections_bfs.size() == objects_bfs.size());
+
+  size_t prev_pos = 0;
+  size_t npos = collections_bfs.size();
+
+  bloom_vector collections_reduced;
+  bloom_vector objects_reduced;
+
+  for (auto e : extents) {
+    if (e.second == 0) {
+      continue;
+    }
+    size_t pos = max(e.first / granularity, prev_pos);
+    size_t end_pos = 1 + (e.first + e.second - 1) / granularity;
+    while (pos != npos && pos < end_pos)  {
+        assert( collections_bfs[pos].element_count() ==
+          objects_bfs[pos].element_count());
+        if (collections_bfs[pos].element_count()) {
+          collections_reduced.push_back(std::move(collections_bfs[pos]));
+          objects_reduced.push_back(std::move(objects_bfs[pos]));
+        }
+        ++pos;
+    }
+    prev_pos = end_pos;
+  }
+  collections_reduced.swap(collections_bfs);
+  objects_reduced.swap(objects_bfs);
+  was_filtered_out = true;
+  return collections_bfs.size();
+}
+
+bool BlueStoreRepairer::remove_key(KeyValueDB *db,
+                                  const string& prefix,
+                                  const string& key)
+{
+  if (!remove_key_txn) {
+    remove_key_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  remove_key_txn->rmkey(prefix, key);
+
+  return true;
+}
+
+bool BlueStoreRepairer::fix_shared_blob(
+  KeyValueDB *db,
+  uint64_t sbid,
+  const bufferlist* bl)
+{
+  KeyValueDB::Transaction txn;
+  if (fix_misreferences_txn) { // reuse this txn
+    txn = fix_misreferences_txn;
+  } else {
+    if (!fix_shared_blob_txn) {
+      fix_shared_blob_txn = db->get_transaction();
+    }
+    txn = fix_shared_blob_txn;
+  }
+  string key;
+  get_shared_blob_key(sbid, &key);
+
+  ++to_repair_cnt;
+  if (bl) {
+    txn->set(PREFIX_SHARED_BLOB, key, *bl);
+  } else {
+    txn->rmkey(PREFIX_SHARED_BLOB, key);
+  }
+  return true;
+}
+
+bool BlueStoreRepairer::fix_statfs(KeyValueDB *db,
+                                  const store_statfs_t& new_statfs)
+{
+  if (!fix_statfs_txn) {
+    fix_statfs_txn = db->get_transaction();
+  }
+  BlueStore::volatile_statfs vstatfs;
+  vstatfs = new_statfs;
+  bufferlist bl;
+  vstatfs.encode(bl);
+  ++to_repair_cnt;
+  fix_statfs_txn->set(PREFIX_STAT, "bluestore_statfs", bl);
+  return true;
+}
+
+bool BlueStoreRepairer::fix_leaked(KeyValueDB *db,
+                                  FreelistManager* fm,
+                                  uint64_t offset, uint64_t len)
+{
+  if (!fix_fm_leaked_txn) {
+    fix_fm_leaked_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  fm->release(offset, len, fix_fm_leaked_txn);
+  return true;
+}
+bool BlueStoreRepairer::fix_false_free(KeyValueDB *db,
+                                      FreelistManager* fm,
+                                      uint64_t offset, uint64_t len)
+{
+  if (!fix_fm_false_free_txn) {
+    fix_fm_false_free_txn = db->get_transaction();
+  }
+  ++to_repair_cnt;
+  fm->allocate(offset, len, fix_fm_false_free_txn);
+  return true;
+}
+
+bool BlueStoreRepairer::preprocess_misreference(KeyValueDB *db)
+{
+  if (misreferenced_extents.size()) {
+    size_t n = space_usage_tracker.filter_out(misreferenced_extents);
+    assert(n > 0);
+    if (!fix_misreferences_txn) {
+      fix_misreferences_txn = db->get_transaction();
+    }
+    return true;
+  }
+  return false;
+}
+
+unsigned BlueStoreRepairer::apply(KeyValueDB* db)
+{
+  if (fix_fm_leaked_txn) {
+    db->submit_transaction_sync(fix_fm_leaked_txn);
+    fix_fm_leaked_txn = nullptr;
+  }
+  if (fix_fm_false_free_txn) {
+    db->submit_transaction_sync(fix_fm_false_free_txn);
+    fix_fm_false_free_txn = nullptr;
+  }
+  if (remove_key_txn) {
+    db->submit_transaction_sync(remove_key_txn);
+    remove_key_txn = nullptr;
+  }
+  if (fix_misreferences_txn) {
+    db->submit_transaction_sync(fix_misreferences_txn);
+    fix_misreferences_txn = nullptr;
+  }
+  if (fix_shared_blob_txn) {
+    db->submit_transaction_sync(fix_shared_blob_txn);
+    fix_shared_blob_txn = nullptr;
+  }
+
+  if (fix_statfs_txn) {
+    db->submit_transaction_sync(fix_statfs_txn);
+    fix_statfs_txn = nullptr;
+  }
+  unsigned repaired = to_repair_cnt;
+  to_repair_cnt = 0;
+  return repaired;
+}
+
+// =======================================================
+
index 746bdc97a8d16232e04500c6f5df5d8c3d4c5040..e0da63ffc2d5f78050fac1ee3316821fd528b080 100644 (file)
@@ -33,6 +33,7 @@
 #include "include/unordered_map.h"
 #include "include/memory.h"
 #include "include/mempool.h"
+#include "common/bloom_filter.hpp"
 #include "common/Finisher.h"
 #include "common/perf_counters.h"
 #include "compressor/Compressor.h"
@@ -45,6 +46,7 @@
 class Allocator;
 class FreelistManager;
 class BlueFS;
+class BlueStoreRepairer;
 
 //#define DEBUG_CACHE
 //#define DEBUG_DEFERRED
@@ -1457,6 +1459,14 @@ public:
     int64_t& compressed_allocated() {
       return values[STATFS_COMPRESSED_ALLOCATED];
     }
+    volatile_statfs& operator=(const store_statfs_t& st) {
+      values[STATFS_ALLOCATED] = st.allocated;
+      values[STATFS_STORED] = st.stored;
+      values[STATFS_COMPRESSED_ORIGINAL] = st.compressed_original;
+      values[STATFS_COMPRESSED] = st.compressed;
+      values[STATFS_COMPRESSED_ALLOCATED] = st.compressed_allocated;
+      return *this;
+    }
     bool is_empty() {
       return values[STATFS_ALLOCATED] == 0 &&
        values[STATFS_STORED] == 0 &&
@@ -2044,11 +2054,13 @@ public:
 
 private:
   int _fsck_check_extents(
+    const coll_t& cid,
     const ghobject_t& oid,
     const PExtentVector& extents,
     bool compressed,
     mempool_dynamic_bitset &used_blocks,
     uint64_t granularity,
+    BlueStoreRepairer* repairer,
     store_statfs_t& expected_statfs);
 
   void _buffer_cache_write(
@@ -2080,6 +2092,8 @@ private:
                      uint64_t tail_pad,
                      bufferlist& padded);
 
+  void _record_onode(OnodeRef &o, KeyValueDB::Transaction &txn);
+
   // -- ondisk version ---
 public:
   const int32_t latest_ondisk_format = 2;        ///< our version
@@ -2330,6 +2344,17 @@ public:
     RWLock::WLocker l(debug_read_error_lock);
     debug_mdata_error_objects.insert(o);
   }
+
+  /// methods to inject various errors fsck can repair
+  void inject_broken_shared_blob_key(const string& key,
+                        const bufferlist& bl);
+  void inject_leaked(uint64_t len);
+  void inject_false_free(coll_t cid, ghobject_t oid);
+  void inject_statfs(const store_statfs_t& new_statfs);
+  void inject_misreference(coll_t cid1, ghobject_t oid1,
+                          coll_t cid2, ghobject_t oid2,
+                          uint64_t offset);
+
   void compact() override {
     assert(db);
     db->compact();
@@ -2633,4 +2658,187 @@ static inline void intrusive_ptr_release(BlueStore::OpSequencer *o) {
   o->put();
 }
 
+class BlueStoreRepairer
+{
+public:
+  // to simplify future potential migration to mempools
+  using fsck_interval = interval_set<uint64_t>;
+
+  // Structure to track what pextents are used for specific cid/oid.
+  // Similar to Bloom filter positive and false-positive matches are 
+  // possible only.
+  // Maintains two lists of bloom filters for both cids and oids
+  //   where each list entry is a BF for specific disk pextent
+  //   The length of the extent per filter is measured on init.
+  // Allows to filter out 'uninteresting' pextents to speadup subsequent
+  //  'is_used' access. 
+  struct StoreSpaceTracker {
+    const uint64_t BLOOM_FILTER_SALT_COUNT = 2;
+    const uint64_t BLOOM_FILTER_TABLE_SIZE = 32; // bytes per single filter
+    const uint64_t BLOOM_FILTER_EXPECTED_COUNT = 16; // arbitrary selected
+    static const uint64_t DEF_MEM_CAP = 128 * 1024 * 1024;
+
+    typedef mempool::bluestore_fsck::vector<bloom_filter> bloom_vector;
+    bloom_vector collections_bfs;
+    bloom_vector objects_bfs;
+    
+    bool was_filtered_out = false; 
+    uint64_t granularity = 0; // extent length for a single filter
+
+    StoreSpaceTracker() {
+    }
+    StoreSpaceTracker(const StoreSpaceTracker& from) :
+      collections_bfs(from.collections_bfs),
+      objects_bfs(from.objects_bfs),
+      granularity(from.granularity) {
+    }
+
+    void init(uint64_t total,
+             uint64_t min_alloc_size,
+             uint64_t mem_cap = DEF_MEM_CAP) {
+      assert(!granularity); // not initialized yet
+      assert(min_alloc_size && isp2(min_alloc_size));
+      assert(mem_cap);
+      
+      total = ROUND_UP_TO(total, min_alloc_size);
+      granularity = total * BLOOM_FILTER_TABLE_SIZE * 2 / mem_cap;
+
+      if (!granularity) {
+       granularity = min_alloc_size;
+      } else {
+       granularity = ROUND_UP_TO(granularity, min_alloc_size);
+      }
+
+      uint64_t entries = P2ROUNDUP(total, granularity) / granularity;
+      collections_bfs.resize(entries,
+        bloom_filter(BLOOM_FILTER_SALT_COUNT,
+                     BLOOM_FILTER_TABLE_SIZE,
+                     0,
+                     BLOOM_FILTER_EXPECTED_COUNT));
+      objects_bfs.resize(entries, 
+        bloom_filter(BLOOM_FILTER_SALT_COUNT,
+                     BLOOM_FILTER_TABLE_SIZE,
+                     0,
+                     BLOOM_FILTER_EXPECTED_COUNT));
+    }
+    inline uint32_t get_hash(const coll_t& cid) const {
+      return cid.hash_to_shard(1);
+    }
+    inline void set_used(uint64_t offset, uint64_t len,
+                        const coll_t& cid, const ghobject_t& oid) {
+      assert(granularity); // initialized
+      
+      // can't call this func after filter_out has been apllied
+      assert(!was_filtered_out);
+      if (!len) {
+       return;
+      }
+      auto pos = offset / granularity;
+      auto end_pos = (offset + len - 1) / granularity;
+      while (pos <= end_pos) {
+        collections_bfs[pos].insert(get_hash(cid));
+        objects_bfs[pos].insert(oid.hobj.get_hash());
+        ++pos;
+      }
+    }
+    // filter-out entries unrelated to the specified(broken) extents.
+    // 'is_used' calls are permitted after that only
+    size_t filter_out(const fsck_interval& extents);
+
+    // determines if collection's present after filtering-out 
+    inline bool is_used(const coll_t& cid) const {
+      assert(was_filtered_out);
+      for(auto& bf : collections_bfs) {
+        if (bf.contains(get_hash(cid))) {
+          return true;
+        }
+      }
+      return false;
+    }
+    // determines if object's present after filtering-out 
+    inline bool is_used(const ghobject_t& oid) const {
+      assert(was_filtered_out);
+      for(auto& bf : objects_bfs) {
+        if (bf.contains(oid.hobj.get_hash())) {
+          return true;
+        }
+      }
+      return false;
+    }
+    // determines if collection's present before filtering-out 
+    inline bool is_used(const coll_t& cid, uint64_t offs) const {
+      assert(granularity); // initialized
+      assert(!was_filtered_out);
+      auto &bf = collections_bfs[offs / granularity];
+      if (bf.contains(get_hash(cid))) {
+        return true;
+      }
+      return false;
+    }
+    // determines if object's present before filtering-out 
+    inline bool is_used(const ghobject_t& oid, uint64_t offs) const {
+      assert(granularity); // initialized
+      assert(!was_filtered_out);
+      auto &bf = objects_bfs[offs / granularity];
+      if (bf.contains(oid.hobj.get_hash())) {
+        return true;
+      }
+      return false;
+    }
+  };
+public:
+
+  bool remove_key(KeyValueDB *db, const string& prefix, const string& key);
+  bool fix_shared_blob(KeyValueDB *db,
+                        uint64_t sbid,
+                      const bufferlist* bl);
+  bool fix_statfs(KeyValueDB *db, const store_statfs_t& new_statfs);
+
+  bool fix_leaked(KeyValueDB *db,
+                 FreelistManager* fm,
+                 uint64_t offset, uint64_t len);
+  bool fix_false_free(KeyValueDB *db,
+                     FreelistManager* fm,
+                     uint64_t offset, uint64_t len);
+
+  void init(uint64_t total_space, uint64_t lres_tracking_unit_size);
+
+  bool preprocess_misreference(KeyValueDB *db);
+
+  unsigned apply(KeyValueDB* db);
+
+  void note_misreference(uint64_t offs, uint64_t len, bool inc_error) {
+    misreferenced_extents.insert(offs, len);
+    if (inc_error) {
+      ++to_repair_cnt;
+    }
+  }
+
+  StoreSpaceTracker& get_space_usage_tracker() {
+    return space_usage_tracker;
+  }
+  const fsck_interval& get_misreferences() const {
+    return misreferenced_extents;
+  }
+  KeyValueDB::Transaction get_fix_misreferences_txn() {
+    return fix_misreferences_txn;
+  }
+
+private:
+  unsigned to_repair_cnt = 0;
+  KeyValueDB::Transaction fix_fm_leaked_txn;
+  KeyValueDB::Transaction fix_fm_false_free_txn;
+  KeyValueDB::Transaction remove_key_txn;
+  KeyValueDB::Transaction fix_statfs_txn;
+  KeyValueDB::Transaction fix_shared_blob_txn;
+
+  KeyValueDB::Transaction fix_misreferences_txn;
+
+  StoreSpaceTracker space_usage_tracker;
+
+  // non-shared extents with multiple references
+  fsck_interval misreferenced_extents;
+
+};
+
 #endif
index 66d68ab2fcc58d0e1e2e1bbdfe2f3daf3bdf5b17..805f10e3aa2993f92841ebf78ef7173784e1d8bd 100644 (file)
@@ -444,6 +444,9 @@ public:
   const PExtentVector& get_extents() const {
     return extents;
   }
+  PExtentVector& dirty_extents() {
+    return extents;
+  }
 
   DENC_HELPERS;
   void bound_encode(size_t& p, uint64_t struct_v) const {
@@ -852,6 +855,9 @@ struct bluestore_shared_blob_t {
   bluestore_extent_ref_map_t ref_map;  ///< shared blob extents
 
   bluestore_shared_blob_t(uint64_t _sbid) : sbid(_sbid) {}
+  bluestore_shared_blob_t(uint64_t _sbid,
+                         bluestore_extent_ref_map_t&& _ref_map ) 
+    : sbid(_sbid), ref_map(std::move(_ref_map)) {}
 
   DENC(bluestore_shared_blob_t, v, p) {
     DENC_START(1, 1, p);
index 0f43c6d72e55c4711674d0b0189730abda028bab..13ef8173bd7bedad440e36970d2cd0bb2333c74a 100644 (file)
@@ -6907,6 +6907,129 @@ TEST_P(StoreTestSpecificAUSize, fsckOnUnalignedDevice2) {
   g_conf->apply_changes(NULL);
 }
 
+TEST_P(StoreTest, BluestoreRepairTest) {
+  if (string(GetParam()) != "bluestore")
+    return;
+  const size_t offs_base = 65536 / 2;
+
+  g_ceph_context->_conf->set_val("bluestore_fsck_on_mount", "false");
+  g_ceph_context->_conf->set_val("bluestore_fsck_on_umount", "false");
+  g_ceph_context->_conf->set_val("bluestore_max_blob_size", stringify(2 * offs_base));
+  g_ceph_context->_conf->set_val("bluestore_extent_map_shard_max_size", "12000");
+  g_ceph_context->_conf->apply_changes(NULL);
+
+  BlueStore* bstore = dynamic_cast<BlueStore*> (store.get());
+
+  // fill the store with some data
+  coll_t cid(spg_t(pg_t(0,555), shard_id_t::NO_SHARD));
+  auto ch = store->create_new_collection(cid);
+
+  ghobject_t hoid(hobject_t(sobject_t("Object 1", CEPH_NOSNAP)));
+  ghobject_t hoid_dup(hobject_t(sobject_t("Object 1(dup)", CEPH_NOSNAP)));
+  ghobject_t hoid2(hobject_t(sobject_t("Object 2", CEPH_NOSNAP)));
+  ghobject_t hoid_cloned = hoid2;
+  hoid_cloned.hobj.snap = 1;
+  ghobject_t hoid3(hobject_t(sobject_t("Object 3", CEPH_NOSNAP)));
+  ghobject_t hoid3_cloned = hoid3;
+  hoid3_cloned.hobj.snap = 1;
+  bufferlist bl;
+  bl.append("1234512345");
+  int r;
+  const size_t repeats = 16;
+  {
+    cerr << "create collection + write" << std::endl;
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    for( auto i = 0ul; i < repeats; ++i ) {
+      t.write(cid, hoid, i * offs_base, bl.length(), bl);
+      t.write(cid, hoid_dup, i * offs_base, bl.length(), bl);
+    }
+    for( auto i = 0ul; i < repeats; ++i ) {
+      t.write(cid, hoid2, i * offs_base, bl.length(), bl);
+    }
+    t.clone(cid, hoid2, hoid_cloned);
+
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+
+  bstore->umount();
+  //////////// leaked pextent fix ////////////
+  cerr << "fix leaked pextents" << std::endl;
+  ASSERT_EQ(bstore->fsck(false), 0);
+  ASSERT_EQ(bstore->repair(false), 0);
+  bstore->mount();
+  bstore->inject_leaked(0x30000);
+  bstore->umount();
+  ASSERT_EQ(bstore->fsck(false), 1);
+  ASSERT_EQ(bstore->repair(false), 0);
+  ASSERT_EQ(bstore->fsck(false), 0);
+
+  //////////// false free fix ////////////
+  cerr << "fix false free pextents" << std::endl;
+  bstore->mount();
+  bstore->inject_false_free(cid, hoid);
+  bstore->umount();
+  ASSERT_EQ(bstore->fsck(false), 2);
+  ASSERT_EQ(bstore->repair(false), 0);
+  ASSERT_EQ(bstore->fsck(false), 0);
+
+  //////////// verify invalid statfs ///////////
+  cerr << "fix invalid statfs" << std::endl;
+  store_statfs_t statfs0, statfs;
+  bstore->mount();
+  ASSERT_EQ(bstore->statfs(&statfs0), 0);
+  statfs = statfs0;
+  statfs.allocated += 0x10000;
+  statfs.stored += 0x10000;
+  ASSERT_FALSE(statfs0 == statfs);
+  bstore->inject_statfs(statfs);
+  bstore->umount();
+
+  ASSERT_EQ(bstore->fsck(false), 1);
+  ASSERT_EQ(bstore->repair(false), 0);
+  ASSERT_EQ(bstore->fsck(false), 0);
+  ASSERT_EQ(bstore->mount(), 0);
+  ASSERT_EQ(bstore->statfs(&statfs), 0);
+  // adjust free space to success in comparison
+  statfs0.available = statfs.available;
+  ASSERT_EQ(statfs0, statfs);
+
+  ///////// undecodable shared blob key / stray shared blob records ///////
+  cerr << "undecodable shared blob key" << std::endl;
+  bstore->inject_broken_shared_blob_key("undec1",
+                           bufferlist());
+  bstore->inject_broken_shared_blob_key("undecodable key 2",
+                           bufferlist());
+  bstore->inject_broken_shared_blob_key("undecodable key 3",
+                           bufferlist());
+  bstore->umount();
+  ASSERT_EQ(bstore->fsck(false), 3);
+  ASSERT_EQ(bstore->repair(false), 0);
+  ASSERT_EQ(bstore->fsck(false), 0);
+
+  cerr << "misreferencing" << std::endl;
+  bstore->mount();
+  bstore->inject_misreference(cid, hoid, cid, hoid_dup, 0);
+  bstore->inject_misreference(cid, hoid, cid, hoid_dup, (offs_base * repeats) / 2);
+  bstore->inject_misreference(cid, hoid, cid, hoid_dup, offs_base * (repeats -1) );
+  
+  bstore->umount();
+  ASSERT_EQ(bstore->fsck(false), 6);
+  ASSERT_EQ(bstore->repair(false), 0);
+
+  ASSERT_EQ(bstore->fsck(true), 0);
+
+  cerr << "Completing" << std::endl;
+  bstore->mount();
+  g_ceph_context->_conf->set_val("bluestore_fsck_on_mount", "true");
+  g_ceph_context->_conf->set_val("bluestore_fsck_on_umount", "true");
+  g_ceph_context->_conf->set_val("bluestore_max_alloc_size", "0");
+  g_ceph_context->_conf->set_val("bluestore_extent_map_shard_max_size", "1200");
+
+  g_ceph_context->_conf->apply_changes(NULL);
+}
+
 int main(int argc, char **argv) {
   vector<const char*> args;
   argv_to_vec(argc, (const char **)argv, args);
index 3aad5f68408490668b178dfacf7a4695b863876e..ab497550da0b85781279963754f85a165e1fd078 100644 (file)
@@ -1441,7 +1441,95 @@ TEST(GarbageCollector, BasicTest)
     em.clear();
     old_extents.clear();
   }
- }
+}
+
+TEST(BlueStoreRepairer, StoreSpaceTracker)
+{
+  BlueStoreRepairer::StoreSpaceTracker bmap0;
+  bmap0.init((uint64_t)4096 * 1024 * 1024 * 1024, 0x1000);
+  ASSERT_EQ(bmap0.granularity, 2 * 1024 * 1024);
+  ASSERT_EQ(bmap0.collections_bfs.size(), 2048 * 1024);
+  ASSERT_EQ(bmap0.objects_bfs.size(), 2048 * 1024);
+
+  BlueStoreRepairer::StoreSpaceTracker bmap;
+  bmap.init(0x2000 * 0x1000 - 1, 0x1000, 512 * 1024);
+  ASSERT_EQ(bmap.granularity, 0x1000);
+  ASSERT_EQ(bmap.collections_bfs.size(), 0x2000);
+  ASSERT_EQ(bmap.objects_bfs.size(), 0x2000);
+
+  coll_t cid;
+  ghobject_t hoid;
+
+  ASSERT_FALSE(bmap.is_used(cid, 0));
+  ASSERT_FALSE(bmap.is_used(hoid, 0));
+  bmap.set_used(0, 1, cid, hoid);
+  ASSERT_TRUE(bmap.is_used(cid, 0));
+  ASSERT_TRUE(bmap.is_used(hoid, 0));
+
+  ASSERT_FALSE(bmap.is_used(cid, 0x1023));
+  ASSERT_FALSE(bmap.is_used(hoid, 0x1023));
+  ASSERT_FALSE(bmap.is_used(cid, 0x2023));
+  ASSERT_FALSE(bmap.is_used(hoid, 0x2023));
+  ASSERT_FALSE(bmap.is_used(cid, 0x3023));
+  ASSERT_FALSE(bmap.is_used(hoid, 0x3023));
+  bmap.set_used(0x1023, 0x3000, cid, hoid);
+  ASSERT_TRUE(bmap.is_used(cid, 0x1023));
+  ASSERT_TRUE(bmap.is_used(hoid, 0x1023));
+  ASSERT_TRUE(bmap.is_used(cid, 0x2023));
+  ASSERT_TRUE(bmap.is_used(hoid, 0x2023));
+  ASSERT_TRUE(bmap.is_used(cid, 0x3023));
+  ASSERT_TRUE(bmap.is_used(hoid, 0x3023));
+
+  ASSERT_FALSE(bmap.is_used(cid, 0x9001));
+  ASSERT_FALSE(bmap.is_used(hoid, 0x9001));
+  ASSERT_FALSE(bmap.is_used(cid, 0xa001));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xa001));
+  ASSERT_FALSE(bmap.is_used(cid, 0xb000));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xb000));
+  ASSERT_FALSE(bmap.is_used(cid, 0xc000));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
+  bmap.set_used(0x9001, 0x2fff, cid, hoid);
+  ASSERT_TRUE(bmap.is_used(cid, 0x9001));
+  ASSERT_TRUE(bmap.is_used(hoid, 0x9001));
+  ASSERT_TRUE(bmap.is_used(cid, 0xa001));
+  ASSERT_TRUE(bmap.is_used(hoid, 0xa001));
+  ASSERT_TRUE(bmap.is_used(cid, 0xb001));
+  ASSERT_TRUE(bmap.is_used(hoid, 0xb001));
+  ASSERT_FALSE(bmap.is_used(cid, 0xc000));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
+
+  bmap.set_used(0xa001, 0x2, cid, hoid);
+  ASSERT_TRUE(bmap.is_used(cid, 0x9001));
+  ASSERT_TRUE(bmap.is_used(hoid, 0x9001));
+  ASSERT_TRUE(bmap.is_used(cid, 0xa001));
+  ASSERT_TRUE(bmap.is_used(hoid, 0xa001));
+  ASSERT_TRUE(bmap.is_used(cid, 0xb001));
+  ASSERT_TRUE(bmap.is_used(hoid, 0xb001));
+  ASSERT_FALSE(bmap.is_used(cid, 0xc000));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xc000));
+
+  ASSERT_FALSE(bmap.is_used(cid, 0xc0000));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xc0000));
+  ASSERT_FALSE(bmap.is_used(cid, 0xc1000));
+  ASSERT_FALSE(bmap.is_used(hoid, 0xc1000));
+
+  bmap.set_used(0xc0000, 0x2000, cid, hoid);
+  ASSERT_TRUE(bmap.is_used(cid, 0xc0000));
+  ASSERT_TRUE(bmap.is_used(hoid, 0xc0000));
+  ASSERT_TRUE(bmap.is_used(cid, 0xc1000));
+  ASSERT_TRUE(bmap.is_used(hoid, 0xc1000));
+
+  interval_set<uint64_t> extents;
+  extents.insert(0,0x500);
+  extents.insert(0x800,0x100);
+  extents.insert(0x1000,0x1000);
+  extents.insert(0xa001,1);
+  extents.insert(0xa0000,0xff8);
+
+  ASSERT_EQ(bmap.filter_out(extents), 3);
+  ASSERT_TRUE(bmap.is_used(cid));
+  ASSERT_TRUE(bmap.is_used(hoid));
+}
 
 int main(int argc, char **argv) {
   vector<const char*> args;