]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
os/bluestore: Modify _deferred_replay
authorAdam Kupczyk <akupczyk@ibm.com>
Thu, 14 Nov 2024 18:19:34 +0000 (18:19 +0000)
committerAdam Kupczyk <akupczyk@ibm.com>
Wed, 18 Dec 2024 10:07:43 +0000 (10:07 +0000)
Modify _deferred_replay to execute it directly, without involving
BlueStore state machine.
In result, kv-sync thread is not necessary.
RocksDB L entries (deferred writes) are removed directly.

Fixes: https://tracker.ceph.com/issues/68060,
  part responsible for stale L entries.

Signed-off-by: Adam Kupczyk <akupczyk@ibm.com>
src/os/bluestore/BlueStore.cc

index c521ee8fbfc20ea44f55644ef1dd85a5c655c40e..3330d8a1e7b930a24f531505762eebafb48accb3 100644 (file)
@@ -14866,7 +14866,7 @@ void BlueStore::_kv_sync_thread()
       auto sync_start = mono_clock::now();
 #endif
       // submit synct synchronously (block and wait for it to commit)
-      int r = db_was_opened_read_only || cct->_conf->bluestore_debug_omit_kv_commit ?
+      int r = cct->_conf->bluestore_debug_omit_kv_commit ?
        0 : db->submit_transaction_sync(synct);
       ceph_assert(r == 0);
 
@@ -15268,19 +15268,14 @@ int BlueStore::_deferred_replay()
       }
     );
   }
-  CollectionRef ch = _get_collection(coll_t::meta());
-  bool fake_ch = false;
-  if (!ch) {
-    // hmm, replaying initial mkfs?
-    ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
-    fake_ch = true;
-  }
-  OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
   if (tracepoint_debug_deferred_replay_start) tracepoint_debug_deferred_replay_start();
+  IOContext ioctx(cct, nullptr);
+  KeyValueDB::Transaction t = db->get_transaction();
   KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
   for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
     dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
             << dendl;
+    t->rmkey(PREFIX_DEFERRED, it->key());
     bluestore_deferred_transaction_t *deferred_txn =
       new bluestore_deferred_transaction_t;
     bufferlist bl = it->value();
@@ -15297,22 +15292,29 @@ int BlueStore::_deferred_replay()
     bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
     if (has_some) {
       if (tracepoint_debug_deferred_replay_track) tracepoint_debug_deferred_replay_track(*deferred_txn);
-      TransContext *txc = _txc_create(ch.get(), osr,  nullptr);
-      txc->deferred_txn = deferred_txn;
-      txc->set_state(TransContext::STATE_KV_DONE);
-      _txc_state_proc(txc);
+      for (auto& op: deferred_txn->ops) {
+        for (auto& e : op.extents) {
+          bufferlist t;
+          op.data.splice(0, e.length, &t);
+          bdev->aio_write(e.offset, t, &ioctx, false);
+        }
+      }
     } else {
       delete deferred_txn;
     }
   }
  out:
-  dout(20) << __func__ << " draining osr" << dendl;
-  _osr_register_zombie(osr);
-  _osr_drain_all();
-  if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end();
-  if (fake_ch) {
-    new_coll_map.clear();
+  bdev->aio_submit(&ioctx);
+  dout(20) << __func__ << "waiting to complete IO" << dendl;
+  ioctx.aio_wait();
+  dout(20) << __func__ << "wait done" << dendl;
+  if (!db_was_opened_read_only) {
+    db->submit_transaction_sync(t);
+    dout(20) << __func__ << "removed L keys" << dendl;
+  } else {
+    dout(10) << __func__ << "DB read-pnly, skipped L keys removal" << dendl;
   }
+  if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end();
   dout(10) << __func__ << " completed " << count << " events" << dendl;
   return r;
 }