From: Sage Weil Date: Wed, 8 Mar 2017 19:28:55 +0000 (-0500) Subject: os/bluestore: update freelist on initial commit X-Git-Tag: v12.0.1~12^2~41 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=bc5bfdd59200754c453532cd97bf3bc263210e9f;p=ceph-ci.git os/bluestore: update freelist on initial commit It does not matter if we update the freelist in the initial commit or when cleaning up the deferred transaction; both will eventually update the persistent kv freelist. We maintain one case to ensure that legacy deferred events (from a kraken upgrade) release when they are replayed. What matters while online is the Allocator, which has an independent in-memory copy of the freelist to make decisions. And we can delay that as long as we want. To avoid any concerns about deferred writes racing against released blocks, just defer any release until the txc is fully completed (including any deferred writes). This ensures that even if we have a pattern like txc 1: schedule deferred write on block A txc 2: release block A txc 1+2: commit txc 2: done! txc 1: do deferred write txc 1: done! then txc 2 won't do its release because it is stuck behind txc 1 in the OpSequencer queue: ... txc 1: reaped txc 2: reaped (and extents released to alloc) This builds in some delay in just-released space being usable again, but it should be a very small amount of space relative to the size of the store! Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 7dc49daf957..03bb73528aa 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7430,6 +7430,11 @@ void BlueStore::_osr_reap_done(OpSequencer *osr) break; } + // release to allocator only after all preceding txc's have also + // finished any deferred writes that potentially land in these + // blocks + _txc_release_alloc(txc); + if (!c && txc->first_collection) { c = txc->first_collection; } @@ -7601,12 +7606,15 @@ void BlueStore::_kv_sync_thread() } // cleanup sync deferred keys - for (std::deque::iterator it = deferred_cleaning.begin(); - it != deferred_cleaning.end(); - ++it) { - bluestore_deferred_transaction_t& wt =*(*it)->deferred_txn; - // kv metadata updates - _txc_finalize_kv(*it, synct); + for (auto txc : deferred_cleaning) { + bluestore_deferred_transaction_t& wt = *txc->deferred_txn; + if (!wt.released.empty()) { + // kraken replay compat only + txc->released = wt.released; + dout(10) << __func__ << " deferred txn has released " << txc->released + << " (we just upgraded from kraken) on " << txc << dendl; + _txc_finalize_kv(txc, synct); + } // cleanup the deferred string key; get_deferred_key(wt.seq, &key); @@ -7634,13 +7642,11 @@ void BlueStore::_kv_sync_thread() while (!kv_committing.empty()) { TransContext *txc = kv_committing.front(); assert(txc->state == TransContext::STATE_KV_SUBMITTED); - _txc_release_alloc(txc); _txc_state_proc(txc); kv_committing.pop_front(); } while (!deferred_cleaning.empty()) { TransContext *txc = deferred_cleaning.front(); - _txc_release_alloc(txc); _txc_state_proc(txc); deferred_cleaning.pop_front(); } @@ -7711,11 +7717,6 @@ int BlueStore::_deferred_finish(TransContext *txc) { bluestore_deferred_transaction_t& wt = *txc->deferred_txn; dout(20) << __func__ << " txc " << " seq " << wt.seq << txc << dendl; - - // move released back to txc - txc->deferred_txn->released.swap(txc->released); - assert(txc->deferred_txn->released.empty()); - std::lock_guard l2(txc->osr->qlock); std::lock_guard l(kv_lock); txc->state = TransContext::STATE_DEFERRED_CLEANUP; @@ -7838,10 +7839,6 @@ int BlueStore::queue_transactions( // journal deferred items if (txc->deferred_txn) { - // move releases to after deferred - txc->deferred_txn->released.swap(txc->released); - assert(txc->released.empty()); - txc->deferred_txn->seq = ++deferred_seq; bufferlist bl; ::encode(*txc->deferred_txn, bl);