From: Sage Weil Date: Mon, 27 Mar 2017 15:32:03 +0000 (-0400) Subject: os/bluestore: flush osr on collection split X-Git-Tag: v12.0.2~184^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=2c7ee9dfeca87e1b7c25ca7d92fe4502aa82e556;p=ceph.git os/bluestore: flush osr on collection split We need to ensure that any preceding txcs have finished their deferred writes before splitting, or else later txcs on the child sequencer will not order/wait for deferred writes correctly before deallocated extents. Fixes: http://tracker.ceph.com/issues/19379 Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 71b4b119c8ca..1c5db9279571 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7492,6 +7492,11 @@ bool BlueStore::_osr_reap_done(OpSequencer *osr) dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() << dendl; if (txc->state != TransContext::STATE_DONE) { + if (txc->state == TransContext::STATE_PREPARE && + deferred_aggressive) { + // for _osr_drain_preceding() + osr->qcond.notify_all(); + } break; } @@ -7522,6 +7527,28 @@ bool BlueStore::_osr_reap_done(OpSequencer *osr) return empty; } +void BlueStore::_osr_drain_preceding(TransContext *txc) +{ + OpSequencer *osr = txc->osr.get(); + dout(10) << __func__ << " " << txc << " osr " << osr << dendl; + deferred_aggressive = true; // FIXME: maybe osr-local aggressive flag? + { + // submit anything pending + std::lock_guard l(deferred_lock); + if (!osr->deferred_pending.empty()) { + _deferred_try_submit(osr); + } + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + osr->drain_preceding(txc); + deferred_aggressive = false; + dout(10) << __func__ << " " << osr << " done" << dendl; +} + void BlueStore::_osr_drain_all() { dout(10) << __func__ << dendl; @@ -10261,6 +10288,13 @@ int BlueStore::_split_collection(TransContext *txc, RWLock::WLocker l2(d->lock); int r; + // flush all previous deferred writes on this sequencer. this is a bit + // heavyweight, but we need to make sure all deferred writes complete + // before we split as the new collection's sequencer may need to order + // this after those writes, and we don't bother with the complexity of + // moving those TransContexts over to the new osr. + _osr_drain_preceding(txc); + // move any cached items (onodes and referenced shared blobs) that will // belong to the child collection post-split. leave everything else behind. // this may include things that don't strictly belong to the now-smaller diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 28bf60b9d546..576331920696 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1633,6 +1633,12 @@ public: qcond.wait(l); } + void drain_preceding(TransContext *txc) { + std::unique_lock l(qlock); + while (!q.empty() && &q.front() != txc) + qcond.wait(l); + } + bool _is_all_kv_submitted() { // caller must hold qlock if (q.empty()) { @@ -1900,6 +1906,7 @@ private: void _txc_release_alloc(TransContext *txc); bool _osr_reap_done(OpSequencer *osr); + void _osr_drain_preceding(TransContext *txc); void _osr_drain_all(); void _osr_unregister_all();