From 2c7ee9dfeca87e1b7c25ca7d92fe4502aa82e556 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 27 Mar 2017 11:32:03 -0400 Subject: [PATCH] os/bluestore: flush osr on collection split We need to ensure that any preceding txcs have finished their deferred writes before splitting, or else later txcs on the child sequencer will not order/wait for deferred writes correctly before deallocated extents. Fixes: http://tracker.ceph.com/issues/19379 Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 34 ++++++++++++++++++++++++++++++++++ src/os/bluestore/BlueStore.h | 7 +++++++ 2 files changed, 41 insertions(+) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 71b4b119c8c..1c5db927957 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7492,6 +7492,11 @@ bool BlueStore::_osr_reap_done(OpSequencer *osr) dout(20) << __func__ << " txc " << txc << " " << txc->get_state_name() << dendl; if (txc->state != TransContext::STATE_DONE) { + if (txc->state == TransContext::STATE_PREPARE && + deferred_aggressive) { + // for _osr_drain_preceding() + osr->qcond.notify_all(); + } break; } @@ -7522,6 +7527,28 @@ bool BlueStore::_osr_reap_done(OpSequencer *osr) return empty; } +void BlueStore::_osr_drain_preceding(TransContext *txc) +{ + OpSequencer *osr = txc->osr.get(); + dout(10) << __func__ << " " << txc << " osr " << osr << dendl; + deferred_aggressive = true; // FIXME: maybe osr-local aggressive flag? + { + // submit anything pending + std::lock_guard l(deferred_lock); + if (!osr->deferred_pending.empty()) { + _deferred_try_submit(osr); + } + } + { + // wake up any previously finished deferred events + std::lock_guard l(kv_lock); + kv_cond.notify_one(); + } + osr->drain_preceding(txc); + deferred_aggressive = false; + dout(10) << __func__ << " " << osr << " done" << dendl; +} + void BlueStore::_osr_drain_all() { dout(10) << __func__ << dendl; @@ -10261,6 +10288,13 @@ int BlueStore::_split_collection(TransContext *txc, RWLock::WLocker l2(d->lock); int r; + // flush all previous deferred writes on this sequencer. this is a bit + // heavyweight, but we need to make sure all deferred writes complete + // before we split as the new collection's sequencer may need to order + // this after those writes, and we don't bother with the complexity of + // moving those TransContexts over to the new osr. + _osr_drain_preceding(txc); + // move any cached items (onodes and referenced shared blobs) that will // belong to the child collection post-split. leave everything else behind. // this may include things that don't strictly belong to the now-smaller diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 28bf60b9d54..57633192069 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1633,6 +1633,12 @@ public: qcond.wait(l); } + void drain_preceding(TransContext *txc) { + std::unique_lock l(qlock); + while (!q.empty() && &q.front() != txc) + qcond.wait(l); + } + bool _is_all_kv_submitted() { // caller must hold qlock if (q.empty()) { @@ -1900,6 +1906,7 @@ private: void _txc_release_alloc(TransContext *txc); bool _osr_reap_done(OpSequencer *osr); + void _osr_drain_preceding(TransContext *txc); void _osr_drain_all(); void _osr_unregister_all(); -- 2.47.3