From: Sage Weil Date: Mon, 19 Dec 2016 15:03:44 +0000 (-0500) Subject: os/bluestore: include modified objects in flush list even if onode unchanged X-Git-Tag: v11.1.1~16^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F12541%2Fhead;p=ceph.git os/bluestore: include modified objects in flush list even if onode unchanged We use the onode flush list so that we can ->flush() as a barrier before doing any read/modify/write. For example, omap_rmkeyrange will flush before reading to see what keys to erase in order to ensure that any previous inserts are applied to the db and we see them and remove them. However, some omap operations don't update the onode itself, which means write_onode() doesn't get called and we aren't put on this list. Add a note_modified_object() helper that can be called instead of write_onode() for those cases. That way we get on the list and flush() works as expected. We could have resolved this by just putting ourselves on the dirty onode list, but in practice every OSD op is writing omap keys to the pgmeta object and there is no need to touch the onode key in this case, so doing so would be a big regression. Signed-off-by: Sage Weil --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 7b6084fdd79c..514f0ad7e206 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -6412,6 +6412,19 @@ void BlueStore::_txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t) o->flush_txns.insert(txc); } + // objects we modified but didn't affect the onode + auto p = txc->modified_objects.begin(); + while (p != txc->modified_objects.end()) { + if (txc->onodes.count(*p) == 0) { + std::lock_guard l((*p)->flush_lock); + (*p)->flush_txns.insert(txc); + ++p; + } else { + // remove dups with onodes list to avoid problems in _txc_finish + p = txc->modified_objects.erase(p); + } + } + // finalize shared_blobs for (auto sb : txc->shared_blobs) { if (sb->shared_blob.empty()) { @@ -6473,21 +6486,20 @@ void BlueStore::_txc_finish(TransContext *txc) dout(20) << __func__ << " " << txc << " onodes " << txc->onodes << dendl; assert(txc->state == TransContext::STATE_FINISHING); - for (set::iterator p = txc->onodes.begin(); - p != txc->onodes.end(); - ++p) { - std::lock_guard l((*p)->flush_lock); - dout(20) << __func__ << " onode " << *p << " had " << (*p)->flush_txns - << dendl; - assert((*p)->flush_txns.count(txc)); - (*p)->flush_txns.erase(txc); - if ((*p)->flush_txns.empty()) - (*p)->flush_cond.notify_all(); + for (auto ls : { &txc->onodes, &txc->modified_objects }) { + for (auto& o : *ls) { + std::lock_guard l(o->flush_lock); + dout(20) << __func__ << " onode " << o << " had " << o->flush_txns + << dendl; + assert(o->flush_txns.count(txc)); + o->flush_txns.erase(txc); + if (o->flush_txns.empty()) { + o->flush_cond.notify_all(); + } + } + ls->clear(); // clear out refs } - // clear out refs - txc->onodes.clear(); - while (!txc->removed_collections.empty()) { _queue_reap_collection(txc->removed_collections.front()); txc->removed_collections.pop_front(); @@ -6921,6 +6933,7 @@ int BlueStore::queue_transactions( } _txc_write_nodes(txc, txc->t); + // journal wal items if (txc->wal_txn) { // move releases to after wal @@ -8271,7 +8284,7 @@ int BlueStore::_do_remove( } o->exists = false; o->onode = bluestore_onode_t(); - txc->onodes.erase(o); + txc->removed(o); for (auto &s : o->extent_map.shards) { txc->t->rmkey(PREFIX_OBJ, s.key); } @@ -8422,6 +8435,8 @@ int BlueStore::_omap_setkeys(TransContext *txc, if (!o->onode.has_omap()) { o->onode.set_omap_flag(); txc->write_onode(o); + } else { + txc->note_modified_object(o); } string final_key; _key_encode_u64(o->onode.nid, &final_key); @@ -8454,6 +8469,8 @@ int BlueStore::_omap_setheader(TransContext *txc, if (!o->onode.has_omap()) { o->onode.set_omap_flag(); txc->write_onode(o); + } else { + txc->note_modified_object(o); } get_omap_header(o->onode.nid, &key); txc->t->set(PREFIX_OMAP, key, bl); @@ -8488,6 +8505,7 @@ int BlueStore::_omap_rmkeys(TransContext *txc, << " <- " << key << dendl; txc->t->rmkey(PREFIX_OMAP, final_key); } + txc->note_modified_object(o); out: dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; @@ -8521,6 +8539,7 @@ int BlueStore::_omap_rmkey_range(TransContext *txc, dout(30) << __func__ << " rm " << pretty_binary_string(it->key()) << dendl; it->next(); } + txc->note_modified_object(o); out: dout(10) << __func__ << " " << c->cid << " " << o->oid << " = " << r << dendl; diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 421f75e989b0..79c565aaa14c 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1215,6 +1215,7 @@ public: uint64_t ops, bytes; set onodes; ///< these need to be updated/written + set modified_objects; ///< objects we modified (and need a ref) set shared_blobs; ///< these need to be updated/written set shared_blobs_written; ///< update these on io completion @@ -1316,6 +1317,15 @@ public: void write_shared_blob(SharedBlobRef &sb) { shared_blobs.insert(sb); } + /// note we logically modified object (when onode itself is unmodified) + void note_modified_object(OnodeRef &o) { + // onode itself isn't written, though + modified_objects.insert(o); + } + void removed(OnodeRef& o) { + onodes.erase(o); + modified_objects.erase(o); + } }; class OpSequencer : public Sequencer_impl {