From: ningtao Date: Tue, 28 May 2019 15:42:47 +0000 (+0800) Subject: os/bluestore: create the tail when first set FLAG_OMAP X-Git-Tag: v14.2.3~110^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0c4bc84077ecd5b7f101ee8f464395e433a066be;p=ceph.git os/bluestore: create the tail when first set FLAG_OMAP the omap iterator when listing omap use the tail of '~', when the iterator moved to the last key of the omapswe wanted, we will try to call extra next(), usually this will be another object's omap header(with '-'). IF there are some deleted key or tombstones, rocksdb will fall in the loop of FindNextUserEntryInternal until find a valid key, so it will travels all dead key in mid and read the sst file heavily. Signed-off-by: Tao Ning (cherry picked from commit 0d01fb0fc65a9b1fe585b33ae1ca22e4805fd7f9) --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index acd0b610224..4dbecb2bb35 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3943,7 +3943,7 @@ bool BlueStore::OmapIteratorImpl::valid() { RWLock::RLocker l(c->lock); bool r = o->onode.has_omap() && it && it->valid() && - it->raw_key().second <= tail; + it->raw_key().second < tail; if (it && it->valid()) { ldout(c->store->cct,20) << __func__ << " is at " << pretty_binary_string(it->raw_key().second) @@ -12932,6 +12932,7 @@ void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix, get_omap_header(id, &prefix); get_omap_tail(id, &tail); txc->t->rm_range_keys(omap_prefix, prefix, tail); + txc->t->rmkey(omap_prefix, tail); dout(20) << __func__ << " remove range start: " << pretty_binary_string(prefix) << " end: " << pretty_binary_string(tail) << dendl; @@ -12970,6 +12971,13 @@ int BlueStore::_omap_setkeys(TransContext *txc, o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; } txc->write_onode(o); + + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string key_tail; + bufferlist tail; + get_omap_tail(o->onode.nid, &key_tail); + txc->t->set(prefix, key_tail, tail); } else { txc->note_modified_object(o); } @@ -13009,6 +13017,13 @@ int BlueStore::_omap_setheader(TransContext *txc, o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; } txc->write_onode(o); + + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string key_tail; + bufferlist tail; + get_omap_tail(o->onode.nid, &key_tail); + txc->t->set(prefix, key_tail, tail); } else { txc->note_modified_object(o); } @@ -13155,14 +13170,13 @@ int BlueStore::_clone(TransContext *txc, newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP, newo->onode.nid); + newo->onode.clear_omap_flag(); } if (oldo->onode.has_omap()) { dout(20) << __func__ << " copying omap data" << dendl; - if (!newo->onode.has_omap()) { - newo->onode.set_omap_flag(); - if (newo->oid.is_pgmeta()) { - newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; - } + newo->onode.set_omap_flag(); + if (newo->oid.is_pgmeta()) { + newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; } const string& prefix = newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; @@ -13184,8 +13198,10 @@ int BlueStore::_clone(TransContext *txc, } it->next(); } - } else { - newo->onode.clear_omap_flag(); + string new_tail; + bufferlist new_tail_value; + get_omap_tail(newo->onode.nid, &new_tail); + txc->t->set(prefix, new_tail, new_tail_value); } txc->write_onode(newo);