From: ningtao Date: Tue, 28 May 2019 15:42:47 +0000 (+0800) Subject: os/bluestore: create the tail when first set FLAG_OMAP X-Git-Tag: v15.1.0~2374^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=0d01fb0fc65a9b1fe585b33ae1ca22e4805fd7f9;p=ceph.git os/bluestore: create the tail when first set FLAG_OMAP the omap iterator when listing omap use the tail of '~', when the iterator moved to the last key of the omapswe wanted, we will try to call extra next(), usually this will be another object's omap header(with '-'). IF there are some deleted key or tombstones, rocksdb will fall in the loop of FindNextUserEntryInternal until find a valid key, so it will travels all dead key in mid and read the sst file heavily. Signed-off-by: Tao Ning --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5db054135e16..92617df89752 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3877,7 +3877,7 @@ bool BlueStore::OmapIteratorImpl::valid() { RWLock::RLocker l(c->lock); bool r = o->onode.has_omap() && it && it->valid() && - it->raw_key().second <= tail; + it->raw_key().second < tail; if (it && it->valid()) { ldout(c->store->cct,20) << __func__ << " is at " << pretty_binary_string(it->raw_key().second) @@ -12907,6 +12907,7 @@ void BlueStore::_do_omap_clear(TransContext *txc, const string& omap_prefix, get_omap_header(id, &prefix); get_omap_tail(id, &tail); txc->t->rm_range_keys(omap_prefix, prefix, tail); + txc->t->rmkey(omap_prefix, tail); dout(20) << __func__ << " remove range start: " << pretty_binary_string(prefix) << " end: " << pretty_binary_string(tail) << dendl; @@ -12945,6 +12946,13 @@ int BlueStore::_omap_setkeys(TransContext *txc, o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; } txc->write_onode(o); + + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string key_tail; + bufferlist tail; + get_omap_tail(o->onode.nid, &key_tail); + txc->t->set(prefix, key_tail, tail); } else { txc->note_modified_object(o); } @@ -12984,6 +12992,13 @@ int BlueStore::_omap_setheader(TransContext *txc, o->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; } txc->write_onode(o); + + const string& prefix = + o->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; + string key_tail; + bufferlist tail; + get_omap_tail(o->onode.nid, &key_tail); + txc->t->set(prefix, key_tail, tail); } else { txc->note_modified_object(o); } @@ -13130,14 +13145,13 @@ int BlueStore::_clone(TransContext *txc, newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP, newo->onode.nid); + newo->onode.clear_omap_flag(); } if (oldo->onode.has_omap()) { dout(20) << __func__ << " copying omap data" << dendl; - if (!newo->onode.has_omap()) { - newo->onode.set_omap_flag(); - if (newo->oid.is_pgmeta()) { - newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; - } + newo->onode.set_omap_flag(); + if (newo->oid.is_pgmeta()) { + newo->onode.flags |= bluestore_onode_t::FLAG_PGMETA_OMAP; } const string& prefix = newo->onode.is_pgmeta_omap() ? PREFIX_PGMETA_OMAP : PREFIX_OMAP; @@ -13159,8 +13173,10 @@ int BlueStore::_clone(TransContext *txc, } it->next(); } - } else { - newo->onode.clear_omap_flag(); + string new_tail; + bufferlist new_tail_value; + get_omap_tail(newo->onode.nid, &new_tail); + txc->t->set(prefix, new_tail, new_tail_value); } txc->write_onode(newo);