From a86d5606354be41715760477a9219a14a7fb92cd Mon Sep 17 00:00:00 2001 From: Yang Honggang Date: Tue, 30 Oct 2018 06:54:39 +0000 Subject: [PATCH] os/bluestore/BlueStore.cc: merge overlapping/adjacent regions before read Fixes: http://tracker.ceph.com/issues/36625 Signed-off-by: Yang Honggang --- src/os/bluestore/BlueStore.cc | 128 +++++++++++++++++++---------- src/test/objectstore/store_test.cc | 83 +++++++++++++++++++ 2 files changed, 168 insertions(+), 43 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 85f9e673873..3a11dec74c4 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7722,20 +7722,20 @@ struct region_t { uint64_t logical_offset; uint64_t blob_xoffset; //region offset within the blob uint64_t length; - bufferlist bl; // used later in read process uint64_t front = 0; - uint64_t r_off = 0; - region_t(uint64_t offset, uint64_t b_offs, uint64_t len) + region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0) : logical_offset(offset), blob_xoffset(b_offs), - length(len){} + length(len), + front(front){} region_t(const region_t& from) : logical_offset(from.logical_offset), blob_xoffset(from.blob_xoffset), - length(from.length){} + length(from.length), + front(from.front){} friend ostream& operator<<(ostream& out, const region_t& r) { return out << "0x" << std::hex << r.logical_offset << ":" @@ -7743,7 +7743,24 @@ struct region_t { } }; -typedef list regions2read_t; +// merged blob read request +struct read_req_t { + uint64_t r_off = 0; + uint64_t r_len = 0; + bufferlist bl; + std::list regs; // original read regions + + read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {} + + friend ostream& operator<<(ostream& out, const read_req_t& r) { + out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : ["; + for (const auto& reg : r.regs) + out << reg; + return out << "]}" << std::dec; + } +}; + +typedef list regions2read_t; typedef map blobs2read_t; int BlueStore::_do_read( @@ -7832,6 +7849,7 @@ int BlueStore::_do_read( << std::dec << dendl; auto pc = cache_res.begin(); + uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size); while (b_len > 0) { unsigned l; if (pc != cache_res.end() && @@ -7849,7 +7867,36 @@ int BlueStore::_do_read( } dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" << b_off << "~" << l << std::dec << dendl; - blobs2read[bptr].emplace_back(region_t(pos, b_off, l)); + // merge regions + { + uint64_t r_off = b_off; + uint64_t r_len = l; + uint64_t front = r_off % chunk_size; + if (front) { + r_off -= front; + r_len += front; + } + unsigned tail = r_len % chunk_size; + if (tail) { + r_len += chunk_size - tail; + } + bool merged = false; + regions2read_t& r2r = blobs2read[bptr]; + if (r2r.size()) { + read_req_t& pre = r2r.back(); + if (r_off <= (pre.r_off + pre.r_len)) { + front += (r_off - pre.r_off); + pre.r_len += (r_off + r_len - pre.r_off - pre.r_len); + pre.regs.emplace_back(region_t(pos, b_off, l, front)); + merged = true; + } + } + if (!merged) { + read_req_t req(r_off, r_len); + req.regs.emplace_back(region_t(pos, b_off, l, front)); + r2r.emplace_back(std::move(req)); + } + } ++num_regions; } pos += l; @@ -7868,8 +7915,9 @@ int BlueStore::_do_read( IOContext ioc(cct, NULL, true); // allow EIO for (auto& p : blobs2read) { const BlobRef& bptr = p.first; + regions2read_t& r2r = p.second; dout(20) << __func__ << " blob " << *bptr << std::hex - << " need " << p.second << std::dec << dendl; + << " need " << r2r << std::dec << dendl; if (bptr->get_blob().is_compressed()) { // read the whole thing if (compressed_blob_bls.empty()) { @@ -7883,7 +7931,7 @@ int BlueStore::_do_read( [&](uint64_t offset, uint64_t length) { int r; // use aio if there are more regions to read than those in this blob - if (num_regions > p.second.size()) { + if (num_regions > r2r.size()) { r = bdev->aio_read(offset, length, &bl, &ioc); } else { r = bdev->read(offset, length, &bl, &ioc, false); @@ -7902,36 +7950,24 @@ int BlueStore::_do_read( } } else { // read the pieces - for (auto& reg : p.second) { - // determine how much of the blob to read - uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size); - reg.r_off = reg.blob_xoffset; - uint64_t r_len = reg.length; - reg.front = reg.r_off % chunk_size; - if (reg.front) { - reg.r_off -= reg.front; - r_len += reg.front; - } - unsigned tail = r_len % chunk_size; - if (tail) { - r_len += chunk_size - tail; - } + for (auto& req : r2r) { dout(20) << __func__ << " region 0x" << std::hex - << reg.logical_offset - << ": 0x" << reg.blob_xoffset << "~" << reg.length - << " reading 0x" << reg.r_off << "~" << r_len << std::dec + << req.regs.front().logical_offset + << ": 0x" << req.regs.front().blob_xoffset + << " reading 0x" << req.r_off + << "~" << req.r_len << std::dec << dendl; // read it r = bptr->get_blob().map( - reg.r_off, r_len, + req.r_off, req.r_len, [&](uint64_t offset, uint64_t length) { int r; // use aio if there is more than one region to read if (num_regions > 1) { - r = bdev->aio_read(offset, length, ®.bl, &ioc); + r = bdev->aio_read(offset, length, &req.bl, &ioc); } else { - r = bdev->read(offset, length, ®.bl, &ioc, false); + r = bdev->read(offset, length, &req.bl, &ioc, false); } if (r < 0) return r; @@ -7946,7 +7982,7 @@ int BlueStore::_do_read( } ceph_assert(r == 0); } - ceph_assert(reg.bl.length() == r_len); + ceph_assert(req.bl.length() == req.r_len); } } } @@ -7967,16 +8003,18 @@ int BlueStore::_do_read( blobs2read_t::iterator b2r_it = blobs2read.begin(); while (b2r_it != blobs2read.end()) { const BlobRef& bptr = b2r_it->first; + regions2read_t& r2r = b2r_it->second; dout(20) << __func__ << " blob " << *bptr << std::hex - << " need 0x" << b2r_it->second << std::dec << dendl; + << " need 0x" << r2r << std::dec << dendl; if (bptr->get_blob().is_compressed()) { ceph_assert(p != compressed_blob_bls.end()); bufferlist& compressed_bl = *p++; if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, - b2r_it->second.front().logical_offset) < 0) { + r2r.front().regs.front().logical_offset) < 0) { // Handles spurious read errors caused by a kernel bug. // We sometimes get all-zero pages as a result of the read under - // high memory pressure. Retrying the failing read succeeds in most cases. + // high memory pressure. Retrying the failing read succeeds in most + // cases. // See also: http://tracker.ceph.com/issues/22464 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { return -EIO; @@ -7991,17 +8029,20 @@ int BlueStore::_do_read( bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, raw_bl); } - for (auto& i : b2r_it->second) { - ready_regions[i.logical_offset].substr_of( - raw_bl, i.blob_xoffset, i.length); + for (auto& req : r2r) { + for (auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of( + raw_bl, r.blob_xoffset, r.length); + } } } else { - for (auto& reg : b2r_it->second) { - if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl, - reg.logical_offset) < 0) { + for (auto& req : r2r) { + if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, + req.regs.front().logical_offset) < 0) { // Handles spurious read errors caused by a kernel bug. // We sometimes get all-zero pages as a result of the read under - // high memory pressure. Retrying the failing read succeeds in most cases. + // high memory pressure. Retrying the failing read succeeds in most + // cases. // See also: http://tracker.ceph.com/issues/22464 if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { return -EIO; @@ -8010,12 +8051,13 @@ int BlueStore::_do_read( } if (buffered) { bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), - reg.r_off, reg.bl); + req.r_off, req.bl); } // prune and keep result - ready_regions[reg.logical_offset].substr_of( - reg.bl, reg.front, reg.length); + for (const auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); + } } } ++b2r_it; diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 8366469b5cb..7d1756f30ae 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -7339,6 +7339,89 @@ TEST_P(StoreTest, allocateBlueFSTest) { ASSERT_EQ(r, 0); } +TEST_P(StoreTest, mergeRegionTest) { + if (string(GetParam()) != "bluestore") + return; + + SetVal(g_conf(), "bluestore_fsck_on_mount", "true"); + SetVal(g_conf(), "bluestore_fsck_on_umount", "true"); + SetVal(g_conf(), "bdev_debug_inflight_ios", "true"); + g_ceph_context->_conf.apply_changes(nullptr); + + uint32_t chunk_size = g_ceph_context->_conf->bdev_block_size; + int r = -1; + coll_t cid; + ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP))); + auto ch = store->create_new_collection(cid); + { + ObjectStore::Transaction t; + t.create_collection(cid, 0); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { + ObjectStore::Transaction t; + t.touch(cid, hoid); + cerr << "Creating object " << hoid << std::endl; + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + bufferlist bl5; + bl5.append("abcde"); + uint64_t offset = 0; + { // 1. same region + ObjectStore::Transaction t; + t.write(cid, hoid, offset, 5, bl5); + t.write(cid, hoid, 0xa + offset, 5, bl5); + t.write(cid, hoid, 0x14 + offset, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 2. adjacent regions + ObjectStore::Transaction t; + offset = chunk_size; + t.write(cid, hoid, offset, 5, bl5); + t.write(cid, hoid, offset + chunk_size + 3, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 3. front merge + ObjectStore::Transaction t; + offset = chunk_size * 2; + t.write(cid, hoid, offset, 5, bl5); + t.write(cid, hoid, offset + chunk_size - 2, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 4. back merge + ObjectStore::Transaction t; + bufferlist blc2; + blc2.append_zero(chunk_size + 2); + + offset = chunk_size * 3; + t.write(cid, hoid, offset, chunk_size + 2, blc2); + t.write(cid, hoid, offset + chunk_size + 3, 5, bl5); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + } + { // 5. overlapping + ObjectStore::Transaction t; + uint64_t final_len = 0; + offset = chunk_size * 10; + bufferlist bl2c2; + bl2c2.append_zero(chunk_size * 2); + t.write(cid, hoid, offset + chunk_size * 3 - 3, chunk_size * 2, bl2c2); + bl2c2.append_zero(2); + t.write(cid, hoid, offset + chunk_size - 2, chunk_size * 2 + 2, bl2c2); + r = queue_transaction(store, ch, std::move(t)); + ASSERT_EQ(r, 0); + + final_len = (offset + chunk_size * 3 - 3) + (chunk_size * 2); + bufferlist bl; + r = store->read(ch, hoid, 0, final_len, bl); + ASSERT_EQ(r, final_len); + } +} #endif // WITH_BLUESTORE int main(int argc, char **argv) { -- 2.39.5