From: xie xingguo Date: Thu, 29 Aug 2019 12:26:44 +0000 (+0800) Subject: os/BlueStore: split _do_read into small helpers X-Git-Tag: v15.1.0~1646^2~3 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=a34f98f0f510968060429e9a4be8e5090cc83246;p=ceph.git os/BlueStore: split _do_read into small helpers The typical read flow consists of 4 steps: - read meta, e.g., extent-map - read data from cache - read cache-missing data off the disk - decompress, verify checksum, and generate the final output stream Hence we split _do_read into corresponding small helpers, which are easy to maintain and for reuse. Signed-off-by: xie xingguo --- diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index a936d445be5..588d54c5553 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -8789,123 +8789,26 @@ int BlueStore::read( return r; } -// -------------------------------------------------------- -// intermediate data structures used while reading -struct region_t { - uint64_t logical_offset; - uint64_t blob_xoffset; //region offset within the blob - uint64_t length; - - // used later in read process - uint64_t front = 0; - - region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0) - : logical_offset(offset), - blob_xoffset(b_offs), - length(len), - front(front){} - region_t(const region_t& from) - : logical_offset(from.logical_offset), - blob_xoffset(from.blob_xoffset), - length(from.length), - front(from.front){} - - friend ostream& operator<<(ostream& out, const region_t& r) { - return out << "0x" << std::hex << r.logical_offset << ":" - << r.blob_xoffset << "~" << r.length << std::dec; - } -}; - -// merged blob read request -struct read_req_t { - uint64_t r_off = 0; - uint64_t r_len = 0; - bufferlist bl; - std::list regs; // original read regions - - read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {} - - friend ostream& operator<<(ostream& out, const read_req_t& r) { - out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : ["; - for (const auto& reg : r.regs) - out << reg; - return out << "]}" << std::dec; - } -}; - -typedef list regions2read_t; -typedef map blobs2read_t; - -int BlueStore::_do_read( - Collection *c, +void BlueStore::_read_cache( OnodeRef o, uint64_t offset, size_t length, - bufferlist& bl, - uint32_t op_flags, - uint64_t retry_count) + int read_cache_policy, + ready_regions_t& ready_regions, + blobs2read_t& blobs2read) { - FUNCTRACE(cct); - int r = 0; - int read_cache_policy = 0; // do not bypass clean or dirty cache - - dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length - << " size 0x" << o->onode.size << " (" << std::dec - << o->onode.size << ")" << dendl; - bl.clear(); - - if (offset >= o->onode.size) { - return r; - } - - // generally, don't buffer anything, unless the client explicitly requests - // it. - bool buffered = false; - if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { - dout(20) << __func__ << " will do buffered read" << dendl; - buffered = true; - } else if (cct->_conf->bluestore_default_buffered_read && - (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | - CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { - dout(20) << __func__ << " defaulting to buffered read" << dendl; - buffered = true; - } - - if (offset + length > o->onode.size) { - length = o->onode.size - offset; - } - - auto start = mono_clock::now(); - o->extent_map.fault_range(db, offset, length); - log_latency(__func__, - l_bluestore_read_onode_meta_lat, - mono_clock::now() - start, - cct->_conf->bluestore_log_op_age); - _dump_onode<30>(cct, *o); - - ready_regions_t ready_regions; - - // for deep-scrub, we only read dirty cache and bypass clean cache in - // order to read underlying block device in case there are silent disk errors. - if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { - dout(20) << __func__ << " will bypass cache and do direct read" << dendl; - read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; - } - // build blob-wise list to of stuff read (that isn't cached) - blobs2read_t blobs2read; unsigned left = length; uint64_t pos = offset; - unsigned num_regions = 0; auto lp = o->extent_map.seek_lextent(offset); while (left > 0 && lp != o->extent_map.extent_map.end()) { if (pos < lp->logical_offset) { unsigned hole = lp->logical_offset - pos; if (hole >= left) { - break; + break; } dout(30) << __func__ << " hole 0x" << std::hex << pos << "~" << hole - << std::dec << dendl; + << std::dec << dendl; pos += hole; left -= hole; } @@ -8920,60 +8823,59 @@ int BlueStore::_do_read( bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval, read_cache_policy); dout(20) << __func__ << " blob " << *bptr << std::hex - << " need 0x" << b_off << "~" << b_len - << " cache has 0x" << cache_interval - << std::dec << dendl; + << " need 0x" << b_off << "~" << b_len + << " cache has 0x" << cache_interval + << std::dec << dendl; auto pc = cache_res.begin(); uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size); while (b_len > 0) { unsigned l; if (pc != cache_res.end() && - pc->first == b_off) { - l = pc->second.length(); - ready_regions[pos].claim(pc->second); - dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x" - << b_off << "~" << l << std::dec << dendl; - ++pc; + pc->first == b_off) { + l = pc->second.length(); + ready_regions[pos].claim(pc->second); + dout(30) << __func__ << " use cache 0x" << std::hex << pos << ": 0x" + << b_off << "~" << l << std::dec << dendl; + ++pc; } else { - l = b_len; - if (pc != cache_res.end()) { - ceph_assert(pc->first > b_off); - l = pc->first - b_off; - } - dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" - << b_off << "~" << l << std::dec << dendl; - // merge regions - { - uint64_t r_off = b_off; - uint64_t r_len = l; - uint64_t front = r_off % chunk_size; - if (front) { - r_off -= front; - r_len += front; - } - unsigned tail = r_len % chunk_size; - if (tail) { - r_len += chunk_size - tail; - } - bool merged = false; - regions2read_t& r2r = blobs2read[bptr]; - if (r2r.size()) { - read_req_t& pre = r2r.back(); - if (r_off <= (pre.r_off + pre.r_len)) { - front += (r_off - pre.r_off); - pre.r_len += (r_off + r_len - pre.r_off - pre.r_len); - pre.regs.emplace_back(region_t(pos, b_off, l, front)); - merged = true; - } - } - if (!merged) { - read_req_t req(r_off, r_len); - req.regs.emplace_back(region_t(pos, b_off, l, front)); - r2r.emplace_back(std::move(req)); - } - } - ++num_regions; + l = b_len; + if (pc != cache_res.end()) { + ceph_assert(pc->first > b_off); + l = pc->first - b_off; + } + dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x" + << b_off << "~" << l << std::dec << dendl; + // merge regions + { + uint64_t r_off = b_off; + uint64_t r_len = l; + uint64_t front = r_off % chunk_size; + if (front) { + r_off -= front; + r_len += front; + } + unsigned tail = r_len % chunk_size; + if (tail) { + r_len += chunk_size - tail; + } + bool merged = false; + regions2read_t& r2r = blobs2read[bptr]; + if (r2r.size()) { + read_req_t& pre = r2r.back(); + if (r_off <= (pre.r_off + pre.r_len)) { + front += (r_off - pre.r_off); + pre.r_len += (r_off + r_len - pre.r_off - pre.r_len); + pre.regs.emplace_back(region_t(pos, b_off, l, front)); + merged = true; + } + } + if (!merged) { + read_req_t req(r_off, r_len); + req.regs.emplace_back(region_t(pos, b_off, l, front)); + r2r.emplace_back(std::move(req)); + } + } } pos += l; b_off += l; @@ -8982,40 +8884,34 @@ int BlueStore::_do_read( } ++lp; } +} - // read raw blob data. use aio if we have >1 blobs to read. - start = mono_clock::now(); // for the sake of simplicity - // measure the whole block below. - // The error isn't that much... - vector compressed_blob_bls; - IOContext ioc(cct, NULL, true); // allow EIO +int BlueStore::_prepare_read_ioc( + blobs2read_t& blobs2read, + vector* compressed_blob_bls, + IOContext* ioc) +{ for (auto& p : blobs2read) { const BlobRef& bptr = p.first; regions2read_t& r2r = p.second; dout(20) << __func__ << " blob " << *bptr << std::hex - << " need " << r2r << std::dec << dendl; + << " need " << r2r << std::dec << dendl; if (bptr->get_blob().is_compressed()) { // read the whole thing - if (compressed_blob_bls.empty()) { - // ensure we avoid any reallocation on subsequent blobs - compressed_blob_bls.reserve(blobs2read.size()); - } - compressed_blob_bls.push_back(bufferlist()); - bufferlist& bl = compressed_blob_bls.back(); - r = bptr->get_blob().map( - 0, bptr->get_blob().get_ondisk_length(), - [&](uint64_t offset, uint64_t length) { - int r; - // use aio if there are more regions to read than those in this blob - if (num_regions > r2r.size()) { - r = bdev->aio_read(offset, length, &bl, &ioc); - } else { - r = bdev->read(offset, length, &bl, &ioc, false); - } - if (r < 0) + if (compressed_blob_bls->empty()) { + // ensure we avoid any reallocation on subsequent blobs + compressed_blob_bls->reserve(blobs2read.size()); + } + compressed_blob_bls->push_back(bufferlist()); + bufferlist& bl = compressed_blob_bls->back(); + auto r = bptr->get_blob().map( + 0, bptr->get_blob().get_ondisk_length(), + [&](uint64_t offset, uint64_t length) { + int r = bdev->aio_read(offset, length, &bl, ioc); + if (r < 0) return r; return 0; - }); + }); if (r < 0) { derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; if (r == -EIO) { @@ -9027,28 +8923,22 @@ int BlueStore::_do_read( } else { // read the pieces for (auto& req : r2r) { - dout(20) << __func__ << " region 0x" << std::hex - << req.regs.front().logical_offset - << ": 0x" << req.regs.front().blob_xoffset - << " reading 0x" << req.r_off - << "~" << req.r_len << std::dec - << dendl; + dout(20) << __func__ << " region 0x" << std::hex + << req.regs.front().logical_offset + << ": 0x" << req.regs.front().blob_xoffset + << " reading 0x" << req.r_off + << "~" << req.r_len << std::dec + << dendl; - // read it - r = bptr->get_blob().map( - req.r_off, req.r_len, - [&](uint64_t offset, uint64_t length) { - int r; - // use aio if there is more than one region to read - if (num_regions > 1) { - r = bdev->aio_read(offset, length, &req.bl, &ioc); - } else { - r = bdev->read(offset, length, &req.bl, &ioc, false); - } - if (r < 0) + // read it + auto r = bptr->get_blob().map( + req.r_off, req.r_len, + [&](uint64_t offset, uint64_t length) { + int r = bdev->aio_read(offset, length, &req.bl, ioc); + if (r < 0) return r; return 0; - }); + }); if (r < 0) { derr << __func__ << " bdev-read failed: " << cpp_strerror(r) << dendl; @@ -9058,60 +8948,47 @@ int BlueStore::_do_read( } ceph_assert(r == 0); } - ceph_assert(req.bl.length() == req.r_len); + ceph_assert(req.bl.length() == req.r_len); } } } + return 0; +} - int64_t num_ios = length; - if (ioc.has_pending_aios()) { - num_ios = -ioc.get_num_ios(); - bdev->aio_submit(&ioc); - dout(20) << __func__ << " waiting for aio" << dendl; - ioc.aio_wait(); - r = ioc.get_return_value(); - if (r < 0) { - ceph_assert(r == -EIO); // no other errors allowed - return -EIO; - } - } - log_latency_fn(__func__, - l_bluestore_read_wait_aio_lat, - mono_clock::now() - start, - cct->_conf->bluestore_log_op_age, - [&](auto lat) { return ", num_ios = " + stringify(num_ios); } - ); - - // enumerate and decompress desired blobs +int BlueStore::_generate_read_result_bl( + OnodeRef o, + uint64_t offset, + size_t length, + ready_regions_t& ready_regions, + vector& compressed_blob_bls, + blobs2read_t& blobs2read, + bool buffered, + bool* csum_error, + bufferlist& bl) +{ + // enumerate and decompress desired blobs auto p = compressed_blob_bls.begin(); blobs2read_t::iterator b2r_it = blobs2read.begin(); while (b2r_it != blobs2read.end()) { const BlobRef& bptr = b2r_it->first; regions2read_t& r2r = b2r_it->second; dout(20) << __func__ << " blob " << *bptr << std::hex - << " need 0x" << r2r << std::dec << dendl; + << " need 0x" << r2r << std::dec << dendl; if (bptr->get_blob().is_compressed()) { ceph_assert(p != compressed_blob_bls.end()); bufferlist& compressed_bl = *p++; if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl, - r2r.front().regs.front().logical_offset) < 0) { - // Handles spurious read errors caused by a kernel bug. - // We sometimes get all-zero pages as a result of the read under - // high memory pressure. Retrying the failing read succeeds in most - // cases. - // See also: http://tracker.ceph.com/issues/22464 - if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { - return -EIO; - } - return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); + r2r.front().regs.front().logical_offset) < 0) { + *csum_error = true; + return -EIO; } bufferlist raw_bl; - r = _decompress(compressed_bl, &raw_bl); + auto r = _decompress(compressed_bl, &raw_bl); if (r < 0) - return r; + return r; if (buffered) { - bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, - raw_bl); + bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0, + raw_bl); } for (auto& req : r2r) { for (auto& r : req.regs) { @@ -9121,26 +8998,19 @@ int BlueStore::_do_read( } } else { for (auto& req : r2r) { - if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, - req.regs.front().logical_offset) < 0) { - // Handles spurious read errors caused by a kernel bug. - // We sometimes get all-zero pages as a result of the read under - // high memory pressure. Retrying the failing read succeeds in most - // cases. - // See also: http://tracker.ceph.com/issues/22464 - if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { - return -EIO; - } - return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); - } - if (buffered) { - bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), - req.r_off, req.bl); - } + if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl, + req.regs.front().logical_offset) < 0) { + *csum_error = true; + return -EIO; + } + if (buffered) { + bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), + req.r_off, req.bl); + } - // prune and keep result - for (const auto& r : req.regs) { - ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); + // prune and keep result + for (const auto& r : req.regs) { + ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length); } } } @@ -9150,12 +9020,12 @@ int BlueStore::_do_read( // generate a resulting buffer auto pr = ready_regions.begin(); auto pr_end = ready_regions.end(); - pos = 0; + uint64_t pos = 0; while (pos < length) { if (pr != pr_end && pr->first == pos + offset) { dout(30) << __func__ << " assemble 0x" << std::hex << pos - << ": data from 0x" << pr->first << "~" << pr->second.length() - << std::dec << dendl; + << ": data from 0x" << pr->first << "~" << pr->second.length() + << std::dec << dendl; pos += pr->second.length(); bl.claim_append(pr->second); ++pr; @@ -9163,11 +9033,11 @@ int BlueStore::_do_read( uint64_t l = length - pos; if (pr != pr_end) { ceph_assert(pr->first > pos + offset); - l = pr->first - (pos + offset); + l = pr->first - (pos + offset); } dout(30) << __func__ << " assemble 0x" << std::hex << pos - << ": zeros for 0x" << (pos + offset) << "~" << l - << std::dec << dendl; + << ": zeros for 0x" << (pos + offset) << "~" << l + << std::dec << dendl; bl.append_zero(l); pos += l; } @@ -9175,6 +9045,114 @@ int BlueStore::_do_read( ceph_assert(bl.length() == length); ceph_assert(pos == length); ceph_assert(pr == pr_end); + return 0; +} + +int BlueStore::_do_read( + Collection *c, + OnodeRef o, + uint64_t offset, + size_t length, + bufferlist& bl, + uint32_t op_flags, + uint64_t retry_count) +{ + FUNCTRACE(cct); + int r = 0; + int read_cache_policy = 0; // do not bypass clean or dirty cache + + dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length + << " size 0x" << o->onode.size << " (" << std::dec + << o->onode.size << ")" << dendl; + bl.clear(); + + if (offset >= o->onode.size) { + return r; + } + + // generally, don't buffer anything, unless the client explicitly requests + // it. + bool buffered = false; + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) { + dout(20) << __func__ << " will do buffered read" << dendl; + buffered = true; + } else if (cct->_conf->bluestore_default_buffered_read && + (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) == 0) { + dout(20) << __func__ << " defaulting to buffered read" << dendl; + buffered = true; + } + + if (offset + length > o->onode.size) { + length = o->onode.size - offset; + } + + auto start = mono_clock::now(); + o->extent_map.fault_range(db, offset, length); + log_latency(__func__, + l_bluestore_read_onode_meta_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age); + _dump_onode<30>(cct, *o); + + // for deep-scrub, we only read dirty cache and bypass clean cache in + // order to read underlying block device in case there are silent disk errors. + if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { + dout(20) << __func__ << " will bypass cache and do direct read" << dendl; + read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; + } + + // build blob-wise list to of stuff read (that isn't cached) + ready_regions_t ready_regions; + blobs2read_t blobs2read; + _read_cache(o, offset, length, read_cache_policy, ready_regions, blobs2read); + + + // read raw blob data. + start = mono_clock::now(); // for the sake of simplicity + // measure the whole block below. + // The error isn't that much... + vector compressed_blob_bls; + IOContext ioc(cct, NULL, true); // allow EIO + r = _prepare_read_ioc(blobs2read, &compressed_blob_bls, &ioc); + // we always issue aio for reading, so errors other than EIO are not allowed + if (r < 0) + return r; + + int64_t num_ios = length; + if (ioc.has_pending_aios()) { + num_ios = -ioc.get_num_ios(); + bdev->aio_submit(&ioc); + dout(20) << __func__ << " waiting for aio" << dendl; + ioc.aio_wait(); + r = ioc.get_return_value(); + if (r < 0) { + ceph_assert(r == -EIO); // no other errors allowed + return -EIO; + } + } + log_latency_fn(__func__, + l_bluestore_read_wait_aio_lat, + mono_clock::now() - start, + cct->_conf->bluestore_log_op_age, + [&](auto lat) { return ", num_ios = " + stringify(num_ios); } + ); + + bool csum_error = false; + r = _generate_read_result_bl(o, offset, length, ready_regions, + compressed_blob_bls, blobs2read, + buffered, &csum_error, bl); + if (csum_error) { + // Handles spurious read errors caused by a kernel bug. + // We sometimes get all-zero pages as a result of the read under + // high memory pressure. Retrying the failing read succeeds in most + // cases. + // See also: http://tracker.ceph.com/issues/22464 + if (retry_count >= cct->_conf->bluestore_retry_disk_reads) { + return -EIO; + } + return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1); + } r = bl.length(); if (retry_count) { logger->inc(l_bluestore_reads_with_retries); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index c3fe59c5859..eec5f4f9476 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2410,6 +2410,81 @@ public: size_t len, bufferlist& bl, uint32_t op_flags = 0) override; + +private: + + // -------------------------------------------------------- + // intermediate data structures used while reading + struct region_t { + uint64_t logical_offset; + uint64_t blob_xoffset; //region offset within the blob + uint64_t length; + + // used later in read process + uint64_t front = 0; + + region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0) + : logical_offset(offset), + blob_xoffset(b_offs), + length(len), + front(front){} + region_t(const region_t& from) + : logical_offset(from.logical_offset), + blob_xoffset(from.blob_xoffset), + length(from.length), + front(from.front){} + + friend ostream& operator<<(ostream& out, const region_t& r) { + return out << "0x" << std::hex << r.logical_offset << ":" + << r.blob_xoffset << "~" << r.length << std::dec; + } + }; + + // merged blob read request + struct read_req_t { + uint64_t r_off = 0; + uint64_t r_len = 0; + bufferlist bl; + std::list regs; // original read regions + + read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {} + + friend ostream& operator<<(ostream& out, const read_req_t& r) { + out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : ["; + for (const auto& reg : r.regs) + out << reg; + return out << "]}" << std::dec; + } + }; + + typedef list regions2read_t; + typedef map blobs2read_t; + + void _read_cache( + OnodeRef o, + uint64_t offset, + size_t length, + int read_cache_policy, + ready_regions_t& ready_regions, + blobs2read_t& blobs2read); + + + int _prepare_read_ioc( + blobs2read_t& blobs2read, + vector* compressed_blob_bls, + IOContext* ioc); + + int _generate_read_result_bl( + OnodeRef o, + uint64_t offset, + size_t length, + ready_regions_t& ready_regions, + vector& compressed_blob_bls, + blobs2read_t& blobs2read, + bool buffered, + bool* csum_error, + bufferlist& bl); + int _do_read( Collection *c, OnodeRef o, @@ -2419,7 +2494,6 @@ public: uint32_t op_flags = 0, uint64_t retry_count = 0); -private: int _fiemap(CollectionHandle &c_, const ghobject_t& oid, uint64_t offset, size_t len, interval_set& destset); public: