uint64_t logical_offset;
uint64_t blob_xoffset; //region offset within the blob
uint64_t length;
- bufferlist bl;
// used later in read process
uint64_t front = 0;
- uint64_t r_off = 0;
- region_t(uint64_t offset, uint64_t b_offs, uint64_t len)
+ region_t(uint64_t offset, uint64_t b_offs, uint64_t len, uint64_t front = 0)
: logical_offset(offset),
blob_xoffset(b_offs),
- length(len){}
+ length(len),
+ front(front){}
region_t(const region_t& from)
: logical_offset(from.logical_offset),
blob_xoffset(from.blob_xoffset),
- length(from.length){}
+ length(from.length),
+ front(from.front){}
friend ostream& operator<<(ostream& out, const region_t& r) {
return out << "0x" << std::hex << r.logical_offset << ":"
}
};
-typedef list<region_t> regions2read_t;
+// merged blob read request
+struct read_req_t {
+ uint64_t r_off = 0;
+ uint64_t r_len = 0;
+ bufferlist bl;
+ std::list<region_t> regs; // original read regions
+
+ read_req_t(uint64_t off, uint64_t len) : r_off(off), r_len(len) {}
+
+ friend ostream& operator<<(ostream& out, const read_req_t& r) {
+ out << "{<0x" << std::hex << r.r_off << ", 0x" << r.r_len << "> : [";
+ for (const auto& reg : r.regs)
+ out << reg;
+ return out << "]}" << std::dec;
+ }
+};
+
+typedef list<read_req_t> regions2read_t;
typedef map<BlueStore::BlobRef, regions2read_t> blobs2read_t;
int BlueStore::_do_read(
<< std::dec << dendl;
auto pc = cache_res.begin();
+ uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
while (b_len > 0) {
unsigned l;
if (pc != cache_res.end() &&
}
dout(30) << __func__ << " will read 0x" << std::hex << pos << ": 0x"
<< b_off << "~" << l << std::dec << dendl;
- blobs2read[bptr].emplace_back(region_t(pos, b_off, l));
+ // merge regions
+ {
+ uint64_t r_off = b_off;
+ uint64_t r_len = l;
+ uint64_t front = r_off % chunk_size;
+ if (front) {
+ r_off -= front;
+ r_len += front;
+ }
+ unsigned tail = r_len % chunk_size;
+ if (tail) {
+ r_len += chunk_size - tail;
+ }
+ bool merged = false;
+ regions2read_t& r2r = blobs2read[bptr];
+ if (r2r.size()) {
+ read_req_t& pre = r2r.back();
+ if (r_off <= (pre.r_off + pre.r_len)) {
+ front += (r_off - pre.r_off);
+ pre.r_len += (r_off + r_len - pre.r_off - pre.r_len);
+ pre.regs.emplace_back(region_t(pos, b_off, l, front));
+ merged = true;
+ }
+ }
+ if (!merged) {
+ read_req_t req(r_off, r_len);
+ req.regs.emplace_back(region_t(pos, b_off, l, front));
+ r2r.emplace_back(std::move(req));
+ }
+ }
++num_regions;
}
pos += l;
IOContext ioc(cct, NULL, true); // allow EIO
for (auto& p : blobs2read) {
const BlobRef& bptr = p.first;
+ regions2read_t& r2r = p.second;
dout(20) << __func__ << " blob " << *bptr << std::hex
- << " need " << p.second << std::dec << dendl;
+ << " need " << r2r << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
// read the whole thing
if (compressed_blob_bls.empty()) {
[&](uint64_t offset, uint64_t length) {
int r;
// use aio if there are more regions to read than those in this blob
- if (num_regions > p.second.size()) {
+ if (num_regions > r2r.size()) {
r = bdev->aio_read(offset, length, &bl, &ioc);
} else {
r = bdev->read(offset, length, &bl, &ioc, false);
}
} else {
// read the pieces
- for (auto& reg : p.second) {
- // determine how much of the blob to read
- uint64_t chunk_size = bptr->get_blob().get_chunk_size(block_size);
- reg.r_off = reg.blob_xoffset;
- uint64_t r_len = reg.length;
- reg.front = reg.r_off % chunk_size;
- if (reg.front) {
- reg.r_off -= reg.front;
- r_len += reg.front;
- }
- unsigned tail = r_len % chunk_size;
- if (tail) {
- r_len += chunk_size - tail;
- }
+ for (auto& req : r2r) {
dout(20) << __func__ << " region 0x" << std::hex
- << reg.logical_offset
- << ": 0x" << reg.blob_xoffset << "~" << reg.length
- << " reading 0x" << reg.r_off << "~" << r_len << std::dec
+ << req.regs.front().logical_offset
+ << ": 0x" << req.regs.front().blob_xoffset
+ << " reading 0x" << req.r_off
+ << "~" << req.r_len << std::dec
<< dendl;
// read it
r = bptr->get_blob().map(
- reg.r_off, r_len,
+ req.r_off, req.r_len,
[&](uint64_t offset, uint64_t length) {
int r;
// use aio if there is more than one region to read
if (num_regions > 1) {
- r = bdev->aio_read(offset, length, ®.bl, &ioc);
+ r = bdev->aio_read(offset, length, &req.bl, &ioc);
} else {
- r = bdev->read(offset, length, ®.bl, &ioc, false);
+ r = bdev->read(offset, length, &req.bl, &ioc, false);
}
if (r < 0)
return r;
}
ceph_assert(r == 0);
}
- ceph_assert(reg.bl.length() == r_len);
+ ceph_assert(req.bl.length() == req.r_len);
}
}
}
blobs2read_t::iterator b2r_it = blobs2read.begin();
while (b2r_it != blobs2read.end()) {
const BlobRef& bptr = b2r_it->first;
+ regions2read_t& r2r = b2r_it->second;
dout(20) << __func__ << " blob " << *bptr << std::hex
- << " need 0x" << b2r_it->second << std::dec << dendl;
+ << " need 0x" << r2r << std::dec << dendl;
if (bptr->get_blob().is_compressed()) {
ceph_assert(p != compressed_blob_bls.end());
bufferlist& compressed_bl = *p++;
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
- b2r_it->second.front().logical_offset) < 0) {
+ r2r.front().regs.front().logical_offset) < 0) {
// Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under
- // high memory pressure. Retrying the failing read succeeds in most cases.
+ // high memory pressure. Retrying the failing read succeeds in most
+ // cases.
// See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO;
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(), 0,
raw_bl);
}
- for (auto& i : b2r_it->second) {
- ready_regions[i.logical_offset].substr_of(
- raw_bl, i.blob_xoffset, i.length);
+ for (auto& req : r2r) {
+ for (auto& r : req.regs) {
+ ready_regions[r.logical_offset].substr_of(
+ raw_bl, r.blob_xoffset, r.length);
+ }
}
} else {
- for (auto& reg : b2r_it->second) {
- if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
- reg.logical_offset) < 0) {
+ for (auto& req : r2r) {
+ if (_verify_csum(o, &bptr->get_blob(), req.r_off, req.bl,
+ req.regs.front().logical_offset) < 0) {
// Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under
- // high memory pressure. Retrying the failing read succeeds in most cases.
+ // high memory pressure. Retrying the failing read succeeds in most
+ // cases.
// See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO;
}
if (buffered) {
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
- reg.r_off, reg.bl);
+ req.r_off, req.bl);
}
// prune and keep result
- ready_regions[reg.logical_offset].substr_of(
- reg.bl, reg.front, reg.length);
+ for (const auto& r : req.regs) {
+ ready_regions[r.logical_offset].substr_of(req.bl, r.front, r.length);
+ }
}
}
++b2r_it;
ASSERT_EQ(r, 0);
}
+TEST_P(StoreTest, mergeRegionTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ SetVal(g_conf(), "bluestore_fsck_on_mount", "true");
+ SetVal(g_conf(), "bluestore_fsck_on_umount", "true");
+ SetVal(g_conf(), "bdev_debug_inflight_ios", "true");
+ g_ceph_context->_conf.apply_changes(nullptr);
+
+ uint32_t chunk_size = g_ceph_context->_conf->bdev_block_size;
+ int r = -1;
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("Object", CEPH_NOSNAP)));
+ auto ch = store->create_new_collection(cid);
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ {
+ ObjectStore::Transaction t;
+ t.touch(cid, hoid);
+ cerr << "Creating object " << hoid << std::endl;
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist bl5;
+ bl5.append("abcde");
+ uint64_t offset = 0;
+ { // 1. same region
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, offset, 5, bl5);
+ t.write(cid, hoid, 0xa + offset, 5, bl5);
+ t.write(cid, hoid, 0x14 + offset, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 2. adjacent regions
+ ObjectStore::Transaction t;
+ offset = chunk_size;
+ t.write(cid, hoid, offset, 5, bl5);
+ t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 3. front merge
+ ObjectStore::Transaction t;
+ offset = chunk_size * 2;
+ t.write(cid, hoid, offset, 5, bl5);
+ t.write(cid, hoid, offset + chunk_size - 2, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 4. back merge
+ ObjectStore::Transaction t;
+ bufferlist blc2;
+ blc2.append_zero(chunk_size + 2);
+
+ offset = chunk_size * 3;
+ t.write(cid, hoid, offset, chunk_size + 2, blc2);
+ t.write(cid, hoid, offset + chunk_size + 3, 5, bl5);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+ }
+ { // 5. overlapping
+ ObjectStore::Transaction t;
+ uint64_t final_len = 0;
+ offset = chunk_size * 10;
+ bufferlist bl2c2;
+ bl2c2.append_zero(chunk_size * 2);
+ t.write(cid, hoid, offset + chunk_size * 3 - 3, chunk_size * 2, bl2c2);
+ bl2c2.append_zero(2);
+ t.write(cid, hoid, offset + chunk_size - 2, chunk_size * 2 + 2, bl2c2);
+ r = queue_transaction(store, ch, std::move(t));
+ ASSERT_EQ(r, 0);
+
+ final_len = (offset + chunk_size * 3 - 3) + (chunk_size * 2);
+ bufferlist bl;
+ r = store->read(ch, hoid, 0, final_len, bl);
+ ASSERT_EQ(r, final_len);
+ }
+}
#endif // WITH_BLUESTORE
int main(int argc, char **argv) {