From: Igor Fedotov Date: Thu, 25 Apr 2019 11:39:14 +0000 (+0300) Subject: os/bluestore: support RocksDB prefetching at BlueFS X-Git-Tag: v15.1.0~2694^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=f37cb01bc149d05f777ec6fad7cd0a7c81686c6e;p=ceph.git os/bluestore: support RocksDB prefetching at BlueFS Signed-off-by: Igor Fedotov --- diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index de0cc1c9ed7..35d6c3bafca 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -124,6 +124,35 @@ void BlueFS::_init_logger() "Maximum bytes allocated from DB"); b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow", "Maximum bytes allocated from SLOW"); + + b.add_u64_counter(l_bluefs_read_random_count, "read_random_count", + "random read requests processed"); + b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes", + "Bytes requested in random read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count", + "random reads requests going to disk"); + b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes", + "Bytes read from disk in random read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count", + "random read requests processed using prefetch buffer"); + b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes", + "Bytes read from prefetch buffer in random read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + + b.add_u64_counter(l_bluefs_read_count, "read_count", + "buffered read requests processed"); + b.add_u64_counter(l_bluefs_read_bytes, "read_bytes", + "Bytes requested in buffered read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + + b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count", + "prefetch read requests processed"); + b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes", + "Bytes requested in prefetch read mode", NULL, + PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES)); + logger = b.create_perf_counters(); cct->get_perfcounters_collection()->add(logger); } @@ -1328,6 +1357,9 @@ int BlueFS::_read_random( size_t len, ///< [in] this many bytes char *out) ///< [out] optional: or copy it here { + auto* buf = &h->buf; + + int ret = 0; dout(10) << __func__ << " h " << h << " 0x" << std::hex << off << "~" << len << std::dec << " from " << h->file->fnode << dendl; @@ -1343,24 +1375,57 @@ int BlueFS::_read_random( dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" << std::hex << len << std::dec << dendl; } + logger->inc(l_bluefs_read_random_count, 1); + logger->inc(l_bluefs_read_random_bytes, len); + - int ret = 0; while (len > 0) { - uint64_t x_off = 0; - auto p = h->file->fnode.seek(off, &x_off); - uint64_t l = std::min(p->length - x_off, static_cast(len)); - dout(20) << __func__ << " read buffered 0x" - << std::hex << x_off << "~" << l << std::dec - << " of " << *p << dendl; - int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out, - cct->_conf->bluefs_buffered_io); - ceph_assert(r == 0); - off += l; - len -= l; - ret += l; - out += l; - } + if (off < buf->bl_off || off >= buf->get_buf_end()) { + uint64_t x_off = 0; + auto p = h->file->fnode.seek(off, &x_off); + uint64_t l = std::min(p->length - x_off, static_cast(len)); + dout(20) << __func__ << " read random 0x" + << std::hex << x_off << "~" << l << std::dec + << " of " << *p << dendl; + int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out, + cct->_conf->bluefs_buffered_io); + ceph_assert(r == 0); + off += l; + len -= l; + ret += l; + out += l; + + logger->inc(l_bluefs_read_random_disk_count, 1); + logger->inc(l_bluefs_read_random_disk_bytes, l); + } else { + auto left = buf->get_buf_remaining(off); + int r = std::min(len, left); + logger->inc(l_bluefs_read_random_buffer_count, 1); + logger->inc(l_bluefs_read_random_buffer_bytes, r); + dout(20) << __func__ << " left 0x" << std::hex << left + << " 0x" << off << "~" << len << std::dec + << dendl; + + if (out) { + // NOTE: h->bl is normally a contiguous buffer so c_str() is free. + memcpy(out, buf->bl.c_str() + off - buf->bl_off, r); + out += r; + } + + dout(30) << __func__ << " result chunk (0x" + << std::hex << r << std::dec << " bytes):\n"; + bufferlist t; + t.substr_of(buf->bl, off - buf->bl_off, r); + t.hexdump(*_dout); + *_dout << dendl; + + off += r; + len -= r; + ret += r; + buf->pos += r; + } + } dout(20) << __func__ << " got " << ret << dendl; --h->file->num_reading; return ret; @@ -1374,9 +1439,12 @@ int BlueFS::_read( bufferlist *outbl, ///< [out] optional: reference the result here char *out) ///< [out] optional: or copy it here { + bool prefetch = !outbl && !out; dout(10) << __func__ << " h " << h << " 0x" << std::hex << off << "~" << len << std::dec - << " from " << h->file->fnode << dendl; + << " from " << h->file->fnode + << (prefetch ? " prefetch" : "") + << dendl; ++h->file->num_reading; @@ -1389,6 +1457,13 @@ int BlueFS::_read( dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x" << std::hex << len << std::dec << dendl; } + logger->inc(l_bluefs_read_count, 1); + logger->inc(l_bluefs_read_bytes, len); + if (prefetch) { + logger->inc(l_bluefs_read_prefetch_count, 1); + logger->inc(l_bluefs_read_prefetch_bytes, len); + } + if (outbl) outbl->clear(); diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 2eae38fc15f..12c25ca2387 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -39,6 +39,17 @@ enum { l_bluefs_max_bytes_wal, l_bluefs_max_bytes_db, l_bluefs_max_bytes_slow, + l_bluefs_read_random_count, + l_bluefs_read_random_bytes, + l_bluefs_read_random_disk_count, + l_bluefs_read_random_disk_bytes, + l_bluefs_read_random_buffer_count, + l_bluefs_read_random_buffer_bytes, + l_bluefs_read_count, + l_bluefs_read_bytes, + l_bluefs_read_prefetch_count, + l_bluefs_read_prefetch_bytes, + l_bluefs_last, }; diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index 6536f6ea38d..7f2f8991b9e 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -121,6 +121,12 @@ class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { (unsigned long long)h->file->fnode.ino); }; + // Readahead the file starting from offset by n bytes for caching. + rocksdb::Status Prefetch(uint64_t offset, size_t n) override { + fs->read(h, &h->buf, offset, n, nullptr, nullptr); + return rocksdb::Status::OK(); + } + //enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED }; void Hint(AccessPattern pattern) override {