From a4713b17c0d79e43cb65cb056c8a5b4611e8376f Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Thu, 9 Apr 2020 00:06:14 +0300 Subject: [PATCH] os/bluestore: fix huge (>4GB) bluefs reads Signed-off-by: Igor Fedotov (cherry picked from commit fe558c4dd12b25b2d523987f22fde5b5898cfb83) Conflicts: (still present buf param in BlueFS::read, lack of )https://github.com/ceph/ceph/pull/34421) src/os/bluestore/BlueFS.cc src/os/bluestore/BlueFS.h src/os/bluestore/BlueRocksEnv.cc src/test/objectstore/test_bluefs.cc --- src/os/bluestore/BlueFS.cc | 19 +++++++++++-------- src/os/bluestore/BlueFS.h | 8 ++++---- src/os/bluestore/BlueRocksEnv.cc | 4 ++-- src/test/objectstore/test_bluefs.cc | 25 +++++++++++++++++++++++++ 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 61ece25b2a102..ec40f328ddb0a 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1523,7 +1523,7 @@ void BlueFS::_drop_link(FileRef file) } } -int BlueFS::_read_random( +int64_t BlueFS::_read_random( FileReader *h, ///< [in] read from here uint64_t off, ///< [in] offset size_t len, ///< [in] this many bytes @@ -1531,7 +1531,7 @@ int BlueFS::_read_random( { auto* buf = &h->buf; - int ret = 0; + int64_t ret = 0; dout(10) << __func__ << " h " << h << " 0x" << std::hex << off << "~" << len << std::dec << " from " << h->file->fnode << dendl; @@ -1557,8 +1557,10 @@ int BlueFS::_read_random( s_lock.unlock(); uint64_t x_off = 0; auto p = h->file->fnode.seek(off, &x_off); - uint64_t l = std::min(p->length - x_off, static_cast(len)); ceph_assert(p != h->file->fnode.extents.end()); + uint64_t l = std::min(p->length - x_off, static_cast(len)); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); dout(20) << __func__ << " read random 0x" << std::hex << x_off << "~" << l << std::dec << " of " << *p << dendl; @@ -1577,7 +1579,7 @@ int BlueFS::_read_random( } } else { auto left = buf->get_buf_remaining(off); - int r = std::min(len, left); + int64_t r = std::min(len, left); logger->inc(l_bluefs_read_random_buffer_count, 1); logger->inc(l_bluefs_read_random_buffer_bytes, r); dout(20) << __func__ << " left 0x" << std::hex << left @@ -1608,7 +1610,7 @@ int BlueFS::_read_random( return ret; } -int BlueFS::_read( +int64_t BlueFS::_read( FileReader *h, ///< [in] read from here FileReaderBuffer *buf, ///< [in] reader state uint64_t off, ///< [in] offset @@ -1644,7 +1646,7 @@ int BlueFS::_read( if (outbl) outbl->clear(); - int ret = 0; + int64_t ret = 0; std::shared_lock s_lock(h->lock); while (len > 0) { size_t left; @@ -1668,6 +1670,8 @@ int BlueFS::_read( super.block_size); want = std::max(want, buf->max_prefetch); uint64_t l = std::min(p->length - x_off, want); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); if (!h->ignore_eof && buf->bl_off + l > eof_offset) { @@ -1689,7 +1693,7 @@ int BlueFS::_read( dout(20) << __func__ << " left 0x" << std::hex << left << " len 0x" << len << std::dec << dendl; - int r = std::min(len, left); + int64_t r = std::min(len, left); if (outbl) { bufferlist t; t.substr_of(buf->bl, off - buf->bl_off, r); @@ -1713,7 +1717,6 @@ int BlueFS::_read( ret += r; buf->pos += r; } - dout(20) << __func__ << " got " << ret << dendl; ceph_assert(!outbl || (int)outbl->length() == ret); --h->file->num_reading; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 2827c98f0197d..1b43896f1e668 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -408,14 +408,14 @@ private: int _preallocate(FileRef f, uint64_t off, uint64_t len); int _truncate(FileWriter *h, uint64_t off); - int _read( + int64_t _read( FileReader *h, ///< [in] read from here FileReaderBuffer *buf, ///< [in] reader state uint64_t offset, ///< [in] offset size_t len, ///< [in] this many bytes bufferlist *outbl, ///< [out] optional: reference the result here char *out); ///< [out] optional: or copy it here - int _read_random( + int64_t _read_random( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset size_t len, ///< [in] this many bytes @@ -593,14 +593,14 @@ public: _maybe_compact_log(l); return r; } - int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, + int64_t read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, bufferlist *outbl, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via // atomics and asserts). return _read(h, buf, offset, len, outbl, out); } - int read_random(FileReader *h, uint64_t offset, size_t len, + int64_t read_random(FileReader *h, uint64_t offset, size_t len, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index ed80d4688dc3b..df626395ec05a 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -47,7 +47,7 @@ class BlueRocksSequentialFile : public rocksdb::SequentialFile { // // REQUIRES: External synchronization rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override { - int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); + int64_t r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); ceph_assert(r >= 0); *result = rocksdb::Slice(scratch, r); return rocksdb::Status::OK(); @@ -95,7 +95,7 @@ class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { // Safe for concurrent use by multiple threads. rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result, char* scratch) const override { - int r = fs->read_random(h, offset, n, scratch); + int64_t r = fs->read_random(h, offset, n, scratch); ceph_assert(r >= 0); *result = rocksdb::Slice(scratch, r); return rocksdb::Status::OK(); diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 94cda07f5bda4..e33f424d37add 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -148,6 +148,7 @@ TEST(BlueFS, very_large_write) { bool old = g_ceph_context->_conf.get_val("bluefs_buffered_io"); g_ceph_context->_conf.set_val("bluefs_buffered_io", "false"); + uint64_t total_written = 0; ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); @@ -164,6 +165,12 @@ TEST(BlueFS, very_large_write) { ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false)); for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { h->append(buf, sizeof(buf)); + total_written += sizeof(buf); + } + fs.fsync(h); + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + h->append(buf, sizeof(buf)); + total_written += sizeof(buf); } fs.fsync(h); fs.close_writer(h); @@ -173,6 +180,7 @@ TEST(BlueFS, very_large_write) { ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); bufferlist bl; BlueFS::FileReaderBuffer readbuf(10485760); + ASSERT_EQ(h->file->fnode.size, total_written); for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { bl.clear(); fs.read(h, &readbuf, i * sizeof(buf), sizeof(buf), &bl, NULL); @@ -183,6 +191,23 @@ TEST(BlueFS, very_large_write) { } ASSERT_EQ(0, r); } + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + bl.clear(); + fs.read(h, &readbuf, i * sizeof(buf), sizeof(buf), &bl, NULL); + int r = memcmp(buf, bl.c_str(), sizeof(buf)); + if (r) { + cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r + << std::endl; + } + ASSERT_EQ(0, r); + } + delete h; + ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); + ASSERT_EQ(h->file->fnode.size, total_written); + unique_ptr huge_buf(new char[h->file->fnode.size]); + auto l = h->file->fnode.size; + int64_t r = fs.read(h, &readbuf, 0, l, NULL, huge_buf.get()); + ASSERT_EQ(r, (int64_t)l); delete h; } fs.umount(); -- 2.39.5