From 400e67aa5f0632c41d93f3a860d8c5db5fdbc289 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Thu, 9 Apr 2020 00:06:14 +0300 Subject: [PATCH] os/bluestore: fix huge (>4GB) bluefs reads Signed-off-by: Igor Fedotov (cherry picked from commit fe558c4dd12b25b2d523987f22fde5b5898cfb83) Conflicts: (trivial) src/os/bluestore/BlueFS.h (trivial) src/os/bluestore/BlueRocksEnv.cc (trivial) src/test/objectstore/test_bluefs.cc --- src/os/bluestore/BlueFS.cc | 17 ++++++++++------- src/os/bluestore/BlueFS.h | 8 ++++---- src/os/bluestore/BlueRocksEnv.cc | 4 ++-- src/test/objectstore/test_bluefs.cc | 27 ++++++++++++++++++++++++++- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 2ba18f0266d9..871cd0e55e4c 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1882,7 +1882,7 @@ void BlueFS::_drop_link(FileRef file) } } -int BlueFS::_read_random( +int64_t BlueFS::_read_random( FileReader *h, ///< [in] read from here uint64_t off, ///< [in] offset uint64_t len, ///< [in] this many bytes @@ -1890,7 +1890,7 @@ int BlueFS::_read_random( { auto* buf = &h->buf; - int ret = 0; + int64_t ret = 0; dout(10) << __func__ << " h " << h << " 0x" << std::hex << off << "~" << len << std::dec << " from " << h->file->fnode << dendl; @@ -1918,6 +1918,8 @@ int BlueFS::_read_random( auto p = h->file->fnode.seek(off, &x_off); ceph_assert(p != h->file->fnode.extents.end()); uint64_t l = std::min(p->length - x_off, len); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); dout(20) << __func__ << " read random 0x" << std::hex << x_off << "~" << l << std::dec << " of " << *p << dendl; @@ -1936,7 +1938,7 @@ int BlueFS::_read_random( } } else { auto left = buf->get_buf_remaining(off); - int r = std::min(len, left); + int64_t r = std::min(len, left); logger->inc(l_bluefs_read_random_buffer_count, 1); logger->inc(l_bluefs_read_random_buffer_bytes, r); dout(20) << __func__ << " left 0x" << std::hex << left @@ -1967,7 +1969,7 @@ int BlueFS::_read_random( return ret; } -int BlueFS::_read( +int64_t BlueFS::_read( FileReader *h, ///< [in] read from here FileReaderBuffer *buf, ///< [in] reader state uint64_t off, ///< [in] offset @@ -2003,7 +2005,7 @@ int BlueFS::_read( if (outbl) outbl->clear(); - int ret = 0; + int64_t ret = 0; std::shared_lock s_lock(h->lock); while (len > 0) { size_t left; @@ -2027,6 +2029,8 @@ int BlueFS::_read( super.block_size); want = std::max(want, buf->max_prefetch); uint64_t l = std::min(p->length - x_off, want); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); if (!h->ignore_eof && buf->bl_off + l > eof_offset) { @@ -2048,7 +2052,7 @@ int BlueFS::_read( dout(20) << __func__ << " left 0x" << std::hex << left << " len 0x" << len << std::dec << dendl; - int r = std::min(len, left); + int64_t r = std::min(len, left); if (outbl) { bufferlist t; t.substr_of(buf->bl, off - buf->bl_off, r); @@ -2072,7 +2076,6 @@ int BlueFS::_read( ret += r; buf->pos += r; } - dout(20) << __func__ << " got " << ret << dendl; ceph_assert(!outbl || (int)outbl->length() == ret); --h->file->num_reading; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index eddabf7344f2..14fca0fb5b3c 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -394,14 +394,14 @@ private: int _preallocate(FileRef f, uint64_t off, uint64_t len); int _truncate(FileWriter *h, uint64_t off); - int _read( + int64_t _read( FileReader *h, ///< [in] read from here FileReaderBuffer *buf, ///< [in] reader state uint64_t offset, ///< [in] offset size_t len, ///< [in] this many bytes bufferlist *outbl, ///< [out] optional: reference the result here char *out); ///< [out] optional: or copy it here - int _read_random( + int64_t _read_random( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset uint64_t len, ///< [in] this many bytes @@ -570,14 +570,14 @@ public: _maybe_compact_log(l); return r; } - int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, + int64_t read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, bufferlist *outbl, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via // atomics and asserts). return _read(h, buf, offset, len, outbl, out); } - int read_random(FileReader *h, uint64_t offset, size_t len, + int64_t read_random(FileReader *h, uint64_t offset, size_t len, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index aa689dbc08b9..b48e0678daa7 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -47,7 +47,7 @@ class BlueRocksSequentialFile : public rocksdb::SequentialFile { // // REQUIRES: External synchronization rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override { - int r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); + int64_t r = fs->read(h, &h->buf, h->buf.pos, n, NULL, scratch); ceph_assert(r >= 0); *result = rocksdb::Slice(scratch, r); return rocksdb::Status::OK(); @@ -95,7 +95,7 @@ class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { // Safe for concurrent use by multiple threads. rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result, char* scratch) const override { - int r = fs->read_random(h, offset, n, scratch); + int64_t r = fs->read_random(h, offset, n, scratch); ceph_assert(r >= 0); *result = rocksdb::Slice(scratch, r); return rocksdb::Status::OK(); diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index f2103521e057..3484b4401e48 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -237,13 +237,14 @@ TEST(BlueFS, small_appends) { } TEST(BlueFS, very_large_write) { - // we'll write a ~3G file, so allocate more than that for the whole fs + // we'll write a ~5G file, so allocate more than that for the whole fs uint64_t size = 1048576 * 1024 * 8ull; TempBdev bdev{size}; BlueFS fs(g_ceph_context); bool old = g_ceph_context->_conf.get_val("bluefs_buffered_io"); g_ceph_context->_conf.set_val("bluefs_buffered_io", "false"); + uint64_t total_written = 0; ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); @@ -261,6 +262,12 @@ TEST(BlueFS, very_large_write) { ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false)); for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { h->append(buf, sizeof(buf)); + total_written += sizeof(buf); + } + fs.fsync(h); + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + h->append(buf, sizeof(buf)); + total_written += sizeof(buf); } fs.fsync(h); fs.close_writer(h); @@ -270,6 +277,7 @@ TEST(BlueFS, very_large_write) { ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); bufferlist bl; BlueFS::FileReaderBuffer readbuf(10485760); + ASSERT_EQ(h->file->fnode.size, total_written); for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { bl.clear(); fs.read(h, &readbuf, i * sizeof(buf), sizeof(buf), &bl, NULL); @@ -280,6 +288,23 @@ TEST(BlueFS, very_large_write) { } ASSERT_EQ(0, r); } + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + bl.clear(); + fs.read(h, &readbuf, i * sizeof(buf), sizeof(buf), &bl, NULL); + int r = memcmp(buf, bl.c_str(), sizeof(buf)); + if (r) { + cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r + << std::endl; + } + ASSERT_EQ(0, r); + } + delete h; + ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); + ASSERT_EQ(h->file->fnode.size, total_written); + unique_ptr huge_buf(new char[h->file->fnode.size]); + auto l = h->file->fnode.size; + int64_t r = fs.read(h, &readbuf, 0, l, NULL, huge_buf.get()); + ASSERT_EQ(r, l); delete h; } fs.umount(); -- 2.47.3