From fe558c4dd12b25b2d523987f22fde5b5898cfb83 Mon Sep 17 00:00:00 2001 From: Igor Fedotov Date: Thu, 9 Apr 2020 00:06:14 +0300 Subject: [PATCH] os/bluestore: fix huge (>4GB) bluefs reads Signed-off-by: Igor Fedotov --- src/os/bluestore/BlueFS.cc | 17 ++++++++++------- src/os/bluestore/BlueFS.h | 8 ++++---- src/os/bluestore/BlueRocksEnv.cc | 4 ++-- src/test/objectstore/test_bluefs.cc | 27 ++++++++++++++++++++++++++- 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index eca048ecf45e7..00a4303b3e2ac 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -1881,7 +1881,7 @@ void BlueFS::_drop_link(FileRef file) } } -int BlueFS::_read_random( +int64_t BlueFS::_read_random( FileReader *h, ///< [in] read from here uint64_t off, ///< [in] offset uint64_t len, ///< [in] this many bytes @@ -1889,7 +1889,7 @@ int BlueFS::_read_random( { auto* buf = &h->buf; - int ret = 0; + int64_t ret = 0; dout(10) << __func__ << " h " << h << " 0x" << std::hex << off << "~" << len << std::dec << " from " << h->file->fnode << dendl; @@ -1915,6 +1915,8 @@ int BlueFS::_read_random( uint64_t x_off = 0; auto p = h->file->fnode.seek(off, &x_off); uint64_t l = std::min(p->length - x_off, len); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); dout(20) << __func__ << " read random 0x" << std::hex << x_off << "~" << l << std::dec << " of " << *p << dendl; @@ -1933,7 +1935,7 @@ int BlueFS::_read_random( } } else { auto left = buf->get_buf_remaining(off); - int r = std::min(len, left); + int64_t r = std::min(len, left); logger->inc(l_bluefs_read_random_buffer_count, 1); logger->inc(l_bluefs_read_random_buffer_bytes, r); dout(20) << __func__ << " left 0x" << std::hex << left @@ -1962,7 +1964,7 @@ int BlueFS::_read_random( return ret; } -int BlueFS::_read( +int64_t BlueFS::_read( FileReader *h, ///< [in] read from here uint64_t off, ///< [in] offset size_t len, ///< [in] this many bytes @@ -1999,7 +2001,7 @@ int BlueFS::_read( if (outbl) outbl->clear(); - int ret = 0; + int64_t ret = 0; std::shared_lock s_lock(h->lock); while (len > 0) { size_t left; @@ -2016,6 +2018,8 @@ int BlueFS::_read( super.block_size); want = std::max(want, buf->max_prefetch); uint64_t l = std::min(p->length - x_off, want); + //hard cap to 1GB + l = std::min(l, uint64_t(1) << 30); uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size); if (!h->ignore_eof && buf->bl_off + l > eof_offset) { @@ -2037,7 +2041,7 @@ int BlueFS::_read( dout(20) << __func__ << " left 0x" << std::hex << left << " len 0x" << len << std::dec << dendl; - int r = std::min(len, left); + int64_t r = std::min(len, left); if (outbl) { bufferlist t; t.substr_of(buf->bl, off - buf->bl_off, r); @@ -2061,7 +2065,6 @@ int BlueFS::_read( ret += r; buf->pos += r; } - dout(20) << __func__ << " got " << ret << dendl; ceph_assert(!outbl || (int)outbl->length() == ret); --h->file->num_reading; diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 82680151e0c51..2ecc187d9574e 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -397,13 +397,13 @@ private: int _preallocate(FileRef f, uint64_t off, uint64_t len); int _truncate(FileWriter *h, uint64_t off); - int _read( + int64_t _read( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset size_t len, ///< [in] this many bytes ceph::buffer::list *outbl, ///< [out] optional: reference the result here char *out); ///< [out] optional: or copy it here - int _read_random( + int64_t _read_random( FileReader *h, ///< [in] read from here uint64_t offset, ///< [in] offset uint64_t len, ///< [in] this many bytes @@ -567,14 +567,14 @@ public: std::unique_lock l(lock); return _fsync(h, l); } - int read(FileReader *h, uint64_t offset, size_t len, + int64_t read(FileReader *h, uint64_t offset, size_t len, ceph::buffer::list *outbl, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via // atomics and asserts). return _read(h, offset, len, outbl, out); } - int read_random(FileReader *h, uint64_t offset, size_t len, + int64_t read_random(FileReader *h, uint64_t offset, size_t len, char *out) { // no need to hold the global lock here; we only touch h and // h->file, and read vs write or delete is already protected (via diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index 55965c3d0fca9..e77a07148ce6f 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -47,7 +47,7 @@ class BlueRocksSequentialFile : public rocksdb::SequentialFile { // // REQUIRES: External synchronization rocksdb::Status Read(size_t n, rocksdb::Slice* result, char* scratch) override { - int r = fs->read(h, h->buf.pos, n, NULL, scratch); + int64_t r = fs->read(h, h->buf.pos, n, NULL, scratch); ceph_assert(r >= 0); *result = rocksdb::Slice(scratch, r); return rocksdb::Status::OK(); @@ -96,7 +96,7 @@ class BlueRocksRandomAccessFile : public rocksdb::RandomAccessFile { // Safe for concurrent use by multiple threads. rocksdb::Status Read(uint64_t offset, size_t n, rocksdb::Slice* result, char* scratch) const override { - int r = fs->read_random(h, offset, n, scratch); + int64_t r = fs->read_random(h, offset, n, scratch); ceph_assert(r >= 0); *result = rocksdb::Slice(scratch, r); return rocksdb::Status::OK(); diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 388ecc408f34e..8835c36c9880b 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -198,13 +198,14 @@ TEST(BlueFS, small_appends) { } TEST(BlueFS, very_large_write) { - // we'll write a ~3G file, so allocate more than that for the whole fs + // we'll write a ~5G file, so allocate more than that for the whole fs uint64_t size = 1048576 * 1024 * 8ull; TempBdev bdev{size}; BlueFS fs(g_ceph_context); bool old = g_ceph_context->_conf.get_val("bluefs_buffered_io"); g_ceph_context->_conf.set_val("bluefs_buffered_io", "false"); + uint64_t total_written = 0; ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev.path, false)); fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size - 1048576); @@ -222,6 +223,12 @@ TEST(BlueFS, very_large_write) { ASSERT_EQ(0, fs.open_for_write("dir", "bigfile", &h, false)); for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { h->append(buf, sizeof(buf)); + total_written += sizeof(buf); + } + fs.fsync(h); + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + h->append(buf, sizeof(buf)); + total_written += sizeof(buf); } fs.fsync(h); fs.close_writer(h); @@ -230,6 +237,7 @@ TEST(BlueFS, very_large_write) { BlueFS::FileReader *h; ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); bufferlist bl; + ASSERT_EQ(h->file->fnode.size, total_written); for (unsigned i = 0; i < 3*1024*1048576ull / sizeof(buf); ++i) { bl.clear(); fs.read(h, i * sizeof(buf), sizeof(buf), &bl, NULL); @@ -240,6 +248,23 @@ TEST(BlueFS, very_large_write) { } ASSERT_EQ(0, r); } + for (unsigned i = 0; i < 2*1024*1048576ull / sizeof(buf); ++i) { + bl.clear(); + fs.read(h, i * sizeof(buf), sizeof(buf), &bl, NULL); + int r = memcmp(buf, bl.c_str(), sizeof(buf)); + if (r) { + cerr << "read got mismatch at offset " << i*sizeof(buf) << " r " << r + << std::endl; + } + ASSERT_EQ(0, r); + } + delete h; + ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h)); + ASSERT_EQ(h->file->fnode.size, total_written); + unique_ptr huge_buf(new char[h->file->fnode.size]); + auto l = h->file->fnode.size; + int64_t r = fs.read(h, 0, l, NULL, huge_buf.get()); + ASSERT_EQ(r, l); delete h; } fs.umount(); -- 2.39.5