From: anand76 Date: Wed, 15 Sep 2021 19:43:35 +0000 (-0700) Subject: More robust checking of IO uring completion data (#8894) X-Git-Tag: v6.25.1~24 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7743f033b17bf3e0ea338bc6751b28adcc8dc559;p=rocksdb.git More robust checking of IO uring completion data (#8894) Summary: Potential bugs in the IO uring implementation can cause bad data to be returned in the completion queue. Add some checks in the PosixRandomAccessFile::MultiRead completion handling code to catch such errors and fail the entire MultiRead. Also log some diagnostic messages and stack trace. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8894 Reviewed By: siying, pdillinger Differential Revision: D30826982 Pulled By: anand1976 fbshipit-source-id: af91815ac760e095d6cc0466cf8bd5c10167fd15 --- diff --git a/HISTORY.md b/HISTORY.md index 680c05088..daac871ab 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,7 @@ * Fix a bug on POSIX in which failure to create a lock file (e.g. out of space) can prevent future LockFile attempts in the same process on the same file from succeeding. * Fix a bug that backup_rate_limiter and restore_rate_limiter in BackupEngine could not limit read rates. * Fix WAL log data corruption when using DBOptions.manual_wal_flush(true) and WriteOptions.sync(true) together. The sync WAL should work with locked log_write_mutex_. +* Add checks for validity of the IO uring completion queue entries, and fail the BlockBasedTableReader MultiGet sub-batch if there's an invalid completion ### New Features * RemoteCompaction's interface now includes `db_name`, `db_id`, `session_id`, which could help the user uniquely identify compaction job between db instances and sessions. diff --git a/env/io_posix.cc b/env/io_posix.cc index a041b32aa..ccdb33825 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -31,6 +31,7 @@ #endif #include "monitoring/iostats_context_imp.h" #include "port/port.h" +#include "port/stack_trace.h" #include "rocksdb/slice.h" #include "test_util/sync_point.h" #include "util/autovector.h" @@ -644,6 +645,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, autovector req_wraps; autovector incomplete_rq_list; + std::unordered_set wrap_cache; for (size_t i = 0; i < num_reqs; i++) { req_wraps.emplace_back(&reqs[i]); @@ -676,6 +678,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, sqe, fd_, &rep_to_submit->iov, 1, rep_to_submit->req->offset + rep_to_submit->finished_len); io_uring_sqe_set_data(sqe, rep_to_submit); + wrap_cache.emplace(rep_to_submit); } incomplete_rq_list.clear(); @@ -724,6 +727,22 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, } req_wrap = static_cast(io_uring_cqe_get_data(cqe)); + // Reset cqe data to catch any stray reuse of it + static_cast(cqe)->user_data = 0xd5d5d5d5d5d5d5d5; + // Check that we got a valid unique cqe data + auto wrap_check = wrap_cache.find(req_wrap); + if (wrap_check == wrap_cache.end()) { + fprintf(stderr, + "PosixRandomAccessFile::MultiRead: " + "Bad cqe data from IO uring - %p\n", + req_wrap); + port::PrintStack(); + ios = IOStatus::IOError("io_uring_cqe_get_data() returned " + + ToString((uint64_t)req_wrap)); + continue; + } + wrap_cache.erase(wrap_check); + FSReadRequest* req = req_wrap->req; if (cqe->res < 0) { req->result = Slice(req->scratch, 0); @@ -769,6 +788,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, } io_uring_cqe_seen(iu, cqe); } + wrap_cache.clear(); } return ios; #else diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index e51ad291a..d0d3679c8 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -1728,14 +1728,16 @@ void BlockBasedTable::RetrieveMultipleBlocks( { IOOptions opts; IOStatus s = file->PrepareIOOptions(options, opts); - if (s.IsTimedOut()) { + if (s.ok()) { + s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), + &direct_io_buf); + } + if (!s.ok()) { + // Discard all the results in this batch if there is any time out + // or overall MultiRead error for (FSReadRequest& req : read_reqs) { req.status = s; } - } else { - // How to handle this status code? - file->MultiRead(opts, &read_reqs[0], read_reqs.size(), &direct_io_buf) - .PermitUncheckedError(); } }