Some kernels (4.9+) sometime fail to return data when reading
from a block device under memory pressure. This patch retries
the read if the checksum verification fails, tests show that
the first retried read succeeds in ~99.5% of the cases, so
3 attempts are made by default before giving up on the data.
Works-around: http://tracker.ceph.com/issues/22464
Signed-off-by: Paul Emmerich <paul.emmerich@croit.io>
(cherry picked from commit
cffcbc73aaaa874829d5fc9091af3042b887f9a7)
Conflicts:
src/common/legacy_config_opts.h
- adjacent options
src/common/options.cc
- no RUNTIME flag in luminous
src/os/bluestore/BlueStore.cc
src/os/bluestore/BlueStore.h
- adjacent perfcounter
src/test/objectstore/store_test.cc
- adjacent tests, no #ifdef
- g_conf, not g_conf()
- no create_new_collection
- queue_transaction etc take osr, not ch
OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
OPTION(bluestore_csum_min_block, OPT_U32)
OPTION(bluestore_csum_max_block, OPT_U32)
+OPTION(bluestore_retry_disk_reads, OPT_U64)
OPTION(bluestore_min_alloc_size, OPT_U32)
OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
OPTION(bluestore_shard_finishers, OPT_BOOL)
OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
+OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)
OPTION(kstore_max_ops, OPT_U64)
OPTION(kstore_max_bytes, OPT_U64)
.set_description("Maximum block size to checksum")
.add_see_also("bluestore_csum_min_block"),
+ Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+ .set_default(3)
+ .set_min_max(0, 255)
+ .set_description("Number of read retries on checksum validation error")
+ .set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),
+
Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.add_tag("mkfs")
.set_default(0)
.set_description(""),
+ Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
+ .set_default(0.0)
+ .set_description("inject crc verification errors into bluestore device reads"),
+
// -----------------------------------------
// kstore
"collection");
b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
"Read EIO errors propagated to high level callers");
+ b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
+ "Read operations that required at least one retry due to failed checksum validation");
logger = b.create_perf_counters();
cct->get_perfcounters_collection()->add(logger);
}
uint64_t offset,
size_t length,
bufferlist& bl,
- uint32_t op_flags)
+ uint32_t op_flags,
+ uint64_t retry_count)
{
FUNCTRACE();
int r = 0;
bufferlist& compressed_bl = *p++;
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
b2r_it->second.front().logical_offset) < 0) {
- return -EIO;
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
bufferlist raw_bl;
r = _decompress(compressed_bl, &raw_bl);
for (auto& reg : b2r_it->second) {
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
reg.logical_offset) < 0) {
- return -EIO;
+ // Handles spurious read errors caused by a kernel bug.
+ // We sometimes get all-zero pages as a result of the read under
+ // high memory pressure. Retrying the failing read succeeds in most cases.
+ // See also: http://tracker.ceph.com/issues/22464
+ if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
+ return -EIO;
+ }
+ return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
if (buffered) {
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
assert(pos == length);
assert(pr == pr_end);
r = bl.length();
+ if (retry_count) {
+ logger->inc(l_bluestore_reads_with_retries);
+ dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
+ << " failed " << std::dec << retry_count << " times before succeeding" << dendl;
+ }
return r;
}
uint64_t bad_csum;
utime_t start = ceph_clock_now();
int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
+ if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
+ (rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
+ derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
+ bad = blob_xoffset;
+ r = -1;
+ bad_csum = 0xDEADBEEF;
+ }
if (r < 0) {
if (r == -1) {
PExtentVector pex;
l_bluestore_extent_compress,
l_bluestore_gc_merged,
l_bluestore_read_eio,
+ l_bluestore_reads_with_retries,
l_bluestore_last
};
uint64_t offset,
size_t len,
bufferlist& bl,
- uint32_t op_flags = 0);
+ uint32_t op_flags = 0,
+ uint64_t retry_count = 0);
private:
int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
g_conf->apply_changes(NULL);
}
+TEST_P(StoreTest, SpuriousReadErrorTest) {
+ if (string(GetParam()) != "bluestore")
+ return;
+
+ ObjectStore::Sequencer osr("test");
+ int r;
+ auto logger = store->get_perf_counters();
+ coll_t cid;
+ ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
+ {
+ ObjectStore::Transaction t;
+ t.create_collection(cid, 0);
+ cerr << "Creating collection " << cid << std::endl;
+ r = store->queue_transaction(&osr, std::move(t), nullptr);
+ ASSERT_EQ(r, 0);
+ }
+ bufferlist test_data;
+ bufferptr ap(0x2000);
+ memset(ap.c_str(), 'a', 0x2000);
+ test_data.append(ap);
+ {
+ ObjectStore::Transaction t;
+ t.write(cid, hoid, 0, 0x2000, test_data);
+ r = store->queue_transaction(&osr, std::move(t), nullptr);
+ ASSERT_EQ(r, 0);
+ // force cache clear
+ EXPECT_EQ(store->umount(), 0);
+ EXPECT_EQ(store->mount(), 0);
+ }
+
+ cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
+ g_conf->set_val("bluestore_retry_disk_reads", "0");
+ g_conf->set_val("bluestore_debug_inject_csum_err_probability", "1");
+ g_ceph_context->_conf->apply_changes(nullptr);
+ {
+ bufferlist in;
+ r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(-EIO, r);
+ ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
+ ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
+ }
+
+ cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
+ g_conf->set_val("bluestore_retry_disk_reads", "255");
+ g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0.8");
+ /**
+ * Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
+ * Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
+ * Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
+ */
+ g_ceph_context->_conf->apply_changes(nullptr);
+ {
+ for (int i = 0; i < 25; ++i) {
+ bufferlist in;
+ r = store->read(cid, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
+ ASSERT_EQ(0x2000, r);
+ ASSERT_TRUE(bl_eq(test_data, in));
+ }
+ ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
+ }
+
+ // revert
+ g_conf->set_val("bluestore_retry_disk_reads", "3");
+ g_conf->set_val("bluestore_debug_inject_csum_err_probability", "0");
+ g_ceph_context->_conf->apply_changes(nullptr);
+}
+
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);