From 4315221179a557f9acd7527cc8be92716b15b332 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Thu, 6 Apr 2017 11:40:41 -0700 Subject: [PATCH] osd,OSDMonitor: try to protect against ec overwrites with filestore This isn't perfect, but it's better than nothing. Prevent enabling the allow_ec_overwrites flag if any of a sample of pgs in the pool map to osds using filestore. This mainly protects filestore-only clusters from enabling it. If a filestore osd is started later, warn in the cluster log when it gets a pg with ec overwrites enabled. Signed-off-by: Josh Durgin --- qa/workunits/cephtool/test.sh | 12 +++++++++ src/mon/OSDMonitor.cc | 47 +++++++++++++++++++++++++++++++++++ src/mon/OSDMonitor.h | 3 +++ src/osd/OSD.cc | 7 ++++++ 4 files changed, 69 insertions(+) diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index 94c11aeebf24b..4398ab6114604 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -1320,7 +1320,19 @@ function test_mon_osd_pool() # should fail because the type is not the same expect_false ceph osd pool create replicated 12 12 erasure ceph osd lspools | grep replicated + ceph osd pool create ec_test 1 1 erasure + set +e + ceph osd metadata | grep osd_objectstore_type | grep -qc bluestore + if [ $? -eq 0 ]; then + ceph osd pool set ec_test allow_ec_overwrites true >& $TMPFILE + check_response $? 22 "pool must only be stored on bluestore for scrubbing to work" + else + ceph osd pool set ec_test allow_ec_overwrites true || return 1 + expect_false ceph osd pool set ec_test allow_ec_overwrites false + fi + set -e ceph osd pool delete replicated replicated --yes-i-really-really-mean-it + ceph osd pool delete ec_test ec_test --yes-i-really-really-mean-it } function test_mon_osd_pool_quota() diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 5cdbac9866ccb..1b48c8edccb63 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1232,6 +1232,48 @@ int OSDMonitor::load_metadata(int osd, map& m, ostream *err) return 0; } +int OSDMonitor::get_osd_objectstore_type(int osd, string *type) +{ + map metadata; + int r = load_metadata(osd, metadata, nullptr); + if (r < 0) + return r; + + auto it = metadata.find("osd_objectstore"); + if (it == metadata.end()) + return -ENOENT; + *type = it->second; + return 0; +} + +bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id, + const pg_pool_t &pool, + ostream *err) +{ + // just check a few pgs for efficiency - this can't give a guarantee anyway, + // since filestore osds could always join the pool later + set checked_osds; + for (unsigned ps = 0; ps < MIN(8, pool.get_pg_num()); ++ps) { + vector up, acting; + pg_t pgid(ps, pool_id, -1); + osdmap.pg_to_up_acting_osds(pgid, up, acting); + for (int osd : up) { + if (checked_osds.find(osd) != checked_osds.end()) + continue; + string objectstore_type; + int r = get_osd_objectstore_type(osd, &objectstore_type); + // allow with missing metadata, e.g. due to an osd never booting yet + if (r < 0 || objectstore_type == "bluestore") { + checked_osds.insert(osd); + continue; + } + *err << "osd." << osd << " uses " << objectstore_type; + return false; + } + } + return true; +} + int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err) { map m; @@ -5782,6 +5824,11 @@ int OSDMonitor::prepare_command_pool_set(map &cmdmap, ss << "expecting value 'true', 'false', '0', or '1'"; return -EINVAL; } + stringstream err; + if (!is_pool_currently_all_bluestore(pool, p, &err)) { + ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str(); + return -EINVAL; + } } else if (var == "target_max_objects") { if (interr.length()) { ss << "error parsing int '" << val << "': " << interr; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 3ccbeecec76d7..5144e89b6d1bd 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -411,6 +411,9 @@ private: OpTracker op_tracker; int load_metadata(int osd, map& m, ostream *err); + int get_osd_objectstore_type(int osd, std::string *type); + bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool, + ostream *err); // when we last received PG stats from each osd map last_osd_report; diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 1082c66e5dbcf..809e8f21e5d4c 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3894,6 +3894,13 @@ int OSD::handle_pg_peering_evt( switch (result) { case RES_NONE: { const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool()); + if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) && + store->get_type() != "bluestore") { + clog->warn() << "pg " << pgid + << " is at risk of silent data corruption: " + << "the pool allows ec overwrites but is not stored in " + << "bluestore, so deep scrubbing will not detect bitrot"; + } PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num())); PG::_init(*rctx.transaction, pgid, pp); -- 2.39.5