From 5a8e700eeec253a9a64b07167be3666a31c2a70b Mon Sep 17 00:00:00 2001 From: David Zafman Date: Wed, 28 Mar 2018 10:21:39 -0700 Subject: [PATCH] osd: Fix stale scrub stats when a primary takes over Fixes: http://tracker.ceph.com/issues/23267 Signed-off-by: David Zafman (cherry picked from commit 5cfb8241f482ed53c63bb97262425b2acb733d7d) Conflicts: src/osd/PG.cc (trivial) test/Makefile.am (Add to make check for Jewel) test/osd/CMakeLists.txt (Add to make check for Jewel) src/test/osd/osd-scrub-test.sh (from qa/standalone/scrub/osd-scrub-test.sh) Different location of ceph-helpers.sh No manager so remove run_mgr Remove default rbd pool Use 4 OSDs because remapped PG doesn't go clean --- src/osd/PG.cc | 6 ++ src/test/Makefile.am | 1 + src/test/osd/CMakeLists.txt | 1 + src/test/osd/osd-scrub-test.sh | 125 +++++++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+) create mode 100755 src/test/osd/osd-scrub-test.sh diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d05dc6bc7f170..4314b9b0b7fd0 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -5410,6 +5410,12 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) if (info.history.merge(oinfo.history)) dirty_info = true; reg_next_scrub(); + if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) { + info.stats.stats.sum.num_scrub_errors = 0; + info.stats.stats.sum.num_shallow_scrub_errors = 0; + info.stats.stats.sum.num_deep_scrub_errors = 0; + dirty_info = true; + } if (last_complete_ondisk.epoch >= info.history.last_epoch_started) { // DEBUG: verify that the snaps are empty in snap_mapper diff --git a/src/test/Makefile.am b/src/test/Makefile.am index 324a27e8c823f..07d0fadb7e979 100644 --- a/src/test/Makefile.am +++ b/src/test/Makefile.am @@ -121,6 +121,7 @@ check_SCRIPTS += \ test/mon/test_pool_quota.sh \ test/osd/osd-scrub-repair.sh \ test/osd/osd-scrub-snaps.sh \ + test/osd/osd-scrub-test.sh \ test/osd/osd-recovery-scrub.sh \ test/osd/osd-config.sh \ test/osd/osd-reuse-id.sh \ diff --git a/src/test/osd/CMakeLists.txt b/src/test/osd/CMakeLists.txt index 4dcc80ccfa046..e32fe819323ec 100644 --- a/src/test/osd/CMakeLists.txt +++ b/src/test/osd/CMakeLists.txt @@ -22,6 +22,7 @@ add_ceph_test(osd-reactivate.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-reactivate.sh) add_ceph_test(osd-reuse-id.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-reuse-id.sh) add_ceph_test(osd-scrub-repair.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-scrub-repair.sh) add_ceph_test(osd-scrub-snaps.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-scrub-snaps.sh) +add_ceph_test(osd-scrub-test.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-scrub-test.sh) add_ceph_test(osd-recovery-scrub.sh ${CMAKE_CURRENT_SOURCE_DIR}/osd-recovery-scrub.sh) #osd-copy-from.sh needs to be run out of ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} diff --git a/src/test/osd/osd-scrub-test.sh b/src/test/osd/osd-scrub-test.sh new file mode 100755 index 0000000000000..a5392b8c6bb45 --- /dev/null +++ b/src/test/osd/osd-scrub-test.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2018 Red Hat +# +# Author: David Zafman +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +source $CEPH_ROOT/qa/workunits/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7138" # git grep '\<7138\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + $func $dir || return 1 + done +} + +# In Jewel the poolid and resulting osdmap is different +# Also, remapped PGs aren't clean, so this test needs a fourth OSDs +# 3,1,0 -> 1,0,2 + +function TEST_scrub_test() { + local dir=$1 + local poolname=test + local OSDS=4 + local objects=15 + + TESTDATA="testdata.$$" + + setup $dir || return 1 + run_mon $dir a --osd_pool_default_size=3 || return 1 + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd || return 1 + done + + ceph osd pool delete rbd rbd --yes-i-really-really-mean-it + + # Create a pool with a single pg + ceph osd pool create $poolname 1 1 + wait_for_clean || return 1 + poolid=$(ceph osd dump | grep "^pool.*[']${poolname}[']" | awk '{ print $2 }') + + dd if=/dev/urandom of=$TESTDATA bs=1032 count=1 + for i in `seq 1 $objects` + do + rados -p $poolname put obj${i} $TESTDATA + done + rm -f $TESTDATA + + local primary=$(ceph --format json osd map test obj1 | jq '.up[0]') + local otherosd=$(ceph --format json osd map test obj1 | jq '.up[1]') + local anotherosd=$(ceph --format json osd map test obj1 | jq '.up[2]') + + ceph pg dump pgs + objectstore_tool $dir $anotherosd obj1 set-bytes /etc/fstab + + local pgid="${poolid}.0" + pg_deep_scrub "$pgid" || return 1 + sleep 5 + + ceph pg $pgid query | grep num_scrub_errors + ceph pg dump pgs | grep ^${pgid} | grep -q -- +inconsistent || return 1 + test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" || return 1 + + ceph osd out $primary + sleep 3 + wait_for_clean || return 1 + + ceph pg dump pgs + pg_deep_scrub "$pgid" || return 1 + sleep 5 + + ceph pg dump pgs + ceph pg $pgid query | grep num_scrub_errors + test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "2" || return 1 + #test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "2" || return 1 + ceph pg dump pgs | grep ^${pgid} | grep -q -- +inconsistent || return 1 + + ceph osd in $primary + sleep 4 + wait_for_clean || return 1 + + repair "$pgid" || return 1 + wait_for_clean || return 1 + + ceph pg $pgid query | grep num_scrub_errors + # This sets up the test after we've repaired with previous primary has old value + test "$(ceph pg $pgid query | jq '.peer_info[1].stats.stat_sum.num_scrub_errors')" = "2" || return 1 + ceph pg dump pgs | grep ^${pgid} | grep -vq -- +inconsistent || return 1 + + ceph osd out $primary + sleep 4 + wait_for_clean || return 1 + + ceph pg $pgid query | grep num_scrub_errors + test "$(ceph pg $pgid query | jq '.info.stats.stat_sum.num_scrub_errors')" = "0" || return 1 + test "$(ceph pg $pgid query | jq '.peer_info[0].stats.stat_sum.num_scrub_errors')" = "0" || return 1 + test "$(ceph pg $pgid query | jq '.peer_info[1].stats.stat_sum.num_scrub_errors')" = "0" || return 1 + ceph pg dump pgs | grep ^${pgid} | grep -vq -- +inconsistent || return 1 + + teardown $dir || return 1 +} + +main osd-scrub-test "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && \ +# test/osd/osd-scrub-test.sh" -- 2.39.5