From f417eda9a448082d50225674e4c24dccff8b4f12 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 22 May 2015 15:54:22 +0800 Subject: [PATCH] tests/test-erasure-code: spin off eio tests into another testsuite * since the eio tests crashes some of the OSD nodes, before the change, the tests try to undo the crash before moving on, so it won't interfere with following tests. a more robust/clean way to do this is to isolate individual tests in a sandbox, so each eio test will have its own: setup + inject + verify crash + teardown cycle. this change helps to remove the cleanup/undo steps in invidual test. * update the disabled tests accordingly. * use a minimum set of OSDs and R-S(2,1) for the testing to speed up the test. * add the new testsuite to check_SCRIPTS Fixes: #11693 Signed-off-by: Kefu Chai --- src/test/erasure-code/Makefile.am | 3 +- src/test/erasure-code/test-erasure-code.sh | 144 -------------- src/test/erasure-code/test-erasure-eio.sh | 217 +++++++++++++++++++++ 3 files changed, 219 insertions(+), 145 deletions(-) create mode 100755 src/test/erasure-code/test-erasure-eio.sh diff --git a/src/test/erasure-code/Makefile.am b/src/test/erasure-code/Makefile.am index ad1780c042b6..327660b200ee 100644 --- a/src/test/erasure-code/Makefile.am +++ b/src/test/erasure-code/Makefile.am @@ -2,7 +2,8 @@ if ENABLE_SERVER if WITH_OSD check_SCRIPTS += \ - test/erasure-code/test-erasure-code.sh + test/erasure-code/test-erasure-code.sh \ + test/erasure-code/test-erasure-eio.sh noinst_HEADERS += \ test/erasure-code/ceph_erasure_code_benchmark.h diff --git a/src/test/erasure-code/test-erasure-code.sh b/src/test/erasure-code/test-erasure-code.sh index 76e98ae5a187..f84ac71e2f5c 100755 --- a/src/test/erasure-code/test-erasure-code.sh +++ b/src/test/erasure-code/test-erasure-code.sh @@ -295,150 +295,6 @@ function TEST_chunk_mapping() { ./ceph osd erasure-code-profile rm remap-profile } -# -# This test case tries to validate the following behavior: -# For object on EC pool, if there is one shard having read error ( -# either primary or replica), it will trigger OSD crash. -# -function TEST_rados_get_dataeio_no_subreadall_jerasure() { - local dir=$1 - - # check if osd_pool_erasure_code_subread_all is enabled or not - # turn it off if it is enabled - # - local subread=1 - CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok config get \ - "osd_pool_erasure_code_subread_all" | grep "true" - if (( $? == 0 )); then - subread=0 - for id in $(seq 0 10) ; do - kill_osd_daemon $dir $id || return 1 - run_osd $dir $id "--osd_pool_erasure_code_subread_all=false" || return 1 - done - fi - - local poolname=pool-jerasure - local profile=profile-jerasure - ./ceph osd erasure-code-profile set $profile \ - plugin=jerasure \ - k=4 m=2 \ - ruleset-failure-domain=osd || return 1 - ./ceph osd pool create $poolname 12 12 erasure $profile \ - || return 1 - - # inject eio on primary OSD (0), then peer OSD (1) - # OSD with eio injection will crash at reading object - # - for shardid in 0 1; do - local objname=obj-eio-$$-$shardid - local -a initial_osds=($(get_osds $poolname $objname)) - local last=$((${#initial_osds[@]} - 1)) - - CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.${initial_osds[$shardid]}.asok config set \ - filestore_debug_inject_read_err true || return 1 - CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.${initial_osds[$shardid]}.asok injectdataerr \ - $poolname $objname $shardid || return 1 - CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.${initial_osds[$shardid]}.asok config set \ - filestore_fail_eio false || return 1 - rados_put_get $dir $poolname $objname || return 1 - wait_for_osd down ${initial_osds[$shardid]} || return 1 - - # restart the crashed OSDs, note it could crash multiple OSDs, - # since after the primary's crash, the second replica could be - # promoted as primary and crash again due to read error - if (( $subread == 0 )); then - for s in "${initial_osds[@]}"; do - activate_osd $dir $s --osd_pool_erasure_code_subread_all=false || return 1 - done - else - for s in "${initial_osds[@]}"; do - activate_osd $dir $s || return 1 - done - fi - wait_for_clean - done - - delete_pool $poolname - ./ceph osd erasure-code-profile rm $profile -} - -# this test case is aimed to test the fix of https://github.com/ceph/ceph/pull/2952 -# this test case can test both client read and recovery read on EIO -# but at this moment, above pull request ONLY resolves client read on EIO -# so this case will fail at function *rados_put_get* when one OSD out -# so disable this case for now until both crashes of client read and recovery read -# on EIO to be fixed -# -#function TEST_rados_get_dataeio_subreadall_jerasure() { -# local dir=$1 -# -# # check if osd_pool_erasure_code_subread_all is enabled or not -# # turn it on if it is disabled -# # skip this case if osd_pool_erasure_code_subread_all is not supported -# # -# CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok config get \ -# "osd_pool_erasure_code_subread_all" | grep "error" -# if (( $? == 0 )); then -# echo "Skip this case because osd_pool_erasure_code_subread_all is not supported" -# return 0 -# fi -# -# # make sure osd_pool_erasure_code_subread_all is true on every OSD -# # -# for id in $(seq 0 10) ; do -# CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.$id.asok config get \ -# "osd_pool_erasure_code_subread_all" | grep "true" -# if (( $? != 0 )); then -# kill_osd_daemon $dir $id || return 1 -# run_osd $dir $id "--osd_pool_erasure_code_subread_all=true" || return 1 -# fi -# done -# -# local poolname=pool-jerasure -# local profile=profile-jerasure -# ./ceph osd erasure-code-profile set $profile \ -# plugin=jerasure \ -# k=4 m=2 \ -# ruleset-failure-domain=osd || return 1 -# ./ceph osd pool create $poolname 12 12 erasure $profile \ -# || return 1 -# -# # inject eio on primary OSD (0), then peer OSD (1) -# # primary OSD will not crash at reading object but pg will be marked as inconsistent -# # -# for shardid in 0 1; do -# local objname=obj-eio-$$-$shardid -# local -a initial_osds=($(get_osds $poolname $objname)) -# local last=$((${#initial_osds[@]} - 1)) -# local pg=$(get_pg $poolname $objname) -# -# CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.${initial_osds[$shardid]}.asok config set \ -# filestore_debug_inject_read_err true || return 1 -# CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.${initial_osds[$shardid]}.asok injectdataerr \ -# $poolname $objname $shardid || return 1 -# rados_put_get $dir $poolname $objname || return 1 -# check_osd_status ${initial_osds[0]} "up" || return 1 -# -# # the reason to skip this check when current shardid != 0 is that the first k chunks returned is not -# # always containing current shardid, so this pg may not be marked as inconsistent -# # However, primary OSD (when shardid == 0) is always the faster one normally, so we can check pg status -# if (( $shardid == 0 )); then -# check_pg_status $pg "inconsistent" || return 1 -# fi -# -# # recreate crashed OSD with the same id since I don't know how to restart it :( -# if (( $shardid != 0 )); then -# kill_osd_daemon $dir ${initial_osds[0]} || return 1 -# run_osd $dir ${initial_osds[0]} "--osd_pool_erasure_code_subread_all=true" || return 1 -# fi -# kill_osd_daemon $dir ${initial_osds[$shardid]} || return 1 -# run_osd $dir ${initial_osds[$shardid]} "--osd_pool_erasure_code_subread_all=true" || return 1 -# done -# -# delete_pool $poolname -# ./ceph osd erasure-code-profile rm $profile -#} - main test-erasure-code "$@" # Local Variables: diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh new file mode 100755 index 000000000000..9a116fbe3455 --- /dev/null +++ b/src/test/erasure-code/test-erasure-eio.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# +# Author: Kefu Chai +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source test/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7112" + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--enable-experimental-unrecoverable-data-corrupting-features=shec " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + run_mon $dir a || return 1 + # check that erasure code plugins are preloaded + CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-mon.a.asok log flush || return 1 + grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function setup_osds() { + local subread=$1 + + for id in $(seq 0 3) ; do + # TODO: the feature of "osd-pool-erasure-code-subread-all" is not yet supported. + if -n osd_pool_erasure_code_subread_all__is_supported; then + run_osd $dir $id "--osd-pool-erasure-code-subread-all=$subread" || return 1 + else + run_osd $dir $id || return 1 + fi + done + wait_for_clean || return 1 + + # check that erasure code plugins are preloaded + CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.0.asok log flush || return 1 + grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1 +} + +function create_erasure_coded_pool() { + local poolname=$1 + + ./ceph osd erasure-code-profile set myprofile \ + plugin=jerasure \ + k=2 m=1 \ + ruleset-failure-domain=osd || return 1 + ./ceph osd pool create $poolname 1 1 erasure myprofile \ + || return 1 + wait_for_clean || return 1 +} + +function delete_pool() { + local poolname=$1 + + ./ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it + ./ceph osd erasure-code-profile rm myprofile +} + +function rados_put_get() { + local dir=$1 + local poolname=$2 + local objname=${3:-SOMETHING} + + for marker in AAA BBB CCCC DDDD ; do + printf "%*s" 1024 $marker + done > $dir/ORIGINAL + # + # get and put an object, compare they are equal + # + ./rados --pool $poolname put $objname $dir/ORIGINAL || return 1 + ./rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + rm $dir/COPY + # + # take out the first OSD used to store the object and + # check the object can still be retrieved, which implies + # recovery + # + local -a initial_osds=($(get_osds $poolname $objname)) + local last=$((${#initial_osds[@]} - 1)) + ./ceph osd out ${initial_osds[$last]} || return 1 + ! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1 + ./rados --pool $poolname get $objname $dir/COPY || return 1 + diff $dir/ORIGINAL $dir/COPY || return 1 + ./ceph osd in ${initial_osds[$last]} || return 1 + + rm $dir/ORIGINAL +} + +function rados_get_data_eio() { + local dir=$1 + shift + local shard_id=$1 + shift + local osd_state=$1 + shift + + # inject eio to speificied shard + # OSD with eio injection will crash at reading object + # + local poolname=pool-jerasure + local objname=obj-eio-$$-$shard_id + local -a initial_osds=($(get_osds $poolname $objname)) + local osd_id=${initial_osds[$shard_id]} + local last=$((${#initial_osds[@]} - 1)) + # set_config osd $osd_id filestore_debug_inject_read_err true || return 1 + set_config osd $osd_id filestore_debug_inject_read_err true || return 1 + CEPH_ARGS='' ./ceph --admin-daemon $dir/ceph-osd.$osd_id.asok \ + injectdataerr $poolname $objname $shard_id || return 1 + set_config osd $osd_id filestore_fail_eio false || return 1 + + rados_put_get $dir $poolname $objname || return 1 + TIMEOUT=1 wait_for_osd $osd_state $osd_id || return 1 +} + +# +# These two test cases try to validate the following behavior: +# For object on EC pool, if there is one shard having read error ( +# either primary or replica), it will trigger OSD crash. +# +function TEST_rados_get_without_subreadall_eio_shard_0() { + local dir=$1 + setup_osds false || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio on primary OSD (0) + local shard_id=0 + rados_get_data_eio $dir $shard_id down || return 1 + delete_pool $poolname +} + +function TEST_rados_get_without_subreadall_eio_shard_1() { + local dir=$1 + setup_osds false || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio into replica OSD (1) + local shard_id=1 + rados_get_data_eio $dir $shard_id down || return 1 + delete_pool $poolname +} + + +: <<'DISABLED_TESTS' +# this test case is aimed to test the fix of https://github.com/ceph/ceph/pull/2952 +# this test case can test both client read and recovery read on EIO +# but at this moment, above pull request ONLY resolves client read on EIO +# so this case will fail at function *rados_put_get* when one OSD out +# so disable this case for now until both crashes of client read and recovery read +# on EIO to be fixed +# + +function TEST_rados_get_with_subreadall_eio_shard_0() { + local dir=$1 + local shard_id=0 + + setup_osds true || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio on primary OSD (0) + local shard_id=0 + rados_get_data_eio $dir $shard_id up || return 1 + + check_pg_status $pg "inconsistent" || return 1 + delete_pool $poolname +} + +function TEST_rados_get_with_subreadall_eio_shard_1() { + local dir=$1 + local shard_id=0 + + setup_osds true || return 1 + + local poolname=pool-jerasure + create_erasure_coded_pool $poolname || return 1 + # inject eio on replica OSD (1) + local shard_id=1 + rados_get_data_eio $dir $shard_id up || return 1 + + # the reason to skip this check when current shardid != 0 is that the first + # k chunks returned is not always containing current shardid, so this pg may + # not be marked as inconsistent. However, primary OSD (when shard_id == 0) is + # always the faster one normally, so we can check pg status. + ## check_pg_status $pg "inconsistent" || return 1 + delete_pool $poolname +} +DISABLED_TESTS + +main test-erasure-eio "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-eio.sh" +# End: -- 2.47.3