From 2feb1127c9d9b8175a79f80e00b85c2e8460c862 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 4 Jun 2018 12:51:11 -0500 Subject: [PATCH] osd/PrimaryLogPG: fix on_local_recover crash on stray clone If there is a stray clone (one that does not appear in the SnapSet) and we do any sort of recovery on it the OSD will crash. Log an error instead but continue. This addresses a problem where a cluster has both (1) an unexpected clone and (2) the clone is not present on all replicas. Doing repair on that PG will both not fix the unexpected clone and also cause the remaining OSDs to crash trying to recover it. Include a test. Fixes: https://tracker.ceph.com/issues/24396 Signed-off-by: Sage Weil (cherry picked from commit 154330fd68d952a4e1b972891f02b6dc9c355424) --- qa/standalone/scrub/osd-unexpected-clone.sh | 88 +++++++++++++++++++++ src/osd/PrimaryLogPG.cc | 17 ++-- 2 files changed, 98 insertions(+), 7 deletions(-) create mode 100755 qa/standalone/scrub/osd-unexpected-clone.sh diff --git a/qa/standalone/scrub/osd-unexpected-clone.sh b/qa/standalone/scrub/osd-unexpected-clone.sh new file mode 100755 index 0000000000000..b969405f82bba --- /dev/null +++ b/qa/standalone/scrub/osd-unexpected-clone.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2015 Intel +# Copyright (C) 2014, 2015 Red Hat +# +# Author: Xiaoxi Chen +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + +function TEST_recover_unexpected() { + local dir=$1 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + run_osd $dir 0 || return 1 + run_osd $dir 1 || return 1 + run_osd $dir 2 || return 1 + + ceph osd pool create foo 1 + rados -p foo put foo /etc/passwd + rados -p foo mksnap snap + rados -p foo put foo /etc/motd + + wait_for_clean || return 1 + + local osd=$(get_primary foo foo) + + JSON=`objectstore_tool $dir $osd --op list foo | grep snapid.:1` + echo "JSON is $JSON" + rm -f _ data + objectstore_tool $dir $osd "$JSON" get-attr _ > _ + objectstore_tool $dir $osd "$JSON" get-bytes data + + rados -p foo rmsnap snap + + sleep 5 + + objectstore_tool $dir $osd "$JSON" set-bytes data + objectstore_tool $dir $osd "$JSON" set-attr _ _ + + sleep 5 + + ceph pg repair 1.0 + + sleep 10 + + ceph log last + + # make sure osds are still up + timeout 60 ceph tell osd.0 version || return 1 + timeout 60 ceph tell osd.1 version || return 1 + timeout 60 ceph tell osd.2 version || return 1 +} + + +main osd-unexpected-clone "$@" + +# Local Variables: +# compile-command: "cd ../.. ; make -j4 && test/osd/osd-bench.sh" +# End: diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 848b2dd12b1a4..ffadcde38c8d9 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -361,13 +361,16 @@ void PrimaryLogPG::on_local_recover( set snaps; dout(20) << " snapset " << recovery_info.ss << dendl; auto p = recovery_info.ss.clone_snaps.find(hoid.snap); - assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn? - snaps.insert(p->second.begin(), p->second.end()); - dout(20) << " snaps " << snaps << dendl; - snap_mapper.add_oid( - recovery_info.soid, - snaps, - &_t); + if (p != recovery_info.ss.clone_snaps.end()) { + snaps.insert(p->second.begin(), p->second.end()); + dout(20) << " snaps " << snaps << dendl; + snap_mapper.add_oid( + recovery_info.soid, + snaps, + &_t); + } else { + derr << __func__ << " " << hoid << " had no clone_snaps" << dendl; + } } if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) && pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) { -- 2.39.5