From: Sage Weil Date: Fri, 26 May 2017 20:07:20 +0000 (-0400) Subject: mon/OSDMonitor: change info in 'osd failed' messages X-Git-Tag: v12.1.0~342^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4cb882e026355b6f064f2bde2ef19f9d2f04259a;p=ceph-ci.git mon/OSDMonitor: change info in 'osd failed' messages - put 'osd.NNN failed' at front - drop the ip address - include the crush location - be consistent: "osd.$id failed ($crush_location) ($reason)" 2017-05-26 16:05:21.573460 mon.0 [INF] osd.0 failed (root=default,host=gnit) (refused connection reported by osd.1) 2017-05-26 16:06:17.198225 mon.0 [INF] osd.1 failed (root=default,host=gnit) (2 reporters from different osd after 20.098778 >= grace 20.000000) Signed-off-by: Sage Weil --- diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index e28621cffaf..9650e8098c0 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1964,8 +1964,12 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) << " down" << dendl; pending_inc.new_state[target_osd] = CEPH_OSD_UP; - mon->clog->info() << osdmap.get_inst(target_osd) << " failed (" - << (int)reporters_by_subtree.size() << " reporters from different " + mon->clog->info() << "osd." << target_osd << " failed (" + << osdmap.crush->get_full_location_ordered_string( + target_osd) + << ") (" + << (int)reporters_by_subtree.size() + << " reporters from different " << reporter_subtree_level << " after " << failed_for << " >= grace " << grace << ")"; return true; @@ -1973,7 +1977,7 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi) return false; } -void OSDMonitor::force_failure(utime_t now, int target_osd) +void OSDMonitor::force_failure(utime_t now, int target_osd, int by) { // already pending failure? if (pending_inc.new_state.count(target_osd) && @@ -1985,7 +1989,9 @@ void OSDMonitor::force_failure(utime_t now, int target_osd) dout(1) << " we're forcing failure of osd." << target_osd << dendl; pending_inc.new_state[target_osd] = CEPH_OSD_UP; - mon->clog->info() << osdmap.get_inst(target_osd) << " failed (forced)"; + mon->clog->info() << "osd." << target_osd << " failed (" + << osdmap.crush->get_full_location_ordered_string(target_osd) + << ") (connection refused reported by osd." << by << ")"; return; } @@ -2012,7 +2018,7 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op) if (m->is_immediate()) { mon->clog->debug() << m->get_target() << " reported immediately failed by " << m->get_orig_source_inst(); - force_failure(now, target_osd); + force_failure(now, target_osd, reporter); return true; } mon->clog->debug() << m->get_target() << " reported failed by " @@ -3458,7 +3464,9 @@ bool OSDMonitor::handle_osd_timeouts(const utime_t &now, } else if (can_mark_down(i)) { utime_t diff = now - t->second; if (diff > timeo) { - mon->clog->info() << "osd." << i << " marked down after no pg stats for " << diff << "seconds"; + mon->clog->info() << "osd." << i << " failed (" + << osdmap.crush->get_full_location_ordered_string(i) + << ") (pg stats for " << diff << "seconds)"; derr << "no osd or pg stats from osd." << i << " since " << t->second << ", " << diff << " seconds ago. marking down" << dendl; pending_inc.new_state[i] = CEPH_OSD_UP; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 44f013f27f7..1a9f74e3f31 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -147,7 +147,7 @@ public: bool check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); - void force_failure(utime_t now, int target_osd); + void force_failure(utime_t now, int target_osd, int by); // the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay utime_t last_attempted_minwait_time;