]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mon/OSDMonitor: change info in 'osd failed' messages 15321/head
authorSage Weil <sage@redhat.com>
Fri, 26 May 2017 20:07:20 +0000 (16:07 -0400)
committerSage Weil <sage@redhat.com>
Fri, 26 May 2017 20:07:20 +0000 (16:07 -0400)
- put 'osd.NNN failed' at front
- drop the ip address
- include the crush location
- be consistent:
  "osd.$id failed ($crush_location) ($reason)"

2017-05-26 16:05:21.573460 mon.0 [INF] osd.0 failed (root=default,host=gnit) (refused connection reported by osd.1)
2017-05-26 16:06:17.198225 mon.0 [INF] osd.1 failed (root=default,host=gnit) (2 reporters from different osd after 20.098778 >= grace 20.000000)

Signed-off-by: Sage Weil <sage@redhat.com>
src/mon/OSDMonitor.cc
src/mon/OSDMonitor.h

index e28621cffafda6770fdd6600f5232e85c7307fa7..9650e8098c0d4dcc4dbfe4ed4da7267b02379ee6 100644 (file)
@@ -1964,8 +1964,12 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
            << " down" << dendl;
     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
 
-    mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
-                     << (int)reporters_by_subtree.size() << " reporters from different "
+    mon->clog->info() << "osd." << target_osd << " failed ("
+                     << osdmap.crush->get_full_location_ordered_string(
+                       target_osd)
+                     << ") ("
+                     << (int)reporters_by_subtree.size()
+                     << " reporters from different "
                      << reporter_subtree_level << " after "
                      << failed_for << " >= grace " << grace << ")";
     return true;
@@ -1973,7 +1977,7 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
   return false;
 }
 
-void OSDMonitor::force_failure(utime_t now, int target_osd)
+void OSDMonitor::force_failure(utime_t now, int target_osd, int by)
 {
   // already pending failure?
   if (pending_inc.new_state.count(target_osd) &&
@@ -1985,7 +1989,9 @@ void OSDMonitor::force_failure(utime_t now, int target_osd)
   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
 
-  mon->clog->info() << osdmap.get_inst(target_osd) << " failed (forced)";
+  mon->clog->info() << "osd." << target_osd << " failed ("
+                   << osdmap.crush->get_full_location_ordered_string(target_osd)
+                   << ") (connection refused reported by osd." << by << ")";
   return;
 }
 
@@ -2012,7 +2018,7 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op)
     if (m->is_immediate()) {
       mon->clog->debug() << m->get_target() << " reported immediately failed by "
             << m->get_orig_source_inst();
-      force_failure(now, target_osd);
+      force_failure(now, target_osd, reporter);
       return true;
     }
     mon->clog->debug() << m->get_target() << " reported failed by "
@@ -3458,7 +3464,9 @@ bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
     } else if (can_mark_down(i)) {
       utime_t diff = now - t->second;
       if (diff > timeo) {
-       mon->clog->info() << "osd." << i << " marked down after no pg stats for " << diff << "seconds";
+       mon->clog->info() << "osd." << i << " failed ("
+                         << osdmap.crush->get_full_location_ordered_string(i)
+                         << ") (pg stats for " << diff << "seconds)";
        derr << "no osd or pg stats from osd." << i << " since " << t->second << ", " << diff
             << " seconds ago.  marking down" << dendl;
        pending_inc.new_state[i] = CEPH_OSD_UP;
index 44f013f27f756c15221ffb47c2c570ca9a592900..1a9f74e3f31414b70f124a35d181833dfdcd87a2 100644 (file)
@@ -147,7 +147,7 @@ public:
 
   bool check_failures(utime_t now);
   bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
-  void force_failure(utime_t now, int target_osd);
+  void force_failure(utime_t now, int target_osd, int by);
 
   // the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay
   utime_t last_attempted_minwait_time;