mds: handle read/replay errors in MDLog with damaged()

author John Spray <john.spray@redhat.com>

Thu, 19 Mar 2015 20:42:59 +0000 (20:42 +0000)

committer John Spray <john.spray@redhat.com>

Mon, 23 Mar 2015 18:20:11 +0000 (18:20 +0000)
author John Spray <john.spray@redhat.com>
Thu, 19 Mar 2015 20:42:59 +0000 (20:42 +0000)
committer John Spray <john.spray@redhat.com>
Mon, 23 Mar 2015 18:20:11 +0000 (18:20 +0000)
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc

index ebc474a8081b74eb5e876c5a639e320fc8042d46..52b58acccc6afbcda820a532f137b3829bca4aae 100644 (file)
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -819,9 +819,10 @@ void MDLog::_recovery_thread(MDSInternalContextBase *completion)
      // Nothing graceful we can do for this
      assert(write_result >= 0);
    } else if (read_result != 0) {
-    // No graceful way of handling this: give up and leave it for support
-    // to work out why RADOS preventing access.
-    assert(0);
+    mds->clog->error() << "failed to read JournalPointer: " << read_result
+                       << " (" << cpp_strerror(read_result) << ")";
+    mds->damaged();
+    assert(0);  // Should be unreachable because damaged() calls respawn()
    }
  
    // If the back pointer is non-null, that means that a journal
@@ -1108,15 +1109,25 @@ void MDLog::_replay_thread()
        r = journaler->get_error();
        dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
        if (r == -ENOENT) {
-       // journal has been trimmed by somebody else?
-       assert(journaler->is_readonly());
-       r = -EAGAIN;
+        if (journaler->is_readonly()) {
+          // journal has been trimmed by somebody else
+          r = -EAGAIN;
+        } else {
+          mds->clog->error() << "missing journal object";
+          mds->damaged();
+          assert(0);  // Should be unreachable because damaged() calls respawn()
+        }
        } else if (r == -EINVAL) {
          if (journaler->get_read_pos() < journaler->get_expire_pos()) {
            // this should only happen if you're following somebody else
-          assert(journaler->is_readonly());
-          dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl;
-          r = -EAGAIN;
+          if(journaler->is_readonly()) {
+            dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl;
+            r = -EAGAIN;
+          } else {
+            mds->clog->error() << "invalid journaler offsets";
+            mds->damaged();
+            assert(0);  // Should be unreachable because damaged() calls respawn()
+          }
          } else {
            /* re-read head and check it
             * Given that replay happens in a separate thread and
@@ -1135,7 +1146,11 @@ void MDLog::_replay_thread()
              } else {
                  dout(0) << "got error while reading head: " << cpp_strerror(err)
                          << dendl;
-                mds->suicide();
+
+                mds->clog->error() << "error reading journal header";
+                mds->damaged();
+                assert(0);  // Should be unreachable because damaged() calls
+                            // respawn()
              }
            }
           standby_trim_segments();
@@ -1171,8 +1186,17 @@ void MDLog::_replay_thread()
        bl.hexdump(*_dout);
        *_dout << dendl;
  
-      assert(!!"corrupt log event" == g_conf->mds_log_skip_corrupt_events);
-      continue;
+      mds->clog->error() << "corrupt journal event at " << pos << "~"
+                         << bl.length() << " / "
+                         << journaler->get_write_pos();
+      if (g_conf->mds_log_skip_corrupt_events) {
+        continue;
+      } else {
+        mds->damaged();
+        assert(0);  // Should be unreachable because damaged() calls
+                    // respawn()
+      }
+
      }
      le->set_start_off(pos);
author	John Spray <john.spray@redhat.com>
	Thu, 19 Mar 2015 20:42:59 +0000 (20:42 +0000)
committer	John Spray <john.spray@redhat.com>
	Mon, 23 Mar 2015 18:20:11 +0000 (18:20 +0000)