From d3de69f8a531bd1c2e95596b3520ab1d4f2a04b6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 17 Jul 2014 13:15:45 +0100 Subject: [PATCH] mds: fix journal reformat failure in standbyreplay In the 0.82 release, standbyreplay MDS daemons would try to reformat the jouranl if they saw an older version on disk, where this should have only been done by the active MDS for the rank. Depending on timing, this could cause fatal corruption of the journal. This change handles the following cases: * only do reformat if not in standbyreplay (else raise EAGAIN to keep trying til an active mds reformats it) * if journal header goes away while in standbyreplay then raise EAGAIN (handle rewrite happening in background) * if journal version is greater than the max supported, suicide Fixes: #8811 Signed-off-by: John Spray (cherry picked from commit 5438500af8979fda32e61714ae40b71c7ffdfd15) --- src/mds/MDLog.cc | 65 ++++++++++++++++++++++++++++++-------------- src/osdc/Journaler.h | 2 ++ 2 files changed, 47 insertions(+), 20 deletions(-) diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc index d9dfd4ad2d6d4..81101a8ae0667 100644 --- a/src/mds/MDLog.cc +++ b/src/mds/MDLog.cc @@ -542,6 +542,14 @@ void MDLog::_recovery_thread(Context *completion) // rewrite failed part way through. Erase the back journal // to clean up. if (jp.back) { + if (mds->is_standby_replay()) { + dout(1) << "Journal " << jp.front << " is being rewritten, " + << "cannot replay in standby until an active MDS completes rewrite" << dendl; + mds->mds_lock.Lock(); + completion->complete(-EAGAIN); + mds->mds_lock.Unlock(); + return; + } dout(1) << "Erasing journal " << jp.back << dendl; C_SaferCond erase_waiter; Journaler back(jp.back, mds->mdsmap->get_metadata_pool(), CEPH_FS_ONDISK_MAGIC, @@ -594,7 +602,13 @@ void MDLog::_recovery_thread(Context *completion) } /* Check whether the front journal format is acceptable or needs re-write */ - if (front_journal->get_stream_format() >= g_conf->mds_journal_format) { + if (front_journal->get_stream_format() > JOURNAL_FORMAT_MAX) { + dout(0) << "Journal " << jp.front << " is in unknown format " << front_journal->get_stream_format() + << ", does this MDS daemon require upgrade?" << dendl; + mds->mds_lock.Lock(); + completion->complete(-EINVAL); + mds->mds_lock.Unlock(); + } else if (front_journal->get_stream_format() >= g_conf->mds_journal_format) { /* Great, the journal is of current format and ready to rock, hook * it into this->journaler and complete */ journaler = front_journal; @@ -603,12 +617,22 @@ void MDLog::_recovery_thread(Context *completion) completion->complete(0); mds->mds_lock.Unlock(); } else { - /* Hand off to reformat routine, which will ultimately set the - * completion when it has done its thing */ - dout(1) << "Journal " << jp.front << " has old format " - << front_journal->get_stream_format() << ", it will now be updated" << dendl; + if (mds->is_standby_replay()) { + /* We must not try to rewrite in standby replay mode, because + * we do not have exclusive access to the log */ + dout(1) << "Journal " << jp.front << " has old format, " + << "cannot replay in standby until an active MDS rewrites it" << dendl; + mds->mds_lock.Lock(); + completion->complete(-EAGAIN); + mds->mds_lock.Unlock(); + } else { + /* Hand off to reformat routine, which will ultimately set the + * completion when it has done its thing */ + dout(1) << "Journal " << jp.front << " has old format " + << front_journal->get_stream_format() << ", it will now be updated" << dendl; - _reformat_journal(jp, front_journal, completion); + _reformat_journal(jp, front_journal, completion); + } } } @@ -782,22 +806,23 @@ void MDLog::_replay_thread() * the MDS is going to either shut down or restart when * we return this error, doing it synchronously is fine * -- as long as we drop the main mds lock--. */ - Mutex mylock("MDLog::_replay_thread lock"); - Cond cond; - bool done = false; - int err = 0; - journaler->reread_head(new C_SafeCond(&mylock, &cond, &done, &err)); + C_SaferCond reread_fin; + journaler->reread_head(&reread_fin); mds->mds_lock.Unlock(); - mylock.Lock(); - while (!done) - cond.Wait(mylock); - mylock.Unlock(); - if (err) { // well, crap - dout(0) << "got error while reading head: " << cpp_strerror(err) - << dendl; - mds->suicide(); - } + int err = reread_fin.wait(); mds->mds_lock.Lock(); + if (err) { + if (err == -ENOENT && mds->is_standby_replay()) { + r = -EAGAIN; + dout(1) << "Journal header went away while in standby replay, journal rewritten?" + << dendl; + break; + } else { + dout(0) << "got error while reading head: " << cpp_strerror(err) + << dendl; + mds->suicide(); + } + } standby_trim_segments(); if (journaler->get_read_pos() < journaler->get_expire_pos()) { dout(0) << "expire_pos is higher than read_pos, returning EAGAIN" << dendl; diff --git a/src/osdc/Journaler.h b/src/osdc/Journaler.h index cce8db31f1755..0a0c868f82390 100644 --- a/src/osdc/Journaler.h +++ b/src/osdc/Journaler.h @@ -70,6 +70,8 @@ typedef __u8 stream_format_t; #define JOURNAL_FORMAT_RESILIENT 1 #define JOURNAL_ENVELOPE_RESILIENT (sizeof(uint32_t) + sizeof(uint64_t) + sizeof(uint64_t)) +// Most recent format which we may try to read +#define JOURNAL_FORMAT_MAX 1 /** * Represents a collection of entries serialized in a byte stream. -- 2.39.5