]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: fix last_complete adjustment after recovering an object
authorSage Weil <sage@newdream.net>
Mon, 24 Oct 2011 20:55:29 +0000 (13:55 -0700)
committerSage Weil <sage@newdream.net>
Tue, 25 Oct 2011 05:50:43 +0000 (22:50 -0700)
After we recover each object, we try to raise the last_complete value
(and matching complete_to iterator).  If our log was purely a backlog, this
won't necessarily end up bringing last_complete all the way up to the
last_update value, and we'll fail an assert later.

If complete_to does reach the end of the log, then we fast-forward
last_complete to last_update.

The crash we were hitting was in finish_recovery(), and looked something
like

osd/PG.cc: In function 'void PG::finish_recovery(ObjectStore::Transaction&, std::list<Context*, std::allocator<Context*> >&)', in thread '0x7f4573df7700'
osd/PG.cc: 1800: FAILED assert(info.last_complete == info.last_update)
 ceph version 0.36-251-g6e29c28 (commit:6e29c2826066a7723ed05b60b8ac0433a04c3c13)
 1: (PG::finish_recovery(ObjectStore::Transaction&, std::list<Context*, std::allocator<Context*> >&)+0x8d) [0x6ff0ed]
 2: (PG::RecoveryState::Active::react(PG::RecoveryState::ActMap const&)+0x316) [0x729196]
 3: (boost::statechart::simple_state<PG::RecoveryState::Active, PG::RecoveryState::Primary, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x21b) [0x759c0b]
 4: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x8d) [0x7423dd]
 5: (PG::RecoveryState::handle_activate_map(PG::RecoveryCtx*)+0x183) [0x711f43]
 6: (OSD::activate_map(ObjectStore::Transaction&, std::list<Context*, std::allocator<Context*> >&)+0x674) [0x579884]
 7: (OSD::handle_osd_map(MOSDMap*)+0x2270) [0x57bd50]
 8: (OSD::_dispatch(Message*)+0x4d0) [0x596bb0]
 9: (OSD::ms_dispatch(Message*)+0x17b) [0x59803b]
 10: (SimpleMessenger::dispatch_entry()+0x9c2) [0x617562]
 11: (SimpleMessenger::DispatchThread::entry()+0x2c) [0x4a3dec]
 12: (Thread::_entry_func(void*)+0x12) [0x611a92]
 13: (()+0x7971) [0x7f457f87b971]
 14: (clone()+0x6d) [0x7f457e10b92d]

Fixes: #1609
Signed-off-by: Sage Weil <sage@newdream.net>
src/osd/ReplicatedPG.cc

index 267c3b20f4aa93093138dbdb6c10c9980fac80e7..7a73ed13b85f895033e608801e841ab9e0bd4680 100644 (file)
@@ -4099,9 +4099,21 @@ void ReplicatedPG::recover_primary_got(hobject_t oid, eversion_t v)
        info.last_complete = log.complete_to->version;
       log.complete_to++;
     }
-    dout(10) << "last_complete now " << info.last_complete << dendl;
-    if (log.complete_to != log.log.end())
-      dout(10) << " log.complete_to = " << log.complete_to->version << dendl;
+    if (log.complete_to != log.log.end()) {
+      dout(10) << "last_complete now " << info.last_complete
+              << " log.complete_to " << log.complete_to->version
+              << dendl;
+    } else {
+      dout(10) << "last_complete now " << info.last_complete
+              << " log.complete_to at end" << dendl;
+      assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
+      if (info.last_complete != info.last_update) {
+       // this happens if the log we are recovering from was a
+       // backlog, and the most recent entry wasn't last_update.
+       info.last_complete = info.last_update;
+       dout(10) << "setting last_complete to last_update " << info.last_complete << dendl;
+      }
+    }
   }
 }