]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: do delayed activation after replay via a queue, not timer event
authorSage Weil <sage@newdream.net>
Tue, 16 Dec 2008 21:28:10 +0000 (13:28 -0800)
committerSage Weil <sage@newdream.net>
Tue, 16 Dec 2008 21:37:32 +0000 (13:37 -0800)
This avoids osd_lock dependency by using osd->timer.

src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h

index 523c94bfe21cf92ac7a0def1717f196100ef5a83..49fc8c849b1a0b06eb691d58ce2a0298f8b01dbf 100644 (file)
@@ -287,6 +287,7 @@ OSD::OSD(int id, Messenger *m, Messenger *hbm, MonMap *mm, const char *dev) :
   recovery_ops_active(0),
   recovery_wq(this, &recovery_tp),
   remove_list_lock("OSD::remove_list_lock"),
+  replay_queue_lock("OSD::replay_queue_lock"),
   snap_trim_wq(this, &disk_tp),
   scrub_wq(this, &disk_tp)
 {
@@ -1157,6 +1158,8 @@ void OSD::tick()
   
   map_lock.get_read();
 
+  check_replay_queue();
+
   // mon report?
   utime_t now = g_clock.now();
   if (now - last_mon_report > g_conf.osd_mon_report_interval)
@@ -1181,6 +1184,27 @@ void OSD::tick()
   map_lock.put_read();
 
   timer.add_event_after(1.0, new C_Tick(this));
+
+
+  // finishers?
+  finished_lock.Lock();
+  if (finished.empty()) {
+    finished_lock.Unlock();
+  } else {
+    list<Message*> waiting;
+    waiting.splice(waiting.begin(), finished);
+
+    finished_lock.Unlock();
+    osd_lock.Unlock();
+    
+    for (list<Message*>::iterator it = waiting.begin();
+         it != waiting.end();
+         it++) {
+      dispatch(*it);
+    }
+
+    osd_lock.Lock();
+  }
 }
 
 // =========================================
@@ -3135,7 +3159,6 @@ void OSD::generate_backlog(PG *pg)
   
   // take osd_lock, map_log (read)
   pg->unlock();
-  osd_lock.Lock();
   map_lock.get_read();
   pg->lock();
 
@@ -3168,7 +3191,6 @@ void OSD::generate_backlog(PG *pg)
 
  out2:
   map_lock.put_read();
-  osd_lock.Unlock();
 
  out:
   pg->unlock();
@@ -3177,10 +3199,26 @@ void OSD::generate_backlog(PG *pg)
 
 
 
+void OSD::check_replay_queue()
+{
+  utime_t now = g_clock.now();
+  list< pair<pg_t,utime_t> > pgids;
+  replay_queue_lock.Lock();
+  while (!replay_queue.empty() &&
+        replay_queue.front().second <= now) {
+    pgids.push_back(replay_queue.front());
+    replay_queue.pop_front();
+  }
+  replay_queue_lock.Unlock();
+
+  for (list< pair<pg_t,utime_t> >::iterator p = pgids.begin(); p != pgids.end(); p++)
+    activate_pg(p->first, p->second);
+}
+
 /*
  * NOTE: this is called from SafeTimer, so caller holds osd_lock
  */
-void OSD::activate_pg(pg_t pgid, epoch_t epoch)
+void OSD::activate_pg(pg_t pgid, utime_t activate_at)
 {
   assert(osd_lock.is_locked());
 
@@ -3189,7 +3227,7 @@ void OSD::activate_pg(pg_t pgid, epoch_t epoch)
     if (pg->is_crashed() &&
        pg->is_replay() &&
        pg->get_role() == 0 &&
-       pg->info.history.same_primary_since <= epoch) {
+       pg->replay_until == activate_at) {
       ObjectStore::Transaction t;
       pg->activate(t);
       store->apply_transaction(t);
@@ -3199,26 +3237,6 @@ void OSD::activate_pg(pg_t pgid, epoch_t epoch)
 
   // wake up _all_ pg waiters; raw pg -> actual pg mapping may have shifted
   wake_all_pg_waiters();
-
-  // finishers?
-  finished_lock.Lock();
-  if (finished.empty()) {
-    finished_lock.Unlock();
-  } else {
-    list<Message*> waiting;
-    waiting.splice(waiting.begin(), finished);
-
-    finished_lock.Unlock();
-    osd_lock.Unlock();
-    
-    for (list<Message*>::iterator it = waiting.begin();
-         it != waiting.end();
-         it++) {
-      dispatch(*it);
-    }
-
-    osd_lock.Lock();
-  }
 }
 
 
index e84f3a80eee8e3923546e7bebb24b54e5cdde67c..4d61aa33b65cf1a389ee93ca9d7a4660c9b03012 100644 (file)
@@ -606,18 +606,12 @@ private:
     remove_list_lock.Unlock();
   }
 
-  void activate_pg(pg_t pgid, epoch_t epoch);
-
-  class C_Activate : public Context {
-    OSD *osd;
-    pg_t pgid;
-    epoch_t epoch;
-  public:
-    C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {}
-    void finish(int r) {
-      osd->activate_pg(pgid, epoch);
-    }
-  };
+  // replay / delayed pg activation
+  Mutex replay_queue_lock;
+  list< pair<pg_t, utime_t > > replay_queue;
+  
+  void check_replay_queue();
+  void activate_pg(pg_t pgid, utime_t activate_at);
 
 
   // -- snap trimming --
index 926b12baa0ac899bb120936b53981381dc787938..95d506ffd4ba64aed55fad025f9f153be3f62cfb 100644 (file)
@@ -1226,10 +1226,14 @@ void PG::peer(ObjectStore::Transaction& t,
 
   // -- crash recovery?
   if (is_crashed()) {
-    dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl;
+    replay_until = g_clock.now();
+    replay_until += g_conf.osd_replay_window;
+    dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window
+            << " until " << replay_until << dendl;
     state_set(PG_STATE_REPLAY);
-    osd->timer.add_event_after(g_conf.osd_replay_window,
-                              new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch()));
+    osd->replay_queue_lock.Lock();
+    osd->replay_queue.push_back(pair<pg_t,utime_t>(info.pgid, replay_until));
+    osd->replay_queue_lock.Unlock();
   } 
   else if (!is_active()) {
     // -- ok, activate!
@@ -1551,6 +1555,7 @@ void PG::update_stats()
 
 void PG::clear_stats()
 {
+  dout(15) << "clear_stats" << dendl;
   pg_stats_lock.Lock();
   pg_stats_valid = false;
   pg_stats_lock.Unlock();
index d4ef083cbd8c1471bce232aac1fdd22218b5aa4d..d986375da57ba4e1afdfa0876c5a6ee26ba86368 100644 (file)
@@ -608,6 +608,7 @@ public:
   int recovery_ops_active;
 
   epoch_t generate_backlog_epoch;  // epoch we decided to build a backlog.
+  utime_t replay_until;
 
 protected:
   int         role;    // 0 = primary, 1 = replica, -1=none.