From dd091b38d56ab12b2fd9ca19523e55f936815465 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Dec 2008 13:28:10 -0800 Subject: [PATCH] osd: do delayed activation after replay via a queue, not timer event This avoids osd_lock dependency by using osd->timer. --- src/osd/OSD.cc | 66 ++++++++++++++++++++++++++++++++------------------ src/osd/OSD.h | 18 +++++--------- src/osd/PG.cc | 11 ++++++--- src/osd/PG.h | 1 + 4 files changed, 57 insertions(+), 39 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 523c94bfe21cf..49fc8c849b1a0 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -287,6 +287,7 @@ OSD::OSD(int id, Messenger *m, Messenger *hbm, MonMap *mm, const char *dev) : recovery_ops_active(0), recovery_wq(this, &recovery_tp), remove_list_lock("OSD::remove_list_lock"), + replay_queue_lock("OSD::replay_queue_lock"), snap_trim_wq(this, &disk_tp), scrub_wq(this, &disk_tp) { @@ -1157,6 +1158,8 @@ void OSD::tick() map_lock.get_read(); + check_replay_queue(); + // mon report? utime_t now = g_clock.now(); if (now - last_mon_report > g_conf.osd_mon_report_interval) @@ -1181,6 +1184,27 @@ void OSD::tick() map_lock.put_read(); timer.add_event_after(1.0, new C_Tick(this)); + + + // finishers? + finished_lock.Lock(); + if (finished.empty()) { + finished_lock.Unlock(); + } else { + list waiting; + waiting.splice(waiting.begin(), finished); + + finished_lock.Unlock(); + osd_lock.Unlock(); + + for (list::iterator it = waiting.begin(); + it != waiting.end(); + it++) { + dispatch(*it); + } + + osd_lock.Lock(); + } } // ========================================= @@ -3135,7 +3159,6 @@ void OSD::generate_backlog(PG *pg) // take osd_lock, map_log (read) pg->unlock(); - osd_lock.Lock(); map_lock.get_read(); pg->lock(); @@ -3168,7 +3191,6 @@ void OSD::generate_backlog(PG *pg) out2: map_lock.put_read(); - osd_lock.Unlock(); out: pg->unlock(); @@ -3177,10 +3199,26 @@ void OSD::generate_backlog(PG *pg) +void OSD::check_replay_queue() +{ + utime_t now = g_clock.now(); + list< pair > pgids; + replay_queue_lock.Lock(); + while (!replay_queue.empty() && + replay_queue.front().second <= now) { + pgids.push_back(replay_queue.front()); + replay_queue.pop_front(); + } + replay_queue_lock.Unlock(); + + for (list< pair >::iterator p = pgids.begin(); p != pgids.end(); p++) + activate_pg(p->first, p->second); +} + /* * NOTE: this is called from SafeTimer, so caller holds osd_lock */ -void OSD::activate_pg(pg_t pgid, epoch_t epoch) +void OSD::activate_pg(pg_t pgid, utime_t activate_at) { assert(osd_lock.is_locked()); @@ -3189,7 +3227,7 @@ void OSD::activate_pg(pg_t pgid, epoch_t epoch) if (pg->is_crashed() && pg->is_replay() && pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { + pg->replay_until == activate_at) { ObjectStore::Transaction t; pg->activate(t); store->apply_transaction(t); @@ -3199,26 +3237,6 @@ void OSD::activate_pg(pg_t pgid, epoch_t epoch) // wake up _all_ pg waiters; raw pg -> actual pg mapping may have shifted wake_all_pg_waiters(); - - // finishers? - finished_lock.Lock(); - if (finished.empty()) { - finished_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - - osd_lock.Lock(); - } } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index e84f3a80eee8e..4d61aa33b65cf 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -606,18 +606,12 @@ private: remove_list_lock.Unlock(); } - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; + // replay / delayed pg activation + Mutex replay_queue_lock; + list< pair > replay_queue; + + void check_replay_queue(); + void activate_pg(pg_t pgid, utime_t activate_at); // -- snap trimming -- diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 926b12baa0ac8..95d506ffd4ba6 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1226,10 +1226,14 @@ void PG::peer(ObjectStore::Transaction& t, // -- crash recovery? if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl; + replay_until = g_clock.now(); + replay_until += g_conf.osd_replay_window; + dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window + << " until " << replay_until << dendl; state_set(PG_STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); + osd->replay_queue_lock.Lock(); + osd->replay_queue.push_back(pair(info.pgid, replay_until)); + osd->replay_queue_lock.Unlock(); } else if (!is_active()) { // -- ok, activate! @@ -1551,6 +1555,7 @@ void PG::update_stats() void PG::clear_stats() { + dout(15) << "clear_stats" << dendl; pg_stats_lock.Lock(); pg_stats_valid = false; pg_stats_lock.Unlock(); diff --git a/src/osd/PG.h b/src/osd/PG.h index d4ef083cbd8c1..d986375da57ba 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -608,6 +608,7 @@ public: int recovery_ops_active; epoch_t generate_backlog_epoch; // epoch we decided to build a backlog. + utime_t replay_until; protected: int role; // 0 = primary, 1 = replica, -1=none. -- 2.39.5