From: Kefu Chai Date: Mon, 20 Jul 2015 16:24:52 +0000 (+0800) Subject: mon: track osd_epoch in MonSession X-Git-Tag: v9.1.0~330^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c05753eacc26e90b2e3b56e641a71bffd5b39bd0;p=ceph.git mon: track osd_epoch in MonSession * remove osd_epoch from OSDMonitor * add osd_epoch to MonSession to track the latest osdmap epoch OSDMonitor sends to a mon client * do not remove osd_epoch entries if an OSD is down, or max_osd > osd_id Fixes: #10930 Signed-off-by: Kefu Chai --- diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 0e2068b9890b..4722a67ef49e 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -4178,6 +4178,10 @@ void Monitor::handle_subscribe(MonOpRequestRef op) } } else if (p->first == "osdmap") { if ((int)s->is_capable("osd", MON_CAP_R)) { + if (s->osd_epoch > p->second.start) { + // client needs earlier osdmaps on purpose, so reset the sent epoch + s->osd_epoch = 0; + } osdmon()->check_sub(s->sub_map["osdmap"]); } } else if (p->first == "osd_pg_creates") { diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 586597721f98..813d6c3dbcf9 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -269,9 +269,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) for (int o = 0; o < osdmap.get_max_osd(); o++) { if (osdmap.is_down(o)) { - // invalidate osd_epoch cache - osd_epoch.erase(o); - // populate down -> out map if (osdmap.is_in(o) && down_pending_out.count(o) == 0) { @@ -280,11 +277,7 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) } } } - // blow away any osd_epoch items beyond max_osd - map::iterator p = osd_epoch.upper_bound(osdmap.get_max_osd()); - while (p != osd_epoch.end()) { - osd_epoch.erase(p++); - } + // XXX: need to trim MonSession connected with a osd whose id > max_osd? /** we don't have any of the feature bit infrastructure in place for * supporting primary_temp mappings without breaking old clients/OSDs.*/ @@ -2350,19 +2343,17 @@ void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first) << " to " << op->get_req()->get_orig_source_inst() << dendl; - int osd = -1; + MonSession *s = NULL; if (op->get_req()->get_source().is_osd()) { - osd = op->get_req()->get_source().num(); - map::iterator p = osd_epoch.find(osd); - if (p != osd_epoch.end()) { - if (first <= p->second) { - dout(10) << __func__ << " osd." << osd << " should already have epoch " - << p->second << dendl; - first = p->second + 1; - if (first > osdmap.get_epoch()) - return; - } - } + s = op->get_session(); + } + + if (s && first <= s->osd_epoch) { + dout(10) << __func__ << s->inst << " should already have epoch " + << s->osd_epoch << dendl; + first = s->osd_epoch + 1; + if (first > osdmap.get_epoch()) + return; } if (first < get_first_committed()) { @@ -2381,8 +2372,8 @@ void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first) m->maps[first] = bl; mon->send_reply(op, m); - if (osd >= 0) - note_osd_has_epoch(osd, osdmap.get_epoch()); + if (s) + s->osd_epoch = osdmap.get_epoch(); return; } @@ -2394,28 +2385,8 @@ void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first) m->newest_map = osdmap.get_epoch(); mon->send_reply(op, m); - if (osd >= 0) - note_osd_has_epoch(osd, last); -} - -// FIXME: we assume the OSD actually receives this. if the mon -// session drops and they reconnect we may not share the same maps -// with them again, which could cause a strange hang (perhaps stuck -// 'waiting for osdmap' requests?). this information should go in the -// MonSession, but I think these functions need to be refactored in -// terms of MonSession first for that to work. -void OSDMonitor::note_osd_has_epoch(int osd, epoch_t epoch) -{ - dout(20) << __func__ << " osd." << osd << " epoch " << epoch << dendl; - map::iterator p = osd_epoch.find(osd); - if (p != osd_epoch.end()) { - dout(20) << __func__ << " osd." << osd << " epoch " << epoch - << " (was " << p->second << ")" << dendl; - p->second = epoch; - } else { - dout(20) << __func__ << " osd." << osd << " epoch " << epoch << dendl; - osd_epoch[osd] = epoch; - } + if (s) + s->osd_epoch = last; } void OSDMonitor::send_incremental(epoch_t first, MonSession *session, @@ -2439,6 +2410,9 @@ void OSDMonitor::send_incremental(epoch_t first, MonSession *session, m->newest_map = osdmap.get_epoch(); m->maps[first] = bl; session->con->send_message(m); + if (session->inst.name.is_osd()) { + session->osd_epoch = first; + } first++; } @@ -2448,8 +2422,9 @@ void OSDMonitor::send_incremental(epoch_t first, MonSession *session, session->con->send_message(m); first = last + 1; - if (session->inst.name.is_osd()) - note_osd_has_epoch(session->inst.name.num(), last); + if (session->inst.name.is_osd()) { + session->osd_epoch = last; + } if (onetime) break; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index cede74c6e152..f4dc8c6d5adf 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -136,14 +136,6 @@ private: map osd_weight; - /* - * cache what epochs we think osds have. this is purely - * optimization to try to avoid sending the same inc maps twice. - */ - map osd_epoch; - - void note_osd_has_epoch(int osd, epoch_t epoch); - void check_failures(utime_t now); bool check_failure(utime_t now, int target_osd, failure_info_t& fi); diff --git a/src/mon/Session.h b/src/mon/Session.h index e4cfed093250..98b4a8f2da8f 100644 --- a/src/mon/Session.h +++ b/src/mon/Session.h @@ -50,6 +50,7 @@ struct MonSession : public RefCountedObject { uint64_t global_id; map sub_map; + epoch_t osd_epoch; // the osdmap epoch sent to the mon client AuthServiceHandler *auth_handler; EntityName entity_name; @@ -60,7 +61,9 @@ struct MonSession : public RefCountedObject { MonSession(const entity_inst_t& i, Connection *c) : con(c), inst(i), closed(false), item(this), auid(0), - global_id(0), auth_handler(NULL), + global_id(0), + osd_epoch(0), + auth_handler(NULL), proxy_con(NULL), proxy_tid(0) { time_established = ceph_clock_now(g_ceph_context); }