From b5dd112ce220e97ec9406779ab8c20841580e5f8 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Thu, 13 Apr 2017 19:08:45 +0800 Subject: [PATCH] mon/OSDMonitor: use lec in OSDBeacon to trim osdmap Signed-off-by: Kefu Chai --- src/mon/OSDMonitor.cc | 98 ++++++++++++++++++++++++++++++++++++++----- src/mon/OSDMonitor.h | 23 +++++++++- 2 files changed, 110 insertions(+), 11 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index baeeaa1c8a8..f1fe17572a0 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -75,6 +75,59 @@ #define dout_subsys ceph_subsys_mon #define OSD_PG_CREATING_PREFIX "osd_pg_creating" +void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean) +{ + if (epoch_by_pg.size() <= ps) { + epoch_by_pg.resize(ps + 1, 0); + } + if (epoch_by_pg[ps] >= last_epoch_clean) { + // stale lec + return; + } + epoch_by_pg[ps] = last_epoch_clean; + if (last_epoch_clean < floor) { + floor = last_epoch_clean; + } + if (ps != next_missing) { + return; + } + for (; next_missing < epoch_by_pg.size(); next_missing++) { + if (epoch_by_pg[next_missing] == 0) { + break; + } + } +} + +void LastEpochClean::remove_pool(uint64_t pool) +{ + report_by_pool.erase(pool); +} + +void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean) +{ + auto& lec = report_by_pool[pg.pool()]; + return lec.report(pg.ps(), last_epoch_clean); +} + +epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const +{ + auto floor = latest.get_epoch(); + for (auto& pool : latest.get_pools()) { + auto reported = report_by_pool.find(pool.first); + if (reported == report_by_pool.end()) { + return 0; + } + if (reported->second.next_missing < pool.second.get_pg_num()) { + return 0; + } + if (reported->second.floor < floor) { + floor = reported->second.floor; + } + } + return floor; +} + + struct C_UpdateCreatingPGs : public Context { OSDMonitor *osdmon; utime_t start; @@ -326,12 +379,16 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) } if (mon->monmap->get_required_features().contains_all( ceph::features::mon::FEATURE_LUMINOUS)) { - creating_pgs = update_pending_creatings(inc); + creating_pgs = update_pending_pgs(inc); for (const auto &osd_state : inc.new_state) { if (osd_state.second & CEPH_OSD_UP) { // could be marked up *or* down, but we're too lazy to check which last_osd_report.erase(osd_state.first); } + if (osd_state.second & CEPH_OSD_EXISTS) { + // could be created *or* destroyed, but we can safely drop it + osd_epochs.erase(osd_state.first); + } } } } @@ -841,7 +898,7 @@ void OSDMonitor::create_pending() } creating_pgs_t -OSDMonitor::update_pending_creatings(const OSDMap::Incremental& inc) +OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc) { creating_pgs_t pending_creatings; { @@ -876,6 +933,7 @@ OSDMonitor::update_pending_creatings(const OSDMap::Incremental& inc) auto last = pending_creatings.pgs.lower_bound(pg_t{0, removed_pool + 1}); pending_creatings.pgs.erase(first, last); + last_epoch_clean.remove_pool(removed_pool); } scan_for_creating_pgs(osdmap.get_pools(), inc.old_pools, @@ -1195,7 +1253,7 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t) // and pg creating, also! if (mon->monmap->get_required_features().contains_all( ceph::features::mon::FEATURE_LUMINOUS)) { - auto pending_creatings = update_pending_creatings(pending_inc); + auto pending_creatings = update_pending_pgs(pending_inc); if (!osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { dout(7) << __func__ << " in the middle of upgrading, " << " trimming pending creating_pgs using pgmap" << dendl; @@ -1296,24 +1354,26 @@ void OSDMonitor::share_map_with_random_osd() version_t OSDMonitor::get_trim_to() { + epoch_t floor; + if (mon->monmap->get_required_features().contains_all( ceph::features::mon::FEATURE_LUMINOUS)) { - std::lock_guard l(creating_pgs_lock); - if (!creating_pgs.pgs.empty()) { - return 0; - } - if (!mon->pgmon()->is_readable()) { - return 0; + { + std::lock_guard l(creating_pgs_lock); + if (!creating_pgs.pgs.empty()) { + return 0; + } } + floor = get_min_last_epoch_clean(); } else { if (!mon->pgmon()->is_readable()) return 0; if (mon->pgmon()->pg_map.creating_pgs.empty()) { return 0; } + floor = mon->pgmon()->pg_map.get_min_last_epoch_clean(); } { - epoch_t floor = mon->pgmon()->pg_map.get_min_last_epoch_clean(); dout(10) << " min_last_epoch_clean " << floor << dendl; if (g_conf->mon_osd_force_trim_to > 0 && g_conf->mon_osd_force_trim_to < (int)get_last_committed()) { @@ -1333,6 +1393,19 @@ version_t OSDMonitor::get_trim_to() return 0; } +epoch_t OSDMonitor::get_min_last_epoch_clean() const +{ + auto floor = last_epoch_clean.get_lower_bound(osdmap); + // also scan osd epochs + // don't trim past the oldest reported osd epoch + for (auto& osd_epoch : osd_epochs) { + if (osd_epoch.second < floor) { + floor = osd_epoch.second; + } + } + return floor; +} + void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) { @@ -2758,6 +2831,11 @@ bool OSDMonitor::prepare_beacon(MonOpRequestRef op) } last_osd_report[from] = ceph_clock_now(); + osd_epochs[from] = beacon->version; + + for (const auto& pg : beacon->pgs) { + last_epoch_clean.report(pg, beacon->min_last_epoch_clean); + } return false; } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 4852571047a..3c2521a072e 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -111,6 +111,22 @@ struct failure_info_t { } }; + +class LastEpochClean { + struct Lec { + vector epoch_by_pg; + ps_t next_missing = 0; + epoch_t floor = std::numeric_limits::max(); + void report(ps_t pg, epoch_t last_epoch_clean); + }; + std::map report_by_pool; +public: + void report(const pg_t& pg, epoch_t last_epoch_clean); + void remove_pool(uint64_t pool); + epoch_t get_lower_bound(const OSDMap& latest) const; +}; + + class OSDMonitor : public PaxosService { CephContext *cct; @@ -414,8 +430,13 @@ private: // when we last received PG stats from each osd map last_osd_report; + // TODO: use last_osd_report to store the osd report epochs, once we don't + // need to upgrade from pre-luminous releases. + map osd_epochs; + LastEpochClean last_epoch_clean; bool preprocess_beacon(MonOpRequestRef op); bool prepare_beacon(MonOpRequestRef op); + epoch_t get_min_last_epoch_clean() const; friend class C_UpdateCreatingPGs; std::map>> creating_pgs_by_osd_epoch; @@ -425,7 +446,7 @@ private: creating_pgs_t creating_pgs; std::mutex creating_pgs_lock; - creating_pgs_t update_pending_creatings(const OSDMap::Incremental& inc); + creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc); void trim_creating_pgs(creating_pgs_t *creating_pgs, const PGMap& pgm); void scan_for_creating_pgs(const std::map& pools, const std::set& removed_pools, -- 2.39.5