From cf25bdf6b0090379903981fe8cee5ea75efd7ba0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 May 2014 17:24:48 -0700 Subject: [PATCH] osd: prevent pgs from getting too far ahead of the min pg epoch Bound the range of PG epochs between the slowest and fastest pg (epoch-wise) with 'osd map max advance'. This value should be set to something less than 'osd map cache size' so that the maps we are processing will be in memory as many PGs advance forward in time in loose synchrony. This is part of the solution to #7576. Signed-off-by: Sage Weil --- src/common/config_opts.h | 1 + src/osd/OSD.cc | 26 +++++++++++++++++++++----- src/osd/OSD.h | 2 +- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 2d2f360a16029..3ef364c6d0ad8 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -448,6 +448,7 @@ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200) OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom") OPTION(osd_map_dedup, OPT_BOOL, true) +OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size! OPTION(osd_map_cache_size, OPT_INT, 500) OPTION(osd_map_message_max, OPT_INT, 100) // max maps per MOSDMap message OPTION(osd_map_share_max_epochs, OPT_INT, 100) // cap on # of inc maps we send to peers, clients diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 5f9c258e4048d..685e6081aa0d7 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -5697,7 +5697,7 @@ void OSD::check_osdmap_features(ObjectStore *fs) } } -void OSD::advance_pg( +bool OSD::advance_pg( epoch_t osd_epoch, PG *pg, ThreadPool::TPHandle &handle, PG::RecoveryCtx *rctx, @@ -5708,11 +5708,19 @@ void OSD::advance_pg( OSDMapRef lastmap = pg->get_osdmap(); if (lastmap->get_epoch() == osd_epoch) - return; + return true; assert(lastmap->get_epoch() < osd_epoch); + epoch_t min_epoch = service.get_min_pg_epoch(); + epoch_t max; + if (min_epoch) { + max = min_epoch + g_conf->osd_map_max_advance; + } else { + max = next_epoch + g_conf->osd_map_max_advance; + } + for (; - next_epoch <= osd_epoch; + next_epoch <= osd_epoch && next_epoch <= max; ++next_epoch) { OSDMapRef nextmap = service.try_get_map(next_epoch); if (!nextmap) @@ -5746,6 +5754,13 @@ void OSD::advance_pg( } service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch()); pg->handle_activate_map(rctx); + if (next_epoch <= osd_epoch) { + dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance + << " past min epoch " << min_epoch + << " ... will requeue " << *pg << dendl; + return false; + } + return true; } /** @@ -7722,8 +7737,9 @@ void OSD::process_peering_events( pg->unlock(); continue; } - advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs); - if (!pg->peering_queue.empty()) { + if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) { + pg->queue_null(curmap->get_epoch(), curmap->get_epoch()); + } else if (!pg->peering_queue.empty()) { PG::CephPeeringEvtRef evt = pg->peering_queue.front(); pg->peering_queue.pop_front(); pg->handle_peering_event(evt, &rctx); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index db433517af840..9ff5f27dd6920 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1287,7 +1287,7 @@ private: void note_down_osd(int osd); void note_up_osd(int osd); - void advance_pg( + bool advance_pg( epoch_t advance_to, PG *pg, ThreadPool::TPHandle &handle, PG::RecoveryCtx *rctx, -- 2.39.5