From: Sage Weil Date: Thu, 24 Nov 2016 23:08:49 +0000 (-0500) Subject: osd: add osd_hack_prune_past_intervals X-Git-Tag: v10.2.10~71^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=d0492ea07abcc8652f9c713deaae792ef68dc491;p=ceph.git osd: add osd_hack_prune_past_intervals Last ditch (but dangerous) method of reducing memory usage for past_intervals, which can help very very unhappy clusters recovery. A properly implemented version of this is in luminous. This hacky version was used successfully to recover multiple jewel-based clusters, but is still only recommended for use when the OSD is otherwise unable to recover. This change is not cherry-picked from master because luminous implements a more sophisticated version of this that changes the past intervals representation entirely, but it is too invasive to backport. This workaround to prune just on startup should be sufficient for emergencies. Signed-off-by: Sage Weil --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index a941edbf9eb7..e47c78567064 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -827,6 +827,8 @@ OPTION(osd_failsafe_nearfull_ratio, OPT_FLOAT, .90) // what % full makes an OSD OPTION(osd_pg_object_context_cache_count, OPT_INT, 64) OPTION(osd_tracing, OPT_BOOL, false) // true if LTTng-UST tracepoints should be enabled +OPTION(osd_hack_prune_past_intervals, OPT_BOOL, false) // simplify past intervals on startup -- last ditch method of reducing memory usage for very unhappy clusters + // determines whether PGLog::check() compares written out log to stored log OPTION(osd_debug_pg_log_writeout, OPT_BOOL, false) OPTION(osd_loop_before_reset_tphandle, OPT_U32, 64) // Max number of loop before we reset thread-pool's handle diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 1cf86a6299b0..eea401ea7f7f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2844,6 +2844,62 @@ void PG::upgrade(ObjectStore *store) #pragma GCC diagnostic pop #pragma GCC diagnostic warning "-Wpragmas" +static void _simplify_past_intervals(map &pi) +{ + generic_dout(0) << __func__ << " on " << pi.size() << " intervals" << dendl; + unsigned was = pi.size(); + if (pi.size() <= 2) + return; + + // go backwards. this is because, although usually we only care about + // up/acting, in the prior set we also look at first when doing the lost_at + // comparison. we want to always keep the *most recent* interval when + // dropping redundant intervals so that we don't break the comparison + // } else if (pinfo->lost_at > interval.first) { + // + // also note: + // we will never have last_epoch_clean pointing to an interval + // that has maybe_went_rw=false, since it only advances during an + // active, healthy interval. thus, no changes needed to prune (we won't + // prune *into* an interval that no longer exists). + auto p = pi.end(); + --p; + generic_dout(0) << __func__ << " keep tail " << *p << dendl; + --p; + auto last = pi.begin(); + generic_dout(0) << __func__ << " keep head " << *last << dendl; + set seen; + while (p != last) { + epoch_t start = p->first; + if (!p->second.maybe_went_rw) { + generic_dout(0) << __func__ << " dropping maybe_went_rw=0 interval " + << *p << dendl; + --p; + pi.erase(start); + continue; + } + ostringstream ss; + ss << p->second.up << " " << p->second.acting << " " << p->second.primary + << " " << p->second.up_primary; + string s = ss.str(); + if (seen.count(s)) { + generic_dout(0) << __func__ << " dropping dup interval " << *p << dendl; + --p; + pi.erase(start); + continue; + } + seen.insert(s); + // keep it + generic_dout(0) << __func__ << " keep " << *p << dendl; + --p; + } + if (pi.size() < was) { + generic_dout(0) << __func__ << " finish with " << pi.size() + << " (from " << was << ")" + << dendl; + } +} + int PG::_prepare_write_info(map *km, epoch_t epoch, pg_info_t &info, coll_t coll, @@ -3212,6 +3268,10 @@ void PG::read_state(ObjectStore *store, bufferlist &bl) info_struct_v); assert(r >= 0); + if (g_conf->osd_hack_prune_past_intervals) { + _simplify_past_intervals(past_intervals); + } + ostringstream oss; pg_log.read_log(store, coll,