From 371d9baa120dc0302e9e61d3bc0e309dfaa773a0 Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Wed, 20 May 2015 12:08:15 -0700 Subject: [PATCH] PG::find_best_info: ignore info.les for incomplete peer See included update to doc/dev/osd_internals/last_epoch_started.rst Fixes: 11687 Signed-off-by: Samuel Just --- doc/dev/osd_internals/last_epoch_started.rst | 33 ++++++++++++++++---- src/osd/PG.cc | 12 +++++-- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/doc/dev/osd_internals/last_epoch_started.rst b/doc/dev/osd_internals/last_epoch_started.rst index fcb930f48b62..fa86b4a3c812 100644 --- a/doc/dev/osd_internals/last_epoch_started.rst +++ b/doc/dev/osd_internals/last_epoch_started.rst @@ -31,9 +31,30 @@ but we only update history.last_epoch_started after the new info.last_epoch_started is persisted (possibly along with the first write). This ensures that we do not require an osd with the most recent info.last_epoch_started until all acting set osds have recorded -it. In find_best_info, we do include info.last_epoch_started values -when calculating the max_last_epoch_started_found because we want to -avoid designating a log entry divergent which in a prior interval -would have been non-divergent. In activate(), we use the peer's -last_epoch_started value as a bound on how far back divergent log -entries can be found. +it. + +In find_best_info, we do include info.last_epoch_started values when +calculating the max_last_epoch_started_found because we want to avoid +designating a log entry divergent which in a prior interval would have +been non-divergent since it might have been used to serve a read. In +activate(), we use the peer's last_epoch_started value as a bound on +how far back divergent log entries can be found. + +However, in a case like + +.. code:: none + + calc_acting osd.0 1.4e( v 473'302 (292'200,473'302] local-les=473 n=4 ec=5 les/c 473/473 556/556/556 + calc_acting osd.1 1.4e( v 473'302 (293'202,473'302] lb 0//0//-1 local-les=477 n=0 ec=5 les/c 473/473 556/556/556 + calc_acting osd.4 1.4e( v 473'302 (120'121,473'302] local-les=473 n=4 ec=5 les/c 473/473 556/556/556 + calc_acting osd.5 1.4e( empty local-les=0 n=0 ec=5 les/c 473/473 556/556/556 + +since osd.1 is the only one which recorded info.les=477 while 4,0 +which were the acting set in that interval did not (4 restarted and 0 +did not get the message in time) the pg is marked incomplete when +either 4 or 0 would have been valid choices. To avoid this, we do not +consider info.les for incomplete peers when calculating +min_last_epoch_started_found. It would not have been in the acting +set, so we must have another osd from that interval anyway (if +maybe_went_rw). If that osd does not remember that info.les, then we +cannot have served reads. diff --git a/src/osd/PG.cc b/src/osd/PG.cc index a771bd7ff595..b341a3d6275f 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -900,6 +900,9 @@ void PG::clear_primary_state() map::const_iterator PG::find_best_info( const map &infos) const { + /* See doc/dev/osd_internals/last_epoch_started.rst before attempting + * to make changes to this process. Also, make sure to update it + * when you find bugs! */ eversion_t min_last_update_acceptable = eversion_t::max(); epoch_t max_last_epoch_started_found = 0; for (map::const_iterator i = infos.begin(); @@ -910,11 +913,16 @@ map::const_iterator PG::find_best_info( min_last_update_acceptable = eversion_t::max(); max_last_epoch_started_found = i->second.history.last_epoch_started; } - if (max_last_epoch_started_found < i->second.last_epoch_started) { + if (!i->second.is_incomplete() && + max_last_epoch_started_found < i->second.last_epoch_started) { min_last_update_acceptable = eversion_t::max(); max_last_epoch_started_found = i->second.last_epoch_started; } - if (max_last_epoch_started_found == i->second.last_epoch_started) { + } + for (map::const_iterator i = infos.begin(); + i != infos.end(); + ++i) { + if (max_last_epoch_started_found <= i->second.last_epoch_started) { if (min_last_update_acceptable > i->second.last_update) min_last_update_acceptable = i->second.last_update; } -- 2.47.3