From 897588003345cb553216351813ae17aa1048f055 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 9 Aug 2018 08:33:42 -0500 Subject: [PATCH] osd: vary tick interval +/- 5% to avoid scrub livelocks If you have two pgs that need to scrub on two OSDs, each the primary for one pg and the replica for the other, you can end up in a livelock: - both osds locally reserve a scrub slot - both osds send a scrub schedule request - both scrub requests are rejected - both osds wait exactly 1 second - repeat Seems a bit unlikely, but I've seen test cases where it goes on more an hour. Fixes: http://tracker.ceph.com/issues/26890 Signed-off-by: Sage Weil (cherry picked from commit 2011377c379c9d53a3a0a693a7874fc330278898) Conflicts: src/osd/OSD.cc - luminous does not have src/include/random.h; use #include instead, seeding with whoami so each OSD gets a different series of pseudo-random numbers --- src/osd/OSD.cc | 24 ++++++++++++++++-------- src/osd/OSD.h | 3 ++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index a3ec4b6eb2f10..042097bb35e22 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -21,6 +21,7 @@ #include #include #include +#include #ifdef HAVE_SYS_PARAM_H #include @@ -163,8 +164,6 @@ #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch()) -const double OSD::OSD_TICK_INTERVAL = 1.0; - static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) { return *_dout << "osd." << whoami << " " << epoch << " "; } @@ -746,8 +745,8 @@ void OSDService::promote_throttle_recalibrate() promote_probability_millis = prob; // set hard limits for this interval to mitigate stampedes - promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2; - promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2; + promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2; + promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2; } // ------------------------------------- @@ -2052,6 +2051,15 @@ OSD::~OSD() delete store; } +double OSD::get_tick_interval() const +{ + // vary +/- 5% to avoid scrub scheduling livelocks + constexpr auto delta = 0.05; + std::default_random_engine rng{whoami}; + return (OSD_TICK_INTERVAL * + std::uniform_real_distribution<>{1.0 - delta, 1.0 + delta}(rng)); +} + void cls_initialize(ClassHandler *ch); void OSD::handle_signal(int signum) @@ -2680,11 +2688,11 @@ int OSD::init() heartbeat_thread.create("osd_srv_heartbt"); // tick - tick_timer.add_event_after(OSD_TICK_INTERVAL, + tick_timer.add_event_after(get_tick_interval(), new C_Tick(this)); { Mutex::Locker l(tick_timer_lock); - tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, + tick_timer_without_osd_lock.add_event_after(get_tick_interval(), new C_Tick_WithoutOSDLock(this)); } @@ -5316,7 +5324,7 @@ void OSD::tick() do_waiters(); - tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this)); + tick_timer.add_event_after(get_tick_interval(), new C_Tick(this)); } void OSD::tick_without_osd_lock() @@ -5425,7 +5433,7 @@ void OSD::tick_without_osd_lock() mgrc.update_osd_health(get_health_metrics()); service.kick_recovery_queue(); - tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, + tick_timer_without_osd_lock.add_event_after(get_tick_interval(), new C_Tick_WithoutOSDLock(this)); } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 4523fb2807f2e..b6d2482bcedb6 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -1214,7 +1214,8 @@ public: protected: - static const double OSD_TICK_INTERVAL; // tick interval for tick_timer and tick_timer_without_osd_lock + const double OSD_TICK_INTERVAL = { 1.0 }; + double get_tick_interval() const; AuthAuthorizeHandlerRegistry *authorize_handler_cluster_registry; AuthAuthorizeHandlerRegistry *authorize_handler_service_registry; -- 2.39.5