]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: vary tick interval +/- 5% to avoid scrub livelocks 23512/head
authorSage Weil <sage@redhat.com>
Thu, 9 Aug 2018 13:33:42 +0000 (08:33 -0500)
committerSage Weil <sage@redhat.com>
Sun, 12 Aug 2018 20:43:40 +0000 (15:43 -0500)
If you have two pgs that need to scrub on two OSDs, each the primary
for one pg and the replica for the other, you can end up in a livelock:

- both osds locally reserve a scrub slot
- both osds send a scrub schedule request
- both scrub requests are rejected
- both osds wait exactly 1 second
- repeat

Seems a bit unlikely, but I've seen test cases where it goes on more an
hour.

Fixes: http://tracker.ceph.com/issues/26890
Signed-off-by: Sage Weil <sage@redhat.com>
src/osd/OSD.cc
src/osd/OSD.h

index db51ad153a32bf15aa65bd19c99c51fd0dda2a4c..6831fc7027112baa054ae3e63633284636101f99 100644 (file)
@@ -36,6 +36,7 @@
 
 #include "include/types.h"
 #include "include/compat.h"
+#include "include/random.h"
 
 #include "OSD.h"
 #include "OSDMap.h"
 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 
 
-const double OSD::OSD_TICK_INTERVAL = 1.0;
-
 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
   return *_dout << "osd." << whoami << " " << epoch << " ";
 }
@@ -603,8 +602,8 @@ void OSDService::promote_throttle_recalibrate()
   promote_probability_millis = prob;
 
   // set hard limits for this interval to mitigate stampedes
-  promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
-  promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
+  promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
+  promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 }
 
 // -------------------------------------
@@ -1978,6 +1977,14 @@ OSD::~OSD()
   delete store;
 }
 
+double OSD::get_tick_interval() const
+{
+  // vary +/- 5% to avoid scrub scheduling livelocks
+  constexpr auto delta = 0.05;
+  return (OSD_TICK_INTERVAL *
+         ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
+}
+
 void cls_initialize(ClassHandler *ch);
 
 void OSD::handle_signal(int signum)
@@ -2606,11 +2613,11 @@ int OSD::init()
   heartbeat_thread.create("osd_srv_heartbt");
 
   // tick
-  tick_timer.add_event_after(OSD_TICK_INTERVAL,
+  tick_timer.add_event_after(get_tick_interval(),
                             new C_Tick(this));
   {
     Mutex::Locker l(tick_timer_lock);
-    tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL,
+    tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
                                                new C_Tick_WithoutOSDLock(this));
   }
 
@@ -4806,7 +4813,7 @@ void OSD::tick()
 
   do_waiters();
 
-  tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
+  tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
 }
 
 void OSD::tick_without_osd_lock()
@@ -4877,7 +4884,7 @@ void OSD::tick_without_osd_lock()
 
   mgrc.update_daemon_health(get_health_metrics());
   service.kick_recovery_queue();
-  tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL,
+  tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
                                              new C_Tick_WithoutOSDLock(this));
 }
 
index e435b99000a715df4aa0235d3795b75c6fc4e313..18739f52dfa89d3eae3bda9c8e1785ef04f822ca 100644 (file)
@@ -1237,7 +1237,8 @@ public:
 
 protected:
 
-  static const double OSD_TICK_INTERVAL; // tick interval for tick_timer and tick_timer_without_osd_lock
+  const double OSD_TICK_INTERVAL = { 1.0 };
+  double get_tick_interval() const;
 
   AuthAuthorizeHandlerRegistry *authorize_handler_cluster_registry;
   AuthAuthorizeHandlerRegistry *authorize_handler_service_registry;