]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: randomize scrub times to avoid scrub wave
authorKefu Chai <kchai@redhat.com>
Mon, 9 Mar 2015 08:42:34 +0000 (16:42 +0800)
committerSage Weil <sage@redhat.com>
Thu, 8 Oct 2015 03:07:13 +0000 (23:07 -0400)
- to avoid the scrub wave when the osd_scrub_max_interval reaches in a
  high-load OSD, the scrub time is randomized.
- extract scrub_load_below_threshold() out of scrub_should_schedule()
- schedule an automatic scrub job at a time which is uniformly distributed
  over [now+osd_scrub_min_interval,
        now+osd_scrub_min_interval*(1+osd_scrub_time_limit]. before
  this change this sort of scrubs will be performed once the hard interval
  is end or system load is below the threshold, but with this change, the
  jobs will be performed as long as the load is low or the interval of
  the scheduled scrubs is longer than conf.osd_scrub_max_interval. all
  automatic jobs should be performed in the configured time period, otherwise
  they are postponed.
- the requested scrub job will be scheduled right away, before this change
  it is queued with the timestamp of `now` and postponed after
  osd_scrub_min_interval.

Fixes: #10973
Signed-off-by: Kefu Chai <kchai@redhat.com>
(cherry picked from commit 5e44040e8528bff06cc0a5a3f3293ab146e0e4e1)

Conflicts:
src/osd/OSD.cc

src/common/config_opts.h
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc

index f2c34fe8a4a140578888ae6190847c4c52eeeb98..cfdd6a8740f43222ded89a787359986af0d82845 100644 (file)
@@ -620,6 +620,7 @@ OPTION(osd_scrub_end_hour, OPT_INT, 24)
 OPTION(osd_scrub_load_threshold, OPT_FLOAT, 0.5)
 OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24)    // if load is low
 OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24)  // regardless of load
+OPTION(osd_scrub_interval_limit, OPT_FLOAT, 0.5) // randomize the scheduled scrub in the span of [min,min*(1+interval_limit))
 OPTION(osd_scrub_chunk_min, OPT_INT, 5)
 OPTION(osd_scrub_chunk_max, OPT_INT, 25)
 OPTION(osd_scrub_sleep, OPT_FLOAT, 0)   // sleep between [deep]scrub ops
index 4c7120b05a6a1d887e1a5d27c068a8e81ea7cbda..7a7fe43754523903eebbb94ef2f12c9ca651a083 100644 (file)
@@ -5910,6 +5910,30 @@ bool OSD::scrub_random_backoff()
   return false;
 }
 
+OSDService::ScrubJob::ScrubJob(const spg_t& pg, const utime_t& timestamp, bool must)
+  : pgid(pg),
+    sched_time(timestamp),
+    deadline(timestamp)
+{
+  // if not explicitly requested, postpone the scrub with a random delay
+  if (!must) {
+    sched_time += g_conf->osd_scrub_min_interval;
+    if (g_conf->osd_scrub_interval_limit > 0) {
+      sched_time += rand() % (int)(g_conf->osd_scrub_min_interval *
+                                  g_conf->osd_scrub_interval_limit);
+    }
+    deadline += g_conf->osd_scrub_max_interval;
+  }
+}
+
+bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
+  if (sched_time < rhs.sched_time)
+    return true;
+  if (sched_time > rhs.sched_time)
+    return false;
+  return pgid < rhs.pgid;
+}
+
 bool OSD::scrub_time_permit(utime_t now)
 {
   struct tm bdt; 
@@ -5937,11 +5961,8 @@ bool OSD::scrub_time_permit(utime_t now)
   return time_permit;
 }
 
-bool OSD::scrub_should_schedule()
+bool OSD::scrub_load_below_threshold()
 {
-  if (!scrub_time_permit(ceph_clock_now(cct))) {
-    return false;
-  }
   double loadavgs[1];
   if (getloadavg(loadavgs, 1) != 1) {
     dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
@@ -5963,54 +5984,39 @@ bool OSD::scrub_should_schedule()
 
 void OSD::sched_scrub()
 {
-  assert(osd_lock.is_locked());
-
-  bool load_is_low = scrub_should_schedule();
-
-  dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
-
   utime_t now = ceph_clock_now(cct);
-  
-  //dout(20) << " " << last_scrub_pg << dendl;
+  bool time_permit = scrub_time_permit(now);
+  bool load_is_low = scrub_load_below_threshold();
+  dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
 
-  pair<utime_t, spg_t> pos;
-  if (service.first_scrub_stamp(&pos)) {
+  OSDService::ScrubJob scrub;
+  if (service.first_scrub_stamp(&scrub)) {
     do {
-      utime_t t = pos.first;
-      spg_t pgid = pos.second;
-      dout(30) << "sched_scrub examine " << pgid << " at " << t << dendl;
-
-      utime_t diff = now - t;
-      if ((double)diff < cct->_conf->osd_scrub_min_interval) {
-       dout(10) << "sched_scrub " << pgid << " at " << t
-                << ": " << (double)diff << " < min (" << cct->_conf->osd_scrub_min_interval << " seconds)" << dendl;
-       break;
-      }
-      if ((double)diff < cct->_conf->osd_scrub_max_interval && !load_is_low) {
+      dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
+
+      if (scrub.sched_time > now) {
        // save ourselves some effort
-       dout(10) << "sched_scrub " << pgid << " high load at " << t
-                << ": " << (double)diff << " < max (" << cct->_conf->osd_scrub_max_interval << " seconds)" << dendl;
+       dout(10) << "sched_scrub " << scrub.pgid << " schedued at " << scrub.sched_time
+                << " > " << now << dendl;
        break;
       }
 
-      PG *pg = _lookup_lock_pg(pgid);
-      if (pg) {
-       if (pg->get_pgbackend()->scrub_supported() && pg->is_active() &&
-           (load_is_low ||
-            (double)diff >= cct->_conf->osd_scrub_max_interval ||
-            pg->scrubber.must_scrub)) {
-         dout(10) << "sched_scrub scrubbing " << pgid << " at " << t
-                  << (pg->scrubber.must_scrub ? ", explicitly requested" :
-                  ( (double)diff >= cct->_conf->osd_scrub_max_interval ? ", diff >= max" : ""))
-                  << dendl;
-         if (pg->sched_scrub()) {
-           pg->unlock();
-           break;
-         }
+      PG *pg = _lookup_lock_pg(scrub.pgid);
+      if (!pg)
+       continue;
+      if (pg->get_pgbackend()->scrub_supported() && pg->is_active() &&
+         (scrub.deadline < now || (time_permit && load_is_low))) {
+       dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
+                << (pg->scrubber.must_scrub ? ", explicitly requested" :
+                    (load_is_low ? ", load_is_low" : " deadline < now"))
+                << dendl;
+       if (pg->sched_scrub()) {
+         pg->unlock();
+         break;
        }
-       pg->unlock();
       }
-    } while  (service.next_scrub_stamp(pos, &pos));
+      pg->unlock();
+    } while (service.next_scrub_stamp(scrub, &scrub));
   }    
   dout(20) << "sched_scrub done" << dendl;
 }
index f5021ef159bb4ee355fbf8fc230f443ae95446dc..d09ae74afecbd2fd370d8860d3f6a580abccd916 100644 (file)
@@ -505,37 +505,51 @@ public:
   Mutex sched_scrub_lock;
   int scrubs_pending;
   int scrubs_active;
-  set< pair<utime_t,spg_t> > last_scrub_pg;
+  struct ScrubJob {
+    /// pg to be scrubbed
+    spg_t pgid;
+    /// a time scheduled for scrub. but the scrub could be delayed if system
+    /// load is too high or it fails to fall in the scrub hours
+    utime_t sched_time;
+    /// the hard upper bound of scrub time
+    utime_t deadline;
+    ScrubJob() {}
+    explicit ScrubJob(const spg_t& pg, const utime_t& timestamp, bool must = true);
+    /// order the jobs by sched_time
+    bool operator<(const ScrubJob& rhs) const;
+  };
+  set<ScrubJob> sched_scrub_pg;
 
-  void reg_last_pg_scrub(spg_t pgid, utime_t t) {
+  /// @returns the scrub_reg_stamp used for unregister the scrub job
+  utime_t reg_pg_scrub(spg_t pgid, utime_t t, bool must) {
+    ScrubJob scrub(pgid, t, must);
     Mutex::Locker l(sched_scrub_lock);
-    last_scrub_pg.insert(pair<utime_t,spg_t>(t, pgid));
+    sched_scrub_pg.insert(scrub);
+    return scrub.sched_time;
   }
-  void unreg_last_pg_scrub(spg_t pgid, utime_t t) {
+  void unreg_pg_scrub(spg_t pgid, utime_t t) {
     Mutex::Locker l(sched_scrub_lock);
-    pair<utime_t,spg_t> p(t, pgid);
-    set<pair<utime_t,spg_t> >::iterator it = last_scrub_pg.find(p);
-    assert(it != last_scrub_pg.end());
-    last_scrub_pg.erase(it);
+    size_t removed = sched_scrub_pg.erase(ScrubJob(pgid, t));
+    assert(removed);
   }
-  bool first_scrub_stamp(pair<utime_t, spg_t> *out) {
+  bool first_scrub_stamp(ScrubJob *out) {
     Mutex::Locker l(sched_scrub_lock);
-    if (last_scrub_pg.empty())
+    if (sched_scrub_pg.empty())
       return false;
-    set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.begin();
+    set<ScrubJob>::iterator iter = sched_scrub_pg.begin();
     *out = *iter;
     return true;
   }
-  bool next_scrub_stamp(pair<utime_t, spg_t> next,
-                       pair<utime_t, spg_t> *out) {
+  bool next_scrub_stamp(const ScrubJob& next,
+                       ScrubJob *out) {
     Mutex::Locker l(sched_scrub_lock);
-    if (last_scrub_pg.empty())
+    if (sched_scrub_pg.empty())
       return false;
-    set< pair<utime_t, spg_t> >::iterator iter = last_scrub_pg.lower_bound(next);
-    if (iter == last_scrub_pg.end())
+    set<ScrubJob>::iterator iter = sched_scrub_pg.lower_bound(next);
+    if (iter == sched_scrub_pg.end())
       return false;
     ++iter;
-    if (iter == last_scrub_pg.end())
+    if (iter == sched_scrub_pg.end())
       return false;
     *out = *iter;
     return true;
@@ -2095,7 +2109,7 @@ protected:
   // -- scrubbing --
   void sched_scrub();
   bool scrub_random_backoff();
-  bool scrub_should_schedule();
+  bool scrub_load_below_threshold();
   bool scrub_time_permit(utime_t now);
 
   xlist<PG*> scrub_queue;
index bfe59b79c821626d9ceb0be98c697abf9348e5f7..918938c352d2f9547b418298ae768df6137aec16 100644 (file)
@@ -3307,20 +3307,27 @@ bool PG::sched_scrub()
 
 void PG::reg_next_scrub()
 {
+  if (!is_primary())
+    return;
+
+  utime_t reg_stamp;
   if (scrubber.must_scrub ||
       (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
-    scrubber.scrub_reg_stamp = utime_t();
+    reg_stamp = ceph_clock_now(cct);
   } else {
-    scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
+    reg_stamp = info.history.last_scrub_stamp;
   }
-  if (is_primary())
-    osd->reg_last_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
+  // note down the sched_time, so we can locate this scrub, and remove it
+  // later on.
+  scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
+                                              reg_stamp,
+                                              scrubber.must_scrub);
 }
 
 void PG::unreg_next_scrub()
 {
   if (is_primary())
-    osd->unreg_last_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
+    osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
 }
 
 void PG::sub_op_scrub_map(OpRequestRef op)