]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: Increase priority for inactive PGs backfill 13232/head
authorBartłomiej Święcki <bartlomiej.swiecki@corp.ovh.com>
Fri, 2 Dec 2016 15:54:46 +0000 (16:54 +0100)
committerBartłomiej Święcki <bartlomiej.swiecki@corp.ovh.com>
Fri, 3 Mar 2017 17:03:00 +0000 (18:03 +0100)
This change does prioritize backfill of PGs which don't
have min_size active copies. Such PGs would cause IO stalls
for clients and would increase throttlers usage.

This change also fixes few subtlle out-of-bounds bugs.

Signed-off-by: Bartłomiej Święcki <bartlomiej.swiecki@corp.ovh.com>
(cherry picked from commit 6a76adcdb1f92c136841d960aa7cd4e5b94addec)

Conflicts:
PendingReleaseNotes (removed version number, merged conflicts)

PendingReleaseNotes
src/osd/PG.cc
src/osd/osd_types.h

index 6743d6fed455d7c6a28970eebe8b71d5f3dd2606..0e272deb3e7ec79e601d36a126de709ea903990d 100644 (file)
@@ -5,3 +5,10 @@
   is enabled (it's off by default).  This means that a VM using librbd that
   previously would have gotten an EIO and gone read-only will now see a
   blocked/hung IO instead.
+
+* Calculation of recovery priorities has been updated.
+  This could lead to unintuitive recovery prioritization
+  during cluster upgrade. In case of such recovery, OSDs
+  in old version would operate on different priority ranges
+  than new ones. Once upgraded, cluster will operate on
+  consistent values.
index 27a77e3f72708074098aafa5a84417d7ca0b0496..e4df1fd5de3fac54b2ecd848d2c73cbf2b46e86a 100644 (file)
@@ -2126,18 +2126,31 @@ unsigned PG::get_recovery_priority()
   int pool_recovery_priority = 0;
   pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
 
-  unsigned ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
-  if (ret > OSD_RECOVERY_PRIORITY_MAX)
+  int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
+
+  // Clamp to valid range
+  if (ret > OSD_RECOVERY_PRIORITY_MAX) {
     ret = OSD_RECOVERY_PRIORITY_MAX;
-  return ret;
+  } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
+    ret = OSD_RECOVERY_PRIORITY_MIN;
+  }
+
+  static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
+  static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
+
+  return static_cast<unsigned>(ret);
 }
 
 unsigned PG::get_backfill_priority()
 {
   // a higher value -> a higher priority
 
-  unsigned ret = OSD_BACKFILL_PRIORITY_BASE;
-  if (is_undersized()) {
+  int ret = OSD_BACKFILL_PRIORITY_BASE;
+  if (acting.size() < pool.info.min_size) {
+    // inactive: no. of replicas < min_size, highest priority since it blocks IO
+    ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
+
+  } else if (is_undersized()) {
     // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
     assert(pool.info.size > actingset.size());
     ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
@@ -2146,9 +2159,20 @@ unsigned PG::get_backfill_priority()
     // degraded: baseline degraded
     ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
   }
-  assert (ret < OSD_RECOVERY_PRIORITY_MAX);
 
-  return ret;
+  // Adjust with pool's recovery priority
+  int pool_recovery_priority = 0;
+  pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+  ret += pool_recovery_priority;
+
+  // Clamp to valid range
+  if (ret > OSD_RECOVERY_PRIORITY_MAX) {
+    ret = OSD_RECOVERY_PRIORITY_MAX;
+  } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
+    ret = OSD_RECOVERY_PRIORITY_MIN;
+  }
+
+  return static_cast<unsigned>(ret);
 }
 
 void PG::finish_recovery(list<Context*>& tfin)
index 1c5d71db308bea26439589c9aaaae99cf2644b7a..872bc26aa46d2cfb510f8f81a957126777b484f4 100644 (file)
 #define CEPH_OSD_FEATURE_INCOMPAT_PGMETA CompatSet::Feature(13, "pg meta object")
 
 
-/// max recovery priority for MBackfillReserve
-#define OSD_RECOVERY_PRIORITY_MAX 255u
+/// min recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MIN 0
 
-/// base recovery priority for MBackfillReserve
-#define OSD_RECOVERY_PRIORITY_BASE 230u
+/// base backfill priority for MBackfillReserve
+#define OSD_BACKFILL_PRIORITY_BASE 100
 
 /// base backfill priority for MBackfillReserve (degraded PG)
-#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 200u
+#define OSD_BACKFILL_DEGRADED_PRIORITY_BASE 140
+
+/// base recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_BASE 180
+
+/// base backfill priority for MBackfillReserve (inactive PG)
+#define OSD_BACKFILL_INACTIVE_PRIORITY_BASE 220
+
+/// max recovery priority for MBackfillReserve
+#define OSD_RECOVERY_PRIORITY_MAX 255
 
-/// base backfill priority for MBackfillReserve
-#define OSD_BACKFILL_PRIORITY_BASE 1u
 
 typedef hobject_t collection_list_handle_t;