]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osd: trim pg logs based on a per-osd budget
authorSage Weil <sage@redhat.com>
Thu, 16 Jan 2020 17:22:34 +0000 (11:22 -0600)
committerKefu Chai <kchai@redhat.com>
Wed, 22 Jan 2020 01:13:00 +0000 (09:13 +0800)
Set the default budget based on the current defaults: 3000 per osd, and a
rule of thumb target of 100 PGs per OSD.  Set the per-PG trim target
by dividing the overall value by the number of PGs on the OSD.

Increase the max pg log length alone, so if the OSD has <100 PGs,
those PGs will get more entries.  Reduce the minimum to be smaller than
the max.  Use the min/max config options to bracket what is allocated to
a single PG.

Signed-off-by: Sage Weil <sage@redhat.com>
src/common/legacy_config_opts.h
src/common/options.cc
src/crimson/osd/pg.cc
src/crimson/osd/pg.h
src/osd/OSD.cc
src/osd/OSD.h
src/osd/PG.cc
src/osd/PG.h
src/osd/PeeringState.cc
src/osd/PeeringState.h

index 875f64dc92a09211425b27d488c9925ba27e24d6..7aff6bedcade74abf9c761c554460fe8632682fb 100644 (file)
@@ -721,6 +721,7 @@ OPTION(osd_kill_backfill_at, OPT_INT)
 // Bounds how infrequently a new map epoch will be persisted for a pg
 OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32) // make this < map_cache_size!
 
+OPTION(osd_target_pg_log_entries_per_osd, OPT_U32)
 OPTION(osd_min_pg_log_entries, OPT_U32)  // number of entries to keep in the pg log when trimming it
 OPTION(osd_max_pg_log_entries, OPT_U32) // max entries, say when degraded, before we trim
 OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs
index 51d5d48080f10058b335cd3db486e6e0ed3ede6e..1810dea73702b83929fdc3c655f2d019f976d981 100644 (file)
@@ -3302,15 +3302,21 @@ std::vector<Option> get_global_options() {
     .set_default(40)
     .set_description(""),
 
+    Option("osd_target_pg_log_entries_per_osd", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(3000 * 100)
+    .set_description("target number of PG entries total on an OSD")
+    .add_see_also("osd_max_pg_log_entries")
+    .add_see_also("osd_min_pg_log_entries"),
+
     Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(3000)
+    .set_default(250)
     .set_description("minimum number of entries to maintain in the PG log")
     .add_service("osd")
     .add_see_also("osd_max_pg_log_entries")
     .add_see_also("osd_pg_log_dups_tracked"),
 
     Option("osd_max_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-    .set_default(3000)
+    .set_default(10000)
     .set_description("maximum number of entries to maintain in the PG log when degraded before we trim")
     .add_service("osd")
     .add_see_also("osd_min_pg_log_entries")
index 9d95ed41b34d862f9292677f642775f58946589a..afacf0163cf46a7d318cf328f10ba87166cf5efa 100644 (file)
@@ -194,6 +194,30 @@ void PG::recheck_readable()
   }
 }
 
+unsigned PG::get_target_pg_log_entries() const
+{
+  const unsigned num_pgs = shard_services.get_pg_num();
+  const unsigned target =
+    local_conf().get_val<uint64_t>("osd_target_pg_log_entries_per_osd");
+  const unsigned min_pg_log_entries =
+    local_conf().get_val<uint64_t>("osd_min_pg_log_entries");
+  if (num_pgs > 0 && target > 0) {
+    // target an even spread of our budgeted log entries across all
+    // PGs.  note that while we only get to control the entry count
+    // for primary PGs, we'll normally be responsible for a mix of
+    // primary and replica PGs (for the same pool(s) even), so this
+    // will work out.
+    const unsigned max_pg_log_entries =
+      local_conf().get_val<uint64_t>("osd_max_pg_log_entries");
+    return std::clamp(target / num_pgs,
+                     min_pg_log_entries,
+                     max_pg_log_entries);
+  } else {
+    // fall back to a per-pg value.
+    return min_pg_log_entries;
+  }
+}
+
 void PG::on_activate(interval_set<snapid_t>)
 {
   projected_last_update = peering_state.get_info().last_update;
index 14787032502eb482c5148807fd4a8c7488d7b578..d303745c544c8ede8ad9e71e3e12512aba0fa8ee 100644 (file)
@@ -240,6 +240,8 @@ public:
                            ceph::timespan delay) final;
   void recheck_readable() final;
 
+  unsigned get_target_pg_log_entries() const final;
+
   void on_pool_change() final {
     // Not needed yet
   }
index 9b1e86e820fbb5362b7659acf6954ccb2ee123af..c7eab55878968c7e6632878568ae1def690024af 100644 (file)
@@ -9400,6 +9400,26 @@ bool OSDService::_recover_now(uint64_t *available_pushes)
   return true;
 }
 
+unsigned OSDService::get_target_pg_log_entries() const
+{
+  auto num_pgs = osd->get_num_pgs();
+  auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
+  if (num_pgs > 0 && target > 0) {
+    // target an even spread of our budgeted log entries across all
+    // PGs.  note that while we only get to control the entry count
+    // for primary PGs, we'll normally be responsible for a mix of
+    // primary and replica PGs (for the same pool(s) even), so this
+    // will work out.
+    return std::max<unsigned>(
+      std::min<unsigned>(target / num_pgs,
+                        cct->_conf->osd_max_pg_log_entries),
+      cct->_conf->osd_min_pg_log_entries);
+  } else {
+    // fall back to a per-pg value.
+    return cct->_conf->osd_min_pg_log_entries;
+  }
+}
+
 void OSD::do_recovery(
   PG *pg, epoch_t queued, uint64_t reserved_pushes,
   ThreadPool::TPHandle &handle)
index 89e034fb2ac1025d84a33cf3083ddef3a76d9b5c..d0d96f52e37bdf51072144a918c7accdcbd9242a 100644 (file)
@@ -662,6 +662,9 @@ public:
        return awaiting.second.get() == pg;
       });
   }
+
+  unsigned get_target_pg_log_entries() const;
+  
   // delayed pg activation
   void queue_for_recovery(PG *pg) {
     std::lock_guard l(recovery_lock);
index 2019ebe1bd4f038f060490c52aa59e717a7f4dcd..6f5dd858f7250ccb5b33acfe4b42bfde2ff6ba85 100644 (file)
@@ -849,6 +849,11 @@ void PG::publish_stats_to_osd()
   }
 }
 
+unsigned PG::get_target_pg_log_entries() const
+{
+  return osd->get_target_pg_log_entries();
+}
+
 void PG::clear_publish_stats()
 {
   dout(15) << "clear_stats" << dendl;
index d1d4b124144b759d9d12400f57b6a7dcaf48fc49..fd8905cf8e8424cc5a1b509bffaed3dceccf5c75 100644 (file)
@@ -400,6 +400,7 @@ public:
   uint64_t get_snap_trimq_size() const override {
     return snap_trimq.size();
   }
+  unsigned get_target_pg_log_entries() const override;
 
   void clear_publish_stats() override;
   void clear_primary_state() override;
index 8714a0a64f0719c2bf4711d12f01d27539e0dc13..41c38359b850e09f7bd020cef83fcf97ae8ed9ef 100644 (file)
@@ -4050,7 +4050,7 @@ void PeeringState::calc_trim_to()
                  PG_STATE_BACKFILLING |
                  PG_STATE_BACKFILL_WAIT |
                  PG_STATE_BACKFILL_TOOFULL)) {
-    target = cct->_conf->osd_max_pg_log_entries;
+    target = pl->get_target_pg_log_entries();
   }
 
   eversion_t limit = std::min(
@@ -4092,7 +4092,7 @@ void PeeringState::calc_trim_to_aggressive()
                 PG_STATE_BACKFILLING |
                 PG_STATE_BACKFILL_WAIT |
                 PG_STATE_BACKFILL_TOOFULL)) {
-    target = cct->_conf->osd_max_pg_log_entries;
+    target = pl->get_target_pg_log_entries();
   }
   // limit pg log trimming up to the can_rollback_to value
   eversion_t limit = std::min(
index be6501786fcb136f3226c2d927666b35b3aa4d23..a9bccea5d4489f2ee45909a963e3d4b2e695ab07 100644 (file)
@@ -281,6 +281,8 @@ public:
     virtual void queue_check_readable(epoch_t lpr, ceph::timespan delay) = 0;
     virtual void recheck_readable() = 0;
 
+    virtual unsigned get_target_pg_log_entries() const = 0;
+
     // ============ Flush state ==================
     /**
      * try_flush_or_schedule_async()