]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
pg: add a configurable lower bound on log size
authorJosh Durgin <josh.durgin@dreamhost.com>
Tue, 10 Jan 2012 22:16:41 +0000 (14:16 -0800)
committerJosh Durgin <josh.durgin@dreamhost.com>
Wed, 11 Jan 2012 19:13:36 +0000 (11:13 -0800)
This helps prevent problems with retrying requests being detected as
duplicates. The problem occurs when the log is trimmed too
aggressively, and an earlier tid is removed from the log, while a
later one is not. The later request will be detected as a duplicate
and responded to immediately, possibly violating the ordering of the
requests.

Partially fixes #1490.
Signed-off-by: Josh Durgin <josh.durgin@dreamhost.com>
src/common/config_opts.h
src/osd/PG.h
src/osd/ReplicatedPG.cc

index c70854a0fb843a87a5dfb9b0c565895e6e637dfe..4ea9207f1c1db6202795cfbc82794f3b5a4215f9 100644 (file)
@@ -294,6 +294,7 @@ OPTION(osd_use_stale_snap, OPT_BOOL, false)
 OPTION(osd_rollback_to_cluster_snap, OPT_STR, "")
 OPTION(osd_max_notify_timeout, OPT_U32, 30) // max notify timeout in seconds
 OPTION(osd_kill_backfill_at, OPT_INT, 0)
+OPTION(osd_min_pg_log_entries, OPT_U32, 1000) // number of entries to keep in the pg log when trimming it
 OPTION(filestore, OPT_BOOL, false)
 OPTION(filestore_max_sync_interval, OPT_DOUBLE, 5)    // seconds
 OPTION(filestore_min_sync_interval, OPT_DOUBLE, .01)  // seconds
index 761ecd4c6df2518fbce77fc5a78b322c58218eba..2c2bf912b757ecf57d59f98abbca0cc3824e4c4f 100644 (file)
@@ -495,6 +495,10 @@ public:
       return head.version == 0 && head.epoch == 0;
     }
 
+    size_t approx_size() const {
+      return head.version - tail.version;
+    }
+
     list<Entry>::iterator find_entry(eversion_t v) {
       int fromhead = head.version - v.version;
       int fromtail = v.version - tail.version;
index 13106e410cf8791724eaf4294c9551211f7326d6..502ec33bc580207876de2b8803014b104deda935 100644 (file)
@@ -405,10 +405,24 @@ void ReplicatedPG::calc_trim_to()
       (is_clean() ||
        log.head.version - log.tail.version > (unsigned)info.stats.stats.sum.num_objects)) {
     if (min_last_complete_ondisk != eversion_t() &&
-       min_last_complete_ondisk != pg_trim_to) {
-      dout(10) << "calc_trim_to " << pg_trim_to << " -> " << min_last_complete_ondisk << dendl;
-      pg_trim_to = min_last_complete_ondisk;
+       min_last_complete_ondisk != pg_trim_to &&
+       log.approx_size() > g_conf->osd_min_pg_log_entries) {
+      size_t num_to_trim = log.approx_size() - g_conf->osd_min_pg_log_entries;
+      list<Log::Entry>::const_iterator it = log.log.begin();
+      eversion_t new_trim_to;
+      for (size_t i = 0; i < num_to_trim; ++i) {
+       new_trim_to = it->version;
+       ++it;
+       if (new_trim_to > min_last_complete_ondisk) {
+         new_trim_to = min_last_complete_ondisk;
+         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
+         break;
+       }
+      }
+      dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
+      pg_trim_to = new_trim_to;
       assert(pg_trim_to <= log.head);
+      assert(pg_trim_to <= min_last_complete_ondisk);
     }
   } else {
     // don't trim