]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
PGLog: store extra duplicate ops beyond the normal log entries
authorJosh Durgin <jdurgin@redhat.com>
Tue, 18 Apr 2017 22:49:21 +0000 (15:49 -0700)
committerJ. Eric Ivancich <ivancich@redhat.com>
Wed, 26 Jul 2017 15:04:44 +0000 (11:04 -0400)
This helps us avoid replaying non-idempotent client operations when
the pg log is very short, e.g. in an effort to force OSDs to use
backfill rather than regular recovery. This can be advantageous to
avoid blocking i/o to objects, at the cost of longer total time to
become clean (since backfill requires scanning the objects to see what
is missing).

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/osd_types.cc
src/osd/osd_types.h

index a54faafbe2edb816b57127369350fdf3bc2c6df3..cf50db16e6aca1d39e285dde06e230e44e75f4c7 100644 (file)
@@ -46,7 +46,8 @@ void PGLog::IndexedLog::split_out_child(
 void PGLog::IndexedLog::trim(
   CephContext* cct,
   eversion_t s,
-  set<eversion_t> *trimmed)
+  set<eversion_t> *trimmed,
+  set<string> *trimmed_dups)
 {
   if (complete_to != log.end() &&
       complete_to->version <= s) {
@@ -67,6 +68,18 @@ void PGLog::IndexedLog::trim(
 
     unindex(e);         // remove from index,
 
+    // add to dup list
+    if (e.version.version + 1000 > s.version) {
+      dirty_dups = true;
+      dups.push_back(pg_log_dup_t(e));
+      dup_index[e.reqid] = &(dups.back());
+      for (const auto& extra : e.extra_reqids) {
+       dups.push_back(pg_log_dup_t(e.version, extra.second,
+                                   extra.first, e.return_code));
+       dup_index[extra->first] = &(dups.back());
+      }
+    }
+
     if (rollback_info_trimmed_to_riter == log.rend() ||
        e.version == rollback_info_trimmed_to_riter->version) {
       log.pop_front();
@@ -76,6 +89,17 @@ void PGLog::IndexedLog::trim(
     }
   }
 
+  while (!dups.empty()) {
+    auto &e = *dups.begin();
+    if (e.version.version + 1000 > s.version)
+      break;
+    generic_dout(20) << "trim dup " << e << dendl;
+    if (trimmed_dups)
+      trimmed_dups->insert(e.get_key_name());
+    dup_index.erase(e.reqid);
+    dups.pop_front();
+  }
+
   // raise tail?
   if (tail < s)
     tail = s;
@@ -124,7 +148,7 @@ void PGLog::trim(
     assert(trim_to <= info.last_complete);
 
     dout(10) << "trim " << log << " to " << trim_to << dendl;
-    log.trim(cct, trim_to, &trimmed);
+    log.trim(cct, trim_to, &trimmed, &trimmed_dups);
     info.log_tail = log.tail;
   }
 }
@@ -446,6 +470,7 @@ void PGLog::write_log_and_missing(
             << ", dirty_from: " << dirty_from
             << ", writeout_from: " << writeout_from
             << ", trimmed: " << trimmed
+            << ", trimmed_dups: " << trimmed_dups
             << ", clear_divergent_priors: " << clear_divergent_priors
             << dendl;
     _write_log_and_missing(
@@ -454,6 +479,7 @@ void PGLog::write_log_and_missing(
       dirty_from,
       writeout_from,
       trimmed,
+      trimmed_dups,
       missing,
       !touched_log,
       require_rollback,
@@ -511,13 +537,14 @@ void PGLog::_write_log_and_missing_wo_missing(
   eversion_t dirty_from,
   eversion_t writeout_from,
   const set<eversion_t> &trimmed,
+  const set<string> &trimmed_dups,
   bool dirty_divergent_priors,
   bool touch_log,
   bool require_rollback,
   set<string> *log_keys_debug
   )
 {
-  set<string> to_remove;
+  set<string> to_remove(trimmed_dups);
   for (set<eversion_t>::const_iterator i = trimmed.begin();
        i != trimmed.end();
        ++i) {
@@ -563,6 +590,18 @@ void PGLog::_write_log_and_missing_wo_missing(
     (*km)[p->get_key_name()].claim(bl);
   }
 
+  if (dirty_dups) {
+    pg_log_dup_t min;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      min.get_key_name(), log.dups.begin()->get_key_name());
+    for (const auto& entry : log.dups) {
+      bufferlist bl;
+      ::encode(entry, bl);
+      (*km)[entry.get_key_name()].claim(bl);
+    }
+  }
+
   if (log_keys_debug) {
     for (map<string, bufferlist>::iterator i = (*km).begin();
         i != (*km).end();
@@ -600,6 +639,7 @@ void PGLog::_write_log_and_missing(
   eversion_t dirty_from,
   eversion_t writeout_from,
   const set<eversion_t> &trimmed,
+  const set<string> &trimmed_dups,
   const pg_missing_tracker_t &missing,
   bool touch_log,
   bool require_rollback,
@@ -607,7 +647,7 @@ void PGLog::_write_log_and_missing(
   bool *rebuilt_missing_with_deletes, // in/out param
   set<string> *log_keys_debug
   ) {
-  set<string> to_remove;
+  set<string> to_remove(trimmed_dups);
   for (set<eversion_t>::const_iterator i = trimmed.begin();
        i != trimmed.end();
        ++i) {
@@ -652,6 +692,18 @@ void PGLog::_write_log_and_missing(
     (*km)[p->get_key_name()].claim(bl);
   }
 
+  if (dirty_dups) {
+    pg_log_dup_t min;
+    t.omap_rmkeyrange(
+      coll, log_oid,
+      min.get_key_name(), log.dups.begin()->get_key_name());
+    for (const auto& entry : log.dups) {
+      bufferlist bl;
+      ::encode(entry, bl);
+      (*km)[entry.get_key_name()].claim(bl);
+    }
+  }
+
   if (log_keys_debug) {
     for (map<string, bufferlist>::iterator i = (*km).begin();
         i != (*km).end();
index 19aeadfbd06ec4e3cafb7d12c7ea4df480c10132..d8bf60ee9972d2569210af7097541e431e741432 100644 (file)
@@ -82,6 +82,7 @@ public:
     mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects;  // ptrs into log.  be careful!
     mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
     mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
+    mutable ceph::unordered_map<osd_reqid_t, pg_log_dup_t*> dup_index;
 
     // recovery pointers
     list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
@@ -398,6 +399,7 @@ public:
       objects.clear();
       caller_ops.clear();
       extra_caller_ops.clear();
+      dup_index.clear();
       indexed_data = 0;
     }
     void unindex(pg_log_entry_t& e) {
@@ -476,7 +478,8 @@ public:
     void trim(
       CephContext* cct,
       eversion_t s,
-      set<eversion_t> *trimmed);
+      set<eversion_t> *trimmed,
+      set<string> *trimmed_dups);
 
     ostream& print(ostream& out) const;
   };
@@ -492,11 +495,13 @@ protected:
   eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
   eversion_t writeout_from;    ///< must writout keys >= writeout_from
   set<eversion_t> trimmed;     ///< must clear keys in trimmed
+  set<string> trimmed_dups; ///< must clear keys in trimmed_dups
   CephContext *cct;
   bool pg_log_debug;
   /// Log is clean on [dirty_to, dirty_from)
   bool touched_log;
   bool clear_divergent_priors;
+  bool dirty_dups; /// log.dups is updated
   bool rebuilt_missing_with_deletes = false;
 
   void mark_dirty_to(eversion_t to) {
@@ -519,6 +524,7 @@ public:
       (writeout_from != eversion_t::max()) ||
       !(trimmed.empty()) ||
       !missing.is_clean() ||
+      !(trimmed_dups.empty()) ||
       rebuilt_missing_with_deletes;
   }
   void mark_log_for_rewrite() {
@@ -554,9 +560,11 @@ protected:
     dirty_from = eversion_t::max();
     touched_log = true;
     trimmed.clear();
+    trimmed_dups.clear();
     writeout_from = eversion_t::max();
     check();
     missing.flush();
+    dirty_dups = false;
   }
 public:
   // cppcheck-suppress noExplicitConstructor
@@ -1111,6 +1119,7 @@ public:
     eversion_t dirty_from,
     eversion_t writeout_from,
     const set<eversion_t> &trimmed,
+    const set<string> &trimmed_dups,
     bool dirty_divergent_priors,
     bool touch_log,
     bool require_rollback,
@@ -1126,6 +1135,7 @@ public:
     eversion_t dirty_from,
     eversion_t writeout_from,
     const set<eversion_t> &trimmed,
+    const set<string> &trimmed_dups,
     const pg_missing_tracker_t &missing,
     bool touch_log,
     bool require_rollback,
@@ -1181,6 +1191,7 @@ public:
     bool has_divergent_priors = false;
     missing.may_include_deletes = false;
     list<pg_log_entry_t> entries;
+    list<pg_log_dup_t> dups;
     if (p) {
       for (p->seek_to_first(); p->valid() ; p->next(false)) {
        // non-log pgmeta_oid keys are prefixed with _; skip those
@@ -1209,6 +1220,13 @@ public:
            assert(missing.may_include_deletes);
          }
          missing.add(oid, item.need, item.have, item.is_delete());
+       } else if (p->key().substr(0, 4) == string("dup_")) {
+         pg_log_dup_t dup;
+         ::decode(dup, bp);
+         if (!dups.empty()) {
+           assert(dups.back().version < dup.version);
+         }
+         dups.push_back(dup);
        } else {
          pg_log_entry_t e;
          e.decode_with_checksum(bp);
@@ -1229,7 +1247,8 @@ public:
       info.log_tail,
       on_disk_can_rollback_to,
       on_disk_rollback_info_trimmed_to,
-      std::move(entries));
+      std::move(entries),
+      std::move(dups));
 
     if (has_divergent_priors || debug_verify_stored_missing) {
       // build missing
index 645229cd1d57a8f08177fcead29876aa15a75d28..49e6713e8602b5286787f48d9ca06fb11b68b995 100644 (file)
@@ -4118,6 +4118,57 @@ ostream& operator<<(ostream& out, const pg_log_entry_t& e)
   return out;
 }
 
+// -- pg_log_dup_t --
+
+string pg_log_dup_t::get_key_name() const
+{
+  return "dup_" + version.get_key_name();
+}
+
+void pg_log_dup_t::encode(bufferlist &bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode(reqid, bl);
+  ::encode(version, bl);
+  ::encode(user_version, bl);
+  ::encode(return_code, bl);
+  ENCODE_FINISH(bl);
+}
+
+void pg_log_dup_t::decode(bufferlist::iterator &bl)
+{
+  DECODE_START(1, bl);
+  ::decode(reqid, bl);
+  ::decode(version, bl);
+  ::decode(user_version, bl);
+  ::decode(return_code, bl);
+  DECODE_FINISH(bl);
+}
+
+void pg_log_dup_t::dump(Formatter *f) const
+{
+  f->dump_stream("reqid") << reqid;
+  f->dump_stream("version") << version;
+  f->dump_stream("user_version") << user_version;
+  f->dump_stream("return_code") << return_code;
+}
+
+void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
+{
+  o.push_back(new pg_log_dup_t());
+  o.push_back(new pg_log_dup_t(osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+                              eversion_t(1,2), 1, 0);
+  o.push_back(new pg_log_dup_t(osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+                              eversion_t(1,2), 2, -ENOENT);
+}
+
+ostream& operator<<(ostream& out, const pg_log_dup_t& e)
+{
+  out << e.reqid << " v" << e.version << " uv" << e.user_version
+      << " rc=" << e.return_code;
+  return out;
+}
+
 
 // -- pg_log_t --
 
@@ -4159,18 +4210,19 @@ void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
 
 void pg_log_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(6, 3, bl);
+  ENCODE_START(7, 3, bl);
   ::encode(head, bl);
   ::encode(tail, bl);
   ::encode(log, bl);
   ::encode(can_rollback_to, bl);
   ::encode(rollback_info_trimmed_to, bl);
+  ::encode(dups, bl);
   ENCODE_FINISH(bl);
 }
  
 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
   ::decode(head, bl);
   ::decode(tail, bl);
   if (struct_v < 2) {
@@ -4185,6 +4237,10 @@ void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
     ::decode(rollback_info_trimmed_to, bl);
   else
     rollback_info_trimmed_to = tail;
+
+  if (struct_v >= 7)
+    ::decode(dups, bl);
+
   DECODE_FINISH(bl);
 
   // handle hobject_t format change
@@ -4209,6 +4265,13 @@ void pg_log_t::dump(Formatter *f) const
     f->close_section();
   }
   f->close_section();
+  f->open_array_section("dups");
+  for (const auto& entry : dups) {
+    f->open_object_section("entry");
+    entry.dump(f);
+    f->close_section();
+  }
+  f->close_section();
 }
 
 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
@@ -4280,13 +4343,16 @@ void pg_log_t::copy_up_to(const pg_log_t &other, int max)
   }
 }
 
-ostream& pg_log_t::print(ostream& out) const 
+ostream& pg_log_t::print(ostream& out) const
 {
   out << *this << std::endl;
   for (list<pg_log_entry_t>::const_iterator p = log.begin();
        p != log.end();
-       ++p) 
+       ++p)
     out << *p << std::endl;
+  for (const auto& entry : dups) {
+    out << " dup entry: " << entry << std::endl;
+  }
   return out;
 }
 
index 62448bbe368db5640777a04500a1f231714f865a..5fdd784fe502f92847eab4306b909095479cbceb 100644 (file)
@@ -3405,7 +3405,30 @@ WRITE_CLASS_ENCODER(pg_log_entry_t)
 
 ostream& operator<<(ostream& out, const pg_log_entry_t& e);
 
+struct pg_log_dup_t {
+  osd_reqid_t reqid;  // caller+tid to uniquely identify request
+  eversion_t version;
+  version_t user_version; // the user version for this entry
+  int32_t return_code; // only stored for ERRORs for dup detection
 
+  pg_log_dup_t()
+   : user_version(0), return_code(0) {}
+  pg_log_dup_t(const pg_log_entry_t &entry) explicit
+    : reqid(entry.reqid), version(entry.version),
+      user_version(entry.user_version), return_code(entry.return_code)
+  {}
+  pg_log_dup_t(const eversion_t& v, version_t uv,
+              const osd_reqid_t& rid, int return_code)
+    : reqid(rid), version(v), user_version(uv),
+      return_code(return_code)
+  {}
+  string get_key_name() const;
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<pg_log_dup_t*>& o);
+};
+WRITE_CLASS_ENCODER(pg_log_dup_t)
 
 /**
  * pg_log_t - incremental log of recent pg changes.
@@ -3432,32 +3455,39 @@ protected:
 
 public:
   mempool::osd_pglog::list<pg_log_entry_t> log;  // the actual log.
+  mempool::osd_pglog::list<pg_log_dup_t> dups;  // entries just for dup op detection
   
   pg_log_t() = default;
   pg_log_t(const eversion_t &last_update,
           const eversion_t &log_tail,
           const eversion_t &can_rollback_to,
           const eversion_t &rollback_info_trimmed_to,
-          mempool::osd_pglog::list<pg_log_entry_t> &&entries)
+          mempool::osd_pglog::list<pg_log_entry_t> &&entries,
+          mempool::osd_pglog::list<pg_log_dup_t> &&dup_entries)
     : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
       rollback_info_trimmed_to(rollback_info_trimmed_to),
-      log(std::move(entries)) {}
+      log(std::move(entries)), dups(std::move(dup_entries)) {}
   pg_log_t(const eversion_t &last_update,
           const eversion_t &log_tail,
           const eversion_t &can_rollback_to,
           const eversion_t &rollback_info_trimmed_to,
-          const std::list<pg_log_entry_t> &entries)
+          const std::list<pg_log_entry_t> &entries,
+          const std::list<pg_log_dup_t> &dup_entries)
     : head(last_update), tail(log_tail), can_rollback_to(can_rollback_to),
       rollback_info_trimmed_to(rollback_info_trimmed_to) {
     for (auto &&entry: entries) {
       log.push_back(entry);
     }
+    for (auto &&entry: dup_entries) {
+      dups.push_back(entry);
+    }
   }
 
   void clear() {
     eversion_t z;
     rollback_info_trimmed_to = can_rollback_to = head = tail = z;
     log.clear();
+    dups.clear();
   }
 
   eversion_t get_rollback_info_trimmed_to() const {
@@ -3485,12 +3515,18 @@ public:
       oldlog.erase(i++);
     }
 
+    // osd_reqid is unique, so it doesn't matter if there are extra
+    // dup entries in each pg. To avoid storing oid with the dup
+    // entries, just copy the whole list.
+    auto childdups(dups);
+
     return pg_log_t(
       head,
       tail,
       can_rollback_to,
       rollback_info_trimmed_to,
-      std::move(childlog));
+      std::move(childlog),
+      std::move(childdups));
   }
 
   mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {