]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
PGLog: continuation, store extra duplicate ops beyond the normal log entries 16172/head
authorJ. Eric Ivancich <ivancich@redhat.com>
Mon, 8 May 2017 20:48:18 +0000 (16:48 -0400)
committerJ. Eric Ivancich <ivancich@redhat.com>
Thu, 27 Jul 2017 15:09:04 +0000 (11:09 -0400)
This helps us avoid replaying non-idempotent client operations when
the pg log is very short, e.g. in an effort to force OSDs to use
backfill rather than regular recovery. This can be advantageous to
avoid blocking i/o to objects, at the cost of longer total time to
become clean (since backfill requires scanning the objects to see what
is missing).

Signed-off-by: J. Eric Ivancich <ivancich@redhat.com>
src/common/legacy_config_opts.h
src/common/options.cc
src/osd/PG.cc
src/osd/PGLog.cc
src/osd/PGLog.h
src/osd/osd_types.cc
src/osd/osd_types.h
src/test/osd/TestPGLog.cc
src/tools/ceph_objectstore_tool.cc

index c0b0192d98a5b8313917dc2993d9922c76cbd327..c70eb83c73d3ede48a16d3d4b800f61a10b48d37 100644 (file)
@@ -815,6 +815,7 @@ OPTION(osd_pg_epoch_persisted_max_stale, OPT_U32) // make this < map_cache_size!
 
 OPTION(osd_min_pg_log_entries, OPT_U32)  // number of entries to keep in the pg log when trimming it
 OPTION(osd_max_pg_log_entries, OPT_U32) // max entries, say when degraded, before we trim
+OPTION(osd_pg_log_dups_tracked, OPT_U32) // how many versions back to track combined in both pglog's regular + dup logs
 OPTION(osd_force_recovery_pg_log_entries_factor, OPT_FLOAT) // max entries factor before force recovery
 OPTION(osd_pg_log_trim_min, OPT_U32)
 OPTION(osd_op_complaint_time, OPT_FLOAT) // how many seconds old makes an op complaint-worthy
index ed7dc0c9666cf3d44c5d23cc5aa24e6abdd61375..b4229d6306b8cbade7a8d16a8b4deea425dc3002 100644 (file)
@@ -2248,12 +2248,25 @@ std::vector<Option> global_options = {
   .set_description(""),
 
   Option("osd_min_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
-  .set_default(3000)
-  .set_description(""),
+  .set_default(1500)
+  .set_description("minimum number of entries to maintain in the PG log")
+  .add_service("osd")
+  .add_see_also("osd_max_pg_log_entries")
+  .add_see_also("osd_pg_log_dups_tracked"),
 
   Option("osd_max_pg_log_entries", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
   .set_default(10000)
-  .set_description(""),
+  .set_description("maximum number of entries to maintain in the PG log when degraded before we trim")
+  .add_service("osd")
+  .add_see_also("osd_min_pg_log_entries")
+  .add_see_also("osd_pg_log_dups_tracked"),
+
+  Option("osd_pg_log_dups_tracked", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+  .set_default(3000)
+  .set_description("how many versions back to track in order to detect duplicate ops; this is combined with both the regular pg log entries and additional minimal dup detection entries")
+  .add_service("osd")
+  .add_see_also("osd_min_pg_log_entries")
+  .add_see_also("osd_max_pg_log_entries"),
 
   Option("osd_force_recovery_pg_log_entries_factor", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
   .set_default(1.3)
index 5153d5d8f32d45fb9bb09287d31132d9500b0803..f85574d2ee9fd6ac894e9960484cbcc5cd49dc32 100644 (file)
@@ -225,6 +225,7 @@ void PG::dump_live_ids()
 }
 #endif
 
+
 void PGPool::update(OSDMapRef map)
 {
   const pg_pool_t *pi = map->get_pg_pool(id);
@@ -296,7 +297,8 @@ PG::PG(OSDService *o, OSDMapRef curmap,
   dirty_info(false), dirty_big_info(false),
   info(p),
   info_struct_v(0),
-  coll(p), pg_log(cct),
+  coll(p),
+  pg_log(cct),
   pgmeta_oid(p.make_pgmeta_oid()),
   missing_loc(this),
   past_intervals(
@@ -3186,7 +3188,7 @@ void PG::append_log(
   auto last = logv.rbegin();
   if (is_primary() && last != logv.rend()) {
     projected_log.skip_can_rollback_to_to_head();
-    projected_log.trim(cct, last->version, nullptr);
+    projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
   }
 
   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
index cf50db16e6aca1d39e285dde06e230e44e75f4c7..7b086eb30a08999f2aae2ed47e0058c414abdafe 100644 (file)
@@ -1,4 +1,4 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 /*
  * Ceph - scalable distributed file system
@@ -10,9 +10,9 @@
  *
  * This is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software 
+ * License version 2.1, as published by the Free Software
  * Foundation.  See file COPYING.
- * 
+ *
  */
 
 #include "PGLog.h"
@@ -37,7 +37,7 @@ void PGLog::IndexedLog::split_out_child(
   PGLog::IndexedLog *target)
 {
   unindex();
-  *target = pg_log_t::split_out_child(child_pgid, split_bits);
+  *target = IndexedLog(pg_log_t::split_out_child(child_pgid, split_bits));
   index();
   target->index();
   reset_rollback_info_trimmed_to_riter();
@@ -47,7 +47,8 @@ void PGLog::IndexedLog::trim(
   CephContext* cct,
   eversion_t s,
   set<eversion_t> *trimmed,
-  set<string> *trimmed_dups)
+  set<string>* trimmed_dups,
+  bool* dirty_dups)
 {
   if (complete_to != log.end() &&
       complete_to->version <= s) {
@@ -58,8 +59,13 @@ void PGLog::IndexedLog::trim(
 
   assert(s <= can_rollback_to);
 
+  auto earliest_dup_version =
+    log.rbegin()->version.version < cct->_conf->osd_pg_log_dups_tracked
+    ? 0u
+    : log.rbegin()->version.version - cct->_conf->osd_pg_log_dups_tracked;
+
   while (!log.empty()) {
-    pg_log_entry_t &e = *log.begin();
+    const pg_log_entry_t &e = *log.begin();
     if (e.version > s)
       break;
     generic_dout(20) << "trim " << e << dendl;
@@ -69,14 +75,15 @@ void PGLog::IndexedLog::trim(
     unindex(e);         // remove from index,
 
     // add to dup list
-    if (e.version.version + 1000 > s.version) {
-      dirty_dups = true;
+    if (e.version.version >= earliest_dup_version) {
+      if (dirty_dups) *dirty_dups = true;
       dups.push_back(pg_log_dup_t(e));
-      dup_index[e.reqid] = &(dups.back());
+      index(dups.back());
       for (const auto& extra : e.extra_reqids) {
+       // note: extras have the same version as outer op
        dups.push_back(pg_log_dup_t(e.version, extra.second,
                                    extra.first, e.return_code));
-       dup_index[extra->first] = &(dups.back());
+       index(dups.back());
       }
     }
 
@@ -90,13 +97,15 @@ void PGLog::IndexedLog::trim(
   }
 
   while (!dups.empty()) {
-    auto &e = *dups.begin();
-    if (e.version.version + 1000 > s.version)
+    const auto& e = *dups.begin();
+    if (e.version.version >= earliest_dup_version)
       break;
     generic_dout(20) << "trim dup " << e << dendl;
     if (trimmed_dups)
       trimmed_dups->insert(e.get_key_name());
-    dup_index.erase(e.reqid);
+    if (indexed_data & PGLOG_INDEXED_DUPS) {
+      dup_index.erase(e.reqid);
+    }
     dups.pop_front();
   }
 
@@ -111,9 +120,18 @@ ostream& PGLog::IndexedLog::print(ostream& out) const
   for (list<pg_log_entry_t>::const_iterator p = log.begin();
        p != log.end();
        ++p) {
-    out << *p << " " << (logged_object(p->soid) ? "indexed":"NOT INDEXED") << std::endl;
+    out << *p << " " <<
+      (logged_object(p->soid) ? "indexed" : "NOT INDEXED") <<
+      std::endl;
     assert(!p->reqid_is_indexed() || logged_req(p->reqid));
   }
+
+  for (list<pg_log_dup_t>::const_iterator p = dups.begin();
+       p != dups.end();
+       ++p) {
+    out << *p << std::endl;
+  }
+
   return out;
 }
 
@@ -148,7 +166,7 @@ void PGLog::trim(
     assert(trim_to <= info.last_complete);
 
     dout(10) << "trim " << log << " to " << trim_to << dendl;
-    log.trim(cct, trim_to, &trimmed, &trimmed_dups);
+    log.trim(cct, trim_to, &trimmed, &trimmed_dups, &dirty_dups);
     info.log_tail = log.tail;
   }
 }
@@ -254,7 +272,7 @@ void PGLog::proc_replica_log(
   } else {
     oinfo.last_complete = oinfo.last_update;
   }
-}
+} // proc_replica_log
 
 /**
  * rewind divergent entries at the head of the log
@@ -268,7 +286,8 @@ void PGLog::rewind_divergent_log(eversion_t newhead,
                                 pg_info_t &info, LogEntryHandler *rollbacker,
                                 bool &dirty_info, bool &dirty_big_info)
 {
-  dout(10) << "rewind_divergent_log truncate divergent future " << newhead << dendl;
+  dout(10) << "rewind_divergent_log truncate divergent future " <<
+    newhead << dendl;
 
 
   if (info.last_complete > newhead)
@@ -342,7 +361,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
     // splice into our log.
     log.log.splice(log.log.begin(),
                   olog.log, from, to);
-      
+
     info.log_tail = log.tail = olog.tail;
     changed = true;
   }
@@ -365,7 +384,7 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
   // extend on head?
   if (olog.head > log.head) {
     dout(10) << "merge_log extending head to " << olog.head << dendl;
-      
+
     // find start point in olog
     list<pg_log_entry_t>::iterator to = olog.log.end();
     list<pg_log_entry_t>::iterator from = olog.log.end();
@@ -423,8 +442,15 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
 
     changed = true;
   }
-  
-  dout(10) << "merge_log result " << log << " " << missing << " changed=" << changed << dendl;
+
+  // now handle dups
+  if (merge_log_dups(olog)) {
+    dirty_dups = true;
+    changed = true;
+  }
+
+  dout(10) << "merge_log result " << log << " " << missing <<
+    " changed=" << changed << dendl;
 
   if (changed) {
     dirty_info = true;
@@ -432,6 +458,81 @@ void PGLog::merge_log(pg_info_t &oinfo, pg_log_t &olog, pg_shard_t fromosd,
   }
 }
 
+
+// returns true if any changes were made to log.dups
+bool PGLog::merge_log_dups(const pg_log_t& olog) {
+  bool changed = false;
+
+  if (!olog.dups.empty()) {
+    if (log.dups.empty()) {
+      dout(10) << "merge_log copying olog dups to log " <<
+       olog.dups.front().version << " to " <<
+       olog.dups.back().version << dendl;
+      changed = true;
+      // since our log.dups is empty just copy them
+      for (const auto& i : olog.dups) {
+       log.dups.push_back(i);
+       log.index(log.dups.back());
+      }
+    } else {
+      // since our log.dups is not empty try to extend on each end
+
+      if (olog.dups.back().version > log.dups.back().version) {
+       // extend the dups's tail (i.e., newer dups)
+       dout(10) << "merge_log extending dups tail to " <<
+         olog.dups.back().version << dendl;
+       changed = true;
+
+       auto log_tail_version = log.dups.back().version;
+
+       auto insert_cursor = log.dups.end();
+       for (auto i = olog.dups.crbegin(); i != olog.dups.crend(); ++i) {
+         if (i->version <= log_tail_version) break;
+         log.dups.insert(insert_cursor, *i);
+
+         auto prev = insert_cursor;
+         --prev;
+         // be sure to pass reference of copy in log.dups
+         log.index(*prev);
+
+         --insert_cursor; // make sure we insert in reverse order
+       }
+      }
+
+      if (olog.dups.front().version < log.dups.front().version) {
+       // extend the dups's head (i.e., older dups)
+       dout(10) << "merge_log extending dups head to " <<
+         olog.dups.front().version << dendl;
+       changed = true;
+
+       auto insert_cursor = log.dups.begin();
+       for (auto i = olog.dups.cbegin(); i != olog.dups.cend(); ++i) {
+         if (i->version >= insert_cursor->version) break;
+         log.dups.insert(insert_cursor, *i);
+         auto prev = insert_cursor;
+         --prev;
+         // be sure to pass address of copy in log.dups
+         log.index(*prev);
+       }
+      }
+    }
+  }
+
+  // remove any dup entries that overlap with pglog
+  if (!log.dups.empty() && log.dups.back().version >= log.tail) {
+    dout(10) << "merge_log removed dups overlapping log entries [" <<
+      log.tail << "," << log.dups.back().version << "]" << dendl;
+    changed = true;
+
+    while (!log.dups.empty() && log.dups.back().version >= log.tail) {
+      log.unindex(log.dups.back());
+      log.dups.pop_back();
+    }
+  }
+
+  return changed;
+}
+
 void PGLog::check() {
   if (!pg_log_debug)
     return;
@@ -458,10 +559,12 @@ void PGLog::check() {
   }
 }
 
+// non-static
 void PGLog::write_log_and_missing(
   ObjectStore::Transaction& t,
   map<string,bufferlist> *km,
-  const coll_t& coll, const ghobject_t &log_oid,
+  const coll_t& coll,
+  const ghobject_t &log_oid,
   bool require_rollback)
 {
   if (is_dirty()) {
@@ -484,29 +587,34 @@ void PGLog::write_log_and_missing(
       !touched_log,
       require_rollback,
       clear_divergent_priors,
+      dirty_dups,
       &rebuilt_missing_with_deletes,
-      (pg_log_debug ? &log_keys_debug : 0));
+      (pg_log_debug ? &log_keys_debug : nullptr));
     undirty();
   } else {
     dout(10) << "log is not dirty" << dendl;
   }
 }
 
+// static
 void PGLog::write_log_and_missing_wo_missing(
     ObjectStore::Transaction& t,
     map<string,bufferlist> *km,
     pg_log_t &log,
     const coll_t& coll, const ghobject_t &log_oid,
     map<eversion_t, hobject_t> &divergent_priors,
-    bool require_rollback)
+    bool require_rollback,
+    bool dirty_dups)
 {
   _write_log_and_missing_wo_missing(
     t, km, log, coll, log_oid,
     divergent_priors, eversion_t::max(), eversion_t(), eversion_t(),
     set<eversion_t>(),
-    true, true, require_rollback, 0);
+    set<string>(),
+    true, true, require_rollback, dirty_dups, nullptr);
 }
 
+// static
 void PGLog::write_log_and_missing(
     ObjectStore::Transaction& t,
     map<string,bufferlist> *km,
@@ -515,6 +623,7 @@ void PGLog::write_log_and_missing(
     const ghobject_t &log_oid,
     const pg_missing_tracker_t &missing,
     bool require_rollback,
+    bool dirty_dups,
     bool *rebuilt_missing_with_deletes)
 {
   _write_log_and_missing(
@@ -523,10 +632,12 @@ void PGLog::write_log_and_missing(
     eversion_t(),
     eversion_t(),
     set<eversion_t>(),
+    set<string>(),
     missing,
-    true, require_rollback, false, rebuilt_missing_with_deletes, 0);
+    true, require_rollback, false, dirty_dups, rebuilt_missing_with_deletes, nullptr);
 }
 
+// static
 void PGLog::_write_log_and_missing_wo_missing(
   ObjectStore::Transaction& t,
   map<string,bufferlist> *km,
@@ -541,6 +652,7 @@ void PGLog::_write_log_and_missing_wo_missing(
   bool dirty_divergent_priors,
   bool touch_log,
   bool require_rollback,
+  bool dirty_dups,
   set<string> *log_keys_debug
   )
 {
@@ -555,7 +667,7 @@ void PGLog::_write_log_and_missing_wo_missing(
     }
   }
 
-//dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
+  // dout(10) << "write_log_and_missing, clearing up to " << dirty_to << dendl;
   if (touch_log)
     t.touch(coll, log_oid);
   if (dirty_to != eversion_t()) {
@@ -565,7 +677,7 @@ void PGLog::_write_log_and_missing_wo_missing(
     clear_up_to(log_keys_debug, dirty_to.get_key_name());
   }
   if (dirty_to != eversion_t::max() && dirty_from != eversion_t::max()) {
-    //   dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
+    // dout(10) << "write_log_and_missing, clearing from " << dirty_from << dendl;
     t.omap_rmkeyrange(
       coll, log_oid,
       dirty_from.get_key_name(), eversion_t::max().get_key_name());
@@ -590,6 +702,19 @@ void PGLog::_write_log_and_missing_wo_missing(
     (*km)[p->get_key_name()].claim(bl);
   }
 
+  if (log_keys_debug) {
+    for (map<string, bufferlist>::iterator i = (*km).begin();
+        i != (*km).end();
+        ++i) {
+      if (i->first[0] == '_')
+       continue;
+      assert(!log_keys_debug->count(i->first));
+      log_keys_debug->insert(i->first);
+    }
+  }
+
+  // process dirty_dups after log_keys_debug is filled, so dups do not
+  // end up in that set
   if (dirty_dups) {
     pg_log_dup_t min;
     t.omap_rmkeyrange(
@@ -602,17 +727,6 @@ void PGLog::_write_log_and_missing_wo_missing(
     }
   }
 
-  if (log_keys_debug) {
-    for (map<string, bufferlist>::iterator i = (*km).begin();
-        i != (*km).end();
-        ++i) {
-      if (i->first[0] == '_')
-       continue;
-      assert(!log_keys_debug->count(i->first));
-      log_keys_debug->insert(i->first);
-    }
-  }
-
   if (dirty_divergent_priors) {
     //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
     ::encode(divergent_priors, (*km)["divergent_priors"]);
@@ -630,6 +744,7 @@ void PGLog::_write_log_and_missing_wo_missing(
     t.omap_rmkeys(coll, log_oid, to_remove);
 }
 
+// static
 void PGLog::_write_log_and_missing(
   ObjectStore::Transaction& t,
   map<string,bufferlist>* km,
@@ -644,6 +759,7 @@ void PGLog::_write_log_and_missing(
   bool touch_log,
   bool require_rollback,
   bool clear_divergent_priors,
+  bool dirty_dups,
   bool *rebuilt_missing_with_deletes, // in/out param
   set<string> *log_keys_debug
   ) {
@@ -692,6 +808,19 @@ void PGLog::_write_log_and_missing(
     (*km)[p->get_key_name()].claim(bl);
   }
 
+  if (log_keys_debug) {
+    for (map<string, bufferlist>::iterator i = (*km).begin();
+        i != (*km).end();
+        ++i) {
+      if (i->first[0] == '_')
+       continue;
+      assert(!log_keys_debug->count(i->first));
+      log_keys_debug->insert(i->first);
+    }
+  }
+
+  // process dirty_dups after log_keys_debug is filled, so dups do not
+  // end up in that set
   if (dirty_dups) {
     pg_log_dup_t min;
     t.omap_rmkeyrange(
@@ -704,17 +833,6 @@ void PGLog::_write_log_and_missing(
     }
   }
 
-  if (log_keys_debug) {
-    for (map<string, bufferlist>::iterator i = (*km).begin();
-        i != (*km).end();
-        ++i) {
-      if (i->first[0] == '_')
-       continue;
-      assert(!log_keys_debug->count(i->first));
-      log_keys_debug->insert(i->first);
-    }
-  }
-
   if (clear_divergent_priors) {
     //dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
     to_remove.insert("divergent_priors");
index d8bf60ee9972d2569210af7097541e431e741432..19405de25be67aac67edb054f9a7c6d9ecf94f2e 100644 (file)
@@ -14,8 +14,7 @@
  * Foundation.  See file COPYING.
  * 
  */
-#ifndef CEPH_PG_LOG_H
-#define CEPH_PG_LOG_H
+#pragma once
 
 // re-include our assert to clobber boost's
 #include "include/assert.h"
@@ -27,7 +26,11 @@ using namespace std;
 #define PGLOG_INDEXED_OBJECTS          (1 << 0)
 #define PGLOG_INDEXED_CALLER_OPS       (1 << 1)
 #define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2)
-#define PGLOG_INDEXED_ALL              (PGLOG_INDEXED_OBJECTS | PGLOG_INDEXED_CALLER_OPS | PGLOG_INDEXED_EXTRA_CALLER_OPS)
+#define PGLOG_INDEXED_DUPS             (1 << 3)
+#define PGLOG_INDEXED_ALL              (PGLOG_INDEXED_OBJECTS | \
+                                       PGLOG_INDEXED_CALLER_OPS | \
+                                       PGLOG_INDEXED_EXTRA_CALLER_OPS | \
+                                       PGLOG_INDEXED_DUPS)
 
 class CephContext;
 
@@ -82,7 +85,7 @@ public:
     mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects;  // ptrs into log.  be careful!
     mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops;
     mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops;
-    mutable ceph::unordered_map<osd_reqid_t, pg_log_dup_t*> dup_index;
+    mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index;
 
     // recovery pointers
     list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item
@@ -133,7 +136,7 @@ public:
       last_requested(0),
       indexed_data(0),
       rollback_info_trimmed_to_riter(log.rbegin())
-      {}
+    }
 
     template <typename... Args>
     IndexedLog(Args&&... args) :
@@ -141,7 +144,8 @@ public:
       complete_to(log.end()),
       last_requested(0),
       indexed_data(0),
-      rollback_info_trimmed_to_riter(log.rbegin()) {
+      rollback_info_trimmed_to_riter(log.rbegin())
+    {
       reset_rollback_info_trimmed_to_riter();
       index();
     }
@@ -151,10 +155,12 @@ public:
       complete_to(log.end()),
       last_requested(rhs.last_requested),
       indexed_data(0),
-      rollback_info_trimmed_to_riter(log.rbegin()) {
+      rollback_info_trimmed_to_riter(log.rbegin())
+    {
       reset_rollback_info_trimmed_to_riter();
       index(rhs.indexed_data);
     }
+
     IndexedLog &operator=(const IndexedLog &rhs) {
       this->~IndexedLog();
       new (this) IndexedLog(rhs);
@@ -262,7 +268,8 @@ public:
       const osd_reqid_t &r,
       eversion_t *version,
       version_t *user_version,
-      int *return_code) const {
+      int *return_code) const
+    {
       assert(version);
       assert(user_version);
       assert(return_code);
@@ -297,6 +304,18 @@ public:
        }
        assert(0 == "in extra_caller_ops but not extra_reqids");
       }
+
+      if (!(indexed_data & PGLOG_INDEXED_DUPS)) {
+        index_dups();
+      }
+      auto q = dup_index.find(r);
+      if (q != dup_index.end()) {
+       *version = q->second->version;
+       *user_version = q->second->user_version;
+       *return_code = q->second->return_code;
+       return true;
+      }
+
       return false;
     }
 
@@ -326,40 +345,58 @@ public:
        }
       }
     }
-    
+
     void index(__u16 to_index = PGLOG_INDEXED_ALL) const {
+      // if to_index is 0, no need to run any of this code, especially
+      // loop below; this can happen with copy constructor for
+      // IndexedLog (and indirectly through assignment operator)
+      if (!to_index) return;
+
       if (to_index & PGLOG_INDEXED_OBJECTS)
        objects.clear();
       if (to_index & PGLOG_INDEXED_CALLER_OPS)
        caller_ops.clear();
       if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS)
        extra_caller_ops.clear();
+      if (to_index & PGLOG_INDEXED_DUPS) {
+       dup_index.clear();
+       for (auto& i : dups) {
+         dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i);
+       }
+      }
 
-      for (list<pg_log_entry_t>::const_iterator i = log.begin();
-          i != log.end();
-          ++i) {
-       if (to_index & PGLOG_INDEXED_OBJECTS) {
-         if (i->object_is_indexed()) {
-           objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
+      constexpr __u16 any_log_entry_index =
+       PGLOG_INDEXED_OBJECTS |
+       PGLOG_INDEXED_CALLER_OPS |
+       PGLOG_INDEXED_EXTRA_CALLER_OPS;
+
+      if (to_index & any_log_entry_index) {
+       for (list<pg_log_entry_t>::const_iterator i = log.begin();
+            i != log.end();
+            ++i) {
+         if (to_index & PGLOG_INDEXED_OBJECTS) {
+           if (i->object_is_indexed()) {
+             objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i));
+           }
          }
-       }
 
-       if (to_index & PGLOG_INDEXED_CALLER_OPS) {
-         if (i->reqid_is_indexed()) {
-           caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
+         if (to_index & PGLOG_INDEXED_CALLER_OPS) {
+           if (i->reqid_is_indexed()) {
+             caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i));
+           }
          }
-       }
-        
-       if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
-         for (auto j = i->extra_reqids.begin();
-              j != i->extra_reqids.end();
-              ++j) {
-            extra_caller_ops.insert(
-             make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
+
+         if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
+           for (auto j = i->extra_reqids.begin();
+                j != i->extra_reqids.end();
+                ++j) {
+             extra_caller_ops.insert(
+               make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i))));
+           }
          }
        }
       }
-        
+
       indexed_data |= to_index;
     }
 
@@ -375,6 +412,10 @@ public:
       index(PGLOG_INDEXED_EXTRA_CALLER_OPS);
     }
 
+    void index_dups() const {
+      index(PGLOG_INDEXED_DUPS);
+    }
+
     void index(pg_log_entry_t& e) {
       if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) {
         if (objects.count(e.soid) == 0 ||
@@ -395,6 +436,7 @@ public:
         }
       }
     }
+
     void unindex() {
       objects.clear();
       caller_ops.clear();
@@ -402,7 +444,8 @@ public:
       dup_index.clear();
       indexed_data = 0;
     }
-    void unindex(pg_log_entry_t& e) {
+
+    void unindex(const pg_log_entry_t& e) {
       // NOTE: this only works if we remove from the _tail_ of the log!
       if (indexed_data & PGLOG_INDEXED_OBJECTS) {
         if (objects.count(e.soid) && objects[e.soid]->version == e.version)
@@ -412,7 +455,7 @@ public:
         if (indexed_data & PGLOG_INDEXED_CALLER_OPS) {
          // divergent merge_log indexes new before unindexing old
           if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e)
-            caller_ops.erase(e.reqid);    
+            caller_ops.erase(e.reqid);
         }
       }
       if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
@@ -432,6 +475,21 @@ public:
       }
     }
 
+    void index(pg_log_dup_t& e) {
+      if (PGLOG_INDEXED_DUPS) {
+       dup_index[e.reqid] = &e;
+      }
+    }
+
+    void unindex(const pg_log_dup_t& e) {
+      if (PGLOG_INDEXED_DUPS) {
+       auto i = dup_index.find(e.reqid);
+       if (i != dup_index.end()) {
+         dup_index.erase(i);
+       }
+      }
+    }
+
     // actors
     void add(const pg_log_entry_t& e, bool applied = true) {
       if (!applied) {
@@ -461,7 +519,7 @@ public:
          caller_ops[e.reqid] = &(log.back());
         }
       }
-      
+
       if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) {
         for (auto j = e.extra_reqids.begin();
             j != e.extra_reqids.end();
@@ -473,16 +531,17 @@ public:
       if (!applied) {
        skip_can_rollback_to_to_head();
       }
-    }
+    } // add
 
     void trim(
       CephContext* cct,
       eversion_t s,
       set<eversion_t> *trimmed,
-      set<string> *trimmed_dups);
+      set<string>* trimmed_dups,
+      bool* dirty_dups);
 
     ostream& print(ostream& out) const;
-  };
+  }; // IndexedLog
 
 
 protected:
@@ -495,7 +554,7 @@ protected:
   eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
   eversion_t writeout_from;    ///< must writout keys >= writeout_from
   set<eversion_t> trimmed;     ///< must clear keys in trimmed
-  set<string> trimmed_dups; ///< must clear keys in trimmed_dups
+  set<string> trimmed_dups;    ///< must clear keys in trimmed_dups
   CephContext *cct;
   bool pg_log_debug;
   /// Log is clean on [dirty_to, dirty_from)
@@ -525,6 +584,7 @@ public:
       !(trimmed.empty()) ||
       !missing.is_clean() ||
       !(trimmed_dups.empty()) ||
+      dirty_dups ||
       rebuilt_missing_with_deletes;
   }
   void mark_log_for_rewrite() {
@@ -567,16 +627,18 @@ protected:
     dirty_dups = false;
   }
 public:
+
   // cppcheck-suppress noExplicitConstructor
-  PGLog(CephContext *cct, DoutPrefixProvider *dpp = 0) :
+  PGLog(CephContext *cct, DoutPrefixProvider *dpp = nullptr) :
     prefix_provider(dpp),
     dirty_from(eversion_t::max()),
     writeout_from(eversion_t::max()),
     cct(cct),
     pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
     touched_log(false),
-    clear_divergent_priors(false) {}
-
+    clear_divergent_priors(false),
+    dirty_dups(false)
+  { }
 
   void reset_backfill();
 
@@ -658,7 +720,7 @@ public:
   void split_into(
       pg_t child_pgid,
       unsigned split_bits,
-      PGLog *opg_log) { 
+      PGLog *opg_log) {
     log.split_out_child(child_pgid, split_bits, &opg_log->log);
     missing.split_into(child_pgid, split_bits, &(opg_log->missing));
     opg_log->mark_dirty_to(eversion_t::max());
@@ -670,7 +732,7 @@ public:
   void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) {
     if (missing.is_missing(oid, v)) {
       missing.got(oid, v);
-      
+
       // raise last_complete?
       if (missing.get_items().empty()) {
        log.complete_to = log.log.end();
@@ -695,8 +757,9 @@ public:
     while (!missing.get_items().empty() && log.complete_to->version <
           missing.get_items().at(
             missing.get_rmissing().begin()->second
-            ).need)
+            ).need) {
       ++log.complete_to;
+    }
     assert(log.complete_to != log.log.end());
     if (log.complete_to == log.log.begin()) {
       if (info)
@@ -759,7 +822,7 @@ protected:
     const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge
     const pg_info_t &info,              ///< [in] info for merging entries
     eversion_t olog_can_rollback_to,     ///< [in] rollback boundary
-    missing_type &missing,              ///< [in,out] missing to adjust, use
+    missing_type &missing,               ///< [in,out] missing to adjust, use
     LogEntryHandler *rollbacker,         ///< [in] optional rollbacker object
     const DoutPrefixProvider *dpp        ///< [in] logging provider
     ) {
@@ -996,7 +1059,11 @@ protected:
       rollbacker,
       this);
   }
+
+  bool merge_log_dups(const pg_log_t& olog);
+
 public:
+
   void rewind_divergent_log(eversion_t newhead,
                             pg_info_t &info,
                             LogEntryHandler *rollbacker,
@@ -1085,11 +1152,12 @@ public:
     return invalidate_stats;
   }
 
-  void write_log_and_missing(ObjectStore::Transaction& t,
-                map<string,bufferlist> *km,
-                const coll_t& coll,
-                const ghobject_t &log_oid,
-                bool require_rollback);
+  void write_log_and_missing(
+    ObjectStore::Transaction& t,
+    map<string,bufferlist> *km,
+    const coll_t& coll,
+    const ghobject_t &log_oid,
+    bool require_rollback);
 
   static void write_log_and_missing_wo_missing(
     ObjectStore::Transaction& t,
@@ -1097,7 +1165,8 @@ public:
     pg_log_t &log,
     const coll_t& coll,
     const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors,
-    bool require_rollback);
+    bool require_rollback,
+    bool dirty_dups);
 
   static void write_log_and_missing(
     ObjectStore::Transaction& t,
@@ -1107,6 +1176,7 @@ public:
     const ghobject_t &log_oid,
     const pg_missing_tracker_t &missing,
     bool require_rollback,
+    bool dirty_dups,
     bool *rebuilt_missing_set_with_deletes);
 
   static void _write_log_and_missing_wo_missing(
@@ -1123,6 +1193,7 @@ public:
     bool dirty_divergent_priors,
     bool touch_log,
     bool require_rollback,
+    bool dirty_dups,
     set<string> *log_keys_debug
     );
 
@@ -1140,13 +1211,16 @@ public:
     bool touch_log,
     bool require_rollback,
     bool clear_divergent_priors,
+    bool dirty_dups,
     bool *rebuilt_missing_with_deletes,
     set<string> *log_keys_debug
     );
 
   void read_log_and_missing(
-    ObjectStore *store, coll_t pg_coll,
-    coll_t log_coll, ghobject_t log_oid,
+    ObjectStore *store,
+    coll_t pg_coll,
+    coll_t log_coll,
+    ghobject_t log_oid,
     const pg_info_t &info,
     ostringstream &oss,
     bool tolerate_divergent_missing_log,
@@ -1158,20 +1232,24 @@ public:
       tolerate_divergent_missing_log,
       &clear_divergent_priors,
       this,
-      (pg_log_debug ? &log_keys_debug : 0),
+      (pg_log_debug ? &log_keys_debug : nullptr),
       debug_verify_stored_missing);
   }
 
   template <typename missing_type>
-  static void read_log_and_missing(ObjectStore *store, coll_t pg_coll,
-    coll_t log_coll, ghobject_t log_oid,
+  static void read_log_and_missing(
+    ObjectStore *store,
+    coll_t pg_coll,
+    coll_t log_coll,
+    ghobject_t log_oid,
     const pg_info_t &info,
     IndexedLog &log,
-    missing_type &missing, ostringstream &oss,
+    missing_type &missing,
+    ostringstream &oss,
     bool tolerate_divergent_missing_log,
-    bool *clear_divergent_priors = NULL,
-    const DoutPrefixProvider *dpp = NULL,
-    set<string> *log_keys_debug = 0,
+    bool *clear_divergent_priors = nullptr,
+    const DoutPrefixProvider *dpp = nullptr,
+    set<string> *log_keys_debug = nullptr,
     bool debug_verify_stored_missing = false
     ) {
     ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll
@@ -1253,9 +1331,10 @@ public:
     if (has_divergent_priors || debug_verify_stored_missing) {
       // build missing
       if (debug_verify_stored_missing || info.last_complete < info.last_update) {
-       ldpp_dout(dpp, 10) << "read_log_and_missing checking for missing items over interval ("
-                          << info.last_complete
-                          << "," << info.last_update << "]" << dendl;
+       ldpp_dout(dpp, 10)
+         << "read_log_and_missing checking for missing items over interval ("
+         << info.last_complete
+         << "," << info.last_update << "]" << dendl;
 
        set<hobject_t> did;
        set<hobject_t> checked;
@@ -1402,7 +1481,5 @@ public:
       missing.flush();
     }
     ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl;
-  }
-};
-
-#endif // CEPH_PG_LOG_H
+  } // static read_log_and_missing
+}; // struct PGLog
index 49e6713e8602b5286787f48d9ca06fb11b68b995..56ff2b42f7d06981ed9ae33a299c8ea7c1548355 100644 (file)
@@ -4156,17 +4156,21 @@ void pg_log_dup_t::dump(Formatter *f) const
 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
 {
   o.push_back(new pg_log_dup_t());
-  o.push_back(new pg_log_dup_t(osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
-                              eversion_t(1,2), 1, 0);
-  o.push_back(new pg_log_dup_t(osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
-                              eversion_t(1,2), 2, -ENOENT);
+  o.push_back(new pg_log_dup_t(eversion_t(1,2),
+                              1,
+                              osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+                              0));
+  o.push_back(new pg_log_dup_t(eversion_t(1,2),
+                              2,
+                              osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
+                              -ENOENT));
 }
 
-ostream& operator<<(ostream& out, const pg_log_dup_t& e)
-{
-  out << e.reqid << " v" << e.version << " uv" << e.user_version
-      << " rc=" << e.return_code;
-  return out;
+
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
+  return out << "log_dup(reqid=" << e.reqid <<
+    " v=" << e.version << " uv=" << e.user_version <<
+    " rc=" << e.return_code << ")";
 }
 
 
index 5fdd784fe502f92847eab4306b909095479cbceb..2e7b3de4ba1cabc5e851b7730d483d21feab829f 100644 (file)
@@ -110,13 +110,18 @@ string ceph_osd_alloc_hint_flag_string(unsigned flags);
  */
 struct osd_reqid_t {
   entity_name_t name; // who
-  ceph_tid_t         tid;
+  ceph_tid_t    tid;
   int32_t       inc;  // incarnation
 
   osd_reqid_t()
-    : tid(0), inc(0) {}
+    : tid(0), inc(0)
+  {}
+  osd_reqid_t(const osd_reqid_t& other)
+    : name(other.name), tid(other.tid), inc(other.inc)
+  {}
   osd_reqid_t(const entity_name_t& a, int i, ceph_tid_t t)
-    : name(a), tid(t), inc(i) {}
+    : name(a), tid(t), inc(i)
+  {}
 
   DENC(osd_reqid_t, v, p) {
     DENC_START(2, 2, p);
@@ -783,7 +788,7 @@ public:
   eversion_t(epoch_t e, version_t v) : version(v), epoch(e), __pad(0) {}
 
   // cppcheck-suppress noExplicitConstructor
-  eversion_t(const ceph_eversion& ce) : 
+  eversion_t(const ceph_eversion& ce) :
     version(ce.version),
     epoch(ce.epoch),
     __pad(0) { }
@@ -3412,8 +3417,9 @@ struct pg_log_dup_t {
   int32_t return_code; // only stored for ERRORs for dup detection
 
   pg_log_dup_t()
-   : user_version(0), return_code(0) {}
-  pg_log_dup_t(const pg_log_entry_t &entry) explicit
+    : user_version(0), return_code(0)
+  {}
+  explicit pg_log_dup_t(const pg_log_entry_t& entry)
     : reqid(entry.reqid), version(entry.version),
       user_version(entry.user_version), return_code(entry.return_code)
   {}
@@ -3422,14 +3428,19 @@ struct pg_log_dup_t {
     : reqid(rid), version(v), user_version(uv),
       return_code(return_code)
   {}
+
   string get_key_name() const;
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator &bl);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<pg_log_dup_t*>& o);
+
+  friend std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
 };
 WRITE_CLASS_ENCODER(pg_log_dup_t)
 
+std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e);
+
 /**
  * pg_log_t - incremental log of recent pg changes.
  *
@@ -3454,9 +3465,12 @@ protected:
   eversion_t rollback_info_trimmed_to;
 
 public:
-  mempool::osd_pglog::list<pg_log_entry_t> log;  // the actual log.
-  mempool::osd_pglog::list<pg_log_dup_t> dups;  // entries just for dup op detection
-  
+  // the actual log
+  mempool::osd_pglog::list<pg_log_entry_t> log;
+
+  // entries just for dup op detection ordered oldest to newest
+  mempool::osd_pglog::list<pg_log_dup_t> dups;
+
   pg_log_t() = default;
   pg_log_t(const eversion_t &last_update,
           const eversion_t &log_tail,
@@ -3527,7 +3541,7 @@ public:
       rollback_info_trimmed_to,
       std::move(childlog),
       std::move(childdups));
-  }
+    }
 
   mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
     assert(newhead >= tail);
@@ -3618,7 +3632,7 @@ public:
 };
 WRITE_CLASS_ENCODER(pg_log_t)
 
-inline ostream& operator<<(ostream& out, const pg_log_t& log) 
+inline ostream& operator<<(ostream& out, const pg_log_t& log)
 {
   out << "log((" << log.tail << "," << log.head << "], crt="
       << log.get_can_rollback_to() << ")";
@@ -4381,10 +4395,6 @@ inline ostream& operator<<(ostream& out, const ObjectExtent &ex)
 }
 
 
-
-
-
-
 // ---------------------------------------
 
 class OSDSuperblock {
index 182eb425492b7462c869f23d3bf5010a34740216..c4837858acba8344f954fabeadbb2a50040eab5b 100644 (file)
 #include "include/coredumpctl.h"
 #include "../objectstore/store_test_fixture.h"
 
-class PGLogTest : virtual public ::testing::Test, protected PGLog {
-public:
-  PGLogTest() : PGLog(g_ceph_context) {}
-  void SetUp() override {
-    missing.may_include_deletes = true;
-  }
-
-  void TearDown() override {
-    clear();
-  }
 
+struct PGLogTestBase {
   static hobject_t mk_obj(unsigned id) {
     hobject_t hoid;
     stringstream ss;
@@ -51,23 +42,25 @@ public:
     return eversion_t(ep, v);
   }
   static pg_log_entry_t mk_ple_mod(
-    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
     pg_log_entry_t e;
     e.mark_unrollbackable();
     e.op = pg_log_entry_t::MODIFY;
     e.soid = hoid;
     e.version = v;
     e.prior_version = pv;
+    e.reqid = reqid;
     return e;
   }
   static pg_log_entry_t mk_ple_dt(
-    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
     pg_log_entry_t e;
     e.mark_unrollbackable();
     e.op = pg_log_entry_t::DELETE;
     e.soid = hoid;
     e.version = v;
     e.prior_version = pv;
+    e.reqid = reqid;
     return e;
   }
   static pg_log_entry_t mk_ple_ldt(
@@ -81,23 +74,58 @@ public:
     return e;
   }
   static pg_log_entry_t mk_ple_mod_rb(
-    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
     pg_log_entry_t e;
     e.op = pg_log_entry_t::MODIFY;
     e.soid = hoid;
     e.version = v;
     e.prior_version = pv;
+    e.reqid = reqid;
     return e;
   }
   static pg_log_entry_t mk_ple_dt_rb(
-    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    const hobject_t &hoid, eversion_t v, eversion_t pv, osd_reqid_t reqid) {
     pg_log_entry_t e;
     e.op = pg_log_entry_t::DELETE;
     e.soid = hoid;
     e.version = v;
     e.prior_version = pv;
+    e.reqid = reqid;
     return e;
   }
+  static pg_log_entry_t mk_ple_mod(
+    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    return mk_ple_mod(hoid, v, pv, osd_reqid_t());
+  }
+  static pg_log_entry_t mk_ple_dt(
+    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    return mk_ple_dt(hoid, v, pv, osd_reqid_t());
+  }
+  static pg_log_entry_t mk_ple_mod_rb(
+    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    return mk_ple_mod_rb(hoid, v, pv, osd_reqid_t());
+  }
+  static pg_log_entry_t mk_ple_dt_rb(
+    const hobject_t &hoid, eversion_t v, eversion_t pv) {
+    return mk_ple_dt_rb(hoid, v, pv, osd_reqid_t());
+  }
+}; // PGLogTestBase
+
+
+class PGLogTest : virtual public ::testing::Test, protected PGLog, public PGLogTestBase  {
+public:
+  PGLogTest() : PGLog(g_ceph_context) {}
+  void SetUp() override {
+    missing.may_include_deletes = true;
+  }
+
+#include "common/ceph_context.h"
+#include "common/config.h"
+
+  void TearDown() override {
+    clear();
+  }
+
 
   struct TestCase {
     list<pg_log_entry_t> base;
@@ -169,12 +197,12 @@ public:
     const IndexedLog &get_fulldiv() const { return fulldiv; }
     const pg_info_t &get_authinfo() const { return authinfo; }
     const pg_info_t &get_divinfo() const { return divinfo; }
-  };
+  }; // struct TestCase
 
   struct LogHandler : public PGLog::LogEntryHandler {
     set<hobject_t> removed;
     list<pg_log_entry_t> rolledback;
-    
+
     void rollback(
       const pg_log_entry_t &entry) override {
       rolledback.push_back(entry);
@@ -253,7 +281,8 @@ public:
     ASSERT_EQ(info.last_update, oinfo.last_update);
     verify_missing(tcase, missing);
     verify_sideeffects(tcase, h);
-  };
+  }
+
   void test_proc_replica_log(const TestCase &tcase) {
     clear();
     log = tcase.get_fullauth();
@@ -286,12 +315,13 @@ public:
       }
     }
     verify_missing(tcase, omissing);
-  }
+  } // test_proc_replica_log
+
   void run_test_case(const TestCase &tcase) {
     test_merge_log(tcase);
     test_proc_replica_log(tcase);
   }
-};
+}; // class PGLogTest
 
 struct TestHandler : public PGLog::LogEntryHandler {
   list<hobject_t> &removed;
@@ -699,7 +729,7 @@ TEST_F(PGLogTest, merge_old_entry) {
   // the old entry (from the log entry given in argument) is not a CLONE and
   // the old entry (from the log entry given in argument) is not a DELETE and
   // the old entry prior_version is lower than the tail of the log :
-  //   add the old object to the remove_snap list and 
+  //   add the old object to the remove_snap list and
   //   add the old object to divergent priors and
   //   add or update the prior_version of the object to missing and
   //   return false
@@ -1413,11 +1443,11 @@ TEST_F(PGLogTest, proc_replica_log) {
             |        |       |  DELETE |
             |        |       |         |
             +--------+-------+---------+
-           
+
       The log entry (1,3) deletes the object x9 and the olog entry
       (2,3) also deletes it : do nothing. The olog tail is ignored
       because it is before the log tail.
-      
+
   */
   {
     clear();
@@ -2347,6 +2377,539 @@ TEST_F(PGLogTestRebuildMissing, MissingNotInLog) {
   run_rebuild_missing_test(expected);
 }
 
+
+class PGLogMergeDupsTest : public ::testing::Test, protected PGLog {
+
+public:
+
+  PGLogMergeDupsTest() : PGLog(g_ceph_context) { }
+
+  void SetUp() override { }
+
+  void TearDown() override {
+    clear();
+  }
+
+  static pg_log_dup_t create_dup_entry(uint a, uint b) {
+    // make each dup_entry unique by using different client id's
+    static uint client_id = 777;
+    return pg_log_dup_t(eversion_t(a, b),
+                       a,
+                       osd_reqid_t(entity_name_t::CLIENT(client_id++), 8, 1),
+                       0);
+  }
+
+  static std::vector<pg_log_dup_t> example_dups_1() {
+    std::vector<pg_log_dup_t> result = {
+      create_dup_entry(10, 11),
+      create_dup_entry(10, 12),
+      create_dup_entry(11, 1),
+      create_dup_entry(12, 3),
+      create_dup_entry(13, 99)
+    };
+    return result;
+  }
+
+  static std::vector<pg_log_dup_t> example_dups_2() {
+    std::vector<pg_log_dup_t> result = {
+      create_dup_entry(12, 3),
+      create_dup_entry(13, 99),
+      create_dup_entry(15, 11),
+      create_dup_entry(16, 14),
+      create_dup_entry(16, 32)
+    };
+    return result;
+  }
+
+  void add_dups(uint a, uint b) {
+    log.dups.push_back(create_dup_entry(a, b));
+  }
+
+  void add_dups(const std::vector<pg_log_dup_t>& l) {
+    for (auto& i : l) {
+      log.dups.push_back(i);
+    }
+  }
+
+  static void add_dups(IndexedLog& log, const std::vector<pg_log_dup_t>& dups) {
+    for (auto& i : dups) {
+      log.dups.push_back(i);
+    }
+  }
+
+  void check_order() {
+    eversion_t prev(0, 0);
+
+    for (auto& i : log.dups) {
+      EXPECT_LT(prev, i.version) << "verify versions monotonically increase";
+      prev = i.version;
+    }
+  }
+
+  void check_index() {
+    EXPECT_EQ(log.dups.size(), log.dup_index.size());
+    for (auto& i : log.dups) {
+      EXPECT_EQ(1u, log.dup_index.count(i.reqid));
+    }
+  }
+};
+
+TEST_F(PGLogMergeDupsTest, OtherEmpty) {
+  log.tail = eversion_t(14, 5);
+
+  IndexedLog olog;
+
+  add_dups(example_dups_1());
+  index();
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(5u, log.dups.size());
+
+  if (5 == log.dups.size()) {
+    EXPECT_EQ(10u, log.dups.front().version.epoch);
+    EXPECT_EQ(11u, log.dups.front().version.version);
+    EXPECT_EQ(13u, log.dups.back().version.epoch);
+    EXPECT_EQ(99u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+TEST_F(PGLogMergeDupsTest, AmEmpty) {
+  log.tail = eversion_t(14, 5);
+  index();
+
+  IndexedLog olog;
+
+  add_dups(olog, example_dups_1());
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(5u, log.dups.size());
+
+  if (5 == log.dups.size()) {
+    EXPECT_EQ(10u, log.dups.front().version.epoch);
+    EXPECT_EQ(11u, log.dups.front().version.version);
+
+    EXPECT_EQ(13u, log.dups.back().version.epoch);
+    EXPECT_EQ(99u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+TEST_F(PGLogMergeDupsTest, AmEmptyOverlap) {
+  log.tail = eversion_t(12, 3);
+  index();
+
+  IndexedLog olog;
+
+  add_dups(olog, example_dups_1());
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(3u, log.dups.size());
+
+  if (3 == log.dups.size()) {
+    EXPECT_EQ(10u, log.dups.front().version.epoch);
+    EXPECT_EQ(11u, log.dups.front().version.version);
+
+    EXPECT_EQ(11u, log.dups.back().version.epoch);
+    EXPECT_EQ(1u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+TEST_F(PGLogMergeDupsTest, Same) {
+  log.tail = eversion_t(14, 1);
+
+  IndexedLog olog;
+
+  add_dups(example_dups_1());
+  index();
+  add_dups(olog, example_dups_1());
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(5u, log.dups.size());
+
+  if (5 == log.dups.size()) {
+    EXPECT_EQ(10u, log.dups.front().version.epoch);
+    EXPECT_EQ(11u, log.dups.front().version.version);
+
+    EXPECT_EQ(13u, log.dups.back().version.epoch);
+    EXPECT_EQ(99u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+
+TEST_F(PGLogMergeDupsTest, Later) {
+  log.tail = eversion_t(16, 14);
+
+  IndexedLog olog;
+
+  add_dups(example_dups_1());
+  index();
+  add_dups(olog, example_dups_2());
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(6u, log.dups.size());
+
+  if (6 == log.dups.size()) {
+    EXPECT_EQ(10u, log.dups.front().version.epoch);
+    EXPECT_EQ(11u, log.dups.front().version.version);
+
+    EXPECT_EQ(15u, log.dups.back().version.epoch);
+    EXPECT_EQ(11u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+
+TEST_F(PGLogMergeDupsTest, Earlier) {
+  log.tail = eversion_t(17, 2);
+
+  IndexedLog olog;
+
+  add_dups(example_dups_2());
+  index();
+  add_dups(olog, example_dups_1());
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(8u, log.dups.size());
+
+  if (6 == log.dups.size()) {
+    EXPECT_EQ(10u, log.dups.front().version.epoch);
+    EXPECT_EQ(11u, log.dups.front().version.version);
+
+    EXPECT_EQ(16u, log.dups.back().version.epoch);
+    EXPECT_EQ(32u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+
+TEST_F(PGLogMergeDupsTest, Superset) {
+  log.tail = eversion_t(17, 2);
+
+  IndexedLog olog;
+
+  add_dups(example_dups_1());
+  index();
+
+  olog.dups.push_back(create_dup_entry(9, 5));
+  olog.dups.push_back(create_dup_entry(15, 11));
+
+  bool changed = merge_log_dups(olog);
+
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(7u, log.dups.size());
+
+  if (7 == log.dups.size()) {
+    EXPECT_EQ(9u, log.dups.front().version.epoch);
+    EXPECT_EQ(5u, log.dups.front().version.version);
+
+    EXPECT_EQ(15u, log.dups.back().version.epoch);
+    EXPECT_EQ(11u, log.dups.back().version.version);
+  }
+
+  check_order();
+  check_index();
+}
+
+
+struct PGLogTrimTest :
+  public ::testing::Test,
+  public PGLogTestBase,
+  public PGLog::IndexedLog
+{
+  std::list<hobject_t*> test_hobjects;
+  CephContext *cct;
+
+  void SetUp() override {
+    cct = (new CephContext(CEPH_ENTITY_TYPE_OSD))->get();
+
+    hobject_t::generate_test_instances(test_hobjects);
+  }
+
+  void SetUp(unsigned min_entries, unsigned max_entries, unsigned dup_track) {
+    constexpr size_t size = 10;
+
+    char min_entries_s[size];
+    char max_entries_s[size];
+    char dup_track_s[size];
+
+    snprintf(min_entries_s, size, "%u", min_entries);
+    snprintf(max_entries_s, size, "%u", max_entries);
+    snprintf(dup_track_s, size, "%u", dup_track);
+
+    cct->_conf->set_val_or_die("osd_min_pg_log_entries", min_entries_s);
+    cct->_conf->set_val_or_die("osd_max_pg_log_entries", max_entries_s);
+    cct->_conf->set_val_or_die("osd_pg_log_dups_tracked", dup_track_s);
+}
+
+  void TearDown() override {
+    while (!test_hobjects.empty()) {
+      delete test_hobjects.front();
+      test_hobjects.pop_front();
+    }
+
+    cct->put();
+  }
+}; // struct PGLogTrimTest
+
+
+# if 0
+TEST_F(PGLogTest, Trim1) {
+  TestCase t;
+
+  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 150), mk_evt(10, 100)));
+  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(15, 155), mk_evt(15, 150)));
+  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
+  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(26, 160)));
+  t.auth.push_back(mk_ple_mod(mk_obj(1), mk_evt(21, 165), mk_evt(31, 171)));
+
+  t.setup();
+}
+#endif
+
+
+TEST_F(PGLogTrimTest, TestMakingCephContext)
+{
+  SetUp(1, 2, 5);
+
+  EXPECT_EQ(1u, cct->_conf->osd_min_pg_log_entries);
+  EXPECT_EQ(2u, cct->_conf->osd_max_pg_log_entries);
+  EXPECT_EQ(5u, cct->_conf->osd_pg_log_dups_tracked);
+}
+
+
+TEST_F(PGLogTrimTest, TestPartialTrim)
+{
+  SetUp(1, 2, 20);
+  PGLog::IndexedLog log;
+  log.head = mk_evt(24, 0);
+  log.skip_can_rollback_to_to_head();
+  log.head = mk_evt(9, 0);
+
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+  log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(19, 160), mk_evt(25, 152)));
+  log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+  log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+  std::set<eversion_t> trimmed;
+  std::set<std::string> trimmed_dups;
+  bool dirty_dups = false;
+
+  log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+
+  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(3u, log.log.size());
+  EXPECT_EQ(3u, trimmed.size());
+  EXPECT_EQ(2u, log.dups.size());
+  EXPECT_EQ(0u, trimmed_dups.size());
+
+  SetUp(1, 2, 15);
+
+  std::set<eversion_t> trimmed2;
+  std::set<std::string> trimmed_dups2;
+  bool dirty_dups2 = false;
+  
+  log.trim(cct, mk_evt(20, 164), &trimmed2, &trimmed_dups2, &dirty_dups2);
+
+  EXPECT_EQ(true, dirty_dups2);
+  EXPECT_EQ(2u, log.log.size());
+  EXPECT_EQ(1u, trimmed2.size());
+  EXPECT_EQ(2u, log.dups.size());
+  EXPECT_EQ(1u, trimmed_dups2.size());
+}
+
+
+TEST_F(PGLogTrimTest, TestTrimNoTrimmed) {
+  SetUp(1, 2, 20);
+  PGLog::IndexedLog log;
+  log.head = mk_evt(20, 0);
+  log.skip_can_rollback_to_to_head();
+  log.head = mk_evt(9, 0);
+
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+  log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
+  log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+  log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+  bool dirty_dups = false;
+
+  log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+
+  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(3u, log.log.size());
+  EXPECT_EQ(2u, log.dups.size());
+}
+
+
+TEST_F(PGLogTrimTest, TestTrimNoDups)
+{
+  SetUp(1, 2, 10);
+  PGLog::IndexedLog log;
+  log.head = mk_evt(20, 0);
+  log.skip_can_rollback_to_to_head();
+  log.head = mk_evt(9, 0);
+
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+  log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152)));
+  log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+  log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+  std::set<eversion_t> trimmed;
+  std::set<std::string> trimmed_dups;
+  bool dirty_dups = false;
+
+  log.trim(cct, mk_evt(19, 157), &trimmed, &trimmed_dups, &dirty_dups);
+
+  EXPECT_EQ(false, dirty_dups);
+  EXPECT_EQ(3u, log.log.size());
+  EXPECT_EQ(3u, trimmed.size());
+  EXPECT_EQ(0u, log.dups.size());
+  EXPECT_EQ(0u, trimmed_dups.size());
+}
+
+TEST_F(PGLogTrimTest, TestNoTrim)
+{
+  SetUp(1, 2, 20);
+  PGLog::IndexedLog log;
+  log.head = mk_evt(24, 0);
+  log.skip_can_rollback_to_to_head();
+  log.head = mk_evt(9, 0);
+
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+  log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(19, 160), mk_evt(25, 152)));
+  log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+  log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+  std::set<eversion_t> trimmed;
+  std::set<std::string> trimmed_dups;
+  bool dirty_dups = false;
+
+  log.trim(cct, mk_evt(9, 99), &trimmed, &trimmed_dups, &dirty_dups);
+
+  EXPECT_EQ(false, dirty_dups);
+  EXPECT_EQ(6u, log.log.size());
+  EXPECT_EQ(0u, trimmed.size());
+  EXPECT_EQ(0u, log.dups.size());
+  EXPECT_EQ(0u, trimmed_dups.size());
+}
+
+TEST_F(PGLogTrimTest, TestTrimAll)
+{
+  SetUp(1, 2, 20);
+  PGLog::IndexedLog log;
+  log.head = mk_evt(24, 0);
+  log.skip_can_rollback_to_to_head();
+  log.head = mk_evt(9, 0);
+
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70)));
+  log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100)));
+  log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150)));
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(19, 160), mk_evt(25, 152)));
+  log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160)));
+  log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166)));
+
+  std::set<eversion_t> trimmed;
+  std::set<std::string> trimmed_dups;
+  bool dirty_dups = false;
+
+  log.trim(cct, mk_evt(22, 180), &trimmed, &trimmed_dups, &dirty_dups);
+
+  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(0u, log.log.size());
+  EXPECT_EQ(6u, trimmed.size());
+  EXPECT_EQ(5u, log.dups.size());
+  EXPECT_EQ(0u, trimmed_dups.size());
+}
+
+
+TEST_F(PGLogTrimTest, TestGetRequest) {
+  SetUp(1, 2, 20);
+  PGLog::IndexedLog log;
+  log.head = mk_evt(20, 0);
+  log.skip_can_rollback_to_to_head();
+  log.head = mk_evt(9, 0);
+
+  entity_name_t client = entity_name_t::CLIENT(777);
+
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(10, 100), mk_evt(8, 70),
+                    osd_reqid_t(client, 8, 1)));
+  log.add(mk_ple_dt(mk_obj(2), mk_evt(15, 150), mk_evt(10, 100),
+                   osd_reqid_t(client, 8, 2)));
+  log.add(mk_ple_mod_rb(mk_obj(3), mk_evt(15, 155), mk_evt(15, 150),
+                       osd_reqid_t(client, 8, 3)));
+  log.add(mk_ple_mod(mk_obj(1), mk_evt(20, 160), mk_evt(25, 152),
+                    osd_reqid_t(client, 8, 4)));
+  log.add(mk_ple_mod(mk_obj(4), mk_evt(21, 165), mk_evt(26, 160),
+                    osd_reqid_t(client, 8, 5)));
+  log.add(mk_ple_dt_rb(mk_obj(5), mk_evt(21, 167), mk_evt(31, 166),
+                      osd_reqid_t(client, 8, 6)));
+
+  bool dirty_dups = false;
+
+  log.trim(cct, mk_evt(19, 157), nullptr, nullptr, &dirty_dups);
+
+  EXPECT_EQ(true, dirty_dups);
+  EXPECT_EQ(3u, log.log.size());
+  EXPECT_EQ(2u, log.dups.size());
+
+  eversion_t version;
+  version_t user_version;
+  int return_code;
+
+  osd_reqid_t log_reqid = osd_reqid_t(client, 8, 5);
+  osd_reqid_t dup_reqid = osd_reqid_t(client, 8, 3);
+  osd_reqid_t bad_reqid = osd_reqid_t(client, 8, 1);
+
+  bool result;
+
+  result = log.get_request(log_reqid, &version, &user_version, &return_code);
+  EXPECT_EQ(true, result);
+  EXPECT_EQ(mk_evt(21, 165), version);
+
+  result = log.get_request(dup_reqid, &version, &user_version, &return_code);
+  EXPECT_EQ(true, result);
+  EXPECT_EQ(mk_evt(15, 155), version);
+
+  result = log.get_request(bad_reqid, &version, &user_version, &return_code);
+  EXPECT_EQ(false, result);
+}
+
+
 // Local Variables:
 // compile-command: "cd ../.. ; make unittest_pglog ; ./unittest_pglog --log-to-stderr=true  --debug-osd=20 # --gtest_filter=*.* "
 // End:
index 9525223d6fa86661347dbab4fa323a1f83160d37..d17a74dbcbb0ec6c18864361365a84d75b5099b9 100644 (file)
@@ -477,12 +477,12 @@ int write_pg(ObjectStore::Transaction &t, epoch_t epoch, pg_info_t &info,
   if (!divergent.empty()) {
     assert(missing.get_items().empty());
     PGLog::write_log_and_missing_wo_missing(
-      t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
+      t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true, true);
   } else {
     pg_missing_tracker_t tmissing(missing);
     bool rebuilt_missing_set_with_deletes = missing.may_include_deletes;
     PGLog::write_log_and_missing(
-      t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true,
+      t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true, true,
       &rebuilt_missing_set_with_deletes);
   }
   t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);