if (m && pi.last_backfill != hobject_t()) {
for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
p != m->log.log.end();
- ++p)
+ ++p) {
if (p->soid <= pi.last_backfill &&
- !p->is_error())
- pm.add_next_event(*p);
+ !p->is_error()) {
+ if (perform_deletes_during_peering() && p->is_delete()) {
+ pm.rm(p->soid, p->version);
+ } else {
+ pm.add_next_event(*p);
+ }
+ }
+ }
}
-
+
if (m) {
dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
//m->log.print(cout);
!touched_log,
require_rollback,
clear_divergent_priors,
+ &rebuilt_missing_with_deletes,
(pg_log_debug ? &log_keys_debug : 0));
undirty();
} else {
const coll_t& coll,
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
- bool require_rollback)
+ bool require_rollback,
+ bool *rebuilt_missing_with_deletes)
{
_write_log_and_missing(
t, km, log, coll, log_oid,
eversion_t(),
set<eversion_t>(),
missing,
- true, require_rollback, false, 0);
+ true, require_rollback, false, rebuilt_missing_with_deletes, 0);
}
void PGLog::_write_log_and_missing_wo_missing(
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
+ bool *rebuilt_missing_with_deletes, // in/out param
set<string> *log_keys_debug
) {
set<string> to_remove;
//dout(10) << "write_log_and_missing: writing divergent_priors" << dendl;
to_remove.insert("divergent_priors");
}
+ // since we encode individual missing items instead of a whole
+ // missing set, we need another key to store this bit of state
+ if (*rebuilt_missing_with_deletes) {
+ (*km)["may_include_deletes_in_missing"] = bufferlist();
+ *rebuilt_missing_with_deletes = false;
+ }
missing.get_changed(
[&](const hobject_t &obj) {
string key = string("missing/") + obj.to_str();
if (!missing.is_missing(obj, &item)) {
to_remove.insert(key);
} else {
- ::encode(obj, (*km)[key]);
- item.encode_with_flags((*km)[key]);
+ uint64_t features = missing.may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0;
+ ::encode(make_pair(obj, item), (*km)[key], features);
}
});
if (require_rollback) {
/// Log is clean on [dirty_to, dirty_from)
bool touched_log;
bool clear_divergent_priors;
+ bool rebuilt_missing_with_deletes = false;
void mark_dirty_to(eversion_t to) {
if (to > dirty_to)
(dirty_from != eversion_t::max()) ||
(writeout_from != eversion_t::max()) ||
!(trimmed.empty()) ||
- !missing.is_clean();
+ !missing.is_clean() ||
+ rebuilt_missing_with_deletes;
}
void mark_log_for_rewrite() {
mark_dirty_to(eversion_t::max());
assert(objiter->second->version > last_divergent_update);
// ensure missing has been updated appropriately
- if (objiter->second->is_update() || objiter->second->is_delete()) {
+ if (objiter->second->is_update() ||
+ (missing.may_include_deletes && objiter->second->is_delete())) {
assert(missing.is_missing(hoid) &&
missing.get_items().at(hoid).need == objiter->second->version);
} else {
}
if (p->soid <= last_backfill &&
!p->is_error()) {
- missing.add_next_event(*p);
+ if (missing.may_include_deletes) {
+ missing.add_next_event(*p);
+ } else {
+ if (p->is_delete()) {
+ missing.rm(p->soid, p->version);
+ } else {
+ missing.add_next_event(*p);
+ }
+ if (rollbacker) {
+ // hack to match PG::mark_all_unfound_lost
+ if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) {
+ rollbacker->try_stash(p->soid, p->version.version);
+ } else if (p->is_delete()) {
+ rollbacker->remove(p->soid);
+ }
+ }
+ }
}
}
return invalidate_stats;
const coll_t& coll,
const ghobject_t &log_oid,
const pg_missing_tracker_t &missing,
- bool require_rollback);
+ bool require_rollback,
+ bool *rebuilt_missing_set_with_deletes);
static void _write_log_and_missing_wo_missing(
ObjectStore::Transaction& t,
bool touch_log,
bool require_rollback,
bool clear_divergent_priors,
+ bool *rebuilt_missing_with_deletes,
set<string> *log_keys_debug
);
ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid);
map<eversion_t, hobject_t> divergent_priors;
bool has_divergent_priors = false;
+ missing.may_include_deletes = false;
list<pg_log_entry_t> entries;
if (p) {
for (p->seek_to_first(); p->valid() ; p->next(false)) {
::decode(on_disk_can_rollback_to, bp);
} else if (p->key() == "rollback_info_trimmed_to") {
::decode(on_disk_rollback_info_trimmed_to, bp);
+ } else if (p->key() == "may_include_deletes_in_missing") {
+ missing.may_include_deletes = true;
} else if (p->key().substr(0, 7) == string("missing")) {
hobject_t oid;
pg_missing_item item;
::decode(oid, bp);
- item.decode_with_flags(bp);
+ ::decode(item, bp);
+ if (item.is_delete()) {
+ assert(missing.may_include_deletes);
+ }
missing.add(oid, item.need, item.have, item.is_delete());
} else {
pg_log_entry_t e;
if (did.count(i->soid)) continue;
did.insert(i->soid);
+ if (!missing.may_include_deletes && i->is_delete())
+ continue;
+
bufferlist bv;
int r = store->getattr(
pg_coll,
out << i.need;
if (i.have != eversion_t())
out << "(" << i.have << ")";
+ out << " flags = " << i.flag_str();
return out;
}
set_delete(is_delete);
}
- void encode(bufferlist& bl) const {
- ::encode(need, bl);
- ::encode(have, bl);
+ void encode(bufferlist& bl, uint64_t features) const {
+ if (HAVE_FEATURE(features, OSD_RECOVERY_DELETES)) {
+ // encoding a zeroed eversion_t to differentiate between this and
+ // legacy unversioned encoding - a need value of 0'0 is not
+ // possible. This can be replaced with the legacy encoding
+ // macros post-luminous.
+ eversion_t e;
+ ::encode(e, bl);
+ ::encode(need, bl);
+ ::encode(have, bl);
+ ::encode(static_cast<uint8_t>(flags), bl);
+ } else {
+ // legacy unversioned encoding
+ ::encode(need, bl);
+ ::encode(have, bl);
+ }
}
void decode(bufferlist::iterator& bl) {
- ::decode(need, bl);
- ::decode(have, bl);
+ eversion_t e;
+ ::decode(e, bl);
+ if (e != eversion_t()) {
+ // legacy encoding, this is the need value
+ need = e;
+ ::decode(have, bl);
+ } else {
+ ::decode(need, bl);
+ ::decode(have, bl);
+ uint8_t f;
+ ::decode(f, bl);
+ flags = static_cast<missing_flags_t>(f);
+ }
}
void set_delete(bool is_delete) {
return (flags & FLAG_DELETE) == FLAG_DELETE;
}
- void encode_with_flags(bufferlist& bl) const {
- encode(bl);
- ::encode(static_cast<uint8_t>(flags), bl);
- }
-
- void decode_with_flags(bufferlist::iterator& bl) {
- decode(bl);
- // no versioning on this, but it's stored in a single omap value,
- // so just check for the end of the bufferlist
- if (!bl.end()) {
- uint8_t f;
- ::decode(f, bl);
- flags = static_cast<missing_flags_t>(f);
+ string flag_str() const {
+ if (flags == FLAG_NONE) {
+ return "none";
+ } else {
+ return "delete";
}
}
void dump(Formatter *f) const {
f->dump_stream("need") << need;
f->dump_stream("have") << have;
- f->dump_stream("flags") << (flags == FLAG_NONE ? "none" : "delete");
+ f->dump_stream("flags") << flag_str();
}
static void generate_test_instances(list<pg_missing_item*>& o) {
o.push_back(new pg_missing_item);
return !(*this == rhs);
}
};
-WRITE_CLASS_ENCODER(pg_missing_item)
+WRITE_CLASS_ENCODER_FEATURES(pg_missing_item)
ostream& operator<<(ostream& out, const pg_missing_item &item);
class pg_missing_const_i {
virtual const map<hobject_t, pg_missing_item> &
get_items() const = 0;
virtual const map<version_t, hobject_t> &get_rmissing() const = 0;
+ virtual bool get_may_include_deletes() const = 0;
virtual unsigned int num_missing() const = 0;
virtual bool have_missing() const = 0;
virtual bool is_missing(const hobject_t& oid, pg_missing_item *out = nullptr) const = 0;
pg_missing_set(const missing_type &m) {
missing = m.get_items();
rmissing = m.get_rmissing();
+ may_include_deletes = m.get_may_include_deletes();
for (auto &&i: missing)
tracker.changed(i.first);
}
+ bool may_include_deletes = false;
+
const map<hobject_t, item> &get_items() const override {
return missing;
}
const map<version_t, hobject_t> &get_rmissing() const override {
return rmissing;
}
+ bool get_may_include_deletes() const override {
+ return may_include_deletes;
+ }
unsigned int num_missing() const override {
return missing.size();
}
pg_t child_pgid,
unsigned split_bits,
pg_missing_set *omissing) {
+ omissing->may_include_deletes = may_include_deletes;
unsigned mask = ~((~0)<<split_bits);
for (map<hobject_t, item>::iterator i = missing.begin();
i != missing.end();
void encode(bufferlist &bl) const {
ENCODE_START(4, 2, bl);
- ::encode(missing, bl);
- // since pg_missing_item was not versioned, we encode the new flags
- // field here explicitly
- map<hobject_t, uint8_t> missing_flags;
- for (const auto &p : missing) {
- if (p.second.flags != pg_missing_item::FLAG_NONE) {
- missing_flags.insert(make_pair(p.first,
- static_cast<uint8_t>(p.second.flags)));
- }
- }
- ::encode(missing_flags, bl);
+ ::encode(missing, bl, may_include_deletes ? CEPH_FEATURE_OSD_RECOVERY_DELETES : 0);
+ ::encode(may_include_deletes, bl);
ENCODE_FINISH(bl);
}
void decode(bufferlist::iterator &bl, int64_t pool = -1) {
DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
::decode(missing, bl);
if (struct_v >= 4) {
- map<hobject_t, uint8_t> missing_flags;
- ::decode(missing_flags, bl);
- for (const auto &p : missing_flags) {
- assert(missing.find(p.first) != missing.end());
- missing[p.first].flags = static_cast<pg_missing_item::missing_flags_t>(p.second);
- }
+ ::decode(may_include_deletes, bl);
}
DECODE_FINISH(bl);
f->close_section();
}
f->close_section();
+ f->dump_bool("may_include_deletes", may_include_deletes);
}
template <typename F>
void filter_objects(F &&f) {
static void generate_test_instances(list<pg_missing_set*>& o) {
o.push_back(new pg_missing_set);
o.push_back(new pg_missing_set);
+ o.back()->add(
+ hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
+ eversion_t(5, 6), eversion_t(5, 1), false);
+ o.push_back(new pg_missing_set);
o.back()->add(
hobject_t(object_t("foo"), "foo", 123, 456, 0, ""),
eversion_t(5, 6), eversion_t(5, 1), true);
+ o.back()->may_include_deletes = true;
}
template <typename F>
void get_changed(F &&f) const {
template <bool TrackChanges>
ostream& operator<<(ostream& out, const pg_missing_set<TrackChanges> &missing)
{
- out << "missing(" << missing.num_missing();
+ out << "missing(" << missing.num_missing()
+ << " may_include_deletes = " << missing.may_include_deletes;
//if (missing.num_lost()) out << ", " << missing.num_lost() << " lost";
out << ")";
return out;
TYPE_FEATUREFUL(pg_query_t)
TYPE(pg_log_entry_t)
TYPE(pg_log_t)
-TYPE(pg_missing_item)
+TYPE_FEATUREFUL(pg_missing_item)
TYPE(pg_missing_t)
TYPE(pg_ls_response_t)
TYPE(pg_nls_response_t)
class PGLogTest : public ::testing::Test, protected PGLog {
public:
PGLogTest() : PGLog(g_ceph_context) {}
- void SetUp() override { }
+ void SetUp() override {
+ missing.may_include_deletes = true;
+ }
void TearDown() override {
clear();
set<hobject_t> toremove;
list<pg_log_entry_t> torollback;
+ bool deletes_during_peering;
private:
IndexedLog fullauth;
pg_info_t authinfo;
pg_info_t divinfo;
public:
+ TestCase() : deletes_during_peering(false) {}
void setup() {
+ init.may_include_deletes = !deletes_during_peering;
+ final.may_include_deletes = !deletes_during_peering;
fullauth.log.insert(fullauth.log.end(), base.begin(), base.end());
fullauth.log.insert(fullauth.log.end(), auth.begin(), auth.end());
fulldiv.log.insert(fulldiv.log.end(), base.begin(), base.end());
pg_info_t oinfo = tcase.get_divinfo();
proc_replica_log(
- oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
+ oinfo, olog, omissing, pg_shard_t(1, shard_id_t(0)));
assert(oinfo.last_update >= log.tail);
for (list<pg_log_entry_t>::const_iterator i = tcase.auth.begin();
i != tcase.auth.end();
++i) {
- if (i->version > oinfo.last_update)
- omissing.add_next_event(*i);
+ if (i->version > oinfo.last_update) {
+ if (i->is_delete() && tcase.deletes_during_peering) {
+ omissing.rm(i->soid, i->version);
+ } else {
+ omissing.add_next_event(*i);
+ }
+ }
}
verify_missing(tcase, omissing);
}
oinfo.stats.reported_epoch = 1;
log.tail = olog.tail = eversion_t(1, 1);
log.head = olog.head = eversion_t(2, 1);
+ missing.may_include_deletes = false;
EXPECT_FALSE(missing.have_missing());
EXPECT_EQ(0U, log.log.size());
list<hobject_t> remove_snap;
bool dirty_info = false;
bool dirty_big_info = false;
+ missing.may_include_deletes = false;
{
pg_log_entry_t e;
bool dirty_big_info = false;
hobject_t divergent_object;
+ missing.may_include_deletes = true;
{
pg_log_entry_t e;
EXPECT_TRUE(dirty_big_info);
}
+ /* +--------------------------+
+ | log olog |
+ +--------+-------+---------+
+ | |object | |
+ |version | hash | version |
+ | | | |
+ tail > (1,1) | x5 | (1,1) < tail
+ | | | |
+ | | | |
+ | (1,2) | x3 | (1,2) < lower_bound
+ | | | |
+ | | | |
+ head > (1,3) | x9 | |
+ | DELETE | | |
+ | | | |
+ | | x9 | (2,3) |
+ | | | MODIFY |
+ | | | |
+ | | x7 | (2,4) < head
+ | | | DELETE |
+ +--------+-------+---------+
+
+ The log entry (1,3) deletes the object x9 but the olog entry (2,3) modifies
+ it and is authoritative : the log entry (1,3) is divergent.
+
+ */
+ {
+ clear();
+
+ pg_log_t olog;
+ pg_info_t oinfo;
+ pg_shard_t fromosd;
+ pg_info_t info;
+ list<hobject_t> remove_snap;
+ bool dirty_info = false;
+ bool dirty_big_info = false;
+
+ hobject_t divergent_object;
+
+ {
+ pg_log_entry_t e;
+ e.mark_unrollbackable();
+
+ e.version = eversion_t(1, 1);
+ e.soid.set_hash(0x5);
+ log.tail = e.version;
+ log.log.push_back(e);
+ e.version = eversion_t(1, 2);
+ e.soid.set_hash(0x3);
+ log.log.push_back(e);
+ e.version = eversion_t(1,3);
+ e.soid.set_hash(0x9);
+ divergent_object = e.soid;
+ e.op = pg_log_entry_t::DELETE;
+ log.log.push_back(e);
+ log.head = e.version;
+ log.index();
+
+ info.last_update = log.head;
+
+ e.version = eversion_t(1, 1);
+ e.soid.set_hash(0x5);
+ olog.tail = e.version;
+ olog.log.push_back(e);
+ e.version = eversion_t(1, 2);
+ e.soid.set_hash(0x3);
+ olog.log.push_back(e);
+ e.version = eversion_t(2, 3);
+ e.soid.set_hash(0x9);
+ e.op = pg_log_entry_t::MODIFY;
+ olog.log.push_back(e);
+ e.version = eversion_t(2, 4);
+ e.soid.set_hash(0x7);
+ e.op = pg_log_entry_t::DELETE;
+ olog.log.push_back(e);
+ olog.head = e.version;
+ }
+
+ snapid_t purged_snap(1);
+ {
+ oinfo.last_update = olog.head;
+ oinfo.purged_snaps.insert(purged_snap);
+ }
+
+ EXPECT_FALSE(missing.have_missing());
+ EXPECT_EQ(1U, log.objects.count(divergent_object));
+ EXPECT_EQ(3U, log.log.size());
+ EXPECT_TRUE(remove_snap.empty());
+ EXPECT_EQ(log.head, info.last_update);
+ EXPECT_TRUE(info.purged_snaps.empty());
+ EXPECT_FALSE(is_dirty());
+ EXPECT_FALSE(dirty_info);
+ EXPECT_FALSE(dirty_big_info);
+
+ TestHandler h(remove_snap);
+ missing.may_include_deletes = false;
+ merge_log(oinfo, olog, fromosd, info, &h,
+ dirty_info, dirty_big_info);
+
+ /* When the divergent entry is a DELETE and the authoritative
+ entry is a MODIFY, the object will be added to missing : it is
+ a verifiable side effect proving the entry was identified
+ to be divergent.
+ */
+ EXPECT_TRUE(missing.is_missing(divergent_object));
+ EXPECT_EQ(1U, log.objects.count(divergent_object));
+ EXPECT_EQ(4U, log.log.size());
+ /* DELETE entries from olog that are appended to the hed of the
+ log, and the divergent version of the object is removed (added
+ to remove_snap). When peering handles deletes, it is the earlier
+ version that is in the removed list.
+ */
+ EXPECT_EQ(0x7U, remove_snap.front().get_hash());
+ EXPECT_EQ(log.head, info.last_update);
+ EXPECT_TRUE(info.purged_snaps.contains(purged_snap));
+ EXPECT_TRUE(is_dirty());
+ EXPECT_TRUE(dirty_info);
+ EXPECT_TRUE(dirty_big_info);
+ }
+
/* +--------------------------+
| log olog |
+--------+-------+---------+
EXPECT_FALSE(dirty_big_info);
TestHandler h(remove_snap);
+ missing.may_include_deletes = false;
merge_log(oinfo, olog, fromosd, info, &h,
dirty_info, dirty_big_info);
EXPECT_EQ(last_update, oinfo.last_update);
EXPECT_EQ(last_complete, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_FALSE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_FALSE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
EXPECT_EQ(olog.head, oinfo.last_update);
EXPECT_EQ(olog.head, oinfo.last_complete);
+ missing.may_include_deletes = false;
proc_replica_log(oinfo, olog, omissing, from);
EXPECT_TRUE(omissing.have_missing());
run_test_case(t);
}
+TEST_F(PGLogTest, merge_log_9) {
+ TestCase t;
+ t.base.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80)));
+
+ t.auth.push_back(mk_ple_dt(mk_obj(1), mk_evt(11, 101), mk_evt(10, 100)));
+
+ t.init.add(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80), false);
+ t.toremove.insert(mk_obj(1));
+ t.deletes_during_peering = true;
+
+ t.setup();
+ run_test_case(t);
+}
+
TEST_F(PGLogTest, merge_log_prior_version_have) {
TestCase t;
t.base.push_back(mk_ple_mod_rb(mk_obj(1), mk_evt(10, 100), mk_evt(8, 80)));
t, &km, log, coll, info.pgid.make_pgmeta_oid(), divergent, true);
} else {
pg_missing_tracker_t tmissing(missing);
+ bool rebuilt_missing_set_with_deletes = false;
PGLog::write_log_and_missing(
- t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true);
+ t, &km, log, coll, info.pgid.make_pgmeta_oid(), tmissing, true,
+ &rebuilt_missing_set_with_deletes);
}
t.omap_setkeys(coll, info.pgid.make_pgmeta_oid(), km);
return 0;