We fix a number of issues with partial_write here.
Fix an issue where it is unclear whether the empty PWLC state is
newer or older than a populated PWLC on another shard by always
updating the pwlc with an empty range, rather than blank.
This is an unfortunate small increase in metadata, so we should
come back to this in a later commit (or possibly later PR).
Normally a PG log consists of a set of log entries with each
log entry have a version number one greater than the previous
entry. When a PG splits the PG log is split so that each of the
new PGs only has log entries for objects in that PG, which
means there can be gaps between version numbers.
PGBackend::partial_write is trying to keep track of adjacent
log updates than do not update a particular shard storing
these as a range in partial_writes_last_complete. To do this
it must compare with the version number of the previous log
entry rather than testing for a version number increment of one.
Also simplify partial_writes to make it more readable.
Signed-off-by: Bill Scales <bill_scales@uk.ibm.com>
Signed-off-by: Alex Ainscow <aainscow@uk.ibm.com>
(cherry picked from commit
2467406f3e22c0746ce20cd04b838dccedadf055)
void trim(const pg_log_entry_t &entry) override {
// TODO
}
- void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
+ void partial_write(pg_info_t *info,
+ eversion_t previous_version,
+ const pg_log_entry_t &entry
+ ) override {
// TODO
- ceph_assert(entry.written_shards.empty() && info->partial_writes_last_complete.empty());
+ ceph_assert(entry.written_shards.empty() &&
+ info->partial_writes_last_complete.empty());
}
};
PGLog::LogEntryHandlerRef get_log_handler(
void trim(const pg_log_entry_t &entry) override {
pg->get_pgbackend()->trim(entry, t);
}
- void partial_write(pg_info_t *info, const pg_log_entry_t &entry) override {
- pg->get_pgbackend()->partial_write(info, entry);
+ void partial_write(pg_info_t *info, eversion_t previous_version,
+ const pg_log_entry_t &entry
+ ) override {
+ pg->get_pgbackend()->partial_write(info, previous_version, entry);
}
};
void PGBackend::partial_write(
pg_info_t *info,
+ eversion_t previous_version,
const pg_log_entry_t &entry)
{
ceph_assert(info != nullptr);
+ if (entry.written_shards.empty() && info->partial_writes_last_complete.empty()) {
+ return;
+ }
auto dpp = get_parent()->get_dpp();
- if (!entry.written_shards.empty()) {
- ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
- << " written_shards=" << entry.written_shards
- << " present_shards=" << entry.present_shards
- << " pwlc=" << info->partial_writes_last_complete
- << dendl;
- const pg_pool_t &pool = get_parent()->get_pool();
- for (unsigned int shard = 0;
- shard < get_parent()->get_pool().size;
- shard++) {
- if (pool.is_nonprimary_shard(shard_id_t(shard))) {
- if (!entry.is_written_shard(shard_id_t(shard))) {
- if (!info->partial_writes_last_complete.contains(shard_id_t(shard))) {
- // 1st partial write since all logs were updated
- info->partial_writes_last_complete[shard_id_t(shard)] =
- std::pair(entry.prior_version, entry.version);
- } else if (info->partial_writes_last_complete[shard_id_t(shard)]
- .second.version + 1 == entry.version.version) {
- // Subsequent partial write, version is sequential
- info->partial_writes_last_complete[shard_id_t(shard)].second =
- entry.version;
- } else {
- // Subsequent partial write, discontiguous versions
- ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
- << dendl;
- }
- } else {
- // Log updated or shard absent, partial write entry not required
- info->partial_writes_last_complete.erase(shard_id_t(shard));
- }
+ ldpp_dout(dpp, 20) << __func__ << " version=" << entry.version
+ << " written_shards=" << entry.written_shards
+ << " present_shards=" << entry.present_shards
+ << " pwlc=" << info->partial_writes_last_complete
+ << " previous_version=" << previous_version
+ << dendl;
+ const pg_pool_t &pool = get_parent()->get_pool();
+ for (shard_id_t shard : pool.nonprimary_shards) {
+ auto pwlc_iter = info->partial_writes_last_complete.find(shard);
+ if (!entry.is_written_shard(shard)) {
+ if (pwlc_iter == info->partial_writes_last_complete.end()) {
+ // 1st partial write since all logs were updated
+ info->partial_writes_last_complete[shard] =
+ std::pair(previous_version, entry.version);
+
+ continue;
}
+ auto &&[old_v, new_v] = pwlc_iter->second;
+ if (old_v == new_v) {
+ old_v = previous_version;
+ new_v = entry.version;
+ } else if (new_v == previous_version) {
+ // Subsequent partial write, contiguous versions
+ new_v = entry.version;
+ } else {
+ // Subsequent partial write, discontiguous versions
+ ldpp_dout(dpp, 20) << __func__ << " cannot update shard " << shard
+ << dendl;
+ }
+ } else if (pwlc_iter != info->partial_writes_last_complete.end()) {
+ auto &&[old_v, new_v] = pwlc_iter->second;
+ // Log updated or shard absent, partial write entry is a no-op
+ // FIXME: In a later commit (or later PR) we should look at other ways of
+ // actually clearing the PWLC once all shards have seen the update.
+ old_v = new_v = entry.version;
}
- ldpp_dout(dpp, 20) << __func__ << " after pwlc="
- << info->partial_writes_last_complete << dendl;
- } else {
- // All shard updated - clear partial write data
- if (!info->partial_writes_last_complete.empty()) {
- ldpp_dout(dpp, 20) << __func__ << " clear pwlc" << dendl;
- }
- info->partial_writes_last_complete.clear();
}
+ ldpp_dout(dpp, 20) << __func__ << " after pwlc="
+ << info->partial_writes_last_complete << dendl;
}
void PGBackend::remove(
void partial_write(
pg_info_t *info,
+ eversion_t previous_version,
const pg_log_entry_t &entry);
void remove(
const hobject_t &hoid,
version_t v) = 0;
virtual void partial_write(
- pg_info_t *info,
- const pg_log_entry_t &entry) = 0;
+ pg_info_t *info,
+ eversion_t previous_version,
+ const pg_log_entry_t &entry
+ ) = 0;
virtual ~LogEntryHandler() {}
};
using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>;
rollback_info_trimmed_to = to;
}
+ eversion_t previous_version;
+ if (rollback_info_trimmed_to_riter == log.rend()) {
+ previous_version = tail;
+ } else {
+ previous_version = rollback_info_trimmed_to_riter->version;
+ }
while (rollback_info_trimmed_to_riter != log.rbegin()) {
--rollback_info_trimmed_to_riter;
if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
++rollback_info_trimmed_to_riter;
break;
}
- f(*rollback_info_trimmed_to_riter);
+ f(*rollback_info_trimmed_to_riter, previous_version);
+ previous_version = rollback_info_trimmed_to_riter->version;
}
return dirty_log;
void trim_rollback_info_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) {
advance_can_rollback_to(
to,
- [&](pg_log_entry_t &entry) {
+ [&](pg_log_entry_t &entry, eversion_t previous_version) {
h->trim(entry);
- h->partial_write(info, entry);
+ h->partial_write(info, previous_version, entry);
});
}
bool roll_forward_to(eversion_t to, pg_info_t *info, LogEntryHandler *h) {
return advance_can_rollback_to(
to,
- [&](pg_log_entry_t &entry) {
+ [&](pg_log_entry_t &entry, eversion_t previous_version) {
h->rollforward(entry);
- h->partial_write(info, entry);
+ h->partial_write(info, previous_version, entry);
});
}
void skip_can_rollback_to_to_head(pg_info_t *info, LogEntryHandler *h) {
advance_can_rollback_to(
head,
- [&](pg_log_entry_t &entry) {
- h->partial_write(info, entry);
+ [&](pg_log_entry_t &entry, eversion_t previous_version) {
+ h->partial_write(info, previous_version, entry);
});
}
void skip_can_rollback_to_to_head() {
- advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {});
+ advance_can_rollback_to(head, [&](const pg_log_entry_t &entry,
+ eversion_t previous_version) {});
}
mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) {
void trim(
const pg_log_entry_t &entry) override {}
void partial_write(
- pg_info_t *info,
- const pg_log_entry_t &entry) override {}
+ pg_info_t *info,
+ eversion_t previous_version,
+ const pg_log_entry_t &entry
+ ) override {}
};
template <typename missing_t>
void trim(
const pg_log_entry_t &entry) override {}
void partial_write(
- pg_info_t *info,
- const pg_log_entry_t &entry) override {}
+ pg_info_t *info,
+ eversion_t previous_version,
+ const pg_log_entry_t &entry
+ ) override {}
};
TEST_F(PGLogTest, rewind_divergent_log) {