if (!recovery_ops.count(op.soid))
return;
RecoveryOp &rop = recovery_ops[op.soid];
- ceph_assert(rop.waiting_on_pushes.count(from));
+ ceph_assert(rop.waiting_on_pushes.contains(from));
rop.waiting_on_pushes.erase(from);
continue_recovery_op(rop, m);
}
int r = op.returned_data->decode(ec_impl, shard_want_to_read, aligned_size);
ceph_assert(r == 0);
- // We are never appending here, so we never need hinfo.
- op.returned_data->insert_parity_buffers();
- r = op.returned_data->encode(ec_impl, NULL, 0);
- ceph_assert(r==0);
// Finally, we don't want to write any padding, so truncate the buffer
// to remove it.
op.state = RecoveryOp::READING;
- // We always read the recovery chunk size (default 8MiB + parity). If that
- // amount of data is not available, then the backend will truncate the
- // response.
+ /* When beginning recovery, the OI may not be known. As such the object
+ * size is not known. For the first read, attempt to read the default
+ * size. If this is larger than the object sizes, then the OSD will
+ * return truncated reads. If the object size is known, then attempt
+ * correctly sized reads.
+ */
+ uint64_t read_size = get_recovery_chunk_size();
+ if (op.obc) {
+ uint64_t read_to_end = ECUtil::align_next(op.obc->obs.oi.size) -
+ op.recovery_progress.data_recovered_to;
+
+ if (read_to_end < read_size) {
+ read_size = read_to_end;
+ }
+ }
sinfo.ro_range_to_shard_extent_set_with_parity(
- op.recovery_progress.data_recovered_to,
- get_recovery_chunk_size(), want);
+ op.recovery_progress.data_recovered_to, read_size, want);
+
+ op.recovery_progress.data_recovered_to += read_size;
+
+ // We only need to recover shards that are missing.
+ for (auto shard : shard_id_set::difference(sinfo.get_all_shards(), op.missing_on_shards)) {
+ want.erase(shard);
+ }
if (op.recovery_progress.first && op.obc) {
op.xattrs = op.obc->attr_cache;
}
if (read_request.shard_reads.empty()) {
ceph_assert(op.obc);
- ceph_assert(0 == op.obc->obs.oi.size);
- dout(10) << __func__ << "Zero size object recovery, skipping reads."
- << op << dendl;
+ /* This can happen for several reasons
+ * - A zero-sized object.
+ * - The missing shards have no data.
+ * - The previous recovery did not need the last data shard. In this
+ * case, data_recovered_to may indicate that the last shard still
+ * needs recovery, when it does not.
+ * We can just skip the read and fall through below.
+ */
+ dout(10) << __func__ << " No reads required " << op << dendl;
// Create an empty read result and fall through.
op.returned_data.emplace(&sinfo);
} else {
dout(20) << __func__ << ": returned_data=" << op.returned_data << dendl;
op.state = RecoveryOp::WRITING;
ObjectRecoveryProgress after_progress = op.recovery_progress;
- after_progress.data_recovered_to = op.returned_data->get_ro_end();
after_progress.first = false;
if (after_progress.data_recovered_to >= op.obc->obs.oi.size) {
after_progress.data_complete = true;
const std::vector<raw_shard_id_t> chunk_mapping_reverse;
const shard_id_set data_shards;
const shard_id_set parity_shards;
+ const shard_id_set all_shards;
private:
void ro_range_to_shards(
return data_shards;
}
+ static shard_id_set calc_all_shards(int k_plus_m) {
+ shard_id_set all_shards;
+ all_shards.insert_range(shard_id_t(), k_plus_m);
+ return all_shards;
+ }
+
+
public:
stripe_info_t(const ErasureCodeInterfaceRef &ec_impl, const pg_pool_t *pool,
uint64_t stripe_width
complete_chunk_mapping(ec_impl->get_chunk_mapping(), k + m)),
chunk_mapping_reverse(reverse_chunk_mapping(chunk_mapping)),
data_shards(calc_shards(raw_shard_id_t(), k, chunk_mapping)),
- parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)) {
+ parity_shards(calc_shards(raw_shard_id_t(k), m, chunk_mapping)),
+ all_shards(calc_all_shards(k + m)) {
ceph_assert(stripe_width != 0);
ceph_assert(stripe_width % k == 0);
}
return parity_shards;
}
+ auto get_all_shards() const {
+ return all_shards;
+ }
+
+
uint64_t ro_offset_to_prev_chunk_offset(uint64_t offset) const {
return (offset / stripe_width) * chunk_size;
}