start_read_op(
priority,
m.reads,
- OpRequestRef());
+ OpRequestRef(),
+ false);
}
void ECBackend::continue_recovery_op(
set<pg_shard_t> to_read;
uint64_t recovery_max_chunk = get_recovery_chunk_size();
int r = get_min_avail_to_read_shards(
- op.hoid, want, true, &to_read);
+ op.hoid, want, true, false, &to_read);
if (r != 0) {
// we must have lost a recovery source
assert(!op.recovery_progress.first);
assert(rop.in_progress.count(from));
rop.in_progress.erase(from);
+ bool is_complete = true;
if (!rop.in_progress.empty()) {
- dout(10) << __func__ << " readop not complete: " << rop << dendl;
- } else {
- dout(10) << __func__ << " readop complete: " << rop << dendl;
+ if (rop.do_redundant_reads) {
+ for (map<hobject_t, read_result_t>::const_iterator iter =
+ rop.complete.begin();
+ iter != rop.complete.end();
+ ++iter) {
+ set<int> have;
+ for (map<pg_shard_t, bufferlist>::const_iterator j =
+ iter->second.returned.front().get<2>().begin();
+ j != iter->second.returned.front().get<2>().end();
+ ++j) {
+ have.insert(j->first.shard);
+ }
+ set<int> want_to_read, dummy_minimum;
+ get_want_to_read_shards(&want_to_read);
+ if (ec_impl->minimum_to_decode(want_to_read, have, &dummy_minimum) < 0) {
+ is_complete = false;
+ break;
+ }
+ }
+ } else {
+ is_complete = false;
+ }
+ }
+ if (is_complete) {
complete_read_op(rop, m);
+ } else {
+ dout(10) << __func__ << " readop not complete: " << rop << dendl;
}
}
const hobject_t &hoid,
const set<int> &want,
bool for_recovery,
+ bool do_redundant_reads,
set<pg_shard_t> *to_read)
{
+ // Make sure we don't do redundant reads for recovery
+ assert(!for_recovery || !do_redundant_reads);
+
map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator>::const_iterator miter =
get_parent()->get_missing_loc_shards().find(hoid);
if (r < 0)
return r;
+ if (do_redundant_reads) {
+ need.swap(have);
+ }
+
if (!to_read)
return 0;
void ECBackend::start_read_op(
int priority,
map<hobject_t, read_request_t, hobject_t::BitwiseComparator> &to_read,
- OpRequestRef _op)
+ OpRequestRef _op,
+ bool do_redundant_reads)
{
ceph_tid_t tid = get_parent()->get_tid();
assert(!tid_to_read_map.count(tid));
op.tid = tid;
op.to_read.swap(to_read);
op.op = _op;
+ op.do_redundant_reads = do_redundant_reads;
dout(10) << __func__ << ": starting " << op << dendl;
map<pg_shard_t, ECSubRead> messages;
const hobject_t &hoid,
const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
pair<bufferlist*, Context*> > > &to_read,
- Context *on_complete)
+ Context *on_complete,
+ bool fast_read)
{
in_progress_client_reads.push_back(ClientAsyncReadStatus(on_complete));
CallClientContexts *c = new CallClientContexts(
offsets.push_back(boost::make_tuple(tmp.first, tmp.second, i->first.get<2>()));
}
- const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
set<int> want_to_read;
- for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
- int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
- want_to_read.insert(chunk);
- }
+ get_want_to_read_shards(&want_to_read);
+
set<pg_shard_t> shards;
int r = get_min_avail_to_read_shards(
hoid,
want_to_read,
false,
+ fast_read,
&shards);
assert(r == 0);
start_read_op(
cct->_conf->osd_client_op_priority,
for_read_op,
- OpRequestRef());
+ OpRequestRef(),
+ fast_read);
return;
}
const hobject_t &hoid,
const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
pair<bufferlist*, Context*> > > &to_read,
- Context *on_complete);
+ Context *on_complete,
+ bool fast_read = false);
private:
friend struct ECRecoveryHandle;
sinfo.get_stripe_width());
}
+ void get_want_to_read_shards(set<int> *want_to_read) const {
+ const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
+ for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
+ int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
+ want_to_read->insert(chunk);
+ }
+ }
+
/**
* Recovery
*
int priority;
ceph_tid_t tid;
OpRequestRef op; // may be null if not on behalf of a client
+ // True if redundant reads are issued, false otherwise,
+ // this is useful to tradeoff some resources (redundant ops) for
+ // low latency read, especially on relatively idle cluster
+ bool do_redundant_reads;
map<hobject_t, read_request_t, hobject_t::BitwiseComparator> to_read;
map<hobject_t, read_result_t, hobject_t::BitwiseComparator> complete;
void start_read_op(
int priority,
map<hobject_t, read_request_t, hobject_t::BitwiseComparator> &to_read,
- OpRequestRef op);
+ OpRequestRef op,
+ bool do_redundant_reads);
/**
const hobject_t &hoid, ///< [in] object
const set<int> &want, ///< [in] desired shards
bool for_recovery, ///< [in] true if we may use non-acting replicas
+ bool do_redundant_reads, ///< [in] true if we want to issue redundant reads to reduce latency
set<pg_shard_t> *to_read ///< [out] shards to read
); ///< @return error code, 0 on success