From: Sage Weil Date: Mon, 19 Jul 2010 21:44:09 +0000 (-0700) Subject: osd: recover degraded objects _before_ modifying it X-Git-Tag: v0.21~48^2~9^2~3 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=1d2018320d1e31d2d7c1b2f49e6cdf4890e6ed8f;p=ceph.git osd: recover degraded objects _before_ modifying it This will slow down writes to degraded objects because we will wait for it to recover before applying the write. OTOH it will be robust in the case of large objects. We can optimize the small object update (and overwrite) cases later. Signed-off-by: Sage Weil --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d61f7a74bfa8e..3494c79afcfde 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -4411,14 +4411,20 @@ void OSD::handle_op(MOSDOp *op) } } - // missing object? if ((op->get_flags() & CEPH_OSD_FLAG_PGOP) == 0) { + // missing object? sobject_t head(op->get_oid(), CEPH_NOSNAP); if (pg->is_missing_object(head)) { pg->wait_for_missing_object(head, op); pg->unlock(); return; } + + if (op->may_write() && pg->is_degraded_object(head)) { + pg->wait_for_degraded_object(head, op); + pg->unlock(); + return; + } } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 0032c41afe160..f4932758519a1 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2408,6 +2408,14 @@ bool PG::block_if_wrlocked(MOSDOp* op, object_info_t& oi) return false; //the object wasn't locked, so the operation can be handled right away } +void PG::take_object_waiters(hash_map >& m) +{ + for (hash_map >::iterator it = m.begin(); + it != m.end(); + it++) + osd->take_waiters(it->second); + m.clear(); +} // ========================================================================================== diff --git a/src/osd/PG.h b/src/osd/PG.h index 6157aee0f43d0..c0948c0d3ee00 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -764,8 +764,10 @@ public: // pg waiters list waiting_for_active; hash_map > waiting_for_missing_object; + list > waiting_for_missing_object, waiting_for_degraded_object; map replay_queue; + + void take_object_waiters(hash_map >& m); hash_map > waiting_for_wr_unlock; @@ -981,6 +983,9 @@ public: virtual bool is_missing_object(const sobject_t& oid) = 0; virtual void wait_for_missing_object(const sobject_t& oid, Message *op) = 0; + virtual bool is_degraded_object(const sobject_t& oid) = 0; + virtual void wait_for_degraded_object(const sobject_t& oid, Message *op) = 0; + virtual void on_osd_failure(int osd) = 0; virtual void on_role_change() = 0; virtual void on_change() = 0; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 2d155342f29dd..43bccbb2729b7 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -78,7 +78,6 @@ bool ReplicatedPG::is_missing_object(const sobject_t& soid) { return missing.missing.count(soid); } - void ReplicatedPG::wait_for_missing_object(const sobject_t& soid, Message *m) { @@ -103,6 +102,39 @@ void ReplicatedPG::wait_for_missing_object(const sobject_t& soid, Message *m) waiting_for_missing_object[soid].push_back(m); } +bool ReplicatedPG::is_degraded_object(const sobject_t& soid) +{ + if (missing.missing.count(soid)) + return true; + for (unsigned i = 1; i < acting.size(); i++) { + int peer = acting[i]; + if (peer_missing.count(peer) && + peer_missing[peer].missing.count(soid)) + return true; + } + return false; +} + +void ReplicatedPG::wait_for_degraded_object(const sobject_t& soid, Message *m) +{ + assert(is_degraded_object(soid)); + + // we don't have it (yet). + if (pushing.count(soid)) { + dout(7) << "degraded " + << soid + << ", already pushing" + << dendl; + } else { + dout(7) << "degraded " + << soid + << ", pushing" + << dendl; + recover_object_replicas(soid); + } + waiting_for_degraded_object[soid].push_back(m); +} + // ========================================================== @@ -2259,16 +2291,6 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now, for (unsigned i=1; iobc->ondisk_read_lock(); - push_to_replica(soid, peer); - start_recovery_op(soid); - repop->obc->ondisk_read_unlock(); - } // forward the write/update/whatever MOSDSubOp *wr = new MOSDSubOp(repop->ctx->reqid, info.pgid, soid, @@ -3278,6 +3300,10 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) if (pushing[soid].empty()) { dout(10) << "pushed " << soid << " to all replicas" << dendl; finish_recovery_op(soid); + if (waiting_for_degraded_object.count(soid)) { + osd->take_waiters(waiting_for_degraded_object[soid]); + waiting_for_degraded_object.erase(soid); + } } else { dout(10) << "pushed " << soid << ", still waiting for push ack from " << pushing[soid] << dendl; @@ -3702,14 +3728,12 @@ void ReplicatedPG::on_role_change() dout(10) << "on_role_change" << dendl; // take object waiters - for (hash_map >::iterator it = waiting_for_missing_object.begin(); - it != waiting_for_missing_object.end(); - it++) - osd->take_waiters(it->second); - waiting_for_missing_object.clear(); + take_object_waiters(waiting_for_missing_object); + take_object_waiters(waiting_for_degraded_object); } + // clear state. called on recovery completion AND cancellation. void ReplicatedPG::_clear_recovery_state() { @@ -3871,6 +3895,38 @@ int ReplicatedPG::recover_primary(int max) return started; } +int ReplicatedPG::recover_object_replicas(const sobject_t& soid) +{ + int started = 0; + + dout(10) << "recover_object_replicas " << soid << dendl; + + ObjectContext *obc = lookup_object_context(soid); + if (obc) { + dout(10) << " ondisk_read_lock for " << soid << dendl; + obc->ondisk_read_lock(); + } + + start_recovery_op(soid); + started++; + + // who needs it? + for (unsigned i=1; iondisk_read_unlock(); + put_object_context(obc); + } + + return started; +} + int ReplicatedPG::recover_replicas(int max) { int started = 0; @@ -3891,30 +3947,7 @@ int ReplicatedPG::recover_replicas(int max) sobject_t soid = peer_missing[peer].rmissing.begin()->second; eversion_t v = peer_missing[peer].rmissing.begin()->first; - ObjectContext *obc = lookup_object_context(soid); - if (obc) { - dout(10) << " ondisk_read_lock for " << soid << dendl; - obc->ondisk_read_lock(); - } - - start_recovery_op(soid); - started++; - - push_to_replica(soid, peer); - - // do other peers need it too? - for (i++; iondisk_read_unlock(); - put_object_context(obc); - } + started += recover_object_replicas(soid); if (started >= max) return started; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index fda5c31b48735..29eda27215641 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -458,6 +458,7 @@ protected: // push map > pushing; + int recover_object_replicas(const sobject_t& soid); void calc_head_subsets(SnapSet& snapset, const sobject_t& head, Missing& missing, interval_set& data_subset, @@ -594,6 +595,9 @@ public: bool is_missing_object(const sobject_t& oid); void wait_for_missing_object(const sobject_t& oid, Message *op); + bool is_degraded_object(const sobject_t& oid); + void wait_for_degraded_object(const sobject_t& oid, Message *op); + void on_osd_failure(int o); void on_acker_change(); void on_role_change();