From: Sage Weil Date: Thu, 4 Dec 2008 19:17:58 +0000 (-0800) Subject: osd: drop lock during most of scrub; only disallow concurrent writes X-Git-Tag: v0.6~187 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=4cfb679bce99b33e1344ad31d2a557e18d393af5;p=ceph.git osd: drop lock during most of scrub; only disallow concurrent writes Make the PG go read-only during a scrub. Only take the pg lock when absolutely necessary. Wait for any pending writes to complete before starting the scrub. --- diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 7813d644413..585a7011783 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -3226,12 +3226,28 @@ void OSD::handle_op(MOSDOp *op) op_queue_cond.Wait(osd_lock); } + // require same or newer map + if (!require_same_or_newer_map(op, op->get_map_epoch())) + return; + + // blacklisted? + if (osdmap->is_blacklisted(op->get_source_addr())) { + dout(4) << "handle_op " << op->get_source_addr() << " is blacklisted" << dendl; + reply_op_error(op, -EBLACKLISTED); + return; + } + + // share our map with sender, if they're old + _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); + + // calc actual pgid pg_t pgid = osdmap->raw_pg_to_pg(op->get_pg()); // get and lock *pg. PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid):0; + logger->set("buf", buffer_total_alloc.test()); utime_t now = g_clock.now(); @@ -3296,7 +3312,7 @@ void OSD::handle_op(MOSDOp *op) // modify if ((pg->get_primary() != whoami || !pg->same_for_modify_since(op->get_map_epoch()))) { - dout(7) << "handle_rep_op pg changed " << pg->info.history + dout(7) << "handle_op pg changed " << pg->info.history << " after " << op->get_map_epoch() << ", dropping" << dendl; assert(op->get_map_epoch() < osdmap->get_epoch()); @@ -3304,6 +3320,14 @@ void OSD::handle_op(MOSDOp *op) delete op; return; } + + // scrubbing? + if (pg->state_test(PG_STATE_SCRUBBING)) { + dout(10) << *pg << " is scrubbing, deferring op " << *op << dendl; + pg->waiting_for_active.push_back(op); + pg->unlock(); + return; + } } // pg must be active. diff --git a/src/osd/PG.cc b/src/osd/PG.cc index aecc1d30ad3..cd6f57e7718 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -1790,6 +1790,9 @@ void PG::scrub() osd->map_lock.get_read(); lock(); + + epoch_t epoch = info.history.same_since; + if (!is_primary()) { dout(10) << "scrub -- not primary" << dendl; unlock(); @@ -1810,23 +1813,46 @@ void PG::scrub() // request maps from replicas for (unsigned i=1; imessenger->send_message(new MOSDPGScrub(info.pgid, osd->osdmap->get_epoch()), osd->osdmap->get_inst(acting[i])); } osd->map_lock.put_read(); - dout(10) << " building my scrub map" << dendl; + // wait for any ops in progress + while (is_write_in_progress()) { + dout(10) << "scrub write(s) in progress, waiting" << dendl; + wait(); + } + + unlock(); + + dout(10) << "scrub building my map" << dendl; ScrubMap scrubmap; build_scrub_map(scrubmap); + lock(); + if (epoch != info.history.same_since) { + dout(10) << "scrub pg changed, aborting" << dendl; + unlock(); + return; + } + while (peer_scrub_map.size() < acting.size() - 1) { dout(10) << " have " << (peer_scrub_map.size()+1) << " / " << acting.size() << " scrub maps, waiting" << dendl; wait(); + + if (epoch != info.history.same_since) { + dout(10) << "scrub pg changed, aborting" << dendl; + unlock(); + return; + } } + unlock(); + // first, compare scrub maps vector m(acting.size()); m[0] = &scrubmap; @@ -1898,7 +1924,6 @@ void PG::scrub() } } - if (ok) dout(10) << "scrub " << po->poid << " size " << po->size << " ok" << dendl; @@ -1916,12 +1941,29 @@ void PG::scrub() osd->get_logclient()->log(LOG_ERROR, s); } + lock(); + if (epoch != info.history.same_since) { + dout(10) << "scrub pg changed, aborting" << dendl; + unlock(); + return; + } + // discard peer scrub info. peer_scrub_map.clear(); + unlock(); + // ok, do the pg-type specific scrubbing _scrub(scrubmap); - + + lock(); + if (epoch != info.history.same_since) { + dout(10) << "scrub pg changed, aborting" << dendl; + unlock(); + return; + } + + // finish up info.stats.last_scrub = info.last_update; info.stats.last_scrub_stamp = g_clock.now(); state_clear(PG_STATE_SCRUBBING); diff --git a/src/osd/PG.h b/src/osd/PG.h index 00752789371..37cd5f82796 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -790,6 +790,7 @@ public: virtual bool same_for_modify_since(epoch_t e) = 0; virtual bool same_for_rep_modify_since(epoch_t e) = 0; + virtual bool is_write_in_progress() = 0; virtual bool is_missing_object(object_t oid) = 0; virtual void wait_for_missing_object(object_t oid, Message *op) = 0; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 32379826c42..488745b010f 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1502,8 +1502,12 @@ void ReplicatedPG::put_projected_object(ProjectedObjectInfo *pinfo) << pinfo->ref << " -> " << (pinfo->ref-1) << dendl; --pinfo->ref; - if (pinfo->ref == 0) + if (pinfo->ref == 0) { projected_objects.erase(pinfo->poid); + + if (projected_objects.empty()) + kick(); + } } diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 3ed3234f570..f5e808a5f96 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -143,6 +143,9 @@ protected: ProjectedObjectInfo *get_projected_object(pobject_t poid); void put_projected_object(ProjectedObjectInfo *pinfo); + bool is_write_in_progress() { + return !projected_objects.empty(); + } // load balancing set balancing_reads;