From f51348dc8bdd5071b7baaf3f0e4d2e0496618f08 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 3 Dec 2008 12:19:59 -0800 Subject: [PATCH] osd: distributed scrub compares primary vs replica contents The checks are still pretty trivial at this point. --- src/messages/MOSDPGScrub.h | 52 +++++++++++ src/messages/MOSDScrub.h | 2 +- src/msg/Message.cc | 4 + src/msg/Message.h | 1 + src/osd/OSD.cc | 42 ++++++++- src/osd/OSD.h | 1 + src/osd/PG.cc | 184 ++++++++++++++++++++++++++++++++++++- src/osd/PG.h | 19 +++- src/osd/ReplicatedPG.cc | 10 +- src/osd/ReplicatedPG.h | 8 +- src/osd/osd_types.h | 43 ++++++++- 11 files changed, 349 insertions(+), 17 deletions(-) create mode 100644 src/messages/MOSDPGScrub.h diff --git a/src/messages/MOSDPGScrub.h b/src/messages/MOSDPGScrub.h new file mode 100644 index 0000000000000..4be436883dd24 --- /dev/null +++ b/src/messages/MOSDPGScrub.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGSCRUB_H +#define __MOSDPGSCRUB_H + +#include "msg/Message.h" + +struct MOSDPGScrub : public Message { + pg_t pgid; + epoch_t epoch; + bufferlist map; + + MOSDPGScrub() {} + MOSDPGScrub(pg_t p, epoch_t e) : + Message(MSG_OSD_PG_SCRUB), + pgid(p), epoch(e) {} + + const char *get_type_name() { return "pg_scrub"; } + void print(ostream& out) { + out << "pg_scrub(" << pgid << " e" << epoch; + if (map.length()) + out << " " << map.length() << " bytes"; + out << ")"; + } + + void encode_payload() { + ::encode(pgid, payload); + ::encode(epoch, payload); + ::encode(map, payload); + } + void decode_payload() { + bufferlist::iterator p = payload.begin(); + ::decode(pgid, p); + ::decode(epoch, p); + ::decode(map, p); + } +}; + +#endif diff --git a/src/messages/MOSDScrub.h b/src/messages/MOSDScrub.h index 8d9c3f0ba1dec..8768be423b5b9 100644 --- a/src/messages/MOSDScrub.h +++ b/src/messages/MOSDScrub.h @@ -19,7 +19,7 @@ #include "msg/Message.h" /* - * Scru - instruct an OSD to create a pg, if it doesn't already exist + * instruct an OSD to scrub some or all pg(s) */ struct MOSDScrub : public Message { diff --git a/src/msg/Message.cc b/src/msg/Message.cc index 3e322a1b40bda..5b7a3e88ef9a7 100644 --- a/src/msg/Message.cc +++ b/src/msg/Message.cc @@ -44,6 +44,7 @@ using namespace std; #include "messages/MOSDPGRemove.h" #include "messages/MOSDPGInfo.h" #include "messages/MOSDPGCreate.h" +#include "messages/MOSDPGScrub.h" #include "messages/MOSDScrub.h" #include "messages/MRemoveSnaps.h" @@ -236,6 +237,9 @@ Message *decode_message(ceph_msg_header& header, ceph_msg_footer& footer, case MSG_OSD_PG_CREATE: m = new MOSDPGCreate; break; + case MSG_OSD_PG_SCRUB: + m = new MOSDPGScrub; + break; case MSG_OSD_SCRUB: m = new MOSDScrub; diff --git a/src/msg/Message.h b/src/msg/Message.h index bf6eca6d497b5..bc94c08334143 100644 --- a/src/msg/Message.h +++ b/src/msg/Message.h @@ -53,6 +53,7 @@ #define MSG_REMOVE_SNAPS 89 #define MSG_OSD_SCRUB 90 +#define MSG_OSD_PG_SCRUB 91 diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index af1c0b50f0a25..984bd32f5f983 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -58,6 +58,7 @@ #include "messages/MOSDPGRemove.h" #include "messages/MOSDPGInfo.h" #include "messages/MOSDPGCreate.h" +#include "messages/MOSDPGScrub.h" #include "messages/MOSDAlive.h" @@ -696,7 +697,7 @@ void OSD::_remove_unlock_pg(PG *pg) } // log - t.remove(0, pgid.to_pobject()); + t.remove(0, pgid.to_log_pobject()); // main collection store->collection_list(pgid.to_coll(), olist); @@ -1564,6 +1565,9 @@ void OSD::dispatch(Message *m) case MSG_OSD_PG_INFO: handle_pg_info((MOSDPGInfo*)m); break; + case MSG_OSD_PG_SCRUB: + handle_pg_scrub((MOSDPGScrub*)m); + break; // client ops case CEPH_MSG_OSD_OP: @@ -2887,6 +2891,42 @@ void OSD::handle_pg_info(MOSDPGInfo *m) delete m; } +void OSD::handle_pg_scrub(MOSDPGScrub *m) +{ + dout(7) << "handle_pg_scrub " << *m << " from " << m->get_source() << dendl; + int from = m->get_source().num(); + if (!require_same_or_newer_map(m, m->epoch)) return; + + PG *pg = _lookup_lock_pg(m->pgid); + if (pg) { + if (m->epoch < pg->info.history.same_since) { + dout(10) << *pg << " has changed since " << m->epoch << dendl; + } else { + if (pg->is_primary()) { + dout(10) << "handle_pg_scrub got peer osd" << from << " scrub map" << dendl; + bufferlist::iterator p = m->map.begin(); + pg->peer_scrub_map[from].decode(p); + pg->kick(); + } else { + // replica, reply + dout(10) << "handle_pg_scrub generating scrub map for primary" << dendl; + + // do this is a separate thread.. FIXME + ScrubMap map; + pg->build_scrub_map(map); + + MOSDPGScrub *reply = new MOSDPGScrub(pg->info.pgid, osdmap->get_epoch()); + ::encode(map, reply->map); + messenger->send_message(reply, m->get_source_inst()); + } + } + pg->unlock(); + } else { + dout(10) << " pg " << m->pgid << " not found" << dendl; + } + delete m; +} + /** PGQuery * from primary to replica | stray diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 8bd47429c47ca..152caaa4d8472 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -439,6 +439,7 @@ private: void handle_pg_log(class MOSDPGLog *m); void handle_pg_info(class MOSDPGInfo *m); void handle_pg_remove(class MOSDPGRemove *m); + void handle_pg_scrub(class MOSDPGScrub *m); // helper for handle_pg_log and handle_pg_info void _process_pg_info(epoch_t epoch, int from, diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d2df2cbfb142d..db5c1b3d46d68 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -25,6 +25,7 @@ #include "messages/MOSDPGLog.h" #include "messages/MOSDPGRemove.h" #include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGScrub.h" #define DOUT_SUBSYS osd #undef dout_prefix @@ -891,6 +892,8 @@ void PG::clear_primary_state() log.reset_recovery_pointers(); stat_object_temp_rd.clear(); + + peer_scrub_map.clear(); } void PG::peer(ObjectStore::Transaction& t, @@ -1477,8 +1480,8 @@ void PG::write_log(ObjectStore::Transaction& t) ondisklog.top = bl.length(); // write it - t.remove(0, info.pgid.to_pobject() ); - t.write(0, info.pgid.to_pobject() , 0, bl.length(), bl); + t.remove(0, info.pgid.to_log_pobject() ); + t.write(0, info.pgid.to_log_pobject() , 0, bl.length(), bl); t.collection_setattr(info.pgid.to_coll(), "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); t.collection_setattr(info.pgid.to_coll(), "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); @@ -1519,7 +1522,7 @@ void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) t.collection_setattr(info.pgid.to_coll(), "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); if (!g_conf.osd_preserve_trimmed_log) - t.zero(0, info.pgid.to_pobject(), 0, ondisklog.bottom); + t.zero(0, info.pgid.to_log_pobject(), 0, ondisklog.bottom); } @@ -1549,7 +1552,7 @@ void PG::append_log(ObjectStore::Transaction &t, bufferlist& bl, if (ondisklog.top % 4096 == 0) ondisklog.block_map[ondisklog.top] = logversion; - t.write(0, info.pgid.to_pobject(), ondisklog.top, bl.length(), bl ); + t.write(0, info.pgid.to_log_pobject(), ondisklog.top, bl.length(), bl ); ondisklog.top += bl.length(); t.collection_setattr(info.pgid.to_coll(), "ondisklog_top", @@ -1585,7 +1588,7 @@ void PG::read_log(ObjectStore *store) if (ondisklog.top > 0) { // read bufferlist bl; - store->read(0, info.pgid.to_pobject(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); + store->read(0, info.pgid.to_log_pobject(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); if (bl.length() < ondisklog.top-ondisklog.bottom) { dout(0) << "read_log got " << bl.length() << " bytes, expected " << ondisklog.top << "-" << ondisklog.bottom << "=" @@ -1706,3 +1709,174 @@ bool PG::block_if_wrlocked(MOSDOp* op) +// ========================================================================================== +// SCRUB + + +/* + * build a (sorted) summary of pg content for purposes of scrubbing + */ +void PG::build_scrub_map(ScrubMap &map) +{ + dout(10) << "build_scrub_map" << dendl; + coll_t c = info.pgid.to_coll(); + + // objects + vector ls; + osd->store->collection_list(c, ls); + + // sort + dout(10) << "sorting" << dendl; + vector< pair > tab(ls.size()); + vector< pair >::iterator q = tab.begin(); + int i = 0; + for (vector::iterator p = ls.begin(); + p != ls.end(); + p++, i++, q++) { + q->first = *p; + q->second = i; + } + sort(tab.begin(), tab.end()); + // tab is now sorted, with ->second indicating object's original position + vector pos(ls.size()); + i = 0; + for (vector< pair >::iterator p = tab.begin(); + p != tab.end(); + p++, i++) + pos[p->second] = i; + // now, pos[orig pos] = sorted pos + + dout(10) << " " << ls.size() << " objects" << dendl; + map.objects.resize(ls.size()); + i = 0; + for (vector::iterator p = ls.begin(); + p != ls.end(); + p++, i++) { + pobject_t poid = *p; + + ScrubMap::object& o = map.objects[pos[i]]; + o.poid = *p; + + struct stat st; + int r = osd->store->stat(c, poid, &st); + assert(r == 0); + o.size = st.st_size; + + osd->store->getattrs(c, poid, o.attrs); + + dout(15) << " " << poid << dendl; + } + + // pg attrs + osd->store->collection_getattrs(c, map.attrs); + + // log + osd->store->read(coll_t(), info.pgid.to_log_pobject(), 0, 0, map.logbl); + dout(10) << " log " << map.logbl.length() << " bytes" << dendl; +} + + + +void PG::scrub() +{ + osd->map_lock.get_read(); + + lock(); + if (!is_primary()) { + dout(10) << "scrub -- not primary" << dendl; + unlock(); + osd->map_lock.put_read(); + return; + } + + if (!is_active() || !is_clean()) { + dout(10) << "scrub -- not active or not clean" << dendl; + unlock(); + osd->map_lock.put_read(); + return; + } + + dout(10) << "scrub start" << dendl; + + // request maps from replicas + for (unsigned i=1; imessenger->send_message(new MOSDPGScrub(info.pgid, osd->osdmap->get_epoch()), + osd->osdmap->get_inst(acting[i])); + } + + osd->map_lock.put_read(); + + dout(10) << " building my scrub map" << dendl; + ScrubMap map; + build_scrub_map(map); + + while (peer_scrub_map.size() < acting.size() - 1) { + dout(10) << " have " << peer_scrub_map.size() << " / " << (acting.size()-1) << " scrub maps, waiting" << dendl; + wait(); + } + + // first, compare scrub maps + vector m(acting.size()); + m[0] = ↦ + for (unsigned i=1; i::iterator p[acting.size()]; + for (unsigned i=0; iobjects.begin(); + + while (1) { + ScrubMap::object *po = 0; + bool missing = false; + for (unsigned i=0; iobjects.end()) + continue; + if (!po) + po = &(*p[i]); + else if (po->poid != p[i]->poid) { + missing = true; + if (po->poid > p[i]->poid) + po = &(*p[i]); + } + } + if (!po) + break; + if (missing) { + for (unsigned i=0; ipoid != p[i]->poid) + dout(0) << " osd" << acting[i] << " missing " << po->poid << dendl; + else + p[i]++; + } + continue; + } + + // compare + bool ok = true; + for (unsigned i=1; isize != p[i]->size) { + dout(0) << " osd" << acting[i] << " " << po->poid + << " size " << p[i]->size << " != " << po->size << dendl; + ok = false; + } + // fixme: check attrs + } + + + if (ok) + dout(10) << "scrub " << po->poid << " size " << po->size << " ok" << dendl; + + // next + for (unsigned i=0; i peer_scrub_map; + + void scrub(); + void build_scrub_map(ScrubMap &map); + virtual void _scrub(ScrubMap &map) {} + + public: PG(OSD *o, pg_t p) : osd(o), @@ -764,7 +782,6 @@ public: virtual void do_sub_op(MOSDSubOp *op) = 0; virtual void do_sub_op_reply(MOSDSubOpReply *op) = 0; virtual bool snap_trimmer() = 0; - virtual void scrub() { }; virtual bool same_for_read_since(epoch_t e) = 0; virtual bool same_for_modify_since(epoch_t e) = 0; diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 50aaba1304f27..91807bf9c1977 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -2821,11 +2821,12 @@ void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) -void ReplicatedPG::scrub() -{ - lock(); - dout(10) << "scrub start" << dendl; +// ========================================================================================== +// SCRUB + +void ReplicatedPG::_scrub() +{ coll_t c = info.pgid.to_coll(); vector ls; osd->store->collection_list(c, ls); @@ -2966,5 +2967,4 @@ void ReplicatedPG::scrub() } dout(10) << "scrub finish" << dendl; - unlock(); } diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 738fe91eda0f6..418c405c7c867 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -159,7 +159,6 @@ protected: friend class C_OSD_ModifyCommit; friend class C_OSD_RepModifyCommit; - // pg on-disk content void clean_up_local(ObjectStore::Transaction& t); @@ -182,6 +181,11 @@ protected: void sub_op_pull(MOSDSubOp *op); + // -- scrub -- + void _scrub(); + + + public: ReplicatedPG(OSD *o, pg_t p) : PG(o,p) @@ -194,8 +198,6 @@ public: void do_sub_op_reply(MOSDSubOpReply *op); bool snap_trimmer(); - void scrub(); - bool same_for_read_since(epoch_t e); bool same_for_modify_since(epoch_t e); bool same_for_rep_modify_since(epoch_t e); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index ef376803c7dc7..47c93c5181e77 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -122,7 +122,7 @@ public: operator uint64_t() const { return u.pg64; } - pobject_t to_pobject() const { + pobject_t to_log_pobject() const { return pobject_t(OSD_METADATA_PG_POOL, // osd metadata 0, object_t(u.pg64, 0)); @@ -663,4 +663,45 @@ inline ostream& operator<<(ostream& out, const SnapSet& cs) { << (cs.head_exists ? "+head":""); } + +/* + * summarize pg contents for purposes of a scrub + */ +struct ScrubMap { + struct object { + pobject_t poid; + __u64 size; + map attrs; + + void encode(bufferlist& bl) const { + ::encode(poid, bl); + ::encode(size, bl); + ::encode(attrs, bl); + } + void decode(bufferlist::iterator& bl) { + ::decode(poid, bl); + ::decode(size, bl); + ::decode(attrs, bl); + } + }; + WRITE_CLASS_ENCODER(object) + + vector objects; + map attrs; + bufferlist logbl; + + void encode(bufferlist& bl) const { + ::encode(objects, bl); + ::encode(attrs, bl); + ::encode(logbl, bl); + } + void decode(bufferlist::iterator& bl) { + ::decode(objects, bl); + ::decode(attrs, bl); + ::decode(logbl, bl); + } +}; +WRITE_CLASS_ENCODER(ScrubMap::object) +WRITE_CLASS_ENCODER(ScrubMap) + #endif -- 2.39.5