From a8f8019a0cd67869092fd538af0b23f526ffa856 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 9 Feb 2007 18:42:36 +0000 Subject: [PATCH] osd ops tagged with reqid_t, which includes osd client (client|mds) incarnation. types.h files restructured/cleaned up somewhat. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1091 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 78 ++---- branches/sage/cephmds2/client/Client.cc | 3 + branches/sage/cephmds2/include/reqid.h | 64 +++++ branches/sage/cephmds2/include/types.h | 224 +----------------- branches/sage/cephmds2/mds/MDS.cc | 3 + branches/sage/cephmds2/mds/MDS.h | 2 + branches/sage/cephmds2/mds/MDSMap.h | 11 +- branches/sage/cephmds2/mds/mdstypes.h | 29 +++ .../sage/cephmds2/messages/MClientRequest.h | 1 + branches/sage/cephmds2/messages/MOSDBoot.h | 1 + branches/sage/cephmds2/messages/MOSDOp.h | 92 +++---- branches/sage/cephmds2/messages/MOSDOpReply.h | 70 +++--- branches/sage/cephmds2/mon/MDSMonitor.cc | 3 +- branches/sage/cephmds2/msg/FakeMessenger.cc | 11 +- branches/sage/cephmds2/msg/Message.h | 170 +------------ branches/sage/cephmds2/msg/msg_types.h | 185 +++++++++++++++ branches/sage/cephmds2/osd/OSD.cc | 18 +- branches/sage/cephmds2/osd/OSDMap.h | 1 + branches/sage/cephmds2/osd/ObjectStore.h | 1 + branches/sage/cephmds2/osd/PG.h | 38 +-- branches/sage/cephmds2/osd/osd_types.h | 167 +++++++++++++ branches/sage/cephmds2/osdc/Objecter.cc | 8 +- branches/sage/cephmds2/osdc/Objecter.h | 7 +- 23 files changed, 610 insertions(+), 577 deletions(-) create mode 100644 branches/sage/cephmds2/include/reqid.h create mode 100644 branches/sage/cephmds2/msg/msg_types.h create mode 100644 branches/sage/cephmds2/osd/osd_types.h diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 648bc14205c1f..8441672e770ea 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -1,8 +1,4 @@ -huh: -- how to keep mds osd op ids unique after a failover? - - (and, how to flush out failed mds) - - exporter recovery if importer fails during EXPORT_EXPORTING stage - importer recovery if exporter fails @@ -12,6 +8,8 @@ huh: /- how to effectively trim cache after resolve but before rejoin / - we need to eliminate unneed non-auth metadata, without hosing potentially useful auth metadata +- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. + - falures during recovery stages... rejoin - fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) @@ -38,57 +36,35 @@ blech: higher versions (and are thus described by a newer journal entry) mds -- journal+recovery - - EImportMap - - EMetaBlob replay - - import/export - - how to keep other MDS nodes from goofing up the import/export notify stuff - - recovery vs import/export +- mds falure vs clients + - clean up client op redirection - idempotent ops + +- journal+recovery - unlink - open+create - file capabilities i/o - link - rename -- mds failure - - mdsmon map updates, mds states - - active, down, recovering, stopping - - + - should auth_pins really go to the root? - FIXME: auth_pins on importer versus import beneath an authpinned region? + journaler - fix up for large events (e.g. imports) +- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. +- should we pad with zeros to avoid splitting individual entries? + - make it a g_conf flag? + - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) +- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes + -- paxos for monitor - lnet? - crush - xml import/export? - crush tools -== todo - -1- pipelining writes? -2- intervening reads? - -inode ops - utime -- no concurrency issues - chown/chmod -- should lock - truncate -- should lock - 1-> no. multiple process concurrency on a single inode is not important. - 2-> maybe... intervening stats? probably not important. - -directory ops. parent inode mtime, + dirent xlocks? - mknod - open+create - symlink - unlink - rmdir - rename - 1-> yes. but mtime updates are independent (mtime monotonically increasing), so it's easy. - 2-> yes. - ---> so, make let's make file/hard wrlock exclusive. locks namespace @@ -99,11 +75,6 @@ locks hard/file wr start/stop -- write lock - - -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - rados+ebofs - purge replicated writes from cache. (with exception of partial tail blocks.) @@ -117,6 +88,9 @@ rados paper todo - snapshots rados snapshots +- integrate revisions into ObjectCacher +- clean up oid.rev vs op.rev in osd+osdc + - attr.crev is rev we were created in. - oid.rev=0 is "live". defined for attr.crev <= rev. - otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) @@ -140,24 +114,6 @@ rados snapshots - clean up messenger failure modes. - add connection retry. -mds recovery -- multiple passes? - 1- establish import/export map - ?- - 2- replay inode, dir, dentry updates -- single pass - - each event needs to embed inode for trace up to the import - - second stage will reconcile cached items with other active mds nodes - - cached items will be shared with the primary to repopulate it's non-dirty cache - - query clients for their state too? - - mds must journal list of clients with whom we share state? - - -journaler -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes monitor diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc index 5817a99370b42..688ffe6742be3 100644 --- a/branches/sage/cephmds2/client/Client.cc +++ b/branches/sage/cephmds2/client/Client.cc @@ -701,6 +701,9 @@ void Client::handle_mds_map(MMDSMap* m) delete m; + // note our inc # + objecter->set_client_incarnation(0); // fixme + mount_cond.Signal(); // mount might be waiting for this. } diff --git a/branches/sage/cephmds2/include/reqid.h b/branches/sage/cephmds2/include/reqid.h new file mode 100644 index 0000000000000..3c71fbae69ab6 --- /dev/null +++ b/branches/sage/cephmds2/include/reqid.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __REQID_H +#define __REQID_H + + +#include "include/types.h" +#include "msg/msg_types.h" + +/* reqid_t - caller name + incarnation# + tid to unique identify this request + * use for metadata and osd ops. + */ +class reqid_t { +public: + entity_name_t name; // who + int inc; // incarnation + tid_t tid; + reqid_t() : inc(0), tid(0) {} + reqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} +}; + +inline ostream& operator<<(ostream& out, const reqid_t& r) { + return out << r.name << "." << r.inc << ":" << r.tid; +} + +inline bool operator==(const reqid_t& l, const reqid_t& r) { + return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); +} +inline bool operator!=(const reqid_t& l, const reqid_t& r) { + return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); +} +inline bool operator<(const reqid_t& l, const reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid < r.tid); +} +inline bool operator<=(const reqid_t& l, const reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); +} +inline bool operator>(const reqid_t& l, const reqid_t& r) { return !(l <= r); } +inline bool operator>=(const reqid_t& l, const reqid_t& r) { return !(l < r); } + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const reqid_t &r) const { + static blobhash H; + return H((const char*)&r, sizeof(r)); + } + }; +} + + +#endif diff --git a/branches/sage/cephmds2/include/types.h b/branches/sage/cephmds2/include/types.h index d27b0b26b8878..b09ee2d4726be 100644 --- a/branches/sage/cephmds2/include/types.h +++ b/branches/sage/cephmds2/include/types.h @@ -32,10 +32,8 @@ using namespace std; #include using namespace __gnu_cxx; - #include "object.h" - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a):(b)) #endif @@ -44,34 +42,6 @@ using namespace __gnu_cxx; #endif -// md ops -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_UTIME 102 -#define MDS_OP_CHMOD 103 -#define MDS_OP_CHOWN 104 - - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 201 -#define MDS_OP_LINK 202 -#define MDS_OP_UNLINK 203 -#define MDS_OP_RENAME 204 - -#define MDS_OP_MKDIR 220 -#define MDS_OP_RMDIR 221 -#define MDS_OP_SYMLINK 222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 306 -#define MDS_OP_FSYNC 307 -//#define MDS_OP_CLOSE 310 -#define MDS_OP_RELEASE 308 - - - // -- stl crap -- /* @@ -146,6 +116,17 @@ struct ltstr +// ---------------------- +// some basic types + +typedef __uint64_t tid_t; // transaction id +typedef __uint64_t version_t; +typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) + + + + + /** object layout * how objects are mapped into PGs */ @@ -200,8 +181,6 @@ struct FileLayout { // -- inode -- -//typedef __uint64_t inodeno_t; - struct inodeno_t { __uint64_t val; inodeno_t() : val() {} @@ -225,9 +204,6 @@ namespace __gnu_cxx { }; } -typedef __uint64_t version_t; - - #define INODE_MODE_FILE 0100000 // S_IFREG #define INODE_MODE_SYMLINK 0120000 // S_IFLNK @@ -281,182 +257,6 @@ struct inode_t { -// lame 128-bit value class. -class lame128_t { -public: - __uint64_t hi, lo; - lame128_t(__uint64_t h=0, __uint64_t l=0) : hi(h), lo(l) {} -}; - -inline ostream& operator<<(ostream& out, lame128_t& oid) { - return out << oid.hi << "." << oid.lo; -} - - -// osd types -//typedef __uint32_t ps_t; // placement seed -//typedef __uint32_t pg_t; // placement group -typedef __uint64_t coll_t; // collection id -typedef __uint64_t tid_t; // transaction id - -typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - -// pg stuff -typedef __uint16_t ps_t; -typedef __uint8_t pruleset_t; - -// placement group id -struct pg_t { - union { - struct { - int preferred; - ps_t ps; - __uint8_t nrep; - pruleset_t ruleset; - } fields; - __uint64_t val; - } u; - pg_t() { u.val = 0; } - pg_t(const pg_t& o) { u.val = o.u.val; } - pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { - u.fields.ps = s; - u.fields.preferred = p; - u.fields.nrep = n; - u.fields.ruleset = r; - } - pg_t(__uint64_t v) { u.val = v; } - /* - pg_t operator=(__uint64_t v) { u.val = v; return *this; } - pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator __uint64_t() const { return u.val; } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) { - //return out << hex << pg.val << dec; - if (pg.u.fields.ruleset) - out << (int)pg.u.fields.ruleset << '.'; - out << (int)pg.u.fields.nrep << '.'; - if (pg.u.fields.preferred) - out << pg.u.fields.preferred << '.'; - out << hex << pg.u.fields.ps << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static hash<__uint64_t> H; - return H(x); - } - }; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - -#define PG_NONE 0xffffffffL - - -typedef __uint16_t snapv_t; // snapshot version - - -class OSDSuperblock { -public: - const static __uint64_t MAGIC = 0xeb0f505dULL; - __uint64_t magic; - __uint64_t fsid; // unique fs id (random number) - int whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - OSDSuperblock(__uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - -class MonSuperblock { -public: - const static __uint64_t MAGIC = 0x00eb0f5000ULL; - __uint64_t magic; - __uint64_t fsid; - int whoami; // mon # - epoch_t current_epoch; - MonSuperblock(__uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), current_epoch(0) {} -}; - - -// new types - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - objectrev_t rev; // which revision? - pg_t pgid; // where to find the object - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << hex << ex.pgid << dec - << " " << ex.start << "~" << ex.length - << ")"; -} - - // client types typedef int fh_t; // file handle @@ -468,8 +268,6 @@ typedef int fh_t; // file handle - - // -- io helpers -- template diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index 82567f168ee0b..3fa5f1535734b 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -446,6 +446,9 @@ void MDS::handle_mds_map(MMDSMap *m) // update messenger. messenger->reset_myname(MSG_ADDR_MDS(whoami)); + // tell objecter my incarnation + objecter->set_client_incarnation(mdsmap->get_inc(whoami)); + reopen_logger(); dout(1) << "handle_mds_map i am now mds" << whoami << endl; diff --git a/branches/sage/cephmds2/mds/MDS.h b/branches/sage/cephmds2/mds/MDS.h index aed6f9b1dd3b2..8b3ff1e4aa430 100644 --- a/branches/sage/cephmds2/mds/MDS.h +++ b/branches/sage/cephmds2/mds/MDS.h @@ -26,6 +26,8 @@ using namespace std; #include using namespace __gnu_cxx; +#include "mdstypes.h" + #include "msg/Dispatcher.h" #include "include/types.h" #include "include/Context.h" diff --git a/branches/sage/cephmds2/mds/MDSMap.h b/branches/sage/cephmds2/mds/MDSMap.h index fb1826900d89f..677589fb33de6 100644 --- a/branches/sage/cephmds2/mds/MDSMap.h +++ b/branches/sage/cephmds2/mds/MDSMap.h @@ -72,7 +72,8 @@ class MDSMap { set mds_created; // which mds ids have initialized journals and id tables. map mds_state; // MDS state map mds_state_seq; - map mds_inst; // up instances + map mds_inst; // up instances + map mds_inc; // incarnation count (monotonically increases) friend class MDSMonitor; @@ -234,6 +235,12 @@ class MDSMap { return -1; } + int get_inc(int m) { + assert(mds_inc.count(m)); + return mds_inc[m]; + } + + void remove_mds(int m) { mds_inst.erase(m); mds_state.erase(m); @@ -251,6 +258,7 @@ class MDSMap { ::_encode(mds_state, blist); ::_encode(mds_state_seq, blist); ::_encode(mds_inst, blist); + ::_encode(mds_inc, blist); } void decode(bufferlist& blist) { @@ -267,6 +275,7 @@ class MDSMap { ::_decode(mds_state, blist, off); ::_decode(mds_state_seq, blist, off); ::_decode(mds_inst, blist, off); + ::_decode(mds_inc, blist, off); } diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h index 92aa8069e2e2f..1ac4525e76559 100644 --- a/branches/sage/cephmds2/mds/mdstypes.h +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -14,6 +14,35 @@ using namespace std; #include + +// md ops +#define MDS_OP_STATFS 1 + +#define MDS_OP_STAT 100 +#define MDS_OP_LSTAT 101 +#define MDS_OP_UTIME 102 +#define MDS_OP_CHMOD 103 +#define MDS_OP_CHOWN 104 + + +#define MDS_OP_READDIR 200 +#define MDS_OP_MKNOD 201 +#define MDS_OP_LINK 202 +#define MDS_OP_UNLINK 203 +#define MDS_OP_RENAME 204 + +#define MDS_OP_MKDIR 220 +#define MDS_OP_RMDIR 221 +#define MDS_OP_SYMLINK 222 + +#define MDS_OP_OPEN 301 +#define MDS_OP_TRUNCATE 306 +#define MDS_OP_FSYNC 307 +//#define MDS_OP_CLOSE 310 +#define MDS_OP_RELEASE 308 + + + // ================================================================ /* meta_load_t diff --git a/branches/sage/cephmds2/messages/MClientRequest.h b/branches/sage/cephmds2/messages/MClientRequest.h index dff2af23deb5f..9b9ac4e115cac 100644 --- a/branches/sage/cephmds2/messages/MClientRequest.h +++ b/branches/sage/cephmds2/messages/MClientRequest.h @@ -19,6 +19,7 @@ #include "msg/Message.h" #include "include/filepath.h" +#include "mds/mdstypes.h" #include "mds/MDS.h" /** diff --git a/branches/sage/cephmds2/messages/MOSDBoot.h b/branches/sage/cephmds2/messages/MOSDBoot.h index 17604282b0635..cfff1869fbe51 100644 --- a/branches/sage/cephmds2/messages/MOSDBoot.h +++ b/branches/sage/cephmds2/messages/MOSDBoot.h @@ -17,6 +17,7 @@ #include "msg/Message.h" #include "include/types.h" +#include "osd/osd_types.h" class MOSDBoot : public Message { public: diff --git a/branches/sage/cephmds2/messages/MOSDOp.h b/branches/sage/cephmds2/messages/MOSDOp.h index 146ea23e0da5a..6139df56d833e 100644 --- a/branches/sage/cephmds2/messages/MOSDOp.h +++ b/branches/sage/cephmds2/messages/MOSDOp.h @@ -16,6 +16,7 @@ #define __MOSDOP_H #include "msg/Message.h" +#include "osd/osd_types.h" /* * OSD op @@ -25,9 +26,7 @@ * */ -//#define OSD_OP_MKFS 20 - -// client ops +// osd client ops #define OSD_OP_READ 1 #define OSD_OP_STAT 2 @@ -48,33 +47,6 @@ #define OSD_OP_PUSH 31 -typedef struct { - long pcid; - - // who's asking? - tid_t tid; - entity_inst_t client; - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - pg_t pg; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int op; - size_t length, offset; - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; -} MOSDOp_st; - class MOSDOp : public Message { public: static const char* get_opname(int op) { @@ -102,14 +74,43 @@ public: } private: - MOSDOp_st st; + struct { + long pcid; + + // who's asking? + entity_inst_t client; + reqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. + + // for replication + tid_t rep_tid; + + object_t oid; + objectrev_t rev; + pg_t pg; + + epoch_t map_epoch; + + eversion_t pg_trim_to; // primary->replica: trim to here + + int op; + size_t length, offset; + eversion_t version; + eversion_t old_version; + + bool want_ack; + bool want_commit; + } st; + bufferlist data; map attrset; friend class MOSDOpReply; public: - const tid_t get_tid() { return st.tid; } + const reqid_t& get_reqid() { return st.reqid; } + const tid_t get_client_tid() { return st.reqid.tid; } + int get_client_inc() { return st.reqid.inc; } + const entity_name_t& get_client() { return st.client.name; } const entity_inst_t& get_client_inst() { return st.client; } void set_client_inst(const entity_inst_t& i) { st.client = i; } @@ -117,8 +118,8 @@ private: const tid_t get_rep_tid() { return st.rep_tid; } void set_rep_tid(tid_t t) { st.rep_tid = t; } - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.pg; } + const object_t get_oid() { return st.oid; } + const pg_t get_pg() { return st.pg; } const epoch_t get_map_epoch() { return st.map_epoch; } //const int get_pg_role() { return st.pg_role; } // who am i asking for? @@ -157,19 +158,22 @@ private: void set_pcid(long pcid) { this->st.pcid = pcid; } long get_pcid() { return st.pcid; } - MOSDOp(long tid, entity_inst_t asker, + MOSDOp(entity_inst_t asker, int inc, long tid, object_t oid, pg_t pg, epoch_t mapepoch, int op) : Message(MSG_OSD_OP) { memset(&st, 0, sizeof(st)); this->st.client = asker; - this->st.tid = tid; - this->st.rep_tid = 0; + this->st.reqid.name = asker.name; + this->st.reqid.inc = inc; + this->st.reqid.tid = tid; this->st.oid = oid; this->st.pg = pg; this->st.map_epoch = mapepoch; this->st.op = op; + this->st.rep_tid = 0; + this->st.want_ack = true; this->st.want_commit = true; } @@ -201,13 +205,15 @@ private: } virtual char *get_type_name() { return "oop"; } + + void print(ostream& out) { + out << "osd_op(" << st.reqid + << " " << get_opname(st.op) + << " " << st.oid + //<< " " << this + << ")"; + } }; -inline ostream& operator<<(ostream& out, MOSDOp& op) -{ - return out << "MOSDOp(" << op.get_client() << "." << op.get_tid() - << " op " << MOSDOp::get_opname(op.get_op()) - << " oid " << hex << op.get_oid() << dec << " " << &op << ")"; -} #endif diff --git a/branches/sage/cephmds2/messages/MOSDOpReply.h b/branches/sage/cephmds2/messages/MOSDOpReply.h index 35c6ad5898b0b..05106e096d176 100644 --- a/branches/sage/cephmds2/messages/MOSDOpReply.h +++ b/branches/sage/cephmds2/messages/MOSDOpReply.h @@ -28,38 +28,36 @@ * */ - -typedef struct { - // req - long pcid; - tid_t tid; - tid_t rep_tid; - - object_t oid; - pg_t pg; - - int op; - - // reply - int result; - bool commit; - size_t length, offset; - size_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; -} MOSDOpReply_st; - - class MOSDOpReply : public Message { - MOSDOpReply_st st; + struct { + // req + reqid_t reqid; + + tid_t rep_tid; + + object_t oid; + pg_t pg; + + int op; + + // reply + int result; + bool commit; + size_t length, offset; + size_t object_size; + eversion_t version; + + eversion_t pg_complete_thru; + + epoch_t map_epoch; + } st; + bufferlist data; map attrset; public: - long get_tid() { return st.tid; } + const reqid_t& get_reqid() { return st.reqid; } + long get_tid() { return st.reqid.tid; } long get_rep_tid() { return st.rep_tid; } object_t get_oid() { return st.oid; } pg_t get_pg() { return st.pg; } @@ -84,7 +82,6 @@ class MOSDOpReply : public Message { void set_attrset(map &as) { attrset = as; } void set_op(int op) { st.op = op; } - void set_tid(tid_t t) { st.tid = t; } void set_rep_tid(tid_t t) { st.rep_tid = t; } // data payload @@ -98,18 +95,13 @@ class MOSDOpReply : public Message { // osdmap epoch_t get_map_epoch() { return st.map_epoch; } - // keep a pcid (procedure call id) to match up request+reply - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } public: MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : Message(MSG_OSD_OPREPLY) { memset(&st, 0, sizeof(st)); - this->st.pcid = req->st.pcid; - + this->st.reqid = req->st.reqid; this->st.op = req->st.op; - this->st.tid = req->st.tid; this->st.rep_tid = req->st.rep_tid; this->st.oid = req->st.oid; @@ -141,6 +133,16 @@ public: } virtual char *get_type_name() { return "oopr"; } + + void print(ostream& out) { + out << "osd_op_reply(" << st.reqid + << " " << MOSDOp::get_opname(st.op) + << " " << st.oid << " = " << st.result + //<< " " << this + << ")"; + } + }; + #endif diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc index 4f4b7e08c9ec9..3ea7ad235ada0 100644 --- a/branches/sage/cephmds2/mon/MDSMonitor.cc +++ b/branches/sage/cephmds2/mon/MDSMonitor.cc @@ -64,7 +64,7 @@ void MDSMonitor::print_map() for (set::iterator p = all.begin(); p != all.end(); ++p) { - dout(7) << " mds" << *p + dout(7) << " mds" << *p << "." << mdsmap.mds_inc[*p] << " : " << MDSMap::get_state_name(mdsmap.get_state(*p)) << " : " << (mdsmap.have_inst(*p) ? mdsmap.get_inst(*p) : blank) << endl; @@ -140,6 +140,7 @@ void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) if (booted) { mdsmap.mds_inst[from].addr = m->get_source_addr(); mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); + mdsmap.mds_inc[from]++; // starting -> creating|starting|replay if (mdsmap.is_degraded() && diff --git a/branches/sage/cephmds2/msg/FakeMessenger.cc b/branches/sage/cephmds2/msg/FakeMessenger.cc index a9deb24d1036e..26a9eff1d1171 100644 --- a/branches/sage/cephmds2/msg/FakeMessenger.cc +++ b/branches/sage/cephmds2/msg/FakeMessenger.cc @@ -155,10 +155,9 @@ int fakemessenger_do_loop_2() if (m) { //dout(18) << "got " << m << endl; - dout(1) << "---- '" << m->get_type_name() - << "' from " << m->get_source() // << ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m + dout(1) << "---- " << m->get_dest() + << " <- " << m->get_source() + << " ---- " << *m << endl; if (g_conf.fakemessenger_serialize) { @@ -328,9 +327,7 @@ int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fr } dm->queue_incoming(m); - dout(1) << "--> " << get_myname() << " sending " << m << " '" << m->get_type_name() << "'" - << " to " << inst - << endl;//" m " << dm << " has " << dm->num_incoming() << " queued" << endl; + dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << endl; } catch (...) { diff --git a/branches/sage/cephmds2/msg/Message.h b/branches/sage/cephmds2/msg/Message.h index e83663a04475a..adae304b58870 100644 --- a/branches/sage/cephmds2/msg/Message.h +++ b/branches/sage/cephmds2/msg/Message.h @@ -11,8 +11,6 @@ * */ - - #ifndef __MESSAGE_H #define __MESSAGE_H @@ -181,173 +179,7 @@ using __gnu_cxx::crope; #include "include/types.h" #include "include/buffer.h" - -#include "tcp.h" - - - - - -// new typed msg_addr_t way! -class entity_name_t { -public: - static const int TYPE_MON = 1; - static const int TYPE_MDS = 2; - static const int TYPE_OSD = 3; - static const int TYPE_CLIENT = 4; - - static const int NEW = -1; - - int _type; - int _num; - - // cons - entity_name_t() : _type(0), _num(0) {} - entity_name_t(int t, int n) : _type(t), _num(n) {} - - int num() const { return _num; } - int type() const { return _type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - } - return "unknown"; - } - - bool is_new() const { return num() == NEW; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { return (l._type == r._type) && (l._num == r._num); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { return (l._type != r._type) || (l._num != r._num); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { return (l._type < r._type) || (l._type == r._type && l._num < r._num); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - -// get rid of these -#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x) -#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x) -#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x) -#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x) - -#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW) -#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW) -#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW) -#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW) - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - __uint8_t ipq[4]; - __uint32_t port; - __uint32_t nonce; // bind time, or pid, or something unique! - - entity_addr_t() : port(0), nonce(0) { - ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0; - } - - void set_addr(tcpaddr_t a) { - memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4); - port = a.sin_port; - } - void make_addr(tcpaddr_t& a) const { - a.sin_family = AF_INET; - memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4); - a.sin_port = port; - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << (int)addr.ipq[0] - << '.' << (int)addr.ipq[1] - << '.' << (int)addr.ipq[2] - << '.' << (int)addr.ipq[3] - << ':' << addr.port - << '.' << addr.nonce; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} +#include "msg_types.h" diff --git a/branches/sage/cephmds2/msg/msg_types.h b/branches/sage/cephmds2/msg/msg_types.h new file mode 100644 index 0000000000000..aca159a9b9f59 --- /dev/null +++ b/branches/sage/cephmds2/msg/msg_types.h @@ -0,0 +1,185 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MSG_TYPES_H +#define __MSG_TYPES_H + +#include "include/types.h" +#include "tcp.h" + +// new typed msg_addr_t way! +class entity_name_t { + int _type:3; + int _num:29; + +public: + static const int TYPE_MON = 1; + static const int TYPE_MDS = 2; + static const int TYPE_OSD = 3; + static const int TYPE_CLIENT = 4; + + static const int NEW = -1; + + // cons + entity_name_t() : _type(0), _num(0) {} + entity_name_t(int t, int n) : _type(t), _num(n) {} + + int num() const { return _num; } + int type() const { return _type; } + const char *type_str() const { + switch (type()) { + case TYPE_MDS: return "mds"; + case TYPE_OSD: return "osd"; + case TYPE_MON: return "mon"; + case TYPE_CLIENT: return "client"; + } + return "unknown"; + } + + bool is_new() const { return num() == NEW; } + + bool is_client() const { return type() == TYPE_CLIENT; } + bool is_mds() const { return type() == TYPE_MDS; } + bool is_osd() const { return type() == TYPE_OSD; } + bool is_mon() const { return type() == TYPE_MON; } +}; + +inline bool operator== (const entity_name_t& l, const entity_name_t& r) { + return (l.type() == r.type()) && (l.num() == r.num()); } +inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { + return (l.type() != r.type()) || (l.num() != r.num()); } +inline bool operator< (const entity_name_t& l, const entity_name_t& r) { + return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } + +inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { + //if (addr.is_namer()) return out << "namer"; + if (addr.is_new() || addr.num() < 0) + return out << addr.type_str() << "?"; + else + return out << addr.type_str() << addr.num(); +} + +namespace __gnu_cxx { + template<> struct hash< entity_name_t > + { + size_t operator()( const entity_name_t m ) const + { + static blobhash H; + return H((const char*)&m, sizeof(m)); + } + }; +} + +// get rid of these +#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x) +#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x) +#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x) +#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x) + +#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW) +#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW) +#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW) +#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW) + + +/* + * an entity's network address. + * includes a random value that prevents it from being reused. + * thus identifies a particular process instance. + * ipv4 for now. + */ +struct entity_addr_t { + __uint8_t ipq[4]; + __uint32_t port; + __uint32_t nonce; // bind time, or pid, or something unique! + + entity_addr_t() : port(0), nonce(0) { + ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0; + } + + void set_addr(tcpaddr_t a) { + memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4); + port = a.sin_port; + } + void make_addr(tcpaddr_t& a) const { + a.sin_family = AF_INET; + memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4); + a.sin_port = port; + } +}; + +inline ostream& operator<<(ostream& out, const entity_addr_t &addr) +{ + return out << (int)addr.ipq[0] + << '.' << (int)addr.ipq[1] + << '.' << (int)addr.ipq[2] + << '.' << (int)addr.ipq[3] + << ':' << addr.port + << '.' << addr.nonce; +} + +inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } +inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } +inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } +inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } +inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } +inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } + +namespace __gnu_cxx { + template<> struct hash< entity_addr_t > + { + size_t operator()( const entity_addr_t& x ) const + { + static blobhash H; + return H((const char*)&x, sizeof(x)); + } + }; +} + + +/* + * a particular entity instance + */ +struct entity_inst_t { + entity_name_t name; + entity_addr_t addr; + entity_inst_t() {} + entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} +}; + + +inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } +inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } +inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } +inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } +inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } +inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } + +namespace __gnu_cxx { + template<> struct hash< entity_inst_t > + { + size_t operator()( const entity_inst_t& x ) const + { + static blobhash H; + return H((const char*)&x, sizeof(x)); + } + }; +} + +inline ostream& operator<<(ostream& out, const entity_inst_t &i) +{ + return out << i.name << " " << i.addr; +} + + +#endif diff --git a/branches/sage/cephmds2/osd/OSD.cc b/branches/sage/cephmds2/osd/OSD.cc index 0e58990fdb470..e2b1c1e4fdbd0 100644 --- a/branches/sage/cephmds2/osd/OSD.cc +++ b/branches/sage/cephmds2/osd/OSD.cc @@ -1912,7 +1912,7 @@ void OSD::pull(PG *pg, object_t oid) // send op tid_t tid = ++last_tid; - MOSDOp *op = new MOSDOp(tid, messenger->get_myinst(), + MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid, oid, pg->get_pgid(), osdmap->get_epoch(), OSD_OP_PULL); @@ -1954,7 +1954,7 @@ void OSD::push(PG *pg, object_t oid, int dest) logger->inc("r_pushb", bl.length()); // send - MOSDOp *op = new MOSDOp(++last_tid, messenger->get_myinst(), + MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid, oid, pg->info.pgid, osdmap->get_epoch(), OSD_OP_PUSH); op->set_offset(0); @@ -3004,8 +3004,7 @@ void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) << endl; // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_tid(), - op->get_client_inst(), + MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, oid, pg->get_pgid(), osdmap->get_epoch(), @@ -3159,9 +3158,8 @@ void OSD::op_modify(MOSDOp *op, PG *pg) } // dup op? - reqid_t reqid(op->get_client(), op->get_tid()); - if (pg->log.logged_req(reqid)) { - dout(-3) << "op_modify " << opname << " dup op " << reqid + if (pg->log.logged_req(op->get_reqid())) { + dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid() << ", doing WRNOOP" << endl; op->set_op(OSD_OP_WRNOOP); opname = MOSDOp::get_opname(op->get_op()); @@ -3312,8 +3310,7 @@ void OSD::prepare_log_transaction(ObjectStore::Transaction& t, if (crev && rev && rev > crev) { eversion_t cv = version; cv.version--; - PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, - op->get_client(), op->get_tid()); + PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); pg->log.add(cloneentry); dout(10) << "prepare_log_transaction " << op->get_op() @@ -3324,8 +3321,7 @@ void OSD::prepare_log_transaction(ObjectStore::Transaction& t, // actual op int opcode = PG::Log::Entry::MODIFY; if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE; - PG::Log::Entry logentry(opcode, oid, version, - op->get_client(), op->get_tid()); + PG::Log::Entry logentry(opcode, oid, version, op->get_reqid()); dout(10) << "prepare_log_transaction " << op->get_op() << " " << logentry diff --git a/branches/sage/cephmds2/osd/OSDMap.h b/branches/sage/cephmds2/osd/OSDMap.h index 2c00eea9cdbdc..48e080eeeccc2 100644 --- a/branches/sage/cephmds2/osd/OSDMap.h +++ b/branches/sage/cephmds2/osd/OSDMap.h @@ -22,6 +22,7 @@ */ #include "config.h" #include "include/types.h" +#include "osd_types.h" #include "msg/Message.h" #include "common/Mutex.h" #include "common/Clock.h" diff --git a/branches/sage/cephmds2/osd/ObjectStore.h b/branches/sage/cephmds2/osd/ObjectStore.h index 21fbd867974ed..e7b992d081b0b 100644 --- a/branches/sage/cephmds2/osd/ObjectStore.h +++ b/branches/sage/cephmds2/osd/ObjectStore.h @@ -16,6 +16,7 @@ #define __OBJECTSTORE_H #include "include/types.h" +#include "osd_types.h" #include "include/Context.h" #include "include/buffer.h" diff --git a/branches/sage/cephmds2/osd/PG.h b/branches/sage/cephmds2/osd/PG.h index 3d87eec2216e2..6d6de985eaf8e 100644 --- a/branches/sage/cephmds2/osd/PG.h +++ b/branches/sage/cephmds2/osd/PG.h @@ -34,35 +34,7 @@ using namespace __gnu_cxx; class OSD; -/* reqid_t - caller + tid to unique identify this request - */ -class reqid_t { -public: - entity_name_t addr; - tid_t tid; - reqid_t() : tid(0) {} - reqid_t(const entity_name_t& a, tid_t t) : addr(a), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const reqid_t& r) { - return out << r.addr << "." << r.tid; -} -inline bool operator==(const reqid_t& l, const reqid_t& r) { - return (l.addr == r.addr) && (l.tid == r.tid); -} -inline bool operator!=(const reqid_t& l, const reqid_t& r) { - return (l.addr != r.addr) || (l.tid != r.tid); -} -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const reqid_t &r) const { - static hash H; - static hash<__uint64_t> I; - return H(r.addr.type() ^ r.addr.num()) ^ I(r.tid); - } - }; -} /** PG - Replica Placement Group * @@ -241,12 +213,12 @@ public: eversion_t version; objectrev_t rev; - reqid_t reqid; // caller+tid to uniquely identify request + reqid_t reqid; // caller+tid to uniquely identify request Entry() : op(0) {} Entry(int _op, object_t _oid, const eversion_t& v, - const entity_name_t& a, tid_t t) : - op(_op), oid(_oid), version(v), reqid(a,t) {} + const reqid_t& rid) : + op(_op), oid(_oid), version(v), reqid(rid) {} bool is_delete() const { return op == DELETE; } bool is_clone() const { return op == CLONE; } @@ -298,7 +270,7 @@ public: class IndexedLog : public Log { public: hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; + hash_set caller_ops; // recovery pointers list::iterator requested_to; // not inclusive of referenced item @@ -316,7 +288,7 @@ public: bool logged_object(object_t oid) { return objects.count(oid); } - bool logged_req(reqid_t &r) { + bool logged_req(const reqid_t &r) { return caller_ops.count(r); } diff --git a/branches/sage/cephmds2/osd/osd_types.h b/branches/sage/cephmds2/osd/osd_types.h new file mode 100644 index 0000000000000..e86c074fa1b15 --- /dev/null +++ b/branches/sage/cephmds2/osd/osd_types.h @@ -0,0 +1,167 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __OSD_TYPES_H +#define __OSD_TYPES_H + +#include "include/reqid.h" + +// osd types +typedef __uint64_t coll_t; // collection id + + +// pg stuff +typedef __uint16_t ps_t; +typedef __uint8_t pruleset_t; + +// placement group id +struct pg_t { + union { + struct { + int preferred; + ps_t ps; + __uint8_t nrep; + pruleset_t ruleset; + } fields; + __uint64_t val; + } u; + pg_t() { u.val = 0; } + pg_t(const pg_t& o) { u.val = o.u.val; } + pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { + u.fields.ps = s; + u.fields.preferred = p; + u.fields.nrep = n; + u.fields.ruleset = r; + } + pg_t(__uint64_t v) { u.val = v; } + /* + pg_t operator=(__uint64_t v) { u.val = v; return *this; } + pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } + pg_t operator+=(pg_t o) { u.val += o.val; return *this; } + pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } + pg_t operator++() { ++u.val; return *this; } + */ + operator __uint64_t() const { return u.val; } +}; + +inline ostream& operator<<(ostream& out, pg_t pg) { + //return out << hex << pg.val << dec; + if (pg.u.fields.ruleset) + out << (int)pg.u.fields.ruleset << '.'; + out << (int)pg.u.fields.nrep << '.'; + if (pg.u.fields.preferred) + out << pg.u.fields.preferred << '.'; + out << hex << pg.u.fields.ps << dec; + return out; +} + +namespace __gnu_cxx { + template<> struct hash< pg_t > + { + size_t operator()( const pg_t& x ) const + { + static hash<__uint64_t> H; + return H(x); + } + }; +} + + + +// compound rados version type +class eversion_t { +public: + epoch_t epoch; + version_t version; + eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} +}; + +inline bool operator==(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) && (l.version == r.version); +} +inline bool operator!=(const eversion_t& l, const eversion_t& r) { + return (l.epoch != r.epoch) || (l.version != r.version); +} +inline bool operator<(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); +} +inline bool operator<=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); +} +inline bool operator>(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); +} +inline bool operator>=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); +} +inline ostream& operator<<(ostream& out, const eversion_t e) { + return out << e.epoch << "'" << e.version; +} + + + + + +// ----------------------------------------- + +class ObjectExtent { + public: + object_t oid; // object id + off_t start; // in object + size_t length; // in object + + objectrev_t rev; // which revision? + pg_t pgid; // where to find the object + + map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} + ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } +}; + +inline ostream& operator<<(ostream& out, ObjectExtent &ex) +{ + return out << "extent(" + << ex.oid << " in " << hex << ex.pgid << dec + << " " << ex.start << "~" << ex.length + << ")"; +} + + + +// --------------------------------------- + +class OSDSuperblock { +public: + const static __uint64_t MAGIC = 0xeb0f505dULL; + __uint64_t magic; + __uint64_t fsid; // unique fs id (random number) + int whoami; // my role in this fs. + epoch_t current_epoch; // most recent epoch + epoch_t oldest_map, newest_map; // oldest/newest maps we have. + OSDSuperblock(__uint64_t f=0, int w=0) : + magic(MAGIC), fsid(f), whoami(w), + current_epoch(0), oldest_map(0), newest_map(0) {} +}; + +inline ostream& operator<<(ostream& out, OSDSuperblock& sb) +{ + return out << "sb(fsid " << sb.fsid + << " osd" << sb.whoami + << " e" << sb.current_epoch + << " [" << sb.oldest_map << "," << sb.newest_map + << "])"; +} + + +#endif diff --git a/branches/sage/cephmds2/osdc/Objecter.cc b/branches/sage/cephmds2/osdc/Objecter.cc index 08fe981da703d..c531a840803b1 100644 --- a/branches/sage/cephmds2/osdc/Objecter.cc +++ b/branches/sage/cephmds2/osdc/Objecter.cc @@ -271,7 +271,8 @@ tid_t Objecter::stat_submit(OSDStat *st) // send last_tid++; - MOSDOp *m = new MOSDOp(last_tid, messenger->get_myinst(), + assert(client_inc >= 0); + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, ex.oid, ex.pgid, osdmap->get_epoch(), OSD_OP_STAT); dout(10) << "stat_submit " << st << " tid " << last_tid @@ -382,7 +383,8 @@ tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) // send last_tid++; - MOSDOp *m = new MOSDOp(last_tid, messenger->get_myinst(), + assert(client_inc >= 0); + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, ex.oid, ex.pgid, osdmap->get_epoch(), OSD_OP_READ); m->set_length(ex.length); @@ -646,7 +648,7 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) else tid = ++last_tid; - MOSDOp *m = new MOSDOp(tid, messenger->get_myinst(), + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, ex.oid, ex.pgid, osdmap->get_epoch(), wr->op); m->set_length(ex.length); diff --git a/branches/sage/cephmds2/osdc/Objecter.h b/branches/sage/cephmds2/osdc/Objecter.h index 30312dd9307c1..f74081eafc312 100644 --- a/branches/sage/cephmds2/osdc/Objecter.h +++ b/branches/sage/cephmds2/osdc/Objecter.h @@ -27,6 +27,7 @@ class Objecter { private: tid_t last_tid; + int client_inc; int num_unacked; int num_uncommitted; @@ -134,7 +135,7 @@ class Objecter { public: Objecter(Messenger *m, MonMap *mm, OSDMap *om) : messenger(m), monmap(mm), osdmap(om), - last_tid(0), + last_tid(0), client_inc(-1), num_unacked(0), num_uncommitted(0) {} ~Objecter() { @@ -163,6 +164,10 @@ class Objecter { return !(op_read.empty() && op_modify.empty()); } + void set_client_incarnation(int inc) { + client_inc = inc; + } + // med level tid_t readx(OSDRead *read, Context *onfinish); tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); -- 2.39.5