From f7948403f9a190689a895df041d8e0ab5a86b2bc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Nov 2008 09:57:11 -0800 Subject: [PATCH] osd: track last_clean_interval in osdmap; simplify encoding/decoding a bit Break osdmap into "base" and "extended" portions, so that clients can ignore the extended portions completely. Track last_clean_interval in the osdmap so we know when the osd last cleanly shut down. Disk and wire format changes. --- src/include/ceph_fs.h | 7 ++--- src/kernel/osdmap.c | 47 ++++------------------------- src/mon/OSDMonitor.cc | 4 +++ src/osd/OSDMap.cc | 2 +- src/osd/OSDMap.h | 70 ++++++++++++++++++++++++++++++++++--------- 5 files changed, 69 insertions(+), 61 deletions(-) diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index fcc246600bdaa..b1a2809a3e6d9 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -22,7 +22,7 @@ * whenever the wire protocol changes. try to keep this string length * constant. */ -#define CEPH_BANNER "ceph 008\n" +#define CEPH_BANNER "ceph 009\n" #define CEPH_BANNER_MAX_LEN 30 /* @@ -34,8 +34,8 @@ #define CEPH_MON_PROTOCOL 2 #define CEPH_CLIENT_PROTOCOL 1 -#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v1" -#define CEPH_MON_ONDISK_MAGIC "ceph monitor volume v1" +#define CEPH_OSD_ONDISK_MAGIC "ceph osd volume v2" +#define CEPH_MON_ONDISK_MAGIC "ceph monitor volume v2" /* * types in this file are defined as little-endian, and are @@ -328,7 +328,6 @@ struct ceph_eversion { /* status bits */ #define CEPH_OSD_EXISTS 1 #define CEPH_OSD_UP 2 -#define CEPH_OSD_CLEAN 4 /* as in, clean shutdown */ /* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ #define CEPH_OSD_IN 0x10000 diff --git a/src/kernel/osdmap.c b/src/kernel/osdmap.c index eae740633b55a..5e3b84591069d 100644 --- a/src/kernel/osdmap.c +++ b/src/kernel/osdmap.c @@ -389,21 +389,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) *p += 4; /* skip length field (should match max) */ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); - *p += sizeof(u32) + map->max_osd * sizeof(u32); /* osd_up_from */ - *p += sizeof(u32) + map->max_osd * sizeof(u32); /* osd_up_thru */ - - /* ignore pg primary swapping */ - ceph_decode_32_safe(p, end, len, bad); - p += len * (sizeof(u64) + sizeof(u32)); - if (len) - derr(0, "WARNING: pg primary swaps in osdmap e%d unsupported\n", - map->epoch); - - /* ignore max_snap, removed_snaps */ - *p += sizeof(u64); - ceph_decode_32_safe(p, end, len, bad); - *p += len * 2 * sizeof(u64); - /* crush */ ceph_decode_32_safe(p, end, len, bad); dout(30, "osdmap_decode crush len %d from off 0x%x\n", len, @@ -417,9 +402,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) goto bad; } + /* ignore the rest of the map */ + *p = end; + dout(30, "osdmap_decode done %p %p\n", *p, end); - if (*p != end) - goto bad; return map; bad: @@ -550,31 +536,8 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, map->osd_weight[osd] = off; } - /* skip new_up_thru */ - ceph_decode_32_safe(p, end, len, bad); - *p += len * 2 * sizeof(u32); - - /* skip old/new pg_swap stuff */ - ceph_decode_32_safe(p, end, len, bad); - *p += len * (sizeof(u64) + sizeof(u32)); - if (len) - derr(0, "WARNING: pg primary swaps in osdmap e%d unsupported\n", - epoch); - ceph_decode_32_safe(p, end, len, bad); - *p += len * sizeof(u64); - if (len) - derr(0, "WARNING: pg primary swaps in osdmap e%d unsupported\n", - epoch); - - /* skip new_max_snap, removed_snaps */ - *p += sizeof(u64); - ceph_decode_32_safe(p, end, len, bad); - *p += len * 2 * sizeof(u64); - - if (*p != end) { - derr(10, "osdmap incremental has trailing gunk?\n"); - goto bad; - } + /* ignore the rest */ + *p = end; return map; bad: diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index e3b903df3a09b..0c8b8dce45a99 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -530,6 +530,10 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m) if (m->sb.weight) osd_weight[from] = m->sb.weight; + // note last clean unmount epoch + pending_inc.new_last_clean_interval[from] = + pair(m->sb.epoch_mounted, m->sb.epoch_unmounted); + // wait paxos->wait_for_commit(new C_Booted(this, m)); } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 93b6b482cb5df..00bd31346d4bd 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -36,7 +36,7 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid &fsid, build_simple_crush_map(crush, num_osd, num_dom); for (int i=0; i new_down; map new_weight; map new_up_thru; + map > new_last_clean_interval; map new_pg_swap_primary; list old_pg_swap_primary; @@ -99,12 +100,14 @@ public: interval_set removed_snaps; void encode(bufferlist& bl) { + // base ::encode(fsid, bl); ::encode(epoch, bl); ::encode(ctime, bl); ::encode(new_flags, bl); ::encode(fullmap, bl); ::encode(crush, bl); + ::encode(new_max_osd, bl); ::encode(new_pg_num, bl); ::encode(new_pgp_num, bl); @@ -113,19 +116,24 @@ public: ::encode(new_up, bl); ::encode(new_down, bl); ::encode(new_weight, bl); + + // extended ::encode(new_up_thru, bl); + ::encode(new_last_clean_interval, bl); ::encode(new_pg_swap_primary, bl); ::encode(old_pg_swap_primary, bl); ::encode(new_max_snap, bl); ::encode(removed_snaps.m, bl); } void decode(bufferlist::iterator &p) { + // base ::decode(fsid, p); ::decode(epoch, p); ::decode(ctime, p); ::decode(new_flags, p); ::decode(fullmap, p); ::decode(crush, p); + ::decode(new_max_osd, p); ::decode(new_pg_num, p); ::decode(new_pgp_num, p); @@ -134,7 +142,10 @@ public: ::decode(new_up, p); ::decode(new_down, p); ::decode(new_weight, p); + + // extended ::decode(new_up_thru, p); + ::decode(new_last_clean_interval, p); ::decode(new_pg_swap_primary, p); ::decode(old_pg_swap_primary, p); ::decode(new_max_snap, p); @@ -190,8 +201,21 @@ private: vector osd_state; vector osd_addr; vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" + + /* + * we track up to two intervals during which the osd was alive and + * healthy. the most recent is [up_from,up_thru), where up_thru is the + * last epoch the osd is known to have _started_. i.e., a lower bound on the + * actual osd death. + * + * the second is last_clean_interval [first,last]. in that case, the last + * interval is the last epoch known to have been either _finished_, or during + * which the osd cleanly shut down. + */ vector osd_up_from; // when it went up - vector osd_up_thru; // lower bound on _actual_ osd death. bumped by osd before activating pgs with no replicas. + vector osd_up_thru; // lower bound on _actual_ osd death. + vector > osd_last_clean_interval; + map pg_swap_primary; // force new osd to be pg primary (if already a member) snapid_t max_snap; interval_set removed_snaps; @@ -263,11 +287,14 @@ private: osd_state.resize(m); osd_up_from.resize(m); osd_up_thru.resize(m); + osd_last_clean_interval.resize(m); osd_weight.resize(m); for (; o get_last_clean_interval(int osd) { + assert(exists(osd)); + return osd_last_clean_interval[osd]; + } int get_any_up_osd() { for (int i=0; ifirst] = i->second; + for (map >::iterator i = inc.new_last_clean_interval.begin(); + i != inc.new_last_clean_interval.end(); + i++) + osd_last_clean_interval[i->first] = i->second; + for (map::iterator i = inc.new_up.begin(); i != inc.new_up.end(); i++) { @@ -473,6 +505,7 @@ private: // serialize, unserialize void encode(bufferlist& blist) { + // base ::encode(fsid, blist); ::encode(epoch, blist); ::encode(ctime, blist); @@ -488,20 +521,25 @@ private: ::encode(osd_state, blist); ::encode(osd_weight, blist); ::encode(osd_addr, blist); + + // crush + bufferlist cbl; + crush.encode(cbl); + ::encode(cbl, blist); + + // extended ::encode(osd_up_from, blist); ::encode(osd_up_thru, blist); + ::encode(osd_last_clean_interval, blist); ::encode(pg_swap_primary, blist); ::encode(max_snap, blist); ::encode(removed_snaps.m, blist); - - bufferlist cbl; - crush.encode(cbl); - ::encode(cbl, blist); } void decode(bufferlist& blist) { bufferlist::iterator p = blist.begin(); + // base ::decode(fsid, p); ::decode(epoch, p); ::decode(ctime, p); @@ -518,17 +556,21 @@ private: ::decode(osd_state, p); ::decode(osd_weight, p); ::decode(osd_addr, p); + + // crush + bufferlist cbl; + ::decode(cbl, p); + bufferlist::iterator cblp = cbl.begin(); + crush.decode(cblp); + + // extended ::decode(osd_up_from, p); ::decode(osd_up_thru, p); + ::decode(osd_last_clean_interval, p); ::decode(pg_swap_primary, p); ::decode(max_snap, p); ::decode(removed_snaps.m, p); - - bufferlist cbl; - ::decode(cbl, p); - bufferlist::iterator cblp = cbl.begin(); - crush.decode(cblp); } -- 2.39.5