From c60834d598da92b1dc0162a8ce1a2faf84318dd0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 10 Mar 2008 17:16:08 -0700 Subject: [PATCH] osd: reworked pg creation a bit --- src/TODO | 4 ++++ src/config.cc | 5 ---- src/config.h | 2 -- src/kernel/osdmap.c | 5 ++-- src/mon/OSDMonitor.cc | 24 ++++++++++++++++--- src/osd/OSD.cc | 38 ++++++++++++++--------------- src/osd/OSDMap.cc | 2 +- src/osd/OSDMap.h | 56 +++++++++++++++++++++++++++++-------------- src/osd/PG.cc | 5 ++-- 9 files changed, 86 insertions(+), 55 deletions(-) diff --git a/src/TODO b/src/TODO index 3db36af3488da..b59532d085ed9 100644 --- a/src/TODO +++ b/src/TODO @@ -167,6 +167,10 @@ objecter - read+floor_lockout osd/rados +- pg splits. + - during mkps, iterate over affected old pgs and move objects + - set prior set on new pg to include primaries+replicas of any possible old pgs from whom objects may have been inherited. + - fix build_prior_set behavior. needs to not always exclude currently down nodes. e.g., 1: A B diff --git a/src/config.cc b/src/config.cc index 780061179614b..23638040864e6 100644 --- a/src/config.cc +++ b/src/config.cc @@ -333,8 +333,6 @@ md_config_t g_conf = { osd_auto_weight: false, - osd_hack_fast_startup: false, // this breaks localized pgs. - // --- fakestore --- fakestore_sync_interval: 2, // seconds @@ -910,9 +908,6 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--osd_auto_weight") == 0) g_conf.osd_auto_weight = atoi(args[++i]); - else if (strcmp(args[i], "--osd_hack_fast_startup") == 0) - g_conf.osd_hack_fast_startup = atoi(args[++i]); - else if (strcmp(args[i], "--bdev_lock") == 0) g_conf.bdev_lock = atoi(args[++i]); else if (strcmp(args[i], "--bdev_el_bidir") == 0) diff --git a/src/config.h b/src/config.h index 6f508178e3b0f..ffbe8fc4ceaa5 100644 --- a/src/config.h +++ b/src/config.h @@ -270,8 +270,6 @@ struct md_config_t { bool osd_auto_weight; - bool osd_hack_fast_startup; - // fakestore double fakestore_sync_interval; bool fakestore_fake_attrs; diff --git a/src/kernel/osdmap.c b/src/kernel/osdmap.c index f35ce914c510f..8d6d8ea7fd1b7 100644 --- a/src/kernel/osdmap.c +++ b/src/kernel/osdmap.c @@ -305,8 +305,6 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) goto bad; if ((err = ceph_decode_32(p, end, &map->epoch)) < 0) goto bad; - if ((err = ceph_decode_32(p, end, &map->mkfs_epoch)) < 0) - goto bad; if ((err = ceph_decode_32(p, end, &map->ctime.tv_sec)) < 0) goto bad; if ((err = ceph_decode_32(p, end, &map->ctime.tv_usec)) < 0) @@ -320,6 +318,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) goto bad; if ((err = ceph_decode_32(p, end, &map->localized_pg_num)) < 0) goto bad; + *p += 2*sizeof(__u32); /* skip prior_*pg_num fields */ calc_pg_masks(map); @@ -397,7 +396,6 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, struct ceph_osdmap *m goto bad; if ((err = ceph_decode_32(p, end, &epoch)) < 0) goto bad; - (*p)++; /* skip mkfs u8 */ BUG_ON(epoch != map->epoch+1); if ((err = ceph_decode_32(p, end, &ctime.tv_sec)) < 0) goto bad; @@ -426,6 +424,7 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, struct ceph_osdmap *m /* new max? */ if ((err = ceph_decode_32(p, end, &max)) < 0) goto bad; + *p += 2*sizeof(__u32); /* skip new_pg_num, for now. FIXME. */ if (max > 0) { if ((err = osdmap_set_max_osd(map, max)) < 0) goto bad; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 23976f1400d55..f05ea4a4bfc00 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -473,9 +473,13 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m) pending_inc.new_offload[from] = CEPH_OSD_IN; osd_weight[from] = m->sb.weight; - - if (!osdmap.post_mkfs() && !osdmap.is_mkfs()) - pending_inc.mkfs = 1; // first set of up osds, do the mkfs! + + if (osdmap.pg_num == 0) { + // set a conservative initial pg_num + pending_inc.new_pg_num = osdmap.get_max_osd() << g_conf.osd_pg_bits; + pending_inc.new_localized_pg_num = 4; // per osd + dout(1) << "prepare_boot setting initial pg_num to " << pending_inc.new_pg_num << dendl; + } // wait paxos->wait_for_commit(new C_Booted(this, m)); @@ -763,6 +767,20 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } + else if (m->cmd[1] == "setpgnum" && m->cmd.size() > 2) { + int n = atoi(m->cmd[2].c_str()); + if (n > osdmap.get_pg_num()) { + ss << "set new pg_num = " << n; + pending_inc.new_pg_num = n; + getline(ss, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); + return true; + } else { + ss << "specified pg_num " << n << " < current " << osdmap.get_pg_num(); + getline(ss, rs); + mon->reply_command(m, -EINVAL, rs); + } + } else if (m->cmd[1] == "down" && m->cmd.size() > 2) { errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 564dda6d1d066..f3e12b80540ba 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -506,8 +506,6 @@ void OSD::try_create_pg(pg_t pgid, ObjectStore::Transaction& t) pg->info.history.same_primary_since = pg->info.history.same_acker_since = osdmap->get_epoch(); pg->write_log(t); - if (g_conf.osd_hack_fast_startup) - pg->activate(t); dout(7) << "created " << *pg << dendl; pg->unlock(); @@ -1325,33 +1323,37 @@ void OSD::advance_map(ObjectStore::Transaction& t) { dout(7) << "advance_map epoch " << osdmap->get_epoch() << " " << pg_map.size() << " pgs" - << " mkfs_peoch " << osdmap->get_mkfs_epoch() << dendl; - if (osdmap->is_mkfs()) { + if (osdmap->is_mkpg()) { + // FIXME: move this bit elsewhere.... // is this okay? + /* ceph_fsid nullfsid; memset(&nullfsid, 0, sizeof(nullfsid)); if (memcmp(&nullfsid, &superblock.fsid, sizeof(nullfsid)) != 0) { - derr(0) << "will not mkfs, my superblock fsid is not zeroed" << dendl; + derr(0) << "will not mkps, my superblock fsid is not zeroed" << dendl; assert(0); } - superblock.fsid = osdmap->get_fsid(); - assert(g_conf.osd_mkfs); // make sure we did a mkfs! + */ + superblock.fsid = osdmap->get_fsid(); // FIXME + //assert(g_conf.osd_mkfs); // make sure we did a mkfs! // ok! ps_t numps = osdmap->get_pg_num(); + ps_t fromps = osdmap->get_prior_pg_num(); ps_t numlps = osdmap->get_localized_pg_num(); - dout(1) << "mkfs " << osdmap->get_fsid() << " on " - << numps << " normal, " - << numlps << " localized pg sets" << dendl; + ps_t fromlps = osdmap->get_prior_pg_num(); + dout(1) << "mkpg " << osdmap->get_fsid() << " on " + << fromps << "-" << numps << " normal, " + << fromlps << "-" << numlps << " localized pg sets" << dendl; int minrep = 1; int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); int minraid = g_conf.osd_min_raid_width; int maxraid = g_conf.osd_max_raid_width; int numpool = 1; // FIXME - dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, " + dout(1) << "mkpg " << minrep << ".." << maxrep << " replicas, " << minraid << ".." << maxraid << " osd raid groups" << dendl; //derr(0) << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; @@ -1361,22 +1363,22 @@ void OSD::advance_map(ObjectStore::Transaction& t) // replicated for (int pool = 0; pool < numpool; pool++) for (int nrep = 1; nrep <= maxrep; nrep++) { - for (ps_t ps = 0; ps < numps; ++ps) + for (ps_t ps = fromps; ps < numps; ++ps) try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, pool, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) + for (ps_t ps = fromlps; ps < numlps; ++ps) try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, pool, whoami), t); } // raided for (int pool = 0; pool < numpool; pool++) for (int size = minraid; size <= maxraid; size++) { - for (ps_t ps = 0; ps < numps; ++ps) + for (ps_t ps = fromps; ps < numps; ++ps) try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, pool, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) + for (ps_t ps = fromlps; ps < numlps; ++ps) try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, pool, whoami), t); } - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl; + dout(1) << "mkpg done, now i have " << pg_map.size() << " pgs" << dendl; } else { // scan existing pg's @@ -1548,10 +1550,6 @@ void OSD::activate_map(ObjectStore::Transaction& t) pg->unlock(); } - if (g_conf.osd_hack_fast_startup && - osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - do_notifies(notify_list); // notify? (residual|replica) do_queries(query_map); do_activators(activator_map); diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 223f2cd6db6d2..1b0d36566940e 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -29,7 +29,7 @@ void OSDMap::build_simple(epoch_t e, ceph_fsid &fsid, ctime = g_clock.now(); set_max_osd(num_osd); - set_pg_num(num_osd << pg_bits); + //pg_num = num_osd << pg_bits; // not here.. must be >0 'up' osds! // crush map map weights; diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 6de80fd9fa148..5d779c386d943 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -51,7 +51,7 @@ using namespace std; inline int calc_bits_of(int t) { int b = 0; - while (t) { + while (t > 0) { t = t >> 1; b++; } @@ -77,7 +77,8 @@ public: // incremental int32_t new_max_osd; - __u8 mkfs; + int32_t new_pg_num; + int32_t new_localized_pg_num; map new_up; map new_down; map new_offload; @@ -87,11 +88,12 @@ public: void encode(bufferlist& bl) { ::_encode(fsid, bl); ::_encode(epoch, bl); - ::_encode(mkfs, bl); ctime._encode(bl); ::_encode(fullmap, bl); ::_encode(crush, bl); ::_encode(new_max_osd, bl); + ::_encode(new_pg_num, bl); + ::_encode(new_localized_pg_num, bl); ::_encode(new_up, bl); ::_encode(new_down, bl); ::_encode(new_offload, bl); @@ -101,11 +103,12 @@ public: void decode(bufferlist& bl, int& off) { ::_decode(fsid, bl, off); ::_decode(epoch, bl, off); - ::_decode(mkfs, bl, off); ctime._decode(bl, off); ::_decode(fullmap, bl, off); ::_decode(crush, bl, off); ::_decode(new_max_osd, bl, off); + ::_decode(new_pg_num, bl, off); + ::_decode(new_localized_pg_num, bl, off); ::_decode(new_up, bl, off); ::_decode(new_down, bl, off); ::_decode(new_offload, bl, off); @@ -113,20 +116,24 @@ public: ::_decode(old_pg_swap_primary, bl, off); } - Incremental(epoch_t e=0) : epoch(e), new_max_osd(-1), mkfs(0) { + Incremental(epoch_t e=0) : epoch(e), new_max_osd(-1), new_pg_num(0), new_localized_pg_num(0) { fsid.major = fsid.minor = cpu_to_le64(0); } }; private: ceph_fsid fsid; - epoch_t epoch, mkfs_epoch; // what epoch of the osd cluster descriptor is this - utime_t ctime, mtime; // epoch start time + epoch_t epoch; // what epoch of the osd cluster descriptor is this + utime_t ctime, mtime; // epoch start time int32_t pg_num; // placement group count int32_t pg_num_mask; // bitmask for above int32_t localized_pg_num; // localized place group count int32_t localized_pg_num_mask; // ditto + // values from prior epoch, so we can create them + int32_t prior_pg_num; + int32_t prior_localized_pg_num; + int32_t max_osd; vector osd_state; vector osd_addr; @@ -139,9 +146,9 @@ private: friend class MDS; public: - OSDMap() : epoch(0), mkfs_epoch((epoch_t)-1), - pg_num(1<<5), - localized_pg_num(1<<3), + OSDMap() : epoch(0), + pg_num(0), localized_pg_num(0), + prior_pg_num(0), prior_localized_pg_num(0), max_osd(0) { fsid.major = fsid.minor = cpu_to_le64(0); calc_pg_masks(); @@ -162,16 +169,18 @@ private: } int get_pg_num() const { return pg_num; } - void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } + int get_prior_pg_num() const { return prior_pg_num; } + //void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } int get_localized_pg_num() const { return localized_pg_num; } + int get_prior_localized_pg_num() const { return prior_localized_pg_num; } /* stamps etc */ const utime_t& get_ctime() const { return ctime; } const utime_t& get_mtime() const { return mtime; } - bool is_mkfs() const { return epoch == mkfs_epoch; } - bool post_mkfs() const { return epoch > mkfs_epoch; } - epoch_t get_mkfs_epoch() const { return mkfs_epoch; } + bool is_mkpg() const { + return (pg_num > prior_pg_num) || (localized_pg_num > prior_localized_pg_num); + } /***** cluster state *****/ /* osds */ @@ -276,6 +285,9 @@ private: epoch++; ctime = inc.ctime; + prior_pg_num = pg_num; + prior_localized_pg_num = prior_localized_pg_num; + // full map? if (inc.fullmap.length()) { decode(inc.fullmap); @@ -287,8 +299,14 @@ private: } // nope, incremental. - if (inc.mkfs) - mkfs_epoch = epoch; + if (inc.new_pg_num) { + pg_num = inc.new_pg_num; + assert(pg_num >= prior_pg_num); + } + if (inc.new_localized_pg_num) { + localized_pg_num = inc.new_localized_pg_num; + assert(localized_pg_num >= prior_localized_pg_num); + } if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); @@ -328,11 +346,12 @@ private: void encode(bufferlist& blist) { ::_encode(fsid, blist); ::_encode(epoch, blist); - ::_encode(mkfs_epoch, blist); ::_encode(ctime, blist); ::_encode(mtime, blist); ::_encode(pg_num, blist); ::_encode(localized_pg_num, blist); + ::_encode(prior_pg_num, blist); + ::_encode(prior_localized_pg_num, blist); ::_encode(max_osd, blist); ::_encode(osd_state, blist); @@ -348,11 +367,12 @@ private: int off = 0; ::_decode(fsid, blist, off); ::_decode(epoch, blist, off); - ::_decode(mkfs_epoch, blist, off); ::_decode(ctime, blist, off); ::_decode(mtime, blist, off); ::_decode(pg_num, blist, off); ::_decode(localized_pg_num, blist, off); + ::_decode(prior_pg_num, blist, off); + ::_decode(prior_localized_pg_num, blist, off); calc_pg_masks(); ::_decode(max_osd, blist, off); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index d1fdcd8f0e825..2966c7acb532e 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -661,7 +661,7 @@ void PG::peer(ObjectStore::Transaction& t, } else { dout(10) << " still active from last started: " << last_started << dendl; } - } else if (osd->osdmap->post_mkfs()) { + } else if (info.pgid.u.pg.ps < osd->osdmap->get_prior_pg_num()) { dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; state_set(STATE_CRASHED); } @@ -888,8 +888,7 @@ void PG::activate(ObjectStore::Transaction& t, } // if primary.. - if (role == 0 && - (!g_conf.osd_hack_fast_startup || osd->osdmap->post_mkfs())) { + if (role == 0) { // who is clean? uptodate_set.clear(); if (info.is_uptodate()) -- 2.39.5