From 02e887312b265d1a68e9efe765bb27bca91a90b3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 10 Mar 2008 16:23:41 -0700 Subject: [PATCH] revamped mkfs procedures, mon startup, and more --- src/Makefile.am | 16 +++- src/TODO | 5 + src/cmon.cc | 113 +++++++++++----------- src/fakefuse.cc | 32 +++++-- src/fakesyn.cc | 29 +++++- src/kernel/osdmap.c | 8 +- src/kernel/osdmap.h | 2 +- src/mds/MDS.cc | 2 +- src/mds/MDSMap.h | 6 +- src/mkmonfs.cc | 98 +++++++++++++++++++ src/mon/ClientMonitor.cc | 4 +- src/mon/MDSMonitor.cc | 2 +- src/mon/MonMap.cc | 2 +- src/mon/Monitor.cc | 67 ++++++++++--- src/mon/Monitor.h | 9 +- src/mon/MonitorStore.cc | 10 +- src/mon/MonitorStore.h | 11 +-- src/mon/OSDMonitor.cc | 197 ++++----------------------------------- src/mon/OSDMonitor.h | 5 +- src/mon/PGMonitor.cc | 3 +- src/mon/Paxos.cc | 2 +- src/mon/Paxos.h | 4 + src/mon/PaxosService.cc | 11 +-- src/mon/PaxosService.h | 5 +- src/monmaptool.cc | 20 ++-- src/osd/OSD.cc | 3 +- src/osd/OSDMap.cc | 162 ++++++++++++++++++++++++++++++++ src/osd/OSDMap.h | 34 ++++--- src/osdmaptool.cc | 161 ++++++++++++++++++++++++++++++++ src/start.sh | 19 +++- 30 files changed, 713 insertions(+), 329 deletions(-) create mode 100644 src/mkmonfs.cc create mode 100644 src/osd/OSDMap.cc create mode 100644 src/osdmaptool.cc diff --git a/src/Makefile.am b/src/Makefile.am index 3a89eb739c97b..b36bf59c90ff3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -7,14 +7,18 @@ AUTOMAKE_OPTIONS = gnu # monitor cmon_SOURCES = cmon.cc msg/SimpleMessenger.cc cmon_LDADD = libmon.a libcrush.a libcommon.a -mkmonmap_SOURCES = mkmonmap.cc -mkmonmap_LDADD = libcommon.a -monmaptool_SOURCES = monmaptool.cc -monmaptool_LDADD = libcommon.a cmonctl_SOURCES = cmonctl.cc msg/SimpleMessenger.cc cmonctl_LDADD = libcommon.a + +# tools +mkmonfs_SOURCES = mkmonfs.cc +mkmonfs_LDADD = libmon.a libcommon.a libcrush.a +monmaptool_SOURCES = monmaptool.cc +monmaptool_LDADD = libcommon.a crushtool_SOURCES = crushtool.cc crushtool_LDADD = libcommon.a libcrush.a +osdmaptool_SOURCES = osdmaptool.cc +osdmaptool_LDADD = libmon.a libcommon.a libcrush.a # mds cmds_SOURCES = cmds.cc msg/SimpleMessenger.cc @@ -64,7 +68,8 @@ AM_LDFLAGS = bin_PROGRAMS = \ cmon cmds cosd csyn \ - mkmonmap monmaptool crushtool cmonctl \ + cmonctl \ + mkmonfs monmaptool osdmaptool crushtool \ fakesyn \ streamtest \ $(FUSEBIN) $(NEWSYN) @@ -84,6 +89,7 @@ libcommon_a_SOURCES = \ common/Finisher.cc \ mon/MonMap.cc \ mon/MonClient.cc \ + osd/OSDMap.cc \ config.cc libcrush_a_SOURCES = \ diff --git a/src/TODO b/src/TODO index 59eda6e0951dd..3db36af3488da 100644 --- a/src/TODO +++ b/src/TODO @@ -3,6 +3,11 @@ gsoc? - libbtrfs objectstore - osdmap/crush gui + +- online changes to pg_num +- initialization of pg_num during non-simple mkfs... +- review/remove post_mkfs hacks in PG.cc?? + code cleanup - userspace encoding/decoding needs major cleanup - use le32 etc annotation diff --git a/src/cmon.cc b/src/cmon.cc index 57509d4becaf9..35bffd0322457 100644 --- a/src/cmon.cc +++ b/src/cmon.cc @@ -25,92 +25,93 @@ using namespace std; #include "mon/MonMap.h" #include "mon/Monitor.h" +#include "mon/MonitorStore.h" #include "msg/SimpleMessenger.h" #include "common/Timer.h" +void usage() +{ + cerr << "usage: ./cmon [flags] " << std::endl; + cerr << " -d daemonize" << std::endl; + cerr << " -o log output to dir/mon#" << std::endl; + cerr << " --debug_mon n debug monitor level (e.g. 10)" << std::endl; + cerr << " --debug_ms n debug messaging level (e.g. 1)" << std::endl; + exit(1); +} int main(int argc, const char **argv) { + int err; + vector args; argv_to_vec(argc, argv, args); parse_config_options(args); // args - int whoami = -1; - const char *monmap_fn = ".ceph_monmap"; + const char *fsdir = 0; for (unsigned i=0; i= monmap.size() || whoami < 0) { + cerr << "mon" << whoami << " does not exist in monmap" << std::endl; + exit(1); } + // bind + cout << "starting mon" << whoami + << " at " << monmap.get_inst(whoami).addr + << " from " << fsdir << std::endl; + g_my_addr = monmap.get_inst(whoami).addr; + err = rank.bind(); + if (err < 0) + return 1; + create_courtesy_output_symlink("mon", whoami); - rank.start(); - // start monitor Messenger *m = rank.register_entity(entity_name_t::MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); + Monitor *mon = new Monitor(whoami, &store, m, &monmap); + rank.start(); // may daemonize + mon->init(); rank.wait(); - // done + store.umount(); delete mon; - return 0; } diff --git a/src/fakefuse.cc b/src/fakefuse.cc index 87292701fdfed..9861ff86c4e81 100644 --- a/src/fakefuse.cc +++ b/src/fakefuse.cc @@ -22,7 +22,7 @@ using namespace std; #include "config.h" #include "mon/Monitor.h" - +#include "mon/MonitorStore.h" #include "mds/MDS.h" #include "osd/OSD.h" #include "client/Client.h" @@ -32,6 +32,7 @@ using namespace std; #include "common/Timer.h" #include "msg/FakeMessenger.h" +#include "messages/MMonCommand.h" @@ -96,7 +97,11 @@ int main(int argc, const char **argv) { Monitor *mon[g_conf.num_mon]; for (int i=0; imkfs(); } // create osd @@ -112,13 +117,28 @@ int main(int argc, const char **argv) { } // init - for (int i=0; iinit(); - for (int i=0; iinit(); + + // build initial osd map + { + OSDMap map; + map.build_simple(0, monmap->fsid, g_conf.num_osd, g_conf.osd_pg_bits, 0); + bufferlist bl; + map.encode(bl); + Messenger *messenger = new FakeMessenger(entity_name_t::ADMIN(-1)); + MMonCommand *m = new MMonCommand(messenger->get_myinst()); + m->set_data(bl); + m->cmd.push_back("osd"); + m->cmd.push_back("setmap"); + messenger->send_message(m, monmap->get_inst(0)); + messenger->shutdown(); + } + for (int i=0; iinit(); - + for (int i=0; iinit(); // create client Client *client[NUMCLIENT]; diff --git a/src/fakesyn.cc b/src/fakesyn.cc index 48dba2d7ec4c5..c2c7c7a3dfc97 100644 --- a/src/fakesyn.cc +++ b/src/fakesyn.cc @@ -24,11 +24,13 @@ using namespace std; #include "mds/MDS.h" #include "osd/OSD.h" #include "mon/Monitor.h" +#include "mon/MonitorStore.h" #include "client/Client.h" #include "client/SyntheticClient.h" #include "msg/FakeMessenger.h" +#include "messages/MMonCommand.h" #include "common/Timer.h" @@ -98,8 +100,13 @@ int main(int argc, const char **argv) // create mon Monitor *mon[g_conf.num_mon]; - for (int i=0; imkfs(); + } // create mds MDS *mds[g_conf.num_mds]; @@ -112,14 +119,28 @@ int main(int argc, const char **argv) start++; } + // build initial osd map + { + OSDMap map; + map.build_simple(0, monmap->fsid, g_conf.num_osd, g_conf.osd_pg_bits, 0); + bufferlist bl; + map.encode(bl); + Messenger *messenger = new FakeMessenger(entity_name_t::ADMIN(-1)); + MMonCommand *m = new MMonCommand(messenger->get_myinst()); + m->set_data(bl); + m->cmd.push_back("osd"); + m->cmd.push_back("setmap"); + messenger->send_message(m, monmap->get_inst(0)); + messenger->shutdown(); + } + // create osd OSD *osd[g_conf.num_osd]; for (int i=0; iepoch)) < 0) goto bad; - if ((err = ceph_decode_32(p, end, &map->mon_epoch)) < 0) + if ((err = ceph_decode_32(p, end, &map->mkfs_epoch)) < 0) goto bad; if ((err = ceph_decode_32(p, end, &map->ctime.tv_sec)) < 0) goto bad; @@ -385,7 +385,7 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, struct ceph_osdmap *m struct ceph_osdmap *newmap = map; struct crush_map *newcrush = 0; struct ceph_fsid fsid; - __u32 epoch, mon_epoch; + __u32 epoch; struct ceph_timeval ctime; __u32 len; __u32 max; @@ -397,9 +397,8 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, struct ceph_osdmap *m goto bad; if ((err = ceph_decode_32(p, end, &epoch)) < 0) goto bad; + (*p)++; /* skip mkfs u8 */ BUG_ON(epoch != map->epoch+1); - if ((err = ceph_decode_32(p, end, &mon_epoch)) < 0) - goto bad; if ((err = ceph_decode_32(p, end, &ctime.tv_sec)) < 0) goto bad; if ((err = ceph_decode_32(p, end, &ctime.tv_usec)) < 0) @@ -433,7 +432,6 @@ struct ceph_osdmap *apply_incremental(void **p, void *end, struct ceph_osdmap *m } map->epoch++; - map->mon_epoch = mon_epoch; map->ctime = map->ctime; if (newcrush) { if (map->crush) diff --git a/src/kernel/osdmap.h b/src/kernel/osdmap.h index 54d801cd84210..1d9c0ab081f28 100644 --- a/src/kernel/osdmap.h +++ b/src/kernel/osdmap.h @@ -8,7 +8,7 @@ struct ceph_osdmap { struct ceph_fsid fsid; ceph_epoch_t epoch; - ceph_epoch_t mon_epoch; + ceph_epoch_t mkfs_epoch; struct ceph_timeval ctime, mtime; __u32 pg_num, pg_num_mask; diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 418d520d16f59..4e31ab080d3d2 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -495,7 +495,7 @@ void MDS::handle_mds_map(MMDSMap *m) mdsmap->is_up(whoami) && (oldwhoami != whoami || !logger)) { create_courtesy_output_symlink("mds", whoami); - reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline + reopen_logger(mdsmap->get_created()); // adopt mds cluster timeline } if (oldwhoami != whoami) { diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h index be88d7c3f4b54..bcb8c6c8d8b02 100644 --- a/src/mds/MDSMap.h +++ b/src/mds/MDSMap.h @@ -131,7 +131,11 @@ class MDSMap { epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } - const utime_t& get_create() const { return created; } + const utime_t& get_created() const { return created; } + void set_created(utime_t ct) { created = ct; } + + int get_max_mds() const { return max_mds; } + void set_max_mds(int m) { max_mds = m; } int get_anchortable() const { return anchortable; } int get_root() const { return root; } diff --git a/src/mkmonfs.cc b/src/mkmonfs.cc new file mode 100644 index 0000000000000..3c64a223466b4 --- /dev/null +++ b/src/mkmonfs.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mon/MonitorStore.cc" +#include "config.h" + +#include "mon/Monitor.h" +#include "mon/MonMap.h" +#include "mds/MDSMap.h" +#include "osd/OSDMap.h" +#include "mon/PGMap.h" + +void usage() +{ + cerr << "usage: ./mkmonfs [--clobber] --mon --monmap --osdmap " << std::endl; + cerr << " --maxmds N set max mds (default is 1)" << std::endl; + exit(1); +} + +int read_file(const char *fn, bufferlist &bl) +{ + struct stat st; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + cerr << "can't open " << fn << ": " << strerror(errno) << std::endl; + return -errno; + } + ::fstat(fd, &st); + bufferptr bp(st.st_size); + bl.append(bp); + ::read(fd, (void*)bl.c_str(), bl.length()); + ::close(fd); + return 0; +} + + +int main(int argc, const char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + + bool clobber = false; + const char *fsdir = 0; + int whoami = -1; + const char *monmapfn = 0; + for (unsigned i = 0; i < args.size(); i++) { + if (strcmp(args[i], "--clobber") == 0) + clobber = true; + else if (strcmp(args[i], "--mon") == 0) + whoami = atoi(args[++i]); + else if (strcmp(args[i], "--monmap") == 0) + monmapfn = args[++i]; + else if (!fsdir) + fsdir = args[i]; + else + usage(); + } + if (!fsdir || !monmapfn || + whoami < 0) + usage(); + + if (!clobber) { + // make sure it doesn't exist + struct stat st; + if (::lstat(fsdir, &st) == 0) { + cerr << "monfs dir " << fsdir << " already exists; remove it first" << std::endl; + usage(); + } + } + + // load monmap + bufferlist monmapbl; + int err = read_file(monmapfn, monmapbl); + if (err < 0) + exit(1); + MonMap monmap; + monmap.decode(monmapbl); + + // go + MonitorStore store(fsdir); + Monitor mon(whoami, &store, 0, &monmap); + mon.mkfs(); + cout << argv[0] << ": created monfs at " << fsdir + << " for mon" << whoami + << std::endl; + return 0; +} diff --git a/src/mon/ClientMonitor.cc b/src/mon/ClientMonitor.cc index 80d0c24cec2dc..d715c1d871f1f 100644 --- a/src/mon/ClientMonitor.cc +++ b/src/mon/ClientMonitor.cc @@ -85,7 +85,6 @@ bool ClientMonitor::update_from_paxos() void ClientMonitor::create_pending() { - assert(mon->is_leader()); pending_inc = Incremental(); pending_inc.version = client_map.version + 1; pending_inc.next_client = client_map.next_client; @@ -96,7 +95,7 @@ void ClientMonitor::create_pending() void ClientMonitor::create_initial() { - dout(1) << "create_initial -- creating initial map" << dendl; + dout(10) << "create_initial -- creating initial map" << dendl; } void ClientMonitor::committed() @@ -107,7 +106,6 @@ void ClientMonitor::committed() void ClientMonitor::encode_pending(bufferlist &bl) { - assert(mon->is_leader()); dout(10) << "encode_pending v " << pending_inc.version << ", next is " << pending_inc.next_client << dendl; diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index 313ecb8fd6652..f5678d9e7e3ae 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -75,7 +75,7 @@ void MDSMonitor::print_map(MDSMap &m, int dbl) void MDSMonitor::create_initial() { dout(10) << "create_initial" << dendl; - pending_mdsmap.max_mds = g_conf.num_mds; + pending_mdsmap.max_mds = 1; pending_mdsmap.created = g_clock.now(); print_map(pending_mdsmap); } diff --git a/src/mon/MonMap.cc b/src/mon/MonMap.cc index c17d183848442..10d4e302726be 100644 --- a/src/mon/MonMap.cc +++ b/src/mon/MonMap.cc @@ -27,7 +27,7 @@ int MonMap::read(const char *fn) // read bufferlist bl; int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; + if (fd < 0) return -errno; struct stat st; ::fstat(fd, &st); bufferptr bp(st.st_size); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index 432a6d10fbf65..dab077c98564f 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -47,6 +47,13 @@ #define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " +void Monitor::preinit() +{ + osdmon = new OSDMonitor(this, &paxos_osdmap); + mdsmon = new MDSMonitor(this, &paxos_mdsmap); + clientmon = new ClientMonitor(this, &paxos_clientmap); + pgmon = new PGMonitor(this, &paxos_pgmap); +} void Monitor::init() { @@ -54,21 +61,7 @@ void Monitor::init() dout(1) << "init" << dendl; - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, &paxos_osdmap); - mdsmon = new MDSMonitor(this, &paxos_mdsmap); - clientmon = new ClientMonitor(this, &paxos_clientmap); - pgmon = new PGMonitor(this, &paxos_pgmap); + preinit(); // init paxos paxos_osdmap.init(); @@ -418,5 +411,49 @@ void Monitor::tick() +/* + * this is the closest thing to a traditional 'mkfs' for ceph. + * initialize the monitor state machines to their initial values. + */ +int Monitor::mkfs() +{ + preinit(); + + // create it + int err = store->mkfs(); + if (err < 0) { + cerr << "error " << err << " " << strerror(err) << std::endl; + exit(1); + } + + store->put_int(whoami, "whoami", 0); + + bufferlist monmapbl; + monmap->encode(monmapbl); + store->put_bl_ss(monmapbl, "monmap", 0); + + list services; + services.push_back(osdmon); + services.push_back(mdsmon); + services.push_back(clientmon); + services.push_back(pgmon); + for (list::iterator p = services.begin(); + p != services.end(); + p++) { + PaxosService *svc = *p; + dout(10) << "initializing " << svc->get_machine_name() << dendl; + svc->paxos->init(); + svc->create_pending(); + svc->create_initial(); + + // commit to paxos + bufferlist bl; + svc->encode_pending(bl); + store->put_bl_sn(bl, svc->get_machine_name(), 1); + store->put_int(1, svc->get_machine_name(), "last_committed"); + } + + return 0; +} diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index 432c639ea4153..0d67ae2929ab6 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -138,12 +138,12 @@ public: }; public: - Monitor(int w, Messenger *m, MonMap *mm) : + Monitor(int w, MonitorStore *s, Messenger *m, MonMap *map) : whoami(w), messenger(m), - monmap(mm), + monmap(map), timer(lock), tick_timer(0), - store(0), + store(s), state(STATE_STARTING), stopping(false), @@ -163,6 +163,7 @@ public: delete messenger; } + void preinit(); void init(); void shutdown(); void dispatch(Message *m); @@ -170,6 +171,8 @@ public: void do_stop(); + int mkfs(); + }; #endif diff --git a/src/mon/MonitorStore.cc b/src/mon/MonitorStore.cc index 92691e0e01591..2f2e3ec8caa03 100644 --- a/src/mon/MonitorStore.cc +++ b/src/mon/MonitorStore.cc @@ -27,14 +27,14 @@ #include #include -void MonitorStore::mount() +int MonitorStore::mount() { dout(1) << "mount" << dendl; // verify dir exists DIR *d = ::opendir(dir.c_str()); if (!d) { derr(1) << "basedir " << dir << " dne" << dendl; - assert(0); + return -ENOENT; } ::closedir(d); @@ -47,17 +47,19 @@ void MonitorStore::mount() dir += "/"; dir += old; } + return 0; } -void MonitorStore::mkfs() +int MonitorStore::mkfs() { dout(1) << "mkfs" << dendl; char cmd[200]; sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); dout(1) << cmd << dendl; - system(cmd); + int r = system(cmd); + return r; } diff --git a/src/mon/MonitorStore.h b/src/mon/MonitorStore.h index 485bf972551c4..0167127998de8 100644 --- a/src/mon/MonitorStore.h +++ b/src/mon/MonitorStore.h @@ -24,13 +24,12 @@ class MonitorStore { string dir; public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } + MonitorStore(const char *d) : dir(d) { } + ~MonitorStore() { } - void mkfs(); // wipe - void mount(); + int mkfs(); // wipe + int mount(); + int umount() { return 0; } // ints (stored as ascii) version_t get_int(const char *a, const char *b=0); diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 124f04182b15d..23976f1400d55 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -99,193 +99,23 @@ void OSDMonitor::fake_reorg() /************ MAPS ****************/ + void OSDMonitor::create_initial() { - assert(mon->is_leader()); - assert(paxos->get_version() == 0); - - dout(1) << "create_initial for " << mon->monmap->fsid << " from g_conf" << dendl; + dout(10) << "create_initial for " << mon->monmap->fsid << " from g_conf" << dendl; - // OSDMap newmap; + newmap.epoch = 1; newmap.set_fsid(mon->monmap->fsid); - newmap.mon_epoch = mon->mon_epoch; newmap.ctime = g_clock.now(); - - newmap.set_pg_num(g_conf.num_osd << g_conf.osd_pg_bits); - - // start at epoch 1 until all osds boot - newmap.inc_epoch(); // = 1 - assert(newmap.get_epoch() == 1); - - map weights; - build_crush_map(newmap.crush, weights); - - // -- test -- -#if 0 - { - //vector t; - //crush.do_rule(2, 132, t, 4, -1); - - // 3x5p2 - int n = 4; - int x = 0; - int p = 2; - vector r(n); - newmap.mark_down(0, false); - newmap.mark_out(0); - newmap.crush.do_rule(CRUSH_REP_RULE(n), x, r, n, p); - dout(0) << "test out " << r << dendl; - } -#endif - - newmap.set_max_osd(g_conf.num_osd); - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } + newmap.crush.create(); // empty crush map // encode into pending incremental newmap.encode(pending_inc.fullmap); } -void OSDMonitor::build_crush_map(CrushWrapper& crush, - map& weights) -{ - // new - crush.create(); - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - int ritems[ndom]; - int rweights[ndom]; - - int nper = ((g_conf.num_osd - 1) / ndom) + 1; - derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl; - - int o = 0; - for (int i=0; i= i) { - crush_rule *rule = crush_make_rule(4); - crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); - crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 1); - crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0); - crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0); - crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule); - } else { - crush_rule *rule = crush_make_rule(3); - crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); - crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0); - crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); - crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule); - } - } - - } else { - // one bucket - - int items[g_conf.num_osd]; - for (int i=0; imax_devices << dendl; -} bool OSDMonitor::update_from_paxos() @@ -321,7 +151,7 @@ bool OSDMonitor::update_from_paxos() int off = 0; inc.decode(bl, off); osdmap.apply_incremental(inc); - + // write out the full map, too. bl.clear(); osdmap.encode(bl); @@ -358,7 +188,6 @@ void OSDMonitor::encode_pending(bufferlist &bl) // finish up pending_inc pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; // tell me about it for (map::iterator i = pending_inc.new_down.begin(); @@ -472,11 +301,11 @@ bool OSDMonitor::should_propose(double& delay) { dout(10) << "should_propose" << dendl; if (osdmap.epoch == 1) { - if (pending_inc.new_up.size() == (unsigned)g_conf.num_osd) { + if (pending_inc.new_up.size() == (unsigned)osdmap.get_max_osd()) { delay = 0.0; if (g_conf.osd_auto_weight) { CrushWrapper crush; - build_crush_map(crush, osd_weight); + OSDMap::build_simple_crush_map(crush, osdmap.get_max_osd(), osd_weight); crush._encode(pending_inc.crush); } return true; @@ -645,6 +474,9 @@ bool OSDMonitor::prepare_boot(MOSDBoot *m) osd_weight[from] = m->sb.weight; + if (!osdmap.post_mkfs() && !osdmap.is_mkfs()) + pending_inc.mkfs = 1; // first set of up osds, do the mkfs! + // wait paxos->wait_for_commit(new C_Booted(this, m)); } @@ -915,6 +747,15 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } + else if (m->cmd[1] == "setmap") { + OSDMap map; + map.decode(m->get_data()); + map.epoch = pending_inc.epoch; // make sure epoch is correct + map.encode(pending_inc.fullmap); + string rs = "set osd map"; + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); + return true; + } else if (m->cmd[1] == "setmaxosd" && m->cmd.size() > 2) { pending_inc.new_max_osd = atoi(m->cmd[2].c_str()); ss << "set new max_osd = " << pending_inc.new_max_osd; diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index cee68642e7ad2..49fcbf767e4a8 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -46,11 +46,10 @@ private: map osd_weight; - void build_crush_map(CrushWrapper& crush, - map& weights); - // svc +public: void create_initial(); +private: bool update_from_paxos(); void create_pending(); // prepare a new pending void encode_pending(bufferlist &bl); diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 18f7c4c008917..4a223696bfa0b 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -63,7 +63,7 @@ void PGMonitor::tick() { */ void PGMonitor::create_initial() { - dout(1) << "create_initial -- creating initial map" << dendl; + dout(10) << "create_initial -- creating initial map" << dendl; } bool PGMonitor::update_from_paxos() @@ -127,7 +127,6 @@ void PGMonitor::create_pending() void PGMonitor::encode_pending(bufferlist &bl) { - assert(mon->is_leader()); dout(10) << "encode_pending v " << pending_inc.version << dendl; assert(paxos->get_version() + 1 == pending_inc.version); pending_inc._encode(bl); diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc index ce15e5d5b4813..b22f223aaf527 100644 --- a/src/mon/Paxos.cc +++ b/src/mon/Paxos.cc @@ -711,7 +711,7 @@ void Paxos::dispatch(Message *m) bool Paxos::is_readable() { - //dout(15) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl; + dout(1) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl; return (mon->is_peon() || mon->is_leader()) && is_active() && diff --git a/src/mon/Paxos.h b/src/mon/Paxos.h index 57c0e001495e8..b2e9ce900207c 100644 --- a/src/mon/Paxos.h +++ b/src/mon/Paxos.h @@ -203,6 +203,10 @@ public: lease_timeout_event(0), accept_timeout_event(0) { } + const char *get_machine_name() const { + return machine_name; + } + void dispatch(Message *m); void init(); diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc index 7b0eed20972a0..96c0482910d2c 100644 --- a/src/mon/PaxosService.cc +++ b/src/mon/PaxosService.cc @@ -23,6 +23,10 @@ #define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxosservice(" << get_paxos_name(paxos->machine_id) << ") " +const char *PaxosService::get_machine_name() +{ + return paxos->get_machine_name(); +} void PaxosService::dispatch(Message *m) @@ -110,6 +114,7 @@ void PaxosService::propose_pending() { dout(10) << "propose_pending" << dendl; assert(have_pending); + assert(mon->is_leader()); if (proposal_timer) { mon->timer.cancel_event(proposal_timer); @@ -160,12 +165,6 @@ void PaxosService::_active() create_pending(); have_pending = true; } - - if (g_conf.mkfs && - paxos->get_version() == 0) { - create_initial(); - propose_pending(); - } } } diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index 234f0bf3c4c4f..1e97e421b2b3b 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -25,6 +25,7 @@ class PaxosService : public Dispatcher { protected: Monitor *mon; Paxos *paxos; + friend class Monitor; class C_RetryMessage : public Context { PaxosService *svc; @@ -75,6 +76,8 @@ public: PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p), proposal_timer(0), have_pending(false) { } + + const char *get_machine_name(); // i implement and you ignore void dispatch(Message *m); @@ -89,9 +92,9 @@ public: void propose_pending(); // propose current pending as new paxos state // you implement + virtual void create_initial() = 0; virtual bool update_from_paxos() = 0; // assimilate latest state from paxos virtual void create_pending() = 0; // [leader] create new pending structures - virtual void create_initial() = 0; // [leader] populate pending with initial state (1) virtual void encode_pending(bufferlist& bl) = 0; // [leader] finish and encode pending for next paxos state virtual void discard_pending() { } // [leader] discard pending diff --git a/src/monmaptool.cc b/src/monmaptool.cc index b22a881ce9473..70ff5facd4c8b 100644 --- a/src/monmaptool.cc +++ b/src/monmaptool.cc @@ -33,12 +33,11 @@ using namespace std; ./monmaptool -f .ceph_monmap --add 1.2.3.4:12345 ./monmaptool -f .ceph_monmap --rm 1.2.3.4:12345 - */ void usage(const char *me) { - cout << me << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567]" << std::endl; + cout << me << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567] " << std::endl; exit(1); } @@ -58,7 +57,7 @@ int main(int argc, const char **argv) const char *me = argv[0]; - const char *fn = ".ceph_monmap"; + const char *fn = 0; bool print = false; bool create = false; bool clobber = false; @@ -66,9 +65,7 @@ int main(int argc, const char **argv) list add, rm; for (unsigned i=0; i acting; int nrep = osdmap->pg_to_acting_osds(pgid, acting); + dout(20) << "pgid " << pgid << " -> " << acting << dendl; int role = osdmap->calc_pg_role(whoami, acting, nrep); if (role < 0) return; @@ -1324,12 +1325,12 @@ void OSD::advance_map(ObjectStore::Transaction& t) { dout(7) << "advance_map epoch " << osdmap->get_epoch() << " " << pg_map.size() << " pgs" + << " mkfs_peoch " << osdmap->get_mkfs_epoch() << dendl; if (osdmap->is_mkfs()) { // is this okay? - assert(superblock.current_epoch == 2); ceph_fsid nullfsid; memset(&nullfsid, 0, sizeof(nullfsid)); if (memcmp(&nullfsid, &superblock.fsid, sizeof(nullfsid)) != 0) { diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc new file mode 100644 index 0000000000000..223f2cd6db6d2 --- /dev/null +++ b/src/osd/OSDMap.cc @@ -0,0 +1,162 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "OSDMap.h" + +#include "config.h" +#define dout generic_dout +#define derr generic_derr + + +void OSDMap::build_simple(epoch_t e, ceph_fsid &fsid, + int num_osd, int pg_bits, int mds_local_osd) +{ + dout(10) << "build_simple on " << num_osd + << " osds with " << pg_bits << " pg bits per osd" << dendl; + epoch = e; + set_fsid(fsid); + ctime = g_clock.now(); + + set_max_osd(num_osd); + set_pg_num(num_osd << pg_bits); + + // crush map + map weights; + build_simple_crush_map(crush, num_osd, weights); + + for (int i=0; i& weights) +{ + // new + crush.create(); + + if (num_osd >= 12) { + int ndom = g_conf.osd_max_rep; + int ritems[ndom]; + int rweights[ndom]; + + int nper = ((num_osd - 1) / ndom) + 1; + derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl; + + int o = 0; + for (int i=0; i= i) { + crush_rule *rule = crush_make_rule(4); + crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); + crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 1); + crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0); + crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0); + crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule); + } else { + crush_rule *rule = crush_make_rule(3); + crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0); + crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0); + crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0); + crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule); + } + } + + } else { + // one bucket + + int items[num_osd]; + for (int i=0; imax_devices << dendl; +} diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index e9d7cf467c9c3..6de80fd9fa148 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -69,7 +69,6 @@ public: public: ceph_fsid fsid; epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch - epoch_t mon_epoch; // monitor epoch (election iteration) utime_t ctime; // full (rare) @@ -78,6 +77,7 @@ public: // incremental int32_t new_max_osd; + __u8 mkfs; map new_up; map new_down; map new_offload; @@ -87,7 +87,7 @@ public: void encode(bufferlist& bl) { ::_encode(fsid, bl); ::_encode(epoch, bl); - ::_encode(mon_epoch, bl); + ::_encode(mkfs, bl); ctime._encode(bl); ::_encode(fullmap, bl); ::_encode(crush, bl); @@ -101,7 +101,7 @@ public: void decode(bufferlist& bl, int& off) { ::_decode(fsid, bl, off); ::_decode(epoch, bl, off); - ::_decode(mon_epoch, bl, off); + ::_decode(mkfs, bl, off); ctime._decode(bl, off); ::_decode(fullmap, bl, off); ::_decode(crush, bl, off); @@ -113,15 +113,14 @@ public: ::_decode(old_pg_swap_primary, bl, off); } - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0), new_max_osd(-1) { + Incremental(epoch_t e=0) : epoch(e), new_max_osd(-1), mkfs(0) { fsid.major = fsid.minor = cpu_to_le64(0); } }; private: ceph_fsid fsid; - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) + epoch_t epoch, mkfs_epoch; // what epoch of the osd cluster descriptor is this utime_t ctime, mtime; // epoch start time int32_t pg_num; // placement group count int32_t pg_num_mask; // bitmask for above @@ -140,7 +139,7 @@ private: friend class MDS; public: - OSDMap() : epoch(0), mon_epoch(0), + OSDMap() : epoch(0), mkfs_epoch((epoch_t)-1), pg_num(1<<5), localized_pg_num(1<<3), max_osd(0) { @@ -154,6 +153,7 @@ private: epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } + void set_epoch(epoch_t e) { epoch = e; } /* pg num / masks */ void calc_pg_masks() { @@ -169,8 +169,9 @@ private: const utime_t& get_ctime() const { return ctime; } const utime_t& get_mtime() const { return mtime; } - bool is_mkfs() const { return epoch == 2; } - bool post_mkfs() const { return epoch > 2; } + bool is_mkfs() const { return epoch == mkfs_epoch; } + bool post_mkfs() const { return epoch > mkfs_epoch; } + epoch_t get_mkfs_epoch() const { return mkfs_epoch; } /***** cluster state *****/ /* osds */ @@ -273,7 +274,6 @@ private: assert(ceph_fsid_equal(&inc.fsid, &fsid) || inc.epoch == 1); assert(inc.epoch == epoch+1); epoch++; - mon_epoch = inc.mon_epoch; ctime = inc.ctime; // full map? @@ -287,6 +287,9 @@ private: } // nope, incremental. + if (inc.mkfs) + mkfs_epoch = epoch; + if (inc.new_max_osd >= 0) set_max_osd(inc.new_max_osd); @@ -325,7 +328,7 @@ private: void encode(bufferlist& blist) { ::_encode(fsid, blist); ::_encode(epoch, blist); - ::_encode(mon_epoch, blist); + ::_encode(mkfs_epoch, blist); ::_encode(ctime, blist); ::_encode(mtime, blist); ::_encode(pg_num, blist); @@ -345,7 +348,7 @@ private: int off = 0; ::_decode(fsid, blist, off); ::_decode(epoch, blist, off); - ::_decode(mon_epoch, blist, off); + ::_decode(mkfs_epoch, blist, off); ::_decode(ctime, blist, off); ::_decode(mtime, blist, off); ::_decode(pg_num, blist, off); @@ -585,7 +588,12 @@ private: } - + /* + * handy helpers to build simple maps... + */ + void build_simple(epoch_t e, ceph_fsid &fsid, + int num_osd, int pg_bits, int mds_local_osd); + static void build_simple_crush_map(CrushWrapper& crush, int num_osd, map& weights); }; diff --git a/src/osdmaptool.cc b/src/osdmaptool.cc new file mode 100644 index 0000000000000..48ea54fd42cfb --- /dev/null +++ b/src/osdmaptool.cc @@ -0,0 +1,161 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "osd/OSDMap.h" +#include "mon/MonMap.h" + +void usage(const char *me) +{ + cout << me << " usage: [--print] [--createsimple [--clobber] [--pgbits ]] " << std::endl; + exit(1); +} + +void printmap(const char *me, OSDMap *m) +{ + cout << me << ": osdmap: epoch " << m->get_epoch() << std::endl + << me << ": osdmap: fsid " << m->get_fsid() << std::endl; + /*for (unsigned i=0; imon_inst.size(); i++) + cout << me << ": osdmap: " //<< "mon" << i << " " + << m->mon_inst[i] << std::endl; + */ +} + +int read_file(const char *fn, bufferlist &bl) +{ + struct stat st; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + cerr << "can't open " << fn << ": " << strerror(errno) << std::endl; + return -errno; + } + ::fstat(fd, &st); + bufferptr bp(st.st_size); + bl.append(bp); + ::read(fd, (void*)bl.c_str(), bl.length()); + ::close(fd); + return 0; +} + +int write_file(const char *fn, bufferlist &bl) +{ + int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644); + if (fd < 0) { + cerr << "can't write " << fn << ": " << strerror(errno) << std::endl; + return -errno; + } + ::write(fd, (void*)bl.c_str(), bl.length()); + ::close(fd); + return 0; +} + +int main(int argc, const char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + const char *me = argv[0]; + + const char *fn = 0; + bool print = false; + bool createsimple = false; + const char *monmapfn; + int num_osd; + int pg_bits = g_conf.osd_pg_bits; + bool clobber = false; + bool modified = false; + list add, rm; + + for (unsigned i=0; i= 0); + } + + + return 0; +} diff --git a/src/start.sh b/src/start.sh index 4bd143130b5b2..765e80dcb7111 100755 --- a/src/start.sh +++ b/src/start.sh @@ -21,14 +21,27 @@ then echo fi -$CEPH_BIN/monmaptool --create --clobber --add $IP:12345 --print # your IP here +# build a fresh fs monmap, mon fs +$CEPH_BIN/monmaptool --create --clobber --add $IP:12345 --print .ceph_monmap +$CEPH_BIN/mkmonfs --clobber mondata/mon0 --mon 0 --monmap .ceph_monmap -ARGS="-d --bind $IP --doutdir out --debug_ms 1" -$CEPH_BIN/cmon $ARGS --mkfs --mon 0 +# shared args +ARGS="-d --bind $IP -o out --debug_ms 1" + +# start monitor +$CEPH_BIN/cmon $ARGS mondata/mon0 --debug_mon 10 --debug_ms 1 + +# build and inject an initial osd map +$CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 4 --print .ceph_osdmap +$CEPH_BIN/cmonctl osd setmap -i .ceph_osdmap + +# osds $CEPH_BIN/cosd $ARGS --mkfs --osd 0 $CEPH_BIN/cosd $ARGS --mkfs --osd 1 $CEPH_BIN/cosd $ARGS --mkfs --osd 2 $CEPH_BIN/cosd $ARGS --mkfs --osd 3 + +# mds $CEPH_BIN/cmds $ARGS --debug_mds 10 echo "started. stop.sh to stop. see out/* (e.g. 'tail -f out/????') for debug output." -- 2.39.5