# monitor
cmon_SOURCES = cmon.cc msg/SimpleMessenger.cc
cmon_LDADD = libmon.a libcrush.a libcommon.a
-mkmonmap_SOURCES = mkmonmap.cc
-mkmonmap_LDADD = libcommon.a
-monmaptool_SOURCES = monmaptool.cc
-monmaptool_LDADD = libcommon.a
cmonctl_SOURCES = cmonctl.cc msg/SimpleMessenger.cc
cmonctl_LDADD = libcommon.a
+
+# tools
+mkmonfs_SOURCES = mkmonfs.cc
+mkmonfs_LDADD = libmon.a libcommon.a libcrush.a
+monmaptool_SOURCES = monmaptool.cc
+monmaptool_LDADD = libcommon.a
crushtool_SOURCES = crushtool.cc
crushtool_LDADD = libcommon.a libcrush.a
+osdmaptool_SOURCES = osdmaptool.cc
+osdmaptool_LDADD = libmon.a libcommon.a libcrush.a
# mds
cmds_SOURCES = cmds.cc msg/SimpleMessenger.cc
bin_PROGRAMS = \
cmon cmds cosd csyn \
- mkmonmap monmaptool crushtool cmonctl \
+ cmonctl \
+ mkmonfs monmaptool osdmaptool crushtool \
fakesyn \
streamtest \
$(FUSEBIN) $(NEWSYN)
common/Finisher.cc \
mon/MonMap.cc \
mon/MonClient.cc \
+ osd/OSDMap.cc \
config.cc
libcrush_a_SOURCES = \
- libbtrfs objectstore
- osdmap/crush gui
+
+- online changes to pg_num
+- initialization of pg_num during non-simple mkfs...
+- review/remove post_mkfs hacks in PG.cc??
+
code cleanup
- userspace encoding/decoding needs major cleanup
- use le32 etc annotation
#include "mon/MonMap.h"
#include "mon/Monitor.h"
+#include "mon/MonitorStore.h"
#include "msg/SimpleMessenger.h"
#include "common/Timer.h"
+void usage()
+{
+ cerr << "usage: ./cmon [flags] <monfsdir>" << std::endl;
+ cerr << " -d daemonize" << std::endl;
+ cerr << " -o <dir> log output to dir/mon#" << std::endl;
+ cerr << " --debug_mon n debug monitor level (e.g. 10)" << std::endl;
+ cerr << " --debug_ms n debug messaging level (e.g. 1)" << std::endl;
+ exit(1);
+}
int main(int argc, const char **argv)
{
+ int err;
+
vector<const char*> args;
argv_to_vec(argc, argv, args);
parse_config_options(args);
// args
- int whoami = -1;
- const char *monmap_fn = ".ceph_monmap";
+ const char *fsdir = 0;
for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--mon") == 0)
- whoami = atoi(args[++i]);
- else if (strcmp(args[i], "--monmap") == 0)
- monmap_fn = args[++i];
- else {
- cerr << "unrecognized arg " << args[i] << std::endl;
- return -1;
- }
+ if (!fsdir)
+ fsdir = args[i];
+ else
+ usage();
}
+ if (!fsdir)
+ usage();
if (g_conf.clock_tare) g_clock.tare();
+ MonitorStore store(fsdir);
+ err = store.mount();
+ if (err < 0) {
+ cerr << "problem opening monitor store in " << fsdir << ": " << strerror(err) << std::endl;
+ exit(1);
+ }
+
+ // whoami?
+ if (!store.exists_bl_ss("whoami")) {
+ cerr << "mon fs missing 'whoami'" << std::endl;
+ exit(1);
+ }
+ int whoami = store.get_int("whoami");
+
+ // monmap?
+ bufferlist mapbl;
+ store.get_bl_ss(mapbl, "monmap", 0);
+ if (mapbl.length() == 0) {
+ cerr << "mon fs missing 'monmap'" << std::endl;
+ exit(1);
+ }
MonMap monmap;
+ monmap.decode(mapbl);
- if (whoami < 0) {
- // let's assume a standalone monitor
- whoami = 0;
-
- // start messenger
- rank.bind(0);
- cout << "starting standalone mon0, bound to " << rank.get_rank_addr() << std::endl;
-
- // add single mon0
- entity_inst_t inst;
- inst.name = entity_name_t::MON(0);
- inst.addr = rank.rank_addr;
- monmap.add_mon(inst);
-
- // write monmap
- cout << "writing monmap to " << monmap_fn << std::endl;;
- int r = monmap.write(monmap_fn);
- if (r < 0) {
- cerr << "couldn't write monmap to " << monmap_fn
- << ": " << strerror(errno) << std::endl;
- return -1;
- }
- } else {
- // i am specific monitor.
-
- // read monmap
- //cout << "reading monmap from " << monmap_fn << std::endl;
- int r = monmap.read(monmap_fn);
- if (r < 0) {
- cerr << "couldn't read monmap from " << monmap_fn
- << ": " << strerror(errno) << std::endl;
- return -1;
- }
-
- // bind to a specific port
- cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami).addr
- << " from " << monmap_fn
- << std::endl;
- g_my_addr = monmap.get_inst(whoami).addr;
- rank.bind();
+ if ((unsigned)whoami >= monmap.size() || whoami < 0) {
+ cerr << "mon" << whoami << " does not exist in monmap" << std::endl;
+ exit(1);
}
+ // bind
+ cout << "starting mon" << whoami
+ << " at " << monmap.get_inst(whoami).addr
+ << " from " << fsdir << std::endl;
+ g_my_addr = monmap.get_inst(whoami).addr;
+ err = rank.bind();
+ if (err < 0)
+ return 1;
+
create_courtesy_output_symlink("mon", whoami);
- rank.start();
-
// start monitor
Messenger *m = rank.register_entity(entity_name_t::MON(whoami));
- Monitor *mon = new Monitor(whoami, m, &monmap);
- mon->init();
+ Monitor *mon = new Monitor(whoami, &store, m, &monmap);
+ rank.start(); // may daemonize
+ mon->init();
rank.wait();
- // done
+ store.umount();
delete mon;
-
return 0;
}
#include "config.h"
#include "mon/Monitor.h"
-
+#include "mon/MonitorStore.h"
#include "mds/MDS.h"
#include "osd/OSD.h"
#include "client/Client.h"
#include "common/Timer.h"
#include "msg/FakeMessenger.h"
+#include "messages/MMonCommand.h"
Monitor *mon[g_conf.num_mon];
for (int i=0; i<g_conf.num_mon; i++) {
- mon[i] = new Monitor(i, new FakeMessenger(entity_name_t::MON(i)), monmap);
+ char fn[100];
+ sprintf(fn, "mondata/mon%d", i);
+ MonitorStore *store = new MonitorStore(fn);
+ mon[i] = new Monitor(i, store, new FakeMessenger(entity_name_t::MON(i)), monmap);
+ mon[i]->mkfs();
}
// create osd
}
// init
- for (int i=0; i<g_conf.num_mon; i++)
+ for (int i=0; i<g_conf.num_mon; i++)
mon[i]->init();
- for (int i=0; i<NUMMDS; i++)
- mds[i]->init();
+
+ // build initial osd map
+ {
+ OSDMap map;
+ map.build_simple(0, monmap->fsid, g_conf.num_osd, g_conf.osd_pg_bits, 0);
+ bufferlist bl;
+ map.encode(bl);
+ Messenger *messenger = new FakeMessenger(entity_name_t::ADMIN(-1));
+ MMonCommand *m = new MMonCommand(messenger->get_myinst());
+ m->set_data(bl);
+ m->cmd.push_back("osd");
+ m->cmd.push_back("setmap");
+ messenger->send_message(m, monmap->get_inst(0));
+ messenger->shutdown();
+ }
+
for (int i=0; i<NUMOSD; i++)
osd[i]->init();
-
+ for (int i=0; i<NUMMDS; i++)
+ mds[i]->init();
// create client
Client *client[NUMCLIENT];
#include "mds/MDS.h"
#include "osd/OSD.h"
#include "mon/Monitor.h"
+#include "mon/MonitorStore.h"
#include "client/Client.h"
#include "client/SyntheticClient.h"
#include "msg/FakeMessenger.h"
+#include "messages/MMonCommand.h"
#include "common/Timer.h"
// create mon
Monitor *mon[g_conf.num_mon];
- for (int i=0; i<g_conf.num_mon; i++)
- mon[i] = new Monitor(i, new FakeMessenger(entity_name_t::MON(i)), monmap);
+ for (int i=0; i<g_conf.num_mon; i++) {
+ char fn[100];
+ sprintf(fn, "mondata/mon%d", i);
+ MonitorStore *store = new MonitorStore(fn);
+ mon[i] = new Monitor(i, store, new FakeMessenger(entity_name_t::MON(i)), monmap);
+ mon[i]->mkfs();
+ }
// create mds
MDS *mds[g_conf.num_mds];
start++;
}
+ // build initial osd map
+ {
+ OSDMap map;
+ map.build_simple(0, monmap->fsid, g_conf.num_osd, g_conf.osd_pg_bits, 0);
+ bufferlist bl;
+ map.encode(bl);
+ Messenger *messenger = new FakeMessenger(entity_name_t::ADMIN(-1));
+ MMonCommand *m = new MMonCommand(messenger->get_myinst());
+ m->set_data(bl);
+ m->cmd.push_back("osd");
+ m->cmd.push_back("setmap");
+ messenger->send_message(m, monmap->get_inst(0));
+ messenger->shutdown();
+ }
+
// create osd
OSD *osd[g_conf.num_osd];
for (int i=0; i<g_conf.num_osd; i++) {
//cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << std::endl;
osd[i] = new OSD(i, new FakeMessenger(entity_name_t::OSD(i)), monmap);
start++;
- }
-
+ }
// start message loop
goto bad;
if ((err = ceph_decode_32(p, end, &map->epoch)) < 0)
goto bad;
- if ((err = ceph_decode_32(p, end, &map->mon_epoch)) < 0)
+ if ((err = ceph_decode_32(p, end, &map->mkfs_epoch)) < 0)
goto bad;
if ((err = ceph_decode_32(p, end, &map->ctime.tv_sec)) < 0)
goto bad;
struct ceph_osdmap *newmap = map;
struct crush_map *newcrush = 0;
struct ceph_fsid fsid;
- __u32 epoch, mon_epoch;
+ __u32 epoch;
struct ceph_timeval ctime;
__u32 len;
__u32 max;
goto bad;
if ((err = ceph_decode_32(p, end, &epoch)) < 0)
goto bad;
+ (*p)++; /* skip mkfs u8 */
BUG_ON(epoch != map->epoch+1);
- if ((err = ceph_decode_32(p, end, &mon_epoch)) < 0)
- goto bad;
if ((err = ceph_decode_32(p, end, &ctime.tv_sec)) < 0)
goto bad;
if ((err = ceph_decode_32(p, end, &ctime.tv_usec)) < 0)
}
map->epoch++;
- map->mon_epoch = mon_epoch;
map->ctime = map->ctime;
if (newcrush) {
if (map->crush)
struct ceph_osdmap {
struct ceph_fsid fsid;
ceph_epoch_t epoch;
- ceph_epoch_t mon_epoch;
+ ceph_epoch_t mkfs_epoch;
struct ceph_timeval ctime, mtime;
__u32 pg_num, pg_num_mask;
mdsmap->is_up(whoami) &&
(oldwhoami != whoami || !logger)) {
create_courtesy_output_symlink("mds", whoami);
- reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline
+ reopen_logger(mdsmap->get_created()); // adopt mds cluster timeline
}
if (oldwhoami != whoami) {
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
- const utime_t& get_create() const { return created; }
+ const utime_t& get_created() const { return created; }
+ void set_created(utime_t ct) { created = ct; }
+
+ int get_max_mds() const { return max_mds; }
+ void set_max_mds(int m) { max_mds = m; }
int get_anchortable() const { return anchortable; }
int get_root() const { return root; }
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "mon/MonitorStore.cc"
+#include "config.h"
+
+#include "mon/Monitor.h"
+#include "mon/MonMap.h"
+#include "mds/MDSMap.h"
+#include "osd/OSDMap.h"
+#include "mon/PGMap.h"
+
+void usage()
+{
+ cerr << "usage: ./mkmonfs [--clobber] <monfs dir> --mon <monid> --monmap <file> --osdmap <file>" << std::endl;
+ cerr << " --maxmds N set max mds (default is 1)" << std::endl;
+ exit(1);
+}
+
+int read_file(const char *fn, bufferlist &bl)
+{
+ struct stat st;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ cerr << "can't open " << fn << ": " << strerror(errno) << std::endl;
+ return -errno;
+ }
+ ::fstat(fd, &st);
+ bufferptr bp(st.st_size);
+ bl.append(bp);
+ ::read(fd, (void*)bl.c_str(), bl.length());
+ ::close(fd);
+ return 0;
+}
+
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+
+ bool clobber = false;
+ const char *fsdir = 0;
+ int whoami = -1;
+ const char *monmapfn = 0;
+ for (unsigned i = 0; i < args.size(); i++) {
+ if (strcmp(args[i], "--clobber") == 0)
+ clobber = true;
+ else if (strcmp(args[i], "--mon") == 0)
+ whoami = atoi(args[++i]);
+ else if (strcmp(args[i], "--monmap") == 0)
+ monmapfn = args[++i];
+ else if (!fsdir)
+ fsdir = args[i];
+ else
+ usage();
+ }
+ if (!fsdir || !monmapfn ||
+ whoami < 0)
+ usage();
+
+ if (!clobber) {
+ // make sure it doesn't exist
+ struct stat st;
+ if (::lstat(fsdir, &st) == 0) {
+ cerr << "monfs dir " << fsdir << " already exists; remove it first" << std::endl;
+ usage();
+ }
+ }
+
+ // load monmap
+ bufferlist monmapbl;
+ int err = read_file(monmapfn, monmapbl);
+ if (err < 0)
+ exit(1);
+ MonMap monmap;
+ monmap.decode(monmapbl);
+
+ // go
+ MonitorStore store(fsdir);
+ Monitor mon(whoami, &store, 0, &monmap);
+ mon.mkfs();
+ cout << argv[0] << ": created monfs at " << fsdir
+ << " for mon" << whoami
+ << std::endl;
+ return 0;
+}
void ClientMonitor::create_pending()
{
- assert(mon->is_leader());
pending_inc = Incremental();
pending_inc.version = client_map.version + 1;
pending_inc.next_client = client_map.next_client;
void ClientMonitor::create_initial()
{
- dout(1) << "create_initial -- creating initial map" << dendl;
+ dout(10) << "create_initial -- creating initial map" << dendl;
}
void ClientMonitor::committed()
void ClientMonitor::encode_pending(bufferlist &bl)
{
- assert(mon->is_leader());
dout(10) << "encode_pending v " << pending_inc.version
<< ", next is " << pending_inc.next_client
<< dendl;
void MDSMonitor::create_initial()
{
dout(10) << "create_initial" << dendl;
- pending_mdsmap.max_mds = g_conf.num_mds;
+ pending_mdsmap.max_mds = 1;
pending_mdsmap.created = g_clock.now();
print_map(pending_mdsmap);
}
// read
bufferlist bl;
int fd = ::open(fn, O_RDONLY);
- if (fd < 0) return fd;
+ if (fd < 0) return -errno;
struct stat st;
::fstat(fd, &st);
bufferptr bp(st.st_size);
#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " "
+void Monitor::preinit()
+{
+ osdmon = new OSDMonitor(this, &paxos_osdmap);
+ mdsmon = new MDSMonitor(this, &paxos_mdsmap);
+ clientmon = new ClientMonitor(this, &paxos_clientmap);
+ pgmon = new PGMonitor(this, &paxos_pgmap);
+}
void Monitor::init()
{
dout(1) << "init" << dendl;
- // store
- char s[80];
- sprintf(s, "mondata/mon%d", whoami);
- store = new MonitorStore(s);
-
- if (g_conf.mkfs)
- store->mkfs();
-
- store->mount();
-
- // create
- osdmon = new OSDMonitor(this, &paxos_osdmap);
- mdsmon = new MDSMonitor(this, &paxos_mdsmap);
- clientmon = new ClientMonitor(this, &paxos_clientmap);
- pgmon = new PGMonitor(this, &paxos_pgmap);
+ preinit();
// init paxos
paxos_osdmap.init();
+/*
+ * this is the closest thing to a traditional 'mkfs' for ceph.
+ * initialize the monitor state machines to their initial values.
+ */
+int Monitor::mkfs()
+{
+ preinit();
+
+ // create it
+ int err = store->mkfs();
+ if (err < 0) {
+ cerr << "error " << err << " " << strerror(err) << std::endl;
+ exit(1);
+ }
+
+ store->put_int(whoami, "whoami", 0);
+
+ bufferlist monmapbl;
+ monmap->encode(monmapbl);
+ store->put_bl_ss(monmapbl, "monmap", 0);
+
+ list<PaxosService*> services;
+ services.push_back(osdmon);
+ services.push_back(mdsmon);
+ services.push_back(clientmon);
+ services.push_back(pgmon);
+ for (list<PaxosService*>::iterator p = services.begin();
+ p != services.end();
+ p++) {
+ PaxosService *svc = *p;
+ dout(10) << "initializing " << svc->get_machine_name() << dendl;
+ svc->paxos->init();
+ svc->create_pending();
+ svc->create_initial();
+
+ // commit to paxos
+ bufferlist bl;
+ svc->encode_pending(bl);
+ store->put_bl_sn(bl, svc->get_machine_name(), 1);
+ store->put_int(1, svc->get_machine_name(), "last_committed");
+ }
+
+ return 0;
+}
};
public:
- Monitor(int w, Messenger *m, MonMap *mm) :
+ Monitor(int w, MonitorStore *s, Messenger *m, MonMap *map) :
whoami(w),
messenger(m),
- monmap(mm),
+ monmap(map),
timer(lock), tick_timer(0),
- store(0),
+ store(s),
state(STATE_STARTING), stopping(false),
delete messenger;
}
+ void preinit();
void init();
void shutdown();
void dispatch(Message *m);
void do_stop();
+ int mkfs();
+
};
#endif
#include <errno.h>
#include <unistd.h>
-void MonitorStore::mount()
+int MonitorStore::mount()
{
dout(1) << "mount" << dendl;
// verify dir exists
DIR *d = ::opendir(dir.c_str());
if (!d) {
derr(1) << "basedir " << dir << " dne" << dendl;
- assert(0);
+ return -ENOENT;
}
::closedir(d);
dir += "/";
dir += old;
}
+ return 0;
}
-void MonitorStore::mkfs()
+int MonitorStore::mkfs()
{
dout(1) << "mkfs" << dendl;
char cmd[200];
sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str());
dout(1) << cmd << dendl;
- system(cmd);
+ int r = system(cmd);
+ return r;
}
string dir;
public:
- MonitorStore(char *d) : dir(d) {
- }
- ~MonitorStore() {
- }
+ MonitorStore(const char *d) : dir(d) { }
+ ~MonitorStore() { }
- void mkfs(); // wipe
- void mount();
+ int mkfs(); // wipe
+ int mount();
+ int umount() { return 0; }
// ints (stored as ascii)
version_t get_int(const char *a, const char *b=0);
/************ MAPS ****************/
+
void OSDMonitor::create_initial()
{
- assert(mon->is_leader());
- assert(paxos->get_version() == 0);
-
- dout(1) << "create_initial for " << mon->monmap->fsid << " from g_conf" << dendl;
+ dout(10) << "create_initial for " << mon->monmap->fsid << " from g_conf" << dendl;
- // <HACK set up OSDMap from g_conf>
OSDMap newmap;
+ newmap.epoch = 1;
newmap.set_fsid(mon->monmap->fsid);
- newmap.mon_epoch = mon->mon_epoch;
newmap.ctime = g_clock.now();
-
- newmap.set_pg_num(g_conf.num_osd << g_conf.osd_pg_bits);
-
- // start at epoch 1 until all osds boot
- newmap.inc_epoch(); // = 1
- assert(newmap.get_epoch() == 1);
-
- map<int,double> weights;
- build_crush_map(newmap.crush, weights);
-
- // -- test --
-#if 0
- {
- //vector<int> t;
- //crush.do_rule(2, 132, t, 4, -1);
-
- // 3x5p2
- int n = 4;
- int x = 0;
- int p = 2;
- vector<int> r(n);
- newmap.mark_down(0, false);
- newmap.mark_out(0);
- newmap.crush.do_rule(CRUSH_REP_RULE(n), x, r, n, p);
- dout(0) << "test out " << r << dendl;
- }
-#endif
-
- newmap.set_max_osd(g_conf.num_osd);
- for (int i=0; i<g_conf.num_osd; i++) {
- newmap.set_state(i, CEPH_OSD_EXISTS|CEPH_OSD_CLEAN);
- newmap.set_offload(i, CEPH_OSD_IN);
- }
-
- if (g_conf.mds_local_osd) {
- newmap.set_max_osd(g_conf.num_mds+g_conf.num_osd);
-
- // add mds local osds, but don't put them in the crush mapping func
- for (int i=0; i<g_conf.num_mds; i++) {
- newmap.set_max_osd(i+g_conf.num_osd);
- newmap.set_state(i, CEPH_OSD_EXISTS);
- newmap.set_offload(i, CEPH_OSD_IN);
- }
- }
-
- // </HACK>
-
- // fake osd failures
- for (map<int,float>::iterator i = g_fake_osd_down.begin();
- i != g_fake_osd_down.end();
- i++) {
- dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << dendl;
- mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1));
- }
- for (map<int,float>::iterator i = g_fake_osd_out.begin();
- i != g_fake_osd_out.end();
- i++) {
- dout(0) << "will fake osd" << i->first << " OUT after " << i->second << dendl;
- mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0));
- }
+ newmap.crush.create(); // empty crush map
// encode into pending incremental
newmap.encode(pending_inc.fullmap);
}
-void OSDMonitor::build_crush_map(CrushWrapper& crush,
- map<int,double>& weights)
-{
- // new
- crush.create();
-
- if (g_conf.num_osd >= 12) {
- int ndom = g_conf.osd_max_rep;
- int ritems[ndom];
- int rweights[ndom];
-
- int nper = ((g_conf.num_osd - 1) / ndom) + 1;
- derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
-
- int o = 0;
- for (int i=0; i<ndom; i++) {
- int items[nper];
- //int w[nper];
- int j;
- rweights[i] = 0;
- for (j=0; j<nper; j++, o++) {
- if (o == g_conf.num_osd) break;
- dout(20) << "added osd" << o << dendl;
- items[j] = o;
- //w[j] = weights[o] ? (0x10000 - (int)(weights[o] * 0x10000)):0x10000;
- //rweights[i] += w[j];
- rweights[i] += 0x10000;
- }
- crush_bucket_uniform *domain = crush_make_uniform_bucket(1, j, items, 0x10000);
- ritems[i] = crush_add_bucket(crush.crush, 0, (crush_bucket*)domain);
- dout(20) << "added domain bucket i " << ritems[i] << " of size " << j << dendl;
- }
-
- // root
- crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights);
- int rootid = crush_add_bucket(crush.crush, 0, (crush_bucket*)root);
-
- // rules
- // replication
- for (int pool=0; pool<1; pool++)
- for (int i=1; i<=ndom; i++) {
- crush_rule *rule = crush_make_rule(4);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
- crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
- crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_REP_RULE(i, pool), rule);
- }
-
- // raid
- for (int pool=0; pool<1; pool++)
- for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
- if (ndom >= i) {
- crush_rule *rule = crush_make_rule(4);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 1);
- crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0);
- crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
- } else {
- crush_rule *rule = crush_make_rule(3);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
- }
- }
-
- } else {
- // one bucket
-
- int items[g_conf.num_osd];
- for (int i=0; i<g_conf.num_osd; i++)
- items[i] = i;
-
- crush_bucket_uniform *b = crush_make_uniform_bucket(1, g_conf.num_osd, items, 0x10000);
- int root = crush_add_bucket(crush.crush, 0, (crush_bucket*)b);
-
- // rules
- // replication
- for (int pool=0; pool<1; pool++)
- for (int i=1; i<=g_conf.osd_max_rep; i++) {
- crush_rule *rule = crush_make_rule(3);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_REP_RULE(i, pool), rule);
- }
-
- // raid4
- for (int pool=0; pool<1; pool++)
- for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
- crush_rule *rule = crush_make_rule(3);
- crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
- crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
- crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
- crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
- }
- }
-
- crush.finalize();
-
- // mark all in
- for (int i=0; i<g_conf.num_osd; i++)
- crush.set_offload(i, CEPH_OSD_IN);
-
- dout(20) << "crush max_devices " << crush.crush->max_devices << dendl;
-}
bool OSDMonitor::update_from_paxos()
int off = 0;
inc.decode(bl, off);
osdmap.apply_incremental(inc);
-
+
// write out the full map, too.
bl.clear();
osdmap.encode(bl);
// finish up pending_inc
pending_inc.ctime = g_clock.now();
- pending_inc.mon_epoch = mon->mon_epoch;
// tell me about it
for (map<int32_t,uint8_t>::iterator i = pending_inc.new_down.begin();
{
dout(10) << "should_propose" << dendl;
if (osdmap.epoch == 1) {
- if (pending_inc.new_up.size() == (unsigned)g_conf.num_osd) {
+ if (pending_inc.new_up.size() == (unsigned)osdmap.get_max_osd()) {
delay = 0.0;
if (g_conf.osd_auto_weight) {
CrushWrapper crush;
- build_crush_map(crush, osd_weight);
+ OSDMap::build_simple_crush_map(crush, osdmap.get_max_osd(), osd_weight);
crush._encode(pending_inc.crush);
}
return true;
osd_weight[from] = m->sb.weight;
+ if (!osdmap.post_mkfs() && !osdmap.is_mkfs())
+ pending_inc.mkfs = 1; // first set of up osds, do the mkfs!
+
// wait
paxos->wait_for_commit(new C_Booted(this, m));
}
paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
return true;
}
+ else if (m->cmd[1] == "setmap") {
+ OSDMap map;
+ map.decode(m->get_data());
+ map.epoch = pending_inc.epoch; // make sure epoch is correct
+ map.encode(pending_inc.fullmap);
+ string rs = "set osd map";
+ paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs));
+ return true;
+ }
else if (m->cmd[1] == "setmaxosd" && m->cmd.size() > 2) {
pending_inc.new_max_osd = atoi(m->cmd[2].c_str());
ss << "set new max_osd = " << pending_inc.new_max_osd;
map<int,double> osd_weight;
- void build_crush_map(CrushWrapper& crush,
- map<int,double>& weights);
-
// svc
+public:
void create_initial();
+private:
bool update_from_paxos();
void create_pending(); // prepare a new pending
void encode_pending(bufferlist &bl);
*/
void PGMonitor::create_initial()
{
- dout(1) << "create_initial -- creating initial map" << dendl;
+ dout(10) << "create_initial -- creating initial map" << dendl;
}
bool PGMonitor::update_from_paxos()
void PGMonitor::encode_pending(bufferlist &bl)
{
- assert(mon->is_leader());
dout(10) << "encode_pending v " << pending_inc.version << dendl;
assert(paxos->get_version() + 1 == pending_inc.version);
pending_inc._encode(bl);
bool Paxos::is_readable()
{
- //dout(15) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl;
+ dout(1) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl;
return
(mon->is_peon() || mon->is_leader()) &&
is_active() &&
lease_timeout_event(0),
accept_timeout_event(0) { }
+ const char *get_machine_name() const {
+ return machine_name;
+ }
+
void dispatch(Message *m);
void init();
#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxosservice(" << get_paxos_name(paxos->machine_id) << ") "
+const char *PaxosService::get_machine_name()
+{
+ return paxos->get_machine_name();
+}
void PaxosService::dispatch(Message *m)
{
dout(10) << "propose_pending" << dendl;
assert(have_pending);
+ assert(mon->is_leader());
if (proposal_timer) {
mon->timer.cancel_event(proposal_timer);
create_pending();
have_pending = true;
}
-
- if (g_conf.mkfs &&
- paxos->get_version() == 0) {
- create_initial();
- propose_pending();
- }
}
}
protected:
Monitor *mon;
Paxos *paxos;
+ friend class Monitor;
class C_RetryMessage : public Context {
PaxosService *svc;
PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p),
proposal_timer(0),
have_pending(false) { }
+
+ const char *get_machine_name();
// i implement and you ignore
void dispatch(Message *m);
void propose_pending(); // propose current pending as new paxos state
// you implement
+ virtual void create_initial() = 0;
virtual bool update_from_paxos() = 0; // assimilate latest state from paxos
virtual void create_pending() = 0; // [leader] create new pending structures
- virtual void create_initial() = 0; // [leader] populate pending with initial state (1)
virtual void encode_pending(bufferlist& bl) = 0; // [leader] finish and encode pending for next paxos state
virtual void discard_pending() { } // [leader] discard pending
./monmaptool -f .ceph_monmap --add 1.2.3.4:12345
./monmaptool -f .ceph_monmap --rm 1.2.3.4:12345
-
*/
void usage(const char *me)
{
- cout << me << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567]" << std::endl;
+ cout << me << " usage: [--print] [--create [--clobber]] [--add 1.2.3.4:567] [--rm 1.2.3.4:567] <mapfilename>" << std::endl;
exit(1);
}
const char *me = argv[0];
- const char *fn = ".ceph_monmap";
+ const char *fn = 0;
bool print = false;
bool create = false;
bool clobber = false;
list<entity_addr_t> add, rm;
for (unsigned i=0; i<args.size(); i++) {
- if (strcmp(args[i], "--fn") == 0)
- fn = args[++i];
- else if (strcmp(args[i], "--print") == 0)
+ if (strcmp(args[i], "--print") == 0)
print = true;
else if (strcmp(args[i], "--create") == 0)
create = true;
else
rm.push_back(addr);
modified = true;
- } else
+ } else if (!fn)
+ fn = args[i];
+ else
usage(me);
}
+ if (!fn)
+ usage(me);
MonMap monmap;
if (!print && !modified)
usage(me);
+ if (modified)
+ monmap.epoch++;
+
if (print)
printmap(me, &monmap);
if (modified) {
- monmap.epoch++;
-
// write it out
cout << me << ": writing epoch " << monmap.epoch
<< " to " << fn
{
vector<int> acting;
int nrep = osdmap->pg_to_acting_osds(pgid, acting);
+ dout(20) << "pgid " << pgid << " -> " << acting << dendl;
int role = osdmap->calc_pg_role(whoami, acting, nrep);
if (role < 0) return;
{
dout(7) << "advance_map epoch " << osdmap->get_epoch()
<< " " << pg_map.size() << " pgs"
+ << " mkfs_peoch " << osdmap->get_mkfs_epoch()
<< dendl;
if (osdmap->is_mkfs()) {
// is this okay?
- assert(superblock.current_epoch == 2);
ceph_fsid nullfsid;
memset(&nullfsid, 0, sizeof(nullfsid));
if (memcmp(&nullfsid, &superblock.fsid, sizeof(nullfsid)) != 0) {
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "OSDMap.h"
+
+#include "config.h"
+#define dout generic_dout
+#define derr generic_derr
+
+
+void OSDMap::build_simple(epoch_t e, ceph_fsid &fsid,
+ int num_osd, int pg_bits, int mds_local_osd)
+{
+ dout(10) << "build_simple on " << num_osd
+ << " osds with " << pg_bits << " pg bits per osd" << dendl;
+ epoch = e;
+ set_fsid(fsid);
+ ctime = g_clock.now();
+
+ set_max_osd(num_osd);
+ set_pg_num(num_osd << pg_bits);
+
+ // crush map
+ map<int,double> weights;
+ build_simple_crush_map(crush, num_osd, weights);
+
+ for (int i=0; i<num_osd; i++) {
+ set_state(i, CEPH_OSD_EXISTS|CEPH_OSD_CLEAN);
+ set_offload(i, CEPH_OSD_IN);
+ }
+
+ if (mds_local_osd) {
+ set_max_osd(mds_local_osd+num_osd);
+
+ // add mds local osds, but don't put them in the crush mapping func
+ for (int i=0; i<mds_local_osd; i++) {
+ set_max_osd(i+num_osd);
+ set_state(i, CEPH_OSD_EXISTS);
+ set_offload(i, CEPH_OSD_IN);
+ }
+ }
+}
+
+void OSDMap::build_simple_crush_map(CrushWrapper& crush, int num_osd, map<int,double>& weights)
+{
+ // new
+ crush.create();
+
+ if (num_osd >= 12) {
+ int ndom = g_conf.osd_max_rep;
+ int ritems[ndom];
+ int rweights[ndom];
+
+ int nper = ((num_osd - 1) / ndom) + 1;
+ derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl;
+
+ int o = 0;
+ for (int i=0; i<ndom; i++) {
+ int items[nper];
+ //int w[nper];
+ int j;
+ rweights[i] = 0;
+ for (j=0; j<nper; j++, o++) {
+ if (o == num_osd) break;
+ dout(20) << "added osd" << o << dendl;
+ items[j] = o;
+ //w[j] = weights[o] ? (0x10000 - (int)(weights[o] * 0x10000)):0x10000;
+ //rweights[i] += w[j];
+ rweights[i] += 0x10000;
+ }
+
+ crush_bucket_uniform *domain = crush_make_uniform_bucket(1, j, items, 0x10000);
+ ritems[i] = crush_add_bucket(crush.crush, 0, (crush_bucket*)domain);
+ dout(20) << "added domain bucket i " << ritems[i] << " of size " << j << dendl;
+ }
+
+ // root
+ crush_bucket_list *root = crush_make_list_bucket(2, ndom, ritems, rweights);
+ int rootid = crush_add_bucket(crush.crush, 0, (crush_bucket*)root);
+
+ // rules
+ // replication
+ for (int pool=0; pool<1; pool++)
+ for (int i=1; i<=ndom; i++) {
+ crush_rule *rule = crush_make_rule(4);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 1);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0);
+ crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_REP_RULE(i, pool), rule);
+ }
+
+ // raid
+ for (int pool=0; pool<1; pool++)
+ for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
+ if (ndom >= i) {
+ crush_rule *rule = crush_make_rule(4);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 1);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_CHOOSE_INDEP, 1, 0);
+ crush_rule_set_step(rule, 3, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
+ } else {
+ crush_rule *rule = crush_make_rule(3);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, rootid, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
+ }
+ }
+
+ } else {
+ // one bucket
+
+ int items[num_osd];
+ for (int i=0; i<num_osd; i++)
+ items[i] = i;
+
+ crush_bucket_uniform *b = crush_make_uniform_bucket(1, num_osd, items, 0x10000);
+ int root = crush_add_bucket(crush.crush, 0, (crush_bucket*)b);
+
+ // rules
+ // replication
+ for (int pool=0; pool<1; pool++)
+ for (int i=1; i<=g_conf.osd_max_rep; i++) {
+ crush_rule *rule = crush_make_rule(3);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_FIRSTN, i, 0);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_REP_RULE(i, pool), rule);
+ }
+
+ // raid4
+ for (int pool=0; pool<1; pool++)
+ for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) {
+ crush_rule *rule = crush_make_rule(3);
+ crush_rule_set_step(rule, 0, CRUSH_RULE_TAKE, root, 0);
+ crush_rule_set_step(rule, 1, CRUSH_RULE_CHOOSE_INDEP, i, 0);
+ crush_rule_set_step(rule, 2, CRUSH_RULE_EMIT, 0, 0);
+ crush_add_rule(crush.crush, CRUSH_RAID_RULE(i, pool), rule);
+ }
+ }
+
+ crush.finalize();
+
+ // mark all in
+ for (int i=0; i<num_osd; i++)
+ crush.set_offload(i, CEPH_OSD_IN);
+
+ dout(20) << "crush max_devices " << crush.crush->max_devices << dendl;
+}
public:
ceph_fsid fsid;
epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
- epoch_t mon_epoch; // monitor epoch (election iteration)
utime_t ctime;
// full (rare)
// incremental
int32_t new_max_osd;
+ __u8 mkfs;
map<int32_t,entity_addr_t> new_up;
map<int32_t,uint8_t> new_down;
map<int32_t,uint32_t> new_offload;
void encode(bufferlist& bl) {
::_encode(fsid, bl);
::_encode(epoch, bl);
- ::_encode(mon_epoch, bl);
+ ::_encode(mkfs, bl);
ctime._encode(bl);
::_encode(fullmap, bl);
::_encode(crush, bl);
void decode(bufferlist& bl, int& off) {
::_decode(fsid, bl, off);
::_decode(epoch, bl, off);
- ::_decode(mon_epoch, bl, off);
+ ::_decode(mkfs, bl, off);
ctime._decode(bl, off);
::_decode(fullmap, bl, off);
::_decode(crush, bl, off);
::_decode(old_pg_swap_primary, bl, off);
}
- Incremental(epoch_t e=0) : epoch(e), mon_epoch(0), new_max_osd(-1) {
+ Incremental(epoch_t e=0) : epoch(e), new_max_osd(-1), mkfs(0) {
fsid.major = fsid.minor = cpu_to_le64(0);
}
};
private:
ceph_fsid fsid;
- epoch_t epoch; // what epoch of the osd cluster descriptor is this
- epoch_t mon_epoch; // monitor epoch (election iteration)
+ epoch_t epoch, mkfs_epoch; // what epoch of the osd cluster descriptor is this
utime_t ctime, mtime; // epoch start time
int32_t pg_num; // placement group count
int32_t pg_num_mask; // bitmask for above
friend class MDS;
public:
- OSDMap() : epoch(0), mon_epoch(0),
+ OSDMap() : epoch(0), mkfs_epoch((epoch_t)-1),
pg_num(1<<5),
localized_pg_num(1<<3),
max_osd(0) {
epoch_t get_epoch() const { return epoch; }
void inc_epoch() { epoch++; }
+ void set_epoch(epoch_t e) { epoch = e; }
/* pg num / masks */
void calc_pg_masks() {
const utime_t& get_ctime() const { return ctime; }
const utime_t& get_mtime() const { return mtime; }
- bool is_mkfs() const { return epoch == 2; }
- bool post_mkfs() const { return epoch > 2; }
+ bool is_mkfs() const { return epoch == mkfs_epoch; }
+ bool post_mkfs() const { return epoch > mkfs_epoch; }
+ epoch_t get_mkfs_epoch() const { return mkfs_epoch; }
/***** cluster state *****/
/* osds */
assert(ceph_fsid_equal(&inc.fsid, &fsid) || inc.epoch == 1);
assert(inc.epoch == epoch+1);
epoch++;
- mon_epoch = inc.mon_epoch;
ctime = inc.ctime;
// full map?
}
// nope, incremental.
+ if (inc.mkfs)
+ mkfs_epoch = epoch;
+
if (inc.new_max_osd >= 0)
set_max_osd(inc.new_max_osd);
void encode(bufferlist& blist) {
::_encode(fsid, blist);
::_encode(epoch, blist);
- ::_encode(mon_epoch, blist);
+ ::_encode(mkfs_epoch, blist);
::_encode(ctime, blist);
::_encode(mtime, blist);
::_encode(pg_num, blist);
int off = 0;
::_decode(fsid, blist, off);
::_decode(epoch, blist, off);
- ::_decode(mon_epoch, blist, off);
+ ::_decode(mkfs_epoch, blist, off);
::_decode(ctime, blist, off);
::_decode(mtime, blist, off);
::_decode(pg_num, blist, off);
}
-
+ /*
+ * handy helpers to build simple maps...
+ */
+ void build_simple(epoch_t e, ceph_fsid &fsid,
+ int num_osd, int pg_bits, int mds_local_osd);
+ static void build_simple_crush_map(CrushWrapper& crush, int num_osd, map<int,double>& weights);
};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/stat.h>
+#include <iostream>
+#include <string>
+using namespace std;
+
+#include "config.h"
+
+#include "osd/OSDMap.h"
+#include "mon/MonMap.h"
+
+void usage(const char *me)
+{
+ cout << me << " usage: [--print] [--createsimple <monmapfile> <numosd> [--clobber] [--pgbits <bitsperosd>]] <mapfilename>" << std::endl;
+ exit(1);
+}
+
+void printmap(const char *me, OSDMap *m)
+{
+ cout << me << ": osdmap: epoch " << m->get_epoch() << std::endl
+ << me << ": osdmap: fsid " << m->get_fsid() << std::endl;
+ /*for (unsigned i=0; i<m->mon_inst.size(); i++)
+ cout << me << ": osdmap: " //<< "mon" << i << " "
+ << m->mon_inst[i] << std::endl;
+ */
+}
+
+int read_file(const char *fn, bufferlist &bl)
+{
+ struct stat st;
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ cerr << "can't open " << fn << ": " << strerror(errno) << std::endl;
+ return -errno;
+ }
+ ::fstat(fd, &st);
+ bufferptr bp(st.st_size);
+ bl.append(bp);
+ ::read(fd, (void*)bl.c_str(), bl.length());
+ ::close(fd);
+ return 0;
+}
+
+int write_file(const char *fn, bufferlist &bl)
+{
+ int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ if (fd < 0) {
+ cerr << "can't write " << fn << ": " << strerror(errno) << std::endl;
+ return -errno;
+ }
+ ::write(fd, (void*)bl.c_str(), bl.length());
+ ::close(fd);
+ return 0;
+}
+
+int main(int argc, const char **argv)
+{
+ vector<const char*> args;
+ argv_to_vec(argc, argv, args);
+ parse_config_options(args);
+
+ const char *me = argv[0];
+
+ const char *fn = 0;
+ bool print = false;
+ bool createsimple = false;
+ const char *monmapfn;
+ int num_osd;
+ int pg_bits = g_conf.osd_pg_bits;
+ bool clobber = false;
+ bool modified = false;
+ list<entity_addr_t> add, rm;
+
+ for (unsigned i=0; i<args.size(); i++) {
+ if (strcmp(args[i], "--print") == 0)
+ print = true;
+ else if (strcmp(args[i], "--createsimple") == 0) {
+ createsimple = true;
+ monmapfn = args[++i];
+ num_osd = atoi(args[++i]);
+ } else if (strcmp(args[i], "--clobber") == 0)
+ clobber = true;
+ else if (strcmp(args[i], "--pgbits") == 0)
+ pg_bits = atoi(args[++i]);
+ else if (!fn)
+ fn = args[i];
+ else
+ usage(me);
+ }
+ if (!fn)
+ usage(me);
+
+ OSDMap osdmap;
+ bufferlist bl;
+
+ cout << me << ": osdmap file '" << fn << "'" << std::endl;
+
+ int r = 0;
+ if (!(createsimple && clobber))
+ r = read_file(fn, bl);
+ if (!createsimple && r < 0) {
+ cerr << me << ": couldn't open " << fn << ": " << strerror(errno) << std::endl;
+ return -1;
+ }
+ else if (createsimple && !clobber && r == 0) {
+ cerr << me << ": " << fn << " exists, --clobber to overwrite" << std::endl;
+ return -1;
+ }
+
+ if (!print && !modified)
+ usage(me);
+
+ if (createsimple) {
+ MonMap monmap;
+ int r = monmap.read(monmapfn);
+ if (r < 0) {
+ cerr << me << ": can't read monmap from " << monmapfn << ": " << strerror(r) << std::endl;
+ exit(1);
+ }
+ osdmap.build_simple(0, monmap.fsid, num_osd, pg_bits, 0);
+ modified = true;
+ }
+
+ if (modified)
+ osdmap.inc_epoch();
+
+ if (print)
+ printmap(me, &osdmap);
+
+ if (modified) {
+ osdmap.encode(bl);
+
+ // write it out
+ cout << me << ": writing epoch " << osdmap.get_epoch()
+ << " to " << fn
+ << std::endl;
+ int r = write_file(fn, bl);
+ assert(r >= 0);
+ }
+
+
+ return 0;
+}
echo
fi
-$CEPH_BIN/monmaptool --create --clobber --add $IP:12345 --print # your IP here
+# build a fresh fs monmap, mon fs
+$CEPH_BIN/monmaptool --create --clobber --add $IP:12345 --print .ceph_monmap
+$CEPH_BIN/mkmonfs --clobber mondata/mon0 --mon 0 --monmap .ceph_monmap
-ARGS="-d --bind $IP --doutdir out --debug_ms 1"
-$CEPH_BIN/cmon $ARGS --mkfs --mon 0
+# shared args
+ARGS="-d --bind $IP -o out --debug_ms 1"
+
+# start monitor
+$CEPH_BIN/cmon $ARGS mondata/mon0 --debug_mon 10 --debug_ms 1
+
+# build and inject an initial osd map
+$CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 4 --print .ceph_osdmap
+$CEPH_BIN/cmonctl osd setmap -i .ceph_osdmap
+
+# osds
$CEPH_BIN/cosd $ARGS --mkfs --osd 0
$CEPH_BIN/cosd $ARGS --mkfs --osd 1
$CEPH_BIN/cosd $ARGS --mkfs --osd 2
$CEPH_BIN/cosd $ARGS --mkfs --osd 3
+
+# mds
$CEPH_BIN/cmds $ARGS --debug_mds 10
echo "started. stop.sh to stop. see out/* (e.g. 'tail -f out/????') for debug output."