From 462b6a908c4c15d0f8e3a8a8e1d921b50871e694 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Mar 2008 15:21:57 -0700 Subject: [PATCH] account for new pg parent split levels to allow pg_num increases; mon command handling cleanups --- src/messages/MOSDPGCreate.h | 1 + src/mon/MDSMonitor.cc | 9 ++-- src/mon/Monitor.cc | 6 ++- src/mon/OSDMonitor.cc | 54 +++++++++++----------- src/mon/OSDMonitor.h | 1 - src/mon/PGMonitor.cc | 89 ++++++++++++++++++++++++++++++------- src/mon/PGMonitor.h | 6 +++ src/osd/OSD.cc | 79 +++++--------------------------- src/osd/OSD.h | 3 +- src/osd/osd_types.h | 3 +- 10 files changed, 133 insertions(+), 118 deletions(-) diff --git a/src/messages/MOSDPGCreate.h b/src/messages/MOSDPGCreate.h index 70a3c1a028b0c..ba1ee668b8c08 100644 --- a/src/messages/MOSDPGCreate.h +++ b/src/messages/MOSDPGCreate.h @@ -27,6 +27,7 @@ struct MOSDPGCreate : public Message { struct create_rec { epoch_t created; // epoch pg created pg_t parent; // split from parent (if != pg_t()) + int split_bits; }; map mkpg; diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc index f5678d9e7e3ae..6de2d0dc68de5 100644 --- a/src/mon/MDSMonitor.cc +++ b/src/mon/MDSMonitor.cc @@ -22,7 +22,6 @@ #include "messages/MMDSGetMap.h" #include "messages/MMDSBeacon.h" #include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" #include "messages/MGenericMessage.h" @@ -256,7 +255,10 @@ bool MDSMonitor::prepare_update(Message *m) case MSG_MDS_BEACON: return handle_beacon((MMDSBeacon*)m); - + + case MSG_MON_COMMAND: + return prepare_command((MMonCommand*)m); + default: assert(0); delete m; @@ -509,7 +511,8 @@ bool MDSMonitor::prepare_command(MMonCommand *m) pending_mdsmap.mds_state[who] = MDSMap::STATE_STOPPING; } else { r = -EEXIST; - ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; + ss << "mds" << who << " not active (" + << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; } } else if (m->cmd[1] == "set_max_mds" && m->cmd.size() > 2) { diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index d3a7642cd769f..9b9f7a94dace2 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -165,10 +165,10 @@ void Monitor::win_election(epoch_t epoch, set& active) paxos_pgmap.leader_init(); // init + pgmon->election_finished(); // hack: before osdmon, for osd->pg kick works ok osdmon->election_finished(); mdsmon->election_finished(); clientmon->election_finished(); - pgmon->election_finished(); } void Monitor::lose_election(epoch_t epoch, int l) @@ -204,6 +204,10 @@ void Monitor::handle_command(MMonCommand *m) osdmon->dispatch(m); return; } + if (m->cmd[0] == "pg") { + pgmon->dispatch(m); + return; + } if (m->cmd[0] == "stop") { do_stop(); return; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 17d4ff3b81105..95ad63b144905 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -28,7 +28,6 @@ #include "messages/MOSDIn.h" #include "messages/MOSDOut.h" #include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" #include "common/Timer.h" @@ -174,12 +173,17 @@ bool OSDMonitor::update_from_paxos() return true; } +ostream& operator<<(ostream& out, OSDMonitor& om) +{ + return out << "e" << om.osdmap.get_epoch() << ": " + << om.osdmap.get_num_osds() << " osds: " + << om.osdmap.get_num_up_osds() << " up, " + << om.osdmap.get_num_in_osds() << " in"; +} + void OSDMonitor::print_summary_stats(int dbl) { - dout(dbl) << osdmap.get_num_osds() << " osds: " - << osdmap.get_num_up_osds() << " up, " - << osdmap.get_num_in_osds() << " in" - << dendl; + dout(dbl) << *this << dendl; } void OSDMonitor::create_pending() @@ -728,7 +732,11 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) stringstream ss; if (m->cmd.size() > 1) { - if (m->cmd[1] == "getmap") { + if (m->cmd[1] == "stat") { + ss << *this; + r = 0; + } + else if (m->cmd[1] == "getmap") { osdmap.encode(rdata); ss << "got osdmap epoch " << osdmap.get_epoch(); r = 0; @@ -746,10 +754,7 @@ bool OSDMonitor::preprocess_command(MMonCommand *m) if (r != -1) { string rs; getline(ss, rs); - MMonCommandAck *reply = new MMonCommandAck(r, rs); - reply->set_data(rdata); - mon->messenger->send_message(reply, m->inst); - delete m; + mon->reply_command(m, r, rs, rdata); return true; } else return false; @@ -759,6 +764,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m) { stringstream ss; string rs; + int err = -EINVAL; if (m->cmd.size() > 1) { if (m->cmd[1] == "setcrushmap") { dout(10) << "prepare_command setting new crush map" << dendl; @@ -785,13 +791,11 @@ bool OSDMonitor::prepare_command(MMonCommand *m) } else if (m->cmd[1] == "setpgnum" && m->cmd.size() > 2) { int n = atoi(m->cmd[2].c_str()); - if (n < osdmap.get_pg_num()) { - ss << "specified pg_num " << n << " < current " << osdmap.get_pg_num(); - } else if (osdmap.get_pg_num() != osdmap.get_pgp_num()) { - ss << "current pg_num " << osdmap.get_pg_num() << " > " << osdmap.get_pgp_num() - << ", increase pgp_num first"; + if (n <= osdmap.get_pg_num()) { + ss << "specified pg_num " << n << " <= current " << osdmap.get_pg_num(); } else if (!mon->pgmon->pg_map.creating_pgs.empty()) { ss << "currently creating pgs, wait"; + err = -EAGAIN; } else { ss << "set new pg_num = " << n; pending_inc.new_pg_num = n; @@ -799,8 +803,6 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "setpgpnum" && m->cmd.size() > 2) { int n = atoi(m->cmd[2].c_str()); @@ -810,6 +812,7 @@ bool OSDMonitor::prepare_command(MMonCommand *m) ss << "specified pgp_num " << n << " > pg_num " << osdmap.get_pg_num(); } else if (!mon->pgmon->pg_map.creating_pgs.empty()) { ss << "still creating pgs, wait"; + err = -EAGAIN; } else { ss << "set new pgp_num = " << n; pending_inc.new_pgp_num = n; @@ -817,11 +820,8 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "down" && m->cmd.size() > 2) { - errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); if (osdmap.is_down(osd)) { ss << "osd" << osd << " is already down"; @@ -832,11 +832,8 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "out" && m->cmd.size() > 2) { - errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); if (osdmap.is_out(osd)) { ss << "osd" << osd << " is already out"; @@ -847,11 +844,8 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "in" && m->cmd.size() > 2) { - errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); if (osdmap.is_in(osd)) { ss << "osd" << osd << " is already in"; @@ -862,9 +856,13 @@ bool OSDMonitor::prepare_command(MMonCommand *m) paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; } - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); + } else { + ss << "unknown command " << m->cmd[1]; } + } else { + ss << "no command?"; } + getline(ss, rs); + mon->reply_command(m, err, rs); return false; } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index df591002f112c..2f67971250f64 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -125,7 +125,6 @@ private: bool preprocess_command(MMonCommand *m); bool prepare_command(MMonCommand *m); - void finish_command(MMonCommand *m, int rc, const string &rs); void mark_all_down(); diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index bcd6ac1089614..b038c315e1ad0 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -24,6 +24,7 @@ #include "messages/MStatfs.h" #include "messages/MStatfsReply.h" #include "messages/MOSDPGCreate.h" +#include "messages/MMonCommand.h" #include "common/Timer.h" @@ -34,8 +35,8 @@ #include -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg v" << pg_map.version << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg v" << pg_map.version << " " /* @@ -112,20 +113,25 @@ bool PGMonitor::update_from_paxos() return true; } -void PGMonitor::print_summary_stats(int dbl) +ostream& operator<<(ostream& out, PGMonitor& pm) { std::stringstream ss; - for (hash_map::iterator p = pg_map.num_pg_by_state.begin(); - p != pg_map.num_pg_by_state.end(); + for (hash_map::iterator p = pm.pg_map.num_pg_by_state.begin(); + p != pm.pg_map.num_pg_by_state.end(); ++p) { - if (p != pg_map.num_pg_by_state.begin()) + if (p != pm.pg_map.num_pg_by_state.begin()) ss << ", "; ss << p->second << " " << pg_state_string(p->first);// << "(" << p->first << ")"; } string states = ss.str(); - dout(dbl) << "v" << pg_map.version << " " - << pg_map.pg_stat.size() << " pgs: " - << states << dendl; + return out << "v" << pm.pg_map.version << ": " + << pm.pg_map.pg_stat.size() << " pgs: " + << states; +} + +void PGMonitor::print_summary_stats(int dbl) +{ + dout(dbl) << *this << dendl; if (!pg_map.creating_pgs.empty()) dout(20) << " creating_pgs = " << pg_map.creating_pgs << dendl; } @@ -170,6 +176,9 @@ bool PGMonitor::preprocess_query(Message *m) return true; } + case MSG_MON_COMMAND: + return preprocess_command((MMonCommand*)m); + default: assert(0); delete m; @@ -336,6 +345,7 @@ void PGMonitor::register_new_pgs() } pg_t parent; + int split_bits = 0; if (!first) { parent = pgid; while (1) { @@ -343,10 +353,11 @@ void PGMonitor::register_new_pgs() int msb = calc_bits_of(parent.u.pg.ps); if (!msb) break; parent.u.pg.ps &= ~(1<<(msb-1)); + split_bits++; dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl; - if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) { - //if (pg_map.pg_stat.count(parent) && - //pg_map.pg_stat[parent].state != PG_STATE_CREATING) { + //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) { + if (pg_map.pg_stat.count(parent) && + pg_map.pg_stat[parent].state != PG_STATE_CREATING) { dout(10) << " parent is " << parent << dendl; break; } @@ -356,12 +367,16 @@ void PGMonitor::register_new_pgs() pending_inc.pg_stat_updates[pgid].state = PG_STATE_CREATING; pending_inc.pg_stat_updates[pgid].created = epoch; pending_inc.pg_stat_updates[pgid].parent = parent; + pending_inc.pg_stat_updates[pgid].parent_split_bits = split_bits; created++; - if (parent == pg_t()) { + if (split_bits == 0) { dout(10) << "register_new_pgs will create " << pgid << dendl; } else { - dout(10) << "register_new_pgs will create " << pgid << " parent " << parent << dendl; + dout(10) << "register_new_pgs will create " << pgid + << " parent " << parent + << " by " << split_bits << " bits" + << dendl; } } @@ -387,7 +402,7 @@ void PGMonitor::send_pg_creates() p++) { pg_t pgid = *p; pg_t on = pgid; - if (pg_map.pg_stat[pgid].parent != pg_t()) + if (pg_map.pg_stat[pgid].parent_split_bits) on = pg_map.pg_stat[pgid].parent; vector acting; int nrep = mon->osdmon->osdmap.pg_to_acting_osds(on, acting); @@ -406,6 +421,7 @@ void PGMonitor::send_pg_creates() msg[osd] = new MOSDPGCreate(mon->osdmon->osdmap.get_epoch()); msg[osd]->mkpg[pgid].created = pg_map.pg_stat[pgid].created; msg[osd]->mkpg[pgid].parent = pg_map.pg_stat[pgid].parent; + msg[osd]->mkpg[pgid].split_bits = pg_map.pg_stat[pgid].parent_split_bits; } for (map::iterator p = msg.begin(); @@ -416,3 +432,46 @@ void PGMonitor::send_pg_creates() last_sent_pg_create[p->first] = g_clock.now(); } } + + +bool PGMonitor::preprocess_command(MMonCommand *m) +{ + int r = -1; + bufferlist rdata; + stringstream ss; + + if (m->cmd.size() > 1) { + if (m->cmd[1] == "stat") { + ss << *this; + r = 0; + } + else if (m->cmd[1] == "getmap") { + pg_map._encode(rdata); + ss << "got pgmap version " << pg_map.version; + r = 0; + } + } + + if (r != -1) { + string rs; + getline(ss, rs); + mon->reply_command(m, r, rs, rdata); + return true; + } else + return false; +} + + +bool PGMonitor::prepare_command(MMonCommand *m) +{ + stringstream ss; + string rs; + int err = -EINVAL; + + // nothing here yet + ss << "unrecognized command"; + + getline(ss, rs); + mon->reply_command(m, err, rs); + return false; +} diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index cb8c6caa45fe0..c6a58ec162ab1 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -32,6 +32,7 @@ using namespace std; class MPGStats; class MStatfs; +class MMonCommand; class PGMonitor : public PaxosService { public: @@ -56,6 +57,9 @@ private: void print_summary_stats(int dbl=5); + bool preprocess_command(MMonCommand *m); + bool prepare_command(MMonCommand *m); + map last_sent_pg_create; // per osd throttle public: @@ -67,6 +71,8 @@ private: void register_new_pgs(); void send_pg_creates(); + + }; #endif diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 701319b723faa..985fc3bebb452 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1784,8 +1784,8 @@ PG *OSD::try_create_pg(pg_t pgid, ObjectStore::Transaction& t) return 0; } - if (creating_pgs[pgid].parent != pg_t()) { - dout(10) << "try_create_pg " << pgid << " - queuing for split" << dendl; + if (creating_pgs[pgid].split_bits) { + dout(10) << "try_create_pg " << pgid << " - queueing for split" << dendl; pg_split_ready[creating_pgs[pgid].parent].insert(pgid); return 0; } @@ -1796,63 +1796,6 @@ PG *OSD::try_create_pg(pg_t pgid, ObjectStore::Transaction& t) } -int OSD::num_expected_children_of(pg_t pgid) -{ - int n = osdmap->get_pg_num(); - int o = osdmap->get_pgp_num(); - assert(n > o); - - /* - bits -o pgp_num = 7 2 -o pgp_num = 8 3 - -n pg_num = 8 3 -n pg_num = 9 4 - -0 000 -1 001 -2 010 -3 011 -4 100 - 5 101 - 6 110 - 7 111 - 8 1000 - 9 1001 - -max = 2 - - */ - - assert(pgid.u.pg.ps < o); - int obits = calc_bits_of(o)-1; // lower bound - int nbits = calc_bits_of(n-1); // upper bound - assert(nbits > obits); - - int max = 0xffffffff >> (32 - (nbits-obits)); // == -> 1 - int num = 0; - dout(10) << "num_expected_children_of " << pgid - << " o/n " << o << "/" << n - << " bits " << obits << "/" << nbits - << " max " << max - << dendl; - - for (int i=1; i<=max; i++) { - int ps = (i << obits) | pgid.u.pg.ps; - dout(10) << "num_expected_children_of " << pgid.u.pg.ps << " -> " << ps << dendl; - if (ps < o || ps >= n) - continue; - num++; - } - - dout(10) << "num_expected_children_of " << pgid - << " num " << num - << dendl; - - return num; -} - void OSD::kick_pg_split_queue() { map< int, map > query_map; @@ -1865,10 +1808,10 @@ void OSD::kick_pg_split_queue() while (n != pg_split_ready.end()) { map >::iterator p = n++; // how many children should this parent have? - unsigned nchildren = num_expected_children_of(p->first); + unsigned nchildren = (1 << (creating_pgs[*p->second.begin()].split_bits - 1)) - 1; if (p->second.size() < nchildren) { dout(15) << " parent " << p->first << " children " << p->second - << " ... waiting for more children" << dendl; + << " ... waiting for " << nchildren << " children" << dendl; continue; } @@ -1985,12 +1928,13 @@ void OSD::handle_pg_create(MOSDPGCreate *m) pg_t pgid = p->first; epoch_t created = p->second.created; pg_t parent = p->second.parent; + int split_bits = p->second.split_bits; pg_t on = pgid; - if (parent != pg_t()) - on = parent; - if (parent != pg_t()) { - dout(20) << "mkpg " << pgid << " e" << created << " from parent " << parent << dendl; + if (split_bits) { + on = parent; + dout(20) << "mkpg " << pgid << " e" << created << " from parent " << parent + << " split by " << split_bits << " bits" << dendl; } else { dout(20) << "mkpg " << pgid << " e" << created << dendl; } @@ -2005,14 +1949,14 @@ void OSD::handle_pg_create(MOSDPGCreate *m) continue; } - // does it exist? + // does it already exist? if (_have_pg(pgid)) { dout(10) << "mkpg " << pgid << " already exists, skipping" << dendl; continue; } // does parent exist? - if (parent != pg_t() && !_have_pg(parent)) { + if (split_bits && !_have_pg(parent)) { dout(10) << "mkpg " << pgid << " missing parent " << parent << ", skipping" << dendl; continue; } @@ -2024,6 +1968,7 @@ void OSD::handle_pg_create(MOSDPGCreate *m) // register. creating_pgs[pgid].created = created; creating_pgs[pgid].parent = parent; + creating_pgs[pgid].split_bits = split_bits; creating_pgs[pgid].acting.swap(acting); calc_priors_during(pgid, created, history.same_primary_since, creating_pgs[pgid].prior); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index b8d39e8aacf30..b4f0f51e51a56 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -303,7 +303,7 @@ private: vector acting; set prior; pg_t parent; - bool has_parent() { return parent != pg_t(); } + int split_bits; }; hash_map creating_pgs; map > pg_split_ready; // children ready to be split to, by parent @@ -311,7 +311,6 @@ private: PG *try_create_pg(pg_t pgid, ObjectStore::Transaction& t); void handle_pg_create(class MOSDPGCreate *m); - int num_expected_children_of(pg_t pgid); void kick_pg_split_queue(); void split_pg(PG *parent, map& children, ObjectStore::Transaction &t); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 497d36f8e25ab..31e0153b792a5 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -257,12 +257,13 @@ struct pg_stat_t { eversion_t reported; epoch_t created; pg_t parent; + int32_t parent_split_bits; int32_t state; int64_t num_bytes; // in bytes int64_t num_blocks; // in 4k blocks int64_t num_objects; - pg_stat_t() : state(0), num_bytes(0), num_blocks(0), num_objects(0) {} + pg_stat_t() : parent_split_bits(0), state(0), num_bytes(0), num_blocks(0), num_objects(0) {} } __attribute__ ((packed)); typedef struct ceph_osd_peer_stat osd_peer_stat_t; -- 2.39.5