From 78ddc62b356cd0f26a9ac666b0f58ac0d0da8798 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 18 Mar 2008 14:13:34 -0700 Subject: [PATCH] some pg split fixes. seems to work --- src/TODO | 3 +- src/kernel/osd_client.c | 16 +++++++--- src/mon/OSDMonitor.cc | 65 +++++++++++++++++++++++++++-------------- src/mon/PGMonitor.cc | 2 ++ src/mon/PGMonitor.h | 2 +- src/osd/OSD.cc | 14 +++++---- src/osd/OSDMap.h | 16 +++++----- 7 files changed, 76 insertions(+), 42 deletions(-) diff --git a/src/TODO b/src/TODO index 95d8abad62f30..263afbcbea70c 100644 --- a/src/TODO +++ b/src/TODO @@ -1,6 +1,5 @@ mkfs -- pg splitting! -- ratchet up pgp_num as pg creation succeeds +- increase pg_num when pg_num > pgp_num code cleanup - userspace encoding/decoding needs major cleanup diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c index 4ddcf1c5a07bd..ac3eea05afa38 100644 --- a/src/kernel/osd_client.c +++ b/src/kernel/osd_client.c @@ -197,16 +197,24 @@ static void send_request(struct ceph_osd_client *osdc, struct ceph_osd_request * int osds[10]; int nr_osds; int i; - + int pps; /* placement ps */ + dout(30, "send_request %p\n", req); ruleno = crush_find_rule(osdc->osdmap->crush, req->r_pgid.pg.pool, req->r_pgid.pg.type, req->r_pgid.pg.size); BUG_ON(ruleno < 0); /* fixme, need some proper error handling here */ dout(30, "using crush rule %d\n", ruleno); - nr_osds = crush_do_rule(osdc->osdmap->crush, ruleno, - req->r_pgid.pg.ps, osds, req->r_pgid.pg.size, - req->r_pgid.pg.preferred); + if (req->r_pgid.pg.preferred >= 0) + pps = ceph_stable_mod(req->r_pgid.pg.ps, + osdc->osdmap->lpgp_num, + osdc->osdmap->lpgp_num_mask); + else + pps = ceph_stable_mod(req->r_pgid.pg.ps, + osdc->osdmap->pgp_num, + osdc->osdmap->pgp_num_mask); + nr_osds = crush_do_rule(osdc->osdmap->crush, ruleno, pps, osds, + req->r_pgid.pg.size, req->r_pgid.pg.preferred); for (i=0; iosdmap, osds[i])) break; diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 7688a5ba6f2d2..17d4ff3b81105 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -785,64 +785,85 @@ bool OSDMonitor::prepare_command(MMonCommand *m) } else if (m->cmd[1] == "setpgnum" && m->cmd.size() > 2) { int n = atoi(m->cmd[2].c_str()); - if (n > osdmap.get_pg_num() && - osdmap.get_pg_num() == osdmap.get_pgp_num()) { + if (n < osdmap.get_pg_num()) { + ss << "specified pg_num " << n << " < current " << osdmap.get_pg_num(); + } else if (osdmap.get_pg_num() != osdmap.get_pgp_num()) { + ss << "current pg_num " << osdmap.get_pg_num() << " > " << osdmap.get_pgp_num() + << ", increase pgp_num first"; + } else if (!mon->pgmon->pg_map.creating_pgs.empty()) { + ss << "currently creating pgs, wait"; + } else { ss << "set new pg_num = " << n; pending_inc.new_pg_num = n; getline(ss, rs); paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; + } + getline(ss, rs); + mon->reply_command(m, -EINVAL, rs); + } + else if (m->cmd[1] == "setpgpnum" && m->cmd.size() > 2) { + int n = atoi(m->cmd[2].c_str()); + if (n <= osdmap.get_pgp_num()) { + ss << "specified pgp_num " << n << " <= current " << osdmap.get_pgp_num(); + } else if (n > osdmap.get_pg_num()) { + ss << "specified pgp_num " << n << " > pg_num " << osdmap.get_pg_num(); + } else if (!mon->pgmon->pg_map.creating_pgs.empty()) { + ss << "still creating pgs, wait"; } else { - ss << "specified pg_num " << n << " < current " << osdmap.get_pg_num(); - ss << " or pg_num " << osdmap.get_pg_num() << " > pgp_num " << osdmap.get_pgp_num(); + ss << "set new pgp_num = " << n; + pending_inc.new_pgp_num = n; getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); + paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); + return true; } + getline(ss, rs); + mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "down" && m->cmd.size() > 2) { errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); - if (osdmap.is_up(osd)) { + if (osdmap.is_down(osd)) { + ss << "osd" << osd << " is already down"; + } else { pending_inc.new_down[osd] = false; ss << "marked down osd" << osd; getline(ss, rs); paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; - } else { - ss << "osd" << osd << " is already down"; - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); } + getline(ss, rs); + mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "out" && m->cmd.size() > 2) { errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); - if (osdmap.is_in(osd)) { + if (osdmap.is_out(osd)) { + ss << "osd" << osd << " is already out"; + } else { pending_inc.new_offload[osd] = CEPH_OSD_OUT; ss << "marked out osd" << osd; getline(ss, rs); paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; - } else { - ss << "osd" << osd << " is already out"; - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); - } + } + getline(ss, rs); + mon->reply_command(m, -EINVAL, rs); } else if (m->cmd[1] == "in" && m->cmd.size() > 2) { errno = 0; long osd = strtol(m->cmd[2].c_str(), 0, 10); - if (osdmap.is_out(osd)) { + if (osdmap.is_in(osd)) { + ss << "osd" << osd << " is already in"; + } else { pending_inc.new_offload[osd] = CEPH_OSD_IN; ss << "marked in osd" << osd; getline(ss, rs); paxos->wait_for_commit(new Monitor::C_Command(mon, m, 0, rs)); return true; - } else { - ss << "osd" << osd << " is already in"; - getline(ss, rs); - mon->reply_command(m, -EINVAL, rs); - } + } + getline(ss, rs); + mon->reply_command(m, -EINVAL, rs); } } return false; diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index 0dfbf9cf7d00e..bcd6ac1089614 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -345,6 +345,8 @@ void PGMonitor::register_new_pgs() parent.u.pg.ps &= ~(1<<(msb-1)); dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl; if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) { + //if (pg_map.pg_stat.count(parent) && + //pg_map.pg_stat[parent].state != PG_STATE_CREATING) { dout(10) << " parent is " << parent << dendl; break; } diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h index 69593b455dc29..cb8c6caa45fe0 100644 --- a/src/mon/PGMonitor.h +++ b/src/mon/PGMonitor.h @@ -35,9 +35,9 @@ class MStatfs; class PGMonitor : public PaxosService { public: + PGMap pg_map; private: - PGMap pg_map; PGMap::Incremental pending_inc; void create_initial(); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 09c8fb6253aab..701319b723faa 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1440,12 +1440,13 @@ void OSD::advance_map(ObjectStore::Transaction& t) if (role != 0) { dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl; creating_pgs.erase(p); + } else { + /* + * adding new ppl to our pg has no effect, since we're still primary, + * and obviously haven't given the new nodes any data. + */ + p->second.acting.swap(acting); // keep the latest } - p->second.acting.swap(acting); // keep the latest - /* - * adding new ppl to our pg has no effect, since we're still primary, - * and obviously haven't given the new nodes any data. - */ } // scan existing pg's @@ -1916,6 +1917,9 @@ void OSD::kick_pg_split_queue() created++; } store->apply_transaction(t); + + // remove from queue + pg_split_ready.erase(p); } do_queries(query_map); diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 2291a74936bf9..5cbea9cdbb414 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -486,14 +486,14 @@ private: // pg -> (osd list) - int pg_to_osds(pg_t actual_pg, vector& osds) { + int pg_to_osds(pg_t pg, vector& osds) { // map to osds[] - pg_t pg = actual_pg; // effective pgid for placement calc + ps_t pps; // placement ps if (pg.preferred() >= 0) - pg.u.pg.ps = ceph_stable_mod(pg.u.pg.ps, pgp_num, pgp_num_mask); + pps = ceph_stable_mod(pg.ps(), lpgp_num, lpgp_num_mask); else - pg.u.pg.ps = ceph_stable_mod(pg.u.pg.ps, lpgp_num, lpgp_num_mask); + pps = ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask); switch (g_conf.osd_pg_layout) { case CEPH_PG_LAYOUT_CRUSH: @@ -501,18 +501,18 @@ private: // what crush rule? int ruleno = crush.find_rule(pg.pool(), pg.type(), pg.size()); if (ruleno >= 0) - crush.do_rule(ruleno, pg.ps(), osds, pg.size(), pg.preferred()); + crush.do_rule(ruleno, pps, osds, pg.size(), pg.preferred()); } break; case CEPH_PG_LAYOUT_LINEAR: for (int i=0; i