From 1892b13f059be2684e5fde4202087a1908997b08 Mon Sep 17 00:00:00 2001 From: sageweil Date: Sun, 19 Aug 2007 20:19:31 +0000 Subject: [PATCH] fixed up mds_local_osd, crush, osd mkfs for preferred pgs, etc. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1645 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/mds/TODO | 8 ++++ branches/sage/mds/config.cc | 1 + branches/sage/mds/config.h | 1 + branches/sage/mds/crush/crush.h | 25 ++++++---- branches/sage/mds/fakesyn.cc | 2 +- branches/sage/mds/mds/MDLog.cc | 5 +- branches/sage/mds/mon/OSDMonitor.cc | 7 +-- branches/sage/mds/newsyn.cc | 3 +- branches/sage/mds/osd/OSD.cc | 27 ++++++----- branches/sage/mds/osd/OSDMap.h | 13 +++-- branches/sage/mds/osd/PG.cc | 4 +- branches/sage/mds/osd/osd_types.h | 5 +- branches/sage/mds/osdc/Objecter.cc | 60 ++++++++++++------------ branches/sage/mds/script/study_static.pl | 10 ++++ 14 files changed, 102 insertions(+), 69 deletions(-) diff --git a/branches/sage/mds/TODO b/branches/sage/mds/TODO index c93cd3f962b89..d52d60d3acfee 100644 --- a/branches/sage/mds/TODO +++ b/branches/sage/mds/TODO @@ -53,6 +53,12 @@ sage mds - fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage +/- .ceph_hosts file, so we can use the infiniband addresses + +- look at mds osds + + + - the split/merge plan: - hmm, should we move ESubtreeMap out of the journal? @@ -87,6 +93,8 @@ sage mds - EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) +- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in teh current log epoch in CDir... + - fix rmdir empty exported dirfrag race - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. - how to know full dir size (when trimming)? diff --git a/branches/sage/mds/config.cc b/branches/sage/mds/config.cc index 0c7e311b1f47b..caa879c0a589a 100644 --- a/branches/sage/mds/config.cc +++ b/branches/sage/mds/config.cc @@ -220,6 +220,7 @@ md_config_t g_conf = { mds_verify_export_dirauth: true, mds_local_osd: false, + mds_local_osd_offset: 1000, mds_thrash_exports: 0, mds_thrash_fragments: 0, diff --git a/branches/sage/mds/config.h b/branches/sage/mds/config.h index 7babeb3340b8d..378e25b7aadfd 100644 --- a/branches/sage/mds/config.h +++ b/branches/sage/mds/config.h @@ -219,6 +219,7 @@ struct md_config_t { bool mds_verify_export_dirauth; // debug flag bool mds_local_osd; + int mds_local_osd_offset; int mds_thrash_exports; int mds_thrash_fragments; diff --git a/branches/sage/mds/crush/crush.h b/branches/sage/mds/crush/crush.h index 374c190ed77ce..9432f901ceb4e 100644 --- a/branches/sage/mds/crush/crush.h +++ b/branches/sage/mds/crush/crush.h @@ -425,16 +425,17 @@ namespace crush { void do_rule(Rule& rule, int x, vector& result, set& outset, map& overloadmap, - int forcefeed=-1) { + int forcefeed=-1) { //int numresult = 0; result.clear(); - // determine hierarchical context for first. + // determine hierarchical context for forcefeed (if any) list force_stack; - if (forcefeed >= 0) { + if (forcefeed >= 0 && parent_map.count(forcefeed)) { int t = forcefeed; while (1) { force_stack.push_front(t); + //cout << "push " << t << " onto force_stack" << endl; if (parent_map.count(t) == 0) break; // reached root, presumably. //cout << " " << t << " parent is " << parent_map[t] << endl; t = parent_map[t]; @@ -453,18 +454,17 @@ namespace crush { // do it switch (pc->cmd) { case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << endl; + { + const int arg = pc->args[0]; + //cout << "take " << arg << endl; if (!force_stack.empty()) { - int forceval = force_stack.front(); + assert(force_stack.front() == arg); force_stack.pop_front(); - assert(arg == forceval); } - w.clear(); - w.push_back(arg); + w.clear(); + w.push_back(arg); } break; @@ -490,6 +490,11 @@ namespace crush { force_stack.pop_front(); //cout << "priming out with " << forceval << endl; forcing = true; + } else if (forcefeed >= 0 && type == 0) { + //cout << "forcing context-less " << forcefeed << endl; + forceval = forcefeed; + forcefeed = -1; + forcing = true; } // do each row independently diff --git a/branches/sage/mds/fakesyn.cc b/branches/sage/mds/fakesyn.cc index 84f06dde83e23..35bf22bf1c953 100644 --- a/branches/sage/mds/fakesyn.cc +++ b/branches/sage/mds/fakesyn.cc @@ -108,7 +108,7 @@ int main(int argc, char **argv) //cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl; mds[i] = new MDS(-1, new FakeMessenger(MSG_ADDR_MDS(i)), monmap); if (g_conf.mds_local_osd) - mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)), monmap); + mdsosd[i] = new OSD(i+g_conf.mds_local_osd_offset, new FakeMessenger(MSG_ADDR_OSD(i+g_conf.mds_local_osd_offset)), monmap); start++; } diff --git a/branches/sage/mds/mds/MDLog.cc b/branches/sage/mds/mds/MDLog.cc index 129aa1e9fe2f9..f1641aeb78b8d 100644 --- a/branches/sage/mds/mds/MDLog.cc +++ b/branches/sage/mds/mds/MDLog.cc @@ -65,9 +65,8 @@ void MDLog::init_journaler() log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); log_inode.layout = g_OSD_MDLogLayout; - if (g_conf.mds_local_osd) { - log_inode.layout.preferred = mds->get_nodeid() + 10000; // hack - } + if (g_conf.mds_local_osd) + log_inode.layout.preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset; // hack // log streamer if (journaler) delete journaler; diff --git a/branches/sage/mds/mon/OSDMonitor.cc b/branches/sage/mds/mon/OSDMonitor.cc index 8c642514c9de1..d96fa178bb10b 100644 --- a/branches/sage/mds/mon/OSDMonitor.cc +++ b/branches/sage/mds/mon/OSDMonitor.cc @@ -212,10 +212,11 @@ void OSDMonitor::create_initial() } if (g_conf.mds_local_osd) { - // add mds osds, but don't put them in the crush mapping func + // add mds local osds, but don't put them in the crush mapping func for (int i=0; iinit(); } } diff --git a/branches/sage/mds/osd/OSD.cc b/branches/sage/mds/osd/OSD.cc index 8fe67d180bea1..803c6e39d03d6 100644 --- a/branches/sage/mds/osd/OSD.cc +++ b/branches/sage/mds/osd/OSD.cc @@ -188,7 +188,7 @@ int OSD::init() { // mkfs? if (g_conf.osd_mkfs) { - dout(2) << "mkfs" << dendl; + dout(2) << "mkfs on local store" << dendl; store->mkfs(); // make up a superblock @@ -714,6 +714,7 @@ void OSD::dispatch(Message *m) { // lock! osd_lock.Lock(); + dout(20) << "dispatch " << m << dendl; switch (m->get_type()) { @@ -809,10 +810,10 @@ void OSD::dispatch(Message *m) finished_lock.Unlock(); osd_lock.Unlock(); - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); + while (!waiting.empty()) { + dout(20) << "doing finished " << waiting.front() << dendl; + dispatch(waiting.front()); + waiting.pop_front(); } return; } @@ -1131,7 +1132,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) pg->info.history.same_primary_since = pg->info.history.same_acker_since = osdmap->get_epoch(); pg->write_log(t); - pg->activate(t); + //pg->activate(t); dout(7) << "created " << *pg << dendl; pg->unlock(); @@ -1153,7 +1154,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) pg->info.history.same_acker_since = pg->info.history.same_since = osdmap->get_epoch(); pg->write_log(t); - pg->activate(t); + //pg->activate(t); dout(7) << "created " << *pg << dendl; pg->unlock(); @@ -1180,7 +1181,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) pg->info.history.same_primary_since = pg->info.history.same_acker_since = osdmap->get_epoch(); pg->write_log(t); - pg->activate(t); + //pg->activate(t); dout(7) << "created " << *pg << dendl; pg->unlock(); @@ -1202,7 +1203,7 @@ void OSD::advance_map(ObjectStore::Transaction& t) pg->info.history.same_acker_since = pg->info.history.same_since = osdmap->get_epoch(); pg->write_log(t); - pg->activate(t); + //pg->activate(t); dout(7) << "created " << *pg << dendl; pg->unlock(); @@ -1374,8 +1375,8 @@ void OSD::activate_map(ObjectStore::Transaction& t) } } - if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; + //if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs + //return; do_notifies(notify_list); // notify? (residual|replica) do_queries(query_map); @@ -1490,11 +1491,11 @@ bool OSD::require_current_map(Message *m, epoch_t ep) */ bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) { - dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << dendl; + dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; // newer map? if (epoch > osdmap->get_epoch()) { - dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << dendl; + dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << " with " << m << dendl; wait_for_new_map(m); return false; } diff --git a/branches/sage/mds/osd/OSDMap.h b/branches/sage/mds/osd/OSDMap.h index 83795a2be5740..25a5eef3d10e6 100644 --- a/branches/sage/mds/osd/OSDMap.h +++ b/branches/sage/mds/osd/OSDMap.h @@ -316,25 +316,30 @@ private: ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { static crush::Hash H(777); + int num = preferred >= 0 ? localized_pg_num:pg_num; + int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask; + // calculate ps (placement seed) ps_t ps; switch (g_conf.osd_object_layout) { case OBJECT_LAYOUT_LINEAR: - ps = stable_mod(oid.bno + oid.ino, pg_num, pg_num_mask); + ps = stable_mod(oid.bno + oid.ino, num, num_mask); break; case OBJECT_LAYOUT_HASHINO: - ps = stable_mod(oid.bno + H(oid.ino), pg_num, pg_num_mask); + ps = stable_mod(oid.bno + H(oid.ino), num, num_mask); break; case OBJECT_LAYOUT_HASH: - ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), pg_num, pg_num_mask); + ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); break; default: assert(0); } + //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl; + // construct object layout return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred), object_stripe_unit); @@ -422,7 +427,7 @@ private: } if (is_out(osd)) - osds.erase(osds.begin()); // oops, but it's down! + osds.erase(osds.begin()); // oops, but it's out } return osds.size(); diff --git a/branches/sage/mds/osd/PG.cc b/branches/sage/mds/osd/PG.cc index 568021d875dbb..0d9b82c745611 100644 --- a/branches/sage/mds/osd/PG.cc +++ b/branches/sage/mds/osd/PG.cc @@ -884,8 +884,8 @@ void PG::activate(ObjectStore::Transaction& t, } // if primary.. - if (role == 0 && - osd->osdmap->post_mkfs()) { + if (role == 0) { + //&& osd->osdmap->post_mkfs()) { // who is clean? clean_set.clear(); if (info.is_clean()) diff --git a/branches/sage/mds/osd/osd_types.h b/branches/sage/mds/osd/osd_types.h index b761ae36d3d0e..a1bf9dea3d0bb 100644 --- a/branches/sage/mds/osd/osd_types.h +++ b/branches/sage/mds/osd/osd_types.h @@ -142,10 +142,11 @@ inline ostream& operator<<(ostream& out, pg_t pg) //if (pg.ruleset()) //out << (int)pg.ruleset() << 's'; - if (pg.preferred() >= 0) - out << pg.preferred() << 'p'; out << hex << pg.ps() << dec; + if (pg.preferred() >= 0) + out << 'p' << pg.preferred(); + //out << "=" << hex << (__uint64_t)pg << dec; return out; } diff --git a/branches/sage/mds/osdc/Objecter.cc b/branches/sage/mds/osdc/Objecter.cc index 8eb02e1a10329..14ba733a06183 100644 --- a/branches/sage/mds/osdc/Objecter.cc +++ b/branches/sage/mds/osdc/Objecter.cc @@ -685,38 +685,38 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) << " osd" << pg.primary() << endl; if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.layout, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_rev(ex.rev); - if (usetid > 0) - m->set_retry_attempt(true); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, + ex.oid, ex.layout, osdmap->get_epoch(), + wr->op); + m->set_length(ex.length); + m->set_offset(ex.start); + m->set_rev(ex.rev); + if (usetid > 0) + m->set_retry_attempt(true); - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) { - bufferlist thisbit; - thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second); - cur.claim_append(thisbit); - } - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; + if (wr->tid_version.count(tid)) + m->set_version(wr->tid_version[tid]); // we're replaying this op! + + // what type of op? + switch (wr->op) { + case OSD_OP_WRITE: + { + // map buffer segments into this extent + // (may be fragmented bc of striping) + bufferlist cur; + for (map::iterator bit = ex.buffer_extents.begin(); + bit != ex.buffer_extents.end(); + bit++) { + bufferlist thisbit; + thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second); + cur.claim_append(thisbit); } - + assert(cur.length() == ex.length); + m->set_data(cur);//.claim(cur); + } + break; + } + messenger->send_message(m, osdmap->get_inst(pg.primary())); } diff --git a/branches/sage/mds/script/study_static.pl b/branches/sage/mds/script/study_static.pl index 9cbeb01fb4320..2350dd727bcf2 100644 --- a/branches/sage/mds/script/study_static.pl +++ b/branches/sage/mds/script/study_static.pl @@ -16,6 +16,8 @@ my $nhardlinks = 0; my %nlinks; my %names; my %dirsize; +my %fnlen; +my $fnchars; my $mask = 00170000; my $ifdir = 0040000; @@ -40,6 +42,10 @@ while (@q) { $nfiles++; my ($ino, $mode, $nlink) = (lstat($file))[1, 2,3]; + my $fnlen = length($f); + $fnlen{$fnlen}++; + $fnchars += $fnlen; + if (($mode & $mask) == $ifdir) { $ndirs++; push(@q, $file); @@ -90,6 +96,10 @@ for my $ds (sort {$a <=> $b} keys %dirsize) { } close DSLOG; +# avg, median file name len +my $avgfnlen = sprintf("%.2f",$nfiles/$nfnchars); + + # stat fs my $df = `df $base`; my $line = (split(/\n/,$df))[1]; # second line -- 2.39.5