- fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage
+/- .ceph_hosts file, so we can use the infiniband addresses
+
+- look at mds osds
+
+
+
- the split/merge plan:
- hmm, should we move ESubtreeMap out of the journal?
- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry)
+- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in teh current log epoch in CDir...
+
- fix rmdir empty exported dirfrag race
- export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race.
- how to know full dir size (when trimming)?
mds_verify_export_dirauth: true,
mds_local_osd: false,
+ mds_local_osd_offset: 1000,
mds_thrash_exports: 0,
mds_thrash_fragments: 0,
bool mds_verify_export_dirauth; // debug flag
bool mds_local_osd;
+ int mds_local_osd_offset;
int mds_thrash_exports;
int mds_thrash_fragments;
void do_rule(Rule& rule, int x, vector<int>& result,
set<int>& outset, map<int,float>& overloadmap,
- int forcefeed=-1) {
+ int forcefeed=-1) {
//int numresult = 0;
result.clear();
- // determine hierarchical context for first.
+ // determine hierarchical context for forcefeed (if any)
list<int> force_stack;
- if (forcefeed >= 0) {
+ if (forcefeed >= 0 && parent_map.count(forcefeed)) {
int t = forcefeed;
while (1) {
force_stack.push_front(t);
+ //cout << "push " << t << " onto force_stack" << endl;
if (parent_map.count(t) == 0) break; // reached root, presumably.
//cout << " " << t << " parent is " << parent_map[t] << endl;
t = parent_map[t];
// do it
switch (pc->cmd) {
case CRUSH_RULE_TAKE:
- {
- const int arg = pc->args[0];
- //cout << "take " << arg << endl;
+ {
+ const int arg = pc->args[0];
+ //cout << "take " << arg << endl;
if (!force_stack.empty()) {
- int forceval = force_stack.front();
+ assert(force_stack.front() == arg);
force_stack.pop_front();
- assert(arg == forceval);
}
- w.clear();
- w.push_back(arg);
+ w.clear();
+ w.push_back(arg);
}
break;
force_stack.pop_front();
//cout << "priming out with " << forceval << endl;
forcing = true;
+ } else if (forcefeed >= 0 && type == 0) {
+ //cout << "forcing context-less " << forcefeed << endl;
+ forceval = forcefeed;
+ forcefeed = -1;
+ forcing = true;
}
// do each row independently
//cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
mds[i] = new MDS(-1, new FakeMessenger(MSG_ADDR_MDS(i)), monmap);
if (g_conf.mds_local_osd)
- mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)), monmap);
+ mdsosd[i] = new OSD(i+g_conf.mds_local_osd_offset, new FakeMessenger(MSG_ADDR_OSD(i+g_conf.mds_local_osd_offset)), monmap);
start++;
}
log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid();
log_inode.layout = g_OSD_MDLogLayout;
- if (g_conf.mds_local_osd) {
- log_inode.layout.preferred = mds->get_nodeid() + 10000; // hack
- }
+ if (g_conf.mds_local_osd)
+ log_inode.layout.preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset; // hack
// log streamer
if (journaler) delete journaler;
}
if (g_conf.mds_local_osd) {
- // add mds osds, but don't put them in the crush mapping func
+ // add mds local osds, but don't put them in the crush mapping func
for (int i=0; i<g_conf.num_mds; i++) {
- newmap.osds.insert(i+10000);
- newmap.down_osds[i+10000] = true;
+ int o = i+g_conf.mds_local_osd_offset;
+ newmap.osds.insert(o);
+ newmap.down_osds[o] = true;
}
}
started++;
if (g_conf.mds_local_osd) {
- mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap);
+ int n = i+g_conf.mds_local_osd_offset;
+ mdsosd[i] = new OSD(n, rank.register_entity(MSG_ADDR_OSD(n)), monmap);
mdsosd[i]->init();
}
}
{
// mkfs?
if (g_conf.osd_mkfs) {
- dout(2) << "mkfs" << dendl;
+ dout(2) << "mkfs on local store" << dendl;
store->mkfs();
// make up a superblock
{
// lock!
osd_lock.Lock();
+ dout(20) << "dispatch " << m << dendl;
switch (m->get_type()) {
finished_lock.Unlock();
osd_lock.Unlock();
- for (list<Message*>::iterator it = waiting.begin();
- it != waiting.end();
- it++) {
- dispatch(*it);
+ while (!waiting.empty()) {
+ dout(20) << "doing finished " << waiting.front() << dendl;
+ dispatch(waiting.front());
+ waiting.pop_front();
}
return;
}
pg->info.history.same_primary_since =
pg->info.history.same_acker_since = osdmap->get_epoch();
pg->write_log(t);
- pg->activate(t);
+ //pg->activate(t);
dout(7) << "created " << *pg << dendl;
pg->unlock();
pg->info.history.same_acker_since =
pg->info.history.same_since = osdmap->get_epoch();
pg->write_log(t);
- pg->activate(t);
+ //pg->activate(t);
dout(7) << "created " << *pg << dendl;
pg->unlock();
pg->info.history.same_primary_since =
pg->info.history.same_acker_since = osdmap->get_epoch();
pg->write_log(t);
- pg->activate(t);
+ //pg->activate(t);
dout(7) << "created " << *pg << dendl;
pg->unlock();
pg->info.history.same_acker_since =
pg->info.history.same_since = osdmap->get_epoch();
pg->write_log(t);
- pg->activate(t);
+ //pg->activate(t);
dout(7) << "created " << *pg << dendl;
pg->unlock();
}
}
- if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs
- return;
+ //if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs
+ //return;
do_notifies(notify_list); // notify? (residual|replica)
do_queries(query_map);
*/
bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch)
{
- dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << dendl;
+ dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
// newer map?
if (epoch > osdmap->get_epoch()) {
- dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << dendl;
+ dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << " with " << m << dendl;
wait_for_new_map(m);
return false;
}
ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) {
static crush::Hash H(777);
+ int num = preferred >= 0 ? localized_pg_num:pg_num;
+ int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask;
+
// calculate ps (placement seed)
ps_t ps;
switch (g_conf.osd_object_layout) {
case OBJECT_LAYOUT_LINEAR:
- ps = stable_mod(oid.bno + oid.ino, pg_num, pg_num_mask);
+ ps = stable_mod(oid.bno + oid.ino, num, num_mask);
break;
case OBJECT_LAYOUT_HASHINO:
- ps = stable_mod(oid.bno + H(oid.ino), pg_num, pg_num_mask);
+ ps = stable_mod(oid.bno + H(oid.ino), num, num_mask);
break;
case OBJECT_LAYOUT_HASH:
- ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), pg_num, pg_num_mask);
+ ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask);
break;
default:
assert(0);
}
+ //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl;
+
// construct object layout
return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred),
object_stripe_unit);
}
if (is_out(osd))
- osds.erase(osds.begin()); // oops, but it's down!
+ osds.erase(osds.begin()); // oops, but it's out
}
return osds.size();
}
// if primary..
- if (role == 0 &&
- osd->osdmap->post_mkfs()) {
+ if (role == 0) {
+ //&& osd->osdmap->post_mkfs()) {
// who is clean?
clean_set.clear();
if (info.is_clean())
//if (pg.ruleset())
//out << (int)pg.ruleset() << 's';
- if (pg.preferred() >= 0)
- out << pg.preferred() << 'p';
out << hex << pg.ps() << dec;
+ if (pg.preferred() >= 0)
+ out << 'p' << pg.preferred();
+
//out << "=" << hex << (__uint64_t)pg << dec;
return out;
}
<< " osd" << pg.primary()
<< endl;
if (pg.primary() >= 0) {
- MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid,
- ex.oid, ex.layout, osdmap->get_epoch(),
- wr->op);
- m->set_length(ex.length);
- m->set_offset(ex.start);
- m->set_rev(ex.rev);
- if (usetid > 0)
- m->set_retry_attempt(true);
-
- if (wr->tid_version.count(tid))
- m->set_version(wr->tid_version[tid]); // we're replaying this op!
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid,
+ ex.oid, ex.layout, osdmap->get_epoch(),
+ wr->op);
+ m->set_length(ex.length);
+ m->set_offset(ex.start);
+ m->set_rev(ex.rev);
+ if (usetid > 0)
+ m->set_retry_attempt(true);
- // what type of op?
- switch (wr->op) {
- case OSD_OP_WRITE:
- {
- // map buffer segments into this extent
- // (may be fragmented bc of striping)
- bufferlist cur;
- for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
- bit != ex.buffer_extents.end();
- bit++) {
- bufferlist thisbit;
- thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
- cur.claim_append(thisbit);
- }
- assert(cur.length() == ex.length);
- m->set_data(cur);//.claim(cur);
- }
- break;
+ if (wr->tid_version.count(tid))
+ m->set_version(wr->tid_version[tid]); // we're replaying this op!
+
+ // what type of op?
+ switch (wr->op) {
+ case OSD_OP_WRITE:
+ {
+ // map buffer segments into this extent
+ // (may be fragmented bc of striping)
+ bufferlist cur;
+ for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
+ bit != ex.buffer_extents.end();
+ bit++) {
+ bufferlist thisbit;
+ thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
+ cur.claim_append(thisbit);
}
-
+ assert(cur.length() == ex.length);
+ m->set_data(cur);//.claim(cur);
+ }
+ break;
+ }
+
messenger->send_message(m, osdmap->get_inst(pg.primary()));
}
my %nlinks;
my %names;
my %dirsize;
+my %fnlen;
+my $fnchars;
my $mask = 00170000;
my $ifdir = 0040000;
$nfiles++;
my ($ino, $mode, $nlink) = (lstat($file))[1, 2,3];
+ my $fnlen = length($f);
+ $fnlen{$fnlen}++;
+ $fnchars += $fnlen;
+
if (($mode & $mask) == $ifdir) {
$ndirs++;
push(@q, $file);
}
close DSLOG;
+# avg, median file name len
+my $avgfnlen = sprintf("%.2f",$nfiles/$nfnchars);
+
+
# stat fs
my $df = `df $base`;
my $line = (split(/\n/,$df))[1]; # second line