int iarg1 = iargs.front();
iargs.pop_front();
if (run_me()) {
- dout(3) << "sleep " << iarg1 << endl;
+ dout(2) << "sleep " << iarg1 << endl;
sleep(iarg1);
}
}
public:
DecayCounter() : val(0) {
- set_halflife( g_conf.mds_bal_interval );
+ set_halflife( g_conf.mds_decay_halflife );
reset();
}
+ /*
DecayCounter(double hl) : val(0) {
set_halflife(hl);
reset();
}
+ */
void adjust(double a) {
decay();
mds_cache_size: MDS_CACHE_SIZE,
mds_cache_mid: .7,
+ mds_decay_halflife: 30,
+
mds_log: true,
mds_log_max_len: MDS_CACHE_SIZE / 3,
mds_log_max_trimming: 10000,
mds_log_before_reply: true,
mds_log_flush_on_shutdown: true,
- mds_bal_replicate_threshold: 4000,
- mds_bal_unreplicate_threshold: 500,
- mds_bal_hash_rd: 5000,
- mds_bal_unhash_rd: 500,
- mds_bal_hash_wr: 2000,
- mds_bal_unhash_wr: 250,
+ mds_bal_replicate_threshold: 2000,
+ mds_bal_unreplicate_threshold: 0,//500,
+ mds_bal_hash_rd: 10000,
+ mds_bal_unhash_rd: 1000,
+ mds_bal_hash_wr: 10000,
+ mds_bal_unhash_wr: 1000,
mds_bal_interval: 30, // seconds
mds_bal_hash_interval: 5, // seconds
mds_bal_idle_threshold: .1,
else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0)
g_conf.mds_log_flush_on_shutdown = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_decay_halflife") == 0)
+ g_conf.mds_decay_halflife = atoi(args[++i]);
+
else if (strcmp(args[i], "--mds_bal_interval") == 0)
g_conf.mds_bal_interval = atoi(args[++i]);
else if (strcmp(args[i], "--mds_bal_rep") == 0)
else if (strcmp(args[i], "--mds_bal_max_until") == 0)
g_conf.mds_bal_max_until = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_hash_rd") == 0)
+ g_conf.mds_bal_hash_rd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_hash_wr") == 0)
+ g_conf.mds_bal_hash_wr = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_unhash_rd") == 0)
+ g_conf.mds_bal_unhash_rd = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_bal_unhash_wr") == 0)
+ g_conf.mds_bal_unhash_wr = atoi(args[++i]);
+
else if (strcmp(args[i], "--mds_bal_mode") == 0)
g_conf.mds_bal_mode = atoi(args[++i]);
else if (strcmp(args[i], "--mds_bal_min_start") == 0)
// mds
int mds_cache_size;
float mds_cache_mid;
+
+ float mds_decay_halflife;
bool mds_log;
int mds_log_max_len;
#'trace' => ['make.lib', 'make.include'],
- 'mds_bal_interval' => 90,
+ 'mds_bal_interval' => 45,
+ 'mds_bal_max' => 4,
'mds_decay_halflife' => 30,
'mds_bal_rep' => 1700,
+
'cper' => 100,# [ 50, 100 ],
'_dep' => [ 'cnode' => '$nummds',
'numclient' => '$cnode * $cper',
'numosd' => '$nummds * 2',
'n' => '1 + $cnode + $nummds + $numosd',
- 'custom' => '"--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000"' ],
+ 'custom' => '"--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 60 --syn trace traces/openssh/make.include 1000"' ],
# parameters
'fs' => ['fakestore'],
{
'sleep' => 3,
- 'nummds' => [1, 2, 4, 7], # googoo
+ 'nummds' => [1, 2, 4, 7], # googoo
#'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc
- 'cper' => 200,#[50, 100, 200],
- '_dep' => [ 'cnode' => '$nummds',
- 'numclient' => '$cnode * $cper',
- 'numosd' => '$nummds * 2',
- 'n' => '1 + $cnode + $nummds + $numosd' ],
# parameters
- #'fs' => 'ebofs',
- 'fs' => 'fakestore',
+ 'fs' => 'ebofs',
+ #'fs' => 'fakestore',
- 'until' => 400, # --syn until $n ... when to stop clients
- 'kill_after' => 500,
- 'start' => 200,
- 'end' => 400,
+ 'until' => 300, # --syn until $n ... when to stop clients
+ 'kill_after' => 400,
+ 'start' => 150,
+ 'end' => 300,
- 'mds_bal_interval' => [30, 60],
- 'mds_bal_max' => [1,2,3],
+ 'mds_bal_interval' => 90,#[60, 90],
+ #'mds_bal_max' => [3,4,5],
+ 'mds_bal_max' => 4,
+ 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60],
+ 'mds_bal_rep' => 1500,#[1000, 1500, 2000],
+
+ 'decay_hl' => 100,#[ 25, 50, 100, 150 ],
- # 'makedirs' => 1,
- #'makedirs_dirs' => 10,
- #'makedirs_files' => 10,
- #'makedirs_depth' => 3,
+ 'cper' => 100, #[50, 75, 100, 125, 150, 200],
+ '_dep' => [ 'cnode' => '$nummds',
+ 'numclient' => '$cnode * $cper',
+ 'numosd' => '$nummds * 2',
+ 'n' => '1 + $cnode + $nummds + $numosd',
+ 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'],
- 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/trace.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/trace.openssh.lib 100 --debug_mds_balancer 10 --mds_bal_max 2',
+ 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60',
#'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
# for final summation (script/sum.pl)
'comb' => {
- 'x' => 'nummds',
+ 'x' => 'nummds',#decay_hl',#'nummds',
'vars' => [ 'mds.req' ]
}
};
+#!/usr/bin/perl
# hi there
{
# startup
# ],
'writefile_mb' => 1000,
- 'osd_pg_bits' => [6, 8, 10, 12, 14],
+ 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14],
# 'osd_object_layout' => [ 'hash', 'hashino', 'linear' ],
'osd_pg_layout' => [ 'crush', 'linear', 'hash' ],
'end' => 90,
'comb' => {
- 'x' => 'osd_pg_bits',
+ 'x' => 'numosd',#'osd_pg_bits',
'vars' => [ 'osd.c_wrb', 'osd.c_wr' ],
# 'maptitle' => { 'osd_object_layout=' => '',
# ',osd_pg_layout=' => ' + '}
if (dir.is_dirty()) out << " dirty";
if (dir.is_import()) out << " import";
if (dir.is_export()) out << " export";
+ if (dir.is_rep()) out << " repl";
if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed;
if (dir.is_auth()) {
out << " auth";
put(CDIR_PIN_OPENED);
}
void open_by_clear() {
- if (open_by.size())
+ if (!open_by.empty())
put(CDIR_PIN_OPENED);
open_by.clear();
open_by_nonce.clear();
dir->replica_nonce = 0; // no longer defined
+ if (!dir->open_by.empty())
+ dout(0) << "open_by not empty non import, " << *dir << ", " << dir->open_by << endl;
+
dir->dir_rep_by = rep_by;
dir->open_by = open_by;
dout(12) << "open_by in export is " << open_by << ", dir now " << dir->open_by << endl;
multimap<double,int> load_map;
for (int i=0; i<cluster_size; i++) {
double l = mds_load[i].mds_load();
- dout(5) << " mds" << i << " load " << mds_load[i] << " -> " << l << endl;
+ if (whoami == 0)
+ dout(-5) << " mds" << i << " load " << mds_load[i] << " -> " << l << endl;
total_load += l;
if (whoami == i) my_load = l;
CInode *in = it->second->get_inode();
if (!in) continue;
if (!in->is_dir()) continue;
- if (!in->dir) continue; // clearly not popular
+ if (!in->dir) continue; // clearly not popular
if (in->dir->is_export()) continue;
if (in->dir->is_hashed()) continue;
// how popular?
double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load();
- //cout << " in " << in->inode.ino << " " << pop << endl;
+ dout(20) << " pop " << pop << " " << *in->dir << endl;
if (pop < minchunk) continue;
return;
}
- // apprently not enough; drill deeper into the hierarchy
+ // apprently not enough; drill deeper into the hierarchy (if non-replicated)
for (list<CDir*>::iterator it = bigger.begin();
it != bigger.end();
it++) {
+ if ((*it)->is_rep()) continue;
dout(7) << " descending into " << **it << endl;
find_exports(*it, amount, exports, have, already_exporting);
if (have > needmin)
return;
}
+ // ok fine, drill inot replicated dirs
+ for (list<CDir*>::iterator it = bigger.begin();
+ it != bigger.end();
+ it++) {
+ if (!(*it)->is_rep()) continue;
+ dout(7) << " descending into replicated " << **it << endl;
+ find_exports(*it, amount, exports, have, already_exporting);
+ if (have > needmin)
+ return;
+ }
}
float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit();
// hit modify counter, if this was a modify
- if (g_conf.num_mds > 1 &&
+ if (g_conf.num_mds > 2 && // FIXME >2 thing
dir->is_auth() &&
!dir->inode->is_root()) { // not root (for now at least)
// hash this dir? (later?)
bool anydom = dir->is_auth();
bool curdom = dir->is_auth();
+ float rd_adj = 0.0;
// replicate?
float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm??
if (!dir->is_rep() &&
dir_pop >= g_conf.mds_bal_replicate_threshold) {
// replicate
- dout(1) << "replicating dir " << *dir << " pop " << dir_pop << endl;
+ float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_RD].get();
+ rd_adj = rdp / mds->get_cluster()->get_num_mds() - rdp;
+ rd_adj /= 2.0; // temper somewhat
+
+ dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl;
dir->dir_rep = CDIR_REP_ALL;
mds->mdcache->send_dir_updates(dir, true);
+
+ dir->popularity[MDS_POP_JUSTME].pop[META_POP_RD].adjust(rd_adj);
+ dir->popularity[MDS_POP_CURDOM].pop[META_POP_RD].adjust(rd_adj);
}
if (dir->is_rep() &&
dir->popularity[MDS_POP_NESTED].pop[type].hit();
in->popularity[MDS_POP_NESTED].pop[type].hit();
+ if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_RD].adjust(rd_adj);
+
if (anydom) {
dir->popularity[MDS_POP_ANYDOM].pop[type].hit();
in->popularity[MDS_POP_ANYDOM].pop[type].hit();
public:
C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {}
void finish(int) {
- cout << "shutdown_check at " << g_clock.now() << endl;
- int o = g_conf.debug_mds;
- g_conf.debug_mds = 10;
- mdc->show_cache();
- g_conf.debug_mds = o;
- g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(mdc));
+ mdc->shutdown_check();
}
};
+void MDCache::shutdown_check()
+{
+ dout(0) << "shutdown_check at " << g_clock.now() << endl;
+
+ // cache
+ int o = g_conf.debug_mds;
+ g_conf.debug_mds = 10;
+ show_cache();
+ g_conf.debug_mds = o;
+ g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this));
+
+ // this
+ dout(0) << "lru size now " << lru.lru_get_size() << endl;
+ dout(0) << "log len " << mds->mdlog->get_num_events() << endl;
+
+
+ if (exports.size())
+ dout(0) << "still have " << exports.size() << " exports" << endl;
+
+ if (mds->filer->is_active())
+ dout(0) << "filer still active" << endl;
+}
+
void MDCache::shutdown_start()
{
dout(1) << "shutdown_start" << endl;
dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl;
if (is_client_req && cur->dir->is_rep()) {
- dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << endl;
+ dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl;
((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino());
req->clear_payload(); // reencode!
}
int nonce = it->second;
if (!in) {
- dout(7) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl;
- assert(in); // OOPS i should be authority, or recent authority (and thus frozen).
+ dout(0) << "inode_expire on " << hex << it->first << dec << " from " << from << ", don't have it" << endl;
+ assert(in); // i should be authority, or proxy .. and pinned
}
if (!in->is_auth()) {
int newauth = in->authority();
dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl;
assert(newauth >= 0);
+ if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl;
assert(in->state_test(CINODE_STATE_PROXY));
if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from);
proxymap[newauth]->add_inode(it->first, it->second);
int nonce = it->second;
if (!dir) {
- dout(7) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl;
- assert(dir); // OOPS i should be authority, or recent authority (and thus frozen).
+ dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl;
+ assert(dir); // i should be authority, or proxy ... and pinned
}
if (!dir->is_auth()) {
int newauth = dir->authority();
}
// update
- dout(1) << "dir_update on " << *in->dir << endl;
+ dout(5) << "dir_update on " << *in->dir << endl;
in->dir->dir_rep = m->get_dir_rep();
in->dir->dir_rep_by = m->get_dir_rep_by();
*/
void MDCache::hash_dir(CDir *dir)
{
- dout(7) << "hash_dir " << *dir << endl;
+ dout(-7) << "hash_dir " << *dir << endl;
assert(!dir->is_hashed());
assert(dir->is_auth());
void MDCache::unhash_dir(CDir *dir)
{
- dout(7) << "unhash_dir " << *dir << endl;
+ dout(-7) << "unhash_dir " << *dir << endl;
assert(dir->is_hashed());
assert(!dir->is_unhashing());
// shutdown
void shutdown_start();
+ void shutdown_check();
bool shutdown_pass();
bool shutdown(); // clear cache (ie at shutodwn)
case 0:
return root.pop[META_POP_RD].get()
+ 2.0*root.pop[META_POP_WR].get()
+ + req_rate
+ 10.0*queue_len;
case 1:
system "mkdir -p $out" unless -d "$out";
+sub reset {
+ print "reset: restarting mpd in 3 seconds\n";
+ system "sleep 3 && (mpiexec -l -n 32 killall tcpsyn ; restartmpd.sh)";
+ print "reset: done\n";
+}
+
if (`hostname` =~ /alc/) {
print "# this looks like alc\n";
$sim->{'_psub'} = 'jobs/alc.tp';
for my $k ('nummds', 'numclient', 'numosd', 'kill_after',
'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits',
+ 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife',
+ 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr',
'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms',
'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc',
'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep',
$r = system "$l $c > $fn/o";
if ($r) {
print "r = $r\n";
+ &reset;
} else {
system "touch $fn/.done";
}
args = nargs;
if (!args.empty()) {
for (unsigned i=0; i<args.size(); i++)
- cout << "stray arg " << args[i] << endl;
+ cerr << "stray arg " << args[i] << endl;
}
assert(args.empty());