# This makes it less annoying to build on non-mpi hosts for dev work, and seems to
# behave just fine... change ${CC} back to mpicxx if you get paranoid.
CC = g++
-CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DUSE_EBOFS
+CFLAGS = -pg -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DUSE_EBOFS
LIBS = -lpthread -lrt -ldb
#for normal mpich2 machines
private:
struct timeval tv;
+ void normalize() {
+ if (tv.tv_usec > 1000*1000) {
+ tv.tv_sec += tv.tv_usec / (1000*1000);
+ tv.tv_usec %= 1000*1000;
+ }
+ }
+
public:
// cons
- utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; }
- utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; }
+ utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); }
+ utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); }
// accessors
time_t sec() const { return tv.tv_sec; }
// --- ebofs ---
ebofs: 0,
- ebofs_commit_interval: 2, // seconds. 0 = no timeout (for debugging/tracing)
- ebofs_oc_size: 1000,
- ebofs_cc_size: 1000,
- ebofs_bc_size: (150 *256), // 4k blocks, *256 for MB
- ebofs_bc_max_dirty: (100 *256), // before write() will block
+ ebofs_commit_ms: 10000, // 0 = no forced commit timeout (for debugging/tracing)
+ ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms
+ ebofs_oc_size: 10000, // onode cache
+ ebofs_cc_size: 10000, // cnode cache
+ ebofs_bc_size: (150 *256), // 4k blocks, *256 for MB
+ ebofs_bc_max_dirty: (100 *256), // before write() will block
ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind)
ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation)
// --- block device ---
bdev_iothreads: 1, // number of ios to queue with kernel
- bdev_idle_kick_after_ms: 100, // ms
+ bdev_idle_kick_after_ms: 0,//100, // ms ** FIXME ** this seems to break things, not sure why yet **
bdev_el_fw_max_ms: 1000, // restart elevator at least once every 1000 ms
bdev_el_bw_max_ms: 300, // restart elevator at least once every 300 ms
bdev_el_bidir: true, // bidirectional elevator?
else if (strcmp(args[i], "--ebofs") == 0)
g_conf.ebofs = 1;
- else if (strcmp(args[i], "--ebofs_commit_interval") == 0)
- g_conf.ebofs_commit_interval = atoi(args[++i]);
+ else if (strcmp(args[i], "--ebofs_commit_ms") == 0)
+ g_conf.ebofs_commit_ms = atoi(args[++i]);
else if (strcmp(args[i], "--fakestore") == 0) {
else if (strcmp(args[i], "--bdev_iothreads") == 0)
g_conf.bdev_iothreads = atoi(args[++i]);
+ else if (strcmp(args[i], "--bdev_idle_kick_after_ms") == 0)
+ g_conf.bdev_idle_kick_after_ms = atoi(args[++i]);
else {
// ebofs
int ebofs;
- int ebofs_commit_interval;
+ int ebofs_commit_ms;
+ int ebofs_idle_commit_ms;
int ebofs_oc_size;
int ebofs_cc_size;
off_t ebofs_bc_size;
io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000));
// should we still be sleeping? (did we get woken up, or did timer expire?
- if (io_queue.empty()) {
+ if (io_queue.empty() && io_threads_running == 0) {
idle_kicker->kick(); // kick
io_wakeup.Wait(lock); // and wait
}
return 0;
}
+bool BlockDevice::is_idle()
+{
+
+ lock.Lock();
+ bool idle = (io_threads_running == 0) && io_queue.empty();
+ lock.Unlock();
+ return idle;
+}
+
void BlockDevice::do_io(int fd, list<biovec*>& biols)
{
int r;
//
int count_io(block_t start, block_t len);
+ bool is_idle();
+
// ** blocking interface **
while (mounted) {
// wait for kick, or timeout
- if (g_conf.ebofs_commit_interval) {
- dout(10) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_interval << " seconds" << endl;
- commit_cond.WaitInterval(ebofs_lock, utime_t(g_conf.ebofs_commit_interval,0));
+ if (g_conf.ebofs_commit_ms) {
+ if (g_conf.ebofs_idle_commit_ms > 0) {
+ // periodically check for idle block device
+ dout(10) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms,"
+ << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl;
+ long left = g_conf.ebofs_commit_ms*1000;
+ while (left > 0) {
+ long next = MIN(left, g_conf.ebofs_idle_commit_ms*1000);
+ if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, left)) != ETIMEDOUT)
+ break; // we got kicked
+ if (dev.is_idle()) {
+ dout(10) << "commit_thread bdev is idle, early commit" << endl;
+ break; // dev is idle
+ }
+ left -= next;
+ }
+ } else {
+ // normal wait+timeout
+ dout(10) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl;
+ commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));
+ }
+
} else {
// DEBUG.. wait until kicked
- dout(10) << "commit_thread no commit_interval, waiting until kicked" << endl;
+ dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl;
commit_cond.Wait(ebofs_lock);
}
// drop lock while we deliver
incoming_lock.Unlock();
- while (in.size()) {
+ while (!in.empty()) {
Message *m = in.front();
in.pop_front();
# hi there
{
- 'n' => 30,
+ # startup
+ 'n' => 30, # mpi nodes
+ 'sleep' => 10, # seconds between runs
'nummds' => 1,
'numosd' => 8,
'numclient' => 400,#[10, 50, 100, 200, 400],
+
+ # parameters
'fs' => [ 'ebofs', 'fakestore' ],
- 'until' => 150,
+ 'until' => 150, # --syn until $n ... when to stop clients
'writefile' => 1,
'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ],
'writefile_mb' => 1000,
+
+ 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0';
+
+ # for final summation (script/sum.pl)
'start' => 30,
'end' => 120
};
=cut
-
my $in = shift || die;
my $out = shift || die;
+$out = $in . "." . $out;
+my $fake = shift;
+
+print "in $in
+out $out/
+";
-die "$out exists" if -d $out;
-my $raw = `cat $in`;
+# get input
+my $raw = `cat log/$in`;
my $sim = eval $raw;
-die "bash input" unless ref $sim;
+unless (ref $sim) {
+ print "bad input: log/$in\n";
+ system "perl -c log/$in";
+ exit 1;
+}
+
+open(W, "log/$out/in");
+print W $raw;
+close W;
+# prep output
+system "mkdir log/$out" unless -d "log/$out";
+
sub iterate {
my $sim = shift @_;
sub run {
my $h = shift @_;
- my $fn = join(",", map {"$_=$h->{$_}"} sort keys %$h);
+ my @fn;
+ for my $k (keys %$sim) {
+ next unless ref $sim->{$k};
+ push(@fn, "$k=$h->{$k}");
+ }
+ my $fn = join(",", @fn);
+ $fn =~ s/ /_/g;
$fn = $out . '/' . $fn if $out;
+ if (-e "log/$fn/.done") {
+ print "already done.\n";
+ return 1;
+ }
+ system "rm -r log/$fn" if -d "log/$fn";
+ system "mkdir log/$fn" unless -d "log/$fn";
+
my $c = "mpiexec -l -n $h->{'n'} ./tcpsyn --mkfs --nummds $h->{'nummds'} --numclient $h->{'numclient'} --numosd $h->{'numosd'}";
$c .= " --$h->{'fs'}";
-
$c .= " --syn until $h->{'until'}" if $h->{'until'};
$c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'};
- $c .= " --log $fn";
+ $c .= ' ' . $h->{'custom'} if $h->{'custom'};
+ $c .= " --log_name $fn";
print "-> $c\n";
- #system "$c > o";
-
+ my $r;
+ unless ($fake) {
+ $r = system "$c > log/$fn/o";
+ system "script/sum.pl -start $h->{'start'} -end $h->{'end'} log/$fn/osd* > log/$fn/sum.osd";
+ system "script/sum.pl -start $h->{'start'} -end $h->{'end'} log/$fn/mds* > log/$fn/sum.mds"
+ if -e "log/$fn/mds1";
+ system "script/sum.pl -start $h->{'start'} -end $h->{'end'} log/$fn/clnode* > log/$fn/sum.cl"
+ if -e "log/$fn/clnode.1";
+ if ($r) {
+ print "r = $r\n";
+ } else {
+ system "touch log/$fn/.done";
+ }
+ }
+ return $r;
}
my @r = &iterate($sim);
my $n = scalar(@r);
my $c = 1;
+my %r;
+my $nfailed = 0;
for my $h (@r) {
- print "$c/$n: ";
- &run($h);
+ print "$c/$n";
+ print " ($nfailed failed)" if $nfailed;
+ print ": ";
+ my $r = &run($h);
+
+ if ($r != 1) {
+ print "sleep $h->{'sleep'}\n";
+ sleep $h->{'sleep'};
+ } elsif ($r == 1) {
+ # already done
+ } elsif ($r) {
+ $nfailed++;
+ }
+
$c++;
}
+print "$nfailed failed\n";