# on issdm, it's /usr/local/mpich2/bin.
# Hook for extra -I options, etc.
-EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib
+EXTRA_CFLAGS = -pg -g #-I${HOME}/include -L${HOME}/lib
# base
-CFLAGS = -O3 -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
+CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS}
LDINC = ld -i -o
CC = g++
LIBS = -pthread
# fake*
fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o
- ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@
+ ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@
fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o
- ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
# mpi startup
newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o
- ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@
+ ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
# ebofs
mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o
- ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o
- ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
dupstore: dupstore.cc config.cc ebofs.o common/Clock.o common/Timer.o osd/FakeStore.o
- ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+ ${CC} ${CFLAGS} ${LIBS} $^ -o $@
# hadoop
iterator& operator++() {
assert(p != ls.end());
advance(1);
+ return *this;
}
// copy data out.
dout(-5) << " exporting idle (" << pop << ") import " << *im
<< " back to mds" << im->inode->authority().first
<< dendl;
- mds->mdcache->migrator->export_dir(im, im->inode->authority().first);
+ mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first);
continue;
}
dout(-5) << "reexporting " << *dir
<< " pop " << pop
<< " back to mds" << target << dendl;
- mds->mdcache->migrator->export_dir(dir, target);
+ mds->mdcache->migrator->export_dir_nicely(dir, target);
have += pop;
import_from_map.erase(plast);
import_pop_map.erase(pop);
<< " back to mds" << imp->inode->authority()
<< dendl;
have += pop;
- mds->mdcache->migrator->export_dir(imp, imp->inode->authority().first);
+ mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first);
}
if (amount-have < MIN_OFFLOAD) break;
}
<< " to mds" << target
<< " " << **it
<< dendl;
- mds->mdcache->migrator->export_dir(*it, target);
+ mds->mdcache->migrator->export_dir_nicely(*it, target);
}
}
void MDCache::_logged_subtree_map(off_t off)
{
dout(10) << "_logged_subtree_map at " << off << dendl;
- mds->mdlog->last_subtree_map = off;
+ mds->mdlog->subtree_maps.insert(off);
mds->mdlog->writing_subtree_map = false;
- list<Context*> ls;
- mds->mdlog->take_subtree_map_expire_waiters(ls);
- mds->queue_waiters(ls);
+ mds->mdlog->kick_subtree_map(); // just in case the last segment was empty.
}
// should we log a new import_map?
// FIXME: should this go elsewhere?
- if (last_subtree_map && !writing_subtree_map &&
+ if (!writing_subtree_map &&
(journaler->get_write_pos() / log_inode.layout.period()) !=
- (last_subtree_map / log_inode.layout.period()) &&
- (journaler->get_write_pos() - last_subtree_map > log_inode.layout.period()/2)) {
+ (get_last_subtree_map_offset() / log_inode.layout.period()) &&
+ (journaler->get_write_pos() - get_last_subtree_map_offset() > log_inode.layout.period()/2)) {
// log import map
- dout(10) << "submit_entry also logging subtree map: last = " << last_subtree_map
+ dout(10) << "submit_entry also logging subtree map: last = " << get_last_subtree_map_offset()
<< ", cur pos = " << journaler->get_write_pos() << dendl;
mds->mdcache->log_subtree_map();
}
trim(NULL);
}
-
+void MDLog::cap()
+{
+ dout(5) << "cap" << dendl;
+ capped = true;
+ kick_subtree_map();
+}
// trim
dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << dendl;
- if (trimming.begin()->first == le->_end_off) {
- // we trimmed off the front!
+ bool kick = false;
+
+ map<off_t,LogEvent*>::iterator p = trimming.begin();
+ if (p->first == le->_start_off) {
+ // we trimmed off the front! it must have been a segment head.
+ assert(!subtree_maps.empty());
+ assert(p->first == *subtree_maps.begin());
+ subtree_maps.erase(subtree_maps.begin());
+
// we can expire the log a bit.
- journaler->set_expire_pos(le->_end_off);
+ off_t to = get_trimmed_to();
+ journaler->set_expire_pos(to);
journaler->trim();
+
+ kick = true;
+ } else {
+ p++;
+
+ // is the next one us?
+ if (le->_start_off == p->first) {
+ p++;
+
+ // did we empty a segment?
+ if (subtree_maps.size() >= 2) {
+ set<off_t>::iterator segp = subtree_maps.begin();
+ assert(*segp < le->_end_off);
+ segp++;
+ dout(20) << "i ended at " << le->get_end_off()
+ << ", next seg starts at " << *segp
+ << ", next trimming is " << (p == trimming.end() ? 0:p->first)
+ << dendl;
+ if (*segp >= le->_end_off &&
+ (p == trimming.end() ||
+ p->first >= *segp)) {
+ dout(10) << "_trimmed segment looks empty" << dendl;
+ kick = true;
+ }
+ } else if (capped && trimming.size() < 3) {
+ kick = true; // blech, imprecise
+ }
+ }
}
- trimming.erase(le->_end_off);
+ trimming.erase(le->_start_off);
delete le;
-
+
+ if (kick)
+ kick_subtree_map();
+
logger->inc("trimf");
logger->set("trimng", trimming.size());
logger->set("rdpos", journaler->get_read_pos());
// trim!
dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << dendl;
- trimming[le->_end_off] = le;
+ trimming[le->_start_off] = le;
le->expire(mds, new C_MDL_Trimmed(this, le));
logger->inc("trims");
logger->set("trimng", trimming.size());
bool waiting_for_read;
friend class C_MDL_Reading;
+
+ off_t get_trimmed_to() {
+ if (trimming.empty())
+ return get_read_pos();
+ else
+ return trimming.begin()->first;
+ }
// -- replay --
// -- subtreemaps --
- off_t last_subtree_map; // offsets of last committed subtreemap. constrains trimming.
- list<Context*> subtree_map_expire_waiters;
+ set<off_t> subtree_maps;
+ map<off_t,list<Context*> > subtree_map_expire_waiters;
bool writing_subtree_map; // one is being written now
bool seen_subtree_map; // for recovery
+ friend class ESubtreeMap;
friend class C_MDS_WroteImportMap;
friend class MDCache;
- void init_journaler();
-
- public:
- void reopen_logger(utime_t start);
-
- off_t get_last_subtree_map_offset() { return last_subtree_map; }
- void add_subtree_map_expire_waiter(Context *c) {
- subtree_map_expire_waiters.push_back(c);
- }
- void take_subtree_map_expire_waiters(list<Context*>& ls) {
- ls.splice(ls.end(), subtree_map_expire_waiters);
+ void kick_subtree_map() {
+ if (subtree_map_expire_waiters.empty()) return;
+ list<Context*> ls;
+ ls.swap(subtree_map_expire_waiters.begin()->second);
+ subtree_map_expire_waiters.erase(subtree_map_expire_waiters.begin());
+ finish_contexts(ls);
}
+public:
+ off_t get_last_subtree_map_offset() {
+ assert(!subtree_maps.empty());
+ return *subtree_maps.rbegin();
+ }
+private:
+ void init_journaler();
+
+public:
+ void reopen_logger(utime_t start);
+
// replay state
map<inodeno_t, set<inodeno_t> > pending_exports;
- public:
+public:
MDLog(MDS *m) : mds(m),
num_events(0), max_events(g_conf.mds_log_max_len),
unflushed(0),
logger(0),
trim_reading(false), waiting_for_read(false),
replay_thread(this),
- last_subtree_map(0),
writing_subtree_map(false), seen_subtree_map(false) {
}
~MDLog();
}
bool is_capped() { return capped; }
- void cap() {
- capped = true;
- list<Context*> ls;
- ls.swap(subtree_map_expire_waiters);
- finish_contexts(ls);
- }
+ void cap();
void submit_entry( LogEvent *e, Context *c = 0 );
void wait_for_sync( Context *c );
// send pending import_maps? (these need to go out when all exports have finished.)
cache->maybe_send_pending_resolves();
-
+
cache->show_subtrees();
+
+ maybe_do_queued_export();
}
} else {
// bystander failed.
// ==========================================================
// EXPORT
+void Migrator::export_dir_nicely(CDir *dir, int dest)
+{
+ // enqueue
+ dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
+ export_queue.push_back(pair<dirfrag_t,int>(dir->dirfrag(), dest));
+
+ maybe_do_queued_export();
+}
+
+void Migrator::maybe_do_queued_export()
+{
+ while (!export_queue.empty() &&
+ export_state.size() <= 2) {
+ dirfrag_t df = export_queue.front().first;
+ int dest = export_queue.front().second;
+ export_queue.pop_front();
+
+ CDir *dir = mds->mdcache->get_dirfrag(df);
+ if (!dir) continue;
+ if (!dir->is_auth()) continue;
+
+ dout(-7) << "nicely exporting " << *dir << " to " << dest << dendl;
+
+ export_dir(dir, dest);
+ }
+}
+
+
+
class C_MDC_ExportFreeze : public Context {
Migrator *mig;
// send pending import_maps?
mds->mdcache->maybe_send_pending_resolves();
+
+ maybe_do_queued_export();
}
map<CDir*,list<Context*> > export_finish_waiters;
+ list< pair<dirfrag_t,int> > export_queue;
// -- imports --
public:
void export_dir(CDir *dir, int dest);
void export_empty_import(CDir *dir);
+ void export_dir_nicely(CDir *dir, int dest);
+ void maybe_do_queued_export();
+
void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth,
map<int,entity_inst_t>& exported_client_map,
utime_t now);
bool ESubtreeMap::has_expired(MDS *mds)
{
- if (mds->mdlog->get_last_subtree_map_offset() > get_start_off()) {
- dout(10) << "ESubtreeMap.has_expired -- there's a newer map" << dendl;
- return true;
- } else if (mds->mdlog->is_capped()) {
- dout(10) << "ESubtreeMap.has_expired -- log is capped, allowing map to expire" << dendl;
+ assert(!mds->mdlog->subtree_maps.empty());
+ set<off_t>::iterator p = mds->mdlog->subtree_maps.begin();
+ off_t first = *p;
+ if (get_start_off() != first) {
+ dout(10) << "ESubtreeMap.has_expired -- not the oldest segment" << dendl;
+ return false;
+ }
+
+ // i am the oldest.
+
+ // capped and last event?
+ if (mds->mdlog->is_capped() &&
+ mds->mdlog->subtree_maps.size() == 1 &&
+ (mds->mdlog->trimming.empty() ||
+ (mds->mdlog->trimming.size() == 1 &&
+ mds->mdlog->trimming.begin()->second == this))) {
+ dout(10) << "ESubtreeMap.has_expired -- capped and last one" << dendl;
return true;
- } else {
- dout(10) << "ESubtreeMap.has_expired -- not until there's a newer map written"
- << " (" << get_start_off() << " >= " << mds->mdlog->get_last_subtree_map_offset() << ")"
- << dendl;
+ }
+
+ p++;
+ if (p == mds->mdlog->subtree_maps.end()) {
+ dout(10) << "ESubtreeMap.has_expired -- only segment" << dendl;
return false;
}
+ off_t next = *p;
+
+ if (mds->mdlog->get_read_pos() < next) {
+ dout(10) << "ESubtreeMap.has_expired -- haven't read this segment, read pos "
+ << mds->mdlog->get_read_pos() << " < next map at " << next
+ << dendl;
+ return false;
+ }
+
+ map<off_t,LogEvent*>::iterator trimp = mds->mdlog->trimming.begin();
+ assert(trimp->first == get_start_off());
+ trimp++;
+ if (trimp != mds->mdlog->trimming.end() &&
+ trimp->first < next) {
+ dout(10) << "ESubtreeMap.has_expired -- segment still trimming at " << trimp->first << dendl;
+ return false;
+ }
+
+ dout(10) << "ESubtreeMap.has_expired -- segment is empty" << dendl;
+ return true;
}
void ESubtreeMap::expire(MDS *mds, Context *c)
{
dout(10) << "ESubtreeMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << dendl;
- mds->mdlog->add_subtree_map_expire_waiter(c);
+ mds->mdlog->subtree_map_expire_waiters[get_start_off()].push_back(c);
}
void ESubtreeMap::replay(MDS *mds)
{
+ // note location
+ mds->mdlog->subtree_maps.insert(get_start_off());
+
if (mds->mdcache->is_subtrees()) {
dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << dendl;
} else {
int num_osds() { return osds.size(); }
void get_all_osds(set<int>& ls) { ls = osds; }
- const set<int>& get_osds() { return osds; }
- const map<int,bool>& get_down_osds() { return down_osds; }
- const set<int>& get_out_osds() { return out_osds; }
- const map<int,float>& get_overload_osds() { return overload_osds; }
+ const set<int32_t>& get_osds() { return osds; }
+ const map<int32_t,bool>& get_down_osds() { return down_osds; }
+ const set<int32_t>& get_out_osds() { return out_osds; }
+ const map<int32_t,float>& get_overload_osds() { return overload_osds; }
bool exists(int osd) { return osds.count(osd); }
bool is_down(int osd) { return down_osds.count(osd); }