# This makes it less annoying to build on non-mpi hosts for dev work, and seems to
# behave just fine... change ${CC} back to mpicxx if you get paranoid.
CC = g++
-CFLAGS = -pg -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DUSE_EBOFS
+CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DUSE_EBOFS
LIBS = -lpthread -lrt -ldb
#for normal mpich2 machines
${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@
# + obfs
+fakesynobfs: fakesyn.cc mds.o client/Client.o client/Buffercache.o osd/OSD.cc osd/OBFSStore.o msg/FakeMessenger.o ${COMMON_OBJS} ${SYN_OBJS}
+ ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@ ../uofs/uofs.a
+
tcpsynobfs: tcpsyn.cc mds.o client/Client.o client/Buffercache.o osd/OSD.cc osd/OBFSStore.o ${TCP_OBJS} ${COMMON_OBJS} ${SYN_OBJS}
${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a
# ebofs
-ebofs: mkfs.ebofs test.ebofs
mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o
${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o
${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@
+ebofs: mkfs.ebofs test.ebofs
void Allocator::dump_freelist()
{
- if (1) {
+ if (0) {
interval_set<block_t> free; // validate too
block_t n = 0;
assert(tab->find(0, cursor) >= 0);
while (1) {
dout(30) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl;
+ assert(cursor.current().value > 0);
if (b < EBOFS_NUM_FREE_BUCKETS)
n += cursor.current().value;
+ if (free.contains( cursor.current().key, cursor.current().value ))
+ dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl;
assert(!free.contains( cursor.current().key, cursor.current().value ));
free.insert( cursor.current().key, cursor.current().value );
if (cursor.move_right() <= 0) break;
}
}
- dout(10) << "allocate " << ex << " near " << near << endl;
+ dout(20) << "allocate " << ex << " near " << near << endl;
dump_freelist();
return num;
}
fs->free_tab[bucket]->remove(ex.start);
fs->free_blocks -= ex.length;
- dout(10) << "allocate partial " << ex << " near " << near << endl;
+ dout(20) << "allocate partial " << ex << " near " << near << endl;
dump_freelist();
return ex.length;
}
int Allocator::release(Extent& ex)
{
- dout(10) << "release " << ex << " (into limbo)" << endl;
+ dout(20) << "release " << ex << " (into limbo)" << endl;
+ assert(ex.length > 0);
limbo.insert(ex.start, ex.length);
fs->limbo_blocks += ex.length;
return 0;
int Allocator::_release(Extent& orig)
{
dout(15) << "_release " << orig << endl;
+ assert(orig.length > 0);
fs->free_blocks += orig.length;
Extent newex = orig;
}
// ok, insert newex
+ assert(newex.length > 0);
int b = pick_bucket(newex.length);
fs->free_tab[b]->insert(newex.start, newex.length);
return 0;
if (g_conf.ebofs_commit_ms) {
if (g_conf.ebofs_idle_commit_ms > 0) {
// periodically check for idle block device
- dout(10) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms,"
+ dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, "
<< g_conf.ebofs_idle_commit_ms << " ms if idle" << endl;
- long left = g_conf.ebofs_commit_ms*1000;
+ long left = g_conf.ebofs_commit_ms;
while (left > 0) {
- long next = MIN(left, g_conf.ebofs_idle_commit_ms*1000);
- if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, left)) != ETIMEDOUT)
+ long next = MIN(left, g_conf.ebofs_idle_commit_ms);
+ if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT)
break; // we got kicked
if (dev.is_idle()) {
- dout(10) << "commit_thread bdev is idle, early commit" << endl;
+ dout(20) << "commit_thread bdev is idle, early commit" << endl;
break; // dev is idle
}
left -= next;
+ dout(20) << "commit_thread " << left << " ms left" << endl;
}
} else {
// normal wait+timeout
- dout(10) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl;
+ dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl;
commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));
}
}
// parse extents
- on->extents.clear();
+ on->extent_map.clear();
block_t n = 0;
for (int i=0; i<eo->num_extents; i++) {
Extent ex = *((Extent*)p);
- on->extents.push_back(ex);
+ on->extent_map[n] = ex;
dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << endl;
n += ex.length;
p += sizeof(Extent);
eo.object_size = on->object_size;
eo.object_blocks = on->object_blocks;
eo.num_attr = on->attr.size();
- eo.num_extents = on->extents.size();
+ eo.num_extents = on->extent_map.size();
bl.copy_in(off, sizeof(eo), (char*)&eo);
off += sizeof(eo);
}
// extents
- for (unsigned i=0; i<on->extents.size(); i++) {
- bl.copy_in(off, sizeof(Extent), (char*)&on->extents[i]);
+ for (map<block_t,Extent>::iterator i = on->extent_map.begin();
+ i != on->extent_map.end();
+ i++) {
+ bl.copy_in(off, sizeof(Extent), (char*)&(i->second));
off += sizeof(Extent);
- dout(15) << "write_onode " << *on << " ex " << i << ": " << on->extents[i] << endl;
+ dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << endl;
}
}
allocator.release(on->onode_loc);
block_t first = 0;
- if (on->extents.size())
- first = on->extents[0].start;
+ if (on->extent_map.size())
+ first = on->extent_map.begin()->second.start;
allocator.allocate(on->onode_loc, blocks, first);
object_tab->remove( on->object_id );
allocator.release(on->onode_loc);
// free data space
- for (unsigned i=0; i<on->extents.size(); i++)
- allocator.release(on->extents[i]);
+ for (map<block_t,Extent>::iterator i = on->extent_map.begin();
+ i != on->extent_map.end();
+ i++)
+ allocator.release(i->second);
+ on->extent_map.clear();
// remove from collections
Table<idpair_t, bool>::Cursor cursor(oc_tab);
// onode
map<string, AttrVal> attr;
- vector<Extent> extents;
+ //vector<Extent> extents;
+ map<block_t, Extent> extent_map;
interval_set<block_t> uncommitted;
// allocation
- void _append_extent(Extent& ex) {
- if (extents.size() &&
- extents[extents.size()-1].end() == ex.start)
- extents[extents.size()-1].length += ex.length;
- else
- extents.push_back(ex);
- }
void verify_extents() {
block_t count = 0;
interval_set<block_t> is;
+
if (0) { // do crazy stupid sanity checking
set<block_t> s;
cout << "verifying" << endl;
- for (unsigned i=0; i<extents.size(); i++) {
- //cout << "verify_extents " << i << " off " << count << " " << extents[i] << endl;
- count += extents[i].length;
-
- //assert(!is.contains(extents[i].start, extents[i].length));
- //is.insert(extents[i].start, extents[i].length);
-
- for (unsigned j=0;j<extents[i].length;j++) {
- assert(s.count(extents[i].start+j) == 0);
- s.insert(extents[i].start+j);
+
+ for (map<block_t,Extent>::iterator p = extent_map.begin();
+ p != extent_map.end();
+ p++) {
+ cout << " " << p->first << ": " << p->second << endl;
+ assert(count == p->first);
+ count += p->second.length;
+ for (unsigned j=0;j<p->second.length;j++) {
+ assert(s.count(p->second.start+j) == 0);
+ s.insert(p->second.start+j);
}
}
- cout << "verified " << extents.size() << " extents" << endl;
+
assert(s.size() == count);
assert(count == object_blocks);
}
}
void set_extent(block_t offset, Extent ex) {
- //cout << "set_extent " << offset << " " << ex << " ... " << object_blocks << endl;
+ //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl;
assert(offset <= object_blocks);
verify_extents();
// at the end?
if (offset == object_blocks) {
- _append_extent(ex);
+ //cout << " appending " << ex << endl;
+ if (!extent_map.empty() && extent_map.end()->second.end() == ex.start)
+ extent_map.end()->second.length += ex.length;
+ else
+ extent_map[object_blocks] = ex;
object_blocks += ex.length;
return;
}
- // nope. ok, rebuild the extent list.
- vector<Extent> old;
- old.swap(extents);
- assert(extents.empty());
-
- unsigned oldex = 0;
- block_t oldoff = 0;
- block_t cur = 0;
-
- // copy up to offset
- while (cur < offset) {
- Extent t;
- t.start = old[oldex].start+oldoff;
- t.length = MIN(offset-cur, old[oldex].length-oldoff);
- _append_extent(t);
-
- cur += t.length;
- oldoff += t.length;
- if (oldoff == old[oldex].length) {
- oldex++;
- oldoff = 0;
+ // removing any extent bits we overwrite
+ if (!extent_map.empty()) {
+ // preceeding extent?
+ map<block_t,Extent>::iterator p = extent_map.lower_bound(offset);
+ if (p != extent_map.begin()) {
+ p--;
+ if (p->first + p->second.length > offset) {
+ //cout << " preceeding was " << p->second << endl;
+ if (p->first + p->second.length > offset+ex.length) {
+ // cutting chunk out of middle, add last bit
+ Extent &n = extent_map[offset+ex.length] = p->second;
+ n.start += offset+ex.length - p->first;
+ n.length -= offset+ex.length - p->first;
+ //cout << " tail frag is " << n << endl;
+ }
+ p->second.length = offset - p->first; // cut tail off preceeding extent
+ //cout << " preceeding now " << p->second << endl;
+ }
+ p++;
}
- }
+
+ // overlapping extents
+ while (p != extent_map.end() &&
+ p->first < offset + ex.length) {
+ map<block_t,Extent>::iterator next = p;
+ next++;
+
+ // completely subsumed?
+ if (p->first + p->second.length <= offset+ex.length) {
+ //cout << " erasing " << p->second << endl;
+ extent_map.erase(p);
+ p = next;
+ continue;
+ }
- // add our new extent
- _append_extent(ex);
- if (offset + ex.length > object_blocks)
- object_blocks = offset + ex.length;
-
- // skip past it in the old stuff
- block_t sleft = ex.length;
- while (sleft > 0) {
- block_t skip = MIN(sleft, old[oldex].length-oldoff);
- sleft -= skip;
- oldoff += skip;
- if (oldoff == old[oldex].length) {
- oldex++;
- oldoff = 0;
- if (oldex == old.size()) break;
+ // spans new extent, cut off head
+ Extent &n = extent_map[ offset+ex.length ] = p->second;
+ //cout << " cut head off " << p->second;
+ n.start += offset+ex.length - p->first;
+ n.length -= offset+ex.length - p->first;
+ extent_map.erase(p);
+ //cout << ", now " << n << endl;
+ break;
}
}
- // copy anything left?
- while (oldex < old.size()) {
- if (oldoff) {
- Extent t;
- t.start = old[oldex].start+oldoff;
- t.length = old[oldex].length-oldoff;
- _append_extent(t);
- oldoff = 0;
- oldex++;
- } else {
- _append_extent(old[oldex++]);
- }
- }
+ extent_map[ offset ] = ex;
+ // extend object?
+ if (offset + ex.length > object_blocks)
+ object_blocks = offset+ex.length;
+
verify_extents();
}
* map teh given page range into extents on disk.
*/
int map_extents(block_t start, block_t len, vector<Extent>& ls) {
+ //cout << "map_extents " << start << " " << len << endl;
verify_extents();
- block_t cur = 0;
- for (unsigned i=0; i<extents.size(); i++) {
- if (cur >= start+len) break;
- if (cur + extents[i].length > start) {
+
+ //assert(start+len <= object_blocks);
+
+ map<block_t,Extent>::iterator p = extent_map.lower_bound(start);
+ if (p != extent_map.begin() &&
+ (p == extent_map.end() || p->first > start && p->first)) {
+ p--;
+ if (p->second.length > start - p->first) {
Extent ex;
- block_t headskip = start-cur;
- ex.start = extents[i].start + headskip;
- ex.length = MIN(len, extents[i].length - headskip);
+ ex.start = p->second.start + (start - p->first);
+ ex.length = MIN(len, p->second.length - (start - p->first));
ls.push_back(ex);
+
+ //cout << " got (tail of?) " << p->second << " : " << ex << endl;
+
start += ex.length;
len -= ex.length;
- if (len == 0) break;
}
- cur += extents[i].length;
+ p++;
}
+
+ while (len > 0 &&
+ p != extent_map.end()) {
+ assert(p->first == start);
+ Extent ex = p->second;
+ ex.length = MIN(len, ex.length);
+ ls.push_back(ex);
+ //cout << " got (head of?) " << p->second << " : " << ex << endl;
+ start += ex.length;
+ len -= ex.length;
+ p++;
+ }
+
return 0;
}
return s;
}
int get_extent_bytes() {
- return sizeof(Extent) * extents.size();
+ return sizeof(Extent) * extent_map.size();
}
};
void insert(T start, T len) {
//cout << "insert " << start << "~" << len << endl;
+ assert(len > 0);
typename map<T,T>::iterator p = find_adj_m(start);
if (p == m.end()) {
m[start] = len; // new interval
assert(p->pulling_version(o) == v);
// write it and add it to the PG
- store->write(o, op->get_length(), 0, op->get_data());
+ store->write(o, op->get_length(), 0, op->get_data(), true);
p->pg->add_object(store, o);
store->setattr(o, "version", &v, sizeof(v));
}
} else {
// normal business
+ assert(0); // no more!
r = store->write(op->get_oid(),
op->get_length(),
op->get_offset(),