From 24a7b51159fca7645a5e424e0689b61924d75fea Mon Sep 17 00:00:00 2001 From: sageweil Date: Mon, 27 Aug 2007 21:13:40 +0000 Subject: [PATCH] rewrote ebofs node management to eliminate crap performance with large object counts git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1707 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/Makefile | 2 +- trunk/ceph/ebofs/BlockDevice.cc | 92 ++++---- trunk/ceph/ebofs/BufferCache.cc | 41 +--- trunk/ceph/ebofs/BufferCache.h | 6 +- trunk/ceph/ebofs/Ebofs.cc | 49 ++--- trunk/ceph/ebofs/Table.h | 56 +++-- trunk/ceph/ebofs/mkfs.ebofs.cc | 62 ++++-- trunk/ceph/ebofs/nodes.h | 379 ++++++++++++++------------------ trunk/ceph/ebofs/test.ebofs.cc | 32 +-- trunk/ceph/ebofs/types.h | 9 +- trunk/ceph/include/bitmapper.h | 15 +- trunk/ceph/include/buffer.h | 11 + trunk/ceph/include/xlist.h | 59 +---- 13 files changed, 394 insertions(+), 419 deletions(-) diff --git a/trunk/ceph/Makefile b/trunk/ceph/Makefile index f6c7a64a0b155..21763fd8726ab 100644 --- a/trunk/ceph/Makefile +++ b/trunk/ceph/Makefile @@ -16,7 +16,7 @@ EXTRA_CFLAGS = -I${HOME}/include -L${HOME}/lib # base -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} +CFLAGS = -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} LDINC = ld -i -o CC = g++ LIBS = -pthread diff --git a/trunk/ceph/ebofs/BlockDevice.cc b/trunk/ceph/ebofs/BlockDevice.cc index 99d1f3ef719d0..355ae39daa5ac 100644 --- a/trunk/ceph/ebofs/BlockDevice.cc +++ b/trunk/ceph/ebofs/BlockDevice.cc @@ -162,7 +162,7 @@ int BlockDevice::ElevatorQueue::dequeue_io(list& biols, // add to biols int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_iovs + nv >= g_conf.bdev_iov_max) break; // too many! + if (num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many! num_iovs += nv; start = MIN(start, bio->start); @@ -663,46 +663,60 @@ int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) assert(fd > 0); - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0); - assert((iov[n].iov_len & 4095) == 0); + while (1) { + off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; + assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << dendl; - dout(1) << "bl is " << bl << dendl; - assert(0); - } else { - assert(r == (int)len); + // write buffers + size_t len = num*EBOFS_BLOCK_SIZE; + + struct iovec iov[ bl.buffers().size() ]; + + int n = 0; + size_t left = len; + for (list::const_iterator i = bl.buffers().begin(); + i != bl.buffers().end(); + i++) { + assert(i->length() % EBOFS_BLOCK_SIZE == 0); + + iov[n].iov_base = (void*)i->c_str(); + iov[n].iov_len = MIN(left, i->length()); + + assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0); + assert((iov[n].iov_len & 4095) == 0); + + left -= iov[n].iov_len; + n++; + if (left == 0) break; + } + + int r = ::writev(fd, iov, n); + + if (r < 0) { + dout(1) << "couldn't write bno " << bno << " num " << num + << " (" << len << " bytes) in " << n << " iovs, r=" << r + << " errno " << errno << " " << strerror(errno) << dendl; + dout(1) << "bl is " << bl << dendl; + assert(0); + } else if (r < (int)len) { + // hrm, we didn't write _all_ of our data. WTF kind of FS is this? + dout(-1) << "bloody hell, writev only wrote " << r << " of " << len << " bytes, looping" << dendl; + assert(r % 4096 == 0); + int wrote = r / 4096; + bno += wrote; + num -= wrote; + bufferlist tail; + tail.substr_of(bl, r, len-r); + bl.claim(tail); + continue; + } else { + // yay + assert(r == (int)len); + break; + } } - return 0; } diff --git a/trunk/ceph/ebofs/BufferCache.cc b/trunk/ceph/ebofs/BufferCache.cc index b6317d5206d2b..18912aa867e6f 100644 --- a/trunk/ceph/ebofs/BufferCache.cc +++ b/trunk/ceph/ebofs/BufferCache.cc @@ -421,13 +421,12 @@ int ObjectCache::map_read(block_t start, block_t len, * - don't worry about disk extent boundaries (yet) */ int ObjectCache::map_write(block_t start, block_t len, - interval_set& alloc, map& hits, version_t super_epoch) { map::iterator p = data.lower_bound(start); - dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << dendl; + dout(10) << "map_write " << *on << " " << start << "~" << len << dendl; // p->first >= start block_t cur = start; @@ -445,21 +444,6 @@ int ObjectCache::map_write(block_t start, block_t len, while (left > 0) { // max for this bh (bc of (re)alloc on disk) block_t max = left; - bool newalloc = false; - - // based on alloc/no-alloc boundary ... - if (alloc.contains(cur, left)) { - if (alloc.contains(cur)) { - block_t ends = alloc.end_after(cur); - max = MIN(left, ends-cur); - newalloc = true; - } else { - if (alloc.starts_after(cur)) { - block_t st = alloc.start_after(cur); - max = MIN(left, st-cur); - } - } - } // based on disk extent boundary ... vector exv; @@ -467,11 +451,7 @@ int ObjectCache::map_write(block_t start, block_t len, if (exv.size() > 1) max = exv[0].length; - if (newalloc) { - dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << dendl; - } else { - dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << dendl; - } + dout(10) << "map_write " << cur << "~" << max << dendl; // at end? if (p == data.end()) { @@ -499,7 +479,7 @@ int ObjectCache::map_write(block_t start, block_t len, BufferHead *right = bc->split(bh, cur); bc->bh_read(on, bh); // reread left bit bh = right; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { + } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { BufferHead *right = bc->split(bh, cur); bc->bh_write(on, bh); // rewrite left bit bh = right; @@ -518,7 +498,7 @@ int ObjectCache::map_write(block_t start, block_t len, BufferHead *right = bc->split(middle, cur+max); bc->bh_read(on, right); // reread right bh = middle; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { + } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { BufferHead *middle = bc->split(bh, cur); bc->bh_write(on, bh); // redo left p++; @@ -542,7 +522,7 @@ int ObjectCache::map_write(block_t start, block_t len, if (bh->is_rx() && bc->bh_cancel_read(bh)) { BufferHead *right = bc->split(bh, cur+max); bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { + } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { BufferHead *right = bc->split(bh, cur+max); bc->bh_write(on, right); // re-tx the right bit } else { @@ -552,7 +532,7 @@ int ObjectCache::map_write(block_t start, block_t len, } // try to cancel tx? - if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch); + if (bh->is_tx() && bh->epoch_modified == super_epoch) bc->bh_cancel_write(bh, super_epoch); // put in our map hits[cur] = bh; @@ -656,7 +636,7 @@ void ObjectCache::truncate(block_t blocks, version_t super_epoch) BufferHead *right = bc->split(bh, blocks); bc->bh_read(on, bh); // reread left bit bh = right; - } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) { + } else if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { BufferHead *right = bc->split(bh, blocks); bc->bh_write(on, bh); // rewrite left bit bh = right; @@ -669,7 +649,7 @@ void ObjectCache::truncate(block_t blocks, version_t super_epoch) // cancel any pending/queued io, if possible. if (bh->is_rx()) bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom) + if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch) bc->bh_cancel_write(bh, super_epoch); if (bh->shadow_of) { dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << dendl; @@ -909,13 +889,14 @@ void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) { + assert(bh->is_tx()); + assert(bh->epoch_modified == cur_epoch); + assert(bh->epoch_modified > 0); if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { dout(10) << "bh_cancel_write on " << *bh << dendl; bh->tx_ioh = 0; mark_dirty(bh); - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! int l = bh->oc->put(); diff --git a/trunk/ceph/ebofs/BufferCache.h b/trunk/ceph/ebofs/BufferCache.h index d712dd9c5a681..24aeccc4a200c 100644 --- a/trunk/ceph/ebofs/BufferCache.h +++ b/trunk/ceph/ebofs/BufferCache.h @@ -335,6 +335,11 @@ inline ostream& operator<<(ostream& out, BufferHead& bh) if (bh.is_rx()) out << " rx"; if (bh.is_tx()) out << " tx"; if (bh.is_partial()) out << " partial"; + + // include epoch modified? + if (bh.is_dirty() || bh.is_tx() || bh.is_partial()) + out << "(e" << bh.epoch_modified << ")"; + //out << " " << bh.data.length(); out << " " << &bh; out << ")"; @@ -420,7 +425,6 @@ class ObjectCache { int map_write(block_t start, block_t len, - interval_set& alloc, map& hits, version_t super_epoch); // can write to these. void touch_bottom(block_t bstart, block_t blast); diff --git a/trunk/ceph/ebofs/Ebofs.cc b/trunk/ceph/ebofs/Ebofs.cc index 92d845dcff834..326d5871bfad7 100644 --- a/trunk/ceph/ebofs/Ebofs.cc +++ b/trunk/ceph/ebofs/Ebofs.cc @@ -102,8 +102,7 @@ int Ebofs::mount() // init node pools dout(3) << "mount nodepool" << dendl; nodepool.init( &sb->nodepool ); - nodepool.read_usemap( dev, super_epoch ); - nodepool.read_clean_nodes( dev ); + nodepool.read_usemap_and_clean_nodes( dev, super_epoch ); // open tables dout(3) << "mount opening tables" << dendl; @@ -213,6 +212,7 @@ int Ebofs::mkfs() nodepool.usemap_odd.length = usemap_len; dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << dendl; dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << dendl; + nodepool.init_usemap(); // init tables struct ebofs_table empty; @@ -247,18 +247,14 @@ int Ebofs::mkfs() // write nodes, super, 2x dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << dendl; - nodepool.commit_start( dev, 0 ); - nodepool.commit_wait(); - bufferptr superbp0; - prepare_super(0, superbp0); - write_super(0, superbp0); - - nodepool.commit_start( dev, 1 ); - nodepool.commit_wait(); - bufferptr superbp1; - prepare_super(1, superbp1); - write_super(1, superbp1); - + for (epoch_t e=0; e<2; e++) { + nodepool.commit_start(dev, e); + nodepool.commit_wait(); + bufferptr superbp; + prepare_super(e, superbp); + write_super(e, superbp); + } + // free memory dout(10) << "mkfs: cleaning up" << dendl; close_tables(); @@ -487,10 +483,10 @@ int Ebofs::commit_thread_entry() << "%) limbo in " << get_limbo_extents() << dendl; dout(2) << "commit_thread nodes: " - << 100*nodepool.num_used()/nodepool.num_total() << "% used, " - << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " - << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, " - << nodepool.num_total() << " total." << dendl; + << 100*nodepool.get_num_used()/nodepool.get_num_total() << "% used, " + << nodepool.get_num_free() << " (" << 100*nodepool.get_num_free()/nodepool.get_num_total() << "%) free, " + << nodepool.get_num_limbo() << " (" << 100*nodepool.get_num_limbo()/nodepool.get_num_total() << "%) limbo, " + << nodepool.get_num_total() << " total." << dendl; dout(2) << "commit_thread bc: " << "size " << bc.get_size() << ", trimmable " << bc.get_trimmable() @@ -540,7 +536,7 @@ int Ebofs::commit_thread_entry() allocator.release_limbo(); // limbo_tab -> free_tabs // do we need more node space? - if (nodepool.num_free() < nodepool.num_total() / 3) { + if (nodepool.get_num_free() < nodepool.get_num_total() / 3) { dout(2) << "commit_thread running low on node space, allocating more." << dendl; alloc_more_node_space(); } @@ -574,17 +570,17 @@ int Ebofs::commit_thread_entry() void Ebofs::alloc_more_node_space() { - dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << dendl; + dout(1) << "alloc_more_node_space free " << nodepool.get_num_free() << "/" << nodepool.get_num_total() << dendl; if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.num_total(); + int want = nodepool.get_num_total(); Extent ex; allocator.allocate(ex, want, 2); dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << dendl; Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length); + unsigned ulen = nodepool.get_usemap_len(nodepool.get_num_total() + ex.length); allocator.allocate(even, ulen, 2); allocator.allocate(odd, ulen, 2); dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << dendl; @@ -594,8 +590,11 @@ void Ebofs::alloc_more_node_space() allocator.release(nodepool.usemap_even); allocator.release(nodepool.usemap_odd); nodepool.add_region(ex); + + // expand usemap? nodepool.usemap_even = even; nodepool.usemap_odd = odd; + nodepool.expand_usemap(); } else { dout (1) << "alloc_more_node_space failed to get space for new usemaps" << dendl; allocator.release(ex); @@ -1384,8 +1383,8 @@ int Ebofs::statfs(struct statfs *buf) buf->f_bfree = get_free_blocks() + get_limbo_blocks(); /* free blocks in fs */ buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */ + buf->f_files = nodepool.get_num_total(); /* total file nodes in file system */ + buf->f_ffree = nodepool.get_num_free(); /* free file nodes in fs */ //buf->f_fsid = 0; /* file system id */ #ifndef DARWIN buf->f_namelen = 8; /* maximum length of filenames */ @@ -1589,7 +1588,7 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) // map b range onto buffer_heads map hits; - oc->map_write(bstart, blen, alloc, hits, super_epoch); + oc->map_write(bstart, blen, hits, super_epoch); // get current versions //version_t lowv, highv; diff --git a/trunk/ceph/ebofs/Table.h b/trunk/ceph/ebofs/Table.h index 02e74b94f7693..7c5f29ab59604 100644 --- a/trunk/ceph/ebofs/Table.h +++ b/trunk/ceph/ebofs/Table.h @@ -77,10 +77,18 @@ class Table { Nodeptr() : node(0) {} Nodeptr(Node *n) : node(n) {} + Nodeptr(NodePool& pool, nodeid_t nid) { + open(pool, nid); + } Nodeptr& operator=(Node *n) { node = n; return *this; } + + void open(NodePool& pool, nodeid_t nid) { + node = pool.get_node(nid); + if (is_index() && node->children.empty()) init_index(pool); + } LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } @@ -90,7 +98,7 @@ class Table { else return leaf_item(i).key; } - + bool is_leaf() { return node->is_leaf(); } bool is_index() { return node->is_index(); } void set_type(int t) { node->set_type(t); } @@ -108,10 +116,24 @@ class Table { int size() { return node->size(); } void set_size(int s) { node->set_size(s); } + void init_index(NodePool& nodepool) { + /* + node->children = vector(max_items()); + for (int i=0; ichildren[i] = nodepool.get_node(index_item(i).node); + else + node->children[i] = 0; + */ + } + + void remove_at_pos(int p) { if (node->is_index()) { - for (int i=p; ichildren[i] = node->children[i+1]; + } } else { for (int i=p; ip; i--) + for (int i=size(); i>p; i--) { index_item(i) = index_item(i-1); + //node->children[i] = node->children[i-1]; + } index_item(p).key = key; - index_item(p).node = node; + index_item(p).node = nid; set_size(size() + 1); } @@ -213,7 +237,7 @@ class Table { // work back down right side for (; lpool.get_node( open[l].index_item(pos[l]).node ); + open[l+1].open(table->pool, open[l].index_item(pos[l]).node); pos[l+1] = open[l+1].size() - 1; } return 1; @@ -240,7 +264,7 @@ class Table { /* work back down */ for (; lpool.get_node( open[l].index_item(pos[l]).node ); + open[l+1].open(table->pool, open[l].index_item(pos[l]).node ); pos[l+1] = 0; // furthest left } return 1; @@ -303,7 +327,7 @@ class Table { Nodeptr here = open[level]; Nodeptr parent = open[level-1]; - Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node ); + Nodeptr left(table->pool, parent.index_item(pos[level-1] - 1).node ); if (left.size() == left.max_items()) return -1; // it's full // make both dirty @@ -342,7 +366,7 @@ class Table { Nodeptr here = open[level]; Nodeptr parent = open[level-1]; - Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node ); + Nodeptr right(table->pool, parent.index_item( pos[level-1] + 1 ).node ); if (right.size() == right.max_items()) return -1; // it's full // make both dirty @@ -389,7 +413,7 @@ class Table { public: bool almost_full() { - if (2*(depth+1) > pool.num_free()) // worst case, plus some. + if (2*(depth+1) > pool.get_num_free()) // worst case, plus some. return true; return false; } @@ -404,7 +428,7 @@ class Table { cursor.level = 0; // start at root - Nodeptr curnode( pool.get_node(root) ); + Nodeptr curnode(pool, root); cursor.open[0] = curnode; if (curnode.size() == 0) return -1; // empty! @@ -443,7 +467,7 @@ class Table { cursor.pos[cursor.level] = i; /* get child node */ - curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node ); + curnode.open(pool, cursor.open[cursor.level].index_item(i).node ); cursor.open[cursor.level+1] = curnode; } @@ -687,7 +711,7 @@ class Table { // left? if (cursor.pos[cursor.level-1] > 0) { int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left = pool.get_node( left_loc ); + left.open(pool, left_loc); if (left.size() > left.min_items()) { /* move cursor left, shift right */ @@ -705,7 +729,7 @@ class Table { else { assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right = pool.get_node( right_loc ); + right.open(pool, right_loc ); if (right.size() > right.min_items()) { /* move cursor right, shift an item left */ @@ -749,7 +773,7 @@ class Table { void clear(Cursor& cursor, int node_loc, int level) { dbtout << "clear" << std::endl; - Nodeptr node = pool.get_node( node_loc ); + Nodeptr node(pool, node_loc); cursor.open[level] = node; // hose children? @@ -778,7 +802,7 @@ class Table { int verify_sub(Cursor& cursor, int node_loc, int level, int& count, K& last, const char *on) { int err = 0; - Nodeptr node = pool.get_node( node_loc ); + Nodeptr node(pool, node_loc); cursor.open[level] = node; // identify max, min, and validate key range diff --git a/trunk/ceph/ebofs/mkfs.ebofs.cc b/trunk/ceph/ebofs/mkfs.ebofs.cc index 5fbddba89b82c..d1d5975e7fd65 100644 --- a/trunk/ceph/ebofs/mkfs.ebofs.cc +++ b/trunk/ceph/ebofs/mkfs.ebofs.cc @@ -26,7 +26,7 @@ int main(int argc, char **argv) parse_config_options(args); if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << endl; + cerr << "usage: mkfs.ebofs [options] " << std::endl; return -1; } char *filename = args[0]; @@ -40,10 +40,34 @@ int main(int argc, char **argv) // test-o-rama! Ebofs fs(filename); fs.mount(); - + + // zillion objects if (1) { char crap[1024*1024]; memset(crap, 0, 1024*1024); + bufferlist bl; + int sz = 10000; + bl.append(crap, sz); + + int n = 100000; + utime_t start = g_clock.now(); + for (int i=0; i nsec while (1) { - cout << g_clock.now() << " writing " << pos << "~" << sz << endl; + cout << g_clock.now() << " writing " << pos << "~" << sz << std::endl; fs.write(oid, pos, sz, bl, (Context*)0); pos += sz; nanosleep(&ts, 0); @@ -79,32 +103,32 @@ int main(int argc, char **argv) bufferlist big; big.append(crap, 1024*1024); - cout << "0" << endl; + cout << "0" << std::endl; fs.write(10, 0, 1024*1024, big, (Context*)0); fs.sync(); fs.trim_buffer_cache(); - cout << "1" << endl; + cout << "1" << std::endl; fs.write(10, 10, 10, small, 0); fs.write(10, 1, 1000, med, 0); fs.sync(); fs.trim_buffer_cache(); - cout << "2" << endl; + cout << "2" << std::endl; fs.write(10, 10, 10, small, 0); //fs.sync(); fs.write(10, 1, 1000, med, 0); fs.sync(); fs.trim_buffer_cache(); - cout << "3" << endl; + cout << "3" << std::endl; fs.write(10, 1, 1000, med, 0); fs.write(10, 10000, 10, small, 0); fs.truncate(10, 100, 0); fs.sync(); fs.trim_buffer_cache(); - cout << "4" << endl; + cout << "4" << std::endl; fs.remove(10); fs.sync(); fs.write(10, 10, 10, small, 0); @@ -151,7 +175,7 @@ int main(int argc, char **argv) char *p = bl.c_str(); off_t o = 0; for (int i=0; i free can alloc free used -> dirty can modify - free used used -> tx + free used used -> clean free used free -> limbo used used -> clean @@ -53,7 +53,6 @@ class Node { // bit fields static const int STATE_CLEAN = 1; static const int STATE_DIRTY = 2; - static const int STATE_TX = 3; static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); @@ -62,27 +61,37 @@ class Node { protected: nodeid_t id; + int pos_in_bitmap; // position in bitmap int state; // use bit fields above! bufferptr bptr; - bufferptr shadow_bptr; // in disk buffer int *type; int *nrecs; public: - xlist::item xlist; + xlist::item xlist; // dirty - Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b), xlist(this) { + vector children; + + Node(nodeid_t i, int pib, bufferptr& b, int s) : + id(i), pos_in_bitmap(pib), + state(s), bptr(b), xlist(this) { nrecs = (int*)(bptr.c_str()); type = (int*)(bptr.c_str() + sizeof(*nrecs)); } - + void do_cow() { + bptr.do_cow(); + } + + // id nodeid_t get_id() const { return id; } void set_id(nodeid_t n) { id = n; } + int get_pos_in_bitmap() const { return pos_in_bitmap; } + void set_pos_in_bitmap(int i) { pos_in_bitmap = i; } // buffer bufferptr& get_buffer() { return bptr; } @@ -102,25 +111,10 @@ class Node { // state bool is_dirty() { return state == STATE_DIRTY; } - bool is_tx() { return state == STATE_TX; } bool is_clean() { return state == STATE_CLEAN; } void set_state(int s) { state = s; } - - void make_shadow() { - assert(is_tx()); - - shadow_bptr = bptr; - - // new buffer - bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES); - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - - // copy contents! - memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES); - } - + }; @@ -135,34 +129,48 @@ class NodePool { vector region_loc; // region locations Extent usemap_even; Extent usemap_odd; + + buffer::ptr usemap_data; + bitmapper usemap_bits; protected: // on-disk block states int num_nodes; - set free; - set clean; - set limbo; - set dirty; - set tx; + int num_dirty; + int num_clean; + int num_free; + int num_limbo; + + xlist dirty_ls; + interval_set free; + interval_set limbo; Mutex &ebofs_lock; Cond commit_cond; int flushing; - static int make_nodeid(int region, int offset) { - return (region << 24) | offset; + nodeid_t make_nodeid(int region, int offset) { + return region_loc[region].start + (block_t)offset; } - static int nodeid_region(nodeid_t nid) { - return nid >> 24; - } - static int nodeid_offset(nodeid_t nid) { - return nid & ((1 << 24) - 1); + int nodeid_pos_in_bitmap(nodeid_t nid) { + unsigned region; + int num = 0; + for (region = 0; + (block_t)nid < region_loc[region].start || (block_t)nid > region_loc[region].end(); + region++) { + //generic_dout(-20) << "node " << nid << " not in " << region << " " << region_loc[region] << dendl; + num += region_loc[region].length; + } + num += nid - region_loc[region].start; + //generic_dout(-20) << "node " << nid << " is in " << region << ", overall bitmap pos is " << num << dendl; + return num; } public: NodePool(Mutex &el) : - num_nodes(0), + num_nodes(0), + num_dirty(0), num_clean(0), num_free(0), num_limbo(0), ebofs_lock(el), flushing(0) {} ~NodePool() { @@ -170,32 +178,52 @@ class NodePool { release_all(); } - int num_free() { return free.size(); } - int num_dirty() { return dirty.size(); } - int num_limbo() { return limbo.size(); } - int num_tx() { return tx.size(); } - int num_clean() { return clean.size(); } - int num_total() { return num_nodes; } - int num_used() { return num_clean() + num_dirty() + num_tx(); } + int get_num_free() { return num_free; } + int get_num_dirty() { return num_dirty; } + int get_num_limbo() { return num_limbo; } + int get_num_clean() { return num_clean; } + int get_num_total() { return num_nodes; } + int get_num_used() { return num_clean + num_dirty; } int get_usemap_len(int n=0) { if (n == 0) n = num_nodes; return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; } - int num_regions() { return region_loc.size(); } + unsigned num_regions() { return region_loc.size(); } // the caller had better adjust usemap locations... void add_region(Extent ex) { - int region = region_loc.size(); - assert(ex.length <= (1 << 24)); + assert(region_loc.size() < EBOFS_MAX_NODE_REGIONS); region_loc.push_back(ex); - for (unsigned o = 0; o < ex.length; o++) { - free.insert( make_nodeid(region, o) ); - } + free.insert(ex.start, ex.length); + num_free += ex.length; num_nodes += ex.length; } + void init_usemap() { + usemap_data = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*usemap_even.length); + usemap_data.zero(); + usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); + } + + void expand_usemap() { + block_t have = usemap_data.length() / EBOFS_BLOCK_SIZE; + if (have < usemap_even.length) { + // use bufferlist to copy/merge two chunks + bufferlist bl; + bl.push_back(usemap_data); + bufferptr newbit = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*(usemap_even.length - have)); + newbit.zero(); + bl.push_back(newbit); + assert(bl.buffers().size() == 1); + usemap_data = bl.buffers().front(); + usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); + } + } + + + int init(struct ebofs_nodepool *np) { // regions assert(region_loc.empty()); @@ -212,6 +240,7 @@ class NodePool { debofs(3) << "init even map at " << usemap_even << std::endl; debofs(3) << "init odd map at " << usemap_odd << std::endl; + init_usemap(); return 0; } @@ -219,11 +248,16 @@ class NodePool { release_all(); region_loc.clear(); + + num_free = 0; + num_dirty = 0; + num_clean = 0; + num_limbo = 0; + dirty_ls.clear(); + free.clear(); - dirty.clear(); - tx.clear(); - clean.clear(); limbo.clear(); + flushing = 0; node_map.clear(); } @@ -231,7 +265,7 @@ class NodePool { // *** blocking i/o routines *** - int read_usemap(BlockDevice& dev, version_t epoch) { + int read_usemap_and_clean_nodes(BlockDevice& dev, version_t epoch) { // read map Extent loc; if (epoch & 1) @@ -239,66 +273,44 @@ class NodePool { else loc = usemap_even; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - dev.read(loc.start, loc.length, bp); + // usemap + dev.read(loc.start, loc.length, usemap_data); - // parse - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; // move one bit right. - roff++; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - if (region == region_loc.size()) break; - } - return 0; - } - - int read_clean_nodes(BlockDevice& dev) { - /* - this relies on the clean set begin defined so that we know which nodes - to read. so it only really works when called from mount()! - */ - for (unsigned r=0; r> 1; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - - *(unsigned char*)(bp.c_str() + byte) = x; - if (region == region_loc.size()) break; - } - - // write bufferlist bl; + bufferptr bp = usemap_data.clone(); bl.append(bp); dev.write(loc.start, loc.length, bl, new C_NP_FlushUsemap(this), "usemap"); @@ -385,19 +368,6 @@ class NodePool { void flushed_node(nodeid_t nid) { ebofs_lock.Lock(); - - // mark nid clean|limbo - if (tx.count(nid)) { // tx -> clean - tx.erase(nid); - clean.insert(nid); - - // make node itself clean - node_map[nid]->set_state(Node::STATE_CLEAN); - } - else { // already limbo (was dirtied, or released) - assert(limbo.count(nid)); - } - flushing--; if (flushing == 0) commit_cond.Signal(); @@ -406,7 +376,7 @@ class NodePool { public: void commit_start(BlockDevice& dev, version_t version) { - generic_dout(20) << "ebofs.nodepool.commit_start start" << dendl; + debofs(20) << "ebofs.nodepool.commit_start start" << std::endl; assert(flushing == 0); /*if (0) @@ -421,42 +391,33 @@ class NodePool { flushing++; write_usemap(dev, version & 1); - // dirty -> tx (write to disk) - assert(tx.empty()); - set didb; - for (set::iterator i = dirty.begin(); - i != dirty.end(); - i++) { - Node *n = get_node(*i); + // dirty -> clean (write to disk) + while (!dirty_ls.empty()) { + Node *n = dirty_ls.front(); assert(n); assert(n->is_dirty()); - n->set_state(Node::STATE_TX); - - unsigned region = nodeid_region(*i); - block_t off = nodeid_offset(*i); - block_t b = region_loc[region].start + off; - - if (0) { // sanity check debug FIXME - assert(didb.count(b) == 0); - didb.insert(b); - } + n->set_state(Node::STATE_CLEAN); + dirty_ls.remove(&n->xlist); + num_dirty--; + num_clean++; + debofs(20) << "ebofs.nodepool.commit_start writing node " << n->get_id() << std::endl; + bufferlist bl; bl.append(n->get_buffer()); - dev.write(b, EBOFS_NODE_BLOCKS, + dev.write(n->get_id(), EBOFS_NODE_BLOCKS, bl, - new C_NP_FlushNode(this, *i), "node"); + new C_NP_FlushNode(this, n->get_id()), "node"); flushing++; - - tx.insert(*i); } - dirty.clear(); // limbo -> free - for (set::iterator i = limbo.begin(); - i != limbo.end(); + for (map::iterator i = limbo.m.begin(); + i != limbo.m.end(); i++) { - free.insert(*i); + num_free += i->second; + num_limbo -= i->second; + free.insert(i->first, i->second); } limbo.clear(); @@ -485,23 +446,13 @@ class NodePool { return node_map[nid]; } - // unopened node - /* not implemented yet!! - Node* open_node(nodeid_t nid) { - Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); - dbtout << "pool.open_node " << n->get_id() << std::endl; - node_map[n->get_id()] = n; - return n; - } - */ - // allocate id/block on disk. always free -> dirty. nodeid_t alloc_id() { // pick node id assert(!free.empty()); - nodeid_t nid = *(free.begin()); + nodeid_t nid = free.start(); free.erase(nid); - dirty.insert(nid); + num_free--; return nid; } @@ -512,12 +463,20 @@ class NodePool { // alloc node bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - Node *n = new Node(nid, bp, Node::STATE_DIRTY); + bp.zero(); + Node *n = new Node(nid, nodeid_pos_in_bitmap(nid), bp, Node::STATE_DIRTY); n->set_type(type); n->set_size(0); + usemap_bits.set(n->get_pos_in_bitmap()); + + n->set_state(Node::STATE_DIRTY); + dirty_ls.push_back(&n->xlist); + num_dirty++; + assert(node_map.count(nid) == 0); node_map[nid] = n; + return n; } @@ -527,20 +486,20 @@ class NodePool { node_map.erase(nid); if (n->is_dirty()) { - assert(dirty.count(nid)); - dirty.erase(nid); + dirty_ls.remove(&n->xlist); + num_dirty--; free.insert(nid); + num_free++; + usemap_bits.clear(n->get_pos_in_bitmap()); } else if (n->is_clean()) { - assert(clean.count(nid)); - clean.erase(nid); - limbo.insert(nid); - } else if (n->is_tx()) { - assert(tx.count(nid)); // i guess htis happens? -sage - tx.erase(nid); limbo.insert(nid); + num_limbo++; + num_clean--; + usemap_bits.clear(n->get_pos_in_bitmap()); } delete n; + assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); } void release_all() { @@ -557,29 +516,33 @@ class NodePool { nodeid_t oldid = n->get_id(); nodeid_t newid = alloc_id(); debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << std::endl; + + // dup data? + // this only does a memcpy if there are multiple references.. + // i.e. if we are still writing the old data + n->do_cow(); // release old block - if (n->is_clean()) { - assert(clean.count(oldid)); - clean.erase(oldid); - } else { - assert(n->is_tx()); - assert(tx.count(oldid)); - tx.erase(oldid); - - // move/copy current -> shadow buffer as necessary - n->make_shadow(); - } + assert(n->is_clean()); + num_clean--; limbo.insert(oldid); + num_limbo++; + usemap_bits.clear(n->get_pos_in_bitmap()); + + // rename node node_map.erase(oldid); - - n->set_state(Node::STATE_DIRTY); - - // move to new one! n->set_id(newid); + n->set_pos_in_bitmap(nodeid_pos_in_bitmap(newid)); node_map[newid] = n; - } - + + // new block + n->set_state(Node::STATE_DIRTY); + dirty_ls.push_back(&n->xlist); + num_dirty++; + usemap_bits.set(n->get_pos_in_bitmap()); + + assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); + } }; diff --git a/trunk/ceph/ebofs/test.ebofs.cc b/trunk/ceph/ebofs/test.ebofs.cc index 345f49b7a68ca..9a8913a52d80d 100644 --- a/trunk/ceph/ebofs/test.ebofs.cc +++ b/trunk/ceph/ebofs/test.ebofs.cc @@ -45,12 +45,12 @@ public: case 0: { oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl; + cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << std::endl; bufferlist bl; fs.read(oid, off, len, bl); int l = MIN(len,bl.length()); if (l) { - cout << t << " got " << l << endl; + cout << t << " got " << l << std::endl; bl.copy(0, l, b); char *p = b; while (l--) { @@ -65,7 +65,7 @@ public: case 1: { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl; + cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << std::endl; for (int j=0;j> 3] & (1 << (b&7)); } void set(int b) { diff --git a/trunk/ceph/include/buffer.h b/trunk/ceph/include/buffer.h index 0e279e09dad85..bd10b1d977ced 100644 --- a/trunk/ceph/include/buffer.h +++ b/trunk/ceph/include/buffer.h @@ -282,6 +282,17 @@ public: ~ptr() { release(); } + + raw *clone() { + return _raw->clone(); + } + + void do_cow() { + if (_raw->nref != 1) { + std::cout << "doing cow on " << _raw << " len " << _len << std::endl; + _raw = _raw->clone(); + } + } void swap(ptr& other) { raw *r = _raw; diff --git a/trunk/ceph/include/xlist.h b/trunk/ceph/include/xlist.h index 91e5ebbdd68ef..b6dc0aec9db25 100644 --- a/trunk/ceph/include/xlist.h +++ b/trunk/ceph/include/xlist.h @@ -15,63 +15,6 @@ #ifndef __XLIST_H #define __XLIST_H -/* -class xlist_head; - -class xlist_item { - private: - xlist_item *_prev, *_next; - xlist_head *_head; - friend class xlist_head; - - public: - xlist_item() : _prev(0), _next(0), _head(0) {} - xlist_head* _get_containing_xlist() { return _head; } -}; - -class xlist_head { - private: - xlist_item *_front, *_back; - int _size; - - friend class xlist_item; - - public: - int size() { return _size; } - bool empty() { return _front == 0; } - - void push_back(xlist_item *item) { - if (item->_head) item->_head->remove(item); - - item->_head = this; - item->_next = 0; - item->_prev = _back; - if (_back) _back->_next = item; - _back = item; - _size++; - } - void remove(xlist_item *item) { - assert(item->_head == this); - - if (item->_prev) - item->_prev->_next = item->_next; - else - _front = item->_next; - if (item->_next) - item->_next->_prev = item->_prev; - else - _back = item->_prev; - _size--; - - item->_head = 0; - item->_next = item->_prev = 0; - } - -}; -*/ - - - template class xlist { public: @@ -95,6 +38,8 @@ private: int _size; public: + xlist() : _front(0), _back(0), _size(0) {} + int size() { return _size; } bool empty() { return _front == 0; } -- 2.39.5