From 18d9132a053846d7fd24ec512a83d4da106d6ed7 Mon Sep 17 00:00:00 2001 From: sage Date: Mon, 28 Nov 2005 19:37:51 +0000 Subject: [PATCH] *** empty log message *** git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@512 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 9 + ceph/ebofs/AlignedBufferPool.h | 116 +++++ ceph/ebofs/Allocator.cc | 155 +++++++ ceph/ebofs/Allocator.h | 32 ++ ceph/ebofs/BlockDevice.h | 187 ++++++++ ceph/ebofs/BufferCache.cc | 223 ++++++++++ ceph/ebofs/BufferCache.h | 191 ++++++++ ceph/ebofs/Ebofs.cc | 326 ++++++++++++++ ceph/ebofs/Ebofs.h | 99 +++++ ceph/ebofs/Onode.h | 82 ++++ ceph/ebofs/Table.h | 783 +++++++++++++++++++++++++++++++++ ceph/ebofs/nodes.h | 387 ++++++++++++++++ ceph/ebofs/types.h | 138 ++++++ 13 files changed, 2728 insertions(+) create mode 100644 ceph/ebofs/AlignedBufferPool.h create mode 100644 ceph/ebofs/Allocator.cc create mode 100644 ceph/ebofs/Allocator.h create mode 100644 ceph/ebofs/BlockDevice.h create mode 100644 ceph/ebofs/BufferCache.cc create mode 100644 ceph/ebofs/BufferCache.h create mode 100644 ceph/ebofs/Ebofs.cc create mode 100644 ceph/ebofs/Ebofs.h create mode 100644 ceph/ebofs/Onode.h create mode 100644 ceph/ebofs/Table.h create mode 100644 ceph/ebofs/nodes.h create mode 100644 ceph/ebofs/types.h diff --git a/ceph/TODO b/ceph/TODO index 94c26e9fde224..15c7e775c6586 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -3,6 +3,15 @@ client - client will re-tx anything it needed to say upon rx of new mds notification +ofs +- object map : object_t -> block_t +- col map : coll_t -> block_t +- freelists : block_t -> block_t + +- objects +- collections set + + cluster issues diff --git a/ceph/ebofs/AlignedBufferPool.h b/ceph/ebofs/AlignedBufferPool.h new file mode 100644 index 0000000000000..99f44974bbee4 --- /dev/null +++ b/ceph/ebofs/AlignedBufferPool.h @@ -0,0 +1,116 @@ +#ifndef __EBOFS_BUFFERPOOL_H +#define __EBOFS_BUFFERPOOL_H + + +#include +#include +using namespace std; + +#include "include/buffer.h" +#include "include/bufferlist.h" + +//#undef dout +//#define dout(x) if (x <= g_conf.debug) cout << "bufferpool " + + + +class AlignedBufferPool { + int buffer_size; + int num_buffers; + int num_free; + + static const int MIN_ALLOC = 1024*1024; + + list allocated; // my pools + list freelist; // my freelist + + + + public: + AlignedBufferPool(int size) : buffer_size(size), num_buffers(0), num_free(0) {} + ~AlignedBufferPool() { + if (num_free != num_buffers) { + dout(1) << "WARNING: " << num_buffers-num_free << "/" << num_buffers << " buffers still allocated" << endl; + } + for (list::iterator i = allocated.begin(); + i != allocated.end(); + i++) + delete[] *i; + } + + + // individual buffers + void free(bufferptr& bp) { + free(bp.c_str()); + } + void free(char *p) { + dout(10) << "bufferpool.free " << (void*)p << endl; + freelist.push_back(p); + num_free++; + } + + static void aligned_buffer_free_func(void *arg, char *ptr) { + AlignedBufferPool *pool = (AlignedBufferPool*)arg; + pool->free(ptr); + } + + + buffer* alloc() { + // get more memory? + if (freelist.empty()) { + int get = (num_buffers ? num_buffers:1) * buffer_size; + if (get < MIN_ALLOC) get = MIN_ALLOC; + + char *n = new char[get]; + assert(n); + allocated.push_back(n); + + // add items to freelist + int num = get / buffer_size; + char *p = n; + int misalign = (unsigned)p % buffer_size; + if (misalign) { + p += buffer_size - misalign; + num--; + } + dout(2) << "bufferpool.alloc allocated " << get << " bytes, got " << num << " aligned buffers" << endl; + while (num--) { + freelist.push_back( p ); + num_free++; + num_buffers++; + p += buffer_size; + } + } + + // allocate one. + assert(!freelist.empty()); + char *p = freelist.front(); + dout(10) << "bufferpool.alloc " << (void*)p << endl; + freelist.pop_front(); + num_free--; + return new buffer(p, buffer_size, BUFFER_MODE_NOCOPY|BUFFER_MODE_NOFREE|BUFFER_MODE_CUSTOMFREE, + buffer_size, + aligned_buffer_free_func, this); + } + + + + // bufferlists + void alloc_list(int n, bufferlist& bl) { + while (n--) { + bl.push_back(alloc()); + } + } + void free_list(bufferlist& bl) { + for (list::iterator i = bl.buffers().begin(); + i != bl.buffers().end(); + i++) + free(*i); + bl.clear(); + } + + +}; + + +#endif diff --git a/ceph/ebofs/Allocator.cc b/ceph/ebofs/Allocator.cc new file mode 100644 index 0000000000000..4ab72714f5256 --- /dev/null +++ b/ceph/ebofs/Allocator.cc @@ -0,0 +1,155 @@ + +#include "Allocator.h" +#include "Ebofs.h" + + +int Allocator::find(Extent& ex, int bucket, block_t num, block_t near) +{ + Table::Cursor cursor(fs->free_tab[bucket]); + bool found = false; + + + if (fs->free_tab[bucket]->find( near, cursor ) >= 0) { + // look to the right + do { + if (cursor.current().value >= num) + found = true; + } while (!found && cursor.move_right() > 0); + } + + if (!found) { + // look to the left + fs->free_tab[bucket]->find( near, cursor ); + + while (!found && cursor.move_left()) + if (cursor.current().value >= num) + found = true; + } + + if (found) { + ex.start = cursor.current().key; + ex.length = cursor.current().value; + return 0; + } + + return -1; +} + +int Allocator::allocate(Extent& ex, block_t num, block_t near) +{ + if (!near) { + near = num/2; // this is totally wrong and stupid. + } + + int bucket; + + // look for contiguous extent + for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { + if (find(ex, bucket, num, near) >= 0) { + // yay! + + // remove original + fs->free_tab[bucket]->remove( ex.start ); + fs->free_blocks += ex.length; + + if (ex.length > num) { + if (ex.start < near) { + // to the left + if (ex.start + ex.length - num <= near) { + // by a lot. take right-most portion. + Extent left; + left.start = ex.start; + left.length = ex.length - num; + ex.start += left.length; + ex.length -= left.length; + assert(ex.length == num); + release(left); + } else { + // take middle part. + Extent left,right; + left.start = ex.start; + left.length = near - ex.start; + ex.start = near; + right.start = ex.start + num; + right.length = ex.length - left.length - num; + ex.length = num; + release(left); + release(right); + } + } + else { + // to the right. take left-most part. + Extent right; + right.start = ex.start + num; + right.length = ex.length - num; + ex.length = num; + release(right); + } + } + + dout(1) << "allocator.alloc " << ex << " near " << near << endl; + return num; + } + } + + // ok, find partial extent instead. + for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { + int bucket = pick_bucket(trysize); + if (find(ex, bucket, trysize, near) >= 0) { + // yay! + assert(ex.length < num); + + fs->free_tab[bucket]->remove(ex.start); + fs->free_blocks -= ex.length; + return ex.length; + } + } + + dout(1) << "allocate failed, fs full! " << fs->free_blocks << endl; + return -1; +} + +int Allocator::release(Extent& ex) +{ + Extent newex = ex; + + // one after us? + for (int b=0; b::Cursor cursor(fs->free_tab[b]); + + if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) + == Table::Cursor::MATCH) { + // add following extent to ours + newex.length += cursor.current().value; + + // remove it + fs->free_tab[b]->remove( cursor.current().key ); + break; + } + } + + // one before us? + for (int b=0; b::Cursor cursor(fs->free_tab[b]); + fs->free_tab[b]->find( newex.start+newex.length, cursor ); + if (cursor.move_left() > 0 && + (cursor.current().key + cursor.current().value == newex.start)) { + // merge + newex.start = cursor.current().key; + newex.length += cursor.current().value; + + // remove it + fs->free_tab[b]->remove( cursor.current().key ); + break; + } + } + + fs->free_blocks += ex.length; + + // ok, insert newex + int b = pick_bucket(ex.length); + fs->free_tab[b]->insert(ex.start, ex.length); + return 0; +} + + diff --git a/ceph/ebofs/Allocator.h b/ceph/ebofs/Allocator.h new file mode 100644 index 0000000000000..a143e0f760ce9 --- /dev/null +++ b/ceph/ebofs/Allocator.h @@ -0,0 +1,32 @@ +#ifndef __EBOFS_ALLOCATOR_H +#define __EBOFS_ALLOCATOR_H + +#include "types.h" + +class Ebofs; + +class Allocator { + protected: + Ebofs *fs; + + static int pick_bucket(block_t num) { + int b = 0; + while (num > 1) { + b++; + num = num >> 1; + } + if (b >= EBOFS_NUM_FREE_BUCKETS) + b = EBOFS_NUM_FREE_BUCKETS-1; + return b; + } + + int find(Extent& ex, int bucket, block_t num, block_t near); + + public: + Allocator(Ebofs *f) : fs(f) {} + + int allocate(Extent& ex, block_t num, block_t near=0); + int release(Extent& ex); +}; + +#endif diff --git a/ceph/ebofs/BlockDevice.h b/ceph/ebofs/BlockDevice.h new file mode 100644 index 0000000000000..5a3b0678bc896 --- /dev/null +++ b/ceph/ebofs/BlockDevice.h @@ -0,0 +1,187 @@ +#ifndef __EBOFS_BLOCKDEVICE_H +#define __EBOFS_BLOCKDEVICE_H + +#include "include/bufferlist.h" +#include "common/Mutex.h" +#include "common/Cond.h" + +#include "types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +class BlockDevice { + char *dev; + int fd; + block_t num_blocks; + + Mutex lock; + + public: + BlockDevice(char *d) : dev(d), fd(0), num_blocks(0) { + }; + + // get size in blocks + block_t get_num_blocks() { + if (!num_blocks) { + // stat + struct stat st; + assert(fd > 0); + int r = ::fstat(fd, &st); + assert(r == 0); + num_blocks = st.st_size / (block_t)EBOFS_BLOCK_SIZE; + } + return num_blocks; + } + + int open() { + fd = ::open(dev, O_CREAT|O_RDWR|O_SYNC|O_DIRECT); + if (fd < 0) return fd; + //dout(1) << "blockdevice.open " << dev << endl; + + // figure size + __uint64_t bsize = 0; + //int r = + ioctl(fd, BLKGETSIZE64, &bsize); + + if (bsize == 0) { + // try stat! + struct stat st; + fstat(fd, &st); + bsize = st.st_size; + } + num_blocks = bsize / (__uint64_t)EBOFS_BLOCK_SIZE; + + dout(1) << "blockdevice.open " << dev << " " << bsize << " bytes, " << num_blocks << " blocks" << endl; + return fd; + } + int close() { + return ::close(fd); + } + + + // single blocks + int read(block_t bno, unsigned num, + bufferptr& bptr) { + lock.Lock(); + assert(fd > 0); + + off_t offset = bno * EBOFS_BLOCK_SIZE; + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); + + int len = num*EBOFS_BLOCK_SIZE; + assert((int)bptr.length() >= len); + int got = ::read(fd, bptr.c_str(), len); + assert(got <= len); + + lock.Unlock(); + return 0; + } + + int write(unsigned bno, unsigned num, + bufferptr& bptr) { + lock.Lock(); + assert(fd > 0); + + off_t offset = bno * EBOFS_BLOCK_SIZE; + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); + + // write buffers + off_t len = num*EBOFS_BLOCK_SIZE; + off_t left = bptr.length(); + if (left > len) left = len; + int r = ::write(fd, bptr.c_str(), left); + //dout(1) << "write " << fd << " " << (void*)bptr.c_str() << " " << left << endl; + if (r < 0) { + dout(1) << "couldn't write bno " << bno << " num " << num << " (" << left << " bytes) r=" << r << " errno " << errno << " " << strerror(errno) << endl; + } + + lock.Unlock(); + return 0; + } + + + // lists + int read(block_t bno, unsigned num, + bufferlist& bl) { + lock.Lock(); + assert(fd > 0); + + off_t offset = bno * EBOFS_BLOCK_SIZE; + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); + + off_t len = num*EBOFS_BLOCK_SIZE; + assert((int)bl.length() >= len); + + for (list::iterator i = bl.buffers().begin(); + i != bl.buffers().end(); + i++) { + assert(i->length() % EBOFS_BLOCK_SIZE == 0); + + int blen = i->length(); + if (blen > len) blen = len; + + int got = ::read(fd, i->c_str(), blen); + assert(got <= blen); + + len -= blen; + if (len == 0) break; + } + + lock.Unlock(); + return 0; + } + + int write(unsigned bno, unsigned num, + bufferlist& bl) { + lock.Lock(); + assert(fd > 0); + + off_t offset = bno * EBOFS_BLOCK_SIZE; + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); + + // write buffers + off_t len = num*EBOFS_BLOCK_SIZE; + + for (list::iterator i = bl.buffers().begin(); + i != bl.buffers().end(); + i++) { + assert(i->length() % EBOFS_BLOCK_SIZE == 0); + + off_t left = i->length(); + if (left > len) left = len; + int r = ::write(fd, i->c_str(), left); + //dout(1) << "write " << fd << " " << (void*)bptr.c_str() << " " << left << endl; + if (r < 0) { + dout(1) << "couldn't write bno " << bno << " num " << num << " (" << left << " bytes) r=" << r << " errno " << errno << " " << strerror(errno) << endl; + } else { + assert(r == left); + } + + len -= left; + if (len == 0) break; + } + + lock.Unlock(); + return 0; + } + + + +}; + +#endif diff --git a/ceph/ebofs/BufferCache.cc b/ceph/ebofs/BufferCache.cc new file mode 100644 index 0000000000000..840f7446a1411 --- /dev/null +++ b/ceph/ebofs/BufferCache.cc @@ -0,0 +1,223 @@ + +#include "BufferCache.h" + + +/*********** BufferHead **************/ + + + + + + + + + + + +/************ ObjectCache **************/ + + +int ObjectCache::map_read(block_t start, block_t len, + map& hits, + map& missing, + map& rx) { + + map::iterator p = data.lower_bound(start); + // p->first >= start + + block_t cur = start; + block_t left = len; + + if (p != data.begin() && p->first < cur) { + p--; // might overlap! + if (p->first + p->second->object_loc.length <= cur) + p++; // doesn't overlap. + } + + for (; left > 0; p++) { + // at end? + if (p == data.end()) { + // rest is a miss. + BufferHead *n = new BufferHead(); + bc->add_bh(n); + n->object_loc.start = cur; + n->object_loc.length = left; + data[cur] = n; + bc->mark_rx(n); + missing[cur] = n; + break; + } + + if (p->first <= cur) { + // have it (or part of it) + BufferHead *e = p->second; + + if (e->get_state() == BufferHead::STATE_CLEAN || + e->get_state() == BufferHead::STATE_DIRTY || + e->get_state() == BufferHead::STATE_TX) { + hits[cur] = e; // readable! + } + else if (e->get_status() == BufferHead::STATE_RX) { + rx[cur] = e; // missing, not readable. + } + + block_t lenfromcur = e->object_loc.length; + if (e->object_loc.start < cur) + lenfromcur -= cur - e->object_loc.start; + + if (lenfromcur < left) { + cur += lenfromcur; + left -= lenfromcur; + continue; // more! + } else { + break; // done. + } + } else if (p->first > cur) { + // gap.. miss + block_t next = p->first; + BufferHead *n = new BufferHead(); + bc->add_bh(n); + n->object_loc.start = cur; + if (next - cur < left) + n->object_loc.length = next - cur; + else + n->object_loc.length = left; + data[cur] = n; + bc->mark_rx(n); + missing[cur] = n; + + cur += object_loc.length; + left -= object_loc.length; + continue; // more? + } + else + assert(0); + } + + return 0; +} + +int ObjectCache::map_write(block_t start, block_t len, + map& hits) +{ + map::iterator p = data.lower_bound(start); + // p->first >= start + + block_t cur = start; + block_t left = len; + + if (p != data.begin() && p->first < cur) { + p--; // might overlap! + if (p->first + p->second->object_loc.length <= cur) + p++; // doesn't overlap. + } + + for (; left > 0; p++) { + // at end? + if (p == data.end()) { + BufferHead *n = new BufferHead(); + bc->add_bh(n); + n->object_loc.start = cur; + n->object_loc.length = left; + data[cur] = n; + bc->mark_dirty(n); + hits[cur] = n; + break; + } + + if (p->first <= cur) { + // have it (or part of it) + BufferHead *e = p->second; + + if (p->first == cur && p->second->object_loc.length <= left) { + // whole bufferhead, piece of cake. + } else { + if (e->get_status() == BufferHead::STATUS_CLEAN) { + // we'll need to cut the buffer! :( + if (p->first == cur && p->second->object_loc.length > left) { + // we want left bit (one splice) + data[cur+left] = bc->split(e, cur + left); + } + else if (p->first < cur && cur+left >= p->first+p->second->object_loc.length) { + // we want right bit (one splice) + e = bc->split(e, cur); + data[cur] = e; + p++; + assert(p->second == e); + } else { + // we want middle bit (two splices) + e = bc->split(e, cur); + data[cur] = e; + p++; + assert(p->second == e); + data[cur+left] = bc->split(e, cur+left); + } + } + } + + // FIXME + e->set_status(BufferHead::STATUS_DIRTY); + hits[cur] = e; + + // keep going. + block_t lenfromcur = e->object_loc.length; + if (e->object_loc.start < cur) + lenfromcur -= cur - e->object_loc.start; + + if (lenfromcur < left) { + cur += lenfromcur; + left -= lenfromcur; + continue; // more! + } else { + break; // done. + } + } else { + // gap! + block_t next = p->first; + BufferHead *n = new BufferHead(); + bc->add_bh(n); + n->object_loc.start = cur; + if (next - cur < left) + n->object_loc.length = next - cur; + else + n->object_loc.length = left; + data[cur] = n; + bc->mark_dirty(n); + hits[cur] = n; + + cur += object_loc.length; + left -= object_loc.length; + continue; // more? + } + } + + return 0; +} + + + +/************** BufferCache ***************/ + + +BufferHead *BufferCache::split(BufferHead *orig, block_t after) +{ + BufferHead *right = new BufferHead; + right->version(orig->get_version()); + right->set_state(orig->get_state()); + + block_t mynewlen = after - orig->object_loc.start; + right->object_loc.start = after; + right->object_loc.length = orig->object_loc.length - mynewlen; + + add_bh(right); + + stat_sub(orig); + orig->object_loc.length = mynewlen; + stat_add(orig); + + + // FIXME: waiters? + + return right; +} + diff --git a/ceph/ebofs/BufferCache.h b/ceph/ebofs/BufferCache.h new file mode 100644 index 0000000000000..fee575eefc740 --- /dev/null +++ b/ceph/ebofs/BufferCache.h @@ -0,0 +1,191 @@ +#ifndef __EBOFS_BUFFERCACHE_H +#define __EBOFS_BUFFERCACHE_H + +#include "include/lru.h" + +#include "types.h" +#include "AlignedBufferPool.h" + + +#define BH_STATE_DIRTY 1 + +class BufferHead : public LRUObject { + public: + //const static int STATUS_DNE = 0; // not in cache; undefined + const static int STATE_CLEAN = 1; // Rw clean + const static int STATE_DIRTY = 2; // RW dirty + const static int STATE_TX = 3; // Rw flushing to disk + const static int STATE_RX = 4; // w reading from disk + + private: + bufferlist bl; + + int ref; + int state; + version_t version; + + Extent object_loc; // block position _in_object_ + + list waitfor_read; + list waitfor_flush; + + public: + BufferHead() : + ref(0), state(STATE_CLEAN), version(0) {} + + int get() { + assert(ref >= 0); + if (ref == 0) lru_pin(); + return ++ref; + } + int put() { + assert(ref > 0); + if (ref == 1) lru_unpin(); + return --ref; + } + + block_t length() { return object_loc.length; } + + void set_state(int s) { + if (state == STATE_CLEAN && s != STATE_CLEAN) get(); + if (state != STATE_CLEAN && s == STATE_CLEAN) put(); + state = s; + } + int get_state() { return state; } + + bool is_dirty() { return state == STATE_DIRTY; } + bool is_clean() { return state == STATE_CLEAN; } + bool is_rx() { return state == STATE_RX; } + bool is_tx() { return state == STATE_TX; } + + + void substr(block_t off, block_t num, bufferlist& sub) { + // determine offset in bufferlist + block_t start = object_loc - off; + block_t len = num - start; + if (start+len > object_loc.length) + len = object_loc.length - start; + + sub.substr_of(bl, start*EBOFS_BLOCK_SIZE, len*EBOFS_BLOCK_SIZE); + } + +}; + + +class ObjectCache { + private: + object_t oid; + class BufferCache *bc; + + map data; + + public: + ObjectCache(object_t o, class BufferCache *b) : oid(o), bc(b) {} + + int map_read(block_t start, block_t len, + map& hits, // hits + map& missing, // read these from disk + map& rx); // wait for these to finish reading from disk + + int map_write(block_t start, block_t len, + map& hits); // write to these. +}; + + +class BufferCache { + private: + AlignedBufferPool& bufferpool; + + LRU lru_dirty, lru_rest; + off_t stat_clean; + off_t stat_dirty; + off_t stat_rx; + off_t stat_tx; + + public: + BufferCache(AlignedBufferPool& bp) : bufferpool(bp), + stat_clean(0), stat_dirty(0), + stat_rx(0), stat_tx(0) {} + + // bh's in cache + void add_bh(BufferHead *bh) { + if (bh->is_dirty()) + lru_dirty.lru_insert_mid(bh); + else + lru_rest.lru_insert_mid(bh); + stat_add(bh); + } + void touch(BufferHead *bh) { + if (bh->is_dirty()) + lru_dirty.lru_touch(bh); + else + lru_rest.lru_touch(bh); + } + void remove_bh(BufferHead *bh) { + stat_sub(bh); + if (bh->is_dirty()) + lru_dirty.lru_remove(bh); + else + lru_rest.lru_remove(bh); + } + + // stats + void stat_add(BufferHead *bh) { + switch (bh->get_state()) { + case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; + case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; + case BufferHead::STATE_RX: stat_rx += bh->length(); break; + case BufferHead::STATE_TX: stat_tx += bh->length(); break; + } + } + void stat_sub(BufferHead *bh) { + switch (bh->get_state()) { + case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; + case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; + case BufferHead::STATE_RX: stat_rx -= bh->length(); break; + case BufferHead::STATE_TX: stat_tx -= bh->length(); break; + } + } + + // bh state + void set_state(BufferHead *bh, int s) { + // move between lru lists? + if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { + lru_rest.lru_remove(bh); + lru_dirty.lru_insert_top(bh); + } + if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { + lru_dirty.lru_remove(bh); + lru_rest.lru_insert_mid(bh); + } + + // set state + stat_sub(bh); + bh->set_state(s); + stat_add(bh); + } + + void copy_state(BufferHead *bh1, BufferHead *bh2) { + set_state(bh2, bh1->get_state()); + } + + void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; + void mark_dirty(BufferHead *bh) { set_state(bh, BufferHead::STATE_DIRTY); }; + void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; + void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; + + + BufferHead *split(BufferHead *orig, block_t after); + +}; + + + +class FlushWaiter { + object_t oid; + map version_map; // versions i need flushed + Context *onflush; +}; + + +#endif diff --git a/ceph/ebofs/Ebofs.cc b/ceph/ebofs/Ebofs.cc new file mode 100644 index 0000000000000..a1efa41b59e83 --- /dev/null +++ b/ceph/ebofs/Ebofs.cc @@ -0,0 +1,326 @@ + +#include "Ebofs.h" + +// ******************* + +#undef dout +#define dout(x) if (x <= g_conf.debug) cout << "ebofs." + +int Ebofs::mount() +{ + assert(!mounted); + + // read super + bufferptr bp1, bp2; + dev.read(0, 1, bp1); + dev.read(1, 1, bp2); + + struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); + struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); + struct ebofs_super *sb = 0; + + // pick newest super + dout(2) << "mount super @0 v " << sb1->version << endl; + dout(2) << "mount super @1 v " << sb2->version << endl; + if (sb1->version > sb2->version) + sb = sb1; + else + sb = sb2; + + super_version = sb->version; + + // init node pools + dout(2) << "mount table nodepool" << endl; + table_nodepool.read( dev, &sb->table_nodepool ); + + // open tables + dout(2) << "mount opening tables" << endl; + object_tab = new Table( table_nodepool, sb->object_tab ); + for (int i=0; i( table_nodepool, sb->free_tab[i] ); + + dout(2) << "mount mounted" << endl; + mounted = true; + return 0; +} + + +int Ebofs::mkfs() +{ + block_t num_blocks = dev.get_num_blocks(); + + // create first noderegion + Extent nr; + nr.start = 2; + nr.length = num_blocks / 10000; + if (nr.length < 10) nr.length = 10; + NodeRegion *r = new NodeRegion(0,nr); + table_nodepool.add_region(r); + table_nodepool.init_all_free(); + dout(1) << "mkfs: first node region at " << nr << endl; + + // init tables + struct ebofs_table empty; + empty.num_keys = 0; + empty.root = -1; + empty.depth = 0; + + object_tab = new Table( table_nodepool, empty ); + collection_tab = new Table( table_nodepool, empty ); + + for (int i=0; i( table_nodepool, empty ); + + + // add free space + Extent left; + left.start = nr.start + nr.length; + left.length = num_blocks - left.start; + dout(1) << "mkfs: free blocks at " << left << endl; + allocator.release( left ); + + // write nodes + dout(1) << "mkfs: flushing nodepool" << endl; + table_nodepool.induce_full_flush(); + table_nodepool.flush( dev ); + + // write super (2x) + dout(1) << "mkfs: writing superblocks" << endl; + super_version = 0; + write_super(); + write_super(); + + mounted = true; + return 0; +} + +int Ebofs::umount() +{ + mounted = false; + + // wait + + // flush + dout(1) << "umount: flushing nodepool" << endl; + table_nodepool.flush( dev ); + + // super + dout(1) << "umount: writing superblocks" << endl; + write_super(); + + // close tables + delete object_tab; + delete collection_tab; + for (int i=0; iget_num_keys(); + sb.object_tab.root = object_tab->get_root(); + sb.object_tab.depth = object_tab->get_depth(); + + for (int i=0; iget_num_keys(); + sb.free_tab[i].root = free_tab[i]->get_root(); + sb.free_tab[i].depth = free_tab[i]->get_depth(); + } + + // pools + sb.table_nodepool.num_regions = table_nodepool.get_num_regions(); + for (int i=0; iinsert( oid, on->onode_loc ); // even tho i'm not placed yet + + on->get(); + return on; +} + + +Onode* Ebofs::get_onode(object_t oid) +{ + // in cache? + if (onode_map.count(oid)) { + // yay + Onode *on = onode_map[oid]; + on->get(); + return on; + } + + // on disk? + Extent onode_loc; + if (object_tab->lookup(oid, onode_loc) != Table::Cursor::MATCH) { + // object dne. + return 0; + } + + // read it! + bufferlist bl; + bufferpool.alloc_list( onode_loc.length, bl ); + dev.read( onode_loc.start, onode_loc.length, bl ); + + // parse data block + Onode *on = new Onode(oid); + + struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); + on->onode_loc = eo->onode_loc; + on->object_size = eo->object_size; + on->object_blocks = eo->object_blocks; + + // parse attributes + char *p = bl.c_str() + sizeof(*eo); + for (int i=0; inum_attr; i++) { + string key = p; + p += key.length() + 1; + int len = *(int*)(p); + p += sizeof(len); + void *val = new char[len]; + memcpy(val, p, len); + p += len; + on->attr[key] = pair(len, val); + } + + // parse extents + on->extents.clear(); + for (int i=0; inum_extents; i++) { + on->extents.push_back( *(Extent*)(p) ); + p += sizeof(Extent); + } + + return on; +} + + +void Ebofs::write_onode(Onode *on) +{ + // allocate + int bytes = sizeof(ebofs_onode) + on->get_attr_bytes() + on->get_extent_bytes(); + unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; + + bufferlist bl; + bufferpool.alloc_list( blocks, bl ); + + // place on disk + if (on->onode_loc.length < blocks) { + // relocate onode! + if (on->onode_loc.length) + allocator.release(on->onode_loc); + + block_t first = 0; + if (on->extents.size()) + first = on->extents[0].start; + + allocator.allocate(on->onode_loc, blocks, first); + object_tab->remove( on->object_id ); + object_tab->insert( on->object_id, on->onode_loc ); + } + + struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); + eo->onode_loc = on->onode_loc; + eo->object_id = on->object_id; + eo->object_size = on->object_size; + eo->object_blocks = on->object_blocks; + eo->num_attr = on->attr.size(); + eo->num_extents = on->extents.size(); + + // attr + char *p = bl.c_str() + sizeof(*eo); + for (map >::iterator i = on->attr.begin(); + i != on->attr.end(); + i++) { + memcpy(p, i->first.c_str(), i->first.length()+1); + p += i->first.length()+1; + *(int*)(p) = i->second.first; + p += sizeof(int); + memcpy(p, i->second.second, i->second.first); + p += i->second.first; + } + + // extents + for (unsigned i=0; iextents.size(); i++) { + *(Extent*)(p) = on->extents[i]; + p += sizeof(Extent); + } + + // write + dev.write( on->onode_loc.start, on->onode_loc.length, bl ); +} + +void Ebofs::remove_onode(Onode *on) +{ + // remove from table + object_tab->remove(on->object_id); + + // free onode space + if (on->onode_loc.length) + allocator.release(on->onode_loc); + + // free data space + for (unsigned i=0; iextents.size(); i++) + allocator.release(on->extents[i]); + + delete on; +} + +void Ebofs::put_onode(Onode *on) +{ + on->put(); +} + + + + + +// *** file i/o *** + +int Ebofs::read(object_t oid, size_t len, off_t off, bufferlist& bl) +{ + + +} diff --git a/ceph/ebofs/Ebofs.h b/ceph/ebofs/Ebofs.h new file mode 100644 index 0000000000000..befc5fbd214fb --- /dev/null +++ b/ceph/ebofs/Ebofs.h @@ -0,0 +1,99 @@ + +#include +using namespace __gnu_cxx; + +#include "include/Context.h" +#include "include/bufferlist.h" + +#include "types.h" +#include "Onode.h" +#include "BlockDevice.h" +#include "nodes.h" +#include "Allocator.h" +#include "Table.h" +#include "AlignedBufferPool.h" + +#include "common/Mutex.h" +#include "common/Cond.h" + + + + +class Ebofs { + protected: + // ** super ** + bool mounted; + BlockDevice &dev; + version_t super_version; + + int write_super(); + + // ** allocator ** + block_t free_blocks; + Allocator allocator; + friend class Allocator; + + // ** tables and sets ** + NodePool table_nodepool; // for primary tables. + //NodePool set_nodepool; // for collections, etc. + + AlignedBufferPool bufferpool; + + // object map: object -> onode_loc + Table *object_tab; + + // collection map: id -> cnode_loc + Table *collection_tab; + + // free list + Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; + + + + // ** cache ** + hash_map onode_map; // onode cache + LRU onode_lru; + + Onode* new_onode(object_t oid); // make new onode. ref++. + Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. + void write_onode(Onode *on); + void remove_onode(Onode *on); + void put_onode(Onode* o); // put it back down. ref--. + + public: + Ebofs(BlockDevice& d) : + dev(d), + free_blocks(0), allocator(this), + bufferpool(EBOFS_BLOCK_SIZE), + object_tab(0), collection_tab(0) { + for (int i=0; i > attr; + vector extents; + + ObjectCache *oc; + + + public: + Onode(object_t oid) : ref(0), object_id(oid), + object_size(0), object_blocks(0), oc(0) { + onode_loc.length = 0; + } + + block_t get_onode_id() { return onode_loc.start; } + int get_onode_len() { return onode_loc.length; } + + void get() { + if (ref == 0) lru_pin(); + ref++; + } + void put() { + ref--; + if (ref == 0) lru_unpin(); + } + + // attr + int getxattr(char *name, void *value, int size) { + return 0; + } + int setxattr(char *name, void *value, int size) { + return 0; + } + int removexattr(char *name) { + return 0; + } + + + // pack/unpack + int get_attr_bytes() { + int s = 0; + for (map >::iterator i = attr.begin(); + i != attr.end(); + i++) { + s += i->first.length() + 1; + s += i->second.first + sizeof(int); + } + return s; + } + int get_extent_bytes() { + return sizeof(Extent) * extents.size(); + } + + + + +}; + + + +#endif diff --git a/ceph/ebofs/Table.h b/ceph/ebofs/Table.h new file mode 100644 index 0000000000000..e661f49a95840 --- /dev/null +++ b/ceph/ebofs/Table.h @@ -0,0 +1,783 @@ +#ifndef __EBOFS_TABLE_H +#define __EBOFS_TABLE_H + +#include "types.h" +#include "nodes.h" + +/** table **/ + +class _Table { + int asdfasdfasdf; +}; + + +template +class Table : public _Table { + private: + NodePool &pool; + + nodeid_t root; + int nkeys; + int depth; + + public: + Table(NodePool &p, + struct ebofs_table& bts) : + pool(p), + root(bts.root), nkeys(bts.num_keys), depth(bts.depth) {} + + nodeid_t get_root() { return root; } + int get_num_keys() { return nkeys; } + int get_depth() { return depth; } + + + /* + */ + class IndexItem { + public: + K key; + nodeid_t node; + static const int MAX = Node::ITEM_LEN / (sizeof(K) + sizeof(nodeid_t)); + static const int MIN = MAX/2; + }; + class LeafItem { + public: + K key; + V value; + static const int MAX = Node::ITEM_LEN / (sizeof(K) + sizeof(V)); + static const int MIN = MAX/2; + }; + + class Nodeptr { + public: + Node *node; + + Nodeptr() : node(0) {} + Nodeptr(Node *n) : node(n) {} + Nodeptr& operator=(Node *n) { + node = n; + return *this; + } + + LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } + IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } + K key(int i) { + if (node->is_index()) + return index_item(i).key; + else + return leaf_item(i).key; + } + + bool is_leaf() { return node->is_leaf(); } + bool is_index() { return node->is_index(); } + void set_type(int t) { node->set_type(t); } + + int max_items() const { + if (node->is_leaf()) + return LeafItem::MAX; + else + return IndexItem::MAX; + } + int min_items() const { return max_items() / 2; } + + nodeid_t get_id() { return node->get_id(); } + + int size() { return node->size(); } + void set_size(int s) { node->set_size(s); } + + void remove_at_pos(int p) { + if (node->is_index()) { + for (int i=p; ip; i--) + leaf_item(i) = leaf_item(i-1); + leaf_item(p).key = key; + leaf_item(p).value = value; + set_size(size() + 1); + } + void insert_at_index_pos(int p, K key, nodeid_t node) { + assert(is_index()); + for (int i=size(); i>p; i--) + index_item(i) = index_item(i-1); + index_item(p).key = key; + index_item(p).node = node; + set_size(size() + 1); + } + + void append_item(LeafItem& i) { + leaf_item(size()) = i; + set_size(size() + 1); + } + void append_item(IndexItem& i) { + index_item(size()) = i; + set_size(size() + 1); + } + + void split(Nodeptr& right) { + if (node->is_index()) { + for (int i=min_items(); iis_index()) + for (int i=0; i open; // open nodes + vector pos; // position within the node + int level; + + Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} + + public: + + LeafItem& current() { + assert(open[level].is_leaf()); + return open[level].leaf_item(pos[level]); + } + + // ** read-only bits ** + int move_left() { + if (table->depth == 0) return OOB; + + // work up around branch + int l; + for (l = level; l >= 0; l--) + if (pos[l] > 0) break; + if (l < 0) + return OOB; // we are the first item in the btree + + // move left one + pos[l]--; + + // work back down right side + for (; lpool.get_node( open[l].index_item(pos[l]).node ); + pos[l+1] = open[l+1].size() - 1; + } + return 0; + } + int move_right() { + if (table->depth == 0) return OOB; + + // work up branch + int l; + for (l=level; l>=0; l--) + if (pos[l] < open[l].size() - 1) break; + if (l < 0) { + /* we are at last item in btree. */ + if (pos[level] < open[level].size()) { + pos[level]++; /* move into add position! */ + return 0; + } + return -1; + } + + /* move left one */ + assert( pos[l] < open[l].size() ); + pos[l]++; + + /* work back down */ + for (; lpool.get_node( open[l].index_item(pos[l]).node ); + pos[l+1] = 0; // furthest left + } + return 0; + } + + // ** modifications ** + void dirty() { + for (int l=level; l>=0; l--) { + if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) + + table->pool.dirty_node(open[l].node); + if (l > 0) + open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); + else + table->root = open[0].get_id(); + } + } + private: + void repair_parents() { + // did i make a change at the start of a node? + if (pos[level] == 0) { + K key = open[level].key(0); // new key parents should have + for (int j=level-1; j>=0; j--) { + if (open[j].index_item(pos[j]).key == key) + break; /* it's the same key, we can stop fixing */ + open[j].index_item(pos[j]).key = key; + if (pos[j] > 0) break; /* last in position 0.. */ + } + } + } + + public: + void remove() { + dirty(); + + // remove from node + open[level].remove_at_pos( pos[level] ); + repair_parents(); + + // was it a key? + if (level == table->depth-1) + table->nkeys--; + } + + void insert(K key, V value) { + dirty(); + + // insert + open[level].insert_at_leaf_pos(pos[level], key, value); + repair_parents(); + + // was it a key? + if (level == table->depth-1) + table->nkeys++; + } + + int rotate_left() { + if (level == 0) return -1; // i am root + if (pos[level-1] == 0) return -1; // nothing to left + + Nodeptr here = open[level]; + Nodeptr parent = open[level-1]; + Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node ); + if (left.size() == left.max_items()) return -1; // it's full + + // make both dirty + dirty(); + if (!left.node->is_dirty()) { + table->pool.dirty_node(left.node); + parent.index_item(pos[level-1]-1).node = left.get_id(); + } + + dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl; + + /* add */ + if (here.node->is_leaf()) + left.append_item(here.leaf_item(0)); + else + left.append_item(here.index_item(0)); + + /* remove */ + here.remove_at_pos(0); + + /* fix parent index for me */ + parent.index_item( pos[level-1] ).key = here.key(0); + // we never have to update past immediate parent, since we're not at pos 0 + + /* adjust cursor */ + if (pos[level] > 0) + pos[level]--; + //else + //assert(1); /* if we were positioned here, we're equal */ + /* if it was 0, then the shifted item == our key, and we can stay here safely. */ + return 0; + } + int rotate_right() { + if (level == 0) return -1; // i am root + if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right + + Nodeptr here = open[level]; + Nodeptr parent = open[level-1]; + Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node ); + if (right.size() == right.max_items()) return -1; // it's full + + // make both dirty + dirty(); + if (!right.node->is_dirty()) { + table->pool.dirty_node(right.node); + parent.index_item( pos[level-1]+1 ).node = right.get_id(); + } + + if (pos[level] == here.size()) { + /* let's just move the cursor over! */ + dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl; + open[level] = right; + pos[level] = 0; + pos[level-1]++; + return 0; + } + + dbtout << "rotating item " << here.key(here.size()-1) << " right from " + << here.get_id() << " to " << right.get_id() << endl; + + /* add */ + if (here.is_index()) + right.insert_at_index_pos(0, + here.index_item( here.size()-1 ).key, + here.index_item( here.size()-1 ).node); + else + right.insert_at_leaf_pos(0, + here.leaf_item( here.size()-1 ).key, + here.leaf_item( here.size()-1 ).value); + + /* remove */ + here.set_size(here.size() - 1); + + /* fix parent index for right */ + parent.index_item( pos[level-1] + 1 ).key = right.key(0); + + return 0; + } + }; + + + public: + bool almost_full() { + if (2*(depth+1) > pool.num_free()) // worst case, plus some. + return true; + return false; + } + + int find(K key, Cursor& cursor) { + //dbtout << "find " << key << endl; + + if (depth == 0) + return Cursor::OOB; + + // init + cursor.level = 0; + + // start at root + Nodeptr curnode( pool.get_node(root) ); + cursor.open[0] = curnode; + + if (curnode.size() == 0) return -1; // empty! + + // find leaf + for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { + /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ + int left = 0; /* i >= left */ + int right = curnode.size()-1; /* i < right */ + while (left < right) { + int i = left + (right - left) / 2; + if (curnode.index_item(i).key < key) { + left = i + 1; + } else if (i && curnode.index_item(i-1).key >= key) { + right = i; + } else { + left = right = i; + break; + } + } + int i = left; + if (i && curnode.index_item(i).key > key) i--; + +#ifdef EBOFS_DEBUG_BTREE + int j; + for (j=0; j key) break; + } + if (i != j) { + dbtout << "btree binary search failed" << endl; + i = j; + } +#endif + + cursor.pos[cursor.level] = i; + + /* get child node */ + curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node ); + cursor.open[cursor.level+1] = curnode; + } + + /* search leaf */ + /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ + int left = 0; /* i >= left */ + int right = curnode.size(); /* i < right */ + while (left < right) { + int i = left + (right - left) / 2; + if (curnode.leaf_item(i).key < key) { + left = i + 1; + } else if (i && curnode.leaf_item(i-1).key >= key) { + right = i; + } else { + left = right = i; + break; + } + } + int i = left; + +#ifdef EBOFS_DEBUG_BTREE + int j; + for (j=0; j= key) break; + } + if (i != j) { + dbtout << "btree binary search failed" << endl; + i = j; + } +#endif + + cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ + + if (curnode.size() >= i+1) { + if (curnode.leaf_item(i).key == key) { + return Cursor::MATCH; /* it's the actual key */ + } else { + return Cursor::INSERT; /* it's an insertion point */ + } + } + return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ + } + + int lookup(K key, V& value) { + Cursor cursor(this); + if (find(key, cursor) == Cursor::MATCH) { + value = cursor.current().value; + return 0; + } + return -1; + } + + int insert(K key, V value) { + dbtout << "insert " << key << " -> " << value << endl; + if (almost_full()) return -1; + + // empty? + if (nkeys == 0) { + // create a root node (leaf!) + assert(root == -1); + assert(depth == 0); + Nodeptr newroot( pool.new_node() ); + newroot.set_type(Node::TYPE_LEAF); + root = newroot.get_id(); + depth++; + } + + // start at/near key + Cursor cursor(this); + find(key, cursor); + + // insert loop + nodeid_t nodevalue = 0; + while (1) { + + /* room in this node? */ + if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { + if (cursor.open[cursor.level].is_leaf()) + cursor.insert( key, value ); // will dirty, etc. + else { + // indices are already dirty + cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); + } + return 0; + } + + /* this node is full. */ + assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); + + /* can we rotate? */ + if (cursor.level > 0) { + if ((cursor.pos[cursor.level-1] > 0 + && cursor.rotate_left() >= 0) || + (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() + && cursor.rotate_right() >= 0)) { + + if (cursor.open[cursor.level].is_leaf()) + cursor.insert( key, value ); // will dirty, etc. + else { + // indices are already dirty + cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); + } + return 0; + } + } + + /** split node **/ + + if (cursor.level == depth-1) { + dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl; + } else { + dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl; + } + + cursor.dirty(); + + // split + Nodeptr newnode( pool.new_node() ); + Nodeptr leftnode = cursor.open[cursor.level]; + newnode.set_type( leftnode.node->get_type() ); + leftnode.split( newnode ); + + /* insert our item */ + if (cursor.pos[cursor.level] > leftnode.size()) { + // not with cursor, since this node isn't added yet! + if (newnode.is_leaf()) { + newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), + key, value ); + nkeys++; + } else { + newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), + key, nodevalue ); + } + } else { + // with cursor (if leaf) + if (leftnode.is_leaf()) + cursor.insert( key, value ); + else + leftnode.insert_at_index_pos( cursor.pos[cursor.level], + key, nodevalue ); + } + + /* are we at the root? */ + if (cursor.level == 0) { + /* split root. */ + dbtout << "that split was the root " << root << endl; + Nodeptr newroot( pool.new_node() ); + + /* new root node */ + newroot.set_type(Node::TYPE_INDEX); + newroot.set_size(2); + newroot.index_item(0).key = leftnode.key(0); + newroot.index_item(0).node = root; + newroot.index_item(1).key = newnode.key(0); + newroot.index_item(1).node = newnode.get_id(); + + /* heighten tree */ + depth++; + root = newroot.get_id(); + return 0; + } + + /* now insert newindex in level-1 */ + nodevalue = newnode.get_id(); + key = newnode.key(0); + cursor.level--; + cursor.pos[cursor.level]++; // ...to the right of leftnode! + } + } + + + int remove(K key) { + if (almost_full()) return -1; + + Cursor cursor(this); + if (find(key, cursor) <= 0) + return -1; // key dne + + + while (1) { + cursor.remove(); + + // balance + adjust + + if (cursor.level == 0) { + // useless root index? + if (cursor.open[0].size() == 1 && + depth > 1) { + depth--; + root = cursor.open[0].index_item(0).node; + pool.release( cursor.open[0].node ); + } + + // note: root can be small, but not empty + else if (nkeys == 0) { + assert(cursor.open[cursor.level].size() == 0); + assert(depth == 1); + root = -1; + depth = 0; + pool.release(cursor.open[0].node); + } + return 0; + } + + if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) + return 0; + + // borrow from siblings? + Nodeptr left; + Nodeptr right; + + // left? + if (cursor.pos[cursor.level-1] > 0) { + int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; + left = pool.get_node( left_loc ); + + if (left.size() > left.min_items()) { + /* move cursor left, shift right */ + cursor.pos[cursor.level] = 0; + cursor.open[cursor.level] = left; + cursor.pos[cursor.level-1]--; + cursor.rotate_right(); + return 0; + } + + /* combine to left */ + right = cursor.open[cursor.level]; + } + else { + assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); + int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; + right = pool.get_node( right_loc ); + + if (right.size() > right.min_items()) { + /* move cursor right, shift an item left */ + cursor.pos[cursor.level] = 1; + cursor.open[cursor.level] = right; + cursor.pos[cursor.level-1]++; + cursor.rotate_left(); + return 0; + } + + /* combine to left */ + left = cursor.open[cursor.level]; + cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ + } + + // note: cursor now points to _right_ node. + + /* combine (towards left) + * (this makes it so our next delete will be in the index + * interior, which is less scary.) + */ + dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl; + + left.merge(right); + + // dirty left + right + cursor.dirty(); // right + if (!left.node->is_dirty()) { + pool.dirty_node(left.node); + cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); + } + + pool.release(right.node); + + cursor.level--; // now point to the link to the obsolete (right-side) sib */ + } + + } + + int verify(Cursor& cursor, int node_loc, int level, int& count) { + int err = 0; + + Nodeptr node = pool.get_node( node_loc ); + cursor.open[level] = node; + + // identify max, min, and validate key range + int min = node.key(0); + int max = min; + int j = min; + for (int i=0; i max) + max = node.key(i); + + if (level < depth-1) { + // index + cursor.pos[level] = i; + err += verify( cursor, cursor.open[level].index_item(i).node, level+1, count ); + } else { + // leaf + count++; + } + } + + if (level) { + // verify that parent's keys are appropriate + if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { + dbtout << ":: key in index node " << cursor.open[level-1].get_id() + << " != min in child " << node_loc + << "(key is " << cursor.open[level-1].index_item(cursor.pos[level-1]).key + << ", min is " << min << ")" << endl; + err++; + } + if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { + if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { + dbtout << ":: next key in index node " << cursor.open[level-1].get_id() + << " != max in child " << node_loc + << "(key is " << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key + << ", max is " << max << ")" << endl; + err++; + } + } + } + + //return err; + + // print it + char s[1000]; + strcpy(s," "); + s[level+1] = 0; + if (1) { + if (root == node_loc) { + dbtout << s << "root " << node_loc << ": " + << node.size() << " / " << node.max_items() << " keys, " << min << "-" << max << endl; + } else if (level == depth-1) { + dbtout << s << "leaf " << node_loc << ": " + << node.size() << " / " << node.max_items() << " keys, " << min << "-" << max << endl; + } else { + dbtout << s << "indx " << node_loc << ": " + << node.size() << " / " << node.max_items() << " keys, " << min << "-" << max << endl; + } + + if (0) { + for (int i=0; i " << node.leaf_item(i).value << endl; + } + } + } + } + + return err; + } + + void verify() { + int count = 0; + Cursor cursor(this); + if (root == -1 && depth == 0) { + return; // empty! + } + int err = verify(cursor, root, 0, count); + assert(err == 0); + if (count != nkeys) { + dbtout << "** count " << count << " != nkeys " << nkeys << endl; + assert(nkeys == count); + } + } + +}; + + +#endif diff --git a/ceph/ebofs/nodes.h b/ceph/ebofs/nodes.h new file mode 100644 index 0000000000000..93d02c738767d --- /dev/null +++ b/ceph/ebofs/nodes.h @@ -0,0 +1,387 @@ +#ifndef __EBOFS_NODES_H +#define __EBOFS_NODES_H + +/** nodes, node regions **/ + +#include "types.h" +#include "BlockDevice.h" +#include "AlignedBufferPool.h" + + +/* node status on disk, in memory + * + * DISK MEMORY ON LISTS EVENT + * + * claim cycle: + * free free free - + * free dirty dirty mark_dirty() + * free dirty committing start commit + * inuse dirty committing (write happens) + * inuse clean - finish commit + * + * release cycle: + * inuse clean - - + * inuse free limbo release + * inuse free committing start_write + * free free committing (write happens) + * free free free finish_write + */ + + + +class Node { + public: + static const int STATUS_FREE = 0; + static const int STATUS_DIRTY = 1; + static const int STATUS_CLEAN = 2; + + static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); + + static const int TYPE_INDEX = 1; + static const int TYPE_LEAF = 2; + + protected: + nodeid_t id; + bufferptr bptr; + int *nrecs; + int *status; + int *type; + + public: + Node(nodeid_t i) : id(i) { + bptr = new buffer(EBOFS_NODE_BYTES); + nrecs = (int*)(bptr.c_str()); + status = (int*)(bptr.c_str() + sizeof(*nrecs)); + type = (int*)(bptr.c_str() + sizeof(*status) + sizeof(*nrecs)); + clear(); + } + Node(nodeid_t i, bufferptr& b) : id(i), bptr(b) { + nrecs = (int*)(bptr.c_str()); + status = (int*)(bptr.c_str() + sizeof(*nrecs)); + type = (int*)(bptr.c_str() + sizeof(*status) + sizeof(*nrecs)); + } + + void clear() { + *nrecs = 0; + *status = STATUS_FREE; + *type = 0; + } + + nodeid_t get_id() const { return id; } + void set_id(nodeid_t n) { id = n; } + + bufferptr& get_buffer() { return bptr; } + + int size() { return *nrecs; } + void set_size(int s) { *nrecs = s; } + + char *item_ptr() { return bptr.c_str() + sizeof(*status) + sizeof(*nrecs) + sizeof(*type); } + + int& get_status() { return *status; } + bool is_dirty() const { return *status == STATUS_DIRTY; } + void set_status(int s) { *status = s; } + + int& get_type() { return *type; } + void set_type(int t) { *type = t; } + bool is_index() { return *type == TYPE_INDEX; } + bool is_leaf() { return *type == TYPE_LEAF; } +}; + + + +class NodeRegion { + public: + + static int make_nodeid(int region, int offset) { + return (region << 24) | offset; + } + static int nodeid_region(nodeid_t nid) { + return nid >> 24; + } + static int nodeid_offset(nodeid_t nid) { + return nid & (0xffffffffUL >> 24); + } + + protected: + int region_id; + int num_nodes; + + public: + Extent location; + + // free -> dirty -> committing -> clean + // dirty -> free + // or !dirty -> limbo -> free + set free; + set dirty; + set committing; + set limbo; + + public: + NodeRegion(int id, Extent& loc) : region_id(id) { + location = loc; + num_nodes = location.length / EBOFS_NODE_BLOCKS; + } + + int get_region_id() const { return region_id; } + + void init_all_free() { + for (unsigned i=0; i limbo : so they get written out as such + for (set::iterator i = free.begin(); + i != free.end(); + i++) + limbo.insert(*i); + free.clear(); + } + + int size() const { + return num_nodes; + //return (location.length / EBOFS_NODE_BLOCKS); // FIXME THIS IS WRONG + } + int num_free() const { + return free.size(); + } + + // new/open node + nodeid_t new_nodeid() { + assert(num_free()); + + nodeid_t nid = *(free.begin()); + free.erase(nid); + dirty.insert(nid); + + return nid; + } + + void release(nodeid_t nid) { + if (dirty.count(nid)) { + dirty.erase(nid); + free.insert(nid); + } + else { + if (committing.count(nid)) + committing.erase(nid); + limbo.insert(nid); + } + } +}; + + + +class NodePool { + protected: + int num_regions; + map node_regions; // regions + map node_map; // open node map + AlignedBufferPool bufferpool; // memory pool + + public: + NodePool() : num_regions(0), bufferpool(EBOFS_NODE_BYTES) {} + ~NodePool() { + // nodes + set left; + for (map::iterator i = node_map.begin(); + i != node_map.end(); + i++) + left.insert(i->second); + for (set::iterator i = left.begin(); + i != left.end(); + i++) + release( *i ); + assert(node_map.empty()); + + // regions + for (map::iterator i = node_regions.begin(); + i != node_regions.end(); + i++) + delete i->second; + node_regions.clear(); + } + + void add_region(NodeRegion *r) { + node_regions[r->get_region_id()] = r; + num_regions++; + } + + int get_num_regions() { + return num_regions; + } + Extent& get_region_loc(int r) { + return node_regions[r]->location; + } + + + int num_free() { + int f = 0; + for (map::iterator i = node_regions.begin(); + i != node_regions.end(); + i++) + f += i->second->num_free(); + return f; + } + + // *** + int read(BlockDevice& dev, struct ebofs_nodepool *np) { + for (int i=0; inum_regions; i++) { + dout(3) << " region " << i << " at " << np->region_loc[i] << endl; + NodeRegion *r = new NodeRegion(i, np->region_loc[i]); + add_region(r); + + for (block_t boff = 0; boff < r->location.length; boff += EBOFS_NODE_BLOCKS) { + nodeid_t nid = NodeRegion::make_nodeid(r->get_region_id(), boff); + + bufferptr bp = bufferpool.alloc(); + dev.read(r->location.start + (block_t)boff, EBOFS_NODE_BLOCKS, + bp); + + Node *n = new Node(nid, bp); + if (n->get_status() == Node::STATUS_FREE) { + dout(5) << " node " << nid << " free" << endl; + r->free.insert(nid); + delete n; + } else { + dout(5) << " node " << nid << " in use" << endl; + node_map[nid] = n; + } + } + } + return 0; + } + void init_all_free() { + for (map::iterator i = node_regions.begin(); + i != node_regions.end(); + i++) { + i->second->init_all_free(); + } + } + + void induce_full_flush() { + for (map::iterator i = node_regions.begin(); + i != node_regions.end(); + i++) { + i->second->induce_full_flush(); + } + } + + void flush(BlockDevice& dev) { + // flush dirty items, change them to limbo status + for (map::iterator i = node_regions.begin(); + i != node_regions.end(); + i++) { + NodeRegion *r = i->second; + + // dirty -> clean + r->committing = r->dirty; + r->dirty.clear(); + + for (set::iterator i = r->committing.begin(); + i != r->committing.end(); + i++) { + Node *n = get_node(*i); + assert(n); // it's dirty, we better have it + n->set_status(Node::STATUS_CLEAN); + block_t off = NodeRegion::nodeid_offset(*i); + dev.write(r->location.start + off, EBOFS_NODE_BLOCKS, n->get_buffer()); + } + r->committing.clear(); + + // limbo -> free + r->committing = r->limbo; + r->limbo.clear(); + + bufferptr freebuffer = bufferpool.alloc(); + Node freenode(1, freebuffer); + freenode.set_status(Node::STATUS_FREE); + for (set::iterator i = r->committing.begin(); + i != r->committing.end(); + i++) { + freenode.set_id( *i ); + block_t off = NodeRegion::nodeid_offset(*i); + dev.write(r->location.start + off, EBOFS_NODE_BLOCKS, freenode.get_buffer()); + } + + for (set::iterator i = r->committing.begin(); + i != r->committing.end(); + i++) + r->free.insert(*i); + r->committing.clear(); + } + } + + + + // *** nodes *** + // opened node + Node* get_node(nodeid_t nid) { + //dbtout << "pool.get " << nid << endl; + assert(node_map.count(nid)); + return node_map[nid]; + } + + // unopened node + /* + Node* open_node(nodeid_t nid) { + Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); + dbtout << "pool.open_node " << n->get_id() << endl; + node_map[n->get_id()] = n; + return n; + } + */ + + // new node + nodeid_t new_nodeid() { + for (map::iterator i = node_regions.begin(); + i != node_regions.end(); + i++) { + if (i->second->num_free()) + return i->second->new_nodeid(); + } + assert(0); // full! + return -1; + } + Node* new_node() { + bufferptr bp = bufferpool.alloc(); + Node *n = new Node(new_nodeid(), bp); + n->clear(); + dbtout << "pool.new_node " << n->get_id() << endl; + assert(node_map.count(n->get_id()) == 0); + node_map[n->get_id()] = n; + return n; + } + + void release_nodeid(nodeid_t nid) { + dbtout << "pool.release_nodeid on " << nid << endl; + assert(node_map.count(nid) == 0); + node_regions[ NodeRegion::nodeid_region(nid) ]->release(nid); + return; + } + void release(Node *n) { + dbtout << "pool.release on " << n->get_id() << endl; + node_map.erase(n->get_id()); + release_nodeid(n->get_id()); + delete n; + } + + void dirty_node(Node *n) { + assert(!n->is_dirty()); + n->set_status(Node::STATUS_DIRTY); + nodeid_t newid = new_nodeid(); + dbtout << "pool.dirty_node on " << n->get_id() << " now " << newid << endl; + node_map.erase(n->get_id()); + release_nodeid(n->get_id()); + n->set_id(newid); + node_map[newid] = n; + } + + + +}; + +#endif diff --git a/ceph/ebofs/types.h b/ceph/ebofs/types.h new file mode 100644 index 0000000000000..c29429faad26d --- /dev/null +++ b/ceph/ebofs/types.h @@ -0,0 +1,138 @@ +#ifndef __EBOFS_TYPES_H +#define __EBOFS_TYPES_H + +#include +#include "include/buffer.h" + +#include +#include +#include +#include +using namespace std; +using namespace __gnu_cxx; + +#define dbtout cout + + + +/* +- this is to make some of the STL types work with 64 bit values, string hash keys, etc. +- added when i was using an old STL.. maybe try taking these out and see if things + compile now? +*/ + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(unsigned long long __x) const { + static hash H; + return H((__x >> 32) ^ (__x & 0xffffffff)); + } + }; + + template<> struct hash< std::string > + { + size_t operator()( const std::string& x ) const + { + static hash H; + return H(x.c_str()); + } + }; +} + + +// disk +typedef __uint64_t block_t; // disk location/sector/block + +static const int EBOFS_BLOCK_SIZE = 4096; + +class Extent { + public: + block_t start, length; + + Extent() : start(0), length(0) {} + Extent(block_t s, block_t l) : start(s), length(l) {} + + block_t last() const { return start + length - 1; } +}; + +inline ostream& operator<<(ostream& out, Extent& ex) +{ + return out << ex.start << "+" << ex.length; +} + + +// tree/set nodes +typedef int nodeid_t; + +static const int EBOFS_NODE_BLOCKS = 1; +static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; +static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! + +struct ebofs_nodepool { + int num_regions; + Extent region_loc[EBOFS_MAX_NODE_REGIONS]; +}; + + +// objects +typedef __uint64_t object_t; +typedef __uint64_t coll_t; + +struct ebofs_onode { + Extent onode_loc; /* this is actually the block we live in */ + + object_t object_id; /* for kicks */ + off_t object_size; /* file size in bytes. should this be 64-bit? */ + unsigned object_blocks; + + int num_attr; // num attr in onode + int num_extents; /* number of extents used. if 0, data is in the onode */ +}; + +//static const int EBOFS_MAX_DATA_IN_ONODE = (EBOFS_BLOCK_SIZE - sizeof(struct ebofs_onode)); +//static const int EBOFS_MAX_EXTENTS_IN_ONODE = (EBOFS_MAX_DATA_IN_ONODE / sizeof(Extent)); + + +// table +struct ebofs_table { + nodeid_t root; /* root node of btree */ + int num_keys; + int depth; +}; + + +// super +typedef __uint64_t version_t; + +static const unsigned EBOFS_MAGIC = 0x000EB0F5; +static const unsigned EBOFS_ROOT_INODE = 1; +static const unsigned EBOFS_ROOT_BLOCK = 0; + +static const int EBOFS_NUM_FREE_BUCKETS = 16; /* see alloc.h for bucket constraints */ + + +struct ebofs_super { + unsigned s_magic; + + unsigned version; // version of this superblock. + + unsigned num_blocks; /* # blocks in filesystem */ + unsigned free_blocks; /* unused blocks */ + //unsigned num_btree_blocks; /* blocks devoted to btrees (sum of chunks) */ + unsigned num_objects; + + /* stupid stuff */ + unsigned num_fragmented; + + struct ebofs_table object_tab; // object directory + //struct ebofs_table cdir; // collection directory + struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; + + struct ebofs_nodepool table_nodepool; +}; + + + + + +#endif -- 2.39.5