debug_filer: 0,
debug_client: 0,
debug_osd: 0,
+ debug_ebofs: 1,
debug_bdev: 1, // block device
// --- client ---
osd_fakestore_syncthreads: 4,
- ebofs_bc_size: 100, // measured in 4k blocks
+ ebofs_bc_size: (50 *256), // measured in 4k blocks, or *256 for MB
// --- fakeclient (mds regression testing) ---
g_conf.debug_client = atoi(args[++i]);
else if (strcmp(args[i], "--debug_osd") == 0)
g_conf.debug_osd = atoi(args[++i]);
+ else if (strcmp(args[i], "--debug_ebofs") == 0)
+ g_conf.debug_ebofs = atoi(args[++i]);
else if (strcmp(args[i], "--debug_bdev") == 0)
g_conf.debug_bdev = atoi(args[++i]);
int debug_filer;
int debug_client;
int debug_osd;
+ int debug_ebofs;
int debug_bdev;
// client
--- /dev/null
+
+intro
+
+osd cluster map
+ requirements
+ desireable properties
+ (c)rush
+
+failure detection
+ distributed ping or heartbeat
+ central filter, notifier
+
+design
+ placement seed, class/superset, groups
+
+normal operation
+ reads
+ writes
+
+recovery
+ triggers: failed disk, or total cluster reorganization
+
+ notify
+ peering
+ pull
+ push
+ clean
+
+writes during recovery
+
+graceful data loss + recovery?
+
+
+
+
+
+
--- /dev/null
+
+// stable states // ------auth----- -----replica-----
+#define LOCK_SYNC 0 // R . / . . . WB same ... for stat()
+#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync()
+#define LOCK_RDONLY 2 // R . / RC R . . same
+#define LOCK_MIXED 3 // . . / . R W . same
+#define LOCK_WRONLY 4 // . . / . . W WB same
+
+// transition states
+#define LOCK_GSYNCR 8 // R . / RC . . . same
+#define LOCK_GSYNCMW 9 // . . / RC . . WB same
+#define LOCK_GSYNCMW2 9 // . . / RC . . WB same
+
+#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . .
+#define LOCK_GLOCKMW 7 // . . / RC . . . same
+
+#define LOCK_GRDONLYM 10 // . . / . R . . same
+#define LOCK_GRDONLYM2 10 // --- . . / . R . .
+#define LOCK_GRDONLYW 11 // . . / . . . . same
+#define LOCK_GRDONLYW2 11 // --- . . / . . . .
+#define LOCK_GRDONLYS 12 // R . / RC . . . same
+#define LOCK_GRDONLYL 13 // R . / RC . . . ---
+
+#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . .
+#define LOCK_GMIXEDR2 15 // --- . . / . R . .
+#define LOCK_GMIXEDW 16 // . . / . . W . same
+#define LOCK_GMIXEDW2 16 // --- . . / . . W .
+#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . .
+#define LOCK_GMIXEDS2 16 // --- . . / . . . .
+#define LOCK_GMIXEDL 17 // R . / . . . . ---
+
+#define LOCK_GWRONLYR 18 // R . / . . . . same
+#define LOCK_GWRONLYR2 18 // --- . . / . . . .
+#define LOCK_GWRONLYM 19 // . . / . . . . same
+#define LOCK_GWRONLYM2 19 // --- . . / . . . .
+#define LOCK_GWRONLYS 20 // R . / . . . WB same
+#define LOCK_GWRONLYS2 20 // --- . . / . . . .
+#define LOCK_GWRONLYL 21
+
#undef dout
-#define dout(x) if (x <= g_conf.debug) cout << "ebofs.allocator."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.allocator."
void Allocator::dump_freelist()
if (1) {
interval_set<block_t> free; // validate too
+ block_t n = 0;
for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
Table<block_t,block_t> *tab;
if (b < EBOFS_NUM_FREE_BUCKETS) {
dout(30) << "dump limbo" << endl;
tab = fs->limbo_tab;
}
-
+
if (tab->get_num_keys() > 0) {
Table<block_t,block_t>::Cursor cursor(tab);
tab->find(0, cursor);
while (1) {
dout(30) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl;
+ n += cursor.current().value;
assert(!free.contains( cursor.current().key, cursor.current().value ));
free.insert( cursor.current().key, cursor.current().value );
if (cursor.move_right() <= 0) break;
}
}
+ assert(n == fs->free_blocks);
dout(31) << "dump combined freelist is " << free << endl;
}
}
int Allocator::release(Extent& ex)
{
dout(10) << "release " << ex << " (into limbo)" << endl;
- fs->limbo_tab->insert(ex.start, ex.length);
+ limbo.insert(ex.start, ex.length);
fs->limbo_blocks += ex.length;
return 0;
}
+int Allocator::commit_limbo()
+{
+ dout(20) << "commit_limbo" << endl;
+ for (map<block_t,block_t>::iterator i = limbo.m.begin();
+ i != limbo.m.end();
+ i++) {
+ fs->limbo_tab->insert(i->first, i->second);
+ fs->free_blocks += i->second;
+ }
+ limbo.clear();
+ fs->limbo_blocks = 0;
+ dump_freelist();
+ return 0;
+}
+
int Allocator::release_limbo()
{
+ dump_freelist();
if (fs->limbo_tab->get_num_keys() > 0) {
Table<block_t,block_t>::Cursor cursor(fs->limbo_tab);
fs->limbo_tab->find(0, cursor);
while (1) {
Extent ex(cursor.current().key, cursor.current().value);
dout(20) << "release_limbo ex " << ex << endl;
+
+ fs->free_blocks -= ex.length;
_release(ex);
+
if (cursor.move_right() <= 0) break;
}
}
fs->limbo_tab->clear();
- fs->limbo_blocks = 0;
+ dump_freelist();
return 0;
}
-int Allocator::_release(Extent& ex)
+int Allocator::_release(Extent& orig)
{
- Extent newex = ex;
-
- dout(15) << "_release " << ex << endl;
-
- fs->free_blocks += ex.length;
+ dout(15) << "_release " << orig << endl;
+ fs->free_blocks += orig.length;
+ Extent newex = orig;
+
// one after us?
for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
}
// ok, insert newex
- int b = pick_bucket(ex.length);
- fs->free_tab[b]->insert(ex.start, ex.length);
+ int b = pick_bucket(newex.length);
+ fs->free_tab[b]->insert(newex.start, newex.length);
return 0;
}
protected:
Ebofs *fs;
- //interval_set<block_t> limbo;
+ interval_set<block_t> limbo;
static int pick_bucket(block_t num) {
int b = 0;
int allocate(Extent& ex, block_t num, block_t near=0);
int release(Extent& ex);
- int release_limbo();
+
+ int commit_limbo(); // limbo -> fs->limbo_tab
+ int release_limbo(); // fs->limbo_tab -> free_tabs
+
+
};
#endif
#undef dout
-#define dout(x) if (x <= g_conf.debug) cout << "ebofs.bh."
-
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh."
void BufferHead::finish_partials()
partial_write.clear();
}
+void BufferHead::cancel_partials()
+{
+ dout(10) << "cancel_partials on " << *this << endl;
+ for (map<block_t, PartialWrite>::iterator p = partial_write.begin();
+ p != partial_write.end();
+ p++) {
+ oc->bc->dec_unflushed( p->second.epoch );
+ }
+}
+
void BufferHead::queue_partial_write(block_t b)
{
if (partial_write.count(b)) {
#undef dout
-#define dout(x) if (x <= g_conf.debug) cout << "ebofs.oc."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc."
void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length)
it++) {
BufferHead *bh = it->second;
+ // cancel any pending/queued io, if possible.
if (bh->is_tx())
bc->bh_cancel_write(bh);
if (bh->is_rx())
bc->bh_cancel_read(bh);
+ if (bh->is_partial_writes())
+ bh->cancel_partials();
for (map<block_t,list<Context*> >::iterator p = bh->waitfor_read.begin();
p != bh->waitfor_read.end();
}
//finish_contexts(bh->waitfor_flush, -1);
+ bc->remove_bh(bh);
delete bh;
}
data.clear();
/************** BufferCache ***************/
#undef dout
-#define dout(x) if (x <= g_conf.debug) cout << "ebofs.bc."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc."
bool is_partial_writes() { return !partial_write.empty(); }
void finish_partials();
+ void cancel_partials();
void queue_partial_write(block_t b);
}
void inc_unflushed(version_t epoch) {
epoch_unflushed[epoch]++;
+ cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl;
}
void dec_unflushed(version_t epoch) {
epoch_unflushed[epoch]--;
+ cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl;
if (stat_waiter &&
epoch_unflushed[epoch] == 0)
stat_cond.Signal();
--- /dev/null
+#ifndef __EBOFS_CNODE_H
+#define __EBOFS_CNODE_H
+
+#include "Onode.h"
+
+/*
+ * collection node
+ *
+ * holds attribute metadata for collections.
+ * colletion membership is stored in b+tree tables, independent of tte cnode.
+ */
+
+class Cnode : public LRUObject
+{
+ private:
+ int ref;
+ bool dirty;
+
+ public:
+ coll_t coll_id;
+ Extent cnode_loc;
+
+ map<string,AttrVal> attr;
+
+ public:
+ Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) {
+ cnode_loc.length = 0;
+ }
+ ~Cnode() {
+ }
+
+ block_t get_cnode_id() { return cnode_loc.start; }
+ int get_cnode_len() { return cnode_loc.length; }
+
+ void get() {
+ if (ref == 0) lru_pin();
+ ref++;
+ }
+ void put() {
+ ref--;
+ if (ref == 0) lru_unpin();
+ }
+
+ void mark_dirty() {
+ if (!dirty) {
+ dirty = true;
+ get();
+ }
+ }
+ void mark_clean() {
+ if (dirty) {
+ dirty = false;
+ put();
+ }
+ }
+ bool is_dirty() { return dirty; }
+
+
+ int get_attr_bytes() {
+ int s = 0;
+ for (map<string, AttrVal >::iterator i = attr.begin();
+ i != attr.end();
+ i++) {
+ s += i->first.length() + 1;
+ s += i->second.len + sizeof(int);
+ }
+ return s;
+ }
+
+ //
+ //???void clear();
+
+
+};
+
+#endif
// *******************
#undef dout
-#define dout(x) if (x <= g_conf.debug) cout << "ebofs."
+#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs."
int Ebofs::mount()
{
left.length = num_blocks - left.start;
dout(1) << "mkfs: free data blocks at " << left << endl;
allocator.release( left );
- allocator.release_limbo();
+ allocator.commit_limbo(); // -> limbo_tab
+ allocator.release_limbo(); // -> free_tab
// write nodes, super, 2x
dout(1) << "mkfs: flushing nodepool and superblocks (2x)" << endl;
}
super_epoch++;
- dout(10) << "commit_thread commit start, new epoch " << super_epoch
- << ". " << get_free_blocks() << " free in " << get_free_extents() << ", "
- << get_limbo_blocks() << " limbo in " << get_limbo_extents() << endl;
+ dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl;
+ dout(10) << "commit_thread data: "
+ << get_free_blocks() << " free in " << get_free_extents()
+ << ", " << get_limbo_blocks() << " limbo in " << get_limbo_extents() << endl;
+ dout(10) << "commit_thread nodes: "
+ << nodepool.num_free() << " free, "
+ << nodepool.num_limbo() << " limbo, "
+ << nodepool.num_total() << " total." << endl;
// (async) write onodes+condes (do this first; it currently involves inode reallocation)
commit_inodes_start();
+ allocator.commit_limbo(); // limbo -> limbo_tab
+
// (async) write btree nodes
nodepool.commit_start( dev, super_epoch );
write_super(super_epoch, superbp);
ebofs_lock.Lock();
- // free limbo space now (since we're done allocating things, AND we've flushed all previous epoch data)
- allocator.release_limbo();
+ // free limbo space now
+ // (since we're done allocating things,
+ // AND we've flushed all previous epoch data)
+ allocator.release_limbo(); // limbo_tab -> free_tabs
+
+ // do we need more node space?
+ if (nodepool.num_free() < nodepool.num_total() / 3) {
+ dout(1) << "commit_thread running low on node space, allocating more." << endl;
+ assert(0);
+ //alloc_more_node_space();
+ }
// kick waiters
dout(10) << "commit_thread kicking commit+sync waiters" << endl;
Onode *on = get_onode(oid);
if (!on) {
ebofs_lock.Unlock();
- return -1; // object dne?
+ return -ENOENT; // object dne?
}
// read data into bl. block as necessary.
ebofs_lock.Lock();
dout(7) << "write " << hex << oid << dec << " len " << len << " off " << off << endl;
assert(len > 0);
+
+ // out of space?
+ if (len / EBOFS_BLOCK_SIZE + 10 >= free_blocks) {
+ dout(1) << "write failing, only " << free_blocks << " blocks free" << endl;
+ if (onsafe) delete onsafe;
+ ebofs_lock.Unlock();
+ return -ENOSPC;
+ }
- // get inode
+ // get|create inode
Onode *on = get_onode(oid);
- if (!on)
- on = new_onode(oid); // new inode!
+ if (!on) on = new_onode(oid); // new inode!
// apply write to buffer cache
apply_write(on, len, off, bl);
Onode *on = get_onode(oid);
if (!on) {
ebofs_lock.Unlock();
- return -1;
+ return -ENOENT;
}
// ok remove it!
Onode *on = get_onode(oid);
if (!on) {
ebofs_lock.Unlock();
- return -1;
+ return -ENOENT;
}
// ??
int Ebofs::setattr(object_t oid, const char *name, void *value, size_t size)
{
Onode *on = get_onode(oid);
- if (!on) return -1;
+ if (!on) return -ENOENT;
string n(name);
AttrVal val((char*)value, size);
int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size)
{
Onode *on = get_onode(oid);
- if (!on) return -1;
+ if (!on) return -ENOENT;
string n(name);
if (on->attr.count(n) == 0) return -1;
int Ebofs::rmattr(object_t oid, const char *name)
{
Onode *on = get_onode(oid);
- if (!on) return -1;
+ if (!on) return -ENOENT;
string n(name);
on->attr.erase(n);
int Ebofs::listattr(object_t oid, vector<string>& attrs)
{
Onode *on = get_onode(oid);
- if (!on) return -1;
+ if (!on) return -ENOENT;
attrs.clear();
for (map<string,AttrVal>::iterator i = on->attr.begin();
int Ebofs::destroy_collection(coll_t cid)
{
- if (!collection_exists(cid)) return -1;
+ if (!collection_exists(cid)) return -ENOENT;
Cnode *cn = new_cnode(cid);
// hose mappings
int Ebofs::collection_add(coll_t cid, object_t oid)
{
- if (!collection_exists(cid)) return -1;
+ if (!collection_exists(cid)) return -ENOENT;
oc_tab->insert(idpair_t(oid,cid), true);
co_tab->insert(idpair_t(cid,oid), true);
return 0;
int Ebofs::collection_remove(coll_t cid, object_t oid)
{
- if (!collection_exists(cid)) return -1;
+ if (!collection_exists(cid)) return -ENOENT;
oc_tab->remove(idpair_t(oid,cid));
co_tab->remove(idpair_t(cid,oid));
return 0;
int Ebofs::collection_list(coll_t cid, list<object_t>& ls)
{
- if (!collection_exists(cid)) return -1;
+ if (!collection_exists(cid)) return -ENOENT;
Table<idpair_t, bool>::Cursor cursor(co_tab);
int Ebofs::collection_setattr(coll_t cid, const char *name, void *value, size_t size)
{
Cnode *cn = get_cnode(cid);
- if (!cn) return -1;
+ if (!cn) return -ENOENT;
string n(name);
AttrVal val((char*)value, size);
int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size)
{
Cnode *cn = get_cnode(cid);
- if (!cn) return -1;
+ if (!cn) return -ENOENT;
string n(name);
if (cn->attr.count(n) == 0) return -1;
int Ebofs::collection_rmattr(coll_t cid, const char *name)
{
Cnode *cn = get_cnode(cid);
- if (!cn) return -1;
+ if (!cn) return -ENOENT;
string n(name);
cn->attr.erase(n);
int Ebofs::collection_listattr(coll_t cid, vector<string>& attrs)
{
Cnode *cn = get_cnode(cid);
- if (!cn) return -1;
+ if (!cn) return -ENOENT;
attrs.clear();
for (map<string,AttrVal>::iterator i = cn->attr.begin();
mounted(false), unmounting(false), readonly(false),
super_epoch(0), commit_thread_started(false), mid_commit(false),
commit_thread(this),
- free_blocks(0), allocator(this),
+ free_blocks(0), limbo_blocks(0),
+ allocator(this),
bufferpool(EBOFS_BLOCK_SIZE),
nodepool(ebofs_lock),
object_tab(0), limbo_tab(0), collection_tab(0), oc_tab(0), co_tab(0),
/** table **/
-#define dbtout dout(10)
+#define dbtout if (10 < g_conf.debug_ebofs) cout
template<class K, class V>
}
int insert(K key, V value) {
- dbtout << "insert " << key << " -> " << value << endl;
+ //dbtout << "insert " << key << " -> " << value << endl;
if (almost_full()) return -1;
// empty?
// test small writes
if (1) {
- char crap[10000];
- memset(crap, 0, 10000);
+ char crap[1024*1024];
+ memset(crap, 0, 1024*1024);
bufferlist bl;
- bl.append(crap, 10000);
+ bl.append(crap, 1024*1024);
// reandom write
- if (0) {
+ if (1) {
srand(0);
for (int i=0; i<10000; i++) {
off_t off = rand() % 1000000;
//fs.sync();
//fs.trim_buffer_cache();
}
+ fs.remove(10);
+ for (int i=0; i<100; i++) {
+ off_t off = rand() % 1000000;
+ size_t len = 1+rand() % 10000;
+ cout << endl << i << " writing bit at " << off << " len " << len << endl;
+ fs.write(10, len, off, bl, (Context*)0);
+ //fs.sync();
+ //fs.trim_buffer_cache();
+ }
}
- if (1) {
+ if (0) {
// sequential write
srand(0);
off_t off = 0;
for (int i=0; i<10000; i++) {
- size_t len = 1+rand() % 10000;
+ size_t len = 1024*1024;//1+rand() % 10000;
cout << endl << i << " writing bit at " << off << " len " << len << endl;
fs.write(10, len, off, bl, (Context*)0);
off += len;
*/
+#undef debofs
+#define debofs(x) if (x < g_conf.debug_ebofs) cout
+
class Node {
public:
protected:
// on-disk block states
+ int num_nodes;
set<nodeid_t> free;
set<nodeid_t> dirty;
set<nodeid_t> tx;
public:
NodePool(Mutex &el) :
bufferpool(EBOFS_NODE_BYTES),
+ num_nodes(0),
ebofs_lock(el),
flushing(0) {}
~NodePool() {
release_all();
}
- int num_free() {
- return free.size();
- }
+ int num_free() { return free.size(); }
+ int num_dirty() { return dirty.size(); }
+ int num_limbo() { return limbo.size(); }
+ int num_tx() { return tx.size(); }
+ int num_clean() { return clean.size(); }
+ int num_total() { return num_nodes; }
// the caller had better adjust usemap locations...
void add_region(Extent ex) {
for (unsigned o = 0; o < ex.length; o++) {
free.insert( make_nodeid(region, o) );
}
+ num_nodes += ex.length;
}
int init(struct ebofs_nodepool *np) {
// regions
for (int i=0; i<np->num_regions; i++) {
- dout(3) << "init region " << i << " at " << np->region_loc[i] << endl;
+ debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl;
region_loc.push_back( np->region_loc[i] );
+ num_nodes += np->region_loc[i].length;
}
// usemap
usemap_even = np->node_usemap_even;
usemap_odd = np->node_usemap_odd;
- dout(3) << "init even map at " << usemap_even << endl;
- dout(3) << "init odd map at " << usemap_odd << endl;
+ debofs(3) << "init even map at " << usemap_even << endl;
+ debofs(3) << "init odd map at " << usemap_odd << endl;
return 0;
}
to read. so it only really works when called from mount()!
*/
for (unsigned r=0; r<region_loc.size(); r++) {
- dout(3) << "ebofs.nodepool.read region " << r << " at " << region_loc[r] << endl;
+ debofs(3) << "ebofs.nodepool.read region " << r << " at " << region_loc[r] << endl;
for (block_t boff = 0; boff < region_loc[r].length; boff++) {
nodeid_t nid = make_nodeid(r, boff);
if (!clean.count(nid)) continue;
- dout(20) << "ebofs.nodepool.read node " << nid << endl;
+ debofs(20) << "ebofs.nodepool.read node " << nid << endl;
bufferptr bp = bufferpool.alloc(EBOFS_NODE_BYTES);
dev.read(region_loc[r].start + (block_t)boff, EBOFS_NODE_BLOCKS,
Node *n = new Node(nid, bp, Node::STATE_CLEAN);
node_map[nid] = n;
- dout(10) << "ebofs.nodepool.read node " << n << " at " << (void*)n << endl;
+ debofs(10) << "ebofs.nodepool.read node " << n << " at " << (void*)n << endl;
}
}
return 0;
// new node
Node* new_node(int type) {
nodeid_t nid = alloc_id();
- dout(15) << "ebofs.nodepool.new_node " << nid << endl;
+ debofs(15) << "ebofs.nodepool.new_node " << nid << endl;
// alloc node
bufferptr bp = bufferpool.alloc(EBOFS_NODE_BYTES);
void release(Node *n) {
const nodeid_t nid = n->get_id();
- dout(15) << "ebofs.nodepool.release on " << nid << endl;
+ debofs(15) << "ebofs.nodepool.release on " << nid << endl;
node_map.erase(nid);
if (n->is_dirty()) {
void release_all() {
while (!node_map.empty()) {
map<nodeid_t,Node*>::iterator i = node_map.begin();
- dout(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl;
+ debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl;
release( i->second );
}
assert(node_map.empty());
// get new node id?
nodeid_t oldid = n->get_id();
nodeid_t newid = alloc_id();
- dout(2) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl;
+ debofs(2) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl;
// release old block
if (n->is_clean()) {