From d44b0f3b5d16acbf9eaec4c01c7d2b2193b4c82a Mon Sep 17 00:00:00 2001 From: sage Date: Sun, 18 Dec 2005 08:27:55 +0000 Subject: [PATCH] *** empty log message *** git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@534 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 9 +- ceph/config.cc | 15 ++-- ceph/config.h | 4 +- ceph/ebofs/Allocator.cc | 6 +- ceph/ebofs/BlockDevice.cc | 2 +- ceph/ebofs/BufferCache.cc | 129 ++++++++++++++++----------- ceph/ebofs/BufferCache.h | 113 +++++++++++++----------- ceph/ebofs/Cnode.h | 8 ++ ceph/ebofs/Ebofs.cc | 171 ++++++++++++++++++++++++++---------- ceph/ebofs/Ebofs.h | 8 +- ceph/ebofs/Onode.h | 33 +++++-- ceph/ebofs/Table.h | 2 +- ceph/ebofs/mkfs.ebofs.cc | 7 +- ceph/ebofs/types.h | 1 + ceph/include/buffer.h | 26 ++++-- ceph/include/interval_set.h | 56 +----------- ceph/include/lru.h | 6 +- ceph/osd/OSD.cc | 2 +- 18 files changed, 349 insertions(+), 249 deletions(-) diff --git a/ceph/TODO b/ceph/TODO index b54fa33c83e70..d1d704fcb5a37 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -12,9 +12,9 @@ client ebofs -- non-blocking get_onode and get_cnode +/- non-blocking get_onode and get_cnode - Ebofs.readonly - +- trim onode cache? @@ -44,7 +44,8 @@ cluster issues - +fakestore +- sync notification is totally broken, use '--fake_osd_sync 1' for now osd foo @@ -56,7 +57,7 @@ osd foo - thread (and unordered!) ops - rep ops tricky -- 'dirty' log on primary +- 'dirty' log on primary? - fast recovery from degraded mode diff --git a/ceph/config.cc b/ceph/config.cc index e76259493b3ac..6af8a68ba1924 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -116,13 +116,15 @@ md_config_t g_conf = { osd_mkfs: false, osd_fakestore_syncthreads: 4, - osd_ebofs: 0, // --- ebofs --- + ebofs: 0, ebofs_commit_interval: 2, // seconds. 0 = no timeout (for debugging/tracing) - ebofs_bc_size: (50 *256), // measured in 4k blocks, or *256 for MB - ebofs_bc_max_dirty: (10 *256), // before write() will wait for data to flush - + ebofs_oc_size: 100, + ebofs_cc_size: 100, + ebofs_bc_size: (5 *256), // measured in 4k blocks, or *256 for MB + ebofs_bc_max_dirty: (1 *256), // before write() will wait for data to flush + // --- block device --- bdev_el_fw_max_ms: 1000, // restart elevator at least once every 1000 ms bdev_el_bw_max_ms: 300, // restart elevator at least once every 1000 ms @@ -282,8 +284,9 @@ void parse_config_options(vector& args) g_conf.client_bcache_ttl = atoi(args[++i]); - else if (strcmp(args[i], "--osd_ebofs") == 0) - g_conf.osd_ebofs = atoi(args[++i]); + else if (strcmp(args[i], "--ebofs") == 0) + g_conf.ebofs = atoi(args[++i]); + else if (strcmp(args[i], "--osd_mkfs") == 0) g_conf.osd_mkfs = atoi(args[++i]); else if (strcmp(args[i], "--osd_pg_bits") == 0) diff --git a/ceph/config.h b/ceph/config.h index ab23a21dfd63d..b3ce92e26009a 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -90,10 +90,12 @@ struct md_config_t { bool osd_mkfs; int osd_fakestore_syncthreads; // such crap - int osd_ebofs; // ebofs + int ebofs; int ebofs_commit_interval; + int ebofs_oc_size; + int ebofs_cc_size; off_t ebofs_bc_size; off_t ebofs_bc_max_dirty; diff --git a/ceph/ebofs/Allocator.cc b/ceph/ebofs/Allocator.cc index e3d31c68d2e7b..7a3657510d56f 100644 --- a/ceph/ebofs/Allocator.cc +++ b/ceph/ebofs/Allocator.cc @@ -16,16 +16,16 @@ void Allocator::dump_freelist() for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { Table *tab; if (b < EBOFS_NUM_FREE_BUCKETS) { - dout(30) << "dump bucket " << b << endl; tab = fs->free_tab[b]; + dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << endl; } else { - dout(30) << "dump limbo" << endl; tab = fs->limbo_tab; + dout(30) << "dump limbo " << tab->get_num_keys() << endl;; } if (tab->get_num_keys() > 0) { Table::Cursor cursor(tab); - tab->find(0, cursor); + assert(tab->find(0, cursor) >= 0); while (1) { dout(30) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl; n += cursor.current().value; diff --git a/ceph/ebofs/BlockDevice.cc b/ceph/ebofs/BlockDevice.cc index 9c579808a74fc..578d615583715 100644 --- a/ceph/ebofs/BlockDevice.cc +++ b/ceph/ebofs/BlockDevice.cc @@ -369,7 +369,7 @@ int BlockDevice::open() { assert(fd == 0); - fd = ::open(dev, O_CREAT|O_RDWR|O_SYNC|O_DIRECT); + fd = ::open(dev, O_CREAT|O_RDWR|O_SYNC|O_DIRECT, 0); if (fd < 0) { dout(1) << "open " << dev << " failed, r = " << fd << " " << strerror(errno) << endl; fd = 0; diff --git a/ceph/ebofs/BufferCache.cc b/ceph/ebofs/BufferCache.cc index 2227c988898c2..761f71757a6bd 100644 --- a/ceph/ebofs/BufferCache.cc +++ b/ceph/ebofs/BufferCache.cc @@ -27,7 +27,8 @@ void BufferHead::finish_partials() apply_partial( bl, p->second.partial ); oc->bc->dev.write( p->second.block, 1, bl, - new C_OC_PartialTxFinish( oc, p->second.epoch )); + new C_OC_PartialTxFinish( oc->bc, p->second.epoch )); + //oc->get(); // don't need OC for completion func! } partial_write.clear(); } @@ -69,12 +70,11 @@ void BufferHead::queue_partial_write(block_t b) #define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc." + void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length) { list waiters; - bc->ebofs_lock.Lock(); - dout(10) << "rx_finish " << start << "~" << length << endl; for (map::iterator p = data.lower_bound(start); p != data.end(); @@ -127,8 +127,6 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length) } finish_contexts(waiters); - - bc->ebofs_lock.Unlock(); } @@ -137,8 +135,6 @@ void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, { //list waiters; - bc->ebofs_lock.Lock(); - dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl; for (map::iterator p = data.lower_bound(start); p != data.end(); @@ -175,25 +171,6 @@ void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, } //finish_contexts(waiters); - - // update unflushed counter - assert(bc->get_unflushed(epoch) > 0); - bc->dec_unflushed(epoch); - - bc->ebofs_lock.Unlock(); -} - -void ObjectCache::partial_tx_finish(version_t epoch) -{ - bc->ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << endl; - - // update unflushed counter - assert(bc->get_unflushed(epoch) > 0); - bc->dec_unflushed(epoch); - - bc->ebofs_lock.Unlock(); } @@ -216,13 +193,13 @@ int ObjectCache::map_read(Onode *on, block_t left = len; if (p != data.begin() && - (p->first > cur || p == data.end())) { + (p == data.end() || p->first > cur)) { p--; // might overlap! if (p->first + p->second->length() <= cur) p++; // doesn't overlap. } - for (; left > 0; p++) { + while (left > 0) { // at end? if (p == data.end()) { // rest is a miss. @@ -235,7 +212,10 @@ int ObjectCache::map_read(Onode *on, bc->add_bh(n); missing[cur] = n; cur += exv[i].length; + left -= exv[i].length; } + assert(left == 0); + assert(cur == start+len); break; } @@ -256,17 +236,11 @@ int ObjectCache::map_read(Onode *on, } else assert(0); - block_t lenfromcur = e->length(); - if (e->start() < cur) - lenfromcur -= cur - e->start(); - - if (lenfromcur < left) { - cur += lenfromcur; - left -= lenfromcur; - continue; // more! - } else { - break; // done. - } + block_t lenfromcur = MIN(e->end() - cur, left); + cur += lenfromcur; + left -= lenfromcur; + p++; + continue; // more? } else if (p->first > cur) { // gap.. miss block_t next = p->first; @@ -288,6 +262,8 @@ int ObjectCache::map_read(Onode *on, assert(0); } + assert(left == 0); + assert(cur == start+len); return 0; } @@ -316,7 +292,7 @@ int ObjectCache::map_write(Onode *on, block_t left = len; if (p != data.begin() && - (p->first > cur || p == data.end())) { + (p == data.end() || p->first > cur)) { p--; // might overlap! if (p->first + p->second->length() <= cur) p++; // doesn't overlap. @@ -437,10 +413,7 @@ int ObjectCache::map_write(Onode *on, hits[cur] = bh; // keep going. - block_t lenfromcur = bh->length(); - if (bh->start() < cur) - lenfromcur -= cur - bh->start(); - + block_t lenfromcur = bh->end() - cur; cur += lenfromcur; left -= lenfromcur; p++; @@ -609,8 +582,10 @@ void BufferCache::bh_read(Onode *on, BufferHead *bh) dout(20) << "bh_read " << *bh << " from " << ex << endl; bh->rx_ioh = dev.read(ex.start, ex.length, bh->data, - new C_OC_RxFinish(on->oc, + new C_OC_RxFinish(ebofs_lock, on->oc, bh->start(), bh->length())); + on->oc->get(); + } bool BufferCache::bh_cancel_read(BufferHead *bh) @@ -619,6 +594,8 @@ bool BufferCache::bh_cancel_read(BufferHead *bh) dout(10) << "bh_cancel_read on " << *bh << endl; bh->rx_ioh = 0; mark_missing(bh); + int l = bh->oc->put(); + assert(l); return true; } return false; @@ -643,12 +620,12 @@ void BufferCache::bh_write(Onode *on, BufferHead *bh) //assert(bh->tx_ioh == 0); bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(on->oc, + new C_OC_TxFinish(ebofs_lock, on->oc, bh->start(), bh->length(), bh->get_version(), bh->epoch_modified)); - - epoch_unflushed[ bh->epoch_modified ]++; + on->oc->get(); + inc_unflushed( bh->epoch_modified ); } @@ -658,12 +635,66 @@ bool BufferCache::bh_cancel_write(BufferHead *bh) dout(10) << "bh_cancel_write on " << *bh << endl; bh->tx_ioh = 0; mark_dirty(bh); - epoch_unflushed[ bh->epoch_modified ]--; // assert.. this should be the same epoch! + dec_unflushed( bh->epoch_modified ); // assert.. this should be the same epoch! + int l = bh->oc->put(); + assert(l); return true; } return false; } +void BufferCache::tx_finish(ObjectCache *oc, + ioh_t ioh, block_t start, block_t length, + version_t version, version_t epoch) +{ + ebofs_lock.Lock(); + + // finish oc + if (oc->put() == 0) { + delete oc; + } else + oc->tx_finish(ioh, start, length, version, epoch); + + // update unflushed counter + assert(get_unflushed(epoch) > 0); + dec_unflushed(epoch); + + ebofs_lock.Unlock(); +} + +void BufferCache::rx_finish(ObjectCache *oc, + ioh_t ioh, block_t start, block_t length) +{ + ebofs_lock.Lock(); + + // oc + if (oc->put() == 0) { + delete oc; + } else + oc->rx_finish(ioh, start, length); + + // + ebofs_lock.Unlock(); +} + +void BufferCache::partial_tx_finish(version_t epoch) +{ + ebofs_lock.Lock(); + + /* don't need oc! + // oc? + if (oc->put() == 0) + delete oc; + */ + + // update unflushed counter + assert(get_unflushed(epoch) > 0); + dec_unflushed(epoch); + + ebofs_lock.Unlock(); +} + + void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) diff --git a/ceph/ebofs/BufferCache.h b/ceph/ebofs/BufferCache.h index d83644fd2eaf9..2f8059fac299f 100644 --- a/ceph/ebofs/BufferCache.h +++ b/ceph/ebofs/BufferCache.h @@ -82,7 +82,8 @@ class BufferHead : public LRUObject { int put() { assert(ref > 0); if (ref == 1) lru_unpin(); - return --ref; + --ref; + return ref; } block_t start() { return object_loc.start; } @@ -191,21 +192,6 @@ class BufferHead : public LRUObject { bool partial_is_complete(off_t size) { return have_partial_range( (off_t)(start()*EBOFS_BLOCK_SIZE), MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - /* - map::iterator i = partial.begin(); - if (i == partial.end()) return false; - if (i->first != (off_t)(object_loc.start * EBOFS_BLOCK_SIZE)) return false; - off_t pos = i->first + i->second.length(); - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - } - off_t upto = MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ); - if (pos == upto) return true; - return false; - */ } void apply_partial() { apply_partial(data, partial); @@ -294,9 +280,26 @@ class ObjectCache { private: map data; + int ref; public: - ObjectCache(object_t o, BufferCache *b) : object_id(o), bc(b) {} + ObjectCache(object_t o, BufferCache *b) : object_id(o), bc(b), ref(0) {} + ~ObjectCache() { + assert(data.empty()); + assert(ref == 0); + } + + int get() { + ++ref; + //cout << "oc.get " << object_id << " " << ref << endl; + return ref; + } + int put() { + assert(ref > 0); + --ref; + //cout << "oc.put " << object_id << " " << ref << endl; + return ref; + } object_t get_object_id() { return object_id; } @@ -348,7 +351,6 @@ class ObjectCache { void rx_finish(ioh_t ioh, block_t start, block_t length); void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - void partial_tx_finish(version_t epoch); void dump() { for (map::iterator i = data.begin(); @@ -360,40 +362,6 @@ class ObjectCache { void tear_down(); }; -class C_OC_RxFinish : public BlockDevice::callback { - ObjectCache *oc; - block_t start, length; -public: - C_OC_RxFinish(ObjectCache *o, block_t s, block_t l) : - oc(o), start(s), length(l) {} - void finish(ioh_t ioh, int r) { - oc->rx_finish(ioh, start, length); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; -public: - C_OC_TxFinish(ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->tx_finish(ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - ObjectCache *oc; - version_t epoch; -public: - C_OC_PartialTxFinish(ObjectCache *o, version_t e) : - oc(o), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->partial_tx_finish(epoch); - } -}; class BufferCache { @@ -554,6 +522,10 @@ class BufferCache { bool bh_cancel_read(BufferHead *bh); bool bh_cancel_write(BufferHead *bh); + void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len); + void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); + void partial_tx_finish(version_t epoch); + friend class C_E_FlushPartial; // bh fun @@ -561,5 +533,42 @@ class BufferCache { }; +class C_OC_RxFinish : public BlockDevice::callback { + Mutex &lock; + ObjectCache *oc; + block_t start, length; +public: + C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l) : + lock(m), oc(o), start(s), length(l) {} + void finish(ioh_t ioh, int r) { + oc->bc->rx_finish(oc, ioh, start, length); + } +}; + +class C_OC_TxFinish : public BlockDevice::callback { + Mutex &lock; + ObjectCache *oc; + block_t start, length; + version_t version; + version_t epoch; + public: + C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : + lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} + void finish(ioh_t ioh, int r) { + oc->bc->tx_finish(oc, ioh, start, length, version, epoch); + } +}; + +class C_OC_PartialTxFinish : public BlockDevice::callback { + BufferCache *bc; + version_t epoch; +public: + C_OC_PartialTxFinish(BufferCache *b, version_t e) : + bc(b), epoch(e) {} + void finish(ioh_t ioh, int r) { + bc->partial_tx_finish(epoch); + } +}; + #endif diff --git a/ceph/ebofs/Cnode.h b/ceph/ebofs/Cnode.h index 300ae8d045298..8e54c4518890c 100644 --- a/ceph/ebofs/Cnode.h +++ b/ceph/ebofs/Cnode.h @@ -73,4 +73,12 @@ class Cnode : public LRUObject }; +inline ostream& operator<<(ostream& out, Cnode& cn) +{ + out << "cnode(" << hex << cn.coll_id << dec; + if (cn.is_dirty()) out << " dirty"; + out << ")"; + return out; +} + #endif diff --git a/ceph/ebofs/Ebofs.cc b/ceph/ebofs/Ebofs.cc index 78ed139c1621c..5d47d80c751e2 100644 --- a/ceph/ebofs/Ebofs.cc +++ b/ceph/ebofs/Ebofs.cc @@ -200,6 +200,15 @@ int Ebofs::umount() finisher_lock.Unlock(); finisher_thread.join(); + trim_bc(0); + trim_inodes(0); + + for (hash_map::iterator i = onode_map.begin(); + i != onode_map.end(); + i++) { + dout(1) << "leftover: " << i->first << " " << *(i->second) << endl; + } + // free memory dout(2) << "umount cleaning up" << endl; close_tables(); @@ -308,10 +317,13 @@ int Ebofs::commit_thread_entry() dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl; dout(10) << "commit_thread data: " << get_free_blocks() << " free in " << get_free_extents() - << ", " << get_limbo_blocks() << " limbo in " << get_limbo_extents() << endl; + << " (" << 100*get_free_blocks()/dev.get_num_blocks() << "%)" + << ", " << get_limbo_blocks() << " limbo in " << get_limbo_extents() + << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() << "%)" + << endl; dout(10) << "commit_thread nodes: " - << nodepool.num_free() << " free, " - << nodepool.num_limbo() << " limbo, " + << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " + << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) free, " << nodepool.num_total() << " total." << endl; // (async) write onodes+condes (do this first; it currently involves inode reallocation) @@ -362,6 +374,7 @@ int Ebofs::commit_thread_entry() // trim bc? trim_bc(); + trim_inodes(); dout(10) << "commit_thread commit finish" << endl; } @@ -423,6 +436,7 @@ Onode* Ebofs::get_onode(object_t oid) // yay Onode *on = onode_map[oid]; on->get(); + //cout << "get_onode " << *on << endl; return on; } @@ -456,6 +470,7 @@ Onode* Ebofs::get_onode(object_t oid) // parse data block Onode *on = new Onode(oid); + onode_map[oid] = on; struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); on->onode_loc = eo->onode_loc; @@ -487,6 +502,7 @@ Onode* Ebofs::get_onode(object_t oid) waitfor_onode.erase(oid); // remove Cond list on->get(); + //cout << "get_onode " << *on << " (loaded)" << endl; return on; } } @@ -559,21 +575,30 @@ void Ebofs::write_onode(Onode *on) void Ebofs::remove_onode(Onode *on) { + //cout << "remove_onode " << *on << endl; + + assert(on->get_ref_count() >= 1); // caller + // tear down buffer cache - ObjectCache *oc = on->get_oc(&bc); - oc->tear_down(); // this will kick readers, flush waiters along the way. + if (on->oc) { + on->oc->tear_down(); // this will kick readers, flush waiters along the way. + on->close_oc(); + } // cancel commit waiters while (!on->commit_waiters.empty()) { Context *c = on->commit_waiters.front(); on->commit_waiters.pop_front(); - commit_waiters[super_epoch].remove(c); // FIXME slow, O(n) + commit_waiters[super_epoch].remove(c); // FIXME slow, O(n), though rare... c->finish(-ENOENT); delete c; } - // mark onode deleted + // remove from onode map, mark dangling/deleted + onode_map.erase(on->object_id); + onode_lru.lru_remove(on); on->deleted = true; + on->dangling = true; // remove from object table object_tab->remove(on->object_id); @@ -586,20 +611,24 @@ void Ebofs::remove_onode(Onode *on) for (unsigned i=0; iextents.size(); i++) allocator.release(on->extents[i]); - // remove from onode map - onode_map.erase(on->object_id); - if (on->is_dirty()) + + if (on->is_dirty()) { + on->mark_clean(); // this unpins *on dirty_onodes.erase(on); + } + //if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl; put_onode(on); } void Ebofs::put_onode(Onode *on) { on->put(); + //cout << "put_onode " << *on << endl; - if (on->get_ref_count() == 0 && - on->deleted) { + if (on->get_ref_count() == 0 && on->dangling) { + //cout << " *** hosing on " << *on << endl; + on->close_oc(); delete on; } } @@ -612,20 +641,52 @@ void Ebofs::dirty_onode(Onode *on) } } -void Ebofs::trim_onode_cache() +void Ebofs::trim_inodes(int max) { - while (onode_lru.lru_get_size() > onode_lru.lru_get_max()) { + unsigned omax = onode_lru.lru_get_max(); + unsigned cmax = cnode_lru.lru_get_max(); + if (max >= 0) omax = cmax = max; + dout(10) << "trim_inodes start " + << onode_lru.lru_get_size() << " / " << omax << " onodes, " + << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; + + // onodes + while (onode_lru.lru_get_size() > omax) { // expire an item Onode *on = (Onode*)onode_lru.lru_expire(); if (on == 0) break; // nothing to expire - + // expire - dout(12) << "trim_onode_cache removing " << on->object_id << endl; + dout(20) << "trim_inodes removing onode " << *on << endl; onode_map.erase(on->object_id); - delete on; + on->dangling = true; + + assert(on->oc == 0); // an open oc pins the onode! + + if (on->get_ref_count() == 0) { + delete on; + } else { + dout(20) << "trim_inodes still active: " << *on << endl; + } + } + + + // cnodes + while (cnode_lru.lru_get_size() > cmax) { + // expire an item + Cnode *cn = (Cnode*)cnode_lru.lru_expire(); + if (cn == 0) break; // nothing to expire + + // expire + dout(20) << "trim_inodes removing cnode " << *cn << endl; + cnode_map.erase(cn->coll_id); + + delete cn; } - dout(10) << "trim_onode_cache " << onode_lru.lru_get_size() << " left" << endl; + dout(10) << "trim_inodes finish " + << onode_lru.lru_get_size() << " / " << omax << " onodes, " + << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; } @@ -812,7 +873,8 @@ void Ebofs::commit_inodes_start() inodes_flushing++; write_onode(on); on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks + on->uncommitted.clear(); // commit allocated blocks + on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. } dirty_onodes.clear(); @@ -855,9 +917,10 @@ void Ebofs::trim_buffer_cache() ebofs_lock.Unlock(); } -void Ebofs::trim_bc() +void Ebofs::trim_bc(off_t max) { - off_t max = g_conf.ebofs_bc_size; + if (max < 0) + max = g_conf.ebofs_bc_size; dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; while (bc.get_size() > max && @@ -945,10 +1008,10 @@ void Ebofs::alloc_write(Onode *on, dout(10) << "alloc_write need to (re)alloc " << alloc << " on " << *on << endl; - if (alloc.empty()) return; + if (alloc.empty()) return; // no need to dirty the onode below! // merge alloc into onode uncommitted map - dout(10) << " union of " << on->uncommitted << " and " << alloc << endl; + //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl; interval_set old = on->uncommitted; on->uncommitted.union_of(alloc); @@ -1060,23 +1123,23 @@ void Ebofs::apply_write(Onode *on, size_t len, off_t off, bufferlist& bl) bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); } - // need to split off partial? - if (bh->is_missing() && bh->length() > 1 && - (bh->start() == bstart && off % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl; - } - if (bh->is_missing() && bh->length() > 1 && - (bh->last() == blast && len+off % EBOFS_BLOCK_SIZE != 0) && - (len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl; + // need to split off partial? (partials can only be ONE block) + if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { + if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { + BufferHead *right = bc.split(bh, bh->start()+1); + hits[right->start()] = right; + dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl; + } + if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && + (len+off < on->object_size)) { + BufferHead *right = bc.split(bh, bh->last()); + hits[right->start()] = right; + dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl; + } } // partial at head or tail? - if ((bh->start() == bstart && off % EBOFS_BLOCK_SIZE != 0) || + if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... (bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0)) { // locate ourselves in bh unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; @@ -1404,12 +1467,17 @@ int Ebofs::write(object_t oid, if (fsync) { // wait for flush. Cond cond; - int flush = 0; - int r = write(oid, len, off, bl, new C_Cond(&cond)); + int flush = 1; // write never returns positive + Context *c = new C_Cond(&cond, &flush); + int r = write(oid, len, off, bl, c); if (r < 0) return r; - cond.Wait(ebofs_lock); - + ebofs_lock.Lock(); + if (flush == 1) { // write never returns positive + cond.Wait(ebofs_lock); + assert(flush <= 0); + } + ebofs_lock.Unlock(); if (flush < 0) return flush; return r; } else { @@ -1426,6 +1494,12 @@ int Ebofs::write(object_t oid, dout(7) << "write " << hex << oid << dec << " len " << len << " off " << off << endl; assert(len > 0); + // too much unflushed dirty data? (if so, block!) + while (_write_will_block()) { + dout(10) << "write blocking on write" << endl; + bc.waitfor_stat(); // waits on ebofs_lock + } + // out of space? if (len / EBOFS_BLOCK_SIZE + 10 >= free_blocks) { dout(1) << "write failing, only " << free_blocks << " blocks free" << endl; @@ -1433,12 +1507,6 @@ int Ebofs::write(object_t oid, ebofs_lock.Unlock(); return -ENOSPC; } - - // too much unflushed dirty data? (if so, block!) - while (_write_will_block()) { - dout(10) << "write blocking on write" << endl; - bc.waitfor_stat(); - } // get|create inode Onode *on = get_onode(oid); @@ -1702,6 +1770,15 @@ int Ebofs::collection_remove(coll_t cid, object_t oid) } oc_tab->remove(idpair_t(oid,cid)); co_tab->remove(idpair_t(cid,oid)); + + // hose cnode? + if (cnode_map.count(cid)) { + Cnode *cn = cnode_map[cid]; + cnode_map.erase(cid); + cnode_lru.lru_remove(cn); + delete cn; + } + ebofs_lock.Unlock(); return 0; } diff --git a/ceph/ebofs/Ebofs.h b/ceph/ebofs/Ebofs.h index 4029407289af2..6b6df832e0a19 100644 --- a/ceph/ebofs/Ebofs.h +++ b/ceph/ebofs/Ebofs.h @@ -129,9 +129,7 @@ class Ebofs : public ObjectStore { void commit_inodes_wait(); friend class C_E_InodeFlush; - public: - void trim_onode_cache(); - protected: + void trim_inodes(int max = -1); // ** buffer cache ** BufferCache bc; @@ -139,7 +137,7 @@ class Ebofs : public ObjectStore { version_t trigger_commit(); void commit_bc_wait(version_t epoch); - void trim_bc(); + void trim_bc(off_t max = -1); public: void sync(); @@ -181,6 +179,8 @@ class Ebofs : public ObjectStore { bufferpool(EBOFS_BLOCK_SIZE), nodepool(ebofs_lock), object_tab(0), limbo_tab(0), collection_tab(0), oc_tab(0), co_tab(0), + onode_lru(g_conf.ebofs_oc_size), + cnode_lru(g_conf.ebofs_cc_size), inodes_flushing(0), bc(dev, bufferpool, ebofs_lock), finisher_stop(false), finisher_thread(this) { diff --git a/ceph/ebofs/Onode.h b/ceph/ebofs/Onode.h index 1e9021a0342a4..fb6d239c170e4 100644 --- a/ceph/ebofs/Onode.h +++ b/ceph/ebofs/Onode.h @@ -41,14 +41,15 @@ public: ObjectCache *oc; bool dirty; - bool deleted; + bool dangling; // not in onode_map + bool deleted; // deleted list commit_waiters; public: Onode(object_t oid) : ref(0), object_id(oid), version(0), object_size(0), object_blocks(0), oc(0), - dirty(false), deleted(false) { + dirty(false), dangling(false), deleted(false) { onode_loc.length = 0; } ~Onode() { @@ -62,12 +63,12 @@ public: void get() { if (ref == 0) lru_pin(); ref++; - //cout << "ebofs.onode.get " << ref << endl; + //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl; } void put() { ref--; if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << ref << endl; + //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl; } void mark_dirty() { @@ -83,20 +84,29 @@ public: } } bool is_dirty() { return dirty; } + bool is_deleted() { return deleted; } + bool is_dangling() { return dangling; } ObjectCache *get_oc(BufferCache *bc) { if (!oc) { oc = new ObjectCache(object_id, bc); + oc->get(); get(); } return oc; } void close_oc() { - assert(oc); - delete oc; - oc = 0; - put(); + if (oc) { + //cout << "close_oc on " << object_id << endl; + assert(oc->is_empty()); + if (oc->put() == 0){ + //cout << "************************* hosing oc" << endl; + delete oc; + } + oc = 0; + put(); + } } @@ -264,7 +274,12 @@ public: inline ostream& operator<<(ostream& out, Onode& on) { - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size << ")"; + out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; + out << " ref=" << on.get_ref_count(); + if (on.is_dirty()) out << " dirty"; + if (on.is_dangling()) out << " dangling"; + if (on.is_deleted()) out << " deleted"; + out << ")"; return out; } diff --git a/ceph/ebofs/Table.h b/ceph/ebofs/Table.h index 9c55c28121f45..4ddf8947344c1 100644 --- a/ceph/ebofs/Table.h +++ b/ceph/ebofs/Table.h @@ -6,7 +6,7 @@ /** table **/ -#define dbtout if (10 <= g_conf.debug_ebofs) cout << "table(" << this << ")." +#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." template diff --git a/ceph/ebofs/mkfs.ebofs.cc b/ceph/ebofs/mkfs.ebofs.cc index 748f9a68675d9..884630ad42934 100644 --- a/ceph/ebofs/mkfs.ebofs.cc +++ b/ceph/ebofs/mkfs.ebofs.cc @@ -32,12 +32,12 @@ int main(int argc, char **argv) memset(crap, 0, 1024*1024); bl.append(crap, 1024*1024); - int megs = 10000; + int megs = 1000; utime_t start = g_clock.now(); for (off_t m=0; mSignal(); } }; diff --git a/ceph/include/buffer.h b/ceph/include/buffer.h index c1ffd605d3490..68781fe6e3d13 100644 --- a/ceph/include/buffer.h +++ b/ceph/include/buffer.h @@ -203,24 +203,29 @@ class bufferptr { _buffer(b), _len(b->_len), _off(0) { - _buffer->_get(); + assert(_buffer->_ref == 0); + _buffer->_get(); // this is always the first one. } // subset cons - a subset of another bufferptr (subset) - bufferptr(const bufferptr& bp, unsigned len, unsigned off) : - _buffer(bp._buffer), - _len(len) { + bufferptr(const bufferptr& bp, unsigned len, unsigned off) { + bufferlock.Lock(); + _buffer = bp._buffer; + _len = len; _off = bp._off + off; _buffer->_get(); assert(_off < _buffer->_len); // sanity checks assert(_off + _len <= _buffer->_len); + bufferlock.Unlock(); } // copy cons - bufferptr(const bufferptr &other) : - _buffer(other._buffer), - _len(other._len), - _off(other._off) { + bufferptr(const bufferptr &other) { + bufferlock.Lock(); + _buffer = other._buffer; + _len = other._len; + _off = other._off; if (_buffer) _buffer->_get(); + bufferlock.Unlock(); } // assignment operator @@ -229,11 +234,13 @@ class bufferptr { // discard old discard_buffer(); - // new + // point to other + bufferlock.Lock(); _buffer = other._buffer; _len = other._len; _off = other._off; if (_buffer) _buffer->_get(); + bufferlock.Unlock(); return *this; } @@ -246,6 +253,7 @@ class bufferptr { bufferlock.Lock(); if (_buffer->_put() == 0) delete _buffer; + _buffer = 0; bufferlock.Unlock(); } } diff --git a/ceph/include/interval_set.h b/ceph/include/interval_set.h index d2b9345c00504..f837f52a1d80a 100644 --- a/ceph/include/interval_set.h +++ b/ceph/include/interval_set.h @@ -139,7 +139,8 @@ class interval_set { typename map::iterator n = p; n++; - if (start+len == n->first) { // combine with next, too! + if (n != m.end() && + start+len == n->first) { // combine with next, too! p->second += n->second; m.erase(n); } @@ -232,59 +233,6 @@ class interval_set { // + b insert(b); return; - - - /* i think this is correct? - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - T upto = 0; - while (pa != a.m.end() || pb != b.m.end()) { - // passed? - if (pa != a.m.end() && pa->first + pa->second <= upto) - { pa++; continue; } - if (pb != b.m.end() && pb->first + pb->second <= upto) - { pb++; continue; } - - // extending? - if (pa != a.m.end() && pa->first <= upto && pa->first+pa->second > upto) { - insert( upto, pa->first+pa->second - upto ); - upto = pa->first+pa->second; - pa++; - continue; - } - if (pb != b.m.end() && pb->first <= upto && pb->first+pb->second > upto) { - insert( upto, pb->first+pb->second - upto ); - upto = pb->first+pb->second; - pb++; - continue; - } - - // gap! - - // non-overlapping? - if (pb == b.m.end() || - (pa != a.m.end() && pa->first + pa->second <= pb->first)) { - insert(pa->first, pa->second); - upto = pa->first + pa->second; - pa++; continue; - } - if (pa == a.m.end() || - (pb != b.m.end() && pb->first + pb->second <= pa->first)) { - insert(pb->first, pb->second); - upto = pb->first + pb->second; - pb++; continue; - } - - // overlapping! - const T start = MIN(pa->first, pb->first); - upto = MAX(pa->first+pa->second, pb->first+pb->second); - insert(start, upto-start); - pa++; - pb++; - } - */ - } void union_of(const interval_set &b) { interval_set a; diff --git a/ceph/include/lru.h b/ceph/include/lru.h index a6e91e6d91674..59e7b4d7ea245 100644 --- a/ceph/include/lru.h +++ b/ceph/include/lru.h @@ -110,14 +110,10 @@ class LRU { //friend class MDCache; // hack public: - LRU() { + LRU(int max = 0) { lru_num = 0; lru_num_pinned = 0; lru_midpoint = .9; - lru_max = 0; - } - LRU(int max) { - LRU(); lru_max = max; } diff --git a/ceph/osd/OSD.cc b/ceph/osd/OSD.cc index 01d1bb3ee48ae..23b83efb177c5 100644 --- a/ceph/osd/OSD.cc +++ b/ceph/osd/OSD.cc @@ -83,7 +83,7 @@ OSD::OSD(int id, Messenger *m) store = new OBFSStore(whoami, NULL, "/dev/sdb3"); #else # ifdef USE_EBOFS - if (g_conf.osd_ebofs) { + if (g_conf.ebofs) { char hostname[100]; hostname[0] = 0; gethostname(hostname,100); -- 2.39.5