From a45bebb5c98e92a946eb5908926d32acc22ac1be Mon Sep 17 00:00:00 2001 From: sageweil Date: Mon, 17 Dec 2007 20:37:16 +0000 Subject: [PATCH] ability to poison commits; untested git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@2214 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/ebofs/ebofs/BufferCache.cc | 19 +++++++- branches/ebofs/ebofs/BufferCache.h | 2 + branches/ebofs/ebofs/Cnode.h | 3 +- branches/ebofs/ebofs/Ebofs.cc | 68 +++++++++++++++-------------- branches/ebofs/ebofs/Ebofs.h | 4 +- branches/ebofs/ebofs/FileJournal.h | 17 +++++--- branches/ebofs/ebofs/Onode.h | 6 ++- branches/ebofs/ebofs/nodes.h | 18 ++++---- branches/ebofs/ebofs/types.h | 7 ++- 9 files changed, 88 insertions(+), 56 deletions(-) diff --git a/branches/ebofs/ebofs/BufferCache.cc b/branches/ebofs/ebofs/BufferCache.cc index 84c82aeb48c9a..ed3cffd60cbe7 100644 --- a/branches/ebofs/ebofs/BufferCache.cc +++ b/branches/ebofs/ebofs/BufferCache.cc @@ -1274,7 +1274,24 @@ void BufferCache::rx_finish(ObjectCache *oc, csum_t actual = calc_csum(bl.c_str(), bl.length()); if (actual != sp->second.csum) { dout(0) << "rx_finish bad csum on partial block " << pblock << dendl; - derr(0) << "rx_finish bad csum on partial block " << pblock << dendl; + derr(0) << "rx_finish bad csum on partial block " << pblock << " ****************" << dendl; + poison_commit = true; + interval_set overwritten; + for (map::iterator p = sp->second.writes.begin(); + p != sp->second.writes.end(); + p++) { + interval_set o; + for (map::iterator q = p->second.partial.begin(); + q != p->second.partial.end(); + q++) + o.insert(q->first, q->second.length()); + overwritten.union_of(o); + } + interval_set new_over; + new_over.intersection_of(sp->second.on->bad_byte_extents, overwritten); + sp->second.on->bad_byte_extents.subtract(new_over); + dout(10) << "rx_finish overwrote " << overwritten << ", newly " << new_over + << ", now " << sp->second.on->bad_byte_extents.m << dendl; } for (map::iterator p = sp->second.writes.begin(); diff --git a/branches/ebofs/ebofs/BufferCache.h b/branches/ebofs/ebofs/BufferCache.h index 12f3cde28704c..aa86a02f31255 100644 --- a/branches/ebofs/ebofs/BufferCache.h +++ b/branches/ebofs/ebofs/BufferCache.h @@ -398,6 +398,8 @@ class BufferCache { LRU lru_dirty, lru_rest; + bool poison_commit; + private: Cond stat_cond; Cond flush_cond; diff --git a/branches/ebofs/ebofs/Cnode.h b/branches/ebofs/ebofs/Cnode.h index 8415978893fb5..77c392cee6b04 100644 --- a/branches/ebofs/ebofs/Cnode.h +++ b/branches/ebofs/ebofs/Cnode.h @@ -34,11 +34,12 @@ class Cnode : public LRUObject public: coll_t coll_id; Extent cnode_loc; + epoch_t last_alloc_epoch; map attr; public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { + Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid), last_alloc_epoch(0) { cnode_loc.length = 0; } ~Cnode() { diff --git a/branches/ebofs/ebofs/Ebofs.cc b/branches/ebofs/ebofs/Ebofs.cc index deae741ec865c..2a2666e3222fb 100644 --- a/branches/ebofs/ebofs/Ebofs.cc +++ b/branches/ebofs/ebofs/Ebofs.cc @@ -95,7 +95,6 @@ int Ebofs::mount() sb = sb2; super_epoch = sb->epoch; dout(3) << "mount epoch " << super_epoch << dendl; - assert(super_epoch == sb->epoch); super_fsid = sb->fsid; @@ -464,7 +463,7 @@ int Ebofs::commit_thread_entry() dout(10) << "commit_thread not dirty" << dendl; } else { - // --- this all happens in one go, from here --- + // --- get ready for a new epoch --- super_epoch++; dirty = false; @@ -491,34 +490,39 @@ int Ebofs::commit_thread_entry() << ", max dirty " << g_conf.ebofs_bc_max_dirty << dendl; - if (journal) journal->commit_epoch_start(); - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // prepare super (before any changes get made!) bufferptr superbp; - prepare_super(super_epoch, superbp); - - // --- to here. --- - // now wait. + int attempt = 1; + while (1) { + // --- queue up commit writes --- + bc.poison_commit = false; + if (journal) + journal->commit_epoch_start(); // FIXME: make loopable + commit_inodes_start(); // do this first; it currently involves inode reallocation + allocator.commit_limbo(); // limbo -> limbo_tab + nodepool.commit_start(dev, super_epoch); + prepare_super(super_epoch, superbp); // prepare super (before any new changes get made!) - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl; - dev.barrier(); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << dendl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << dendl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << dendl; + // --- now (try to) flush everything --- + // (partial writes may fail if read block has a bad csum) + + // blockdev barrier (prioritize our writes!) + dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl; + dev.barrier(); + + // wait for it all to flush (drops global lock) + commit_bc_wait(super_epoch-1); + dout(30) << "commit_thread bc flushed" << dendl; + commit_inodes_wait(); + dout(30) << "commit_thread inodes flushed" << dendl; + nodepool.commit_wait(); + dout(30) << "commit_thread btree nodes flushed" << dendl; + + if (!bc.poison_commit) + break; // ok! + + ++attempt; + dout(1) << "commit_thread commit poisoned, retrying, attempt " << attempt << dendl; + } // ok, now (synchronously) write the prior super! dout(10) << "commit_thread commit flushed, writing super for prior epoch" << dendl; @@ -532,6 +536,7 @@ int Ebofs::commit_thread_entry() // (since we're done allocating things, // AND we've flushed all previous epoch data) allocator.release_limbo(); // limbo_tab -> free_tabs + nodepool.commit_finish(); // do we need more node space? if (nodepool.get_num_free() < nodepool.get_num_total() / 3) { @@ -867,7 +872,7 @@ void Ebofs::write_onode(Onode *on) bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); // (always) relocate onode - if (1) { + if (super_epoch > on->last_alloc_epoch) { if (on->onode_loc.length) allocator.release(on->onode_loc); @@ -1149,7 +1154,7 @@ void Ebofs::write_cnode(Cnode *cn) bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); // (always) relocate cnode! - if (1) { + if (super_epoch > cn->last_alloc_epoch) { if (cn->cnode_loc.length) allocator.release(cn->cnode_loc); @@ -1244,8 +1249,7 @@ void Ebofs::commit_inodes_start() inodes_flushing++; write_onode(on); on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. + on->uncommitted.clear(); // commit any newly allocated blocks } dirty_onodes.clear(); diff --git a/branches/ebofs/ebofs/Ebofs.h b/branches/ebofs/ebofs/Ebofs.h index bbd618d8f9d61..5b3320f483153 100644 --- a/branches/ebofs/ebofs/Ebofs.h +++ b/branches/ebofs/ebofs/Ebofs.h @@ -55,7 +55,7 @@ protected: bool mounted, unmounting, dirty; bool readonly; version_t super_epoch; - bool commit_thread_started, mid_commit; + bool commit_thread_started; Cond commit_cond; // to wake up the commit thread Cond sync_cond; uint64_t super_fsid; @@ -237,7 +237,7 @@ protected: fake_writes(false), dev(devfn), mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), + super_epoch(0), commit_thread_started(false), commit_thread(this), journalfn(jfn), journal(0), free_blocks(0), limbo_blocks(0), diff --git a/branches/ebofs/ebofs/FileJournal.h b/branches/ebofs/ebofs/FileJournal.h index 7c9a67ccbd25f..446adeb826c71 100644 --- a/branches/ebofs/ebofs/FileJournal.h +++ b/branches/ebofs/ebofs/FileJournal.h @@ -25,10 +25,11 @@ class FileJournal : public Journal { public: /** log header - * we allow 3 pointers: + * we allow 4 pointers: * top/initial, - * one for an epoch boundary, - * and one for a wrap in the ring buffer/journal file. + * one for an epoch boundary (if any), + * one for a wrap in the ring buffer/journal file, + * one for a second epoch boundary (if any). * the epoch boundary one is useful only for speedier recovery in certain cases * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!) */ @@ -37,8 +38,8 @@ public: int num; off_t wrap; off_t max_size; - epoch_t epoch[3]; - off_t offset[3]; + epoch_t epoch[4]; + off_t offset[4]; header_t() : fsid(0), num(0), wrap(0), max_size(0) {} @@ -56,7 +57,11 @@ public: } } void push(epoch_t e, off_t o) { - assert(num < 3); + assert(num < 4); + if (num > 2 && + epoch[num-1] == e && + epoch[num-2] == (e-1)) + num--; // tail was an epoch boundary; replace it. epoch[num] = e; offset[num] = o; num++; diff --git a/branches/ebofs/ebofs/Onode.h b/branches/ebofs/ebofs/Onode.h index a0acdbad1aa27..f2e6cab4dead3 100644 --- a/branches/ebofs/ebofs/Onode.h +++ b/branches/ebofs/ebofs/Onode.h @@ -74,6 +74,8 @@ public: // data Extent onode_loc; + epoch_t last_alloc_epoch; // epoch i last allocated for + __s64 object_size; __u64 alloc_blocks, last_block; csum_t data_csum; @@ -94,10 +96,10 @@ public: bool dangling; // not in onode_map bool deleted; // deleted - list commit_waiters; + //list commit_waiters; public: - Onode(pobject_t oid) : ref(0), object_id(oid), version(0), + Onode(pobject_t oid) : ref(0), object_id(oid), version(0), last_alloc_epoch(0), object_size(0), alloc_blocks(0), last_block(0), data_csum(0), readonly(0), oc(0), diff --git a/branches/ebofs/ebofs/nodes.h b/branches/ebofs/ebofs/nodes.h index 86ed013fc2efa..abc4d493ed710 100644 --- a/branches/ebofs/ebofs/nodes.h +++ b/branches/ebofs/ebofs/nodes.h @@ -424,6 +424,16 @@ class NodePool { flushing++; } + debofs(20) << "ebofs.nodepool.commit_start finish" << std::endl; + } + + void commit_wait() { + while (flushing > 0) + commit_cond.Wait(ebofs_lock); + debofs(20) << "ebofs.nodepool.commit_wait finish" << std::endl; + } + + void commit_finish() { // limbo -> free for (map::iterator i = limbo.m.begin(); i != limbo.m.end(); @@ -433,14 +443,6 @@ class NodePool { free.insert(i->first, i->second); } limbo.clear(); - - debofs(20) << "ebofs.nodepool.commit_start finish" << std::endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - debofs(20) << "ebofs.nodepool.commit_wait finish" << std::endl; } diff --git a/branches/ebofs/ebofs/types.h b/branches/ebofs/ebofs/types.h index c853d0605cdb6..01c54b5bf416b 100644 --- a/branches/ebofs/ebofs/types.h +++ b/branches/ebofs/ebofs/types.h @@ -169,11 +169,10 @@ struct ebofs_super { } bool is_corrupt() { csum_t actual = calc_csum(); - if (actual != super_csum) { - cout << "actual " << actual << " expected " << super_csum << std::endl; + if (actual != super_csum) return true; - } - return false; + else + return false; } bool is_valid_magic() { return s_magic == EBOFS_MAGIC; } bool is_valid() { return is_valid_magic() && !is_corrupt(); } -- 2.39.5