From b24257c507da8367f06018dd9fe77542a88be912 Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 18 Dec 2007 01:49:10 +0000 Subject: [PATCH] more partial csum handling.. leading to disaster git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@2219 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/ebofs/TODO | 8 +++-- branches/ebofs/ebofs/BufferCache.cc | 54 ++++++++++++++++++++++++----- branches/ebofs/ebofs/BufferCache.h | 2 +- branches/ebofs/ebofs/Ebofs.cc | 22 ++++++++++-- branches/ebofs/ebofs/Onode.h | 6 ++-- 5 files changed, 74 insertions(+), 18 deletions(-) diff --git a/branches/ebofs/TODO b/branches/ebofs/TODO index b5859a5192e76..8ea68c8c43f39 100644 --- a/branches/ebofs/TODO +++ b/branches/ebofs/TODO @@ -189,9 +189,13 @@ reliability - osdmonitor, filter ebofs -- allow holes +/- allow holes - checksums - - validate checksums on read +/ - validate checksums on read +/ - track partial zero regions +/ - fix partial handling: barrier on commit so that any errors propagate into onode. + - fix onode csum: should be effective csum during commit? can we delay apply_partial csum update in onode? HMM!!! + - allow btree sets - optionally scrub deallocated extents diff --git a/branches/ebofs/ebofs/BufferCache.cc b/branches/ebofs/ebofs/BufferCache.cc index db826694e9d2b..c8bfba9902493 100644 --- a/branches/ebofs/ebofs/BufferCache.cc +++ b/branches/ebofs/ebofs/BufferCache.cc @@ -119,7 +119,8 @@ void BufferHead::apply_partial() do_apply_partial(data, partial); csum_t newc = calc_csum(data.c_str(), EBOFS_BLOCK_SIZE); csum_t *p = oc->on->get_extent_csum_ptr(start(), 1); - dout(10) << "apply_partial was " << hex << oldc + dout(10) << "apply_partial onode had " << hex << *p + << " bl was " << oldc << " now " << newc << dec << dendl; assert(*p == oldc); *p = newc; @@ -186,7 +187,7 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist csum_t got[bh->length()]; for (unsigned i=0; ilength(); i++) { got[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE); - if (rand() % 10 == 0) { + if (false && rand() % 10 == 0) { dout(0) << "rx_finish HACK INJECTING bad csum" << dendl; derr(0) << "rx_finish HACK INJECTING bad csum" << dendl; got[i] = 0; @@ -244,6 +245,32 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist assert(cur_block == bh->partial_tx_to); } + // verify csum + assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); + csum_t want = *bh->oc->on->get_extent_csum_ptr(bh->start(), 1); + csum_t got = calc_csum(bl.c_str(), bl.length()); + if (want != got) { + derr(0) << "rx_finish bad csum on partial readback, want " << hex << want + << " got " << got << dec << dendl; + dout(0) << "rx_finish bad csum on partial readback, want " << hex << want + << " got " << got << dec << dendl; + *bh->oc->on->get_extent_csum_ptr(bh->start(), 1) = got; + bh->oc->on->data_csum += got - want; + + interval_set bad; + bad.insert(bh->start()*EBOFS_BLOCK_SIZE, EBOFS_BLOCK_SIZE); + bh->oc->on->bad_byte_extents.union_of(bad); + + interval_set over; + for (map::iterator q = bh->partial.begin(); + q != bh->partial.end(); + q++) + over.insert(bh->start()*EBOFS_BLOCK_SIZE+q->first, q->second.length()); + interval_set new_over; + new_over.intersection_of(over, bh->oc->on->bad_byte_extents); + bh->oc->on->bad_byte_extents.subtract(new_over); + } + // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); @@ -1260,6 +1287,14 @@ void BufferCache::rx_finish(ObjectCache *oc, dout(0) << "rx_finish bad csum on partial block " << pblock << dendl; derr(0) << "rx_finish bad csum on partial block " << pblock << " ****************" << dendl; poison_commit = true; + *sp->second.on->get_extent_csum_ptr(sp->second.oblock, 1) = actual; + sp->second.on->data_csum += actual - want; + + + interval_set bad; + bad.insert(sp->second.oblock*EBOFS_BLOCK_SIZE, EBOFS_BLOCK_SIZE); + sp->second.on->bad_byte_extents.union_of(bad); + interval_set overwritten; for (map::iterator p = sp->second.writes.begin(); p != sp->second.writes.end(); @@ -1268,12 +1303,13 @@ void BufferCache::rx_finish(ObjectCache *oc, for (map::iterator q = p->second.partial.begin(); q != p->second.partial.end(); q++) - o.insert(q->first, q->second.length()); + o.insert(sp->second.oblock*EBOFS_BLOCK_SIZE+q->first, q->second.length()); overwritten.union_of(o); } interval_set new_over; new_over.intersection_of(sp->second.on->bad_byte_extents, overwritten); sp->second.on->bad_byte_extents.subtract(new_over); + dout(10) << "rx_finish overwrote " << overwritten << ", newly " << new_over << ", now " << sp->second.on->bad_byte_extents.m << dendl; } @@ -1292,9 +1328,9 @@ void BufferCache::rx_finish(ObjectCache *oc, do_apply_partial( combined, p->second.partial ); // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); + dev.write(pblock, 1, combined, + new C_OC_PartialTxFinish( this, p->second.epoch ), + "finish_partials"); } partial_write.erase(sp); } @@ -1402,12 +1438,12 @@ void BufferCache::bh_cancel_partial_write(BufferHead *bh) } -void BufferCache::queue_partial(Onode *on, block_t opos, csum_t csum, +void BufferCache::queue_partial(Onode *on, block_t oblock, csum_t csum, block_t from, block_t to, map& partial, version_t epoch) { - dout(10) << "queue_partial " << on->object_id << " at " << opos + dout(10) << "queue_partial " << on->object_id << " at " << oblock << " from disk " << from << " -> " << to << " in epoch " << epoch << dendl; @@ -1423,7 +1459,7 @@ void BufferCache::queue_partial(Onode *on, block_t opos, csum_t csum, on->get(); // one ref for each pair. partial_write[from].on = on; partial_write[from].csum = csum; - partial_write[from].opos = opos; + partial_write[from].oblock = oblock; partial_write[from].writes[to].partial = partial; partial_write[from].writes[to].epoch = epoch; } diff --git a/branches/ebofs/ebofs/BufferCache.h b/branches/ebofs/ebofs/BufferCache.h index 8fae665ffe9d0..4be4e7e2eff72 100644 --- a/branches/ebofs/ebofs/BufferCache.h +++ b/branches/ebofs/ebofs/BufferCache.h @@ -436,7 +436,7 @@ class BufferCache { }; struct PartialWriteSet { Onode *on; // object - block_t opos; // block in object + block_t oblock; // block in object csum_t csum; // expected csum map writes; }; diff --git a/branches/ebofs/ebofs/Ebofs.cc b/branches/ebofs/ebofs/Ebofs.cc index 5a59745b4bc62..14ff96e7879f9 100644 --- a/branches/ebofs/ebofs/Ebofs.cc +++ b/branches/ebofs/ebofs/Ebofs.cc @@ -877,6 +877,7 @@ void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) p++) { Extent o(p->first, p->second); bl.copy_in(off, sizeof(o), (char*)&o); + off += sizeof(o); dout(15) << "encode_onode " << *on << " bad byte ex " << o << dendl; } @@ -2033,6 +2034,21 @@ int Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, dout(10) << "attempt_read " << *on << " " << off << "~" << len << dendl; ObjectCache *oc = on->get_oc(&bc); + // overlapping bad byte extents? + if (!on->bad_byte_extents.empty()) { + if (on->bad_byte_extents.contains(off)) { + dout(10) << "attempt_read corrupt (bad byte extent) at off " << off << ", returning -EIO" << dendl; + return -EIO; + } + if (on->bad_byte_extents.end() > off) { + off_t bad = on->bad_byte_extents.start_after(off); + if (bad < off+(off_t)len) { + len = bad-off; + dout(10) << "attempt_read corrupt (bad byte extent) at " << bad << ", shortening read to " << len << dendl; + } + } + } + // map block_t bstart = off / EBOFS_BLOCK_SIZE; block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; @@ -2103,13 +2119,13 @@ int Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, block_t curblock = bstart; while (curblock <= blast) { BufferHead *bh = 0; - if (h->first == curblock) { + if (h != hits.end() && h->first == curblock) { bh = h->second; h++; - } else if (p->first == curblock) { + } else if (p != partials.end() && p->first == curblock) { bh = p->second; p++; - } else if (c->first == curblock) { + } else if (c != corrupt.end() && c->first == curblock) { bh = c->second; c++; } else assert(0); diff --git a/branches/ebofs/ebofs/Onode.h b/branches/ebofs/ebofs/Onode.h index 402d20cfadc7d..85bd658d05ba1 100644 --- a/branches/ebofs/ebofs/Onode.h +++ b/branches/ebofs/ebofs/Onode.h @@ -180,7 +180,7 @@ public: for (map::iterator p = extent_map.begin(); p != extent_map.end(); p++) { - //cout << " verify_extents " << p->first << ": " << p->second << std::endl; + cout << " verify_extents " << p->first << ": " << p->second << std::endl; assert(pos == p->first); pos += p->second.ex.length; if (p->second.ex.start) { @@ -192,7 +192,7 @@ public: } } } - //cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl; + cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl; assert(s.size() == count); assert(count == alloc_blocks); @@ -518,7 +518,7 @@ public: return sizeof(Extent) * extent_map.size() + sizeof(csum_t)*alloc_blocks; } int get_bad_byte_bytes() { - return 2 * sizeof(off_t) * bad_byte_extents.m.size(); + return sizeof(Extent) * bad_byte_extents.m.size(); } }; -- 2.39.5