From e7493e1193d766506f11b32d398b60cbbfa8c059 Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 27 Nov 2007 00:26:21 +0000 Subject: [PATCH] validate checksums on reads; still need to deal with partial writes git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@2117 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/ebofs/TODO | 3 + branches/ebofs/config.cc | 8 +- branches/ebofs/config.h | 4 +- branches/ebofs/ebofs/BufferCache.cc | 129 +++++++++++++++++++--------- branches/ebofs/ebofs/BufferCache.h | 12 ++- branches/ebofs/ebofs/Ebofs.cc | 88 +++++++++++++------ branches/ebofs/ebofs/Ebofs.h | 7 +- branches/ebofs/ebofs/Onode.h | 4 +- branches/ebofs/ebofs/test.ebofs.cc | 28 +++--- 9 files changed, 187 insertions(+), 96 deletions(-) diff --git a/branches/ebofs/TODO b/branches/ebofs/TODO index 6cc4b221019d1..b5859a5192e76 100644 --- a/branches/ebofs/TODO +++ b/branches/ebofs/TODO @@ -190,6 +190,9 @@ reliability ebofs - allow holes +- checksums + - validate checksums on read + - allow btree sets - optionally scrub deallocated extents - clone() diff --git a/branches/ebofs/config.cc b/branches/ebofs/config.cc index bad3c05db8155..3dd7dfb79f2a9 100644 --- a/branches/ebofs/config.cc +++ b/branches/ebofs/config.cc @@ -335,9 +335,7 @@ md_config_t g_conf = { ebofs_bc_max_dirty: (30 *256), // before write() will block ebofs_max_prefetch: 1000, // 4k blocks ebofs_realloc: false, // hrm, this can cause bad fragmentation, don't use! - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) + ebofs_verify_csum_on_read: true, // --- block device --- bdev_lock: true, @@ -822,8 +820,8 @@ void parse_config_options(std::vector& args) g_conf.ebofs_bc_size = atoi(args[++i]); else if (strcmp(args[i], "--ebofs_bc_max_dirty") == 0) g_conf.ebofs_bc_max_dirty = atoi(args[++i]); - else if (strcmp(args[i], "--ebofs_abp_max_alloc") == 0) - g_conf.ebofs_abp_max_alloc = atoi(args[++i]); + else if (strcmp(args[i], "--ebofs_verify_csum_on_read") == 0) + g_conf.ebofs_verify_csum_on_read = atoi(args[++i]); else if (strcmp(args[i], "--ebofs_max_prefetch") == 0) g_conf.ebofs_max_prefetch = atoi(args[++i]); else if (strcmp(args[i], "--ebofs_realloc") == 0) diff --git a/branches/ebofs/config.h b/branches/ebofs/config.h index 442d7f9a69b93..13a49b1d39237 100644 --- a/branches/ebofs/config.h +++ b/branches/ebofs/config.h @@ -298,9 +298,7 @@ struct md_config_t { off_t ebofs_bc_max_dirty; unsigned ebofs_max_prefetch; bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; + bool ebofs_verify_csum_on_read; // block device bool bdev_lock; diff --git a/branches/ebofs/ebofs/BufferCache.cc b/branches/ebofs/ebofs/BufferCache.cc index 57c02a667c295..172efe7df8c54 100644 --- a/branches/ebofs/ebofs/BufferCache.cc +++ b/branches/ebofs/ebofs/BufferCache.cc @@ -26,9 +26,8 @@ void do_apply_partial(bufferlist& bl, map& pm) for (map::iterator i = pm.begin(); i != pm.end(); i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << std::endl; - bl.copy_in(pos, i->second.length(), i->second); + cout << "do_apply_partial at " << i->first << "~" << i->second.length() << std::endl; + bl.copy_in(i->first, i->second.length(), i->second); } pm.clear(); } @@ -49,8 +48,6 @@ void BufferHead::add_partial(off_t off, bufferlist& p) assert(off >= 0); assert(off + len <= EBOFS_BLOCK_SIZE); - csum_t csum_diff = calc_csum_realign(p.c_str(), p.length(), off); - // trim any existing that overlaps map::iterator i = partial.begin(); while (i != partial.end()) { @@ -67,7 +64,6 @@ void BufferHead::add_partial(off_t off, bufferlist& p) // overlap all of i? if (off <= i->first && off+len >= i->first + i->second.length()) { // erase it and move on. - csum_diff -= calc_csum_realign(i->second.c_str(), i->second.length(), i->first); partial.erase(i++); continue; } @@ -75,7 +71,6 @@ void BufferHead::add_partial(off_t off, bufferlist& p) if (off > i->first && off+len >= i->first + i->second.length()) { // shorten i. unsigned taillen = off - i->first; - csum_diff -= calc_csum_realign(i->second.c_str()+taillen, taillen, off); bufferlist o; o.claim( i->second ); i->second.substr_of(o, 0, taillen); @@ -87,7 +82,6 @@ void BufferHead::add_partial(off_t off, bufferlist& p) // move i (make new tail). off_t tailoff = off+len; unsigned trim = tailoff - i->first; - csum_diff -= calc_csum_realign(i->second.c_str(), trim, i->first); partial[tailoff].substr_of(i->second, trim, i->second.length()-trim); partial.erase(i++); // should now be at tailoff i++; @@ -104,7 +98,6 @@ void BufferHead::add_partial(off_t off, bufferlist& p) unsigned tailoff = off+len - i->first; unsigned taillen = o.length() - len - headlen; partial[off+len].substr_of(o, tailoff, taillen); - csum_diff -= calc_csum_realign(o.c_str()+headlen, taillen, off); break; } assert(0); @@ -112,25 +105,23 @@ void BufferHead::add_partial(off_t off, bufferlist& p) // insert and adjust csum partial[off] = p; - csum_t *csum = oc->on->get_extent_csum_ptr(start()); - csum[0] += csum_diff; - oc->on->data_csum += csum_diff; - dout(10) << "add_partial off " << off << "~" << p.length() - << " csum_diff " << hex << csum_diff << " now " - << csum[0] << dec << dendl; + dout(10) << "add_partial off " << off << "~" << p.length() << dendl; } void BufferHead::apply_partial() { + assert(!partial.empty()); + dout(10) << "apply_partial on " << partial.size() << " substrings" << dendl; + csum_t oldc = calc_csum(data.c_str(), EBOFS_BLOCK_SIZE); do_apply_partial(data, partial); - csum_t new_csum = calc_csum(data.c_str(), EBOFS_BLOCK_SIZE); - csum_t *oldp = oc->on->get_extent_csum_ptr(start()); - if (new_csum != *oldp) { - dout(10) << "apply_partial old_csum " << hex << *oldp << " calced_csum " << new_csum << dec << dendl; - assert(*oldp == new_csum); - } - partial.clear(); + csum_t newc = calc_csum(data.c_str(), EBOFS_BLOCK_SIZE); + csum_t *p = oc->on->get_extent_csum_ptr(start(), 1); + dout(10) << "apply_partial was " << hex << oldc + << " now " << newc << dec << dendl; + assert(*p == oldc); + *p = newc; + oc->on->data_csum += newc - oldc; } @@ -147,9 +138,11 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist list waiters; dout(10) << "rx_finish " << start << "~" << length << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { + map::iterator p, next; + for (p = data.lower_bound(start); p != data.end(); p = next) { + next = p; + next++; + BufferHead *bh = p->second; dout(10) << "rx_finish ?" << *bh << dendl; assert(p->first == bh->start()); @@ -166,13 +159,71 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist if (bh->rx_ioh == ioh) bh->rx_ioh = 0; + // trigger waiters + for (map >::iterator p = bh->waitfor_read.begin(); + p != bh->waitfor_read.end(); + p++) { + assert(p->first >= bh->start() && p->first < bh->end()); + waiters.splice(waiters.begin(), p->second); + } + bh->waitfor_read.clear(); + if (bh->is_rx()) { assert(bh->get_version() == 0); assert(bh->end() <= start+length); assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << dendl; + bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); + + // verify checksum + int bad = 0; + if (g_conf.ebofs_verify_csum_on_read) { + csum_t *want = bh->oc->on->get_extent_csum_ptr(bh->start(), bh->length()); + csum_t got[bh->length()]; + for (unsigned i=0; ilength(); i++) { + got[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE); + if (false && rand() % 10 == 0) { + dout(0) << "rx_finish HACK INJECTING bad csum" << dendl; + got[i] = 0; + } + if (got[i] != want[i]) { + dout(0) << "rx_finish bad csum wanted " << hex << want[i] << " got " << got[i] << dec + << " for object block " << (i+bh->start()) + << dendl; + bad++; + } + } + if (bad) { + block_t ostart = bh->start(); + block_t olen = bh->length(); + for (unsigned s=0; ssplit(bh, ostart+s); + dout(0) << "rx_finish rx -> clean on " << *bh << dendl; + bc->mark_clean(bh); + bh = middle; + } + BufferHead *right = bh; + if (e < olen) + right = bc->split(bh, ostart+e); + dout(0) << "rx_finish rx -> corrupt on " << *bh <mark_corrupt(bh); + bh = right; + s = e; + } + } + } + } + if (bh) { + dout(10) << "rx_finish rx -> clean on " << *bh << dendl; + bc->mark_clean(bh); + } } else if (bh->is_partial()) { dout(10) << "rx_finish partial -> tx on " << *bh << dendl; @@ -213,14 +264,6 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist bh->is_clean()); // was overwritten, queued, _and_ flushed to disk } - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); } finish_contexts(waiters); @@ -364,6 +407,9 @@ int ObjectCache::try_map_read(block_t start, block_t len) e->is_tx()) { dout(20) << "try_map_read hit " << *e << dendl; } + else if (e->is_corrupt()) { + dout(20) << "try_map_read corrupt " << *e << dendl; + } else if (e->is_rx()) { dout(20) << "try_map_read rx " << *e << dendl; num_missing++; @@ -420,7 +466,8 @@ int ObjectCache::map_read(block_t start, block_t len, map& hits, map& missing, map& rx, - map& partial) { + map& partial, + map& corrupt) { map::iterator p = data.lower_bound(start); @@ -476,6 +523,10 @@ int ObjectCache::map_read(block_t start, block_t len, dout(20) << "map_read hit " << *e << dendl; bc->touch(e); } + else if (e->is_corrupt()) { + corrupt[cur] = e; + dout(20) << "map_read corrupt " << *e << dendl; + } else if (e->is_rx()) { rx[cur] = e; // missing, not readable. dout(20) << "map_read rx " << *e << dendl; @@ -953,7 +1004,7 @@ void ObjectCache::scrub_csums() on->map_extents(bh->start()+i, 1, exv, 0); assert(exv.size() == 1); if (exv[0].start == 0) continue; // hole. - csum_t want = *on->get_extent_csum_ptr(bh->start()+i); + csum_t want = *on->get_extent_csum_ptr(bh->start()+i, 1); csum_t b = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE); if (b != want) { dout(0) << "scrub_csums bad data at " << (bh->start()+i) << " have " @@ -1220,11 +1271,7 @@ void BufferCache::rx_finish(ObjectCache *oc, p++) { dout(10) << "rx_finish partial from " << pblock << " -> " << p->first << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" << dendl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - // make the combined block bufferlist combined; bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); diff --git a/branches/ebofs/ebofs/BufferCache.h b/branches/ebofs/ebofs/BufferCache.h index fcddefbdad886..3d6e4a3622232 100644 --- a/branches/ebofs/ebofs/BufferCache.h +++ b/branches/ebofs/ebofs/BufferCache.h @@ -43,6 +43,7 @@ class BufferHead : public LRUObject { const static int STATE_TX = 3; // Rw flushing to disk const static int STATE_RX = 4; // w reading from disk const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. + const static int STATE_CORRUPT = 6; // data on disk doesn't match onode checksum public: ObjectCache *oc; @@ -146,6 +147,7 @@ class BufferHead : public LRUObject { bool is_tx() { return state == STATE_TX; } bool is_rx() { return state == STATE_RX; } bool is_partial() { return state == STATE_PARTIAL; } + bool is_corrupt() { return state == STATE_CORRUPT; } void add_shadow(BufferHead *dup) { shadows.insert(dup); @@ -346,7 +348,8 @@ class ObjectCache { map& hits, // hits map& missing, // read these from disk map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk + map& partial, // (maybe) wait for these to read from disk + map& corrupt); // bad checksums int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. @@ -397,7 +400,7 @@ class BufferCache { Cond flush_cond; int stat_waiter; - off_t stat_clean; + off_t stat_clean, stat_corrupt; off_t stat_dirty; off_t stat_rx; off_t stat_tx; @@ -433,7 +436,7 @@ class BufferCache { BufferCache(BlockDevice& d, Mutex& el) : ebofs_lock(el), dev(d), stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) + stat_clean(0), stat_corrupt(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) {} @@ -483,6 +486,7 @@ class BufferCache { switch (bh->get_state()) { case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; + case BufferHead::STATE_CORRUPT: stat_corrupt += bh->length(); break; case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; case BufferHead::STATE_TX: stat_tx += bh->length(); break; case BufferHead::STATE_RX: stat_rx += bh->length(); break; @@ -494,6 +498,7 @@ class BufferCache { switch (bh->get_state()) { case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; + case BufferHead::STATE_CORRUPT: stat_corrupt -= bh->length(); break; case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; case BufferHead::STATE_TX: stat_tx -= bh->length(); break; case BufferHead::STATE_RX: stat_rx -= bh->length(); break; @@ -564,6 +569,7 @@ class BufferCache { void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; + void mark_corrupt(BufferHead *bh) { set_state(bh, BufferHead::STATE_CORRUPT); }; void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; diff --git a/branches/ebofs/ebofs/Ebofs.cc b/branches/ebofs/ebofs/Ebofs.cc index 10aeb286ef80f..dda4aefa4ce52 100644 --- a/branches/ebofs/ebofs/Ebofs.cc +++ b/branches/ebofs/ebofs/Ebofs.cc @@ -1304,7 +1304,7 @@ void Ebofs::trim_bc(off_t max) if (!bh) break; dout(25) << "trim_bc trimming " << *bh << dendl; - assert(bh->is_clean()); + assert(bh->is_clean() || bh->is_corrupt()); ObjectCache *oc = bh->oc; bc.remove_bh(bh); @@ -1434,7 +1434,8 @@ int Ebofs::statfs(struct statfs *buf) void Ebofs::alloc_write(Onode *on, block_t start, block_t len, interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) + block_t& old_bfirst, block_t& old_blast, + csum_t& old_csum_first, csum_t& old_csum_last) { // first decide what pages to (re)allocate alloc.insert(start, len); // start with whole range @@ -1461,14 +1462,21 @@ void Ebofs::alloc_write(Onode *on, // take note if first/last blocks in write range are remapped.. in case we need to do a partial read/write thing // these are for partial, so we don't care about TX bh's, so don't worry about bits canceling stuff below. - if (!old.empty() && old[0].start) { // ..if not a hole.. - if (i->first == start) { + if (!old.empty()) { + if (old[0].start && + i->first == start) { // ..if not a hole.. old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << dendl; + old_csum_first = *on->get_extent_csum_ptr(start, 1); + dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] + << " csum " << old_csum_first << dendl; } - if (i->first+i->second == start+len) { + if (old[old.size()-1].start && + i->first+i->second == start+len && + start+len <= on->last_block) { old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << dendl; + old_csum_last = *on->get_extent_csum_ptr(start+len-1, 1); + dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] + << " csum " << old_csum_last << dendl; } } } @@ -1621,7 +1629,8 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) interval_set alloc; block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); + csum_t old_csum_first, old_csum_last; + alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast, old_csum_first, old_csum_last); dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << dendl; if (fake_writes) { @@ -1697,6 +1706,21 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); assert(bh->length() == 1); + if (bh->is_missing()) { + // newly realloc; carry old checksum over since we're only partially overwriting + if (bh->start() == bstart) { + dout(10) << "apply_write carrying over starting csum " << hex << old_csum_first << dec + << " for partial " << *bh << dendl; + *on->get_extent_csum_ptr(bh->start(), 1) = old_csum_first; + on->data_csum += old_csum_first; + } else if (bh->end()-1 == blast) { + dout(10) << "apply_write carrying over ending csum " << hex << old_csum_last << dec + << " for partial " << *bh << dendl; + *on->get_extent_csum_ptr(bh->end()-1, 1) = old_csum_last; + on->data_csum += old_csum_last; + } else assert(0); + } + // add frag to partial dout(10) << "apply_write writing into partial " << *bh << ":" << " off_in_bh " << off_in_bh @@ -1787,15 +1811,15 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) block_t rbfirst = off_in_bh/EBOFS_BLOCK_SIZE; block_t rblast = (off_in_bh+len_in_bh+4095)/EBOFS_BLOCK_SIZE; block_t bnum = rblast-rbfirst; - csum_t *csum = on->get_extent_csum_ptr(bh->start()+rbfirst); - dout(10) << "calc csum for " << rbfirst << "~" << bnum << dendl; + csum_t *csum = on->get_extent_csum_ptr(bh->start()+rbfirst, bnum); + dout(20) << "calc csum for " << rbfirst << "~" << bnum << dendl; for (unsigned i=0; idata_csum -= csum[i]; - dout(10) << "old csum for " << (i+rbfirst) << " is " << hex << csum[i] << dec << dendl; + dout(30) << "old csum for " << (i+rbfirst) << " is " << hex << csum[i] << dec << dendl; csum[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE); - dout(10) << "new csum for " << (i+rbfirst) << " is " << hex << csum[i] << dec << dendl; + dout(30) << "new csum for " << (i+rbfirst) << " is " << hex << csum[i] << dec << dendl; on->data_csum += csum[i]; - dout(10) << "new data_csum is " << hex << on->data_csum << dec << dendl; + dout(30) << "new data_csum is " << hex << on->data_csum << dec << dendl; } blpos += len_in_bh; @@ -1849,8 +1873,8 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) } // fill in csums - csum_t *csum = on->get_extent_csum_ptr(bh->start()); unsigned blocks = (len_in_bh + 4095)/ EBOFS_BLOCK_SIZE; + csum_t *csum = on->get_extent_csum_ptr(bh->start(), blocks); for (unsigned i=0; idata_csum -= csum[i]; csum[i] = calc_csum(bh->data.c_str() + i*EBOFS_BLOCK_SIZE, EBOFS_BLOCK_SIZE); @@ -1963,8 +1987,8 @@ void Ebofs::apply_zero(Onode *on, off_t off, size_t len) // *** file i/o *** -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) +int Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, + Cond *will_wait_on, bool *will_wait_on_bool) { dout(10) << "attempt_read " << *on << " " << off << "~" << len << dendl; ObjectCache *oc = on->get_oc(&bc); @@ -1978,7 +2002,8 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, map missing; // read these map rx; // wait for these map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); + map corrupt; + oc->map_read(bstart, blen, hits, missing, rx, partials, corrupt); // missing buffers? if (!missing.empty()) { @@ -1991,7 +2016,7 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, BufferHead *wait_on = missing.begin()->second; block_t b = MAX(wait_on->start(), bstart); wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; + return 0; } // are partials sufficient? @@ -2015,7 +2040,7 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, partials_ok = false; } } - if (!partials_ok) return false; + if (!partials_ok) return 0; // wait on rx? if (!rx.empty()) { @@ -2024,13 +2049,14 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, dout(20) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << dendl; block_t b = MAX(wait_on->start(), bstart); wait_on->waitfor_read[b].push_back(c); - return false; + return 0; } // yay, we have it all! // concurrently walk thru hits, partials. map::iterator h = hits.begin(); map::iterator p = partials.begin(); + map::iterator c = corrupt.begin(); bl.clear(); off_t pos = off; @@ -2043,6 +2069,9 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, } else if (p->first == curblock) { bh = p->second; p++; + } else if (c->first == curblock) { + bh = c->second; + c++; } else assert(0); off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); @@ -2050,7 +2079,15 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, off_t start = MAX( pos, bhstart ); off_t end = MIN( off+(off_t)len, bhend ); - if (bh->is_partial()) { + if (bh->is_corrupt()) { + if (bl.length()) { + dout(10) << "attempt_read corrupt at " << *bh << ", returning short result" << dendl; + return 1; + } else { + dout(10) << "attempt_read corrupt at " << *bh << ", returning -EIO" << dendl; + return -EIO; + } + } else if (bh->is_partial()) { // copy from a partial block. yuck! bufferlist frag; bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); @@ -2087,7 +2124,7 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, } assert(bl.length() == len); - return true; + return 1; } @@ -2207,7 +2244,7 @@ int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) // check size bound if (off >= on->object_size) { dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << dendl; - r = -ESPIPE; // FIXME better errno? + r = 0; break; } @@ -2215,8 +2252,9 @@ int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay + r = attempt_read(on, off, will_read, bl, &cond, &done); + if (r != 0) + break; // wait while (!done) diff --git a/branches/ebofs/ebofs/Ebofs.h b/branches/ebofs/ebofs/Ebofs.h index c9e3d9b8c5ab1..b8bdb9b45b528 100644 --- a/branches/ebofs/ebofs/Ebofs.h +++ b/branches/ebofs/ebofs/Ebofs.h @@ -188,11 +188,12 @@ protected: void alloc_write(Onode *on, block_t start, block_t len, interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); + block_t& old_bfirst, block_t& old_blast, + csum_t& old_csum_first, csum_t& old_csum_last); void apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl); void apply_zero(Onode *on, off_t off, size_t len); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); + int attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, + Cond *will_wait_on, bool *will_wait_on_bool); // ** finisher ** // async write notification to users diff --git a/branches/ebofs/ebofs/Onode.h b/branches/ebofs/ebofs/Onode.h index 6bc074c4dbbda..ec0f9bdd9b9a7 100644 --- a/branches/ebofs/ebofs/Onode.h +++ b/branches/ebofs/ebofs/Onode.h @@ -199,13 +199,13 @@ public: } } - csum_t *get_extent_csum_ptr(block_t offset) { + csum_t *get_extent_csum_ptr(block_t offset, block_t len) { map::iterator p = extent_map.lower_bound(offset); if (p == extent_map.end() || p->first > offset) p--; assert(p->first <= offset); assert(p->second.ex.start != 0); - assert(offset < p->first + p->second.ex.length); + assert(offset+len <= p->first + p->second.ex.length); return &p->second.csum[offset-p->first]; } diff --git a/branches/ebofs/ebofs/test.ebofs.cc b/branches/ebofs/ebofs/test.ebofs.cc index 2b96168b51007..8dcc4961e2666 100644 --- a/branches/ebofs/ebofs/test.ebofs.cc +++ b/branches/ebofs/ebofs/test.ebofs.cc @@ -40,7 +40,7 @@ public: while (!stop) { object_t oid; - oid.ino = (rand() % 10) + 0x10000000; + oid.ino = (rand() % 1000) + 0x10000000; coll_t cid = rand() % 50; off_t off = rand() % 10000;//0;//rand() % 1000000; off_t len = 1+rand() % 100000; @@ -48,7 +48,7 @@ public: if (rand() % 2) a = "two"; int l = 3;//rand() % 10; - switch (rand() % 4) {//10) { + switch (rand() % 5) {//10) { case 0: { oid.rev = rand() % 10; @@ -92,31 +92,38 @@ public: break; case 3: + { + cout << t << " truncate " << hex << oid << dec << " " << off << std::endl; + fs.truncate(oid, 0); + } + break; + + case 4: cout << t << " remove " << hex << oid << dec << std::endl; fs.remove(oid); break; - case 4: + case 5: cout << t << " collection_add " << hex << oid << dec << " to " << cid << std::endl; fs.collection_add(cid, oid, 0); break; - case 5: + case 6: cout << t << " collection_remove " << hex << oid << dec << " from " << cid << std::endl; fs.collection_remove(cid, oid, 0); break; - case 6: + case 7: cout << t << " setattr " << hex << oid << dec << " " << a << " len " << l << std::endl; fs.setattr(oid, a, (void*)a, l, 0); break; - case 7: + case 8: cout << t << " rmattr " << hex << oid << dec << " " << a << std::endl; fs.rmattr(oid,a); break; - case 8: + case 9: { char v[4]; cout << t << " getattr " << hex << oid << dec << " " << a << std::endl; @@ -127,13 +134,6 @@ public: } break; - case 9: - { - cout << t << " truncate " << hex << oid << dec << " " << off << std::endl; - fs.truncate(oid, 0); - } - break; - case 10: { object_t newoid = oid; -- 2.39.5