From b108f5ea3f7514242a0e5fc173522e6023d069d9 Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 18 Dec 2007 00:34:38 +0000 Subject: [PATCH] cleanup; onode checksums; fixed stat_ bug git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@2217 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/ebofs/ebofs/BufferCache.cc | 52 ++---- branches/ebofs/ebofs/BufferCache.h | 44 +++-- branches/ebofs/ebofs/Ebofs.cc | 265 +++++++++++++++------------- branches/ebofs/ebofs/Ebofs.h | 1 + branches/ebofs/ebofs/Onode.h | 33 ++-- branches/ebofs/ebofs/types.h | 30 ++-- 6 files changed, 230 insertions(+), 195 deletions(-) diff --git a/branches/ebofs/ebofs/BufferCache.cc b/branches/ebofs/ebofs/BufferCache.cc index ed3cffd60cbe7..db826694e9d2b 100644 --- a/branches/ebofs/ebofs/BufferCache.cc +++ b/branches/ebofs/ebofs/BufferCache.cc @@ -26,7 +26,7 @@ void do_apply_partial(bufferlist& bl, map& pm) for (map::iterator i = pm.begin(); i != pm.end(); i++) { - cout << "do_apply_partial at " << i->first << "~" << i->second.length() << std::endl; + //cout << "do_apply_partial at " << i->first << "~" << i->second.length() << std::endl; bl.copy_in(i->first, i->second.length(), i->second); } pm.clear(); @@ -498,9 +498,7 @@ int ObjectCache::map_read(block_t start, block_t len, left, // no prefetch exv, 0); for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); + BufferHead *n = new BufferHead(this, cur, exv[i].length); if (exv[i].start) { missing[cur] = n; dout(20) << "map_read miss " << left << " left, " << *n << dendl; @@ -562,9 +560,7 @@ int ObjectCache::map_read(block_t start, block_t len, exv, 0); for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); + BufferHead *n = new BufferHead(this, cur, exv[i].length); if (exv[i].start) { missing[cur] = n; dout(20) << "map_read gap " << *n << dendl; @@ -648,12 +644,10 @@ int ObjectCache::map_write(block_t start, block_t len, // at end? if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); + BufferHead *n = new BufferHead(this, cur, max); if (hole) n->set_state(BufferHead::STATE_CLEAN); // hole + bc->add_bh(n); hits[cur] = n; left -= max; cur += max; @@ -744,9 +738,7 @@ int ObjectCache::map_write(block_t start, block_t len, block_t next = p->first; block_t glen = MIN(next-cur, max); dout(10) << "map_write gap " << cur << "~" << glen << dendl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); + BufferHead *n = new BufferHead(this, cur, glen); if (hole) n->set_state(BufferHead::STATE_CLEAN); // hole bc->add_bh(n); @@ -841,7 +833,6 @@ void ObjectCache::discard_bh(BufferHead *bh, version_t super_epoch) finish_contexts(p->second, -1); bc->remove_bh(bh); - delete bh; } void ObjectCache::truncate(block_t blocks, version_t super_epoch) @@ -889,9 +880,7 @@ void ObjectCache::clone_to(Onode *other) // dup dirty or tx bh's if (!ton) ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); + BufferHead *nbh = new BufferHead(ton, bh->start(), bh->length()); nbh->data = bh->data; // just copy refs to underlying buffers. bc->add_bh(nbh); @@ -928,13 +917,13 @@ BufferHead *ObjectCache::merge_bh_left(BufferHead *left, BufferHead *right) if (right->version > left->version) left->version = right->version; if (right->last_flushed > left->last_flushed) left->last_flushed = right->last_flushed; - left->set_length(left->length() + right->length()); + bc->stat_sub(left); + left->reset_length(left->length() + right->length()); + bc->stat_add(left); left->data.claim_append(right->data); // remove right - remove_bh(right); - bc->lru_rest.lru_remove(right); - delete right; + bc->remove_bh(right); dout(10) << "merge_bh_left result " << *left << dendl; return left; } @@ -1044,24 +1033,19 @@ BufferHead *BufferCache::split(BufferHead *orig, block_t after) dout(20) << "split " << *orig << " at " << after << dendl; // split off right - BufferHead *right = new BufferHead(orig->get_oc()); + block_t newleftlen = after - orig->start(); + BufferHead *right = new BufferHead(orig->get_oc(), after, orig->length() - newleftlen); right->set_version(orig->get_version()); right->epoch_modified = orig->epoch_modified; right->last_flushed = orig->last_flushed; right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); + add_bh(right); // shorten left stat_sub(orig); - orig->set_length( newleftlen ); + orig->reset_length( newleftlen ); stat_add(orig); - // add right - add_bh(right); - // adjust rx_from if (orig->is_rx()) { right->rx_from = orig->rx_from; @@ -1272,7 +1256,7 @@ void BufferCache::rx_finish(ObjectCache *oc, // verify csum csum_t actual = calc_csum(bl.c_str(), bl.length()); - if (actual != sp->second.csum) { + if (actual != sp->second.csum || rand() % 5 == 0) { dout(0) << "rx_finish bad csum on partial block " << pblock << dendl; derr(0) << "rx_finish bad csum on partial block " << pblock << " ****************" << dendl; poison_commit = true; @@ -1342,10 +1326,10 @@ void BufferCache::rx_finish(ObjectCache *oc, bh->data.clear(); bh->data.push_back( bp ); bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, + EBOFS_BLOCK_SIZE, bl); bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); + mark_clean(bh); // trigger waiters for (map >::iterator p = bh->waitfor_read.begin(); diff --git a/branches/ebofs/ebofs/BufferCache.h b/branches/ebofs/ebofs/BufferCache.h index aa86a02f31255..8fae665ffe9d0 100644 --- a/branches/ebofs/ebofs/BufferCache.h +++ b/branches/ebofs/ebofs/BufferCache.h @@ -83,11 +83,12 @@ class BufferHead : public LRUObject { bool want_to_expire; // wants to be at bottom of lru public: - BufferHead(ObjectCache *o) : + BufferHead(ObjectCache *o, block_t start, block_t len) : oc(o), //cancellable_ioh(0), tx_epoch(0), rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), shadow_of(0), ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), + object_loc(start, len), //xlist_dirty(this), want_to_expire(false) {} @@ -111,9 +112,9 @@ class BufferHead : public LRUObject { int get_num_ref() { return ref; } block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } + //void set_start(block_t s) { object_loc.start = s; } block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } + void reset_length(block_t l) { object_loc.length = l; } block_t end() { return start() + length(); } block_t last() { return end()-1; } @@ -310,7 +311,7 @@ class ObjectCache { pobject_t get_object_id() { return object_id; } - void add_bh(BufferHead *bh) { + void add_oc_bh(BufferHead *bh) { // add to my map assert(data.count(bh->start()) == 0); @@ -333,7 +334,7 @@ class ObjectCache { data[bh->start()] = bh; } - void remove_bh(BufferHead *bh) { + void remove_oc_bh(BufferHead *bh) { assert(data.count(bh->start())); data.erase(bh->start()); } @@ -405,6 +406,7 @@ class BufferCache { Cond flush_cond; int stat_waiter; + off_t stat_all; off_t stat_clean, stat_corrupt; off_t stat_dirty; off_t stat_rx; @@ -446,21 +448,22 @@ class BufferCache { BufferCache(BlockDevice& d, Mutex& el) : ebofs_lock(el), dev(d), stat_waiter(0), - stat_clean(0), stat_corrupt(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) + stat_all(0), stat_clean(0), stat_corrupt(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) {} off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; + assert(stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial+stat_corrupt+stat_missing == stat_all); + return stat_all; } off_t get_trimmable() { - return stat_clean; + return stat_clean+stat_corrupt; } // bh's in cache void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); + bh->get_oc()->add_oc_bh(bh); if (bh->is_dirty()) { lru_dirty.lru_insert_mid(bh); //dirty_bh.push_back(&bh->xlist_dirty); @@ -482,17 +485,19 @@ class BufferCache { lru_rest.lru_bottouch(bh); } void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); + bh->get_oc()->remove_oc_bh(bh); stat_sub(bh); if (bh->is_dirty()) { lru_dirty.lru_remove(bh); //dirty_bh.push_back(&bh->xlist_dirty); } else lru_rest.lru_remove(bh); + delete bh; } // stats void stat_add(BufferHead *bh) { + assert(stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial+stat_corrupt+stat_missing == stat_all); switch (bh->get_state()) { case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; @@ -501,19 +506,24 @@ class BufferCache { case BufferHead::STATE_TX: stat_tx += bh->length(); break; case BufferHead::STATE_RX: stat_rx += bh->length(); break; case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; + default: assert(0); } + stat_all += bh->length(); if (stat_waiter) stat_cond.Signal(); } void stat_sub(BufferHead *bh) { + assert(stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial+stat_corrupt+stat_missing == stat_all); switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_CORRUPT: stat_corrupt -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; + case BufferHead::STATE_MISSING: stat_missing -= bh->length(); assert(stat_missing >= 0); break; + case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); assert(stat_clean >= 0); break; + case BufferHead::STATE_CORRUPT: stat_corrupt -= bh->length(); assert(stat_corrupt >= 0); break; + case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); assert(stat_dirty >= 0); break; + case BufferHead::STATE_TX: stat_tx -= bh->length(); assert(stat_tx >= 0); break; + case BufferHead::STATE_RX: stat_rx -= bh->length(); assert(stat_rx >= 0); break; + case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); assert(stat_partial >= 0); break; + default: assert(0); } + stat_all -= bh->length(); } off_t get_stat_tx() { return stat_tx; } off_t get_stat_rx() { return stat_rx; } diff --git a/branches/ebofs/ebofs/Ebofs.cc b/branches/ebofs/ebofs/Ebofs.cc index 2a2666e3222fb..5a59745b4bc62 100644 --- a/branches/ebofs/ebofs/Ebofs.cc +++ b/branches/ebofs/ebofs/Ebofs.cc @@ -663,6 +663,81 @@ Onode* Ebofs::new_onode(pobject_t oid) return on; } +Onode* Ebofs::decode_onode(bufferlist& bl, unsigned& off) +{ + // verify csum + struct ebofs_onode *eo = (struct ebofs_onode*)(bl.c_str() + off); + if (eo->onode_bytes > bl.length() - off) { + derr(0) << "obviously corrupt onode (bad onode_bytes)" << dendl; + return 0; + } + csum_t actual = calc_csum(bl.c_str() + off + sizeof(csum_t), + eo->onode_bytes - sizeof(csum_t)); + if (actual != eo->onode_csum) { + derr(0) << "corrupt onode (bad csum actual " << actual << " != " << eo->onode_csum << ")" << dendl; + return 0; + } + + // add onode + Onode *on = new Onode(eo->object_id); + + // parse data block + on->readonly = eo->readonly; + on->onode_loc = eo->onode_loc; + on->object_size = eo->object_size; + on->alloc_blocks = eo->alloc_blocks; + on->data_csum = eo->data_csum; + + // parse + char *p = (char*)(eo + 1); + + // parse collection list + for (int i=0; inum_collections; i++) { + coll_t c = *((coll_t*)p); + p += sizeof(c); + on->collections.insert(c); + } + + // parse attributes + for (unsigned i=0; inum_attr; i++) { + string key = p; + p += key.length() + 1; + int len = *(int*)(p); + p += sizeof(len); + on->attr[key] = buffer::copy(p, len); + p += len; + dout(15) << "get_onode " << *on << " attr " << key << " len " << len << dendl; + } + + // parse extents + on->extent_map.clear(); + block_t n = 0; + for (unsigned i=0; inum_extents; i++) { + Extent ex = *((Extent*)p); + p += sizeof(Extent); + on->extent_map[n].ex = ex; + if (ex.start) { + on->extent_map[n].csum.resize(ex.length); + memcpy(&on->extent_map[n].csum[0], p, sizeof(csum_t)*ex.length); + p += sizeof(csum_t)*ex.length; + } + dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << dendl; + n += ex.length; + } + on->last_block = n; + + // parse bad byte extents + for (unsigned i=0; inum_bad_byte_extents; i++) { + Extent ex = *((Extent*)p); + p += sizeof(ex); + on->bad_byte_extents.insert(ex.start, ex.length); + dout(15) << "get_onode " << *on << " bad byte ex " << ex << dendl; + } + + unsigned len = p - (char*)eo; + assert(len == eo->onode_bytes); + return on; +} Onode* Ebofs::get_onode(pobject_t oid) { @@ -706,74 +781,15 @@ Onode* Ebofs::get_onode(pobject_t oid) ebofs_lock.Unlock(); dev.read( onode_loc.start, onode_loc.length, bl ); ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - dout(0) << " wrong oid in onode block: " << eo->object_id << " != " << oid << dendl; - dout(0) << " onode_loc is " << eo->onode_loc << dendl; - dout(0) << " object_size " << eo->object_size << dendl; - dout(0) << " alloc_blocks " << eo->alloc_blocks << dendl; - dout(0) << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << dendl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->alloc_blocks = eo->alloc_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (unsigned i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << dendl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (unsigned i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - p += sizeof(Extent); - on->extent_map[n].ex = ex; - if (ex.start) { - on->extent_map[n].csum.resize(ex.length); - memcpy(&on->extent_map[n].csum[0], p, sizeof(csum_t)*ex.length); - p += sizeof(csum_t)*ex.length; - } - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << dendl; - n += ex.length; - } - on->last_block = n; - // parse bad byte extents - for (unsigned i=0; inum_bad_byte_extents; i++) { - Extent ex = *((Extent*)p); - p += sizeof(ex); - on->bad_byte_extents.insert(ex.start, ex.length); - dout(15) << "get_onode " << *on << " bad byte ex " << ex << dendl; + unsigned off = 0; + Onode *on = decode_onode(bl, off); + if (!on) { + assert(0); // corrupt! } + assert(on->object_id == oid); + onode_map[oid] = on; + onode_lru.lru_insert_top(on); // wake up other waiters for (list::iterator i = waitfor_onode[oid].begin(); @@ -801,6 +817,8 @@ public: void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) { + unsigned start_off = off; + // onode struct ebofs_onode eo; eo.readonly = on->readonly; @@ -808,6 +826,7 @@ void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) eo.object_id = on->object_id; eo.object_size = on->object_size; eo.alloc_blocks = on->alloc_blocks; + eo.data_csum = on->data_csum; eo.inline_bytes = 0; /* write me */ eo.num_collections = on->collections.size(); eo.num_attr = on->attr.size(); @@ -835,7 +854,7 @@ void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) off += sizeof(int); bl.copy_in(off, l, i->second.c_str()); off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << dendl; + dout(15) << "encode_onode " << *on << " attr " << i->first << " len " << l << dendl; } // extents @@ -849,7 +868,7 @@ void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) bl.copy_in(off, sizeof(csum_t)*o.ex.length, (char*)&o.csum[0]); off += sizeof(csum_t)*o.ex.length; } - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << o.ex << dendl; + dout(15) << "encode_onode " << *on << " ex " << i->first << ": " << o.ex << dendl; } // bad byte extents @@ -858,18 +877,25 @@ void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) p++) { Extent o(p->first, p->second); bl.copy_in(off, sizeof(o), (char*)&o); - dout(15) << "write_onode " << *on << " bad byte ex " << o << dendl; + dout(15) << "encode_onode " << *on << " bad byte ex " << o << dendl; } + + eo.onode_bytes = off - start_off; + bl.copy_in(start_off + sizeof(csum_t), sizeof(__u32), (char*)&eo.onode_bytes); + eo.onode_csum = calc_csum(bl.c_str() + start_off + sizeof(csum_t), + eo.onode_bytes - sizeof(csum_t)); + bl.copy_in(start_off, sizeof(csum_t), (char*)&eo); + dout(15) << "encode_onode len " << eo.onode_bytes << " csum " << eo.onode_csum << dendl; } void Ebofs::write_onode(Onode *on) { // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; + unsigned bytes = on->get_ondisk_bytes(); + unsigned blocks = DIV_ROUND_UP(bytes, EBOFS_BLOCK_SIZE); bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); + bl.push_back(buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks)); // (always) relocate onode if (super_epoch > on->last_alloc_epoch) { @@ -881,8 +907,8 @@ void Ebofs::write_onode(Onode *on) first = on->get_first_block(); allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); + object_tab->remove(on->object_id); + object_tab->insert(on->object_id, on->onode_loc); //object_tab->verify(); } @@ -891,6 +917,8 @@ void Ebofs::write_onode(Onode *on) unsigned off = 0; encode_onode(on, bl, off); assert(off == bytes); + if (off < bl.length()) + bl.zero(off, bl.length()-off); // write dev.write( on->onode_loc.start, on->onode_loc.length, bl, @@ -1147,7 +1175,7 @@ void Ebofs::write_cnode(Cnode *cn) { // allocate buffer unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; + unsigned blocks = DIV_ROUND_UP(bytes, EBOFS_BLOCK_SIZE); bufferlist bl; //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); @@ -1308,7 +1336,6 @@ void Ebofs::trim_bc(off_t max) ObjectCache *oc = bh->oc; bc.remove_bh(bh); - delete bh; if (oc->is_empty()) { Onode *on = oc->on; @@ -1706,20 +1733,6 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) if (bh->is_partial() || bh->is_rx() || bh->is_missing() || bh->is_corrupt()) { assert(bh->length() == 1); - if (bh->is_missing()) { - // newly realloc? carry old checksum over since we're only partially overwriting - if (bh->start() == bstart && alloc.contains(bstart)) { - dout(10) << "apply_write carrying over starting csum " << hex << old_csum_first << dec - << " for partial " << *bh << dendl; - *on->get_extent_csum_ptr(bh->start(), 1) = old_csum_first; - on->data_csum += old_csum_first; - } else if (bh->end()-1 == blast && alloc.contains(blast)) { - dout(10) << "apply_write carrying over ending csum " << hex << old_csum_last << dec - << " for partial " << *bh << dendl; - *on->get_extent_csum_ptr(bh->end()-1, 1) = old_csum_last; - on->data_csum += old_csum_last; - } - } if (bh->is_corrupt()) { dout(10) << "apply_write marking non-overwritten bytes bad on corrupt " << *bh << dendl; interval_set bad; @@ -1734,7 +1747,20 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) *on->get_extent_csum_ptr(bh->start(), 1) = csum; on->data_csum += csum; bc.mark_clean(bh); - } + } else { + // newly realloc? carry old checksum over since we're only partially overwriting + if (bh->start() == bstart && alloc.contains(bstart)) { + dout(10) << "apply_write carrying over starting csum " << hex << old_csum_first << dec + << " for partial " << *bh << dendl; + *on->get_extent_csum_ptr(bh->start(), 1) = old_csum_first; + on->data_csum += old_csum_first; + } else if (bh->end()-1 == blast && alloc.contains(blast)) { + dout(10) << "apply_write carrying over ending csum " << hex << old_csum_last << dec + << " for partial " << *bh << dendl; + *on->get_extent_csum_ptr(bh->end()-1, 1) = old_csum_last; + on->data_csum += old_csum_last; + } + } // add frag to partial dout(10) << "apply_write writing into partial " << *bh << ":" @@ -1824,7 +1850,7 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) // update csum block_t rbfirst = off_in_bh/EBOFS_BLOCK_SIZE; - block_t rblast = (off_in_bh+len_in_bh+4095)/EBOFS_BLOCK_SIZE; + block_t rblast = DIV_ROUND_UP(off_in_bh+len_in_bh, EBOFS_BLOCK_SIZE); block_t bnum = rblast-rbfirst; csum_t *csum = on->get_extent_csum_ptr(bh->start()+rbfirst, bnum); dout(20) << "calc csum for " << rbfirst << "~" << bnum << dendl; @@ -1888,7 +1914,7 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) } // fill in csums - unsigned blocks = (len_in_bh + 4095)/ EBOFS_BLOCK_SIZE; + unsigned blocks = DIV_ROUND_UP(len_in_bh, EBOFS_BLOCK_SIZE); csum_t *csum = on->get_extent_csum_ptr(bh->start(), blocks); for (unsigned i=0; idata_csum -= csum[i]; @@ -1929,8 +1955,8 @@ void Ebofs::apply_zero(Onode *on, off_t off, size_t len) // zero edges // head? - if (off & (EBOFS_BLOCK_SIZE-1)) { - size_t l = EBOFS_BLOCK_SIZE - (off & (EBOFS_BLOCK_SIZE-1)); + if (off & EBOFS_BLOCK_MASK) { + size_t l = EBOFS_BLOCK_SIZE - (off & EBOFS_BLOCK_MASK); if (l > len) l = len; bufferptr bp(l); bp.zero(); @@ -1943,8 +1969,8 @@ void Ebofs::apply_zero(Onode *on, off_t off, size_t len) if (len == 0) return; // done! // tail? - if ((off+len) & (EBOFS_BLOCK_SIZE-1)) { - int l = (off+len) & (EBOFS_BLOCK_SIZE-1); + if ((off+len) & EBOFS_BLOCK_MASK) { + int l = (off+len) & EBOFS_BLOCK_MASK; bufferptr bp(l); bp.zero(); bufferlist bl; @@ -1956,9 +1982,10 @@ void Ebofs::apply_zero(Onode *on, off_t off, size_t len) // map middle onto buffers assert(len > 0); - block_t bstart = 0; - if (off) bstart = 1 + (off-1) / EBOFS_BLOCK_SIZE; - block_t blen = (off+len) / EBOFS_BLOCK_SIZE; + assert((off & EBOFS_BLOCK_MASK) == 0); + assert((len & EBOFS_BLOCK_MASK) == 0); + block_t bstart = off / EBOFS_BLOCK_SIZE; + block_t blen = len / EBOFS_BLOCK_SIZE; assert(blen > 0); map hits; @@ -1974,26 +2001,24 @@ void Ebofs::apply_zero(Onode *on, off_t off, size_t len) p = next; } - if (blen) { - // free old blocks - vector old; - on->map_extents(bstart, blen, old, 0); - for (unsigned i=0; iset_extent(bstart, hole); - - // adjust uncom - interval_set zeroed; - zeroed.insert(bstart, blen); - interval_set olduncom; - olduncom.intersection_of(zeroed, on->uncommitted); - dout(10) << "_zeroed old uncom " << on->uncommitted << " zeroed " << zeroed - << " subtracting " << olduncom << dendl; - on->uncommitted.subtract(olduncom); - dout(10) << "_zeroed new uncom " << on->uncommitted << dendl; - } + // free old blocks + vector old; + on->map_extents(bstart, blen, old, 0); + for (unsigned i=0; iset_extent(bstart, hole); + + // adjust uncom + interval_set zeroed; + zeroed.insert(bstart, blen); + interval_set olduncom; + olduncom.intersection_of(zeroed, on->uncommitted); + dout(10) << "_zeroed old uncom " << on->uncommitted << " zeroed " << zeroed + << " subtracting " << olduncom << dendl; + on->uncommitted.subtract(olduncom); + dout(10) << "_zeroed new uncom " << on->uncommitted << dendl; finish_contexts(finished, -1); } diff --git a/branches/ebofs/ebofs/Ebofs.h b/branches/ebofs/ebofs/Ebofs.h index 5b3320f483153..ac96333916f02 100644 --- a/branches/ebofs/ebofs/Ebofs.h +++ b/branches/ebofs/ebofs/Ebofs.h @@ -130,6 +130,7 @@ protected: bool have_onode(pobject_t oid) { return onode_map.count(oid); } + Onode* decode_onode(bufferlist& bl, unsigned& off); Onode* get_onode(pobject_t oid); // get cached onode, or read from disk. ref++. void remove_onode(Onode *on); void put_onode(Onode* o); // put it back down. ref--. diff --git a/branches/ebofs/ebofs/Onode.h b/branches/ebofs/ebofs/Onode.h index f2e6cab4dead3..402d20cfadc7d 100644 --- a/branches/ebofs/ebofs/Onode.h +++ b/branches/ebofs/ebofs/Onode.h @@ -180,7 +180,7 @@ public: for (map::iterator p = extent_map.begin(); p != extent_map.end(); p++) { - cout << " verify_extents " << p->first << ": " << p->second << std::endl; + //cout << " verify_extents " << p->first << ": " << p->second << std::endl; assert(pos == p->first); pos += p->second.ex.length; if (p->second.ex.start) { @@ -192,7 +192,7 @@ public: } } } - cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl; + //cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl; assert(s.size() == count); assert(count == alloc_blocks); @@ -217,7 +217,7 @@ public: * factor clobbered extents out of csums. */ void set_extent(block_t offset, Extent ex) { - cout << "set_extent " << offset << " -> " << ex << " ... " << last_block << std::endl; + //cout << "set_extent " << offset << " -> " << ex << " ... " << last_block << std::endl; verify_extents(); @@ -267,7 +267,7 @@ public: p--; ExtentCsum &left = p->second; if (p->first + left.ex.length > offset) { - cout << " preceeding left was " << left << std::endl; + //cout << " preceeding left was " << left << std::endl; block_t newlen = offset - p->first; if (p->first + left.ex.length > offset+ex.length) { // cutting chunk out of middle, add trailing bit @@ -280,7 +280,7 @@ public: for (unsigned j=0; jsecond; if (p->first + o.ex.length <= offset+ex.length) { - cout << " erasing " << o << std::endl; + //cout << " erasing " << o << std::endl; if (o.ex.start) { alloc_blocks -= o.ex.length; for (unsigned i=0; ifirst; n.ex.length -= overlap; if (n.ex.start) { @@ -328,7 +328,7 @@ public: n.resize_head(); } extent_map.erase(p); - cout << ", now " << n << std::endl; + //cout << ", now " << n << std::endl; break; } } @@ -349,7 +349,7 @@ public: } int truncate_extents(block_t len, vector& extra) { - cout << " truncate to " << len << " .. last_block " << last_block << std::endl; + //cout << " truncate to " << len << " .. last_block " << last_block << std::endl; verify_extents(); @@ -364,7 +364,7 @@ public: Extent ex; ex.start = o.ex.start + newlen; ex.length = o.ex.length - newlen; - cout << " truncating ex " << p->second.ex << " to " << newlen << ", releasing " << ex << std::endl; + //cout << " truncating ex " << p->second.ex << " to " << newlen << ", releasing " << ex << std::endl; for (unsigned i=newlen; i=(b) ? (a):(b)) #endif +#ifndef DIV_ROUND_UP +# define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif // disk typedef uint64_t block_t; // disk location/sector/block static const int EBOFS_BLOCK_SIZE = 4096; +static const int EBOFS_BLOCK_MASK = 4095; static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 struct Extent { @@ -56,7 +60,7 @@ struct Extent { block_t last() const { return start + length - 1; } block_t end() const { return start + length; } -}; +} __attribute__ ((packed)); inline ostream& operator<<(ostream& out, const Extent& ex) { @@ -69,32 +73,34 @@ inline ostream& operator<<(ostream& out, const Extent& ex) typedef uint64_t coll_t; struct ebofs_onode { - csum_t data_csum; + csum_t onode_csum; // from after onode_csum to base + onode_bytes + __u32 onode_bytes; Extent onode_loc; /* this is actually the block we live in */ - pobject_t object_id; /* for kicks */ - __u8 readonly; + pobject_t object_id; /* for kicks */ + __u64 readonly; __s64 object_size; /* file size in bytes. should this be 64-bit? */ __u32 alloc_blocks; // allocated + csum_t data_csum; __u16 inline_bytes; __u16 num_collections; __u32 num_attr; // num attr in onode __u32 num_extents; /* number of extents used. if 0, data is in the onode */ - __u32 num_bad_byte_extents; // undefined partial byte extents over partial blocks; block checksums reflect zeroed data beneath these. -}; + __u32 num_bad_byte_extents; // corrupt partial byte extents +} __attribute__ ((packed)); struct ebofs_cnode { Extent cnode_loc; /* this is actually the block we live in */ coll_t coll_id; __u32 num_attr; // num attr in cnode -}; +} __attribute__ ((packed)); struct ebofs_onode_ptr { Extent loc; csum_t csum; -}; +} __attribute__ ((packed)); // tree/set nodes @@ -112,7 +118,7 @@ struct ebofs_nodepool { __u32 num_regions; Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; +} __attribute__ ((packed)); // table @@ -121,13 +127,13 @@ struct ebofs_node_ptr { //__u64 start[EBOFS_NODE_DUP]; //__u64 length; csum_t csum; -}; +} __attribute__ ((packed)); struct ebofs_table { ebofs_node_ptr root; __u32 num_keys; __u32 depth; -}; +} __attribute__ ((packed)); // super @@ -176,7 +182,7 @@ struct ebofs_super { } bool is_valid_magic() { return s_magic == EBOFS_MAGIC; } bool is_valid() { return is_valid_magic() && !is_corrupt(); } -}; +} __attribute__ ((packed)); #endif -- 2.39.5