]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
fixed zero; some checksum fixes
authorsageweil <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
Thu, 13 Dec 2007 20:50:16 +0000 (20:50 +0000)
committersageweil <sageweil@29311d96-e01e-0410-9327-a35deaab8ce9>
Thu, 13 Dec 2007 20:50:16 +0000 (20:50 +0000)
git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@2207 29311d96-e01e-0410-9327-a35deaab8ce9

branches/ebofs/config.cc
branches/ebofs/config.h
branches/ebofs/ebofs/BufferCache.cc
branches/ebofs/ebofs/BufferCache.h
branches/ebofs/ebofs/Ebofs.cc
branches/ebofs/ebofs/Onode.h

index b4c108d0b53ebf6984991dd0061802fd7b47c086..bb86b35b3c11639c6952ef304bb14f5fd6b2809e 100644 (file)
@@ -328,7 +328,6 @@ md_config_t g_conf = {
   ebofs_cloneable: false,
   ebofs_verify: false,
   ebofs_commit_ms:      1000,       // 0 = no forced commit timeout (for debugging/tracing)
-  ebofs_idle_commit_ms: 0,         // 0 = no idle detection.  UGLY HACK.  use bdev_idle_kick_after_ms instead.
   ebofs_oc_size:        10000,      // onode cache
   ebofs_cc_size:        10000,      // cnode cache
   ebofs_bc_size:        (5 *256), // 4k blocks, *256 for MB
@@ -810,8 +809,6 @@ void parse_config_options(std::vector<char*>& args)
       g_conf.ebofs_verify = atoi(args[++i]);
     else if (strcmp(args[i], "--ebofs_commit_ms") == 0)
       g_conf.ebofs_commit_ms = atoi(args[++i]);
-    else if (strcmp(args[i], "--ebofs_idle_commit_ms") == 0)
-      g_conf.ebofs_idle_commit_ms = atoi(args[++i]);
     else if (strcmp(args[i], "--ebofs_oc_size") == 0)
       g_conf.ebofs_oc_size = atoi(args[++i]);
     else if (strcmp(args[i], "--ebofs_cc_size") == 0)
index 13a49b1d39237dc35858b4f7240255b5668ef3c7..6ce585a84437a194a34f092d82aa054cff5bdfec 100644 (file)
@@ -291,7 +291,6 @@ struct md_config_t {
   bool  ebofs_cloneable;
   bool  ebofs_verify;
   int   ebofs_commit_ms;
-  int   ebofs_idle_commit_ms;
   int   ebofs_oc_size;
   int   ebofs_cc_size;
   off_t ebofs_bc_size;
index ad5380977dccc971e764078196ea7ebbda81145a..c9c8dafca164b2427a99fbbcf2c5706ac43b8ce4 100644 (file)
@@ -38,7 +38,9 @@ void do_apply_partial(bufferlist& bl, map<off_t, bufferlist>& pm)
 
 
 #undef dout
+#undef derr
 #define dout(x)  if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs." << *this << "."
+#define derr(x)  if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs." << *this << "."
 
 
 void BufferHead::add_partial(off_t off, bufferlist& p) 
@@ -129,7 +131,9 @@ void BufferHead::apply_partial()
 
 
 #undef dout
+#undef derr
 #define dout(x)  if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.oc."
+#define derr(x)  if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs.oc."
 
 
 
@@ -182,8 +186,9 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist
        csum_t got[bh->length()];
        for (unsigned i=0; i<bh->length(); i++) {
          got[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE);
-         if (false && rand() % 10 == 0) {
+         if (rand() % 10 == 0) {
            dout(0) << "rx_finish HACK INJECTING bad csum" << dendl;
+           derr(0) << "rx_finish HACK INJECTING bad csum" << dendl;
            got[i] = 0;
          }
          if (got[i] != want[i]) {
@@ -201,7 +206,8 @@ void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist
              unsigned e;
              for (e=s; e<olen; e++)
                if (got[e] == want[e]) break;
-             dout(0) << "rx_finish  bad csum over " << s << "~" << (e-s) << dendl;
+             dout(0) << "rx_finish  bad csum in " << bh->oc->on->object_id << " over " << s << "~" << (e-s) << dendl;
+             derr(0) << "rx_finish  bad csum in " << bh->oc->on->object_id << " over " << s << "~" << (e-s) << dendl;
              
              if (s) {
                BufferHead *middle = bc->split(bh, ostart+s);
@@ -1259,36 +1265,37 @@ void BufferCache::rx_finish(ObjectCache *oc,
   // finish any partials?
   //  note: these are partials that were re-written after a commit,
   //        or for whom the OC was destroyed (eg truncated after a commit)
-  map<block_t, map<block_t, PartialWrite> >::iterator sp = partial_write.lower_bound(diskstart);
-  while (sp != partial_write.end()) {
-    if (sp->first >= diskstart+length) break;
-    assert(sp->first >= diskstart);
-
-    block_t pblock = sp->first;
-    map<block_t, PartialWrite> writes;
-    writes.swap( sp->second );
-
-    map<block_t, map<block_t, PartialWrite> >::iterator t = sp;
-    sp++;
-    partial_write.erase(t);
-
-    for (map<block_t, PartialWrite>::iterator p = writes.begin();
-         p != writes.end();
-         p++) {
-      dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
-                << " for epoch " << p->second.epoch
-                << dendl;
-      // make the combined block
-      bufferlist combined;
-      bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
-      combined.push_back( bp );
-      combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
-      do_apply_partial( combined, p->second.partial );
-
-      // write it!
-      dev.write( pblock, 1, combined,
-                 new C_OC_PartialTxFinish( this, p->second.epoch ),
-                 "finish_partials");
+  if (length == 1) {
+    map<block_t,PartialWriteSet>::iterator sp = partial_write.find(diskstart);
+    if (sp != partial_write.end()) {
+      block_t pblock = diskstart;
+
+      // verify csum
+      csum_t actual = calc_csum(bl.c_str(), bl.length());
+      if (actual != sp->second.csum) {
+       dout(0) << "rx_finish bad csum on partial block " << pblock << dendl;
+       derr(0) << "rx_finish bad csum on partial block " << pblock << dendl;
+      } 
+      
+      for (map<block_t, PartialWrite>::iterator p = sp->second.writes.begin();
+          p != sp->second.writes.end();
+          p++) {
+       dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
+                << " for epoch " << p->second.epoch
+                << dendl;
+       // make the combined block
+       bufferlist combined;
+       bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+       combined.push_back( bp );
+       combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
+       do_apply_partial( combined, p->second.partial );
+       
+       // write it!
+       dev.write( pblock, 1, combined,
+                  new C_OC_PartialTxFinish( this, p->second.epoch ),
+                  "finish_partials");
+      }
+      partial_write.erase(sp);
     }
   }
 
@@ -1399,30 +1406,30 @@ void BufferCache::queue_partial(block_t from, block_t to,
            << " in epoch " << epoch 
            << dendl;
   
-  if (partial_write[from].count(to)) {
+  if (partial_write[from].writes.count(to)) {
     // this should be in the same epoch.
-    assert( partial_write[from][to].epoch == epoch);
+    assert( partial_write[from].writes[to].epoch == epoch);
     assert(0); // actually.. no!
   } else {
     inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
   }
   
-  partial_write[from][to].partial = partial;
-  partial_write[from][to].epoch = epoch;
+  partial_write[from].writes[to].partial = partial;
+  partial_write[from].writes[to].epoch = epoch;
 }
 
 void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch)
 {
   assert(partial_write.count(from));
-  assert(partial_write[from].count(to));
-  assert(partial_write[from][to].epoch == epoch);
+  assert(partial_write[from].writes.count(to));
+  assert(partial_write[from].writes[to].epoch == epoch);
 
   dout(10) << "cancel_partial " << from << " -> " << to 
-           << "  (was epoch " << partial_write[from][to].epoch << ")"
+           << "  (was epoch " << partial_write[from].writes[to].epoch << ")"
            << dendl;
 
-  partial_write[from].erase(to);
-  if (partial_write[from].empty())
+  partial_write[from].writes.erase(to);
+  if (partial_write[from].writes.empty())
     partial_write.erase(from);
 
   dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
index fc8424057f1a0893ad697fe18843e1460fab8d14..3959a7e644090f353c9128ebf1eb958543a00de3 100644 (file)
@@ -260,6 +260,7 @@ inline ostream& operator<<(ostream& out, BufferHead& bh)
   if (bh.is_rx()) out << " rx";
   if (bh.is_tx()) out << " tx";
   if (bh.is_partial()) out << " partial";
+  if (bh.is_corrupt()) out << " corrupt";
 
   // include epoch modified?
   if (bh.is_dirty() || bh.is_tx() || bh.is_partial()) 
@@ -425,17 +426,16 @@ class BufferCache {
    *
    * really, at most there will only ever be two of these, for current+previous epochs.
    */
-  class PartialWrite {
-  public:
+  struct PartialWrite {
     map<off_t, bufferlist> partial;   // partial dirty content overlayed onto incoming data
     version_t              epoch;
   };
-  class WriteSet {
+  struct PartialWriteSet {
     csum_t csum;                       // expected csum
     map<block_t, PartialWrite> writes;
   };
 
-  map<block_t, map<block_t, PartialWrite> > partial_write;  // queued writes w/ partial content
+  map<block_t, PartialWriteSet> partial_write;  // queued writes w/ partial content
   map<block_t, set<BufferHead*> >           shadow_partials;
 
  public:
index 504d8df749d8f30e69b99c5ff5a9eb0880343952..e8bacbe6989bb08aedd1abf81625a51e2a8e5bbd 100644 (file)
@@ -437,41 +437,9 @@ int Ebofs::commit_thread_entry()
     
     // wait for kick, or timeout
     if (g_conf.ebofs_commit_ms) {
-      if (g_conf.ebofs_idle_commit_ms > 0) {
-       // *** this is an ugly ugly hack ****
-       //     do not use
-        // periodically check for idle block device
-       utime_t idle_wait(0, g_conf.ebofs_idle_commit_ms*1000);
-        dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " 
-                 << idle_wait << " ms if idle" << dendl;
-       utime_t now = g_clock.now();
-       utime_t stop = now;
-       stop += (double)g_conf.ebofs_commit_ms / 1000.0;
-        do {
-         utime_t wait = MIN(stop - now, idle_wait);
-          if (commit_cond.WaitInterval(ebofs_lock, wait) != ETIMEDOUT) {
-            dout(20) << "commit_thread i got kicked" << dendl;
-            break;   // we got kicked
-         }
-          if (dev.is_idle()) {
-            dout(20) << "commit_thread bdev is idle, early commit" << dendl;
-            break;  // dev is idle
-          }
-         now = g_clock.now();
-          dout(20) << "commit_thread now=" << now << ", stop at " << stop << dendl;
-
-          // hack hack
-          //if (!left) g_conf.debug_ebofs = 10;
-          // /hack hack
-       } while (now < stop);
-       dout(20) << "commit_thread done with idle loop" << dendl;
-
-      } else {
-        // normal wait+timeout
-        dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl;
-        commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));   
-      }
-
+      // normal wait+timeout
+      dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl;
+      commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));   
     } else {
       // DEBUG.. wait until kicked
       dout(10) << "commit_thread no commit_ms, waiting until kicked" << dendl;
@@ -490,6 +458,7 @@ int Ebofs::commit_thread_entry()
       dout(10) << "commit_thread not dirty" << dendl;
     }
     else {
+      // --- this all happens in one go, from here ---
       super_epoch++;
       dirty = false;
 
@@ -525,15 +494,18 @@ int Ebofs::commit_thread_entry()
       
       // (async) write btree nodes
       nodepool.commit_start( dev, super_epoch );
-      
-      // blockdev barrier (prioritize our writes!)
-      dout(30) << "commit_thread barrier.  flushing inodes " << inodes_flushing << dendl;
-      dev.barrier();
 
       // prepare super (before any changes get made!)
       bufferptr superbp;
       prepare_super(super_epoch, superbp);
       
+      // --- to here. ---
+      // now wait.
+      
+      // blockdev barrier (prioritize our writes!)
+      dout(30) << "commit_thread barrier.  flushing inodes " << inodes_flushing << dendl;
+      dev.barrier();
+
       // wait for it all to flush (drops global lock)
       commit_bc_wait(super_epoch-1);  
       dout(30) << "commit_thread bc flushed" << dendl;
@@ -566,10 +538,8 @@ int Ebofs::commit_thread_entry()
 
       // kick waiters
       dout(10) << "commit_thread queueing commit + kicking sync waiters" << dendl;
-      
       queue_finishers(commit_waiters[super_epoch-1]);
       commit_waiters.erase(super_epoch-1);
-
       sync_cond.Signal();
 
       dout(10) << "commit_thread commit finish" << dendl;
@@ -1723,23 +1693,37 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl)
       unsigned len_in_bh = MIN( (off_t)(left),
                                 (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos );
       
-      if (bh->is_partial() || bh->is_rx() || bh->is_missing()) {
-        assert(bh->is_partial() || bh->is_rx() || bh->is_missing());
+      if (bh->is_partial() || bh->is_rx() || bh->is_missing() || bh->is_corrupt()) {
         assert(bh->length() == 1);
 
        if (bh->is_missing()) {
-         // newly realloc; carry old checksum over since we're only partially overwriting
-         if (bh->start() == bstart) {
+         // newly realloc? carry old checksum over since we're only partially overwriting
+         if (bh->start() == bstart && alloc.contains(bstart)) {
            dout(10) << "apply_write  carrying over starting csum " << hex << old_csum_first << dec
                     << " for partial " << *bh << dendl;
            *on->get_extent_csum_ptr(bh->start(), 1) = old_csum_first;
            on->data_csum += old_csum_first;
-         } else if (bh->end()-1 == blast) {
+         } else if (bh->end()-1 == blast && alloc.contains(blast)) {
            dout(10) << "apply_write  carrying over ending csum " << hex << old_csum_last << dec
                     << " for partial " << *bh << dendl;
            *on->get_extent_csum_ptr(bh->end()-1, 1) = old_csum_last;
            on->data_csum += old_csum_last;
-         } else assert(0);
+         } 
+       }
+       if (bh->is_corrupt()) {
+         dout(10) << "apply_write  marking non-overwritten bytes bad on corrupt " << *bh << dendl;
+         interval_set<off_t> bad;
+         off_t bs = bh->start() * EBOFS_BLOCK_SIZE;
+         if (off_in_bh) bad.insert(bs, bs+off_in_bh);
+         if (off_in_bh+len_in_bh < (unsigned)EBOFS_BLOCK_SIZE)
+           bad.insert(bs+off_in_bh+len_in_bh, bs+EBOFS_BLOCK_SIZE-off_in_bh-len_in_bh);
+         dout(10) << "apply_write  marking non-overwritten bytes " << bad << " bad on corrupt " << *bh << dendl;
+         bh->oc->on->bad_byte_extents.union_of(bad);
+         csum_t csum = calc_csum(bh->data.c_str(), bh->data.length());
+         dout(10) << "apply_write  marking corrupt bh csum " << hex << csum << dec << " clean " << *bh << dendl;
+         *on->get_extent_csum_ptr(bh->start(), 1) = csum;
+         on->data_csum += csum;
+         bc.mark_clean(bh);
        }
 
         // add frag to partial
@@ -1772,7 +1756,7 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl)
           bc.mark_partial(bh);
           bc.bh_queue_partial_write(on, bh);          // queue the eventual write
         }
-        else if (bh->is_missing()) {
+        else if (bh->is_missing() || bh->is_corrupt()) {
           dout(10) << "apply_write  missing -> partial " << *bh << dendl;
           assert(bh->length() == 1);
           bc.mark_partial(bh);
@@ -2420,6 +2404,20 @@ unsigned Ebofs::_apply_transaction(Transaction& t)
       }
       break;
 
+    case Transaction::OP_ZERO:
+      {
+        pobject_t oid;
+       t.get_oid(oid);
+        off_t offset, len;
+       t.get_length(offset);
+       t.get_length(len);
+        if (_zero(oid, offset, len) < 0) {
+          dout(7) << "apply_transaction fail on _zero" << dendl;
+          r &= bit;
+        }
+      }
+      break;
+
     case Transaction::OP_TRIMCACHE:
       {
         pobject_t oid;
index fd78b4f788a858e60c42e6b23423ffc19b1e0ebd..a0acdbad1aa279c5638ec66a70bb4b85d9421e4e 100644 (file)
@@ -190,8 +190,7 @@ public:
          }
        }
       }
-      cout << " verify_extents got csum " 
-          << hex << csum << " want " << data_csum << dec << std::endl;
+      cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl;
 
       assert(s.size() == count);
       assert(count == alloc_blocks);