ebofs_cloneable: false,
ebofs_verify: false,
ebofs_commit_ms: 1000, // 0 = no forced commit timeout (for debugging/tracing)
- ebofs_idle_commit_ms: 0, // 0 = no idle detection. UGLY HACK. use bdev_idle_kick_after_ms instead.
ebofs_oc_size: 10000, // onode cache
ebofs_cc_size: 10000, // cnode cache
ebofs_bc_size: (5 *256), // 4k blocks, *256 for MB
g_conf.ebofs_verify = atoi(args[++i]);
else if (strcmp(args[i], "--ebofs_commit_ms") == 0)
g_conf.ebofs_commit_ms = atoi(args[++i]);
- else if (strcmp(args[i], "--ebofs_idle_commit_ms") == 0)
- g_conf.ebofs_idle_commit_ms = atoi(args[++i]);
else if (strcmp(args[i], "--ebofs_oc_size") == 0)
g_conf.ebofs_oc_size = atoi(args[++i]);
else if (strcmp(args[i], "--ebofs_cc_size") == 0)
bool ebofs_cloneable;
bool ebofs_verify;
int ebofs_commit_ms;
- int ebofs_idle_commit_ms;
int ebofs_oc_size;
int ebofs_cc_size;
off_t ebofs_bc_size;
#undef dout
+#undef derr
#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs." << *this << "."
+#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs." << *this << "."
void BufferHead::add_partial(off_t off, bufferlist& p)
#undef dout
+#undef derr
#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.oc."
+#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs.oc."
csum_t got[bh->length()];
for (unsigned i=0; i<bh->length(); i++) {
got[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE);
- if (false && rand() % 10 == 0) {
+ if (rand() % 10 == 0) {
dout(0) << "rx_finish HACK INJECTING bad csum" << dendl;
+ derr(0) << "rx_finish HACK INJECTING bad csum" << dendl;
got[i] = 0;
}
if (got[i] != want[i]) {
unsigned e;
for (e=s; e<olen; e++)
if (got[e] == want[e]) break;
- dout(0) << "rx_finish bad csum over " << s << "~" << (e-s) << dendl;
+ dout(0) << "rx_finish bad csum in " << bh->oc->on->object_id << " over " << s << "~" << (e-s) << dendl;
+ derr(0) << "rx_finish bad csum in " << bh->oc->on->object_id << " over " << s << "~" << (e-s) << dendl;
if (s) {
BufferHead *middle = bc->split(bh, ostart+s);
// finish any partials?
// note: these are partials that were re-written after a commit,
// or for whom the OC was destroyed (eg truncated after a commit)
- map<block_t, map<block_t, PartialWrite> >::iterator sp = partial_write.lower_bound(diskstart);
- while (sp != partial_write.end()) {
- if (sp->first >= diskstart+length) break;
- assert(sp->first >= diskstart);
-
- block_t pblock = sp->first;
- map<block_t, PartialWrite> writes;
- writes.swap( sp->second );
-
- map<block_t, map<block_t, PartialWrite> >::iterator t = sp;
- sp++;
- partial_write.erase(t);
-
- for (map<block_t, PartialWrite>::iterator p = writes.begin();
- p != writes.end();
- p++) {
- dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
- << " for epoch " << p->second.epoch
- << dendl;
- // make the combined block
- bufferlist combined;
- bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
- combined.push_back( bp );
- combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
- do_apply_partial( combined, p->second.partial );
-
- // write it!
- dev.write( pblock, 1, combined,
- new C_OC_PartialTxFinish( this, p->second.epoch ),
- "finish_partials");
+ if (length == 1) {
+ map<block_t,PartialWriteSet>::iterator sp = partial_write.find(diskstart);
+ if (sp != partial_write.end()) {
+ block_t pblock = diskstart;
+
+ // verify csum
+ csum_t actual = calc_csum(bl.c_str(), bl.length());
+ if (actual != sp->second.csum) {
+ dout(0) << "rx_finish bad csum on partial block " << pblock << dendl;
+ derr(0) << "rx_finish bad csum on partial block " << pblock << dendl;
+ }
+
+ for (map<block_t, PartialWrite>::iterator p = sp->second.writes.begin();
+ p != sp->second.writes.end();
+ p++) {
+ dout(10) << "rx_finish partial from " << pblock << " -> " << p->first
+ << " for epoch " << p->second.epoch
+ << dendl;
+ // make the combined block
+ bufferlist combined;
+ bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
+ combined.push_back( bp );
+ combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl);
+ do_apply_partial( combined, p->second.partial );
+
+ // write it!
+ dev.write( pblock, 1, combined,
+ new C_OC_PartialTxFinish( this, p->second.epoch ),
+ "finish_partials");
+ }
+ partial_write.erase(sp);
}
}
<< " in epoch " << epoch
<< dendl;
- if (partial_write[from].count(to)) {
+ if (partial_write[from].writes.count(to)) {
// this should be in the same epoch.
- assert( partial_write[from][to].epoch == epoch);
+ assert( partial_write[from].writes[to].epoch == epoch);
assert(0); // actually.. no!
} else {
inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
}
- partial_write[from][to].partial = partial;
- partial_write[from][to].epoch = epoch;
+ partial_write[from].writes[to].partial = partial;
+ partial_write[from].writes[to].epoch = epoch;
}
void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch)
{
assert(partial_write.count(from));
- assert(partial_write[from].count(to));
- assert(partial_write[from][to].epoch == epoch);
+ assert(partial_write[from].writes.count(to));
+ assert(partial_write[from].writes[to].epoch == epoch);
dout(10) << "cancel_partial " << from << " -> " << to
- << " (was epoch " << partial_write[from][to].epoch << ")"
+ << " (was epoch " << partial_write[from].writes[to].epoch << ")"
<< dendl;
- partial_write[from].erase(to);
- if (partial_write[from].empty())
+ partial_write[from].writes.erase(to);
+ if (partial_write[from].writes.empty())
partial_write.erase(from);
dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch );
if (bh.is_rx()) out << " rx";
if (bh.is_tx()) out << " tx";
if (bh.is_partial()) out << " partial";
+ if (bh.is_corrupt()) out << " corrupt";
// include epoch modified?
if (bh.is_dirty() || bh.is_tx() || bh.is_partial())
*
* really, at most there will only ever be two of these, for current+previous epochs.
*/
- class PartialWrite {
- public:
+ struct PartialWrite {
map<off_t, bufferlist> partial; // partial dirty content overlayed onto incoming data
version_t epoch;
};
- class WriteSet {
+ struct PartialWriteSet {
csum_t csum; // expected csum
map<block_t, PartialWrite> writes;
};
- map<block_t, map<block_t, PartialWrite> > partial_write; // queued writes w/ partial content
+ map<block_t, PartialWriteSet> partial_write; // queued writes w/ partial content
map<block_t, set<BufferHead*> > shadow_partials;
public:
// wait for kick, or timeout
if (g_conf.ebofs_commit_ms) {
- if (g_conf.ebofs_idle_commit_ms > 0) {
- // *** this is an ugly ugly hack ****
- // do not use
- // periodically check for idle block device
- utime_t idle_wait(0, g_conf.ebofs_idle_commit_ms*1000);
- dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, "
- << idle_wait << " ms if idle" << dendl;
- utime_t now = g_clock.now();
- utime_t stop = now;
- stop += (double)g_conf.ebofs_commit_ms / 1000.0;
- do {
- utime_t wait = MIN(stop - now, idle_wait);
- if (commit_cond.WaitInterval(ebofs_lock, wait) != ETIMEDOUT) {
- dout(20) << "commit_thread i got kicked" << dendl;
- break; // we got kicked
- }
- if (dev.is_idle()) {
- dout(20) << "commit_thread bdev is idle, early commit" << dendl;
- break; // dev is idle
- }
- now = g_clock.now();
- dout(20) << "commit_thread now=" << now << ", stop at " << stop << dendl;
-
- // hack hack
- //if (!left) g_conf.debug_ebofs = 10;
- // /hack hack
- } while (now < stop);
- dout(20) << "commit_thread done with idle loop" << dendl;
-
- } else {
- // normal wait+timeout
- dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl;
- commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));
- }
-
+ // normal wait+timeout
+ dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl;
+ commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));
} else {
// DEBUG.. wait until kicked
dout(10) << "commit_thread no commit_ms, waiting until kicked" << dendl;
dout(10) << "commit_thread not dirty" << dendl;
}
else {
+ // --- this all happens in one go, from here ---
super_epoch++;
dirty = false;
// (async) write btree nodes
nodepool.commit_start( dev, super_epoch );
-
- // blockdev barrier (prioritize our writes!)
- dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl;
- dev.barrier();
// prepare super (before any changes get made!)
bufferptr superbp;
prepare_super(super_epoch, superbp);
+ // --- to here. ---
+ // now wait.
+
+ // blockdev barrier (prioritize our writes!)
+ dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl;
+ dev.barrier();
+
// wait for it all to flush (drops global lock)
commit_bc_wait(super_epoch-1);
dout(30) << "commit_thread bc flushed" << dendl;
// kick waiters
dout(10) << "commit_thread queueing commit + kicking sync waiters" << dendl;
-
queue_finishers(commit_waiters[super_epoch-1]);
commit_waiters.erase(super_epoch-1);
-
sync_cond.Signal();
dout(10) << "commit_thread commit finish" << dendl;
unsigned len_in_bh = MIN( (off_t)(left),
(off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos );
- if (bh->is_partial() || bh->is_rx() || bh->is_missing()) {
- assert(bh->is_partial() || bh->is_rx() || bh->is_missing());
+ if (bh->is_partial() || bh->is_rx() || bh->is_missing() || bh->is_corrupt()) {
assert(bh->length() == 1);
if (bh->is_missing()) {
- // newly realloc; carry old checksum over since we're only partially overwriting
- if (bh->start() == bstart) {
+ // newly realloc? carry old checksum over since we're only partially overwriting
+ if (bh->start() == bstart && alloc.contains(bstart)) {
dout(10) << "apply_write carrying over starting csum " << hex << old_csum_first << dec
<< " for partial " << *bh << dendl;
*on->get_extent_csum_ptr(bh->start(), 1) = old_csum_first;
on->data_csum += old_csum_first;
- } else if (bh->end()-1 == blast) {
+ } else if (bh->end()-1 == blast && alloc.contains(blast)) {
dout(10) << "apply_write carrying over ending csum " << hex << old_csum_last << dec
<< " for partial " << *bh << dendl;
*on->get_extent_csum_ptr(bh->end()-1, 1) = old_csum_last;
on->data_csum += old_csum_last;
- } else assert(0);
+ }
+ }
+ if (bh->is_corrupt()) {
+ dout(10) << "apply_write marking non-overwritten bytes bad on corrupt " << *bh << dendl;
+ interval_set<off_t> bad;
+ off_t bs = bh->start() * EBOFS_BLOCK_SIZE;
+ if (off_in_bh) bad.insert(bs, bs+off_in_bh);
+ if (off_in_bh+len_in_bh < (unsigned)EBOFS_BLOCK_SIZE)
+ bad.insert(bs+off_in_bh+len_in_bh, bs+EBOFS_BLOCK_SIZE-off_in_bh-len_in_bh);
+ dout(10) << "apply_write marking non-overwritten bytes " << bad << " bad on corrupt " << *bh << dendl;
+ bh->oc->on->bad_byte_extents.union_of(bad);
+ csum_t csum = calc_csum(bh->data.c_str(), bh->data.length());
+ dout(10) << "apply_write marking corrupt bh csum " << hex << csum << dec << " clean " << *bh << dendl;
+ *on->get_extent_csum_ptr(bh->start(), 1) = csum;
+ on->data_csum += csum;
+ bc.mark_clean(bh);
}
// add frag to partial
bc.mark_partial(bh);
bc.bh_queue_partial_write(on, bh); // queue the eventual write
}
- else if (bh->is_missing()) {
+ else if (bh->is_missing() || bh->is_corrupt()) {
dout(10) << "apply_write missing -> partial " << *bh << dendl;
assert(bh->length() == 1);
bc.mark_partial(bh);
}
break;
+ case Transaction::OP_ZERO:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ if (_zero(oid, offset, len) < 0) {
+ dout(7) << "apply_transaction fail on _zero" << dendl;
+ r &= bit;
+ }
+ }
+ break;
+
case Transaction::OP_TRIMCACHE:
{
pobject_t oid;
}
}
}
- cout << " verify_extents got csum "
- << hex << csum << " want " << data_csum << dec << std::endl;
+ cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl;
assert(s.size() == count);
assert(count == alloc_blocks);