From: Jason Dillaman Date: Fri, 12 May 2017 18:37:39 +0000 (-0400) Subject: blkin: improve trace visibility into object cacher X-Git-Tag: v12.1.0~10^2~67^2~5 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=b33eb6452b37f5164c51817a6c80e976a3e65dfa;p=ceph.git blkin: improve trace visibility into object cacher Signed-off-by: Jason Dillaman --- diff --git a/src/client/ObjecterWriteback.h b/src/client/ObjecterWriteback.h index ca76472adda..8928437646c 100644 --- a/src/client/ObjecterWriteback.h +++ b/src/client/ObjecterWriteback.h @@ -17,8 +17,9 @@ class ObjecterWriteback : public WritebackHandler { void read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) override { + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) override { m_objecter->read_trunc(oid, oloc, off, len, snapid, pbl, 0, trunc_size, trunc_seq, new C_OnFinisher(new C_Lock(m_lock, onfinish), @@ -35,7 +36,8 @@ class ObjecterWriteback : public WritebackHandler { const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, ceph_tid_t journal_tid, - Context *oncommit, ZTracer::Trace *trace) override { + const ZTracer::Trace &parent_trace, + Context *oncommit) override { return m_objecter->write_trunc(oid, oloc, off, len, snapc, bl, mtime, 0, trunc_size, trunc_seq, new C_OnFinisher(new C_Lock(m_lock, diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc index a30bc9adf72..ed5e9d10701 100644 --- a/src/librbd/LibrbdWriteback.cc +++ b/src/librbd/LibrbdWriteback.cc @@ -70,8 +70,8 @@ namespace librbd { class C_OrderedWrite : public Context { public: C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result, - LibrbdWriteback *wb) - : m_cct(cct), m_result(result), m_wb_handler(wb) {} + const ZTracer::Trace &trace, LibrbdWriteback *wb) + : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {} ~C_OrderedWrite() override {} void finish(int r) override { ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl; @@ -83,10 +83,12 @@ namespace librbd { m_wb_handler->complete_writes(m_result->oid); } ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl; + m_trace.event("finish"); } private: CephContext *m_cct; LibrbdWriteback::write_result_d *m_result; + ZTracer::Trace m_trace; LibrbdWriteback *m_wb_handler; }; @@ -196,8 +198,9 @@ namespace librbd { const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) { // on completion, take the mutex and then call onfinish. Context *req = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock); @@ -220,7 +223,7 @@ namespace librbd { util::create_rados_callback(req); int r = m_ictx->data_ctx.aio_operate( oid.name, rados_completion, &op, flags, nullptr, - (trace ? trace->get_info() : nullptr)); + (parent_trace.valid() ? parent_trace.get_info() : nullptr)); rados_completion->release(); assert(r >= 0); } @@ -256,15 +259,23 @@ namespace librbd { const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, ceph_tid_t journal_tid, - Context *oncommit, - ZTracer::Trace *trace) + const ZTracer::Trace &parent_trace, + Context *oncommit) { + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", nullptr, &parent_trace); + trace.copy_name("writeback " + oid.name); + trace.event("start"); + } + uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix); write_result_d *result = new write_result_d(oid.name, oncommit); m_writes[oid.name].push(result); ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl; - C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this); + C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, trace, + this); // all IO operations are flushed prior to closing the journal assert(journal_tid == 0 || m_ictx->journal != NULL); diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h index 72db4dfa61b..6ffba511979 100644 --- a/src/librbd/LibrbdWriteback.h +++ b/src/librbd/LibrbdWriteback.h @@ -24,8 +24,8 @@ namespace librbd { void read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) override; + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, Context *onfinish) override; // Determine whether a read to this extent could be affected by a // write-triggered copy-on-write @@ -38,7 +38,8 @@ namespace librbd { const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, ceph_tid_t journal_tid, - Context *oncommit, ZTracer::Trace *trace) override; + const ZTracer::Trace &parent_trace, + Context *oncommit) override; using WritebackHandler::write; void overwrite_extent(const object_t& oid, uint64_t off, diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index 36633b0c5b4..a77d6b31883 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -36,19 +36,21 @@ class ObjectCacher::C_ReadFinish : public Context { xlist::item set_item; bool trust_enoent; ceph_tid_t tid; + ZTracer::Trace trace; public: bufferlist bl; C_ReadFinish(ObjectCacher *c, Object *ob, ceph_tid_t t, loff_t s, - uint64_t l) : + uint64_t l, const ZTracer::Trace &trace) : oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l), set_item(this), trust_enoent(true), - tid(t) { + tid(t), trace(trace) { ob->reads.push_back(&set_item); } void finish(int r) override { oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent); + trace.event("finish"); // object destructor clears the list if (set_item.is_on_list()) @@ -65,24 +67,25 @@ class ObjectCacher::C_RetryRead : public Context { OSDRead *rd; ObjectSet *oset; Context *onfinish; - ZTracer::Trace r_trace; + ZTracer::Trace trace; public: C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c, - ZTracer::Trace *trace) - : oc(_oc), rd(r), oset(os), onfinish(c) { - if (trace) { - r_trace = *trace; - } - } + const ZTracer::Trace &trace) + : oc(_oc), rd(r), oset(os), onfinish(c), trace(trace) { + } void finish(int r) override { - if (r < 0) { - if (onfinish) - onfinish->complete(r); + if (r >= 0) { + r = oc->_readx(rd, oset, onfinish, false, &trace); + } + + if (r == 0) { + // read is still in-progress return; } - int ret = oc->_readx(rd, oset, onfinish, false, &r_trace); - if (ret != 0 && onfinish) { - onfinish->complete(ret); + + trace.event("finish"); + if (onfinish) { + onfinish->complete(r); } } }; @@ -94,7 +97,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, ldout(oc->cct, 20) << "split " << *left << " at " << off << dendl; // split off right - ObjectCacher::BufferHead *right = new BufferHead(this, &left->b_trace); + ObjectCacher::BufferHead *right = new BufferHead(this); //inherit and if later access, this auto clean. right->set_dontneed(left->get_dontneed()); @@ -277,14 +280,12 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex, map& hits, map& missing, map& rx, - map& errors, - ZTracer::Trace *trace) + map& errors) { assert(oc->lock.is_locked()); - ldout(oc->cct, 10) << "map_read " << ex.oid - << " " << ex.offset << "~" << ex.length - << dendl; - + ldout(oc->cct, 10) << "map_read " << ex.oid << " " + << ex.offset << "~" << ex.length << dendl; + loff_t cur = ex.offset; loff_t left = ex.length; @@ -293,7 +294,7 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex, // at end? if (p == data.end()) { // rest is a miss. - BufferHead *n = new BufferHead(this, trace); + BufferHead *n = new BufferHead(this); n->set_start(cur); n->set_length(left); oc->bh_add(this, n); @@ -309,7 +310,7 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex, assert(cur == (loff_t)ex.offset + (loff_t)ex.length); break; // no more. } - + if (p->first <= cur) { // have it (or part of it) BufferHead *e = p->second; @@ -329,17 +330,17 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex, } else { ceph_abort(); } - + loff_t lenfromcur = MIN(e->end() - cur, left); cur += lenfromcur; left -= lenfromcur; ++p; continue; // more? - + } else if (p->first > cur) { // gap.. miss loff_t next = p->first; - BufferHead *n = new BufferHead(this, trace); + BufferHead *n = new BufferHead(this); loff_t len = MIN(next - cur, left); n->set_start(cur); n->set_length(len); @@ -403,8 +404,7 @@ void ObjectCacher::Object::audit_buffers() * other dirty data to left and/or right. */ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex, - ceph_tid_t tid, - ZTracer::Trace *trace) + ceph_tid_t tid) { assert(oc->lock.is_locked()); BufferHead *final = 0; @@ -422,7 +422,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex, // at end ? if (p == data.end()) { if (final == NULL) { - final = new BufferHead(this, trace); + final = new BufferHead(this); replace_journal_tid(final, tid); final->set_start( cur ); final->set_length( max ); @@ -498,7 +498,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex, final->set_length(final->length() + glen); oc->bh_stat_add(final); } else { - final = new BufferHead(this, trace); + final = new BufferHead(this); replace_journal_tid(final, tid); final->set_start( cur ); final->set_length( glen ); @@ -620,6 +620,7 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name, max_size(max_bytes), max_objects(max_objects), max_dirty_age(ceph::make_timespan(max_dirty_age)), block_writes_upfront(block_writes_upfront), + trace_endpoint("ObjectCacher"), flush_set_callback(flush_callback), flush_set_callback_arg(flush_callback_arg), last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct), @@ -732,24 +733,32 @@ void ObjectCacher::close_object(Object *ob) delete ob; } -void ObjectCacher::bh_read(BufferHead *bh, int op_flags) +void ObjectCacher::bh_read(BufferHead *bh, int op_flags, + const ZTracer::Trace &parent_trace) { assert(lock.is_locked()); ldout(cct, 7) << "bh_read on " << *bh << " outstanding reads " << reads_outstanding << dendl; + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &trace_endpoint, &parent_trace); + trace.copy_name("bh_read " + bh->ob->get_oid().name); + trace.event("start"); + } + mark_rx(bh); bh->last_read_tid = ++last_read_tid; // finisher C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid, - bh->start(), bh->length()); + bh->start(), bh->length(), trace); // go writeback_handler.read(bh->ob->get_oid(), bh->ob->get_object_number(), bh->ob->get_oloc(), bh->start(), bh->length(), bh->ob->get_snap(), &onfinish->bl, bh->ob->truncate_size, bh->ob->truncate_seq, - op_flags, onfinish, &bh->b_trace); + op_flags, trace, onfinish); ++reads_outstanding; } @@ -987,11 +996,12 @@ class ObjectCacher::C_WriteCommit : public Context { int64_t poolid; sobject_t oid; vector > ranges; + ZTracer::Trace trace; public: - ceph_tid_t tid; + ceph_tid_t tid = 0; C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, loff_t s, - uint64_t l) : - oc(c), poolid(_poolid), oid(o), tid(0) { + uint64_t l, const ZTracer::Trace &trace) : + oc(c), poolid(_poolid), oid(o), trace(trace) { ranges.push_back(make_pair(s, l)); } C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, @@ -1001,6 +1011,7 @@ public: } void finish(int r) override { oc->bh_write_commit(poolid, oid, ranges, tid, r); + trace.event("finish"); } }; void ObjectCacher::bh_write_scattered(list& blist) @@ -1056,17 +1067,24 @@ void ObjectCacher::bh_write_scattered(list& blist) perfcounter->inc(l_objectcacher_data_flushed, total_len); } -void ObjectCacher::bh_write(BufferHead *bh) +void ObjectCacher::bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace) { assert(lock.is_locked()); ldout(cct, 7) << "bh_write " << *bh << dendl; bh->ob->get(); + ZTracer::Trace trace; + if (parent_trace.valid()) { + trace.init("", &trace_endpoint, &parent_trace); + trace.copy_name("bh_write " + bh->ob->get_oid().name); + trace.event("start"); + } + // finishers C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->oloc.pool, bh->ob->get_soid(), bh->start(), - bh->length()); + bh->length(), trace); // go ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(), bh->ob->get_oloc(), @@ -1074,8 +1092,7 @@ void ObjectCacher::bh_write(BufferHead *bh) bh->snapc, bh->bl, bh->last_write, bh->ob->truncate_size, bh->ob->truncate_seq, - bh->journal_tid, oncommit, - &bh->b_trace); + bh->journal_tid, trace, oncommit); ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl; // set bh last_write_tid @@ -1200,8 +1217,9 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, finish_contexts(cct, ls, r); } -void ObjectCacher::flush(loff_t amount) +void ObjectCacher::flush(ZTracer::Trace *trace, loff_t amount) { + assert(trace != nullptr); assert(lock.is_locked()); ceph::real_time cutoff = ceph::real_clock::now(); @@ -1224,9 +1242,9 @@ void ObjectCacher::flush(loff_t amount) bh_write_adjacencies(bh, cutoff, amount > 0 ? &left : NULL, NULL); } else { left -= bh->length(); - bh_write(bh); + bh_write(bh, *trace); } - } + } } @@ -1300,14 +1318,25 @@ bool ObjectCacher::is_cached(ObjectSet *oset, vector& extents, * returns 0 if doing async read */ int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, - ZTracer::Trace *trace) + ZTracer::Trace *parent_trace) { - return _readx(rd, oset, onfinish, true, trace); + ZTracer::Trace trace; + if (parent_trace != nullptr) { + trace.init("read", &trace_endpoint, parent_trace); + trace.event("start"); + } + + int r =_readx(rd, oset, onfinish, true, &trace); + if (r < 0) { + trace.event("finish"); + } + return r; } int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, bool external_call, ZTracer::Trace *trace) { + assert(trace != nullptr); assert(lock.is_locked()); bool success = true; int error = 0; @@ -1360,7 +1389,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, if (scattered_write) blist.push_back(bh); else - bh_write(bh); + bh_write(bh, *trace); } } } @@ -1370,7 +1399,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, ldout(cct, 10) << "readx waiting on tid " << o->last_write_tid << " on " << *o << dendl; o->waitfor_commit[o->last_write_tid].push_back( - new C_RetryRead(this,rd, oset, onfinish, trace)); + new C_RetryRead(this,rd, oset, onfinish, *trace)); // FIXME: perfcounter! return 0; } @@ -1399,7 +1428,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, // map extent into bufferheads map hits, missing, rx, errors; - o->map_read(*ex_it, hits, missing, rx, errors, trace); + o->map_read(*ex_it, hits, missing, rx, errors); if (external_call) { // retry reading error buffers missing.insert(errors.begin(), errors.end()); @@ -1428,14 +1457,14 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, << (MAX(rx_bytes, max_size) - max_size) << " read bytes" << dendl; waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish, - trace)); + *trace)); } bh_remove(o, bh_it->second); delete bh_it->second; } else { bh_it->second->set_nocache(nocache); - bh_read(bh_it->second, rd->fadvise_flags); + bh_read(bh_it->second, rd->fadvise_flags, *trace); if ((success && onfinish) || last != missing.end()) last = bh_it; } @@ -1447,7 +1476,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, ldout(cct, 10) << "readx missed, waiting on " << *last->second << " off " << last->first << dendl; last->second->waitfor_read[last->first].push_back( - new C_RetryRead(this, rd, oset, onfinish, trace) ); + new C_RetryRead(this, rd, oset, onfinish, *trace) ); } @@ -1460,7 +1489,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second << " off " << bh_it->first << dendl; bh_it->second->waitfor_read[bh_it->first].push_back( - new C_RetryRead(this, rd, oset, onfinish, trace) ); + new C_RetryRead(this, rd, oset, onfinish, *trace) ); } bytes_not_in_cache += bh_it->second->length(); success = false; @@ -1628,7 +1657,7 @@ void ObjectCacher::retry_waiting_reads() } int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, - ZTracer::Trace *trace) + ZTracer::Trace *parent_trace) { assert(lock.is_locked()); ceph::real_time now = ceph::real_clock::now(); @@ -1637,6 +1666,12 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED; bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE; + ZTracer::Trace trace; + if (parent_trace != nullptr) { + trace.init("write", &trace_endpoint, parent_trace); + trace.event("start"); + } + for (vector::iterator ex_it = wr->extents.begin(); ex_it != wr->extents.end(); ++ex_it) { @@ -1646,10 +1681,10 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, ex_it->truncate_size, oset->truncate_seq); // map it all into a single bufferhead. - BufferHead *bh = o->map_write(*ex_it, wr->journal_tid, trace); + BufferHead *bh = o->map_write(*ex_it, wr->journal_tid); bool missing = bh->is_missing(); bh->snapc = wr->snapc; - + bytes_written += ex_it->length; if (bh->is_tx()) { bytes_written_in_flush += ex_it->length; @@ -1708,7 +1743,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, } } - int r = _wait_for_write(wr, bytes_written, oset, onfreespace); + int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace); delete wr; //verify_stats(); @@ -1718,23 +1753,26 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, class ObjectCacher::C_WaitForWrite : public Context { public: - C_WaitForWrite(ObjectCacher *oc, uint64_t len, Context *onfinish) : - m_oc(oc), m_len(len), m_onfinish(onfinish) {} + C_WaitForWrite(ObjectCacher *oc, uint64_t len, + const ZTracer::Trace &trace, Context *onfinish) : + m_oc(oc), m_len(len), m_trace(trace), m_onfinish(onfinish) {} void finish(int r) override; private: ObjectCacher *m_oc; uint64_t m_len; + ZTracer::Trace m_trace; Context *m_onfinish; }; void ObjectCacher::C_WaitForWrite::finish(int r) { Mutex::Locker l(m_oc->lock); - m_oc->maybe_wait_for_writeback(m_len); + m_oc->maybe_wait_for_writeback(m_len, &m_trace); m_onfinish->complete(r); } -void ObjectCacher::maybe_wait_for_writeback(uint64_t len) +void ObjectCacher::maybe_wait_for_writeback(uint64_t len, + ZTracer::Trace *trace) { assert(lock.is_locked()); ceph::mono_time start = ceph::mono_clock::now(); @@ -1747,6 +1785,9 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len) while (get_stat_dirty() + get_stat_tx() > 0 && (uint64_t) (get_stat_dirty() + get_stat_tx()) >= max_dirty + get_stat_dirty_waiting()) { + if (blocked == 0) { + trace->event("start wait for writeback"); + } ldout(cct, 10) << __func__ << " waiting for dirty|tx " << (get_stat_dirty() + get_stat_tx()) << " >= max " << max_dirty << " + dirty_waiting " @@ -1758,6 +1799,9 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len) ++blocked; ldout(cct, 10) << __func__ << " woke up" << dendl; } + if (blocked > 0) { + trace->event("finish wait for writeback"); + } if (blocked && perfcounter) { perfcounter->inc(l_objectcacher_write_ops_blocked); perfcounter->inc(l_objectcacher_write_bytes_blocked, len); @@ -1768,19 +1812,20 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len) // blocking wait for write. int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, - Context *onfreespace) + ZTracer::Trace *trace, Context *onfreespace) { assert(lock.is_locked()); + assert(trace != nullptr); int ret = 0; if (max_dirty > 0) { if (block_writes_upfront) { - maybe_wait_for_writeback(len); + maybe_wait_for_writeback(len, trace); if (onfreespace) onfreespace->complete(0); } else { assert(onfreespace); - finisher.queue(new C_WaitForWrite(this, len, onfreespace)); + finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace)); } } else { // write-thru! flush what we just wrote. @@ -1789,7 +1834,7 @@ int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, Context *fin = block_writes_upfront ? new C_Cond(&cond, &done, &ret) : onfreespace; assert(fin); - bool flushed = flush_set(oset, wr->extents, fin); + bool flushed = flush_set(oset, wr->extents, trace, fin); assert(!flushed); // we just dirtied it, and didn't drop our lock! ldout(cct, 10) << "wait_for_write waiting on write-thru of " << len << " bytes" << dendl; @@ -1828,12 +1873,19 @@ void ObjectCacher::flusher_entry() << max_dirty << " max)" << dendl; loff_t actual = get_stat_dirty() + get_stat_dirty_waiting(); + + ZTracer::Trace trace; + if (cct->_conf->osdc_blkin_trace_all) { + trace.init("flusher", &trace_endpoint); + trace.event("start"); + } + if (actual > 0 && (uint64_t) actual > target_dirty) { // flush some dirty pages ldout(cct, 10) << "flusher " << get_stat_dirty() << " dirty + " << get_stat_dirty_waiting() << " dirty_waiting > target " << target_dirty << ", flushing some dirty bhs" << dendl; - flush(actual - target_dirty); + flush(&trace, actual - target_dirty); } else { // check tail of lru for old dirty items ceph::real_time cutoff = ceph::real_clock::now(); @@ -1848,17 +1900,20 @@ void ObjectCacher::flusher_entry() if (scattered_write) { bh_write_adjacencies(bh, cutoff, NULL, &max); } else { - bh_write(bh); + bh_write(bh, trace); --max; } } if (!max) { // back off the lock to avoid starving other threads + trace.event("backoff"); lock.Unlock(); lock.Lock(); continue; } } + + trace.event("finish"); if (flusher_stop) break; @@ -1955,8 +2010,10 @@ void ObjectCacher::purge(Object *ob) // true if clean, already flushed. // false if we wrote something. // be sloppy about the ranges and flush any buffer it touches -bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length) +bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length, + ZTracer::Trace *trace) { + assert(trace != nullptr); assert(lock.is_locked()); list blist; bool clean = true; @@ -1980,7 +2037,7 @@ bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length) if (scattered_write) blist.push_back(bh); else - bh_write(bh); + bh_write(bh, *trace); clean = false; } if (scattered_write && !blist.empty()) @@ -2029,7 +2086,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish) // Buffer heads in dirty_or_tx_bh are sorted in ObjectSet/Object/offset // order. But items in oset->objects are not sorted. So the iterator can // point to any buffer head in the ObjectSet - BufferHead key(*oset->objects.begin(), nullptr); + BufferHead key(*oset->objects.begin()); it = dirty_or_tx_bh.lower_bound(&key); p = q = it; @@ -2056,7 +2113,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish) } blist.push_back(bh); } else { - bh_write(bh); + bh_write(bh, {}); } } } @@ -2082,7 +2139,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish) } blist.push_front(bh); } else { - bh_write(bh); + bh_write(bh, {}); } } if (!backwards) @@ -2109,9 +2166,10 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish) // flush. non-blocking, takes callback. // returns true if already flushed bool ObjectCacher::flush_set(ObjectSet *oset, vector& exv, - Context *onfinish) + ZTracer::Trace *trace, Context *onfinish) { assert(lock.is_locked()); + assert(trace != nullptr); assert(onfinish != NULL); if (oset->objects.empty()) { ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl; @@ -2137,7 +2195,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, vector& exv, ldout(cct, 20) << "flush_set " << oset << " ex " << ex << " ob " << soid << " " << ob << dendl; - if (!flush(ob, ex.offset, ex.length)) { + if (!flush(ob, ex.offset, ex.length, trace)) { // we'll need to gather... ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid " << ob->last_write_tid << " on " << *ob << dendl; @@ -2181,7 +2239,7 @@ bool ObjectCacher::flush_all(Context *onfinish) } blist.push_back(bh); } else { - bh_write(bh); + bh_write(bh, {}); } } diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h index c6c49deb2b5..31201a72354 100644 --- a/src/osdc/ObjectCacher.h +++ b/src/osdc/ObjectCacher.h @@ -131,10 +131,9 @@ class ObjectCacher { int error; // holds return value for failed reads map > waitfor_read; - ZTracer::Trace b_trace; // cons - explicit BufferHead(Object *o, ZTracer::Trace *trace) : + explicit BufferHead(Object *o) : state(STATE_MISSING), ref(0), dontneed(false), @@ -145,9 +144,6 @@ class ObjectCacher { journal_tid(0), error(0) { ex.start = ex.length = 0; - if (trace) { - b_trace = *trace; - } } // extent @@ -353,11 +349,9 @@ class ObjectCacher { map& hits, map& missing, map& rx, - map& errors, - ZTracer::Trace *trace); - BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid, - ZTracer::Trace *trace); - + map& errors); + BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid); + void replace_journal_tid(BufferHead *bh, ceph_tid_t tid); void truncate(loff_t s); void discard(loff_t off, loff_t len); @@ -410,6 +404,8 @@ class ObjectCacher { ceph::timespan max_dirty_age; bool block_writes_upfront; + ZTracer::Endpoint trace_endpoint; + flush_set_callback_t flush_set_callback; void *flush_set_callback_arg; @@ -526,14 +522,15 @@ class ObjectCacher { void bh_remove(Object *ob, BufferHead *bh); // io - void bh_read(BufferHead *bh, int op_flags); - void bh_write(BufferHead *bh); + void bh_read(BufferHead *bh, int op_flags, + const ZTracer::Trace &parent_trace); + void bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace); void bh_write_scattered(list& blist); void bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff, int64_t *amount, int *max_count); void trim(); - void flush(loff_t amount=0); + void flush(ZTracer::Trace *trace, loff_t amount=0); /** * flush a range of buffers @@ -546,7 +543,8 @@ class ObjectCacher { * @param len extent length, or 0 for entire object * @return true if object was already clean/flushed. */ - bool flush(Object *o, loff_t off, loff_t len); + bool flush(Object *o, loff_t off, loff_t len, + ZTracer::Trace *trace); loff_t release(Object *o); void purge(Object *o); @@ -605,17 +603,17 @@ class ObjectCacher { * the return value is total bytes read */ int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish, - ZTracer::Trace *trace = nullptr); + ZTracer::Trace *parent_trace = nullptr); int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace, - ZTracer::Trace *trace = nullptr); + ZTracer::Trace *parent_trace = nullptr); bool is_cached(ObjectSet *oset, vector& extents, snapid_t snapid); private: // write blocking int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset, - Context *onfreespace); - void maybe_wait_for_writeback(uint64_t len); + ZTracer::Trace *trace, Context *onfreespace); + void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace); bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish); public: @@ -625,7 +623,7 @@ public: bool flush_set(ObjectSet *oset, Context *onfinish=0); bool flush_set(ObjectSet *oset, vector& ex, - Context *onfinish = 0); + ZTracer::Trace *trace, Context *onfinish = 0); bool flush_all(Context *onfinish = 0); void purge_set(ObjectSet *oset); @@ -698,7 +696,8 @@ public: vector extents; Striper::file_to_extents(cct, oset->ino, layout, offset, len, oset->truncate_size, extents); - return flush_set(oset, extents, onfinish); + ZTracer::Trace trace; + return flush_set(oset, extents, &trace, onfinish); } }; diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h index 4485f7e4605..bf90fdd0ac0 100644 --- a/src/osdc/WritebackHandler.h +++ b/src/osdc/WritebackHandler.h @@ -15,8 +15,8 @@ class WritebackHandler { virtual void read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace = nullptr) = 0; + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, Context *onfinish) = 0; /** * check if a given extent read result may change due to a write * @@ -35,8 +35,9 @@ class WritebackHandler { const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, - ceph_tid_t journal_tid, Context *oncommit, - ZTracer::Trace *trace = nullptr) = 0; + ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) = 0; virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len, ceph_tid_t original_journal_tid, diff --git a/src/test/osdc/FakeWriteback.cc b/src/test/osdc/FakeWriteback.cc index 29d94fc25e1..72e80e433f6 100644 --- a/src/test/osdc/FakeWriteback.cc +++ b/src/test/osdc/FakeWriteback.cc @@ -62,8 +62,9 @@ void FakeWriteback::read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) { C_Delay *wrapper = new C_Delay(m_cct, onfinish, m_lock, off, pbl, m_delay_ns); @@ -76,8 +77,9 @@ ceph_tid_t FakeWriteback::write(const object_t& oid, const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, - ceph_tid_t journal_tid, Context *oncommit, - ZTracer::Trace *trace) + ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) { C_Delay *wrapper = new C_Delay(m_cct, oncommit, m_lock, off, NULL, m_delay_ns); diff --git a/src/test/osdc/FakeWriteback.h b/src/test/osdc/FakeWriteback.h index d0c5449a828..6112eb72082 100644 --- a/src/test/osdc/FakeWriteback.h +++ b/src/test/osdc/FakeWriteback.h @@ -20,15 +20,17 @@ public: void read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) override; + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) override; ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, uint64_t off, uint64_t len, const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, ceph_tid_t journal_tid, - Context *oncommit, ZTracer::Trace *trace) override; + const ZTracer::Trace &parent_trace, + Context *oncommit) override; using WritebackHandler::write; diff --git a/src/test/osdc/MemWriteback.cc b/src/test/osdc/MemWriteback.cc index cc9bda9fe98..e9e1f9fe345 100644 --- a/src/test/osdc/MemWriteback.cc +++ b/src/test/osdc/MemWriteback.cc @@ -92,8 +92,9 @@ void MemWriteback::read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) { assert(snapid == CEPH_NOSNAP); C_DelayRead *wrapper = new C_DelayRead(this, m_cct, onfinish, m_lock, oid, @@ -107,8 +108,9 @@ ceph_tid_t MemWriteback::write(const object_t& oid, const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, - ceph_tid_t journal_tid, Context *oncommit, - ZTracer::Trace *trace) + ceph_tid_t journal_tid, + const ZTracer::Trace &parent_trace, + Context *oncommit) { assert(snapc.seq == 0); C_DelayWrite *wrapper = new C_DelayWrite(this, m_cct, oncommit, m_lock, oid, diff --git a/src/test/osdc/MemWriteback.h b/src/test/osdc/MemWriteback.h index c4daae937b2..a073cbf1760 100644 --- a/src/test/osdc/MemWriteback.h +++ b/src/test/osdc/MemWriteback.h @@ -20,15 +20,17 @@ public: void read(const object_t& oid, uint64_t object_no, const object_locator_t& oloc, uint64_t off, uint64_t len, snapid_t snapid, bufferlist *pbl, uint64_t trunc_size, - __u32 trunc_seq, int op_flags, Context *onfinish, - ZTracer::Trace *trace) override; + __u32 trunc_seq, int op_flags, + const ZTracer::Trace &parent_trace, + Context *onfinish) override; ceph_tid_t write(const object_t& oid, const object_locator_t& oloc, uint64_t off, uint64_t len, const SnapContext& snapc, const bufferlist &bl, ceph::real_time mtime, uint64_t trunc_size, __u32 trunc_seq, ceph_tid_t journal_tid, - Context *oncommit, ZTracer::Trace *trace) override; + const ZTracer::Trace &parent_trace, + Context *oncommit) override; using WritebackHandler::write;