]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
blkin: improve trace visibility into object cacher
authorJason Dillaman <dillaman@redhat.com>
Fri, 12 May 2017 18:37:39 +0000 (14:37 -0400)
committerJason Dillaman <dillaman@redhat.com>
Thu, 18 May 2017 22:13:27 +0000 (18:13 -0400)
Signed-off-by: Jason Dillaman <dillaman@redhat.com>
src/client/ObjecterWriteback.h
src/librbd/LibrbdWriteback.cc
src/librbd/LibrbdWriteback.h
src/osdc/ObjectCacher.cc
src/osdc/ObjectCacher.h
src/osdc/WritebackHandler.h
src/test/osdc/FakeWriteback.cc
src/test/osdc/FakeWriteback.h
src/test/osdc/MemWriteback.cc
src/test/osdc/MemWriteback.h

index ca76472adda395b37e7b3b734c2f2ae9123eae48..8928437646ce2ee8d6a605f43e4be095811b058d 100644 (file)
@@ -17,8 +17,9 @@ class ObjecterWriteback : public WritebackHandler {
   void read(const object_t& oid, uint64_t object_no,
                    const object_locator_t& oloc, uint64_t off, uint64_t len,
                    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-                   __u32 trunc_seq, int op_flags, Context *onfinish,
-                   ZTracer::Trace *trace) override {
+                   __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace,
+                    Context *onfinish) override {
     m_objecter->read_trunc(oid, oloc, off, len, snapid, pbl, 0,
                           trunc_size, trunc_seq,
                           new C_OnFinisher(new C_Lock(m_lock, onfinish),
@@ -35,7 +36,8 @@ class ObjecterWriteback : public WritebackHandler {
                           const SnapContext& snapc, const bufferlist &bl,
                           ceph::real_time mtime, uint64_t trunc_size,
                           __u32 trunc_seq, ceph_tid_t journal_tid,
-                          Context *oncommit, ZTracer::Trace *trace) override {
+                           const ZTracer::Trace &parent_trace,
+                          Context *oncommit) override {
     return m_objecter->write_trunc(oid, oloc, off, len, snapc, bl, mtime, 0,
                                   trunc_size, trunc_seq,
                                   new C_OnFinisher(new C_Lock(m_lock,
index a30bc9adf72fe2fb3ed448b9ea2538c009687db8..ed5e9d107012d9598b5d25c66ee3929f94c96cfe 100644 (file)
@@ -70,8 +70,8 @@ namespace librbd {
   class C_OrderedWrite : public Context {
   public:
     C_OrderedWrite(CephContext *cct, LibrbdWriteback::write_result_d *result,
-                  LibrbdWriteback *wb)
-      : m_cct(cct), m_result(result), m_wb_handler(wb) {}
+                  const ZTracer::Trace &trace, LibrbdWriteback *wb)
+      : m_cct(cct), m_result(result), m_trace(trace), m_wb_handler(wb) {}
     ~C_OrderedWrite() override {}
     void finish(int r) override {
       ldout(m_cct, 20) << "C_OrderedWrite completing " << m_result << dendl;
@@ -83,10 +83,12 @@ namespace librbd {
        m_wb_handler->complete_writes(m_result->oid);
       }
       ldout(m_cct, 20) << "C_OrderedWrite finished " << m_result << dendl;
+      m_trace.event("finish");
     }
   private:
     CephContext *m_cct;
     LibrbdWriteback::write_result_d *m_result;
+    ZTracer::Trace m_trace;
     LibrbdWriteback *m_wb_handler;
   };
 
@@ -196,8 +198,9 @@ namespace librbd {
                             const object_locator_t& oloc,
                             uint64_t off, uint64_t len, snapid_t snapid,
                             bufferlist *pbl, uint64_t trunc_size,
-                            __u32 trunc_seq, int op_flags, Context *onfinish,
-                            ZTracer::Trace *trace)
+                            __u32 trunc_seq, int op_flags,
+                             const ZTracer::Trace &parent_trace,
+                             Context *onfinish)
   {
     // on completion, take the mutex and then call onfinish.
     Context *req = new C_ReadRequest(m_ictx->cct, onfinish, &m_lock);
@@ -220,7 +223,7 @@ namespace librbd {
       util::create_rados_callback(req);
     int r = m_ictx->data_ctx.aio_operate(
       oid.name, rados_completion, &op, flags, nullptr,
-      (trace ? trace->get_info() : nullptr));
+      (parent_trace.valid() ? parent_trace.get_info() : nullptr));
     rados_completion->release();
     assert(r >= 0);
   }
@@ -256,15 +259,23 @@ namespace librbd {
                                    const bufferlist &bl,
                                    ceph::real_time mtime, uint64_t trunc_size,
                                    __u32 trunc_seq, ceph_tid_t journal_tid,
-                                   Context *oncommit,
-                                   ZTracer::Trace *trace)
+                                    const ZTracer::Trace &parent_trace,
+                                   Context *oncommit)
   {
+    ZTracer::Trace trace;
+    if (parent_trace.valid()) {
+      trace.init("", nullptr, &parent_trace);
+      trace.copy_name("writeback " + oid.name);
+      trace.event("start");
+    }
+
     uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
 
     write_result_d *result = new write_result_d(oid.name, oncommit);
     m_writes[oid.name].push(result);
     ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
-    C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
+    C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, trace,
+                                                  this);
 
     // all IO operations are flushed prior to closing the journal
     assert(journal_tid == 0 || m_ictx->journal != NULL);
index 72db4dfa61ba889b9cde0a2e40f4f69eebe6417e..6ffba51197974d22e4ecbaeaa8b12a558b16b7a1 100644 (file)
@@ -24,8 +24,8 @@ namespace librbd {
     void read(const object_t& oid, uint64_t object_no,
               const object_locator_t& oloc, uint64_t off, uint64_t len,
               snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-              __u32 trunc_seq, int op_flags, Context *onfinish,
-             ZTracer::Trace *trace) override;
+              __u32 trunc_seq, int op_flags,
+             const ZTracer::Trace &parent_trace, Context *onfinish) override;
 
     // Determine whether a read to this extent could be affected by a
     // write-triggered copy-on-write
@@ -38,7 +38,8 @@ namespace librbd {
                      const SnapContext& snapc, const bufferlist &bl,
                      ceph::real_time mtime, uint64_t trunc_size,
                      __u32 trunc_seq, ceph_tid_t journal_tid,
-                     Context *oncommit, ZTracer::Trace *trace) override;
+                     const ZTracer::Trace &parent_trace,
+                     Context *oncommit) override;
     using WritebackHandler::write;
 
     void overwrite_extent(const object_t& oid, uint64_t off,
index 36633b0c5b435254ce5c3eb16bd46fe49a5910b6..a77d6b318834796c945fbed137d7e262f0f8f5f0 100644 (file)
@@ -36,19 +36,21 @@ class ObjectCacher::C_ReadFinish : public Context {
   xlist<C_ReadFinish*>::item set_item;
   bool trust_enoent;
   ceph_tid_t tid;
+  ZTracer::Trace trace;
 
 public:
   bufferlist bl;
   C_ReadFinish(ObjectCacher *c, Object *ob, ceph_tid_t t, loff_t s,
-              uint64_t l) :
+              uint64_t l, const ZTracer::Trace &trace) :
     oc(c), poolid(ob->oloc.pool), oid(ob->get_soid()), start(s), length(l),
     set_item(this), trust_enoent(true),
-    tid(t) {
+    tid(t), trace(trace) {
     ob->reads.push_back(&set_item);
   }
 
   void finish(int r) override {
     oc->bh_read_finish(poolid, oid, tid, start, length, bl, r, trust_enoent);
+    trace.event("finish");
 
     // object destructor clears the list
     if (set_item.is_on_list())
@@ -65,24 +67,25 @@ class ObjectCacher::C_RetryRead : public Context {
   OSDRead *rd;
   ObjectSet *oset;
   Context *onfinish;
-  ZTracer::Trace r_trace;
+  ZTracer::Trace trace;
 public:
   C_RetryRead(ObjectCacher *_oc, OSDRead *r, ObjectSet *os, Context *c,
-             ZTracer::Trace *trace)
-    : oc(_oc), rd(r), oset(os), onfinish(c) {
-      if (trace) {
-        r_trace = *trace;
-      }
-    }
+             const ZTracer::Trace &trace)
+    : oc(_oc), rd(r), oset(os), onfinish(c), trace(trace) {
+  }
   void finish(int r) override {
-    if (r < 0) {
-      if (onfinish)
-        onfinish->complete(r);
+    if (r >= 0) {
+      r = oc->_readx(rd, oset, onfinish, false, &trace);
+    }
+
+    if (r == 0) {
+      // read is still in-progress
       return;
     }
-    int ret = oc->_readx(rd, oset, onfinish, false, &r_trace);
-    if (ret != 0 && onfinish) {
-      onfinish->complete(ret);
+
+    trace.event("finish");
+    if (onfinish) {
+      onfinish->complete(r);
     }
   }
 };
@@ -94,7 +97,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left,
   ldout(oc->cct, 20) << "split " << *left << " at " << off << dendl;
 
   // split off right
-  ObjectCacher::BufferHead *right = new BufferHead(this, &left->b_trace);
+  ObjectCacher::BufferHead *right = new BufferHead(this);
 
   //inherit and if later access, this auto clean.
   right->set_dontneed(left->get_dontneed());
@@ -277,14 +280,12 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
                                    map<loff_t, BufferHead*>& hits,
                                    map<loff_t, BufferHead*>& missing,
                                    map<loff_t, BufferHead*>& rx,
-                                  map<loff_t, BufferHead*>& errors,
-                                  ZTracer::Trace *trace)
+                                  map<loff_t, BufferHead*>& errors)
 {
   assert(oc->lock.is_locked());
-  ldout(oc->cct, 10) << "map_read " << ex.oid 
-              << " " << ex.offset << "~" << ex.length
-              << dendl;
-  
+  ldout(oc->cct, 10) << "map_read " << ex.oid << " "
+                     << ex.offset << "~" << ex.length << dendl;
+
   loff_t cur = ex.offset;
   loff_t left = ex.length;
 
@@ -293,7 +294,7 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
     // at end?
     if (p == data.end()) {
       // rest is a miss.
-      BufferHead *n = new BufferHead(this, trace);
+      BufferHead *n = new BufferHead(this);
       n->set_start(cur);
       n->set_length(left);
       oc->bh_add(this, n);
@@ -309,7 +310,7 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
       assert(cur == (loff_t)ex.offset + (loff_t)ex.length);
       break;  // no more.
     }
-    
+
     if (p->first <= cur) {
       // have it (or part of it)
       BufferHead *e = p->second;
@@ -329,17 +330,17 @@ int ObjectCacher::Object::map_read(ObjectExtent &ex,
       } else {
         ceph_abort();
       }
-      
+
       loff_t lenfromcur = MIN(e->end() - cur, left);
       cur += lenfromcur;
       left -= lenfromcur;
       ++p;
       continue;  // more?
-      
+
     } else if (p->first > cur) {
       // gap.. miss
       loff_t next = p->first;
-      BufferHead *n = new BufferHead(this, trace);
+      BufferHead *n = new BufferHead(this);
       loff_t len = MIN(next - cur, left);
       n->set_start(cur);
       n->set_length(len);
@@ -403,8 +404,7 @@ void ObjectCacher::Object::audit_buffers()
  * other dirty data to left and/or right.
  */
 ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
-                                                         ceph_tid_t tid,
-                                                         ZTracer::Trace *trace)
+                                                         ceph_tid_t tid)
 {
   assert(oc->lock.is_locked());
   BufferHead *final = 0;
@@ -422,7 +422,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
     // at end ?
     if (p == data.end()) {
       if (final == NULL) {
-        final = new BufferHead(this, trace);
+        final = new BufferHead(this);
         replace_journal_tid(final, tid);
         final->set_start( cur );
         final->set_length( max );
@@ -498,7 +498,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(ObjectExtent &ex,
         final->set_length(final->length() + glen);
         oc->bh_stat_add(final);
       } else {
-        final = new BufferHead(this, trace);
+        final = new BufferHead(this);
        replace_journal_tid(final, tid);
         final->set_start( cur );
         final->set_length( glen );
@@ -620,6 +620,7 @@ ObjectCacher::ObjectCacher(CephContext *cct_, string name,
     max_size(max_bytes), max_objects(max_objects),
     max_dirty_age(ceph::make_timespan(max_dirty_age)),
     block_writes_upfront(block_writes_upfront),
+    trace_endpoint("ObjectCacher"),
     flush_set_callback(flush_callback),
     flush_set_callback_arg(flush_callback_arg),
     last_read_tid(0), flusher_stop(false), flusher_thread(this),finisher(cct),
@@ -732,24 +733,32 @@ void ObjectCacher::close_object(Object *ob)
   delete ob;
 }
 
-void ObjectCacher::bh_read(BufferHead *bh, int op_flags)
+void ObjectCacher::bh_read(BufferHead *bh, int op_flags,
+                           const ZTracer::Trace &parent_trace)
 {
   assert(lock.is_locked());
   ldout(cct, 7) << "bh_read on " << *bh << " outstanding reads "
                << reads_outstanding << dendl;
 
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_read " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
   mark_rx(bh);
   bh->last_read_tid = ++last_read_tid;
 
   // finisher
   C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob, bh->last_read_tid,
-                                           bh->start(), bh->length());
+                                           bh->start(), bh->length(), trace);
   // go
   writeback_handler.read(bh->ob->get_oid(), bh->ob->get_object_number(),
                         bh->ob->get_oloc(), bh->start(), bh->length(),
                         bh->ob->get_snap(), &onfinish->bl,
                         bh->ob->truncate_size, bh->ob->truncate_seq,
-                        op_flags, onfinish, &bh->b_trace);
+                        op_flags, trace, onfinish);
 
   ++reads_outstanding;
 }
@@ -987,11 +996,12 @@ class ObjectCacher::C_WriteCommit : public Context {
   int64_t poolid;
   sobject_t oid;
   vector<pair<loff_t, uint64_t> > ranges;
+  ZTracer::Trace trace;
 public:
-  ceph_tid_t tid;
+  ceph_tid_t tid = 0;
   C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o, loff_t s,
-               uint64_t l) :
-    oc(c), poolid(_poolid), oid(o), tid(0) {
+               uint64_t l, const ZTracer::Trace &trace) :
+    oc(c), poolid(_poolid), oid(o), trace(trace) {
       ranges.push_back(make_pair(s, l));
     }
   C_WriteCommit(ObjectCacher *c, int64_t _poolid, sobject_t o,
@@ -1001,6 +1011,7 @@ public:
     }
   void finish(int r) override {
     oc->bh_write_commit(poolid, oid, ranges, tid, r);
+    trace.event("finish");
   }
 };
 void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist)
@@ -1056,17 +1067,24 @@ void ObjectCacher::bh_write_scattered(list<BufferHead*>& blist)
     perfcounter->inc(l_objectcacher_data_flushed, total_len);
 }
 
-void ObjectCacher::bh_write(BufferHead *bh)
+void ObjectCacher::bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace)
 {
   assert(lock.is_locked());
   ldout(cct, 7) << "bh_write " << *bh << dendl;
 
   bh->ob->get();
 
+  ZTracer::Trace trace;
+  if (parent_trace.valid()) {
+    trace.init("", &trace_endpoint, &parent_trace);
+    trace.copy_name("bh_write " + bh->ob->get_oid().name);
+    trace.event("start");
+  }
+
   // finishers
   C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->oloc.pool,
                                              bh->ob->get_soid(), bh->start(),
-                                             bh->length());
+                                             bh->length(), trace);
   // go
   ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(),
                                           bh->ob->get_oloc(),
@@ -1074,8 +1092,7 @@ void ObjectCacher::bh_write(BufferHead *bh)
                                           bh->snapc, bh->bl, bh->last_write,
                                           bh->ob->truncate_size,
                                           bh->ob->truncate_seq,
-                                          bh->journal_tid, oncommit,
-                                          &bh->b_trace);
+                                          bh->journal_tid, trace, oncommit);
   ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl;
 
   // set bh last_write_tid
@@ -1200,8 +1217,9 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid,
     finish_contexts(cct, ls, r);
 }
 
-void ObjectCacher::flush(loff_t amount)
+void ObjectCacher::flush(ZTracer::Trace *trace, loff_t amount)
 {
+  assert(trace != nullptr);
   assert(lock.is_locked());
   ceph::real_time cutoff = ceph::real_clock::now();
 
@@ -1224,9 +1242,9 @@ void ObjectCacher::flush(loff_t amount)
       bh_write_adjacencies(bh, cutoff, amount > 0 ? &left : NULL, NULL);
     } else {
       left -= bh->length();
-      bh_write(bh);
+      bh_write(bh, *trace);
     }
-  }    
+  }
 }
 
 
@@ -1300,14 +1318,25 @@ bool ObjectCacher::is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
  * returns 0 if doing async read
  */
 int ObjectCacher::readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
-                       ZTracer::Trace *trace)
+                       ZTracer::Trace *parent_trace)
 {
-  return _readx(rd, oset, onfinish, true, trace);
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("read", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
+  int r =_readx(rd, oset, onfinish, true, &trace);
+  if (r < 0) {
+    trace.event("finish");
+  }
+  return r;
 }
 
 int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
                         bool external_call, ZTracer::Trace *trace)
 {
+  assert(trace != nullptr);
   assert(lock.is_locked());
   bool success = true;
   int error = 0;
@@ -1360,7 +1389,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
              if (scattered_write)
                blist.push_back(bh);
              else
-               bh_write(bh);
+               bh_write(bh, *trace);
            }
          }
        }
@@ -1370,7 +1399,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
          ldout(cct, 10) << "readx  waiting on tid " << o->last_write_tid
                         << " on " << *o << dendl;
          o->waitfor_commit[o->last_write_tid].push_back(
-           new C_RetryRead(this,rd, oset, onfinish, trace));
+           new C_RetryRead(this,rd, oset, onfinish, *trace));
          // FIXME: perfcounter!
          return 0;
        }
@@ -1399,7 +1428,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
 
     // map extent into bufferheads
     map<loff_t, BufferHead*> hits, missing, rx, errors;
-    o->map_read(*ex_it, hits, missing, rx, errors, trace);
+    o->map_read(*ex_it, hits, missing, rx, errors);
     if (external_call) {
       // retry reading error buffers
       missing.insert(errors.begin(), errors.end());
@@ -1428,14 +1457,14 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
                           << (MAX(rx_bytes, max_size) - max_size)
                           << " read bytes" << dendl;
            waitfor_read.push_back(new C_RetryRead(this, rd, oset, onfinish,
-                                                  trace));
+                                                  *trace));
          }
 
          bh_remove(o, bh_it->second);
          delete bh_it->second;
        } else {
          bh_it->second->set_nocache(nocache);
-         bh_read(bh_it->second, rd->fadvise_flags);
+         bh_read(bh_it->second, rd->fadvise_flags, *trace);
          if ((success && onfinish) || last != missing.end())
            last = bh_it;
        }
@@ -1447,7 +1476,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
        ldout(cct, 10) << "readx missed, waiting on " << *last->second
          << " off " << last->first << dendl;
        last->second->waitfor_read[last->first].push_back(
-         new C_RetryRead(this, rd, oset, onfinish, trace) );
+         new C_RetryRead(this, rd, oset, onfinish, *trace) );
 
       }
 
@@ -1460,7 +1489,7 @@ int ObjectCacher::_readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
          ldout(cct, 10) << "readx missed, waiting on " << *bh_it->second
                         << " off " << bh_it->first << dendl;
          bh_it->second->waitfor_read[bh_it->first].push_back(
-           new C_RetryRead(this, rd, oset, onfinish, trace) );
+           new C_RetryRead(this, rd, oset, onfinish, *trace) );
        }
        bytes_not_in_cache += bh_it->second->length();
        success = false;
@@ -1628,7 +1657,7 @@ void ObjectCacher::retry_waiting_reads()
 }
 
 int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
-                        ZTracer::Trace *trace)
+                        ZTracer::Trace *parent_trace)
 {
   assert(lock.is_locked());
   ceph::real_time now = ceph::real_clock::now();
@@ -1637,6 +1666,12 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
   bool dontneed = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
   bool nocache = wr->fadvise_flags & LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
 
+  ZTracer::Trace trace;
+  if (parent_trace != nullptr) {
+    trace.init("write", &trace_endpoint, parent_trace);
+    trace.event("start");
+  }
+
   for (vector<ObjectExtent>::iterator ex_it = wr->extents.begin();
        ex_it != wr->extents.end();
        ++ex_it) {
@@ -1646,10 +1681,10 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
                           ex_it->truncate_size, oset->truncate_seq);
 
     // map it all into a single bufferhead.
-    BufferHead *bh = o->map_write(*ex_it, wr->journal_tid, trace);
+    BufferHead *bh = o->map_write(*ex_it, wr->journal_tid);
     bool missing = bh->is_missing();
     bh->snapc = wr->snapc;
-    
+
     bytes_written += ex_it->length;
     if (bh->is_tx()) {
       bytes_written_in_flush += ex_it->length;
@@ -1708,7 +1743,7 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
     }
   }
 
-  int r = _wait_for_write(wr, bytes_written, oset, onfreespace);
+  int r = _wait_for_write(wr, bytes_written, oset, &trace, onfreespace);
   delete wr;
 
   //verify_stats();
@@ -1718,23 +1753,26 @@ int ObjectCacher::writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
 
 class ObjectCacher::C_WaitForWrite : public Context {
 public:
-  C_WaitForWrite(ObjectCacher *oc, uint64_t len, Context *onfinish) :
-    m_oc(oc), m_len(len), m_onfinish(onfinish) {}
+  C_WaitForWrite(ObjectCacher *oc, uint64_t len,
+                 const ZTracer::Trace &trace, Context *onfinish) :
+    m_oc(oc), m_len(len), m_trace(trace), m_onfinish(onfinish) {}
   void finish(int r) override;
 private:
   ObjectCacher *m_oc;
   uint64_t m_len;
+  ZTracer::Trace m_trace;
   Context *m_onfinish;
 };
 
 void ObjectCacher::C_WaitForWrite::finish(int r)
 {
   Mutex::Locker l(m_oc->lock);
-  m_oc->maybe_wait_for_writeback(m_len);
+  m_oc->maybe_wait_for_writeback(m_len, &m_trace);
   m_onfinish->complete(r);
 }
 
-void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
+void ObjectCacher::maybe_wait_for_writeback(uint64_t len,
+                                            ZTracer::Trace *trace)
 {
   assert(lock.is_locked());
   ceph::mono_time start = ceph::mono_clock::now();
@@ -1747,6 +1785,9 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
   while (get_stat_dirty() + get_stat_tx() > 0 &&
         (uint64_t) (get_stat_dirty() + get_stat_tx()) >=
         max_dirty + get_stat_dirty_waiting()) {
+    if (blocked == 0) {
+      trace->event("start wait for writeback");
+    }
     ldout(cct, 10) << __func__ << " waiting for dirty|tx "
                   << (get_stat_dirty() + get_stat_tx()) << " >= max "
                   << max_dirty << " + dirty_waiting "
@@ -1758,6 +1799,9 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
     ++blocked;
     ldout(cct, 10) << __func__ << " woke up" << dendl;
   }
+  if (blocked > 0) {
+    trace->event("finish wait for writeback");
+  }
   if (blocked && perfcounter) {
     perfcounter->inc(l_objectcacher_write_ops_blocked);
     perfcounter->inc(l_objectcacher_write_bytes_blocked, len);
@@ -1768,19 +1812,20 @@ void ObjectCacher::maybe_wait_for_writeback(uint64_t len)
 
 // blocking wait for write.
 int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
-                                 Context *onfreespace)
+                                 ZTracer::Trace *trace, Context *onfreespace)
 {
   assert(lock.is_locked());
+  assert(trace != nullptr);
   int ret = 0;
 
   if (max_dirty > 0) {
     if (block_writes_upfront) {
-      maybe_wait_for_writeback(len);
+      maybe_wait_for_writeback(len, trace);
       if (onfreespace)
        onfreespace->complete(0);
     } else {
       assert(onfreespace);
-      finisher.queue(new C_WaitForWrite(this, len, onfreespace));
+      finisher.queue(new C_WaitForWrite(this, len, *trace, onfreespace));
     }
   } else {
     // write-thru!  flush what we just wrote.
@@ -1789,7 +1834,7 @@ int ObjectCacher::_wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
     Context *fin = block_writes_upfront ?
       new C_Cond(&cond, &done, &ret) : onfreespace;
     assert(fin);
-    bool flushed = flush_set(oset, wr->extents, fin);
+    bool flushed = flush_set(oset, wr->extents, trace, fin);
     assert(!flushed);   // we just dirtied it, and didn't drop our lock!
     ldout(cct, 10) << "wait_for_write waiting on write-thru of " << len
                   << " bytes" << dendl;
@@ -1828,12 +1873,19 @@ void ObjectCacher::flusher_entry()
                   << max_dirty << " max)"
                   << dendl;
     loff_t actual = get_stat_dirty() + get_stat_dirty_waiting();
+
+    ZTracer::Trace trace;
+    if (cct->_conf->osdc_blkin_trace_all) {
+      trace.init("flusher", &trace_endpoint);
+      trace.event("start");
+    }
+
     if (actual > 0 && (uint64_t) actual > target_dirty) {
       // flush some dirty pages
       ldout(cct, 10) << "flusher " << get_stat_dirty() << " dirty + "
                     << get_stat_dirty_waiting() << " dirty_waiting > target "
                     << target_dirty << ", flushing some dirty bhs" << dendl;
-      flush(actual - target_dirty);
+      flush(&trace, actual - target_dirty);
     } else {
       // check tail of lru for old dirty items
       ceph::real_time cutoff = ceph::real_clock::now();
@@ -1848,17 +1900,20 @@ void ObjectCacher::flusher_entry()
        if (scattered_write) {
          bh_write_adjacencies(bh, cutoff, NULL, &max);
         } else {
-         bh_write(bh);
+         bh_write(bh, trace);
          --max;
        }
       }
       if (!max) {
        // back off the lock to avoid starving other threads
+        trace.event("backoff");
        lock.Unlock();
        lock.Lock();
        continue;
       }
     }
+
+    trace.event("finish");
     if (flusher_stop)
       break;
 
@@ -1955,8 +2010,10 @@ void ObjectCacher::purge(Object *ob)
 // true if clean, already flushed.
 // false if we wrote something.
 // be sloppy about the ranges and flush any buffer it touches
-bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length)
+bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length,
+                         ZTracer::Trace *trace)
 {
+  assert(trace != nullptr);
   assert(lock.is_locked());
   list<BufferHead*> blist;
   bool clean = true;
@@ -1980,7 +2037,7 @@ bool ObjectCacher::flush(Object *ob, loff_t offset, loff_t length)
     if (scattered_write)
       blist.push_back(bh);
     else
-      bh_write(bh);
+      bh_write(bh, *trace);
     clean = false;
   }
   if (scattered_write && !blist.empty())
@@ -2029,7 +2086,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
   // Buffer heads in dirty_or_tx_bh are sorted in ObjectSet/Object/offset
   // order. But items in oset->objects are not sorted. So the iterator can
   // point to any buffer head in the ObjectSet
-  BufferHead key(*oset->objects.begin(), nullptr);
+  BufferHead key(*oset->objects.begin());
   it = dirty_or_tx_bh.lower_bound(&key);
   p = q = it;
 
@@ -2056,7 +2113,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
        }
        blist.push_back(bh);
       } else {
-       bh_write(bh);
+       bh_write(bh, {});
       }
     }
   }
@@ -2082,7 +2139,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
          }
          blist.push_front(bh);
        } else {
-         bh_write(bh);
+         bh_write(bh, {});
        }
       }
       if (!backwards)
@@ -2109,9 +2166,10 @@ bool ObjectCacher::flush_set(ObjectSet *oset, Context *onfinish)
 // flush.  non-blocking, takes callback.
 // returns true if already flushed
 bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv,
-                            Context *onfinish)
+                            ZTracer::Trace *trace, Context *onfinish)
 {
   assert(lock.is_locked());
+  assert(trace != nullptr);
   assert(onfinish != NULL);
   if (oset->objects.empty()) {
     ldout(cct, 10) << "flush_set on " << oset << " dne" << dendl;
@@ -2137,7 +2195,7 @@ bool ObjectCacher::flush_set(ObjectSet *oset, vector<ObjectExtent>& exv,
     ldout(cct, 20) << "flush_set " << oset << " ex " << ex << " ob " << soid
                   << " " << ob << dendl;
 
-    if (!flush(ob, ex.offset, ex.length)) {
+    if (!flush(ob, ex.offset, ex.length, trace)) {
       // we'll need to gather...
       ldout(cct, 10) << "flush_set " << oset << " will wait for ack tid "
                     << ob->last_write_tid << " on " << *ob << dendl;
@@ -2181,7 +2239,7 @@ bool ObjectCacher::flush_all(Context *onfinish)
        }
        blist.push_back(bh);
       } else {
-       bh_write(bh);
+       bh_write(bh, {});
       }
     }
 
index c6c49deb2b5f92ec0f7e0d0bb4ae7fc88cd55933..31201a7235422df27ed375b6c26776721e43d310 100644 (file)
@@ -131,10 +131,9 @@ class ObjectCacher {
     int error; // holds return value for failed reads
 
     map<loff_t, list<Context*> > waitfor_read;
-    ZTracer::Trace b_trace;
 
     // cons
-    explicit BufferHead(Object *o, ZTracer::Trace *trace) :
+    explicit BufferHead(Object *o) :
       state(STATE_MISSING),
       ref(0),
       dontneed(false),
@@ -145,9 +144,6 @@ class ObjectCacher {
       journal_tid(0),
       error(0) {
       ex.start = ex.length = 0;
-      if (trace) {
-        b_trace = *trace;
-      }
     }
 
     // extent
@@ -353,11 +349,9 @@ class ObjectCacher {
                  map<loff_t, BufferHead*>& hits,
                  map<loff_t, BufferHead*>& missing,
                  map<loff_t, BufferHead*>& rx,
-                map<loff_t, BufferHead*>& errors,
-                ZTracer::Trace *trace);
-    BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid,
-                         ZTracer::Trace *trace);
-    
+                map<loff_t, BufferHead*>& errors);
+    BufferHead *map_write(ObjectExtent &ex, ceph_tid_t tid);
+
     void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
     void truncate(loff_t s);
     void discard(loff_t off, loff_t len);
@@ -410,6 +404,8 @@ class ObjectCacher {
   ceph::timespan max_dirty_age;
   bool block_writes_upfront;
 
+  ZTracer::Endpoint trace_endpoint;
+
   flush_set_callback_t flush_set_callback;
   void *flush_set_callback_arg;
 
@@ -526,14 +522,15 @@ class ObjectCacher {
   void bh_remove(Object *ob, BufferHead *bh);
 
   // io
-  void bh_read(BufferHead *bh, int op_flags);
-  void bh_write(BufferHead *bh);
+  void bh_read(BufferHead *bh, int op_flags,
+               const ZTracer::Trace &parent_trace);
+  void bh_write(BufferHead *bh, const ZTracer::Trace &parent_trace);
   void bh_write_scattered(list<BufferHead*>& blist);
   void bh_write_adjacencies(BufferHead *bh, ceph::real_time cutoff,
                            int64_t *amount, int *max_count);
 
   void trim();
-  void flush(loff_t amount=0);
+  void flush(ZTracer::Trace *trace, loff_t amount=0);
 
   /**
    * flush a range of buffers
@@ -546,7 +543,8 @@ class ObjectCacher {
    * @param len extent length, or 0 for entire object
    * @return true if object was already clean/flushed.
    */
-  bool flush(Object *o, loff_t off, loff_t len);
+  bool flush(Object *o, loff_t off, loff_t len,
+             ZTracer::Trace *trace);
   loff_t release(Object *o);
   void purge(Object *o);
 
@@ -605,17 +603,17 @@ class ObjectCacher {
    * the return value is total bytes read
    */
   int readx(OSDRead *rd, ObjectSet *oset, Context *onfinish,
-           ZTracer::Trace *trace = nullptr);
+           ZTracer::Trace *parent_trace = nullptr);
   int writex(OSDWrite *wr, ObjectSet *oset, Context *onfreespace,
-            ZTracer::Trace *trace = nullptr);
+            ZTracer::Trace *parent_trace = nullptr);
   bool is_cached(ObjectSet *oset, vector<ObjectExtent>& extents,
                 snapid_t snapid);
 
 private:
   // write blocking
   int _wait_for_write(OSDWrite *wr, uint64_t len, ObjectSet *oset,
-                     Context *onfreespace);
-  void maybe_wait_for_writeback(uint64_t len);
+                      ZTracer::Trace *trace, Context *onfreespace);
+  void maybe_wait_for_writeback(uint64_t len, ZTracer::Trace *trace);
   bool _flush_set_finish(C_GatherBuilder *gather, Context *onfinish);
 
 public:
@@ -625,7 +623,7 @@ public:
 
   bool flush_set(ObjectSet *oset, Context *onfinish=0);
   bool flush_set(ObjectSet *oset, vector<ObjectExtent>& ex,
-                Context *onfinish = 0);
+                 ZTracer::Trace *trace, Context *onfinish = 0);
   bool flush_all(Context *onfinish = 0);
 
   void purge_set(ObjectSet *oset);
@@ -698,7 +696,8 @@ public:
     vector<ObjectExtent> extents;
     Striper::file_to_extents(cct, oset->ino, layout, offset, len,
                             oset->truncate_size, extents);
-    return flush_set(oset, extents, onfinish);
+    ZTracer::Trace trace;
+    return flush_set(oset, extents, &trace, onfinish);
   }
 };
 
index 4485f7e460593b328b964f33dbd540fb69a0910c..bf90fdd0ac000f9011a80a6943e65dc057002ace 100644 (file)
@@ -15,8 +15,8 @@ class WritebackHandler {
   virtual void read(const object_t& oid, uint64_t object_no,
                    const object_locator_t& oloc, uint64_t off, uint64_t len,
                    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-                   __u32 trunc_seq, int op_flags, Context *onfinish,
-                   ZTracer::Trace *trace = nullptr) = 0;
+                   __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace, Context *onfinish) = 0;
   /**
    * check if a given extent read result may change due to a write
    *
@@ -35,8 +35,9 @@ class WritebackHandler {
                           const SnapContext& snapc,
                           const bufferlist &bl, ceph::real_time mtime,
                           uint64_t trunc_size, __u32 trunc_seq,
-                           ceph_tid_t journal_tid, Context *oncommit,
-                           ZTracer::Trace *trace = nullptr) = 0;
+                           ceph_tid_t journal_tid,
+                           const ZTracer::Trace &parent_trace,
+                           Context *oncommit) = 0;
 
   virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len,
                                 ceph_tid_t original_journal_tid,
index 29d94fc25e1d07abf0a4a344db796f3e6f1ce8da..72e80e433f6d79cf00daaadd260b132f29942c14 100644 (file)
@@ -62,8 +62,9 @@ void FakeWriteback::read(const object_t& oid, uint64_t object_no,
                         const object_locator_t& oloc,
                         uint64_t off, uint64_t len, snapid_t snapid,
                         bufferlist *pbl, uint64_t trunc_size,
-                        __u32 trunc_seq, int op_flags, Context *onfinish,
-                        ZTracer::Trace *trace)
+                        __u32 trunc_seq, int op_flags,
+                         const ZTracer::Trace &parent_trace,
+                         Context *onfinish)
 {
   C_Delay *wrapper = new C_Delay(m_cct, onfinish, m_lock, off, pbl,
                                 m_delay_ns);
@@ -76,8 +77,9 @@ ceph_tid_t FakeWriteback::write(const object_t& oid,
                                const SnapContext& snapc,
                                const bufferlist &bl, ceph::real_time mtime,
                                uint64_t trunc_size, __u32 trunc_seq,
-                               ceph_tid_t journal_tid, Context *oncommit,
-                               ZTracer::Trace *trace)
+                               ceph_tid_t journal_tid,
+                                const ZTracer::Trace &parent_trace,
+                                Context *oncommit)
 {
   C_Delay *wrapper = new C_Delay(m_cct, oncommit, m_lock, off, NULL,
                                 m_delay_ns);
index d0c5449a8280a537bad995aeb146789c87459c37..6112eb7208262c28c32ef0cfee9fea87d4b1733b 100644 (file)
@@ -20,15 +20,17 @@ public:
   void read(const object_t& oid, uint64_t object_no,
                    const object_locator_t& oloc, uint64_t off, uint64_t len,
                    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-                   __u32 trunc_seq, int op_flags, Context *onfinish,
-                   ZTracer::Trace *trace) override;
+                   __u32 trunc_seq, int op_flags,
+                   const ZTracer::Trace &parent_trace,
+                    Context *onfinish) override;
 
   ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
                           uint64_t off, uint64_t len,
                           const SnapContext& snapc, const bufferlist &bl,
                           ceph::real_time mtime, uint64_t trunc_size,
                           __u32 trunc_seq, ceph_tid_t journal_tid,
-                          Context *oncommit, ZTracer::Trace *trace) override;
+                           const ZTracer::Trace &parent_trace,
+                          Context *oncommit) override;
 
   using WritebackHandler::write;
 
index cc9bda9fe98d59942a41171adfbd58f27d7ac239..e9e1f9fe345baaaa4f3761bd244aea1a98969e27 100644 (file)
@@ -92,8 +92,9 @@ void MemWriteback::read(const object_t& oid, uint64_t object_no,
                         const object_locator_t& oloc,
                         uint64_t off, uint64_t len, snapid_t snapid,
                         bufferlist *pbl, uint64_t trunc_size,
-                        __u32 trunc_seq, int op_flags, Context *onfinish,
-                        ZTracer::Trace *trace)
+                        __u32 trunc_seq, int op_flags,
+                         const ZTracer::Trace &parent_trace,
+                         Context *onfinish)
 {
   assert(snapid == CEPH_NOSNAP);
   C_DelayRead *wrapper = new C_DelayRead(this, m_cct, onfinish, m_lock, oid,
@@ -107,8 +108,9 @@ ceph_tid_t MemWriteback::write(const object_t& oid,
                                const SnapContext& snapc,
                                const bufferlist &bl, ceph::real_time mtime,
                                uint64_t trunc_size, __u32 trunc_seq,
-                               ceph_tid_t journal_tid, Context *oncommit,
-                               ZTracer::Trace *trace)
+                               ceph_tid_t journal_tid,
+                                const ZTracer::Trace &parent_trace,
+                                Context *oncommit)
 {
   assert(snapc.seq == 0);
   C_DelayWrite *wrapper = new C_DelayWrite(this, m_cct, oncommit, m_lock, oid,
index c4daae937b2383de35f8381a92562520003095df..a073cbf1760dfdbc6e70cd63ba085e8378f31171 100644 (file)
@@ -20,15 +20,17 @@ public:
   void read(const object_t& oid, uint64_t object_no,
                    const object_locator_t& oloc, uint64_t off, uint64_t len,
                    snapid_t snapid, bufferlist *pbl, uint64_t trunc_size,
-                   __u32 trunc_seq, int op_flags, Context *onfinish,
-                   ZTracer::Trace *trace) override;
+                   __u32 trunc_seq, int op_flags,
+                    const ZTracer::Trace &parent_trace,
+                    Context *onfinish) override;
 
   ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
                           uint64_t off, uint64_t len,
                           const SnapContext& snapc, const bufferlist &bl,
                           ceph::real_time mtime, uint64_t trunc_size,
                           __u32 trunc_seq, ceph_tid_t journal_tid,
-                          Context *oncommit, ZTracer::Trace *trace) override;
+                           const ZTracer::Trace &parent_trace,
+                          Context *oncommit) override;
 
   using WritebackHandler::write;