]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
osdc/Objecter: pg-mapping cache 28487/head
authorxie xingguo <xie.xingguo@zte.com.cn>
Thu, 6 Jun 2019 08:12:27 +0000 (16:12 +0800)
committerxie xingguo <xie.xingguo@zte.com.cn>
Fri, 14 Jun 2019 06:42:31 +0000 (14:42 +0800)
The CRUSH-based addressing is some kind of CPU intensive task
and hence should be avoid whenever possible.

The patch introduces a per objecter pg-mapping cache, which as
a result can saved us 10%+ CPU (fio, 4k randow read, 24k IOPS):

Was:
$ top -Hp 415864
top - 14:45:39 up 6 days,  5:45,  3 users,  load average: 9.67, 8.66, 8.45
Threads:  23 total,   3 running,  20 sleeping,   0 stopped,   0 zombie
%Cpu(s): 21.6 us, 14.5 sy,  0.0 ni, 59.6 id,  1.4 wa,  0.0 hi,  3.0 si,  0.0 st
KiB Mem : 19616576+total, 12773412+free, 50964508 used, 17467128 buff/cache
KiB Swap:  4194300 total,  4194300 free,        0 used. 14352892+avail Mem

    PID USER      PR  NI    VIRT    RES    SHR S %CPU %MEM     TIME+ COMMAND
 415942 root      20   0 1957800 492820 123880 R 80.0  0.3   0:13.75 tp_librbd

Now:
$ top -Hp 475779
top - 10:22:05 up  1:18,  4 users,  load average: 2.65, 1.44, 1.60
Threads:  23 total,   2 running,  21 sleeping,   0 stopped,   0 zombie
%Cpu(s):  6.1 us,  2.2 sy,  0.0 ni, 91.4 id,  0.0 wa,  0.0 hi,  0.2 si,  0.0 st
KiB Mem : 19616576+total, 13919555+free, 27280820 used, 29689392 buff/cache
KiB Swap:  4194300 total,  4194300 free,        0 used. 16798102+avail Mem

    PID USER      PR  NI    VIRT    RES    SHR S %CPU %MEM     TIME+ COMMAND
 476231 root      20   0 1957808 491712 123836 S 71.1  0.3   0:41.47 tp_librbd

And below is a more detailed report from the perf tool:

Was:
$ perf report -g graph
  Children      Self  Command    Shared Object        Symbol
+   29.33%     0.03%  tp_librbd  librados.so.2.0.0    [.] Objecter::op_submit
+   28.83%     0.09%  tp_librbd  librados.so.2.0.0    [.] Objecter::_op_submit_with_budget
+   27.12%     0.17%  tp_librbd  librados.so.2.0.0    [.] Objecter::_op_submit
+   16.47%     0.26%  tp_librbd  librados.so.2.0.0    [.] Objecter::_calc_target
+   15.04%     0.10%  tp_librbd  libceph-common.so.0  [.] OSDMap::_pg_to_up_acting_osds
+   13.52%     0.16%  tp_librbd  libceph-common.so.0  [.] OSDMap::_pg_to_raw_osds

Now:
$ perf report -g graph
  Children      Self  Command    Shared Object        Symbol
+   17.84%     0.04%  tp_librbd  librados.so.2.0.0    [.] Objecter::op_submit
+   17.34%     0.06%  tp_librbd  librados.so.2.0.0    [.] Objecter::_op_submit_with_budget
+   15.80%     0.17%  tp_librbd  librados.so.2.0.0    [.] Objecter::_op_submit
+    6.11%     2.02%  tp_librbd  librados.so.2.0.0    [.] Objecter::_calc_target

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
src/osdc/Objecter.cc
src/osdc/Objecter.h

index 7140522237ba895467e246f09d68bf0b521abfe0..7aa5161e338cd4fb6c930ab277293073dfb8afd3 100644 (file)
@@ -412,6 +412,7 @@ void Objecter::start(const OSDMap* o)
   start_tick();
   if (o) {
     osdmap->deepish_copy_from(*o);
+    prune_pg_mapping(osdmap->get_pools());
   } else if (osdmap->get_epoch() == 0) {
     _maybe_request_map();
   }
@@ -1233,6 +1234,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
        }
        logger->set(l_osdc_map_epoch, osdmap->get_epoch());
 
+        prune_pg_mapping(osdmap->get_pools());
        cluster_full = cluster_full || _osdmap_full_flag();
        update_pool_full_map(pool_full_map);
 
@@ -1279,6 +1281,7 @@ void Objecter::handle_osd_map(MOSDMap *m)
        ldout(cct, 3) << "handle_osd_map decoding full epoch "
                      << m->get_last() << dendl;
        osdmap->decode(m->maps[m->get_last()]);
+        prune_pg_mapping(osdmap->get_pools());
 
        _scan_requests(homeless_session, false, false, NULL,
                       need_resend, need_resend_linger,
@@ -2861,11 +2864,26 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
   int size = pi->size;
   int min_size = pi->min_size;
   unsigned pg_num = pi->get_pg_num();
+  unsigned pg_num_mask = pi->get_pg_num_mask();
   unsigned pg_num_pending = pi->get_pg_num_pending();
   int up_primary, acting_primary;
   vector<int> up, acting;
-  osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary,
-                              &acting, &acting_primary);
+  ps_t actual_ps = ceph_stable_mod(pgid.ps(), pg_num, pg_num_mask);
+  pg_t actual_pgid(actual_ps, pgid.pool());
+  pg_mapping_t pg_mapping;
+  pg_mapping.epoch = osdmap->get_epoch();
+  if (lookup_pg_mapping(actual_pgid, &pg_mapping)) {
+    up = pg_mapping.up;
+    up_primary = pg_mapping.up_primary;
+    acting = pg_mapping.acting;
+    acting_primary = pg_mapping.acting_primary;
+  } else {
+    osdmap->pg_to_up_acting_osds(actual_pgid, &up, &up_primary,
+                                 &acting, &acting_primary);
+    pg_mapping_t pg_mapping(osdmap->get_epoch(),
+                            up, up_primary, acting, acting_primary);
+    update_pg_mapping(actual_pgid, std::move(pg_mapping));
+  }
   bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE);
   bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES);
   unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask);
@@ -2924,7 +2942,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
     t->size = size;
     t->min_size = min_size;
     t->pg_num = pg_num;
-    t->pg_num_mask = pi->get_pg_num_mask();
+    t->pg_num_mask = pg_num_mask;
     t->pg_num_pending = pg_num_pending;
     osdmap->get_primary_shard(
       pg_t(ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask), pgid.pool()),
index 31c9b3a77629e31a6dec5334714e79f06cca5096..2ebd3a5ccc0537fd8007b433f907097afdd98582 100644 (file)
@@ -1250,6 +1250,62 @@ private:
   // to be drained by consume_blacklist_events.
   bool blacklist_events_enabled;
   std::set<entity_addr_t> blacklist_events;
+  struct pg_mapping_t {
+    epoch_t epoch = 0;
+    std::vector<int> up;
+    int up_primary = -1;
+    std::vector<int> acting;
+    int acting_primary = -1;
+
+    pg_mapping_t() {}
+    pg_mapping_t(epoch_t epoch, std::vector<int> up, int up_primary,
+                 std::vector<int> acting, int acting_primary)
+               : epoch(epoch), up(up), up_primary(up_primary),
+                 acting(acting), acting_primary(acting_primary) {}
+  };
+  std::shared_mutex pg_mapping_lock;
+  // pool -> pg mapping
+  std::map<int64_t, std::vector<pg_mapping_t>> pg_mappings;
+
+  // convenient accessors
+  bool lookup_pg_mapping(const pg_t& pg, pg_mapping_t* pg_mapping) {
+    std::shared_lock l{pg_mapping_lock};
+    auto it = pg_mappings.find(pg.pool());
+    if (it == pg_mappings.end())
+      return false;
+    auto& mapping_array = it->second;
+    if (pg.ps() >= mapping_array.size())
+      return false;
+    if (mapping_array[pg.ps()].epoch != pg_mapping->epoch) // stale
+      return false;
+    *pg_mapping = mapping_array[pg.ps()];
+    return true;
+  }
+  void update_pg_mapping(const pg_t& pg, pg_mapping_t&& pg_mapping) {
+    std::lock_guard l{pg_mapping_lock};
+    auto& mapping_array = pg_mappings[pg.pool()];
+    ceph_assert(pg.ps() < mapping_array.size());
+    mapping_array[pg.ps()] = std::move(pg_mapping);
+  }
+  void prune_pg_mapping(const mempool::osdmap::map<int64_t,pg_pool_t>& pools) {
+    std::lock_guard l{pg_mapping_lock};
+    for (auto& pool : pools) {
+      auto& mapping_array = pg_mappings[pool.first];
+      size_t pg_num = pool.second.get_pg_num();
+      if (mapping_array.size() != pg_num) {
+        // catch both pg_num increasing & decreasing
+        mapping_array.resize(pg_num);
+      }
+    }
+    for (auto it = pg_mappings.begin(); it != pg_mappings.end(); ) {
+      if (!pools.count(it->first)) {
+        // pool is gone
+        pg_mappings.erase(it++);
+        continue;
+      }
+      it++;
+    }
+  }
 
 public:
   void maybe_request_map();