From 8697db15c4ff59cd0c4dc021652576e72351dbaf Mon Sep 17 00:00:00 2001 From: xie xingguo Date: Thu, 6 Jun 2019 16:12:27 +0800 Subject: [PATCH] osdc/Objecter: pg-mapping cache The CRUSH-based addressing is some kind of CPU intensive task and hence should be avoid whenever possible. The patch introduces a per objecter pg-mapping cache, which as a result can saved us 10%+ CPU (fio, 4k randow read, 24k IOPS): Was: $ top -Hp 415864 top - 14:45:39 up 6 days, 5:45, 3 users, load average: 9.67, 8.66, 8.45 Threads: 23 total, 3 running, 20 sleeping, 0 stopped, 0 zombie %Cpu(s): 21.6 us, 14.5 sy, 0.0 ni, 59.6 id, 1.4 wa, 0.0 hi, 3.0 si, 0.0 st KiB Mem : 19616576+total, 12773412+free, 50964508 used, 17467128 buff/cache KiB Swap: 4194300 total, 4194300 free, 0 used. 14352892+avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 415942 root 20 0 1957800 492820 123880 R 80.0 0.3 0:13.75 tp_librbd Now: $ top -Hp 475779 top - 10:22:05 up 1:18, 4 users, load average: 2.65, 1.44, 1.60 Threads: 23 total, 2 running, 21 sleeping, 0 stopped, 0 zombie %Cpu(s): 6.1 us, 2.2 sy, 0.0 ni, 91.4 id, 0.0 wa, 0.0 hi, 0.2 si, 0.0 st KiB Mem : 19616576+total, 13919555+free, 27280820 used, 29689392 buff/cache KiB Swap: 4194300 total, 4194300 free, 0 used. 16798102+avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 476231 root 20 0 1957808 491712 123836 S 71.1 0.3 0:41.47 tp_librbd And below is a more detailed report from the perf tool: Was: $ perf report -g graph Children Self Command Shared Object Symbol + 29.33% 0.03% tp_librbd librados.so.2.0.0 [.] Objecter::op_submit + 28.83% 0.09% tp_librbd librados.so.2.0.0 [.] Objecter::_op_submit_with_budget + 27.12% 0.17% tp_librbd librados.so.2.0.0 [.] Objecter::_op_submit + 16.47% 0.26% tp_librbd librados.so.2.0.0 [.] Objecter::_calc_target + 15.04% 0.10% tp_librbd libceph-common.so.0 [.] OSDMap::_pg_to_up_acting_osds + 13.52% 0.16% tp_librbd libceph-common.so.0 [.] OSDMap::_pg_to_raw_osds Now: $ perf report -g graph Children Self Command Shared Object Symbol + 17.84% 0.04% tp_librbd librados.so.2.0.0 [.] Objecter::op_submit + 17.34% 0.06% tp_librbd librados.so.2.0.0 [.] Objecter::_op_submit_with_budget + 15.80% 0.17% tp_librbd librados.so.2.0.0 [.] Objecter::_op_submit + 6.11% 2.02% tp_librbd librados.so.2.0.0 [.] Objecter::_calc_target Signed-off-by: xie xingguo --- src/osdc/Objecter.cc | 24 ++++++++++++++++--- src/osdc/Objecter.h | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 7140522237ba8..7aa5161e338cd 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -412,6 +412,7 @@ void Objecter::start(const OSDMap* o) start_tick(); if (o) { osdmap->deepish_copy_from(*o); + prune_pg_mapping(osdmap->get_pools()); } else if (osdmap->get_epoch() == 0) { _maybe_request_map(); } @@ -1233,6 +1234,7 @@ void Objecter::handle_osd_map(MOSDMap *m) } logger->set(l_osdc_map_epoch, osdmap->get_epoch()); + prune_pg_mapping(osdmap->get_pools()); cluster_full = cluster_full || _osdmap_full_flag(); update_pool_full_map(pool_full_map); @@ -1279,6 +1281,7 @@ void Objecter::handle_osd_map(MOSDMap *m) ldout(cct, 3) << "handle_osd_map decoding full epoch " << m->get_last() << dendl; osdmap->decode(m->maps[m->get_last()]); + prune_pg_mapping(osdmap->get_pools()); _scan_requests(homeless_session, false, false, NULL, need_resend, need_resend_linger, @@ -2861,11 +2864,26 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) int size = pi->size; int min_size = pi->min_size; unsigned pg_num = pi->get_pg_num(); + unsigned pg_num_mask = pi->get_pg_num_mask(); unsigned pg_num_pending = pi->get_pg_num_pending(); int up_primary, acting_primary; vector up, acting; - osdmap->pg_to_up_acting_osds(pgid, &up, &up_primary, - &acting, &acting_primary); + ps_t actual_ps = ceph_stable_mod(pgid.ps(), pg_num, pg_num_mask); + pg_t actual_pgid(actual_ps, pgid.pool()); + pg_mapping_t pg_mapping; + pg_mapping.epoch = osdmap->get_epoch(); + if (lookup_pg_mapping(actual_pgid, &pg_mapping)) { + up = pg_mapping.up; + up_primary = pg_mapping.up_primary; + acting = pg_mapping.acting; + acting_primary = pg_mapping.acting_primary; + } else { + osdmap->pg_to_up_acting_osds(actual_pgid, &up, &up_primary, + &acting, &acting_primary); + pg_mapping_t pg_mapping(osdmap->get_epoch(), + up, up_primary, acting, acting_primary); + update_pg_mapping(actual_pgid, std::move(pg_mapping)); + } bool sort_bitwise = osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE); bool recovery_deletes = osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES); unsigned prev_seed = ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask); @@ -2924,7 +2942,7 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change) t->size = size; t->min_size = min_size; t->pg_num = pg_num; - t->pg_num_mask = pi->get_pg_num_mask(); + t->pg_num_mask = pg_num_mask; t->pg_num_pending = pg_num_pending; osdmap->get_primary_shard( pg_t(ceph_stable_mod(pgid.ps(), t->pg_num, t->pg_num_mask), pgid.pool()), diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 31c9b3a77629e..2ebd3a5ccc053 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -1250,6 +1250,62 @@ private: // to be drained by consume_blacklist_events. bool blacklist_events_enabled; std::set blacklist_events; + struct pg_mapping_t { + epoch_t epoch = 0; + std::vector up; + int up_primary = -1; + std::vector acting; + int acting_primary = -1; + + pg_mapping_t() {} + pg_mapping_t(epoch_t epoch, std::vector up, int up_primary, + std::vector acting, int acting_primary) + : epoch(epoch), up(up), up_primary(up_primary), + acting(acting), acting_primary(acting_primary) {} + }; + std::shared_mutex pg_mapping_lock; + // pool -> pg mapping + std::map> pg_mappings; + + // convenient accessors + bool lookup_pg_mapping(const pg_t& pg, pg_mapping_t* pg_mapping) { + std::shared_lock l{pg_mapping_lock}; + auto it = pg_mappings.find(pg.pool()); + if (it == pg_mappings.end()) + return false; + auto& mapping_array = it->second; + if (pg.ps() >= mapping_array.size()) + return false; + if (mapping_array[pg.ps()].epoch != pg_mapping->epoch) // stale + return false; + *pg_mapping = mapping_array[pg.ps()]; + return true; + } + void update_pg_mapping(const pg_t& pg, pg_mapping_t&& pg_mapping) { + std::lock_guard l{pg_mapping_lock}; + auto& mapping_array = pg_mappings[pg.pool()]; + ceph_assert(pg.ps() < mapping_array.size()); + mapping_array[pg.ps()] = std::move(pg_mapping); + } + void prune_pg_mapping(const mempool::osdmap::map& pools) { + std::lock_guard l{pg_mapping_lock}; + for (auto& pool : pools) { + auto& mapping_array = pg_mappings[pool.first]; + size_t pg_num = pool.second.get_pg_num(); + if (mapping_array.size() != pg_num) { + // catch both pg_num increasing & decreasing + mapping_array.resize(pg_num); + } + } + for (auto it = pg_mappings.begin(); it != pg_mappings.end(); ) { + if (!pools.count(it->first)) { + // pool is gone + pg_mappings.erase(it++); + continue; + } + it++; + } + } public: void maybe_request_map(); -- 2.39.5