From 7fe0ac7c1159d3ee81de27365dcf76ba6f4a2fb5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Lo=C3=AFc=20Dachary?= Date: Fri, 19 Mar 2021 08:29:08 +0100 Subject: [PATCH] qa: verify the benefits of mempool cacheline optimization MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There already is a test to verify the mempool sharding works, in the sense that it uses at least half of the variables available to count the number of allocated objects and their total size. This new test verifies that, with sharding, object counting is at least twice faster than without sharding. It also collects cacheline contention data with the perf c2c tool. The manual analysis of this data shows the optimization gain is indeed related to cacheline contention. Fixes: https://tracker.ceph.com/issues/49896 Signed-off-by: Loïc Dachary --- qa/standalone/c2c/c2c.sh | 84 +++++++++++++++++ qa/suites/rados/standalone/workloads/c2c.yaml | 18 ++++ src/include/mempool.h | 11 ++- src/test/CMakeLists.txt | 14 +++ src/test/test_c2c.cc | 89 +++++++++++++++++++ 5 files changed, 215 insertions(+), 1 deletion(-) create mode 100755 qa/standalone/c2c/c2c.sh create mode 100644 qa/suites/rados/standalone/workloads/c2c.yaml create mode 100644 src/test/test_c2c.cc diff --git a/qa/standalone/c2c/c2c.sh b/qa/standalone/c2c/c2c.sh new file mode 100755 index 0000000000000..a6969d555d829 --- /dev/null +++ b/qa/standalone/c2c/c2c.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash + +set -ex + +function run_perf_c2c() { + # First get some background system info + uname -a > uname.out + lscpu > lscpu.out + cat /proc/cmdline > cmdline.out + timeout -s INT 10 vmstat -w 1 > vmstat.out || true + sudo dmesg >& dmesg.out + cat /proc/cpuinfo > cpuinfo.out + ps axo psr,time,stat,ppid,pid,pcpu,comm > ps.1.out + ps -eafT > ps.2.out + sudo sysctl -a > sysctl.out + + nodecnt=`lscpu|grep "NUMA node(" |awk '{print $3}'` + for ((i=0; i<$nodecnt; i++)) + do + sudo cat /sys/devices/system/node/node${i}/meminfo > meminfo.$i.out + done + sudo more `sudo find /proc -name status` > proc_parent_child_status.out + sudo more /proc/*/numa_maps > numa_maps.out + + # + # Get separate kernel and user perf-c2c stats + # + sudo perf c2c record -a --ldlat=70 --all-user -o perf_c2c_a_all_user.data sleep 5 + sudo perf c2c report --stdio -i perf_c2c_a_all_user.data > perf_c2c_a_all_user.out 2>&1 + sudo perf c2c report --full-symbols --stdio -i perf_c2c_a_all_user.data > perf_c2c_full-sym_a_all_user.out 2>&1 + + sudo perf c2c record --call-graph dwarf -a --ldlat=70 --all-user -o perf_c2c_g_a_all_user.data sleep 5 + sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_user.data > perf_c2c_g_a_all_user.out 2>&1 + + sudo perf c2c record -a --ldlat=70 --all-kernel -o perf_c2c_a_all_kernel.data sleep 4 + sudo perf c2c report --stdio -i perf_c2c_a_all_kernel.data > perf_c2c_a_all_kernel.out 2>&1 + + sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_all_kernel.data sleep 4 + + sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_kernel.data > perf_c2c_g_a_all_kernel.out 2>&1 + + # + # Get combined kernel and user perf-c2c stats + # + sudo perf c2c record -a --ldlat=70 -o perf_c2c_a_both.data sleep 4 + sudo perf c2c report --stdio -i perf_c2c_a_both.data > perf_c2c_a_both.out 2>&1 + + sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_both.data sleep 4 + sudo perf c2c report -g --stdio -i perf_c2c_g_a_both.data > perf_c2c_g_a_both.out 2>&1 + + # + # Get all-user physical addr stats, in case multiple threads or processes are + # accessing shared memory with different vaddrs. + # + sudo perf c2c record --phys-data -a --ldlat=70 --all-user -o perf_c2c_a_all_user_phys_data.data sleep 5 + sudo perf c2c report --stdio -i perf_c2c_a_all_user_phys_data.data > perf_c2c_a_all_user_phys_data.out 2>&1 +} + +function run() { + local dir=$1 + shift + ( + rm -fr $dir + mkdir $dir + cd $dir + ceph_test_c2c --threads $(($(nproc) * 2)) "$@" & + sleep 30 # let it warm up + run_perf_c2c + kill $! || { echo "ceph_test_c2c WAS NOT RUNNING" ; exit 1 ; } + ) || exit 1 +} + +function bench() { + optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) --sharding 2> /dev/null || true) + not_optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) 2> /dev/null || true) + if ! (( $optimized > ( $not_optimized * 2 ) )) ; then + echo "the optimization is expected to be at least x2 faster" + exit 1 + fi +} + +run with-sharding --sharding +run without-sharding +bench diff --git a/qa/suites/rados/standalone/workloads/c2c.yaml b/qa/suites/rados/standalone/workloads/c2c.yaml new file mode 100644 index 0000000000000..9a0dfce944d7d --- /dev/null +++ b/qa/suites/rados/standalone/workloads/c2c.yaml @@ -0,0 +1,18 @@ +arch: x86_64 +roles: +- - mon.a + - mgr.x + - osd.0 + - client.0 +tasks: +- install: + extra_system_packages: + rpm: + - perf + deb: + - linux-tools-generic +- workunit: + basedir: qa/standalone + clients: + all: + - c2c diff --git a/src/include/mempool.h b/src/include/mempool.h index fe84f3b8f0970..08d8282fb2f0e 100644 --- a/src/include/mempool.h +++ b/src/include/mempool.h @@ -201,7 +201,16 @@ enum { num_shards = 1 << num_shard_bits }; -// align shard to a cacheline +// +// Align shard to a cacheline. +// +// It would be possible to retrieve the value at runtime (for instance +// with getconf LEVEL1_DCACHE_LINESIZE or grep -m1 cache_alignment +// /proc/cpuinfo). It is easier to hard code the largest cache +// linesize for all known processors (128 bytes). If the actual cache +// linesize is smaller on a given processor, it will just waste a few +// bytes. +// struct shard_t { ceph::atomic bytes = {0}; ceph::atomic items = {0}; diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index be6673393b20d..1cc93df1dd551 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -478,6 +478,20 @@ install(TARGETS ceph_test_stress_watch DESTINATION ${CMAKE_INSTALL_BINDIR}) +add_executable(ceph_test_c2c + test_c2c.cc + ) +target_link_libraries(ceph_test_c2c + global + ceph-common + pthread + ${EXTRALIBS} + ${CMAKE_DL_LIBS} + ) +install(TARGETS + ceph_test_c2c + DESTINATION ${CMAKE_INSTALL_BINDIR}) + if(WITH_FUSE) add_executable(ceph_test_cfuse_cache_invalidate test_cfuse_cache_invalidate.cc diff --git a/src/test/test_c2c.cc b/src/test/test_c2c.cc new file mode 100644 index 0000000000000..07f6752d777bb --- /dev/null +++ b/src/test/test_c2c.cc @@ -0,0 +1,89 @@ +#include "common/ceph_argparse.h" +#include "common/debug.h" +#include "common/config.h" +#include "global/global_init.h" +#include "global/signal_handler.h" + +#include "include/mempool.h" + +#include +#include + +using std::cerr; +using std::string; + +static void usage(void) +{ + cerr << "--threads number of threads (default 1)" << std::endl; + cerr << "--sharding activate sharding optimization" << std::endl; +} + + +mempool::shard_t shards[mempool::num_shards] = {0}; + +void sigterm_handler(int signum) +{ + size_t total = 0; + for (auto& shard : shards) { + total += shard.bytes; + } + cout << total << std::endl; + exit(0); +} + +int main(int argc, const char **argv) +{ + int ret = 0; + vector args; + argv_to_vec(argc, argv, args); + auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, + CODE_ENVIRONMENT_UTILITY, + CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); + common_init_finish(g_ceph_context); + + int threads = 1; + bool sharding = false; + for (std::vector::iterator i = args.begin(); i != args.end(); ) { + if (ceph_argparse_double_dash(args, i)) { + break; + } + else if (ceph_argparse_witharg(args, i, &threads, cerr, "--threads", "-t", (char*)NULL)) { + } + else if (ceph_argparse_flag(args, i, "--sharding", "-s", (char*)NULL)) { + sharding = true; + } + else { + cerr << "unknown command line option: " << *i << std::endl; + cerr << std::endl; + usage(); + return 2; + } + } + + init_async_signal_handler(); + register_async_signal_handler(SIGTERM, sigterm_handler); + + + std::vector workers; + for (int i = 0; i < threads; i++) { + workers.push_back( + std::thread([&](){ + while(1) { + size_t i; + if (sharding) { + i = mempool::pool_t::pick_a_shard_int(); + } else { + i = 0; + } + shards[i].bytes++; + } + })); + } + + for (auto& t:workers) { + t.join(); + } + workers.clear(); + + return ret; +} -- 2.39.5