]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: verify the benefits of mempool cacheline optimization 41014/head
authorLoïc Dachary <loic@dachary.org>
Fri, 19 Mar 2021 07:29:08 +0000 (08:29 +0100)
committerKefu Chai <kchai@redhat.com>
Fri, 30 Apr 2021 04:11:13 +0000 (12:11 +0800)
There already is a test to verify the mempool sharding works, in the sense that
it uses at least half of the variables available to count the number of
allocated objects and their total size. This new test verifies that, with
sharding, object counting is at least twice faster than without sharding. It
also collects cacheline contention data with the perf c2c tool. The manual
analysis of this data shows the optimization gain is indeed related to cacheline
contention.

Fixes: https://tracker.ceph.com/issues/49896
Signed-off-by: Loïc Dachary <loic@dachary.org>
qa/standalone/c2c/c2c.sh [new file with mode: 0755]
qa/suites/rados/standalone/workloads/c2c.yaml [new file with mode: 0644]
src/include/mempool.h
src/test/CMakeLists.txt
src/test/test_c2c.cc [new file with mode: 0644]

diff --git a/qa/standalone/c2c/c2c.sh b/qa/standalone/c2c/c2c.sh
new file mode 100755 (executable)
index 0000000..a6969d5
--- /dev/null
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+set -ex
+
+function run_perf_c2c() {
+    # First get some background system info
+    uname -a > uname.out
+    lscpu > lscpu.out
+    cat /proc/cmdline > cmdline.out
+    timeout -s INT 10 vmstat -w 1 > vmstat.out || true
+    sudo dmesg >& dmesg.out
+    cat /proc/cpuinfo > cpuinfo.out
+    ps axo psr,time,stat,ppid,pid,pcpu,comm > ps.1.out
+    ps -eafT > ps.2.out
+    sudo sysctl -a > sysctl.out
+
+    nodecnt=`lscpu|grep "NUMA node(" |awk '{print $3}'`
+    for ((i=0; i<$nodecnt; i++))
+    do
+       sudo cat /sys/devices/system/node/node${i}/meminfo > meminfo.$i.out
+    done
+    sudo more `sudo find /proc -name status` > proc_parent_child_status.out
+    sudo more /proc/*/numa_maps > numa_maps.out
+
+    #
+    # Get separate kernel and user perf-c2c stats
+    #
+    sudo perf c2c record -a --ldlat=70 --all-user -o perf_c2c_a_all_user.data sleep 5
+    sudo perf c2c report --stdio -i perf_c2c_a_all_user.data > perf_c2c_a_all_user.out 2>&1
+    sudo perf c2c report --full-symbols --stdio -i perf_c2c_a_all_user.data > perf_c2c_full-sym_a_all_user.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf -a --ldlat=70 --all-user -o perf_c2c_g_a_all_user.data sleep 5
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_user.data > perf_c2c_g_a_all_user.out 2>&1
+
+    sudo perf c2c record -a --ldlat=70 --all-kernel -o perf_c2c_a_all_kernel.data sleep 4
+    sudo perf c2c report --stdio -i perf_c2c_a_all_kernel.data > perf_c2c_a_all_kernel.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_all_kernel.data sleep 4
+
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_kernel.data > perf_c2c_g_a_all_kernel.out 2>&1
+
+    #
+    # Get combined kernel and user perf-c2c stats
+    #
+    sudo perf c2c record -a --ldlat=70 -o perf_c2c_a_both.data sleep 4
+    sudo perf c2c report --stdio -i perf_c2c_a_both.data > perf_c2c_a_both.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_both.data sleep 4
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_both.data > perf_c2c_g_a_both.out 2>&1
+
+    #
+    # Get all-user physical addr stats, in case multiple threads or processes are
+    # accessing shared memory with different vaddrs.
+    #
+    sudo perf c2c record --phys-data -a --ldlat=70 --all-user -o perf_c2c_a_all_user_phys_data.data sleep 5
+    sudo perf c2c report --stdio -i perf_c2c_a_all_user_phys_data.data > perf_c2c_a_all_user_phys_data.out 2>&1
+}
+
+function run() {
+    local dir=$1
+    shift
+    (
+       rm -fr $dir
+       mkdir $dir
+       cd $dir
+       ceph_test_c2c --threads $(($(nproc) * 2)) "$@" &
+       sleep 30 # let it warm up
+       run_perf_c2c
+       kill $! || { echo "ceph_test_c2c WAS NOT RUNNING" ; exit 1 ; }
+    ) || exit 1
+}
+
+function bench() {
+    optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) --sharding 2> /dev/null || true)
+    not_optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) 2> /dev/null || true)
+    if ! (( $optimized > ( $not_optimized * 2 ) )) ; then
+       echo "the optimization is expected to be at least x2 faster"
+       exit 1
+    fi
+}
+
+run with-sharding --sharding
+run without-sharding
+bench
diff --git a/qa/suites/rados/standalone/workloads/c2c.yaml b/qa/suites/rados/standalone/workloads/c2c.yaml
new file mode 100644 (file)
index 0000000..9a0dfce
--- /dev/null
@@ -0,0 +1,18 @@
+arch: x86_64
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - client.0
+tasks:
+- install:
+    extra_system_packages:
+      rpm:
+        - perf
+      deb:
+        - linux-tools-generic
+- workunit:
+    basedir: qa/standalone
+    clients:
+      all:
+        - c2c
index fe84f3b8f09704b7f8f3f0834251f436a5634786..08d8282fb2f0e53c8d5e033df71aecf6ae8a3ed1 100644 (file)
@@ -201,7 +201,16 @@ enum {
   num_shards = 1 << num_shard_bits
 };
 
-// align shard to a cacheline
+//
+// Align shard to a cacheline.
+//
+// It would be possible to retrieve the value at runtime (for instance
+// with getconf LEVEL1_DCACHE_LINESIZE or grep -m1 cache_alignment
+// /proc/cpuinfo). It is easier to hard code the largest cache
+// linesize for all known processors (128 bytes). If the actual cache
+// linesize is smaller on a given processor, it will just waste a few
+// bytes.
+//
 struct shard_t {
   ceph::atomic<size_t> bytes = {0};
   ceph::atomic<size_t> items = {0};
index be6673393b20dd71c7e0a3e1a4fcda5a0af33024..1cc93df1dd5512faa8dae4f4da7e86f4500e98c0 100644 (file)
@@ -478,6 +478,20 @@ install(TARGETS
   ceph_test_stress_watch
   DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+add_executable(ceph_test_c2c
+  test_c2c.cc
+  )
+target_link_libraries(ceph_test_c2c
+  global
+  ceph-common
+  pthread
+  ${EXTRALIBS}
+  ${CMAKE_DL_LIBS}
+  )
+install(TARGETS
+  ceph_test_c2c
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 if(WITH_FUSE)
   add_executable(ceph_test_cfuse_cache_invalidate
     test_cfuse_cache_invalidate.cc
diff --git a/src/test/test_c2c.cc b/src/test/test_c2c.cc
new file mode 100644 (file)
index 0000000..07f6752
--- /dev/null
@@ -0,0 +1,89 @@
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/config.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/mempool.h"
+
+#include <iostream>
+#include <string>
+
+using std::cerr;
+using std::string;
+
+static void usage(void)
+{
+  cerr << "--threads       number of threads (default 1)" << std::endl;
+  cerr << "--sharding      activate sharding optimization" << std::endl;
+}
+
+
+mempool::shard_t shards[mempool::num_shards] = {0};
+
+void sigterm_handler(int signum)
+{
+  size_t total = 0;
+  for (auto& shard : shards) {
+    total += shard.bytes;
+  }
+  cout << total << std::endl;
+  exit(0);
+}
+
+int main(int argc, const char **argv)
+{
+  int ret = 0;
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                        CODE_ENVIRONMENT_UTILITY,
+                        CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+  common_init_finish(g_ceph_context);
+
+  int threads = 1;
+  bool sharding = false;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    }
+    else if (ceph_argparse_witharg(args, i, &threads, cerr, "--threads", "-t", (char*)NULL)) {
+    }
+    else if (ceph_argparse_flag(args, i, "--sharding", "-s", (char*)NULL)) {
+      sharding = true;
+    }
+    else {
+      cerr << "unknown command line option: " << *i << std::endl;
+      cerr << std::endl;
+      usage();
+      return 2;
+    }
+  }
+
+  init_async_signal_handler();
+  register_async_signal_handler(SIGTERM, sigterm_handler);
+
+
+  std::vector<std::thread> workers;
+  for (int i = 0; i < threads; i++) {
+    workers.push_back(
+      std::thread([&](){
+         while(1) {
+           size_t i;
+           if (sharding) {
+             i = mempool::pool_t::pick_a_shard_int();
+           } else {
+             i = 0;
+           }
+           shards[i].bytes++;
+         }
+       }));
+  }
+
+  for (auto& t:workers) {
+    t.join();
+  }
+  workers.clear();
+
+  return ret;
+}