qa: verify the benefits of mempool cacheline optimization

author Loïc Dachary <loic@dachary.org>

Fri, 19 Mar 2021 07:29:08 +0000 (08:29 +0100)

committer Kefu Chai <kchai@redhat.com>

Fri, 30 Apr 2021 04:11:13 +0000 (12:11 +0800)
author Loïc Dachary <loic@dachary.org>
Fri, 19 Mar 2021 07:29:08 +0000 (08:29 +0100)
committer Kefu Chai <kchai@redhat.com>
Fri, 30 Apr 2021 04:11:13 +0000 (12:11 +0800)
diff --git a/qa/standalone/c2c/c2c.sh b/qa/standalone/c2c/c2c.sh

new file mode 100755 (executable)

index 0000000..a6969d5
--- /dev/null
+++ b/qa/standalone/c2c/c2c.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+set -ex
+
+function run_perf_c2c() {
+    # First get some background system info
+    uname -a > uname.out
+    lscpu > lscpu.out
+    cat /proc/cmdline > cmdline.out
+    timeout -s INT 10 vmstat -w 1 > vmstat.out || true
+    sudo dmesg >& dmesg.out
+    cat /proc/cpuinfo > cpuinfo.out
+    ps axo psr,time,stat,ppid,pid,pcpu,comm > ps.1.out
+    ps -eafT > ps.2.out
+    sudo sysctl -a > sysctl.out
+
+    nodecnt=`lscpu|grep "NUMA node(" |awk '{print $3}'`
+    for ((i=0; i<$nodecnt; i++))
+    do
+       sudo cat /sys/devices/system/node/node${i}/meminfo > meminfo.$i.out
+    done
+    sudo more `sudo find /proc -name status` > proc_parent_child_status.out
+    sudo more /proc/*/numa_maps > numa_maps.out
+
+    #
+    # Get separate kernel and user perf-c2c stats
+    #
+    sudo perf c2c record -a --ldlat=70 --all-user -o perf_c2c_a_all_user.data sleep 5
+    sudo perf c2c report --stdio -i perf_c2c_a_all_user.data > perf_c2c_a_all_user.out 2>&1
+    sudo perf c2c report --full-symbols --stdio -i perf_c2c_a_all_user.data > perf_c2c_full-sym_a_all_user.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf -a --ldlat=70 --all-user -o perf_c2c_g_a_all_user.data sleep 5
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_user.data > perf_c2c_g_a_all_user.out 2>&1
+
+    sudo perf c2c record -a --ldlat=70 --all-kernel -o perf_c2c_a_all_kernel.data sleep 4
+    sudo perf c2c report --stdio -i perf_c2c_a_all_kernel.data > perf_c2c_a_all_kernel.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_all_kernel.data sleep 4
+
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_kernel.data > perf_c2c_g_a_all_kernel.out 2>&1
+
+    #
+    # Get combined kernel and user perf-c2c stats
+    #
+    sudo perf c2c record -a --ldlat=70 -o perf_c2c_a_both.data sleep 4
+    sudo perf c2c report --stdio -i perf_c2c_a_both.data > perf_c2c_a_both.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_both.data sleep 4
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_both.data > perf_c2c_g_a_both.out 2>&1
+
+    #
+    # Get all-user physical addr stats, in case multiple threads or processes are
+    # accessing shared memory with different vaddrs.
+    #
+    sudo perf c2c record --phys-data -a --ldlat=70 --all-user -o perf_c2c_a_all_user_phys_data.data sleep 5
+    sudo perf c2c report --stdio -i perf_c2c_a_all_user_phys_data.data > perf_c2c_a_all_user_phys_data.out 2>&1
+}
+
+function run() {
+    local dir=$1
+    shift
+    (
+       rm -fr $dir
+       mkdir $dir
+       cd $dir
+       ceph_test_c2c --threads $(($(nproc) * 2)) "$@" &
+       sleep 30 # let it warm up
+       run_perf_c2c
+       kill $! || { echo "ceph_test_c2c WAS NOT RUNNING" ; exit 1 ; }
+    ) || exit 1
+}
+
+function bench() {
+    optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) --sharding 2> /dev/null || true)
+    not_optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) 2> /dev/null || true)
+    if ! (( $optimized > ( $not_optimized * 2 ) )) ; then
+       echo "the optimization is expected to be at least x2 faster"
+       exit 1
+    fi
+}
+
+run with-sharding --sharding
+run without-sharding
+bench
diff --git a/qa/suites/rados/standalone/workloads/c2c.yaml b/qa/suites/rados/standalone/workloads/c2c.yaml

new file mode 100644 (file)

index 0000000..9a0dfce
--- /dev/null
+++ b/qa/suites/rados/standalone/workloads/c2c.yaml
@@ -0,0 +1,18 @@
+arch: x86_64
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - client.0
+tasks:
+- install:
+    extra_system_packages:
+      rpm:
+        - perf
+      deb:
+        - linux-tools-generic
+- workunit:
+    basedir: qa/standalone
+    clients:
+      all:
+        - c2c
diff --git a/src/include/mempool.h b/src/include/mempool.h

index fe84f3b8f09704b7f8f3f0834251f436a5634786..08d8282fb2f0e53c8d5e033df71aecf6ae8a3ed1 100644 (file)
--- a/src/include/mempool.h
+++ b/src/include/mempool.h
@@ -201,7 +201,16 @@ enum {
    num_shards = 1 << num_shard_bits
  };
  
-// align shard to a cacheline
+//
+// Align shard to a cacheline.
+//
+// It would be possible to retrieve the value at runtime (for instance
+// with getconf LEVEL1_DCACHE_LINESIZE or grep -m1 cache_alignment
+// /proc/cpuinfo). It is easier to hard code the largest cache
+// linesize for all known processors (128 bytes). If the actual cache
+// linesize is smaller on a given processor, it will just waste a few
+// bytes.
+//
  struct shard_t {
    ceph::atomic<size_t> bytes = {0};
    ceph::atomic<size_t> items = {0};
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt

index be6673393b20dd71c7e0a3e1a4fcda5a0af33024..1cc93df1dd5512faa8dae4f4da7e86f4500e98c0 100644 (file)
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -478,6 +478,20 @@ install(TARGETS
    ceph_test_stress_watch
    DESTINATION ${CMAKE_INSTALL_BINDIR})
  
+add_executable(ceph_test_c2c
+  test_c2c.cc
+  )
+target_link_libraries(ceph_test_c2c
+  global
+  ceph-common
+  pthread
+  ${EXTRALIBS}
+  ${CMAKE_DL_LIBS}
+  )
+install(TARGETS
+  ceph_test_c2c
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
+
  if(WITH_FUSE)
    add_executable(ceph_test_cfuse_cache_invalidate
      test_cfuse_cache_invalidate.cc
diff --git a/src/test/test_c2c.cc b/src/test/test_c2c.cc

new file mode 100644 (file)

index 0000000..07f6752
--- /dev/null
+++ b/src/test/test_c2c.cc
@@ -0,0 +1,89 @@
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/config.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/mempool.h"
+
+#include <iostream>
+#include <string>
+
+using std::cerr;
+using std::string;
+
+static void usage(void)
+{
+  cerr << "--threads       number of threads (default 1)" << std::endl;
+  cerr << "--sharding      activate sharding optimization" << std::endl;
+}
+
+
+mempool::shard_t shards[mempool::num_shards] = {0};
+
+void sigterm_handler(int signum)
+{
+  size_t total = 0;
+  for (auto& shard : shards) {
+    total += shard.bytes;
+  }
+  cout << total << std::endl;
+  exit(0);
+}
+
+int main(int argc, const char **argv)
+{
+  int ret = 0;
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+                        CODE_ENVIRONMENT_UTILITY,
+                        CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+  common_init_finish(g_ceph_context);
+
+  int threads = 1;
+  bool sharding = false;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    }
+    else if (ceph_argparse_witharg(args, i, &threads, cerr, "--threads", "-t", (char*)NULL)) {
+    }
+    else if (ceph_argparse_flag(args, i, "--sharding", "-s", (char*)NULL)) {
+      sharding = true;
+    }
+    else {
+      cerr << "unknown command line option: " << *i << std::endl;
+      cerr << std::endl;
+      usage();
+      return 2;
+    }
+  }
+
+  init_async_signal_handler();
+  register_async_signal_handler(SIGTERM, sigterm_handler);
+
+
+  std::vector<std::thread> workers;
+  for (int i = 0; i < threads; i++) {
+    workers.push_back(
+      std::thread([&](){
+         while(1) {
+           size_t i;
+           if (sharding) {
+             i = mempool::pool_t::pick_a_shard_int();
+           } else {
+             i = 0;
+           }
+           shards[i].bytes++;
+         }
+       }));
+  }
+
+  for (auto& t:workers) {
+    t.join();
+  }
+  workers.clear();
+
+  return ret;
+}
author	Loïc Dachary <loic@dachary.org>
	Fri, 19 Mar 2021 07:29:08 +0000 (08:29 +0100)
committer	Kefu Chai <kchai@redhat.com>
	Fri, 30 Apr 2021 04:11:13 +0000 (12:11 +0800)
qa/standalone/c2c/c2c.sh	[new file with mode: 0755]	patch \| blob
qa/suites/rados/standalone/workloads/c2c.yaml	[new file with mode: 0644]	patch \| blob
src/include/mempool.h		patch \| blob \| history
src/test/CMakeLists.txt		patch \| blob \| history
src/test/test_c2c.cc	[new file with mode: 0644]	patch \| blob