From 7fe0ac7c1159d3ee81de27365dcf76ba6f4a2fb5 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Lo=C3=AFc=20Dachary?= <loic@dachary.org>
Date: Fri, 19 Mar 2021 08:29:08 +0100
Subject: [PATCH] qa: verify the benefits of mempool cacheline optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

There already is a test to verify the mempool sharding works, in the sense that
it uses at least half of the variables available to count the number of
allocated objects and their total size. This new test verifies that, with
sharding, object counting is at least twice faster than without sharding. It
also collects cacheline contention data with the perf c2c tool. The manual
analysis of this data shows the optimization gain is indeed related to cacheline
contention.

Fixes: https://tracker.ceph.com/issues/49896

Signed-off-by: LoÃ¯c Dachary <loic@dachary.org>
---
 qa/standalone/c2c/c2c.sh                      | 84 +++++++++++++++++
 qa/suites/rados/standalone/workloads/c2c.yaml | 18 ++++
 src/include/mempool.h                         | 11 ++-
 src/test/CMakeLists.txt                       | 14 +++
 src/test/test_c2c.cc                          | 89 +++++++++++++++++++
 5 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100755 qa/standalone/c2c/c2c.sh
 create mode 100644 qa/suites/rados/standalone/workloads/c2c.yaml
 create mode 100644 src/test/test_c2c.cc

diff --git a/qa/standalone/c2c/c2c.sh b/qa/standalone/c2c/c2c.sh
new file mode 100755
index 0000000000000..a6969d555d829
--- /dev/null
+++ b/qa/standalone/c2c/c2c.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+set -ex
+
+function run_perf_c2c() {
+    # First get some background system info
+    uname -a > uname.out
+    lscpu > lscpu.out
+    cat /proc/cmdline > cmdline.out
+    timeout -s INT 10 vmstat -w 1 > vmstat.out || true
+    sudo dmesg >& dmesg.out
+    cat /proc/cpuinfo > cpuinfo.out
+    ps axo psr,time,stat,ppid,pid,pcpu,comm > ps.1.out
+    ps -eafT > ps.2.out
+    sudo sysctl -a > sysctl.out
+
+    nodecnt=`lscpu|grep "NUMA node(" |awk '{print $3}'`
+    for ((i=0; i<$nodecnt; i++))
+    do
+       sudo cat /sys/devices/system/node/node${i}/meminfo > meminfo.$i.out
+    done
+    sudo more `sudo find /proc -name status` > proc_parent_child_status.out
+    sudo more /proc/*/numa_maps > numa_maps.out
+
+    #
+    # Get separate kernel and user perf-c2c stats
+    #
+    sudo perf c2c record -a --ldlat=70 --all-user -o perf_c2c_a_all_user.data sleep 5
+    sudo perf c2c report --stdio -i perf_c2c_a_all_user.data > perf_c2c_a_all_user.out 2>&1
+    sudo perf c2c report --full-symbols --stdio -i perf_c2c_a_all_user.data > perf_c2c_full-sym_a_all_user.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf -a --ldlat=70 --all-user -o perf_c2c_g_a_all_user.data sleep 5
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_user.data > perf_c2c_g_a_all_user.out 2>&1
+
+    sudo perf c2c record -a --ldlat=70 --all-kernel -o perf_c2c_a_all_kernel.data sleep 4
+    sudo perf c2c report --stdio -i perf_c2c_a_all_kernel.data > perf_c2c_a_all_kernel.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_all_kernel.data sleep 4
+
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_all_kernel.data > perf_c2c_g_a_all_kernel.out 2>&1
+
+    #
+    # Get combined kernel and user perf-c2c stats
+    #
+    sudo perf c2c record -a --ldlat=70 -o perf_c2c_a_both.data sleep 4
+    sudo perf c2c report --stdio -i perf_c2c_a_both.data > perf_c2c_a_both.out 2>&1
+
+    sudo perf c2c record --call-graph dwarf --ldlat=70 -a --all-kernel -o perf_c2c_g_a_both.data sleep 4
+    sudo perf c2c report -g --stdio -i perf_c2c_g_a_both.data > perf_c2c_g_a_both.out 2>&1
+
+    #
+    # Get all-user physical addr stats, in case multiple threads or processes are
+    # accessing shared memory with different vaddrs.
+    #
+    sudo perf c2c record --phys-data -a --ldlat=70 --all-user -o perf_c2c_a_all_user_phys_data.data sleep 5
+    sudo perf c2c report --stdio -i perf_c2c_a_all_user_phys_data.data > perf_c2c_a_all_user_phys_data.out 2>&1
+}
+
+function run() {
+    local dir=$1
+    shift
+    (
+	rm -fr $dir
+	mkdir $dir
+	cd $dir
+	ceph_test_c2c --threads $(($(nproc) * 2)) "$@" &
+	sleep 30 # let it warm up
+	run_perf_c2c
+	kill $! || { echo "ceph_test_c2c WAS NOT RUNNING" ; exit 1 ; }
+    ) || exit 1
+}
+
+function bench() {
+    optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) --sharding 2> /dev/null || true)
+    not_optimized=$(timeout 30 ceph_test_c2c --threads $(($(nproc) * 2)) 2> /dev/null || true)
+    if ! (( $optimized > ( $not_optimized * 2 ) )) ; then
+	echo "the optimization is expected to be at least x2 faster"
+	exit 1
+    fi
+}
+
+run with-sharding --sharding
+run without-sharding
+bench
diff --git a/qa/suites/rados/standalone/workloads/c2c.yaml b/qa/suites/rados/standalone/workloads/c2c.yaml
new file mode 100644
index 0000000000000..9a0dfce944d7d
--- /dev/null
+++ b/qa/suites/rados/standalone/workloads/c2c.yaml
@@ -0,0 +1,18 @@
+arch: x86_64
+roles:
+- - mon.a
+  - mgr.x
+  - osd.0
+  - client.0
+tasks:
+- install:
+    extra_system_packages:
+      rpm:
+        - perf
+      deb:
+        - linux-tools-generic
+- workunit:
+    basedir: qa/standalone
+    clients:
+      all:
+        - c2c
diff --git a/src/include/mempool.h b/src/include/mempool.h
index fe84f3b8f0970..08d8282fb2f0e 100644
--- a/src/include/mempool.h
+++ b/src/include/mempool.h
@@ -201,7 +201,16 @@ enum {
   num_shards = 1 << num_shard_bits
 };
 
-// align shard to a cacheline
+//
+// Align shard to a cacheline.
+//
+// It would be possible to retrieve the value at runtime (for instance
+// with getconf LEVEL1_DCACHE_LINESIZE or grep -m1 cache_alignment
+// /proc/cpuinfo). It is easier to hard code the largest cache
+// linesize for all known processors (128 bytes). If the actual cache
+// linesize is smaller on a given processor, it will just waste a few
+// bytes.
+//
 struct shard_t {
   ceph::atomic<size_t> bytes = {0};
   ceph::atomic<size_t> items = {0};
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index be6673393b20d..1cc93df1dd551 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -478,6 +478,20 @@ install(TARGETS
   ceph_test_stress_watch
   DESTINATION ${CMAKE_INSTALL_BINDIR})
 
+add_executable(ceph_test_c2c
+  test_c2c.cc
+  )
+target_link_libraries(ceph_test_c2c
+  global
+  ceph-common
+  pthread
+  ${EXTRALIBS}
+  ${CMAKE_DL_LIBS}
+  )
+install(TARGETS
+  ceph_test_c2c
+  DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 if(WITH_FUSE)
   add_executable(ceph_test_cfuse_cache_invalidate
     test_cfuse_cache_invalidate.cc
diff --git a/src/test/test_c2c.cc b/src/test/test_c2c.cc
new file mode 100644
index 0000000000000..07f6752d777bb
--- /dev/null
+++ b/src/test/test_c2c.cc
@@ -0,0 +1,89 @@
+#include "common/ceph_argparse.h"
+#include "common/debug.h"
+#include "common/config.h"
+#include "global/global_init.h"
+#include "global/signal_handler.h"
+
+#include "include/mempool.h"
+
+#include <iostream>
+#include <string>
+
+using std::cerr;
+using std::string;
+
+static void usage(void)
+{
+  cerr << "--threads       number of threads (default 1)" << std::endl;
+  cerr << "--sharding      activate sharding optimization" << std::endl;
+}
+
+
+mempool::shard_t shards[mempool::num_shards] = {0};
+
+void sigterm_handler(int signum)
+{
+  size_t total = 0;
+  for (auto& shard : shards) {
+    total += shard.bytes;
+  }
+  cout << total << std::endl;
+  exit(0);
+}
+
+int main(int argc, const char **argv)
+{
+  int ret = 0;
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
+			 CODE_ENVIRONMENT_UTILITY,
+			 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
+  common_init_finish(g_ceph_context);
+
+  int threads = 1;
+  bool sharding = false;
+  for (std::vector<const char*>::iterator i = args.begin(); i != args.end(); ) {
+    if (ceph_argparse_double_dash(args, i)) {
+      break;
+    }
+    else if (ceph_argparse_witharg(args, i, &threads, cerr, "--threads", "-t", (char*)NULL)) {
+    }
+    else if (ceph_argparse_flag(args, i, "--sharding", "-s", (char*)NULL)) {
+      sharding = true;
+    }
+    else {
+      cerr << "unknown command line option: " << *i << std::endl;
+      cerr << std::endl;
+      usage();
+      return 2;
+    }
+  }
+
+  init_async_signal_handler();
+  register_async_signal_handler(SIGTERM, sigterm_handler);
+
+
+  std::vector<std::thread> workers;
+  for (int i = 0; i < threads; i++) {
+    workers.push_back(
+      std::thread([&](){
+	  while(1) {
+	    size_t i;
+	    if (sharding) {
+	      i = mempool::pool_t::pick_a_shard_int();
+	    } else {
+	      i = 0;
+	    }
+	    shards[i].bytes++;
+	  }
+	}));
+  }
+
+  for (auto& t:workers) {
+    t.join();
+  }
+  workers.clear();
+
+  return ret;
+}
-- 
2.39.5