From abaed06ffe4d3ea2f052192fad87242880188f3c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 3 Oct 2019 10:04:19 -0500 Subject: [PATCH] osd: set affinity for *all* threads It turns out schedule_setaffinity(2) only sets the current *thread*'s affinity, not the entire process. Look in /proc to identify all of our children and map those, too. Note that this doesn't seem to affect the numa_preferred_nid value in /proc/$pid/[task/$tid/]sched... at least not immediately. That value does seem to change for some threads after some load is applied. Fixes: https://tracker.ceph.com/issues/42054 Signed-off-by: Sage Weil --- src/common/numa.cc | 56 ++++++++++++++++++++++++++++++++++++++++++++++ src/common/numa.h | 3 +++ src/osd/OSD.cc | 2 +- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/src/common/numa.cc b/src/common/numa.cc index c75f50088eb..dc80d0f33bd 100644 --- a/src/common/numa.cc +++ b/src/common/numa.cc @@ -134,6 +134,56 @@ int get_numa_node_cpu_set( return r; } +static int easy_readdir(const std::string& dir, std::set *out) +{ + DIR *h = ::opendir(dir.c_str()); + if (!h) { + return -errno; + } + struct dirent *de = nullptr; + while ((de = ::readdir(h))) { + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) { + continue; + } + out->insert(de->d_name); + } + closedir(h); + return 0; +} + +int set_cpu_affinity_all_threads(size_t cpu_set_size, cpu_set_t *cpu_set) +{ + // first set my affinity + int r = sched_setaffinity(getpid(), cpu_set_size, cpu_set); + if (r < 0) { + return -errno; + } + + // make 2 passes here so that we (hopefully) catch racing threads creating + // threads. + for (unsigned pass = 0; pass < 2; ++pass) { + // enumerate all child threads from /proc + std::set ls; + std::string path = "/proc/"s + stringify(getpid()) + "/task"; + r = easy_readdir(path, &ls); + if (r < 0) { + return r; + } + for (auto& i : ls) { + pid_t tid = atoll(i.c_str()); + if (!tid) { + continue; // wtf + } + r = sched_setaffinity(tid, cpu_set_size, cpu_set); + if (r < 0) { + return -errno; + } + } + } + return 0; +} + #elif defined(__FreeBSD__) int parse_cpu_set_list(const char *s, @@ -162,4 +212,10 @@ int get_numa_node_cpu_set(int node, return -ENOTSUP; } +int set_cpu_affinity_all_threads(size_t cpu_set_size, + cpu_set_t *cpu_set) +{ + return -ENOTSUP; +} + #endif diff --git a/src/common/numa.h b/src/common/numa.h index 4acb8280c8f..78851deef51 100644 --- a/src/common/numa.h +++ b/src/common/numa.h @@ -19,3 +19,6 @@ std::set cpu_set_to_set(size_t cpu_set_size, int get_numa_node_cpu_set(int node, size_t *cpu_set_size, cpu_set_t *cpu_set); + +int set_cpu_affinity_all_threads(size_t cpu_set_size, + cpu_set_t *cpu_set); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index fc7f5654e52..d461f9f24ee 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -2313,7 +2313,7 @@ int OSD::set_numa_affinity() << " cpus " << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set) << dendl; - r = sched_setaffinity(getpid(), numa_cpu_set_size, &numa_cpu_set); + r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set); if (r < 0) { r = -errno; derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r) -- 2.39.5