#include "common/Timer.h"
#include "common/TracepointProvider.h"
#include "common/ceph_argparse.h"
+#include "common/numa.h"
#include "global/global_init.h"
#include "global/signal_handler.h"
forker.exit(0);
}
+ // consider objectstore numa node
+ int os_numa_node = -1;
+ r = store->get_numa_node(&os_numa_node, nullptr, nullptr);
+ if (r >= 0 && os_numa_node >= 0) {
+ dout(1) << " objectstore numa_node " << os_numa_node << dendl;
+ }
+ int iface_preferred_numa_node = -1;
+ if (g_conf().get_val<bool>("osd_numa_prefer_iface")) {
+ iface_preferred_numa_node = os_numa_node;
+ }
+
+ // messengers
std::string msg_type = g_conf().get_val<std::string>("ms_type");
std::string public_msg_type =
g_conf().get_val<std::string>("ms_public_type");
ms_objecter->set_default_policy(Messenger::Policy::lossy_client(CEPH_FEATURE_OSDREPLYMUX));
entity_addrvec_t public_addrs, cluster_addrs;
- r = pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC, &public_addrs);
+ r = pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC, &public_addrs,
+ iface_preferred_numa_node);
if (r < 0) {
derr << "Failed to pick public address." << dendl;
forker.exit(1);
}
- r = pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_CLUSTER, &cluster_addrs);
+ r = pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_CLUSTER, &cluster_addrs,
+ iface_preferred_numa_node);
if (r < 0) {
derr << "Failed to pick cluster address." << dendl;
forker.exit(1);
.set_description("Number of striping periods to zero head of MDS journal write position"),
// -- OSD --
+ Option("osd_numa_prefer_iface", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_flag(Option::FLAG_STARTUP)
+ .set_description("prefer IP on network interface on same numa node as storage")
+ .add_see_also("osd_numa_auto_affinity"),
+
+ Option("osd_numa_auto_affinity", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+ .set_default(true)
+ .set_flag(Option::FLAG_STARTUP)
+ .set_description("automatically set affinity to numa node when storage and network match"),
+
+ Option("osd_numa_node", Option::TYPE_INT, Option::LEVEL_ADVANCED)
+ .set_default(-1)
+ .set_flag(Option::FLAG_STARTUP)
+ .set_description("set affinity to a numa node (-1 for none)")
+ .add_see_also("osd_numa_auto_affinity"),
+
Option("osd_smart_report_timeout", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(5)
.set_description("Timeout (in seconds) for smarctl to run, default is set to 5"),
return 0;
}
+int OSD::set_numa_affinity()
+{
+ // storage numa node
+ int store_node = -1;
+ store->get_numa_node(&store_node, nullptr, nullptr);
+ if (store_node >= 0) {
+ dout(1) << __func__ << " storage numa node " << store_node << dendl;
+ }
+
+ // check network numa node(s)
+ int front_node = -1, back_node = -1;
+ string front_iface = pick_iface(
+ cct,
+ client_messenger->get_myaddrs().front().get_sockaddr_storage());
+ string back_iface = pick_iface(
+ cct,
+ cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
+ int r = get_iface_numa_node(front_iface, &front_node);
+ if (r >= 0) {
+ dout(1) << __func__ << " public network " << front_iface << " numa node "
+ << front_node << dendl;
+ r = get_iface_numa_node(back_iface, &back_node);
+ if (r >= 0) {
+ dout(1) << __func__ << " cluster network " << back_iface << " numa node "
+ << back_node << dendl;
+ if (front_node == back_node &&
+ front_node == store_node) {
+ dout(1) << " objectstore and network numa nodes all match" << dendl;
+ if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
+ numa_node = front_node;
+ }
+ } else {
+ derr << __func__ << " objectstore and network numa nodes to not match"
+ << dendl;
+ }
+ }
+ } else {
+ derr << __func__ << " unable to identify public interface '" << front_iface
+ << "' numa node: " << cpp_strerror(r) << dendl;
+ }
+ if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
+ // this takes precedence over the automagic logic above
+ numa_node = node;
+ }
+ if (numa_node >= 0) {
+ int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
+ if (r < 0) {
+ dout(1) << __func__ << " unable to determine numa node " << numa_node
+ << " CPUs" << dendl;
+ numa_node = -1;
+ } else {
+ dout(1) << __func__ << " setting numa affinity to node " << numa_node
+ << " cpus "
+ << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
+ << dendl;
+ r = sched_setaffinity(getpid(), numa_cpu_set_size, &numa_cpu_set);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
+ << dendl;
+ numa_node = -1;
+ }
+ }
+ } else {
+ dout(1) << __func__ << " not setting numa affinity" << dendl;
+ }
+ return 0;
+}
+
// asok
class OSDSocketHook : public AdminSocketHook {
hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
}
+ // we now know what our front and back addrs will be, and we are
+ // about to tell the mon what our metadata (including numa bindings)
+ // are, so now is a good time!
+ set_numa_affinity();
+
MOSDBoot *mboot = new MOSDBoot(
superblock, get_osdmap_epoch(), service.get_boot_epoch(),
hb_back_addrs, hb_front_addrs, cluster_addrs,
{
int node = -1;
set<int> nodes;
- string cpu_list;
set<string> unknown;
for (auto nm : { "front_iface", "back_iface" }) {
if (!(*pm)[nm].size()) {
unknown.insert(nm);
continue;
}
- cpu_set_t cpu_set;
- size_t cpu_set_size;
int n = -1;
- int r = get_iface_numa_node((*pm)[nm], &n, &cpu_set_size, &cpu_set);
+ int r = get_iface_numa_node((*pm)[nm], &n);
if (r < 0) {
unknown.insert((*pm)[nm]);
continue;
nodes.insert(n);
if (node < 0) {
node = n;
- cpu_list = cpu_set_to_str_list(cpu_set_size, &cpu_set);
}
}
if (unknown.size()) {
}
if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
(*pm)["network_numa_node"] = stringify(node);
- (*pm)["network_numa_node_cpus"] = cpu_list;
}
}
-
+
+ if (numa_node >= 0) {
+ (*pm)["numa_node"] = stringify(numa_node);
+ (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
+ &numa_cpu_set);
+ }
+
set<string> devnames;
store->get_devices(&devnames);
(*pm)["devices"] = stringify(devnames);