From 25f28e610fe674f24eaa0a66c9ccc91b03fdcc92 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 6 Aug 2019 11:12:43 -0500 Subject: [PATCH] common/config: respect POD_MEMORY_REQUEST *and* POD_MEMORY_LIMIT env vars If a kubernetes pod spec specifies a limit of X, then the pod gets both the limits.memory and requests.memory resource fields set, and rook passes those as POD_MEMORY_LIMIT and POD_MEMORY_REQUEST environment variables. This is a problem if only the limit is set, because we will end up setting our osd_memory_target (and, in the future, other *_memory_targets) to the hard limit, and the daemon will inevitably reach that threshold and get killed. Fix this by also looking at the POD_MEMORY_LIMIT value, and applying the ratio (default: .8) to it, and setting our actual target to the min of that and the POD_MEMORY_REQUEST. Also, set the "default" target to ratio*limit, so that it will apply in general when no request is specified. When both request and limit are 10M, we then see "osd_memory_target": { "default": "800000000000", "env": "800000000000", "final": "800000000000" }, In a more "normal" situation where limit is 10M and request is 5M, we get "osd_memory_target": { "default": "800000000000", "env": "500000000000", "final": "500000000000" }, If only limit is specified (to 10M), we get "osd_memory_target": { "default": "800000000000", "final": "800000000000" }, Fixes: https://tracker.ceph.com/issues/41037 Signed-off-by: Sage Weil --- src/common/config.cc | 63 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/src/common/config.cc b/src/common/config.cc index 82d6a37efb20..d5c9e1c6f87f 100644 --- a/src/common/config.cc +++ b/src/common/config.cc @@ -485,6 +485,8 @@ void md_config_t::parse_env(unsigned entity_type, // CRD) and is the target memory utilization we try to maintain for daemons // that respect it. // + // If POD_MEMORY_REQUEST is present, we use it as the target. + // // - Limits: At runtime, the container runtime (and Linux) will use the // limits to see if the pod is using too many resources. In that case, the // pod will be killed/restarted automatically if the pod goes over the limit. @@ -492,30 +494,71 @@ void md_config_t::parse_env(unsigned entity_type, // much higher). This corresponds to the cgroup memory limit that will // trigger the Linux OOM killer. // - // Here are the documented best practices: https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits + // If POD_MEMORY_LIMIT is present, we use it as the /default/ value for + // the target, which means it will only apply if the *_memory_target option + // isn't set via some other path (e.g., POD_MEMORY_REQUEST, or the cluster + // config, or whatever.) + // + // Here are the documented best practices: + // https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits // // When the operator creates the CephCluster CR, it will need to generate the // desired requests and limits. As long as we are conservative in our choice // for requests and generous with the limits we should be in a good place to // get started. // - // The support in Rook is already there for applying the limits as seen in these links. + // The support in Rook is already there for applying the limits as seen in + // these links. // - // Rook docs on the resource requests and limits: https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings - // Example CR settings: https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90 - if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) { + // Rook docs on the resource requests and limits: + // https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings + // Example CR settings: + // https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90 + // + uint64_t pod_limit = 0, pod_request = 0; + if (auto pod_lim = getenv("POD_MEMORY_LIMIT"); pod_lim) { string err; - uint64_t v = atoll(pod_req); + uint64_t v = atoll(pod_lim); if (v) { switch (entity_type) { case CEPH_ENTITY_TYPE_OSD: - _set_val(values, tracker, stringify(v), - *find_option("osd_memory_target"), - CONF_ENV, &err); - break; + { + double cgroup_ratio = get_val( + values, "osd_memory_target_cgroup_limit_ratio"); + if (cgroup_ratio > 0.0) { + pod_limit = v * cgroup_ratio; + // set osd_memory_target *default* based on cgroup limit, so that + // it can be overridden by any explicit settings elsewhere. + set_val_default(values, tracker, + "osd_memory_target", stringify(pod_limit)); + } + } } } } + if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) { + if (uint64_t v = atoll(pod_req); v) { + pod_request = v; + } + } + if (pod_request && pod_limit) { + // If both LIMIT and REQUEST are set, ensure that we use the + // min of request and limit*ratio. This is important + // because k8s set set LIMIT == REQUEST if only LIMIT is + // specified, and we want to apply the ratio in that case, + // even though REQUEST is present. + pod_request = std::min(pod_request, pod_limit); + } + if (pod_request) { + string err; + switch (entity_type) { + case CEPH_ENTITY_TYPE_OSD: + _set_val(values, tracker, stringify(pod_request), + *find_option("osd_memory_target"), + CONF_ENV, &err); + break; + } + } if (getenv(args_var)) { vector env_args; -- 2.47.3