From 25f28e610fe674f24eaa0a66c9ccc91b03fdcc92 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@redhat.com>
Date: Tue, 6 Aug 2019 11:12:43 -0500
Subject: [PATCH] common/config: respect POD_MEMORY_REQUEST *and*
 POD_MEMORY_LIMIT env vars

If a kubernetes pod spec specifies a limit of X, then the pod gets both
the limits.memory and requests.memory resource fields set, and rook passes
those as POD_MEMORY_LIMIT and POD_MEMORY_REQUEST environment variables.

This is a problem if only the limit is set, because we will end up
setting our osd_memory_target (and, in the future, other *_memory_targets)
to the hard limit, and the daemon will inevitably reach that threshold
and get killed.

Fix this by also looking at the POD_MEMORY_LIMIT value, and applying the
ratio (default: .8) to it, and setting our actual target to the min of
that and the POD_MEMORY_REQUEST.

Also, set the "default" target to ratio*limit, so that it will apply in
general when no request is specified.

When both request and limit are 10M, we then see

        "osd_memory_target": {
            "default": "800000000000",
            "env": "800000000000",
            "final": "800000000000"
        },

In a more "normal" situation where limit is 10M and request is 5M, we get

        "osd_memory_target": {
            "default": "800000000000",
            "env": "500000000000",
            "final": "500000000000"
        },

If only limit is specified (to 10M), we get

        "osd_memory_target": {
            "default": "800000000000",
            "final": "800000000000"
        },

Fixes: https://tracker.ceph.com/issues/41037
Signed-off-by: Sage Weil <sage@redhat.com>
---
 src/common/config.cc | 63 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/src/common/config.cc b/src/common/config.cc
index 82d6a37efb20..d5c9e1c6f87f 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -485,6 +485,8 @@ void md_config_t::parse_env(unsigned entity_type,
   //   CRD) and is the target memory utilization we try to maintain for daemons
   //   that respect it.
   //
+  //   If POD_MEMORY_REQUEST is present, we use it as the target.
+  //
   // - Limits: At runtime, the container runtime (and Linux) will use the
   //   limits to see if the pod is using too many resources. In that case, the
   //   pod will be killed/restarted automatically if the pod goes over the limit.
@@ -492,30 +494,71 @@ void md_config_t::parse_env(unsigned entity_type,
   //   much higher). This corresponds to the cgroup memory limit that will
   //   trigger the Linux OOM killer.
   //
-  // Here are the documented best practices: https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits
+  //   If POD_MEMORY_LIMIT is present, we use it as the /default/ value for
+  //   the target, which means it will only apply if the *_memory_target option
+  //   isn't set via some other path (e.g., POD_MEMORY_REQUEST, or the cluster
+  //   config, or whatever.)
+  //
+  // Here are the documented best practices:
+  //   https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#motivation-for-cpu-requests-and-limits
   //
   // When the operator creates the CephCluster CR, it will need to generate the
   // desired requests and limits. As long as we are conservative in our choice
   // for requests and generous with the limits we should be in a good place to
   // get started.
   //
-  // The support in Rook is already there for applying the limits as seen in these links.
+  // The support in Rook is already there for applying the limits as seen in
+  // these links.
   //
-  // Rook docs on the resource requests and limits: https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings
-  // Example CR settings: https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90
-  if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) {
+  // Rook docs on the resource requests and limits:
+  //   https://rook.io/docs/rook/v1.0/ceph-cluster-crd.html#cluster-wide-resources-configuration-settings
+  // Example CR settings:
+  //   https://github.com/rook/rook/blob/6d2ef936698593036185aabcb00d1d74f9c7bfc1/cluster/examples/kubernetes/ceph/cluster.yaml#L90
+  //
+  uint64_t pod_limit = 0, pod_request = 0;
+  if (auto pod_lim = getenv("POD_MEMORY_LIMIT"); pod_lim) {
     string err;
-    uint64_t v = atoll(pod_req);
+    uint64_t v = atoll(pod_lim);
     if (v) {
       switch (entity_type) {
       case CEPH_ENTITY_TYPE_OSD:
-	_set_val(values, tracker, stringify(v),
-		 *find_option("osd_memory_target"),
-		 CONF_ENV, &err);
-	break;
+        {
+	  double cgroup_ratio = get_val<double>(
+	    values, "osd_memory_target_cgroup_limit_ratio");
+	  if (cgroup_ratio > 0.0) {
+	    pod_limit = v * cgroup_ratio;
+	    // set osd_memory_target *default* based on cgroup limit, so that
+	    // it can be overridden by any explicit settings elsewhere.
+	    set_val_default(values, tracker,
+			    "osd_memory_target", stringify(pod_limit));
+	  }
+	}
       }
     }
   }
+  if (auto pod_req = getenv("POD_MEMORY_REQUEST"); pod_req) {
+    if (uint64_t v = atoll(pod_req); v) {
+      pod_request = v;
+    }
+  }
+  if (pod_request && pod_limit) {
+    // If both LIMIT and REQUEST are set, ensure that we use the
+    // min of request and limit*ratio.  This is important
+    // because k8s set set LIMIT == REQUEST if only LIMIT is
+    // specified, and we want to apply the ratio in that case,
+    // even though REQUEST is present.
+    pod_request = std::min<uint64_t>(pod_request, pod_limit);
+  }
+  if (pod_request) {
+    string err;
+    switch (entity_type) {
+    case CEPH_ENTITY_TYPE_OSD:
+      _set_val(values, tracker, stringify(pod_request),
+	       *find_option("osd_memory_target"),
+	       CONF_ENV, &err);
+      break;
+    }
+  }
 
   if (getenv(args_var)) {
     vector<const char *> env_args;
-- 
2.47.3