]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mon/OSDMonitor: test crush with fork but not crushtool spawn
authorSage Weil <sage@redhat.com>
Thu, 29 Jun 2017 19:14:40 +0000 (15:14 -0400)
committerSage Weil <sage@redhat.com>
Fri, 7 Jul 2017 15:11:24 +0000 (11:11 -0400)
We see timeouts here, but I very much suspect they are due to the overhead
of launching the crushtool process and not the test itself.  We have
perfectly code already in our process, though; we just want to isolate
failure and time out reliably.  So, fork and timeout, without executing
a new binary.

Hopefully-fixes: http://tracker.ceph.com/issues/19964
Signed-off-by: Sage Weil <sage@redhat.com>
src/crush/CrushTester.cc
src/crush/CrushTester.h
src/mon/OSDMonitor.cc

index ef83e7311d7bfe99f9bb0271e7e834e5d031f506..c24448a1d359f78bb80cc18440c85be18cbba012 100644 (file)
@@ -19,6 +19,7 @@
 #include <boost/icl/interval_map.hpp>
 #include <boost/algorithm/string/join.hpp>
 #include "common/SubProcess.h"
+#include "common/fork_function.h"
 
 void CrushTester::set_device_weight(int dev, float f)
 {
@@ -359,6 +360,18 @@ void CrushTester::write_integer_indexed_scalar_data_string(vector<string> &dst,
   dst.push_back( data_buffer.str() );
 }
 
+int CrushTester::test_with_fork(int timeout)
+{
+  ostringstream sink;
+  int r = fork_function(timeout, sink, [&]() {
+      return test();
+    });
+  if (r == -ETIMEDOUT) {
+    err << "timed out during smoke test (" << timeout << " seconds)";
+  }
+  return r;
+}
+
 int CrushTester::test_with_crushtool(const char *crushtool_cmd,
                                     int max_id, int timeout,
                                     int ruleset)
index b3c3b70c35f0c6991dd7ab0a2b65326f1174c267..0982f342a3a24085830a31590eff2c383ce5997f 100644 (file)
@@ -358,6 +358,7 @@ public:
    */
   void check_overlapped_rules() const;
   int test();
+  int test_with_fork(int timeout);
   int test_with_crushtool(const char *crushtool_cmd = "crushtool",
                          int max_id = -1,
                          int timeout = 0,
index 405c05a8ee06a5bbe8b6eb8d66c2c2d42921ea09..de0cb2f7d76749d34c4287b493bccd42cadd625b 100644 (file)
@@ -5624,19 +5624,13 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
   _get_pending_crush(newcrush);
   ostringstream err;
   CrushTester tester(newcrush, err);
-  // use the internal crush tester if crushtool config is empty
-  if (g_conf->crushtool.empty()) {
-    r = tester.test();
-  } else {
-    r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
-                                  osdmap.get_max_osd(),
-                                  g_conf->mon_lease,
-                                  crush_rule);
-  }
-  if (r) {
-    dout(10) << " tester.test_with_crushtool returns " << r
+  tester.set_max_x(50);
+  tester.set_rule(crush_rule);
+  r = tester.test_with_fork(g_conf->mon_lease);
+  if (r < 0) {
+    dout(10) << " tester.test_with_fork returns " << r
             << ": " << err.str() << dendl;
-    *ss << "crushtool check failed with " << r << ": " << err.str();
+    *ss << "crush test failed with " << r << ": " << err.str();
     return r;
   }
   unsigned size, min_size;
@@ -7039,16 +7033,12 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
     dout(10) << " testing map" << dendl;
     stringstream ess;
     CrushTester tester(crush, ess);
-    // XXX: Use mon_lease as a timeout value for crushtool.
-    // If the crushtool consistently takes longer than 'mon_lease' seconds,
-    // then we would consistently trigger an election before the command
-    // finishes, having a flapping monitor unable to hold quorum.
-    int r = tester.test_with_crushtool(g_conf->crushtool.c_str(),
-                                      osdmap.get_max_osd(),
-                                      g_conf->mon_lease);
+    tester.set_max_x(50);
+    int r = tester.test_with_fork(g_conf->mon_lease);
     if (r < 0) {
-      derr << "error on crush map: " << ess.str() << dendl;
-      ss << "Failed crushmap test: " << ess.str();
+      dout(10) << " tester.test_with_fork returns " << r
+              << ": " << ess.str() << dendl;
+      ss << "crush smoke test failed with " << r << ": " << ess.str();
       err = r;
       goto reply;
     }