From: Sage Weil Date: Thu, 29 Jun 2017 19:14:40 +0000 (-0400) Subject: mon/OSDMonitor: test crush with fork but not crushtool spawn X-Git-Tag: v12.1.1~110^2~2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e189f11fcde6829cc7f86894b913bc1a3f81ecfe;p=ceph.git mon/OSDMonitor: test crush with fork but not crushtool spawn We see timeouts here, but I very much suspect they are due to the overhead of launching the crushtool process and not the test itself. We have perfectly code already in our process, though; we just want to isolate failure and time out reliably. So, fork and timeout, without executing a new binary. Hopefully-fixes: http://tracker.ceph.com/issues/19964 Signed-off-by: Sage Weil --- diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc index ef83e7311d7b..c24448a1d359 100644 --- a/src/crush/CrushTester.cc +++ b/src/crush/CrushTester.cc @@ -19,6 +19,7 @@ #include #include #include "common/SubProcess.h" +#include "common/fork_function.h" void CrushTester::set_device_weight(int dev, float f) { @@ -359,6 +360,18 @@ void CrushTester::write_integer_indexed_scalar_data_string(vector &dst, dst.push_back( data_buffer.str() ); } +int CrushTester::test_with_fork(int timeout) +{ + ostringstream sink; + int r = fork_function(timeout, sink, [&]() { + return test(); + }); + if (r == -ETIMEDOUT) { + err << "timed out during smoke test (" << timeout << " seconds)"; + } + return r; +} + int CrushTester::test_with_crushtool(const char *crushtool_cmd, int max_id, int timeout, int ruleset) diff --git a/src/crush/CrushTester.h b/src/crush/CrushTester.h index b3c3b70c35f0..0982f342a3a2 100644 --- a/src/crush/CrushTester.h +++ b/src/crush/CrushTester.h @@ -358,6 +358,7 @@ public: */ void check_overlapped_rules() const; int test(); + int test_with_fork(int timeout); int test_with_crushtool(const char *crushtool_cmd = "crushtool", int max_id = -1, int timeout = 0, diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 405c05a8ee06..de0cb2f7d767 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -5624,19 +5624,13 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid, _get_pending_crush(newcrush); ostringstream err; CrushTester tester(newcrush, err); - // use the internal crush tester if crushtool config is empty - if (g_conf->crushtool.empty()) { - r = tester.test(); - } else { - r = tester.test_with_crushtool(g_conf->crushtool.c_str(), - osdmap.get_max_osd(), - g_conf->mon_lease, - crush_rule); - } - if (r) { - dout(10) << " tester.test_with_crushtool returns " << r + tester.set_max_x(50); + tester.set_rule(crush_rule); + r = tester.test_with_fork(g_conf->mon_lease); + if (r < 0) { + dout(10) << " tester.test_with_fork returns " << r << ": " << err.str() << dendl; - *ss << "crushtool check failed with " << r << ": " << err.str(); + *ss << "crush test failed with " << r << ": " << err.str(); return r; } unsigned size, min_size; @@ -7039,16 +7033,12 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op, dout(10) << " testing map" << dendl; stringstream ess; CrushTester tester(crush, ess); - // XXX: Use mon_lease as a timeout value for crushtool. - // If the crushtool consistently takes longer than 'mon_lease' seconds, - // then we would consistently trigger an election before the command - // finishes, having a flapping monitor unable to hold quorum. - int r = tester.test_with_crushtool(g_conf->crushtool.c_str(), - osdmap.get_max_osd(), - g_conf->mon_lease); + tester.set_max_x(50); + int r = tester.test_with_fork(g_conf->mon_lease); if (r < 0) { - derr << "error on crush map: " << ess.str() << dendl; - ss << "Failed crushmap test: " << ess.str(); + dout(10) << " tester.test_with_fork returns " << r + << ": " << ess.str() << dendl; + ss << "crush smoke test failed with " << r << ": " << ess.str(); err = r; goto reply; }