From 1d9101cf31064774a7cbe740ce27f1dfee4e3cd7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 15 Sep 2014 12:50:55 +0100 Subject: [PATCH] tasks: fix race in test_stale_caps Signed-off-by: John Spray --- tasks/cephfs/mount.py | 22 +++++++ tasks/mds_client_recovery.py | 119 ++++++++++++++++++++--------------- 2 files changed, 90 insertions(+), 51 deletions(-) diff --git a/tasks/cephfs/mount.py b/tasks/cephfs/mount.py index 42b943db37627..1d34079ed9bf3 100644 --- a/tasks/cephfs/mount.py +++ b/tasks/cephfs/mount.py @@ -1,6 +1,7 @@ from contextlib import contextmanager import logging import datetime +import time from textwrap import dedent import os from teuthology.orchestra import run @@ -101,6 +102,10 @@ class CephFSMount(object): 'sudo', 'daemon-helper', 'kill', 'python', '-c', pyscript ], wait=False, stdin=run.PIPE) + def run_shell(self, args): + args = ["cd", self.mountpoint, run.Raw('&&')] + args + return self.client_remote.run(args=args) + def open_background(self, basename="background_file"): """ Open a file for writing, then block such that the client @@ -125,6 +130,23 @@ class CephFSMount(object): self.background_procs.append(rproc) return rproc + def wait_for_visible(self, basename="background_file", timeout=30): + i = 0 + while i < timeout: + r = self.client_remote.run(args=[ + 'sudo', 'ls', os.path.join(self.mountpoint, basename) + ], check_status=False) + if r.exitstatus == 0: + log.debug("File {0} became visible from {1} after {2}s".format( + basename, self.client_id, i)) + return + else: + time.sleep(1) + i += 1 + + raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format( + i, basename, self.client_id)) + def write_background(self, basename="background_file"): """ Open a file for writing, complete as soon as you can diff --git a/tasks/mds_client_recovery.py b/tasks/mds_client_recovery.py index 560a2f846a78d..12de23113a64b 100644 --- a/tasks/mds_client_recovery.py +++ b/tasks/mds_client_recovery.py @@ -40,6 +40,8 @@ class TestClientRecovery(unittest.TestCase): self.mount_a.wait_until_mounted() self.mount_b.wait_until_mounted() + self.mount_a.run_shell(["rm", "-rf", "*"]) + def tearDown(self): self.fs.clear_firewall() self.mount_a.teardown() @@ -180,34 +182,42 @@ class TestClientRecovery(unittest.TestCase): # Capability release from stale session # ===================================== cap_holder = self.mount_a.open_background() - self.mount_a.kill() - - # Now, after mds_session_timeout seconds, the waiter should - # complete their operation when the MDS marks the holder's - # session stale. - cap_waiter = self.mount_b.write_background() - a = time.time() - cap_waiter.wait() - b = time.time() - # Should have succeeded - self.assertEqual(cap_waiter.exitstatus, 0) + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible() - cap_waited = b - a - log.info("cap_waiter waited {0}s".format(cap_waited)) - self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0, - "Capability handover took {0}, expected approx {1}".format( - cap_waited, self.mds_session_timeout - )) + # Simulate client death + self.mount_a.kill() - cap_holder.stdin.close() try: - cap_holder.wait() - except CommandFailedError: - # We killed it, so it raises an error - pass - - self.mount_a.kill_cleanup() + # Now, after mds_session_timeout seconds, the waiter should + # complete their operation when the MDS marks the holder's + # session stale. + cap_waiter = self.mount_b.write_background() + a = time.time() + cap_waiter.wait() + b = time.time() + + # Should have succeeded + self.assertEqual(cap_waiter.exitstatus, 0) + + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0, + "Capability handover took {0}, expected approx {1}".format( + cap_waited, self.mds_session_timeout + )) + + cap_holder.stdin.close() + try: + cap_holder.wait() + except CommandFailedError: + # We killed it, so it raises an error + pass + finally: + # teardown() doesn't quite handle this case cleanly, so help it out + self.mount_a.kill_cleanup() self.mount_a.mount() self.mount_a.wait_until_mounted() @@ -220,37 +230,44 @@ class TestClientRecovery(unittest.TestCase): # and then immediately kill it. cap_holder = self.mount_a.open_background() mount_a_client_id = self.mount_a.get_client_id() - self.mount_a.kill() - # The waiter should get stuck waiting for the capability - # held on the MDS by the now-dead client A - cap_waiter = self.mount_b.write_background() - time.sleep(5) - self.assertFalse(cap_waiter.finished) + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible() - self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) - # Now, because I evicted the old holder of the capability, it should - # immediately get handed over to the waiter - a = time.time() - cap_waiter.wait() - b = time.time() - cap_waited = b - a - log.info("cap_waiter waited {0}s".format(cap_waited)) - # This is the check that it happened 'now' rather than waiting - # for the session timeout - self.assertLess(cap_waited, self.mds_session_timeout / 2.0, - "Capability handover took {0}, expected less than {1}".format( - cap_waited, self.mds_session_timeout / 2.0 - )) + # Simulate client death + self.mount_a.kill() - cap_holder.stdin.close() try: - cap_holder.wait() - except CommandFailedError: - # We killed it, so it raises an error - pass - - self.mount_a.kill_cleanup() + # The waiter should get stuck waiting for the capability + # held on the MDS by the now-dead client A + cap_waiter = self.mount_b.write_background() + time.sleep(5) + self.assertFalse(cap_waiter.finished) + + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + # Now, because I evicted the old holder of the capability, it should + # immediately get handed over to the waiter + a = time.time() + cap_waiter.wait() + b = time.time() + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + # This is the check that it happened 'now' rather than waiting + # for the session timeout + self.assertLess(cap_waited, self.mds_session_timeout / 2.0, + "Capability handover took {0}, expected less than {1}".format( + cap_waited, self.mds_session_timeout / 2.0 + )) + + cap_holder.stdin.close() + try: + cap_holder.wait() + except CommandFailedError: + # We killed it, so it raises an error + pass + finally: + self.mount_a.kill_cleanup() self.mount_a.mount() self.mount_a.wait_until_mounted() -- 2.39.5