tasks: fix race in test_stale_caps

author John Spray <jspray@redhat.com>

Mon, 15 Sep 2014 11:50:55 +0000 (12:50 +0100)

committer John Spray <jspray@redhat.com>

Mon, 15 Sep 2014 13:32:20 +0000 (14:32 +0100)
author John Spray <jspray@redhat.com>
Mon, 15 Sep 2014 11:50:55 +0000 (12:50 +0100)
committer John Spray <jspray@redhat.com>
Mon, 15 Sep 2014 13:32:20 +0000 (14:32 +0100)
diff --git a/tasks/cephfs/mount.py b/tasks/cephfs/mount.py

index 42b943db37627c7fd964b12cd743e6222c62a428..1d34079ed9bf318aab722fa343f38db3c639214f 100644 (file)
--- a/tasks/cephfs/mount.py
+++ b/tasks/cephfs/mount.py
@@ -1,6 +1,7 @@
  from contextlib import contextmanager
  import logging
  import datetime
+import time
  from textwrap import dedent
  import os
  from teuthology.orchestra import run
@@ -101,6 +102,10 @@ class CephFSMount(object):
              'sudo', 'daemon-helper', 'kill', 'python', '-c', pyscript
          ], wait=False, stdin=run.PIPE)
  
+    def run_shell(self, args):
+        args = ["cd", self.mountpoint, run.Raw('&&')] + args
+        return self.client_remote.run(args=args)
+
      def open_background(self, basename="background_file"):
          """
          Open a file for writing, then block such that the client
@@ -125,6 +130,23 @@ class CephFSMount(object):
          self.background_procs.append(rproc)
          return rproc
  
+    def wait_for_visible(self, basename="background_file", timeout=30):
+        i = 0
+        while i < timeout:
+            r = self.client_remote.run(args=[
+                'sudo', 'ls', os.path.join(self.mountpoint, basename)
+            ], check_status=False)
+            if r.exitstatus == 0:
+                log.debug("File {0} became visible from {1} after {2}s".format(
+                    basename, self.client_id, i))
+                return
+            else:
+                time.sleep(1)
+                i += 1
+
+        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
+            i, basename, self.client_id))
+
      def write_background(self, basename="background_file"):
          """
          Open a file for writing, complete as soon as you can
diff --git a/tasks/mds_client_recovery.py b/tasks/mds_client_recovery.py

index 560a2f846a78dbe89c88a8aecac0b8f9c60f897c..12de23113a64be689de1cd09ac0386ef380049a3 100644 (file)
--- a/tasks/mds_client_recovery.py
+++ b/tasks/mds_client_recovery.py
@@ -40,6 +40,8 @@ class TestClientRecovery(unittest.TestCase):
          self.mount_a.wait_until_mounted()
          self.mount_b.wait_until_mounted()
  
+        self.mount_a.run_shell(["rm", "-rf", "*"])
+
      def tearDown(self):
          self.fs.clear_firewall()
          self.mount_a.teardown()
@@ -180,34 +182,42 @@ class TestClientRecovery(unittest.TestCase):
          # Capability release from stale session
          # =====================================
          cap_holder = self.mount_a.open_background()
-        self.mount_a.kill()
-
-        # Now, after mds_session_timeout seconds, the waiter should
-        # complete their operation when the MDS marks the holder's
-        # session stale.
-        cap_waiter = self.mount_b.write_background()
-        a = time.time()
-        cap_waiter.wait()
-        b = time.time()
  
-        # Should have succeeded
-        self.assertEqual(cap_waiter.exitstatus, 0)
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
  
-        cap_waited = b - a
-        log.info("cap_waiter waited {0}s".format(cap_waited))
-        self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
-                        "Capability handover took {0}, expected approx {1}".format(
-                            cap_waited, self.mds_session_timeout
-                        ))
+        # Simulate client death
+        self.mount_a.kill()
  
-        cap_holder.stdin.close()
          try:
-            cap_holder.wait()
-        except CommandFailedError:
-            # We killed it, so it raises an error
-            pass
-
-        self.mount_a.kill_cleanup()
+            # Now, after mds_session_timeout seconds, the waiter should
+            # complete their operation when the MDS marks the holder's
+            # session stale.
+            cap_waiter = self.mount_b.write_background()
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+
+            # Should have succeeded
+            self.assertEqual(cap_waiter.exitstatus, 0)
+
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+                            "Capability handover took {0}, expected approx {1}".format(
+                                cap_waited, self.mds_session_timeout
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except CommandFailedError:
+                # We killed it, so it raises an error
+                pass
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill_cleanup()
  
          self.mount_a.mount()
          self.mount_a.wait_until_mounted()
@@ -220,37 +230,44 @@ class TestClientRecovery(unittest.TestCase):
          # and then immediately kill it.
          cap_holder = self.mount_a.open_background()
          mount_a_client_id = self.mount_a.get_client_id()
-        self.mount_a.kill()
  
-        # The waiter should get stuck waiting for the capability
-        # held on the MDS by the now-dead client A
-        cap_waiter = self.mount_b.write_background()
-        time.sleep(5)
-        self.assertFalse(cap_waiter.finished)
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
  
-        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
-        # Now, because I evicted the old holder of the capability, it should
-        # immediately get handed over to the waiter
-        a = time.time()
-        cap_waiter.wait()
-        b = time.time()
-        cap_waited = b - a
-        log.info("cap_waiter waited {0}s".format(cap_waited))
-        # This is the check that it happened 'now' rather than waiting
-        # for the session timeout
-        self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
-                        "Capability handover took {0}, expected less than {1}".format(
-                            cap_waited, self.mds_session_timeout / 2.0
-                        ))
+        # Simulate client death
+        self.mount_a.kill()
  
-        cap_holder.stdin.close()
          try:
-            cap_holder.wait()
-        except CommandFailedError:
-            # We killed it, so it raises an error
-            pass
-
-        self.mount_a.kill_cleanup()
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+            time.sleep(5)
+            self.assertFalse(cap_waiter.finished)
+
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+            # Now, because I evicted the old holder of the capability, it should
+            # immediately get handed over to the waiter
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            # This is the check that it happened 'now' rather than waiting
+            # for the session timeout
+            self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, self.mds_session_timeout / 2.0
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except CommandFailedError:
+                # We killed it, so it raises an error
+                pass
+        finally:
+            self.mount_a.kill_cleanup()
  
          self.mount_a.mount()
          self.mount_a.wait_until_mounted()
author	John Spray <jspray@redhat.com>
	Mon, 15 Sep 2014 11:50:55 +0000 (12:50 +0100)
committer	John Spray <jspray@redhat.com>
	Mon, 15 Sep 2014 13:32:20 +0000 (14:32 +0100)
tasks/cephfs/mount.py		patch \| blob \| history
tasks/mds_client_recovery.py		patch \| blob \| history