From 1d9101cf31064774a7cbe740ce27f1dfee4e3cd7 Mon Sep 17 00:00:00 2001
From: John Spray <jspray@redhat.com>
Date: Mon, 15 Sep 2014 12:50:55 +0100
Subject: [PATCH] tasks: fix race in test_stale_caps

Signed-off-by: John Spray <john.spray@redhat.com>
---
 tasks/cephfs/mount.py        |  22 +++++++
 tasks/mds_client_recovery.py | 119 ++++++++++++++++++++---------------
 2 files changed, 90 insertions(+), 51 deletions(-)

diff --git a/tasks/cephfs/mount.py b/tasks/cephfs/mount.py
index 42b943db37627..1d34079ed9bf3 100644
--- a/tasks/cephfs/mount.py
+++ b/tasks/cephfs/mount.py
@@ -1,6 +1,7 @@
 from contextlib import contextmanager
 import logging
 import datetime
+import time
 from textwrap import dedent
 import os
 from teuthology.orchestra import run
@@ -101,6 +102,10 @@ class CephFSMount(object):
             'sudo', 'daemon-helper', 'kill', 'python', '-c', pyscript
         ], wait=False, stdin=run.PIPE)
 
+    def run_shell(self, args):
+        args = ["cd", self.mountpoint, run.Raw('&&')] + args
+        return self.client_remote.run(args=args)
+
     def open_background(self, basename="background_file"):
         """
         Open a file for writing, then block such that the client
@@ -125,6 +130,23 @@ class CephFSMount(object):
         self.background_procs.append(rproc)
         return rproc
 
+    def wait_for_visible(self, basename="background_file", timeout=30):
+        i = 0
+        while i < timeout:
+            r = self.client_remote.run(args=[
+                'sudo', 'ls', os.path.join(self.mountpoint, basename)
+            ], check_status=False)
+            if r.exitstatus == 0:
+                log.debug("File {0} became visible from {1} after {2}s".format(
+                    basename, self.client_id, i))
+                return
+            else:
+                time.sleep(1)
+                i += 1
+
+        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
+            i, basename, self.client_id))
+
     def write_background(self, basename="background_file"):
         """
         Open a file for writing, complete as soon as you can
diff --git a/tasks/mds_client_recovery.py b/tasks/mds_client_recovery.py
index 560a2f846a78d..12de23113a64b 100644
--- a/tasks/mds_client_recovery.py
+++ b/tasks/mds_client_recovery.py
@@ -40,6 +40,8 @@ class TestClientRecovery(unittest.TestCase):
         self.mount_a.wait_until_mounted()
         self.mount_b.wait_until_mounted()
 
+        self.mount_a.run_shell(["rm", "-rf", "*"])
+
     def tearDown(self):
         self.fs.clear_firewall()
         self.mount_a.teardown()
@@ -180,34 +182,42 @@ class TestClientRecovery(unittest.TestCase):
         # Capability release from stale session
         # =====================================
         cap_holder = self.mount_a.open_background()
-        self.mount_a.kill()
-
-        # Now, after mds_session_timeout seconds, the waiter should
-        # complete their operation when the MDS marks the holder's
-        # session stale.
-        cap_waiter = self.mount_b.write_background()
-        a = time.time()
-        cap_waiter.wait()
-        b = time.time()
 
-        # Should have succeeded
-        self.assertEqual(cap_waiter.exitstatus, 0)
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
 
-        cap_waited = b - a
-        log.info("cap_waiter waited {0}s".format(cap_waited))
-        self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
-                        "Capability handover took {0}, expected approx {1}".format(
-                            cap_waited, self.mds_session_timeout
-                        ))
+        # Simulate client death
+        self.mount_a.kill()
 
-        cap_holder.stdin.close()
         try:
-            cap_holder.wait()
-        except CommandFailedError:
-            # We killed it, so it raises an error
-            pass
-
-        self.mount_a.kill_cleanup()
+            # Now, after mds_session_timeout seconds, the waiter should
+            # complete their operation when the MDS marks the holder's
+            # session stale.
+            cap_waiter = self.mount_b.write_background()
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+
+            # Should have succeeded
+            self.assertEqual(cap_waiter.exitstatus, 0)
+
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+                            "Capability handover took {0}, expected approx {1}".format(
+                                cap_waited, self.mds_session_timeout
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except CommandFailedError:
+                # We killed it, so it raises an error
+                pass
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill_cleanup()
 
         self.mount_a.mount()
         self.mount_a.wait_until_mounted()
@@ -220,37 +230,44 @@ class TestClientRecovery(unittest.TestCase):
         # and then immediately kill it.
         cap_holder = self.mount_a.open_background()
         mount_a_client_id = self.mount_a.get_client_id()
-        self.mount_a.kill()
 
-        # The waiter should get stuck waiting for the capability
-        # held on the MDS by the now-dead client A
-        cap_waiter = self.mount_b.write_background()
-        time.sleep(5)
-        self.assertFalse(cap_waiter.finished)
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
 
-        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
-        # Now, because I evicted the old holder of the capability, it should
-        # immediately get handed over to the waiter
-        a = time.time()
-        cap_waiter.wait()
-        b = time.time()
-        cap_waited = b - a
-        log.info("cap_waiter waited {0}s".format(cap_waited))
-        # This is the check that it happened 'now' rather than waiting
-        # for the session timeout
-        self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
-                        "Capability handover took {0}, expected less than {1}".format(
-                            cap_waited, self.mds_session_timeout / 2.0
-                        ))
+        # Simulate client death
+        self.mount_a.kill()
 
-        cap_holder.stdin.close()
         try:
-            cap_holder.wait()
-        except CommandFailedError:
-            # We killed it, so it raises an error
-            pass
-
-        self.mount_a.kill_cleanup()
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+            time.sleep(5)
+            self.assertFalse(cap_waiter.finished)
+
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+            # Now, because I evicted the old holder of the capability, it should
+            # immediately get handed over to the waiter
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            # This is the check that it happened 'now' rather than waiting
+            # for the session timeout
+            self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, self.mds_session_timeout / 2.0
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except CommandFailedError:
+                # We killed it, so it raises an error
+                pass
+        finally:
+            self.mount_a.kill_cleanup()
 
         self.mount_a.mount()
         self.mount_a.wait_until_mounted()
-- 
2.39.5