]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
qa/cephadm: add test for cephadm asyncio based timeout
authorAdam King <adking@redhat.com>
Sat, 4 Nov 2023 22:45:17 +0000 (18:45 -0400)
committerAdam King <adking@redhat.com>
Wed, 28 Feb 2024 20:04:48 +0000 (15:04 -0500)
Adds a test that will set the default cephadm command
timeout and then force a timeout to occur by holding
the cephadm lock and triggering a device refresh.
This works because cephadm ceph-volume commands
require the cephadm lock to run, so the command will
timeout waiting for the lock to become available.

Signed-off-by: Adam King <adking@redhat.com>
(cherry picked from commit efb69ee350d1c20bdbfc88db8a01ec88761bd99f)

qa/suites/orch/cephadm/workunits/task/test_cephadm_timeout.yaml [new file with mode: 0644]
qa/workunits/cephadm/test_cephadm_timeout.py [new file with mode: 0755]

diff --git a/qa/suites/orch/cephadm/workunits/task/test_cephadm_timeout.yaml b/qa/suites/orch/cephadm/workunits/task/test_cephadm_timeout.yaml
new file mode 100644 (file)
index 0000000..24b53d0
--- /dev/null
@@ -0,0 +1,13 @@
+roles:
+- - host.a
+  - mon.a
+  - mgr.a
+  - osd.0
+  - client.0
+tasks:
+- install:
+- cephadm:
+- workunit:
+    clients:
+      client.0:
+        - cephadm/test_cephadm_timeout.py
\ No newline at end of file
diff --git a/qa/workunits/cephadm/test_cephadm_timeout.py b/qa/workunits/cephadm/test_cephadm_timeout.py
new file mode 100755 (executable)
index 0000000..67b43a2
--- /dev/null
@@ -0,0 +1,179 @@
+#!/usr/bin/python3 -s
+
+import time
+import os
+import fcntl
+import subprocess
+import uuid
+import sys
+
+from typing import Optional, Any
+
+LOCK_DIR = '/run/cephadm'
+DATA_DIR = '/var/lib/ceph'
+
+class _Acquire_ReturnProxy(object):
+    def __init__(self, lock: 'FileLock') -> None:
+        self.lock = lock
+        return None
+
+    def __enter__(self) -> 'FileLock':
+        return self.lock
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.lock.release()
+        return None
+
+class FileLock(object):
+    def __init__(self, name: str, timeout: int = -1) -> None:
+        if not os.path.exists(LOCK_DIR):
+            os.mkdir(LOCK_DIR, 0o700)
+        self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
+
+        self._lock_file_fd: Optional[int] = None
+        self.timeout = timeout
+        self._lock_counter = 0
+        return None
+
+    @property
+    def is_locked(self) -> bool:
+        return self._lock_file_fd is not None
+
+    def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
+        # Use the default timeout, if no timeout is provided.
+        if timeout is None:
+            timeout = self.timeout
+
+        # Increment the number right at the beginning.
+        # We can still undo it, if something fails.
+        self._lock_counter += 1
+
+        start_time = time.time()
+        try:
+            while True:
+                if not self.is_locked:
+                    self._acquire()
+
+                if self.is_locked:
+                    break
+                elif timeout >= 0 and time.time() - start_time > timeout:
+                    raise Exception(self._lock_file)
+                else:
+                    time.sleep(poll_intervall)
+        except Exception:
+            # Something did go wrong, so decrement the counter.
+            self._lock_counter = max(0, self._lock_counter - 1)
+
+            raise
+        return _Acquire_ReturnProxy(lock=self)
+
+    def release(self, force: bool = False) -> None:
+        if self.is_locked:
+            self._lock_counter -= 1
+
+            if self._lock_counter == 0 or force:
+                self._release()
+                self._lock_counter = 0
+
+        return None
+
+    def __enter__(self) -> 'FileLock':
+        self.acquire()
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.release()
+        return None
+
+    def __del__(self) -> None:
+        self.release(force=True)
+        return None
+
+    def _acquire(self) -> None:
+        open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+        fd = os.open(self._lock_file, open_mode)
+
+        try:
+            fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+        except (IOError, OSError):
+            os.close(fd)
+        else:
+            self._lock_file_fd = fd
+        return None
+
+    def _release(self) -> None:
+        fd = self._lock_file_fd
+        self._lock_file_fd = None
+        fcntl.flock(fd, fcntl.LOCK_UN)  # type: ignore
+        os.close(fd)  # type: ignore
+        return None
+
+def _is_fsid(s):
+    try:
+        uuid.UUID(s)
+    except ValueError:
+        return False
+    return True
+
+def find_fsid():
+    if not os.path.exists(DATA_DIR):
+        raise Exception(f'{DATA_DIR} does not exist. Aborting...')
+
+    for d in os.listdir(DATA_DIR):
+        # assume the first thing we find that is an fsid
+        # is what we want. Not expecting multiple clusters
+        # to have been installed here.
+        if _is_fsid(d):
+            return d
+    raise Exception(f'No fsid dir found in {DATA_DIR} does not exist. Aborting...')
+
+def main():
+    print('Looking for cluster fsid...')
+    fsid = find_fsid()
+    print(f'Found fsid {fsid}')
+
+    print('Setting cephadm command timeout to 120...')
+    subprocess.run(['cephadm', 'shell', '--', 'ceph', 'config', 'set',
+                    'mgr', 'mgr/cephadm/default_cephadm_command_timeout', '120'],
+                    check=True)
+
+    print('Taking hold of cephadm lock for 300 seconds...')
+    lock = FileLock(fsid, 300)
+    lock.acquire()
+
+    print('Triggering cephadm device refresh...')
+    subprocess.run(['cephadm', 'shell', '--', 'ceph', 'orch', 'device', 'ls', '--refresh'],
+                    check=True)
+
+    print('Sleeping 150 seconds to allow for timeout to occur...')
+    time.sleep(150)
+
+    print('Checking ceph health detail...')
+    # directing stdout to res.stdout via "capture_stdout" option
+    # (and same for stderr) seems to have been added in python 3.7.
+    # Using files so this works with 3.6 as well
+    with open('/tmp/ceph-health-detail-stdout', 'w') as f_stdout:
+        with open('/tmp/ceph-health-detail-stderr', 'w') as f_stderr:
+            subprocess.run(['cephadm', 'shell', '--', 'ceph', 'health', 'detail'],
+                           check=True, stdout=f_stdout, stderr=f_stderr)
+
+    res_stdout = open('/tmp/ceph-health-detail-stdout', 'r').read()
+    res_stderr = open('/tmp/ceph-health-detail-stderr', 'r').read()
+    print(f'"cephadm shell -- ceph health detail" stdout:\n{res_stdout}')
+    print(f'"cephadm shell -- ceph health detail" stderr:\n{res_stderr}')
+
+    print('Checking for correct health warning in health detail...')
+    if 'CEPHADM_REFRESH_FAILED' not in res_stdout:
+        raise Exception('No health warning caused by timeout was raised')
+    if 'Command "cephadm ceph-volume -- inventory" timed out' not in res_stdout:
+        raise Exception('Health warnings did not contain message about time out')
+
+    print('Health warnings found succesfully. Exiting.')
+    return 0
+
+    
+if __name__ == '__main__':
+    if os.getuid() != 0:
+        print('Trying to run myself with sudo...')
+        os.execvp('sudo', [sys.executable] + list(sys.argv))
+    main()