From: sunilkumarn417 <sunnagar@redhat.com>
Date: Sun, 23 Feb 2020 12:30:39 +0000 (+0530)
Subject: Updated system tests with specific validations
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=c2917794f76bf369bb3f460ee48ad3fa21f676ea;p=ceph.git

Updated system tests with specific validations

Signed-off-by: sunilkumarn417 <sunnagar@redhat.com>
---

diff --git a/qa/suites/mixed-system-tests/single/tasks/basic.yaml b/qa/suites/mixed-system-tests/single/tasks/basic.yaml
index 06b6d288a260d..777dfe6b6e2d3 100644
--- a/qa/suites/mixed-system-tests/single/tasks/basic.yaml
+++ b/qa/suites/mixed-system-tests/single/tasks/basic.yaml
@@ -7,7 +7,7 @@ tasks:
         config:
             user_count: 5
             bucket_count: 5
-            objects_count: 20
+            objects_count: 10
             objects_size_range:
                 min: 100
                 max: 200
@@ -89,6 +89,6 @@ tasks:
             io-engine: rbd
             test-clone-io: 1
             rw: randrw
-            runtime: 600
+            runtime: 900
     - mixed_system_test.restart_tests:
-        daemons: ["mgr", "mon", "osd"]
+        daemons: ["mgr", "mon", "mds", "osd"]
diff --git a/qa/tasks/mixed_system_test.py b/qa/tasks/mixed_system_test.py
index 83ce2750c0d64..04f402de4ee77 100644
--- a/qa/tasks/mixed_system_test.py
+++ b/qa/tasks/mixed_system_test.py
@@ -3,6 +3,7 @@ import logging
 
 from tasks.mixed_system_tests import system
 from tasks.mixed_system_tests import ios
+from time import sleep
 
 log = logging.getLogger(__name__)
 
@@ -60,10 +61,14 @@ def restart_tests(ctx, config):
     daemons = config.get('daemons')
     try:
         for daemon in daemons:
+            stats = system.get_daemon_info(daemon, ctx)
+            assert stats.get("active_count") > 0,\
+                "{} Not found in cluster".format(daemon)
             assert system.ceph_daemon_system_test(ctx, daemon)
             log.info("{} completed".format(daemon))
+            sleep(60)
         yield
     except Exception as err:
         assert False, err
     finally:
-        log.info("Daemon(s) Service system tests completed")
+        log.info("Completed")
diff --git a/qa/tasks/mixed_system_tests/constants.py b/qa/tasks/mixed_system_tests/constants.py
deleted file mode 100644
index 1533d68561120..0000000000000
--- a/qa/tasks/mixed_system_tests/constants.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""
-Add your constants here
-
-"""
-
-cephqe_scripts = {
-    "DIR": {"v1": {"script": "rgw/v1/tests/s3/", "config": "rgw/v1/tests/s3/yamls"},
-            "v2": {"script": "rgw/v2/tests/s3_swift/", "config": "rgw/v2/tests/s3_swift/configs"}
-    },
-    "MASTER_BRANCH": "master",
-    "REPO_NAME": "ceph-qe-scripts",
-    "WIP_BRANCH": None
-}
-
-HEALTH = {
-    "error": "HEALTH_ERR",
-    "warn": "HEALTH_WARN",
-    "good": "HEALTH_OK"
-}
diff --git a/qa/tasks/mixed_system_tests/ios.py b/qa/tasks/mixed_system_tests/ios.py
index d55b70c438c0c..673418279438c 100644
--- a/qa/tasks/mixed_system_tests/ios.py
+++ b/qa/tasks/mixed_system_tests/ios.py
@@ -9,11 +9,24 @@ import pwd
 import time
 
 from teuthology.orchestra import run
-from constants import cephqe_scripts
 
 log = logging.getLogger(__name__)
 
 
+cephqe_scripts = {
+    "DIR": {"v1":
+                {"script": "rgw/v1/tests/s3/",
+                 "config": "rgw/v1/tests/s3/yamls"},
+            "v2":
+                {"script": "rgw/v2/tests/s3_swift/",
+                 "config": "rgw/v2/tests/s3_swift/configs"}
+            },
+    "MASTER_BRANCH": "master",
+    "REPO_NAME": "ceph-qe-scripts",
+    "WIP_BRANCH": None
+}
+
+
 class rgw_ios:
     """
     RGW IOS using ceph-qe-scripts
diff --git a/qa/tasks/mixed_system_tests/system.py b/qa/tasks/mixed_system_tests/system.py
index a1a7b6543f727..8a5e92c623130 100644
--- a/qa/tasks/mixed_system_tests/system.py
+++ b/qa/tasks/mixed_system_tests/system.py
@@ -1,15 +1,156 @@
 """
-System  tests
+System  Tests
 """
 import logging
+from cStringIO import StringIO
 from time import sleep
-from constants import HEALTH
 
 from teuthology.orchestra import run
+from teuthology.exceptions import CommandFailedError
 
 log = logging.getLogger(__name__)
 
 
+_MGR = "mgr"
+_OSD = "osd"
+_MON = "mon"
+_MDS = "mds"
+
+_CEPH_HEALTH = {
+    "error": "HEALTH_ERR",
+    "warn": "HEALTH_WARN",
+    "good": "HEALTH_OK"
+}
+
+_systemd_cmd = 'sudo systemctl {action} {daemon}@{id_}'
+
+
+def __mark(dstate):
+    """
+    Logging marker
+    Args:
+        dstate: daemon state
+    """
+    return " [ {}:{} ]".format(dstate.type_.upper(), dstate.id_.upper())
+
+
+def __wait(seconds=60, msg=None):
+    """
+    Method to initiate sleep with reason
+    """
+    log.info("Wait for {} seconds......".format(seconds))
+    log.info("Reason to wait : {}".format(msg))
+    sleep(seconds)
+    log.info("wait completed......")
+
+
+def daemon_service(dstate, action, retries=10):
+    """
+    perform systemctl command with action provided
+    Args:
+        dstate: Daemon state
+        action: action to be performed
+        retries: number of retries
+    """
+    mark = __mark(dstate)
+    daemon = "{cluster}-{role}".format(cluster=dstate.cluster, role=dstate.type_)
+    daemon_id = "{id}".format(id=dstate.id_)
+
+    log.info("{} {} daemon".format(mark, action.upper()))
+    while retries:
+        retries -= 1
+        try:
+            getattr(dstate, action)()
+            __wait(60, msg="systemctl command executed")
+            res = dstate.remote.run(args=[run.Raw(dstate.show_cmd)], stdout=StringIO())
+            res = res.stdout.read().lower()
+            if "ActiveState=failed".lower() in res:
+                assert False, res
+            log.info("{} {} daemon - Successful ".format(mark, action))
+            return
+        except (AssertionError, CommandFailedError) as err:
+            log.error("{} Command execution failed - {}".format(mark, err))
+            log.warn("{} Trying to {}, Retries left: {}".format(mark, action,
+                                                                retries))
+            cmd = "sudo systemctl reset-failed"
+            log.warn("{} Running '{}'".format(mark, cmd))
+            dstate.remote.run(args=[run.Raw(cmd)])
+            __wait(10, msg="Resetted failed daemons")
+            cmd = "sudo systemctl daemon-reload"
+            log.warn("{} Running '{}'".format(mark, cmd))
+            dstate.remote.run(args=[run.Raw(cmd)])
+            __wait(10, msg="Daemon reloaded")
+            log.warn("{} Restarting daemon".format(mark))
+            dstate.restart()
+            __wait(30, msg="Daemon Restarted")
+    else:
+        assert False, "{} Unable to complete {} action".format(mark, action)
+
+
+def get_daemon_info(daemon, ctx):
+    """
+    Get number daemons and objects
+    Args:
+        daemon: daemon name
+        ctx: ceph context object
+    Return:
+        daemon_stat: daemon details
+    """
+    daemon = "ceph.%s" % daemon.lower() \
+        if not daemon.lower().startswith("ceph.") else daemon
+
+    log.info(" [ {} ] Get daemon information".format(daemon.upper()))
+
+    daemon_stat = {
+        "count": None,
+        "daemons": None,
+        "active": dict(),
+        "active_count": None,
+        "inactive": dict(),
+        "inactive_count": None
+    }
+
+    try:
+        assert daemon in ctx.daemons.daemons,\
+            " {} Daemons Not Found".format(daemon)
+
+        daemons = ctx.daemons.daemons[daemon]
+        daemon_stat['daemons'] = daemons
+        daemon_stat['count'] = len(daemons)
+        for name, dstate in daemons.items():
+            if dstate.running():
+                daemon_stat["active"].update({name: dstate})
+            else:
+                daemon_stat["inactive"].update({name: dstate})
+        daemon_stat['active_count'] = len(daemon_stat["active"])
+        daemon_stat['inactive_count'] = len(daemon_stat["inactive"])
+        log.info(" [ {} ] Ceph daemon Stats : {}".format(daemon.upper(), daemon_stat))
+        return daemon_stat
+    except AssertionError as err:
+        log.warn(" {}".format(err))
+        log.warn(" [ {} ] Daemons not available in cluster".format(daemon.upper()))
+    return False
+
+
+def check_ceph_cli_availability(ctx):
+    """
+    Function to check ceph cli availability,
+    In case, ceph command line fails when
+        1) less number of mons available
+    Args:
+        ctx: ceph context object
+    """
+    __warn_status = "Make sure required number(2) of MON(S) are running"
+    mons = ctx.daemons.daemons['ceph.mon']
+    res = [daemon.running() for node, daemon in mons.items() if daemon.running()]
+    if len(res) >= 2:
+        log.info(" CEPH CLI is available")
+        return True
+    log.warn(" {} CEPH CLI not available {}".format("*"*10, "*"*10))
+    log.warn(" Cannot run CEPH CLI commands: {}".format(__warn_status))
+    return False
+
+
 def check_service_status(ctx, dstate, **args):
     """
     check service status and cluster health_status
@@ -22,68 +163,119 @@ def check_service_status(ctx, dstate, **args):
                  exit_status: exit status
                  check: true)
     """
+    timeout = 120
+    interval = 5
+    mark = __mark(dstate)
+
     try:
         # Check daemon restart/start status
-        timeout = 120
-        interval = 5
-
         if args.get('timeout'):
             timeout = args['timeout']
 
         iterations = timeout / interval
         exit_status = args.get('exit_status')
+        action = args.get('action', "start")
 
         while iterations:
-            log.info("Check {} {} daemon status".format(dstate.role,
-                                                        dstate.id_))
-            if dstate.check_status() is not exit_status:
-                log.warn("{} {} is still not {}".format(dstate.role,
-                                                        dstate.id_, exit_status))
-                sleep(interval)
-                iterations -= 1
+            log.info("{} Check daemon status".format(mark))
+            sleep(interval)
+            iterations -= 1
+            try:
+                if dstate.check_status() is not exit_status:
+                    log.warn("{} is still not {}".format(mark, exit_status))
+                    continue
+            except CommandFailedError:
+                daemon_service(dstate, action)
                 continue
             break
         else:
             assert False
 
-        # check cluster health
+        # Check cluster health
+        # Check ceph cli availability, skip if ceph cli not available
         cluster = ctx.managers.keys()[0]
         check_status = args.get('check_status', False)
         check_key = args.get('check_keys')
         health_state = args.get('state')
 
+        try:
+            if dstate.type_.lower() in [_MON]:
+                assert check_ceph_cli_availability(ctx)
+            elif dstate.type_.lower() in [_MGR, _MDS]:
+                res = get_daemon_info(dstate.type_, ctx)
+                if res['active_count'] > 0:
+                    health_state.append(_CEPH_HEALTH['good'])
+        except AssertionError:
+            return True
+
         while timeout:
             sleep(interval)
             timeout -= interval
-            cluster_status = ctx.managers[cluster].raw_cluster_status()
-            health = cluster_status.get('health')
-            status = health['status']
-            checks = health['checks']
 
             try:
+                cluster_status = ctx.managers[cluster].raw_cluster_status()
+                health = cluster_status.get('health')
+                status = health['status']
+                checks = health['checks']
+
                 if check_status:
+                    log.info("{} Validate CEPH Health Status".format(mark))
                     assert status in health_state, \
-                        "[ {} ] not found in health status {}".format(health_state, status)
+                        " [ {} ] not found in health status as expected," \
+                        " current status : {}".format(health_state, status)
                     log.info(" Cluster health status : {} as expected".format(status))
+
                     if check_key:
+                        log.info("{} Validate CEPH Health Checks".format(mark))
                         check_key = [check_key] if not isinstance(check_key, list) else check_key
-
                         for chk in check_key:
                             assert chk.upper() in checks, \
-                                "[ {} ] not found in health checks {}".format(chk, checks)
-                            log.info("[ {} ] found in cluster health checks as expected".format(chk))
-                        log.info(" Cluster health status : {}".format(checks))
+                                " [ {} ] not found in health checks {}".format(chk, checks)
+                            log.info(" [ {} ] found in cluster health checks as expected".format(chk))
+                        log.info(" Cluster health checks as expected : {}".format(checks))
                 return health
             except AssertionError as err:
                 log.warn(err)
-                log.warn("Retrying with {} seconds left".format(timeout))
+                log.warn("{} Retrying with {} seconds left".format(mark, timeout))
                 continue
         else:
-            assert False, "[ {} ] not found in health checks".format(health_state)
+            assert False, " [ {} ] not found in health checks".format(health_state)
     except AssertionError:
         assert False
 
 
+def wait_for_daemon_healthy(dstate, ctx, timeout=1200):
+    """
+    Wait for daemon health
+        1. check daemon available with healthy status
+        2. wait for PG's clean state
+    Args:
+        dstate: Daemon state
+        ctx: ceph context object
+        timeout: timeout in seconds
+    """
+    _type = dstate.type_
+    daemon_count = len(ctx.daemons.iter_daemons_of_role(_MON))
+    mark = __mark(dstate)
+    cluster = ctx.managers.keys()[0]
+
+    try:
+        ctx.managers[cluster].wait_for_mgr_available(timeout=timeout)
+        log.info("{} MGRs are available now".format(mark))
+        ctx.managers[cluster].wait_for_mon_quorum_size(daemon_count,
+                                                       timeout=timeout)
+        log.info("{} MONs has correct Quorum size : {}".format(mark, daemon_count))
+        ctx.managers[cluster].wait_for_all_osds_up(timeout=timeout)
+        log.info("{} OSDs are all up".format(mark))
+        ctx.managers[cluster].wait_for_clean(timeout=timeout)
+        log.info("{} CEPH cluster has all Active+Clean PGs".format(mark))
+        ctx.managers[cluster].wait_till_active(timeout=timeout)
+        log.info("{} CEPH cluster is Active".format(mark))
+    except Exception as err:
+        log.error(err)
+        assert False, err
+
+
 def reboot_node(dstate, **args):
     """
     Reboot daemon node
@@ -100,15 +292,17 @@ def reboot_node(dstate, **args):
         interval = args['interval']
 
     try:
-        # reboot node
+        # Reboot node
+        log.info(" [ {} ] Reboot {} daemon node".format(dstate.id_.upper(),
+                                                        dstate.type_.upper()))
         dstate.remote.run(args=["sudo", "shutdown", "-r", "now", run.Raw("&")])
 
-        # wait for ssh reconnection
+        # Wait for ssh reconnection
         assert dstate.remote.reconnect(timeout=timeout, sleep_time=interval),\
             " [ {} ] Reboot failed".format(dstate.id_)
         log.info(" [ {} ] Reboot successful".format(dstate.id_))
         return True
-    except AssertionError as err:
+    except (AssertionError, CommandFailedError) as err:
         assert False, err
 
 
@@ -131,38 +325,60 @@ def ceph_daemon_system_test(ctx, daemon):
         "state": None,
         "check_status": True,
         "verify_status": None,
-        "check_keys": None
+        "check_keys": None,
+        "action": None
     }
 
     try:
         # Get daemon nodes with SystemDState obj from ctx
         daemons = ctx.daemons.daemons.get(daemon)
         for name, dstate in daemons.items():
-            # stop and verify the cluster status
-            dstate.stop()
-            kwargs['exit_status'] = 0
-            kwargs['state'] = [HEALTH['warn']]
-            kwargs['check_keys'] = "{}_down".format(dstate.daemon_type)
+            mark = __mark(dstate)
 
+            # Stop and verify the cluster status
+            # Wait for 60 secs for clear status
+            log.info("{}  System tests - STARTED ".format(mark))
+            kwargs['exit_status'] = 0
+            kwargs['state'] = [_CEPH_HEALTH['warn']]
+            kwargs['action'] = "stop"
+            daemon_service(dstate, kwargs['action'])
             check_service_status(ctx, dstate, **kwargs)
+            __wait(60, msg="wait for daemon to be in expected state")
+            log.info("{} STOP daemon system test - Done".format(mark))
 
-            # start and verify the cluster status
-            dstate.restart()
+            # Start and verify the cluster status
+            # Wait for 60 secs for clear status
             kwargs['exit_status'] = None
-            kwargs['state'] = [HEALTH['warn'], HEALTH['good']]
+            kwargs['state'] = [_CEPH_HEALTH['warn'], _CEPH_HEALTH['good']]
             kwargs['check_keys'] = None
+            kwargs['action'] = "start"
+            daemon_service(dstate, kwargs['action'])
             check_service_status(ctx, dstate, **kwargs)
+            __wait(60, msg="wait for daemon to be in expected state")
+            wait_for_daemon_healthy(dstate, ctx, timeout=3600)
+            log.info("{} START daemon system test - Done".format(mark))
 
-            # restart daemon and verify cluster status
-            dstate.restart()
+            # Restart daemon and verify cluster status
+            # Wait for 60 secs for clear status
+            kwargs['action'] = "restart"
+            daemon_service(dstate, kwargs['action'])
             check_service_status(ctx, dstate, **kwargs)
+            __wait(60, msg="wait for daemon to be in expected state")
+            wait_for_daemon_healthy(dstate, ctx, timeout=3600)
+            log.info("{} RESTART daemon system test - Done".format(mark))
 
-            # reboot daemon node and verify cluster status
-            reboot_node(dstate, timeout=600, interval=30)
-            log.info("[ ({}, {}) ] daemon system tests Completed".format(daemon, dstate.id_))
-            return True
+            # Reboot daemon node and verify cluster status
+            assert reboot_node(dstate, timeout=1200, interval=30)
+            __wait(60, msg="wait for node to be in expected state")
+            wait_for_daemon_healthy(dstate, ctx, timeout=3600)
+            log.info("{} REBOOT daemon system test - Done".format(mark))
+        log.info("[ {} ] Daemon system tests - PASSED".format(daemon.upper()))
+        return True
     except KeyError as err:
         log.error("No {}(s) found".format(daemon))
         assert False, err
+    except Exception as err:
+        log.error("[ {} ] : System tests - FAILED ".format(daemon.upper()))
+        assert False, err
     finally:
-        log.info("Daemon service system tests Completed")
+        log.info("[ {} ] : System tests - COMPLETED ".format(daemon.upper()))