qa/suites/rbd/nvmeof: add multi-subsystem setup and thrash test

author Vallari Agrawal <val.agl002@gmail.com>

Thu, 27 Jun 2024 11:18:47 +0000 (16:48 +0530)

committer Alexander Indenbaum <aindenba@redhat.com>

Wed, 19 Nov 2025 18:41:52 +0000 (20:41 +0200)
author Vallari Agrawal <val.agl002@gmail.com>
Thu, 27 Jun 2024 11:18:47 +0000 (16:48 +0530)
committer Alexander Indenbaum <aindenba@redhat.com>
Wed, 19 Nov 2025 18:41:52 +0000 (20:41 +0200)
diff --git a/qa/suites/nvmeof/.qa b/qa/suites/nvmeof/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/% b/qa/suites/nvmeof/basic/%

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/qa/suites/nvmeof/basic/.qa b/qa/suites/nvmeof/basic/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/basic/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/base/.qa b/qa/suites/nvmeof/basic/base/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/basic/base/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/base/install.yaml b/qa/suites/nvmeof/basic/base/install.yaml

new file mode 100644 (file)

index 0000000..64b754e
--- /dev/null
+++ b/qa/suites/nvmeof/basic/base/install.yaml
@@ -0,0 +1,15 @@
+use_shaman: True
+tasks:
+- install:
+    extra_packages:
+        - nvme-cli
+- cephadm: 
+    watchdog_setup:
+- cephadm.shell:
+    host.a:
+    # get state before nvmeof deployment
+    - ceph orch status
+    - ceph orch ps
+    - ceph orch host ls
+    - ceph orch device ls
+    - ceph osd lspools
diff --git a/qa/suites/nvmeof/basic/centos_latest.yaml b/qa/suites/nvmeof/basic/centos_latest.yaml

new file mode 120000 (symlink)

index 0000000..bd9854e
--- /dev/null
+++ b/qa/suites/nvmeof/basic/centos_latest.yaml
@@ -0,0 +1 @@
+.qa/distros/supported/centos_latest.yaml
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/clusters/.qa b/qa/suites/nvmeof/basic/clusters/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/basic/clusters/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml b/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml

new file mode 100644 (file)

index 0000000..4029491
--- /dev/null
+++ b/qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml
@@ -0,0 +1,27 @@
+roles:
+- - host.a
+  - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - client.0
+  - ceph.nvmeof.nvmeof.a
+- - host.b
+  - mon.b
+  - mon.c
+  - osd.2
+  - osd.3
+  - osd.4
+  - client.1
+- - client.2
+<<<<<<<< HEAD:qa/suites/rbd/nvmeof/cluster/fixed-3.yaml
+========
+- - client.3
+
+overrides:
+  ceph:
+    conf:
+      mon:
+        # cephadm can take up to 5 minutes to bring up remaining mons
+        mon down mkfs grace: 300
+>>>>>>>> d0c4182bf57 (qa/suites/rbd/nvmeof: add multi-subsystem setup and thrash test):qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml
diff --git a/qa/suites/nvmeof/basic/conf b/qa/suites/nvmeof/basic/conf

new file mode 120000 (symlink)

index 0000000..4bc0fe8
--- /dev/null
+++ b/qa/suites/nvmeof/basic/conf
@@ -0,0 +1 @@
+.qa/rbd/conf
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/workloads/.qa b/qa/suites/nvmeof/basic/workloads/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/basic/workloads/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml

new file mode 100644 (file)

index 0000000..0382cfc
--- /dev/null
+++ b/qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml
@@ -0,0 +1,38 @@
+tasks:
+- nvmeof:
+    client: client.0
+    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    rbd:
+        pool_name: mypool
+        image_name: myimage
+    gateway_config:
+      subsystems_count: 3
+      namespaces_count: 20
+      cli_image: quay.io/ceph/nvmeof-cli:1.2
+
+- cephadm.wait_for_service:
+    service: nvmeof.mypool
+
+- workunit:
+    no_coverage_and_limits: true
+    clients:
+      client.2:
+        - rbd/nvmeof_setup_subsystem.sh
+    env:
+      RBD_POOL: mypool
+      RBD_IMAGE_PREFIX: myimage
+
+- workunit:
+    no_coverage_and_limits: true
+    timeout: 30m
+    clients:
+      client.2:
+        - rbd/nvmeof_basic_tests.sh
+        - rbd/nvmeof_fio_test.sh --start_ns 1 --end_ns 30 --rbd_iostat
+      client.3:
+        - rbd/nvmeof_basic_tests.sh
+        - rbd/nvmeof_fio_test.sh --start_ns 31 --end_ns 60
+    env:
+      RBD_POOL: mypool
+      RBD_IMAGE: myimage
+      IOSTAT_INTERVAL: '10'
diff --git a/qa/suites/nvmeof/thrash/% b/qa/suites/nvmeof/thrash/%

new file mode 100644 (file)

index 0000000..e69de29
diff --git a/qa/suites/nvmeof/thrash/.qa b/qa/suites/nvmeof/thrash/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/base/.qa b/qa/suites/nvmeof/thrash/base/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/base/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/base/install.yaml b/qa/suites/nvmeof/thrash/base/install.yaml

new file mode 100644 (file)

index 0000000..4b5cea9
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/base/install.yaml
@@ -0,0 +1,15 @@
+use_shaman: True
+tasks:
+- install:
+    extra_packages:
+        - nvme-cli
+- cephadm:
+    watchdog_setup:
+- cephadm.shell:
+    host.a:
+    # get state before nvmeof deployment
+    - ceph orch status
+    - ceph orch ps
+    - ceph orch host ls
+    - ceph orch device ls
+    - ceph osd lspools
diff --git a/qa/suites/nvmeof/thrash/centos_latest.yaml b/qa/suites/nvmeof/thrash/centos_latest.yaml

new file mode 120000 (symlink)

index 0000000..bd9854e
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/centos_latest.yaml
@@ -0,0 +1 @@
+.qa/distros/supported/centos_latest.yaml
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/clusters/.qa b/qa/suites/nvmeof/thrash/clusters/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/clusters/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml b/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml

new file mode 100644 (file)

index 0000000..afe0ed7
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml
@@ -0,0 +1,30 @@
+roles:
+- - host.a
+  - mon.a
+  - mgr.x
+  - osd.0
+  - osd.1
+  - client.0
+  - ceph.nvmeof.nvmeof.a
+- - host.b
+  - mon.b
+  - osd.2
+  - osd.3
+  - osd.4
+  - client.1
+  - ceph.nvmeof.nvmeof.b
+- - host.c
+  - mon.c
+  - osd.5
+  - osd.6
+  - osd.7
+  - client.2
+  - ceph.nvmeof.nvmeof.c
+- - client.3 # initiator
+
+overrides:
+  ceph:
+    conf:
+      mon:
+        # cephadm can take up to 5 minutes to bring up remaining mons
+        mon down mkfs grace: 300
diff --git a/qa/suites/nvmeof/thrash/conf b/qa/suites/nvmeof/thrash/conf

new file mode 120000 (symlink)

index 0000000..4bc0fe8
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/conf
@@ -0,0 +1 @@
+.qa/rbd/conf
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/.qa b/qa/suites/nvmeof/thrash/gateway-initiator-setup/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml

new file mode 100644 (file)

index 0000000..3e5262f
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml
@@ -0,0 +1,24 @@
+tasks:
+- nvmeof:
+    client: client.0
+    gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
+    rbd:
+      pool_name: mypool
+      image_name_prefix: myimage
+    gateway_config:
+      subsystems_count: 3
+      namespaces_count: 20 # each subsystem
+      cli_image: quay.io/ceph/nvmeof-cli:1.2
+
+- cephadm.wait_for_service:
+    service: nvmeof.mypool
+
+- workunit:
+    no_coverage_and_limits: true
+    clients:
+      client.3:
+        - rbd/nvmeof_setup_subsystem.sh
+        - rbd/nvmeof_basic_tests.sh
+    env:
+      RBD_POOL: mypool
+      RBD_IMAGE_PREFIX: myimage
diff --git a/qa/suites/nvmeof/thrash/thrashers/.qa b/qa/suites/nvmeof/thrash/thrashers/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/thrashers/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml

new file mode 100644 (file)

index 0000000..4306de9
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml
@@ -0,0 +1,24 @@
+overrides:
+  ceph:
+    log-ignorelist: 
+      # mon thrashing
+      - MON_DOWN
+      - mons down
+      - mon down
+      - out of quorum
+      # nvmeof daemon thrashing
+      - CEPHADM_FAILED_DAEMON
+      - is in error state
+      - failed cephadm daemon
+
+tasks:
+- nvmeof.thrash:
+    checker_host: 'client.3'
+    switch_thrashers: True
+
+- mon_thrash:
+    revive_delay: 60
+    thrash_delay: 60
+    thrash_many: true
+    switch_thrashers: True
+    logger: '[nvmeof.thrasher.mon_thrasher]'
diff --git a/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml

new file mode 100644 (file)

index 0000000..0271e41
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml
@@ -0,0 +1,11 @@
+overrides:
+  ceph:
+    log-ignorelist:  
+      # nvmeof daemon thrashing
+      - CEPHADM_FAILED_DAEMON
+      - is in error state
+      - failed cephadm daemon
+
+tasks:
+- nvmeof.thrash:
+    checker_host: 'client.3'
diff --git a/qa/suites/nvmeof/thrash/workloads/.qa b/qa/suites/nvmeof/thrash/workloads/.qa

new file mode 120000 (symlink)

index 0000000..a602a03
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/workloads/.qa
@@ -0,0 +1 @@
+../.qa/
+\ No newline at end of file
diff --git a/qa/suites/nvmeof/thrash/workloads/fio.yaml b/qa/suites/nvmeof/thrash/workloads/fio.yaml

new file mode 100644 (file)

index 0000000..fa7153d
--- /dev/null
+++ b/qa/suites/nvmeof/thrash/workloads/fio.yaml
@@ -0,0 +1,11 @@
+tasks:
+- workunit:
+    no_coverage_and_limits: true
+    timeout: 30m
+    clients:
+      client.3:
+        - rbd/nvmeof_fio_test.sh --rbd_iostat
+    env:
+      RBD_POOL: mypool
+      IOSTAT_INTERVAL: '10'
+      RUNTIME: '600'
diff --git a/qa/suites/rbd/nvmeof/% b/qa/suites/rbd/nvmeof/%

deleted file mode 100644 (file)

index e69de29..0000000
diff --git a/qa/suites/rbd/nvmeof/.qa b/qa/suites/rbd/nvmeof/.qa

deleted file mode 120000 (symlink)

index a602a03..0000000
--- a/qa/suites/rbd/nvmeof/.qa
+++ /dev/null
@@ -1 +0,0 @@
-../.qa/
-\ No newline at end of file
diff --git a/qa/suites/rbd/nvmeof/base/.qa b/qa/suites/rbd/nvmeof/base/.qa

deleted file mode 120000 (symlink)

index a602a03..0000000
--- a/qa/suites/rbd/nvmeof/base/.qa
+++ /dev/null
@@ -1 +0,0 @@
-../.qa/
-\ No newline at end of file
diff --git a/qa/suites/rbd/nvmeof/base/install.yaml b/qa/suites/rbd/nvmeof/base/install.yaml

deleted file mode 100644 (file)

index 6fc91d8..0000000
--- a/qa/suites/rbd/nvmeof/base/install.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-use_shaman: True
-tasks:
-- install:
-- cephadm:
-- cephadm.shell:
-    host.a:
-    # get state before nvmeof deployment
-    - ceph orch status
-    - ceph orch ps
-    - ceph orch host ls
-    - ceph orch device ls
-    - ceph osd lspools
-
diff --git a/qa/suites/rbd/nvmeof/centos_latest.yaml b/qa/suites/rbd/nvmeof/centos_latest.yaml

deleted file mode 120000 (symlink)

index bd9854e..0000000
--- a/qa/suites/rbd/nvmeof/centos_latest.yaml
+++ /dev/null
@@ -1 +0,0 @@
-.qa/distros/supported/centos_latest.yaml
-\ No newline at end of file
diff --git a/qa/suites/rbd/nvmeof/cluster/+ b/qa/suites/rbd/nvmeof/cluster/+

deleted file mode 100644 (file)

index e69de29..0000000
diff --git a/qa/suites/rbd/nvmeof/cluster/.qa b/qa/suites/rbd/nvmeof/cluster/.qa

deleted file mode 120000 (symlink)

index a602a03..0000000
--- a/qa/suites/rbd/nvmeof/cluster/.qa
+++ /dev/null
@@ -1 +0,0 @@
-../.qa/
-\ No newline at end of file
diff --git a/qa/suites/rbd/nvmeof/cluster/fixed-3.yaml b/qa/suites/rbd/nvmeof/cluster/fixed-3.yaml

deleted file mode 100644 (file)

index f417079..0000000
--- a/qa/suites/rbd/nvmeof/cluster/fixed-3.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-roles:
-- - host.a
-  - mon.a
-  - mgr.x
-  - osd.0
-  - osd.1
-  - client.0
-  - ceph.nvmeof.nvmeof.a
-- - host.b
-  - mon.b
-  - osd.2
-  - osd.3
-  - osd.4
-  - client.1
-- - client.2
diff --git a/qa/suites/rbd/nvmeof/cluster/openstack.yaml b/qa/suites/rbd/nvmeof/cluster/openstack.yaml

deleted file mode 100644 (file)

index 40fef47..0000000
--- a/qa/suites/rbd/nvmeof/cluster/openstack.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-openstack:
-  - machine:
-      disk: 40 # GB
-      ram: 8000 # MB
-      cpus: 1
-    volumes: # attached to each instance
-      count: 4
-      size: 30 # GB
diff --git a/qa/suites/rbd/nvmeof/workloads/.qa b/qa/suites/rbd/nvmeof/workloads/.qa

deleted file mode 120000 (symlink)

index a602a03..0000000
--- a/qa/suites/rbd/nvmeof/workloads/.qa
+++ /dev/null
@@ -1 +0,0 @@
-../.qa/
-\ No newline at end of file
diff --git a/qa/suites/rbd/nvmeof/workloads/nvmeof_initiator.yaml b/qa/suites/rbd/nvmeof/workloads/nvmeof_initiator.yaml

deleted file mode 100644 (file)

index bbb9b0a..0000000
--- a/qa/suites/rbd/nvmeof/workloads/nvmeof_initiator.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-tasks:
-- nvmeof:
-    client: client.0
-    version: latest  # "default" uses packaged version; change to test specific nvmeof images, example "latest"
-    rbd:
-        pool_name: mypool
-        image_name: myimage
-    gateway_config:
-        source: host.a 
-        target: client.2
-        vars:
-            cli_version: latest
-
-- cephadm.wait_for_service:
-    service: nvmeof.mypool
-
-- workunit:
-    no_coverage_and_limits: true
-    clients:
-      client.2:
-        - rbd/nvmeof_initiator.sh
-        - rbd/nvmeof_basic_tests.sh
-        - rbd/nvmeof_fio_test.sh
-    env:
-      RBD_POOL: mypool
-      RBD_IMAGE: myimage
-      IOSTAT_INTERVAL: '10'
diff --git a/qa/tasks/cephadm.py b/qa/tasks/cephadm.py

index 1f885faf53dd76aa618710d8d216b5c02b5e6a76..2f5f5c3fec331197b811bbadf217e06993471577 100644 (file)
--- a/qa/tasks/cephadm.py
+++ b/qa/tasks/cephadm.py
@@ -28,6 +28,7 @@ from teuthology.config import config as teuth_config
  from teuthology.exceptions import ConfigError, CommandFailedError
  from textwrap import dedent
  from tasks.cephfs.filesystem import MDSCluster, Filesystem
+from tasks.daemonwatchdog import DaemonWatchdog
  from tasks.util import chacra
  
  # these items we use from ceph.py should probably eventually move elsewhere
@@ -1405,6 +1406,15 @@ def ceph_clients(ctx, config):
              remote.sudo_write_file(client_keyring, keyring, mode='0644')
      yield
  
+@contextlib.contextmanager
+def watchdog_setup(ctx, config):
+    if 'watchdog_setup' in config: 
+        ctx.ceph[config['cluster']].thrashers = []
+        ctx.ceph[config['cluster']].watchdog = DaemonWatchdog(ctx, config, ctx.ceph[config['cluster']].thrashers)
+        ctx.ceph[config['cluster']].watchdog.start()
+    else:
+        ctx.ceph[config['cluster']].watchdog = None 
+    yield
  
  @contextlib.contextmanager
  def ceph_initial():
@@ -1445,10 +1455,11 @@ def stop(ctx, config):
          cluster, type_, id_ = teuthology.split_role(role)
          ctx.daemons.get_daemon(type_, id_, cluster).stop()
          clusters.add(cluster)
-
-#    for cluster in clusters:
-#        ctx.ceph[cluster].watchdog.stop()
-#        ctx.ceph[cluster].watchdog.join()
+    
+    if ctx.ceph[cluster].watchdog:
+        for cluster in clusters:
+            ctx.ceph[cluster].watchdog.stop()
+            ctx.ceph[cluster].watchdog.join()
  
      yield
  
@@ -2157,6 +2168,7 @@ def task(ctx, config):
  
      :param ctx: the argparse.Namespace object
      :param config: the config dict
+    :param watchdog_setup: start DaemonWatchdog to watch daemons for failures
      """
      if config is None:
          config = {}
@@ -2243,6 +2255,8 @@ def task(ctx, config):
              lambda: ceph_monitoring('grafana', ctx=ctx, config=config),
              lambda: ceph_clients(ctx=ctx, config=config),
              lambda: create_rbd_pool(ctx=ctx, config=config),
+            lambda: conf_epoch(ctx=ctx, config=config),
+            lambda: watchdog_setup(ctx=ctx, config=config),
      ):
          try:
              if config.get('wait-for-healthy', True):
diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py

index ceffd56ebb1ad7e745b9c692981bfa079b28cff5..234a26e10ea24df5cb7476c810d09866626174af 100644 (file)
--- a/qa/tasks/daemonwatchdog.py
+++ b/qa/tasks/daemonwatchdog.py
@@ -62,11 +62,11 @@ class DaemonWatchdog(Greenlet):
                  except:
                      self.logger.exception("ignoring exception:")
          daemons = []
-        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
-        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
-        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
-        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
-        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: not daemon.finished(), self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: not daemon.finished(), self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: not daemon.finished(), self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: not daemon.finished(), self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
+        daemons.extend(filter(lambda daemon: not daemon.finished(), self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
  
          for daemon in daemons:
              try:
@@ -90,11 +90,11 @@ class DaemonWatchdog(Greenlet):
              mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)
  
              daemon_failures = []
-            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
-            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
-            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
-            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
-            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))
+            daemon_failures.extend(filter(lambda daemon: daemon.finished(), osds))
+            daemon_failures.extend(filter(lambda daemon: daemon.finished(), mons))
+            daemon_failures.extend(filter(lambda daemon: daemon.finished(), mdss))
+            daemon_failures.extend(filter(lambda daemon: daemon.finished(), rgws))
+            daemon_failures.extend(filter(lambda daemon: daemon.finished(), mgrs))
  
              for daemon in daemon_failures:
                  name = daemon.role + '.' + daemon.id_
diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py

index df446d06a3a7407b64f9af0d958462e3c9b252c0..34aa1f9cc9e2495408fbe46a029238236d483820 100644 (file)
--- a/qa/tasks/mon_thrash.py
+++ b/qa/tasks/mon_thrash.py
@@ -8,6 +8,7 @@ import time
  import gevent
  import json
  import math
+from gevent.event import Event
  from teuthology import misc as teuthology
  from teuthology.contextutil import safe_while
  from tasks import ceph_manager
@@ -43,6 +44,10 @@ class MonitorThrasher(Thrasher):
                          the monitor (default: 10)
      thrash_delay        Number of seconds to wait in-between
                          test iterations (default: 0)
+    switch_thrashers:   Toggle this to switch between thrashers so it waits until all
+                        thrashers are done thrashing before proceeding. And then
+                        wait until all thrashers are done reviving before proceeding.
+                        (default: false) 
      store_thrash        Thrash monitor store before killing the monitor being thrashed (default: False)
      store_thrash_probability  Probability of thrashing a monitor's store
                                (default: 50)
@@ -93,7 +98,7 @@ class MonitorThrasher(Thrasher):
          self.manager = manager
          self.manager.wait_for_clean()
  
-        self.stopping = False
+        self.stopping = Event()
          self.logger = logger
          self.config = config
          self.name = name
@@ -101,6 +106,9 @@ class MonitorThrasher(Thrasher):
          if self.config is None:
              self.config = dict()
  
+        if self.config.get("switch_thrashers"): 
+            self.switch_thrasher = Event()
+
          """ Test reproducibility """
          self.random_seed = self.config.get('seed', None)
  
@@ -159,7 +167,7 @@ class MonitorThrasher(Thrasher):
          """
          Break out of this processes thrashing loop.
          """
-        self.stopping = True
+        self.stopping.set()
          self.thread.get()
  
      def stop_and_join(self):
@@ -224,7 +232,6 @@ class MonitorThrasher(Thrasher):
          """
          Revive the monitor specified
          """
-        self.log('killing mon.{id}'.format(id=mon))
          self.log('reviving mon.{id}'.format(id=mon))
          self.manager.revive_mon(mon)
  
@@ -270,6 +277,28 @@ class MonitorThrasher(Thrasher):
              # Allow successful completion so gevent doesn't see an exception.
              # The DaemonWatchdog will observe the error and tear down the test.
  
+    def switch_task(self):
+        """
+        Pause mon thrasher till other thrashers are done with their iteration.
+        This would help to sync between multiple thrashers, like:
+        1. thrasher-1 and thrasher-2: thrash daemons in parallel
+        2. thrasher-1 and thrasher-2: revive daemons in parallel 
+        This allows us to run some checks after each thrashing and reviving iteration.
+        """
+        if not hasattr(self, 'switch_thrasher'):
+            return
+        self.switch_thrasher.set()
+        thrashers = self.ctx.ceph[self.config.get('cluster')].thrashers
+        for t in thrashers:
+            if not isinstance(t, MonitorThrasher) and hasattr(t, 'switch_thrasher') and ( 
+                isinstance(t.stopping, Event) and not t.stopping.is_set()
+            ):
+                other_thrasher = t
+                self.log('switch_task: waiting for others thrashers')
+                other_thrasher.switch_thrasher.wait(300)
+                self.log('switch_task: done waiting for the other thrasher')
+                other_thrasher.switch_thrasher.clear()
+
      def _do_thrash(self):
          """
          Continuously loop and thrash the monitors.
@@ -289,7 +318,7 @@ class MonitorThrasher(Thrasher):
                  fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
                  ))
  
-        while not self.stopping:
+        while not self.stopping.is_set():
              mons = _get_mons(self.ctx)
              self.manager.wait_for_mon_quorum_size(len(mons))
              self.log('making sure all monitors are in the quorum')
@@ -350,6 +379,8 @@ class MonitorThrasher(Thrasher):
                  delay=self.revive_delay))
              time.sleep(self.revive_delay)
  
+            self.switch_task()
+
              for mon in mons_to_kill:
                  self.revive_mon(mon)
              # do more freezes
@@ -385,6 +416,8 @@ class MonitorThrasher(Thrasher):
                      delay=self.thrash_delay))
                  time.sleep(self.thrash_delay)
  
+            self.switch_task()
+
          #status after thrashing
          if self.mds_failover:
              status = self.mds_cluster.status()
@@ -411,6 +444,8 @@ def task(ctx, config):
      if 'cluster' not in config:
          config['cluster'] = 'ceph'
  
+    logger = config.get('logger', 'mon_thrasher')
+
      log.info('Beginning mon_thrash...')
      first_mon = teuthology.get_first_mon(ctx, config)
      (mon,) = ctx.cluster.only(first_mon).remotes.keys()
@@ -421,7 +456,7 @@ def task(ctx, config):
          )
      thrash_proc = MonitorThrasher(ctx,
          manager, config, "MonitorThrasher",
-        logger=log.getChild('mon_thrasher'))
+        logger=log.getChild(logger))
      ctx.ceph[config['cluster']].thrashers.append(thrash_proc)
      try:
          log.debug('Yielding')
diff --git a/qa/tasks/nvmeof.py b/qa/tasks/nvmeof.py

index b75d00d93ae80e38620e4164a62e17a2b41c2c8c..687fa41fe0d8a008f3c462ff2e93a8ddcce4ce3b 100644 (file)
--- a/qa/tasks/nvmeof.py
+++ b/qa/tasks/nvmeof.py
@@ -1,10 +1,18 @@
  import logging
+import random
+import time
+from collections import defaultdict
+from datetime import datetime
  from textwrap import dedent
+from gevent.event import Event
+from gevent.greenlet import Greenlet
  from teuthology.task import Task
  from teuthology import misc
  from teuthology.exceptions import ConfigError
+from teuthology.orchestra import run
  from tasks.util import get_remote_for_role
  from tasks.cephadm import _shell
+from tasks.thrasher import Thrasher
  
  log = logging.getLogger(__name__)
  
@@ -50,7 +58,7 @@ class Nvmeof(Task):
          self.set_gateway_cfg()
  
      def _set_defaults(self):
-        self.gateway_image = self.config.get('version', 'default')
+        self.gateway_image = self.config.get('gw_image', 'default')
  
          rbd_config = self.config.get('rbd', {})
          self.poolname = rbd_config.get('pool_name', 'mypool')
@@ -58,13 +66,14 @@ class Nvmeof(Task):
          self.rbd_size = rbd_config.get('rbd_size', 1024*8)
  
          gateway_config = self.config.get('gateway_config', {})
-        conf_vars = gateway_config.get('vars', {})
-        self.cli_image = conf_vars.get('cli_version', 'latest')
-        self.bdev = conf_vars.get('bdev', 'mybdev')
-        self.serial = conf_vars.get('serial', 'SPDK00000000000001')
-        self.nqn = conf_vars.get('nqn', 'nqn.2016-06.io.spdk:cnode1')
-        self.port = conf_vars.get('port', '4420')
-        self.srport = conf_vars.get('srport', '5500')
+        self.cli_image = gateway_config.get('cli_image', 'quay.io/ceph/nvmeof-cli:latest')
+        self.nqn_prefix = gateway_config.get('subsystem_nqn_prefix', 'nqn.2016-06.io.spdk:cnode')
+        self.subsystems_count = gateway_config.get('subsystems_count', 1) 
+        self.namespaces_count = gateway_config.get('namespaces_count', 1) # namepsaces per subsystem
+        self.bdev = gateway_config.get('bdev', 'mybdev')
+        self.serial = gateway_config.get('serial', 'SPDK00000000000001')
+        self.port = gateway_config.get('port', '4420')
+        self.srport = gateway_config.get('srport', '5500')
  
      def deploy_nvmeof(self):
          """
@@ -87,13 +96,13 @@ class Nvmeof(Task):
                  daemons[role] = (remote, id_)
  
          if nodes:
-            image = self.gateway_image
-            if (image != "default"):
-                log.info(f'[nvmeof]: ceph config set mgr mgr/cephadm/container_image_nvmeof quay.io/ceph/nvmeof:{image}')
+            gw_image = self.gateway_image
+            if (gw_image != "default"):
+                log.info(f'[nvmeof]: ceph config set mgr mgr/cephadm/container_image_nvmeof {gw_image}')
                  _shell(self.ctx, self.cluster_name, self.remote, [
                      'ceph', 'config', 'set', 'mgr', 
                      'mgr/cephadm/container_image_nvmeof',
-                    f'quay.io/ceph/nvmeof:{image}'
+                    gw_image
                  ])
  
              poolname = self.poolname
@@ -115,10 +124,14 @@ class Nvmeof(Task):
                  '--placement', str(len(nodes)) + ';' + ';'.join(nodes)
              ])
  
-            log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}')
-            _shell(self.ctx, self.cluster_name, self.remote, [
-                'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}'
-            ])
+            total_images = int(self.namespaces_count) * int(self.subsystems_count)
+            log.info(f'[nvmeof]: creating {total_images} images')
+            for i in range(1, total_images + 1):
+                imagename = self.image_name_prefix + str(i)
+                log.info(f'[nvmeof]: rbd create {poolname}/{imagename} --size {self.rbd_size}')
+                _shell(self.ctx, self.cluster_name, self.remote, [
+                    'rbd', 'create', f'{poolname}/{imagename}', '--size', f'{self.rbd_size}'
+                ])
  
          for role, i in daemons.items():
              remote, id_ = i
@@ -144,15 +157,16 @@ class Nvmeof(Task):
          gateway_name = ""
          nvmeof_daemons = self.ctx.daemons.iter_daemons_of_role('nvmeof', cluster=self.cluster_name)
          for daemon in nvmeof_daemons:
-            if ip_address == daemon.remote.ip_address:
-                gateway_name = daemon.name()
+            gateway_names += [daemon.remote.shortname]
+            gateway_ips += [daemon.remote.ip_address]
          conf_data = dedent(f"""
-            NVMEOF_GATEWAY_IP_ADDRESS={ip_address}
-            NVMEOF_GATEWAY_NAME={gateway_name}
-            NVMEOF_CLI_IMAGE="quay.io/ceph/nvmeof-cli:{self.cli_image}"
-            NVMEOF_BDEV={self.bdev}
-            NVMEOF_SERIAL={self.serial}
-            NVMEOF_NQN={self.nqn}
+            NVMEOF_GATEWAY_IP_ADDRESSES={",".join(gateway_ips)}
+            NVMEOF_GATEWAY_NAMES={",".join(gateway_names)}
+            NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS={ip_address}
+            NVMEOF_CLI_IMAGE="{self.cli_image}"
+            NVMEOF_SUBSYSTEMS_PREFIX={self.nqn_prefix}
+            NVMEOF_SUBSYSTEMS_COUNT={self.subsystems_count}
+            NVMEOF_NAMESPACES_COUNT={self.namespaces_count}
              NVMEOF_PORT={self.port}
              NVMEOF_SRPORT={self.srport}
              """)
@@ -165,4 +179,280 @@ class Nvmeof(Task):
          log.info("[nvmeof]: executed set_gateway_cfg successfully!")
  
  
+class NvmeofThrasher(Thrasher, Greenlet):
+    """
+    How it works::
+
+    - pick a nvmeof daemon
+    - kill it
+    - wait for other thrashers to finish thrashing (if switch_thrashers True) 
+    - sleep for 'revive_delay' seconds
+    - do some checks after thrashing ('do_checks' method) 
+    - revive daemons
+    - wait for other thrashers to finish reviving (if switch_thrashers True)
+    - sleep for 'thrash_delay' seconds
+    - do some checks after reviving ('do_checks' method) 
+
+    
+    Options::
+
+    seed                Seed to use on the RNG to reproduce a previous
+                        behavior (default: None; i.e., not set) 
+    checker_host:       Initiator client on which verification tests would 
+                        run during thrashing (mandatory option)
+    switch_thrashers:   Toggle this to switch between thrashers so it waits until all
+                        thrashers are done thrashing before proceeding. And then
+                        wait until all thrashers are done reviving before proceeding.
+                        (default: false)          
+    randomize:          Enables randomization and use the max/min values. (default: true)
+    max_thrash:         Maximum number of daemons that can be thrashed at a time. 
+                        (default: num_of_daemons-1, minimum of 1 daemon should be up)
+    min_thrash_delay:   Minimum number of seconds to delay before thrashing again. 
+                        (default: 60)
+    max_thrash_delay:   Maximum number of seconds to delay before thrashing again. 
+                        (default: min_thrash_delay + 30)
+    min_revive_delay:   Minimum number of seconds to delay before bringing back a 
+                        thrashed daemon. (default: 100)
+    max_revive_delay:   Maximum number of seconds to delay before bringing back a 
+                        thrashed daemon. (default: min_revive_delay + 30)
+
+    daemon_max_thrash_times: 
+                        For now, NVMeoF daemons have limitation that each daemon can 
+                        be thrashed only 3 times in span of 30 mins. This option 
+                        allows to set the amount of times it could be thrashed in a period
+                        of time. (default: 3)
+    daemon_max_thrash_period: 
+                        This option goes with the above option. It sets the period of time
+                        over which each daemons can be thrashed for daemon_max_thrash_times
+                        amount of times. Time period in seconds. (default: 1800, i.e. 30mins)
+    
+
+    For example::
+    tasks:
+    - nvmeof.thrash:
+        checker_host: 'client.3'
+        switch_thrashers: True
+
+    - mon_thrash:
+        switch_thrashers: True
+
+    - workunit:
+        clients:
+            client.3:
+            - rbd/nvmeof_fio_test.sh --rbd_iostat
+        env:
+            RBD_POOL: mypool
+            IOSTAT_INTERVAL: '10'
+    
+    """
+    def __init__(self, ctx, config, daemons) -> None:
+        super(NvmeofThrasher, self).__init__()
+
+        if config is None:
+            self.config = dict()
+        self.config = config
+        self.ctx = ctx
+        self.daemons = daemons
+        self.logger = log.getChild('[nvmeof.thrasher]')
+        self.stopping = Event()
+        if self.config.get("switch_thrashers"): 
+            self.switch_thrasher = Event()
+        self.checker_host = get_remote_for_role(self.ctx, self.config.get('checker_host'))
+        self.devices = self._get_devices(self.checker_host)
+
+        """ Random seed """
+        self.random_seed = self.config.get('seed', None)
+        if self.random_seed is None:
+            self.random_seed = int(time.time())
+
+        self.rng = random.Random()
+        self.rng.seed(int(self.random_seed))
+
+        """ Thrashing params """
+        self.randomize = bool(self.config.get('randomize', True))
+        self.max_thrash_daemons = int(self.config.get('max_thrash', len(self.daemons) - 1))
+
+        # Limits on thrashing each daemon
+        self.daemon_max_thrash_times = int(self.config.get('daemon_max_thrash_times', 3))
+        self.daemon_max_thrash_period = int(self.config.get('daemon_max_thrash_period', 30 * 60)) # seconds
+
+        self.min_thrash_delay = int(self.config.get('min_thrash_delay', 60))
+        self.max_thrash_delay = int(self.config.get('max_thrash_delay', self.min_thrash_delay + 30))
+        self.min_revive_delay = int(self.config.get('min_revive_delay', 100))
+        self.max_revive_delay = int(self.config.get('max_revive_delay', self.min_revive_delay + 30))
+
+    def _get_devices(self, remote):
+        GET_DEVICE_CMD = "sudo nvme list --output-format=json | " \
+            "jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == \"Ceph bdev Controller\") | .DevicePath'"
+        devices = remote.sh(GET_DEVICE_CMD).split()
+        return devices
+    
+    def log(self, x):
+        self.logger.info(x)
+
+    def _run(self): # overriding 
+        try:
+            self.do_thrash()
+        except Exception as e:
+            self.set_thrasher_exception(e)
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+            # The DaemonWatchdog will observe the error and tear down the test.
+    
+    def stop(self):
+        self.stopping.set()
+
+    def do_checks(self):
+        """
+        Run some checks to see if everything is running well during thrashing.
+        """
+        self.log('display and verify stats:')
+        for d in self.daemons:
+            d.remote.sh(d.status_cmd, check_status=False)
+        check_cmd = [
+            'ceph', 'orch', 'ls',
+            run.Raw('&&'), 'ceph', 'orch', 'ps', '--daemon-type', 'nvmeof',
+            run.Raw('&&'), 'ceph', 'health', 'detail',
+            run.Raw('&&'), 'ceph', '-s',
+        ]
+        for dev in self.devices:
+            check_cmd += [
+                run.Raw('&&'), 'sudo', 'nvme', 'list-subsys', dev,
+                run.Raw('|'), 'grep', 'live optimized'
+            ] 
+        self.checker_host.run(args=check_cmd).wait()        
+
+    def switch_task(self):
+        """
+        Pause nvmeof thrasher till other thrashers are done with their iteration.
+        This method would help to sync between multiple thrashers, like:
+        1. thrasher-1 and thrasher-2: thrash daemons in parallel
+        2. thrasher-1 and thrasher-2: revive daemons in parallel 
+        This allows us to run some checks after each thrashing and reviving iteration.
+        """
+        if not hasattr(self, 'switch_thrasher'):
+            return
+        self.switch_thrasher.set()
+        thrashers = self.ctx.ceph[self.config.get('cluster')].thrashers
+        for t in thrashers:
+            if not isinstance(t, NvmeofThrasher) and hasattr(t, 'switch_thrasher') and ( 
+                isinstance(t.stopping, Event) and not t.stopping.is_set()
+            ):
+                other_thrasher = t
+                self.log('switch_task: waiting for other thrasher')
+                other_thrasher.switch_thrasher.wait(300)
+                self.log('switch_task: done waiting for the other thrasher')
+                other_thrasher.switch_thrasher.clear()
+
+    def do_thrash(self):
+        self.log('start thrashing')
+        self.log(f'seed: {self.random_seed}, , '\
+                 f'max thrash delay: {self.max_thrash_delay}, min thrash delay: {self.min_thrash_delay} '\
+                 f'max revive delay: {self.max_revive_delay}, min revive delay: {self.min_revive_delay} '\
+                 f'daemons: {len(self.daemons)} '\
+                )
+        daemons_thrash_history = defaultdict(list)
+        summary = []
+
+        while not self.stopping.is_set():
+            killed_daemons = []
+
+            weight = 1.0 / len(self.daemons)
+            count = 0
+            for daemon in self.daemons:
+                skip = self.rng.uniform(0.0, 1.0)
+                if weight <= skip:
+                    self.log('skipping daemon {label} with skip ({skip}) > weight ({weight})'.format(
+                        label=daemon.id_, skip=skip, weight=weight))
+                    continue
+
+                # For now, nvmeof daemons can only be thrashed 3 times in last 30mins. 
+                # Skip thrashing if daemon was thrashed <daemon_max_thrash_times> 
+                # times in last <daemon_max_thrash_period> seconds. 
+                thrashed_history = daemons_thrash_history.get(daemon.id_, [])
+                history_ptr = len(thrashed_history) - self.daemon_max_thrash_times
+                if history_ptr >= 0: 
+                    ptr_timestamp = thrashed_history[history_ptr]
+                    current_timestamp = datetime.now()
+                    if (current_timestamp - ptr_timestamp).total_seconds() < self.daemon_max_thrash_period:
+                        self.log(f'skipping daemon {daemon.id_}: thrashed total {len(thrashed_history)} times, '\
+                                 f'can only thrash {self.daemon_max_thrash_times} times '\
+                                 f'in {self.daemon_max_thrash_period} seconds.')
+                        continue
+
+                self.log('kill {label}'.format(label=daemon.id_))
+                daemon.stop()
+
+                killed_daemons.append(daemon)
+                daemons_thrash_history[daemon.id_] += [datetime.now()]
+
+                # only thrash max_thrash_daemons amount of daemons
+                count += 1
+                if count >= self.max_thrash_daemons:
+                    break
+
+            if killed_daemons:
+                summary += ["killed: " + ", ".join([d.id_ for d in killed_daemons])]
+                # delay before reviving
+                revive_delay = self.min_revive_delay
+                if self.randomize:
+                    revive_delay = random.randrange(self.min_revive_delay, self.max_revive_delay)
+
+                self.log(f'waiting for {revive_delay} secs before reviving')
+                time.sleep(revive_delay) # blocking wait
+                self.log('done waiting before reviving')
+
+                self.do_checks()
+                self.switch_task()
+
+                # revive after thrashing
+                for daemon in killed_daemons:
+                    self.log('reviving {label}'.format(label=daemon.id_))
+                    daemon.restart()
+                
+                # delay before thrashing
+                thrash_delay = self.min_thrash_delay
+                if self.randomize:
+                    thrash_delay = random.randrange(self.min_thrash_delay, self.max_thrash_delay)
+                if thrash_delay > 0.0:
+                    self.log(f'waiting for {thrash_delay} secs before thrashing')
+                    time.sleep(thrash_delay) # blocking
+                    self.log('done waiting before thrashing')
+
+                self.do_checks()
+                self.switch_task()
+        self.log("Thrasher summary: ")
+        for daemon in daemons_thrash_history:
+            self.log(f'{daemon} was thrashed {len(daemons_thrash_history[daemon])} times')
+        for index, string in enumerate(summary):
+            self.log(f"Iteration {index}: {string}")
+
+class ThrashTest(Nvmeof):
+    name = 'nvmeof.thrash'
+    def setup(self):
+        if self.config is None:
+            self.config = {}
+        assert isinstance(self.config, dict), \
+            'nvmeof.thrash task only accepts a dict for configuration'
+
+        self.cluster = self.config['cluster'] = self.config.get('cluster', 'ceph')
+        daemons = list(self.ctx.daemons.iter_daemons_of_role('nvmeof', self.cluster))
+        assert len(daemons) > 1, \
+            'nvmeof.thrash task requires at least 2 nvmeof daemon'
+        self.thrasher = NvmeofThrasher(self.ctx, self.config, daemons)
+
+    def begin(self):
+        self.thrasher.start()
+        self.ctx.ceph[self.cluster].thrashers.append(self.thrasher) 
+
+    def end(self):
+        log.info('joining nvmeof.thrash')
+        self.thrasher.stop()
+        if self.thrasher.exception is not None:
+            raise RuntimeError('error during thrashing')
+        self.thrasher.join()
+        log.info('done joining')
+
+
  task = Nvmeof
+thrash = ThrashTest
diff --git a/qa/workunits/rbd/nvmeof_basic_tests.sh b/qa/workunits/rbd/nvmeof_basic_tests.sh

index 878e043fbeb56ed2af80c54f64032406331daa29..1c8d24affa3768c7bf040a81a8a89098cd57066d 100755 (executable)
--- a/qa/workunits/rbd/nvmeof_basic_tests.sh
+++ b/qa/workunits/rbd/nvmeof_basic_tests.sh
@@ -1,7 +1,13 @@
  #!/bin/bash -x
  
+sudo modprobe nvme-fabrics
+sudo modprobe nvme-tcp
+sudo dnf reinstall nvme-cli -y
+sudo lsmod | grep nvme
+nvme version
+
  source /etc/ceph/nvmeof.env
-SPDK_CONTROLLER="SPDK bdev Controller"
+SPDK_CONTROLLER="Ceph bdev Controller"
  DISCOVERY_PORT="8009"
  
  discovery() {
@@ -13,8 +19,9 @@ discovery() {
  }
  
  connect() {
-    sudo nvme connect -t tcp --traddr $NVMEOF_GATEWAY_IP_ADDRESS -s $NVMEOF_PORT -n $NVMEOF_NQN
-    output=$(sudo nvme list)
+    sudo nvme connect -t tcp --traddr $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS -s $NVMEOF_PORT -n "${NVMEOF_SUBSYSTEMS_PREFIX}1"
+    sleep 5
+    output=$(sudo nvme list --output-format=json)
      if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then
          return 1
      fi
@@ -29,8 +36,9 @@ disconnect_all() {
  }
  
  connect_all() {
-    sudo nvme connect-all --traddr=$NVMEOF_GATEWAY_IP_ADDRESS --transport=tcp
-    output=$(sudo nvme list)
+    sudo nvme connect-all --traddr=$NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --transport=tcp -l 3600
+    sleep 5
+    output=$(sudo nvme list --output-format=json)
      if ! echo "$output" | grep -q "$SPDK_CONTROLLER"; then
          return 1
      fi
@@ -65,7 +73,9 @@ test_run list_subsys 1
  test_run disconnect_all
  test_run list_subsys 0
  test_run connect_all
-test_run list_subsys 1
+gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
+multipath_count=$(( $gateways_count * $NVMEOF_SUBSYSTEMS_COUNT)) 
+test_run list_subsys $multipath_count
  
  
  echo "-------------Test Summary-------------"
diff --git a/qa/workunits/rbd/nvmeof_fio_test.sh b/qa/workunits/rbd/nvmeof_fio_test.sh

index bacc15e83eb7ab3071bd0a908e1e00fca6c174d2..8eef2d66e6621d83e2dca6ff1137868160f125e7 100755 (executable)
--- a/qa/workunits/rbd/nvmeof_fio_test.sh
+++ b/qa/workunits/rbd/nvmeof_fio_test.sh
@@ -4,7 +4,18 @@ sudo yum -y install fio
  sudo yum -y install sysstat
  
  fio_file=$(mktemp -t nvmeof-fio-XXXX)
-drives_list=$(sudo nvme list --output-format=json | jq -r '.Devices | .[] | select(.ModelNumber == "SPDK bdev Controller") | .DevicePath')
+all_drives_list=$(sudo nvme list --output-format=json | 
+    jq -r '.Devices | sort_by(.NameSpace) | .[] | select(.ModelNumber == "Ceph bdev Controller") | .DevicePath')
+
+# When the script is passed --start_ns and --end_ns (example: `nvmeof_fio_test.sh --start_ns 1 --end_ns 3`), 
+# then fio runs on namespaces only in the defined range (which is 1 to 3 here). 
+# So if `nvme list` has 5 namespaces with "SPDK Controller", then fio will 
+# run on first 3 namespaces here.
+if [ "$namespace_range_start" ] || [ "$namespace_range_end" ]; then
+    selected_drives=$(echo "${all_drives_list[@]}" | sed -n "${namespace_range_start},${namespace_range_end}p")
+else
+    selected_drives="${all_drives_list[@]}"
+fi
  
  RUNTIME=${RUNTIME:-600}
  # IOSTAT_INTERVAL=10
@@ -24,13 +35,18 @@ verify=md5
  verify_fatal=1
  EOF
  
-fio --showcmd $fio_file
-sudo fio $fio_file &
+echo "[nvmeof.fio] starting fio test..."
  
  if [ -n "$IOSTAT_INTERVAL" ]; then
      iostat_count=$(( RUNTIME / IOSTAT_INTERVAL ))
      iostat -d $IOSTAT_INTERVAL $iostat_count -h 
  fi
+if [ "$rbd_iostat" = true  ]; then
+    iterations=$(( RUNTIME / 5 ))
+    timeout 20 rbd perf image iostat $RBD_POOL --iterations $iterations &
+fi
+fio --showcmd $fio_file
+sudo fio $fio_file 
  wait
  
-echo "[nvmeof] fio test successful!"
+echo "[nvmeof.fio] fio test successful!"
author	Vallari Agrawal <val.agl002@gmail.com>
	Thu, 27 Jun 2024 11:18:47 +0000 (16:48 +0530)
committer	Alexander Indenbaum <aindenba@redhat.com>
	Wed, 19 Nov 2025 18:41:52 +0000 (20:41 +0200)
qa/suites/nvmeof/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/%	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/basic/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/base/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/base/install.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/basic/centos_latest.yaml	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/clusters/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/clusters/2-gateways-2-initiator.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/basic/conf	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/workloads/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/basic/workloads/nvmeof_initiator.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/%	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/base/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/base/install.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/centos_latest.yaml	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/clusters/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/clusters/3-gateways-1-initiator.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/conf	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/gateway-initiator-setup/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/gateway-initiator-setup/3-subsys-60-namespace.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/thrashers/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/thrashers/nvmeof_mon_thrash.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/thrashers/nvmeof_thrash.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/nvmeof/thrash/workloads/.qa	[new symlink]	patch \| blob
qa/suites/nvmeof/thrash/workloads/fio.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rbd/nvmeof/%	[deleted file]	patch \| blob \| history
qa/suites/rbd/nvmeof/.qa	[deleted symlink]	patch \| blob \| history
qa/suites/rbd/nvmeof/base/.qa	[deleted symlink]	patch \| blob \| history
qa/suites/rbd/nvmeof/base/install.yaml	[deleted file]	patch \| blob \| history
qa/suites/rbd/nvmeof/centos_latest.yaml	[deleted symlink]	patch \| blob \| history
qa/suites/rbd/nvmeof/cluster/+	[deleted file]	patch \| blob \| history
qa/suites/rbd/nvmeof/cluster/.qa	[deleted symlink]	patch \| blob \| history
qa/suites/rbd/nvmeof/cluster/fixed-3.yaml	[deleted file]	patch \| blob \| history
qa/suites/rbd/nvmeof/cluster/openstack.yaml	[deleted file]	patch \| blob \| history
qa/suites/rbd/nvmeof/workloads/.qa	[deleted symlink]	patch \| blob \| history
qa/suites/rbd/nvmeof/workloads/nvmeof_initiator.yaml	[deleted file]	patch \| blob \| history
qa/tasks/cephadm.py		patch \| blob \| history
qa/tasks/daemonwatchdog.py		patch \| blob \| history
qa/tasks/mon_thrash.py		patch \| blob \| history
qa/tasks/nvmeof.py		patch \| blob \| history
qa/workunits/rbd/nvmeof_basic_tests.sh		patch \| blob \| history
qa/workunits/rbd/nvmeof_fio_test.sh		patch \| blob \| history