]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
qa: enable libcephfs debug logs for fio workload with nfs 61547/head
authorDhairya Parmar <dparmar@redhat.com>
Tue, 11 Feb 2025 21:08:56 +0000 (02:38 +0530)
committerDhairya Parmar <dparmar@redhat.com>
Thu, 8 May 2025 07:38:05 +0000 (13:08 +0530)
this acts as a stop gap to get libcephfs logs in teuthology runs
while https://tracker.ceph.com/issues/69895 is being discussed and implemented.

Signed-off-by: Dhairya Parmar <dparmar@redhat.com>
qa/tasks/cephfs/test_nfs.py

index 0a1c07dce04bb36355926f3aff0caafd4b24793d..857d3296fa651126ad243fe24e36a672e45dcf36 100644 (file)
@@ -4,6 +4,7 @@ import json
 import time
 import logging
 from io import BytesIO, StringIO
+import yaml
 
 from tasks.mgr.mgr_test_case import MgrTestCase
 from teuthology import contextutil
@@ -487,6 +488,121 @@ class TestNFS(MgrTestCase):
                                  }
                              }))
 
+    def apply_ganesha_spec(self, spec):
+        """
+        apply spec and wait for redeploy otherwise it will reset any conf changes
+        :param spec: ganesha daemon spec (YAML)
+        """
+        ganesha_daemon_pid_init = (self.ctx.cluster.run(args=["sudo", "pgrep", "ganesha.nfsd"],
+                                                        stdout=StringIO(),
+                                                        stderr=StringIO()))[0].stdout.getvalue().strip()
+        self.ctx.cluster.run(args=['ceph', 'orch', 'apply', '-i', '-'],
+                             stdin=spec)
+        with contextutil.safe_while(sleep=4, tries=15) as proceed:
+            while proceed():
+                try:
+                    ganesha_daemon_pid = (self.ctx.cluster.run(args=["sudo", "pgrep", "ganesha.nfsd"],
+                                                               stdout=StringIO(),
+                                                               stderr=StringIO()))[0].stdout.getvalue().strip()
+                    if ganesha_daemon_pid != ganesha_daemon_pid_init:
+                        # new pid i.e. redeployment done
+                        break
+                except CommandFailedError:
+                    # no pid if the redeployment is in progress
+                    log.info('waiting for ganesha daemon redeployment')
+
+    def enable_libcephfs_logging(self, cluster_name):
+        """
+        enable ceph client logs by adding a volume mount to ganesha daemon's
+        unit.run using `ceph orch apply -i <spec>` and adding client log path
+        to /var/lib/ceph/{fsid}/{ganesha_daemon}/config
+        :param cluster_name: nfs cluster name
+        """
+        fsid = self._cmd("fsid").strip()
+
+        # add volume mount for ceph client logging from /var/log/ceph/$fsid:/var/log/ceph:z
+        ganesha_spec = self._cmd("orch", "ls", "--service-name",
+                                 f"nfs.{cluster_name}", "--export").strip()
+        parsed_ganesha_spec = yaml.safe_load(ganesha_spec)
+        original_ganesha_spec = yaml.dump(parsed_ganesha_spec)
+        parsed_ganesha_spec["extra_container_args"] = ["-v",
+                                                       f"/var/log/ceph/{fsid}:/var/log/ceph:z"]
+        debug_enabled_ganesha_spec = yaml.dump(parsed_ganesha_spec).replace("- -v", '- "-v"').replace(
+            f"- /var/log/ceph/{fsid}:/var/log/ceph:z", f'- "/var/log/ceph/{fsid}:/var/log/ceph:z"')
+        log.debug(f"debug enabled ganesha spec: {debug_enabled_ganesha_spec}")
+
+        self.apply_ganesha_spec(debug_enabled_ganesha_spec)
+
+        # add client debug to /var/lib/ceph/$fsid/$ganesha_daemon/config
+        ganesha_daemon = ((self._orch_cmd("ps", "--daemon-type", "nfs")).split("\n")[1].split(' ')[0]).strip()
+        GANESHA_CONF_FILE_PATH = f"/var/lib/ceph/{fsid}/{ganesha_daemon}/config"
+
+        original_ganesha_conf = (self.ctx.cluster.run(args=["sudo", "cat", GANESHA_CONF_FILE_PATH],
+                                                      stdout=StringIO(),
+                                                      stderr=StringIO()))[0].stdout.getvalue().strip()
+        if "[client]" not in original_ganesha_conf:
+            s = f"[client]\n\tdebug client = 20\n\tlog file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log"
+            self._sys_cmd(["echo", Raw(f'"{s}"'), Raw("|"), "sudo", "tee", Raw("-a"), GANESHA_CONF_FILE_PATH])
+            # restart ganesha daemon for the changes to take effect
+            self._orch_cmd("restart", f"nfs.{cluster_name}")
+
+        # ensure log level and file path exists
+        ganesha_conf_debug_enabled = (self.ctx.cluster.run(args=["sudo", "cat", GANESHA_CONF_FILE_PATH],
+                                                           stdout=StringIO(),
+                                                           stderr=StringIO()))[0].stdout.getvalue().strip()
+        self.assertIn("[client]", ganesha_conf_debug_enabled)
+        self.assertIn("debug client = 20", ganesha_conf_debug_enabled)
+        self.assertIn(f"log file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log",
+                      ganesha_conf_debug_enabled)
+
+        def check_libcephfs_log():
+            LIBCEPHFS_LOG_FILE_PATH = f"/var/log/ceph/{fsid}/ceph-client.nfs.{cluster_name}.log"
+            libcephfs_log = (self.ctx.cluster.run(args=["sudo", "cat",
+                                                        LIBCEPHFS_LOG_FILE_PATH,
+                                                        Raw("|"), "tail", "-n", "2"],
+                                                  check_status=False,
+                                                  stdout=StringIO(),
+                                                  stderr=StringIO()))
+            if libcephfs_log[0].returncode != 0:
+                log.debug(f"failed to read {LIBCEPHFS_LOG_FILE_PATH}, retrying")
+                return False
+            if len(libcephfs_log[0].stdout.getvalue().strip()) == 0:
+                log.debug(f"log file {LIBCEPHFS_LOG_FILE_PATH} empty, retrying")
+                return False
+            return True
+
+        # usually appears in no time, sometimes might take a second or two for the log file to appear
+        self.wait_until_true(check_libcephfs_log, timeout=60)
+
+        return original_ganesha_spec, GANESHA_CONF_FILE_PATH, original_ganesha_conf
+
+    def disable_libcephfs_logging(self, cluster_name, ganesha_spec, conf_path, ganesha_conf):
+        """
+        disable ceph client logs by reverting back to the primary ganesha spec and removing debug level
+        and file path from /var/lib/ceph/{fsid}/{ganesha_daemon}/config
+        :param cluster_name: nfs cluster name
+        :param ganesha_spec: primary spec (spec prior to adding debug volume mount)
+        :param conf_path: ganesha conf file path
+        :param ganesha_conf: primary ganesha conf (conf prior to adding debug level and path)
+        """
+        self.apply_ganesha_spec(ganesha_spec)
+
+        # remove ceph client debug info from ganesha conf
+        conf_content = (self.ctx.cluster.run(args=["sudo", "cat", conf_path],
+                                             stdout=StringIO(),
+                                             stderr=StringIO()))[0].stdout.getvalue().strip()
+        if "[client]" in conf_content:
+            self.ctx.cluster.run(args=['sudo', 'truncate', Raw("-s"), "0", conf_path])
+            self._sys_cmd(["echo", Raw(f'"{ganesha_conf}"'), Raw("|"), "sudo", "tee", conf_path])
+            default_conf = (self.ctx.cluster.run(args=["sudo", "cat", conf_path],
+                                                 stdout=StringIO(),
+                                                 stderr=StringIO()))[0].stdout.getvalue().strip()
+            self.assertNotIn("[client]", default_conf)
+            self.assertNotIn("debug client = 20", default_conf)
+            self.assertNotIn(f"log file = /var/log/ceph/ceph-client.nfs.{cluster_name}.log", default_conf)
+            # restart ganesha daemon for the changes to take effect
+            self._orch_cmd("restart", f"nfs.{cluster_name}")
+
     def test_create_and_delete_cluster(self):
         '''
         Test successful creation and deletion of the nfs cluster.
@@ -671,11 +787,13 @@ class TestNFS(MgrTestCase):
         Test async io using fio. Expect completion without hang or crash
         '''
         self._test_create_cluster()
+        ganesha_spec, conf_path, conf = self.enable_libcephfs_logging(self.cluster_id)
         self._create_export(export_id='1', create_fs=True,
                             extra_cmd=['--pseudo-path', self.pseudo_path])
         port, ip = self._get_port_ip_info()
         self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed')
         self._test_fio(self.pseudo_path, port, ip)
+        self.disable_libcephfs_logging(self.cluster_id, ganesha_spec, conf_path, conf)
         self._test_delete_cluster()
 
     def test_cluster_info(self):