return True
+ def get_active_names(self):
+ """
+ Return MDS daemon names of those daemons holding ranks
+ in state up:active
+
+ :return: list of strings like ['a', 'b'], sorted by rank
+ """
+ status = self.mon_manager.get_mds_status_all()
+ result = []
+ for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+ if mds_status['state'] == 'up:active':
+ result.append(mds_status['name'])
+
+ return result
+
def wait_for_daemons(self, timeout=None):
"""
Wait until all daemons are healthy
elif timeout is not None and elapsed > timeout:
raise RuntimeError(
"Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
- elapsed, goal_state, current_state
- ))
+ elapsed, goal_state, current_state
+ ))
else:
time.sleep(1)
elapsed += 1
return json.loads(p.stdout.getvalue().strip())
- def list_dirfrag(self, dir_ino):
+ def rados(self, args, pool=None):
"""
- Read the named object and return the list of omap keys
-
- :return a list of 0 or more strings
+ Call into the `rados` CLI from an MDS
"""
- dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+ if pool is None:
+ pool = self.get_metadata_pool_name()
# Doesn't matter which MDS we use to run rados commands, they all
# have access to the pools
# NB we could alternatively use librados pybindings for this, but it's a one-liner
# using the `rados` CLI
- args = ["rados", "-p", self.get_metadata_pool_name(), "listomapkeys", dirfrag_obj_name]
+ args = ["rados", "-p", pool] + args
+ p = remote.run(
+ args=args,
+ stdout=StringIO())
+ return p.stdout.getvalue().strip()
+
+ def list_dirfrag(self, dir_ino):
+ """
+ Read the named object and return the list of omap keys
+
+ :return a list of 0 or more strings
+ """
+
+ dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+
try:
- p = remote.run(
- args=args,
- stdout=StringIO())
+ key_list_str = self.rados(["listomapkeys", dirfrag_obj_name])
except CommandFailedError as e:
log.error(e.__str__())
raise ObjectNotFound(dirfrag_obj_name)
- key_list_str = p.stdout.getvalue().strip()
return key_list_str.split("\n") if key_list_str else []
- def journal_tool(self, args):
+ def erase_metadata_objects(self, prefix):
"""
- Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+ For all objects in the metadata pool matching the prefix,
+ erase them.
+
+ This O(N) with the number of objects in the pool, so only suitable
+ for use on toy test filesystems.
"""
+ all_objects = self.rados(["ls"]).split("\n")
+ matching_objects = [o for o in all_objects if o.startswith(prefix)]
+ for o in matching_objects:
+ self.rados(["rm", o])
+
+ def erase_mds_objects(self, rank):
+ """
+ Erase all the per-MDS objects for a particular rank. This includes
+ inotable, sessiontable, journal
+ """
+
+ def obj_prefix(multiplier):
+ """
+ MDS object naming conventions like rank 1's
+ journal is at 201.***
+ """
+ return "%x." % (multiplier * 0x100 + rank)
+
+ # MDS_INO_LOG_OFFSET
+ self.erase_metadata_objects(obj_prefix(2))
+ # MDS_INO_LOG_BACKUP_OFFSET
+ self.erase_metadata_objects(obj_prefix(3))
+ # MDS_INO_LOG_POINTER_OFFSET
+ self.erase_metadata_objects(obj_prefix(4))
+ # MDSTables & SessionMap
+ self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
+
+ def _run_tool(self, tool, args, rank=None, quiet=False):
mds_id = self.mds_ids[0]
remote = self.mds_daemons[mds_id].remote
- return remote.run(
- args=['cephfs-journal-tool'] + args,
+ # Tests frequently have [client] configuration that jacks up
+ # the objecter log level (unlikely to be interesting here)
+ # and does not set the mds log level (very interesting here)
+ if quiet:
+ base_args = [tool, '--debug-mds=1', '--debug-objecter=1']
+ else:
+ base_args = [tool, '--debug-mds=4', '--debug-objecter=1']
+
+ if rank is not None:
+ base_args.extend(["--rank", "%d" % rank])
+
+ t1 = datetime.datetime.now()
+ r = remote.run(
+ args=base_args + args,
stdout=StringIO()).stdout.getvalue().strip()
+ duration = datetime.datetime.now() - t1
+ log.info("Ran {0} in time {1}, result:\n{2}".format(
+ base_args + args, duration, r
+ ))
+ return r
+
+ def journal_tool(self, args, rank=None, quiet=False):
+ """
+ Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+ """
+ return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+
+ def table_tool(self, args, quiet=False):
+ """
+ Invoke cephfs-table-tool with the passed arguments, and return its stdout
+ """
+ return self._run_tool("cephfs-table-tool", args, None, quiet)
--- /dev/null
+
+"""
+Test our tools for recovering the content of damaged journals
+"""
+
+import contextlib
+import logging
+from textwrap import dedent
+import time
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestJournalRepair(CephFSTestCase):
+ def test_inject_to_empty(self):
+ """
+ That when some dentries in the journal but nothing is in
+ the backing store, we correctly populate the backing store
+ from the journalled dentries.
+ """
+
+ # Inject metadata operations
+ self.mount_a.run_shell(["touch", "rootfile"])
+ self.mount_a.run_shell(["mkdir", "subdir"])
+ self.mount_a.run_shell(["touch", "subdir/subdirfile"])
+ # There are several different paths for handling hardlinks, depending
+ # on whether an existing dentry (being overwritten) is also a hardlink
+ self.mount_a.run_shell(["mkdir", "linkdir"])
+
+ # Test inode -> remote transition for a dentry
+ self.mount_a.run_shell(["touch", "linkdir/link0"])
+ self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
+
+ # Test nothing -> remote transition
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
+
+ # Test remote -> inode transition
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
+ self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
+ self.mount_a.run_shell(["touch", "linkdir/link2"])
+
+ # Test remote -> diff remote transition
+ self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
+ self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
+ self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
+
+ # Before we unmount, make a note of the inode numbers, later we will
+ # check that they match what we recover from the journal
+ rootfile_ino = self.mount_a.path_to_ino("rootfile")
+ subdir_ino = self.mount_a.path_to_ino("subdir")
+ linkdir_ino = self.mount_a.path_to_ino("linkdir")
+ subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
+
+ self.mount_a.umount_wait()
+
+ # Stop the MDS
+ self.fs.mds_stop()
+ self.fs.mds_fail()
+
+ # Now, the journal should contain the operations, but the backing
+ # store shouldn't
+ with self.assertRaises(ObjectNotFound):
+ self.fs.list_dirfrag(subdir_ino)
+ self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+ # Execute the dentry recovery, this should populate the backing store
+ self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+
+ # Dentries in ROOT_INO are present
+ self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
+ self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head'])
+ self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
+ sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
+
+ # Now check the MDS can read what we wrote: truncate the journal
+ # and start the mds.
+ self.fs.journal_tool(['journal', 'reset'])
+ self.fs.mds_restart()
+ self.fs.wait_for_daemons()
+
+ # List files
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ # First ls -R to populate MDCache, such that hardlinks will
+ # resolve properly (recover_dentries does not create backtraces,
+ # so ordinarily hardlinks to inodes that happen not to have backtraces
+ # will be invisible in readdir).
+ # FIXME: hook in forward scrub here to regenerate backtraces
+ proc = self.mount_a.run_shell(['ls', '-R'])
+
+ proc = self.mount_a.run_shell(['ls', '-R'])
+ self.assertEqual(proc.stdout.getvalue().strip(),
+ dedent("""
+ .:
+ linkdir
+ rootfile
+ subdir
+
+ ./linkdir:
+ link0
+ link1
+ link2
+ link3
+
+ ./subdir:
+ subdirfile
+ """).strip())
+
+ # Check the correct inos were preserved by path
+ self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
+ self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
+ self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
+
+ # Check that the hard link handling came out correctly
+ self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
+ self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
+ self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
+ self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
+
+ # Create a new file, ensure it is not issued the same ino as one of the
+ # recovered ones
+ self.mount_a.run_shell(["touch", "afterwards"])
+ new_ino = self.mount_a.path_to_ino("afterwards")
+ self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
+
+ def test_reset(self):
+ """
+ That after forcibly modifying the backing store, we can get back into
+ a good state by resetting the MDSMap.
+
+ The scenario is that we have two active MDSs, and we lose the journals. Once
+ we have completely lost confidence in the integrity of the metadata, we want to
+ return the system to a single-MDS state to go into a scrub to recover what we
+ can.
+ """
+
+ # Set max_mds to 2
+ self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "2")
+
+ # See that we have two active MDSs
+ self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+ reject_fn=lambda v: v > 2 or v < 1)
+ active_mds_names = self.fs.get_active_names()
+
+ # Do a bunch of I/O such that at least some will hit the second MDS: create
+ # lots of directories so that the balancer should find it easy to make a decision
+ # to allocate some of them to the second mds.
+ spammers = []
+ for n in range(0, 16):
+ dir_name = "spam_{0}".format(n)
+ spammers.append(self.mount_a.spam_dir_background(dir_name))
+
+ def subtrees_assigned():
+ got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
+ rank_1_count = len([s for s in got_subtrees if s['auth_first'] == 1])
+
+ # Greater than 1, because there is typically 1 for ~mds1, and once it
+ # has been assigned something in addition to that it means it has been
+ # assigned a "real" subtree.
+ return rank_1_count > 1
+
+ # We are waiting for the MDS to respond to hot directories, which
+ # is not guaranteed to happen at a particular time, so a lengthy timeout here.
+ self.wait_until_true(subtrees_assigned, 600)
+
+ # Flush the journals so that we have some backing store data
+ # belonging to one MDS, and some to the other MDS.
+ for mds_name in active_mds_names:
+ self.fs.mds_asok(["flush", "journal"], mds_name)
+
+ # Stop (hard) the second MDS daemon
+ self.fs.mds_stop(active_mds_names[1])
+
+ # Wipe out the tables for MDS rank 1 so that it is broken and can't start
+ # (this is the simulated failure that we will demonstrate that the disaster
+ # recovery tools can get us back from)
+ self.fs.erase_metadata_objects(prefix="mds1_")
+
+ # Try to access files from the client
+ blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
+
+ # Check that this "ls -R" blocked rather than completing: indicates
+ # it got stuck trying to access subtrees which were on the now-dead MDS.
+ log.info("Sleeping to check ls is blocked...")
+ time.sleep(60)
+ self.assertFalse(blocked_ls.finished)
+
+ # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
+ # is not coming back. Kill it.
+ log.info("Killing mount, it's blocked on the MDS we killed")
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+ try:
+ # Now that the mount is dead, the ls -R should error out.
+ blocked_ls.wait()
+ except CommandFailedError:
+ pass
+
+ log.info("Terminating spammer processes...")
+ for spammer_proc in spammers:
+ spammer_proc.stdin.close()
+ try:
+ spammer_proc.wait()
+ except CommandFailedError:
+ pass
+
+ # See that the second MDS will crash when it starts and tries to
+ # acquire rank 1
+ self.fs.mds_restart(active_mds_names[1])
+ crasher = self.fs.mds_daemons[active_mds_names[1]].proc
+
+ try:
+ crasher.wait()
+ except CommandFailedError as e:
+ log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus))
+ self.fs.mds_daemons[active_mds_names[1]].proc = None
+ else:
+ raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1]))
+
+ # Now it's crashed, let the MDSMonitor know that it's not coming back
+ self.fs.mds_fail(active_mds_names[1])
+
+ # Now give up and go through a disaster recovery procedure
+ self.fs.mds_stop(active_mds_names[0])
+ self.fs.mds_fail(active_mds_names[0])
+ # Invoke recover_dentries quietly, because otherwise log spews millions of lines
+ self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
+ self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+ self.fs.table_tool(["0", "reset", "session"])
+ self.fs.journal_tool(["journal", "reset"], rank=0)
+ self.fs.erase_mds_objects(1)
+ self.fs.mon_remote.run(args=['sudo', 'ceph', 'fs', 'reset', 'default', '--yes-i-really-mean-it'])
+
+ # Bring an MDS back online, mount a client, and see that we can walk the full
+ # filesystem tree again
+ self.fs.mds_restart(active_mds_names[0])
+ self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
+ reject_fn=lambda v: len(v) > 1)
+ self.mount_a.mount()
+ self.mount_a.run_shell(["ls", "-R"], wait=True)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ fs = Filesystem(ctx)
+
+ # Pick out the clients we will use from the configuration
+ # =======================================================
+ if len(ctx.mounts) < 1:
+ raise RuntimeError("Need at least one clients")
+ mount_a = ctx.mounts.values()[0]
+
+ # Stash references on ctx so that we can easily debug in interactive mode
+ # =======================================================================
+ ctx.filesystem = fs
+ ctx.mount_a = mount_a
+
+ run_tests(ctx, config, TestJournalRepair, {
+ 'fs': fs,
+ 'mount_a': mount_a
+ })
+
+ # Continue to any downstream tasks
+ # ================================
+ yield