--- /dev/null
+# Ceph - scalable distributed file system
+#
+# Copyright (C) 2022 Red Hat, Inc.
+#
+# This is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1, as published by the Free Software
+# Foundation. See file COPYING.
+
+# Suggested recovery sequence (for single MDS cluster):
+#
+# 1) Unmount all clients.
+#
+# 2) Flush the journal (if possible):
+#
+# ceph tell mds.<fs_name>:0 flush journal
+#
+# 3) Fail the file system:
+#
+# ceph fs fail <fs_name>
+#
+# 4a) Recover dentries from the journal. This will be a no-op if the MDS flushed the journal successfully:
+#
+# cephfs-journal-tool --rank=<fs_name>:0 event recover_dentries summary
+#
+# 4b) If all good so far, reset the journal:
+#
+# cephfs-journal-tool --rank=<fs_name>:0 journal reset
+#
+# 5) Run this tool to see list of damaged dentries:
+#
+# python3 first-damage.py --memo run.1 <pool>
+#
+# 6) Optionally, remove them:
+#
+# python3 first-damage.py --memo run.2 --remove <pool>
+#
+# Note: use --memo to specify a different file to save objects that have
+# already been traversed, for independent runs.
+#
+# This has the effect of removing that dentry from the snapshot or HEAD
+# (current hierarchy). Note: the inode's linkage will be lost. The inode may
+# be recoverable in lost+found during a future data scan recovery.
+
+import argparse
+import logging
+import os
+import rados
+import re
+import sys
+import struct
+
+log = logging.getLogger("first-damage-traverse")
+
+MEMO = None
+REMOVE = False
+POOL = None
+NEXT_SNAP = None
+CONF = None
+
+DIR_PATTERN = re.compile(r'[0-9a-fA-F]{8,}\.[0-9a-fA-F]+')
+
+CACHE = set()
+
+def traverse(MEMO, ioctx):
+ for o in ioctx.list_objects():
+ if not DIR_PATTERN.fullmatch(o.key):
+ log.debug("skipping %s", o.key)
+ continue
+ elif o.key in CACHE:
+ log.debug("skipping previously examined object %s", o.key)
+ continue
+ log.info("examining: %s", o.key)
+
+ with rados.ReadOpCtx() as rctx:
+ it = ioctx.get_omap_vals(rctx, None, None, 100000)[0]
+ ioctx.operate_read_op(rctx, o.key)
+ for (dnk, val) in it:
+ log.debug('\t%s', dnk)
+ (first,) = struct.unpack('<I', val[:4])
+ if first > NEXT_SNAP:
+ log.warning(f"found {o.key}:{dnk} first (0x{first:x}) > NEXT_SNAP (0x{NEXT_SNAP:x})")
+ if REMOVE:
+ log.warning(f"removing {o.key}:{dnk}")
+ with rados.WriteOpCtx() as wctx:
+ ioctx.remove_omap_keys(wctx, [dnk])
+ ioctx.operate_write_op(wctx, o.key)
+ MEMO.write(f"{o.key}\n")
+
+if __name__ == '__main__':
+ outpath = os.path.join(os.path.expanduser('~'), os.path.basename(sys.argv[0]))
+ P = argparse.ArgumentParser(description="remove CephFS metadata dentries with invalid first snapshot")
+ P.add_argument('--conf', action='store', help='Ceph conf file', type=str)
+ P.add_argument('--debug', action='store', help='debug file', type=str, default=outpath+'.log')
+ P.add_argument('--memo', action='store', help='db for traversed dirs', default=outpath+'.memo')
+ P.add_argument('--next-snap', action='store', help='force next-snap (dev)', type=int)
+ P.add_argument('--remove', action='store_true', help='remove bad dentries', default=False)
+ P.add_argument('pool', action='store', help='metadata pool', type=str)
+ NS = P.parse_args()
+
+ logging.basicConfig(filename=NS.debug, level=logging.DEBUG)
+
+ MEMO = NS.memo
+ REMOVE = NS.remove
+ POOL = NS.pool
+ NEXT_SNAP = NS.next_snap
+ CONF = NS.conf
+
+ log.info("running as pid %d", os.getpid())
+
+ try:
+ with open(MEMO) as f:
+ for line in f.readlines():
+ CACHE.add(line.rstrip())
+ except FileNotFoundError:
+ pass
+
+ R = rados.Rados(conffile=CONF)
+ R.connect()
+ ioctx = R.open_ioctx(POOL)
+
+ if NEXT_SNAP is None:
+ data = ioctx.read("mds_snaptable")
+ # skip "version" of MDSTable payload
+ # V=$(dd if="$SNAPTABLE" bs=1 count=1 skip=8 | od --endian=little -An -t u1)
+ V = struct.unpack('<b', data[8:9])[0]
+ log.debug("version is %d", V)
+ if V != 5:
+ raise RuntimeError("incompatible snaptable")
+ # skip version,struct_v,compat_v,length
+ # NEXT_SNAP=$((1 + $(dd if="$SNAPTABLE" bs=1 count=8 skip=14 | od --endian=little -An -t u8)))
+ NEXT_SNAP = 1 + struct.unpack('<Q', data[14:22])[0]
+ log.debug("NEXT_SNAP = %d", NEXT_SNAP)
+
+ with open(MEMO, 'a') as f:
+ log.info("saving traversed keys to %s to allow resuming", MEMO)
+ traverse(f, ioctx)
+++ /dev/null
-#!/bin/bash
-
-# Ceph - scalable distributed file system
-#
-# Copyright (C) 2022 Red Hat, Inc.
-#
-# This is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License version 2.1, as published by the Free Software
-# Foundation. See file COPYING.
-
-# Suggested recovery sequence (for single MDS cluster):
-#
-# 1) Unmount all clients.
-#
-# 2) Flush the journal (if possible):
-#
-# ceph tell mds.<fs_name>:0 flush journal
-#
-# 3) Fail the file system:
-#
-# ceph fs fail <fs_name>
-#
-# 4a) Recover dentries from the journal. This will be a no-op if the MDS flushed the journal successfully:
-#
-# cephfs-journal-tool --rank=<fs_name>:0 event recover_dentries summary
-#
-# 4b) If all good so far, reset the journal:
-#
-# cephfs-journal-tool --rank=<fs_name>:0 journal reset
-#
-# 5) Run this tool to see list of damaged dentries:
-#
-# first-damage.sh <pool>
-#
-# 6) Optionally, remove them:
-#
-# first-damage.sh --remove <pool>
-#
-# This has the effect of removing that dentry from the snapshot or HEAD
-# (current hierarchy). Note: the inode's linkage will be lost. The inode may
-# be recoverable in lost+found during a future data scan recovery.
-
-set -ex
-
-function usage {
- printf '%s: [--remove] <metadata pool> [newest snapid]\n' "$0"
- printf ' remove CephFS metadata dentries with invalid first snapshot'
- exit 1
-}
-
-function mrados {
- rados --pool="$METADATA_POOL" "$@"
-}
-
-function traverse {
- local T=$(mktemp -p /tmp MDS_TRAVERSAL.XXXXXX)
- mrados ls | grep -E '[[:xdigit:]]{8,}\.[[:xdigit:]]+' > "$T"
- while read obj; do
- local O=$(mktemp -p /tmp "$obj".XXXXXX)
- local KEYS=$(mktemp -p /tmp "$obj"-keys.XXXXXX)
- mrados listomapkeys "$obj" > "$KEYS"
- while read dnk; do
- mrados getomapval "$obj" "$dnk" "$O"
- local first=$(dd if="$O" bs=1 count=4 | od --endian=little -An -t u8)
- if [ "$first" -gt "$NEXT_SNAP" ]; then
- printf 'found "%s:%s" first (0x%x) > NEXT_SNAP (0x%x)\n' "$obj" "$dnk" "$first" "$NEXT_SNAP"
- if [ "$REMOVE" -ne 0 ]; then
- printf 'removing "%s:%s"\n' "$obj" "$dnk"
- mrados rmomapkey "$obj" "$dnk"
- fi
- fi
- done < "$KEYS"
- rm "$O"
- done < "$T"
-}
-
-function main {
- eval set -- $(getopt --name "$0" --options 'r' --longoptions 'help,remove' -- "$@")
-
- while [ "$#" -gt 0 ]; do
- case "$1" in
- -h|--help)
- usage
- ;;
- --remove)
- REMOVE=1
- shift
- ;;
- --)
- shift
- break
- ;;
- esac
- done
-
- if [ -z "$1" ]; then
- usage
- fi
- METADATA_POOL="$1"
- NEXT_SNAP="$2"
-
- if [ -z "$NEXT_SNAP" ]; then
- SNAPTABLE=$(mktemp -p /tmp MDS_SNAPTABLE.XXXXXX)
- rados --pool="$METADATA_POOL" get mds_snaptable "$SNAPTABLE"
- # skip "version" of MDSTable payload
- V=$(dd if="$SNAPTABLE" bs=1 count=1 skip=8 | od --endian=little -An -t u1)
- if [ "$V" -ne 5 ]; then
- printf 'incompatible snaptable\n'
- exit 2
- fi
- # skip version,struct_v,compat_v,length
- NEXT_SNAP=$((1 + $(dd if="$SNAPTABLE" bs=1 count=8 skip=14 | od --endian=little -An -t u8)))
- printf 'found latest snap: %d\n' "$NEXT_SNAP"
- fi
-
- traverse
-}
-
-main "$@"