From eee6fb0f12b55973a93558dbdb9118732cc762d0 Mon Sep 17 00:00:00 2001
From: ethanwu <ethanwu@synology.com>
Date: Sun, 24 Mar 2024 17:33:42 +0800
Subject: [PATCH] mds: fix rank root doesn't insert root ino into its subtree
 map when starting

Root ino belongs to subtree of root rank, and should be inserted when creating
subtree map log. This is missing when mds runs at STATE_STARTING, however.
When doing replay, all inode under this subtree will be trimmed by
 trim_non_auth_subtree and cause replay failure.

Quick way to reproduce this:
  After creating filesystem, mount it and create some directory.
  mkdir -p ${cephfs_root}/dir1/dir11/foo
  mkdir -p ${cephfs_root}/dir1/dir11/bar
  unmount cephfs
  ./bin/ceph fs set a down true
  ./bin/ceph fs set a down false
  ./bin/cephfs-journal-tool --rank=a:0 event get json --path output # Can see that ESubtreeMap only contains 0x100 but no 0x1
  mount cephfs
  rmdir ${cephfs_root}/dir1/dir11/foo
  rmdir ${cephfs_root}/dir1/dir11/bar
  unmount cephfs
  trigger mds rank 0 failover, and you can find rank 0 fails during replay and is marked damaged

  Check mds log will find the following related message:
  -49> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.cache trim_non_auth_subtree(0x560372b2df80) [dir 0x1 / [2,head] auth v=12 cv=0/0 dir_auth=-2 state=1073741824 f(v0 m2024-03-24T18:03:30.350260+0800 1=0+1) n(v3 rc2024-03-24T18:03:30.401819+0800 4=0+4) hs=1+0,ss=0+0 | child=1 subtree=1 0x560372b2df80]

  -27> 2024-03-24T18:06:19.461+0800 7f1542cbf700 14 mds.0.cache remove_inode [inode 0x10000000000 [...2,head] #10000000000/ auth v10 f(v0 m2024-03-24T18:03:30.378677+0800 1=0+1) n(v1 rc2024-03-24T18:03:30.401819+0800 4=0+4) (iversion lock) 0x560372c52100]
  -21> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.log _replay 4216759~3161 / 4226491 2024-03-24T18:05:16.515314+0800: EUpdate unlink_local [metablob 0x10000000000, 4 dirs]   -20> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.journal EUpdate::replay
  -19> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.journal EMetaBlob.replay 4 dirlumps by unknown.0
  -18> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.journal EMetaBlob.replay don't have renamed ino 0x10000000003
  -17> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.journal EMetaBlob.replay found null dentry in dir 0x10000000001
  -16> 2024-03-24T18:06:19.461+0800 7f1542cbf700 10 mds.0.journal EMetaBlob.replay dir 0x10000000000
  -15> 2024-03-24T18:06:19.461+0800 7f1542cbf700  0 mds.0.journal EMetaBlob.replay missing dir ino  0x10000000000
  -14> 2024-03-24T18:06:19.461+0800 7f1542cbf700 -1 log_channel(cluster) log [ERR] : failure replaying journal (EMetaBlob)
  -13> 2024-03-24T18:06:19.461+0800 7f1542cbf700  5 mds.beacon.c set_want_state: up:replay -> down:damaged

The way to fix this is refering to how mdsdir inode is handled when MDS enter STARTING.

Fixes: https://tracker.ceph.com/issues/65094
Signed-off-by: ethanwu <ethanwu@synology.com>
(cherry picked from commit 463c3b7d3e725667fa4b5d30b9d5441ed327b388)
---
 src/mds/MDCache.cc | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 0aa1ec766b992..0f17a29ab2c9e 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -633,7 +633,24 @@ void MDCache::open_root_inode(MDSContext *c)
   if (mds->get_nodeid() == mds->mdsmap->get_root()) {
     CInode *in;
     in = create_system_inode(CEPH_INO_ROOT, S_IFDIR|0755);  // initially inaccurate!
-    in->fetch(c);
+    if (mds->is_starting()) {
+      in->fetch(
+          new MDSInternalContextWrapper(mds,
+            new LambdaContext([this, c](int r) {
+              if (r < 0) {
+                c->complete(r);
+                return;
+              }
+              CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
+              ceph_assert(rootdir);
+              adjust_subtree_auth(rootdir, mds->get_nodeid());
+              rootdir->fetch(c);
+            })
+          )
+        );
+    } else {
+      in->fetch(c);
+    }
   } else {
     discover_base_ino(CEPH_INO_ROOT, c, mds->mdsmap->get_root());
   }
-- 
2.39.5