From 21d099239e8bebf104e93b82171abad21643dadb Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 28 Jan 2025 17:30:19 -0500 Subject: [PATCH] mds: skip scrubbing damaged dirfrag This only happens when the omap fetch fails or the fnode is corrupt. MDS can't presently repair that damage. Without this change, the MDS enters an infinite loop of repair: 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 MDSContext::complete: 12C_RetryScrub 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack kick_off_scrubs: state=RUNNING 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack kick_off_scrubs entering with 0 in progress and 1 in the stack 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.scrubstack scrub_dirfrag [dir 0x10000000000 /dir_x/ [2,head] auth v=8 cv=7/7 ap=1+0 state=1610612737|complete f(v0 m2025-01-28T19:25:31.191802+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) hs=1+0,ss=0+0 | child=1 dirty=1 waiter=0 authpin=1 scrubqueue=1 0x55b1a50fa880] 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.den(0x10000000000 dir_xx) scrubbing [dentry #0x1/dir_x/dir_xx [2,head] auth (dversion lock) pv=0 v=8 ino=0x10000000001 state=1073741824 0x55b1a50eaf00] next_seq = 2 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.cache.snaprealm(0x1 seq 1 0x55b1a50da240) get_snaps (seq 1 cached_seq 1) 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.scrubstack _enqueue with {[inode 0x10000000001 [...2,head] /dir_x/dir_xx/ auth v6 f(v0 m2025-01-28T19:25:31.193448+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) 0x55b1a4fac680]}, top=0 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.ino(0x10000000001) scrub_initialize with scrub_version 6 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.ino(0x10000000001) uninline_initialize with scrub_version 6 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack enqueue [inode 0x10000000001 [...2,head] /dir_x/dir_xx/ auth v6 f(v0 m2025-01-28T19:25:31.193448+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) 0x55b1a4fac680] to bottom of ScrubStack 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.dir(0x10000000000) get_num_head_items() = 1; fnode.fragstat.nfiles=0 fnode.fragstat.nsubdirs=1 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.dir(0x10000000000) total of child dentries: n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.dir(0x10000000000) my rstats: n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.cache.dir(0x10000000000) check_rstats complete on 0x55b1a50fa880 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.dir(0x10000000000) scrub_finished 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.cache.dir(0x10000000000) auth_unpin by 0x55b1a4f7b600 on [dir 0x10000000000 /dir_x/ [2,head] auth v=8 cv=7/7 state=1610612737|complete f(v0 m2025-01-28T19:25:31.191802+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) hs=1+0,ss=0+0 | child=1 dirty=1 waiter=0 authpin=0 scrubqueue=1 0x55b1a50fa880] count now 0 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.scrubstack scrub_dirfrag done 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack kick_off_scrubs dirfrag, done 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack dequeue [dir 0x10000000000 /dir_x/ [2,head] auth v=8 cv=7/7 state=1610612737|complete f(v0 m2025-01-28T19:25:31.191802+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) hs=1+0,ss=0+0 | child=1 dirty=1 waiter=0 authpin=0 scrubqueue=1 0x55b1a50fa880] from ScrubStack 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack kick_off_scrubs examining [inode 0x10000000001 [...2,head] /dir_x/dir_xx/ auth v6 f(v0 m2025-01-28T19:25:31.193448+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) | scrubqueue=1 0x55b1a4fac680] 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.dir(0x10000000000) can_auth_pin: auth! 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.scrubstack scrub_dir_inode [inode 0x10000000001 [...2,head] /dir_x/dir_xx/ auth v6 f(v0 m2025-01-28T19:25:31.193448+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) | scrubqueue=1 0x55b1a4fac680] 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack scrub_dir_inode recursive mode, frags [*] 2025-01-28T19:25:46.153+0000 7f9626cc5640 15 mds.0.cache.ino(0x10000000001) maybe_export_pin update=0 [inode 0x10000000001 [...2,head] /dir_x/dir_xx/ auth v6 f(v0 m2025-01-28T19:25:31.193448+0000 1=0+1) n(v0 rc2025-01-28T19:25:31.306508+0000 b1 3=1+2) | scrubqueue=1 0x55b1a4fac680] 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.cache.dir(0x10000000001) can_auth_pin: auth! 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.scrubstack scrub_dir_inode barebones [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 state=1073741824 f() n() hs=0+0,ss=0+0 0x55b1a50fb180] 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.cache.dir(0x10000000001) fetch_keys 0 keys on [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 state=1073741824 f() n() hs=0+0,ss=0+0 0x55b1a50fb180] 2025-01-28T19:25:46.153+0000 7f9626cc5640 10 mds.0.cache.dir(0x10000000001) auth_pin by 0x55b1a50fb180 on [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 ap=1+0 state=1073741824 f() n() hs=0+0,ss=0+0 | authpin=1 0x55b1a50fb180] count now 1 2025-01-28T19:25:46.153+0000 7f9626cc5640 1 -- [v2:172.21.10.4:6867/526112796,v1:172.21.10.4:6872/526112796] --> [v2:172.21.10.4:6802/3852331191,v1:172.21.10.4:6803/3852331191] -- osd_op(unknown.0.340:50 42.7 42:e2e07930:::10000000001.00000000:head [omap-get-header,omap-get-vals-by-keys in=4b,getxattr parent in=6b] snapc 0=[] ondisk+read+known_if_redirected+full_force+supports_pool_eio e564) -- 0x55b1a50d8c00 con 0x55b1a50d9000 2025-01-28T19:25:46.153+0000 7f9626cc5640 20 mds.0.bal hit_dir 3 pop is 1, frag * size 0 [pop IRD:[C 0.00e+00] IWR:[C 0.00e+00] RDR:[C 0.00e+00] FET:[C 1.00e+00] STR:[C 0.00e+00] *LOAD:2.0] 2025-01-28T19:25:46.153+0000 7f962ecd5640 1 -- [v2:172.21.10.4:6867/526112796,v1:172.21.10.4:6872/526112796] <== osd.0 v2:172.21.10.4:6802/3852331191 3 ==== osd_op_reply(50 10000000001.00000000 [omap-get-header,omap-get-vals-by-keys,getxattr] v0'0 uv0 ondisk = -2 ((2) No such file or directory)) ==== 248+0+0 (crc 0 0 0) 0x55b1a4444280 con 0x55b1a50d9000 2025-01-28T19:25:46.153+0000 7f96254c2640 10 MDSIOContextBase::complete: 21C_IO_Dir_OMAP_Fetched 2025-01-28T19:25:46.153+0000 7f96254c2640 10 MDSContext::complete: 21C_IO_Dir_OMAP_Fetched 2025-01-28T19:25:46.153+0000 7f96254c2640 10 mds.0.cache.dir(0x10000000001) _fetched header 0 bytes 0 keys for [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 ap=1+0 state=1073741824 f() n() hs=0+0,ss=0+0 | authpin=1 0x55b1a50fb180] 2025-01-28T19:25:46.153+0000 7f96254c2640 0 mds.0.cache.dir(0x10000000001) _fetched missing object for [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 ap=1+0 state=1073741824 f() n() hs=0+0,ss=0+0 | authpin=1 0x55b1a50fb180] 2025-01-28T19:25:46.153+0000 7f96254c2640 -1 log_channel(cluster) log [ERR] : dir 0x10000000001 object missing on disk; some files may be lost (/dir_x/dir_xx) 2025-01-28T19:25:46.153+0000 7f96254c2640 10 mds.0.cache.dir(0x10000000001) go_bad * 2025-01-28T19:25:46.153+0000 7f96254c2640 10 mds.0.cache.dir(0x10000000001) auth_unpin by 0x55b1a50fb180 on [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 state=1073741824 f() n() hs=0+0,ss=0+0 0x55b1a50fb180] count now 0 2025-01-28T19:25:46.153+0000 7f96254c2640 11 mds.0.cache.dir(0x10000000001) finish_waiting mask 2 result -5 on [dir 0x10000000001 /dir_x/dir_xx/ [2,head] auth v=0 cv=0/0 state=1073741824 f() n() hs=0+0,ss=0+0 0x55b1a50fb180] 2025-01-28T19:25:46.153+0000 7f96254c2640 10 MDSContext::complete: 12C_RetryScrub Note that this partially reverts 5b56098f17. That commit incorrectly marked a dirfrag as repaired when it may not even exist in the metadata pool. Fixes: 5b56098f17dd9abe4c15cbc7f487c0e94841beaf Signed-off-by: Patrick Donnelly --- src/mds/CInode.cc | 15 +++++++++------ src/mds/MDCache.cc | 4 ++++ src/mds/ScrubStack.cc | 10 +++++++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index dfad411d323d8..15ddd0f1e2d55 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -4999,9 +4999,15 @@ next: // check each dirfrag... for (const auto &p : in->dirfrags) { CDir *dir = p.second; - ceph_assert(dir->get_version() > 0); - nest_info.add(dir->get_fnode()->accounted_rstat); - dir_info.add(dir->get_fnode()->accounted_fragstat); + /* If the dirfrag is damaged, we can not do any checks on the fragment. */ + if (in->mdcache->mds->damage_table.is_dirfrag_damaged(dir)) { + results->raw_stats.error_str << "one or more dirfrags are damaged"; + goto next; + } else { + ceph_assert(dir->get_version() > 0); + nest_info.add(dir->get_fnode()->accounted_rstat); + dir_info.add(dir->get_fnode()->accounted_fragstat); + } } nest_info.rsubdirs++; // it gets one to account for self if (const sr_t *srnode = in->get_projected_srnode(); srnode) @@ -5015,9 +5021,6 @@ next: << "freshly-calculated rstats don't match existing ones (will be fixed)"; in->mdcache->repair_inode_stats(in); results->raw_stats.repaired = true; - for (const auto &p : in->dirfrags){ - in->mdcache->mds->damage_table.remove_dirfrag_damage_entry(p.second); - } } else { results->raw_stats.error_str << "freshly-calculated rstats don't match existing ones"; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 04e1fd7111952..06b4c2a2af0eb 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -13534,6 +13534,10 @@ void MDCache::repair_inode_stats_work(const MDRequestRef& mdr) ceph_assert(mdr->is_auth_pinned(diri)); dir = diri->get_or_open_dirfrag(this, leaf); } + if (mds->damage_table.is_dirfrag_damaged(dir)) { + mds->server->respond_to_request(mdr, -CEPHFS_EIO); + return; + } if (dir->get_version() == 0) { ceph_assert(dir->is_auth()); dir->fetch_keys({}, new C_MDS_RetryRequest(this, mdr)); diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 7ec77a31de3dd..974e0f1ba1422 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -382,7 +382,15 @@ void ScrubStack::scrub_dir_inode(CInode *in, bool *added_children, bool *done) if (queued.contains(fg)) continue; CDir *dir = in->get_or_open_dirfrag(mdcache, fg); - if (!dir->is_auth()) { + if (mds->damage_table.is_dirfrag_damaged(dir)) { + /* N.B.: we are cowardly (and ironically) not looking at dirfrags we've + * noted as damaged already. The state of the dirfrag will be missing an + * omap (or object) or the fnode is corrupt. Neither situation the MDS + * presently knows how to recover from. So skip it for now. + */ + dout(5) << __func__ << ": not scrubbing damaged dirfrag: " << *dir << dendl; + continue; + } else if (!dir->is_auth()) { if (dir->is_ambiguous_auth()) { dout(20) << __func__ << " ambiguous auth " << *dir << dendl; dir->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather.new_sub()); -- 2.39.5