qa: add test support for the alloc ino failing

author Xiubo Li <xiubli@redhat.com>

Sat, 18 Sep 2021 02:34:19 +0000 (10:34 +0800)

committer Xiubo Li <xiubli@redhat.com>

Wed, 28 Jun 2023 01:48:55 +0000 (09:48 +0800)
author Xiubo Li <xiubli@redhat.com>
Sat, 18 Sep 2021 02:34:19 +0000 (10:34 +0800)
committer Xiubo Li <xiubli@redhat.com>
Wed, 28 Jun 2023 01:48:55 +0000 (09:48 +0800)
diff --git a/doc/cephfs/mds-config-ref.rst b/doc/cephfs/mds-config-ref.rst

index a5e0bba91775766679568e2265e0eb34d2a6bdb9..5b68053a05ed62b72b64e6681a4d5b03bf392318 100644 (file)
--- a/doc/cephfs/mds-config-ref.rst
+++ b/doc/cephfs/mds-config-ref.rst
@@ -57,6 +57,8 @@
  .. confval:: mds_kill_import_at
  .. confval:: mds_kill_link_at
  .. confval:: mds_kill_rename_at
+.. confval:: mds_inject_skip_replaying_inotable
+.. confval:: mds_kill_skip_replaying_inotable
  .. confval:: mds_wipe_sessions
  .. confval:: mds_wipe_ino_prealloc
  .. confval:: mds_skip_ino
diff --git a/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml b/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml

index 40d63ba792b11ea5110d73e8de0cff99488bc55d..e6d6ef99b15d6ac2a6cdc1affe948fd7685ee97f 100644 (file)
--- a/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml
+++ b/qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml
@@ -11,3 +11,4 @@ overrides:
        - has not responded to cap revoke by MDS for over
        - MDS_CLIENT_LATE_RELEASE
        - responding to mclientcaps
+      - RECENT_CRASH
diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py

index 53150acbf034d474f85b185feccea61e1126e0a4..5c513b5c99a4eea9faed3359a17b552a7cb90c58 100644 (file)
--- a/qa/tasks/cephfs/test_misc.py
+++ b/qa/tasks/cephfs/test_misc.py
@@ -572,3 +572,45 @@ class TestCacheDrop(CephFSTestCase):
          # particular operation causing this is journal flush which causes the
          # MDS to wait wait for cap revoke.
          self.mount_a.resume_netns()
+
+class TestSkipReplayInoTable(CephFSTestCase):
+    MDSS_REQUIRED = 1
+    CLIENTS_REQUIRED = 1
+
+    def test_alloc_cinode_assert(self):
+        """
+        Test alloc CInode assert.
+
+        See: https://tracker.ceph.com/issues/52280
+        """
+
+        # Create a directory and the mds will journal this and then crash
+        self.mount_a.run_shell(["rm", "-rf", "test_alloc_ino"])
+        self.mount_a.run_shell(["mkdir", "test_alloc_ino"])
+
+        status = self.fs.status()
+        rank0 = self.fs.get_rank(rank=0, status=status)
+
+        self.fs.mds_asok(['config', 'set', 'mds_kill_skip_replaying_inotable', "true"])
+        # This will make the MDS crash, since we only have one MDS in the
+        # cluster and without the "wait=False" it will stuck here forever.
+        self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir1"], wait=False)
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # Now set the mds config to skip replaying the inotable
+        self.fs.set_ceph_conf('mds', 'mds_inject_skip_replaying_inotable', True)
+        self.fs.set_ceph_conf('mds', 'mds_wipe_sessions', True)
+
+        # sleep 5 seconds to make sure the journal log is flushed and applied
+        time.sleep(5)
+        self.fs.mds_restart()
+        # sleep 5 seconds to make sure the mds tell command won't stuck
+        time.sleep(5)
+        self.fs.wait_for_daemons()
+
+        self.delete_mds_coredump(rank0['name']);
+
+        self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir2"])
+
+        ls_out = set(self.mount_a.ls("test_alloc_ino/"))
+        self.assertEqual(ls_out, set({"dir1", "dir2"}))
diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in

index 28814401e542c1f94aad51a3b033735c4051a353..eeb895615d6a60e1e3c6e924f274f058b25a24f7 100644 (file)
--- a/src/common/options/mds.yaml.in
+++ b/src/common/options/mds.yaml.in
@@ -1094,6 +1094,26 @@ options:
    default: false
    services:
    - mds
+- name: mds_kill_skip_replaying_inotable
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will skip replaying the inotable when replaying the journal, and
+    the premary MDS will crash, while the replacing MDS won't.
+    (for testing only).
+  with_legacy: true
+- name: mds_inject_skip_replaying_inotable
+  type: bool
+  level: dev
+  default: false
+  services:
+  - mds
+  fmt_desc: Ceph will skip replaying the inotable when replaying the journal, and
+    the premary MDS will crash, while the replacing MDS won't.
+    (for testing only).
+  with_legacy: true
  #  percentage of MDS modify replies to skip sending the client a trace on [0-1]
  - name: mds_inject_traceless_reply_probability
    type: float
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h

index 582cf33dcba5d25fba02156f612b3c24d9d65c62..a99bed72a8f5cdc05c6af23a6f170d889a8b2fc8 100644 (file)
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -211,6 +211,9 @@ class MDCache {
    bool test_and_clear_taken_inos(inodeno_t ino) {
      return replay_taken_inos.erase(ino) != 0;
    }
+  bool is_taken_inos_empty(void) {
+    return replay_taken_inos.empty();
+  }
  
    uint64_t cache_limit_memory(void) {
      return cache_memory_limit;
diff --git a/src/mds/Server.cc b/src/mds/Server.cc

index d34f4fb719a5e9f8a23ff9edabb565859497d879..50e1c3f2cefd1927da1f7233f50afdbe1a6bf2a6 100644 (file)
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4479,6 +4479,9 @@ public:
    void finish(int r) override {
      ceph_assert(r == 0);
  
+    // crash current MDS and the replacing MDS will test the journal
+    ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
+
      dn->pop_projected_linkage();
  
      // dirty inode, dn, dir
@@ -6825,6 +6828,9 @@ public:
    void finish(int r) override {
      ceph_assert(r == 0);
  
+    // crash current MDS and the replacing MDS will test the journal
+    ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
+
      // link the inode
      dn->pop_projected_linkage();
      
@@ -7131,6 +7137,11 @@ void Server::handle_client_symlink(MDRequestRef& mdr)
  
    journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
    mds->balancer->maybe_fragment(dir, false);
+
+  // flush the journal as soon as possible
+  if (g_conf()->mds_kill_skip_replaying_inotable) {
+    mdlog->flush();
+  }
  }
  
  
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h

index 69e15d9e9c024d44ae09734d265adf8e2460e15d..736a509ea211dbf0bc474b313394b9e32e896efe 100644 (file)
--- a/src/mds/events/EMetaBlob.h
+++ b/src/mds/events/EMetaBlob.h
@@ -601,7 +601,7 @@ private:
    }
  
    void update_segment(LogSegment *ls);
-  void replay(MDSRank *mds, LogSegment *ls, MDPeerUpdate *su=NULL);
+  void replay(MDSRank *mds, LogSegment *ls, int type, MDPeerUpdate *su=NULL);
  };
  WRITE_CLASS_ENCODER_FEATURES(EMetaBlob)
  WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit)
diff --git a/src/mds/journal.cc b/src/mds/journal.cc

index c5a28050ec5e3e157bd2fdc04946b2a49d81ffb5..4b9e73b5a7e18581729a1c60b61b0c2746fc98dc 100644 (file)
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -1163,7 +1163,7 @@ void EMetaBlob::generate_test_instances(std::list<EMetaBlob*>& ls)
    ls.push_back(new EMetaBlob());
  }
  
-void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
+void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, int type, MDPeerUpdate *peerup)
  {
    dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
  
@@ -1567,9 +1567,12 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
      logseg->open_files.push_back(&in->item_open_file);
    }
  
+  bool skip_replaying_inotable = g_conf()->mds_inject_skip_replaying_inotable;
+
    // allocated_inos
    if (inotablev) {
-    if (mds->inotable->get_version() >= inotablev) {
+    if (mds->inotable->get_version() >= inotablev ||
+       unlikely(type == EVENT_UPDATE && skip_replaying_inotable)) {
        dout(10) << "EMetaBlob.replay inotable tablev " << inotablev
                << " <= table " << mds->inotable->get_version() << dendl;
        if (allocated_ino)
@@ -1597,7 +1600,8 @@ void EMetaBlob::replay(MDSRank *mds, LogSegment *logseg, MDPeerUpdate *peerup)
      }
    }
    if (sessionmapv) {
-    if (mds->sessionmap.get_version() >= sessionmapv) {
+    if (mds->sessionmap.get_version() >= sessionmapv ||
+       unlikely(type == EVENT_UPDATE && skip_replaying_inotable)) {
        dout(10) << "EMetaBlob.replay sessionmap v " << sessionmapv
                << " <= table " << mds->sessionmap.get_version() << dendl;
        if (used_preallocated_ino)
@@ -2241,7 +2245,8 @@ void EUpdate::update_segment()
  void EUpdate::replay(MDSRank *mds)
  {
    auto&& segment = get_segment();
-  metablob.replay(mds, segment);
+  dout(10) << "EUpdate::replay" << dendl;
+  metablob.replay(mds, segment, EVENT_UPDATE);
    
    if (had_peers) {
      dout(10) << "EUpdate.replay " << reqid << " had peers, expecting a matching ECommitted" << dendl;
@@ -2324,7 +2329,7 @@ void EOpen::replay(MDSRank *mds)
  {
    dout(10) << "EOpen.replay " << dendl;
    auto&& segment = get_segment();
-  metablob.replay(mds, segment);
+  metablob.replay(mds, segment, EVENT_OPEN);
  
    // note which segments inodes belong to, so we don't have to start rejournaling them
    for (const auto &ino : inos) {
@@ -2640,7 +2645,7 @@ void EPeerUpdate::replay(MDSRank *mds)
      dout(10) << "EPeerUpdate.replay prepare " << reqid << " for mds." << leader
              << ": applying commit, saving rollback info" << dendl;
      su = new MDPeerUpdate(origop, rollback);
-    commit.replay(mds, segment, su);
+    commit.replay(mds, segment, EVENT_PEERUPDATE, su);
      mds->mdcache->add_uncommitted_peer(reqid, segment, leader, su);
      break;
  
@@ -2652,7 +2657,7 @@ void EPeerUpdate::replay(MDSRank *mds)
    case EPeerUpdate::OP_ROLLBACK:
      dout(10) << "EPeerUpdate.replay abort " << reqid << " for mds." << leader
              << ": applying rollback commit blob" << dendl;
-    commit.replay(mds, segment);
+    commit.replay(mds, segment, EVENT_PEERUPDATE);
      mds->mdcache->finish_uncommitted_peer(reqid, false);
      break;
  
@@ -2831,7 +2836,7 @@ void ESubtreeMap::replay(MDSRank *mds)
    
    // first, stick the spanning tree in my cache
    //metablob.print(*_dout);
-  metablob.replay(mds, get_segment());
+  metablob.replay(mds, get_segment(), EVENT_SUBTREEMAP);
    
    // restore import/export maps
    for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = subtrees.begin();
@@ -2906,7 +2911,7 @@ void EFragment::replay(MDSRank *mds)
      ceph_abort();
    }
  
-  metablob.replay(mds, segment);
+  metablob.replay(mds, segment, EVENT_FRAGMENT);
    if (in && g_conf()->mds_debug_frag)
      in->verify_dirfrags();
  }
@@ -2990,7 +2995,7 @@ void EExport::replay(MDSRank *mds)
  {
    dout(10) << "EExport.replay " << base << dendl;
    auto&& segment = get_segment();
-  metablob.replay(mds, segment);
+  metablob.replay(mds, segment, EVENT_EXPORT);
    
    CDir *dir = mds->mdcache->get_dirfrag(base);
    ceph_assert(dir);
@@ -3069,7 +3074,7 @@ void EImportStart::replay(MDSRank *mds)
    dout(10) << "EImportStart.replay " << base << " bounds " << bounds << dendl;
    //metablob.print(*_dout);
    auto&& segment = get_segment();
-  metablob.replay(mds, segment);
+  metablob.replay(mds, segment, EVENT_IMPORTSTART);
  
    // put in ambiguous import list
    mds->mdcache->add_ambiguous_import(base, bounds);
author	Xiubo Li <xiubli@redhat.com>
	Sat, 18 Sep 2021 02:34:19 +0000 (10:34 +0800)
committer	Xiubo Li <xiubli@redhat.com>
	Wed, 28 Jun 2023 01:48:55 +0000 (09:48 +0800)
doc/cephfs/mds-config-ref.rst		patch \| blob \| history
qa/suites/fs/multiclient/tasks/cephfs_misc_tests.yaml		patch \| blob \| history
qa/tasks/cephfs/test_misc.py		patch \| blob \| history
src/common/options/mds.yaml.in		patch \| blob \| history
src/mds/MDCache.h		patch \| blob \| history
src/mds/Server.cc		patch \| blob \| history
src/mds/events/EMetaBlob.h		patch \| blob \| history
src/mds/journal.cc		patch \| blob \| history