This change uses an unordered_map to memoize results of CInode::is_ancestor_of
so that subsequent invocations can skip directory inodes which are already
known to not be a descendent of the target directory.
In the worst case, this unordered_map can grow to the number of inodes in
memory when all inodes are directories and at least one client has a cap for
each inode. However, in general this will not be the case. The size of each
entry in the map will be a 64-bit pointer and bool. The total size will vary
across platforms but we can say that with a conservative estimate of 192 bits /
entry overhead (including the entry linked list pointer in the bucket), the map
will grow to ~24MB / 1M inodes.
The result of this change is not eye-popping but it does have a significant performance advantage.
For an unpatched MDS with 1M inodes with caps in the global snaprealm (with debugging commits preceding this one):
2024-02-27T01:08:53.247+0000
7f4be40ec700 2 mds.0.cache Memory usage: total
6037860, rss
5710800, heap 215392, baseline 199008,
1000251 /
1000323 inodes have caps,
1000251 caps, 0.999928 caps per inode
...
2024-02-27T01:08:54.000+0000
7f4be18e7700 10 mds.0.cache.snaprealm(0x1 seq 3 0x55feaf85ad80) split_at: snaprealm(0x1000000043b seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x55feb986b200) on [inode 0x1000000043b [...4,head] ~mds0/stray8/
1000000043b/ auth v152 pv153 ap=3 snaprealm=0x55feb986b200 f() n(v0 1=0+1) old_inodes=1 (ilink xlockdone x=1) (isnap xlockdone x=1) (ifile excl) (iversion lock w=1 last_client=4361) caps={4361=pAsXsFs/-@6},l=4361 | request=1 lock=3 caps=1 authpin=1 0x56000423d180]
2024-02-27T01:08:54.649+0000
7f4be18e7700 10 mds.0.cache.ino(0x1000000043b) move_to_realm joining realm snaprealm(0x1000000043b seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x55feb986b200), leaving realm snaprealm(0x1 seq 3 lc 3 cr 3 cps 1 snaps={2=snap(2 0x1 'one' 2024-02-27T01:06:29.440802+0000),3=snap(3 0x1 'two' 2024-02-27T01:06:43.209349+0000)} last_modified 2024-02-27T01:06:43.209349+0000 change_attr 2 0x55feaf85ad80)
2024-02-27T01:08:54.750+0000
7f4be18e7700 10 mds.0.cache.snaprealm(0x1 seq 3 0x55feaf85ad80) split_at: split 1 inodes
so around 750ms to check all inodes_with_caps (1M) in the global snaprealm. This result was fairly consistent for multiple tries.
For a 100k split:
2024-02-27T04:12:27.548+0000
7f2da9dbe700 10 mds.0.cache.ino(0x1000000000f) open_snaprealm snaprealm(0x1000000000f seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x563553c92900) parent is snaprealm(0x1 seq 2 lc 2 cr 2 cps 1 snaps={2=snap(2 0x1 '1' 2024-02-27T04:12:13.803030+0000)} last_modified 2024-02-27T04:12:13.803030+0000 change_attr 1 0x563553abed80)
2024-02-27T04:12:27.548+0000
7f2da9dbe700 10 mds.0.cache.snaprealm(0x1 seq 2 0x563553abed80) split_at: snaprealm(0x1000000000f seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x563553c92900) on [inode 0x1000000000f [...3,head] /tmp.K9bdjohIVa/ auth v10972 ap=2 snaprealm=0x563553c92900 f(v0 m2024-02-27T04:03:37.953918+0000 1=0+1) n(v106 rc2024-02-27T04:12:27.544141+0000 rs1 99755=0+99755) old_inodes=1 (isnap xlock x=1 by 0x5636a6372900) (inest lock dirty) (ifile excl) (iversion lock w=1 last_client=20707) caps={20707=pAsLsXsFsx/AsLsXsFsx@8},l=20707 | dirtyscattered=1 request=1 lock=2 dirfrag=1 caps=1 dirtyrstat=0 dirtyparent=0 dirty=1 waiter=0 authpin=1 0x563553cfd180]
2024-02-27T04:12:28.886+0000
7f2da9dbe700 10 mds.0.cache.snaprealm(0x1 seq 2 0x563553abed80) split_at: split 100031 inodes
or about 1,338ms. This caused a split of 100k inodes. This takes more time
because directories are actually moved to the snaprealm with a lot of list
twiddling for caps.
With this patch, we bring that down, for 1 split:
2024-02-27T02:09:48.549+0000
7ff854ad4700 2 mds.0.cache Memory usage: total
5859852, rss
4290012, heap 231776, baseline 190816,
1000312 /
1000327 inodes have caps,
1000312 caps, 0.999985 caps per inode
...
2024-02-27T02:09:48.550+0000
7ff8522cf700 10 mds.0.cache.ino(0x100000f456f) open_snaprealm snaprealm(0x100000f456f seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x559e2b4fab40) parent is snaprealm(0x1 seq 9 lc 9 cr 9 cps 1 snaps={2=snap(2 0x1 'one' 2024-02-27T01:34:36.001053+0000),3=snap(3 0x1 'two' 2024-02-27T01:34:48.623349+0000),6=snap(6 0x1 'six' 2024-02-27T02:03:51.619896+0000),7=snap(7 0x1 'seven' 2024-02-27T02:04:28.375336+0000),8=snap(8 0x1 '1' 2024-02-27T02:06:14.170884+0000),9=snap(9 0x1 '2' 2024-02-27T02:09:47.158624+0000)} last_modified 2024-02-27T02:09:47.158624+0000 change_attr 6 0x559dfd4ad8c0)
2024-02-27T02:09:48.550+0000
7ff8522cf700 10 mds.0.cache.snaprealm(0x1 seq 9 0x559dfd4ad8c0) split_at: snaprealm(0x100000f456f seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x559e2b4fab40) on [inode 0x100000f456f [...a,head] ~mds0/stray2/
100000f456f/ auth v1164 pv1165 ap=3 snaprealm=0x559e2b4fab40 DIRTYPARENT f() n(v0 1=0+1) old_inodes=1 (ilink xlockdone x=1) (isnap xlockdone x=1) (inest lock) (ifile excl) (iversion lock w=1 last_client=4365) caps={4365=pAsLsXsFsx/AsLsXsFsx@6},l=4365 | request=1 lock=3 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=0 authpin=1 0x559e8a8bd600]
2024-02-27T02:09:48.550+0000
7ff8522cf700 10 mds.0.cache.snaprealm(0x1 seq 9 0x559dfd4ad8c0) open_children are 0x559dfd4add40,0x559e1cca1d40
2024-02-27T02:09:48.919+0000
7ff8522cf700 10 mds.0.cache.snaprealm(0x1 seq 9 0x559dfd4ad8c0) split_at: split 1 inodes
or about 370ms. This was also fairly consistent across multiple tries.
For a 100k split:
2024-02-27T01:52:24.500+0000
7ff8522cf700 10 mds.0.cache.snaprealm(0x1 seq 3 0x559dfd4ad8c0) split_at: snaprealm(0x10000000013 seq 0 lc 0 cr 0 cps 1 snaps={} last_modified 0.000000 change_attr 0 0x559e1cca1d40) on [inode 0x10000000013 [...5,head] /tmp.RIUAaU5wuE/ auth v10499 ap=2 snaprealm=0x559e1cca1d40 f(v0 m2024-02-27T01:16:04.611198+0000 1=0+1) n(v122 rc2024-02-27T01:52:24.495465+0000 rs1 100031=0+100031) old_inodes=1 (isnap xlock x=1 by 0x559ef038a880) (inest lock) (ifile excl) (iversion lock w=1 last_client=4365) caps={4365=pAsLsXsFsx/-@11},l=4365 | dirtyscattered=0 request=1 lock=2 dirfrag=1 caps=1 dirty=1 waiter=0 authpin=1 0x559e0238c580]
2024-02-27T01:52:24.500+0000
7ff8522cf700 10 mds.0.cache.snaprealm(0x1 seq 3 0x559dfd4ad8c0) open_children are 0x559dfd4add40
2024-02-27T01:52:25.338+0000
7ff8522cf700 10 mds.0.cache.snaprealm(0x1 seq 3 0x559dfd4ad8c0) split_at: split 100031 inodes
or about 840ms. This can be easily done by making a directory in one of the
trees created (see reproducer below).
Reproducing can be done with:
for ((i =0; i < 10; i++)); do (pushd $(mktemp -d -p . ); for ((j = 0; j < 30; ++j)); do mkdir "$j"; pushd "$j"; done; for ((j = 0; j < 10; ++j)); do for ((k = 0; k < 10000; ++k)); do mkdir $j.$k; done & done) & done
to make 1M directories. We put the majority of directories in a 30-deep nesting
to exercise CInode::is_ancestor_of with some worst-case type scenario.
Make sure all debugging configs are disabled for the MDS/clients. Make sure the
client has a cache size to accomodate 1M caps. Make at least one snapshot:
mkdir .snap/one
Then reproduction can be done with:
$ mkdir tmp.qQNsTpxpvh/dir; mkdir .snap/$((++i)); rmdir tmp.qQNsTpxpvh/dir
It is not necessary to delete any snapshots to reproduce this behavior. It's
only necessary to have a lot of inodes_with_caps in a snaprealm and effect a
split.
Fixes: https://tracker.ceph.com/issues/53192
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
(cherry picked from commit
a0ccb79fa0806792c7ee666c667328a8aeb09e97)
return NULL;
}
-bool CInode::is_ancestor_of(const CInode *other) const
+bool CInode::is_ancestor_of(const CInode *other, std::unordered_map<CInode const*,bool>* visited) const
{
+ std::vector<CInode const*> my_visited = {};
while (other) {
- if (other == this)
+ if (visited && other->is_dir()) {
+ if (auto it = visited->find(other); it != visited->end()) {
+ for (auto& in : my_visited) {
+ (*visited)[in] = it->second;
+ }
+ return it->second;
+ }
+ my_visited.push_back(other); /* N.B.: this being non-empty means visited is assumed non-null */
+ }
+ if (other == this) {
+ for (auto& in : my_visited) {
+ (*visited)[in] = true;
+ }
return true;
+ }
const CDentry *pdn = other->get_oldest_parent_dn();
if (!pdn) {
ceph_assert(other->is_base());
}
other = pdn->get_dir()->get_inode();
}
+ for (auto& in : my_visited) {
+ (*visited)[in] = false;
+ }
return false;
}
}
// -- misc --
- bool is_ancestor_of(const CInode *other) const;
+ bool is_ancestor_of(const CInode *other, std::unordered_map<CInode const*,bool>* visited=nullptr) const;
bool is_projected_ancestor_of(const CInode *other) const;
void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
}
// split inodes_with_caps
+ std::unordered_map<CInode const*,bool> visited;
uint64_t count = 0;
+ dout(20) << " reserving space for " << CDir::count() << " dirs" << dendl;
+ visited.reserve(CDir::count()); /* a reasonable starting poing: keep in mind there may be CInode directories without fragments in cache */
for (auto p = inodes_with_caps.begin(); !p.end(); ) {
CInode *in = *p;
++p;
// does inode fall within the child realm?
- if (child->inode->is_ancestor_of(in)) {
+ if (child->inode->is_ancestor_of(in, &visited)) {
dout(25) << " child gets " << *in << dendl;
in->move_to_realm(child);
++count;
dout(25) << " keeping " << *in << dendl;
}
}
+ dout(20) << " visited " << visited.size() << " directories" << dendl;
dout(10) << __func__ << ": split " << count << " inodes" << dendl;
}