]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-client.git/commitdiff
ceph: parse subvolume_id from InodeStat v9 and store in inode
authorAlex Markuze <amarkuze@redhat.com>
Tue, 10 Feb 2026 09:06:25 +0000 (09:06 +0000)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 20 Apr 2026 14:44:00 +0000 (16:44 +0200)
Add support for parsing the subvolume_id field from InodeStat v9 and
storing it in the inode for later use by subvolume metrics tracking.

The subvolume_id identifies which CephFS subvolume an inode belongs to,
enabling per-subvolume I/O metrics collection and reporting.

This patch:
- Adds subvolume_id field to struct ceph_mds_reply_info_in
- Adds i_subvolume_id field to struct ceph_inode_info
- Parses subvolume_id from v9 InodeStat in parse_reply_info_in()
- Adds ceph_inode_set_subvolume() helper to propagate the ID to inodes
- Initializes i_subvolume_id in inode allocation and clears on destroy

Signed-off-by: Alex Markuze <amarkuze@redhat.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
fs/ceph/inode.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.h

index 108492a8779dcae7075f421f0006d9ba0c6c50bc..1f7dc35425eb7287bd841a15b0edc60746d2c4d2 100644 (file)
@@ -639,6 +639,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
        ci->i_max_bytes = 0;
        ci->i_max_files = 0;
+       ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE;
 
        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
        memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
@@ -744,6 +745,8 @@ void ceph_evict_inode(struct inode *inode)
 
        percpu_counter_dec(&mdsc->metric.total_inodes);
 
+       ci->i_subvolume_id = CEPH_SUBVOLUME_ID_NONE;
+
        netfs_wait_for_outstanding_io(inode);
        truncate_inode_pages_final(&inode->i_data);
        if (inode_state_read_once(inode) & I_PINNING_NETFS_WB)
@@ -885,6 +888,40 @@ int ceph_fill_file_size(struct inode *inode, int issued,
        return queue_trunc;
 }
 
+/*
+ * Set the subvolume ID for an inode.
+ *
+ * The subvolume_id identifies which CephFS subvolume this inode belongs to.
+ * CEPH_SUBVOLUME_ID_NONE (0) means unknown/unset - the MDS only sends
+ * non-zero IDs for inodes within subvolumes.
+ *
+ * An inode's subvolume membership is immutable - once an inode is created
+ * in a subvolume, it stays there. Therefore, if we already have a valid
+ * (non-zero) subvolume_id and receive a different one, that indicates a bug.
+ */
+void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id)
+{
+       struct ceph_inode_info *ci;
+       u64 old;
+
+       if (!inode || subvolume_id == CEPH_SUBVOLUME_ID_NONE)
+               return;
+
+       ci = ceph_inode(inode);
+       old = READ_ONCE(ci->i_subvolume_id);
+
+       if (old == subvolume_id)
+               return;
+
+       if (old != CEPH_SUBVOLUME_ID_NONE) {
+               /* subvolume_id should not change once set */
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       WRITE_ONCE(ci->i_subvolume_id, subvolume_id);
+}
+
 void ceph_fill_file_time(struct inode *inode, int issued,
                         u64 time_warp_seq, struct timespec64 *ctime,
                         struct timespec64 *mtime, struct timespec64 *atime)
@@ -1088,6 +1125,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
        new_issued = ~issued & info_caps;
 
        __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
+       ceph_inode_set_subvolume(inode, iinfo->subvolume_id);
 
 #ifdef CONFIG_FS_ENCRYPTION
        if (iinfo->fscrypt_auth_len &&
@@ -1598,6 +1636,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
                        goto done;
                }
                if (parent_dir) {
+                       ceph_inode_set_subvolume(parent_dir,
+                                                rinfo->diri.subvolume_id);
                        err = ceph_fill_inode(parent_dir, NULL, &rinfo->diri,
                                              rinfo->dirfrag, session, -1,
                                              &req->r_caps_reservation);
@@ -1686,6 +1726,7 @@ retry_lookup:
                BUG_ON(!req->r_target_inode);
 
                in = req->r_target_inode;
+               ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id);
                err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
                                NULL, session,
                                (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
index b2a07185cecd739d8894b36a233f68d3f440b421..dc363b0173c122eee3d74c9cf3981a564072a646 100644 (file)
@@ -96,19 +96,19 @@ bad:
        return -EIO;
 }
 
-/*
- * parse individual inode info
- */
 static int parse_reply_info_in(void **p, void *end,
                               struct ceph_mds_reply_info_in *info,
-                              u64 features)
+                              u64 features,
+                              struct ceph_mds_client *mdsc)
 {
        int err = 0;
        u8 struct_v = 0;
+       u8 struct_compat = 0;
+       u32 struct_len = 0;
+
+       info->subvolume_id = CEPH_SUBVOLUME_ID_NONE;
 
        if (features == (u64)-1) {
-               u32 struct_len;
-               u8 struct_compat;
                ceph_decode_8_safe(p, end, struct_v, bad);
                ceph_decode_8_safe(p, end, struct_compat, bad);
                /* struct_v is expected to be >= 1. we only understand
@@ -252,6 +252,10 @@ static int parse_reply_info_in(void **p, void *end,
                        ceph_decode_skip_n(p, end, v8_struct_len, bad);
                }
 
+               /* struct_v 9 added subvolume_id */
+               if (struct_v >= 9)
+                       ceph_decode_64_safe(p, end, info->subvolume_id, bad);
+
                *p = end;
        } else {
                /* legacy (unversioned) struct */
@@ -394,12 +398,13 @@ bad:
  */
 static int parse_reply_info_trace(void **p, void *end,
                                  struct ceph_mds_reply_info_parsed *info,
-                                 u64 features)
+                                 u64 features,
+                                 struct ceph_mds_client *mdsc)
 {
        int err;
 
        if (info->head->is_dentry) {
-               err = parse_reply_info_in(p, end, &info->diri, features);
+               err = parse_reply_info_in(p, end, &info->diri, features, mdsc);
                if (err < 0)
                        goto out_bad;
 
@@ -419,7 +424,8 @@ static int parse_reply_info_trace(void **p, void *end,
        }
 
        if (info->head->is_target) {
-               err = parse_reply_info_in(p, end, &info->targeti, features);
+               err = parse_reply_info_in(p, end, &info->targeti, features,
+                                         mdsc);
                if (err < 0)
                        goto out_bad;
        }
@@ -440,7 +446,8 @@ out_bad:
  */
 static int parse_reply_info_readdir(void **p, void *end,
                                    struct ceph_mds_request *req,
-                                   u64 features)
+                                   u64 features,
+                                   struct ceph_mds_client *mdsc)
 {
        struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
        struct ceph_client *cl = req->r_mdsc->fsc->client;
@@ -555,7 +562,7 @@ static int parse_reply_info_readdir(void **p, void *end,
                rde->name_len = oname.len;
 
                /* inode */
-               err = parse_reply_info_in(p, end, &rde->inode, features);
+               err = parse_reply_info_in(p, end, &rde->inode, features, mdsc);
                if (err < 0)
                        goto out_bad;
                /* ceph_readdir_prepopulate() will update it */
@@ -763,7 +770,8 @@ static int parse_reply_info_extra(void **p, void *end,
        if (op == CEPH_MDS_OP_GETFILELOCK)
                return parse_reply_info_filelock(p, end, info, features);
        else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
-               return parse_reply_info_readdir(p, end, req, features);
+               return parse_reply_info_readdir(p, end, req, features,
+                                               req->r_mdsc);
        else if (op == CEPH_MDS_OP_CREATE)
                return parse_reply_info_create(p, end, info, features, s);
        else if (op == CEPH_MDS_OP_GETVXATTR)
@@ -792,7 +800,8 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
                ceph_decode_need(&p, end, len, bad);
-               err = parse_reply_info_trace(&p, p+len, info, features);
+               err = parse_reply_info_trace(&p, p + len, info, features,
+                                            s->s_mdsc);
                if (err < 0)
                        goto out_bad;
        }
@@ -801,7 +810,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
                ceph_decode_need(&p, end, len, bad);
-               err = parse_reply_info_extra(&p, p+len, req, features, s);
+               err = parse_reply_info_extra(&p, p + len, req, features, s);
                if (err < 0)
                        goto out_bad;
        }
@@ -4033,6 +4042,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        goto out_err;
                }
                req->r_target_inode = in;
+               ceph_inode_set_subvolume(in, rinfo->targeti.subvolume_id);
        }
 
        mutex_lock(&session->s_mutex);
index 0428a5eaf28c658eca3f572243818c4e8c50bae4..bd3690baa65c0203ec793690c7d295fc02ead139 100644 (file)
@@ -118,6 +118,7 @@ struct ceph_mds_reply_info_in {
        u32 fscrypt_file_len;
        u64 rsnaps;
        u64 change_attr;
+       u64 subvolume_id;
 };
 
 struct ceph_mds_reply_dir_entry {
index 2ade9feca410e6930af07729b4c43c14908a0533..5f96620967721a0e020c9af7bd240cbc1f23830d 100644 (file)
@@ -399,6 +399,15 @@ struct ceph_inode_info {
        /* quotas */
        u64 i_max_bytes, i_max_files;
 
+       /*
+        * Subvolume ID this inode belongs to. CEPH_SUBVOLUME_ID_NONE (0)
+        * means unknown/unset, matching the FUSE client convention.
+        * Once set to a valid (non-zero) value, it should not change
+        * during the inode's lifetime.
+        */
+#define CEPH_SUBVOLUME_ID_NONE 0
+       u64 i_subvolume_id;
+
        s32 i_dir_pin;
 
        struct rb_root i_fragtree;
@@ -1092,6 +1101,7 @@ extern struct inode *ceph_get_snapdir(struct inode *parent);
 extern int ceph_fill_file_size(struct inode *inode, int issued,
                               u32 truncate_seq, u64 truncate_size,
                               u64 size, int newcaps);
+extern void ceph_inode_set_subvolume(struct inode *inode, u64 subvolume_id);
 extern void ceph_fill_file_time(struct inode *inode, int issued,
                                u64 time_warp_seq, struct timespec64 *ctime,
                                struct timespec64 *mtime,