]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-client.git/commitdiff
ceph: supply snapshot context in ceph_uninline_data()
authorethanwu <ethanwu@synology.com>
Thu, 25 Sep 2025 10:42:06 +0000 (18:42 +0800)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 9 Feb 2026 12:23:40 +0000 (13:23 +0100)
The ceph_uninline_data function was missing proper snapshot context
handling for its OSD write operations. Both CEPH_OSD_OP_CREATE and
CEPH_OSD_OP_WRITE requests were passing NULL instead of the appropriate
snapshot context, which could lead to unnecessary object clone.

Reproducer:
../src/vstart.sh --new -x --localhost --bluestore
// turn on cephfs inline data
./bin/ceph fs set a inline_data true --yes-i-really-really-mean-it
// allow fs_a client to take snapshot
./bin/ceph auth caps client.fs_a mds 'allow rwps fsname=a' mon 'allow r fsname=a' osd 'allow rw tag cephfs data=a'
// mount cephfs with fuse, since kernel cephfs doesn't support inline write
ceph-fuse --id fs_a -m 127.0.0.1:40318 --conf ceph.conf -d /mnt/mycephfs/
// bump snapshot seq
mkdir /mnt/mycephfs/.snap/snap1
echo "foo" > /mnt/mycephfs/test
// umount and mount it again using kernel cephfs client
umount /mnt/mycephfs
mount -t ceph fs_a@.a=/ /mnt/mycephfs/ -o conf=./ceph.conf
echo "bar" >> /mnt/mycephfs/test
./bin/rados listsnaps -p cephfs.a.data $(printf "%x\n" $(stat -c %i /mnt/mycephfs/test)).00000000

will see this object does unnecessary clone
1000000000a.00000000 (seq:2):
cloneid snaps   size    overlap
2       2       4       []
head    -       8

but it's expected to see
10000000000.00000000 (seq:2):
cloneid snaps   size    overlap
head    -       8

since there's no snapshot between these 2 writes

clone happened because the first osd request CEPH_OSD_OP_CREATE doesn't
pass snap context so object is created with snap seq 0, but later data
writeback is equipped with snapshot context.
snap.seq(1) > object snap seq(0), so osd does object clone.

This fix properly acquiring the snapshot context before performing
write operations.

Signed-off-by: ethanwu <ethanwu@synology.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Tested-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/addr.c

index 63b75d2142102e9e217aae49689642c69e6b6e89..faecd9025ee9cf342fca5c15791a44b2bad19cbf 100644 (file)
@@ -2199,6 +2199,7 @@ int ceph_uninline_data(struct file *file)
        struct ceph_osd_request *req = NULL;
        struct ceph_cap_flush *prealloc_cf = NULL;
        struct folio *folio = NULL;
+       struct ceph_snap_context *snapc = NULL;
        u64 inline_version = CEPH_INLINE_NONE;
        struct page *pages[1];
        int err = 0;
@@ -2226,6 +2227,24 @@ int ceph_uninline_data(struct file *file)
        if (inline_version == 1) /* initial version, no data */
                goto out_uninline;
 
+       down_read(&fsc->mdsc->snap_rwsem);
+       spin_lock(&ci->i_ceph_lock);
+       if (__ceph_have_pending_cap_snap(ci)) {
+               struct ceph_cap_snap *capsnap =
+                               list_last_entry(&ci->i_cap_snaps,
+                                               struct ceph_cap_snap,
+                                               ci_item);
+               snapc = ceph_get_snap_context(capsnap->context);
+       } else {
+               if (!ci->i_head_snapc) {
+                       ci->i_head_snapc = ceph_get_snap_context(
+                               ci->i_snap_realm->cached_context);
+               }
+               snapc = ceph_get_snap_context(ci->i_head_snapc);
+       }
+       spin_unlock(&ci->i_ceph_lock);
+       up_read(&fsc->mdsc->snap_rwsem);
+
        folio = read_mapping_folio(inode->i_mapping, 0, file);
        if (IS_ERR(folio)) {
                err = PTR_ERR(folio);
@@ -2241,7 +2260,7 @@ int ceph_uninline_data(struct file *file)
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 0, 1,
                                    CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
-                                   NULL, 0, 0, false);
+                                   snapc, 0, 0, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out_unlock;
@@ -2257,7 +2276,7 @@ int ceph_uninline_data(struct file *file)
        req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
                                    ceph_vino(inode), 0, &len, 1, 3,
                                    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
-                                   NULL, ci->i_truncate_seq,
+                                   snapc, ci->i_truncate_seq,
                                    ci->i_truncate_size, false);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
@@ -2320,6 +2339,7 @@ out_unlock:
                folio_put(folio);
        }
 out:
+       ceph_put_snap_context(snapc);
        ceph_free_cap_flush(prealloc_cf);
        doutc(cl, "%llx.%llx inline_version %llu = %d\n",
              ceph_vinop(inode), inline_version, err);