From: Yan, Zheng Date: Tue, 15 Nov 2016 12:59:54 +0000 (+0800) Subject: mds: force client flush snap data before truncating objects X-Git-Tag: v11.1.0~169^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fpull%2F11994%2Fhead;p=ceph.git mds: force client flush snap data before truncating objects Snapshot data get lost if following sequence of events happen - client writes data to a file - make a snapshot - truncate the file - mds truncate file objects using the newest snap context - client flushes snap data using the old snap context OSD first handles MDS's truncate request, it updates object's snap context. When handling client's write request, OSD finds that object's snap context is newer than request's snap context. So it uses the newer one and treats the data as if they were written after the snapshot. The fix is avoid touching file objects while clients may have unflushed snap data. Before truncating file objects, MDS checks if clients may have unflushed snap data. If client have, MDS set filelock to a special unstable state, the state revokes Fb capability. MDS starts truncating file objects after the Fb capability get revoked. Fixes: http://tracker.ceph.com/issues/17193 Signed-off-by: Yan, Zheng --- diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 3eff5ebd3e66..fd794ce9205c 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1592,6 +1592,7 @@ void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue if (lock->get_num_rdlocks() == 0 && lock->get_num_wrlocks() == 0 && lock->get_num_client_lease() == 0 && + lock->get_state() != LOCK_XLOCKSNAP && lock->get_type() != CEPH_LOCK_DN) { CInode *in = static_cast(lock->get_parent()); client_t loner = in->get_target_loner(); @@ -1608,7 +1609,7 @@ void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue } } // the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK - eval_gather(lock, true, pneed_issue); + eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue); } void Locker::xlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 06e7520b865b..0307f01a1ae7 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6146,7 +6146,11 @@ void MDCache::identify_files_to_recover() } if (recover) { - in->auth_pin(&in->filelock); + if (in->filelock.is_stable()) { + in->auth_pin(&in->filelock); + } else { + assert(in->filelock.get_state() == LOCK_XLOCKSNAP); + } in->filelock.set_state(LOCK_PRE_SCAN); rejoin_recover_q.push_back(in); } else { @@ -6158,6 +6162,8 @@ void MDCache::identify_files_to_recover() void MDCache::start_files_to_recover() { for (CInode *in : rejoin_check_q) { + if (in->filelock.get_state() == LOCK_XLOCKSNAP) + mds->locker->issue_caps(in); mds->locker->check_inode_max_size(in); } rejoin_check_q.clear(); @@ -6181,6 +6187,17 @@ void MDCache::do_file_recover() // ---------------------------- // truncate +class C_MDC_RetryTruncate : public MDCacheContext { + CInode *in; + LogSegment *ls; +public: + C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) : + MDCacheContext(c), in(i), ls(l) {} + void finish(int r) { + mdcache->_truncate_inode(in, ls); + } +}; + void MDCache::truncate_inode(CInode *in, LogSegment *ls) { inode_t *pi = in->get_projected_inode(); @@ -6191,6 +6208,15 @@ void MDCache::truncate_inode(CInode *in, LogSegment *ls) ls->truncating_inodes.insert(in); in->get(CInode::PIN_TRUNCATING); + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + assert(in->filelock.is_xlocked()); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + mds->locker->issue_caps(in); + return; + } _truncate_inode(in, ls); } @@ -6218,7 +6244,6 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls) assert(pi->truncate_from < (1ULL << 63)); assert(pi->truncate_size < pi->truncate_from); - in->auth_pin(this); SnapRealm *realm = in->find_snaprealm(); SnapContext nullsnap; @@ -6328,8 +6353,21 @@ void MDCache::start_recovered_truncates() LogSegment *ls = p->second; for (set::iterator q = ls->truncating_inodes.begin(); q != ls->truncating_inodes.end(); - ++q) - _truncate_inode(*q, ls); + ++q) { + CInode *in = *q; + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + assert(in->filelock.is_stable()); + in->filelock.set_state(LOCK_XLOCKDONE); + in->auth_pin(&in->filelock); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + // start_files_to_recover will revoke caps + continue; + } + _truncate_inode(in, ls); + } } } diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index 78bd474a2c82..210557837e11 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -96,6 +96,14 @@ public: get_state() == LOCK_MIX; } + void set_xlock_snap_sync(MDSInternalContextBase *c) + { + assert(get_type() == CEPH_LOCK_IFILE); + assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE); + state = LOCK_XLOCKSNAP; + add_waiter(WAIT_STABLE, c); + } + xlist::item *get_updated_item() { return &more()->item_updated; } utime_t get_update_stamp() { diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 736fab5cb4c9..455875c536d5 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -101,6 +101,7 @@ public: case LOCK_PREXLOCK: return "prexlock"; case LOCK_XLOCK: return "xlock"; case LOCK_XLOCKDONE: return "xlockdone"; + case LOCK_XLOCKSNAP: return "xlocksnap"; case LOCK_LOCK_XLOCK: return "lock->xlock"; case LOCK_SYNC_LOCK: return "sync->lock"; @@ -498,7 +499,8 @@ public: more()->xlock_by.reset(); } void put_xlock() { - assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || is_locallock() || + assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || + state == LOCK_XLOCKSNAP || is_locallock() || state == LOCK_LOCK /* if we are a master of a slave */); --more()->num_xlock; parent->put(MDSCacheObject::PIN_LOCK); diff --git a/src/mds/locks.c b/src/mds/locks.c index aa612367c8a1..2fc0a5fc0419 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -99,6 +99,7 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_XLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_XLOCKDONE] = { LOCK_LOCK, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,CEPH_CAP_GSHARED,0 }, + [LOCK_XLOCKSNAP] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, diff --git a/src/mds/locks.h b/src/mds/locks.h index d1585cec5760..9f4ea566b4a1 100644 --- a/src/mds/locks.h +++ b/src/mds/locks.h @@ -52,6 +52,7 @@ enum { LOCK_PREXLOCK, // A . . .. . . / . . (lock) LOCK_XLOCK, // A . . .. . . / . . (lock) LOCK_XLOCKDONE, // A r p rd l x / . . (lock) <-- by same client only!! + LOCK_XLOCKSNAP, // also revoke Fb LOCK_LOCK_XLOCK, LOCK_SYNC_LOCK, // AR R . .. . . / . . R .. . . / . .