From: Yan, Zheng Date: Tue, 15 Nov 2016 12:59:54 +0000 (+0800) Subject: mds: force client flush snap data before truncating objects X-Git-Tag: v10.2.6~112^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=e1af490200fcdf6b3f8fb48bc96278f739c2438b;p=ceph.git mds: force client flush snap data before truncating objects Snapshot data get lost if following sequence of events happen - client writes data to a file - make a snapshot - truncate the file - mds truncate file objects using the newest snap context - client flushes snap data using the old snap context OSD first handles MDS's truncate request, it updates object's snap context. When handling client's write request, OSD finds that object's snap context is newer than request's snap context. So it uses the newer one and treats the data as if they were written after the snapshot. The fix is avoid touching file objects while clients may have unflushed snap data. Before truncating file objects, MDS checks if clients may have unflushed snap data. If client have, MDS set filelock to a special unstable state, the state revokes Fb capability. MDS starts truncating file objects after the Fb capability get revoked. Fixes: http://tracker.ceph.com/issues/17193 Signed-off-by: Yan, Zheng (cherry picked from commit 9c65920e7f6febe294e25a67473693ce6f9adfa7) --- diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 10fff9848c9..aef41dea976 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1578,6 +1578,7 @@ void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue if (lock->get_num_rdlocks() == 0 && lock->get_num_wrlocks() == 0 && lock->get_num_client_lease() == 0 && + lock->get_state() != LOCK_XLOCKSNAP && lock->get_type() != CEPH_LOCK_DN) { CInode *in = static_cast(lock->get_parent()); client_t loner = in->get_target_loner(); @@ -1594,7 +1595,7 @@ void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue } } // the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK - eval_gather(lock, true, pneed_issue); + eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue); } void Locker::xlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 447d2795df8..8b6bee8fa45 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6133,7 +6133,11 @@ void MDCache::identify_files_to_recover() } if (recover) { - in->auth_pin(&in->filelock); + if (in->filelock.is_stable()) { + in->auth_pin(&in->filelock); + } else { + assert(in->filelock.get_state() == LOCK_XLOCKSNAP); + } in->filelock.set_state(LOCK_PRE_SCAN); rejoin_recover_q.push_back(in); } else { @@ -6145,6 +6149,8 @@ void MDCache::identify_files_to_recover() void MDCache::start_files_to_recover() { for (CInode *in : rejoin_check_q) { + if (in->filelock.get_state() == LOCK_XLOCKSNAP) + mds->locker->issue_caps(in); mds->locker->check_inode_max_size(in); } rejoin_check_q.clear(); @@ -6168,6 +6174,17 @@ void MDCache::do_file_recover() // ---------------------------- // truncate +class C_MDC_RetryTruncate : public MDCacheContext { + CInode *in; + LogSegment *ls; +public: + C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) : + MDCacheContext(c), in(i), ls(l) {} + void finish(int r) { + mdcache->_truncate_inode(in, ls); + } +}; + void MDCache::truncate_inode(CInode *in, LogSegment *ls) { inode_t *pi = in->get_projected_inode(); @@ -6178,6 +6195,15 @@ void MDCache::truncate_inode(CInode *in, LogSegment *ls) ls->truncating_inodes.insert(in); in->get(CInode::PIN_TRUNCATING); + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + assert(in->filelock.is_xlocked()); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + mds->locker->issue_caps(in); + return; + } _truncate_inode(in, ls); } @@ -6205,7 +6231,6 @@ void MDCache::_truncate_inode(CInode *in, LogSegment *ls) assert(pi->truncate_from < (1ULL << 63)); assert(pi->truncate_size < pi->truncate_from); - in->auth_pin(this); SnapRealm *realm = in->find_snaprealm(); SnapContext nullsnap; @@ -6314,8 +6339,21 @@ void MDCache::start_recovered_truncates() LogSegment *ls = p->second; for (set::iterator q = ls->truncating_inodes.begin(); q != ls->truncating_inodes.end(); - ++q) - _truncate_inode(*q, ls); + ++q) { + CInode *in = *q; + in->auth_pin(this); + + if (!in->client_need_snapflush.empty() && + (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { + assert(in->filelock.is_stable()); + in->filelock.set_state(LOCK_XLOCKDONE); + in->auth_pin(&in->filelock); + in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); + // start_files_to_recover will revoke caps + continue; + } + _truncate_inode(in, ls); + } } } diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index 78bd474a2c8..210557837e1 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -96,6 +96,14 @@ public: get_state() == LOCK_MIX; } + void set_xlock_snap_sync(MDSInternalContextBase *c) + { + assert(get_type() == CEPH_LOCK_IFILE); + assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE); + state = LOCK_XLOCKSNAP; + add_waiter(WAIT_STABLE, c); + } + xlist::item *get_updated_item() { return &more()->item_updated; } utime_t get_update_stamp() { diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 6d1d7fab9d5..10a3bd0a1d9 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -101,6 +101,7 @@ public: case LOCK_PREXLOCK: return "prexlock"; case LOCK_XLOCK: return "xlock"; case LOCK_XLOCKDONE: return "xlockdone"; + case LOCK_XLOCKSNAP: return "xlocksnap"; case LOCK_LOCK_XLOCK: return "lock->xlock"; case LOCK_SYNC_LOCK: return "sync->lock"; @@ -493,7 +494,8 @@ public: more()->xlock_by.reset(); } void put_xlock() { - assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || is_locallock() || + assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || + state == LOCK_XLOCKSNAP || is_locallock() || state == LOCK_LOCK /* if we are a master of a slave */); --more()->num_xlock; parent->put(MDSCacheObject::PIN_LOCK); diff --git a/src/mds/locks.c b/src/mds/locks.c index aa612367c8a..2fc0a5fc041 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -99,6 +99,7 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_XLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_XLOCKDONE] = { LOCK_LOCK, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,CEPH_CAP_GSHARED,0 }, + [LOCK_XLOCKSNAP] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 }, [LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD }, diff --git a/src/mds/locks.h b/src/mds/locks.h index d1585cec576..9f4ea566b4a 100644 --- a/src/mds/locks.h +++ b/src/mds/locks.h @@ -52,6 +52,7 @@ enum { LOCK_PREXLOCK, // A . . .. . . / . . (lock) LOCK_XLOCK, // A . . .. . . / . . (lock) LOCK_XLOCKDONE, // A r p rd l x / . . (lock) <-- by same client only!! + LOCK_XLOCKSNAP, // also revoke Fb LOCK_LOCK_XLOCK, LOCK_SYNC_LOCK, // AR R . .. . . / . . R .. . . / . .