Snapshot data get lost if following sequence of events happen
- client writes data to a file
- make a snapshot
- truncate the file
- mds truncate file objects using the newest snap context
- client flushes snap data using the old snap context
OSD first handles MDS's truncate request, it updates object's snap
context. When handling client's write request, OSD finds that
object's snap context is newer than request's snap context. So
it uses the newer one and treats the data as if they were
written after the snapshot.
The fix is avoid touching file objects while clients may have
unflushed snap data. Before truncating file objects, MDS checks
if clients may have unflushed snap data. If client have, MDS
set filelock to a special unstable state, the state revokes Fb
capability. MDS starts truncating file objects after the Fb
capability get revoked.
Fixes: http://tracker.ceph.com/issues/17193
Signed-off-by: Yan, Zheng <zyan@redhat.com>
(cherry picked from commit
9c65920e7f6febe294e25a67473693ce6f9adfa7)
if (lock->get_num_rdlocks() == 0 &&
lock->get_num_wrlocks() == 0 &&
lock->get_num_client_lease() == 0 &&
+ lock->get_state() != LOCK_XLOCKSNAP &&
lock->get_type() != CEPH_LOCK_DN) {
CInode *in = static_cast<CInode*>(lock->get_parent());
client_t loner = in->get_target_loner();
}
}
// the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK
- eval_gather(lock, true, pneed_issue);
+ eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue);
}
void Locker::xlock_finish(SimpleLock *lock, MutationImpl *mut, bool *pneed_issue)
}
if (recover) {
- in->auth_pin(&in->filelock);
+ if (in->filelock.is_stable()) {
+ in->auth_pin(&in->filelock);
+ } else {
+ assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
+ }
in->filelock.set_state(LOCK_PRE_SCAN);
rejoin_recover_q.push_back(in);
} else {
void MDCache::start_files_to_recover()
{
for (CInode *in : rejoin_check_q) {
+ if (in->filelock.get_state() == LOCK_XLOCKSNAP)
+ mds->locker->issue_caps(in);
mds->locker->check_inode_max_size(in);
}
rejoin_check_q.clear();
// ----------------------------
// truncate
+class C_MDC_RetryTruncate : public MDCacheContext {
+ CInode *in;
+ LogSegment *ls;
+public:
+ C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
+ MDCacheContext(c), in(i), ls(l) {}
+ void finish(int r) {
+ mdcache->_truncate_inode(in, ls);
+ }
+};
+
void MDCache::truncate_inode(CInode *in, LogSegment *ls)
{
inode_t *pi = in->get_projected_inode();
ls->truncating_inodes.insert(in);
in->get(CInode::PIN_TRUNCATING);
+ in->auth_pin(this);
+
+ if (!in->client_need_snapflush.empty() &&
+ (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
+ assert(in->filelock.is_xlocked());
+ in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
+ mds->locker->issue_caps(in);
+ return;
+ }
_truncate_inode(in, ls);
}
assert(pi->truncate_from < (1ULL << 63));
assert(pi->truncate_size < pi->truncate_from);
- in->auth_pin(this);
SnapRealm *realm = in->find_snaprealm();
SnapContext nullsnap;
LogSegment *ls = p->second;
for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
q != ls->truncating_inodes.end();
- ++q)
- _truncate_inode(*q, ls);
+ ++q) {
+ CInode *in = *q;
+ in->auth_pin(this);
+
+ if (!in->client_need_snapflush.empty() &&
+ (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
+ assert(in->filelock.is_stable());
+ in->filelock.set_state(LOCK_XLOCKDONE);
+ in->auth_pin(&in->filelock);
+ in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
+ // start_files_to_recover will revoke caps
+ continue;
+ }
+ _truncate_inode(in, ls);
+ }
}
}
get_state() == LOCK_MIX;
}
+ void set_xlock_snap_sync(MDSInternalContextBase *c)
+ {
+ assert(get_type() == CEPH_LOCK_IFILE);
+ assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE);
+ state = LOCK_XLOCKSNAP;
+ add_waiter(WAIT_STABLE, c);
+ }
+
xlist<ScatterLock*>::item *get_updated_item() { return &more()->item_updated; }
utime_t get_update_stamp() {
case LOCK_PREXLOCK: return "prexlock";
case LOCK_XLOCK: return "xlock";
case LOCK_XLOCKDONE: return "xlockdone";
+ case LOCK_XLOCKSNAP: return "xlocksnap";
case LOCK_LOCK_XLOCK: return "lock->xlock";
case LOCK_SYNC_LOCK: return "sync->lock";
more()->xlock_by.reset();
}
void put_xlock() {
- assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE || is_locallock() ||
+ assert(state == LOCK_XLOCK || state == LOCK_XLOCKDONE ||
+ state == LOCK_XLOCKSNAP || is_locallock() ||
state == LOCK_LOCK /* if we are a master of a slave */);
--more()->num_xlock;
parent->put(MDSCacheObject::PIN_LOCK);
[LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
[LOCK_XLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
[LOCK_XLOCKDONE] = { LOCK_LOCK, false, LOCK_LOCK, XCL, XCL, XCL, 0, 0, XCL, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,CEPH_CAP_GSHARED,0 },
+ [LOCK_XLOCKSNAP] = { LOCK_LOCK, false, LOCK_LOCK, 0, XCL, 0, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,0 },
[LOCK_LOCK_XLOCK]= { LOCK_PREXLOCK,false,LOCK_LOCK,0, XCL, 0, 0, 0, 0, XCL, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
[LOCK_MIX] = { 0, false, LOCK_MIX, 0, 0, REQ, ANY, 0, 0, 0, CEPH_CAP_GRD|CEPH_CAP_GWR|CEPH_CAP_GLAZYIO,0,0,CEPH_CAP_GRD },
LOCK_PREXLOCK, // A . . .. . . / . . (lock)
LOCK_XLOCK, // A . . .. . . / . . (lock)
LOCK_XLOCKDONE, // A r p rd l x / . . (lock) <-- by same client only!!
+ LOCK_XLOCKSNAP, // also revoke Fb
LOCK_LOCK_XLOCK,
LOCK_SYNC_LOCK, // AR R . .. . . / . . R .. . . / . .