inodeno_t rino;
unsigned char rdtype;
- __s32 ls;
::decode(rino, p);
::decode(rdtype, p);
- ::decode(ls, p);
+ lock.decode_state(p, is_new);
+
+ bool need_recover;
+ ::decode(need_recover, p);
if (is_new) {
if (rino)
dir->link_remote_inode(this, rino, rdtype);
- lock.set_state(ls);
+ if (need_recover)
+ lock.mark_need_recover();
}
}
void clear_new() { state_clear(STATE_NEW); }
// -- replication
- void encode_replica(mds_rank_t mds, bufferlist& bl) {
+ void encode_replica(mds_rank_t mds, bufferlist& bl, bool need_recover) {
if (!is_replicated())
lock.replicate_relax();
::encode(first, bl);
::encode(linkage.remote_ino, bl);
::encode(linkage.remote_d_type, bl);
- __s32 ls = lock.get_replica_state();
- ::encode(ls, bl);
+ lock.encode_state_for_replica(bl);
+ ::encode(need_recover, bl);
}
void decode_replica(bufferlist::iterator& p, bool is_new);
want_loner_cap = loner_cap; // for now, we'll eval() shortly.
}
-void CInode::_encode_locks_state_for_replica(bufferlist& bl)
+void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
{
authlock.encode_state_for_replica(bl);
linklock.encode_state_for_replica(bl);
snaplock.encode_state_for_replica(bl);
flocklock.encode_state_for_replica(bl);
policylock.encode_state_for_replica(bl);
+ ::encode(need_recover, bl);
}
+
void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
{
authlock.encode_state_for_replica(bl);
flocklock.encode_state_for_replica(bl);
policylock.encode_state_for_replica(bl);
}
+
void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
{
authlock.decode_state(p, is_new);
snaplock.decode_state(p, is_new);
flocklock.decode_state(p, is_new);
policylock.decode_state(p, is_new);
+
+ bool need_recover;
+ ::decode(need_recover, p);
+ if (need_recover && is_new) {
+ // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
+ // and change the object when replaying unsafe requests.
+ authlock.mark_need_recover();
+ linklock.mark_need_recover();
+ dirfragtreelock.mark_need_recover();
+ filelock.mark_need_recover();
+ nestlock.mark_need_recover();
+ xattrlock.mark_need_recover();
+ snaplock.mark_need_recover();
+ flocklock.mark_need_recover();
+ policylock.mark_need_recover();
+ }
}
void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters,
- list<SimpleLock*>& eval_locks)
-{
- authlock.decode_state_rejoin(p, waiters);
- linklock.decode_state_rejoin(p, waiters);
- dirfragtreelock.decode_state_rejoin(p, waiters);
- filelock.decode_state_rejoin(p, waiters);
- nestlock.decode_state_rejoin(p, waiters);
- xattrlock.decode_state_rejoin(p, waiters);
- snaplock.decode_state_rejoin(p, waiters);
- flocklock.decode_state_rejoin(p, waiters);
- policylock.decode_state_rejoin(p, waiters);
+ list<SimpleLock*>& eval_locks, bool survivor)
+{
+ authlock.decode_state_rejoin(p, waiters, survivor);
+ linklock.decode_state_rejoin(p, waiters, survivor);
+ dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
+ filelock.decode_state_rejoin(p, waiters, survivor);
+ nestlock.decode_state_rejoin(p, waiters, survivor);
+ xattrlock.decode_state_rejoin(p, waiters, survivor);
+ snaplock.decode_state_rejoin(p, waiters, survivor);
+ flocklock.decode_state_rejoin(p, waiters, survivor);
+ policylock.decode_state_rejoin(p, waiters, survivor);
if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
eval_locks.push_back(&dirfragtreelock);
void encode_store(bufferlist& bl, uint64_t features);
void decode_store(bufferlist::iterator& bl);
- void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features) {
+ void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) {
assert(is_auth());
// relax locks?
::encode(nonce, bl);
_encode_base(bl, features);
- _encode_locks_state_for_replica(bl);
+ _encode_locks_state_for_replica(bl, need_recover);
}
void decode_replica(bufferlist::iterator& p, bool is_new) {
__u32 nonce;
void _decode_base(bufferlist::iterator& p);
void _encode_locks_full(bufferlist& bl);
void _decode_locks_full(bufferlist::iterator& p);
- void _encode_locks_state_for_replica(bufferlist& bl);
+ void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
void _decode_locks_state(bufferlist::iterator& p, bool is_new);
void _decode_locks_rejoin(bufferlist::iterator& p, std::list<MDSInternalContextBase*>& waiters,
- std::list<SimpleLock*>& eval_locks);
+ std::list<SimpleLock*>& eval_locks, bool survivor);
// -- import/export --
void encode_export(bufferlist& bl);
}
} else {
assert(mdr->is_master());
- if ((*p)->is_scatterlock()) {
- ScatterLock *slock = static_cast<ScatterLock *>(*p);
- if (slock->is_rejoin_mix()) {
- // If there is a recovering mds who replcated an object when it failed
- // and scatterlock in the object was in MIX state, It's possible that
- // the recovering mds needs to take wrlock on the scatterlock when it
- // replays unsafe requests. So this mds should delay taking rdlock on
- // the scatterlock until the recovering mds finishes replaying unsafe.
- // Otherwise unsafe requests may get replayed after current request.
- //
- // For example:
- // The recovering mds is auth mds of a dirfrag, this mds is auth mds
- // of correspinding inode. when 'rm -rf' the direcotry, this mds should
- // delay the rmdir request until the recovering mds has replayed unlink
- // requests.
- if (mds->is_cluster_degraded()) {
- if (!mdr->is_replay()) {
- drop_locks(mdr.get());
- mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
- dout(10) << " rejoin mix scatterlock " << *slock << " " << *(*p)->get_parent()
- << ", waiting for cluster recovered" << dendl;
- marker.message = "rejoin mix scatterlock, waiting for cluster recovered";
- return false;
- }
- } else {
- slock->clear_rejoin_mix();
+ if ((*p)->needs_recover()) {
+ if (mds->is_cluster_degraded()) {
+ if (!mdr->is_replay()) {
+ // see comments in SimpleLock::set_state_rejoin() and
+ // ScatterLock::encode_state_for_rejoin()
+ drop_locks(mdr.get());
+ mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+ dout(10) << " rejoin recovering " << **p << " " << *(*p)->get_parent()
+ << ", waiting for cluster recovered" << dendl;
+ marker.message = "rejoin recovering lock, waiting for cluster recovered";
+ return false;
}
+ } else {
+ (*p)->clear_need_recover();
}
}
dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
mds_rank_t from = mds_rank_t(ack->get_source().num());
+ assert(mds->get_state() >= MDSMap::STATE_REJOIN);
+ bool survivor = !mds->is_rejoin();
+
// for sending cache expire message
set<CInode*> isolated_inodes;
set<CInode*> refragged_inodes;
}
dn->set_replica_nonce(q->second.nonce);
- dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
+ dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
dn->state_clear(CDentry::STATE_REJOINING);
dout(10) << " got " << *dn << dendl;
}
assert(in);
in->set_replica_nonce(nonce);
bufferlist::iterator q = lockbl.begin();
- in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
+ in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
in->state_clear(CInode::STATE_REJOINING);
dout(10) << " got inode locks " << *in << dendl;
}
// done?
assert(rejoin_ack_gather.count(from));
rejoin_ack_gather.erase(from);
- if (mds->is_rejoin()) {
+ if (!survivor) {
if (rejoin_gather.empty()) {
// eval unstable scatter locks after all wrlocks are rejoined.
// ----------------------------
// REPLICAS
+
+void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
+{
+ dirfrag_t df = dir->dirfrag();
+ ::encode(df, bl);
+ dir->encode_replica(to, bl);
+}
+
+void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
+{
+ ::encode(dn->name, bl);
+ ::encode(dn->last, bl);
+ dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
+void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+ uint64_t features)
+{
+ ::encode(in->inode.ino, bl); // bleh, minor assymetry here
+ ::encode(in->last, bl);
+ in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
list<MDSInternalContextBase*>& finished)
{
friend class C_MDC_Join;
public:
- void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) {
- dirfrag_t df = dir->dirfrag();
- ::encode(df, bl);
- dir->encode_replica(to, bl);
- }
- void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) {
- ::encode(dn->name, bl);
- ::encode(dn->last, bl);
- dn->encode_replica(to, bl);
- }
+ void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
+ void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
- uint64_t features) {
- ::encode(in->inode.ino, bl); // bleh, minor assymetry here
- ::encode(in->last, bl);
- in->encode_replica(to, bl, features);
- }
+ uint64_t features);
CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
DIRTY = 1 << 10,
FLUSHING = 1 << 11,
FLUSHED = 1 << 12,
- REJOIN_MIX = 1 << 13,
};
public:
bool is_dirty_or_flushing() const {
return is_dirty() || is_flushing();
}
- bool is_rejoin_mix() const {
- return state_flags & REJOIN_MIX;
- }
void mark_dirty() {
if (!is_dirty()) {
void clear_flushed() override {
state_flags &= ~FLUSHED;
}
- void clear_rejoin_mix() {
- state_flags &= ~REJOIN_MIX;
- }
void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
if (rstate == LOCK_MIX ||
s = LOCK_MIX_LOCK;
}
+ // If there is a recovering mds who replcated an object when it failed
+ // and scatterlock in the object was in MIX state, It's possible that
+ // the recovering mds needs to take wrlock on the scatterlock when it
+ // replays unsafe requests. So this mds should delay taking rdlock on
+ // the scatterlock until the recovering mds finishes replaying unsafe.
+ // Otherwise unsafe requests may get replayed after current request.
+ //
+ // For example:
+ // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+ // of correspinding inode. when 'rm -rf' the direcotry, this mds should
+ // delay the rmdir request until the recovering mds has replayed unlink
+ // requests.
if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
- state_flags |= REJOIN_MIX;
+ mark_need_recover();
::encode(s, bl);
}
- void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters) {
- SimpleLock::decode_state_rejoin(p, waiters);
+ void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, bool survivor) {
+ SimpleLock::decode_state_rejoin(p, waiters, survivor);
if (is_flushing()) {
set_dirty();
clear_flushing();
__s16 state_flags;
enum {
- LEASED = 1 << 0,
+ LEASED = 1 << 0,
+ NEED_RECOVER = 1 << 1,
};
private:
//assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states.
return s;
}
- void set_state_rejoin(int s, list<MDSInternalContextBase*>& waiters) {
- if (!is_stable() && get_parent()->is_auth()) {
- state = s;
- get_parent()->auth_unpin(this);
- } else {
- state = s;
- }
+ void set_state_rejoin(int s, list<MDSInternalContextBase*>& waiters, bool survivor) {
+ assert(!get_parent()->is_auth());
+
+ // If lock in the replica object was not in SYNC state when auth mds of the object failed.
+ // Auth mds of the object may take xlock on the lock and change the object when replaying
+ // unsafe requests.
+ if (!survivor || state != LOCK_SYNC)
+ mark_need_recover();
+
+ state = s;
+
if (is_stable())
take_waiting(SimpleLock::WAIT_ALL, waiters);
}
return is_xlocked() || is_rdlocked() || is_wrlocked() || is_leased();
}
+ bool needs_recover() const {
+ return state_flags & NEED_RECOVER;
+ }
+ void mark_need_recover() {
+ state_flags |= NEED_RECOVER;
+ }
+ void clear_need_recover() {
+ state_flags &= ~NEED_RECOVER;
+ }
+
// encode/decode
void encode(bufferlist& bl) const {
ENCODE_START(2, 2, bl);
if (is_new)
state = s;
}
- void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters) {
+ void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, bool survivor) {
__s16 s;
::decode(s, p);
- set_state_rejoin(s, waiters);
+ set_state_rejoin(s, waiters, survivor);
}