if (!object->is_auth()) {
if (!mdr->locks.empty())
- mds->locker->drop_locks(mdr.get());
+ drop_locks(mdr.get());
if (object->is_ambiguous_auth()) {
// wait
dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
}
if (!object->can_auth_pin()) {
// wait
- mds->locker->drop_locks(mdr.get());
+ drop_locks(mdr.get());
mdr->drop_local_auth_pins();
if (auth_pin_nonblock) {
dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl;
dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << dendl;
}
} else {
+ assert(mdr->is_master());
+ if ((*p)->is_scatterlock()) {
+ ScatterLock *slock = static_cast<ScatterLock *>(*p);
+ if (slock->is_rejoin_mix()) {
+ // If there is a recovering mds who replcated an object when it failed
+ // and scatterlock in the object was in MIX state, It's possible that
+ // the recovering mds needs to take wrlock on the scatterlock when it
+ // replays unsafe requests. So this mds should delay taking rdlock on
+ // the scatterlock until the recovering mds finishes replaying unsafe.
+ // Otherwise unsafe requests may get replayed after current request.
+ //
+ // For example:
+ // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+ // of correspinding inode. when 'rm -rf' the direcotry, this mds should
+ // delay the rmdir request until the recovering mds has replayed unlink
+ // requests.
+ if (mds->is_cluster_degraded()) {
+ if (!mdr->is_replay()) {
+ drop_locks(mdr.get());
+ mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+ dout(10) << " rejoin mix scatterlock " << *slock << " " << *(*p)->get_parent()
+ << ", waiting for cluster recovered" << dendl;
+ marker.message = "rejoin mix scatterlock, waiting for cluster recovered";
+ return false;
+ }
+ } else {
+ slock->clear_rejoin_mix();
+ }
+ }
+ }
+
marker.message = "failed to rdlock, waiting";
if (!rdlock_start(*p, mdr))
goto out;
cluster_degraded = mdsmap->is_degraded();
if (oldmap->is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
dout(1) << "cluster recovered." << dendl;
+ auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
+ if (it != waiting_for_active_peer.end()) {
+ queue_waiters(it->second);
+ waiting_for_active_peer.erase(it);
+ }
}
// did someone go active?
send_message(m, c.get());
}
- void wait_for_active(MDSInternalContextBase *c) {
- waiting_for_active.push_back(c);
- }
void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
waiting_for_active_peer[who].push_back(c);
}
+ void wait_for_cluster_recovered(MDSInternalContextBase *c) {
+ assert(cluster_degraded);
+ waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
+ }
+
+ void wait_for_active(MDSInternalContextBase *c) {
+ waiting_for_active.push_back(c);
+ }
void wait_for_replay(MDSInternalContextBase *c) {
waiting_for_replay.push_back(c);
}
assert(!client_request);
more()->filepath1 = fp;
}
+
void MDRequestImpl::set_filepath2(const filepath& fp)
{
assert(!client_request);
more()->filepath2 = fp;
}
+bool MDRequestImpl::is_replay() const
+{
+ return client_request ? client_request->is_replay() : false;
+}
+
void MDRequestImpl::print(ostream &out) const
{
out << "request(" << reqid;
const filepath& get_filepath2();
void set_filepath(const filepath& fp);
void set_filepath2(const filepath& fp);
+ bool is_replay() const;
void print(ostream &out) const override;
void dump(Formatter *f) const override;
DIRTY = 1 << 2,
FLUSHING = 1 << 3,
FLUSHED = 1 << 4,
+ REJOIN_MIX = 1 << 5, // no rdlock until the recovering mds become active
};
public:
bool is_dirty_or_flushing() const {
return have_more() ? (is_dirty() || is_flushing()) : false;
}
+ bool is_rejoin_mix() const {
+ return have_more() ? _more->state_flags & REJOIN_MIX : false;
+ }
void mark_dirty() {
if (!is_dirty()) {
}
}
+ void clear_rejoin_mix() {
+ if (have_more()) {
+ _more->state_flags &= ~REJOIN_MIX;
+ try_clear_more();
+ }
+ }
+
void set_last_scatter(utime_t t) { more()->last_scatter = t; }
utime_t get_last_scatter() {
return more()->last_scatter;
void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
if (rstate == LOCK_MIX ||
rstate == LOCK_MIX_LOCK || // replica still has wrlocks?
- rstate == LOCK_MIX_SYNC || // "
- rstate == LOCK_MIX_TSYN) // "
+ rstate == LOCK_MIX_SYNC)
state = LOCK_MIX;
else if (locktoo && rstate == LOCK_LOCK)
state = LOCK_LOCK;
}
- void encode_state_for_rejoin(bufferlist& bl, int rep) const {
+ void encode_state_for_rejoin(bufferlist& bl, int rep) {
__s16 s = get_replica_state();
if (is_gathering(rep)) {
// the recovering mds may hold rejoined wrlocks
else
s = LOCK_MIX_LOCK;
}
+
+ if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
+ more()->state_flags |= REJOIN_MIX;
+
::encode(s, bl);
}
CDir *dir = *p;
const fnode_t *pf = dir->get_projected_fnode();
if (pf->fragstat.size()) {
- dout(10) << "dir_is_nonempty_unlocked dirstat has "
+ dout(10) << "dir_is_nonempty dirstat has "
<< pf->fragstat.size() << " items " << *dir << dendl;
return true;
}