]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: Add posix file lock deadlock detection
authorYan, Zheng <zyan@redhat.com>
Fri, 19 Feb 2016 10:39:49 +0000 (18:39 +0800)
committerYan, Zheng <zyan@redhat.com>
Fri, 19 Feb 2016 10:48:14 +0000 (18:48 +0800)
Signed-off-by: Yan, Zheng <zyan@redhat.com>
src/client/Client.cc
src/mds/CInode.h
src/mds/Server.cc
src/mds/flock.cc
src/mds/flock.h

index c506c4550b6357fb8487bf180657371c668c8a96..a998be32cb2c81c82d73c603e92cdaef7486f45d 100644 (file)
@@ -8766,11 +8766,11 @@ int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
       ceph_lock_state_t *lock_state;
       if (lock_type == CEPH_LOCK_FCNTL) {
        if (!in->fcntl_locks)
-         in->fcntl_locks = new ceph_lock_state_t(cct);
+         in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
        lock_state = in->fcntl_locks;
       } else if (lock_type == CEPH_LOCK_FLOCK) {
        if (!in->flock_locks)
-         in->flock_locks = new ceph_lock_state_t(cct);
+         in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
        lock_state = in->flock_locks;
       } else {
        assert(0);
@@ -8781,11 +8781,11 @@ int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
       if (fh) {
        if (lock_type == CEPH_LOCK_FCNTL) {
          if (!fh->fcntl_locks)
-           fh->fcntl_locks = new ceph_lock_state_t(cct);
+           fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
          lock_state = fh->fcntl_locks;
        } else {
          if (!fh->flock_locks)
-           fh->flock_locks = new ceph_lock_state_t(cct);
+           fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
          lock_state = fh->flock_locks;
        }
        _update_lock_state(fl, owner, lock_state);
@@ -8920,7 +8920,7 @@ void Client::_update_lock_state(struct flock *fl, uint64_t owner,
     list<ceph_filelock> activated_locks;
     lock_state->remove_lock(filelock, activated_locks);
   } else {
-    bool r = lock_state->add_lock(filelock, false, false);
+    bool r = lock_state->add_lock(filelock, false, false, NULL);
     assert(r);
   }
 }
index 15fef42e99bbe2b6e07e6c3da93cf78e8deff47e..8f0b0a12449bdfb15e7d4bedf0dca9e82a0cdea9 100644 (file)
@@ -542,7 +542,7 @@ protected:
 
   ceph_lock_state_t *get_fcntl_lock_state() {
     if (!fcntl_locks)
-      fcntl_locks = new ceph_lock_state_t(g_ceph_context);
+      fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL);
     return fcntl_locks;
   }
   void clear_fcntl_lock_state() {
@@ -551,7 +551,7 @@ protected:
   }
   ceph_lock_state_t *get_flock_lock_state() {
     if (!flock_locks)
-      flock_locks = new ceph_lock_state_t(g_ceph_context);
+      flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK);
     return flock_locks;
   }
   void clear_flock_lock_state() {
index b9f5b0c361a05a5a5ce529e69d7c8a32f3270379..596de2bc4cc002fae31aa74bffa739f61e7dda7e 100644 (file)
@@ -3531,14 +3531,17 @@ void Server::handle_client_file_setlock(MDRequestRef& mdr)
     respond_to_request(mdr, 0);
   } else {
     dout(10) << " lock attempt on " << set_lock << dendl;
+    bool deadlock = false;
     if (mdr->more()->flock_was_waiting &&
        !lock_state->is_waiting(set_lock)) {
       dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
       respond_to_request(mdr, -EINTR);
-    } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting)) {
+    } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
       dout(10) << " it failed on this attempt" << dendl;
       // couldn't set lock right now
-      if (!will_wait) {
+      if (deadlock) {
+       respond_to_request(mdr, -EDEADLK);
+      } else if (!will_wait) {
        respond_to_request(mdr, -EWOULDBLOCK);
       } else {
        dout(10) << " added to waiting list" << dendl;
index e99435e5d19d7580b9736c405dcea56d4298b6d2..09557032e07fd1f6f1daa2df3552f0ac0dd3f7b8 100644 (file)
@@ -8,7 +8,27 @@
 
 #define dout_subsys ceph_subsys_mds
 
-bool ceph_lock_state_t::is_waiting(ceph_filelock &fl)
+static multimap<ceph_filelock, ceph_lock_state_t*> global_waiting_locks;
+
+ceph_lock_state_t::~ceph_lock_state_t()
+{
+  if (type == CEPH_LOCK_FCNTL) {
+    for (auto p = waiting_locks.begin(); p != waiting_locks.end(); ++p) {
+      for (auto q = global_waiting_locks.find(p->second);
+          q != global_waiting_locks.end(); ) {
+       if (q->first != p->second)
+         break;
+       if (q->second == this) {
+         global_waiting_locks.erase(q);
+         break;
+       }
+       ++q;
+      }
+    }
+  }
+}
+
+bool ceph_lock_state_t::is_waiting(const ceph_filelock &fl)
 {
   multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start);
   while (p != waiting_locks.end()) {
@@ -22,12 +42,12 @@ bool ceph_lock_state_t::is_waiting(ceph_filelock &fl)
   return false;
 }
 
-void ceph_lock_state_t::remove_waiting(ceph_filelock& fl)
+void ceph_lock_state_t::remove_waiting(const ceph_filelock& fl)
 {
-  multimap<uint64_t, ceph_filelock>::iterator p = waiting_locks.find(fl.start);
-  while (p != waiting_locks.end()) {
+  for (auto p = waiting_locks.find(fl.start);
+       p != waiting_locks.end(); ) {
     if (p->second.start > fl.start)
-      return;
+      break;
     if (p->second.length == fl.length &&
        ceph_filelock_owner_equal(p->second, fl)) {
       waiting_locks.erase(p);
@@ -35,14 +55,100 @@ void ceph_lock_state_t::remove_waiting(ceph_filelock& fl)
       if (!client_waiting_lock_counts[(client_t)fl.client]) {
         client_waiting_lock_counts.erase((client_t)fl.client);
       }
-      return;
+      break;
     }
     ++p;
   }
+
+  if (type == CEPH_LOCK_FCNTL) {
+    for (auto q = global_waiting_locks.find(fl);
+        q != global_waiting_locks.end(); ) {
+      if (q->first != fl)
+       break;
+      if (q->second == this) {
+       global_waiting_locks.erase(q);
+       break;
+      }
+      ++q;
+    }
+  }
+}
+
+bool ceph_lock_state_t::is_deadlock(const ceph_filelock& fl,
+                                   list<multimap<uint64_t, ceph_filelock>::iterator>&
+                                     overlapping_locks,
+                                   const ceph_filelock *first_fl, unsigned depth)
+{
+  ldout(cct,15) << "is_deadlock " << fl << dendl;
+
+  // only for posix lock
+  if (type != CEPH_LOCK_FCNTL)
+    return false;
+
+  // find conflict locks' owners
+  set<ceph_filelock> lock_owners;
+  for (auto p = overlapping_locks.begin();
+       p != overlapping_locks.end();
+       ++p) {
+
+    if (fl.type == CEPH_LOCK_SHARED &&
+       (*p)->second.type == CEPH_LOCK_SHARED)
+      continue;
+
+    // circle detected
+    if (first_fl && ceph_filelock_owner_equal(*first_fl, (*p)->second)) {
+      ldout(cct,15) << " detect deadlock" << dendl;
+      return true;
+    }
+
+    ceph_filelock tmp = (*p)->second;
+    tmp.start = 0;
+    tmp.length = 0;
+    tmp.type = 0;
+    lock_owners.insert(tmp);
+  }
+
+  if (depth >= MAX_DEADLK_DEPTH)
+    return false;
+
+  first_fl = first_fl ? first_fl : &fl;
+  for (auto p = lock_owners.begin();
+       p != lock_owners.end();
+       ++p) {
+    ldout(cct,15) << " conflict lock owner " << *p << dendl;
+    // if conflict lock' owner is waiting for other lock?
+    for (auto q = global_waiting_locks.lower_bound(*p);
+        q != global_waiting_locks.end();
+        ++q) {
+      if (!ceph_filelock_owner_equal(q->first, *p))
+       break;
+
+      list<multimap<uint64_t, ceph_filelock>::iterator>
+       _overlapping_locks, _self_overlapping_locks;
+      ceph_lock_state_t& state = *(q->second);
+      if (state.get_overlapping_locks(q->first, _overlapping_locks)) {
+       state.split_by_owner(q->first, _overlapping_locks, _self_overlapping_locks);
+      }
+      if (!_overlapping_locks.empty()) {
+       if (is_deadlock(q->first, _overlapping_locks, first_fl, depth + 1))
+         return true;
+      }
+    }
+  }
+  return false;
+}
+
+void ceph_lock_state_t::add_waiting(const ceph_filelock& fl)
+{
+  waiting_locks.insert(pair<uint64_t, ceph_filelock>(fl.start, fl));
+  if (type == CEPH_LOCK_FCNTL) {
+    global_waiting_locks.insert(pair<ceph_filelock,ceph_lock_state_t*>(fl, this));
+  }
 }
 
 bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
-                                 bool wait_on_fail, bool replay)
+                                 bool wait_on_fail, bool replay,
+                                bool *deadlock)
 {
   ldout(cct,15) << "add_lock " << new_lock << dendl;
   bool ret = false;
@@ -60,14 +166,20 @@ bool ceph_lock_state_t::add_lock(ceph_filelock& new_lock,
       ldout(cct,15) << "overlapping lock, and this lock is exclusive, can't set"
               << dendl;
       if (wait_on_fail && !replay) {
-        waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
+       if (is_deadlock(new_lock, overlapping_locks))
+         *deadlock = true;
+       else
+         add_waiting(new_lock);
       }
     } else { //shared lock, check for any exclusive locks blocking us
       if (contains_exclusive_lock(overlapping_locks)) { //blocked :(
         ldout(cct,15) << " blocked by exclusive lock in overlapping_locks" << dendl;
-        if (wait_on_fail && !replay) {
-          waiting_locks.insert(pair<uint64_t, ceph_filelock>(new_lock.start, new_lock));
-        }
+       if (wait_on_fail && !replay) {
+         if (is_deadlock(new_lock, overlapping_locks))
+           *deadlock = true;
+         else
+           add_waiting(new_lock);
+       }
       } else {
         //yay, we can insert a shared lock
         ldout(cct,15) << "inserting shared lock" << dendl;
@@ -190,12 +302,37 @@ bool ceph_lock_state_t::remove_all_from (client_t client)
 {
   bool cleared_any = false;
   if (client_held_lock_counts.count(client)) {
-    remove_all_from(client, held_locks);
+    multimap<uint64_t, ceph_filelock>::iterator iter = held_locks.begin();
+    while (iter != held_locks.end()) {
+      if ((client_t)iter->second.client == client) {
+       held_locks.erase(iter++);
+      } else
+       ++iter;
+    }
     client_held_lock_counts.erase(client);
     cleared_any = true;
   }
+
   if (client_waiting_lock_counts.count(client)) {
-    remove_all_from(client, waiting_locks);
+    multimap<uint64_t, ceph_filelock>::iterator iter = waiting_locks.begin();
+    while (iter != waiting_locks.end()) {
+      if ((client_t)iter->second.client != client) {
+       ++iter;
+       continue;
+      }
+
+      for (auto p = global_waiting_locks.find(iter->second);
+          p != global_waiting_locks.end(); ) {
+       if (p->first != iter->second)
+         break;
+       if (p->second == this) {
+         global_waiting_locks.erase(p);
+         break;
+       }
+       ++p;
+      }
+      waiting_locks.erase(iter++);
+    }
     client_waiting_lock_counts.erase(client);
   }
   return cleared_any;
@@ -328,18 +465,6 @@ void ceph_lock_state_t::adjust_locks(list<multimap<uint64_t, ceph_filelock>::ite
   }
 }
 
-void ceph_lock_state_t::remove_all_from(client_t client,
-                                        multimap<uint64_t,
-                                          ceph_filelock>& locks)
-{
-  multimap<uint64_t, ceph_filelock>::iterator iter = locks.begin();
-  while (iter != locks.end()) {
-    if ((client_t)iter->second.client == client) {
-      locks.erase(iter++);
-    } else ++iter;
-  }
-}
-
 multimap<uint64_t, ceph_filelock>::iterator
 ceph_lock_state_t::get_lower_bound(uint64_t start,
                                    multimap<uint64_t, ceph_filelock>& lock_map)
@@ -383,7 +508,7 @@ bool ceph_lock_state_t::share_space(
   return ret;
 }
 
-bool ceph_lock_state_t::get_overlapping_locks(ceph_filelock& lock,
+bool ceph_lock_state_t::get_overlapping_locks(const ceph_filelock& lock,
                            list<multimap<uint64_t,
                                ceph_filelock>::iterator> & overlaps,
                            list<multimap<uint64_t,
@@ -428,7 +553,7 @@ bool ceph_lock_state_t::get_overlapping_locks(ceph_filelock& lock,
   return !overlaps.empty();
 }
 
-bool ceph_lock_state_t::get_waiting_overlaps(ceph_filelock& lock,
+bool ceph_lock_state_t::get_waiting_overlaps(const ceph_filelock& lock,
                                              list<multimap<uint64_t,
                                                ceph_filelock>::iterator>&
                                                overlaps)
@@ -445,7 +570,7 @@ bool ceph_lock_state_t::get_waiting_overlaps(ceph_filelock& lock,
   return !overlaps.empty();
 }
 
-void ceph_lock_state_t::split_by_owner(ceph_filelock& owner,
+void ceph_lock_state_t::split_by_owner(const ceph_filelock& owner,
                                        list<multimap<uint64_t,
                                            ceph_filelock>::iterator>& locks,
                                        list<multimap<uint64_t,
index 55ba2b8367fe66b617b8b524d66fce77a55842bd..fade1a88e3229432e1556b010fe8e83be34b6eb1 100644 (file)
@@ -9,7 +9,7 @@
 #include "mdstypes.h"
 
 
-inline ostream& operator<<(ostream& out, ceph_filelock& l) {
+inline ostream& operator<<(ostream& out, const ceph_filelock& l) {
   out << "start: " << l.start << ", length: " << l.length
       << ", client: " << l.client << ", owner: " << l.owner
       << ", pid: " << l.pid << ", type: " << (int)l.type
@@ -17,7 +17,7 @@ inline ostream& operator<<(ostream& out, ceph_filelock& l) {
   return out;
 }
 
-inline bool ceph_filelock_owner_equal(ceph_filelock& l, ceph_filelock& r)
+inline bool ceph_filelock_owner_equal(const ceph_filelock& l, const ceph_filelock& r)
 {
   if (l.client != r.client || l.owner != r.owner)
     return false;
@@ -29,17 +29,52 @@ inline bool ceph_filelock_owner_equal(ceph_filelock& l, ceph_filelock& r)
   return l.pid == r.pid;
 }
 
-inline bool operator==(ceph_filelock& l, ceph_filelock& r) {
-  return
-    l.length == r.length &&
-    l.type == r.type &&
-    ceph_filelock_owner_equal(l, r);
+inline int ceph_filelock_owner_compare(const ceph_filelock& l, const ceph_filelock& r)
+{
+  if (l.client != r.client)
+    return l.client > r.client ? 1 : -1;
+  if (l.owner != r.owner)
+    return l.owner > r.owner ? 1 : -1;
+  if (l.owner & (1ULL << 63))
+    return 0;
+  if (l.pid != r.pid)
+    return l.pid > r.pid ? 1 : -1;
+  return 0;
+}
+
+inline int ceph_filelock_compare(const ceph_filelock& l, const ceph_filelock& r)
+{
+  int ret = ceph_filelock_owner_compare(l, r);
+  if (ret)
+    return ret;
+  if (l.start != r.start)
+    return l.start > r.start ? 1 : -1;
+  if (l.length != r.length)
+    return l.length > r.length ? 1 : -1;
+  if (l.type != r.type)
+    return l.type > r.type ? 1 : -1;
+  return 0;
+}
+
+inline bool operator<(const ceph_filelock& l, const ceph_filelock& r)
+{
+  return ceph_filelock_compare(l, r) < 0;
+}
+
+inline bool operator==(const ceph_filelock& l, const ceph_filelock& r) {
+  return ceph_filelock_compare(l, r) == 0;
+}
+
+inline bool operator!=(const ceph_filelock& l, const ceph_filelock& r) {
+  return ceph_filelock_compare(l, r) != 0;
 }
 
 class ceph_lock_state_t {
   CephContext *cct;
+  int type;
 public:
-  explicit ceph_lock_state_t(CephContext *cct_) : cct(cct_) {}
+  explicit ceph_lock_state_t(CephContext *cct_, int type_) : cct(cct_), type(type_) {}
+  ~ceph_lock_state_t();
   multimap<uint64_t, ceph_filelock> held_locks;    // current locks
   multimap<uint64_t, ceph_filelock> waiting_locks; // locks waiting for other locks
   // both of the above are keyed by starting offset
@@ -52,14 +87,13 @@ public:
    * @param fl The filelock to check for
    * @returns True if the lock is waiting, false otherwise
    */
-  bool is_waiting(ceph_filelock &fl);
+  bool is_waiting(const ceph_filelock &fl);
   /**
    * Remove a lock from the waiting_locks list
    *
    * @param fl The filelock to remove
    */
-  void remove_waiting(ceph_filelock& fl);
-
+  void remove_waiting(const ceph_filelock& fl);
   /*
    * Try to set a new lock. If it's blocked and wait_on_fail is true,
    * add the lock to waiting_locks.
@@ -73,7 +107,8 @@ public:
    *
    * @returns true if set, false if not set.
    */
-  bool add_lock(ceph_filelock& new_lock, bool wait_on_fail, bool replay);
+  bool add_lock(ceph_filelock& new_lock, bool wait_on_fail, bool replay,
+               bool *deadlock);
   /**
    * See if a lock is blocked by existing locks. If the lock is blocked,
    * it will be set to the value of the first blocking lock. Otherwise,
@@ -91,11 +126,33 @@ public:
    * @param removal_lock The lock to remove
    * @param activated_locks A return parameter, holding activated wait locks.
    */
-  void remove_lock(ceph_filelock removal_lock,
+  void remove_lock(const ceph_filelock removal_lock,
                    list<ceph_filelock>& activated_locks);
 
   bool remove_all_from(client_t client);
 private:
+  static const unsigned MAX_DEADLK_DEPTH = 5;
+
+  /**
+   * Check if adding the lock causes deadlock
+   *
+   * @param fl The blocking filelock 
+   * @param overlapping_locks list of all overlapping locks 
+   * @param first_fl 
+   * @depth recursion call depth
+   */
+  bool is_deadlock(const ceph_filelock& fl,
+                  list<multimap<uint64_t, ceph_filelock>::iterator>&
+                     overlapping_locks,
+                  const ceph_filelock *first_fl=NULL, unsigned depth=0);
+
+  /**
+   * Add a lock to the waiting_locks list
+   *
+   * @param fl The filelock to add
+   */
+  void add_waiting(const ceph_filelock& fl);
+
   /**
    * Adjust old locks owned by a single process so that process can set
    * a new lock of different type. Handle any changes needed to the old locks
@@ -120,10 +177,6 @@ private:
                     list<multimap<uint64_t, ceph_filelock>::iterator>
                       neighbor_locks);
 
-  //this won't reset the counter map value, do that yourself
-  void remove_all_from(client_t client,
-                       multimap<uint64_t, ceph_filelock>& locks);
-
   //get last lock prior to start position
   multimap<uint64_t, ceph_filelock>::iterator
   get_lower_bound(uint64_t start,
@@ -143,7 +196,7 @@ private:
                   uint64_t start, uint64_t end);
   
   bool share_space(multimap<uint64_t, ceph_filelock>::iterator& iter,
-                   ceph_filelock &lock) {
+                   const ceph_filelock &lock) {
     uint64_t end = lock.start;
     if (lock.length) {
       end += lock.length - 1;
@@ -158,14 +211,14 @@ private:
    * overlaps: an empty list, to be filled.
    * Returns: true if at least one lock overlaps.
    */
-  bool get_overlapping_locks(ceph_filelock& lock,
+  bool get_overlapping_locks(const ceph_filelock& lock,
                              list<multimap<uint64_t,
                                  ceph_filelock>::iterator> & overlaps,
                              list<multimap<uint64_t,
                                  ceph_filelock>::iterator> *self_neighbors);
 
   
-  bool get_overlapping_locks(ceph_filelock& lock,
+  bool get_overlapping_locks(const ceph_filelock& lock,
                             list<multimap<uint64_t, ceph_filelock>::iterator>& overlaps) {
     return get_overlapping_locks(lock, overlaps, NULL);
   }
@@ -176,7 +229,7 @@ private:
    * overlaps: an empty list, to be filled
    * Returns: true if at least one waiting_lock overlaps
    */
-  bool get_waiting_overlaps(ceph_filelock& lock,
+  bool get_waiting_overlaps(const ceph_filelock& lock,
                             list<multimap<uint64_t,
                                 ceph_filelock>::iterator>& overlaps);
   /*
@@ -187,7 +240,7 @@ private:
    *        Will have all locks owned by owner removed
    * owned_locks: an empty list, to be filled with the locks owned by owner
    */
-  void split_by_owner(ceph_filelock& owner,
+  void split_by_owner(const ceph_filelock& owner,
                      list<multimap<uint64_t,
                          ceph_filelock>::iterator> & locks,
                      list<multimap<uint64_t,