]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
cohort_lru: crash fix and reduce lock contention
authorMatt Benjamin <mbenjamin@redhat.com>
Tue, 25 Nov 2025 17:41:37 +0000 (12:41 -0500)
committerDaniel Gryniewicz <dang@fprintf.net>
Fri, 29 May 2026 16:05:12 +0000 (12:05 -0400)
Fixes crash induced by taking the address of the last element
of an empty intrusive list (!).

Also, introduces active queue, reducing potential for lock
contention in evict_block():

* entries are tracked on lane::active_queue when lru_refcnt > 1
** on some lane::q otherwise

Object transition between queues when lru_refcnt changes value--
a value of 0 triggers deletion, as before.

Fixes: https://tracker.ceph.com/issues/73992
Signed-off-by: Matt Benjamin <mbenjamin@redhat.com>
src/common/cohort_lru.h
src/rgw/rgw_file.cc
src/rgw/rgw_file_int.h

index 6924918ce784b8cc92263723c832930e72741926..5ceb99094640f07ed98c1b18f62a5150bc0170cc 100644 (file)
@@ -66,10 +66,10 @@ namespace cohort {
     class Object
     {
     private:
-      uint32_t lru_flags;
+      bi::list_member_hook<link_mode> lru_hook;
       std::atomic<uint32_t> lru_refcnt;
-      std::atomic<uint32_t> lru_adj;
-      bi::list_member_hook< link_mode > lru_hook;
+      std::atomic_flag active;
+      std::atomic_flag evicting;
 
       typedef bi::list<Object,
                       bi::member_hook<
@@ -87,7 +87,7 @@ namespace cohort {
 
     public:
 
-      Object() : lru_flags(FLAG_NONE), lru_refcnt(0), lru_adj(0) {}
+      Object() : lru_refcnt(0) {}
 
       uint32_t get_refcnt() const { return lru_refcnt; }
 
@@ -120,8 +120,8 @@ namespace cohort {
 
       struct Lane {
        LK lock;
-       Object::Queue q;
-       // Object::Queue pinned; /* placeholder for possible expansion */
+        Object::Queue q;
+        Object::Queue active; /* objects with an initial ref cannot be evicted */
        CACHE_PAD(0);
        Lane() {}
       };
@@ -131,15 +131,8 @@ namespace cohort {
       std::atomic<uint32_t> evict_lane;
       const uint32_t lane_hiwat;
 
-      static constexpr uint32_t lru_adj_modulus = 5;
-
       static constexpr uint32_t SENTINEL_REFCNT = 1;
 
-      /* internal flag values */
-      static constexpr uint32_t FLAG_INLRU = 0x0001;
-      static constexpr uint32_t FLAG_PINNED  = 0x0002; // possible future use
-      static constexpr uint32_t FLAG_EVICTING = 0x0004;
-
       Lane& lane_of(void* addr) {
        return qlane[(uint64_t)(addr) % n_lanes];
       }
@@ -148,9 +141,11 @@ namespace cohort {
        return (evict_lane++ % n_lanes);
       }
 
-      bool can_reclaim(Object* o) {
-       return ((o->lru_refcnt == SENTINEL_REFCNT) &&
-               (!(o->lru_flags & FLAG_EVICTING)));
+      bool
+      can_reclaim(Object* o) { /* XXX called with lane lock held */
+        auto evicting = o->evicting.test();
+        return ((o->lru_refcnt == SENTINEL_REFCNT) &&
+                (! evicting));
       }
 
       Object* evict_block(const ObjectFactory* newobj_fac) {
@@ -158,12 +153,16 @@ namespace cohort {
        for (int ix = 0; ix < n_lanes; ++ix,
               lane_ix = next_evict_lane()) {
          Lane& lane = qlane[lane_ix];
-         std::unique_lock lane_lock{lane.lock};
+          std::unique_lock lane_lock{lane.lock};
+          /* back() on empty intrusive list is undefined */
+          if (lane.q.empty()) {
+            continue;
+          }
+          Object* o = &(lane.q.back());
          /* if object at LRU has refcnt==1, it may be reclaimable */
-         Object* o = &(lane.q.back());
          if (can_reclaim(o)) {
            ++(o->lru_refcnt);
-           o->lru_flags |= FLAG_EVICTING;
+           (void) o->evicting.test_and_set();
            lane_lock.unlock();
            if (o->reclaim(newobj_fac)) {
              lane_lock.lock();
@@ -171,14 +170,13 @@ namespace cohort {
              /* assertions that o state has not changed across
               * relock */
              ceph_assert(o->lru_refcnt == SENTINEL_REFCNT);
-             ceph_assert(o->lru_flags & FLAG_INLRU);
              Object::Queue::iterator it =
                Object::Queue::s_iterator_to(*o);
              lane.q.erase(it);
              return o;
            } else {
-             --(o->lru_refcnt);
-             o->lru_flags &= ~FLAG_EVICTING;
+              --(o->lru_refcnt);
+              o->evicting.clear();
              /* unlock in next block */
            }
          } /* can_reclaim(o) */
@@ -199,25 +197,26 @@ namespace cohort {
 
       bool ref(Object* o, uint32_t flags) {
        ++(o->lru_refcnt);
-       if (flags & FLAG_INITIAL) {
-         if ((++(o->lru_adj) % lru_adj_modulus) == 0) {
-           Lane& lane = lane_of(o);
-           lane.lock.lock();
-           /* move to MRU */
+        if (flags & FLAG_INITIAL) {
+          Lane& lane = lane_of(o);
+          lane.lock.lock();
+         if (! o->active.test()) {
            Object::Queue::iterator it =
              Object::Queue::s_iterator_to(*o);
            lane.q.erase(it);
-           lane.q.push_front(*o);
-           lane.lock.unlock();
-         } /* adj */
+            lane.active.push_front(*o);
+            (void) o->active.test_and_set();
+          } /* adj */
+          lane.lock.unlock();
        } /* initial ref */
        return true;
       } /* ref */
 
-      void unref(Object* o, uint32_t flags) {
+      void
+      unref(Object* o, uint32_t flags) {
        uint32_t refcnt = --(o->lru_refcnt);
        Object* tdo = nullptr;
-       if (unlikely(refcnt == 0)) {
+       if (unlikely(refcnt == 0 /* last ref */)) {
          Lane& lane = lane_of(o);
          lane.lock.lock();
          refcnt = o->lru_refcnt.load();
@@ -233,15 +232,19 @@ namespace cohort {
          lane.lock.lock();
          refcnt = o->lru_refcnt.load();
          if (likely(refcnt == SENTINEL_REFCNT)) {
-           /* move to LRU */
-           Object::Queue::iterator it =
-             Object::Queue::s_iterator_to(*o);
-           lane.q.erase(it);
+           /* move to MRU */
+            Object::Queue::iterator it = Object::Queue::s_iterator_to(*o);
+            auto active = o->active.test();
+            if (active) {
+              lane.active.erase(it);
+            } else {
+              lane.q.erase(it);
+            }
            /* hiwat check */
            if (lane.q.size() > lane_hiwat) {
              tdo = o;
            } else {
-             lane.q.push_back(*o);
+             lane.q.push_front(*o);
            }
          }
          lane.lock.unlock();
@@ -262,26 +265,19 @@ namespace cohort {
        else
          o = fac->alloc(); /* get a new one */
 
-       o->lru_flags = FLAG_INLRU;
-
        Lane& lane = lane_of(o);
-       lane.lock.lock();
-       switch (edge) {
-       case Edge::MRU:
-         lane.q.push_front(*o);
-         break;
-       case Edge::LRU:
-         lane.q.push_back(*o);
-         break;
-       default:
-         ceph_abort();
-         break;
-       }
-       if (flags & FLAG_INITIAL)
+        lane.lock.lock();
+
+        /* XXX always insert onto active */
+        lane.active.push_front(*o);
+        (void) o->active.test_and_set();
+
+        if (flags & FLAG_INITIAL)
          o->lru_refcnt += 2; /* sentinel ref + initial */
        else
          ++(o->lru_refcnt); /* sentinel */
-       lane.lock.unlock();
+
+        lane.lock.unlock();
        return o;
       } /* insert */
 
index 5032b5105eb4c411915c4a607a58b7a004f3a557..f1585d532df5cdc29c7e7450eb5b24dc63d3f4ca 100644 (file)
@@ -1506,11 +1506,8 @@ namespace rgw {
     if (factory == nullptr) {
       return false;
     }
-    /* make sure the reclaiming object is the same partition with newobject factory,
-     * then we can recycle the object, and replace with newobject */
-    if (!fs->fh_cache.is_same_partition(factory->fhk.fh_hk.object, fh.fh_hk.object)) {
-      return false;
-    }
+    /* XXXX seems like we should use a flag rather than is_linked(),
+     * as we now depend on safe_link mode */
     /* in the non-delete case, handle may still be in handle table */
     if (fh_hook.is_linked()) {
       /* in this case, we are being called from a context which holds
index fe8d5da5ac1ddab9697c91dbf491ad23be27a638..8a2adb032c88f9034f3e69b1929e42802f838277 100644 (file)
@@ -1070,7 +1070,7 @@ namespace rgw {
        /* LATCHED, LOCKED */
        if (! (flags & RGWFileHandle::FLAG_LOCK))
          fh->mtx.unlock(); /* ! LOCKED */
-      }
+      } /* fh */
       lat.lock->unlock(); /* !LATCHED */
       get<0>(fhr) = fh;
       if (fh) {