]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
Objecter: don't attempt to read from non-primary on EC pools 35326/head
authorIlya Dryomov <idryomov@gmail.com>
Thu, 28 May 2020 10:24:20 +0000 (12:24 +0200)
committerIlya Dryomov <idryomov@gmail.com>
Sun, 31 May 2020 12:26:28 +0000 (14:26 +0200)
With BALANCE_READS or LOCALIZE_READS set, the client will hang if
the non-primary OSD is picked because the OSD will most likely drop
the op (or start waiting for peering that won't actually happen).

Refactor the code so that the replica read conditions don't need to
be repeated.  Apart from the missing replica pool check, the acting
set size was checked only in the LOCALIZE_READS case.

Fixes: https://tracker.ceph.com/issues/45793
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
src/osdc/Objecter.cc

index c59807ee4e3896c522ca0be79e1ba0fc60518b79..6c260d57cecc2efef1c024d474c8a512d2329e11 100644 (file)
@@ -2853,20 +2853,19 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
                   << " acting " << acting
                   << " primary " << acting_primary << dendl;
     t->used_replica = false;
-    if (acting_primary == -1) {
-      t->osd = -1;
-    } else {
+    if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+                     CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+        !is_write && pi->is_replicated() && acting.size() > 1) {
       int osd;
-      bool read = is_read && !is_write;
-      if (read && (t->flags & CEPH_OSD_FLAG_BALANCE_READS)) {
+      ceph_assert(is_read && acting[0] == acting_primary);
+      if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
        int p = rand() % acting.size();
        if (p)
          t->used_replica = true;
        osd = acting[p];
        ldout(cct, 10) << " chose random osd." << osd << " of " << acting
                       << dendl;
-      } else if (read && (t->flags & CEPH_OSD_FLAG_LOCALIZE_READS) &&
-                acting.size() > 1) {
+      } else {
        // look for a local replica.  prefer the primary if the
        // distance is the same.
        int best = -1;
@@ -2889,10 +2888,10 @@ int Objecter::_calc_target(op_target_t *t, Connection *con, bool any_change)
        }
        ceph_assert(best >= 0);
        osd = acting[best];
-      } else {
-       osd = acting_primary;
       }
       t->osd = osd;
+    } else {
+      t->osd = acting_primary;
     }
   }
   if (legacy_change || unpaused || force_resend) {