]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
msg/async/rdma: deal with all RDMA device async event
authorChangcheng Liu <changcheng.liu@aliyun.com>
Thu, 20 Jun 2019 03:20:27 +0000 (11:20 +0800)
committerChangcheng Liu <changcheng.liu@aliyun.com>
Fri, 23 Aug 2019 02:45:22 +0000 (10:45 +0800)
1. List all asynchronous event of the RDMA device
2. Output the fatal error events to check RDMA device status

Signed-off-by: Changcheng Liu <changcheng.liu@aliyun.com>
src/msg/async/rdma/RDMAStack.cc

index 5c11a8c8afc9b17e12f50f7107be9217852c51c4..eb8db72b134d7748c654979b2873f0d914014d02 100644 (file)
@@ -132,27 +132,102 @@ void RDMADispatcher::handle_async_event()
       return;
     }
     perf_logger->inc(l_msgr_rdma_total_async_events);
-    // FIXME: Currently we must ensure no other factor make QP in ERROR state,
-    // otherwise this qp can't be deleted in current cleanup flow.
-    if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) {
-      perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
-      uint64_t qpn = async_event.element.qp->qp_num;
-      ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp
+    switch (async_event.event_type) {
+      /***********************CQ events********************/
+      case IBV_EVENT_CQ_ERR:
+        lderr(cct) << __func__ << " CQ Overflow, dev = " << get_stack()->get_infiniband().get_device()->ctxt
+                   << " Need destroy and recreate resource " << dendl;
+        break;
+      /***********************QP events********************/
+      case IBV_EVENT_QP_FATAL:
+        /* Error occurred on a QP and it transitioned to error state */
+        lderr(cct) << __func__ << " Error occurred on a QP and it transitioned to error state, dev = "
+                   << get_stack()->get_infiniband().get_device()->ctxt << " Need destroy and recreate resource " << dendl;
+        break;
+      case IBV_EVENT_QP_LAST_WQE_REACHED:
+        /* Last WQE Reached on a QP associated with and SRQ */
+        {
+          // FIXME: Currently we must ensure no other factor make QP in ERROR state,
+          // otherwise this qp can't be deleted in current cleanup flow.
+          perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+          uint64_t qpn = async_event.element.qp->qp_num;
+          lderr(cct) << __func__ << " event associated qp=" << async_event.element.qp
                      << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
-      std::lock_guard l{lock};
-      RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
-      if (!conn) {
-        ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl;
-      } else {
-        ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl;
-        conn->fault();
-        if (!cct->_conf->ms_async_rdma_cm)
-          erase_qpn_lockless(qpn);
-      }
-    } else {
-      ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt
-                    << " evt: " << ibv_event_type_str(async_event.event_type)
-                    << dendl;
+          std::lock_guard l{lock};
+          RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+          if (!conn) {
+            ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl;
+          } else {
+            ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl;
+            conn->fault();
+            if (!cct->_conf->ms_async_rdma_cm)
+              erase_qpn_lockless(qpn);
+          }
+        }
+        break;
+      case IBV_EVENT_QP_REQ_ERR:
+        /* Invalid Request Local Work Queue Error */
+        [[fallthrough]];
+      case IBV_EVENT_QP_ACCESS_ERR:
+        /* Local access violation error */
+        [[fallthrough]];
+      case IBV_EVENT_COMM_EST:
+        /* Communication was established on a QP */
+        [[fallthrough]];
+      case IBV_EVENT_SQ_DRAINED:
+        /* Send Queue was drained of outstanding messages in progress */
+        [[fallthrough]];
+      case IBV_EVENT_PATH_MIG:
+        /* A connection has migrated to the alternate path */
+        [[fallthrough]];
+      case IBV_EVENT_PATH_MIG_ERR:
+        /* A connection failed to migrate to the alternate path */
+        [[fallthrough]];
+
+      /***********************SRQ events*******************/
+      case IBV_EVENT_SRQ_ERR:
+        /* Error occurred on an SRQ */
+        // fall through #TODO
+        [[fallthrough]];
+      case IBV_EVENT_SRQ_LIMIT_REACHED:
+        /* SRQ limit was reached */
+        [[fallthrough]];
+        // fall through #TODO
+
+      /***********************Port events******************/
+      case IBV_EVENT_PORT_ACTIVE:
+        /* Link became active on a port */
+        [[fallthrough]];
+        // fall through #TODO
+      case IBV_EVENT_PORT_ERR:
+        /* Link became unavailable on a port */
+        [[fallthrough]];
+      case IBV_EVENT_LID_CHANGE:
+        /* LID was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_PKEY_CHANGE:
+        /* P_Key table was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_SM_CHANGE:
+        /* SM was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_CLIENT_REREGISTER:
+        /* SM sent a CLIENT_REREGISTER request to a port */
+        [[fallthrough]];
+      case IBV_EVENT_GID_CHANGE:
+        /* GID table was changed on a port */
+        [[fallthrough]];
+
+      /***********************CA events******************/
+      //CA events:
+      case IBV_EVENT_DEVICE_FATAL:
+        /* CA is in FATAL state */
+        ldout(cct, 1) << __func__ << " ibv_get_async_event: dev = " << get_stack()->get_infiniband().get_device()->ctxt
+                      << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+       break;
+      default:
+        lderr(cct) << __func__ << " ibv_get_async_event: dev = " << get_stack()->get_infiniband().get_device()->ctxt
+                   << " unknown event: " << async_event.event_type << dendl;
     }
     ibv_ack_async_event(&async_event);
   }