]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
AsyncMessenger: move Async* related file to msg/async
authorHaomai Wang <haomaiwang@gmail.com>
Wed, 8 Oct 2014 06:08:55 +0000 (14:08 +0800)
committerHaomai Wang <haomaiwang@gmail.com>
Wed, 8 Oct 2014 07:53:20 +0000 (15:53 +0800)
Signed-off-by: Haomai Wang <haomaiwang@gmail.com>
20 files changed:
src/msg/AsyncConnection.cc [deleted file]
src/msg/AsyncConnection.h [deleted file]
src/msg/AsyncMessenger.cc [deleted file]
src/msg/AsyncMessenger.h [deleted file]
src/msg/Event.cc [deleted file]
src/msg/Event.h [deleted file]
src/msg/EventEpoll.cc [deleted file]
src/msg/EventEpoll.h [deleted file]
src/msg/async/AsyncConnection.cc [new file with mode: 0644]
src/msg/async/AsyncConnection.h [new file with mode: 0644]
src/msg/async/AsyncMessenger.cc [new file with mode: 0644]
src/msg/async/AsyncMessenger.h [new file with mode: 0644]
src/msg/async/Event.cc [new file with mode: 0644]
src/msg/async/Event.h [new file with mode: 0644]
src/msg/async/EventEpoll.cc [new file with mode: 0644]
src/msg/async/EventEpoll.h [new file with mode: 0644]
src/msg/async/net_handler.cc [new file with mode: 0644]
src/msg/async/net_handler.h [new file with mode: 0644]
src/msg/net_handler.cc [deleted file]
src/msg/net_handler.h [deleted file]

diff --git a/src/msg/AsyncConnection.cc b/src/msg/AsyncConnection.cc
deleted file mode 100644 (file)
index 31a9948..0000000
+++ /dev/null
@@ -1,2026 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <unistd.h>
-
-#include "include/Context.h"
-#include "common/errno.h"
-#include "AsyncMessenger.h"
-#include "AsyncConnection.h"
-
-// Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
-#define SEQ_MASK  0x7fffffff 
-
-#define dout_subsys ceph_subsys_ms
-#undef dout_prefix
-#define dout_prefix _conn_prefix(_dout)
-ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
-  return *_dout << "-- " << async_msgr->get_myinst().addr << " >> " << peer_addr << " conn(" << this
-        << " sd=" << sd << " :" << port
-        << " s=" << get_state_name(state)
-        << " pgs=" << peer_global_seq
-        << " cs=" << connect_seq
-        << " l=" << policy.lossy
-        << ").";
-}
-
-class C_handle_read : public EventCallback {
-  AsyncConnectionRef conn;
-
- public:
-  C_handle_read(AsyncConnectionRef c): conn(c) {}
-  void do_request(int fd) {
-    conn->process();
-  }
-};
-
-class C_handle_write : public EventCallback {
-  AsyncConnectionRef conn;
-
- public:
-  C_handle_write(AsyncConnectionRef c): conn(c) {}
-  void do_request(int fd) {
-    conn->handle_write();
-  }
-};
-
-class C_handle_reset : public EventCallback {
-  AsyncMessenger *msgr;
-  AsyncConnectionRef conn;
-
- public:
-  C_handle_reset(AsyncMessenger *m, AsyncConnectionRef c): msgr(m), conn(c) {}
-  void do_request(int id) {
-    msgr->ms_deliver_handle_reset(conn.get());
-  }
-};
-
-class C_handle_remote_reset : public EventCallback {
-  AsyncMessenger *msgr;
-  AsyncConnectionRef conn;
-
- public:
-  C_handle_remote_reset(AsyncMessenger *m, AsyncConnectionRef c): msgr(m), conn(c) {}
-  void do_request(int id) {
-    msgr->ms_deliver_handle_remote_reset(conn.get());
-  }
-};
-
-class C_handle_dispatch : public EventCallback {
-  AsyncMessenger *msgr;
-  Message *m;
-
- public:
-  C_handle_dispatch(AsyncMessenger *msgr, Message *m): msgr(msgr), m(m) {}
-  void do_request(int id) {
-    //msgr->ms_fast_preprocess(m);
-    //if (msgr->ms_can_fast_dispatch(m)) {
-    //  msgr->ms_fast_dispatch(m);
-    //} else {
-      msgr->ms_deliver_dispatch(m);
-    //}
-  }
-};
-
-
-static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off)
-{
-  // create a buffer to read into that matches the data alignment
-  unsigned left = len;
-  if (off & ~CEPH_PAGE_MASK) {
-    // head
-    unsigned head = 0;
-    head = MIN(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
-    bufferptr bp = buffer::create(head);
-    data.push_back(bp);
-    left -= head;
-  }
-  unsigned middle = left & CEPH_PAGE_MASK;
-  if (middle > 0) {
-    bufferptr bp = buffer::create_page_aligned(middle);
-    data.push_back(bp);
-    left -= middle;
-  }
-  if (left) {
-    bufferptr bp = buffer::create(left);
-    data.push_back(bp);
-  }
-}
-
-AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c)
-  : Connection(cct, m), async_msgr(m), global_seq(0), connect_seq(0), out_seq(0), in_seq(0), in_seq_acked(0),
-    state(STATE_NONE), state_after_send(0), sd(-1),
-    lock("AsyncConnection::lock"), open_write(false), keepalive(false),
-    got_bad_auth(false), authorizer(NULL),
-    state_buffer(4096), state_offset(0), net(cct), center(c)
-{
-  read_handler.reset(new C_handle_read(this));
-  write_handler.reset(new C_handle_write(this));
-  reset_handler.reset(new C_handle_reset(async_msgr, this));
-  remote_reset_handler.reset(new C_handle_remote_reset(async_msgr, this));
-  memset(msgvec, 0, sizeof(msgvec));
-}
-
-AsyncConnection::~AsyncConnection()
-{
-  assert(!authorizer);
-}
-
-/* return -1 means `fd` occurs error or closed, it should be closed
- * return 0 means EAGAIN or EINTR */
-int AsyncConnection::read_bulk(int fd, char *buf, int len)
-{
-  int nread = ::read(fd, buf, len);
-  if (nread == -1) {
-    if (errno == EAGAIN || errno == EINTR) {
-      nread = 0;
-    } else {
-      ldout(async_msgr->cct, 1) << __func__ << " Reading from fd=" << fd
-                          << " : "<< strerror(errno) << dendl;
-      return -1;
-    }
-  } else if (nread == 0) {
-    ldout(async_msgr->cct, 1) << __func__ << " Peer close file descriptor "
-                              << fd << dendl;
-    return -1;
-  }
-  return nread;
-}
-
-// return the length of msg needed to be sent,
-// < 0 means error occured
-int AsyncConnection::do_sendmsg(struct msghdr &msg, int len, bool more)
-{
-  while (len > 0) {
-    int r = ::sendmsg(sd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
-
-    if (r == 0) {
-      ldout(async_msgr->cct, 10) << __func__ << " sendmsg got r==0!" << dendl;
-    } else if (r < 0) {
-      if (errno == EAGAIN || errno == EINTR) {
-        r = len;
-      } else {
-        ldout(async_msgr->cct, 1) << __func__ << " sendmsg error: " << cpp_strerror(errno) << dendl;
-      }
-
-      return r;
-    }
-
-    len -= r;
-    if (len == 0) break;
-
-    // hrmph.  trim r bytes off the front of our message.
-    ldout(async_msgr->cct, 20) << __func__ << " short write did " << r << ", still have " << len << dendl;
-    while (r > 0) {
-      if (msg.msg_iov[0].iov_len <= (size_t)r) {
-        // lose this whole item
-        r -= msg.msg_iov[0].iov_len;
-        msg.msg_iov++;
-        msg.msg_iovlen--;
-      } else {
-        msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r;
-        msg.msg_iov[0].iov_len -= r;
-        break;
-      }
-    }
-  }
-  return 0;
-}
-
-// return the remaining bytes, it may larger than the length of ptr
-// else return < 0 means error
-int AsyncConnection::_try_send(bufferlist send_bl, bool send)
-{
-  if (send_bl.length()) {
-    if (outcoming_bl.length())
-      outcoming_bl.claim_append(send_bl);
-    else
-      outcoming_bl.swap(send_bl);
-  }
-
-  if (!send)
-    return 0;
-
-  // standby?
-  if (is_queued() && state == STATE_STANDBY && !policy.server) {
-    assert(!outcoming_bl.length());
-    connect_seq++;
-    state = STATE_CONNECTING;
-    center->create_time_event(0, read_handler);
-    return 0;
-  }
-
-  if (state == STATE_STANDBY) {
-    ldout(async_msgr->cct, 1) << __func__ << " connection is standby" << dendl;
-    return 0;
-  }
-  if (state == STATE_CLOSED) {
-    ldout(async_msgr->cct, 1) << __func__ << " connection is closed" << dendl;
-    return -EINTR;
-  }
-
-  int r = 0;
-  uint64_t sended = 0;
-  list<bufferptr>::const_iterator pb = outcoming_bl.buffers().begin();
-  while (outcoming_bl.length() > sended) {
-    struct msghdr msg;
-    int size = MIN(outcoming_bl.buffers().size(), IOV_LEN);
-    memset(&msg, 0, sizeof(msg));
-    msg.msg_iovlen = 0;
-    msg.msg_iov = msgvec;
-    int msglen = 0;
-    while (size > 0) {
-      msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str());
-      msgvec[msg.msg_iovlen].iov_len = pb->length();
-      msg.msg_iovlen++;
-      msglen += pb->length();
-      pb++;
-      size--;
-    }
-
-    r = do_sendmsg(msg, msglen, false);
-    if (r < 0)
-      return r;
-
-    // "r" is the remaining length
-    sended += msglen - r;
-    if (r > 0) {
-      ldout(async_msgr->cct, 5) << __func__ << " remaining " << r
-                          << " needed to be sent, creating event for writing"
-                          << dendl;
-      break;
-    }
-    // only "r" == 0 continue
-  }
-
-  // trim already sent for outcoming_bl
-  if (sended) {
-    bufferlist bl;
-    if (sended < outcoming_bl.length())
-      outcoming_bl.splice(sended, outcoming_bl.length()-sended, &bl);
-    bl.swap(outcoming_bl);
-  }
-
-  ldout(async_msgr->cct, 20) << __func__ << " send bytes " << sended
-                             << " remaining bytes " << outcoming_bl.length() << dendl;
-
-  if (!open_write && is_queued()) {
-    center->create_file_event(sd, EVENT_WRITABLE, write_handler);
-    open_write = true;
-  }
-
-  if (open_write && !is_queued()) {
-    center->delete_file_event(sd, EVENT_WRITABLE);
-    open_write = false;
-  }
-
-  return outcoming_bl.length();
-}
-
-// Because this func will be called multi times to populate
-// the needed buffer, so the passed in bufferptr must be the same.
-// Normally, only "read_message" will pass existing bufferptr in
-//
-// return the remaining bytes, 0 means this buffer is finished
-// else return < 0 means error
-int AsyncConnection::read_until(uint64_t needed, bufferptr &p)
-{
-  assert(needed);
-  int offset = state_offset;
-  int left = needed - offset;
-  int r;
-  do {
-    r = read_bulk(sd, p.c_str()+offset, left);
-    if (r < 0) {
-      ldout(async_msgr->cct, 1) << __func__ << " read failed, state is " << get_state_name(state) << dendl;
-      return -1;
-    } else if (r == left) {
-      state_offset = 0;
-      return 0;
-    }
-    left -= r;
-    offset += r;
-  } while (r > 0);
-
-  state_offset = offset;
-  ldout(async_msgr->cct, 20) << __func__ << " read " << r << " bytes, state is "
-                      << get_state_name(state) << dendl;
-  return needed - offset;
-}
-
-void AsyncConnection::process()
-{
-  int r = 0;
-  int prev_state = state;
-  Mutex::Locker l(lock);
-  do {
-    ldout(async_msgr->cct, 20) << __func__ << " state is " << get_state_name(state)
-                               << ", prev state is " << get_state_name(prev_state) << dendl;
-    prev_state = state;
-    switch (state) {
-      case STATE_OPEN:
-        {
-          char tag = -1;
-          r = read_bulk(sd, &tag, sizeof(tag));
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read tag failed, state is "
-                                      << get_state_name(state) << dendl;
-            goto fail;
-          } else if (r == 0) {
-            break;
-          }
-          assert(r == 1);
-
-          if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
-            ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE" << dendl;
-          } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
-            state = STATE_OPEN_KEEPALIVE2;
-          } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
-            state = STATE_OPEN_KEEPALIVE2_ACK;
-          } else if (tag == CEPH_MSGR_TAG_ACK) {
-            state = STATE_OPEN_TAG_ACK;
-          } else if (tag == CEPH_MSGR_TAG_MSG) {
-            state = STATE_OPEN_MESSAGE_HEADER;
-          } else if (tag == CEPH_MSGR_TAG_CLOSE) {
-            state = STATE_OPEN_TAG_CLOSE;
-          } else {
-            ldout(async_msgr->cct, 0) << __func__ << " bad tag " << (int)tag << dendl;
-            goto fail;
-          }
-
-          break;
-        }
-
-      case STATE_OPEN_KEEPALIVE2:
-        {
-          ceph_timespec *t;
-          r = read_until(sizeof(*t), state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-
-          ldout(async_msgr->cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
-          t = (ceph_timespec*)(state_buffer.c_str());
-          utime_t kp_t = utime_t(*t);
-          _send_keepalive_or_ack(true, &kp_t);
-          ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
-          state = STATE_OPEN;
-          break;
-        }
-
-      case STATE_OPEN_KEEPALIVE2_ACK:
-        {
-          ceph_timespec *t;
-          r = read_until(sizeof(*t), state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-
-          t = (ceph_timespec*)(state_buffer.c_str());
-          last_keepalive_ack = utime_t(*t);
-          ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
-          state = STATE_OPEN;
-          break;
-        }
-
-      case STATE_OPEN_TAG_ACK:
-        {
-          ceph_le64 *seq;
-          r = read_until(sizeof(seq), state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read ack seq failed" << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-
-          seq = (ceph_le64*)(state_buffer.c_str());
-          ldout(async_msgr->cct, 20) << __func__ << " got ACK" << dendl;
-          handle_ack(*seq);
-          state = STATE_OPEN;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_HEADER:
-        {
-          ldout(async_msgr->cct, 20) << __func__ << " begin MSG" << dendl;
-          ceph_msg_header header;
-          ceph_msg_header_old oldheader;
-          __u32 header_crc;
-          int len;
-          if (has_feature(CEPH_FEATURE_NOSRCADDR))
-            len = sizeof(header);
-          else
-            len = sizeof(oldheader);
-
-          r = read_until(len, state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read message header failed" << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-
-          ldout(async_msgr->cct, 20) << __func__ << " got MSG header" << dendl;
-
-          if (has_feature(CEPH_FEATURE_NOSRCADDR)) {
-            header = *((ceph_msg_header*)state_buffer.c_str());
-            header_crc = ceph_crc32c(0, (unsigned char *)&header,
-                                    sizeof(header) - sizeof(header.crc));
-          } else {
-            oldheader = *((ceph_msg_header_old*)state_buffer.c_str());
-            // this is fugly
-            memcpy(&header, &oldheader, sizeof(header));
-            header.src = oldheader.src.name;
-            header.reserved = oldheader.reserved;
-            header.crc = oldheader.crc;
-            header_crc = ceph_crc32c(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
-          }
-
-          ldout(async_msgr->cct, 20) << __func__ << " got envelope type=" << header.type
-                              << " src " << entity_name_t(header.src)
-                              << " front=" << header.front_len
-                              << " data=" << header.data_len
-                              << " off " << header.data_off << dendl;
-
-          // verify header crc
-          if (header_crc != header.crc) {
-            ldout(async_msgr->cct,0) << __func__ << "reader got bad header crc "
-                              << header_crc << " != " << header.crc << dendl;
-            goto fail;
-          }
-
-          // Reset state
-          data_buf.clear();
-          front.clear();
-          middle.clear();
-          data.clear();
-          recv_stamp = ceph_clock_now(async_msgr->cct);
-          current_header = header;
-          state = STATE_OPEN_MESSAGE_THROTTLE_MESSAGE;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_THROTTLE_MESSAGE:
-        {
-          if (policy.throttler_messages) {
-            ldout(async_msgr->cct,10) << __func__ << " wants " << 1 << " message from policy throttler "
-                                << policy.throttler_messages->get_current() << "/"
-                                << policy.throttler_messages->get_max() << dendl;
-            // FIXME: may block
-            policy.throttler_messages->get();
-          }
-
-          state = STATE_OPEN_MESSAGE_THROTTLE_BYTES;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_THROTTLE_BYTES:
-        {
-          uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
-          if (message_size) {
-            if (policy.throttler_bytes) {
-              ldout(async_msgr->cct,10) << __func__ << " wants " << message_size << " bytes from policy throttler "
-                  << policy.throttler_bytes->get_current() << "/"
-                  << policy.throttler_bytes->get_max() << dendl;
-              // FIXME: may block
-              policy.throttler_bytes->get(message_size);
-            }
-          }
-
-          throttle_stamp = ceph_clock_now(msgr->cct);
-          state = STATE_OPEN_MESSAGE_READ_FRONT;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_READ_FRONT:
-        {
-          // read front
-          int front_len = current_header.front_len;
-          if (front_len) {
-            bufferptr ptr = buffer::create(front_len);
-            r = read_until(front_len, ptr);
-            if (r < 0) {
-              ldout(async_msgr->cct, 1) << __func__ << " read message front failed" << dendl;
-              goto fail;
-            } else if (r > 0) {
-              break;
-            }
-
-            front.push_back(ptr);
-            ldout(async_msgr->cct, 20) << __func__ << " got front " << front.length() << dendl;
-          }
-          state = STATE_OPEN_MESSAGE_READ_MIDDLE;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_READ_MIDDLE:
-        {
-          // read middle
-          int middle_len = current_header.middle_len;
-          if (middle_len) {
-            bufferptr ptr = buffer::create(middle_len);
-            r = read_until(middle_len, ptr);
-            if (r < 0) {
-              ldout(async_msgr->cct, 1) << __func__ << " read message middle failed" << dendl;
-              goto fail;
-            } else if (r > 0) {
-              break;
-            }
-            middle.push_back(ptr);
-            ldout(async_msgr->cct, 20) << __func__ << " got middle " << middle.length() << dendl;
-          }
-
-          state = STATE_OPEN_MESSAGE_READ_DATA_PREPARE;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_READ_DATA_PREPARE:
-        {
-          // read data
-          uint64_t data_len = le32_to_cpu(current_header.data_len);
-          int data_off = le32_to_cpu(current_header.data_off);
-          if (data_len) {
-            // get a buffer
-            map<ceph_tid_t,pair<bufferlist,int> >::iterator p = rx_buffers.find(current_header.tid);
-            if (p != rx_buffers.end()) {
-              ldout(async_msgr->cct,10) << __func__ << " seleting rx buffer v " << p->second.second
-                                  << " at offset " << data_off
-                                  << " len " << p->second.first.length() << dendl;
-              data_buf = p->second.first;
-              // make sure it's big enough
-              if (data_buf.length() < data_len)
-                data_buf.push_back(buffer::create(data_len - data_buf.length()));
-              data_blp = data_buf.begin();
-            } else {
-              ldout(async_msgr->cct,20) << __func__ << " allocating new rx buffer at offset " << data_off << dendl;
-              alloc_aligned_buffer(data_buf, data_len, data_off);
-              data_blp = data_buf.begin();
-            }
-          }
-
-          msg_left = data_len;
-          state = STATE_OPEN_MESSAGE_READ_DATA;
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_READ_DATA:
-        {
-          while (msg_left > 0) {
-            bufferptr bp = data_blp.get_current_ptr();
-            uint64_t read = MIN(bp.length(), msg_left);
-            r = read_until(read, bp);
-            if (r < 0) {
-              ldout(async_msgr->cct, 1) << __func__ << " read data error " << dendl;
-              goto fail;
-            } else if (r > 0) {
-              break;
-            }
-
-            data_blp.advance(read);
-            data.append(bp, 0, read);
-            msg_left -= read;
-          }
-
-          if (msg_left == 0)
-            state = STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH;
-
-          break;
-        }
-
-      case STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH:
-        {
-          ceph_msg_footer footer;
-          ceph_msg_footer_old old_footer;
-          int len;
-          // footer
-          if (has_feature(CEPH_FEATURE_MSG_AUTH))
-            len = sizeof(footer);
-          else
-            len = sizeof(old_footer);
-
-          r = read_until(len, state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read footer data error " << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-
-          if (has_feature(CEPH_FEATURE_MSG_AUTH)) {
-            footer = *((ceph_msg_footer*)state_buffer.c_str());
-          } else {
-            old_footer = *((ceph_msg_footer_old*)state_buffer.c_str());
-            footer.front_crc = old_footer.front_crc;
-            footer.middle_crc = old_footer.middle_crc;
-            footer.data_crc = old_footer.data_crc;
-            footer.sig = 0;
-            footer.flags = old_footer.flags;
-          }
-          int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
-          ldout(async_msgr->cct, 10) << __func__ << " aborted = " << aborted << dendl;
-          if (aborted) {
-            ldout(async_msgr->cct, 0) << __func__ << " got " << front.length() << " + " << middle.length() << " + " << data.length()
-                                << " byte message.. ABORTED" << dendl;
-            goto fail;
-          }
-
-          ldout(async_msgr->cct, 20) << __func__ << " got " << front.length() << " + " << middle.length()
-                              << " + " << data.length() << " byte message" << dendl;
-          Message *message = decode_message(async_msgr->cct, current_header, footer, front, middle, data);
-          if (!message) {
-            ldout(async_msgr->cct, 1) << __func__ << " decode message failed " << dendl;
-            goto fail;
-          }
-
-          //
-          //  Check the signature if one should be present.  A zero return indicates success. PLR
-          //
-
-          if (session_security.get() == NULL) {
-            ldout(async_msgr->cct, 10) << __func__ << " No session security set" << dendl;
-          } else {
-            if (session_security->check_message_signature(message)) {
-              ldout(async_msgr->cct, 0) << __func__ << "Signature check failed" << dendl;
-              goto fail;
-            }
-          }
-          message->set_byte_throttler(policy.throttler_bytes);
-          message->set_message_throttler(policy.throttler_messages);
-
-          // store reservation size in message, so we don't get confused
-          // by messages entering the dispatch queue through other paths.
-          uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
-          message->set_dispatch_throttle_size(message_size);
-
-          message->set_recv_stamp(recv_stamp);
-          message->set_throttle_stamp(throttle_stamp);
-          message->set_recv_complete_stamp(ceph_clock_now(async_msgr->cct));
-
-          // check received seq#.  if it is old, drop the message.  
-          // note that incoming messages may skip ahead.  this is convenient for the client
-          // side queueing because messages can't be renumbered, but the (kernel) client will
-          // occasionally pull a message out of the sent queue to send elsewhere.  in that case
-          // it doesn't matter if we "got" it or not.
-          if (message->get_seq() <= in_seq) {
-            ldout(async_msgr->cct,0) << __func__ << " got old message "
-                    << message->get_seq() << " <= " << in_seq << " " << message << " " << *message
-                    << ", discarding" << dendl;
-            message->put();
-            if (has_feature(CEPH_FEATURE_RECONNECT_SEQ) && async_msgr->cct->_conf->ms_die_on_old_message)
-              assert(0 == "old msgs despite reconnect_seq feature");
-            goto fail;
-          }
-          message->set_connection(this);
-
-          // note last received message.
-          in_seq = message->get_seq();
-          ldout(async_msgr->cct, 10) << __func__ << " got message " << message->get_seq()
-                               << " " << message << " " << *message << dendl;
-
-          // if send_message always successfully send, it may have no
-          // opportunity to send seq ack. 10 is a experience value.
-          if (in_seq > in_seq_acked + 10) {
-            center->create_time_event(2, write_handler);
-          }
-
-          state = STATE_OPEN;
-
-          async_msgr->ms_fast_preprocess(message);
-          if (async_msgr->ms_can_fast_dispatch(message)) {
-            lock.Unlock();
-            async_msgr->ms_fast_dispatch(message);
-            lock.Lock();
-          } else {
-            center->create_time_event(1, EventCallbackRef(new C_handle_dispatch(async_msgr, message)));
-          }
-
-          break;
-        }
-
-      case STATE_OPEN_TAG_CLOSE:
-        {
-          ldout(async_msgr->cct,20) << __func__ << " got CLOSE" << dendl;
-          _stop();
-          break;
-        }
-
-      case STATE_STANDBY:
-        {
-          ldout(async_msgr->cct,20) << __func__ << " enter STANDY" << dendl;
-
-          break;
-        }
-
-      case STATE_CLOSED:
-        {
-          center->delete_file_event(sd, EVENT_READABLE);
-          ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
-          break;
-        }
-
-      default:
-        {
-          if (_process_connection() < 0)
-            goto fail;
-          break;
-        }
-    }
-
-    continue;
-
-fail:
-    // clean up state internal variables and states
-    if (state >= STATE_CONNECTING_SEND_CONNECT_MSG &&
-        state <= STATE_CONNECTING_READY) {
-      delete authorizer;
-      authorizer = NULL;
-      got_bad_auth = false;
-    }
-
-    if (state > STATE_OPEN_MESSAGE_THROTTLE_MESSAGE &&
-        state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH
-        && policy.throttler_messages) {
-      ldout(async_msgr->cct,10) << __func__ << " releasing " << 1
-                          << " message to policy throttler "
-                          << policy.throttler_messages->get_current() << "/"
-                          << policy.throttler_messages->get_max() << dendl;
-      policy.throttler_messages->put();
-    }
-    if (state > STATE_OPEN_MESSAGE_THROTTLE_BYTES &&
-        state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH) {
-      uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
-      if (policy.throttler_bytes) {
-        ldout(async_msgr->cct,10) << __func__ << " releasing " << message_size
-                            << " bytes to policy throttler "
-                            << policy.throttler_bytes->get_current() << "/"
-                            << policy.throttler_bytes->get_max() << dendl;
-        policy.throttler_bytes->put(message_size);
-      }
-    }
-    fault();
-  } while (prev_state != state);
-}
-
-int AsyncConnection::_process_connection()
-{
-  int r = 0;
-
-  switch(state) {
-    case STATE_WAIT_SEND:
-      {
-        if (!outcoming_bl.length()) {
-          assert(state_after_send);
-          state = state_after_send;
-          state_after_send = 0;
-        }
-        break;
-      }
-
-    case STATE_CONNECTING:
-      {
-        assert(!policy.server);
-
-        // reset connect state variables
-        got_bad_auth = false;
-        delete authorizer;
-        authorizer = NULL;
-        memset(&connect_msg, 0, sizeof(connect_msg));
-        memset(&connect_reply, 0, sizeof(connect_reply));
-
-        global_seq = async_msgr->get_global_seq();
-        // close old socket.  this is safe because we stopped the reader thread above.
-        if (sd >= 0) {
-          center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
-          ::close(sd);
-        }
-
-        sd = net.connect(get_peer_addr());
-        if (sd < 0) {
-          goto fail;
-        }
-        r = net.set_nonblock(sd);
-        if (r < 0) {
-          goto fail;
-        }
-        net.set_socket_options(sd);
-
-        center->create_file_event(sd, EVENT_READABLE, read_handler);
-        state = STATE_CONNECTING_WAIT_BANNER;
-        break;
-      }
-
-    case STATE_CONNECTING_WAIT_BANNER:
-      {
-        r = read_until(strlen(CEPH_BANNER), state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read banner failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        if (memcmp(state_buffer.c_str(), CEPH_BANNER, strlen(CEPH_BANNER))) {
-          ldout(async_msgr->cct, 0) << __func__ << " connect protocol error (bad banner) on peer "
-                              << get_peer_addr() << dendl;
-          goto fail;
-        }
-
-        ldout(async_msgr->cct, 10) << __func__ << " get banner, ready to send banner" << dendl;
-
-        bufferlist bl;
-        bl.append(state_buffer.c_str(), strlen(CEPH_BANNER));
-        r = _try_send(bl);
-        if (r == 0) {
-          state = STATE_CONNECTING_WAIT_IDENTIFY_PEER;
-          ldout(async_msgr->cct, 10) << __func__ << " connect write banner done: "
-                               << get_peer_addr() << dendl;
-        } else if (r > 0) {
-          state = STATE_WAIT_SEND;
-          state_after_send = STATE_CONNECTING_WAIT_IDENTIFY_PEER;
-          ldout(async_msgr->cct, 10) << __func__ << " connect wait for write banner: "
-                               << get_peer_addr() << dendl;
-        } else {
-          goto fail;
-        }
-        break;
-      }
-
-    case STATE_CONNECTING_WAIT_IDENTIFY_PEER:
-      {
-        entity_addr_t paddr, peer_addr_for_me;
-        int port;
-        bufferlist myaddrbl;
-
-        r = read_until(sizeof(paddr)*2, state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read identify peeraddr failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        bufferlist bl;
-        bl.append(state_buffer);
-        bufferlist::iterator p = bl.begin();
-        try {
-          ::decode(paddr, p);
-          ::decode(peer_addr_for_me, p);
-        } catch (const buffer::error& e) {
-          lderr(async_msgr->cct) << __func__ <<  " decode peer addr failed " << dendl;
-          goto fail;
-        }
-        port = peer_addr_for_me.get_port();
-        ldout(async_msgr->cct, 20) << __func__ <<  " connect read peer addr "
-                             << paddr << " on socket " << sd << dendl;
-        if (peer_addr != paddr) {
-          if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() &&
-              peer_addr.get_nonce() == paddr.get_nonce()) {
-            ldout(async_msgr->cct, 0) << __func__ <<  " connect claims to be " << paddr
-                                << " not " << peer_addr
-                                << " - presumably this is the same node!" << dendl;
-          } else {
-            ldout(async_msgr->cct, 0) << __func__ << " connect claims to be "
-                                << paddr << " not " << peer_addr << " - wrong node!" << dendl;
-            goto fail;
-          }
-        }
-
-        ldout(async_msgr->cct, 20) << __func__ << " connect peer addr for me is " << peer_addr_for_me << dendl;
-        async_msgr->learned_addr(peer_addr_for_me);
-        ::encode(async_msgr->get_myaddr(), myaddrbl);
-        r = _try_send(myaddrbl);
-        if (r == 0) {
-          state = STATE_CONNECTING_SEND_CONNECT_MSG;
-          ldout(async_msgr->cct, 10) << __func__ << " connect sent my addr "
-              << async_msgr->get_myaddr() << dendl;
-        } else if (r > 0) {
-          state = STATE_WAIT_SEND;
-          state_after_send = STATE_CONNECTING_SEND_CONNECT_MSG;
-          ldout(async_msgr->cct, 10) << __func__ << " connect send my addr done: "
-              << async_msgr->get_myaddr() << dendl;
-        } else {
-          ldout(async_msgr->cct, 2) << __func__ << " connect couldn't write my addr, "
-              << cpp_strerror(errno) << dendl;
-          goto fail;
-        }
-
-        break;
-      }
-
-    case STATE_CONNECTING_SEND_CONNECT_MSG:
-      {
-        if (!got_bad_auth) {
-          delete authorizer;
-          authorizer = async_msgr->get_authorizer(peer_type, false);
-        }
-        bufferlist bl;
-
-        connect_msg.features = policy.features_supported;
-        connect_msg.host_type = async_msgr->get_myinst().name.type();
-        connect_msg.global_seq = global_seq;
-        connect_msg.connect_seq = connect_seq;
-        connect_msg.protocol_version = async_msgr->get_proto_version(peer_type, true);
-        connect_msg.authorizer_protocol = authorizer ? authorizer->protocol : 0;
-        connect_msg.authorizer_len = authorizer ? authorizer->bl.length() : 0;
-        if (authorizer)
-          ldout(async_msgr->cct, 10) << __func__ <<  "connect_msg.authorizer_len="
-              << connect_msg.authorizer_len << " protocol="
-              << connect_msg.authorizer_protocol << dendl;
-        connect_msg.flags = 0;
-        if (policy.lossy)
-          connect_msg.flags |= CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
-        bl.append((char*)&connect_msg, sizeof(connect_msg));
-        if (authorizer) {
-          bl.append(authorizer->bl.c_str(), authorizer->bl.length());
-        }
-        ldout(async_msgr->cct, 10) << __func__ << " connect sending gseq=" << global_seq << " cseq="
-            << connect_seq << " proto=" << connect_msg.protocol_version << dendl;
-
-        r = _try_send(bl);
-        if (r == 0) {
-          state = STATE_CONNECTING_WAIT_CONNECT_REPLY;
-          ldout(async_msgr->cct,20) << __func__ << "connect wrote (self +) cseq, waiting for reply" << dendl;
-        } else if (r > 0) {
-          state = STATE_WAIT_SEND;
-          state_after_send = STATE_CONNECTING_WAIT_CONNECT_REPLY;
-          ldout(async_msgr->cct, 10) << __func__ << " continue send reply " << dendl;
-        } else {
-          ldout(async_msgr->cct, 2) << __func__ << " connect couldn't send reply "
-              << cpp_strerror(errno) << dendl;
-          goto fail;
-        }
-
-        break;
-      }
-
-    case STATE_CONNECTING_WAIT_CONNECT_REPLY:
-      {
-        r = read_until(sizeof(connect_reply), state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read connect reply failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        connect_reply = *((ceph_msg_connect_reply*)state_buffer.c_str());
-        connect_reply.features = ceph_sanitize_features(connect_reply.features);
-
-        ldout(async_msgr->cct, 20) << __func__ << " connect got reply tag " << (int)connect_reply.tag
-                             << " connect_seq " << connect_reply.connect_seq << " global_seq "
-                             << connect_reply.global_seq << " proto " << connect_reply.protocol_version
-                             << " flags " << (int)connect_reply.flags << " features "
-                             << connect_reply.features << dendl;
-        state = STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH;
-
-        break;
-      }
-
-    case STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH:
-      {
-        bufferlist authorizer_reply;
-        if (connect_reply.authorizer_len) {
-          ldout(async_msgr->cct, 10) << __func__ << " reply.authorizer_len=" << connect_reply.authorizer_len << dendl;
-          r = read_until(connect_reply.authorizer_len, state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read connect reply authorizer failed" << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-
-          authorizer_reply.push_back(state_buffer);
-          bufferlist::iterator iter = authorizer_reply.begin();
-          if (authorizer && !authorizer->verify_reply(iter)) {
-            ldout(async_msgr->cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
-            goto fail;
-          }
-        }
-        r = handle_connect_reply(connect_msg, connect_reply);
-        if (r < 0)
-          goto fail;
-
-        // state must be changed!
-        assert(state != STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH);
-        break;
-      }
-
-    case STATE_CONNECTING_WAIT_ACK_SEQ:
-      {
-        uint64_t newly_acked_seq = 0;
-        bufferlist bl;
-
-        r = read_until(sizeof(newly_acked_seq), state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read connect ack seq failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        newly_acked_seq = *((uint64_t*)state_buffer.c_str());
-        ldout(async_msgr->cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
-                            << " vs out_seq " << out_seq << dendl;
-        while (newly_acked_seq > out_seq) {
-          Message *m = _get_next_outgoing();
-          assert(m);
-          ldout(async_msgr->cct, 2) << __func__ << " discarding previously sent " << m->get_seq()
-                              << " " << *m << dendl;
-          assert(m->get_seq() <= newly_acked_seq);
-          m->put();
-          ++out_seq;
-        }
-
-        bl.append((char*)&in_seq, sizeof(in_seq));
-        r = _try_send(bl);
-        if (r == 0) {
-          state = STATE_CONNECTING_READY;
-          ldout(async_msgr->cct, 10) << __func__ << " send in_seq done " << dendl;
-        } else if (r > 0) {
-          state_after_send = STATE_CONNECTING_READY;
-          state = STATE_WAIT_SEND;
-          ldout(async_msgr->cct, 10) << __func__ << " continue send in_seq " << dendl;
-        } else {
-          goto fail;
-        }
-        break;
-      }
-
-    case STATE_CONNECTING_READY:
-      {
-        // hooray!
-        peer_global_seq = connect_reply.global_seq;
-        policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
-        state = STATE_OPEN;
-        connect_seq += 1;
-        assert(connect_seq == connect_reply.connect_seq);
-        backoff = utime_t();
-        set_features((uint64_t)connect_reply.features & (uint64_t)connect_msg.features);
-        ldout(async_msgr->cct, 10) << __func__ << "connect success " << connect_seq
-                             << ", lossy = " << policy.lossy << ", features "
-                             << get_features() << dendl;
-
-        // If we have an authorizer, get a new AuthSessionHandler to deal with ongoing security of the
-        // connection.  PLR
-        if (authorizer != NULL) {
-          session_security.reset(
-              get_auth_session_handler(async_msgr->cct,
-                                       authorizer->protocol,
-                                       authorizer->session_key,
-                                       get_features()));
-        } else {
-          // We have no authorizer, so we shouldn't be applying security to messages in this AsyncConnection.  PLR
-          session_security.reset();
-        }
-
-        async_msgr->ms_deliver_handle_connect(this);
-        async_msgr->ms_deliver_handle_fast_connect(this);
-
-        // message may in queue between last _try_send and connection ready
-        // write event may already notify and we need to force scheduler again
-        if (is_queued())
-          center->create_time_event(1, write_handler);
-
-        break;
-      }
-
-    case STATE_ACCEPTING:
-      {
-        bufferlist bl;
-
-        if (net.set_nonblock(sd) < 0)
-          goto fail;
-
-        net.set_socket_options(sd);
-
-        bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
-
-        ::encode(async_msgr->get_myaddr(), bl);
-        port = async_msgr->get_myaddr().get_port();
-        // and peer's socket addr (they might not know their ip)
-        socklen_t len = sizeof(socket_addr.ss_addr());
-        r = ::getpeername(sd, (sockaddr*)&socket_addr.ss_addr(), &len);
-        if (r < 0) {
-          ldout(async_msgr->cct, 0) << __func__ << " failed to getpeername "
-                              << cpp_strerror(errno) << dendl;
-          goto fail;
-        }
-        ::encode(socket_addr, bl);
-        ldout(async_msgr->cct, 1) << __func__ << " sd=" << sd << " " << socket_addr << dendl;
-
-        r = _try_send(bl);
-        if (r == 0) {
-          state = STATE_ACCEPTING_WAIT_BANNER_ADDR;
-          ldout(async_msgr->cct, 10) << __func__ << " write banner and addr done: "
-            << get_peer_addr() << dendl;
-        } else if (r > 0) {
-          state = STATE_WAIT_SEND;
-          state_after_send = STATE_ACCEPTING_WAIT_BANNER_ADDR;
-          ldout(async_msgr->cct, 10) << __func__ << " wait for write banner and addr: "
-                              << get_peer_addr() << dendl;
-        } else {
-          goto fail;
-        }
-
-        break;
-      }
-    case STATE_ACCEPTING_WAIT_BANNER_ADDR:
-      {
-        bufferlist addr_bl;
-        entity_addr_t peer_addr;
-
-        r = read_until(strlen(CEPH_BANNER) + sizeof(peer_addr), state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read peer banner and addr failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        if (memcmp(state_buffer.c_str(), CEPH_BANNER, strlen(CEPH_BANNER))) {
-          ldout(async_msgr->cct, 1) << __func__ << " accept peer sent bad banner '" << state_buffer.c_str()
-                                    << "' (should be '" << CEPH_BANNER << "')" << dendl;
-          goto fail;
-        }
-
-        addr_bl.append(state_buffer, strlen(CEPH_BANNER), sizeof(peer_addr));
-        {
-          bufferlist::iterator ti = addr_bl.begin();
-          ::decode(peer_addr, ti);
-        }
-
-        ldout(async_msgr->cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
-        if (peer_addr.is_blank_ip()) {
-          // peer apparently doesn't know what ip they have; figure it out for them.
-          int port = peer_addr.get_port();
-          peer_addr.addr = socket_addr.addr;
-          peer_addr.set_port(port);
-          ldout(async_msgr->cct, 0) << __func__ << " accept peer addr is really " << peer_addr
-                             << " (socket is " << socket_addr << ")" << dendl;
-        }
-        set_peer_addr(peer_addr);  // so that connection_state gets set up
-        state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
-        break;
-      }
-
-    case STATE_ACCEPTING_WAIT_CONNECT_MSG:
-      {
-        r = read_until(sizeof(connect_msg), state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read connect msg failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        connect_msg = *((ceph_msg_connect*)state_buffer.c_str());
-        // sanitize features
-        connect_msg.features = ceph_sanitize_features(connect_msg.features);
-        state = STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH;
-        break;
-      }
-
-    case STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH:
-      {
-        bufferlist authorizer_bl, authorizer_reply;
-
-        if (connect_msg.authorizer_len) {
-          r = read_until(connect_msg.authorizer_len, state_buffer);
-          if (r < 0) {
-            ldout(async_msgr->cct, 1) << __func__ << " read connect msg failed" << dendl;
-            goto fail;
-          } else if (r > 0) {
-            break;
-          }
-          authorizer_bl.push_back(state_buffer);
-        }
-
-        ldout(async_msgr->cct, 20) << __func__ << " accept got peer connect_seq "
-                             << connect_msg.connect_seq << " global_seq "
-                             << connect_msg.global_seq << dendl;
-        set_peer_type(connect_msg.host_type);
-        policy = async_msgr->get_policy(connect_msg.host_type);
-        ldout(async_msgr->cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type
-                                   << ", policy.lossy=" << policy.lossy << " policy.server="
-                                   << policy.server << " policy.standby=" << policy.standby
-                                   << " policy.resetcheck=" << policy.resetcheck << dendl;
-
-        r = handle_connect_msg(connect_msg, authorizer_bl, authorizer_reply);
-        if (r < 0)
-          goto fail;
-
-        // state is changed by "handle_connect_msg"
-        assert(state != STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH);
-        break;
-      }
-
-    case STATE_ACCEPTING_WAIT_SEQ:
-      {
-        uint64_t newly_acked_seq;
-        r = read_until(sizeof(newly_acked_seq), state_buffer);
-        if (r < 0) {
-          ldout(async_msgr->cct, 1) << __func__ << " read ack seq failed" << dendl;
-          goto fail;
-        } else if (r > 0) {
-          break;
-        }
-
-        newly_acked_seq = *((uint64_t*)state_buffer.c_str());
-        ldout(async_msgr->cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq << dendl;
-        discard_requeued_up_to(newly_acked_seq);
-        state = STATE_ACCEPTING_READY;
-        break;
-      }
-
-    case STATE_ACCEPTING_READY:
-      {
-        ldout(async_msgr->cct, 20) << __func__ << " accept done" << dendl;
-        state = STATE_OPEN;
-        memset(&connect_msg, 0, sizeof(connect_msg));
-        break;
-      }
-
-    default:
-      {
-        lderr(async_msgr->cct) << __func__ << " bad state" << get_state_name(state) << dendl;
-        assert(0);
-      }
-  }
-
-  return 0;
-
-fail:
-  return -1;
-}
-
-int AsyncConnection::handle_connect_reply(ceph_msg_connect &connect, ceph_msg_connect_reply &reply)
-{
-  uint64_t feat_missing;
-  if (reply.tag == CEPH_MSGR_TAG_FEATURES) {
-    ldout(async_msgr->cct, 0) << __func__ << " connect protocol feature mismatch, my "
-                        << std::hex << connect.features << " < peer "
-                        << reply.features << " missing "
-                        << (reply.features & ~policy.features_supported)
-                        << std::dec << dendl;
-    goto fail;
-  }
-
-  if (reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
-    ldout(async_msgr->cct, 0) << __func__ << " connect protocol version mismatch, my "
-                        << connect.protocol_version << " != " << reply.protocol_version
-                        << dendl;
-    goto fail;
-  }
-
-  if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
-    ldout(async_msgr->cct,0) << __func__ << " connect got BADAUTHORIZER" << dendl;
-    if (got_bad_auth)
-      goto fail;
-    got_bad_auth = true;
-    delete authorizer;
-    authorizer = async_msgr->get_authorizer(peer_type, true);  // try harder
-    state = STATE_CONNECTING_SEND_CONNECT_MSG;
-  }
-  if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
-    ldout(async_msgr->cct, 0) << __func__ << "connect got RESETSESSION" << dendl;
-    was_session_reset();
-    state = STATE_CONNECTING_SEND_CONNECT_MSG;
-  }
-  if (reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
-    global_seq = async_msgr->get_global_seq(reply.global_seq);
-    ldout(async_msgr->cct, 10) << __func__ << " connect got RETRY_GLOBAL "
-                         << reply.global_seq << " chose new "
-                         << global_seq << dendl;
-    state = STATE_CONNECTING_SEND_CONNECT_MSG;
-  }
-  if (reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
-    assert(reply.connect_seq > connect_seq);
-    connect_seq = reply.connect_seq;
-    ldout(async_msgr->cct, 10) << __func__ << " connect got RETRY_SESSION "
-                         << connect_seq << " -> "
-                         << reply.connect_seq << dendl;
-    state = STATE_CONNECTING_SEND_CONNECT_MSG;
-  }
-  if (reply.tag == CEPH_MSGR_TAG_WAIT) {
-    ldout(async_msgr->cct, 3) << __func__ << " connect got WAIT (connection race)" << dendl;
-    state = STATE_WAIT;
-  }
-
-  feat_missing = policy.features_required & ~(uint64_t)connect_reply.features;
-  if (feat_missing) {
-    ldout(async_msgr->cct, 1) << __func__ << " missing required features " << std::hex
-                              << feat_missing << std::dec << dendl;
-    goto fail;
-  }
-
-  if (reply.tag == CEPH_MSGR_TAG_SEQ) {
-    ldout(async_msgr->cct, 10) << __func__ << "got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" << dendl;
-    state = STATE_CONNECTING_WAIT_ACK_SEQ;
-  }
-  if (reply.tag == CEPH_MSGR_TAG_READY) {
-    ldout(async_msgr->cct, 10) << __func__ << "got CEPH_MSGR_TAG_READY " << dendl;
-    state = STATE_CONNECTING_READY;
-  }
-
-  return 0;
-
- fail:
-  return -1;
-}
-
-int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &authorizer_bl,
-                                        bufferlist &authorizer_reply)
-{
-  int r;
-  ceph_msg_connect_reply reply;
-  bufferlist reply_bl;
-  uint64_t existing_seq = -1;
-  bool is_reset_from_peer = false;
-  char reply_tag;
-
-  memset(&reply, 0, sizeof(reply));
-  reply.protocol_version = async_msgr->get_proto_version(peer_type, false);
-
-  // mismatch?
-  ldout(async_msgr->cct,10) << __func__ << "accept my proto " << reply.protocol_version
-                      << ", their proto " << connect.protocol_version << dendl;
-  if (connect.protocol_version != reply.protocol_version) {
-    return _reply_accept(CEPH_MSGR_TAG_BADPROTOVER, connect, reply, authorizer_reply);
-  }
-  // require signatures for cephx?
-  if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
-    if (peer_type == CEPH_ENTITY_TYPE_OSD ||
-        peer_type == CEPH_ENTITY_TYPE_MDS) {
-      if (async_msgr->cct->_conf->cephx_require_signatures ||
-          async_msgr->cct->_conf->cephx_cluster_require_signatures) {
-        ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
-        policy.features_required |= CEPH_FEATURE_MSG_AUTH;
-      }
-    } else {
-      if (async_msgr->cct->_conf->cephx_require_signatures ||
-          async_msgr->cct->_conf->cephx_service_require_signatures) {
-        ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for service" << dendl;
-        policy.features_required |= CEPH_FEATURE_MSG_AUTH;
-      }
-    }
-  }
-  uint64_t feat_missing = policy.features_required & ~(uint64_t)connect.features;
-  if (feat_missing) {
-    ldout(async_msgr->cct, 1) << __func__ << "peer missing required features "
-                        << std::hex << feat_missing << std::dec << dendl;
-    return _reply_accept(CEPH_MSGR_TAG_FEATURES, connect, reply, authorizer_reply);
-  }
-
-  bool authorizer_valid;
-  if (!async_msgr->verify_authorizer(this, peer_type, connect.authorizer_protocol, authorizer_bl,
-                               authorizer_reply, authorizer_valid, session_key) || !authorizer_valid) {
-    ldout(async_msgr->cct,0) << __func__ << ": got bad authorizer" << dendl;
-    session_security.reset();
-    return _reply_accept(CEPH_MSGR_TAG_BADAUTHORIZER, connect, reply, authorizer_reply);
-  }
-
-  // We've verified the authorizer for this AsyncConnection, so set up the session security structure.  PLR
-  ldout(async_msgr->cct, 10) << __func__ << " accept:  setting up session_security." << dendl;
-
-  // existing?
-  AsyncConnectionRef existing = async_msgr->lookup_conn(peer_addr);
-  if (existing) {
-    if (connect.global_seq < existing->peer_global_seq) {
-      ldout(async_msgr->cct, 10) << __func__ << " accept existing " << existing
-                           << ".gseq " << existing->peer_global_seq << " > "
-                           << connect.global_seq << ", RETRY_GLOBAL" << dendl;
-      reply.global_seq = existing->peer_global_seq;  // so we can send it below..
-      return _reply_accept(CEPH_MSGR_TAG_RETRY_GLOBAL, connect, reply, authorizer_reply);
-    } else {
-      ldout(async_msgr->cct, 10) << __func__ << " accept existing " << existing
-                           << ".gseq " << existing->peer_global_seq
-                           << " <= " << connect.global_seq << ", looks ok" << dendl;
-    }
-
-    if (existing->policy.lossy) {
-      ldout(async_msgr->cct, 0) << __func__ << " accept replacing existing (lossy) channel (new one lossy="
-                          << policy.lossy << ")" << dendl;
-      existing->was_session_reset();
-      goto replace;
-    }
-
-    ldout(async_msgr->cct, 0) << __func__ << "accept connect_seq " << connect.connect_seq
-                        << " vs existing " << existing->connect_seq
-                        << " state " << existing->state << dendl;
-
-    if (connect.connect_seq == 0 && existing->connect_seq > 0) {
-      ldout(async_msgr->cct,0) << __func__ << " accept peer reset, then tried to connect to us, replacing" << dendl;
-      // this is a hard reset from peer
-      is_reset_from_peer = true;
-      if (policy.resetcheck)
-        existing->was_session_reset(); // this resets out_queue, msg_ and connect_seq #'s
-      goto replace;
-    }
-
-    if (connect.connect_seq < existing->connect_seq) {
-      // old attempt, or we sent READY but they didn't get it.
-      ldout(async_msgr->cct, 10) << __func__ << "accept existing " << existing << ".cseq "
-                           << existing->connect_seq << " > " << connect.connect_seq
-                           << ", RETRY_SESSION" << dendl;
-      reply.connect_seq = existing->connect_seq + 1;
-      return _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
-    }
-
-    if (connect.connect_seq == existing->connect_seq) {
-      // if the existing connection successfully opened, and/or
-      // subsequently went to standby, then the peer should bump
-      // their connect_seq and retry: this is not a connection race
-      // we need to resolve here.
-      if (existing->state == STATE_OPEN ||
-          existing->state == STATE_STANDBY) {
-        ldout(async_msgr->cct, 10) << __func__ << " accept connection race, existing " << existing
-                             << ".cseq " << existing->connect_seq << " == "
-                             << connect.connect_seq << ", OPEN|STANDBY, RETRY_SESSION" << dendl;
-        reply.connect_seq = existing->connect_seq + 1;
-        return _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
-      }
-
-      // connection race?
-      if (peer_addr < async_msgr->get_myaddr() || existing->policy.server) {
-        // incoming wins
-        ldout(async_msgr->cct, 10) << __func__ << " accept connection race, existing " << existing
-                             << ".cseq " << existing->connect_seq << " == " << connect.connect_seq
-                             << ", or we are server, replacing my attempt" << dendl;
-        goto replace;
-      } else {
-        // our existing outgoing wins
-        ldout(async_msgr->cct,10) << __func__ << "accept connection race, existing "
-                            << existing << ".cseq " << existing->connect_seq
-                            << " == " << connect.connect_seq << ", sending WAIT" << dendl;
-        assert(peer_addr > async_msgr->get_myaddr());
-        // make sure our outgoing connection will follow through
-        existing->_send_keepalive_or_ack();
-        return _reply_accept(CEPH_MSGR_TAG_WAIT, connect, reply, authorizer_reply);
-      }
-    }
-
-    assert(connect.connect_seq > existing->connect_seq);
-    assert(connect.global_seq >= existing->peer_global_seq);
-    if (policy.resetcheck &&   // RESETSESSION only used by servers; peers do not reset each other
-        existing->connect_seq == 0) {
-      ldout(async_msgr->cct, 0) << __func__ << "accept we reset (peer sent cseq "
-                          << connect.connect_seq << ", " << existing << ".cseq = "
-                          << existing->connect_seq << "), sending RESETSESSION" << dendl;
-      return _reply_accept(CEPH_MSGR_TAG_RESETSESSION, connect, reply, authorizer_reply);
-    }
-
-    // reconnect
-    ldout(async_msgr->cct, 10) << __func__ << " accept peer sent cseq " << connect.connect_seq
-                         << " > " << existing->connect_seq << dendl;
-    goto replace;
-  } // existing
-  else if (policy.resetcheck && connect.connect_seq > 0) {
-    // we reset, and they are opening a new session
-    ldout(async_msgr->cct, 0) << __func__ << "accept we reset (peer sent cseq "
-                        << connect.connect_seq << "), sending RESETSESSION" << dendl;
-    return _reply_accept(CEPH_MSGR_TAG_RESETSESSION, connect, reply, authorizer_reply);
-  } else {
-    // new session
-    ldout(async_msgr->cct,10) << __func__ << "accept new session" << dendl;
-    existing = NULL;
-    goto open;
-  }
-  assert(0);
-
- replace:
-  // if it is a hard reset from peer, we don't need a round-trip to negotiate in/out sequence
-  if ((connect.features & CEPH_FEATURE_RECONNECT_SEQ) && !is_reset_from_peer) {
-    reply_tag = CEPH_MSGR_TAG_SEQ;
-    existing_seq = existing->in_seq;
-  }
-  ldout(async_msgr->cct, 10) << __func__ << " accept replacing " << existing << dendl;
-  existing->mark_down();
-
-  // In order to avoid dead lock, here need to lock in ordering.
-  // It may be another thread access this connection between unlock and lock
-  // call, this is rely to EventCenter to guarantee only one thread can access
-  // one connection.
-  lock.Unlock();
-  if (existing->sd > sd) {
-    existing->lock.Lock();
-    lock.Lock();
-  } else {
-    lock.Lock();
-    existing->lock.Lock();
-  }
-  if (existing->policy.lossy) {
-    // disconnect from the Connection
-    async_msgr->ms_deliver_handle_reset(existing.get());
-  } else {
-    // queue a reset on the new connection, which we're dumping for the old
-    async_msgr->ms_deliver_handle_reset(this);
-
-    // reset the in_seq if this is a hard reset from peer,
-    // otherwise we respect our original connection's value
-    if (is_reset_from_peer)
-      existing->in_seq = 0;
-
-    // Clean up output buffer
-    existing->outcoming_bl.clear();
-    existing->requeue_sent();
-    reply.connect_seq = existing->connect_seq + 1;
-    if (_reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply) < 0)
-      goto fail;
-
-    uint64_t s = existing->sd;
-    existing->sd = sd;
-    sd = s;
-    existing->state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
-    _stop();
-    existing->lock.Unlock();
-    return 0;
-  }
-  existing->lock.Unlock();
-
- open:
-  connect_seq = connect.connect_seq + 1;
-  peer_global_seq = connect.global_seq;
-  ldout(async_msgr->cct, 10) << __func__ << " accept success, connect_seq = "
-                       << connect_seq << ", sending READY" << dendl;
-
-  // send READY reply
-  reply.tag = (reply_tag ? reply_tag : CEPH_MSGR_TAG_READY);
-  reply.features = policy.features_supported;
-  reply.global_seq = async_msgr->get_global_seq();
-  reply.connect_seq = connect_seq;
-  reply.flags = 0;
-  reply.authorizer_len = authorizer_reply.length();
-  if (policy.lossy)
-    reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
-
-  set_features((uint64_t)reply.features & (uint64_t)connect.features);
-  ldout(async_msgr->cct, 10) << __func__ << " accept features " << get_features() << dendl;
-
-  session_security.reset(
-      get_auth_session_handler(async_msgr->cct, connect.authorizer_protocol,
-                               session_key, get_features()));
-
-  // notify
-  async_msgr->ms_deliver_handle_accept(this);
-  async_msgr->ms_deliver_handle_fast_accept(this);
-
-  // ok!
-  async_msgr->accept_conn(this);
-
-  reply_bl.append((char*)&reply, sizeof(reply));
-
-  if (reply.authorizer_len)
-    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
-
-  int next_state;
-
-  if (reply_tag == CEPH_MSGR_TAG_SEQ) {
-    reply_bl.append((char*)&existing_seq, sizeof(existing_seq));
-    next_state = STATE_ACCEPTING_WAIT_SEQ;
-  } else {
-    next_state = STATE_ACCEPTING_READY;
-    discard_requeued_up_to(0);
-  }
-
-  r = _try_send(reply_bl);
-  if (r < 0) {
-    goto fail;
-  }
-
-  if (r == 0) {
-    state = next_state;
-    ldout(async_msgr->cct, 2) << __func__ << " accept write reply msg done" << dendl;
-  } else {
-    state = STATE_WAIT_SEND;
-    state_after_send = next_state;
-  }
-
-  return 0;
-
- fail:
-  return -1;
-}
-
-void AsyncConnection::_connect()
-{
-  ldout(async_msgr->cct, 10) << __func__ << " " << connect_seq << dendl;
-
-  state = STATE_CONNECTING;
-  // rescheduler connection in order to avoid lock dep
-  // may called by external thread(send_message)
-  center->dispatch_event_external(read_handler);
-}
-
-void AsyncConnection::accept(int incoming)
-{
-  ldout(async_msgr->cct, 10) << __func__ << " " << incoming << dendl;
-  assert(sd < 0);
-
-  sd = incoming;
-  state = STATE_ACCEPTING;
-  center->create_file_event(sd, EVENT_READABLE, read_handler);
-  // rescheduler connection in order to avoid lock dep
-  process();
-}
-
-int AsyncConnection::send_message(Message *m)
-{
-  ldout(async_msgr->cct, 10) << __func__ << dendl;
-  m->get_header().src = async_msgr->get_myname();
-  if (!m->get_priority())
-    m->set_priority(async_msgr->get_default_send_priority());
-
-  Mutex::Locker l(lock);
-  if (!is_queued() && state >= STATE_OPEN && state <= STATE_OPEN_TAG_CLOSE) {
-    ldout(async_msgr->cct, 10) << __func__ << " try send msg " << m << dendl;
-    int r = _send(m);
-    if (r < 0) {
-      ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
-      // we want to handle fault within internal thread
-      center->dispatch_event_external(write_handler);
-    }
-  } else {
-    out_q[m->get_priority()].push_back(m);
-    if ((state == STATE_STANDBY || state == STATE_CLOSED) && !policy.server) {
-      ldout(async_msgr->cct, 10) << __func__ << " state is " << get_state_name(state)
-                                 << " policy.server is false" << dendl;
-      _connect();
-    } else if (sd > 0 && !open_write) {
-      center->dispatch_event_external(write_handler);
-    }
-  }
-  return 0;
-}
-
-void AsyncConnection::requeue_sent()
-{
-  if (sent.empty())
-    return;
-
-  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
-  while (!sent.empty()) {
-    Message *m = sent.back();
-    sent.pop_back();
-    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend seq " << out_seq
-                         << " (" << m->get_seq() << ")" << dendl;
-    rq.push_front(m);
-    out_seq--;
-  }
-}
-
-void AsyncConnection::discard_requeued_up_to(uint64_t seq)
-{
-  ldout(async_msgr->cct, 10) << __func__ << " " << seq << dendl;
-  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0)
-    return;
-  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
-  while (!rq.empty()) {
-    Message *m = rq.front();
-    if (m->get_seq() == 0 || m->get_seq() > seq)
-      break;
-    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend seq " << out_seq
-                         << " <= " << seq << ", discarding" << dendl;
-    m->put();
-    rq.pop_front();
-    out_seq++;
-  }
-  if (rq.empty())
-    out_q.erase(CEPH_MSG_PRIO_HIGHEST);
-}
-
-/*
- * Tears down the AsyncConnection's message queues, and removes them from the DispatchQueue
- * Must hold pipe_lock prior to calling.
- */
-void AsyncConnection::discard_out_queue()
-{
-  ldout(async_msgr->cct, 10) << __func__ << " " << dendl;
-
-  for (list<Message*>::iterator p = sent.begin(); p != sent.end(); ++p) {
-    ldout(async_msgr->cct, 20) << __func__ << " discard " << *p << dendl;
-    (*p)->put();
-  }
-  sent.clear();
-  for (map<int,list<Message*> >::iterator p = out_q.begin(); p != out_q.end(); ++p)
-    for (list<Message*>::iterator r = p->second.begin(); r != p->second.end(); ++r) {
-      ldout(async_msgr->cct, 20) << __func__ << " discard " << *r << dendl;
-      (*r)->put();
-    }
-  out_q.clear();
-}
-
-int AsyncConnection::randomize_out_seq()
-{
-  if (get_features() & CEPH_FEATURE_MSG_AUTH) {
-    // Set out_seq to a random value, so CRC won't be predictable.   Don't bother checking seq_error
-    // here.  We'll check it on the call.  PLR
-    int seq_error = get_random_bytes((char *)&out_seq, sizeof(out_seq));
-    out_seq &= SEQ_MASK;
-    lsubdout(async_msgr->cct, ms, 10) << __func__ << "randomize_out_seq " << out_seq << dendl;
-    return seq_error;
-  } else {
-    // previously, seq #'s always started at 0.
-    out_seq = 0;
-    return 0;
-  }
-}
-
-void AsyncConnection::fault()
-{
-  if (state == STATE_CLOSED) {
-    ldout(async_msgr->cct, 10) << __func__ << " state is already STATE_CLOSED" << dendl;
-    return ;
-  }
-
-  if (policy.lossy && state != STATE_CONNECTING) {
-    ldout(async_msgr->cct, 10) << __func__ << " on lossy channel, failing" << dendl;
-    _stop();
-    return ;
-  }
-
-  if (sd >= 0) {
-    shutdown_socket();
-    center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
-  }
-  open_write = false;
-
-  // requeue sent items
-  requeue_sent();
-  outcoming_bl.clear();
-  if (policy.standby && !is_queued()) {
-    ldout(async_msgr->cct,0) << __func__ << " with nothing to send, going to standby" << dendl;
-    state = STATE_STANDBY;
-    return;
-  }
-
-  if (state != STATE_CONNECTING) {
-    // policy maybe empty when state is in accept
-    if (policy.server || (state >= STATE_ACCEPTING && state < STATE_ACCEPTING_WAIT_SEQ)) {
-      ldout(async_msgr->cct, 0) << __func__ << " server, going to standby" << dendl;
-      state = STATE_STANDBY;
-    } else {
-      ldout(async_msgr->cct, 0) << __func__ << " initiating reconnect" << dendl;
-      connect_seq++;
-      state = STATE_CONNECTING;
-    }
-    backoff = utime_t();
-  } else {
-    if (backoff == utime_t()) {
-      backoff.set_from_double(async_msgr->cct->_conf->ms_initial_backoff);
-    } else {
-      backoff += backoff;
-      if (backoff > async_msgr->cct->_conf->ms_max_backoff)
-        backoff.set_from_double(async_msgr->cct->_conf->ms_max_backoff);
-    }
-    ldout(async_msgr->cct, 10) << __func__ << " waiting " << backoff << dendl;
-  }
-
-  // woke up again;
-  center->create_time_event(backoff, read_handler);
-}
-
-void AsyncConnection::was_session_reset()
-{
-  ldout(async_msgr->cct,10) << __func__ << "was_session_reset" << dendl;
-  discard_out_queue();
-  outcoming_bl.clear();
-
-  center->dispatch_event_external(remote_reset_handler);
-
-  if (randomize_out_seq()) {
-    lsubdout(async_msgr->cct,ms,15) << __func__ << " Could not get random bytes to set seq number for session reset; set seq number to " << out_seq << dendl;
-  }
-
-  in_seq = 0;
-  connect_seq = 0;
-  in_seq_acked = 0;
-}
-
-void AsyncConnection::_stop()
-{
-  ldout(async_msgr->cct, 10) << __func__ << dendl;
-  center->dispatch_event_external(reset_handler);
-  shutdown_socket();
-  discard_out_queue();
-  outcoming_bl.clear();
-  if (policy.lossy)
-    was_session_reset();
-  open_write = false;
-  state = STATE_CLOSED;
-}
-
-int AsyncConnection::_send(Message *m)
-{
-  m->set_seq(++out_seq);
-  if (!policy.lossy) {
-    // put on sent list
-    sent.push_back(m); 
-    m->get();
-  }
-
-  // associate message with Connection (for benefit of encode_payload)
-  m->set_connection(this);
-
-  uint64_t features = get_features();
-  if (m->empty_payload())
-    ldout(async_msgr->cct, 20) << __func__ << " encoding " << m->get_seq() << " features " << features
-                         << " " << m << " " << *m << dendl;
-  else
-    ldout(async_msgr->cct, 20) << __func__ << " half-reencoding " << m->get_seq() << " features "
-                         << features << " " << m << " " << *m << dendl;
-
-  // encode and copy out of *m
-  m->encode(features, !async_msgr->cct->_conf->ms_nocrc);
-
-  // prepare everything
-  ceph_msg_header& header = m->get_header();
-  ceph_msg_footer& footer = m->get_footer();
-
-  // Now that we have all the crcs calculated, handle the
-  // digital signature for the message, if the AsyncConnection has session
-  // security set up.  Some session security options do not
-  // actually calculate and check the signature, but they should
-  // handle the calls to sign_message and check_signature.  PLR
-  if (session_security.get() == NULL) {
-    ldout(async_msgr->cct, 20) << __func__ << " no session security" << dendl;
-  } else {
-    if (session_security->sign_message(m)) {
-      ldout(async_msgr->cct, 20) << __func__ << " failed to sign seq # "
-                           << header.seq << "): sig = " << footer.sig << dendl;
-    } else {
-      ldout(async_msgr->cct, 20) << __func__ << " signed seq # " << header.seq
-                           << "): sig = " << footer.sig << dendl;
-    }
-  }
-
-  bufferlist blist = m->get_payload();
-  blist.append(m->get_middle());
-  blist.append(m->get_data());
-
-  ldout(async_msgr->cct, 20) << __func__ << " sending " << m->get_seq()
-                       << " " << m << dendl;
-  int rc = write_message(header, footer, blist);
-
-  if (rc < 0) {
-    ldout(async_msgr->cct, 1) << __func__ << " error sending " << m << ", "
-                        << cpp_strerror(errno) << dendl;
-  } else if (rc == 0) {
-    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " done." << dendl;
-  } else {
-    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " continuely." << dendl;
-  }
-  m->put();
-
-  return rc;
-}
-
-int AsyncConnection::write_message(ceph_msg_header& header, ceph_msg_footer& footer,
-                                  bufferlist& blist)
-{
-  bufferlist bl;
-  int ret;
-
-  // send tag
-  char tag = CEPH_MSGR_TAG_MSG;
-  bl.append(&tag, sizeof(tag));
-
-  // send envelope
-  ceph_msg_header_old oldheader;
-  if (has_feature(CEPH_FEATURE_NOSRCADDR)) {
-    bl.append((char*)&header, sizeof(header));
-  } else {
-    memcpy(&oldheader, &header, sizeof(header));
-    oldheader.src.name = header.src;
-    oldheader.src.addr = get_peer_addr();
-    oldheader.orig_src = oldheader.src;
-    oldheader.reserved = header.reserved;
-    oldheader.crc = ceph_crc32c(0, (unsigned char*)&oldheader,
-                                sizeof(oldheader) - sizeof(oldheader.crc));
-    bl.append((char*)&oldheader, sizeof(oldheader));
-  }
-
-  bl.claim_append(blist);
-
-  // send footer; if receiver doesn't support signatures, use the old footer format
-  ceph_msg_footer_old old_footer;
-  if (has_feature(CEPH_FEATURE_MSG_AUTH)) {
-    bl.append((char*)&footer, sizeof(footer));
-  } else {
-    old_footer.front_crc = footer.front_crc;
-    old_footer.middle_crc = footer.middle_crc;
-    old_footer.data_crc = footer.data_crc;
-    old_footer.flags = footer.flags;
-    bl.append((char*)&old_footer, sizeof(old_footer));
-  }
-
-  // send
-  ret = _try_send(bl);
-  if (ret < 0)
-    return ret;
-
-  return ret;
-}
-
-void AsyncConnection::handle_ack(uint64_t seq)
-{
-  lsubdout(async_msgr->cct, ms, 15) << __func__ << " got ack seq " << seq << dendl;
-  // trim sent list
-  while (!sent.empty() && sent.front()->get_seq() <= seq) {
-    Message *m = sent.front();
-    sent.pop_front();
-    lsubdout(async_msgr->cct, ms, 10) << __func__ << "reader got ack seq "
-                                << seq << " >= " << m->get_seq() << " on "
-                                << m << " " << *m << dendl;
-    m->put();
-  }
-}
-
-void AsyncConnection::send_keepalive()
-{
-  Mutex::Locker l(lock);
-  keepalive = true;
-  center->dispatch_event_external(write_handler);
-}
-
-void AsyncConnection::_send_keepalive_or_ack(bool ack, utime_t *tp)
-{
-  assert(lock.is_locked());
-  bufferlist bl;
-
-  utime_t t = ceph_clock_now(async_msgr->cct);
-  struct ceph_timespec ts;
-  t.encode_timeval(&ts);
-  if (ack) {
-    assert(tp);
-    tp->encode_timeval(&ts);
-    bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
-    bl.append((char*)&ts, sizeof(ts));
-  } else if (has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
-    struct ceph_timespec ts;
-    t.encode_timeval(&ts);
-    bl.append(CEPH_MSGR_TAG_KEEPALIVE2);
-    bl.append((char*)&ts, sizeof(ts));
-  } else {
-    bl.append(CEPH_MSGR_TAG_KEEPALIVE);
-  }
-
-  ldout(async_msgr->cct, 10) << __func__ << " try send keepalive or ack" << dendl;
-  _try_send(bl, false);
-}
-
-void AsyncConnection::handle_write()
-{
-  ldout(async_msgr->cct, 10) << __func__ << " started." << dendl;
-  Mutex::Locker l(lock);
-  bufferlist bl;
-  int r;
-  if (state >= STATE_OPEN && state <= STATE_OPEN_TAG_CLOSE) {
-    if (keepalive) {
-      _send_keepalive_or_ack();
-      keepalive = false;
-    }
-
-    while (1) {
-      Message *m = _get_next_outgoing();
-      if (!m)
-        break;
-
-      ldout(async_msgr->cct, 10) << __func__ << " try send msg " << m << dendl;
-      r = _send(m);
-      if (r < 0) {
-        ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
-        goto fail;
-      } else if (r > 0) {
-        break;
-      }
-    }
-
-    if (in_seq > in_seq_acked) {
-      ceph_le64 s;
-      s = in_seq;
-      bl.append(CEPH_MSGR_TAG_ACK);
-      bl.append((char*)&s, sizeof(s));
-      ldout(async_msgr->cct, 10) << __func__ << " try send msg ack" << dendl;
-      in_seq_acked = s;
-      _try_send(bl);
-    }
-  } else if (state != STATE_CONNECTING) {
-    r = _try_send(bl);
-    if (r < 0) {
-      ldout(async_msgr->cct, 1) << __func__ << " send outcoming bl failed" << dendl;
-      goto fail;
-    }
-  }
-
-  return ;
- fail:
-  fault();
-}
diff --git a/src/msg/AsyncConnection.h b/src/msg/AsyncConnection.h
deleted file mode 100644 (file)
index 3c025ff..0000000
+++ /dev/null
@@ -1,249 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_MSG_ASYNCCONNECTION_H
-#define CEPH_MSG_ASYNCCONNECTION_H
-
-#include <list>
-#include <map>
-using namespace std;
-
-#include "common/Mutex.h"
-#include "include/buffer.h"
-
-#include "auth/AuthSessionHandler.h"
-#include "include/buffer.h"
-#include "Connection.h"
-#include "net_handler.h"
-#include "Event.h"
-#include "Messenger.h"
-
-class AsyncMessenger;
-
-class AsyncConnection : public Connection {
-  const static uint64_t IOV_LEN = 1024;
-
-  int read_bulk(int fd, char *buf, int len);
-  int do_sendmsg(struct msghdr &msg, int len, bool more);
-  // if "send" is false, it will only append bl to send buffer
-  // the main usage is avoid error happen outside messenger threads
-  int _try_send(bufferlist bl, bool send=true);
-  int _send(Message *m);
-  int read_until(uint64_t needed, bufferptr &p);
-  int _process_connection();
-  void _connect();
-  void _stop();
-  int handle_connect_reply(ceph_msg_connect &connect, ceph_msg_connect_reply &r);
-  int handle_connect_msg(ceph_msg_connect &m, bufferlist &aubl, bufferlist &bl);
-  void was_session_reset();
-  void fault();
-  void discard_out_queue();
-  void discard_requeued_up_to(uint64_t seq);
-  void requeue_sent();
-  int randomize_out_seq();
-  void handle_ack(uint64_t seq);
-  void _send_keepalive_or_ack(bool ack=false, utime_t *t=NULL);
-  int write_message(ceph_msg_header& header, ceph_msg_footer& footer, bufferlist& blist);
-  int _reply_accept(char tag, ceph_msg_connect &connect, ceph_msg_connect_reply &reply,
-                    bufferlist authorizer_reply) {
-    bufferlist reply_bl;
-    reply.tag = tag;
-    reply.features = ((uint64_t)connect.features & policy.features_supported) | policy.features_required;
-    reply.authorizer_len = authorizer_reply.length();
-    reply_bl.append((char*)&reply, sizeof(reply));
-    if (reply.authorizer_len) {
-      reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
-    }
-    int r = _try_send(reply_bl);
-    if (r < 0)
-      return -1;
-
-    state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
-    return 0;
-  }
-  bool is_queued() {
-    return !out_q.empty() || outcoming_bl.length();
-  }
-  void shutdown_socket() {
-    if (sd >= 0)
-      ::shutdown(sd, SHUT_RDWR);
-  }
-  Message *_get_next_outgoing() {
-    Message *m = 0;
-    while (!m && !out_q.empty()) {
-      map<int, list<Message*> >::reverse_iterator p = out_q.rbegin();
-      if (!p->second.empty()) {
-        m = p->second.front();
-        p->second.pop_front();
-      }
-      if (p->second.empty())
-        out_q.erase(p->first);
-    }
-    return m;
-  }
- public:
-  AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c);
-  ~AsyncConnection();
-
-  ostream& _conn_prefix(std::ostream *_dout);
-
-  bool is_connected() {
-    // FIXME?
-    return true;
-  }
-
-  // Only call when AsyncConnection first construct
-  void connect(const entity_addr_t& addr, int type) {
-    set_peer_type(type);
-    set_peer_addr(addr);
-    policy = msgr->get_policy(type);
-    _connect();
-  }
-  // Only call when AsyncConnection first construct
-  void accept(int sd);
-  int send_message(Message *m);
-
-  void send_keepalive();
-  void mark_down() {
-    Mutex::Locker l(lock);
-    _stop();
-  }
-  void mark_disposable() {
-    Mutex::Locker l(lock);
-    policy.lossy = true;
-  }
-
- private:
-  enum {
-    STATE_NONE,
-    STATE_OPEN,
-    STATE_OPEN_KEEPALIVE2,
-    STATE_OPEN_KEEPALIVE2_ACK,
-    STATE_OPEN_TAG_ACK,
-    STATE_OPEN_MESSAGE_HEADER,
-    STATE_OPEN_MESSAGE_THROTTLE_MESSAGE,
-    STATE_OPEN_MESSAGE_THROTTLE_BYTES,
-    STATE_OPEN_MESSAGE_READ_FRONT,
-    STATE_OPEN_MESSAGE_READ_MIDDLE,
-    STATE_OPEN_MESSAGE_READ_DATA_PREPARE,
-    STATE_OPEN_MESSAGE_READ_DATA,
-    STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH,
-    STATE_OPEN_TAG_CLOSE,
-    STATE_WAIT_SEND,
-    STATE_CONNECTING,
-    STATE_CONNECTING_WAIT_BANNER,
-    STATE_CONNECTING_WAIT_IDENTIFY_PEER,
-    STATE_CONNECTING_SEND_CONNECT_MSG,
-    STATE_CONNECTING_WAIT_CONNECT_REPLY,
-    STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH,
-    STATE_CONNECTING_WAIT_ACK_SEQ,
-    STATE_CONNECTING_READY,
-    STATE_ACCEPTING,
-    STATE_ACCEPTING_HANDLE_CONNECT,
-    STATE_ACCEPTING_WAIT_BANNER_ADDR,
-    STATE_ACCEPTING_WAIT_CONNECT_MSG,
-    STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH,
-    STATE_ACCEPTING_WAIT_SEQ,
-    STATE_ACCEPTING_READY,
-    STATE_STANDBY,
-    STATE_CLOSED,
-    STATE_WAIT,       // just wait for racing connection
-  };
-
-  static const char *get_state_name(int state) {
-      const char* const statenames[] = {"STATE_NONE",
-                                        "STATE_OPEN",
-                                        "STATE_OPEN_KEEPALIVE2",
-                                        "STATE_OPEN_KEEPALIVE2_ACK",
-                                        "STATE_OPEN_TAG_ACK",
-                                        "STATE_OPEN_MESSAGE_HEADER",
-                                        "STATE_OPEN_MESSAGE_THROTTLE_MESSAGE",
-                                        "STATE_OPEN_MESSAGE_THROTTLE_BYTES",
-                                        "STATE_OPEN_MESSAGE_READ_FRONT",
-                                        "STATE_OPEN_MESSAGE_READ_MIDDLE",
-                                        "STATE_OPEN_MESSAGE_READ_DATA_PREPARE",
-                                        "STATE_OPEN_MESSAGE_READ_DATA",
-                                        "STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH",
-                                        "STATE_OPEN_TAG_CLOSE",
-                                        "STATE_WAIT_SEND",
-                                        "STATE_CONNECTING",
-                                        "STATE_CONNECTING_WAIT_BANNER",
-                                        "STATE_CONNECTING_WAIT_IDENTIFY_PEER",
-                                        "STATE_CONNECTING_SEND_CONNECT_MSG",
-                                        "STATE_CONNECTING_WAIT_CONNECT_REPLY",
-                                        "STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH",
-                                        "STATE_CONNECTING_WAIT_ACK_SEQ",
-                                        "STATE_CONNECTING_READY",
-                                        "STATE_ACCEPTING",
-                                        "STATE_ACCEPTING_HANDLE_CONNECT",
-                                        "STATE_ACCEPTING_WAIT_BANNER_ADDR",
-                                        "STATE_ACCEPTING_WAIT_CONNECT_MSG",
-                                        "STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH",
-                                        "STATE_ACCEPTING_WAIT_SEQ",
-                                        "STATE_ACCEPTING_READY",
-                                        "STATE_STANDBY",
-                                        "STATE_CLOSED",
-                                        "STATE_WAIT",
-                                        "STATE_FAULT"};
-      return statenames[state];
-  }
-
-  CephContext *cc;
-  AsyncMessenger *async_msgr;
-  int global_seq;
-  __u32 connect_seq, peer_global_seq;
-  uint64_t out_seq;
-  uint64_t in_seq, in_seq_acked;
-  int state;
-  int state_after_send;
-  int sd;
-  int port;
-  Messenger::Policy policy;
-  map<int, list<Message*> > out_q;  // priority queue for outbound msgs
-  list<Message*> sent;
-  Mutex lock;
-  utime_t backoff;         // backoff time
-  bool open_write;
-  EventCallbackRef read_handler;
-  EventCallbackRef write_handler;
-  EventCallbackRef reset_handler;
-  EventCallbackRef remote_reset_handler;
-  bool keepalive;
-  struct iovec msgvec[IOV_LEN];
-
-  // Tis section are temp variables used by state transition
-
-  // Open state
-  utime_t recv_stamp;
-  utime_t throttle_stamp;
-  uint64_t msg_left;
-  ceph_msg_header current_header;
-  bufferlist data_buf;
-  bufferlist::iterator data_blp;
-  bufferlist front, middle, data;
-  ceph_msg_connect connect_msg;
-  // Connecting state
-  bool got_bad_auth;
-  AuthAuthorizer *authorizer;
-  ceph_msg_connect_reply connect_reply;
-  // Accepting state
-  entity_addr_t socket_addr;
-  CryptoKey session_key;
-
-  // used only for local state, it will be overwrite when state transition
-  bufferptr state_buffer;
-  // used only by "read_until"
-  uint64_t state_offset;
-  bufferlist outcoming_bl;
-  NetHandler net;
-  EventCenter *center;
-  ceph::shared_ptr<AuthSessionHandler> session_security;
-
- public:
-  // used by eventcallback
-  void handle_write();
-  void process();
-}; /* AsyncConnection */
-
-typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef;
-
-#endif
diff --git a/src/msg/AsyncMessenger.cc b/src/msg/AsyncMessenger.cc
deleted file mode 100644 (file)
index ed8f04f..0000000
+++ /dev/null
@@ -1,678 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-
-#include <errno.h>
-#include <iostream>
-#include <fstream>
-#include <poll.h>
-
-#include "AsyncMessenger.h"
-
-#include "common/config.h"
-#include "common/Timer.h"
-#include "common/errno.h"
-#include "auth/Crypto.h"
-#include "include/Spinlock.h"
-
-#define dout_subsys ceph_subsys_ms
-#undef dout_prefix
-#define dout_prefix _prefix(_dout, this)
-static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) {
-  return *_dout << "-- " << m->get_myaddr() << " ";
-}
-
-static ostream& _prefix(std::ostream *_dout, Processor *p) {
-  return *_dout << " Processor -- ";
-}
-
-static ostream& _prefix(std::ostream *_dout, Worker *w) {
-  return *_dout << "--";
-}
-
-class C_handle_accept : public EventCallback {
-  AsyncConnectionRef conn;
-  int fd;
-
- public:
-  C_handle_accept(AsyncConnectionRef c, int s): conn(c), fd(s) {}
-  void do_request(int id) {
-    conn->accept(fd);
-  }
-};
-
-class C_handle_connect : public EventCallback {
-  AsyncConnectionRef conn;
-  const entity_addr_t addr;
-  int type;
-
- public:
-  C_handle_connect(AsyncConnectionRef c, const entity_addr_t &d, int t)
-      :conn(c), addr(d), type(t) {}
-  void do_request(int id) {
-    conn->connect(addr, type);
-  }
-};
-
-
-/*******************
- * Processor
- */
-
-int Processor::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
-{
-  const md_config_t *conf = msgr->cct->_conf;
-  // bind to a socket
-  ldout(msgr->cct, 10) << __func__ << dendl;
-
-  int family;
-  switch (bind_addr.get_family()) {
-  case AF_INET:
-  case AF_INET6:
-    family = bind_addr.get_family();
-    break;
-
-  default:
-    // bind_addr is empty
-    family = conf->ms_bind_ipv6 ? AF_INET6 : AF_INET;
-  }
-
-  /* socket creation */
-  listen_sd = ::socket(family, SOCK_STREAM, 0);
-  if (listen_sd < 0) {
-    lderr(msgr->cct) << __func__ << " unable to create socket: "
-                     << cpp_strerror(errno) << dendl;
-    return -errno;
-  }
-
-  // use whatever user specified (if anything)
-  entity_addr_t listen_addr = bind_addr;
-  listen_addr.set_family(family);
-
-  /* bind to port */
-  int rc = -1;
-  if (listen_addr.get_port()) {
-    // specific port
-
-    // reuse addr+port when possible
-    int on = 1;
-    rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
-    if (rc < 0) {
-      lderr(msgr->cct) << __func__ << " unable to setsockopt: "
-                       << cpp_strerror(errno) << dendl;
-      return -errno;
-    }
-
-    rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
-    if (rc < 0) {
-      lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr.ss_addr()
-                       << ": " << cpp_strerror(errno) << dendl;
-      return -errno;
-    }
-  } else {
-    // try a range of ports
-    for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) {
-      if (avoid_ports.count(port))
-        continue;
-      listen_addr.set_port(port);
-      rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
-      if (rc == 0)
-        break;
-    }
-    if (rc < 0) {
-      lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr.ss_addr()
-                       << " on any port in range " << msgr->cct->_conf->ms_bind_port_min
-                       << "-" << msgr->cct->_conf->ms_bind_port_max
-                       << ": " << cpp_strerror(errno) << dendl;
-      return -errno;
-    }
-    ldout(msgr->cct,10) << __func__ << " bound on random port " << listen_addr << dendl;
-  }
-
-  // what port did we get?
-  socklen_t llen = sizeof(listen_addr.ss_addr());
-  rc = getsockname(listen_sd, (sockaddr*)&listen_addr.ss_addr(), &llen);
-  if (rc < 0) {
-    rc = -errno;
-    lderr(msgr->cct) << __func__ << " failed getsockname: " << cpp_strerror(rc) << dendl;
-    return rc;
-  }
-
-  ldout(msgr->cct, 10) << __func__ << " bound to " << listen_addr << dendl;
-
-  // listen!
-  rc = ::listen(listen_sd, 128);
-  if (rc < 0) {
-    rc = -errno;
-    lderr(msgr->cct) << __func__ << " unable to listen on " << listen_addr
-                     << ": " << cpp_strerror(rc) << dendl;
-    return rc;
-  }
-
-  msgr->set_myaddr(bind_addr);
-  if (bind_addr != entity_addr_t())
-    msgr->learned_addr(bind_addr);
-
-  if (msgr->get_myaddr().get_port() == 0) {
-    msgr->set_myaddr(listen_addr);
-  }
-  entity_addr_t addr = msgr->get_myaddr();
-  addr.nonce = nonce;
-  msgr->set_myaddr(addr);
-
-  msgr->init_local_connection();
-
-  ldout(msgr->cct,1) << __func__ << " bind my_inst.addr is " << msgr->get_myaddr() << dendl;
-  return 0;
-}
-
-int Processor::rebind(const set<int>& avoid_ports)
-{
-  ldout(msgr->cct, 1) << __func__ << " rebind avoid " << avoid_ports << dendl;
-
-  entity_addr_t addr = msgr->get_myaddr();
-  set<int> new_avoid = avoid_ports;
-  new_avoid.insert(addr.get_port());
-  addr.set_port(0);
-
-  // adjust the nonce; we want our entity_addr_t to be truly unique.
-  nonce += 1000000;
-  msgr->my_inst.addr.nonce = nonce;
-  ldout(msgr->cct, 10) << __func__ << " new nonce " << nonce << " and inst " << msgr->my_inst << dendl;
-
-  ldout(msgr->cct, 10) << __func__ << " will try " << addr << " and avoid ports " << new_avoid << dendl;
-  int r = bind(addr, new_avoid);
-  if (r == 0)
-    start();
-  return r;
-}
-
-int Processor::start()
-{
-  ldout(msgr->cct, 1) << __func__ << " start" << dendl;
-
-  // start thread
-  if (listen_sd > 0)
-    create();
-
-  return 0;
-}
-
-void *Processor::entry()
-{
-  ldout(msgr->cct, 10) << __func__ << " starting" << dendl;
-  int errors = 0;
-
-  struct pollfd pfd;
-  pfd.fd = listen_sd;
-  pfd.events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
-  while (!done) {
-    ldout(msgr->cct, 20) << __func__ << " calling poll" << dendl;
-    int r = poll(&pfd, 1, -1);
-    if (r < 0)
-      break;
-    ldout(msgr->cct,20) << __func__ << " poll got " << r << dendl;
-
-    if (pfd.revents & (POLLERR | POLLNVAL | POLLHUP))
-      break;
-
-    ldout(msgr->cct,10) << __func__ << " pfd.revents=" << pfd.revents << dendl;
-    if (done) break;
-
-    // accept
-    entity_addr_t addr;
-    socklen_t slen = sizeof(addr.ss_addr());
-    int sd = ::accept(listen_sd, (sockaddr*)&addr.ss_addr(), &slen);
-    if (sd >= 0) {
-      errors = 0;
-      ldout(msgr->cct,10) << __func__ << "accepted incoming on sd " << sd << dendl;
-
-      msgr->add_accept(sd);
-    } else {
-      ldout(msgr->cct,0) << __func__ << " no incoming connection?  sd = " << sd
-                         << " errno " << errno << " " << cpp_strerror(errno) << dendl;
-      if (++errors > 4)
-        break;
-    }
-  }
-
-  ldout(msgr->cct,20) << __func__ << " closing" << dendl;
-  // don't close socket, in case we start up again?  blech.
-  if (listen_sd >= 0) {
-    ::close(listen_sd);
-    listen_sd = -1;
-  }
-  ldout(msgr->cct,10) << __func__ << " stopping" << dendl;
-  return 0;
-}
-
-void Processor::stop()
-{
-  done = true;
-  ldout(msgr->cct,10) << __func__ << dendl;
-
-  if (listen_sd >= 0) {
-    ::shutdown(listen_sd, SHUT_RDWR);
-  }
-
-  // wait for thread to stop before closing the socket, to avoid
-  // racing against fd re-use.
-  if (is_started()) {
-    join();
-  }
-
-  if (listen_sd >= 0) {
-    ::close(listen_sd);
-    listen_sd = -1;
-  }
-  done = false;
-}
-
-void Worker::stop()
-{
-  ldout(msgr->cct, 10) << __func__ << dendl;
-  done = true;
-  center.wakeup();
-}
-
-void *Worker::entry()
-{
-  ldout(msgr->cct, 10) << __func__ << " starting" << dendl;
-  int r;
-
-  while (!done) {
-    ldout(msgr->cct, 20) << __func__ << " calling event process" << dendl;
-
-    r = center.process_events(30000000);
-    if (r < 0) {
-      ldout(msgr->cct,20) << __func__ << " process events failed: "
-                          << cpp_strerror(errno) << dendl;
-      // TODO do something?
-    }
-  }
-
-  return 0;
-}
-
-/*******************
- * AsyncMessenger
- */
-
-AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
-                               string mname, uint64_t _nonce)
-  : SimplePolicyMessenger(cct, name,mname, _nonce),
-    conn_id(0),
-    processor(this, _nonce),
-    lock("AsyncMessenger::lock"),
-    nonce(_nonce), did_bind(false),
-    global_seq(0),
-    cluster_protocol(0), stopped(true)
-{
-  ceph_spin_init(&global_seq_lock);
-  for (int i = 0; i < cct->_conf->ms_event_op_threads; ++i) {
-    Worker *w = new Worker(this, cct);
-    workers.push_back(w);
-  }
-  local_connection = new AsyncConnection(cct, this, &workers[0]->center);
-  init_local_connection();
-}
-
-/**
- * Destroy the AsyncMessenger. Pretty simple since all the work is done
- * elsewhere.
- */
-AsyncMessenger::~AsyncMessenger()
-{
-  assert(!did_bind); // either we didn't bind or we shut down the Processor
-}
-
-void AsyncMessenger::ready()
-{
-  ldout(cct,10) << __func__ << " " << get_myaddr() << dendl;
-
-  lock.Lock();
-  processor.start();
-  lock.Unlock();
-}
-
-int AsyncMessenger::shutdown()
-{
-  ldout(cct,10) << __func__ << "shutdown " << get_myaddr() << dendl;
-  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it)
-    (*it)->stop();
-  mark_down_all();
-
-  // break ref cycles on the loopback connection
-  processor.stop();
-  local_connection->set_priv(NULL);
-  stop_cond.Signal();
-  stopped = true;
-  return 0;
-}
-
-
-int AsyncMessenger::bind(const entity_addr_t &bind_addr)
-{
-  lock.Lock();
-  if (started) {
-    ldout(cct,10) << __func__ << " already started" << dendl;
-    lock.Unlock();
-    return -1;
-  }
-  ldout(cct,10) << __func__ << " bind " << bind_addr << dendl;
-  lock.Unlock();
-
-  // bind to a socket
-  set<int> avoid_ports;
-  int r = processor.bind(bind_addr, avoid_ports);
-  if (r >= 0)
-    did_bind = true;
-  return r;
-}
-
-int AsyncMessenger::rebind(const set<int>& avoid_ports)
-{
-  ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl;
-  assert(did_bind);
-  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it) {
-    (*it)->stop();
-    if ((*it)->is_started())
-      (*it)->join();
-  }
-
-  processor.stop();
-  mark_down_all();
-  return processor.rebind(avoid_ports);
-}
-
-int AsyncMessenger::start()
-{
-  lock.Lock();
-  ldout(cct,1) << __func__ << " start" << dendl;
-
-  // register at least one entity, first!
-  assert(my_inst.name.type() >= 0);
-
-  assert(!started);
-  started = true;
-  stopped = false;
-
-  if (!did_bind) {
-    my_inst.addr.nonce = nonce;
-    _init_local_connection();
-  }
-
-  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it)
-    (*it)->create();
-
-  lock.Unlock();
-  return 0;
-}
-
-void AsyncMessenger::wait()
-{
-  lock.Lock();
-  if (!started) {
-    lock.Unlock();
-    return;
-  }
-  if (!stopped)
-    stop_cond.Wait(lock);
-
-  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it)
-    (*it)->join();
-  lock.Unlock();
-
-  // done!  clean up.
-  ldout(cct,20) << __func__ << ": stopping processor thread" << dendl;
-  processor.stop();
-  did_bind = false;
-  ldout(cct,20) << __func__ << ": stopped processor thread" << dendl;
-
-  // close all pipes
-  lock.Lock();
-  {
-    ldout(cct, 10) << __func__ << ": closing pipes" << dendl;
-
-    while (!conns.empty()) {
-      AsyncConnectionRef p = conns.begin()->second;
-      _stop_conn(p);
-    }
-  }
-  lock.Unlock();
-
-  ldout(cct, 10) << __func__ << ": done." << dendl;
-  ldout(cct, 1) << __func__ << " complete." << dendl;
-  started = false;
-}
-
-AsyncConnectionRef AsyncMessenger::add_accept(int sd)
-{
-  lock.Lock();
-  Worker *w = workers[conn_id % workers.size()];
-  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center);
-  w->center.dispatch_event_external(EventCallbackRef(new C_handle_accept(conn, sd)));
-  accepting_conns.insert(conn);
-  conn_id++;
-  lock.Unlock();
-  return conn;
-}
-
-AsyncConnectionRef AsyncMessenger::create_connect(const entity_addr_t& addr, int type)
-{
-  assert(lock.is_locked());
-  assert(addr != my_inst.addr);
-
-  ldout(cct, 10) << __func__ << " " << addr
-                 << ", creating connection and registering" << dendl;
-
-  // create connection
-  Worker *w = workers[conn_id % workers.size()];
-  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center);
-  conn->connect(addr, type);
-  assert(!conns.count(addr));
-  conns[addr] = conn;
-  conn_id++;
-
-  return conn;
-}
-
-ConnectionRef AsyncMessenger::get_connection(const entity_inst_t& dest)
-{
-  Mutex::Locker l(lock);
-  if (my_inst.addr == dest.addr) {
-    // local
-    return local_connection;
-  }
-
-  AsyncConnectionRef conn = _lookup_conn(dest.addr);
-  if (conn) {
-    ldout(cct, 10) << __func__ << " " << dest << " existing " << conn << dendl;
-  } else {
-    conn = create_connect(dest.addr, dest.name.type());
-    ldout(cct, 10) << __func__ << " " << dest << " new " << conn << dendl;
-  }
-
-  return conn;
-}
-
-ConnectionRef AsyncMessenger::get_loopback_connection()
-{
-  return local_connection;
-}
-
-int AsyncMessenger::_send_message(Message *m, const entity_inst_t& dest)
-{
-  ldout(cct, 1) << __func__ << "--> " << dest.name << " "
-                << dest.addr << " -- " << *m << " -- ?+"
-                << m->get_data().length() << " " << m << dendl;
-
-  if (dest.addr == entity_addr_t()) {
-    ldout(cct,0) << __func__ <<  " message " << *m
-                 << " with empty dest " << dest.addr << dendl;
-    m->put();
-    return -EINVAL;
-  }
-
-  AsyncConnectionRef conn = _lookup_conn(dest.addr);
-  submit_message(m, conn, dest.addr, dest.name.type());
-  return 0;
-}
-
-void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con,
-                                    const entity_addr_t& dest_addr, int dest_type)
-{
-  if (cct->_conf->ms_dump_on_send) {
-    m->encode(-1, true);
-    ldout(cct, 0) << __func__ << "submit_message " << *m << "\n";
-    m->get_payload().hexdump(*_dout);
-    if (m->get_data().length() > 0) {
-      *_dout << " data:\n";
-      m->get_data().hexdump(*_dout);
-    }
-    *_dout << dendl;
-    m->clear_payload();
-  }
-
-  // existing connection?
-  if (con) {
-    con->send_message(m);
-    return ;
-  }
-
-  // local?
-  if (my_inst.addr == dest_addr) {
-    // local
-    ldout(cct, 20) << __func__ << " " << *m << " local" << dendl;
-    m->set_connection(local_connection.get());
-    m->set_recv_stamp(ceph_clock_now(cct));
-    ms_fast_preprocess(m);
-    if (ms_can_fast_dispatch(m)) {
-      ms_fast_dispatch(m);
-    } else {
-      if (m->get_priority() >= CEPH_MSG_PRIO_LOW) {
-        ms_fast_dispatch(m);
-      } else {
-        ms_deliver_dispatch(m);
-      }
-    }
-
-    return;
-  }
-
-  // remote, no existing pipe.
-  const Policy& policy = get_policy(dest_type);
-  if (policy.server) {
-    ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addr
-                   << ", lossy server for target type "
-                   << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl;
-    m->put();
-  } else {
-    ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addr << ", new pipe." << dendl;
-  }
-}
-
-/**
- * If my_inst.addr doesn't have an IP set, this function
- * will fill it in from the passed addr. Otherwise it does nothing and returns.
- */
-void AsyncMessenger::set_addr_unknowns(entity_addr_t &addr)
-{
-  Mutex::Locker l(lock);
-  if (my_inst.addr.is_blank_ip()) {
-    int port = my_inst.addr.get_port();
-    my_inst.addr.addr = addr.addr;
-    my_inst.addr.set_port(port);
-    _init_local_connection();
-  }
-}
-
-int AsyncMessenger::send_keepalive(Connection *con)
-{
-  con->send_keepalive();
-  return 0;
-}
-
-void AsyncMessenger::mark_down_all()
-{
-  ldout(cct,1) << __func__ << " " << dendl;
-  lock.Lock();
-  for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin();
-       q != accepting_conns.end(); ++q) {
-    AsyncConnectionRef p = *q;
-    ldout(cct, 5) << __func__ << " accepting_conn " << p << dendl;
-    p->mark_down();
-    p->get();
-    ms_deliver_handle_reset(p.get());
-  }
-  accepting_conns.clear();
-
-  while (!conns.empty()) {
-    ceph::unordered_map<entity_addr_t, AsyncConnectionRef>::iterator it = conns.begin();
-    AsyncConnectionRef p = it->second;
-    ldout(cct, 5) << __func__ << " " << it->first << " " << p << dendl;
-    conns.erase(it);
-    p->mark_down();
-    p->get();
-    ms_deliver_handle_reset(p.get());
-  }
-  lock.Unlock();
-}
-
-void AsyncMessenger::mark_down(const entity_addr_t& addr)
-{
-  lock.Lock();
-  AsyncConnectionRef p = _lookup_conn(addr);
-  if (p) {
-    ldout(cct, 1) << __func__ << " " << addr << " -- " << p << dendl;
-    _stop_conn(p);
-    p->get();
-    ms_deliver_handle_reset(p.get());
-  } else {
-    ldout(cct, 1) << __func__ << " " << addr << " -- pipe dne" << dendl;
-  }
-  lock.Unlock();
-}
-
-int AsyncMessenger::get_proto_version(int peer_type, bool connect)
-{
-  int my_type = my_inst.name.type();
-
-  // set reply protocol version
-  if (peer_type == my_type) {
-    // internal
-    return cluster_protocol;
-  } else {
-    // public
-    if (connect) {
-      switch (peer_type) {
-      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
-      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
-      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
-      }
-    } else {
-      switch (my_type) {
-      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
-      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
-      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
-      }
-    }
-  }
-  return 0;
-}
-
-void AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
-{
-  // be careful here: multiple threads may block here, and readers of
-  // my_inst.addr do NOT hold any lock.
-
-  // this always goes from true -> false under the protection of the
-  // mutex.  if it is already false, we need not retake the mutex at
-  // all.
-  lock.Lock();
-  entity_addr_t t = peer_addr_for_me;
-  t.set_port(my_inst.addr.get_port());
-  my_inst.addr.addr = t.addr;
-  ldout(cct, 1) << __func__ << " learned my addr " << my_inst.addr << dendl;
-  _init_local_connection();
-  lock.Unlock();
-}
diff --git a/src/msg/AsyncMessenger.h b/src/msg/AsyncMessenger.h
deleted file mode 100644 (file)
index 087f557..0000000
+++ /dev/null
@@ -1,395 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_ASYNCMESSENGER_H
-#define CEPH_ASYNCMESSENGER_H
-
-#include "include/types.h"
-#include "include/xlist.h"
-
-#include <list>
-#include <map>
-using namespace std;
-#include "include/unordered_map.h"
-#include "include/unordered_set.h"
-
-#include "common/Mutex.h"
-#include "include/atomic.h"
-#include "common/Cond.h"
-#include "common/Thread.h"
-#include "common/Throttle.h"
-
-#include "SimplePolicyMessenger.h"
-#include "include/assert.h"
-#include "DispatchQueue.h"
-#include "AsyncConnection.h"
-#include "Event.h"
-
-
-class AsyncMessenger;
-
-/**
- * If the Messenger binds to a specific address, the Processor runs
- * and listens for incoming connections.
- */
-class Processor : public Thread {
-  AsyncMessenger *msgr;
-  bool done;
-  int listen_sd;
-  uint64_t nonce;
-
-  public:
-  Processor(AsyncMessenger *r, uint64_t n) : msgr(r), done(false), listen_sd(-1), nonce(n) {}
-
-  void *entry();
-  void stop();
-  int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports);
-  int rebind(const set<int>& avoid_port);
-  int start();
-  void accept();
-};
-
-class Worker : public Thread {
-  AsyncMessenger *msgr;
-  bool done;
-
- public:
-  EventCenter center;
-  Worker(AsyncMessenger *m, CephContext *c): msgr(m), done(false), center(c) {
-    center.init(5000);
-  }
-  void *entry();
-  void stop();
-};
-
-
-/*
- * This class handles transmission and reception of messages. Generally
- * speaking, there are several major components:
- *
- * - Connection
- *    Each logical session is associated with a Connection.
- * - AsyncConnection
- *    Each network connection is handled through a AsyncConnection, which handles
- *    the input and output of each message.  There is normally a 1:1
- *    relationship between AsyncConnection and Connection, but logical sessions may
- *    get handed off between AsyncConnection when sockets reconnect or during
- *    connection races.
- * - IncomingQueue
- *    Incoming messages are associated with an IncomingQueue, and there
- *    is one such queue associated with each AsyncConnection.
- * - DispatchQueue
- *    IncomingQueues get queued in the DispatchQueue, which is responsible
- *    for doing a round-robin sweep and processing them via a worker thread.
- * - AsyncMessenger
- *    It's the exterior class passed to the external message handler and
- *    most of the API details.
- *
- * Lock ordering:
- *
- *   AsyncMessenger::lock
- *       Pipe::pipe_lock
- *           DispatchQueue::lock
- *               IncomingQueue::lock
- */
-
-class AsyncMessenger : public SimplePolicyMessenger {
-  // First we have the public Messenger interface implementation...
-public:
-  /**
-   * Initialize the AsyncMessenger!
-   *
-   * @param cct The CephContext to use
-   * @param name The name to assign ourselves
-   * _nonce A unique ID to use for this AsyncMessenger. It should not
-   * be a value that will be repeated if the daemon restarts.
-   */
-  AsyncMessenger(CephContext *cct, entity_name_t name,
-                 string mname, uint64_t _nonce);
-
-  /**
-   * Destroy the AsyncMessenger. Pretty simple since all the work is done
-   * elsewhere.
-   */
-  virtual ~AsyncMessenger();
-
-  /** @defgroup Accessors
-   * @{
-   */
-  void set_addr_unknowns(entity_addr_t& addr);
-
-  int get_dispatch_queue_len() {
-    return 0;
-  }
-
-  double get_dispatch_queue_max_age(utime_t now) {
-    return 0;
-  }
-  /** @} Accessors */
-
-  /**
-   * @defgroup Configuration functions
-   * @{
-   */
-  void set_cluster_protocol(int p) {
-    assert(!started && !did_bind);
-    cluster_protocol = p;
-  }
-
-  int bind(const entity_addr_t& bind_addr);
-  int rebind(const set<int>& avoid_ports);
-
-  /** @} Configuration functions */
-
-  /**
-   * @defgroup Startup/Shutdown
-   * @{
-   */
-  virtual int start();
-  virtual void wait();
-  virtual int shutdown();
-
-  /** @} // Startup/Shutdown */
-
-  /**
-   * @defgroup Messaging
-   * @{
-   */
-  virtual int send_message(Message *m, const entity_inst_t& dest) {
-          Mutex::Locker l(lock);
-
-    return _send_message(m, dest);
-  }
-
-  /** @} // Messaging */
-
-  /**
-   * @defgroup Connection Management
-   * @{
-   */
-  virtual ConnectionRef get_connection(const entity_inst_t& dest);
-  virtual ConnectionRef get_loopback_connection();
-  int send_keepalive(Connection *con);
-  virtual void mark_down(const entity_addr_t& addr);
-  virtual void mark_down_all();
-  /** @} // Connection Management */
-
-  /**
-   * @defgroup Inner classes
-   * @{
-   */
-
-  Connection *create_anon_connection() {
-    Mutex::Locker l(lock);
-    Worker *w = workers[conn_id % workers.size()];
-    conn_id++;
-    return new AsyncConnection(cct, this, &w->center);
-  }
-
-  /**
-   * @} // Inner classes
-   */
-
-protected:
-  /**
-   * @defgroup Messenger Interfaces
-   * @{
-   */
-  /**
-   * Start up the DispatchQueue thread once we have somebody to dispatch to.
-   */
-  virtual void ready();
-  /** @} // Messenger Interfaces */
-
-private:
-
-  /**
-   * @defgroup Utility functions
-   * @{
-   */
-
-  /**
-   * Create a connection associated with the given entity (of the given type).
-   * Initiate the connection. (This function returning does not guarantee
-   * connection success.)
-   *
-   * @param addr The address of the entity to connect to.
-   * @param type The peer type of the entity at the address.
-   * @param con An existing Connection to associate with the new connection. If
-   * NULL, it creates a new Connection.
-   * @param msg an initial message to queue on the new connection
-   *
-   * @return a pointer to the newly-created connection. Caller does not own a
-   * reference; take one if you need it.
-   */
-  AsyncConnectionRef create_connect(const entity_addr_t& addr, int type);
-
-  /**
-   * Queue up a Message for delivery to the entity specified
-   * by addr and dest_type.
-   * submit_message() is responsible for creating
-   * new AsyncConnection (and closing old ones) as necessary.
-   *
-   * @param m The Message to queue up. This function eats a reference.
-   * @param con The existing Connection to use, or NULL if you don't know of one.
-   * @param addr The address to send the Message to.
-   * @param dest_type The peer type of the address we're sending to
-   * just drop silently under failure.
-   */
-  void submit_message(Message *m, AsyncConnectionRef con,
-                      const entity_addr_t& dest_addr, int dest_type);
-
-  int _send_message(Message *m, const entity_inst_t& dest);
-
- private:
-  vector<Worker*> workers;
-  int conn_id;
-
-  Processor processor;
-  friend class Processor;
-
-  /// overall lock used for AsyncMessenger data structures
-  Mutex lock;
-  // AsyncMessenger stuff
-  /// approximately unique ID set by the Constructor for use in entity_addr_t
-  uint64_t nonce;
-
-  /**
-   *  The following aren't lock-protected since you shouldn't be able to race
-   *  the only writers.
-   */
-
-  int listen_sd;
-  /**
-   *  false; set to true if the AsyncMessenger bound to a specific address;
-   *  and set false again by Accepter::stop().
-   */
-  bool did_bind;
-  /// counter for the global seq our connection protocol uses
-  __u32 global_seq;
-  /// lock to protect the global_seq
-  ceph_spinlock_t global_seq_lock;
-
-  /**
-   * hash map of addresses to Asyncconnection
-   *
-   * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered
-   * invalid and can be replaced by anyone holding the msgr lock
-   */
-  ceph::unordered_map<entity_addr_t, AsyncConnectionRef> conns;
-
-  /**
-   * list of connection are in teh process of accepting
-   *
-   * These are not yet in the conns map.
-   */
-  // FIXME clear up
-  set<AsyncConnectionRef> accepting_conns;
-
-  /// internal cluster protocol version, if any, for talking to entities of the same type.
-  int cluster_protocol;
-
-  Cond  stop_cond;
-  bool stopped;
-
-  AsyncConnectionRef _lookup_conn(const entity_addr_t& k) {
-    assert(lock.is_locked());
-    ceph::unordered_map<entity_addr_t, AsyncConnectionRef>::iterator p = conns.find(k);
-    if (p == conns.end())
-      return NULL;
-    return p->second;
-  }
-
-  void _stop_conn(AsyncConnectionRef c) {
-    assert(lock.is_locked());
-    if (c) {
-      c->mark_down();
-      conns.erase(c->peer_addr);
-    }
-  }
-
-  void _init_local_connection() {
-    assert(lock.is_locked());
-    local_connection->peer_addr = my_inst.addr;
-    local_connection->peer_type = my_inst.name.type();
-    ms_deliver_handle_fast_connect(local_connection.get());
-  }
-
-
-public:
-
-  /// con used for sending messages to ourselves
-  ConnectionRef local_connection;
-
-  /**
-   * @defgroup AsyncMessenger internals
-   * @{
-   */
-  /**
-   * This wraps _lookup_conn.
-   */
-  AsyncConnectionRef lookup_conn(const entity_addr_t& k) {
-    Mutex::Locker l(lock);
-    return _lookup_conn(k);
-  }
-
-  void accept_conn(AsyncConnectionRef conn) {
-    Mutex::Locker l(lock);
-    conns[conn->peer_addr] = conn;
-    accepting_conns.erase(conn);
-  }
-
-  void learned_addr(const entity_addr_t &peer_addr_for_me);
-  AsyncConnectionRef add_accept(int sd);
-
-  /**
-   * This wraps ms_deliver_get_authorizer. We use it for AsyncConnection.
-   */
-  AuthAuthorizer *get_authorizer(int peer_type, bool force_new) {
-    return ms_deliver_get_authorizer(peer_type, force_new);
-  }
-
-  /**
-   * This wraps ms_deliver_verify_authorizer; we use it for AsyncConnection.
-   */
-  bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth, bufferlist& auth_reply,
-                         bool& isvalid, CryptoKey& session_key) {
-    return ms_deliver_verify_authorizer(con, peer_type, protocol, auth,
-                                        auth_reply, isvalid, session_key);
-  }
-  /**
-   * Increment the global sequence for this AsyncMessenger and return it.
-   * This is for the connect protocol, although it doesn't hurt if somebody
-   * else calls it.
-   *
-   * @return a global sequence ID that nobody else has seen.
-   */
-  __u32 get_global_seq(__u32 old=0) {
-    ceph_spin_lock(&global_seq_lock);
-    if (old > global_seq)
-      global_seq = old;
-    __u32 ret = ++global_seq;
-    ceph_spin_unlock(&global_seq_lock);
-    return ret;
-  }
-  /**
-   * Get the protocol version we support for the given peer type: either
-   * a peer protocol (if it matches our own), the protocol version for the
-   * peer (if we're connecting), or our protocol version (if we're accepting).
-   */
-  int get_proto_version(int peer_type, bool connect);
-
-  /**
-   * Fill in the address and peer type for the local connection, which
-   * is used for delivering messages back to ourself.
-   */
-  void init_local_connection() {
-    Mutex::Locker l(lock);
-    _init_local_connection();
-  }
-
-  /**
-   * @} // AsyncMessenger Internals
-   */
-} ;
-
-#endif /* CEPH_SIMPLEMESSENGER_H */
diff --git a/src/msg/Event.cc b/src/msg/Event.cc
deleted file mode 100644 (file)
index 2aa99bc..0000000
+++ /dev/null
@@ -1,311 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#include <time.h>
-
-#include "common/errno.h"
-#include "Event.h"
-
-#ifdef HAVE_EPOLL
-#include "EventEpoll.h"
-#else
-#ifdef HAVE_KQUEUE
-#include "EventKqueue.h"
-#else
-#include "EventSelect.h"
-#endif
-#endif
-
-#define dout_subsys ceph_subsys_ms
-
-#undef dout_prefix
-#define dout_prefix *_dout << "Event "
-
-class C_handle_notify : public EventCallback {
- public:
-  C_handle_notify() {}
-  void do_request(int fd_or_id) {
-  }
-};
-
-int EventCenter::init(int n)
-{
-  // can't init multi times
-  assert(nevent == 0);
-#ifdef HAVE_EPOLL
-  driver = new EpollDriver(cct);
-#else
-#ifdef HAVE_KQUEUE
-  driver = new KqueueDriver(cct);
-#else
-  driver = new SelectDriver(cct);
-#endif
-#endif
-
-  if (!driver) {
-    lderr(cct) << __func__ << " failed to create event driver " << dendl;
-    return -1;
-  }
-
-  int r = driver->init(n);
-  if (r < 0) {
-    lderr(cct) << __func__ << " failed to init event driver." << dendl;
-    return r;
-  }
-
-  int fds[2];
-  if (pipe(fds) < 0) {
-    lderr(cct) << __func__ << " can't create notify pipe" << dendl;
-    return -1;
-  }
-
-  notify_receive_fd = fds[0];
-  notify_send_fd = fds[1];
-  file_events = (FileEvent *)malloc(sizeof(FileEvent)*n);
-  memset(file_events, 0, sizeof(FileEvent)*n);
-
-  nevent = n;
-  create_file_event(notify_receive_fd, EVENT_READABLE, EventCallbackRef(new C_handle_notify()));
-  return 0;
-}
-
-EventCenter::~EventCenter()
-{
-  if (driver)
-    delete driver;
-
-  if (notify_receive_fd > 0)
-    ::close(notify_receive_fd);
-  if (notify_send_fd > 0)
-    ::close(notify_send_fd);
-}
-
-int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
-{
-  int r;
-  if (fd > nevent) {
-    int new_size = nevent << 2;
-    while (fd > new_size)
-      new_size <<= 2;
-    ldout(cct, 10) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
-    r = driver->resize_events(new_size);
-    if (r < 0) {
-      lderr(cct) << __func__ << " event count is exceed." << dendl;
-      return -ERANGE;
-    }
-    FileEvent *new_events = (FileEvent *)realloc(file_events, sizeof(FileEvent)*new_size);
-    if (!new_events) {
-      lderr(cct) << __func__ << " failed to realloc file_events" << cpp_strerror(errno) << dendl;
-      return -errno;
-    }
-    file_events = new_events;
-    nevent = new_size;
-  }
-
-  EventCenter::FileEvent *event = _get_file_event(fd);
-
-  r = driver->add_event(fd, event->mask, mask);
-  if (r < 0)
-    return r;
-
-  event->mask |= mask;
-  if (mask & EVENT_READABLE) {
-    event->read_cb = ctxt;
-  }
-  if (mask & EVENT_WRITABLE) {
-    event->write_cb = ctxt;
-  }
-  ldout(cct, 10) << __func__ << " create event fd=" << fd << " mask=" << mask
-                 << " now mask is " << event->mask << dendl;
-  return 0;
-}
-
-void EventCenter::delete_file_event(int fd, int mask)
-{
-  EventCenter::FileEvent *event = _get_file_event(fd);
-  if (!event->mask)
-    return ;
-
-  driver->del_event(fd, event->mask, mask);
-
-  if (mask & EVENT_READABLE && event->read_cb) {
-    event->read_cb.reset();
-  }
-  if (mask & EVENT_WRITABLE && event->write_cb) {
-    event->write_cb.reset();
-  }
-
-  event->mask = event->mask & (~mask);
-  ldout(cct, 10) << __func__ << " delete fd=" << fd << " mask=" << mask
-                 << " now mask is " << event->mask << dendl;
-}
-
-uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt)
-{
-  uint64_t id = time_event_next_id++;
-
-  ldout(cct, 10) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl;
-  EventCenter::TimeEvent event;
-  utime_t expire;
-  struct timeval tv;
-
-  if (microseconds < 5) {
-    tv.tv_sec = 0;
-    tv.tv_usec = microseconds;
-  } else {
-    expire = ceph_clock_now(cct);
-    expire.copy_to_timeval(&tv);
-    tv.tv_sec += microseconds / 1000000;
-    tv.tv_usec += microseconds % 1000000;
-  }
-  expire.set_from_timeval(&tv);
-
-  event.id = id;
-  event.time_cb = ctxt;
-  time_events[expire].push_back(event);
-
-  return id;
-}
-
-void EventCenter::wakeup()
-{
-  ldout(cct, 1) << __func__ << dendl;
-  char buf[1];
-  buf[0] = 'c';
-  // wake up "event_wait"
-  int n = write(notify_send_fd, buf, 1);
-  // FIXME ?
-  assert(n == 1);
-}
-
-int EventCenter::process_time_events()
-{
-  int processed = 0;
-  time_t now = time(NULL);
-  utime_t cur = ceph_clock_now(cct);
-  ldout(cct, 10) << __func__ << " cur time is " << cur << dendl;
-
-  /* If the system clock is moved to the future, and then set back to the
-   * right value, time events may be delayed in a random way. Often this
-   * means that scheduled operations will not be performed soon enough.
-   *
-   * Here we try to detect system clock skews, and force all the time
-   * events to be processed ASAP when this happens: the idea is that
-   * processing events earlier is less dangerous than delaying them
-   * indefinitely, and practice suggests it is. */
-  if (now < last_time) {
-    map<utime_t, list<TimeEvent> > changed;
-    for (map<utime_t, list<TimeEvent> >::iterator it = time_events.begin();
-         it != time_events.end(); ++it) {
-      changed[utime_t()].swap(it->second);
-    }
-    time_events.swap(changed);
-  }
-  last_time = now;
-
-  map<utime_t, list<TimeEvent> >::iterator prev;
-  for (map<utime_t, list<TimeEvent> >::iterator it = time_events.begin();
-       it != time_events.end(); ) {
-    prev = it;
-    if (cur >= it->first) {
-      for (list<TimeEvent>::iterator j = it->second.begin();
-           j != it->second.end(); ++j) {
-        ldout(cct, 10) << __func__ << " process time event: id=" << j->id << " time is "
-                      << it->first << dendl;
-        j->time_cb->do_request(j->id);
-      }
-      processed++;
-      ++it;
-      time_events.erase(prev);
-    } else {
-      break;
-    }
-  }
-
-  return processed;
-}
-
-int EventCenter::process_events(int timeout_microseconds)
-{
-  struct timeval tv;
-  int numevents;
-  bool trigger_time = false;
-
-  utime_t period, shortest, now = ceph_clock_now(cct);
-  now.copy_to_timeval(&tv);
-  if (timeout_microseconds > 0) {
-    tv.tv_sec += timeout_microseconds / 1000000;
-    tv.tv_usec += timeout_microseconds % 1000000;
-  }
-  shortest.set_from_timeval(&tv);
-
-  {
-    map<utime_t, list<TimeEvent> >::iterator it = time_events.begin();
-    if (it != time_events.end() && shortest >= it->first) {
-      ldout(cct, 10) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl;
-      shortest = it->first;
-      trigger_time = true;
-      if (shortest > now) {
-        period = now - shortest;
-        period.copy_to_timeval(&tv);
-      } else {
-        tv.tv_sec = 0;
-        tv.tv_usec = 0;
-      }
-    } else {
-      tv.tv_sec = timeout_microseconds / 1000000;
-      tv.tv_usec = timeout_microseconds % 1000000;
-    }
-  }
-
-  ldout(cct, 10) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl;
-  vector<FiredFileEvent> fired_events;
-  numevents = driver->event_wait(fired_events, &tv);
-  for (int j = 0; j < numevents; j++) {
-    int rfired = 0;
-    FileEvent *event = _get_file_event(fired_events[j].fd);
-    if (!event)
-      continue;
-
-    /* note the event->mask & mask & ... code: maybe an already processed
-    * event removed an element that fired and we still didn't
-    * processed, so we check if the event is still valid. */
-    if (event->mask & fired_events[j].mask & EVENT_READABLE) {
-      rfired = 1;
-      event->read_cb->do_request(fired_events[j].fd);
-    }
-    event = _get_file_event(fired_events[j].fd);
-    if (!event)
-      continue;
-
-    if (event->mask & fired_events[j].mask & EVENT_WRITABLE) {
-      if (!rfired || event->read_cb != event->write_cb)
-        event->write_cb->do_request(fired_events[j].fd);
-    }
-
-    ldout(cct, 20) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl;
-  }
-
-  if (trigger_time)
-    numevents += process_time_events();
-
-  {
-    lock.Lock();
-    while (!external_events.empty()) {
-      EventCallbackRef e = external_events.front();
-      external_events.pop_front();
-      lock.Unlock();
-      e->do_request(0);
-      lock.Lock();
-    }
-    lock.Unlock();
-  }
-  return numevents;
-}
-
-void EventCenter::dispatch_event_external(EventCallbackRef e)
-{
-  lock.Lock();
-  external_events.push_back(e);
-  lock.Unlock();
-  wakeup();
-}
diff --git a/src/msg/Event.h b/src/msg/Event.h
deleted file mode 100644 (file)
index 3b3e66b..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_MSG_EVENT_H
-#define CEPH_MSG_EVENT_H
-
-#ifdef __APPLE__
-#include <AvailabilityMacros.h>
-#endif
-
-// We use epoll, kqueue, evport, select in descending order by performance.
-#if defined(__linux__)
-#define HAVE_EPOLL 1
-#endif
-
-#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
-#define HAVE_KQUEUE 1
-#endif
-
-#ifdef __sun
-#include <sys/feature_tests.h>
-#ifdef _DTRACE_VERSION
-#define HAVE_EVPORT 1
-#endif
-#endif
-
-#include "include/Context.h"
-#include "include/unordered_map.h"
-#include "common/WorkQueue.h"
-
-#define EVENT_NONE 0
-#define EVENT_READABLE 1
-#define EVENT_WRITABLE 2
-
-class EventCenter;
-
-class EventCallback {
-
- public:
-  virtual void do_request(int fd_or_id) = 0;
-  virtual ~EventCallback() {}       // we want a virtual destructor!!!
-};
-
-typedef ceph::shared_ptr<EventCallback> EventCallbackRef;
-
-struct FiredFileEvent {
-  int fd;
-  int mask;
-};
-
-class EventDriver {
- public:
-  virtual ~EventDriver() {}       // we want a virtual destructor!!!
-  virtual int init(int nevent) = 0;
-  virtual int add_event(int fd, int cur_mask, int mask) = 0;
-  virtual void del_event(int fd, int cur_mask, int del_mask) = 0;
-  virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
-  virtual int resize_events(int newsize) = 0;
-};
-
-class EventCenter {
-  struct FileEvent {
-    int mask;
-    EventCallbackRef read_cb;
-    EventCallbackRef write_cb;
-    FileEvent(): mask(0) {}
-  };
-
-  struct TimeEvent {
-    uint64_t id;
-    EventCallbackRef time_cb;
-
-    TimeEvent(): id(0) {}
-  };
-
-  CephContext *cct;
-  int nevent;
-  // Used only to external event
-  Mutex lock;
-  deque<EventCallbackRef> external_events;
-  FileEvent *file_events;
-  EventDriver *driver;
-  map<utime_t, list<TimeEvent> > time_events;
-  uint64_t time_event_next_id;
-  time_t last_time; // last time process time event
-  int notify_receive_fd;
-  int notify_send_fd;
-
-  int process_time_events();
-  FileEvent *_get_file_event(int fd) {
-    FileEvent *p = &file_events[fd];
-    if (!p->mask)
-      new(p) FileEvent();
-    return p;
-  }
-
- public:
-  EventCenter(CephContext *c):
-    cct(c), nevent(0),
-    lock("AsyncMessenger::lock"),
-    driver(NULL), time_event_next_id(0),
-    notify_receive_fd(-1), notify_send_fd(-1) {
-    last_time = time(NULL);
-  }
-  ~EventCenter();
-  int init(int nevent);
-  // Used by internal thread
-  int create_file_event(int fd, int mask, EventCallbackRef ctxt);
-  uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt);
-  void delete_file_event(int fd, int mask);
-  int process_events(int timeout_microseconds);
-  void wakeup();
-
-  // Used by external thread
-  void dispatch_event_external(EventCallbackRef e);
-};
-
-#endif
diff --git a/src/msg/EventEpoll.cc b/src/msg/EventEpoll.cc
deleted file mode 100644 (file)
index 1b7aa18..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-#include "common/errno.h"
-#include "EventEpoll.h"
-
-#define dout_subsys ceph_subsys_ms
-
-#undef dout_prefix
-#define dout_prefix *_dout << "EpollDriver."
-
-int EpollDriver::init(int nevent)
-{
-  events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent);
-  if (!events) {
-    lderr(cct) << __func__ << " unable to malloc memory: "
-                           << cpp_strerror(errno) << dendl;
-    return -errno;
-  }
-  memset(events, 0, sizeof(struct epoll_event)*nevent);
-
-  epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
-  if (epfd == -1) {
-    lderr(cct) << __func__ << " unable to do epoll_create: "
-                       << cpp_strerror(errno) << dendl;
-    return -errno;
-  }
-
-  size = nevent;
-
-  return 0;
-}
-
-int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
-{
-  struct epoll_event ee;
-  /* If the fd was already monitored for some event, we need a MOD
-   * operation. Otherwise we need an ADD operation. */
-  int op;
-  op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD;
-
-  ee.events = EPOLLET;
-  add_mask |= cur_mask; /* Merge old events */
-  if (add_mask & EVENT_READABLE)
-    ee.events |= EPOLLIN;
-  if (add_mask & EVENT_WRITABLE)
-    ee.events |= EPOLLOUT;
-  ee.data.u64 = 0; /* avoid valgrind warning */
-  ee.data.fd = fd;
-  if (epoll_ctl(epfd, op, fd, &ee) == -1) {
-    lderr(cct) << __func__ << " unable to add event: "
-                       << cpp_strerror(errno) << dendl;
-    return -errno;
-  }
-
-  ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask
-                 << dendl;
-  return 0;
-}
-
-void EpollDriver::del_event(int fd, int cur_mask, int delmask)
-{
-  struct epoll_event ee;
-  int mask = cur_mask & (~delmask);
-
-  ee.events = 0;
-  if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
-  if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
-  ee.data.u64 = 0; /* avoid valgrind warning */
-  ee.data.fd = fd;
-  if (mask != EVENT_NONE) {
-    if (epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee) < 0) {
-      lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
-                 << " failed." << cpp_strerror(errno) << dendl;
-    }
-  } else {
-    /* Note, Kernel < 2.6.9 requires a non null event pointer even for
-     * EPOLL_CTL_DEL. */
-    if (epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee) < 0) {
-      lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
-                 << " failed." << cpp_strerror(errno) << dendl;
-    }
-  }
-  ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << mask
-                 << dendl;
-}
-
-int EpollDriver::resize_events(int newsize)
-{
-  return 0;
-}
-
-int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
-{
-  int retval, numevents = 0;
-
-  retval = epoll_wait(epfd, events, size,
-                      tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
-  if (retval > 0) {
-    int j;
-
-    numevents = retval;
-    fired_events.resize(numevents);
-    for (j = 0; j < numevents; j++) {
-      int mask = 0;
-      struct epoll_event *e = events + j;
-
-      if (e->events & EPOLLIN) mask |= EVENT_READABLE;
-      if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE;
-      if (e->events & EPOLLERR) mask |= EVENT_WRITABLE;
-      if (e->events & EPOLLHUP) mask |= EVENT_WRITABLE;
-      fired_events[j].fd = e->data.fd;
-      fired_events[j].mask = mask;
-    }
-  }
-  return numevents;
-}
diff --git a/src/msg/EventEpoll.h b/src/msg/EventEpoll.h
deleted file mode 100644 (file)
index 735acca..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_MSG_EVENTEPOLL_H
-#define CEPH_MSG_EVENTEPOLL_H
-
-#include <unistd.h>
-#include <sys/epoll.h>
-
-#include "Event.h"
-
-class EpollDriver : public EventDriver {
-  int epfd;
-  struct epoll_event *events;
-  CephContext *cct;
-  int size;
-
- public:
-  EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c) {}
-  virtual ~EpollDriver() {
-    if (epfd != -1)
-      close(epfd);
-
-    if (events)
-      free(events);
-  }
-
-  int init(int nevent);
-  int add_event(int fd, int cur_mask, int add_mask);
-  void del_event(int fd, int cur_mask, int del_mask);
-  int resize_events(int newsize);
-  int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp);
-};
-
-#endif
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
new file mode 100644 (file)
index 0000000..31a9948
--- /dev/null
@@ -0,0 +1,2026 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include "include/Context.h"
+#include "common/errno.h"
+#include "AsyncMessenger.h"
+#include "AsyncConnection.h"
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
+#define SEQ_MASK  0x7fffffff 
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "-- " << async_msgr->get_myinst().addr << " >> " << peer_addr << " conn(" << this
+        << " sd=" << sd << " :" << port
+        << " s=" << get_state_name(state)
+        << " pgs=" << peer_global_seq
+        << " cs=" << connect_seq
+        << " l=" << policy.lossy
+        << ").";
+}
+
+class C_handle_read : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  C_handle_read(AsyncConnectionRef c): conn(c) {}
+  void do_request(int fd) {
+    conn->process();
+  }
+};
+
+class C_handle_write : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  C_handle_write(AsyncConnectionRef c): conn(c) {}
+  void do_request(int fd) {
+    conn->handle_write();
+  }
+};
+
+class C_handle_reset : public EventCallback {
+  AsyncMessenger *msgr;
+  AsyncConnectionRef conn;
+
+ public:
+  C_handle_reset(AsyncMessenger *m, AsyncConnectionRef c): msgr(m), conn(c) {}
+  void do_request(int id) {
+    msgr->ms_deliver_handle_reset(conn.get());
+  }
+};
+
+class C_handle_remote_reset : public EventCallback {
+  AsyncMessenger *msgr;
+  AsyncConnectionRef conn;
+
+ public:
+  C_handle_remote_reset(AsyncMessenger *m, AsyncConnectionRef c): msgr(m), conn(c) {}
+  void do_request(int id) {
+    msgr->ms_deliver_handle_remote_reset(conn.get());
+  }
+};
+
+class C_handle_dispatch : public EventCallback {
+  AsyncMessenger *msgr;
+  Message *m;
+
+ public:
+  C_handle_dispatch(AsyncMessenger *msgr, Message *m): msgr(msgr), m(m) {}
+  void do_request(int id) {
+    //msgr->ms_fast_preprocess(m);
+    //if (msgr->ms_can_fast_dispatch(m)) {
+    //  msgr->ms_fast_dispatch(m);
+    //} else {
+      msgr->ms_deliver_dispatch(m);
+    //}
+  }
+};
+
+
+static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off)
+{
+  // create a buffer to read into that matches the data alignment
+  unsigned left = len;
+  if (off & ~CEPH_PAGE_MASK) {
+    // head
+    unsigned head = 0;
+    head = MIN(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
+    bufferptr bp = buffer::create(head);
+    data.push_back(bp);
+    left -= head;
+  }
+  unsigned middle = left & CEPH_PAGE_MASK;
+  if (middle > 0) {
+    bufferptr bp = buffer::create_page_aligned(middle);
+    data.push_back(bp);
+    left -= middle;
+  }
+  if (left) {
+    bufferptr bp = buffer::create(left);
+    data.push_back(bp);
+  }
+}
+
+AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c)
+  : Connection(cct, m), async_msgr(m), global_seq(0), connect_seq(0), out_seq(0), in_seq(0), in_seq_acked(0),
+    state(STATE_NONE), state_after_send(0), sd(-1),
+    lock("AsyncConnection::lock"), open_write(false), keepalive(false),
+    got_bad_auth(false), authorizer(NULL),
+    state_buffer(4096), state_offset(0), net(cct), center(c)
+{
+  read_handler.reset(new C_handle_read(this));
+  write_handler.reset(new C_handle_write(this));
+  reset_handler.reset(new C_handle_reset(async_msgr, this));
+  remote_reset_handler.reset(new C_handle_remote_reset(async_msgr, this));
+  memset(msgvec, 0, sizeof(msgvec));
+}
+
+AsyncConnection::~AsyncConnection()
+{
+  assert(!authorizer);
+}
+
+/* return -1 means `fd` occurs error or closed, it should be closed
+ * return 0 means EAGAIN or EINTR */
+int AsyncConnection::read_bulk(int fd, char *buf, int len)
+{
+  int nread = ::read(fd, buf, len);
+  if (nread == -1) {
+    if (errno == EAGAIN || errno == EINTR) {
+      nread = 0;
+    } else {
+      ldout(async_msgr->cct, 1) << __func__ << " Reading from fd=" << fd
+                          << " : "<< strerror(errno) << dendl;
+      return -1;
+    }
+  } else if (nread == 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " Peer close file descriptor "
+                              << fd << dendl;
+    return -1;
+  }
+  return nread;
+}
+
+// return the length of msg needed to be sent,
+// < 0 means error occured
+int AsyncConnection::do_sendmsg(struct msghdr &msg, int len, bool more)
+{
+  while (len > 0) {
+    int r = ::sendmsg(sd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+
+    if (r == 0) {
+      ldout(async_msgr->cct, 10) << __func__ << " sendmsg got r==0!" << dendl;
+    } else if (r < 0) {
+      if (errno == EAGAIN || errno == EINTR) {
+        r = len;
+      } else {
+        ldout(async_msgr->cct, 1) << __func__ << " sendmsg error: " << cpp_strerror(errno) << dendl;
+      }
+
+      return r;
+    }
+
+    len -= r;
+    if (len == 0) break;
+
+    // hrmph.  trim r bytes off the front of our message.
+    ldout(async_msgr->cct, 20) << __func__ << " short write did " << r << ", still have " << len << dendl;
+    while (r > 0) {
+      if (msg.msg_iov[0].iov_len <= (size_t)r) {
+        // lose this whole item
+        r -= msg.msg_iov[0].iov_len;
+        msg.msg_iov++;
+        msg.msg_iovlen--;
+      } else {
+        msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r;
+        msg.msg_iov[0].iov_len -= r;
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+// return the remaining bytes, it may larger than the length of ptr
+// else return < 0 means error
+int AsyncConnection::_try_send(bufferlist send_bl, bool send)
+{
+  if (send_bl.length()) {
+    if (outcoming_bl.length())
+      outcoming_bl.claim_append(send_bl);
+    else
+      outcoming_bl.swap(send_bl);
+  }
+
+  if (!send)
+    return 0;
+
+  // standby?
+  if (is_queued() && state == STATE_STANDBY && !policy.server) {
+    assert(!outcoming_bl.length());
+    connect_seq++;
+    state = STATE_CONNECTING;
+    center->create_time_event(0, read_handler);
+    return 0;
+  }
+
+  if (state == STATE_STANDBY) {
+    ldout(async_msgr->cct, 1) << __func__ << " connection is standby" << dendl;
+    return 0;
+  }
+  if (state == STATE_CLOSED) {
+    ldout(async_msgr->cct, 1) << __func__ << " connection is closed" << dendl;
+    return -EINTR;
+  }
+
+  int r = 0;
+  uint64_t sended = 0;
+  list<bufferptr>::const_iterator pb = outcoming_bl.buffers().begin();
+  while (outcoming_bl.length() > sended) {
+    struct msghdr msg;
+    int size = MIN(outcoming_bl.buffers().size(), IOV_LEN);
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_iovlen = 0;
+    msg.msg_iov = msgvec;
+    int msglen = 0;
+    while (size > 0) {
+      msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str());
+      msgvec[msg.msg_iovlen].iov_len = pb->length();
+      msg.msg_iovlen++;
+      msglen += pb->length();
+      pb++;
+      size--;
+    }
+
+    r = do_sendmsg(msg, msglen, false);
+    if (r < 0)
+      return r;
+
+    // "r" is the remaining length
+    sended += msglen - r;
+    if (r > 0) {
+      ldout(async_msgr->cct, 5) << __func__ << " remaining " << r
+                          << " needed to be sent, creating event for writing"
+                          << dendl;
+      break;
+    }
+    // only "r" == 0 continue
+  }
+
+  // trim already sent for outcoming_bl
+  if (sended) {
+    bufferlist bl;
+    if (sended < outcoming_bl.length())
+      outcoming_bl.splice(sended, outcoming_bl.length()-sended, &bl);
+    bl.swap(outcoming_bl);
+  }
+
+  ldout(async_msgr->cct, 20) << __func__ << " send bytes " << sended
+                             << " remaining bytes " << outcoming_bl.length() << dendl;
+
+  if (!open_write && is_queued()) {
+    center->create_file_event(sd, EVENT_WRITABLE, write_handler);
+    open_write = true;
+  }
+
+  if (open_write && !is_queued()) {
+    center->delete_file_event(sd, EVENT_WRITABLE);
+    open_write = false;
+  }
+
+  return outcoming_bl.length();
+}
+
+// Because this func will be called multi times to populate
+// the needed buffer, so the passed in bufferptr must be the same.
+// Normally, only "read_message" will pass existing bufferptr in
+//
+// return the remaining bytes, 0 means this buffer is finished
+// else return < 0 means error
+int AsyncConnection::read_until(uint64_t needed, bufferptr &p)
+{
+  assert(needed);
+  int offset = state_offset;
+  int left = needed - offset;
+  int r;
+  do {
+    r = read_bulk(sd, p.c_str()+offset, left);
+    if (r < 0) {
+      ldout(async_msgr->cct, 1) << __func__ << " read failed, state is " << get_state_name(state) << dendl;
+      return -1;
+    } else if (r == left) {
+      state_offset = 0;
+      return 0;
+    }
+    left -= r;
+    offset += r;
+  } while (r > 0);
+
+  state_offset = offset;
+  ldout(async_msgr->cct, 20) << __func__ << " read " << r << " bytes, state is "
+                      << get_state_name(state) << dendl;
+  return needed - offset;
+}
+
+void AsyncConnection::process()
+{
+  int r = 0;
+  int prev_state = state;
+  Mutex::Locker l(lock);
+  do {
+    ldout(async_msgr->cct, 20) << __func__ << " state is " << get_state_name(state)
+                               << ", prev state is " << get_state_name(prev_state) << dendl;
+    prev_state = state;
+    switch (state) {
+      case STATE_OPEN:
+        {
+          char tag = -1;
+          r = read_bulk(sd, &tag, sizeof(tag));
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read tag failed, state is "
+                                      << get_state_name(state) << dendl;
+            goto fail;
+          } else if (r == 0) {
+            break;
+          }
+          assert(r == 1);
+
+          if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
+            ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+          } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
+            state = STATE_OPEN_KEEPALIVE2;
+          } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+            state = STATE_OPEN_KEEPALIVE2_ACK;
+          } else if (tag == CEPH_MSGR_TAG_ACK) {
+            state = STATE_OPEN_TAG_ACK;
+          } else if (tag == CEPH_MSGR_TAG_MSG) {
+            state = STATE_OPEN_MESSAGE_HEADER;
+          } else if (tag == CEPH_MSGR_TAG_CLOSE) {
+            state = STATE_OPEN_TAG_CLOSE;
+          } else {
+            ldout(async_msgr->cct, 0) << __func__ << " bad tag " << (int)tag << dendl;
+            goto fail;
+          }
+
+          break;
+        }
+
+      case STATE_OPEN_KEEPALIVE2:
+        {
+          ceph_timespec *t;
+          r = read_until(sizeof(*t), state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+
+          ldout(async_msgr->cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+          t = (ceph_timespec*)(state_buffer.c_str());
+          utime_t kp_t = utime_t(*t);
+          _send_keepalive_or_ack(true, &kp_t);
+          ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
+          state = STATE_OPEN;
+          break;
+        }
+
+      case STATE_OPEN_KEEPALIVE2_ACK:
+        {
+          ceph_timespec *t;
+          r = read_until(sizeof(*t), state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+
+          t = (ceph_timespec*)(state_buffer.c_str());
+          last_keepalive_ack = utime_t(*t);
+          ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+          state = STATE_OPEN;
+          break;
+        }
+
+      case STATE_OPEN_TAG_ACK:
+        {
+          ceph_le64 *seq;
+          r = read_until(sizeof(seq), state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read ack seq failed" << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+
+          seq = (ceph_le64*)(state_buffer.c_str());
+          ldout(async_msgr->cct, 20) << __func__ << " got ACK" << dendl;
+          handle_ack(*seq);
+          state = STATE_OPEN;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_HEADER:
+        {
+          ldout(async_msgr->cct, 20) << __func__ << " begin MSG" << dendl;
+          ceph_msg_header header;
+          ceph_msg_header_old oldheader;
+          __u32 header_crc;
+          int len;
+          if (has_feature(CEPH_FEATURE_NOSRCADDR))
+            len = sizeof(header);
+          else
+            len = sizeof(oldheader);
+
+          r = read_until(len, state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read message header failed" << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+
+          ldout(async_msgr->cct, 20) << __func__ << " got MSG header" << dendl;
+
+          if (has_feature(CEPH_FEATURE_NOSRCADDR)) {
+            header = *((ceph_msg_header*)state_buffer.c_str());
+            header_crc = ceph_crc32c(0, (unsigned char *)&header,
+                                    sizeof(header) - sizeof(header.crc));
+          } else {
+            oldheader = *((ceph_msg_header_old*)state_buffer.c_str());
+            // this is fugly
+            memcpy(&header, &oldheader, sizeof(header));
+            header.src = oldheader.src.name;
+            header.reserved = oldheader.reserved;
+            header.crc = oldheader.crc;
+            header_crc = ceph_crc32c(0, (unsigned char *)&oldheader, sizeof(oldheader) - sizeof(oldheader.crc));
+          }
+
+          ldout(async_msgr->cct, 20) << __func__ << " got envelope type=" << header.type
+                              << " src " << entity_name_t(header.src)
+                              << " front=" << header.front_len
+                              << " data=" << header.data_len
+                              << " off " << header.data_off << dendl;
+
+          // verify header crc
+          if (header_crc != header.crc) {
+            ldout(async_msgr->cct,0) << __func__ << "reader got bad header crc "
+                              << header_crc << " != " << header.crc << dendl;
+            goto fail;
+          }
+
+          // Reset state
+          data_buf.clear();
+          front.clear();
+          middle.clear();
+          data.clear();
+          recv_stamp = ceph_clock_now(async_msgr->cct);
+          current_header = header;
+          state = STATE_OPEN_MESSAGE_THROTTLE_MESSAGE;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_THROTTLE_MESSAGE:
+        {
+          if (policy.throttler_messages) {
+            ldout(async_msgr->cct,10) << __func__ << " wants " << 1 << " message from policy throttler "
+                                << policy.throttler_messages->get_current() << "/"
+                                << policy.throttler_messages->get_max() << dendl;
+            // FIXME: may block
+            policy.throttler_messages->get();
+          }
+
+          state = STATE_OPEN_MESSAGE_THROTTLE_BYTES;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_THROTTLE_BYTES:
+        {
+          uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
+          if (message_size) {
+            if (policy.throttler_bytes) {
+              ldout(async_msgr->cct,10) << __func__ << " wants " << message_size << " bytes from policy throttler "
+                  << policy.throttler_bytes->get_current() << "/"
+                  << policy.throttler_bytes->get_max() << dendl;
+              // FIXME: may block
+              policy.throttler_bytes->get(message_size);
+            }
+          }
+
+          throttle_stamp = ceph_clock_now(msgr->cct);
+          state = STATE_OPEN_MESSAGE_READ_FRONT;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_READ_FRONT:
+        {
+          // read front
+          int front_len = current_header.front_len;
+          if (front_len) {
+            bufferptr ptr = buffer::create(front_len);
+            r = read_until(front_len, ptr);
+            if (r < 0) {
+              ldout(async_msgr->cct, 1) << __func__ << " read message front failed" << dendl;
+              goto fail;
+            } else if (r > 0) {
+              break;
+            }
+
+            front.push_back(ptr);
+            ldout(async_msgr->cct, 20) << __func__ << " got front " << front.length() << dendl;
+          }
+          state = STATE_OPEN_MESSAGE_READ_MIDDLE;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_READ_MIDDLE:
+        {
+          // read middle
+          int middle_len = current_header.middle_len;
+          if (middle_len) {
+            bufferptr ptr = buffer::create(middle_len);
+            r = read_until(middle_len, ptr);
+            if (r < 0) {
+              ldout(async_msgr->cct, 1) << __func__ << " read message middle failed" << dendl;
+              goto fail;
+            } else if (r > 0) {
+              break;
+            }
+            middle.push_back(ptr);
+            ldout(async_msgr->cct, 20) << __func__ << " got middle " << middle.length() << dendl;
+          }
+
+          state = STATE_OPEN_MESSAGE_READ_DATA_PREPARE;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_READ_DATA_PREPARE:
+        {
+          // read data
+          uint64_t data_len = le32_to_cpu(current_header.data_len);
+          int data_off = le32_to_cpu(current_header.data_off);
+          if (data_len) {
+            // get a buffer
+            map<ceph_tid_t,pair<bufferlist,int> >::iterator p = rx_buffers.find(current_header.tid);
+            if (p != rx_buffers.end()) {
+              ldout(async_msgr->cct,10) << __func__ << " seleting rx buffer v " << p->second.second
+                                  << " at offset " << data_off
+                                  << " len " << p->second.first.length() << dendl;
+              data_buf = p->second.first;
+              // make sure it's big enough
+              if (data_buf.length() < data_len)
+                data_buf.push_back(buffer::create(data_len - data_buf.length()));
+              data_blp = data_buf.begin();
+            } else {
+              ldout(async_msgr->cct,20) << __func__ << " allocating new rx buffer at offset " << data_off << dendl;
+              alloc_aligned_buffer(data_buf, data_len, data_off);
+              data_blp = data_buf.begin();
+            }
+          }
+
+          msg_left = data_len;
+          state = STATE_OPEN_MESSAGE_READ_DATA;
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_READ_DATA:
+        {
+          while (msg_left > 0) {
+            bufferptr bp = data_blp.get_current_ptr();
+            uint64_t read = MIN(bp.length(), msg_left);
+            r = read_until(read, bp);
+            if (r < 0) {
+              ldout(async_msgr->cct, 1) << __func__ << " read data error " << dendl;
+              goto fail;
+            } else if (r > 0) {
+              break;
+            }
+
+            data_blp.advance(read);
+            data.append(bp, 0, read);
+            msg_left -= read;
+          }
+
+          if (msg_left == 0)
+            state = STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH;
+
+          break;
+        }
+
+      case STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH:
+        {
+          ceph_msg_footer footer;
+          ceph_msg_footer_old old_footer;
+          int len;
+          // footer
+          if (has_feature(CEPH_FEATURE_MSG_AUTH))
+            len = sizeof(footer);
+          else
+            len = sizeof(old_footer);
+
+          r = read_until(len, state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read footer data error " << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+
+          if (has_feature(CEPH_FEATURE_MSG_AUTH)) {
+            footer = *((ceph_msg_footer*)state_buffer.c_str());
+          } else {
+            old_footer = *((ceph_msg_footer_old*)state_buffer.c_str());
+            footer.front_crc = old_footer.front_crc;
+            footer.middle_crc = old_footer.middle_crc;
+            footer.data_crc = old_footer.data_crc;
+            footer.sig = 0;
+            footer.flags = old_footer.flags;
+          }
+          int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
+          ldout(async_msgr->cct, 10) << __func__ << " aborted = " << aborted << dendl;
+          if (aborted) {
+            ldout(async_msgr->cct, 0) << __func__ << " got " << front.length() << " + " << middle.length() << " + " << data.length()
+                                << " byte message.. ABORTED" << dendl;
+            goto fail;
+          }
+
+          ldout(async_msgr->cct, 20) << __func__ << " got " << front.length() << " + " << middle.length()
+                              << " + " << data.length() << " byte message" << dendl;
+          Message *message = decode_message(async_msgr->cct, current_header, footer, front, middle, data);
+          if (!message) {
+            ldout(async_msgr->cct, 1) << __func__ << " decode message failed " << dendl;
+            goto fail;
+          }
+
+          //
+          //  Check the signature if one should be present.  A zero return indicates success. PLR
+          //
+
+          if (session_security.get() == NULL) {
+            ldout(async_msgr->cct, 10) << __func__ << " No session security set" << dendl;
+          } else {
+            if (session_security->check_message_signature(message)) {
+              ldout(async_msgr->cct, 0) << __func__ << "Signature check failed" << dendl;
+              goto fail;
+            }
+          }
+          message->set_byte_throttler(policy.throttler_bytes);
+          message->set_message_throttler(policy.throttler_messages);
+
+          // store reservation size in message, so we don't get confused
+          // by messages entering the dispatch queue through other paths.
+          uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
+          message->set_dispatch_throttle_size(message_size);
+
+          message->set_recv_stamp(recv_stamp);
+          message->set_throttle_stamp(throttle_stamp);
+          message->set_recv_complete_stamp(ceph_clock_now(async_msgr->cct));
+
+          // check received seq#.  if it is old, drop the message.  
+          // note that incoming messages may skip ahead.  this is convenient for the client
+          // side queueing because messages can't be renumbered, but the (kernel) client will
+          // occasionally pull a message out of the sent queue to send elsewhere.  in that case
+          // it doesn't matter if we "got" it or not.
+          if (message->get_seq() <= in_seq) {
+            ldout(async_msgr->cct,0) << __func__ << " got old message "
+                    << message->get_seq() << " <= " << in_seq << " " << message << " " << *message
+                    << ", discarding" << dendl;
+            message->put();
+            if (has_feature(CEPH_FEATURE_RECONNECT_SEQ) && async_msgr->cct->_conf->ms_die_on_old_message)
+              assert(0 == "old msgs despite reconnect_seq feature");
+            goto fail;
+          }
+          message->set_connection(this);
+
+          // note last received message.
+          in_seq = message->get_seq();
+          ldout(async_msgr->cct, 10) << __func__ << " got message " << message->get_seq()
+                               << " " << message << " " << *message << dendl;
+
+          // if send_message always successfully send, it may have no
+          // opportunity to send seq ack. 10 is a experience value.
+          if (in_seq > in_seq_acked + 10) {
+            center->create_time_event(2, write_handler);
+          }
+
+          state = STATE_OPEN;
+
+          async_msgr->ms_fast_preprocess(message);
+          if (async_msgr->ms_can_fast_dispatch(message)) {
+            lock.Unlock();
+            async_msgr->ms_fast_dispatch(message);
+            lock.Lock();
+          } else {
+            center->create_time_event(1, EventCallbackRef(new C_handle_dispatch(async_msgr, message)));
+          }
+
+          break;
+        }
+
+      case STATE_OPEN_TAG_CLOSE:
+        {
+          ldout(async_msgr->cct,20) << __func__ << " got CLOSE" << dendl;
+          _stop();
+          break;
+        }
+
+      case STATE_STANDBY:
+        {
+          ldout(async_msgr->cct,20) << __func__ << " enter STANDY" << dendl;
+
+          break;
+        }
+
+      case STATE_CLOSED:
+        {
+          center->delete_file_event(sd, EVENT_READABLE);
+          ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
+          break;
+        }
+
+      default:
+        {
+          if (_process_connection() < 0)
+            goto fail;
+          break;
+        }
+    }
+
+    continue;
+
+fail:
+    // clean up state internal variables and states
+    if (state >= STATE_CONNECTING_SEND_CONNECT_MSG &&
+        state <= STATE_CONNECTING_READY) {
+      delete authorizer;
+      authorizer = NULL;
+      got_bad_auth = false;
+    }
+
+    if (state > STATE_OPEN_MESSAGE_THROTTLE_MESSAGE &&
+        state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH
+        && policy.throttler_messages) {
+      ldout(async_msgr->cct,10) << __func__ << " releasing " << 1
+                          << " message to policy throttler "
+                          << policy.throttler_messages->get_current() << "/"
+                          << policy.throttler_messages->get_max() << dendl;
+      policy.throttler_messages->put();
+    }
+    if (state > STATE_OPEN_MESSAGE_THROTTLE_BYTES &&
+        state <= STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH) {
+      uint64_t message_size = current_header.front_len + current_header.middle_len + current_header.data_len;
+      if (policy.throttler_bytes) {
+        ldout(async_msgr->cct,10) << __func__ << " releasing " << message_size
+                            << " bytes to policy throttler "
+                            << policy.throttler_bytes->get_current() << "/"
+                            << policy.throttler_bytes->get_max() << dendl;
+        policy.throttler_bytes->put(message_size);
+      }
+    }
+    fault();
+  } while (prev_state != state);
+}
+
+int AsyncConnection::_process_connection()
+{
+  int r = 0;
+
+  switch(state) {
+    case STATE_WAIT_SEND:
+      {
+        if (!outcoming_bl.length()) {
+          assert(state_after_send);
+          state = state_after_send;
+          state_after_send = 0;
+        }
+        break;
+      }
+
+    case STATE_CONNECTING:
+      {
+        assert(!policy.server);
+
+        // reset connect state variables
+        got_bad_auth = false;
+        delete authorizer;
+        authorizer = NULL;
+        memset(&connect_msg, 0, sizeof(connect_msg));
+        memset(&connect_reply, 0, sizeof(connect_reply));
+
+        global_seq = async_msgr->get_global_seq();
+        // close old socket.  this is safe because we stopped the reader thread above.
+        if (sd >= 0) {
+          center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
+          ::close(sd);
+        }
+
+        sd = net.connect(get_peer_addr());
+        if (sd < 0) {
+          goto fail;
+        }
+        r = net.set_nonblock(sd);
+        if (r < 0) {
+          goto fail;
+        }
+        net.set_socket_options(sd);
+
+        center->create_file_event(sd, EVENT_READABLE, read_handler);
+        state = STATE_CONNECTING_WAIT_BANNER;
+        break;
+      }
+
+    case STATE_CONNECTING_WAIT_BANNER:
+      {
+        r = read_until(strlen(CEPH_BANNER), state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read banner failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        if (memcmp(state_buffer.c_str(), CEPH_BANNER, strlen(CEPH_BANNER))) {
+          ldout(async_msgr->cct, 0) << __func__ << " connect protocol error (bad banner) on peer "
+                              << get_peer_addr() << dendl;
+          goto fail;
+        }
+
+        ldout(async_msgr->cct, 10) << __func__ << " get banner, ready to send banner" << dendl;
+
+        bufferlist bl;
+        bl.append(state_buffer.c_str(), strlen(CEPH_BANNER));
+        r = _try_send(bl);
+        if (r == 0) {
+          state = STATE_CONNECTING_WAIT_IDENTIFY_PEER;
+          ldout(async_msgr->cct, 10) << __func__ << " connect write banner done: "
+                               << get_peer_addr() << dendl;
+        } else if (r > 0) {
+          state = STATE_WAIT_SEND;
+          state_after_send = STATE_CONNECTING_WAIT_IDENTIFY_PEER;
+          ldout(async_msgr->cct, 10) << __func__ << " connect wait for write banner: "
+                               << get_peer_addr() << dendl;
+        } else {
+          goto fail;
+        }
+        break;
+      }
+
+    case STATE_CONNECTING_WAIT_IDENTIFY_PEER:
+      {
+        entity_addr_t paddr, peer_addr_for_me;
+        int port;
+        bufferlist myaddrbl;
+
+        r = read_until(sizeof(paddr)*2, state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read identify peeraddr failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        bufferlist bl;
+        bl.append(state_buffer);
+        bufferlist::iterator p = bl.begin();
+        try {
+          ::decode(paddr, p);
+          ::decode(peer_addr_for_me, p);
+        } catch (const buffer::error& e) {
+          lderr(async_msgr->cct) << __func__ <<  " decode peer addr failed " << dendl;
+          goto fail;
+        }
+        port = peer_addr_for_me.get_port();
+        ldout(async_msgr->cct, 20) << __func__ <<  " connect read peer addr "
+                             << paddr << " on socket " << sd << dendl;
+        if (peer_addr != paddr) {
+          if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() &&
+              peer_addr.get_nonce() == paddr.get_nonce()) {
+            ldout(async_msgr->cct, 0) << __func__ <<  " connect claims to be " << paddr
+                                << " not " << peer_addr
+                                << " - presumably this is the same node!" << dendl;
+          } else {
+            ldout(async_msgr->cct, 0) << __func__ << " connect claims to be "
+                                << paddr << " not " << peer_addr << " - wrong node!" << dendl;
+            goto fail;
+          }
+        }
+
+        ldout(async_msgr->cct, 20) << __func__ << " connect peer addr for me is " << peer_addr_for_me << dendl;
+        async_msgr->learned_addr(peer_addr_for_me);
+        ::encode(async_msgr->get_myaddr(), myaddrbl);
+        r = _try_send(myaddrbl);
+        if (r == 0) {
+          state = STATE_CONNECTING_SEND_CONNECT_MSG;
+          ldout(async_msgr->cct, 10) << __func__ << " connect sent my addr "
+              << async_msgr->get_myaddr() << dendl;
+        } else if (r > 0) {
+          state = STATE_WAIT_SEND;
+          state_after_send = STATE_CONNECTING_SEND_CONNECT_MSG;
+          ldout(async_msgr->cct, 10) << __func__ << " connect send my addr done: "
+              << async_msgr->get_myaddr() << dendl;
+        } else {
+          ldout(async_msgr->cct, 2) << __func__ << " connect couldn't write my addr, "
+              << cpp_strerror(errno) << dendl;
+          goto fail;
+        }
+
+        break;
+      }
+
+    case STATE_CONNECTING_SEND_CONNECT_MSG:
+      {
+        if (!got_bad_auth) {
+          delete authorizer;
+          authorizer = async_msgr->get_authorizer(peer_type, false);
+        }
+        bufferlist bl;
+
+        connect_msg.features = policy.features_supported;
+        connect_msg.host_type = async_msgr->get_myinst().name.type();
+        connect_msg.global_seq = global_seq;
+        connect_msg.connect_seq = connect_seq;
+        connect_msg.protocol_version = async_msgr->get_proto_version(peer_type, true);
+        connect_msg.authorizer_protocol = authorizer ? authorizer->protocol : 0;
+        connect_msg.authorizer_len = authorizer ? authorizer->bl.length() : 0;
+        if (authorizer)
+          ldout(async_msgr->cct, 10) << __func__ <<  "connect_msg.authorizer_len="
+              << connect_msg.authorizer_len << " protocol="
+              << connect_msg.authorizer_protocol << dendl;
+        connect_msg.flags = 0;
+        if (policy.lossy)
+          connect_msg.flags |= CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
+        bl.append((char*)&connect_msg, sizeof(connect_msg));
+        if (authorizer) {
+          bl.append(authorizer->bl.c_str(), authorizer->bl.length());
+        }
+        ldout(async_msgr->cct, 10) << __func__ << " connect sending gseq=" << global_seq << " cseq="
+            << connect_seq << " proto=" << connect_msg.protocol_version << dendl;
+
+        r = _try_send(bl);
+        if (r == 0) {
+          state = STATE_CONNECTING_WAIT_CONNECT_REPLY;
+          ldout(async_msgr->cct,20) << __func__ << "connect wrote (self +) cseq, waiting for reply" << dendl;
+        } else if (r > 0) {
+          state = STATE_WAIT_SEND;
+          state_after_send = STATE_CONNECTING_WAIT_CONNECT_REPLY;
+          ldout(async_msgr->cct, 10) << __func__ << " continue send reply " << dendl;
+        } else {
+          ldout(async_msgr->cct, 2) << __func__ << " connect couldn't send reply "
+              << cpp_strerror(errno) << dendl;
+          goto fail;
+        }
+
+        break;
+      }
+
+    case STATE_CONNECTING_WAIT_CONNECT_REPLY:
+      {
+        r = read_until(sizeof(connect_reply), state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read connect reply failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        connect_reply = *((ceph_msg_connect_reply*)state_buffer.c_str());
+        connect_reply.features = ceph_sanitize_features(connect_reply.features);
+
+        ldout(async_msgr->cct, 20) << __func__ << " connect got reply tag " << (int)connect_reply.tag
+                             << " connect_seq " << connect_reply.connect_seq << " global_seq "
+                             << connect_reply.global_seq << " proto " << connect_reply.protocol_version
+                             << " flags " << (int)connect_reply.flags << " features "
+                             << connect_reply.features << dendl;
+        state = STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH;
+
+        break;
+      }
+
+    case STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH:
+      {
+        bufferlist authorizer_reply;
+        if (connect_reply.authorizer_len) {
+          ldout(async_msgr->cct, 10) << __func__ << " reply.authorizer_len=" << connect_reply.authorizer_len << dendl;
+          r = read_until(connect_reply.authorizer_len, state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read connect reply authorizer failed" << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+
+          authorizer_reply.push_back(state_buffer);
+          bufferlist::iterator iter = authorizer_reply.begin();
+          if (authorizer && !authorizer->verify_reply(iter)) {
+            ldout(async_msgr->cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
+            goto fail;
+          }
+        }
+        r = handle_connect_reply(connect_msg, connect_reply);
+        if (r < 0)
+          goto fail;
+
+        // state must be changed!
+        assert(state != STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH);
+        break;
+      }
+
+    case STATE_CONNECTING_WAIT_ACK_SEQ:
+      {
+        uint64_t newly_acked_seq = 0;
+        bufferlist bl;
+
+        r = read_until(sizeof(newly_acked_seq), state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read connect ack seq failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        newly_acked_seq = *((uint64_t*)state_buffer.c_str());
+        ldout(async_msgr->cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
+                            << " vs out_seq " << out_seq << dendl;
+        while (newly_acked_seq > out_seq) {
+          Message *m = _get_next_outgoing();
+          assert(m);
+          ldout(async_msgr->cct, 2) << __func__ << " discarding previously sent " << m->get_seq()
+                              << " " << *m << dendl;
+          assert(m->get_seq() <= newly_acked_seq);
+          m->put();
+          ++out_seq;
+        }
+
+        bl.append((char*)&in_seq, sizeof(in_seq));
+        r = _try_send(bl);
+        if (r == 0) {
+          state = STATE_CONNECTING_READY;
+          ldout(async_msgr->cct, 10) << __func__ << " send in_seq done " << dendl;
+        } else if (r > 0) {
+          state_after_send = STATE_CONNECTING_READY;
+          state = STATE_WAIT_SEND;
+          ldout(async_msgr->cct, 10) << __func__ << " continue send in_seq " << dendl;
+        } else {
+          goto fail;
+        }
+        break;
+      }
+
+    case STATE_CONNECTING_READY:
+      {
+        // hooray!
+        peer_global_seq = connect_reply.global_seq;
+        policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
+        state = STATE_OPEN;
+        connect_seq += 1;
+        assert(connect_seq == connect_reply.connect_seq);
+        backoff = utime_t();
+        set_features((uint64_t)connect_reply.features & (uint64_t)connect_msg.features);
+        ldout(async_msgr->cct, 10) << __func__ << "connect success " << connect_seq
+                             << ", lossy = " << policy.lossy << ", features "
+                             << get_features() << dendl;
+
+        // If we have an authorizer, get a new AuthSessionHandler to deal with ongoing security of the
+        // connection.  PLR
+        if (authorizer != NULL) {
+          session_security.reset(
+              get_auth_session_handler(async_msgr->cct,
+                                       authorizer->protocol,
+                                       authorizer->session_key,
+                                       get_features()));
+        } else {
+          // We have no authorizer, so we shouldn't be applying security to messages in this AsyncConnection.  PLR
+          session_security.reset();
+        }
+
+        async_msgr->ms_deliver_handle_connect(this);
+        async_msgr->ms_deliver_handle_fast_connect(this);
+
+        // message may in queue between last _try_send and connection ready
+        // write event may already notify and we need to force scheduler again
+        if (is_queued())
+          center->create_time_event(1, write_handler);
+
+        break;
+      }
+
+    case STATE_ACCEPTING:
+      {
+        bufferlist bl;
+
+        if (net.set_nonblock(sd) < 0)
+          goto fail;
+
+        net.set_socket_options(sd);
+
+        bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+
+        ::encode(async_msgr->get_myaddr(), bl);
+        port = async_msgr->get_myaddr().get_port();
+        // and peer's socket addr (they might not know their ip)
+        socklen_t len = sizeof(socket_addr.ss_addr());
+        r = ::getpeername(sd, (sockaddr*)&socket_addr.ss_addr(), &len);
+        if (r < 0) {
+          ldout(async_msgr->cct, 0) << __func__ << " failed to getpeername "
+                              << cpp_strerror(errno) << dendl;
+          goto fail;
+        }
+        ::encode(socket_addr, bl);
+        ldout(async_msgr->cct, 1) << __func__ << " sd=" << sd << " " << socket_addr << dendl;
+
+        r = _try_send(bl);
+        if (r == 0) {
+          state = STATE_ACCEPTING_WAIT_BANNER_ADDR;
+          ldout(async_msgr->cct, 10) << __func__ << " write banner and addr done: "
+            << get_peer_addr() << dendl;
+        } else if (r > 0) {
+          state = STATE_WAIT_SEND;
+          state_after_send = STATE_ACCEPTING_WAIT_BANNER_ADDR;
+          ldout(async_msgr->cct, 10) << __func__ << " wait for write banner and addr: "
+                              << get_peer_addr() << dendl;
+        } else {
+          goto fail;
+        }
+
+        break;
+      }
+    case STATE_ACCEPTING_WAIT_BANNER_ADDR:
+      {
+        bufferlist addr_bl;
+        entity_addr_t peer_addr;
+
+        r = read_until(strlen(CEPH_BANNER) + sizeof(peer_addr), state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read peer banner and addr failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        if (memcmp(state_buffer.c_str(), CEPH_BANNER, strlen(CEPH_BANNER))) {
+          ldout(async_msgr->cct, 1) << __func__ << " accept peer sent bad banner '" << state_buffer.c_str()
+                                    << "' (should be '" << CEPH_BANNER << "')" << dendl;
+          goto fail;
+        }
+
+        addr_bl.append(state_buffer, strlen(CEPH_BANNER), sizeof(peer_addr));
+        {
+          bufferlist::iterator ti = addr_bl.begin();
+          ::decode(peer_addr, ti);
+        }
+
+        ldout(async_msgr->cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
+        if (peer_addr.is_blank_ip()) {
+          // peer apparently doesn't know what ip they have; figure it out for them.
+          int port = peer_addr.get_port();
+          peer_addr.addr = socket_addr.addr;
+          peer_addr.set_port(port);
+          ldout(async_msgr->cct, 0) << __func__ << " accept peer addr is really " << peer_addr
+                             << " (socket is " << socket_addr << ")" << dendl;
+        }
+        set_peer_addr(peer_addr);  // so that connection_state gets set up
+        state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
+        break;
+      }
+
+    case STATE_ACCEPTING_WAIT_CONNECT_MSG:
+      {
+        r = read_until(sizeof(connect_msg), state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read connect msg failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        connect_msg = *((ceph_msg_connect*)state_buffer.c_str());
+        // sanitize features
+        connect_msg.features = ceph_sanitize_features(connect_msg.features);
+        state = STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH;
+        break;
+      }
+
+    case STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH:
+      {
+        bufferlist authorizer_bl, authorizer_reply;
+
+        if (connect_msg.authorizer_len) {
+          r = read_until(connect_msg.authorizer_len, state_buffer);
+          if (r < 0) {
+            ldout(async_msgr->cct, 1) << __func__ << " read connect msg failed" << dendl;
+            goto fail;
+          } else if (r > 0) {
+            break;
+          }
+          authorizer_bl.push_back(state_buffer);
+        }
+
+        ldout(async_msgr->cct, 20) << __func__ << " accept got peer connect_seq "
+                             << connect_msg.connect_seq << " global_seq "
+                             << connect_msg.global_seq << dendl;
+        set_peer_type(connect_msg.host_type);
+        policy = async_msgr->get_policy(connect_msg.host_type);
+        ldout(async_msgr->cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type
+                                   << ", policy.lossy=" << policy.lossy << " policy.server="
+                                   << policy.server << " policy.standby=" << policy.standby
+                                   << " policy.resetcheck=" << policy.resetcheck << dendl;
+
+        r = handle_connect_msg(connect_msg, authorizer_bl, authorizer_reply);
+        if (r < 0)
+          goto fail;
+
+        // state is changed by "handle_connect_msg"
+        assert(state != STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH);
+        break;
+      }
+
+    case STATE_ACCEPTING_WAIT_SEQ:
+      {
+        uint64_t newly_acked_seq;
+        r = read_until(sizeof(newly_acked_seq), state_buffer);
+        if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " read ack seq failed" << dendl;
+          goto fail;
+        } else if (r > 0) {
+          break;
+        }
+
+        newly_acked_seq = *((uint64_t*)state_buffer.c_str());
+        ldout(async_msgr->cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq << dendl;
+        discard_requeued_up_to(newly_acked_seq);
+        state = STATE_ACCEPTING_READY;
+        break;
+      }
+
+    case STATE_ACCEPTING_READY:
+      {
+        ldout(async_msgr->cct, 20) << __func__ << " accept done" << dendl;
+        state = STATE_OPEN;
+        memset(&connect_msg, 0, sizeof(connect_msg));
+        break;
+      }
+
+    default:
+      {
+        lderr(async_msgr->cct) << __func__ << " bad state" << get_state_name(state) << dendl;
+        assert(0);
+      }
+  }
+
+  return 0;
+
+fail:
+  return -1;
+}
+
+int AsyncConnection::handle_connect_reply(ceph_msg_connect &connect, ceph_msg_connect_reply &reply)
+{
+  uint64_t feat_missing;
+  if (reply.tag == CEPH_MSGR_TAG_FEATURES) {
+    ldout(async_msgr->cct, 0) << __func__ << " connect protocol feature mismatch, my "
+                        << std::hex << connect.features << " < peer "
+                        << reply.features << " missing "
+                        << (reply.features & ~policy.features_supported)
+                        << std::dec << dendl;
+    goto fail;
+  }
+
+  if (reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
+    ldout(async_msgr->cct, 0) << __func__ << " connect protocol version mismatch, my "
+                        << connect.protocol_version << " != " << reply.protocol_version
+                        << dendl;
+    goto fail;
+  }
+
+  if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
+    ldout(async_msgr->cct,0) << __func__ << " connect got BADAUTHORIZER" << dendl;
+    if (got_bad_auth)
+      goto fail;
+    got_bad_auth = true;
+    delete authorizer;
+    authorizer = async_msgr->get_authorizer(peer_type, true);  // try harder
+    state = STATE_CONNECTING_SEND_CONNECT_MSG;
+  }
+  if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
+    ldout(async_msgr->cct, 0) << __func__ << "connect got RESETSESSION" << dendl;
+    was_session_reset();
+    state = STATE_CONNECTING_SEND_CONNECT_MSG;
+  }
+  if (reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
+    global_seq = async_msgr->get_global_seq(reply.global_seq);
+    ldout(async_msgr->cct, 10) << __func__ << " connect got RETRY_GLOBAL "
+                         << reply.global_seq << " chose new "
+                         << global_seq << dendl;
+    state = STATE_CONNECTING_SEND_CONNECT_MSG;
+  }
+  if (reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
+    assert(reply.connect_seq > connect_seq);
+    connect_seq = reply.connect_seq;
+    ldout(async_msgr->cct, 10) << __func__ << " connect got RETRY_SESSION "
+                         << connect_seq << " -> "
+                         << reply.connect_seq << dendl;
+    state = STATE_CONNECTING_SEND_CONNECT_MSG;
+  }
+  if (reply.tag == CEPH_MSGR_TAG_WAIT) {
+    ldout(async_msgr->cct, 3) << __func__ << " connect got WAIT (connection race)" << dendl;
+    state = STATE_WAIT;
+  }
+
+  feat_missing = policy.features_required & ~(uint64_t)connect_reply.features;
+  if (feat_missing) {
+    ldout(async_msgr->cct, 1) << __func__ << " missing required features " << std::hex
+                              << feat_missing << std::dec << dendl;
+    goto fail;
+  }
+
+  if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+    ldout(async_msgr->cct, 10) << __func__ << "got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" << dendl;
+    state = STATE_CONNECTING_WAIT_ACK_SEQ;
+  }
+  if (reply.tag == CEPH_MSGR_TAG_READY) {
+    ldout(async_msgr->cct, 10) << __func__ << "got CEPH_MSGR_TAG_READY " << dendl;
+    state = STATE_CONNECTING_READY;
+  }
+
+  return 0;
+
+ fail:
+  return -1;
+}
+
+int AsyncConnection::handle_connect_msg(ceph_msg_connect &connect, bufferlist &authorizer_bl,
+                                        bufferlist &authorizer_reply)
+{
+  int r;
+  ceph_msg_connect_reply reply;
+  bufferlist reply_bl;
+  uint64_t existing_seq = -1;
+  bool is_reset_from_peer = false;
+  char reply_tag;
+
+  memset(&reply, 0, sizeof(reply));
+  reply.protocol_version = async_msgr->get_proto_version(peer_type, false);
+
+  // mismatch?
+  ldout(async_msgr->cct,10) << __func__ << "accept my proto " << reply.protocol_version
+                      << ", their proto " << connect.protocol_version << dendl;
+  if (connect.protocol_version != reply.protocol_version) {
+    return _reply_accept(CEPH_MSGR_TAG_BADPROTOVER, connect, reply, authorizer_reply);
+  }
+  // require signatures for cephx?
+  if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
+    if (peer_type == CEPH_ENTITY_TYPE_OSD ||
+        peer_type == CEPH_ENTITY_TYPE_MDS) {
+      if (async_msgr->cct->_conf->cephx_require_signatures ||
+          async_msgr->cct->_conf->cephx_cluster_require_signatures) {
+        ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
+        policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+    } else {
+      if (async_msgr->cct->_conf->cephx_require_signatures ||
+          async_msgr->cct->_conf->cephx_service_require_signatures) {
+        ldout(async_msgr->cct, 10) << __func__ << " using cephx, requiring MSG_AUTH feature bit for service" << dendl;
+        policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+    }
+  }
+  uint64_t feat_missing = policy.features_required & ~(uint64_t)connect.features;
+  if (feat_missing) {
+    ldout(async_msgr->cct, 1) << __func__ << "peer missing required features "
+                        << std::hex << feat_missing << std::dec << dendl;
+    return _reply_accept(CEPH_MSGR_TAG_FEATURES, connect, reply, authorizer_reply);
+  }
+
+  bool authorizer_valid;
+  if (!async_msgr->verify_authorizer(this, peer_type, connect.authorizer_protocol, authorizer_bl,
+                               authorizer_reply, authorizer_valid, session_key) || !authorizer_valid) {
+    ldout(async_msgr->cct,0) << __func__ << ": got bad authorizer" << dendl;
+    session_security.reset();
+    return _reply_accept(CEPH_MSGR_TAG_BADAUTHORIZER, connect, reply, authorizer_reply);
+  }
+
+  // We've verified the authorizer for this AsyncConnection, so set up the session security structure.  PLR
+  ldout(async_msgr->cct, 10) << __func__ << " accept:  setting up session_security." << dendl;
+
+  // existing?
+  AsyncConnectionRef existing = async_msgr->lookup_conn(peer_addr);
+  if (existing) {
+    if (connect.global_seq < existing->peer_global_seq) {
+      ldout(async_msgr->cct, 10) << __func__ << " accept existing " << existing
+                           << ".gseq " << existing->peer_global_seq << " > "
+                           << connect.global_seq << ", RETRY_GLOBAL" << dendl;
+      reply.global_seq = existing->peer_global_seq;  // so we can send it below..
+      return _reply_accept(CEPH_MSGR_TAG_RETRY_GLOBAL, connect, reply, authorizer_reply);
+    } else {
+      ldout(async_msgr->cct, 10) << __func__ << " accept existing " << existing
+                           << ".gseq " << existing->peer_global_seq
+                           << " <= " << connect.global_seq << ", looks ok" << dendl;
+    }
+
+    if (existing->policy.lossy) {
+      ldout(async_msgr->cct, 0) << __func__ << " accept replacing existing (lossy) channel (new one lossy="
+                          << policy.lossy << ")" << dendl;
+      existing->was_session_reset();
+      goto replace;
+    }
+
+    ldout(async_msgr->cct, 0) << __func__ << "accept connect_seq " << connect.connect_seq
+                        << " vs existing " << existing->connect_seq
+                        << " state " << existing->state << dendl;
+
+    if (connect.connect_seq == 0 && existing->connect_seq > 0) {
+      ldout(async_msgr->cct,0) << __func__ << " accept peer reset, then tried to connect to us, replacing" << dendl;
+      // this is a hard reset from peer
+      is_reset_from_peer = true;
+      if (policy.resetcheck)
+        existing->was_session_reset(); // this resets out_queue, msg_ and connect_seq #'s
+      goto replace;
+    }
+
+    if (connect.connect_seq < existing->connect_seq) {
+      // old attempt, or we sent READY but they didn't get it.
+      ldout(async_msgr->cct, 10) << __func__ << "accept existing " << existing << ".cseq "
+                           << existing->connect_seq << " > " << connect.connect_seq
+                           << ", RETRY_SESSION" << dendl;
+      reply.connect_seq = existing->connect_seq + 1;
+      return _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
+    }
+
+    if (connect.connect_seq == existing->connect_seq) {
+      // if the existing connection successfully opened, and/or
+      // subsequently went to standby, then the peer should bump
+      // their connect_seq and retry: this is not a connection race
+      // we need to resolve here.
+      if (existing->state == STATE_OPEN ||
+          existing->state == STATE_STANDBY) {
+        ldout(async_msgr->cct, 10) << __func__ << " accept connection race, existing " << existing
+                             << ".cseq " << existing->connect_seq << " == "
+                             << connect.connect_seq << ", OPEN|STANDBY, RETRY_SESSION" << dendl;
+        reply.connect_seq = existing->connect_seq + 1;
+        return _reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply);
+      }
+
+      // connection race?
+      if (peer_addr < async_msgr->get_myaddr() || existing->policy.server) {
+        // incoming wins
+        ldout(async_msgr->cct, 10) << __func__ << " accept connection race, existing " << existing
+                             << ".cseq " << existing->connect_seq << " == " << connect.connect_seq
+                             << ", or we are server, replacing my attempt" << dendl;
+        goto replace;
+      } else {
+        // our existing outgoing wins
+        ldout(async_msgr->cct,10) << __func__ << "accept connection race, existing "
+                            << existing << ".cseq " << existing->connect_seq
+                            << " == " << connect.connect_seq << ", sending WAIT" << dendl;
+        assert(peer_addr > async_msgr->get_myaddr());
+        // make sure our outgoing connection will follow through
+        existing->_send_keepalive_or_ack();
+        return _reply_accept(CEPH_MSGR_TAG_WAIT, connect, reply, authorizer_reply);
+      }
+    }
+
+    assert(connect.connect_seq > existing->connect_seq);
+    assert(connect.global_seq >= existing->peer_global_seq);
+    if (policy.resetcheck &&   // RESETSESSION only used by servers; peers do not reset each other
+        existing->connect_seq == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << "accept we reset (peer sent cseq "
+                          << connect.connect_seq << ", " << existing << ".cseq = "
+                          << existing->connect_seq << "), sending RESETSESSION" << dendl;
+      return _reply_accept(CEPH_MSGR_TAG_RESETSESSION, connect, reply, authorizer_reply);
+    }
+
+    // reconnect
+    ldout(async_msgr->cct, 10) << __func__ << " accept peer sent cseq " << connect.connect_seq
+                         << " > " << existing->connect_seq << dendl;
+    goto replace;
+  } // existing
+  else if (policy.resetcheck && connect.connect_seq > 0) {
+    // we reset, and they are opening a new session
+    ldout(async_msgr->cct, 0) << __func__ << "accept we reset (peer sent cseq "
+                        << connect.connect_seq << "), sending RESETSESSION" << dendl;
+    return _reply_accept(CEPH_MSGR_TAG_RESETSESSION, connect, reply, authorizer_reply);
+  } else {
+    // new session
+    ldout(async_msgr->cct,10) << __func__ << "accept new session" << dendl;
+    existing = NULL;
+    goto open;
+  }
+  assert(0);
+
+ replace:
+  // if it is a hard reset from peer, we don't need a round-trip to negotiate in/out sequence
+  if ((connect.features & CEPH_FEATURE_RECONNECT_SEQ) && !is_reset_from_peer) {
+    reply_tag = CEPH_MSGR_TAG_SEQ;
+    existing_seq = existing->in_seq;
+  }
+  ldout(async_msgr->cct, 10) << __func__ << " accept replacing " << existing << dendl;
+  existing->mark_down();
+
+  // In order to avoid dead lock, here need to lock in ordering.
+  // It may be another thread access this connection between unlock and lock
+  // call, this is rely to EventCenter to guarantee only one thread can access
+  // one connection.
+  lock.Unlock();
+  if (existing->sd > sd) {
+    existing->lock.Lock();
+    lock.Lock();
+  } else {
+    lock.Lock();
+    existing->lock.Lock();
+  }
+  if (existing->policy.lossy) {
+    // disconnect from the Connection
+    async_msgr->ms_deliver_handle_reset(existing.get());
+  } else {
+    // queue a reset on the new connection, which we're dumping for the old
+    async_msgr->ms_deliver_handle_reset(this);
+
+    // reset the in_seq if this is a hard reset from peer,
+    // otherwise we respect our original connection's value
+    if (is_reset_from_peer)
+      existing->in_seq = 0;
+
+    // Clean up output buffer
+    existing->outcoming_bl.clear();
+    existing->requeue_sent();
+    reply.connect_seq = existing->connect_seq + 1;
+    if (_reply_accept(CEPH_MSGR_TAG_RETRY_SESSION, connect, reply, authorizer_reply) < 0)
+      goto fail;
+
+    uint64_t s = existing->sd;
+    existing->sd = sd;
+    sd = s;
+    existing->state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
+    _stop();
+    existing->lock.Unlock();
+    return 0;
+  }
+  existing->lock.Unlock();
+
+ open:
+  connect_seq = connect.connect_seq + 1;
+  peer_global_seq = connect.global_seq;
+  ldout(async_msgr->cct, 10) << __func__ << " accept success, connect_seq = "
+                       << connect_seq << ", sending READY" << dendl;
+
+  // send READY reply
+  reply.tag = (reply_tag ? reply_tag : CEPH_MSGR_TAG_READY);
+  reply.features = policy.features_supported;
+  reply.global_seq = async_msgr->get_global_seq();
+  reply.connect_seq = connect_seq;
+  reply.flags = 0;
+  reply.authorizer_len = authorizer_reply.length();
+  if (policy.lossy)
+    reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
+
+  set_features((uint64_t)reply.features & (uint64_t)connect.features);
+  ldout(async_msgr->cct, 10) << __func__ << " accept features " << get_features() << dendl;
+
+  session_security.reset(
+      get_auth_session_handler(async_msgr->cct, connect.authorizer_protocol,
+                               session_key, get_features()));
+
+  // notify
+  async_msgr->ms_deliver_handle_accept(this);
+  async_msgr->ms_deliver_handle_fast_accept(this);
+
+  // ok!
+  async_msgr->accept_conn(this);
+
+  reply_bl.append((char*)&reply, sizeof(reply));
+
+  if (reply.authorizer_len)
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+
+  int next_state;
+
+  if (reply_tag == CEPH_MSGR_TAG_SEQ) {
+    reply_bl.append((char*)&existing_seq, sizeof(existing_seq));
+    next_state = STATE_ACCEPTING_WAIT_SEQ;
+  } else {
+    next_state = STATE_ACCEPTING_READY;
+    discard_requeued_up_to(0);
+  }
+
+  r = _try_send(reply_bl);
+  if (r < 0) {
+    goto fail;
+  }
+
+  if (r == 0) {
+    state = next_state;
+    ldout(async_msgr->cct, 2) << __func__ << " accept write reply msg done" << dendl;
+  } else {
+    state = STATE_WAIT_SEND;
+    state_after_send = next_state;
+  }
+
+  return 0;
+
+ fail:
+  return -1;
+}
+
+void AsyncConnection::_connect()
+{
+  ldout(async_msgr->cct, 10) << __func__ << " " << connect_seq << dendl;
+
+  state = STATE_CONNECTING;
+  // rescheduler connection in order to avoid lock dep
+  // may called by external thread(send_message)
+  center->dispatch_event_external(read_handler);
+}
+
+void AsyncConnection::accept(int incoming)
+{
+  ldout(async_msgr->cct, 10) << __func__ << " " << incoming << dendl;
+  assert(sd < 0);
+
+  sd = incoming;
+  state = STATE_ACCEPTING;
+  center->create_file_event(sd, EVENT_READABLE, read_handler);
+  // rescheduler connection in order to avoid lock dep
+  process();
+}
+
+int AsyncConnection::send_message(Message *m)
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+  m->get_header().src = async_msgr->get_myname();
+  if (!m->get_priority())
+    m->set_priority(async_msgr->get_default_send_priority());
+
+  Mutex::Locker l(lock);
+  if (!is_queued() && state >= STATE_OPEN && state <= STATE_OPEN_TAG_CLOSE) {
+    ldout(async_msgr->cct, 10) << __func__ << " try send msg " << m << dendl;
+    int r = _send(m);
+    if (r < 0) {
+      ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
+      // we want to handle fault within internal thread
+      center->dispatch_event_external(write_handler);
+    }
+  } else {
+    out_q[m->get_priority()].push_back(m);
+    if ((state == STATE_STANDBY || state == STATE_CLOSED) && !policy.server) {
+      ldout(async_msgr->cct, 10) << __func__ << " state is " << get_state_name(state)
+                                 << " policy.server is false" << dendl;
+      _connect();
+    } else if (sd > 0 && !open_write) {
+      center->dispatch_event_external(write_handler);
+    }
+  }
+  return 0;
+}
+
+void AsyncConnection::requeue_sent()
+{
+  if (sent.empty())
+    return;
+
+  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend seq " << out_seq
+                         << " (" << m->get_seq() << ")" << dendl;
+    rq.push_front(m);
+    out_seq--;
+  }
+}
+
+void AsyncConnection::discard_requeued_up_to(uint64_t seq)
+{
+  ldout(async_msgr->cct, 10) << __func__ << " " << seq << dendl;
+  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0)
+    return;
+  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  while (!rq.empty()) {
+    Message *m = rq.front();
+    if (m->get_seq() == 0 || m->get_seq() > seq)
+      break;
+    ldout(async_msgr->cct, 10) << __func__ << " " << *m << " for resend seq " << out_seq
+                         << " <= " << seq << ", discarding" << dendl;
+    m->put();
+    rq.pop_front();
+    out_seq++;
+  }
+  if (rq.empty())
+    out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+}
+
+/*
+ * Tears down the AsyncConnection's message queues, and removes them from the DispatchQueue
+ * Must hold pipe_lock prior to calling.
+ */
+void AsyncConnection::discard_out_queue()
+{
+  ldout(async_msgr->cct, 10) << __func__ << " " << dendl;
+
+  for (list<Message*>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(async_msgr->cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (map<int,list<Message*> >::iterator p = out_q.begin(); p != out_q.end(); ++p)
+    for (list<Message*>::iterator r = p->second.begin(); r != p->second.end(); ++r) {
+      ldout(async_msgr->cct, 20) << __func__ << " discard " << *r << dendl;
+      (*r)->put();
+    }
+  out_q.clear();
+}
+
+int AsyncConnection::randomize_out_seq()
+{
+  if (get_features() & CEPH_FEATURE_MSG_AUTH) {
+    // Set out_seq to a random value, so CRC won't be predictable.   Don't bother checking seq_error
+    // here.  We'll check it on the call.  PLR
+    int seq_error = get_random_bytes((char *)&out_seq, sizeof(out_seq));
+    out_seq &= SEQ_MASK;
+    lsubdout(async_msgr->cct, ms, 10) << __func__ << "randomize_out_seq " << out_seq << dendl;
+    return seq_error;
+  } else {
+    // previously, seq #'s always started at 0.
+    out_seq = 0;
+    return 0;
+  }
+}
+
+void AsyncConnection::fault()
+{
+  if (state == STATE_CLOSED) {
+    ldout(async_msgr->cct, 10) << __func__ << " state is already STATE_CLOSED" << dendl;
+    return ;
+  }
+
+  if (policy.lossy && state != STATE_CONNECTING) {
+    ldout(async_msgr->cct, 10) << __func__ << " on lossy channel, failing" << dendl;
+    _stop();
+    return ;
+  }
+
+  if (sd >= 0) {
+    shutdown_socket();
+    center->delete_file_event(sd, EVENT_READABLE|EVENT_WRITABLE);
+  }
+  open_write = false;
+
+  // requeue sent items
+  requeue_sent();
+  outcoming_bl.clear();
+  if (policy.standby && !is_queued()) {
+    ldout(async_msgr->cct,0) << __func__ << " with nothing to send, going to standby" << dendl;
+    state = STATE_STANDBY;
+    return;
+  }
+
+  if (state != STATE_CONNECTING) {
+    // policy maybe empty when state is in accept
+    if (policy.server || (state >= STATE_ACCEPTING && state < STATE_ACCEPTING_WAIT_SEQ)) {
+      ldout(async_msgr->cct, 0) << __func__ << " server, going to standby" << dendl;
+      state = STATE_STANDBY;
+    } else {
+      ldout(async_msgr->cct, 0) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      state = STATE_CONNECTING;
+    }
+    backoff = utime_t();
+  } else {
+    if (backoff == utime_t()) {
+      backoff.set_from_double(async_msgr->cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > async_msgr->cct->_conf->ms_max_backoff)
+        backoff.set_from_double(async_msgr->cct->_conf->ms_max_backoff);
+    }
+    ldout(async_msgr->cct, 10) << __func__ << " waiting " << backoff << dendl;
+  }
+
+  // woke up again;
+  center->create_time_event(backoff, read_handler);
+}
+
+void AsyncConnection::was_session_reset()
+{
+  ldout(async_msgr->cct,10) << __func__ << "was_session_reset" << dendl;
+  discard_out_queue();
+  outcoming_bl.clear();
+
+  center->dispatch_event_external(remote_reset_handler);
+
+  if (randomize_out_seq()) {
+    lsubdout(async_msgr->cct,ms,15) << __func__ << " Could not get random bytes to set seq number for session reset; set seq number to " << out_seq << dendl;
+  }
+
+  in_seq = 0;
+  connect_seq = 0;
+  in_seq_acked = 0;
+}
+
+void AsyncConnection::_stop()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+  center->dispatch_event_external(reset_handler);
+  shutdown_socket();
+  discard_out_queue();
+  outcoming_bl.clear();
+  if (policy.lossy)
+    was_session_reset();
+  open_write = false;
+  state = STATE_CLOSED;
+}
+
+int AsyncConnection::_send(Message *m)
+{
+  m->set_seq(++out_seq);
+  if (!policy.lossy) {
+    // put on sent list
+    sent.push_back(m); 
+    m->get();
+  }
+
+  // associate message with Connection (for benefit of encode_payload)
+  m->set_connection(this);
+
+  uint64_t features = get_features();
+  if (m->empty_payload())
+    ldout(async_msgr->cct, 20) << __func__ << " encoding " << m->get_seq() << " features " << features
+                         << " " << m << " " << *m << dendl;
+  else
+    ldout(async_msgr->cct, 20) << __func__ << " half-reencoding " << m->get_seq() << " features "
+                         << features << " " << m << " " << *m << dendl;
+
+  // encode and copy out of *m
+  m->encode(features, !async_msgr->cct->_conf->ms_nocrc);
+
+  // prepare everything
+  ceph_msg_header& header = m->get_header();
+  ceph_msg_footer& footer = m->get_footer();
+
+  // Now that we have all the crcs calculated, handle the
+  // digital signature for the message, if the AsyncConnection has session
+  // security set up.  Some session security options do not
+  // actually calculate and check the signature, but they should
+  // handle the calls to sign_message and check_signature.  PLR
+  if (session_security.get() == NULL) {
+    ldout(async_msgr->cct, 20) << __func__ << " no session security" << dendl;
+  } else {
+    if (session_security->sign_message(m)) {
+      ldout(async_msgr->cct, 20) << __func__ << " failed to sign seq # "
+                           << header.seq << "): sig = " << footer.sig << dendl;
+    } else {
+      ldout(async_msgr->cct, 20) << __func__ << " signed seq # " << header.seq
+                           << "): sig = " << footer.sig << dendl;
+    }
+  }
+
+  bufferlist blist = m->get_payload();
+  blist.append(m->get_middle());
+  blist.append(m->get_data());
+
+  ldout(async_msgr->cct, 20) << __func__ << " sending " << m->get_seq()
+                       << " " << m << dendl;
+  int rc = write_message(header, footer, blist);
+
+  if (rc < 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " error sending " << m << ", "
+                        << cpp_strerror(errno) << dendl;
+  } else if (rc == 0) {
+    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " done." << dendl;
+  } else {
+    ldout(async_msgr->cct, 10) << __func__ << " sending " << m << " continuely." << dendl;
+  }
+  m->put();
+
+  return rc;
+}
+
+int AsyncConnection::write_message(ceph_msg_header& header, ceph_msg_footer& footer,
+                                  bufferlist& blist)
+{
+  bufferlist bl;
+  int ret;
+
+  // send tag
+  char tag = CEPH_MSGR_TAG_MSG;
+  bl.append(&tag, sizeof(tag));
+
+  // send envelope
+  ceph_msg_header_old oldheader;
+  if (has_feature(CEPH_FEATURE_NOSRCADDR)) {
+    bl.append((char*)&header, sizeof(header));
+  } else {
+    memcpy(&oldheader, &header, sizeof(header));
+    oldheader.src.name = header.src;
+    oldheader.src.addr = get_peer_addr();
+    oldheader.orig_src = oldheader.src;
+    oldheader.reserved = header.reserved;
+    oldheader.crc = ceph_crc32c(0, (unsigned char*)&oldheader,
+                                sizeof(oldheader) - sizeof(oldheader.crc));
+    bl.append((char*)&oldheader, sizeof(oldheader));
+  }
+
+  bl.claim_append(blist);
+
+  // send footer; if receiver doesn't support signatures, use the old footer format
+  ceph_msg_footer_old old_footer;
+  if (has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    bl.append((char*)&footer, sizeof(footer));
+  } else {
+    old_footer.front_crc = footer.front_crc;
+    old_footer.middle_crc = footer.middle_crc;
+    old_footer.data_crc = footer.data_crc;
+    old_footer.flags = footer.flags;
+    bl.append((char*)&old_footer, sizeof(old_footer));
+  }
+
+  // send
+  ret = _try_send(bl);
+  if (ret < 0)
+    return ret;
+
+  return ret;
+}
+
+void AsyncConnection::handle_ack(uint64_t seq)
+{
+  lsubdout(async_msgr->cct, ms, 15) << __func__ << " got ack seq " << seq << dendl;
+  // trim sent list
+  while (!sent.empty() && sent.front()->get_seq() <= seq) {
+    Message *m = sent.front();
+    sent.pop_front();
+    lsubdout(async_msgr->cct, ms, 10) << __func__ << "reader got ack seq "
+                                << seq << " >= " << m->get_seq() << " on "
+                                << m << " " << *m << dendl;
+    m->put();
+  }
+}
+
+void AsyncConnection::send_keepalive()
+{
+  Mutex::Locker l(lock);
+  keepalive = true;
+  center->dispatch_event_external(write_handler);
+}
+
+void AsyncConnection::_send_keepalive_or_ack(bool ack, utime_t *tp)
+{
+  assert(lock.is_locked());
+  bufferlist bl;
+
+  utime_t t = ceph_clock_now(async_msgr->cct);
+  struct ceph_timespec ts;
+  t.encode_timeval(&ts);
+  if (ack) {
+    assert(tp);
+    tp->encode_timeval(&ts);
+    bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
+    bl.append((char*)&ts, sizeof(ts));
+  } else if (has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+    struct ceph_timespec ts;
+    t.encode_timeval(&ts);
+    bl.append(CEPH_MSGR_TAG_KEEPALIVE2);
+    bl.append((char*)&ts, sizeof(ts));
+  } else {
+    bl.append(CEPH_MSGR_TAG_KEEPALIVE);
+  }
+
+  ldout(async_msgr->cct, 10) << __func__ << " try send keepalive or ack" << dendl;
+  _try_send(bl, false);
+}
+
+void AsyncConnection::handle_write()
+{
+  ldout(async_msgr->cct, 10) << __func__ << " started." << dendl;
+  Mutex::Locker l(lock);
+  bufferlist bl;
+  int r;
+  if (state >= STATE_OPEN && state <= STATE_OPEN_TAG_CLOSE) {
+    if (keepalive) {
+      _send_keepalive_or_ack();
+      keepalive = false;
+    }
+
+    while (1) {
+      Message *m = _get_next_outgoing();
+      if (!m)
+        break;
+
+      ldout(async_msgr->cct, 10) << __func__ << " try send msg " << m << dendl;
+      r = _send(m);
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " send msg failed" << dendl;
+        goto fail;
+      } else if (r > 0) {
+        break;
+      }
+    }
+
+    if (in_seq > in_seq_acked) {
+      ceph_le64 s;
+      s = in_seq;
+      bl.append(CEPH_MSGR_TAG_ACK);
+      bl.append((char*)&s, sizeof(s));
+      ldout(async_msgr->cct, 10) << __func__ << " try send msg ack" << dendl;
+      in_seq_acked = s;
+      _try_send(bl);
+    }
+  } else if (state != STATE_CONNECTING) {
+    r = _try_send(bl);
+    if (r < 0) {
+      ldout(async_msgr->cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+      goto fail;
+    }
+  }
+
+  return ;
+ fail:
+  fault();
+}
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
new file mode 100644 (file)
index 0000000..3c025ff
--- /dev/null
@@ -0,0 +1,249 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MSG_ASYNCCONNECTION_H
+#define CEPH_MSG_ASYNCCONNECTION_H
+
+#include <list>
+#include <map>
+using namespace std;
+
+#include "common/Mutex.h"
+#include "include/buffer.h"
+
+#include "auth/AuthSessionHandler.h"
+#include "include/buffer.h"
+#include "Connection.h"
+#include "net_handler.h"
+#include "Event.h"
+#include "Messenger.h"
+
+class AsyncMessenger;
+
+class AsyncConnection : public Connection {
+  const static uint64_t IOV_LEN = 1024;
+
+  int read_bulk(int fd, char *buf, int len);
+  int do_sendmsg(struct msghdr &msg, int len, bool more);
+  // if "send" is false, it will only append bl to send buffer
+  // the main usage is avoid error happen outside messenger threads
+  int _try_send(bufferlist bl, bool send=true);
+  int _send(Message *m);
+  int read_until(uint64_t needed, bufferptr &p);
+  int _process_connection();
+  void _connect();
+  void _stop();
+  int handle_connect_reply(ceph_msg_connect &connect, ceph_msg_connect_reply &r);
+  int handle_connect_msg(ceph_msg_connect &m, bufferlist &aubl, bufferlist &bl);
+  void was_session_reset();
+  void fault();
+  void discard_out_queue();
+  void discard_requeued_up_to(uint64_t seq);
+  void requeue_sent();
+  int randomize_out_seq();
+  void handle_ack(uint64_t seq);
+  void _send_keepalive_or_ack(bool ack=false, utime_t *t=NULL);
+  int write_message(ceph_msg_header& header, ceph_msg_footer& footer, bufferlist& blist);
+  int _reply_accept(char tag, ceph_msg_connect &connect, ceph_msg_connect_reply &reply,
+                    bufferlist authorizer_reply) {
+    bufferlist reply_bl;
+    reply.tag = tag;
+    reply.features = ((uint64_t)connect.features & policy.features_supported) | policy.features_required;
+    reply.authorizer_len = authorizer_reply.length();
+    reply_bl.append((char*)&reply, sizeof(reply));
+    if (reply.authorizer_len) {
+      reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+    }
+    int r = _try_send(reply_bl);
+    if (r < 0)
+      return -1;
+
+    state = STATE_ACCEPTING_WAIT_CONNECT_MSG;
+    return 0;
+  }
+  bool is_queued() {
+    return !out_q.empty() || outcoming_bl.length();
+  }
+  void shutdown_socket() {
+    if (sd >= 0)
+      ::shutdown(sd, SHUT_RDWR);
+  }
+  Message *_get_next_outgoing() {
+    Message *m = 0;
+    while (!m && !out_q.empty()) {
+      map<int, list<Message*> >::reverse_iterator p = out_q.rbegin();
+      if (!p->second.empty()) {
+        m = p->second.front();
+        p->second.pop_front();
+      }
+      if (p->second.empty())
+        out_q.erase(p->first);
+    }
+    return m;
+  }
+ public:
+  AsyncConnection(CephContext *cct, AsyncMessenger *m, EventCenter *c);
+  ~AsyncConnection();
+
+  ostream& _conn_prefix(std::ostream *_dout);
+
+  bool is_connected() {
+    // FIXME?
+    return true;
+  }
+
+  // Only call when AsyncConnection first construct
+  void connect(const entity_addr_t& addr, int type) {
+    set_peer_type(type);
+    set_peer_addr(addr);
+    policy = msgr->get_policy(type);
+    _connect();
+  }
+  // Only call when AsyncConnection first construct
+  void accept(int sd);
+  int send_message(Message *m);
+
+  void send_keepalive();
+  void mark_down() {
+    Mutex::Locker l(lock);
+    _stop();
+  }
+  void mark_disposable() {
+    Mutex::Locker l(lock);
+    policy.lossy = true;
+  }
+
+ private:
+  enum {
+    STATE_NONE,
+    STATE_OPEN,
+    STATE_OPEN_KEEPALIVE2,
+    STATE_OPEN_KEEPALIVE2_ACK,
+    STATE_OPEN_TAG_ACK,
+    STATE_OPEN_MESSAGE_HEADER,
+    STATE_OPEN_MESSAGE_THROTTLE_MESSAGE,
+    STATE_OPEN_MESSAGE_THROTTLE_BYTES,
+    STATE_OPEN_MESSAGE_READ_FRONT,
+    STATE_OPEN_MESSAGE_READ_MIDDLE,
+    STATE_OPEN_MESSAGE_READ_DATA_PREPARE,
+    STATE_OPEN_MESSAGE_READ_DATA,
+    STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH,
+    STATE_OPEN_TAG_CLOSE,
+    STATE_WAIT_SEND,
+    STATE_CONNECTING,
+    STATE_CONNECTING_WAIT_BANNER,
+    STATE_CONNECTING_WAIT_IDENTIFY_PEER,
+    STATE_CONNECTING_SEND_CONNECT_MSG,
+    STATE_CONNECTING_WAIT_CONNECT_REPLY,
+    STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH,
+    STATE_CONNECTING_WAIT_ACK_SEQ,
+    STATE_CONNECTING_READY,
+    STATE_ACCEPTING,
+    STATE_ACCEPTING_HANDLE_CONNECT,
+    STATE_ACCEPTING_WAIT_BANNER_ADDR,
+    STATE_ACCEPTING_WAIT_CONNECT_MSG,
+    STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH,
+    STATE_ACCEPTING_WAIT_SEQ,
+    STATE_ACCEPTING_READY,
+    STATE_STANDBY,
+    STATE_CLOSED,
+    STATE_WAIT,       // just wait for racing connection
+  };
+
+  static const char *get_state_name(int state) {
+      const char* const statenames[] = {"STATE_NONE",
+                                        "STATE_OPEN",
+                                        "STATE_OPEN_KEEPALIVE2",
+                                        "STATE_OPEN_KEEPALIVE2_ACK",
+                                        "STATE_OPEN_TAG_ACK",
+                                        "STATE_OPEN_MESSAGE_HEADER",
+                                        "STATE_OPEN_MESSAGE_THROTTLE_MESSAGE",
+                                        "STATE_OPEN_MESSAGE_THROTTLE_BYTES",
+                                        "STATE_OPEN_MESSAGE_READ_FRONT",
+                                        "STATE_OPEN_MESSAGE_READ_MIDDLE",
+                                        "STATE_OPEN_MESSAGE_READ_DATA_PREPARE",
+                                        "STATE_OPEN_MESSAGE_READ_DATA",
+                                        "STATE_OPEN_MESSAGE_READ_FOOTER_AND_DISPATCH",
+                                        "STATE_OPEN_TAG_CLOSE",
+                                        "STATE_WAIT_SEND",
+                                        "STATE_CONNECTING",
+                                        "STATE_CONNECTING_WAIT_BANNER",
+                                        "STATE_CONNECTING_WAIT_IDENTIFY_PEER",
+                                        "STATE_CONNECTING_SEND_CONNECT_MSG",
+                                        "STATE_CONNECTING_WAIT_CONNECT_REPLY",
+                                        "STATE_CONNECTING_WAIT_CONNECT_REPLY_AUTH",
+                                        "STATE_CONNECTING_WAIT_ACK_SEQ",
+                                        "STATE_CONNECTING_READY",
+                                        "STATE_ACCEPTING",
+                                        "STATE_ACCEPTING_HANDLE_CONNECT",
+                                        "STATE_ACCEPTING_WAIT_BANNER_ADDR",
+                                        "STATE_ACCEPTING_WAIT_CONNECT_MSG",
+                                        "STATE_ACCEPTING_WAIT_CONNECT_MSG_AUTH",
+                                        "STATE_ACCEPTING_WAIT_SEQ",
+                                        "STATE_ACCEPTING_READY",
+                                        "STATE_STANDBY",
+                                        "STATE_CLOSED",
+                                        "STATE_WAIT",
+                                        "STATE_FAULT"};
+      return statenames[state];
+  }
+
+  CephContext *cc;
+  AsyncMessenger *async_msgr;
+  int global_seq;
+  __u32 connect_seq, peer_global_seq;
+  uint64_t out_seq;
+  uint64_t in_seq, in_seq_acked;
+  int state;
+  int state_after_send;
+  int sd;
+  int port;
+  Messenger::Policy policy;
+  map<int, list<Message*> > out_q;  // priority queue for outbound msgs
+  list<Message*> sent;
+  Mutex lock;
+  utime_t backoff;         // backoff time
+  bool open_write;
+  EventCallbackRef read_handler;
+  EventCallbackRef write_handler;
+  EventCallbackRef reset_handler;
+  EventCallbackRef remote_reset_handler;
+  bool keepalive;
+  struct iovec msgvec[IOV_LEN];
+
+  // Tis section are temp variables used by state transition
+
+  // Open state
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+  uint64_t msg_left;
+  ceph_msg_header current_header;
+  bufferlist data_buf;
+  bufferlist::iterator data_blp;
+  bufferlist front, middle, data;
+  ceph_msg_connect connect_msg;
+  // Connecting state
+  bool got_bad_auth;
+  AuthAuthorizer *authorizer;
+  ceph_msg_connect_reply connect_reply;
+  // Accepting state
+  entity_addr_t socket_addr;
+  CryptoKey session_key;
+
+  // used only for local state, it will be overwrite when state transition
+  bufferptr state_buffer;
+  // used only by "read_until"
+  uint64_t state_offset;
+  bufferlist outcoming_bl;
+  NetHandler net;
+  EventCenter *center;
+  ceph::shared_ptr<AuthSessionHandler> session_security;
+
+ public:
+  // used by eventcallback
+  void handle_write();
+  void process();
+}; /* AsyncConnection */
+
+typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef;
+
+#endif
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
new file mode 100644 (file)
index 0000000..ed8f04f
--- /dev/null
@@ -0,0 +1,678 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include <errno.h>
+#include <iostream>
+#include <fstream>
+#include <poll.h>
+
+#include "AsyncMessenger.h"
+
+#include "common/config.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+#include "auth/Crypto.h"
+#include "include/Spinlock.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) {
+  return *_dout << "-- " << m->get_myaddr() << " ";
+}
+
+static ostream& _prefix(std::ostream *_dout, Processor *p) {
+  return *_dout << " Processor -- ";
+}
+
+static ostream& _prefix(std::ostream *_dout, Worker *w) {
+  return *_dout << "--";
+}
+
+class C_handle_accept : public EventCallback {
+  AsyncConnectionRef conn;
+  int fd;
+
+ public:
+  C_handle_accept(AsyncConnectionRef c, int s): conn(c), fd(s) {}
+  void do_request(int id) {
+    conn->accept(fd);
+  }
+};
+
+class C_handle_connect : public EventCallback {
+  AsyncConnectionRef conn;
+  const entity_addr_t addr;
+  int type;
+
+ public:
+  C_handle_connect(AsyncConnectionRef c, const entity_addr_t &d, int t)
+      :conn(c), addr(d), type(t) {}
+  void do_request(int id) {
+    conn->connect(addr, type);
+  }
+};
+
+
+/*******************
+ * Processor
+ */
+
+int Processor::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
+{
+  const md_config_t *conf = msgr->cct->_conf;
+  // bind to a socket
+  ldout(msgr->cct, 10) << __func__ << dendl;
+
+  int family;
+  switch (bind_addr.get_family()) {
+  case AF_INET:
+  case AF_INET6:
+    family = bind_addr.get_family();
+    break;
+
+  default:
+    // bind_addr is empty
+    family = conf->ms_bind_ipv6 ? AF_INET6 : AF_INET;
+  }
+
+  /* socket creation */
+  listen_sd = ::socket(family, SOCK_STREAM, 0);
+  if (listen_sd < 0) {
+    lderr(msgr->cct) << __func__ << " unable to create socket: "
+                     << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  // use whatever user specified (if anything)
+  entity_addr_t listen_addr = bind_addr;
+  listen_addr.set_family(family);
+
+  /* bind to port */
+  int rc = -1;
+  if (listen_addr.get_port()) {
+    // specific port
+
+    // reuse addr+port when possible
+    int on = 1;
+    rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+    if (rc < 0) {
+      lderr(msgr->cct) << __func__ << " unable to setsockopt: "
+                       << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+
+    rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
+    if (rc < 0) {
+      lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr.ss_addr()
+                       << ": " << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  } else {
+    // try a range of ports
+    for (int port = msgr->cct->_conf->ms_bind_port_min; port <= msgr->cct->_conf->ms_bind_port_max; port++) {
+      if (avoid_ports.count(port))
+        continue;
+      listen_addr.set_port(port);
+      rc = ::bind(listen_sd, (struct sockaddr *) &listen_addr.ss_addr(), listen_addr.addr_size());
+      if (rc == 0)
+        break;
+    }
+    if (rc < 0) {
+      lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr.ss_addr()
+                       << " on any port in range " << msgr->cct->_conf->ms_bind_port_min
+                       << "-" << msgr->cct->_conf->ms_bind_port_max
+                       << ": " << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+    ldout(msgr->cct,10) << __func__ << " bound on random port " << listen_addr << dendl;
+  }
+
+  // what port did we get?
+  socklen_t llen = sizeof(listen_addr.ss_addr());
+  rc = getsockname(listen_sd, (sockaddr*)&listen_addr.ss_addr(), &llen);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(msgr->cct) << __func__ << " failed getsockname: " << cpp_strerror(rc) << dendl;
+    return rc;
+  }
+
+  ldout(msgr->cct, 10) << __func__ << " bound to " << listen_addr << dendl;
+
+  // listen!
+  rc = ::listen(listen_sd, 128);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(msgr->cct) << __func__ << " unable to listen on " << listen_addr
+                     << ": " << cpp_strerror(rc) << dendl;
+    return rc;
+  }
+
+  msgr->set_myaddr(bind_addr);
+  if (bind_addr != entity_addr_t())
+    msgr->learned_addr(bind_addr);
+
+  if (msgr->get_myaddr().get_port() == 0) {
+    msgr->set_myaddr(listen_addr);
+  }
+  entity_addr_t addr = msgr->get_myaddr();
+  addr.nonce = nonce;
+  msgr->set_myaddr(addr);
+
+  msgr->init_local_connection();
+
+  ldout(msgr->cct,1) << __func__ << " bind my_inst.addr is " << msgr->get_myaddr() << dendl;
+  return 0;
+}
+
+int Processor::rebind(const set<int>& avoid_ports)
+{
+  ldout(msgr->cct, 1) << __func__ << " rebind avoid " << avoid_ports << dendl;
+
+  entity_addr_t addr = msgr->get_myaddr();
+  set<int> new_avoid = avoid_ports;
+  new_avoid.insert(addr.get_port());
+  addr.set_port(0);
+
+  // adjust the nonce; we want our entity_addr_t to be truly unique.
+  nonce += 1000000;
+  msgr->my_inst.addr.nonce = nonce;
+  ldout(msgr->cct, 10) << __func__ << " new nonce " << nonce << " and inst " << msgr->my_inst << dendl;
+
+  ldout(msgr->cct, 10) << __func__ << " will try " << addr << " and avoid ports " << new_avoid << dendl;
+  int r = bind(addr, new_avoid);
+  if (r == 0)
+    start();
+  return r;
+}
+
+int Processor::start()
+{
+  ldout(msgr->cct, 1) << __func__ << " start" << dendl;
+
+  // start thread
+  if (listen_sd > 0)
+    create();
+
+  return 0;
+}
+
+void *Processor::entry()
+{
+  ldout(msgr->cct, 10) << __func__ << " starting" << dendl;
+  int errors = 0;
+
+  struct pollfd pfd;
+  pfd.fd = listen_sd;
+  pfd.events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+  while (!done) {
+    ldout(msgr->cct, 20) << __func__ << " calling poll" << dendl;
+    int r = poll(&pfd, 1, -1);
+    if (r < 0)
+      break;
+    ldout(msgr->cct,20) << __func__ << " poll got " << r << dendl;
+
+    if (pfd.revents & (POLLERR | POLLNVAL | POLLHUP))
+      break;
+
+    ldout(msgr->cct,10) << __func__ << " pfd.revents=" << pfd.revents << dendl;
+    if (done) break;
+
+    // accept
+    entity_addr_t addr;
+    socklen_t slen = sizeof(addr.ss_addr());
+    int sd = ::accept(listen_sd, (sockaddr*)&addr.ss_addr(), &slen);
+    if (sd >= 0) {
+      errors = 0;
+      ldout(msgr->cct,10) << __func__ << "accepted incoming on sd " << sd << dendl;
+
+      msgr->add_accept(sd);
+    } else {
+      ldout(msgr->cct,0) << __func__ << " no incoming connection?  sd = " << sd
+                         << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+      if (++errors > 4)
+        break;
+    }
+  }
+
+  ldout(msgr->cct,20) << __func__ << " closing" << dendl;
+  // don't close socket, in case we start up again?  blech.
+  if (listen_sd >= 0) {
+    ::close(listen_sd);
+    listen_sd = -1;
+  }
+  ldout(msgr->cct,10) << __func__ << " stopping" << dendl;
+  return 0;
+}
+
+void Processor::stop()
+{
+  done = true;
+  ldout(msgr->cct,10) << __func__ << dendl;
+
+  if (listen_sd >= 0) {
+    ::shutdown(listen_sd, SHUT_RDWR);
+  }
+
+  // wait for thread to stop before closing the socket, to avoid
+  // racing against fd re-use.
+  if (is_started()) {
+    join();
+  }
+
+  if (listen_sd >= 0) {
+    ::close(listen_sd);
+    listen_sd = -1;
+  }
+  done = false;
+}
+
+void Worker::stop()
+{
+  ldout(msgr->cct, 10) << __func__ << dendl;
+  done = true;
+  center.wakeup();
+}
+
+void *Worker::entry()
+{
+  ldout(msgr->cct, 10) << __func__ << " starting" << dendl;
+  int r;
+
+  while (!done) {
+    ldout(msgr->cct, 20) << __func__ << " calling event process" << dendl;
+
+    r = center.process_events(30000000);
+    if (r < 0) {
+      ldout(msgr->cct,20) << __func__ << " process events failed: "
+                          << cpp_strerror(errno) << dendl;
+      // TODO do something?
+    }
+  }
+
+  return 0;
+}
+
+/*******************
+ * AsyncMessenger
+ */
+
+AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
+                               string mname, uint64_t _nonce)
+  : SimplePolicyMessenger(cct, name,mname, _nonce),
+    conn_id(0),
+    processor(this, _nonce),
+    lock("AsyncMessenger::lock"),
+    nonce(_nonce), did_bind(false),
+    global_seq(0),
+    cluster_protocol(0), stopped(true)
+{
+  ceph_spin_init(&global_seq_lock);
+  for (int i = 0; i < cct->_conf->ms_event_op_threads; ++i) {
+    Worker *w = new Worker(this, cct);
+    workers.push_back(w);
+  }
+  local_connection = new AsyncConnection(cct, this, &workers[0]->center);
+  init_local_connection();
+}
+
+/**
+ * Destroy the AsyncMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+AsyncMessenger::~AsyncMessenger()
+{
+  assert(!did_bind); // either we didn't bind or we shut down the Processor
+}
+
+void AsyncMessenger::ready()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddr() << dendl;
+
+  lock.Lock();
+  processor.start();
+  lock.Unlock();
+}
+
+int AsyncMessenger::shutdown()
+{
+  ldout(cct,10) << __func__ << "shutdown " << get_myaddr() << dendl;
+  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it)
+    (*it)->stop();
+  mark_down_all();
+
+  // break ref cycles on the loopback connection
+  processor.stop();
+  local_connection->set_priv(NULL);
+  stop_cond.Signal();
+  stopped = true;
+  return 0;
+}
+
+
+int AsyncMessenger::bind(const entity_addr_t &bind_addr)
+{
+  lock.Lock();
+  if (started) {
+    ldout(cct,10) << __func__ << " already started" << dendl;
+    lock.Unlock();
+    return -1;
+  }
+  ldout(cct,10) << __func__ << " bind " << bind_addr << dendl;
+  lock.Unlock();
+
+  // bind to a socket
+  set<int> avoid_ports;
+  int r = processor.bind(bind_addr, avoid_ports);
+  if (r >= 0)
+    did_bind = true;
+  return r;
+}
+
+int AsyncMessenger::rebind(const set<int>& avoid_ports)
+{
+  ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl;
+  assert(did_bind);
+  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it) {
+    (*it)->stop();
+    if ((*it)->is_started())
+      (*it)->join();
+  }
+
+  processor.stop();
+  mark_down_all();
+  return processor.rebind(avoid_ports);
+}
+
+int AsyncMessenger::start()
+{
+  lock.Lock();
+  ldout(cct,1) << __func__ << " start" << dendl;
+
+  // register at least one entity, first!
+  assert(my_inst.name.type() >= 0);
+
+  assert(!started);
+  started = true;
+  stopped = false;
+
+  if (!did_bind) {
+    my_inst.addr.nonce = nonce;
+    _init_local_connection();
+  }
+
+  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it)
+    (*it)->create();
+
+  lock.Unlock();
+  return 0;
+}
+
+void AsyncMessenger::wait()
+{
+  lock.Lock();
+  if (!started) {
+    lock.Unlock();
+    return;
+  }
+  if (!stopped)
+    stop_cond.Wait(lock);
+
+  for (vector<Worker*>::iterator it = workers.begin(); it != workers.end(); ++it)
+    (*it)->join();
+  lock.Unlock();
+
+  // done!  clean up.
+  ldout(cct,20) << __func__ << ": stopping processor thread" << dendl;
+  processor.stop();
+  did_bind = false;
+  ldout(cct,20) << __func__ << ": stopped processor thread" << dendl;
+
+  // close all pipes
+  lock.Lock();
+  {
+    ldout(cct, 10) << __func__ << ": closing pipes" << dendl;
+
+    while (!conns.empty()) {
+      AsyncConnectionRef p = conns.begin()->second;
+      _stop_conn(p);
+    }
+  }
+  lock.Unlock();
+
+  ldout(cct, 10) << __func__ << ": done." << dendl;
+  ldout(cct, 1) << __func__ << " complete." << dendl;
+  started = false;
+}
+
+AsyncConnectionRef AsyncMessenger::add_accept(int sd)
+{
+  lock.Lock();
+  Worker *w = workers[conn_id % workers.size()];
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center);
+  w->center.dispatch_event_external(EventCallbackRef(new C_handle_accept(conn, sd)));
+  accepting_conns.insert(conn);
+  conn_id++;
+  lock.Unlock();
+  return conn;
+}
+
+AsyncConnectionRef AsyncMessenger::create_connect(const entity_addr_t& addr, int type)
+{
+  assert(lock.is_locked());
+  assert(addr != my_inst.addr);
+
+  ldout(cct, 10) << __func__ << " " << addr
+                 << ", creating connection and registering" << dendl;
+
+  // create connection
+  Worker *w = workers[conn_id % workers.size()];
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &w->center);
+  conn->connect(addr, type);
+  assert(!conns.count(addr));
+  conns[addr] = conn;
+  conn_id++;
+
+  return conn;
+}
+
+ConnectionRef AsyncMessenger::get_connection(const entity_inst_t& dest)
+{
+  Mutex::Locker l(lock);
+  if (my_inst.addr == dest.addr) {
+    // local
+    return local_connection;
+  }
+
+  AsyncConnectionRef conn = _lookup_conn(dest.addr);
+  if (conn) {
+    ldout(cct, 10) << __func__ << " " << dest << " existing " << conn << dendl;
+  } else {
+    conn = create_connect(dest.addr, dest.name.type());
+    ldout(cct, 10) << __func__ << " " << dest << " new " << conn << dendl;
+  }
+
+  return conn;
+}
+
+ConnectionRef AsyncMessenger::get_loopback_connection()
+{
+  return local_connection;
+}
+
+int AsyncMessenger::_send_message(Message *m, const entity_inst_t& dest)
+{
+  ldout(cct, 1) << __func__ << "--> " << dest.name << " "
+                << dest.addr << " -- " << *m << " -- ?+"
+                << m->get_data().length() << " " << m << dendl;
+
+  if (dest.addr == entity_addr_t()) {
+    ldout(cct,0) << __func__ <<  " message " << *m
+                 << " with empty dest " << dest.addr << dendl;
+    m->put();
+    return -EINVAL;
+  }
+
+  AsyncConnectionRef conn = _lookup_conn(dest.addr);
+  submit_message(m, conn, dest.addr, dest.name.type());
+  return 0;
+}
+
+void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con,
+                                    const entity_addr_t& dest_addr, int dest_type)
+{
+  if (cct->_conf->ms_dump_on_send) {
+    m->encode(-1, true);
+    ldout(cct, 0) << __func__ << "submit_message " << *m << "\n";
+    m->get_payload().hexdump(*_dout);
+    if (m->get_data().length() > 0) {
+      *_dout << " data:\n";
+      m->get_data().hexdump(*_dout);
+    }
+    *_dout << dendl;
+    m->clear_payload();
+  }
+
+  // existing connection?
+  if (con) {
+    con->send_message(m);
+    return ;
+  }
+
+  // local?
+  if (my_inst.addr == dest_addr) {
+    // local
+    ldout(cct, 20) << __func__ << " " << *m << " local" << dendl;
+    m->set_connection(local_connection.get());
+    m->set_recv_stamp(ceph_clock_now(cct));
+    ms_fast_preprocess(m);
+    if (ms_can_fast_dispatch(m)) {
+      ms_fast_dispatch(m);
+    } else {
+      if (m->get_priority() >= CEPH_MSG_PRIO_LOW) {
+        ms_fast_dispatch(m);
+      } else {
+        ms_deliver_dispatch(m);
+      }
+    }
+
+    return;
+  }
+
+  // remote, no existing pipe.
+  const Policy& policy = get_policy(dest_type);
+  if (policy.server) {
+    ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addr
+                   << ", lossy server for target type "
+                   << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl;
+    m->put();
+  } else {
+    ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addr << ", new pipe." << dendl;
+  }
+}
+
+/**
+ * If my_inst.addr doesn't have an IP set, this function
+ * will fill it in from the passed addr. Otherwise it does nothing and returns.
+ */
+void AsyncMessenger::set_addr_unknowns(entity_addr_t &addr)
+{
+  Mutex::Locker l(lock);
+  if (my_inst.addr.is_blank_ip()) {
+    int port = my_inst.addr.get_port();
+    my_inst.addr.addr = addr.addr;
+    my_inst.addr.set_port(port);
+    _init_local_connection();
+  }
+}
+
+int AsyncMessenger::send_keepalive(Connection *con)
+{
+  con->send_keepalive();
+  return 0;
+}
+
+void AsyncMessenger::mark_down_all()
+{
+  ldout(cct,1) << __func__ << " " << dendl;
+  lock.Lock();
+  for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin();
+       q != accepting_conns.end(); ++q) {
+    AsyncConnectionRef p = *q;
+    ldout(cct, 5) << __func__ << " accepting_conn " << p << dendl;
+    p->mark_down();
+    p->get();
+    ms_deliver_handle_reset(p.get());
+  }
+  accepting_conns.clear();
+
+  while (!conns.empty()) {
+    ceph::unordered_map<entity_addr_t, AsyncConnectionRef>::iterator it = conns.begin();
+    AsyncConnectionRef p = it->second;
+    ldout(cct, 5) << __func__ << " " << it->first << " " << p << dendl;
+    conns.erase(it);
+    p->mark_down();
+    p->get();
+    ms_deliver_handle_reset(p.get());
+  }
+  lock.Unlock();
+}
+
+void AsyncMessenger::mark_down(const entity_addr_t& addr)
+{
+  lock.Lock();
+  AsyncConnectionRef p = _lookup_conn(addr);
+  if (p) {
+    ldout(cct, 1) << __func__ << " " << addr << " -- " << p << dendl;
+    _stop_conn(p);
+    p->get();
+    ms_deliver_handle_reset(p.get());
+  } else {
+    ldout(cct, 1) << __func__ << " " << addr << " -- pipe dne" << dendl;
+  }
+  lock.Unlock();
+}
+
+int AsyncMessenger::get_proto_version(int peer_type, bool connect)
+{
+  int my_type = my_inst.name.type();
+
+  // set reply protocol version
+  if (peer_type == my_type) {
+    // internal
+    return cluster_protocol;
+  } else {
+    // public
+    if (connect) {
+      switch (peer_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+      }
+    } else {
+      switch (my_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+      }
+    }
+  }
+  return 0;
+}
+
+void AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_inst.addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  lock.Lock();
+  entity_addr_t t = peer_addr_for_me;
+  t.set_port(my_inst.addr.get_port());
+  my_inst.addr.addr = t.addr;
+  ldout(cct, 1) << __func__ << " learned my addr " << my_inst.addr << dendl;
+  _init_local_connection();
+  lock.Unlock();
+}
diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h
new file mode 100644 (file)
index 0000000..087f557
--- /dev/null
@@ -0,0 +1,395 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_ASYNCMESSENGER_H
+#define CEPH_ASYNCMESSENGER_H
+
+#include "include/types.h"
+#include "include/xlist.h"
+
+#include <list>
+#include <map>
+using namespace std;
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
+#include "common/Mutex.h"
+#include "include/atomic.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+
+#include "SimplePolicyMessenger.h"
+#include "include/assert.h"
+#include "DispatchQueue.h"
+#include "AsyncConnection.h"
+#include "Event.h"
+
+
+class AsyncMessenger;
+
+/**
+ * If the Messenger binds to a specific address, the Processor runs
+ * and listens for incoming connections.
+ */
+class Processor : public Thread {
+  AsyncMessenger *msgr;
+  bool done;
+  int listen_sd;
+  uint64_t nonce;
+
+  public:
+  Processor(AsyncMessenger *r, uint64_t n) : msgr(r), done(false), listen_sd(-1), nonce(n) {}
+
+  void *entry();
+  void stop();
+  int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports);
+  int rebind(const set<int>& avoid_port);
+  int start();
+  void accept();
+};
+
+class Worker : public Thread {
+  AsyncMessenger *msgr;
+  bool done;
+
+ public:
+  EventCenter center;
+  Worker(AsyncMessenger *m, CephContext *c): msgr(m), done(false), center(c) {
+    center.init(5000);
+  }
+  void *entry();
+  void stop();
+};
+
+
+/*
+ * This class handles transmission and reception of messages. Generally
+ * speaking, there are several major components:
+ *
+ * - Connection
+ *    Each logical session is associated with a Connection.
+ * - AsyncConnection
+ *    Each network connection is handled through a AsyncConnection, which handles
+ *    the input and output of each message.  There is normally a 1:1
+ *    relationship between AsyncConnection and Connection, but logical sessions may
+ *    get handed off between AsyncConnection when sockets reconnect or during
+ *    connection races.
+ * - IncomingQueue
+ *    Incoming messages are associated with an IncomingQueue, and there
+ *    is one such queue associated with each AsyncConnection.
+ * - DispatchQueue
+ *    IncomingQueues get queued in the DispatchQueue, which is responsible
+ *    for doing a round-robin sweep and processing them via a worker thread.
+ * - AsyncMessenger
+ *    It's the exterior class passed to the external message handler and
+ *    most of the API details.
+ *
+ * Lock ordering:
+ *
+ *   AsyncMessenger::lock
+ *       Pipe::pipe_lock
+ *           DispatchQueue::lock
+ *               IncomingQueue::lock
+ */
+
+class AsyncMessenger : public SimplePolicyMessenger {
+  // First we have the public Messenger interface implementation...
+public:
+  /**
+   * Initialize the AsyncMessenger!
+   *
+   * @param cct The CephContext to use
+   * @param name The name to assign ourselves
+   * _nonce A unique ID to use for this AsyncMessenger. It should not
+   * be a value that will be repeated if the daemon restarts.
+   */
+  AsyncMessenger(CephContext *cct, entity_name_t name,
+                 string mname, uint64_t _nonce);
+
+  /**
+   * Destroy the AsyncMessenger. Pretty simple since all the work is done
+   * elsewhere.
+   */
+  virtual ~AsyncMessenger();
+
+  /** @defgroup Accessors
+   * @{
+   */
+  void set_addr_unknowns(entity_addr_t& addr);
+
+  int get_dispatch_queue_len() {
+    return 0;
+  }
+
+  double get_dispatch_queue_max_age(utime_t now) {
+    return 0;
+  }
+  /** @} Accessors */
+
+  /**
+   * @defgroup Configuration functions
+   * @{
+   */
+  void set_cluster_protocol(int p) {
+    assert(!started && !did_bind);
+    cluster_protocol = p;
+  }
+
+  int bind(const entity_addr_t& bind_addr);
+  int rebind(const set<int>& avoid_ports);
+
+  /** @} Configuration functions */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  virtual int start();
+  virtual void wait();
+  virtual int shutdown();
+
+  /** @} // Startup/Shutdown */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  virtual int send_message(Message *m, const entity_inst_t& dest) {
+          Mutex::Locker l(lock);
+
+    return _send_message(m, dest);
+  }
+
+  /** @} // Messaging */
+
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  virtual ConnectionRef get_connection(const entity_inst_t& dest);
+  virtual ConnectionRef get_loopback_connection();
+  int send_keepalive(Connection *con);
+  virtual void mark_down(const entity_addr_t& addr);
+  virtual void mark_down_all();
+  /** @} // Connection Management */
+
+  /**
+   * @defgroup Inner classes
+   * @{
+   */
+
+  Connection *create_anon_connection() {
+    Mutex::Locker l(lock);
+    Worker *w = workers[conn_id % workers.size()];
+    conn_id++;
+    return new AsyncConnection(cct, this, &w->center);
+  }
+
+  /**
+   * @} // Inner classes
+   */
+
+protected:
+  /**
+   * @defgroup Messenger Interfaces
+   * @{
+   */
+  /**
+   * Start up the DispatchQueue thread once we have somebody to dispatch to.
+   */
+  virtual void ready();
+  /** @} // Messenger Interfaces */
+
+private:
+
+  /**
+   * @defgroup Utility functions
+   * @{
+   */
+
+  /**
+   * Create a connection associated with the given entity (of the given type).
+   * Initiate the connection. (This function returning does not guarantee
+   * connection success.)
+   *
+   * @param addr The address of the entity to connect to.
+   * @param type The peer type of the entity at the address.
+   * @param con An existing Connection to associate with the new connection. If
+   * NULL, it creates a new Connection.
+   * @param msg an initial message to queue on the new connection
+   *
+   * @return a pointer to the newly-created connection. Caller does not own a
+   * reference; take one if you need it.
+   */
+  AsyncConnectionRef create_connect(const entity_addr_t& addr, int type);
+
+  /**
+   * Queue up a Message for delivery to the entity specified
+   * by addr and dest_type.
+   * submit_message() is responsible for creating
+   * new AsyncConnection (and closing old ones) as necessary.
+   *
+   * @param m The Message to queue up. This function eats a reference.
+   * @param con The existing Connection to use, or NULL if you don't know of one.
+   * @param addr The address to send the Message to.
+   * @param dest_type The peer type of the address we're sending to
+   * just drop silently under failure.
+   */
+  void submit_message(Message *m, AsyncConnectionRef con,
+                      const entity_addr_t& dest_addr, int dest_type);
+
+  int _send_message(Message *m, const entity_inst_t& dest);
+
+ private:
+  vector<Worker*> workers;
+  int conn_id;
+
+  Processor processor;
+  friend class Processor;
+
+  /// overall lock used for AsyncMessenger data structures
+  Mutex lock;
+  // AsyncMessenger stuff
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
+
+  /**
+   *  The following aren't lock-protected since you shouldn't be able to race
+   *  the only writers.
+   */
+
+  int listen_sd;
+  /**
+   *  false; set to true if the AsyncMessenger bound to a specific address;
+   *  and set false again by Accepter::stop().
+   */
+  bool did_bind;
+  /// counter for the global seq our connection protocol uses
+  __u32 global_seq;
+  /// lock to protect the global_seq
+  ceph_spinlock_t global_seq_lock;
+
+  /**
+   * hash map of addresses to Asyncconnection
+   *
+   * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered
+   * invalid and can be replaced by anyone holding the msgr lock
+   */
+  ceph::unordered_map<entity_addr_t, AsyncConnectionRef> conns;
+
+  /**
+   * list of connection are in teh process of accepting
+   *
+   * These are not yet in the conns map.
+   */
+  // FIXME clear up
+  set<AsyncConnectionRef> accepting_conns;
+
+  /// internal cluster protocol version, if any, for talking to entities of the same type.
+  int cluster_protocol;
+
+  Cond  stop_cond;
+  bool stopped;
+
+  AsyncConnectionRef _lookup_conn(const entity_addr_t& k) {
+    assert(lock.is_locked());
+    ceph::unordered_map<entity_addr_t, AsyncConnectionRef>::iterator p = conns.find(k);
+    if (p == conns.end())
+      return NULL;
+    return p->second;
+  }
+
+  void _stop_conn(AsyncConnectionRef c) {
+    assert(lock.is_locked());
+    if (c) {
+      c->mark_down();
+      conns.erase(c->peer_addr);
+    }
+  }
+
+  void _init_local_connection() {
+    assert(lock.is_locked());
+    local_connection->peer_addr = my_inst.addr;
+    local_connection->peer_type = my_inst.name.type();
+    ms_deliver_handle_fast_connect(local_connection.get());
+  }
+
+
+public:
+
+  /// con used for sending messages to ourselves
+  ConnectionRef local_connection;
+
+  /**
+   * @defgroup AsyncMessenger internals
+   * @{
+   */
+  /**
+   * This wraps _lookup_conn.
+   */
+  AsyncConnectionRef lookup_conn(const entity_addr_t& k) {
+    Mutex::Locker l(lock);
+    return _lookup_conn(k);
+  }
+
+  void accept_conn(AsyncConnectionRef conn) {
+    Mutex::Locker l(lock);
+    conns[conn->peer_addr] = conn;
+    accepting_conns.erase(conn);
+  }
+
+  void learned_addr(const entity_addr_t &peer_addr_for_me);
+  AsyncConnectionRef add_accept(int sd);
+
+  /**
+   * This wraps ms_deliver_get_authorizer. We use it for AsyncConnection.
+   */
+  AuthAuthorizer *get_authorizer(int peer_type, bool force_new) {
+    return ms_deliver_get_authorizer(peer_type, force_new);
+  }
+
+  /**
+   * This wraps ms_deliver_verify_authorizer; we use it for AsyncConnection.
+   */
+  bool verify_authorizer(Connection *con, int peer_type, int protocol, bufferlist& auth, bufferlist& auth_reply,
+                         bool& isvalid, CryptoKey& session_key) {
+    return ms_deliver_verify_authorizer(con, peer_type, protocol, auth,
+                                        auth_reply, isvalid, session_key);
+  }
+  /**
+   * Increment the global sequence for this AsyncMessenger and return it.
+   * This is for the connect protocol, although it doesn't hurt if somebody
+   * else calls it.
+   *
+   * @return a global sequence ID that nobody else has seen.
+   */
+  __u32 get_global_seq(__u32 old=0) {
+    ceph_spin_lock(&global_seq_lock);
+    if (old > global_seq)
+      global_seq = old;
+    __u32 ret = ++global_seq;
+    ceph_spin_unlock(&global_seq_lock);
+    return ret;
+  }
+  /**
+   * Get the protocol version we support for the given peer type: either
+   * a peer protocol (if it matches our own), the protocol version for the
+   * peer (if we're connecting), or our protocol version (if we're accepting).
+   */
+  int get_proto_version(int peer_type, bool connect);
+
+  /**
+   * Fill in the address and peer type for the local connection, which
+   * is used for delivering messages back to ourself.
+   */
+  void init_local_connection() {
+    Mutex::Locker l(lock);
+    _init_local_connection();
+  }
+
+  /**
+   * @} // AsyncMessenger Internals
+   */
+} ;
+
+#endif /* CEPH_SIMPLEMESSENGER_H */
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
new file mode 100644 (file)
index 0000000..2aa99bc
--- /dev/null
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#include <time.h>
+
+#include "common/errno.h"
+#include "Event.h"
+
+#ifdef HAVE_EPOLL
+#include "EventEpoll.h"
+#else
+#ifdef HAVE_KQUEUE
+#include "EventKqueue.h"
+#else
+#include "EventSelect.h"
+#endif
+#endif
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "Event "
+
+class C_handle_notify : public EventCallback {
+ public:
+  C_handle_notify() {}
+  void do_request(int fd_or_id) {
+  }
+};
+
+int EventCenter::init(int n)
+{
+  // can't init multi times
+  assert(nevent == 0);
+#ifdef HAVE_EPOLL
+  driver = new EpollDriver(cct);
+#else
+#ifdef HAVE_KQUEUE
+  driver = new KqueueDriver(cct);
+#else
+  driver = new SelectDriver(cct);
+#endif
+#endif
+
+  if (!driver) {
+    lderr(cct) << __func__ << " failed to create event driver " << dendl;
+    return -1;
+  }
+
+  int r = driver->init(n);
+  if (r < 0) {
+    lderr(cct) << __func__ << " failed to init event driver." << dendl;
+    return r;
+  }
+
+  int fds[2];
+  if (pipe(fds) < 0) {
+    lderr(cct) << __func__ << " can't create notify pipe" << dendl;
+    return -1;
+  }
+
+  notify_receive_fd = fds[0];
+  notify_send_fd = fds[1];
+  file_events = (FileEvent *)malloc(sizeof(FileEvent)*n);
+  memset(file_events, 0, sizeof(FileEvent)*n);
+
+  nevent = n;
+  create_file_event(notify_receive_fd, EVENT_READABLE, EventCallbackRef(new C_handle_notify()));
+  return 0;
+}
+
+EventCenter::~EventCenter()
+{
+  if (driver)
+    delete driver;
+
+  if (notify_receive_fd > 0)
+    ::close(notify_receive_fd);
+  if (notify_send_fd > 0)
+    ::close(notify_send_fd);
+}
+
+int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
+{
+  int r;
+  if (fd > nevent) {
+    int new_size = nevent << 2;
+    while (fd > new_size)
+      new_size <<= 2;
+    ldout(cct, 10) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
+    r = driver->resize_events(new_size);
+    if (r < 0) {
+      lderr(cct) << __func__ << " event count is exceed." << dendl;
+      return -ERANGE;
+    }
+    FileEvent *new_events = (FileEvent *)realloc(file_events, sizeof(FileEvent)*new_size);
+    if (!new_events) {
+      lderr(cct) << __func__ << " failed to realloc file_events" << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+    file_events = new_events;
+    nevent = new_size;
+  }
+
+  EventCenter::FileEvent *event = _get_file_event(fd);
+
+  r = driver->add_event(fd, event->mask, mask);
+  if (r < 0)
+    return r;
+
+  event->mask |= mask;
+  if (mask & EVENT_READABLE) {
+    event->read_cb = ctxt;
+  }
+  if (mask & EVENT_WRITABLE) {
+    event->write_cb = ctxt;
+  }
+  ldout(cct, 10) << __func__ << " create event fd=" << fd << " mask=" << mask
+                 << " now mask is " << event->mask << dendl;
+  return 0;
+}
+
+void EventCenter::delete_file_event(int fd, int mask)
+{
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  if (!event->mask)
+    return ;
+
+  driver->del_event(fd, event->mask, mask);
+
+  if (mask & EVENT_READABLE && event->read_cb) {
+    event->read_cb.reset();
+  }
+  if (mask & EVENT_WRITABLE && event->write_cb) {
+    event->write_cb.reset();
+  }
+
+  event->mask = event->mask & (~mask);
+  ldout(cct, 10) << __func__ << " delete fd=" << fd << " mask=" << mask
+                 << " now mask is " << event->mask << dendl;
+}
+
+uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt)
+{
+  uint64_t id = time_event_next_id++;
+
+  ldout(cct, 10) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl;
+  EventCenter::TimeEvent event;
+  utime_t expire;
+  struct timeval tv;
+
+  if (microseconds < 5) {
+    tv.tv_sec = 0;
+    tv.tv_usec = microseconds;
+  } else {
+    expire = ceph_clock_now(cct);
+    expire.copy_to_timeval(&tv);
+    tv.tv_sec += microseconds / 1000000;
+    tv.tv_usec += microseconds % 1000000;
+  }
+  expire.set_from_timeval(&tv);
+
+  event.id = id;
+  event.time_cb = ctxt;
+  time_events[expire].push_back(event);
+
+  return id;
+}
+
+void EventCenter::wakeup()
+{
+  ldout(cct, 1) << __func__ << dendl;
+  char buf[1];
+  buf[0] = 'c';
+  // wake up "event_wait"
+  int n = write(notify_send_fd, buf, 1);
+  // FIXME ?
+  assert(n == 1);
+}
+
+int EventCenter::process_time_events()
+{
+  int processed = 0;
+  time_t now = time(NULL);
+  utime_t cur = ceph_clock_now(cct);
+  ldout(cct, 10) << __func__ << " cur time is " << cur << dendl;
+
+  /* If the system clock is moved to the future, and then set back to the
+   * right value, time events may be delayed in a random way. Often this
+   * means that scheduled operations will not be performed soon enough.
+   *
+   * Here we try to detect system clock skews, and force all the time
+   * events to be processed ASAP when this happens: the idea is that
+   * processing events earlier is less dangerous than delaying them
+   * indefinitely, and practice suggests it is. */
+  if (now < last_time) {
+    map<utime_t, list<TimeEvent> > changed;
+    for (map<utime_t, list<TimeEvent> >::iterator it = time_events.begin();
+         it != time_events.end(); ++it) {
+      changed[utime_t()].swap(it->second);
+    }
+    time_events.swap(changed);
+  }
+  last_time = now;
+
+  map<utime_t, list<TimeEvent> >::iterator prev;
+  for (map<utime_t, list<TimeEvent> >::iterator it = time_events.begin();
+       it != time_events.end(); ) {
+    prev = it;
+    if (cur >= it->first) {
+      for (list<TimeEvent>::iterator j = it->second.begin();
+           j != it->second.end(); ++j) {
+        ldout(cct, 10) << __func__ << " process time event: id=" << j->id << " time is "
+                      << it->first << dendl;
+        j->time_cb->do_request(j->id);
+      }
+      processed++;
+      ++it;
+      time_events.erase(prev);
+    } else {
+      break;
+    }
+  }
+
+  return processed;
+}
+
+int EventCenter::process_events(int timeout_microseconds)
+{
+  struct timeval tv;
+  int numevents;
+  bool trigger_time = false;
+
+  utime_t period, shortest, now = ceph_clock_now(cct);
+  now.copy_to_timeval(&tv);
+  if (timeout_microseconds > 0) {
+    tv.tv_sec += timeout_microseconds / 1000000;
+    tv.tv_usec += timeout_microseconds % 1000000;
+  }
+  shortest.set_from_timeval(&tv);
+
+  {
+    map<utime_t, list<TimeEvent> >::iterator it = time_events.begin();
+    if (it != time_events.end() && shortest >= it->first) {
+      ldout(cct, 10) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl;
+      shortest = it->first;
+      trigger_time = true;
+      if (shortest > now) {
+        period = now - shortest;
+        period.copy_to_timeval(&tv);
+      } else {
+        tv.tv_sec = 0;
+        tv.tv_usec = 0;
+      }
+    } else {
+      tv.tv_sec = timeout_microseconds / 1000000;
+      tv.tv_usec = timeout_microseconds % 1000000;
+    }
+  }
+
+  ldout(cct, 10) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl;
+  vector<FiredFileEvent> fired_events;
+  numevents = driver->event_wait(fired_events, &tv);
+  for (int j = 0; j < numevents; j++) {
+    int rfired = 0;
+    FileEvent *event = _get_file_event(fired_events[j].fd);
+    if (!event)
+      continue;
+
+    /* note the event->mask & mask & ... code: maybe an already processed
+    * event removed an element that fired and we still didn't
+    * processed, so we check if the event is still valid. */
+    if (event->mask & fired_events[j].mask & EVENT_READABLE) {
+      rfired = 1;
+      event->read_cb->do_request(fired_events[j].fd);
+    }
+    event = _get_file_event(fired_events[j].fd);
+    if (!event)
+      continue;
+
+    if (event->mask & fired_events[j].mask & EVENT_WRITABLE) {
+      if (!rfired || event->read_cb != event->write_cb)
+        event->write_cb->do_request(fired_events[j].fd);
+    }
+
+    ldout(cct, 20) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl;
+  }
+
+  if (trigger_time)
+    numevents += process_time_events();
+
+  {
+    lock.Lock();
+    while (!external_events.empty()) {
+      EventCallbackRef e = external_events.front();
+      external_events.pop_front();
+      lock.Unlock();
+      e->do_request(0);
+      lock.Lock();
+    }
+    lock.Unlock();
+  }
+  return numevents;
+}
+
+void EventCenter::dispatch_event_external(EventCallbackRef e)
+{
+  lock.Lock();
+  external_events.push_back(e);
+  lock.Unlock();
+  wakeup();
+}
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
new file mode 100644 (file)
index 0000000..3b3e66b
--- /dev/null
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MSG_EVENT_H
+#define CEPH_MSG_EVENT_H
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+// We use epoll, kqueue, evport, select in descending order by performance.
+#if defined(__linux__)
+#define HAVE_EPOLL 1
+#endif
+
+#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
+#define HAVE_KQUEUE 1
+#endif
+
+#ifdef __sun
+#include <sys/feature_tests.h>
+#ifdef _DTRACE_VERSION
+#define HAVE_EVPORT 1
+#endif
+#endif
+
+#include "include/Context.h"
+#include "include/unordered_map.h"
+#include "common/WorkQueue.h"
+
+#define EVENT_NONE 0
+#define EVENT_READABLE 1
+#define EVENT_WRITABLE 2
+
+class EventCenter;
+
+class EventCallback {
+
+ public:
+  virtual void do_request(int fd_or_id) = 0;
+  virtual ~EventCallback() {}       // we want a virtual destructor!!!
+};
+
+typedef ceph::shared_ptr<EventCallback> EventCallbackRef;
+
+struct FiredFileEvent {
+  int fd;
+  int mask;
+};
+
+class EventDriver {
+ public:
+  virtual ~EventDriver() {}       // we want a virtual destructor!!!
+  virtual int init(int nevent) = 0;
+  virtual int add_event(int fd, int cur_mask, int mask) = 0;
+  virtual void del_event(int fd, int cur_mask, int del_mask) = 0;
+  virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
+  virtual int resize_events(int newsize) = 0;
+};
+
+class EventCenter {
+  struct FileEvent {
+    int mask;
+    EventCallbackRef read_cb;
+    EventCallbackRef write_cb;
+    FileEvent(): mask(0) {}
+  };
+
+  struct TimeEvent {
+    uint64_t id;
+    EventCallbackRef time_cb;
+
+    TimeEvent(): id(0) {}
+  };
+
+  CephContext *cct;
+  int nevent;
+  // Used only to external event
+  Mutex lock;
+  deque<EventCallbackRef> external_events;
+  FileEvent *file_events;
+  EventDriver *driver;
+  map<utime_t, list<TimeEvent> > time_events;
+  uint64_t time_event_next_id;
+  time_t last_time; // last time process time event
+  int notify_receive_fd;
+  int notify_send_fd;
+
+  int process_time_events();
+  FileEvent *_get_file_event(int fd) {
+    FileEvent *p = &file_events[fd];
+    if (!p->mask)
+      new(p) FileEvent();
+    return p;
+  }
+
+ public:
+  EventCenter(CephContext *c):
+    cct(c), nevent(0),
+    lock("AsyncMessenger::lock"),
+    driver(NULL), time_event_next_id(0),
+    notify_receive_fd(-1), notify_send_fd(-1) {
+    last_time = time(NULL);
+  }
+  ~EventCenter();
+  int init(int nevent);
+  // Used by internal thread
+  int create_file_event(int fd, int mask, EventCallbackRef ctxt);
+  uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt);
+  void delete_file_event(int fd, int mask);
+  int process_events(int timeout_microseconds);
+  void wakeup();
+
+  // Used by external thread
+  void dispatch_event_external(EventCallbackRef e);
+};
+
+#endif
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
new file mode 100644 (file)
index 0000000..1b7aa18
--- /dev/null
@@ -0,0 +1,114 @@
+#include "common/errno.h"
+#include "EventEpoll.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EpollDriver."
+
+int EpollDriver::init(int nevent)
+{
+  events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent);
+  if (!events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  memset(events, 0, sizeof(struct epoll_event)*nevent);
+
+  epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+  if (epfd == -1) {
+    lderr(cct) << __func__ << " unable to do epoll_create: "
+                       << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  size = nevent;
+
+  return 0;
+}
+
+int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  struct epoll_event ee;
+  /* If the fd was already monitored for some event, we need a MOD
+   * operation. Otherwise we need an ADD operation. */
+  int op;
+  op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD;
+
+  ee.events = EPOLLET;
+  add_mask |= cur_mask; /* Merge old events */
+  if (add_mask & EVENT_READABLE)
+    ee.events |= EPOLLIN;
+  if (add_mask & EVENT_WRITABLE)
+    ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (epoll_ctl(epfd, op, fd, &ee) == -1) {
+    lderr(cct) << __func__ << " unable to add event: "
+                       << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask
+                 << dendl;
+  return 0;
+}
+
+void EpollDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  struct epoll_event ee;
+  int mask = cur_mask & (~delmask);
+
+  ee.events = 0;
+  if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
+  if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (mask != EVENT_NONE) {
+    if (epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+    }
+  } else {
+    /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+     * EPOLL_CTL_DEL. */
+    if (epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
+                 << " failed." << cpp_strerror(errno) << dendl;
+    }
+  }
+  ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << mask
+                 << dendl;
+}
+
+int EpollDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  retval = epoll_wait(epfd, events, size,
+                      tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+  if (retval > 0) {
+    int j;
+
+    numevents = retval;
+    fired_events.resize(numevents);
+    for (j = 0; j < numevents; j++) {
+      int mask = 0;
+      struct epoll_event *e = events + j;
+
+      if (e->events & EPOLLIN) mask |= EVENT_READABLE;
+      if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE;
+      if (e->events & EPOLLERR) mask |= EVENT_WRITABLE;
+      if (e->events & EPOLLHUP) mask |= EVENT_WRITABLE;
+      fired_events[j].fd = e->data.fd;
+      fired_events[j].mask = mask;
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h
new file mode 100644 (file)
index 0000000..735acca
--- /dev/null
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_MSG_EVENTEPOLL_H
+#define CEPH_MSG_EVENTEPOLL_H
+
+#include <unistd.h>
+#include <sys/epoll.h>
+
+#include "Event.h"
+
+class EpollDriver : public EventDriver {
+  int epfd;
+  struct epoll_event *events;
+  CephContext *cct;
+  int size;
+
+ public:
+  EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c) {}
+  virtual ~EpollDriver() {
+    if (epfd != -1)
+      close(epfd);
+
+    if (events)
+      free(events);
+  }
+
+  int init(int nevent);
+  int add_event(int fd, int cur_mask, int add_mask);
+  void del_event(int fd, int cur_mask, int del_mask);
+  int resize_events(int newsize);
+  int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp);
+};
+
+#endif
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
new file mode 100644 (file)
index 0000000..e51b19e
--- /dev/null
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "net_handler.h"
+#include "common/errno.h"
+#include "common/debug.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "net_handler: "
+
+namespace ceph{
+
+int NetHandler::create_socket(int domain, bool reuse_addr)
+{
+  int s, on = 1;
+
+  if ((s = ::socket(domain, SOCK_STREAM, 0)) == -1) {
+    lderr(cct) << __func__ << " couldn't created socket " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  /* Make sure connection-intensive things like the benckmark
+   * will be able to close/open sockets a zillion of times */
+  if (reuse_addr) {
+    if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
+      lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: %s"
+                 << strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+
+  return s;
+}
+
+int NetHandler::set_nonblock(int sd)
+{
+  int flags;
+
+  /* Set the socket nonblocking.
+   * Note that fcntl(2) for F_GETFL and F_SETFL can't be
+   * interrupted by a signal. */
+  if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
+    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: %s" << strerror(errno) << dendl;
+    return -errno;
+  }
+  if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
+    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): %s" << strerror(errno) << dendl;
+    return -errno;
+  }
+
+  return 0;
+}
+
+void NetHandler::set_socket_options(int sd)
+{
+  // disable Nagle algorithm?
+  if (cct->_conf->ms_tcp_nodelay) {
+    int flag = 1;
+    int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
+    if (r < 0) {
+      r = -errno;
+      ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
+    }
+  }
+  if (cct->_conf->ms_tcp_rcvbuf) {
+    int size = cct->_conf->ms_tcp_rcvbuf;
+    int r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
+    if (r < 0)  {
+      r = -errno;
+      ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
+    }
+  }
+
+  // block ESIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
+  int val = 1;
+  int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
+  if (r) {
+    r = -errno;
+    ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
+  }
+#endif
+}
+
+int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock)
+{
+  int ret;
+  int s = create_socket(addr.get_family());
+  if (s < 0)
+    return s;
+
+  if (nonblock) {
+    ret = set_nonblock(s);
+    if (ret < 0)
+      return ret;
+  }
+  ret = ::connect(s, (sockaddr*)&addr.addr, addr.addr_size());
+  if (ret < 0) {
+    if (errno == EINPROGRESS && nonblock)
+      return s;
+
+    lderr(cct) << __func__ << " connect: %s " << strerror(errno) << dendl;
+    close(s);
+    return -errno;
+  }
+
+  set_socket_options(s);
+
+  return s;
+}
+
+int NetHandler::connect(const entity_addr_t &addr)
+{
+  return generic_connect(addr, false);
+}
+
+int NetHandler::nonblock_connect(const entity_addr_t &addr)
+{
+  return generic_connect(addr, true);
+}
+
+
+}
diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h
new file mode 100644 (file)
index 0000000..bc8487a
--- /dev/null
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_COMMON_NET_UTILS_H
+#define CEPH_COMMON_NET_UTILS_H
+#include "common/config.h"
+
+namespace ceph {
+  class NetHandler {
+   private:
+    int create_socket(int domain, bool reuse_addr=false);
+    int generic_connect(const entity_addr_t& addr, bool nonblock);
+
+    CephContext *cct;
+   public:
+    NetHandler(CephContext *c): cct(c) {}
+    int set_nonblock(int sd);
+    void set_socket_options(int sd);
+    int connect(const entity_addr_t &addr);
+    int nonblock_connect(const entity_addr_t &addr);
+  };
+}
+
+#endif
diff --git a/src/msg/net_handler.cc b/src/msg/net_handler.cc
deleted file mode 100644 (file)
index e51b19e..0000000
+++ /dev/null
@@ -1,128 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <arpa/inet.h>
-
-#include "net_handler.h"
-#include "common/errno.h"
-#include "common/debug.h"
-
-#define dout_subsys ceph_subsys_ms
-#undef dout_prefix
-#define dout_prefix *_dout << "net_handler: "
-
-namespace ceph{
-
-int NetHandler::create_socket(int domain, bool reuse_addr)
-{
-  int s, on = 1;
-
-  if ((s = ::socket(domain, SOCK_STREAM, 0)) == -1) {
-    lderr(cct) << __func__ << " couldn't created socket " << cpp_strerror(errno) << dendl;
-    return -errno;
-  }
-
-  /* Make sure connection-intensive things like the benckmark
-   * will be able to close/open sockets a zillion of times */
-  if (reuse_addr) {
-    if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
-      lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: %s"
-                 << strerror(errno) << dendl;
-      return -errno;
-    }
-  }
-
-  return s;
-}
-
-int NetHandler::set_nonblock(int sd)
-{
-  int flags;
-
-  /* Set the socket nonblocking.
-   * Note that fcntl(2) for F_GETFL and F_SETFL can't be
-   * interrupted by a signal. */
-  if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
-    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: %s" << strerror(errno) << dendl;
-    return -errno;
-  }
-  if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
-    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): %s" << strerror(errno) << dendl;
-    return -errno;
-  }
-
-  return 0;
-}
-
-void NetHandler::set_socket_options(int sd)
-{
-  // disable Nagle algorithm?
-  if (cct->_conf->ms_tcp_nodelay) {
-    int flag = 1;
-    int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
-    if (r < 0) {
-      r = -errno;
-      ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
-    }
-  }
-  if (cct->_conf->ms_tcp_rcvbuf) {
-    int size = cct->_conf->ms_tcp_rcvbuf;
-    int r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
-    if (r < 0)  {
-      r = -errno;
-      ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
-    }
-  }
-
-  // block ESIGPIPE
-#ifdef CEPH_USE_SO_NOSIGPIPE
-  int val = 1;
-  int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
-  if (r) {
-    r = -errno;
-    ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
-  }
-#endif
-}
-
-int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock)
-{
-  int ret;
-  int s = create_socket(addr.get_family());
-  if (s < 0)
-    return s;
-
-  if (nonblock) {
-    ret = set_nonblock(s);
-    if (ret < 0)
-      return ret;
-  }
-  ret = ::connect(s, (sockaddr*)&addr.addr, addr.addr_size());
-  if (ret < 0) {
-    if (errno == EINPROGRESS && nonblock)
-      return s;
-
-    lderr(cct) << __func__ << " connect: %s " << strerror(errno) << dendl;
-    close(s);
-    return -errno;
-  }
-
-  set_socket_options(s);
-
-  return s;
-}
-
-int NetHandler::connect(const entity_addr_t &addr)
-{
-  return generic_connect(addr, false);
-}
-
-int NetHandler::nonblock_connect(const entity_addr_t &addr)
-{
-  return generic_connect(addr, true);
-}
-
-
-}
diff --git a/src/msg/net_handler.h b/src/msg/net_handler.h
deleted file mode 100644 (file)
index bc8487a..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_COMMON_NET_UTILS_H
-#define CEPH_COMMON_NET_UTILS_H
-#include "common/config.h"
-
-namespace ceph {
-  class NetHandler {
-   private:
-    int create_socket(int domain, bool reuse_addr=false);
-    int generic_connect(const entity_addr_t& addr, bool nonblock);
-
-    CephContext *cct;
-   public:
-    NetHandler(CephContext *c): cct(c) {}
-    int set_nonblock(int sd);
-    void set_socket_options(int sd);
-    int connect(const entity_addr_t &addr);
-    int nonblock_connect(const entity_addr_t &addr);
-  };
-}
-
-#endif