]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
crimson/monc: fix send_message() racing with reopen_session(). 41364/head
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Mon, 17 May 2021 13:50:55 +0000 (13:50 +0000)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Mon, 17 May 2021 14:44:36 +0000 (14:44 +0000)
The `send_message()` method is a high-level facility for
communicating with a monitor. If there is an active conn
available, it sends the message immediately; otherwise
the message is queued. This method assumes the queue is
already drained if the connection is available.

`active_con` is managed by `reopen_session()` where it's
first cleared and then reset after finding new alive mon.
This is followed by draining the `pending_messages` queue
which happens in `on_session_opened()` after the `MAuth`
exchange is finished.

Unfortunately, the path from the `active_con` clearing
to draining the queue is long and divided into multiple
continuations which results in lack of atomicity. When
e.g. `run_command()` interleaves the stages, following
crash happens:

```
INFO  2021-05-07 08:13:43,914 [shard 0] monc - do_auth_single: mon v2:172.21.15.82:6805/34166 => v2:172.21.15.82:3300/0 returns auth_reply(proto 2 0 (0) Success) v1: 0
ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-3910-g1b18e076/rpm/el8/BUILD/ceph-17.0.0-3910-g1b18e076/src/crimson/mon/MonClient.cc:1034: seastar::future<> crimson::mon::Client::send_message(MessageRef): Assertion `pending_messages.empty()' failed.
Aborting on shard 0.
Backtrace:
 0# 0x000055CDE6DB532F in ceph-osd
 1# FatalSignal::signaled(int, siginfo_t const*) in ceph-osd
 2# FatalSignal::install_oneshot_signal_handler<6>()::{lambda(int, siginfo_t*, void*)#1}::_FUN(int, siginfo_t*, void*) in ceph-osd
 3# 0x00007FC1BF20BB20 in /lib64/libpthread.so.0
 4# gsignal in /lib64/libc.so.6
 5# abort in /lib64/libc.so.6
 6# 0x00007FC1BD806B09 in /lib64/libc.so.6
 7# 0x00007FC1BD814DE6 in /lib64/libc.so.6
 8# crimson::mon::Client::send_message(boost::intrusive_ptr<Message>) in ceph-osd
 9# crimson::mon::Client::renew_subs() in ceph-osd
10# 0x000055CDE764FB0B in ceph-osd
11# 0x000055CDE10457F0 in ceph-osd
12# 0x000055CDEA0AB88F in ceph-osd
13# 0x000055CDEA0B0DD0 in ceph-osd
14# 0x000055CDEA2689FB in ceph-osd
15# 0x000055CDE9DC0EDA in ceph-osd
16# main in ceph-osd
17# __libc_start_main in /lib64/libc.so.6
18# _start in ceph-osd
```

The problem caused following failure at Sepia:
http://pulpito.front.sepia.ceph.com/rzarzynski-2021-05-07_07:41:02-rados-master-distro-basic-smithi/6104549

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/crimson/mon/MonClient.cc
src/crimson/mon/MonClient.h

index e5a50918c58b0207187747b7e5550152d2a1f66c..6234f7555bbaf09992f993cde01d5216c2794e92 100644 (file)
@@ -901,6 +901,7 @@ seastar::future<> Client::stop()
   logger().info("{}", __func__);
   auto fut = gate.close();
   timer.cancel();
+  ready_to_send = false;
   for (auto& pending_con : pending_conns) {
     pending_con->close();
   }
@@ -929,6 +930,7 @@ static entity_addr_t choose_client_addr(
 seastar::future<bool> Client::reopen_session(int rank)
 {
   logger().info("{} to mon.{}", __func__, rank);
+  ready_to_send = false;
   if (active_con) {
     active_con->close();
     active_con = nullptr;
@@ -1001,6 +1003,9 @@ void Client::_finish_auth(const entity_addr_t& peer)
   }
 
   ceph_assert(!active_con && !pending_conns.empty());
+  // It's too early to toggle the `ready_to_send` flag. It will
+  // be set atfer finishing the MAuth exchange and draining out
+  // the `pending_messages` queue.
   active_con = std::move(*found);
   *found = nullptr;
   for (auto& conn : pending_conns) {
@@ -1028,7 +1033,7 @@ Client::run_command(std::string&& cmd,
 
 seastar::future<> Client::send_message(MessageRef m)
 {
-  if (active_con) {
+  if (active_con && ready_to_send) {
     assert(pending_messages.empty());
     return active_con->get_conn()->send(m);
   } else {
@@ -1047,6 +1052,7 @@ seastar::future<> Client::on_session_opened()
       m.pr.set_value();
     }
     pending_messages.clear();
+    ready_to_send = true;
     return seastar::now();
   }).then([this] {
     return seastar::parallel_for_each(mon_commands,
index cd15b2cd22d42cc0bc6b0cf1df7de4aae8830924..a0dfa45481d34f111190ef73c3f702f6e8e1f584 100644 (file)
@@ -55,6 +55,7 @@ class Client : public crimson::net::Dispatcher,
   const uint32_t want_keys;
 
   MonMap monmap;
+  bool ready_to_send = false;
   seastar::shared_ptr<Connection> active_con;
   std::vector<seastar::shared_ptr<Connection>> pending_conns;
   seastar::timer<seastar::lowres_clock> timer;