]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
crimson/osd: fix Watch::connect() behaviour on reconnect.
authorRadoslaw Zarzynski <rzarzyns@redhat.com>
Thu, 2 Sep 2021 14:41:38 +0000 (14:41 +0000)
committerRadoslaw Zarzynski <rzarzyns@redhat.com>
Thu, 2 Sep 2021 15:20:37 +0000 (15:20 +0000)
It's perfectly legal for a client to reconnect to particular `Watch`
using different socket / `Connection` than original one. This shall
include proper handling of the watch timer which is currently broken
as, when reconnecting, we don't cancel the timer. This leaded to the
following crash at Sepia:

```
rzarzynski@teuthology:/home/teuthworker/archive/rzarzynski-2021-09-02_07:44:51-rados-master-distro-basic-smithi/6372357$ less ./remote/smithi183/log/ceph-osd.4.log.gz
...
DEBUG 2021-09-02 08:10:45,462 [shard 0] osd - client_request(id=12, detail=m=[osd_op(client.5087.0:93 7.1e 7:7c7084bd:::repobj:head {watch reconnect cookie 94478891024832 gen 1} snapc 0={} ondisk+write+know
n_if_redirected e40) v8]): got obc lock
...
DEBUG 2021-09-02 08:10:45,462 [shard 0] osd - do_op_watch
INFO  2021-09-02 08:10:45,462 [shard 0] osd - found existing watch by client.5087
DEBUG 2021-09-02 08:10:45,462 [shard 0] osd - do_op_watch_subop_watch
INFO  2021-09-02 08:10:45,462 [shard 0] osd - found existing watch watch(cookie 94478891024832 30s 172.21.15.150:0/3544196211) by client.5087
...
INFO  2021-09-02 08:10:45,462 [shard 0] osd - op_effect: found existing watcher: 94478891024832,client.5087
ceph-osd: /home/jenkins-build/build/workspace/ceph-dev-new-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/17.0.0-7406-g9d30203c/rpm/el8/BUILD/ceph-
17.0.0-7406-g9d30203c/src/seastar/include/seastar/core/timer.hh:95: void seastar::timer<Clock>::arm_state(seastar::timer<Clock>::time_point, std::optional<typename Clock::duration>) [with Clock = seastar::l
owres_clock; seastar::timer<Clock>::time_point = std::chrono::time_point<seastar::lowres_clock, std::chrono::duration<long int, std::ratio<1, 1000> > >; typename Clock::duration = std::chrono::duration<long
 int, std::ratio<1, 1000> >]: Assertion `!_armed' failed.
Aborting on shard 0.
Backtrace:
 0# 0x000055CC052CF0B6 in ceph-osd
 1# FatalSignal::signaled(int, siginfo_t const&) in ceph-osd
 2# FatalSignal::install_oneshot_signal_handler<6>()::{lambda(int, siginfo_t*, void*)#1}::_FUN(int, siginfo_t*, void*) in ceph-osd
 3# 0x00007FA58349FB20 in /lib64/libpthread.so.0
 4# gsignal in /lib64/libc.so.6
 5# abort in /lib64/libc.so.6
 6# 0x00007FA581A98C89 in /lib64/libc.so.6
 7# 0x00007FA581AA6A76 in /lib64/libc.so.6
 8# 0x000055CC0BEEE9DD in ceph-osd
 9# crimson::osd::Watch::connect(seastar::shared_ptr<crimson::net::Connection>, bool) in ceph-osd
10# 0x000055CC00B1D246 in ceph-osd
11# 0x000055CBFFEF01AE in ceph-osd
...
```

Signed-off-by: Radoslaw Zarzynski <rzarzyns@redhat.com>
src/crimson/osd/watch.cc

index 1848869d19b74de42d6ffedaba4fd15cc5171a57..54112963287e8ad68ce6cc30af25c031d828cea2 100644 (file)
@@ -81,9 +81,9 @@ seastar::future<> Watch::connect(crimson::net::ConnectionRef conn, bool)
 {
   if (this->conn == conn) {
     logger().debug("conn={} already connected", conn);
-    timeout_timer.cancel();
+    return seastar::now();
   }
-
+  timeout_timer.cancel();
   timeout_timer.arm(std::chrono::seconds{winfo.timeout_seconds});
   this->conn = std::move(conn);
   return seastar::now();