]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-build.git/commitdiff
ceph-pull-requests: kill orphan processes after job is aborted 1764/head
authorKefu Chai <kchai@redhat.com>
Wed, 17 Mar 2021 11:22:36 +0000 (19:22 +0800)
committerKefu Chai <kchai@redhat.com>
Wed, 17 Mar 2021 11:24:28 +0000 (19:24 +0800)
Fixes: https://tracker.ceph.com/issues/47830
Signed-off-by: Kefu Chai <kchai@redhat.com>
ceph-pull-requests/build/kill-tests [new file with mode: 0644]
ceph-pull-requests/config/definitions/ceph-pull-requests.yml

diff --git a/ceph-pull-requests/build/kill-tests b/ceph-pull-requests/build/kill-tests
new file mode 100644 (file)
index 0000000..8ecc125
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/bash -ex
+
+# kill all descendant processes of ctest
+
+# ceph-pull-requests/build/build is killed by jenkins when the ceph-pull-requests job is aborted or
+# canceled, see https://www.jenkins.io/doc/book/using/aborting-a-build/ . but build/build does not
+# wait until all its children processes quit. after ctest is killed by SIGTERM, there is chance
+# that some tests are still running as ctest does not get a chance to kill them before it terminates.
+# if these tests had timed out, ctest would kill them using SIGKILL. so we need to kill them
+# manually after the job is aborted.
+
+# if ctest is still running, get its pid, otherwise we are done.
+ctest_pid=$(pgrep ctest) || exit 0
+# the parent process of ctest should have been terminated, but this might not be true when
+# it comes to some of its descendant processes, for instance, unittest-seastar-messenger
+ctest_pgid=$(ps --no-headers --format 'pgid:1' --pid $ctest_pid)
+kill -SIGTERM -- -"$ctest_pgid"
+# try harder
+for seconds in 0 1 1 2 3; do
+    sleep $seconds
+    if pgrep --pgroup $ctest_pgid > /dev/null; then
+        # kill only if we've waited for a while
+        if test $seconds != 0; then
+            pgrep --pgroup $ctest_pgid
+            echo 'try harder'
+            kill -SIGKILL -- -"$ctest_pgid"
+        fi
+    else
+        echo 'killed'
+        break
+    fi
+done
index 14791cd07285bb16c7f5e55fd62555d306d4767f..460cca12c204497af7e68fb706d2d1f613343e0d 100644 (file)
                 healthy: 10
                 unhealthy: 20
                 failing: 30
+      - postbuildscript:
+          builders:
+            - role: SLAVE
+              build-on:
+                - ABORTED
+              build-steps:
+                - shell:
+                    !include-raw:
+                      - ../../build/kill-tests
 
     wrappers:
       - ansicolor