From 124860a579210cb37c80320f22dff0f5dc9c5a51 Mon Sep 17 00:00:00 2001 From: David Galloway Date: Fri, 16 Mar 2018 11:42:29 -0400 Subject: [PATCH] sepia-fog-images: Pause the queue if there are active Deploy tasks I've observed deployment failures while teuthology jobs are running if the OS image is replaced/removed while it's in use. Pausing the queue will allow Deploy tasks to finish so we can capture a new OS image without interrupting running jobs. Signed-off-by: David Galloway --- sepia-fog-images/README.rst | 2 ++ sepia-fog-images/build/build | 30 +++++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/sepia-fog-images/README.rst b/sepia-fog-images/README.rst index e1f11456..e22a7f9c 100644 --- a/sepia-fog-images/README.rst +++ b/sepia-fog-images/README.rst @@ -42,6 +42,8 @@ This job: #. Configures the DHCP server so the testnodes PXE boot back to the FOG server. +#. Pauses the teuthology queue (if needed) so active FOG deployments aren't interrupted. + #. Reboots all the testnodes so FOG captures the assigned images. #. Updates the teuthology lock DB with the new host keys and OS info. diff --git a/sepia-fog-images/build/build b/sepia-fog-images/build/build index 1e61e937..1d8f194b 100755 --- a/sepia-fog-images/build/build +++ b/sepia-fog-images/build/build @@ -153,13 +153,32 @@ set -ex # Restart dhcpd so servers PXE boot to FOG server ssh ubuntu@store01.front.sepia.ceph.com "sudo service dhcpd restart" +# Get FOG 'Deploy' TaskID +fogdeployid=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/tasktype -d '{"name": "Deploy"}' -X GET | jq -r '.tasktypes[0].id') + +# Check for scheduled deploy tasks +deploytasks=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogdeployid}'"}' -X GET | jq -r '.count') + +# If there are scheduled or active deploy tasks, pause the queue and let them finish. +# Capturing a new OS image can interrupt active OS deployments. +if [ $deploytasks -gt 0 ]; then + for type in $MACHINETYPES; do + # Only pause the queue for 20 minutes just in case anything goes wrong with the Jenkins job. + teuthology-queue --pause 1200 --machine_type $type + done + pausedqueue=true + while [ $deploytasks -gt 0 ]; do + echo "$(date) -- $deploytasks FOG deploy tasks still queued. Sleeping 10sec" + sleep 10 + deploytasks=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogdeployid}'"}' -X GET | jq -r '.count') + done +fi + # Reboot all hosts so FOG can capture their OSes for host in $allhosts; do funPowerCycle $host done -set +x - # Wait for Capture tasks to finish capturetasks=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogcaptureid}'"}' -X GET | jq -r '.count') while [ $capturetasks -gt 0 ]; do @@ -168,7 +187,12 @@ while [ $capturetasks -gt 0 ]; do capturetasks=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogcaptureid}'"}' -X GET | jq -r '.count') done -set -ex +# Unpause the queue if we paused it earlier +if [ "$pausedqueue" = true ]; then + for type in $MACHINETYPES; do + teuthology-queue --pause 0 --machine_type $type + done +fi # Unlock all machines after all capture images are finished for host in $allhosts; do -- 2.39.5