From ff02927921305f122ba8763ebb1addfb2bbd9d73 Mon Sep 17 00:00:00 2001 From: David Galloway Date: Tue, 21 Nov 2017 14:14:06 -0500 Subject: [PATCH] sepia-fog-images: Job creation Signed-off-by: David Galloway --- sepia-fog-images/README.rst | 62 ++++++ sepia-fog-images/build/build | 189 ++++++++++++++++++ sepia-fog-images/build/failure | 41 ++++ .../config/definitions/sepia-fog-images.yml | 45 +++++ 4 files changed, 337 insertions(+) create mode 100644 sepia-fog-images/README.rst create mode 100755 sepia-fog-images/build/build create mode 100755 sepia-fog-images/build/failure create mode 100644 sepia-fog-images/config/definitions/sepia-fog-images.yml diff --git a/sepia-fog-images/README.rst b/sepia-fog-images/README.rst new file mode 100644 index 00000000..b40c6685 --- /dev/null +++ b/sepia-fog-images/README.rst @@ -0,0 +1,62 @@ +sepia-fog-images +================ + +This job automates the creation/capturing of FOG_ images. + +Prerequisites +------------- + +These steps should only have to be performed when a new teuthology host is being set up but it's good to have documented. + +#. Run the ``ansible/examples/slave_teuthology.yml`` playbook against the teuthology host. + +#. As the ``jenkins-build`` user on the teuthology host, generate a new RSA SSH key (``ssh-keygen -t rsa``). + +#. Copy the public key to jenkins-build.pub_ in the keys repo. (This is so the jenkins-build user can ssh to testnodes and VPSHOSTs) + +#. Run the ceph-cm-ansible_ ``users`` playbook against the Cobbler host and the DHCP server. (This lets the jenkins-build user set Cobbler settings and update DHCP entries) + +#. Define ``FOG_API_TOKEN`` and ``FOG_USER_TOKEN`` as **Global name/password pairs** in Jenkins. + +**NOTE:** This job also relies on: + +- teuthology.yaml_ -- If the job is being run on the teuthology host, this should already be in place at ``/etc/teuthology.yaml``. +- ceph-sepia-secrets_ -- If the job is being run on a teuthology host, ``/etc/ansible`` should already be symlinked to a ceph-sepia-secrets checkout. +- ceph-cm-ansible/tools_ -- There's a playbook that preps a host for capturing after Cobbler reimage along with a script to update DHCP entries. + +How it works +------------ + +This job: + +#. Locks a number of testnodes via ``teuthology-lock`` depending on the number of machine types and distros you specify. + +#. SSHes and configures the DHCP server to make the testnodes boot to the Cobbler PXE server (instead of the default FOG). + +#. SSHes and sets the appropriate profile for each machine in Cobbler. + +#. Reboots the testnodes so they get reimaged via Cobbler. The ceph-cm-ansible_ testnodes role gets run as a post-install task_. + +#. Runs the ``prep-fog-capture.yml`` playbook against the testnodes to wipe out network settings and mounts. (This is because biosdevname/systemd/udev rules need to be overridden/rewritten by rc.local) + +#. Configures the DHCP server so the testnodes PXE boot back to the FOG server. + +#. Reboots all the testnodes so FOG captures the assigned images. + +#. Updates the teuthology lock DB with the new host keys and OS info. + +#. Unlocks/releases the testnodes. + +Usage +----- + +See https://wiki.sepia.ceph.com/doku.php?id=services:fog + +.. _FOG: https://fogproject.org/ +.. _jenkins-build.pub: https://github.com/ceph/keys/blob/master/ssh/jenkins-build.pub +.. _teuthology.yaml: http://docs.ceph.com/teuthology/docs/siteconfig.html +.. _ceph-sepia-secrets: https://github.com/ceph/ceph-sepia-secrets/ +.. _tools: https://github.com/ceph/ceph-cm-ansible/tree/master/tools +.. _Jenkins: https://jenkins.ceph.com/job/sepia-fog-images +.. _task: https://github.com/ceph/ceph-cm-ansible/blob/master/roles/cobbler/templates/snippets/cephlab_rc_local +.. _ceph-cm-ansible: https://github.com/ceph/ceph-cm-ansible diff --git a/sepia-fog-images/build/build b/sepia-fog-images/build/build new file mode 100755 index 00000000..09c4d94c --- /dev/null +++ b/sepia-fog-images/build/build @@ -0,0 +1,189 @@ +#!/bin/bash +# This job: +# - Reimages testnodes using Cobbler (which runs ceph-cm-ansible) +# - Preps the testnodes to have a FOG image captured (ceph-cm-ansible/tools/prep-fog-capture.yml) +# - Captures FOG images +# +# CAPITAL vars are provided by Jenkins. lowercase are just in this script + +set -ex + +# Converts distro friendly names into Cobbler/FOG image names +funSetProfiles () { + if [ "$1" == "trusty" ]; then + cobblerprofile="Ubuntu-14.04-server-x86_64" + fogprofile="ubuntu_14.04" + elif [ "$1" == "xenial" ]; then + cobblerprofile="Ubuntu-16.04-server-x86_64" + fogprofile="ubuntu_16.04" + elif [ "$1" == "centos" ]; then + cobblerprofile="CentOS-7.4-x86_64" + fogprofile="centos_7.4" + else + echo "Unknown profile $1" + exit 1 + fi +} + +funPowerCycle () { + host=$(echo ${1} | cut -d '.' -f1) + powerstatus=$(ipmitool -I lanplus -U inktank -P $SEPIA_IPMI_PASS -H ${host}.ipmi.sepia.ceph.com chassis power status | cut -d ' ' -f4-) + if [ "$powerstatus" == "off" ]; then + ipmitool -I lanplus -U inktank -P $SEPIA_IPMI_PASS -H ${host}.ipmi.sepia.ceph.com chassis power on + else + ipmitool -I lanplus -U inktank -P $SEPIA_IPMI_PASS -H ${host}.ipmi.sepia.ceph.com chassis power cycle + fi +} + +# Clone or update teuthology +if [ ! -d teuthology ]; then + git clone https://github.com/ceph/teuthology + cd teuthology +else + cd teuthology + git pull +fi + +# Bootstrap teuthology +./bootstrap + +cd $WORKSPACE + +source $WORKSPACE/teuthology/virtualenv/bin/activate + +# Clone or update ceph-cm-ansible +if [ ! -d ceph-cm-ansible ]; then + git clone https://github.com/ceph/ceph-cm-ansible +else + cd ceph-cm-ansible + git pull +fi + +cd $WORKSPACE + +# Don't bail if we fail to lock machines +set +e + +numdistros=$(echo $DISTROS | wc -w) +# Keep trying to lock machines +for type in $MACHINETYPES; do + numlocked=$(teuthology-lock --brief --machine-type $type | grep "Locked to capture FOG image for Jenkins build $BUILD_NUMBER" | wc -l) + while [ $numlocked -lt $numdistros ]; do + # Lock one at a time since we have a better shot of getting one instead of all at once. + # Setting the BUILD_NUMBER in the description makes sure each Jenkins job uses the right machines. + # This is useful for when a job is aborted and another is started while the previous job's machines are debugged/cleaned up. + teuthology-lock --lock-many 1 --machine-type $type --desc "Locked to capture FOG image for Jenkins build $BUILD_NUMBER" + # Sleep for a bit so we don't hammer the lock server + if [ $? -ne 0 ]; then + sleep 5 + fi + numlocked=$(teuthology-lock --brief --machine-type $type | grep "Locked to capture FOG image for Jenkins build $BUILD_NUMBER" | wc -l) + done +done + +set -e + +allhosts=$(teuthology-lock --brief | grep "Locked to capture FOG image for Jenkins build $BUILD_NUMBER" | cut -d '.' -f1 | tr "\n" " ") +# Configure DHCP to use cobbler as the PXE server for each machine to reimage and ansiblize +for machine in $allhosts; do + ssh ubuntu@store01.front.sepia.ceph.com "sudo /usr/local/sbin/set-next-server.sh $machine cobbler" +done + +# Restart dhcpd (for some reason doing this every time we set the next-server in the for loop above, dhcpd would fail to start) +ssh ubuntu@store01.front.sepia.ceph.com "sudo service dhcpd restart" + +# Get FOG 'Capture' TaskID +fogcaptureid=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/tasktype -d '{"name": "Capture"}' -X GET | jq -r '.tasktypes[0].id') + +# Set cobbler profile and FOG image ID for each locked machine +for type in $MACHINETYPES; do + lockedhosts=$(teuthology-lock --brief --machine-type $type | grep "Locked to capture FOG image for Jenkins build $BUILD_NUMBER" | cut -d '.' -f1 | sort) + # Create arrays using our lists so we can iterate through them + array1=($lockedhosts) + array2=($DISTROS) + for i in $(seq 1 $numdistros); do + funSetProfiles ${array2[$i-1]} + ssh ubuntu@cobbler.front.sepia.ceph.com "sudo cobbler system edit --name ${array1[$i-1]} --profile $cobblerprofile --netboot-enabled=1" + funPowerCycle ${array1[$i-1]} + # Get FOG host ID + foghostid=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/host -d '{"name": "'${array1[$i-1]}'"}' -X GET | jq -r '.hosts[0].id') + # Get FOG image ID + fogimageid=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/image -d '{"name": "'${type}_${fogprofile}'"}' -X GET | jq -r '.images[0].id') + # Set foghostid (target host) to capture fogimageid + curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/host/$foghostid -d '{"imageID": "'${fogimageid}'"}' -X PUT + # Create 'Capture' task for each machine + curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/host/$foghostid/task -d '{"taskTypeID": "'${fogcaptureid}'"}' -X POST + done +done + +# Sleep for 10sec to allow the hosts to reboot (Makes sure we don't `stat` existing/old /ceph-qa-ready +sleep 10 + +# Don't bail if machines aren't ready yet +set +e + +# Set DHCP next-server back to FOG and prep each machine for FOG capturing +remaininghosts=$allhosts +while [ "$remaininghosts" != "" ]; do + for host in $remaininghosts; do + if ssh -q ubuntu@${host}.front.sepia.ceph.com stat /ceph-qa-ready \> /dev/null 2\>\&1; then + # Bail if anything fails + set -ex + # Set DHCP back + ssh ubuntu@store01.front.sepia.ceph.com "sudo /usr/local/sbin/set-next-server.sh $host fog" + # Prep the host for FOG image capture + ansible-playbook $WORKSPACE/ceph-cm-ansible/tools/prep-fog-capture.yml -e ansible_ssh_user=ubuntu --limit="$host*" + remaininghosts=$(echo $remaininghosts | sed -e "s/ *$host *//") + else + # This gets noisy + set +ex + echo "$(date) -- $host is not ready. Sleeping for 2min" + sleep 120 + fi + done +done + +set -ex + +# Restart dhcpd so servers PXE boot to FOG server +ssh ubuntu@store01.front.sepia.ceph.com "sudo service dhcpd restart" + +# Reboot all hosts so FOG can capture their OSes +for host in $allhosts; do + funPowerCycle $host +done + +set +x + +# Wait for Capture tasks to finish +capturetasks=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogcaptureid}'"}' -X GET | jq -r '.count') +while [ $capturetasks -gt 0 ]; do + echo "$(date) -- $capturetasks FOG capture tasks still queued. Sleeping 10sec" + sleep 10 + capturetasks=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogcaptureid}'"}' -X GET | jq -r '.count') +done + +# Don't bail if machines aren't ready yet +set +e + +# Wait for hosts to come back up after getting images captured +remaininghosts=$allhosts +while [ "$remaininghosts" != "" ]; do + for host in $remaininghosts; do + if ssh -q ubuntu@${host}.front.sepia.ceph.com stat /ceph-qa-ready \> /dev/null 2\>\&1; then + remaininghosts=$(echo $remaininghosts | sed -e "s/ *$host *//") + else + echo "$(date) -- $host is not ready. Sleeping for 10sec" + sleep 10 + fi + done +done + +set -ex + +# Update lock db and unlock machines! +for host in $allhosts; do + teuthology-updatekeys ubuntu@${host} + teuthology-update-inventory ubuntu@${host} + teuthology-lock --unlock $host +done diff --git a/sepia-fog-images/build/failure b/sepia-fog-images/build/failure new file mode 100755 index 00000000..6d79a209 --- /dev/null +++ b/sepia-fog-images/build/failure @@ -0,0 +1,41 @@ +#!/bin/bash + +set -ex + +funPowerCycle () { + host=$(echo ${1} | cut -d '.' -f1) + powerstatus=$(ipmitool -I lanplus -U inktank -P $SEPIA_IPMI_PASS -H ${host}.ipmi.sepia.ceph.com chassis power status | cut -d ' ' -f4-) + if [ "$powerstatus" == "off" ]; then + ipmitool -I lanplus -U inktank -P $SEPIA_IPMI_PASS -H ${host}.ipmi.sepia.ceph.com chassis power on + else + ipmitool -I lanplus -U inktank -P $SEPIA_IPMI_PASS -H ${host}.ipmi.sepia.ceph.com chassis power cycle + fi +} + +allhosts=$(teuthology-lock --brief | grep "Locked to capture FOG image for Jenkins build $BUILD_NUMBER" | cut -d '.' -f1 | tr "\n" " ") +# Set DHCP server back to FOG +for machine in $allhosts; do + ssh ubuntu@store01.front.sepia.ceph.com "sudo /usr/local/sbin/set-next-server.sh $machine fog" + teuthology-lock --update --status down $machine + teuthology-lock --update --desc "Failed to capture FOG image: ${BUILD_URL}" +done + +# Restart dhcpd (for some reason doing this every time we set the next-server in the for loop above, dhcpd would fail to start) +ssh ubuntu@store01.front.sepia.ceph.com "sudo service dhcpd restart" + +# Get FOG 'Capture' TaskID +fogcaptureid=$(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/tasktype -d '{"name": "Capture"}' -X GET | jq -r '.tasktypes[0].id') + +# Delete all active Capture tasks +for task in $(curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/active -d '{"typeID": "'${fogcaptureid}'"}' -X GET | jq -r '.tasks[].id'); do + curl -s -k -H "fog-api-token: ${FOG_API_TOKEN}" -H "fog-user-token: ${FOG_USER_TOKEN}" http://fog.front.sepia.ceph.com/fog/task/${task} -X DELETE +done + +set +e + +# Try to update lock DB (in case whomever debugs Jenkins job failure forgets) +for host in $allhosts; do + teuthology-updatekeys ubuntu@${host} + teuthology-update-inventory ubuntu@${host} + teuthology-lock --unlock $host +done diff --git a/sepia-fog-images/config/definitions/sepia-fog-images.yml b/sepia-fog-images/config/definitions/sepia-fog-images.yml new file mode 100644 index 00000000..51bb69f8 --- /dev/null +++ b/sepia-fog-images/config/definitions/sepia-fog-images.yml @@ -0,0 +1,45 @@ +- job: + name: sepia-fog-images + project-type: freestyle + defaults: global + concurrent: false + display-name: 'Sepia FOG Image Creator' + node: teuthology + quiet-period: 0 + block-downstream: false + block-upstream: false + logrotate: + daysToKeep: 15 + numToKeep: 30 + artifactDaysToKeep: -1 + artifactNumToKeep: -1 + + parameters: + - string: + name: DISTROS + default: "trusty xenial centos" + description: "Distro to capture images for: (e.g., 'trusty', 'xenial', 'centos' or 'trusty xenial' for multiple distros)" + - string: + name: MACHINETYPES + default: "smithi mira" + description: "Machine types to capture images for. (e.g., 'smithi' or 'smithi mira' for multiple machine types)" + + builders: + - shell: + !include-raw: + - ../../build/build + + publishers: + - postbuildscript: + script-only-if-failed: True + script-only-if-succeeded: False + builders: + - shell: + !include-raw: + - ../../build/failure + + wrappers: + - mask-passwords: + - inject-passwords: + global: true + mask-password-params: true -- 2.39.5