From: Zack Cerza Date: Wed, 19 Mar 2025 18:35:11 +0000 (-0600) Subject: lock.ops.unlock_one_safe: Invert run-match logic X-Git-Tag: 1.2.3~3^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=104ebb3e290c8a1de2d9b637ad327e8e640f54be;p=teuthology.git lock.ops.unlock_one_safe: Invert run-match logic When unlock_one_safe is called with run_name, the caller means to express "unlock this node if it belongs to this run". When it is called with run_name and job_id, it means "unlock this node if it belongs to this job in this run". We had inverted the logic, causing leaks on reimage failures. Signed-off-by: Zack Cerza --- 104ebb3e290c8a1de2d9b637ad327e8e640f54be diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..560e80017 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +omit = */test/* diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..d2f4d315d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +venv +virtualenv +.tox diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 000000000..92f385bf5 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @ceph/teuthology diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..4fc1b2c3c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + test: + name: CI on python${{ matrix.python }} via ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - os: ubuntu-22.04 + python: "3.10" + - os: ubuntu-22.04 + python: "3.11" + - os: ubuntu-24.04 + python: "3.12" + steps: + - uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Install tox + run: pip install tox + - name: Run flake8 + run: tox -e flake8 + - name: Run unit tests + run: tox -e py3 + - name: Run docs build + run: tox -e docs diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml new file mode 100644 index 000000000..15395e136 --- /dev/null +++ b/.github/workflows/dependencies.yml @@ -0,0 +1,54 @@ +name: dependencies + +on: + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + upgrade: + name: Test dependencies + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - os: ubuntu-22.04 + python: "3.10" + - os: ubuntu-22.04 + python: "3.11" + steps: + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python }} + - name: Checkout default branch + uses: actions/checkout@v4 + with: + ref: main + path: teuthology + - name: virtualenv + run: | + pip install --user virtualenv + virtualenv ./virtualenv + cd ./virtualenv/lib/python* + touch no-global-site-packages.txt + working-directory: ./teuthology + - name: Refresh system repos + run: | + sudo apt update -y + sudo apt upgrade -y + - name: Initial bootstrap + run: ./bootstrap install + working-directory: ./teuthology + - name: Move initial repository + run: mv teuthology teuthology.orig + - name: Checkout desired ref + uses: actions/checkout@v4 + with: + path: teuthology + - name: Move virtualenv to new checkout + run: mv ./teuthology.orig/virtualenv ./teuthology/ + - name: Re-run bootstrap + run: ./bootstrap install + working-directory: ./teuthology diff --git a/.github/workflows/dev_container.yml b/.github/workflows/dev_container.yml new file mode 100644 index 000000000..8540a2e5b --- /dev/null +++ b/.github/workflows/dev_container.yml @@ -0,0 +1,42 @@ +--- +name: dev_container +on: + push: + branches: + - "main" + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + docker: + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - os: ubuntu-24.04 + python: "3.12" + - os: ubuntu-24.04-arm + python: "3.12" + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Login to Quay.io + if: github.event_name == 'push' && github.ref_name == 'main' + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Build and push + uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 + env: + QUAY_URI: quay.io/ceph-infra/teuthology-dev + QUAY_TAG: ${{ github.event_name == 'pull_request' && github.head_ref || github.ref_name }} + with: + context: . + file: containers/teuthology-dev/Dockerfile + push: ${{ github.event_name == 'push' && github.ref_name == 'main' }} + tags: ${{ env.QUAY_URI }}:${{ env.QUAY_TAG }} + outputs: type=image,name=target diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml new file mode 100644 index 000000000..e599289b3 --- /dev/null +++ b/.github/workflows/integration.yml @@ -0,0 +1,33 @@ +name: integration +on: + pull_request: + workflow_dispatch: +jobs: + test: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Make archive directory + run: mkdir /tmp/archive_dir + - name: Test using docker-compose + run: ./start.sh + working-directory: ./docs/docker-compose + - name: Rename Directory + # Replace ":" with "_" everywhere in directory path. + # This needs to be done because GA does not support ":" colon character in artifacts (like in /root-2025-03-06_18:47:26-teuthology:no-ceph-main-distro-default-testnode). + # Invalid characters include: Double quote ", Colon :, Less than <, Greater than >, Vertical bar |, Asterisk *, Question mark ?, Carriage return \r, Line feed \n + if: always() + run: | + for DIR in /tmp/archive_dir/root-*; do + SAFE_DIR="${DIR//:/_}" # Replace in '/tmp/archive_dir/root-2025-03-06_18:47:26-teuthology:no-ceph-main-distro-default-testnode' + if [ "$DIR" != "$SAFE_DIR" ]; then + mv "$DIR" "$SAFE_DIR" + fi + done + - name: Upload teuthology archive logs + uses: actions/upload-artifact@v4 + if: always() + with: + name: teuthology-logs + path: | + /tmp/archive_dir/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..68a366c73 --- /dev/null +++ b/.gitignore @@ -0,0 +1,32 @@ +*~ +.#* +## the next line needs to start with a backslash to avoid looking like +## a comment +\#*# +.*.swp + +*.pyc +*.pyo +.tox + +/*.egg-info +/virtualenv +/build +/*.yaml +docs/build + +.ropeproject +.coverage + +# autogenerated docs from sphinx-apidoc +docs/modules.rst +docs/teuthology.rst +docs/teuthology.*.rst + +# PyCharm +.idea + +# vscode +.vscode/ + +.ansible diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 000000000..159539123 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,3 @@ +teuthology: + tags: [ ceph-workbench ] + script: "git clean -ffqdx ; ./bootstrap install ; unset OS_AUTH_URL ; source virtualenv/bin/activate ; pip install tox ; tox" diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 000000000..56ef6eb48 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,20 @@ +--- +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 +formats: [] +build: + os: ubuntu-22.04 + tools: + python: "3.10" +python: + install: + - method: pip + path: . + extra_requirements: + - orchestra + - requirements: docs/requirements.txt +sphinx: + builder: html + configuration: docs/conf.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..f1bc1c881 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,17 @@ +env: HOME=/home/travis + +sudo: required +dist: trusty + +before_install: + - sudo apt-get -qq update + - ./bootstrap install + +language: python +python: + - 2.7 + +install: + - pip install tox + +script: tox -rv diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..02914d4f9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2014 Red Hat, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..2683cd654 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include *.rst +include requirements.txt +include tox.ini +include pytest.ini diff --git a/README.rst b/README.rst new file mode 100644 index 000000000..f83be7992 --- /dev/null +++ b/README.rst @@ -0,0 +1,11 @@ +=================================================== +`Teuthology` -- The Ceph integration test framework +=================================================== + + +Welcome! Teuthology's documentation is primarily hosted at `docs.ceph.com +`__. + +You can also look at docs `inside this repository `__, but note that +GitHub's `RST `__ rendering is quite +limited. Mainly that means that links between documents will be broken. diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 000000000..c7bd5e20d --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,4 @@ +[defaults] +# Store collections in this directory. This is to avoid potential compatibility +# issues between differently-versioned ansible processes. +collections_path = .ansible diff --git a/beanstalk/alpine/Dockerfile b/beanstalk/alpine/Dockerfile new file mode 100644 index 000000000..7afb0005b --- /dev/null +++ b/beanstalk/alpine/Dockerfile @@ -0,0 +1,13 @@ +# For beanstalkd 1.12 use edge branch +#FROM alpine:edge + +FROM alpine:3.12.3 + +MAINTAINER Kyrylo Shatskyy + +RUN apk update && apk add beanstalkd beanstalkd-doc + +ENV BEANSTALK_ADDR "0.0.0.0" +ENV BEANSTALK_PORT "11300" + +CMD /usr/bin/beanstalkd -V -l $BEANSTALK_ADDR -p $BEANSTALK_PORT diff --git a/bootstrap b/bootstrap new file mode 100755 index 000000000..8990d8781 --- /dev/null +++ b/bootstrap @@ -0,0 +1,159 @@ +#!/bin/bash +set -e +if [ $# -eq 0 ]; then + install=false +else + if [ "$1" = "install" ]; then + install=true + else + echo "Invalid command, supported commands are: 'install'" + exit 1 + fi +fi + +if [[ "$PYTHON" =~ "python2" ]]; then + echo "python2 is not supported." >&2 + exit 1 +fi + +# Use the newest version we find +if [ -z "$PYTHON" ]; then + for i in 12 11 10; do + command -v "python3.$i" && PYTHON="python3.$i" &>/dev/null && break + done +fi +if [ -z "$PYTHON" ]; then + # This would be bizarre, but I suppose possible + PYTHON=${PYTHON:-"python3"} +fi +echo "Using python: $PYTHON" + +case "$(uname -s)" in +Linux) + if command -v lsb_release; then + OS=$(lsb_release --id --short) + else + . /etc/os-release + OS=$(echo $NAME | tr -d ' ') + fi + # rpm/dnf is the default, to reduce repetition in the case statement + has_pkg="rpm -q --whatprovides" + install_pkg="sudo dnf install -y" + case "$OS" in + Ubuntu|Debian|LinuxMint) + deps=(qemu-utils python3-dev libssl-dev python3-pip python3-wheel python3-venv libev-dev libvirt-dev libffi-dev libyaml-dev build-essential jq curl) + has_pkg="dpkg -s" + install_pkg="sudo apt install -y" + ;; + RedHatEnterpriseWorkstation|RedHatEnterpriseServer|RedHatEnterprise|CentOS) + deps=(python39-pip python39-devel mariadb-devel libev-devel libvirt-devel libffi-devel) + ;; + CentOSStream) + PYTHON=python3.12 + deps=($PYTHON-pip $PYTHON-devel) + ;; + AlmaLinux|RockyLinux) + PYTHON=python3.12 + deps=($PYTHON-pip $PYTHON-devel libev-devel libvirt-devel libffi-devel) + ;; + Fedora|FedoraLinux) + deps=($PYTHON-pip $PYTHON-devel libev-devel libvirt-devel libffi-devel) + ;; + "openSUSE project"|"SUSE LINUX"|"openSUSE"|"openSUSELeap"|"openSUSETumbleweed") + PYTHON=python3.12 + deps=(python312-pip python312-devel python312 libev-devel libvirt-devel libffi-devel) + install_pkg="sudo zypper install" + ;; + esac + ;; + +Darwin) + deps="python libvirt libev libffi" + has_pkg="brew list" + install_pkg="brew install" + ;; +esac +for package in ${deps[@]}; do + if ! $has_pkg $package &>/dev/null; then + # add a space after old values + missing="${missing:+$missing }$package" + echo missing=${missing} + fi +done +if [ -n "$missing" ]; then + echo "$0: missing required packages:" 1>&2 + echo "$missing" + if [ "$install" = true ]; then + echo "Installing missing packages..." + $install_pkg $missing + else + echo "Please install missing packages or run './bootstrap install'" + echo "$install_pkg $missing" + exit 1 + fi + fi + +PYTHON_BIN=$(command -v $PYTHON) +if [ -z $PYTHON_BIN -o ! -e $PYTHON_BIN -o ! -x $PYTHON_BIN ]; then + echo "Cannot find $PYTHON!" + exit 1 +fi +PYTHON_VER_OUT=$($PYTHON_BIN --version) + +VENV=${VENV:-"./virtualenv"} +# If the venv was set to use system site-packages, fix that +if [ -f "$VENV/pyvenv.cfg" ]; then + sed -i'' -e 's/\(include-system-site-packages\s*=\s*\)true/\1false/g' $VENV/pyvenv.cfg +fi + +# Attempt to force a UTF-8 locale without being specific to English +export LANG=${LANG:-C.UTF-8} +(echo $LANG | grep -qi utf-8) || export LC_ALL=$LANG.UTF-8 + +if [ -z "$NO_CLOBBER" ] && \ + [ ! -e "$VENV/bin/pip" -o ! -e "$VENV/bin/$PYTHON" ] || \ + [ "${PYTHON_VER_OUT}" != "$($VENV/bin/$PYTHON --version)" ] \ + ; then + echo "Deleting existing virtual environment" + rm -rf virtualenv +fi + +if [ -z "$NO_CLOBBER" ] || [ ! -e $VENV ]; then + echo "Creating new venv at $VENV" + $PYTHON_BIN -m venv $VENV +fi + +PY_MAJOR=$($VENV/bin/python -c "import sys; print(sys.version_info[0])") +PY_MINOR=$($VENV/bin/python -c "import sys; print(sys.version_info[1])") + +# Python version check +if [[ "$PY_MAJOR" -ne 3 || "$PY_MINOR" -lt 10 ]]; then + echo "Python version should be 3.10 or higher, found $PY_MAJOR.$PY_MINOR" + exit 1 +fi + +$VENV/bin/pip install packaging + +if [ -f "$VENV/bin/ansible" ]; then + uninstall_ansible=$($VENV/bin/python3 -c "import ansible; from packaging.version import parse; print(parse(ansible.__version__) < parse('2.10.0'))") + if [ "$uninstall_ansible" = "True" ]; then + $VENV/bin/pip uninstall -y ansible + fi +fi + +# First, upgrade pip +$VENV/bin/pip install --upgrade pip + +# See https://github.com/pypa/pip/issues/8559 +$VENV/bin/pip install -r requirements.txt --use-pep517 + +# By default, install teuthology in editable mode +$VENV/bin/pip install ${PIP_INSTALL_FLAGS:---editable '.[test]'} + +# Check to make sure requirements are met +$VENV/bin/pip check + +# Install ansible collections +$VENV/bin/ansible-galaxy install -r requirements.yml + +echo "Bootstrap completed successfully!!!" diff --git a/build_qemu_image.sh b/build_qemu_image.sh new file mode 100755 index 000000000..614f519aa --- /dev/null +++ b/build_qemu_image.sh @@ -0,0 +1,61 @@ +#!/bin/sh -x +set -e + +IMAGE_URL=http://cloud-images.ubuntu.com/releases/precise/release/ubuntu-12.04-server-cloudimg-amd64-disk1.img + +wget -O base.qcow2 $IMAGE_URL + +image=base.raw +qemu-img convert -O raw base.qcow2 $image +rm -f base.qcow2 + +# Note: this assumes that sector size is 512, and that there's only one +# partition. very brittle. +START_SECT=$(fdisk -lu $image | grep ^$image | awk '{print $3}') +START_BYTE=$(echo "$START_SECT * 512" | bc) + +root=/tmp/$$ + +cleanup() { + sudo chroot $root rm -f /etc/resolv.conf || true + sudo chroot $root ln -s ../run/resolvconf/resolv.conf /etc/resolv.conf || true + sudo umount $root/proc || true + sudo umount $root/sys || true + sudo umount $root/dev/pts || true + sudo umount $root + sudo rmdir $root +} +trap cleanup INT TERM EXIT + +sudo mkdir $root +sudo mount -o loop,offset=$START_BYTE $image $root + +# set up chroot +sudo mount -t proc proc $root/proc +sudo mount -t sysfs sysfs $root/sys +sudo mount -t devpts devptr $root/dev/pts + +# set up network access +sudo chroot $root rm /etc/resolv.conf +sudo cp /etc/resolv.conf $root/etc/resolv.conf + +# packages +# These should be kept in sync with ceph-qa-chef.git/cookbooks/ceph-qa/default.rb +sudo chroot $root apt-get -y --force-yes install iozone3 bonnie++ dbench \ + tiobench build-essential attr libtool automake gettext uuid-dev \ + libacl1-dev bc xfsdump dmapi xfslibs-dev + +# install ltp without ltp-network-test, so we don't pull in xinetd and +# a bunch of other unnecessary stuff +sudo chroot $root apt-get -y --force-yes --no-install-recommends install ltp-kernel-test + +# add 9p fs support +sudo chroot $root apt-get -y --force-yes install linux-image-extra-virtual + +cleanup +trap - INT TERM EXIT + +qemu-img convert -O qcow2 $image output.qcow2 +rm -f $image + +exit 0 diff --git a/containers/teuthology-dev/.teuthology.yaml b/containers/teuthology-dev/.teuthology.yaml new file mode 100644 index 000000000..43037bb7e --- /dev/null +++ b/containers/teuthology-dev/.teuthology.yaml @@ -0,0 +1,14 @@ +queue_host: beanstalk +queue_port: 11300 +lock_server: http://paddles:8080 +results_server: http://paddles:8080 +results_ui_server: http://pulpito:8081/ +teuthology_path: /teuthology +archive_base: /archive_dir +reserve_machines: 0 +lab_domain: '' + +defaults: + cephadm: + containers: + image: 'quay.ceph.io/ceph-ci/ceph' diff --git a/containers/teuthology-dev/Dockerfile b/containers/teuthology-dev/Dockerfile new file mode 100644 index 000000000..074ec3482 --- /dev/null +++ b/containers/teuthology-dev/Dockerfile @@ -0,0 +1,45 @@ +FROM ubuntu:noble +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=C.UTF-8 +RUN apt-get update && \ + apt-get install -y \ + git \ + qemu-utils \ + python3-dev \ + libssl-dev \ + ipmitool \ + python3-pip \ + python3-venv \ + vim \ + jq \ + curl \ + libev-dev \ + libvirt-dev \ + libffi-dev \ + libyaml-dev \ + locales \ + lsb-release && \ + apt-get clean all && \ + locale-gen $LC_ALL +WORKDIR /teuthology +COPY requirements.txt requirements.yml ansible.cfg bootstrap /teuthology/ +RUN \ + cd /teuthology && \ + mkdir ../archive_dir && \ + mkdir log && \ + chmod +x /teuthology/bootstrap && \ + PIP_INSTALL_FLAGS="-r requirements.txt" ./bootstrap +COPY . /teuthology +RUN \ + (git config -f ./.git/config --unset 'http.https://github.com/.extraheader' || true ) && \ + ./bootstrap +COPY containers/teuthology-dev/containerized_node.yaml /teuthology +COPY containers/teuthology-dev/.teuthology.yaml /root +COPY containers/teuthology-dev/teuthology.sh / +RUN \ + mkdir $HOME/.ssh && \ + touch $HOME/.ssh/id_rsa && \ + chmod 600 $HOME/.ssh/id_rsa && \ + echo "StrictHostKeyChecking=no" > $HOME/.ssh/config && \ + echo "UserKnownHostsFile=/dev/null" >> $HOME/.ssh/config +ENTRYPOINT /teuthology.sh diff --git a/containers/teuthology-dev/containerized_node.yaml b/containers/teuthology-dev/containerized_node.yaml new file mode 100644 index 000000000..3e5345a72 --- /dev/null +++ b/containers/teuthology-dev/containerized_node.yaml @@ -0,0 +1,11 @@ +overrides: + ansible.cephlab: + skip_tags: "timezone,nagios,monitoring-scripts,ssh,hostname,pubkeys,zap,sudoers,kerberos,selinux,lvm,ntp-client,resolvconf,packages,cpan,nfs" + vars: + containerized_node: true + ansible_user: root + cm_user: root + start_rpcbind: false + cephadm: + osd_method: raw + no_cgroups_split: true diff --git a/containers/teuthology-dev/teuthology.sh b/containers/teuthology-dev/teuthology.sh new file mode 100755 index 000000000..373f6efb8 --- /dev/null +++ b/containers/teuthology-dev/teuthology.sh @@ -0,0 +1,40 @@ +#!/usr/bin/bash +set -e +source /teuthology/virtualenv/bin/activate +set -x +cat /run/secrets/id_rsa > $HOME/.ssh/id_rsa +if [ -n "$TEUTHOLOGY_TESTNODES" ]; then + for node in $(echo $TEUTHOLOGY_TESTNODES | tr , ' '); do + teuthology-update-inventory -m "$TEUTHOLOGY_MACHINE_TYPE" "$node" + done + TEUTHOLOGY_CONF=${TEUTHOLOGY_CONF:-} +else + TEUTHOLOGY_CONF=/teuthology/containerized_node.yaml +fi +export TEUTHOLOGY_MACHINE_TYPE=${TEUTHOLOGY_MACHINE_TYPE:-testnode} +if [ "$TEUTHOLOGY_SUITE" != "none" ]; then + if [ -n "$TEUTHOLOGY_BRANCH" ]; then + TEUTH_BRANCH_FLAG="--teuthology-branch $TEUTHOLOGY_BRANCH" + fi + teuthology-suite -v \ + $TEUTH_BRANCH_FLAG \ + -m "$TEUTHOLOGY_MACHINE_TYPE" \ + --newest 100 \ + --ceph "${TEUTHOLOGY_CEPH_BRANCH:-main}" \ + --ceph-repo "${TEUTHOLOGY_CEPH_REPO:-https://github.com/ceph/ceph.git}" \ + --suite "${TEUTHOLOGY_SUITE:-teuthology:no-ceph}" \ + --suite-branch "${TEUTHOLOGY_SUITE_BRANCH:-main}" \ + --suite-repo "${TEUTHOLOGY_SUITE_REPO:-https://github.com/ceph/ceph.git}" \ + --filter-out "libcephfs,kclient" \ + --force-priority \ + --seed 349 \ + ${TEUTHOLOGY_SUITE_EXTRA_ARGS} \ + $TEUTHOLOGY_CONF + DISPATCHER_EXIT_FLAG='--exit-on-empty-queue' + teuthology-queue -m $TEUTHOLOGY_MACHINE_TYPE -s | \ + python3 -c "import sys, json; assert json.loads(sys.stdin.read())['count'] > 0, 'queue is empty!'" +fi +teuthology-dispatcher -v \ + --log-dir /teuthology/log \ + --tube "$TEUTHOLOGY_MACHINE_TYPE" \ + $DISPATCHER_EXIT_FLAG diff --git a/docs/COMPONENTS.rst b/docs/COMPONENTS.rst new file mode 100644 index 000000000..f4fe7fd8b --- /dev/null +++ b/docs/COMPONENTS.rst @@ -0,0 +1,71 @@ +.. _components: + +=================== +Ceph Lab Components +=================== + +The distinct hardware/software components that a lab is composed of and the way +they interact between them is described here. In general, a lab is composed of +a scheduler, worker(s), package builder (`gitbuilder +`__), job database (`paddles +`__), job archive, a web UI (`pulpito +`__) and test nodes. + +| +| + +.. image:: cephlab.png + :align: center + :alt: Components of a Ceph Lab. + +| +| + +In the figure above, every service appears on a separate machine but this is +not a requirement (see :ref:`lab_setup` for an alternative setup). Jobs are +submitted to the scheduler, which are then picked up by dispatcher and +processesed by job supervisors. One supervisor processes and keeps track of a +job (one at a time). The output of the job (logs and files associated to an +execution) is stored in the archive, which is a path in the file system where +the dispatcher is running. The job database contains information about the status +of jobs and test nodes, as well as results of executions (# of tests passed, +failed, etc.). All this information can be visualized in ``pulpito``, the web +UI. For an example, see Ceph community's Lab `here `__. + +Test nodes can be grouped in classes (referred to as ``machine-type``), +allowing teuthology schedule jobs across multiple hardware setups. + +Life of a Teuthology Job +======================== + +The teuthology scheduler exposes a work queue (using `beanstalkd +`__) where jobs are submitted. The life of a +job begins when ``teuthology-suite`` is executed, which is when a job is +prepared and queued (``teuthology-schedule`` is implicitly invoked). When a job +is created (or whenever the status of a job is changed, e.g. from queued to +started), information about the job is recorded in ``paddles``'s internal +database. Depending on the priority of the job, the scheduler eventually +determines when a job can get executed. At this point, +``teuthology-dispatcher`` checks the lock status of the requested +machines by querying ``paddles``, acquires locks of the +nodes if they are available, and invokes ``teuthology-dispatcher`` in +``supervisor`` mode. ``supervisor`` reimages the target machines and invokes +``teuthology`` (the command). ``teuthology`` proceeds to execute the job +(execute every task in the YAML job description). After the execution is +completed (ie ``teuthology`` process exits), ``supervisor`` unlocks the +target machines. If the requested machines are not available, the ``dispatcher`` +waits for the machines to be available before running anymore jobs. Results from +the job are stored in the archive directory of the worker for forensic analysis. + +Since `QA suites `__ usually +specify ``install`` and ``ceph`` tasks, we briefly describe what they do. When +a suite is scheduled (via ``teuthology-suite``), the branch that is being +worked against has to be specified (e.g. a git ``SHA`` or ``ref``). Packages +for the given branch and distro are probed on gitbuilder to see if they exist. +Once this and other sanity checks pass, the job is created and scheduled. Once +the job initializes, the ``install`` task pulls and installs Ceph packages from +``gitbuilder``. The installation task might also be preceded by a ``kernel`` +task which first reboots testnodes (and optionally installs) into a specified +kernel. The ``ceph`` task subsequently configures and launches the cluster. At +this point, Ceph is ready to receive requests from other tasks (such as +``rados``). diff --git a/docs/ChangeLog.rst b/docs/ChangeLog.rst new file mode 100644 index 000000000..218a0baa2 --- /dev/null +++ b/docs/ChangeLog.rst @@ -0,0 +1,6 @@ +Changelog +========= + +0.1.0 +----- +* (Actual changelog coming soon) diff --git a/docs/INSTALL.rst b/docs/INSTALL.rst new file mode 100644 index 000000000..28a96c9cd --- /dev/null +++ b/docs/INSTALL.rst @@ -0,0 +1,119 @@ +.. _installation_and_setup: + +Installation and setup +====================== + +Ubuntu, Fedora & SUSE/openSUSE +------------------------------ +First, clone the `git repository `__:: + + git clone https://github.com/ceph/teuthology.git + +Next, run the bootstrap script, which will do everything for you assuming +you have ``sudo``:: + + cd teuthology + ./bootstrap + +Finally, activate the ``virtualenv``:: + + source virtualenv/bin/activate + +Run a teuthology command to confirm that everything's working. For instance:: + + teuthology --help + +MacOS X +------- + +The ``bootstrap`` script was recently updated to support MacOS X using `homebrew `_:: + + ./bootstrap + +**Note**: Certain features might not work properly on MacOS X. Patches are +encouraged, but it has never been a goal of ours to run a full ``teuthology`` +setup on a Mac. + +Other operating systems +----------------------- + +Patches are welcomed to add ``bootstrap`` support for other operating systems. Until then, manual installs are possible + +First install the non-PyPI dependencies:: + + python-dev python-pip python-virtualenv libevent-dev python-libvirt + +Next, clone its `git repository `__, +create a `virtualenv `__, and +install dependencies. The instructions are given below:: + + git clone https://github.com/ceph/teuthology/ + cd teuthology + virtualenv --python python3 ./virtualenv + source virtualenv/bin/activate + pip install --upgrade pip + pip install -r requirements.txt + python setup.py develop + + +Teuthology in PyPI +------------------ + +However if you prefer, you may install ``teuthology`` from `PyPI `__:: + + pip install teuthology + + +**Note**: The version in PyPI can be (*far*) behind the development version. + +Or from GitHub:: + + pip install git+https://github.com/ceph/teuthology#egg=teuthology[orchestra] + +where the dependencies for orchestrating are installed. They are used for +interacting with the services to schedule tests and to report the test results. + + +Update Dependencies +------------------- + +We track the dependencies using ``requirements.txt``. These packages are +tested, and should work with teuthology. But if you want to bump up the +versions of them, please use the following command to update these files:: + + ./update-requirements.sh -P + +Please upgrade pip-tool using following command :: + + pip install pip-tools --upgrade + +if the command above fails like:: + + Traceback (most recent call last): + File "/home/kchai/teuthology/virtualenv/bin/pip-compile", line 5, in + from piptools.scripts.compile import cli + File "/home/kchai/teuthology/virtualenv/local/lib/python2.7/site-packages/piptools/scripts/compile.py", line 11, in + from pip.req import InstallRequirement, parse_requirements + ImportError: No module named req + +Add Dependencies +---------------- + +td,dr: please add the new dependencies in both ``setup.py`` and +``requirements.in``. + +We also use ``pip install `` to install teuthology in some Ceph's unit +tests. To cater their needs, some requirements are listed in ``setup.py`` as +well, so that ``pip install`` can pick them up. We could just avoid duplicating +the packages specifications in two places by putting:: + + -e .[orchestra,test] + +in ``requirements.in``. But dependabot includes:: + + -e file:///home/dependabot/dependabot-updater/tmp/dependabot_20200617-72-1n8af4b # via -r requirements.in + +in the generated ``requirements.txt``. This renders the created pull request +useless without human intervention. To appease dependabot, a full-blown +``requirements.in`` collecting all direct dependencies listed by ``setup.py`` +is used instead. diff --git a/docs/LAB_SETUP.rst b/docs/LAB_SETUP.rst new file mode 100644 index 000000000..b967400c5 --- /dev/null +++ b/docs/LAB_SETUP.rst @@ -0,0 +1,142 @@ +.. _lab_setup: + +========================== +Teuthology Lab Setup Notes +========================== + +Introduction +============ + +We recently set up a new lab for Ceph testing and decided to document the parts of the process that are most relevant to teuthology. This is the result. + +We started by setting aside two of the test machines: one as the 'teuthology node', and another as the 'paddles/pulpito node'. These would be used to orchestrate automated testing and to store and serve the results on our intranet. + +paddles/pulpito node +==================== + +We're currently running both paddles and pulpito on the same node. We have a proxy server up front listening on port 80 that forwards to the proper service based on which hostname is used. Feel free to modify our `paddles <_static/nginx_paddles>`_ and `pulpito <_static/nginx_pulpito>`_ configurations for your use. + +Do the following as root or as another user with sudo access:: + + sudo apt-get install git python3-dev python3-virtualenv postgresql postgresql-contrib postgresql-server-dev-all supervisor + sudo -u postgres createuser paddles -P + sudo -u postgres createdb paddles + +Create a separate user for paddles and puplito. We used 'paddles' and 'pulpito'. + + +paddles +------- +Follow instructions at https://github.com/ceph/paddles/blob/main/README.rst + + +pulpito +------- +Follow instructions at https://github.com/ceph/pulpito/blob/main/README.rst + + +Starting up +----------- + +Back as the 'root or sudo' user:: + + sudo cp ~paddles/paddles/supervisord_paddles.conf /etc/supervisor/conf.d/paddles.conf + sudo supervisorctl reread && sudo supervisorctl update paddles && sudo supervisorctl start paddles + sudo cp ~pulpito/pulpito/supervisord_pulpito.conf /etc/supervisor/conf.d/pulpito.conf + sudo supervisorctl reread && sudo supervisorctl update pulpito && sudo supervisorctl start pulpito + + +Test Nodes +========== + +Each node needs to have a user named 'ubuntu' with passwordless sudo access. + +It's also necessary to generate an ssh key pair that will be used to provide +passwordless authentication to all the test nodes, and put the public key in +``~/.ssh/authorized_keys`` on all the test nodes. + + +Teuthology Node +=============== + +Create an ``/etc/teuthology.yaml`` that looks like:: + + lab_domain: example.com + lock_server: http://paddles.example.com:8080 + results_server: http://paddles.example.com:8080 + queue_host: localhost + queue_port: 11300 + results_email: you@example.com + archive_base: /home/teuthworker/archive + +Do the following as root or as another user with sudo access: + +Create two additional users: one that simply submits jobs to the queue, and +another that picks them up from the queue and executes them. We use +'teuthology' and 'teuthworker', respectively. + +Give both users passwordless sudo access. + +Copy the ssh key pair that you created to access the test nodes into each of +these users' ``~/.ssh`` directory. + +Install these packages:: + + sudo apt-get -y install git python-dev python-pip python-virtualenv libevent-dev python-libvirt beanstalkd + +Now, set up the two users you just created: + + +Scheduler +--------- +As 'teuthology', do the following:: + + mkdir ~/src + git clone https://github.com/ceph/teuthology.git src/teuthology_main + pushd src/teuthology_main/ + ./bootstrap + popd + + +Worker +------ +As 'teuthworker', do the following:: + + mkdir ~/src + git clone https://github.com/ceph/teuthology.git src/teuthology_main + pushd src/teuthology_main/ + ./bootstrap + popd + mkdir ~/bin + wget -O ~/bin/worker_start https://raw.githubusercontent.com/ceph/teuthology/main/docs/_static/worker_start.sh + echo 'PATH="$HOME/src/teuthology_main/virtualenv/bin:$PATH"' >> ~/.profile + source ~/.profile + mkdir -p ~/archive/worker_logs + worker_start magna 1 + + +Submitting Nodes +================ + +First:: + + wget https://raw.githubusercontent.com/ceph/teuthology/main/docs/_static/create_nodes.py + +Edit ``create_nodes.py`` to generate the hostnames of the machines you want to submit to paddles. + +Now to do the work:: + + python create_nodes.py + teuthology-lock --owner initial@setup --list-targets > /tmp/targets + teuthology --owner initial@setup /tmp/targets + teuthology-lock --owner initial@setup --unlock -t /tmp/targets + + +Serving Test Logs +================= + +pulpito tries to provide links to test logs. Out-of-the-box, those links will be broken, but are easy to fix. + +First, install your favorite web server on the teuthology node. If you use nginx, you may use `our configuration <_static/nginx_test_logs>`_ as a template. + +Once you've got log files being served, edit paddles' ``config.py`` and update the ``job_log_href_templ`` value. Restart paddles when you're done. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..6fd9f9965 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,180 @@ +# Makefile for Sphinx documentation +# + +GENERATED_API_DOCS = {modules,teuthology{,.openstack,.openstack.test,.orchestra,.task,.task.tests}}.rst + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +SOURCEDIR = . +BUILDDIR = build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) $(SOURCEDIR) + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* $(GENERATED_API_DOCS) + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/teuthology.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/teuthology.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/teuthology" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/teuthology" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/README.rst b/docs/README.rst new file mode 100644 index 000000000..62f5ce007 --- /dev/null +++ b/docs/README.rst @@ -0,0 +1,149 @@ +=================================================== +`Teuthology` -- The Ceph integration test framework +=================================================== + +``teuthology`` is an automation framework for `Ceph +`__, written in `Python +`__. It is used to run the vast majority of its tests +and was developed because the unique requirements of testing such a highly +distributed system with active kernel development meant that no other framework +existed that could do its job. + +The name '`teuthology `__' refers to the +study of cephalopods. + + +Overview +======== + +The general mode of operation of ``teuthology`` is to remotely orchestrate +operations on remote hosts over SSH, as implemented by `Paramiko +`__. A typical `job` consists of multiple nested +`tasks`, each of which perform operations on a remote host over the network. + +When testing, it is common to group many `jobs` together to form a `test run`. + +If you are new to teuthology and simply want to run existing tests, check out +:ref:`intro_testers` + + +Provided Utilities +================== +* :ref:`teuthology` - Run individual jobs +* :ref:`teuthology-kill` - Kill running jobs or entire runs +* :ref:`teuthology-lock` - Lock, unlock, and update status of machines +* :ref:`teuthology-ls` - List job results by examining an archive directory +* :ref:`teuthology-openstack` - Use OpenStack backend (wrapper around ``teuthology-suite``) +* :ref:`teuthology-queue` - List, or delete, jobs in the queue +* :ref:`teuthology-report` - Submit test results to a web service (we use `paddles `__) +* :ref:`teuthology-results` - Examing a finished run and email results +* :ref:`teuthology-schedule` - Schedule a single job +* :ref:`teuthology-suite` - Schedule a full run based on a suite (see `suites` in `ceph-qa-suite `__) +* :ref:`teuthology-updatekeys` - Update SSH host keys for a machine +* :ref:`teuthology-worker` - Worker daemon to monitor the queue and execute jobs + +For a description of the distinct services that utilities interact with see +:ref:`components`. + +Installation +============ + +See :ref:`installation_and_setup`. + + +Infrastructure +============== + +The examples in this document are based on the lab machine configuration used +by the Red Hat Ceph development and quality assurance teams +(see :ref:`lab_setup`). Other instances of a Ceph Lab being used in a +development or testing environment may differ from these examples. + + +Detailed test configuration +=========================== + +See :ref:`detailed_test_config`. + + +Virtual Machine Support +======================= + +For OpenStack support, see :ref:`openstack-backend` + +For 'vps' support using `downburst `__, see +:ref:`downburst_vms` + + +Test Suites +=========== + +Each suite name is determined by the name of the directory in ``ceph-qa-suite`` +that contains that suite. The directory contains subdirectories and yaml files, +which, when assembled, produce valid tests that can be run. The test suite +application generates combinations of these files and thus ends up running a +set of tests based off the data in the directory for the suite. + +To run a suite, enter:: + + teuthology-suite -s [-c ] [-k ] [-e email] [-f flavor] [-t ] [-m ] + +where: + +* ``suite``: the name of the suite (the directory in ceph-qa-suite). +* ``ceph``: ceph branch to be used. +* ``kernel``: version of the kernel to be used. +* ``email``: email address to send the results to. +* ``flavor``: the ceph packages shaman flavor to run against +* ``teuth``: version of teuthology to run +* ``mtype``: machine type of the run +* ``templates``: template file used for further modifying the suite (optional) + +For example, consider:: + + teuthology-suite -s rbd -c wip-fix -k distro -e bob.smith@foo.com -f default -t jewel -m mira + +The above command runs the rbd suite using the wip-fix branch of ceph, the +jewel kernel, with a 'default' ceph shaman build packages flavor, and the teuthology jewel branch +will be used. It will run on mira machines and send an email to +bob.smith@foo.com when it's completed. For more details on +``teuthology-suite``, please consult the output of ``teuthology-suite --help``. +Read more about running integration tests using teuthology at `docs.ceph.com +`__. + +In order for a queued task to be run, a teuthworker thread on +``teuthology.front.sepia.ceph.com`` needs to remove the task from the queue. +On ``teuthology.front.sepia.ceph.com``, run ``ps aux | grep teuthology-worker`` +to view currently running tasks. If no processes are reading from the test +version that you are running, additonal teuthworker tasks need to be started. +To start these tasks: + +* copy your build tree to ``/home/teuthworker`` on ``teuthology.front.sepia.ceph.com``. +* Give it a unique name (in this example, xxx) +* start up some number of worker threads (as many as machines you are testing with, there are 60 running for the default queue):: + + /home/virtualenv/bin/python + /var/lib/teuthworker/xxx/virtualenv/bin/teuthworker + /var/lib/teuthworker/archive --tube xxx + --log-dir /var/lib/teuthworker/archive/worker_logs + + Note: The threads on teuthology.front.sepia.ceph.com are started via + ~/teuthworker/start.sh. You can use that file as a model for your + own threads, or add to this file if you want your threads to be + more permanent. + +Once the suite completes, an email message is sent to the users specified, and +a large amount of information is left on ``teuthology.front.sepia.ceph.com`` in +``/var/lib/teuthworker/archive``. + +This is symbolically linked to /a for convenience. A new directory is created +whose name consists of a concatenation of the date and time that the suite was +started, the name of the suite, the ceph branch tested, the kernel used, and +the flavor. For every test run there is a directory whose name is the pid +number of the pid of that test. Each of these directory contains a copy of the +``teuthology.log`` for that process. Other information from the suite is +stored in files in the directory, and task-specific yaml files and other logs +are saved in the subdirectories. + +These logs are also publically available at +``http://qa-proxy.ceph.com/teuthology/``. diff --git a/docs/_static/create_nodes.py b/docs/_static/create_nodes.py new file mode 100755 index 000000000..3645b613c --- /dev/null +++ b/docs/_static/create_nodes.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# A sample script that can be used while setting up a new teuthology lab +# This script will connect to the machines in your lab, and populate a +# paddles instance with their information. +# +# You WILL need to modify it. + +import logging +import sys +from teuthology.orchestra.remote import Remote +from teuthology.lock.ops import update_inventory + +paddles_url = 'http://paddles.example.com/nodes/' + +machine_type = 'typica' +lab_domain = 'example.com' +# Don't change the user. It won't work at this time. +user = 'ubuntu' +# We are populating 'typica003' -> 'typica192' +machine_index_range = range(3, 192) + +log = logging.getLogger(sys.argv[0]) +logging.getLogger("requests.packages.urllib3.connectionpool").setLevel( + logging.WARNING) + + +def get_shortname(machine_type, index): + """ + Given a number, return a hostname. Example: + get_shortname('magna', 3) = 'magna003' + + Modify to suit your needs. + """ + return machine_type + str(index).rjust(3, '0') + + +def get_info(user, fqdn): + remote = Remote('@'.join((user, fqdn))) + return remote.inventory_info + + +def main(): + shortnames = [get_shortname(machine_type, i) for i in machine_index_range] + fqdns = ['.'.join((name, lab_domain)) for name in shortnames] + for fqdn in fqdns: + log.info("Creating %s", fqdn) + base_info = dict( + name=fqdn, + locked=True, + locked_by='initial@setup', + machine_type=machine_type, + description="Initial node creation", + ) + try: + info = get_info(user, fqdn) + base_info.update(info) + base_info['up'] = True + except Exception as exc: + log.error("{fqdn} is down".format(fqdn=fqdn)) + base_info['up'] = False + base_info['description'] = repr(exc) + update_inventory(base_info) + +if __name__ == '__main__': + main() diff --git a/docs/_static/nginx_paddles b/docs/_static/nginx_paddles new file mode 100644 index 000000000..c1e0896f2 --- /dev/null +++ b/docs/_static/nginx_paddles @@ -0,0 +1,11 @@ +server { + server_name paddles.example.com; + proxy_send_timeout 600; + proxy_connect_timeout 240; + location / { + proxy_pass http://paddles.example.com:8080/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + +} diff --git a/docs/_static/nginx_pulpito b/docs/_static/nginx_pulpito new file mode 100644 index 000000000..de9147ca8 --- /dev/null +++ b/docs/_static/nginx_pulpito @@ -0,0 +1,11 @@ +server { + server_name pulpito.example.com; + proxy_send_timeout 600; + proxy_connect_timeout 240; + location / { + proxy_pass http://pulpito.example.com:8081/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + +} diff --git a/docs/_static/nginx_test_logs b/docs/_static/nginx_test_logs new file mode 100644 index 000000000..139a0a197 --- /dev/null +++ b/docs/_static/nginx_test_logs @@ -0,0 +1,7 @@ +server { + allow all; + autoindex on; + server_name test_logs.example.com; + root /home/teuthworker/archive; + default_type text/plain; +} diff --git a/docs/_static/worker_start.sh b/docs/_static/worker_start.sh new file mode 100644 index 000000000..e2b4424ae --- /dev/null +++ b/docs/_static/worker_start.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# A simple script used by Red Hat to start teuthology-worker processes. + +ARCHIVE=${ARCHIVE:-"$HOME/archive"} +WORKER_LOGS=$ARCHIVE/worker_logs + +function start_workers_for_tube { + echo "Starting $2 workers for $1" + for i in `seq 1 $2` + do + teuthology-worker -v --archive-dir $ARCHIVE --tube $1 --log-dir $WORKER_LOGS & + done +} + +function start_all { + start_workers_for_tube plana 50 + start_workers_for_tube mira 50 + start_workers_for_tube vps 80 + start_workers_for_tube burnupi 10 + start_workers_for_tube tala 5 + start_workers_for_tube saya 10 + start_workers_for_tube multi 100 +} + +function main { + printf '%s\n' "$*" + if [[ -z "$*" ]] + then + start_all + elif [ ! -z "$2" ] && [ "$2" -gt "0" ] + then + start_workers_for_tube $1 $2 + else + echo "usage: $0 [tube_name number_of_workers]" >&2 + exit 1 + fi +} + +main "$@" diff --git a/docs/_themes/ceph/static/font/ApexSans-Book.eot b/docs/_themes/ceph/static/font/ApexSans-Book.eot new file mode 100644 index 000000000..332c8cbe3 Binary files /dev/null and b/docs/_themes/ceph/static/font/ApexSans-Book.eot differ diff --git a/docs/_themes/ceph/static/font/ApexSans-Book.svg b/docs/_themes/ceph/static/font/ApexSans-Book.svg new file mode 100644 index 000000000..8af9af2bb --- /dev/null +++ b/docs/_themes/ceph/static/font/ApexSans-Book.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/_themes/ceph/static/font/ApexSans-Book.ttf b/docs/_themes/ceph/static/font/ApexSans-Book.ttf new file mode 100644 index 000000000..42a008463 Binary files /dev/null and b/docs/_themes/ceph/static/font/ApexSans-Book.ttf differ diff --git a/docs/_themes/ceph/static/font/ApexSans-Book.woff b/docs/_themes/ceph/static/font/ApexSans-Book.woff new file mode 100644 index 000000000..681a70ee9 Binary files /dev/null and b/docs/_themes/ceph/static/font/ApexSans-Book.woff differ diff --git a/docs/_themes/ceph/static/font/ApexSans-Medium.eot b/docs/_themes/ceph/static/font/ApexSans-Medium.eot new file mode 100644 index 000000000..e06fd2154 Binary files /dev/null and b/docs/_themes/ceph/static/font/ApexSans-Medium.eot differ diff --git a/docs/_themes/ceph/static/font/ApexSans-Medium.svg b/docs/_themes/ceph/static/font/ApexSans-Medium.svg new file mode 100644 index 000000000..6c624ec97 --- /dev/null +++ b/docs/_themes/ceph/static/font/ApexSans-Medium.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/_themes/ceph/static/font/ApexSans-Medium.ttf b/docs/_themes/ceph/static/font/ApexSans-Medium.ttf new file mode 100644 index 000000000..44c281e33 Binary files /dev/null and b/docs/_themes/ceph/static/font/ApexSans-Medium.ttf differ diff --git a/docs/_themes/ceph/static/font/ApexSans-Medium.woff b/docs/_themes/ceph/static/font/ApexSans-Medium.woff new file mode 100644 index 000000000..b7c88194b Binary files /dev/null and b/docs/_themes/ceph/static/font/ApexSans-Medium.woff differ diff --git a/docs/_themes/ceph/static/nature.css_t b/docs/_themes/ceph/static/nature.css_t new file mode 100644 index 000000000..394a6339f --- /dev/null +++ b/docs/_themes/ceph/static/nature.css_t @@ -0,0 +1,325 @@ +/* + * nature.css_t + * ~~~~~~~~~~~~ + * + * Sphinx stylesheet -- nature theme. + * + * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +@import url("basic.css"); + +/* -- page layout ----------------------------------------------------------- */ + +@font-face { + font-family: 'ApexSansMedium'; + src: url('font/ApexSans-Medium.eot'); + src: url('font/ApexSans-Medium.eot?#iefix') format('embedded-opentype'), + url('font/ApexSans-Medium.woff') format('woff'), + url('font/ApexSans-Medium.ttf') format('truetype'), + url('font/ApexSans-Medium.svg#FontAwesome') format('svg'); + font-weight: normal; + font-style: normal; +} + +@font-face { + font-family: 'ApexSansBook'; + src: url('font/ApexSans-Book.eot'); + src: url('font/ApexSans-Book.eot?#iefix') format('embedded-opentype'), + url('font/ApexSans-Book.woff') format('woff'), + url('font/ApexSans-Book.ttf') format('truetype'), + url('font/ApexSans-Book.svg#FontAwesome') format('svg'); + font-weight: normal; + font-style: normal; +} + +body { + font: 14px/1.4 Helvetica, Arial, sans-serif; + background-color: #E6E8E8; + color: #37424A; + margin: 0; + padding: 0; + border-top: 5px solid #F05C56; +} + +div.documentwrapper { + float: left; + width: 100%; +} + +div.bodywrapper { + margin: 0 0 0 330px; +} + +hr { + border: 1px solid #B1B4B6; +} + +div.document { + background-color: #ffffff; +} + +div.body { + background-color: #ffffff; + color: #3E4349; + padding: 0 30px 30px 30px; +} + +div.footer { + color: #222B31; + width: 100%; + padding: 13px 0; + text-align: center; + font-size: 75%; +} + +div.footer a { + color: #444; + text-decoration: underline; +} + +div.related { + background-color: #80D2DC; + line-height: 32px; + color: #37424A; + // text-shadow: 0px 1px 0 #444; + font-size: 100%; + border-top: #9C4850 5px solid; +} + +div.related a { + color: #37424A; + text-decoration: none; +} + +div.related a:hover { + color: #fff; + // text-decoration: underline; +} + +div.sphinxsidebar { + // font-size: 100%; + line-height: 1.5em; + width: 330px; +} + +div.sphinxsidebarwrapper{ + padding: 20px 0; + background-color: #efefef; +} + +div.sphinxsidebar h3, +div.sphinxsidebar h4 { + font-family: ApexSansMedium; + color: #e6e8e8; + font-size: 1.2em; + font-weight: normal; + margin: 0; + padding: 5px 10px; + background-color: #5e6a71; + // text-shadow: 1px 1px 0 white; + text-transform: uppercase; +} + +div.sphinxsidebar h4{ + font-size: 1.1em; +} + +div.sphinxsidebar h3 a { + color: #e6e8e8; +} + + +div.sphinxsidebar p { + color: #888; + padding: 5px 20px; +} + +div.sphinxsidebar p.topless { +} + +div.sphinxsidebar ul { + margin: 10px 5px 10px 20px; + padding: 0; + color: #000; +} + +div.sphinxsidebar a { + color: #444; +} + +div.sphinxsidebar input { + border: 1px solid #ccc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar input[type=text]{ + margin-left: 20px; +} + +/* -- body styles ----------------------------------------------------------- */ + +a { + color: #F05C56; + text-decoration: none; +} + +a:hover { + color: #F05C56; + text-decoration: underline; +} + +div.body h1, +div.body h2, +div.body h3, +div.body h4, +div.body h5, +div.body h6 { + // font-family: ApexSansMedium; + // background-color: #80D2DC; + // font-weight: normal; + // color: #37424a; + margin: 30px 0px 10px 0px; + padding: 5px 0 5px 0px; + // text-shadow: 0px 1px 0 white; + text-transform: uppercase; +} + +div.body h1 { font: 20px/2.0 ApexSansBook; color: #37424A; border-top: 20px solid white; margin-top: 0; } +div.body h2 { font: 18px/1.8 ApexSansMedium; background-color: #5E6A71; color: #E6E8E8; padding: 5px 10px; } +div.body h3 { font: 16px/1.6 ApexSansMedium; color: #37424A; } +div.body h4 { font: 14px/1.4 Helvetica, Arial, sans-serif; color: #37424A; } +div.body h5 { font: 12px/1.2 Helvetica, Arial, sans-serif; color: #37424A; } +div.body h6 { font-size: 100%; color: #37424A; } + +// div.body h2 { font-size: 150%; background-color: #E6E8E8; color: #37424A; } +// div.body h3 { font-size: 120%; background-color: #E6E8E8; color: #37424A; } +// div.body h4 { font-size: 110%; background-color: #E6E8E8; color: #37424A; } +// div.body h5 { font-size: 100%; background-color: #E6E8E8; color: #37424A; } +// div.body h6 { font-size: 100%; background-color: #E6E8E8; color: #37424A; } + +a.headerlink { + color: #c60f0f; + font-size: 0.8em; + padding: 0 4px 0 4px; + text-decoration: none; +} + +a.headerlink:hover { + background-color: #c60f0f; + color: white; +} + +div.body p, div.body dd, div.body li { + line-height: 1.5em; +} + +div.admonition p.admonition-title + p { + display: inline; +} + +div.highlight{ + background-color: white; +} + +div.note { + background-color: #e6e8e8; + border: 1px solid #ccc; +} + +div.seealso { + background-color: #ffc; + border: 1px solid #ff6; +} + +div.topic { + background-color: #efefef; +} + +div.warning { + background-color: #F05C56; + border: 1px solid #9C4850; + color: #fff; +} + +p.admonition-title { + display: inline; +} + +p.admonition-title:after { + content: ":"; +} + +pre { + padding: 10px; + background-color: White; + color: #222; + line-height: 1.2em; + border: 1px solid #5e6a71; + font-size: 1.1em; + margin: 1.5em; + -webkit-box-shadow: 1px 1px 1px #e6e8e8; + -moz-box-shadow: 1px 1px 1px #e6e8e8; +} + +tt { + background-color: #ecf0f3; + color: #222; + /* padding: 1px 2px; */ + font-size: 15px; + font-family: monospace; +} + +.viewcode-back { + font-family: Arial, sans-serif; +} + +div.viewcode-block:target { + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +table.docutils { + margin: 1.5em; +} + +div.sidebar { + border: 1px solid #5E6A71; + background-color: #E6E8E8; +} + +div.admonition.tip { + background-color: #80D2DC; + border: 1px solid #55AEBA; +} + +div.admonition.important { + background-color: #F05C56; + border: 1px solid #9C4850; + color: #fff; +} + +div.tip tt.literal { + background-color: #55aeba; + color: #fff; +} + +div.important tt.literal { + background-color: #9C4850; + color: #fff; +} + +h2 .literal { + color: #fff; + background-color: #37424a; +} + +dl.glossary dt { + font-size: 1.0em; + padding-top:20px; + +} \ No newline at end of file diff --git a/docs/_themes/ceph/theme.conf b/docs/_themes/ceph/theme.conf new file mode 100644 index 000000000..1cc400446 --- /dev/null +++ b/docs/_themes/ceph/theme.conf @@ -0,0 +1,4 @@ +[theme] +inherit = basic +stylesheet = nature.css +pygments_style = tango diff --git a/docs/cephlab.png b/docs/cephlab.png new file mode 100644 index 000000000..4cdcea286 Binary files /dev/null and b/docs/cephlab.png differ diff --git a/docs/cephlab.svg b/docs/cephlab.svg new file mode 100644 index 000000000..d1b446d49 --- /dev/null +++ b/docs/cephlab.svg @@ -0,0 +1,3 @@ + + +
laptop or workstation
laptop or workstation
paddles
paddles
pulpito
pulpito
teuthology dispatcher
teuthology disp...
teuthology scheduler
teuthology sche...
package builder
package builder
















.
.
test nodes

....


                       . . . .

smithi
. . . ....


                       . . . .

mira
. . . ....
smithi1
smithi1
smithi2
smithi2
smithiN
smithiN
mira1
mira1
mira2
mira2
miraN
miraN
job supervisors
job superviso...
beanstalkd work queue
beanstalkd work...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/commands/list.rst b/docs/commands/list.rst new file mode 100644 index 000000000..0ac9437ea --- /dev/null +++ b/docs/commands/list.rst @@ -0,0 +1,9 @@ +Command line interface (CLI) +============================ + +Help output of the available command line tools for teuthology. + +.. toctree:: + :glob: + + * diff --git a/docs/commands/teuthology-describe.rst b/docs/commands/teuthology-describe.rst new file mode 100644 index 000000000..fb1f95c3a --- /dev/null +++ b/docs/commands/teuthology-describe.rst @@ -0,0 +1,4 @@ +teuthology-describe +=================== + +.. program-output:: teuthology-describe --help diff --git a/docs/commands/teuthology-dispatcher.rst b/docs/commands/teuthology-dispatcher.rst new file mode 100644 index 000000000..3fa5166a1 --- /dev/null +++ b/docs/commands/teuthology-dispatcher.rst @@ -0,0 +1,9 @@ +teuthology-dispatcher +===================== + +.. program-output:: teuthology-dispatcher --help + +trouble-shooting notes: +======================= + +- Github unreachable kills dispatcher - The dispatcher might be killed when github becomes unreachable, e.g., https://tracker.ceph.com/issues/54366 \ No newline at end of file diff --git a/docs/commands/teuthology-kill.rst b/docs/commands/teuthology-kill.rst new file mode 100644 index 000000000..6bc084165 --- /dev/null +++ b/docs/commands/teuthology-kill.rst @@ -0,0 +1,4 @@ +teuthology-kill +=============== + +.. program-output:: teuthology-kill --help diff --git a/docs/commands/teuthology-lock.rst b/docs/commands/teuthology-lock.rst new file mode 100644 index 000000000..5123175f0 --- /dev/null +++ b/docs/commands/teuthology-lock.rst @@ -0,0 +1,4 @@ +teuthology-lock +=============== + +.. program-output:: teuthology-lock --help diff --git a/docs/commands/teuthology-ls.rst b/docs/commands/teuthology-ls.rst new file mode 100644 index 000000000..856f561f7 --- /dev/null +++ b/docs/commands/teuthology-ls.rst @@ -0,0 +1,4 @@ +teuthology-ls +============= + +.. program-output:: teuthology-ls --help diff --git a/docs/commands/teuthology-openstack.rst b/docs/commands/teuthology-openstack.rst new file mode 100644 index 000000000..501fbfe80 --- /dev/null +++ b/docs/commands/teuthology-openstack.rst @@ -0,0 +1,4 @@ +teuthology-openstack +==================== + +.. program-output:: teuthology-openstack --help diff --git a/docs/commands/teuthology-prune-logs.rst b/docs/commands/teuthology-prune-logs.rst new file mode 100644 index 000000000..c534d5739 --- /dev/null +++ b/docs/commands/teuthology-prune-logs.rst @@ -0,0 +1,4 @@ +teuthology-prune-logs +===================== + +.. program-output:: teuthology-prune-logs --help diff --git a/docs/commands/teuthology-queue.rst b/docs/commands/teuthology-queue.rst new file mode 100644 index 000000000..3f8c22283 --- /dev/null +++ b/docs/commands/teuthology-queue.rst @@ -0,0 +1,4 @@ +teuthology-queue +================ + +.. program-output:: teuthology-queue --help diff --git a/docs/commands/teuthology-reimage.rst b/docs/commands/teuthology-reimage.rst new file mode 100644 index 000000000..eb085af79 --- /dev/null +++ b/docs/commands/teuthology-reimage.rst @@ -0,0 +1,4 @@ +teuthology-reimage +================== + +.. program-output:: teuthology-reimage --help diff --git a/docs/commands/teuthology-report.rst b/docs/commands/teuthology-report.rst new file mode 100644 index 000000000..bdd3c49cc --- /dev/null +++ b/docs/commands/teuthology-report.rst @@ -0,0 +1,4 @@ +teuthology-report +================= + +.. program-output:: teuthology-report --help diff --git a/docs/commands/teuthology-results.rst b/docs/commands/teuthology-results.rst new file mode 100644 index 000000000..22c3eee8f --- /dev/null +++ b/docs/commands/teuthology-results.rst @@ -0,0 +1,4 @@ +teuthology-results +================== + +.. program-output:: teuthology-results --help diff --git a/docs/commands/teuthology-schedule.rst b/docs/commands/teuthology-schedule.rst new file mode 100644 index 000000000..3c03c3f57 --- /dev/null +++ b/docs/commands/teuthology-schedule.rst @@ -0,0 +1,4 @@ +teuthology-schedule +=================== + +.. program-output:: teuthology-schedule --help diff --git a/docs/commands/teuthology-suite.rst b/docs/commands/teuthology-suite.rst new file mode 100644 index 000000000..85c63eed7 --- /dev/null +++ b/docs/commands/teuthology-suite.rst @@ -0,0 +1,4 @@ +teuthology-suite +================ + +.. program-output:: teuthology-suite --help diff --git a/docs/commands/teuthology-update-inventory.rst b/docs/commands/teuthology-update-inventory.rst new file mode 100644 index 000000000..dc4e216ba --- /dev/null +++ b/docs/commands/teuthology-update-inventory.rst @@ -0,0 +1,4 @@ +teuthology-update-inventory +=========================== + +.. program-output:: teuthology-update-inventory --help diff --git a/docs/commands/teuthology-updatekeys.rst b/docs/commands/teuthology-updatekeys.rst new file mode 100644 index 000000000..c0525aecc --- /dev/null +++ b/docs/commands/teuthology-updatekeys.rst @@ -0,0 +1,4 @@ +teuthology-updatekeys +===================== + +.. program-output:: teuthology-updatekeys --help diff --git a/docs/commands/teuthology-wait.rst b/docs/commands/teuthology-wait.rst new file mode 100644 index 000000000..072b87e70 --- /dev/null +++ b/docs/commands/teuthology-wait.rst @@ -0,0 +1,4 @@ +teuthology-wait +===================== + +.. program-output:: teuthology-wait --help diff --git a/docs/commands/teuthology-worker.rst b/docs/commands/teuthology-worker.rst new file mode 100644 index 000000000..c0096092c --- /dev/null +++ b/docs/commands/teuthology-worker.rst @@ -0,0 +1,4 @@ +teuthology-worker +================= + +.. program-output:: teuthology-worker --help diff --git a/docs/commands/teuthology.rst b/docs/commands/teuthology.rst new file mode 100644 index 000000000..0b7fdf2ab --- /dev/null +++ b/docs/commands/teuthology.rst @@ -0,0 +1,4 @@ +teuthology +========== + +.. program-output:: teuthology --help diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..bce967b1e --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +# +# teuthology documentation build configuration file, created by +# sphinx-quickstart on Thu Aug 7 12:30:36 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosectionlabel', + 'sphinxcontrib.programoutput', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The root toctree document. +root_doc = 'index' + +# General information about the project. +project = u'teuthology' +copyright = u'2014, Inktank Storage, Inc.' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1.0' +# The full version, including alpha/beta/rc tags. +release = '0.1.0' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'ceph' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['_themes'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'teuthologydoc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ('index', 'teuthology.tex', u'teuthology Documentation', + u'Inktank Storage, Inc.', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'teuthology', u'teuthology Documentation', + [u'Inktank Storage, Inc.'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'teuthology', u'teuthology Documentation', + u'Inktank Storage, Inc.', 'teuthology', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/detailed_test_config.rst b/docs/detailed_test_config.rst new file mode 100644 index 000000000..2a46c4d77 --- /dev/null +++ b/docs/detailed_test_config.rst @@ -0,0 +1,309 @@ +.. _detailed_test_config: + +=========================== +Detailed Test Configuration +=========================== + +Test configuration +================== + +An integration test run takes three items of configuration: + +- ``targets``: what hosts to run on; this is a dictionary mapping + hosts to ssh host keys, like: + "username@hostname.example.com: ssh-rsa long_hostkey_here" +- ``roles``: how to use the hosts; this is a list of lists, where each + entry lists all the roles to be run on a single host. For example, a + single entry might say ``[mon.1, osd.1]``. +- ``tasks``: how to set up the cluster and what tests to run on it; + see below for examples + +The format for this configuration is `YAML `__, a +structured data format that is still human-readable and editable. + +For example, a full config for a test run that sets up a three-machine +cluster, mounts Ceph via ``ceph-fuse``, and leaves you at an interactive +Python prompt for manual exploration (and enabling you to SSH in to +the nodes & use the live cluster ad hoc), might look like this:: + + roles: + - [mon.0, mds.0, osd.0] + - [mon.1, osd.1] + - [mon.2, client.0] + targets: + ubuntu@host07.example.com: ssh-rsa host07_ssh_key + ubuntu@host08.example.com: ssh-rsa host08_ssh_key + ubuntu@host09.example.com: ssh-rsa host09_ssh_key + tasks: + - install: + - ceph: + - ceph-fuse: [client.0] + - interactive: + repo: git://git.ceph.com/ceph.git + +The number of entries under ``roles`` and ``targets`` must match. + +Note the colon after every task name in the ``tasks`` section. Also note the +dashes before each task. This is the YAML syntax for an ordered list and +specifies the order in which tasks are executed. + +The ``install`` task needs to precede all other tasks. + +The listed targets need resolvable hostnames. If you do not have a DNS server +running, you can add entries to ``/etc/hosts``. You also need to be able to SSH +in to the listed targets without passphrases, and the remote user needs to have +passwordless `sudo` access. Note that the ssh keys at the end of the +``targets`` entries are the public ssh keys for the hosts. These are +located in /etc/ssh/ssh_host_rsa_key.pub + +If you saved the above file as ``example.yaml``, you could run +teuthology on it like this:: + + ./virtualenv/bin/teuthology example.yaml + +It is possible to configure installation so that specifying targets and host +keys can be omitted. Teuthology is run with the ``--lock`` option which locks +the targets based on ``roles`` in YAML. Teuthology grabs machines from a pool of +available test machines; but since most times machines are busy, you might have +to wait until they are free or else command fails due to lack of available +machines. To avoid this you can specify ``--block`` with ``--lock`` which will +make teuthology retry until it finds and locks required machines. + +You could also pass the ``-v`` option for more verbose execution. See +``teuthology --help`` for more options. + + +Multiple config files +--------------------- + +You can pass multiple files as arguments to teuthology. Each one +will be read as a config file, and their contents will be merged. This +allows you to share definitions of what a "simple 3 node cluster" +is. The source tree comes with ``roles/3-simple.yaml``, so we could +skip the ``roles`` section in the above ``example.yaml`` and then +run:: + + ./virtualenv/bin/teuthology roles/3-simple.yaml example.yaml + + +Reserving target machines +------------------------- + +Teuthology automatically locks nodes for you if you specify the +``--lock`` option. Without this option, you must specify machines to +run on in a ``targets.yaml`` file, and lock them using +teuthology-lock. + +Note that the default owner of a machine is of the form: USER@HOST where USER +is the user who issued the lock command and host is the machine on which the +lock command was run. + +You can override this with the ``--owner`` option when running +teuthology or teuthology-lock. + +With ``teuthology-lock`` you can also add a description, so you can +remember which tests you were running. This can be done when +locking or unlocking machines, or as a separate action with the +``--update`` option. To lock 3 machines and set a description, run:: + + ./virtualenv/bin/teuthology-lock --lock-many 3 --desc 'test foo' + +If machines become unusable for some reason, you can mark them down:: + + ./virtualenv/bin/teuthology-lock --update --status down machine1 machine2 + +To see the status of all machines, use the ``--list`` option. This can +be restricted to particular machines as well:: + + ./virtualenv/bin/teuthology-lock --list machine1 machine2 + + +Choosing machines for a job +--------------------------- + +It is possible to run jobs against machines of one or more ``machine_type`` +values. It is also possible to tell ``teuthology`` to only select those +machines which match the following criteria specified in the job's YAML: + +* ``os_type`` (e.g. 'rhel', 'ubuntu') +* ``os_version`` (e.g. '7.0', '14.04') +* ``arch`` (e.g. 'x86_64') + + +Tasks +===== + +A task is a Python module in the ``teuthology.task`` package, with a +callable named ``task``. It gets the following arguments: + +- ``ctx``: a context that is available through the lifetime of the + test run, and has useful attributes such as ``cluster``, letting the + task access the remote hosts. Tasks can also store their internal + state here. (TODO beware of namespace collisions.) +- ``config``: the data structure after the colon in the config file, + e.g. for the above ``ceph-fuse`` example, it would be a list like + ``["client.0"]``. + +Tasks can be simple functions, called once in the order they are +listed in ``tasks``. But sometimes it makes sense for a task to be +able to clean up after itself: for example, unmounting the filesystem +after a test run. A task callable that returns a Python `context +manager +`__ +will have the manager added to a stack, and the stack will be unwound +at the end of the run. This means the cleanup actions are run in +reverse order, both on success and failure. A nice way of writing +context managers is the ``contextlib.contextmanager`` decorator; look +for that string in the existing tasks to see examples, and note where +they use ``yield``. + +Further details on some of the more complex tasks such as install or workunit +can be obtained via python help. For example:: + + >>> import teuthology.task.workunit + >>> help(teuthology.task.workunit) + +displays a page of more documentation and more concrete examples. + +Some of the more important / commonly used tasks include: + +* ``ansible``: Run the ansible task. +* ``install``: by default, the install task goes to gitbuilder and installs the + results of the latest build. You can, however, add additional parameters to + the test configuration to cause it to install any branch, SHA, archive or + URL. The following are valid parameters. + + - ``branch``: specify a branch (firefly, giant...) + + - ``flavor``: specify a flavor (next, unstable...). Flavors can be thought of + as subsets of branches. Sometimes (unstable, for example) they may have a + predefined meaning. + + - ``project``: specify a project (ceph, samba...) + + - ``sha1``: install the build with this sha1 value. + + - ``tag``: specify a tag/identifying text for this build (v47.2, v48.1...) + +* ``ceph``: Bring up Ceph + +* ``overrides``: override behavior. Typically, this includes sub-tasks being + overridden. Overrides technically is not a task (there is no 'def task' in + an overrides.py file), but from a user's standpoint can be described as + behaving like one. + Sub-tasks can nest further information. For example, overrides + of install tasks are project specific, so the following section of a yaml + file would cause all ceph installations to default to using the jewel + branch:: + + overrides: + install: + ceph: + branch: jewel + +* ``workunit``: workunits are a way of grouping tasks and behavior on targets. +* ``sequential``: group the sub-tasks into a unit where the sub-tasks run + sequentially as listed. +* ``parallel``: group the sub-tasks into a unit where the sub-tasks all run in + parallel. + +Sequential and parallel tasks can be nested. Tasks run sequentially unless +specified otherwise. + +The above list is a very incomplete description of the tasks available on +teuthology. The teuthology/task subdirectory contains the teuthology-specific +python files that implement tasks. + +Extra tasks used by teuthology can be found in ceph-qa-suite/tasks. These +tasks are not needed for teuthology to run, but do test specific independent +features. A user who wants to define a test for a new feature can implement +new tasks in this directory. + +Many of these tasks are used to run python scripts that are defined in the +ceph/ceph-qa-suite. + +Troubleshooting +=============== + +Postmortem Debugging +-------------------- + +After completion of a test, the ``archive`` subdirectory is archived under +the corresponding ``remote`` subdirectory. We can disable this behavior +using the top-level configuration, like:: + + archive-on-error: true + +If ``archive-on-error`` is ``true``, the ``archive`` subdirectory is +archived only for failed tests. + +If the size of the archived file exceeds 128MB, the file will be compressed +using GZip. This threshold can be configured using the top-level option +named ``log-compress-min-size``, like:: + + log-compress-min-size: 256GB + +Other size unit postfixes are also supported, +see `humanfriendly document `__ +for more details. + +Situ Debugging +-------------- +Sometimes when a bug triggers, instead of automatic cleanup, you want +to explore the system as is. Adding a top-level:: + + interactive-on-error: true + +as a config file for teuthology will make that possible. With that +option, any *task* that fails will have the ``interactive`` task +called after it. This means that before any cleanup happens, you get a +chance to inspect the system -- both through Teuthology and via extra +SSH connections -- and the cleanup completes only when you choose. +Just exit the interactive Python session to continue the cleanup. + +You can enable interactive-on-error with the ``teuthology`` command option +``--interactive-on-error`` + +Interactive task facilities +=========================== + +The ``interactive`` task presents a prompt for you to interact with the +teuthology configuration. The ``ctx`` variable is available to explore, +and a ``pprint.PrettyPrinter().pprint`` object is added for convenience as +``pp``, so you can do things like pp(dict-of-interest) to see a formatted +view of the dict. + +This is also useful to pause the execution of the test between two tasks, +either to perform ad hoc operations, or to examine the state of the cluster. +Hit ``control-D`` to continue when done. + +You need to nest ``interactive`` underneath ``tasks`` in your config. You +can have has many ``interactive`` tasks as needed in your task list. + +An example:: + + tasks: + - ceph: + - interactive: + +Test Sandbox Directory +====================== + +Teuthology currently places most test files and mount points in a +sandbox directory, defaulting to ``/home/$USER/cephtest``. To change +the location of the sandbox directory, the following option can be +specified in ``$HOME/.teuthology.yaml``:: + + test_path: + +Shaman options +============== + +Shaman is a helper class which could be used to build the uri for specified +packages based the 'shaman_host': 'shaman.ceph.com'. + +Options:: + + use_shaman: True # Enable to use Shaman, False as default + shaman: + force_noarch: True # Force to use "noarch" to build the uri diff --git a/docs/docker-compose/README.md b/docs/docker-compose/README.md new file mode 100644 index 000000000..af346f9db --- /dev/null +++ b/docs/docker-compose/README.md @@ -0,0 +1,93 @@ +# Teuthology Development Environment Instruction + +The purpose of this guide is to help developers set +up a development environment for Teuthology. We will be using +Docker to set up all the containers for +Postgres, Paddles, Pulpito, Beanstalk, and Teuthology. + +Currently, it's possible to execute against two classes of test nodes: + +* Using containerized test nodes + * Advantage: No need for a lab at all! + * Disadvantage: Cannot run all Ceph tests; best for exercising the framework itself +* Using nodes from an existing lab (e.g. the Sepia lab) + * Advantage: Can run all Ceph tests + * Disadvantage: Requires lab access + + +Additionally, there are two modes of execution: +* One-shot (the default): Containers start up, schedule and run the `teuthology:no-ceph` suite, and shut down. Success or failure is indicated by the `start.sh` exit code. +* Wait: Containers start up, and `teuthology-dispatcher` is started, but no jobs are scheduled. Runs until the user presses Ctrl-C or `docker compose down` is run. + +The teuthology container will be built with code from the repository clone that's currently in use. + +## Prerequisites + +### Installing and Running Docker + +For Docker installation see: +https://docs.docker.com/get-docker/ + +### Using Containerized Nodes + +There's nothing special to do; see the Running Tests section below. + +### Using an Existing Lab + +This document assumes you have access to the lab that you intend to use, and that you're already familiar with its VPN and SSH infrastructure. + +Depending on your local operating system, it may be necessary to connect to the VPN before starting Docker. + +#### Using your SSH private key + +In your local shell, simply: +```bash +export SSH_PRIVKEY_PATH=$HOME/.ssh/id_rsa +``` +The teuthology container will write it to a file at runtime. + +#### Reserving Machines in the Lab + +Taking the Sepia lab as an example once again, most users will want to do something like: + +```bash +ssh teuthology.front.sepia.ceph.com +~/teuthology/virtualenv/bin/teuthology-lock \ + --lock-many 1 \ + --machine-type smithi \ + --desc "teuthology dev testing" +``` + +When you are done, don't forget to unlock! + +#### Using Lab Machines + +Once you have your machines locked, you need to provide a list of their hostnames and their machine type: + +```bash +export TESTNODES="smithi999.front.sepia.ceph.com,smithi123.front.sepia.ceph.com" +export MACHINE_TYPE="smithi" +``` + +If the lab uses a "secrets" or "inventory" repository for [ceph-cm-ansible](https://github.com/ceph/ceph-cm-ansible), you'll need to provide a URL for that. In Sepia: +```bash +export ANSIBLE_INVENTORY_REPO="https://github.com/ceph/ceph-sepia-secrets" +``` +This repo will be cloned locally, using your existing `git` configuration, and copied into the teuthology container at build time. + +## Running Tests + +To run the default `teuthology:no-ceph` suite in one-shot mode: +```bash +./start.sh +``` + +To run in wait mode: +```bash +TEUTHOLOGY_WAIT=1 ./start.sh +``` + +To schedule tests in wait mode: +```bash +docker exec docker-compose_teuthology_1 /venv/bin/teuthology-suite ... +``` diff --git a/docs/docker-compose/db/01-init.sh b/docs/docker-compose/db/01-init.sh new file mode 100755 index 000000000..b9e5adc2f --- /dev/null +++ b/docs/docker-compose/db/01-init.sh @@ -0,0 +1,8 @@ +set -e +export PGPASSWORD=$POSTGRES_PASSWORD; +psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL + CREATE USER $APP_DB_USER WITH PASSWORD '$APP_DB_PASS'; + CREATE DATABASE $APP_DB_NAME; + GRANT ALL PRIVILEGES ON DATABASE $APP_DB_NAME TO $APP_DB_USER; + \connect $APP_DB_NAME $APP_DB_USER +EOSQL \ No newline at end of file diff --git a/docs/docker-compose/docker-compose.yml b/docs/docker-compose/docker-compose.yml new file mode 100644 index 000000000..8c97cd252 --- /dev/null +++ b/docs/docker-compose/docker-compose.yml @@ -0,0 +1,94 @@ +version: '3.8' + +services: + postgres: + image: postgres:14 + healthcheck: + test: [ "CMD", "pg_isready", "-q", "-d", "paddles", "-U", "admin" ] + timeout: 5s + interval: 10s + retries: 2 + environment: + - POSTGRES_USER=root + - POSTGRES_PASSWORD=password + - APP_DB_USER=admin + - APP_DB_PASS=password + - APP_DB_NAME=paddles + volumes: + - ./db:/docker-entrypoint-initdb.d/ + ports: + - 5432:5432 + paddles: + image: quay.io/ceph-infra/paddles + environment: + PADDLES_SERVER_HOST: 0.0.0.0 + PADDLES_SQLALCHEMY_URL: postgresql+psycopg2://admin:password@postgres:5432/paddles + depends_on: + postgres: + condition: service_healthy + links: + - postgres + healthcheck: + test: ["CMD", "curl", "-f", "http://0.0.0.0:8080"] + timeout: 5s + interval: 30s + retries: 2 + ports: + - 8080:8080 + pulpito: + image: quay.io/ceph-infra/pulpito + environment: + PULPITO_PADDLES_ADDRESS: http://paddles:8080 + depends_on: + paddles: + condition: service_healthy + links: + - paddles + healthcheck: + test: ["CMD", "curl", "-f", "http://0.0.0.0:8081"] + timeout: 5s + interval: 10s + retries: 2 + ports: + - 8081:8081 + beanstalk: + build: ../../beanstalk/alpine + ports: + - "11300:11300" + teuthology: + build: + context: ../../ + dockerfile: ./docs/docker-compose/teuthology/Dockerfile + args: + SSH_PRIVKEY_FILE: $SSH_PRIVKEY_FILE + depends_on: + paddles: + condition: service_healthy + links: + - paddles + - beanstalk + environment: + SSH_PRIVKEY: + SSH_PRIVKEY_FILE: + MACHINE_TYPE: + TESTNODES: + TEUTHOLOGY_WAIT: + TEUTH_BRANCH: + volumes: + - /tmp/archive_dir:/archive_dir:rw + testnode: + build: + context: ./testnode + dockerfile: ./Dockerfile + deploy: + replicas: 3 + depends_on: + paddles: + condition: service_healthy + links: + - paddles + ports: + - "22" + environment: + SSH_PUBKEY: + platform: linux/amd64 diff --git a/docs/docker-compose/start.sh b/docs/docker-compose/start.sh new file mode 100755 index 000000000..b4132ecb8 --- /dev/null +++ b/docs/docker-compose/start.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e +export TEUTHOLOGY_BRANCH=${TEUTHOLOGY_BRANCH:-$(git branch --show-current)} +export TEUTH_BRANCH=${TEUTHOLOGY_BRANCH} +if [ -n "$ANSIBLE_INVENTORY_REPO" ]; then + basename=$(basename $ANSIBLE_INVENTORY_REPO | cut -d. -f1) + if [ ! -d "$basename" ]; then + git clone \ + --depth 1 \ + $ANSIBLE_INVENTORY_REPO + fi + mkdir -p teuthology/ansible_inventory + cp -rf $basename/ansible/ teuthology/ansible_inventory + if [ ! -d teuthology/ansible_inventory/hosts ]; then + mv -f teuthology/ansible_inventory/inventory teuthology/ansible_inventory/hosts + fi +fi +# Make the hosts and secrets directories, so that the COPY instruction in the +# Dockerfile does not cause a build failure when not using this feature. +mkdir -p teuthology/ansible_inventory/hosts teuthology/ansible_inventory/secrets + +if [ -n "$CUSTOM_CONF" ]; then + cp "$CUSTOM_CONF" teuthology/ +fi + +# Generate an SSH keypair to use if necessary +if [ -z "$SSH_PRIVKEY_PATH" ]; then + SSH_PRIVKEY_PATH=$(mktemp -u /tmp/teuthology-ssh-key-XXXXXX) + ssh-keygen -t rsa -N '' -f $SSH_PRIVKEY_PATH + export SSH_PRIVKEY=$(cat $SSH_PRIVKEY_PATH) + export SSH_PUBKEY=$(cat $SSH_PRIVKEY_PATH.pub) + export SSH_PRIVKEY_FILE=id_rsa +else + export SSH_PRIVKEY=$(cat $SSH_PRIVKEY_PATH) + export SSH_PRIVKEY_FILE=$(basename $SSH_PRIVKEY_PATH | cut -d. -f1) +fi + +if [ -z "$TEUTHOLOGY_WAIT" ]; then + DC_EXIT_FLAG='--abort-on-container-exit --exit-code-from teuthology' + DC_AUTO_DOWN_CMD='docker compose down' +fi +export TEUTHOLOGY_WAIT + +trap "docker compose down" SIGINT +docker compose up \ + --build \ + $DC_EXIT_FLAG +$DC_AUTO_DOWN_CMD diff --git a/docs/docker-compose/testnode/Dockerfile b/docs/docker-compose/testnode/Dockerfile new file mode 100644 index 000000000..15fd23504 --- /dev/null +++ b/docs/docker-compose/testnode/Dockerfile @@ -0,0 +1,26 @@ +FROM ubuntu:22.04 +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && \ + apt -y install \ + sudo \ + openssh-server \ + hostname \ + curl \ + python3-pip \ + apache2 \ + nfs-kernel-server && \ + apt clean all +COPY testnode_start.sh / +COPY testnode_stop.sh / +COPY testnode_sudoers /etc/sudoers.d/teuthology +RUN \ + ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key -N '' && \ + sed -i 's/#PermitRootLogin yes/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + mkdir -p /root/.ssh && \ + chmod 700 /root/.ssh && \ + useradd -g sudo ubuntu && \ + mkdir -p /home/ubuntu/.ssh && \ + chmod 700 /home/ubuntu/.ssh && \ + chown -R ubuntu /home/ubuntu +EXPOSE 22 +ENTRYPOINT /testnode_start.sh diff --git a/docs/docker-compose/testnode/testnode_start.sh b/docs/docker-compose/testnode/testnode_start.sh new file mode 100755 index 000000000..6da13a7d0 --- /dev/null +++ b/docs/docker-compose/testnode/testnode_start.sh @@ -0,0 +1,17 @@ +#!/usr/bin/bash +set -x +echo "$SSH_PUBKEY" > /root/.ssh/authorized_keys +echo "$SSH_PUBKEY" > /home/ubuntu/.ssh/authorized_keys +chown ubuntu /home/ubuntu/.ssh/authorized_keys +. /etc/os-release +if [ $ID = 'centos' ]; then + VERSION_ID=${VERSION_ID}.stream +fi +payload="{\"name\": \"$(hostname)\", \"machine_type\": \"testnode\", \"up\": true, \"locked\": false, \"os_type\": \"${ID}\", \"os_version\": \"${VERSION_ID}\"}" +for i in $(seq 1 5); do + echo "attempt $i" + curl -v -f -d "$payload" http://paddles:8080/nodes/ && break + sleep 1 +done +mkdir -p /run/sshd +exec /usr/sbin/sshd -D diff --git a/docs/docker-compose/testnode/testnode_stop.sh b/docs/docker-compose/testnode/testnode_stop.sh new file mode 100755 index 000000000..8bb74bbf8 --- /dev/null +++ b/docs/docker-compose/testnode/testnode_stop.sh @@ -0,0 +1,9 @@ +#!/usr/bin/bash +set -x +hostname=$(hostname) +payload="{\"name\": \"$hostname\", \"machine_type\": \"testnode\", \"up\": false}" +for i in $(seq 1 5); do + echo "attempt $i" + curl -s -f -X PUT -d "$payload" http://paddles:8080/nodes/$hostname/ && break + sleep 1 +done diff --git a/docs/docker-compose/testnode/testnode_sudoers b/docs/docker-compose/testnode/testnode_sudoers new file mode 100644 index 000000000..35828ad9b --- /dev/null +++ b/docs/docker-compose/testnode/testnode_sudoers @@ -0,0 +1,4 @@ +%sudo ALL=(ALL) NOPASSWD: ALL +# For ansible pipelining +Defaults !requiretty +Defaults visiblepw diff --git a/docs/docker-compose/teuthology/.teuthology.yaml b/docs/docker-compose/teuthology/.teuthology.yaml new file mode 100644 index 000000000..bac8ec1aa --- /dev/null +++ b/docs/docker-compose/teuthology/.teuthology.yaml @@ -0,0 +1,9 @@ +queue_host: beanstalk +queue_port: 11300 +lock_server: http://paddles:8080 +results_server: http://paddles:8080 +results_ui_server: http://pulpito:8081/ +teuthology_path: /teuthology +archive_base: /archive_dir +reserve_machines: 0 +lab_domain: '' \ No newline at end of file diff --git a/docs/docker-compose/teuthology/Dockerfile b/docs/docker-compose/teuthology/Dockerfile new file mode 100644 index 000000000..bfd3882ab --- /dev/null +++ b/docs/docker-compose/teuthology/Dockerfile @@ -0,0 +1,49 @@ +FROM ubuntu:22.04 +ARG SSH_PRIVKEY_FILE=id_ed25519 +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=C.UTF-8 +RUN apt-get update && \ + apt-get install -y \ + git \ + jq \ + curl \ + qemu-utils \ + python3-dev \ + libssl-dev \ + ipmitool \ + python3-pip \ + python3-venv \ + vim \ + locales-all \ + libev-dev \ + libvirt-dev \ + libffi-dev \ + libyaml-dev \ + locales \ + lsb-release && \ + apt-get clean all && \ + locale-gen $LC_ALL +WORKDIR /teuthology +COPY requirements.txt requirements.yml ansible.cfg bootstrap /teuthology/ +RUN \ + cd /teuthology && \ + mkdir ../archive_dir && \ + mkdir log && \ + chmod +x /teuthology/bootstrap && \ + PIP_INSTALL_FLAGS="-r requirements.txt" ./bootstrap +COPY . /teuthology +RUN \ + ./bootstrap +COPY docs/docker-compose/teuthology/containerized_node.yaml /teuthology +COPY docs/docker-compose/teuthology/.teuthology.yaml /root +COPY docs/docker-compose/teuthology/teuthology.sh / +RUN mkdir -p /etc/ansible +COPY docs/docker-compose/teuthology/ansible_inventory/hosts /etc/ansible/ +COPY docs/docker-compose/teuthology/ansible_inventory/secrets /etc/ansible/ +RUN \ + mkdir $HOME/.ssh && \ + touch $HOME/.ssh/${SSH_PRIVKEY_FILE} && \ + chmod 600 $HOME/.ssh/${SSH_PRIVKEY_FILE} && \ + echo "StrictHostKeyChecking=no" > $HOME/.ssh/config && \ + echo "UserKnownHostsFile=/dev/null" >> $HOME/.ssh/config +ENTRYPOINT /teuthology.sh diff --git a/docs/docker-compose/teuthology/containerized_node.yaml b/docs/docker-compose/teuthology/containerized_node.yaml new file mode 100644 index 000000000..02304886c --- /dev/null +++ b/docs/docker-compose/teuthology/containerized_node.yaml @@ -0,0 +1,8 @@ +overrides: + ansible.cephlab: + skip_tags: "timezone,nagios,monitoring-scripts,ssh,hostname,pubkeys,zap,sudoers,kerberos,selinux,lvm,ntp-client,resolvconf,packages,cpan,nfs" + vars: + containerized_node: true + ansible_user: root + cm_user: root + start_rpcbind: false diff --git a/docs/docker-compose/teuthology/teuthology.sh b/docs/docker-compose/teuthology/teuthology.sh new file mode 100755 index 000000000..78770e323 --- /dev/null +++ b/docs/docker-compose/teuthology/teuthology.sh @@ -0,0 +1,46 @@ +#!/usr/bin/bash +set -e +# We don't want -x yet, in case the private key is sensitive +if [ -n "$SSH_PRIVKEY_FILE" ]; then + echo "$SSH_PRIVKEY" > $HOME/.ssh/$SSH_PRIVKEY_FILE +fi +source /teuthology/virtualenv/bin/activate +set -x +if [ -n "$TESTNODES" ]; then + for node in $(echo $TESTNODES | tr , ' '); do + teuthology-update-inventory -m $MACHINE_TYPE $node + done + CUSTOM_CONF=${CUSTOM_CONF:-} +else + CUSTOM_CONF=/teuthology/containerized_node.yaml +fi +export MACHINE_TYPE=${MACHINE_TYPE:-testnode} +if [ -z "$TEUTHOLOGY_WAIT" ]; then + if [ -n "$TEUTH_BRANCH" ]; then + TEUTH_BRANCH_FLAG="--teuthology-branch $TEUTH_BRANCH" + fi + teuthology-suite -v \ + $TEUTH_BRANCH_FLAG \ + --ceph-repo https://github.com/ceph/ceph.git \ + --suite-repo https://github.com/ceph/ceph.git \ + -c main \ + -m $MACHINE_TYPE \ + --limit 1 \ + -n 100 \ + --suite teuthology:no-ceph \ + --filter-out "libcephfs,kclient,stream,centos,rhel" \ + -d ubuntu -D 22.04 \ + --suite-branch main \ + --subset 9000/100000 \ + -p 75 \ + --seed 349 \ + --force-priority \ + $CUSTOM_CONF + DISPATCHER_EXIT_FLAG='--exit-on-empty-queue' + teuthology-queue -m $MACHINE_TYPE -s | \ + python3 -c "import sys, json; assert json.loads(sys.stdin.read())['count'] > 0, 'queue is empty!'" +fi +teuthology-dispatcher -v \ + --log-dir /teuthology/log \ + --tube $MACHINE_TYPE \ + $DISPATCHER_EXIT_FLAG diff --git a/docs/downburst_vms.rst b/docs/downburst_vms.rst new file mode 100644 index 000000000..d649be66b --- /dev/null +++ b/docs/downburst_vms.rst @@ -0,0 +1,89 @@ +.. _downburst_vms: + +============= +Downburst VMs +============= + +Teuthology also supports virtual machines via `downburst +`__, which can function like physical +machines but differ in the following ways: + +VPS Hosts: +-------- +The following description is based on the Red Hat lab used by the upstream Ceph +development and quality assurance teams. + +The teuthology database of available machines contains a vpshost field. +For physical machines, this value is null. For virtual machines, this entry +is the name of the physical machine that that virtual machine resides on. + +There are fixed "slots" for virtual machines that appear in the teuthology +database. These slots have a machine type of vps and can be locked like +any other machine. The existence of a vpshost field is how teuthology +knows whether or not a database entry represents a physical or a virtual +machine. + +In order to get the right virtual machine associations, the following needs +to be set in ~/.config/libvirt/libvirt.conf or for some older versions +of libvirt (like ubuntu precise) in ~/.libvirt/libvirt.conf:: + + uri_aliases = [ + 'mira001=qemu+ssh://ubuntu@mira001.front.sepia.ceph.com/system?no_tty=1', + 'mira003=qemu+ssh://ubuntu@mira003.front.sepia.ceph.com/system?no_tty=1', + 'mira004=qemu+ssh://ubuntu@mira004.front.sepia.ceph.com/system?no_tty=1', + 'mira005=qemu+ssh://ubuntu@mira005.front.sepia.ceph.com/system?no_tty=1', + 'mira006=qemu+ssh://ubuntu@mira006.front.sepia.ceph.com/system?no_tty=1', + 'mira007=qemu+ssh://ubuntu@mira007.front.sepia.ceph.com/system?no_tty=1', + 'mira008=qemu+ssh://ubuntu@mira008.front.sepia.ceph.com/system?no_tty=1', + 'mira009=qemu+ssh://ubuntu@mira009.front.sepia.ceph.com/system?no_tty=1', + 'mira010=qemu+ssh://ubuntu@mira010.front.sepia.ceph.com/system?no_tty=1', + 'mira011=qemu+ssh://ubuntu@mira011.front.sepia.ceph.com/system?no_tty=1', + 'mira013=qemu+ssh://ubuntu@mira013.front.sepia.ceph.com/system?no_tty=1', + 'mira014=qemu+ssh://ubuntu@mira014.front.sepia.ceph.com/system?no_tty=1', + 'mira015=qemu+ssh://ubuntu@mira015.front.sepia.ceph.com/system?no_tty=1', + 'mira017=qemu+ssh://ubuntu@mira017.front.sepia.ceph.com/system?no_tty=1', + 'mira018=qemu+ssh://ubuntu@mira018.front.sepia.ceph.com/system?no_tty=1', + 'mira020=qemu+ssh://ubuntu@mira020.front.sepia.ceph.com/system?no_tty=1', + 'mira024=qemu+ssh://ubuntu@mira024.front.sepia.ceph.com/system?no_tty=1', + 'mira029=qemu+ssh://ubuntu@mira029.front.sepia.ceph.com/system?no_tty=1', + 'mira036=qemu+ssh://ubuntu@mira036.front.sepia.ceph.com/system?no_tty=1', + 'mira043=qemu+ssh://ubuntu@mira043.front.sepia.ceph.com/system?no_tty=1', + 'mira044=qemu+ssh://ubuntu@mira044.front.sepia.ceph.com/system?no_tty=1', + 'mira074=qemu+ssh://ubuntu@mira074.front.sepia.ceph.com/system?no_tty=1', + 'mira079=qemu+ssh://ubuntu@mira079.front.sepia.ceph.com/system?no_tty=1', + 'mira081=qemu+ssh://ubuntu@mira081.front.sepia.ceph.com/system?no_tty=1', + 'mira098=qemu+ssh://ubuntu@mira098.front.sepia.ceph.com/system?no_tty=1', + ] + +Downburst: +---------- + +When a virtual machine is locked, downburst is run on that machine to install a +new image. This allows the user to set different virtual OSes to be installed +on the newly created virtual machine. Currently the default virtual machine is +ubuntu (precise). A different vm installation can be set using the +``--os-type`` and ``--os-version`` options in ``teuthology.lock``. + +When a virtual machine is unlocked, downburst destroys the image on the +machine. + +To find the downburst executable, teuthology first checks the PATH environment +variable. If not defined, teuthology next checks for +src/downburst/virtualenv/bin/downburst executables in the user's home +directory, /home/ubuntu, and /home/teuthology. This can all be overridden if +the user specifies a downburst field in the user's .teuthology.yaml file. + +Host Keys: +---------- + +Because teuthology reinstalls a new machine, a new hostkey is generated. After +locking, once a connection is established to the new machine, +``teuthology-lock`` with the ``--list`` or ``--list-targets`` options will +display the new keys. When vps machines are locked using the ``--lock-many`` +option, a message is displayed indicating that ``--list-targets`` should be run +later. + +Assumptions: +------------ + +It is assumed that downburst is on the user's ``$PATH``. diff --git a/docs/exporter.rst b/docs/exporter.rst new file mode 100644 index 000000000..fb729a82c --- /dev/null +++ b/docs/exporter.rst @@ -0,0 +1,67 @@ +.. _exporter: + +================================== +The Teuthology Prometheus Exporter +================================== + +To help make it easier to determine the status of the lab, we've created a +`Prometheus `__ exporter (helpfully named +`teuthology-exporter`. We use `Grafana `__ to visualize +the data we collect. + +It listens on port 61764, and scrapes every 60 seconds by default. + + +Exposed Metrics +=============== + +.. list-table:: + + * - Name + - Type + - Description + - Labels + * - beanstalk_queue_length + - Gauge + - The number of jobs in the beanstalkd queue + - machine type + * - beanstalk_queue_paused + - Gauge + - Whether or not the beanstalkd queue is paused + - machine type + * - teuthology_dispatchers + - Gauge + - The number of running teuthology-dispatcher instances + - machine type + * - teuthology_job_processes + - Gauge + - The number of running job *processes* + - + * - teuthology_job_results_total + - Gauge + - The number of completed jobs + - status (pass/fail/dead) + * - teuthology_nodes + - Gauge + - The number of test nodes + - up, locked + * - teuthology_job_duration_seconds + - Summary + - The time it took to run a job + - suite + * - teuthology_task_duration_seconds + - Summary + - The time it took for each phase of each task to run + - name, phase (enter/exit) + * - teuthology_bootstrap_duration_seconds + - Summary + - The time it took to run teuthology's bootstrap script + - + * - teuthology_node_locking_duration_seconds + - Summary + - The time it took to lock nodes + - machine type, count + * - teuthology_node_reimaging_duration_seconds + - Summary + - The time it took to reimage nodes + - machine type, count diff --git a/docs/fragment_merging.rst b/docs/fragment_merging.rst new file mode 100644 index 000000000..9652a5cd5 --- /dev/null +++ b/docs/fragment_merging.rst @@ -0,0 +1,318 @@ +.. _fragment_merging: + +================ +Fragment Merging +================ + +Once the matrix of YAML fragments is constructed by teuthology, the fragments +must be merged together and processed. Up until 2022, this merging process was +static: all of the YAML fragments were joined together in lexicographical order +with duplicate fragment members *deep merged* together (e.g. the "tasks" +array). Now, fragments and entire job specifications can be dynamically changed +or dropped according to Lua scripts embedded in the fragment. + +premerge Scripts +================ + +The first phase of script execution takes place in the *premerge* step. Each +fragment may have its own premerge script which is run before the fragment is +merged. The script is defined as follows:: + + teuthology: + premerge: | + if yaml.os_type == 'ubuntu' then reject() end + +Again, this script will run prior to the YAML fragment merging into the +complete YAML specification for a job. The script has access to the YAML job +description (the ``yaml`` variable) generated so far from the fragments merged +prior to this one (remember: fragments are ordered lexicographically). In the +above case, the ``os_type`` is checked such that the fragment is dropped +(rejected) if the job is configured to run on Ubuntu. Note: this does not +account for a jobs' default os_type which is not yet known; only the +``os_type`` specified by the YAML fragments is usable in these scripts. + +When run in the premerge step, the ``reject`` function causes the fragment to be +dropped from the job: none of its YAML will be merged into the job. The +``accept`` function causes the fragment to be merged. The default action is to +accept the fragment. + +postmerge Scripts +================= + +The second phase of script execution is the *postmerge* step run after all +fragments have been merged. At this point, the YAML specification for the job +is all but complete. Scripts can now make final modifications to the YAML or +reject the job completely causing it to be removed from the list of jobs to be +scheduled. An example postmerge script:: + + teuthology: + postmerge: + - if yaml.os_type == "ubuntu" then reject() end + +This script is the same but has a different effect: after combining all the +YAML fragments for a job, if the os_type is "ubuntu" then the entire job is +dropped (filtered out / rejected). postmerge scripts are also specified as a +list of strings in the ``teuthology.postmerge`` array which may span multiple +fragments. During the postmerge step, all of these strings are concatenated and +then executed as a single script. You may use this to define variables, +functions, or anything else you need. + +Scripts have access to the entire yaml object and may use it to do advanced +checks. It is also possible to programatically change the YAML definition:: + + teuthology: + postmerge: + - | + -- use the lupa "attrgetter" to fetch attrs not items via Lua's indexing + local attr = py_attrgetter + local tasks = py_list() + for i = 1, 3 do + local task = py_dict() + task.exec = py_dict() + task.exec["mon.a"] = py_list() + attr(task.exec["mon.a"]).append("echo "..i) + attr(tasks).append(task) + end + deep_merge(yaml.tasks, tasks) + + +This will be as if the YAML fragment contained:: + + tasks: + - exec: + mon.a: + - echo 1 + - exec: + mon.a: + - echo 2 + - exec: + mon.a: + - echo 3 + +Except the tasks are appended to the end after all fragments have been loaded. +This is opposed to the normal mode of the tasks appending when the fragment is +merged (in lexicographic order). + +API +=== + +Scripts are well sandboxed with access to a small selection of the Lua builtin +libraries. There is also access to some Python/Lupa specific functions which +are prefixed with ``py_``. No I/O or other system functions permitted. + +The Lua builtins available include:: + + assert + error + ipairs + pairs + tonumber + tostring + +Additionally, the Python functions exposed via Lupa include:: + + py_attrgetter = python.as_attrgetter + py_dict = python.builtins.dict + py_list = python.builtins.list + py_tuple = python.builtins.tuple + py_enumerate = python.enumerate + py_iterex = python.iterex + py_itemgetter = python.as_itemgetter + +These are all prefixed with ``py_``. See the `Lupa documentation +`__ for more information. + +Finally, teuthology exposes the following functions for scripts: + +:: + + accept() + +The ``accept`` function stops script execution and causes the fragment to be +merged (premerge script) or the job to be accepted for scheduling (postmerge +script). The default action of a script is to accept. + +:: + + reject() + +The ``reject`` function stops script execution and causes the fragment to be +dropped (premerge script) or the job to be rejected for scheduling (postmerge +script). + + +:: + + deep_merge(a, b) + +The ``deep_merge`` function comes from the teuthology code base. It's used to +merge YAML structures. It's provided for convenience to ease a common operation +on Python (yaml) objects. The function merges ``b`` into ``a``. + + +:: + + log + +The ``log`` Python class (object) allows Lua to leave debugging in the +``teuthology-suite`` log. + +:: + + yaml_load(str) + +This function loads the YAML string and returns it as a Python structure (of +dicts, lists, etc.). + + +Concrete Example +================ + +The +`fs:upgrade:mds_upgrade_sequence `__ +sub-suite tests that the `upgrade sequence for CephFS `__ +is followed when the cluster is managed by cephadm. The most interesting set of YAML in this suite is in ``tasks/``:: + + % + 0-from/ + pacific.yaml + v16.2.4.yaml + 1-volume/ + 0-create.yaml + 1-ranks/ + 1.yaml + 2.yaml + 2-allow_standby_replay/ + yes.yaml + no.yaml + 3-inline + yes.yaml + no.yaml + 4-verify.yaml + 2-client.yaml + 3-upgrade-with-workload.yaml + 4-verify.yaml + +Basically: upgrade the cluster from one of two versions of pacific, create a +volume (fs), possibly turn some knobs in the MDSMap, and verify the upgrade +completes correctly. This works well and is an excellent example of effective +matrix construction for testing. + +The feature we want to test is a `new upgrade procedure +`__ for the MDS. It only requires +"failing" the file systems which removes all running MDS from the MDSMap and +prevents any MDS from "joining" the file system (becoming active). The upgrade +procedure then upgrades the packages, restarts the MDS, then sets the file +system to allow MDS to join (become active). Ideally, we could modify the +matrix this way:: + + % + fail_fs/ + yes.yaml + no.yaml + tasks/ + % + 0-from/ + pacific.yaml + v16.2.4.yaml + 1-volume/ + 0-create.yaml + 1-ranks/ + 1.yaml + 2.yaml + 2-allow_standby_replay/ + yes.yaml + no.yaml + 3-inline + yes.yaml + no.yaml + 4-verify.yaml + 2-client.yaml + 3-upgrade-with-workload.yaml + 4-verify.yaml + +So we just change (or don't change) a single config option in ``fail_fs`` +which turns on that upgrade path:: + + overrides: + ceph: + conf: + mgr: + mgr/orchestrator/fail_fs: true + +The complication however is that this new ``fail_fs`` config option is only +understood by the newest mgr (the ``main`` branch or possibly the latest +pacific or quincy)... and the mons won't let you set a config unknown to exist. +So, we must do a staggered upgrade to test this new upgrade path: the mgr must +be upgraded, a config option set to change how MDS upgrades are performed, and +then the cluster may continue upgrading. + +**Here's the problem**: the mgr only knows how to do a staggered upgrade +beginning with v16.2.10. So, we can't even upgrade from v16.2.4 to test this +new upgrade path. + +(One might be tempted to remove v16.2.4 as an upgrade path in +QA but we must continue testing this due to major (breaking) changes in the +MDSMap across v16.2.4 and v16.2.5. It would not be acceptable to remove it.) + +To get around this awkward problem, we can use the new scripting of fragment +merging to control whether this ``mgr/orchestrator/fail_fs`` config option is +set. If we are upgrading from v16.2.4, then drop any jobs in the matrix that +also want to test this new MDS upgrade procedure. So we modify the yaml +fragments as:: + + fail_fs/no.yaml: + teuthology: + variables: + fail_fs: false + overrides: + ceph: + conf: + mgr: + mgr/orchestrator/fail_fs: false + + fail_fs/yes.yaml: + teuthology: + variables: + fail_fs: true + overrides: + ceph: + conf: + mgr: + mgr/orchestrator/fail_fs: true + + tasks/0-from/v16.2.4.yaml: + teuthology: + postmerge: + - if yaml.teuthology.variables.fail_fs then reject() end + ... + + +We have set a variable (for ease of programming) in a +``teuthology['variables']`` dictionary which indicates whether the merged YAML +includes the ``fail_fs`` feature or not. Then, if we're upgrading from v16.2.4 +and that variable is true, drop that set of jobs in the matrix. This +effectively prevents any testing of this upgrade procedure when the cluster is +upgraded from v16.2.4. + +Note: the final merged QA code also includes a YAML fragment to perform a +staggered upgrade of the ``ceph-mgr``. This YAML fragment is dropped using a +premerge script if we're not testing ``fail_fs``; there is no reason to do a +staggered upgrade if we don't need to. See the code if you'd like to see how +that works! + + +Why Lua +======= + +Lua is a small, extensible, and easily sandboxed scripting environment. Python +is difficult to sandbox correctly and its restrictions make it difficult to +embed in YAML (like indentation for code blocks). + + +Python-Lua +========== + +`Lupa `__ is the most recent derivative of the +"lunatic" python project. It allows for trivial cross-talk between Python and +Lua worlds. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..82db430f5 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,26 @@ +Content Index +============= + +.. toctree:: + :maxdepth: 2 + + README.rst + intro_testers.rst + fragment_merging.rst + siteconfig.rst + detailed_test_config.rst + openstack_backend.rst + libcloud_backend.rst + downburst_vms.rst + INSTALL.rst + LAB_SETUP.rst + exporter.rst + commands/list.rst + ChangeLog.rst + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/intro_testers.rst b/docs/intro_testers.rst new file mode 100644 index 000000000..2abf97474 --- /dev/null +++ b/docs/intro_testers.rst @@ -0,0 +1,81 @@ +.. _intro_testers: + +======================== +Introduction for Testers +======================== + +This document is aimed at providing an introduction to running existing test suites. + +We assume here that you have access to an operational test lab; if not, ask +your local admin for access! + +If you're here to test upstream Ceph, start `here +`__. + + +Terminology +=========== + +In the abstract, each set of tests is defined by a `suite`. All of our suites +live in the `ceph` git repository in the `qa/suites/ directory +`__ . +Each subdirectory in `suites` is a suite; they may also have "sub-suites" which +may aid in scheduling, for example, tests for a specific feature. + +In concrete terms, a `run` is what is created by assembling the contents of a +`suite` into a number of `jobs`. A `job` is created by assembling a number of +`fragments` (also known as `facets`) together. Each `fragment` is in `YAML +`__ format. + +Each `job` definition contains a list of `tasks` to execute, along with +`roles`. `Roles` tell `teuthology` how many nodes to use for each `job` along +with what functions each node will perform. + +To go into more depth regarding suite design, see the `README +`__. + +One example of this is the `smoke +`__ suite. + + +Scheduling +========== +Most testing happens by scheduling `runs`. The way we do that is using the +`teuthology-suite` command. + +To get a preview of what `teuthology-suite` might do, try:: + + teuthology-suite -v -m mira --ceph-repo http://github.com/ceph/ceph.git -c main --suite-repo http://github.com/ceph/ceph.git -s smoke --dry-run + +The `-m mira` specifies `mira` as the machine type. Machine types are dependent +on the specific lab in use. The `--ceph-repo http://github.com/ceph/ceph.git` +specifies from which git repository to pull `-c main`. Similarly, +`--suite-repo` is specifying where to find the QA branch. The default for +`--ceph-repo` and `--suite-repo` is `http://github.com/ceph/ceph-ci.git` which +is usually what you will want. For `main`, you must always use +`http://github.com/ceph/ceph.git` as it does not exist on the ceph-ci +repository. + +Assuming a build is available, that should pretend to schedule several jobs. If +it complains about missing packages, try swapping `main` with `jewel` or one +of the other Ceph stable branches. + +To see even more detail, swap `-v` with `-vv`. It will print out each job +definition in full. To limit the number of jobs scheduled, you may want to use +the `--limit`, `--filter`, or `--filter-out` flags. + +To actually schedule, drop `--dry-run` and optionally use the `--email` flag to +get an email when the test run completes. + +`teuthology-suite` also prints out a link to the run in `pulpito +`__ that will display the current status of +each job. The Sepia lab's pulpito instance is `here +`__. + +There may be times when, after scheduling a run containing a large number of +jobs, that you want to reschedule only those jobs which have failed or died for +some other reason. For that use-case, `teuthology-suite` has a `--rerun`/`-r` +flag, and an optional `--rerun-statuses`/`-R` flag. An example of its usage +is:: + + teuthology-suite -m smithi -c wip-pdonnell-testing-20170718 --rerun pdonnell-2017-07-19_19:04:52-multimds-wip-pdonnell-testing-20170718-testing-basic-smithi -R dead --dry-run diff --git a/docs/laptop/README.md b/docs/laptop/README.md new file mode 100644 index 000000000..c44a62b7b --- /dev/null +++ b/docs/laptop/README.md @@ -0,0 +1,455 @@ +# Teuthology Development Environment Guide + +This is a brief guide how to setup teuthology development environment +on your laptop (desktop if you wish). Though everything in this guide +can be implemented as one handy script, some more details how things +work can be helpful to document. + +## Introduction + +Teuthology consists from the following components: + +teuthology - the core framework which can run a job, +the config file which describes test environment +and task list to execute. + +- paddles - a database and the api +- pulpito - web gui for paddles +- beanstalkd - the job queue + +The teuthology core includes following main tools: +- teuthology-suite +- teuthology-schedule +- teuthology-worker +- teuthology (formerly teuthology-run). +- teuthology-lock - allows to lock and provision nodes + separately from run. + +## Docker + +Though paddles and pulpito can be run as services using supervisord +it is often useful to have them isolated in a container. +There can be used any of available tools, but here are example for +bare docker. + +### Start docker and add shared network + +Add your user to docker group and start the service: + +```bash +sudo usermod -aG docker $USER +sudo service docker start +``` + +Create paddles network for container interaction: + +```bash +docker network create paddles +``` + +### Run postgres + +Start postgres containers in order to use paddles: + +```bash +mkdir -p $HOME/.teuthology/postgres +docker run -d -p 5432:5432 --network paddles --name paddles-postgres \ + -e POSTGRES_PASSWORD=secret \ + -e POSTGRES_USER=paddles \ + -e POSTGRES_DB=paddles \ + -e PGDATA=/var/lib/postgresql/data/pgdata \ + -v $HOME/.teuthology/postgres:/var/lib/postgresql/data postgres +``` + +NOTE. When running container on MacOS X using podman postgres may experience +troubles with volume directory binds because of podman machine, thus use regular +volumes like `-v paddlesdb:/var/lib/postgresql/data`. + +### Run paddles + +Checkout paddles and build the image: + +```bash +cd ~/paddles && docker build . --file Dockerfile --tag paddles +``` + +Run the container with previously created network: + +```bash +docker run -d --network paddles --name api -p 8080:8080 \ + -e PADDLES_SERVER_HOST=0.0.0.0 \ + -e PADDLES_SQLALCHEMY_URL=postgresql+psycopg2://paddles:secret@paddles-postgres/paddles \ + -e PADDLES_JOB_LOG_HREF_TEMPL='http://localhost:8888/{run_name}/{job_id}/teuthology.log' \ + paddles +``` + +Note: we provide job log href template here, so the logs can be referenced logs in archive share +correctly, for details see below in `Run dispatcher` section. + +### Run pulpito + +Checkout pulpito and build the image: + +```bash +cd ~/pulpito && docker build . --file Dockerfile --tag pulpito +``` + +Run the container: + +```bash +docker run -d --network paddles --name web -p 8081:8081 -e PULPITO_PADDLES_ADDRESS=http://api:8080 pulpito +``` + +NOTE. Restart pulpito container: + +```bash +docker kill web ; docker container rm web +``` + +NOTE. You can check all listening ports by: + +```bash +sudo lsof -i -P -n | grep LISTEN +``` + +NOTE. You can check database connection using: + +```bash +psql -h localhost -U paddles -l +``` + +## Setup Libvirt for Downburst + +Add libvirt host nodes: + +```sql +insert into nodes (name, machine_type, is_vm, locked, up) values ('localhost', 'libvirt', false, true, true); +insert into nodes (name, machine_type, is_vm, locked, up, mac_address, vm_host_id) values ('target-00.local', 'vps', true, false, false, '52:54:00:00:00:00', (select id from nodes where name='localhost')); +insert into nodes (name, machine_type, is_vm, locked, up, mac_address, vm_host_id) values ('target-01.local', 'vps', true, false, false, '52:54:00:00:00:01', (select id from nodes where name='localhost')); +insert into nodes (name, machine_type, is_vm, locked, up, mac_address, vm_host_id) values ('target-02.local', 'vps', true, false, false, '52:54:00:00:00:02', (select id from nodes where name='localhost')); +insert into nodes (name, machine_type, is_vm, locked, up, mac_address, vm_host_id) values ('target-03.local', 'vps', true, false, false, '52:54:00:00:00:03', (select id from nodes where name='localhost')); +``` +or just use the following command: + +```bash +psql -h localhost -U paddles -d paddles < docs/laptop/targets.sql +``` + +Add libvirt config file so downburst able to use 'localhost' node to connect to: + +```bash +cat > ~/.config/libvirt/libvirt.conf << END +uri_aliases = [ + 'localhost=qemu:///system?no_tty=1', +] + +END +``` + +Add your user to wheel group and allow to wheel group to passwordless access libvirt: + +```bash + +sudo usermod -a -G wheel $USER + +``` + +Allow users in wheel group to manage the libvirt daemon without authentication: + +```bash + +sudo tee /etc/polkit-1/rules.d/50-libvirt.rules << END +polkit.addRule(function(action, subject) { + if (action.id == "org.libvirt.unix.manage" && + subject.isInGroup("wheel")) { + return polkit.Result.YES; + } +}); + +END + +``` + +(Taken from: https://octetz.com/docs/2020/2020-05-06-linux-hypervisor-setup/) + +Make sure libvirtd is running: + +```bash +sudo service libvirtd start +``` + +NOTE. You can check you are able to access libvirt without password: + +```bash + +virsh -c qemu:///system list + +``` + +Make sure libvirt front network exists, it can be defined as NAT and +include dhcp records for the target nodes: + +```xml + + front + + + + + + + + + + + + + + +``` +for example: + +```bash +virsh -c qemu:///system net-define docs/laptop/front.xml + +``` + +(for details, look https://jamielinux.com/docs/libvirt-networking-handbook/appendix/dhcp-host-entries.html) + +Add corresponding records to your /etc/hosts: + +```txt +192.168.123.100 target-00 target-00.local +192.168.123.101 target-01 target-01.local +192.168.123.102 target-02 target-02.local +192.168.123.103 target-03 target-03.local +``` +you can take it from corresponding file: +``` +sudo tee -a /etc/hosts < docs/laptop/hosts +``` + +Make sure the front network is up: + +```bash +sudo virsh net-start front +``` + +NOTE. The 'default' volume pool should be up and running before trying downburst or teuthology-lock. + +```bash +> sudo virsh pool-list --all + Name State Autostart +------------------------------- + default active no +``` + + +## Setup teuthology virtual environment + + +Checkout the teuthology core repo and run the bootstrap script: +```bash +git clone https://github.com/ceph/teuthology ~/teuthology +cd ~/teuthology && ./bootstrap +. virtualenv/bin/activate +``` + +By default the `./bootstrap` script is installing teuthology in development mode +to the `virtualenv` directory. + +Create teuthology config file `~/.teuthology.yaml`: + +```bash +cat > ~/.teuthology.yaml << END +# replace $HOME with whatever appropriate to your needs +# teuthology-lock +lab_domain: local +lock_server: http://localhost:80 +default_machine_type: vps +# teuthology-run +results_server: http://localhost:80 +# we do not need reserve_machines on localhost +reserve_machines: 0 +# point to your teuthology +teuthology_path: $HOME/teuthology +# beanstalkd +queue_host: localhost +queue_port: 11300 +# if you want make and test patches to ceph-cm-ansible +# ceph_cm_ansible_git_url: $HOME/ceph-cm-ansible +# customize kvm guests parameter +downburst: + path: $HOME/downburst/virtualenv/bin/downburst + # define discover_url if you need your custom downburst image server + # discover_url: http://localhost:8181/images/ibs/ + machine: + cpus: 2 + disk: 12G + ram: 2G + volumes: + size: 8G + count: 4 +# add the next two if you do not use shaman +check_package_signatures: false +suite_verify_ceph_hash: false +END + +``` + +List locks: + +```bash +> teuthology-lock --brief --all +localhost up locked None "None" +target-00.local up unlocked None "None" +target-01.local up unlocked None "None" +target-02.local up unlocked None "None" +target-03.local up unlocked None "None" + +``` +Where the `localhost` is special purpose node where libvirt instance is running +and where the target nodes will be created. + +Export the downburst discover url environment variable for your own image storage if required: + +```bash +# cloud image location +export DOWNBURST_DISCOVER_URL=http://localhost:8181/images +``` + +NOTE. The step above is optional and is required if you are going to use custom image +location for the downburst, which is useful though when you want minimize traffic to +you computer. Refer [Create own discovery location](#create-own-discovery-location) +to know more how to create your private image storage. + +Try to lock nodes now: + +```bash +teuthology-lock -v --lock target-00 -m vps --os-type opensuse --os-version 15.2 +teuthology-lock -v --lock-many 1 -m vps --os-type ubuntu --os-version 16.04 +``` + +To initialize all targets you need to use `--lock` instead `--lock-many` +for the first time for each target. + +(Note. It can be probably changed, but this is how it is recommended +in teuthology adding nodes guide for the lab setup) + +For further usage nodes should be unlocked with `--unlock` option. + +### Run beanstalkd + +Build and run beanstalkd container. +```bash +cd teuthology/beanstalk/alpine && + podman build . --file Dockerfile --tag beanstalkd + +podman run -d --network paddles --name queue -p 11300:11300 beanstalkd +``` + + + +Alternatively, beanstalkd can be installed as a service. + +Note: +For openSUSE there is no beanstalkd package as for Ubuntu, so it is needed +to add corresponding repo: + +```bash +zypper addrepo https://download.opensuse.org/repositories/filesystems:/ceph:/teuthology/openSUSE_Leap_15.2/x86_64/ teuthology && zypper ref +``` + +Install beanstalkd package and run the service: + +```bash +sudo zypper in beanstalkd +sudo service beanstalkd start +``` + +### Run dispatcher + +Create and share archive directory. + +```bash +mkdir -p ~/.teuthology/archive + +podman run --name archive -v $HOME/.teuthology/archive:/usr/local/apache2/htdocs/ -d -p 8888:80 httpd:2.4 +``` + +Run teuthology dispatcher against 'vps' machine type. + +```bash +mkdir -p ~/.teuthology/dispatcher +teuthology-dispatcher -v -a ~/.teuthology/archive -t vps -l ~/.teuthology/dispatcher +``` + +Schedule a dummy job: +```bash +teuthology-suite -v --ceph-repo https://github.com/ceph/ceph --suite-repo https://github.com/ceph/ceph --ceph main --suite dummy -d ubuntu --sha1 35adebe94e8b0a17e7b56379a8bf24e5f7b8ced4 --limit 1 -m vps -t refs/pull/2023/merge +``` + +## Downburst + +Checkout downburst to your home, bootstrap virtualenv and enable it: +```bash +git clone https://github.com/ceph/downburst ~/downburst +pushd ~/downburst && ./bootstrap +``` + +### Create own discovery location + +(This step is optional, use it if you want to use private image location.) + +Create images directory, and download some images: + +```bash +DATE=$(date +%Y%m%d) +mkdir -p $HOME/.teuthology/www/images +wget http://download.opensuse.org/distribution/leap/15.2/appliances/openSUSE-Leap-15.2-JeOS.x86_64-OpenStack-Cloud.qcow2 -O $HOME/.teuthology/www/images/opensuse-15.2-$DATE-cloudimg-amd64.img +wget http://download.opensuse.org/distribution/leap/15.1/jeos/openSUSE-Leap-15.1-JeOS.x86_64-OpenStack-Cloud.qcow2 -O $HOME/.teuthology/www/images/opensuse-15.1-$DATE-cloudimg-amd64.img +wget http://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-OpenStack-Cloud.qcow2 -O $HOME/.teuthology/www/images/opensuse-tumbleweed-20200810-cloudimg-amd64.img +```` + +Create sha512 for the image: + +```bash +cd $HOME/.teuthology/www/images +sha512sum opensuse-15.2-$DATE-cloudimg-amd64.img | cut -d' ' -f1 > opensuse-15.2-$DATE-cloudimg-amd64.img.sha512 +sha512sum opensuse-15.1-$DATE-cloudimg-amd64.img | cut -d' ' -f1 > opensuse-15.1-$DATE-cloudimg-amd64.img.sha512 +sha512sum opensuse-tumbleweed-20200810-cloudimg-amd64.img | cut -d' ' -f1 > opensuse-tumbleweed-20200810-cloudimg-amd64.img.sha512 +``` + +run webserver localy: + +```bash +podman run --name downburst-discovery -v $HOME/.teuthology/www:/usr/local/apache2/htdocs/ -d -p 8181:80 httpd:2.4 +``` + +```bash +export DOWNBURST_DISCOVER_URL=http://localhost:8181/images/ +``` + +Make sure libvirtd is running and default network is up: + +```bash +sudo service libvirtd start +sudo virsh net-start default +``` + +### Try out node creation + + +List available distro/version and available images. + +```bash +downburst list +``` + +Start a VM for example: + +```bash +downburst -v create --distro opensuse --user-data doc/examples/no-password.opensuse.user.yaml opensuse +sudo virsh net-dhcp-leases default | grep opensuse + +``` diff --git a/docs/laptop/default-pool.xml b/docs/laptop/default-pool.xml new file mode 100644 index 000000000..106740d72 --- /dev/null +++ b/docs/laptop/default-pool.xml @@ -0,0 +1,7 @@ + + default + + /var/lib/libvirt/images/default + + + diff --git a/docs/laptop/front.xml b/docs/laptop/front.xml new file mode 100644 index 000000000..67887a0d6 --- /dev/null +++ b/docs/laptop/front.xml @@ -0,0 +1,15 @@ + + front + + + + + + + + + + + + + diff --git a/docs/laptop/hosts b/docs/laptop/hosts new file mode 100644 index 000000000..d15cad80f --- /dev/null +++ b/docs/laptop/hosts @@ -0,0 +1,7 @@ + +# teuthology hosts used as downburst vps targets +192.168.123.100 target-00 target-00.local +192.168.123.101 target-01 target-01.local +192.168.123.102 target-02 target-02.local +192.168.123.103 target-03 target-03.local + diff --git a/docs/laptop/ssh_config b/docs/laptop/ssh_config new file mode 100644 index 000000000..9b847b924 --- /dev/null +++ b/docs/laptop/ssh_config @@ -0,0 +1,6 @@ +Host target-* + User ubuntu + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR + diff --git a/docs/laptop/targets.sql b/docs/laptop/targets.sql new file mode 100644 index 000000000..2165c1922 --- /dev/null +++ b/docs/laptop/targets.sql @@ -0,0 +1,8 @@ +begin; +insert into nodes (name, machine_type, is_vm, locked, up) values ('localhost', 'libvirt', false, true, true); +insert into nodes (name, machine_type, is_vm, locked, up, mac_address, vm_host_id) values +('target-00.local', 'vps', true, false, false, '52:54:00:00:00:00', (select id from nodes where name='localhost')), +('target-01.local', 'vps', true, false, false, '52:54:00:00:00:01', (select id from nodes where name='localhost')), +('target-02.local', 'vps', true, false, false, '52:54:00:00:00:02', (select id from nodes where name='localhost')), +('target-03.local', 'vps', true, false, false, '52:54:00:00:00:03', (select id from nodes where name='localhost')); +commit; diff --git a/docs/laptop/teuthology.yaml b/docs/laptop/teuthology.yaml new file mode 100644 index 000000000..f29e0a78d --- /dev/null +++ b/docs/laptop/teuthology.yaml @@ -0,0 +1,30 @@ +# replace $HOME with whatever appropriate to your needs +# teuthology-lock +lab_domain: local +lock_server: http://localhost:80 +default_machine_type: vps +# teuthology-run +results_server: http://localhost:80 +# we do not need reserve_machines on localhost +reserve_machines: 0 +# point to your teuthology +teuthology_path: $HOME/teuthology +# beanstalkd +queue_host: localhost +queue_port: 11300 +# if you want make and test patches to ceph-cm-ansible +# ceph_cm_ansible_git_url: $HOME/ceph-cm-ansible +# customize kvm guests parameter +downburst: + path: $HOME/downburst/virtualenv/bin/downburst + discover_url: http://localhost:8181/images/ibs/ + machine: + cpus: 2 + disk: 12G + ram: 2G + volumes: + size: 8G + count: 4 +check_package_signatures: false +suite_verify_ceph_hash: false + diff --git a/docs/libcloud_backend.rst b/docs/libcloud_backend.rst new file mode 100644 index 000000000..84bdf7d9a --- /dev/null +++ b/docs/libcloud_backend.rst @@ -0,0 +1,43 @@ +.. _libcloud-backend: + +LibCloud backend +================ +This is an *experimental* provisioning backend that eventually intends to support several libcloud drivers. At this time only the OpenStack driver is supported. + +Prerequisites +------------- +* An account with an OpenStack provider that supports Nova and Cinder +* A DNS server supporting `RFC 2136 `_. We use `bind `_ and `this ansible role `_ to help configure ours. +* An `nsupdate-web `_ instance configured to update DNS records. We use `an ansible role `_ for this as well. +* Configuration in `teuthology.yaml` for this backend itself (see :ref:`libcloud_config`) and `nsupdate-web` +* You will also need to choose a maximum number of nodes to be running at once, and create records in your paddles database for each one - making sure to set `is_vm` to `True` for each. + +.. _libcloud_config: + +Configuration +------------- +An example configuration using OVH as an OpenStack provider:: + + libcloud: + providers: + ovh: # This string is the 'machine type' value you will use when locking these nodes + driver: openstack + driver_args: # driver args are passed directly to the libcloud driver + username: 'my_ovh_username' + password: 'my_ovh_password' + ex_force_auth_url: 'https://auth.cloud.ovh.net/v2.0/tokens' + ex_force_auth_version: '2.0_password' + ex_tenant_name: 'my_tenant_name' + ex_force_service_region: 'my_region' + +Why nsupdate-web? +----------------- +While we could have supported directly calling `nsupdate `_, we chose not to. There are a few reasons for this: + +* To avoid piling on yet another feature of teuthology that could be left up to a separate service +* To avoid teuthology users having to request, obtain and safeguard the private key that nsupdate requires to function +* Because we use one subdomain for all of Sepia's test nodes, we had to enable dynamic DNS for that whole zone (this is a limitation of bind). However, we do not want users to be able to push DNS updates for the entire zone. Instead, we gave nsupdate-web the ability to accept or reject requests based on whether the hostname matches a configurable regular expression. The private key itself is not shared with non-admin users. + +Bugs +---- +At this time, only OVH has been tested as a provider. PRs are welcome to support more! diff --git a/docs/openstack_backend.rst b/docs/openstack_backend.rst new file mode 100644 index 000000000..36f8fdf2b --- /dev/null +++ b/docs/openstack_backend.rst @@ -0,0 +1,214 @@ +.. _openstack-backend: + +OpenStack backend +================= + +The ``teuthology-openstack`` command is a wrapper around +``teuthology-suite`` that transparently creates the teuthology cluster +using OpenStack virtual machines. + +Prerequisites +------------- + +An OpenStack tenant with access to the nova and cinder API. If the +cinder API is not available, some jobs won't run because they expect +volumes attached to each instance. + +Setup OpenStack at OVH +---------------------- + +Each instance has a public IP by default. + +* `create an account `_ +* get $HOME/openrc.sh from `the horizon dashboard `_ + +Setup +----- + +* Get and configure teuthology:: + + $ git clone http://github.com/ceph/teuthology + $ cd teuthology ; ./bootstrap install + $ source virtualenv/bin/activate + +* Setup the teuthology node:: + + $ teuthology-openstack --key-filename myself.pem --key-name myself --setup + +Get OpenStack credentials and test it +------------------------------------- + +* follow the `OpenStack API Quick Start `_ +* source $HOME/openrc.sh +* verify the OpenStack client works:: + + $ nova list + +----+------------+--------+------------+-------------+-------------------------+ + | ID | Name | Status | Task State | Power State | Networks | + +----+------------+--------+------------+-------------+-------------------------+ + +----+------------+--------+------------+-------------+-------------------------+ +* create a passwordless ssh public key with:: + + $ openstack keypair create myself > myself.pem + +-------------+-------------------------------------------------+ + | Field | Value | + +-------------+-------------------------------------------------+ + | fingerprint | e0:a3:ab:5f:01:54:5c:1d:19:40:d9:62:b4:b3:a1:0b | + | name | myself | + | user_id | 5cf9fa21b2e9406b9c4108c42aec6262 | + +-------------+-------------------------------------------------+ + $ chmod 600 myself.pem + +Usage +----- + +* Run the dummy suite. It does nothing useful but shows all works as + expected. Note that the first time it is run, it can take a long + time (from a few minutes to half an hour or so) because it downloads + and uploads a cloud image to the OpenStack provider. :: + + $ teuthology-openstack --key-filename myself.pem --key-name myself --suite dummy + Job scheduled with name ubuntu-2015-07-24_09:03:29-dummy-main---basic-openstack and ID 1 + 2015-07-24 09:03:30,520.520 INFO:teuthology.suite:ceph sha1: dedda6245ce8db8828fdf2d1a2bfe6163f1216a1 + 2015-07-24 09:03:31,620.620 INFO:teuthology.suite:ceph version: v9.0.2-829.gdedda62 + 2015-07-24 09:03:31,620.620 INFO:teuthology.suite:teuthology branch: main + 2015-07-24 09:03:32,196.196 INFO:teuthology.suite:ceph-qa-suite branch: main + 2015-07-24 09:03:32,197.197 INFO:teuthology.repo_utils:Fetching from upstream into /home/ubuntu/src/ceph-qa-suite_main + 2015-07-24 09:03:33,096.096 INFO:teuthology.repo_utils:Resetting repo at /home/ubuntu/src/ceph-qa-suite_main to branch main + 2015-07-24 09:03:33,157.157 INFO:teuthology.suite:Suite dummy in /home/ubuntu/src/ceph-qa-suite_main/suites/dummy generated 1 jobs (not yet filtered) + 2015-07-24 09:03:33,158.158 INFO:teuthology.suite:Scheduling dummy/{all/nop.yaml} + 2015-07-24 09:03:34,045.045 INFO:teuthology.suite:Suite dummy in /home/ubuntu/src/ceph-qa-suite_main/suites/dummy scheduled 1 jobs. + 2015-07-24 09:03:34,046.046 INFO:teuthology.suite:Suite dummy in /home/ubuntu/src/ceph-qa-suite_main/suites/dummy -- 0 jobs were filtered out. + + 2015-07-24 11:03:34,104.104 INFO:teuthology.openstack: + web interface: http://167.114.242.13:8081/ + ssh access : ssh ubuntu@167.114.242.13 # logs in /usr/share/nginx/html + +* Visit the web interface (the URL is displayed at the end of the + teuthology-openstack output) to monitor the progress of the suite. + +* The virtual machine running the suite will persist for forensic + analysis purposes. To destroy it run:: + + $ teuthology-openstack --key-filename myself.pem --key-name myself --teardown + +* The test results can be uploaded to a publicly accessible location + with the ``--upload`` flag:: + + $ teuthology-openstack --key-filename myself.pem --key-name myself \ + --suite dummy --upload + + +Troubleshooting +--------------- + +Debian Jessie users may face the following error:: + + NameError: name 'PROTOCOL_SSLv3' is not defined + +The `workaround +`_ +suggesting to replace ``PROTOCOL_SSLv3`` with ``PROTOCOL_SSLv23`` in +the ssl.py has been reported to work. + +Running the OpenStack backend integration tests +----------------------------------------------- + +The easiest way to run the integration tests is to first run a dummy suite:: + + $ teuthology-openstack --key-name myself --suite dummy + ... + ssh access : ssh ubuntu@167.114.242.13 + +This will create a virtual machine suitable for the integration +test. Login wih the ssh access displayed at the end of the +``teuthology-openstack`` command and run the following:: + + $ pkill -f teuthology-worker + $ cd teuthology ; pip install "tox>=1.9" + $ tox -v -e openstack-integration + integration/openstack-integration.py::TestSuite::test_suite_noop PASSED + ... + ========= 9 passed in 2545.51 seconds ======== + $ tox -v -e openstack + integration/test_openstack.py::TestTeuthologyOpenStack::test_create PASSED + ... + ========= 1 passed in 204.35 seconds ========= + +Defining instances flavor and volumes +------------------------------------- + +Each target (i.e. a virtual machine or instance in the OpenStack +parlance) created by the OpenStack backend are exactly the same. By +default they have at least 8GB RAM, 20GB disk, 1 cpus and no disk +attached. It is equivalent to having the following in the +`~/.teuthology.yaml `_ file:: + + openstack: + ... + machine: + disk: 20 # GB + ram: 8000 # MB + cpus: 1 + volumes: + count: 0 + size: 1 # GB + +If a job needs more RAM or disk etc. the following can be included in +an existing facet (yaml file in the teuthology parlance):: + + openstack: + - machine: + disk: 100 # GB + volumes: + count: 4 + size: 10 # GB + +Teuthology interprets this as the minimimum requirements, on top of +the defaults found in the ``~/.teuthology.yaml`` file and the job will +be given instances with at least 100GB root disk, 8GB RAM, 1 cpus and +four 10GB volumes attached. The highest value wins: if the job claims +to need 4GB RAM and the defaults are 8GB RAM, the targets will all +have 8GB RAM. + +Note the dash before the ``machine`` key: the ``openstack`` element is +an array with one value. If the dash is missing, it is a dictionary instead. +It matters because there can be multiple entries per job such as:: + + openstack: + - machine: + disk: 40 # GB + ram: 8000 # MB + + openstack: + - machine: + ram: 32000 # MB + + openstack: + - volumes: # attached to each instance + count: 3 + size: 200 # GB + +When a job is composed with these, teuthology aggregates them as:: + + openstack: + - machine: + disk: 40 # GB + ram: 8000 # MB + - machine: + ram: 32000 # MB + - volumes: # attached to each instance + count: 3 + size: 200 # GB + +i.e. all entries are grouped in a list in the same fashion ``tasks`` are. +The resource requirement is the maximum of the resources found in each +element (including the default values). In the example above it is equivalent to:: + + openstack: + machine: + disk: 40 # GB + ram: 32000 # MB + volumes: # attached to each instance + count: 3 + size: 200 # GB diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..a948430da --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +sphinx >= 5.0.0 # for python 3.10 +sphinxcontrib-programoutput +mock == 2.0.0 +openstacksdk == 4.5.0 +python-openstackclient >= 6.0.0 diff --git a/docs/siteconfig.rst b/docs/siteconfig.rst new file mode 100644 index 000000000..5de2bda53 --- /dev/null +++ b/docs/siteconfig.rst @@ -0,0 +1,263 @@ +.. _site_config: + +Site and Client Configuration +============================= + +Teuthology requires several configuration options to be set, and provides many other optional ones. They are looked for in ``~/.teuthology.yaml`` if it exists, or ``/etc/teuthology.yaml`` if it doesn't. + +Here is a sample configuration with many of the options set and documented:: + + # lab_domain: the domain name to append to all short hostnames + lab_domain: example.com + + # The root directory to use for storage of all scheduled job logs and + # other data. + archive_base: /home/teuthworker/archive + + # The default machine_type value to use when not specified. Currently + # only used by teuthology-suite. + default_machine_type: awesomebox + + # Control how many machines need to be free in the cluster. 0 means + # Teuthology can use the entire cluster. + reserve_machines: 5 + + # The machine types currently in active use; currently only used by + # teuthology-exporter + active_machine_types: ['smithi'] + + # The host and port to use for the beanstalkd queue. This is required + # for scheduled jobs. + queue_host: localhost + queue_port: 11300 + + # The URL of the lock server (paddles). This is required for scheduled + # jobs. + lock_server: http://paddles.example.com:8080/ + + # The URL of the results server (paddles). + results_server: http://paddles.example.com:8080/ + + # This URL of the results UI server (pulpito). You must of course use + # paddles for pulpito to be useful. + results_ui_server: http://pulpito.example.com/ + + # Email address that will receive job results summaries. + results_email: ceph-qa@example.com + + # Email address that job results summaries originate from + results_sending_email: teuthology@example.com + + # How long (in seconds) teuthology-results should wait for jobs to finish + # before considering them 'hung' + results_timeout: 43200 + + # Gitbuilder archive that stores e.g. ceph packages + gitbuilder_host: gitbuilder.example.com + + # URL for 'gitserver' helper web application + # see http://github.com/ceph/gitserver + githelper_base_url: http://git.ceph.com:8080 + + # Verify the packages signatures + check_package_signatures: true + + # Where all git repos are considered to reside. + ceph_git_base_url: https://github.com/ceph/ + + # Where the ceph git repo is considered to reside. + ceph_git_url: https://github.com/ceph/ceph.git + + # Where the ceph-qa-suite git repo is considered to reside. + ceph_qa_suite_git_url: https://github.com/ceph/ceph-qa-suite.git + + # Where teuthology and ceph-qa-suite repos should be stored locally + src_base_path: /home/foo/src + + # Where the teuthology git repo is considered to reside. + teuthology_git_url: https://github.com/ceph/teuthology.git + + # Where teuthology path is located: do not clone if present + #teuthology_path: . + + # Whether or not teuthology-suite, when scheduling, should update + # itself from git. This is disabled by default. + automated_scheduling: false + + # How often, in seconds, teuthology-supervisor should poll its child job + # processes + watchdog_interval: 120 + + # How old a scheduled job can be, in seconds, before the dispatcher + # considers it 'expired', skipping it. + max_job_age: 1209600 + + # How long a scheduled job should be allowed to run, in seconds, before + # it is killed by the supervisor process. + max_job_time: 259200 + + # The template from which the URL of the repository containing packages + # is built. + # + # {host} is 'gitbuilder_host' from .teuthology.yaml + # {proj} is the value of 'project' from the job yaml file or 'ceph' + # {flavor} is the value of 'flavor' from the job yaml file or 'default' + # {uri} is ref/tag if 'tag' is set in the job yaml file + # or ref/branch if 'branch' is set in the job yaml file + # or sha1/sha1 if 'sha1' is set in the job yaml file + # or ref/main + # {pkg_type} is either 'deb' or 'rpm' depending on the host on which the + # packages are to be installed + # {dist} If lsb_release -si is Fedora the value is: + # Fedora 20 => fc20 + # Fedora 21 => fc21 + # etc. + # If lsb_release -si is CentOS or RedHatEnterpriseServer it is + # CentOS 6.5 => centos6 + # CentOS 7.0 => centos7 + # CentOS 7.1 => centos7 + # RedHatEnterpriseServer 6.4 => centos6 + # RedHatEnterpriseServer 7.0 => centos7 + # RedHatEnterpriseServer 7.1 => centos7 + # etc. + # Everything else is whatever lsb_release -sc returns + # Ubuntu 12.04 => precise + # Ubuntu 14.04 => trusty + # Debian GNU/Linux 7.0 => wheezy + # Debian GNU/Linux 8.0 => jessie + # etc. + # {arch} is the output of the 'arch' command on the host on which + # the packages are to be installed + # i386 + # x86_64 + # armv7l + # etc. + baseurl_template: http://{host}/{proj}-{pkg_type}-{dist}-{arch}-{flavor}/{uri} + + # If True, teuthology-suite verifies that a package matching the + # desired ceph branch exists in the gitbuilder. If False, no + # verification is done and teuthology-suite assumes the packages + # are either not necessary to run the task or they are created on + # demand. + suite_verify_ceph_hash: True + + # If true, teuthology-suite will schedule jobs even if the required + # packages are not built. + suite_allow_missing_packages: False + + # The rsync destination to upload the job results, when --upload is + # is provided to teuthology-suite. + # + archive_upload: ubuntu@teuthology-logs.public.ceph.com:./ + + # The path to the SSH private key for rsync to upload to archive_upload + # + archive_upload_key: None + + # The public facing URL of the archive_upload location + # + archive_upload_url: http://teuthology-logs.public.ceph.com/ + + # The OpenStack backend configuration, a dictionary interpreted as follows + # + openstack: + + # The teuthology-openstack command will clone teuthology with + # this command for the purpose of deploying teuthology from + # scratch and run workers listening on the openstack tube + # + clone: git clone http://github.com/ceph/teuthology + + # The path to the user-data file used when creating a target. It can have + # the {os_type} and {os_version} placeholders which are replaced with + # the value of --os-type and --os-version. No instance of a give {os_type} + # and {os_version} combination can be created unless such a file exists. + # + user-data: teuthology/openstack/openstack-{os_type}-{os_version}-user-data.txt + + # The IP number of the instance running the teuthology cluster. It will + # be used to build user facing URLs and should usually be the floating IP + # associated with the instance running the pulpito server. + # + ip: 8.4.8.4 + + # OpenStack has predefined machine sizes (called flavors) + # For a given job requiring N machines, the following example select + # the smallest flavor that satisfies these requirements. For instance + # If there are three flavors + # + # F1 (10GB disk, 2000MB RAM, 1CPU) + # F2 (100GB disk, 7000MB RAM, 1CPU) + # F3 (50GB disk, 7000MB RAM, 1CPU) + # + # and machine: { disk: 40, ram: 7000, cpus: 1 }, F3 will be chosen. + # F1 does not have enough RAM (2000 instead of the 7000 minimum) and + # although F2 satisfies all the requirements, it is larger than F3 + # (100GB instead of 50GB) and presumably more expensive. + # + # This configuration applies to all instances created for teuthology jobs + # that do not redefine these values. + # + machine: + + # The minimum root disk size of the flavor, in GB + # + disk: 20 # GB + + # The minimum RAM size of the flavor, in MB + # + ram: 8000 # MB + + # The minimum number of vCPUS of the flavor + # + cpus: 1 + + # The volumes attached to each instance. In the following example, + # three volumes of 10 GB will be created for each instance and + # will show as /dev/vdb, /dev/vdc and /dev/vdd + # + # This configuration applies to all instances created for teuthology jobs + # that do not redefine these values. + # + volumes: + + # The number of volumes + # + count: 3 + + # The size of each volume, in GB + # + size: 10 # GB + + # The host running a [PCP](http://pcp.io/) manager + pcp_host: http://pcp.front.sepia.ceph.com:44323/ + + # Settings for http://www.conserver.com/ + use_conserver: true + conserver_master: conserver.front.sepia.ceph.com + conserver_port: 3109 + + # Optionally use a specific SSH private key to connect to test nodes. + # Takes precedence over any entries in ~/.ssh/config. + ssh_key: ~/.ssh/my_key.rsa + + # Settings for [nsupdate-web](https://github.com/zmc/nsupdate-web) + # Used by the [libcloud](https://libcloud.apache.org/) backend + nsupdate_url: http://nsupdate.front.sepia.ceph.com/update + + # Settings for https://fogproject.org/ + fog: + endpoint: http://fog.example.com/fog + api_token: your_api_token + user_token: your_user_token + machine_types: ['mira', 'smithi'] + + # FOG provisioner is default and switching to Pelgas + # should be made explicitly + pelagos: + endpoint: http://head.ses.suse.de:5000/ + machine_types: ['type1', 'type2', 'type3'] + + # Do not allow more than that many jobs in a single run by default. + # To disable this check use 0. + job_threshold: 500 diff --git a/examples/3node_ceph.yaml b/examples/3node_ceph.yaml new file mode 100644 index 000000000..16544f341 --- /dev/null +++ b/examples/3node_ceph.yaml @@ -0,0 +1,15 @@ +roles: +- [mon.0, mds.0, osd.0] +- [mon.1, osd.1] +- [mon.2, client.0] + +tasks: +- install: +- ceph: +- kclient: [client.0] +- interactive: + +targets: + ubuntu@: ssh-rsa + ubuntu@: ssh-rsa + ubuntu@: ssh-rsa diff --git a/examples/3node_rgw.yaml b/examples/3node_rgw.yaml new file mode 100644 index 000000000..e0a42e2ff --- /dev/null +++ b/examples/3node_rgw.yaml @@ -0,0 +1,24 @@ +interactive-on-error: true +overrides: + ceph: + branch: main + fs: xfs +roles: +- - mon.a + - mon.c + - osd.0 +- - mon.b + - mds.a + - osd.1 +- - client.0 +tasks: +- install: +- ceph: null +- rgw: + - client.0 +- interactive: + +targets: + ubuntu@: ssh-rsa + ubuntu@: ssh-rsa + ubuntu@: ssh-rsa diff --git a/examples/parallel_example.yaml b/examples/parallel_example.yaml new file mode 100644 index 000000000..d1491358b --- /dev/null +++ b/examples/parallel_example.yaml @@ -0,0 +1,20 @@ +interactive-on-error: true +overrides: +roles: +- - test0 + - test1 +- - test0 + - test1 +- - test0 +tasks: +- install: +- parallel_example: + - test0 + - test1 + +targets: + ubuntu@: ssh-rsa + ubuntu@: ssh-rsa + ubuntu@: ssh-rsa + + diff --git a/hammer.sh b/hammer.sh new file mode 100755 index 000000000..9f206f2e9 --- /dev/null +++ b/hammer.sh @@ -0,0 +1,32 @@ +#!/bin/sh -ex +# +# simple script to repeat a test until it fails +# + +if [ $1 = "-a" ]; then + shift + job=$1 + log="--archive $job.out" +else + job=$1 + log="" +fi + +test -e $1 + +title() { + echo '\[\033]0;hammer '$job' '$N' passes\007\]' +} + +N=0 +title +[ -n "$log" ] && [ -d $job.out ] && rm -rf $job.out +while teuthology $log $job $2 $3 $4 +do + date + N=$(($N+1)) + echo "$job: $N passes" + [ -n "$log" ] && rm -rf $job.out + title +done +echo "$job: $N passes, then failure." diff --git a/openstack-delegate.sh b/openstack-delegate.sh new file mode 100755 index 000000000..01b7e63a2 --- /dev/null +++ b/openstack-delegate.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +trap "rm -f teuthology-integration.pem ; openstack keypair delete teuthology-integration ; openstack server delete teuthology-integration" EXIT + +openstack keypair create teuthology-integration > teuthology-integration.pem +chmod 600 teuthology-integration.pem +teuthology-openstack --name teuthology-integration --key-filename teuthology-integration.pem --key-name teuthology-integration --suite teuthology/integration --wait --teardown --upload diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..ece6fe5f5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = [ + "setuptools>=45", + "wheel", + "setuptools_scm>=6.2", +] + +[tool.setuptools_scm] +version_scheme = "python-simplified-semver" \ No newline at end of file diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 000000000..60d435c8b --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +norecursedirs = .git build virtualenv teuthology.egg-info .tox */integration task/tests +log_cli=true +log_level=NOTSET +addopts = -p no:cacheprovider diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..f5c8f60de --- /dev/null +++ b/requirements.txt @@ -0,0 +1,208 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --extra=test pyproject.toml +# +ansible-core==2.17.7 + # via teuthology (pyproject.toml) +apache-libcloud==3.8.0 + # via teuthology (pyproject.toml) +backports-ssl-match-hostname==3.7.0.1 + # via teuthology (pyproject.toml) +bcrypt==4.1.2 + # via paramiko +beanstalkc3==0.4.0 + # via teuthology (pyproject.toml) +boto==2.49.0 + # via teuthology (pyproject.toml) +boto3==1.34.8 + # via teuthology (pyproject.toml) +botocore==1.34.8 + # via + # boto3 + # s3transfer +build==1.0.3 + # via pip-tools +cachetools==5.3.2 + # via tox +certifi==2023.11.17 + # via + # requests + # sentry-sdk +cffi==1.16.0 + # via + # cryptography + # pynacl +chardet==5.2.0 + # via tox +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via pip-tools +colorama==0.4.6 + # via tox +configobj==5.0.9 + # via teuthology (pyproject.toml) +configparser==6.0.0 + # via teuthology (pyproject.toml) +coverage[toml]==7.4.0 + # via + # pytest-cov + # teuthology (pyproject.toml) +cryptography==41.0.7 + # via + # ansible-core + # paramiko + # pyopenssl +distlib==0.3.8 + # via virtualenv +docopt==0.6.2 + # via teuthology (pyproject.toml) +filelock==3.13.1 + # via + # tox + # virtualenv +gevent==24.2.1 + # via teuthology (pyproject.toml) +greenlet==3.0.3 + # via gevent +httplib2==0.22.0 + # via teuthology (pyproject.toml) +humanfriendly==10.0 + # via teuthology (pyproject.toml) +idna==3.6 + # via requests +iniconfig==2.0.0 + # via pytest +ipy==1.1 + # via teuthology (pyproject.toml) +jinja2==3.1.6 + # via ansible-core +jmespath==1.0.1 + # via + # boto3 + # botocore +lupa==2.2 + # via teuthology (pyproject.toml) +lxml==4.9.4 + # via teuthology (pyproject.toml) +markupsafe==2.1.3 + # via jinja2 +mock==5.1.0 + # via teuthology (pyproject.toml) +ndg-httpsclient==0.5.1 + # via teuthology (pyproject.toml) +netaddr==0.9.0 + # via teuthology (pyproject.toml) +packaging==23.2 + # via + # ansible-core + # build + # pyproject-api + # pytest + # tox +paramiko==3.4.0 + # via teuthology (pyproject.toml) +pexpect==4.9.0 + # via teuthology (pyproject.toml) +pip-tools==7.3.0 + # via teuthology (pyproject.toml) +platformdirs==4.1.0 + # via + # tox + # virtualenv +pluggy==1.3.0 + # via + # pytest + # tox +prettytable==3.9.0 + # via teuthology (pyproject.toml) +prometheus-client==0.19.0 + # via teuthology (pyproject.toml) +psutil==5.9.7 + # via teuthology (pyproject.toml) +ptyprocess==0.7.0 + # via pexpect +pyasn1==0.5.1 + # via + # ndg-httpsclient + # teuthology (pyproject.toml) +pycparser==2.21 + # via cffi +pyjwt==2.8.0 + # via teuthology (pyproject.toml) +pynacl==1.5.0 + # via + # paramiko + # teuthology (pyproject.toml) +pynose==1.5.1 + # via teuthology (pyproject.toml) +pyopenssl==23.3.0 + # via + # ndg-httpsclient + # teuthology (pyproject.toml) +pyparsing==3.1.1 + # via httplib2 +pyproject-api==1.6.1 + # via tox +pyproject-hooks==1.0.0 + # via build +pytest==7.4.3 + # via + # pytest-cov + # teuthology (pyproject.toml) +pytest-cov==4.1.0 + # via teuthology (pyproject.toml) +python-dateutil==2.8.2 + # via + # botocore + # teuthology (pyproject.toml) +pyyaml==6.0.1 + # via + # ansible-core + # teuthology (pyproject.toml) +requests==2.31.0 + # via + # apache-libcloud + # teuthology (pyproject.toml) +resolvelib==0.8.1 + # via ansible-core +s3transfer==0.10.0 + # via boto3 +sentry-sdk==1.39.1 + # via teuthology (pyproject.toml) +six==1.16.0 + # via + # configobj + # python-dateutil +toml==0.10.2 + # via teuthology (pyproject.toml) +tox==4.11.4 + # via teuthology (pyproject.toml) +types-psutil==6.0.0.20240621 + # via teuthology (pyproject.toml) +urllib3==1.26.18 + # via + # botocore + # requests + # sentry-sdk + # teuthology (pyproject.toml) +virtualenv==20.26.6 + # via tox +wcwidth==0.2.12 + # via prettytable +wheel==0.42.0 + # via pip-tools +xmltodict==0.13.0 + # via teuthology (pyproject.toml) +zope-event==5.0 + # via gevent +zope-interface==6.1 + # via gevent +openstacksdk==4.5.0 + # via teuthology (pyproject.toml) +python-openstackclient>=6.0.0 +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools diff --git a/requirements.yml b/requirements.yml new file mode 100644 index 000000000..ec47f6713 --- /dev/null +++ b/requirements.yml @@ -0,0 +1,12 @@ +--- +collections: + - amazon.aws + - name: ansible.netcommon + version: "<6.0.0" # 6.0 requires ansible-core >= 2.14 + - ansible.posix + - name: ansible.utils + version: "<3.0.0" # 3.0 requires ansible-core >= 2.14 + - community.docker + - community.general + - community.postgresql + diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/describe.py b/scripts/describe.py new file mode 100644 index 000000000..0764ecf6b --- /dev/null +++ b/scripts/describe.py @@ -0,0 +1,79 @@ +import docopt + +import teuthology.config +import teuthology.describe_tests + +doc = """ +usage: + teuthology-describe-tests -h + teuthology-describe-tests [options] [--] + +Describe the contents of a qa suite by reading 'meta' elements from +yaml files in the suite. + +The 'meta' element should contain a list with a dictionary +of key/value pairs for entries, i.e.: + +meta: +- field1: value1 + field2: value2 + field3: value3 + desc: short human-friendly description + +Fields are user-defined, and are not required to be in all yaml files. + +positional arguments: + path of qa suite + +optional arguments: + -h, --help Show this help message and exit + -f , --fields Comma-separated list of fields to + include [default: desc] + --show-facet [yes|no] List the facet of each file + [default: yes] + --format [plain|json|csv] Output format (written to stdout) + [default: plain] + +options only for describing combinations represented by a suite: + -c, --combinations Describe test combinations rather than + individual yaml fragments + -s, --summary Print summary + --filter Only list tests whose description contains + at least one of the keywords in the comma + separated keyword string specified + --filter-out Do not list tests whose description contains + any of the keywords in the comma separated + keyword string specified + --filter-all Only list tests whose description contains + each of the keywords in the comma separated + keyword string specified + -F, --filter-fragments Check fragments additionaly to descriptions + using keywords specified with 'filter', + 'filter-out' and 'filter-all' options. + -p, --print-description Print job descriptions for the suite, + used only in combination with 'summary' + -P, --print-fragments Print file list inovolved for each facet, + used only in combination with 'summary' + -l , --limit List at most this many jobs + [default: 0] + --subset Instead of listing the entire + suite, break the set of jobs into + pieces (each of which + will contain each facet at least + once) and list piece . + Listing 0/, 1/, + 2/ ... -1/ + will list all jobs in the + suite (many more than once). + -S , --seed Used for pseudo-random tests generation + involving facet whose path ends with '$' + operator, where negative value used for + a random seed + [default: -1] + --no-nested-subset Disable nested subsets +""" + + +def main(): + args = docopt.docopt(doc) + teuthology.describe_tests.main(args) diff --git a/scripts/dispatcher.py b/scripts/dispatcher.py new file mode 100644 index 000000000..45dd61b26 --- /dev/null +++ b/scripts/dispatcher.py @@ -0,0 +1,62 @@ +import argparse +import sys + +import teuthology.dispatcher.supervisor + +from .supervisor import parse_args as parse_supervisor_args + + +def parse_args(argv): + parser = argparse.ArgumentParser( + description="Start a dispatcher for the specified tube. Grab jobs from a beanstalk queue and run the teuthology tests they describe as subprocesses. The subprocess invoked is teuthology-supervisor." + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="be more verbose", + ) + parser.add_argument( + "-a", + "--archive-dir", + type=str, + help="path to archive results in", + ) + parser.add_argument( + "-t", + "--tube", + type=str, + help="which beanstalk tube to read jobs from", + required=True, + ) + parser.add_argument( + "-l", + "--log-dir", + type=str, + help="path in which to store the dispatcher log", + required=True, + ) + parser.add_argument( + "--exit-on-empty-queue", + action="store_true", + help="if the queue is empty, exit", + ) + return parser.parse_args(argv) + + +def main(): + if "--supervisor" in sys.argv: + # This is for transitional compatibility, so the old dispatcher can + # invoke the new supervisor. Once old dispatchers are phased out, + # this block can be as well. + sys.argv.remove("--supervisor") + sys.argv[0] = "teuthology-supervisor" + sys.exit(teuthology.dispatcher.supervisor.main( + parse_supervisor_args(sys.argv[1:]) + )) + else: + sys.exit(teuthology.dispatcher.main(parse_args(sys.argv[1:]))) + + +if __name__ == "__main__": + main() diff --git a/scripts/exporter.py b/scripts/exporter.py new file mode 100644 index 000000000..438d5d3f3 --- /dev/null +++ b/scripts/exporter.py @@ -0,0 +1,18 @@ +import docopt + +import teuthology.exporter + +doc = """ +usage: teuthology-exporter --help + teuthology-exporter [--interval INTERVAL] + +optional arguments: + -h, --help show this help message and exit + --interval INTERVAL update metrics this often, in seconds + [default: 60] +""" + + +def main(): + args = docopt.docopt(doc) + teuthology.exporter.main(args) diff --git a/scripts/kill.py b/scripts/kill.py new file mode 100644 index 000000000..31acc8b1a --- /dev/null +++ b/scripts/kill.py @@ -0,0 +1,44 @@ +import docopt + +import teuthology.config +import teuthology.kill + +doc = """ +usage: teuthology-kill -h + teuthology-kill [-a ARCHIVE] [-p] -r RUN + teuthology-kill [-a ARCHIVE] [-p] -m MACHINE_TYPE -r RUN + teuthology-kill [-a ARCHIVE] [-o OWNER] -r RUN -j JOB ... + teuthology-kill [-a ARCHIVE] [-o OWNER] -J JOBSPEC + teuthology-kill [-p] -o OWNER -m MACHINE_TYPE -r RUN + +Kill running teuthology jobs: +1. Removes any queued jobs from the beanstalk queue +2. Kills any running jobs +3. Nukes any machines involved + +NOTE: Must be run on the same machine that is executing the teuthology job +processes. + +optional arguments: + -h, --help show this help message and exit + -a ARCHIVE, --archive ARCHIVE + The base archive directory + [default: {archive_base}] + -p, --preserve-queue Preserve the queue - do not delete queued jobs + -r, --run RUN The name(s) of the run(s) to kill + -j, --job JOB The job_id of the job to kill + -J, --jobspec JOBSPEC + The 'jobspec' of the job to kill. A jobspec consists of + both the name of the run and the job_id, separated by a + '/'. e.g. 'my-test-run/1234' + -o, --owner OWNER The owner of the job(s) + -m, --machine-type MACHINE_TYPE + The type of machine the job(s) are running on. + This is required if killing a job that is still + entirely in the queue. +""".format(archive_base=teuthology.config.config.archive_base) + + +def main(): + args = docopt.docopt(doc) + teuthology.kill.main(args) diff --git a/scripts/lock.py b/scripts/lock.py new file mode 100644 index 000000000..69e50ccaf --- /dev/null +++ b/scripts/lock.py @@ -0,0 +1,181 @@ +import argparse +import textwrap +import sys + +import teuthology.lock +import teuthology.lock.cli + + +def _positive_int(string): + value = int(string) + if value < 1: + raise argparse.ArgumentTypeError( + '{string} is not positive'.format(string=string)) + return value + + +def main(): + sys.exit(teuthology.lock.cli.main(parse_args(sys.argv[1:]))) + + +def parse_args(argv): + parser = argparse.ArgumentParser( + description='Lock, unlock, or query lock status of machines', + epilog=textwrap.dedent(''' + Examples: + teuthology-lock --summary + teuthology-lock --lock-many 1 --machine-type vps + teuthology-lock --lock -t target.yaml + teuthology-lock --list-targets plana01 + teuthology-lock --brief + teuthology-lock --brief --owner user@host + teuthology-lock --update --status down --desc testing plana01 + '''), + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + '-v', '--verbose', + action='store_true', + default=False, + help='be more verbose', + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '--list', + action='store_true', + default=False, + help='Show lock info for machines owned by you, or only machines ' + + 'specified. Can be restricted by --owner, --status, and --locked.', + ) + group.add_argument( + '--brief', + action='store_true', + default=False, + help='Like --list, but with summary instead of detail', + ) + group.add_argument( + '--list-targets', + action='store_true', + default=False, + help='Show lock info for all machines, or only machines specified, ' + + 'in targets: yaml format. Can be restricted by --owner, --status, ' + + 'and --locked.', + ) + group.add_argument( + '--lock', + action='store_true', + default=False, + help='lock particular machines', + ) + group.add_argument( + '--unlock', + action='store_true', + default=False, + help='unlock particular machines', + ) + group.add_argument( + '--lock-many', + dest='num_to_lock', + type=_positive_int, + help='lock this many machines', + ) + group.add_argument( + '--update', + action='store_true', + default=False, + help='update the description or status of some machines', + ) + group.add_argument( + '--summary', + action='store_true', + default=False, + help='summarize locked-machine counts by owner', + ) + parser.add_argument( + '-a', '--all', + action='store_true', + default=False, + help='list all machines, not just those owned by you', + ) + parser.add_argument( + '--owner', + default=None, + help='owner of the lock(s) (must match to unlock a machine)', + ) + parser.add_argument( + '-f', + action='store_true', + default=False, + help="don't exit after the first error, continue locking or " + + "unlocking other machines", + ) + parser.add_argument( + '--desc', + default=None, + help='lock description', + ) + parser.add_argument( + '--desc-pattern', + default=None, + help='lock description', + ) + parser.add_argument( + '-m', '--machine-type', + default=None, + help='Type of machine to lock, valid choices: mira | plana | ' + + 'burnupi | vps | saya | tala', + ) + parser.add_argument( + '--status', + default=None, + choices=['up', 'down'], + help='whether a machine is usable for testing', + ) + parser.add_argument( + '--locked', + default=None, + choices=['true', 'false'], + help='whether a machine is locked', + ) + parser.add_argument( + '-t', '--targets', + dest='targets', + default=None, + help='input yaml containing targets', + ) + parser.add_argument( + 'machines', + metavar='MACHINE', + default=[], + nargs='*', + help='machines to operate on', + ) + parser.add_argument( + '--os-type', + default=None, + help='OS type (distro)', + ) + parser.add_argument( + '--os-version', + default=None, + help='OS (distro) version such as "12.10"', + ) + parser.add_argument( + '--arch', + default=None, + help='architecture (x86_64, i386, armv7, aarch64)', + ) + parser.add_argument( + '--json-query', + default=None, + help=textwrap.dedent('''\ + JSON fragment, explicitly given, or a file containing + JSON, containing a query for --list or --brief. + Example: teuthology-lock --list --all --json-query + '{"vm_host":{"name":"mira003.front.sepia.ceph.com"}}' + will list all machines who have a vm_host entry + with a dictionary that contains at least the name key + with value mira003.front.sepia.ceph.com. + Note: be careful about quoting and the shell.'''), + ) + + return parser.parse_args(argv) diff --git a/scripts/ls.py b/scripts/ls.py new file mode 100644 index 000000000..5c9b33be3 --- /dev/null +++ b/scripts/ls.py @@ -0,0 +1,19 @@ +""" +usage: teuthology-ls [-h] [-v] + +List teuthology job results + +positional arguments: + path under which to archive results + +optional arguments: + -h, --help show this help message and exit + -v, --verbose show reasons tests failed +""" +import docopt +import teuthology.ls + + +def main(): + args = docopt.docopt(__doc__) + teuthology.ls.main(args) diff --git a/scripts/node_cleanup.py b/scripts/node_cleanup.py new file mode 100755 index 000000000..5d474b7c8 --- /dev/null +++ b/scripts/node_cleanup.py @@ -0,0 +1,74 @@ +import argparse +import logging +import sys + +import teuthology +from teuthology.config import config +from teuthology.lock import query, ops + + +def main(): + args = parse_args(sys.argv[1:]) + if args.verbose: + teuthology.log.setLevel(logging.DEBUG) + else: + teuthology.log.setLevel(100) + log = logging.getLogger(__name__) + logger = logging.getLogger() + for handler in logger.handlers: + handler.setFormatter( + logging.Formatter('%(message)s') + ) + try: + stale = query.find_stale_locks(args.owner) + except Exception: + log.exception(f"Error while check for stale locks held by {args.owner}") + return + if not stale: + return + by_owner = {} + for node in stale: + if args.owner and node['locked_by'] != args.owner: + log.warning( + f"Node {node['name']} expected to be locked by {args.owner} " + f"but found {node['locked_by']} instead" + ) + continue + by_owner.setdefault(node['locked_by'], []).append(node) + if args.dry_run: + log.info("Would attempt to unlock:") + for owner, nodes in by_owner.items(): + for node in nodes: + node_job = node['description'].replace( + config.archive_base, config.results_ui_server) + log.info(f"{node['name']}\t{node_job}") + else: + for owner, nodes in by_owner.items(): + ops.unlock_safe([node["name"] for node in nodes], owner) + log.info(f"unlocked {len(stale)} nodes") + +def parse_args(argv): + parser = argparse.ArgumentParser( + description="Find and unlock nodes that are still locked by jobs that are no " + "longer active", + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', + default=False, + help='Be more verbose', + ) + parser.add_argument( + '--dry-run', + action='store_true', + default=False, + help="List nodes that would be unlocked if the flag were omitted", + ) + parser.add_argument( + '--owner', + help='Optionally, find nodes locked by a specific user', + ) + return parser.parse_args(argv) + +if __name__ == "__main__": + main() diff --git a/scripts/openstack.py b/scripts/openstack.py new file mode 100644 index 000000000..a9f09332e --- /dev/null +++ b/scripts/openstack.py @@ -0,0 +1,409 @@ +import argparse +import sys +import os + +import teuthology.openstack + +def main(argv=sys.argv[1:]): + sys.exit(teuthology.openstack.main(parse_args(argv), argv)) + +def get_key_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--key-name', + help='OpenStack keypair name', + ) + parser.add_argument( + '--key-filename', + help='path to the ssh private key. Default: %(default)s', + default=[ + os.environ['HOME'] + '/.ssh/id_rsa', + os.environ['HOME'] + '/.ssh/id_dsa', + os.environ['HOME'] + '/.ssh/id_ecdsa' + ] + ) + return parser + +def get_suite_parser(): + parser = argparse.ArgumentParser() + # copy/pasted from scripts/suite.py + parser.add_argument( + 'config_yaml', + nargs='*', + help='Optional extra job yaml to include', + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', default=None, + help='be more verbose', + ) + parser.add_argument( + '--dry-run', + action='store_true', default=None, + help='Do a dry run; do not schedule anything', + ) + parser.add_argument( + '-s', '--suite', + help='The suite to schedule', + ) + parser.add_argument( + '-c', '--ceph', + help='The ceph branch to run against', + default=os.getenv('TEUTH_CEPH_BRANCH', 'main'), + ) + parser.add_argument( + '-k', '--kernel', + help=('The kernel branch to run against; if not ' + 'supplied, the installed kernel is unchanged'), + ) + parser.add_argument( + '-f', '--flavor', + help=("The ceph packages shaman flavor to run with:" + "('default', 'crimson', 'notcmalloc', 'jaeger')"), + default='default', + ) + parser.add_argument( + '-d', '--distro', + help='Distribution to run against', + ) + parser.add_argument( + '--suite-branch', + help='Use this suite branch instead of the ceph branch', + default=os.getenv('TEUTH_SUITE_BRANCH', 'main'), + ) + parser.add_argument( + '-e', '--email', + help='When tests finish or time out, send an email here', + ) + parser.add_argument( + '-N', '--num', + help='Number of times to run/queue the job', + type=int, + default=1, + ) + parser.add_argument( + '-l', '--limit', + metavar='JOBS', + help='Queue at most this many jobs', + type=int, + ) + parser.add_argument( + '--subset', + help=('Instead of scheduling the entire suite, break the ' + 'set of jobs into pieces (each of which will ' + 'contain each facet at least once) and schedule ' + 'piece . Scheduling 0/, 1/, ' + '2/ ... -1/ will schedule all ' + 'jobs in the suite (many more than once).') + ) + parser.add_argument( + '-p', '--priority', + help='Job priority (lower is sooner)', + type=int, + default=1000, + ) + parser.add_argument( + '--timeout', + help=('How long, in seconds, to wait for jobs to finish ' + 'before sending email. This does not kill jobs.'), + type=int, + default=43200, + ) + parser.add_argument( + '--filter', + help=('Only run jobs whose description contains at least one ' + 'of the keywords in the comma separated keyword ' + 'string specified. ') + ) + parser.add_argument( + '--filter-out', + help=('Do not run jobs whose description contains any of ' + 'the keywords in the comma separated keyword ' + 'string specified. ') + ) + parser.add_argument( + '--throttle', + help=('When scheduling, wait SLEEP seconds between jobs. ' + 'Useful to avoid bursts that may be too hard on ' + 'the underlying infrastructure or exceed OpenStack API ' + 'limits (server creation per minute for instance).'), + type=int, + default=15, + ) + parser.add_argument( + '--suite-relpath', + help=('Look for tasks and suite definitions in this' + 'subdirectory of the suite repo.'), + ) + parser.add_argument( + '-r', '--rerun', + help=('Attempt to reschedule a run, selecting only those' + 'jobs whose status are mentioned by' + '--rerun-status.' + 'Note that this is implemented by scheduling an' + 'entirely new suite and including only jobs whose' + 'descriptions match the selected ones. It does so' + 'using the same logic as --filter.' + 'Of all the flags that were passed when scheduling' + 'the original run, the resulting one will only' + 'inherit the suite value. Any others must be' + 'passed as normal while scheduling with this' + 'feature.'), + ) + parser.add_argument( + '-R', '--rerun-statuses', + help=("A comma-separated list of statuses to be used" + "with --rerun. Supported statuses are: 'dead'," + "'fail', 'pass', 'queued', 'running', 'waiting'"), + default='fail,dead', + ) + parser.add_argument( + '-D', '--distroversion', '--distro-version', + help='Distro version to run against', + ) + parser.add_argument( + '-n', '--newest', + help=('Search for the newest revision built on all' + 'required distro/versions, starting from' + 'either --ceph or --sha1, backtracking' + 'up to commits'), + type=int, + default=0, + ) + parser.add_argument( + '-S', '--sha1', + help=('The ceph sha1 to run against (overrides -c)' + 'If both -S and -c are supplied, -S wins, and' + 'there is no validation that sha1 is contained' + 'in branch') + ) + parser.add_argument( + '--ceph-repo', + help=("Query this repository for Ceph branch and SHA1"), + default=os.getenv('TEUTH_CEPH_REPO', 'https://github.com/ceph/ceph'), + ) + parser.add_argument( + '--suite-repo', + help=("Use tasks and suite definition in this repository"), + default=os.getenv('TEUTH_SUITE_REPO', 'https://github.com/ceph/ceph'), + ) + parser.add_argument( + '--sleep-before-teardown', + help='Number of seconds to sleep before the teardown', + default=0 + ) + return parser + +def get_openstack_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--wait', + action='store_true', default=None, + help='block until the suite is finished', + ) + parser.add_argument( + '--name', + help='OpenStack primary instance name', + default='teuthology', + ) + parser.add_argument( + '--nameserver', + help='nameserver ip address (optional)', + ) + parser.add_argument( + '--simultaneous-jobs', + help='maximum number of jobs running in parallel', + type=int, + default=1, + ) + parser.add_argument( + '--controller-cpus', + help='override default minimum vCPUs when selecting flavor for teuthology VM', + type=int, + default=0, + ) + parser.add_argument( + '--controller-ram', + help='override default minimum RAM (in megabytes) when selecting flavor for teuthology VM', + type=int, + default=0, + ) + parser.add_argument( + '--controller-disk', + help='override default minimum disk size (in gigabytes) when selecting flavor for teuthology VM', + type=int, + default=0, + ) + parser.add_argument( + '--setup', + action='store_true', default=False, + help='deploy the cluster, if it does not exist', + ) + parser.add_argument( + '--teardown', + action='store_true', default=None, + help='destroy the cluster, if it exists', + ) + parser.add_argument( + '--teuthology-git-url', + help="git clone url for teuthology", + default=os.getenv('TEUTH_REPO', 'https://github.com/ceph/teuthology'), + ) + parser.add_argument( + '--teuthology-branch', + help="use this teuthology branch instead of main", + default=os.getenv('TEUTH_BRANCH', 'main'), + ) + parser.add_argument( + '--ceph-workbench-git-url', + help="git clone url for ceph-workbench", + ) + parser.add_argument( + '--ceph-workbench-branch', + help="use this ceph-workbench branch instead of main", + default='main', + ) + parser.add_argument( + '--upload', + action='store_true', default=False, + help='upload archives to an rsync server', + ) + parser.add_argument( + '--archive-upload', + help='rsync destination to upload archives', + default='ubuntu@teuthology-logs.public.ceph.com:./', + ) + parser.add_argument( + '--archive-upload-url', + help='Public facing URL where archives are uploaded', + default='http://teuthology-logs.public.ceph.com', + ) + parser.add_argument( + '--test-repo', + action='append', + help=('Package repository to be added on test nodes, which are specified ' + 'as NAME:URL, NAME!PRIORITY:URL or @FILENAME, for details see below.'), + default=None, + ) + parser.add_argument( + '--no-canonical-tags', + action='store_true', default=False, + help='configure remote teuthology to not fetch tags from http://github.com/ceph/ceph.git in buildpackages task', + ) + return parser + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + parents=[ + get_suite_parser(), + get_key_parser(), + get_openstack_parser(), + ], + conflict_handler='resolve', + add_help=False, + epilog="""test repos: + +Test repository can be specified using --test-repo optional argument +with value in the following formats: NAME:URL, NAME!PRIORITY:URL +or @FILENAME. See examples: + +1) Essential usage requires to provide repo name and url: + + --test-repo foo:http://example.com/repo/foo + +2) Repo can be prioritized by adding a number after '!' symbol + in the name: + + --test-repo 'bar!10:http://example.com/repo/bar' + +3) Repo data can be taken from a file by simply adding '@' symbol + at the beginning argument value, for example from yaml: + + --test-repo @path/to/foo.yaml + + where `foo.yaml` contains one or more records like: + + - name: foo + priority: 1 + url: http://example.com/repo/foo + +4) Or from json file: + + --test-repo @path/to/foo.json + + where `foo.json` content is: + + [{"name":"foo","priority":1,"url":"http://example.com/repo/foo"}] + + +Several repos can be provided with multiple usage of --test-repo and/or +you can provide several repos within one yaml or json file. +The repositories are added in the order they appear in the command line or +in the file. Example: + + --- + # The foo0 repo will be included first, after all that have any priority, + # in particular after foo1 because it has lowest priority + - name: foo0 + url: http://example.com/repo/foo0 + # The foo1 will go after foo2 because it has lower priority then foo2 + - name: foo1 + url: http://example.com/repo/foo1 + priority: 2 + # The foo2 will go first because it has highest priority + - name: foo2 + url: http://example.com/repo/foo2 + priority: 1 + # The foo3 will go after foo0 because it appears after it in this file + - name: foo3 + url: http://example.com/repo/foo3 + +Equivalent json file content below: + + [ + { + "name": "foo0", + "url": "http://example.com/repo/foo0" + }, + { + "name": "foo1", + "url": "http://example.com/repo/foo1", + "priority": 2 + }, + { + "name": "foo2", + "url": "http://example.com/repo/foo2", + "priority": 1 + }, + { + "name": "foo3", + "url": "http://example.com/repo/foo3" + } + ] + +At the moment supported only files with extensions: .yaml, .yml, .json, .jsn. + +teuthology-openstack %s +""" % teuthology.__version__, + description=""" +Run a suite of ceph integration tests. A suite is a directory containing +facets. A facet is a directory containing config snippets. Running a suite +means running teuthology for every configuration combination generated by +taking one config snippet from each facet. Any config files passed on the +command line will be used for every combination, and will override anything in +the suite. By specifying a subdirectory in the suite argument, it is possible +to limit the run to a specific facet. For instance -s upgrade/dumpling-x only +runs the dumpling-x facet of the upgrade suite. + +Display the http and ssh access to follow the progress of the suite +and analyze results. + + firefox http://183.84.234.3:8081/ + ssh -i teuthology-admin.pem ubuntu@183.84.234.3 + +""") + return parser + +def parse_args(argv): + return get_parser().parse_args(argv) diff --git a/scripts/prune_logs.py b/scripts/prune_logs.py new file mode 100644 index 000000000..424b4b7b7 --- /dev/null +++ b/scripts/prune_logs.py @@ -0,0 +1,38 @@ +import docopt + +import teuthology.config +import teuthology.prune + +doc = """ +usage: + teuthology-prune-logs -h + teuthology-prune-logs [-v] [options] + +Prune old logfiles from the archive + +optional arguments: + -h, --help Show this help message and exit + -v, --verbose Be more verbose + -a ARCHIVE, --archive ARCHIVE + The base archive directory + [default: {archive_base}] + --dry-run Don't actually delete anything; just log what would be + deleted + -p DAYS, --pass DAYS Remove all logs for jobs which passed and are older + than DAYS. Negative values will skip this operation. + [default: 14] + -f DAYS, --fail DAYS Like --pass, but for failed jobs. [default: -1] + -r DAYS, --remotes DAYS + Remove the 'remote' subdir of jobs older than DAYS. + Negative values will skip this operation. + [default: 60] + -z DAYS, --compress DAYS + Compress (using gzip) any teuthology.log files older + than DAYS. Negative values will skip this operation. + [default: 30] +""".format(archive_base=teuthology.config.config.archive_base) + + +def main(): + args = docopt.docopt(doc) + teuthology.prune.main(args) diff --git a/scripts/queue.py b/scripts/queue.py new file mode 100644 index 000000000..8ea5ca5c2 --- /dev/null +++ b/scripts/queue.py @@ -0,0 +1,36 @@ +import docopt + +import teuthology.config +import teuthology.beanstalk + +doc = """ +usage: teuthology-queue -h + teuthology-queue [-s|-d|-f] -m MACHINE_TYPE + teuthology-queue [-r] -m MACHINE_TYPE + teuthology-queue -m MACHINE_TYPE -D PATTERN + teuthology-queue -p SECONDS [-m MACHINE_TYPE] + +List Jobs in queue. +If -D is passed, then jobs with PATTERN in the job name are deleted from the +queue. + +Arguments: + -m, --machine_type MACHINE_TYPE [default: multi] + Which machine type queue to work on. + +optional arguments: + -h, --help Show this help message and exit + -D, --delete PATTERN Delete Jobs with PATTERN in their name + -d, --description Show job descriptions + -r, --runs Only show run names + -f, --full Print the entire job config. Use with caution. + -s, --status Prints the status of the queue + -p, --pause SECONDS Pause queues for a number of seconds. A value of 0 + will unpause. If -m is passed, pause that queue, + otherwise pause all queues. +""" + + +def main(): + args = docopt.docopt(doc) + teuthology.beanstalk.main(args) diff --git a/scripts/reimage.py b/scripts/reimage.py new file mode 100644 index 000000000..42ec6e8ff --- /dev/null +++ b/scripts/reimage.py @@ -0,0 +1,25 @@ +import docopt +import sys + +import teuthology.reimage + +doc = """ +usage: teuthology-reimage --help + teuthology-reimage --os-type distro --os-version version [options] ... + +Reimage nodes without locking using specified distro type and version. +The nodes must be locked by the current user, otherwise an error occurs. +Custom owner can be specified in order to provision someone else nodes. +Reimaging unlocked nodes cannot be provided. + +Standard arguments: + -h, --help Show this help message and exit + -v, --verbose Be more verbose + --os-type Distro type like: rhel, ubuntu, etc. + --os-version Distro version like: 7.6, 16.04, etc. + --owner user@host Owner of the locked machines +""" + +def main(argv=sys.argv[1:]): + args = docopt.docopt(doc, argv=argv) + return teuthology.reimage.main(args) diff --git a/scripts/report.py b/scripts/report.py new file mode 100644 index 000000000..d2b39d3c5 --- /dev/null +++ b/scripts/report.py @@ -0,0 +1,42 @@ +import docopt + +import teuthology.report + +doc = """ +usage: + teuthology-report -h + teuthology-report [-v] [-R] [-n] [-s SERVER] [-a ARCHIVE] [-D] -r RUN ... + teuthology-report [-v] [-s SERVER] [-a ARCHIVE] [-D] -r RUN -j JOB ... + teuthology-report [-v] [-R] [-n] [-s SERVER] [-a ARCHIVE] --all-runs + +Submit test results to a web service + +optional arguments: + -h, --help show this help message and exit + -a ARCHIVE, --archive ARCHIVE + The base archive directory + [default: {archive_base}] + -r [RUN ...], --run [RUN ...] + A run (or list of runs) to submit + -j [JOB ...], --job [JOB ...] + A job (or list of jobs) to submit + --all-runs Submit all runs in the archive + -R, --refresh Re-push any runs already stored on the server. Note + that this may be slow. + -s SERVER, --server SERVER + "The server to post results to, e.g. + http://localhost:8080/ . May also be specified in + ~/.teuthology.yaml as 'results_server' + -n, --no-save By default, when submitting all runs, we remember the + last successful submission in a file called + 'last_successful_run'. Pass this flag to disable that + behavior. + -D, --dead Mark all given jobs (or entire runs) with status + 'dead'. Implies --refresh. + -v, --verbose be more verbose +""".format(archive_base=teuthology.config.config.archive_base) + + +def main(): + args = docopt.docopt(doc) + teuthology.report.main(args) diff --git a/scripts/results.py b/scripts/results.py new file mode 100644 index 000000000..99e70a3fd --- /dev/null +++ b/scripts/results.py @@ -0,0 +1,25 @@ +""" +usage: teuthology-results [-h] [-v] [--dry-run] [--email EMAIL] [--timeout TIMEOUT] --archive-dir DIR --name NAME [--subset SUBSET] [--seed SEED] [--no-nested-subset] + +Email teuthology suite results + +optional arguments: + -h, --help show this help message and exit + -v, --verbose be more verbose + --dry-run Instead of sending the email, just print it + --email EMAIL address to email test failures to + --timeout TIMEOUT how many seconds to wait for all tests to finish + [default: 0] + --archive-dir DIR path under which results for the suite are stored + --name NAME name of the suite + --subset SUBSET subset passed to teuthology-suite + --seed SEED random seed used in teuthology-suite + --no-nested-subset disable nested subsets used in teuthology-suite +""" +import docopt +import teuthology.results + + +def main(): + args = docopt.docopt(__doc__) + teuthology.results.main(args) diff --git a/scripts/run.py b/scripts/run.py new file mode 100644 index 000000000..20ee6ef3b --- /dev/null +++ b/scripts/run.py @@ -0,0 +1,38 @@ +""" +usage: teuthology --help + teuthology --version + teuthology [options] [--] ... + +Run ceph integration tests + +positional arguments: + one or more config files to read + +optional arguments: + -h, --help show this help message and exit + -v, --verbose be more verbose + --version the current installed version of teuthology + -a DIR, --archive DIR path to archive results in + --description DESCRIPTION job description + --owner OWNER job owner + --lock lock machines for the duration of the run + --machine-type MACHINE_TYPE Type of machine to lock/run tests on. + --os-type OS_TYPE Distro/OS of machine to run test on. + --os-version OS_VERSION Distro/OS version of machine to run test on. + --block block until locking machines succeeds (use with --lock) + --name NAME name for this teuthology run + --suite-path SUITE_PATH Location of ceph-qa-suite on disk. If not specified, + it will be fetched + --interactive-on-error drop to a python shell on failure, which will + halt the job; developer can then ssh to targets + and examine cluster state. + +""" +import docopt + +import teuthology.run + + +def main(): + args = docopt.docopt(__doc__, version=teuthology.__version__) + teuthology.run.main(args) diff --git a/scripts/schedule.py b/scripts/schedule.py new file mode 100644 index 000000000..58f7a4624 --- /dev/null +++ b/scripts/schedule.py @@ -0,0 +1,61 @@ +import docopt + +import teuthology.misc +import teuthology.schedule +import sys + +doc = """ +usage: teuthology-schedule -h + teuthology-schedule [options] --name [--] [ ...] + +Schedule ceph integration tests + +positional arguments: + Config file to read + "-" indicates read stdin. + +optional arguments: + -h, --help Show this help message and exit + -v, --verbose Be more verbose + -b , --queue-backend + Queue backend name, use prefix '@' + to append job config to the given + file path as yaml. + [default: beanstalk] + -n , --name Name of suite run the job is part of + -d , --description Job description + -o , --owner Job owner + -w , --worker Which worker to use (type of machine) + [default: plana] + -p , --priority Job priority (lower is sooner) + [default: 1000] + -N , --num Number of times to run/queue the job + [default: 1] + + --first-in-suite Mark the first job in a suite so suite + can note down the rerun-related info + [default: False] + --last-in-suite Mark the last job in a suite so suite + post-processing can be run + [default: False] + --email Where to send the results of a suite. + Only applies to the last job in a suite. + --timeout How many seconds to wait for jobs to + finish before emailing results. Only + applies to the last job in a suite. + --seed The random seed for rerunning the suite. + Only applies to the last job in a suite. + --subset The subset option passed to teuthology-suite. + Only applies to the last job in a suite. + --no-nested-subset The no-nested-subset option passed to + teuthology-suite. + Only applies to the last job in a suite. + --dry-run Instead of scheduling, just output the + job config. + +""" + + +def main(argv=sys.argv[1:]): + args = docopt.docopt(doc, argv=argv) + teuthology.schedule.main(args) diff --git a/scripts/suite.py b/scripts/suite.py new file mode 100644 index 000000000..3cf858db6 --- /dev/null +++ b/scripts/suite.py @@ -0,0 +1,232 @@ +import docopt +import sys + +import teuthology.suite +from teuthology.suite import override_arg_defaults as defaults +from teuthology.config import config + +doc = """ +usage: teuthology-suite --help + teuthology-suite [-v | -vv ] --suite [options] [...] + teuthology-suite [-v | -vv ] --rerun [options] [...] + +Run a suite of ceph integration tests. A suite is a directory containing +facets. A facet is a directory containing config snippets. Running a suite +means running teuthology for every configuration combination generated by +taking one config snippet from each facet. Any config files passed on the +command line will be used for every combination, and will override anything in +the suite. By specifying a subdirectory in the suite argument, it is possible +to limit the run to a specific facet. For instance -s upgrade/dumpling-x only +runs the dumpling-x facet of the upgrade suite. + +Miscellaneous arguments: + -h, --help Show this help message and exit + -v, --verbose Be more verbose + --dry-run Do a dry run; do not schedule anything. In + combination with -vv, also call + teuthology-schedule with --dry-run. + -y, --non-interactive Do not ask question and say yes when + it is possible. + +Standard arguments: + Optional extra job yaml to include + -s , --suite + The suite to schedule + --wait Block until the suite is finished + -c , --ceph The ceph branch to run against + [default: {default_ceph_branch}] + -S , --sha1 The ceph sha1 to run against (overrides -c) + If both -S and -c are supplied, -S wins, and + there is no validation that sha1 is contained + in branch + -n , --newest + Search for the newest revision built on all + required distro/versions, starting from + either --ceph or --sha1, backtracking + up to commits [default: 0] + -k , --kernel + The kernel branch to run against, + use 'none' to bypass kernel task. + [default: distro] + -f , --flavor + The ceph packages shaman flavor to run with: + ('default', 'crimson', 'notcmalloc', 'jaeger') + [default: default] + -t , --teuthology-branch + The teuthology branch to run against. + Default value is determined in the next order. + There is TEUTH_BRANCH environment variable set. + There is `qa/.teuthology_branch` present in + the suite repo and contains non-empty string. + There is `teuthology_branch` present in one of + the user or system `teuthology.yaml` configuration + files respectively, otherwise use `main`. + -m , --machine-type + Machine type [default: {default_machine_type}] + -d , --distro + Distribution to run against + -D , --distro-version + Distro version to run against + --ceph-repo Query this repository for Ceph branch and SHA1 + values [default: {default_ceph_repo}] + --suite-repo Use tasks and suite definition in this repository + [default: {default_suite_repo}] + --suite-relpath + Look for tasks and suite definitions in this + subdirectory of the suite repo. + [default: qa] + --suite-branch + Use this suite branch instead of the ceph branch + --suite-sha1 The suite sha1 to use for the tests (overrides + --suite-branch) + --suite-dir Use this alternative directory as-is when + assembling jobs from yaml fragments. This causes + to be ignored for scheduling + purposes, but it will still be used for test + running. The must have `qa/suite` + sub-directory. + --validate-sha1 + Validate that git SHA1s passed to -S exist. + [default: true] + --kdb + Enable/disable kdb in kernel + [default: true] + --sleep-before-teardown + Number of seconds to sleep before teardown. + Use with care, as this applies to all jobs in the + run. This option is used along with --limit one. + If the --limit ommitted then it's forced to 1. + If the --limit is greater than 4, then user must + confirm it interactively to avoid massive lock + of resources, however --non-interactive option + can be used to skip user input. + [default: 0] + --arch Override architecture defaults, for example, + aarch64, armv7l, x86_64. Normally this + argument should not be provided and the arch + is determined from --machine-type. + +Scheduler arguments: + --owner Job owner + -b , --queue-backend + Scheduler queue backend name + -e , --email + When tests finish or time out, send an email + here. May also be specified in ~/.teuthology.yaml + as 'results_email' + --expire Do not execute jobs in the run if they have not + completed by this time. Valid formats include + ISO 8601, and relative offsets like '90s', '30m', + '1h', '3d', or '1w' + --rocketchat Comma separated list of Rocket.Chat channels where + to send a message when tests finished or time out. + To be used with --sleep-before-teardown option. + -N , --num Number of times to run/queue the job + [default: 1] + -l , --limit Queue at most this many jobs + [default: 0] + --subset Instead of scheduling the entire suite, break the + set of jobs into pieces (each of which will + contain each facet at least once) and schedule + piece . Scheduling 0/, 1/, + 2/ ... -1/ will schedule all + jobs in the suite (many more than once). If specified, + this value can be found in results.log. + -p , --priority + Job priority (lower is sooner) + [default: 1000] + --timeout How long, in seconds, to wait for jobs to finish + before sending email. This does not kill jobs. + [default: {default_results_timeout}] + --filter KEYWORDS Only run jobs whose description contains at least one + of the keywords in the comma separated keyword + string specified. + --filter-out KEYWORDS Do not run jobs whose description contains any of + the keywords in the comma separated keyword + string specified. + --filter-all KEYWORDS Only run jobs whose description contains each one + of the keywords in the comma separated keyword + string specified. + -F, --filter-fragments + Check yaml fragments too if job description + does not match the filters provided with + options --filter, --filter-out, and --filter-all. + [default: false] + --archive-upload RSYNC_DEST Rsync destination to upload archives. + --archive-upload-url URL Public facing URL where archives are uploaded. + --throttle SLEEP When scheduling, wait SLEEP seconds between jobs. + Useful to avoid bursts that may be too hard on + the underlying infrastructure or exceed OpenStack API + limits (server creation per minute for instance). + -r, --rerun Attempt to reschedule a run, selecting only those + jobs whose status are mentioned by + --rerun-status. + Note that this is implemented by scheduling an + entirely new suite and including only jobs whose + descriptions match the selected ones. It does so + using the same logic as --filter. + Of all the flags that were passed when scheduling + the original run, the resulting one will only + inherit the --suite value. Any other arguments + must be passed again while scheduling. By default, + 'seed' and 'subset' will be taken from results.log, + but can be overide if passed again. + This is important for tests involving random facet + (path ends with '$' operator). + -R, --rerun-statuses + A comma-separated list of statuses to be used + with --rerun. Supported statuses are: 'dead', + 'fail', 'pass', 'queued', 'running', 'waiting' + [default: fail,dead] + --seed SEED An random number mostly useful when used along + with --rerun argument to rerun the exact + same jobs that can only be picked at random. + This number can be found in the output of + teuthology-suite command or in results.log. + Pass -1 for a random seed [default: -1]. + --force-priority Skip the priority check. + --job-threshold Do not allow to schedule the run if the number + of jobs exceeds . Use 0 to allow + any number [default: {default_job_threshold}]. + --no-nested-subset Do not perform nested suite subsets [default: false]. + ++=================+=================================================================+ +| Priority | Explanation | ++=================+=================================================================+ +| N < 10 | Use this if the sky is falling and some group of tests | +| | must be run ASAP. | ++-----------------+-----------------------------------------------------------------+ +| 10 <= N < 50 | Use this if your tests are urgent and blocking other | +| | important development. | ++-----------------+-----------------------------------------------------------------+ +| 50 <= N < 75 | Use this if you are testing a particular feature/fix | +| | and running fewer than about 25 jobs. This range is also | +| | used for urgent release testing. | ++-----------------+-----------------------------------------------------------------+ +| 75 <= N < 100 | Tech Leads regularly schedule integration tests with this | +| | priority to verify pull requests against main. | ++-----------------+-----------------------------------------------------------------+ +| 100 <= N < 150 | This priority is used for QE validation of point releases. | ++-----------------+-----------------------------------------------------------------+ +| 150 <= N < 200 | Use this priority for 100 jobs or fewer that test a particular | +| | feature or fix. Results are available in about 24 hours. | ++-----------------+-----------------------------------------------------------------+ +| 200 <= N < 1000 | Use this priority for large test runs. Results are available | +| | in about a week. | ++-----------------+-----------------------------------------------------------------+ + +""".format( + default_machine_type=config.default_machine_type, + default_results_timeout=config.results_timeout, + default_ceph_repo=defaults('--ceph-repo', + config.get_ceph_git_url()), + default_suite_repo=defaults('--suite-repo', + config.get_ceph_qa_suite_git_url()), + default_ceph_branch=defaults('--ceph-branch', 'main'), + default_job_threshold=config.job_threshold, +) + + +def main(argv=sys.argv[1:]): + args = docopt.docopt(doc, argv=argv) + return teuthology.suite.main(args) diff --git a/scripts/supervisor.py b/scripts/supervisor.py new file mode 100644 index 000000000..7450473eb --- /dev/null +++ b/scripts/supervisor.py @@ -0,0 +1,44 @@ +import argparse +import sys + +import teuthology.dispatcher.supervisor + + +def parse_args(argv): + parser = argparse.ArgumentParser( + description="Supervise and run a teuthology job; normally only run by the dispatcher", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="be more verbose", + ) + parser.add_argument( + "-a", + "--archive-dir", + type=str, + help="path in which to store the job's logfiles", + required=True, + ) + parser.add_argument( + "--bin-path", + type=str, + help="teuthology bin path", + required=True, + ) + parser.add_argument( + "--job-config", + type=str, + help="file descriptor of job's config file", + required=True, + ) + return parser.parse_args(argv) + + +def main(): + sys.exit(teuthology.dispatcher.supervisor.main(parse_args(sys.argv[1:]))) + + +if __name__ == "__main__": + main() diff --git a/scripts/test/script.py b/scripts/test/script.py new file mode 100644 index 000000000..fdabd1b55 --- /dev/null +++ b/scripts/test/script.py @@ -0,0 +1,16 @@ +import subprocess +from pytest import raises + + +class Script(object): + script_name = 'teuthology' + + def test_help(self): + args = (self.script_name, '--help') + out = subprocess.check_output(args).decode() + assert out.startswith('usage') + + def test_invalid(self): + args = (self.script_name, '--invalid-option') + with raises(subprocess.CalledProcessError): + subprocess.check_call(args) diff --git a/scripts/test/test_dispatcher_.py b/scripts/test/test_dispatcher_.py new file mode 100644 index 000000000..4d201aae5 --- /dev/null +++ b/scripts/test/test_dispatcher_.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestDispatcher(Script): + script_name = 'teuthology-dispatcher' diff --git a/scripts/test/test_exporter_.py b/scripts/test/test_exporter_.py new file mode 100644 index 000000000..b0611a337 --- /dev/null +++ b/scripts/test/test_exporter_.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestExporter(Script): + script_name = 'teuthology-exporter' diff --git a/scripts/test/test_lock.py b/scripts/test/test_lock.py new file mode 100644 index 000000000..3fc803aae --- /dev/null +++ b/scripts/test/test_lock.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestLock(Script): + script_name = 'teuthology-lock' diff --git a/scripts/test/test_ls.py b/scripts/test/test_ls.py new file mode 100644 index 000000000..d0e4d8145 --- /dev/null +++ b/scripts/test/test_ls.py @@ -0,0 +1,15 @@ +import docopt + +from script import Script +from scripts import ls + +doc = ls.__doc__ + + +class TestLs(Script): + script_name = 'teuthology-ls' + + def test_args(self): + args = docopt.docopt(doc, ["--verbose", "some/archive/dir"]) + assert args["--verbose"] + assert args[""] == "some/archive/dir" diff --git a/scripts/test/test_prune_logs.py b/scripts/test/test_prune_logs.py new file mode 100644 index 000000000..8e967522f --- /dev/null +++ b/scripts/test/test_prune_logs.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestPruneLogs(Script): + script_name = 'teuthology-prune-logs' diff --git a/scripts/test/test_report.py b/scripts/test/test_report.py new file mode 100644 index 000000000..c8065fd1f --- /dev/null +++ b/scripts/test/test_report.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestReport(Script): + script_name = 'teuthology-report' diff --git a/scripts/test/test_results.py b/scripts/test/test_results.py new file mode 100644 index 000000000..a97981cb6 --- /dev/null +++ b/scripts/test/test_results.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestResults(Script): + script_name = 'teuthology-results' diff --git a/scripts/test/test_run.py b/scripts/test/test_run.py new file mode 100644 index 000000000..74fa1b926 --- /dev/null +++ b/scripts/test/test_run.py @@ -0,0 +1,45 @@ +import docopt + +from script import Script +from scripts import run + +doc = run.__doc__ + + +class TestRun(Script): + script_name = 'teuthology' + + def test_all_args(self): + args = docopt.docopt(doc, [ + "--verbose", + "--archive", "some/archive/dir", + "--description", "the_description", + "--owner", "the_owner", + "--lock", + "--machine-type", "machine_type", + "--os-type", "os_type", + "--os-version", "os_version", + "--block", + "--name", "the_name", + "--suite-path", "some/suite/dir", + "path/to/config.yml", + ]) + assert args["--verbose"] + assert args["--archive"] == "some/archive/dir" + assert args["--description"] == "the_description" + assert args["--owner"] == "the_owner" + assert args["--lock"] + assert args["--machine-type"] == "machine_type" + assert args["--os-type"] == "os_type" + assert args["--os-version"] == "os_version" + assert args["--block"] + assert args["--name"] == "the_name" + assert args["--suite-path"] == "some/suite/dir" + assert args[""] == ["path/to/config.yml"] + + def test_multiple_configs(self): + args = docopt.docopt(doc, [ + "config1.yml", + "config2.yml", + ]) + assert args[""] == ["config1.yml", "config2.yml"] diff --git a/scripts/test/test_schedule.py b/scripts/test/test_schedule.py new file mode 100644 index 000000000..e89f983a7 --- /dev/null +++ b/scripts/test/test_schedule.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestSchedule(Script): + script_name = 'teuthology-schedule' diff --git a/scripts/test/test_suite.py b/scripts/test/test_suite.py new file mode 100644 index 000000000..062aba470 --- /dev/null +++ b/scripts/test/test_suite.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestSuite(Script): + script_name = 'teuthology-suite' diff --git a/scripts/test/test_supervisor_.py b/scripts/test/test_supervisor_.py new file mode 100644 index 000000000..81298995c --- /dev/null +++ b/scripts/test/test_supervisor_.py @@ -0,0 +1,5 @@ +from script import Script + + +class TestSupervisor(Script): + script_name = 'teuthology-supervisor' diff --git a/scripts/test/test_updatekeys.py b/scripts/test/test_updatekeys.py new file mode 100644 index 000000000..c4122b0f7 --- /dev/null +++ b/scripts/test/test_updatekeys.py @@ -0,0 +1,21 @@ +from script import Script +import subprocess +from pytest import raises +from pytest import skip + + +class TestUpdatekeys(Script): + script_name = 'teuthology-updatekeys' + + def test_invalid(self): + skip("teuthology.lock needs to be partially refactored to allow" + + "teuthology-updatekeys to return nonzero in all erorr cases") + + def test_all_and_targets(self): + args = (self.script_name, '-a', '-t', 'foo') + with raises(subprocess.CalledProcessError): + subprocess.check_call(args) + + def test_no_args(self): + with raises(subprocess.CalledProcessError): + subprocess.check_call(self.script_name) diff --git a/scripts/update_inventory.py b/scripts/update_inventory.py new file mode 100644 index 000000000..014e3ccf5 --- /dev/null +++ b/scripts/update_inventory.py @@ -0,0 +1,41 @@ +import docopt + +import teuthology +import teuthology.lock +import teuthology.lock.ops +import teuthology.misc +import teuthology.orchestra.remote + +import logging + +doc = """ +usage: teuthology-update-inventory -h + teuthology-update-inventory [-v] [-m type] REMOTE [REMOTE ...] + +Update the given nodes' inventory information on the lock server + + + -h, --help show this help message and exit + -v, --verbose be more verbose + -m , --machine-type optionally specify a machine type when + submitting nodes for the first time + REMOTE hostnames of machines whose information to update + +""" + + +def main(): + args = docopt.docopt(doc) + if args['--verbose']: + teuthology.log.setLevel(logging.DEBUG) + + machine_type = args.get('--machine-type') + remotes = args.get('REMOTE') + for rem_name in remotes: + rem_name = teuthology.misc.canonicalize_hostname(rem_name) + remote = teuthology.orchestra.remote.Remote(rem_name) + remote.connect() + inventory_info = remote.inventory_info + if machine_type: + inventory_info['machine_type'] = machine_type + teuthology.lock.ops.update_inventory(inventory_info) diff --git a/scripts/updatekeys.py b/scripts/updatekeys.py new file mode 100644 index 000000000..394ae32bb --- /dev/null +++ b/scripts/updatekeys.py @@ -0,0 +1,31 @@ +import docopt +import sys + +import teuthology.lock +import teuthology.lock.cli + +doc = """ +usage: teuthology-updatekeys -h + teuthology-updatekeys [-v] -t + teuthology-updatekeys [-v] ... + teuthology-updatekeys [-v] -a + +Update any hostkeys that have changed. You can list specific machines to run +on, or use -a to check all of them automatically. + +positional arguments: + MACHINES hosts to check for updated keys + +optional arguments: + -h, --help Show this help message and exit + -v, --verbose Be more verbose + -t , --targets + Input yaml containing targets to check + -a, --all Update hostkeys of all machines in the db +""" + + +def main(): + args = docopt.docopt(doc) + status = teuthology.lock.cli.updatekeys(args) + sys.exit(status) diff --git a/scripts/wait.py b/scripts/wait.py new file mode 100644 index 000000000..6b2ff34a9 --- /dev/null +++ b/scripts/wait.py @@ -0,0 +1,31 @@ +import docopt +import sys + +import logging + +import teuthology +import teuthology.suite +from teuthology.config import config + +doc = """ +usage: teuthology-wait --help + teuthology-wait [-v] --run + +Wait until run is finished. Returns exit code 0 on success, otherwise 1. + +Miscellaneous arguments: + -h, --help Show this help message and exit + -v, --verbose Be more verbose + +Standard arguments: + -r, --run Run name to watch. +""" + + +def main(argv=sys.argv[1:]): + args = docopt.docopt(doc, argv=argv) + if args.get('--verbose'): + teuthology.log.setLevel(logging.DEBUG) + name = args.get('--run') + return teuthology.suite.wait(name, config.max_job_time, None) + diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 000000000..463ebf6eb --- /dev/null +++ b/setup.cfg @@ -0,0 +1,153 @@ +[metadata] +name = teuthology +long_description = file: README.rst +long_description_content_type = text/x-rst +url = https://github.com/ceph/teuthology +author = Red Hat, Inc. +license = MIT +classifiers = + Intended Audience :: Developers + License :: OSI Approved :: MIT License + Natural Language :: English + Operating System :: POSIX :: Linux + Programming Language :: Python :: 3 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 + Programming Language :: Python :: Implementation :: CPython + Topic :: Software Development :: Quality Assurance + Topic :: Software Development :: Testing + Topic :: System :: Distributed Computing + Topic :: System :: Filesystems +description_content_type = text/x-rst; charset=UTF-8 +description_file = README.rst +keywords = teuthology, test, ceph, cluster +summary = Ceph test framework + +[options] +python_requires = >=3.10 +packages = find: +install_requires = + PyYAML + ansible-core==2.17.7 + apache-libcloud + backports.ssl-match-hostname + beanstalkc3>=0.4.0 + configobj + configparser + docopt + gevent<25 + httplib2 + humanfriendly + lupa + lxml + ndg-httpsclient + netaddr + openstacksdk # teuthology-openstack dependencies + paramiko + pexpect + pip-tools + prettytable + prometheus_client>=0.16.0 + psutil>=2.1.0 + pyasn1 + pynacl>=1.5.0 + pyopenssl>=0.13 + python-dateutil + requests>2.13.0 + sentry-sdk + types-psutil + urllib3>=1.25.4,<1.27 # For botocore +scripts = + teuthology/task/install/bin/adjust-ulimits + teuthology/task/install/bin/daemon-helper + teuthology/task/install/bin/stdin-killer + +[options.entry_points] +console_scripts = + teuthology = scripts.run:main + teuthology-openstack = scripts.openstack:main + teuthology-suite = scripts.suite:main + teuthology-ls = scripts.ls:main + teuthology-worker = scripts.worker:main + teuthology-lock = scripts.lock:main + teuthology-schedule = scripts.schedule:main + teuthology-updatekeys = scripts.updatekeys:main + teuthology-update-inventory = scripts.update_inventory:main + teuthology-results = scripts.results:main + teuthology-report = scripts.report:main + teuthology-kill = scripts.kill:main + teuthology-queue = scripts.queue:main + teuthology-prune-logs = scripts.prune_logs:main + teuthology-describe = scripts.describe:main + teuthology-reimage = scripts.reimage:main + teuthology-dispatcher = scripts.dispatcher:main + teuthology-wait = scripts.wait:main + teuthology-exporter = scripts.exporter:main + teuthology-node-cleanup = scripts.node_cleanup:main + teuthology-supervisor = scripts.supervisor:main + +[options.extras_require] +manhole = + manhole +rocketchat = + rocket-python>=1.2.15 +sentry = + sentry-sdk +test = + PyJWT + boto>=2.0b4 + boto3 + coverage + ipy + mock + pynose + pytest + pytest-cov + toml + tox + xmltodict +# libcloud openstack dependencies +openstack = + python-openstackclient + python-novaclient + +[options.package_data] +teuthology.openstack = + archive-key + archive-key.pub + openstack-centos-6.5-user-data.txt + openstack-centos-7.0-user-data.txt + openstack-centos-7.1-user-data.txt + openstack-centos-7.2-user-data.txt + openstack-debian-8.0-user-data.txt + openstack-opensuse-42.1-user-data.txt + openstack-teuthology.cron + openstack-teuthology.init + openstack-ubuntu-12.04-user-data.txt + openstack-ubuntu-14.04-user-data.txt + openstack-user-data.txt + openstack.yaml + setup-openstack.sh +teuthology.suite = + fragment-merge.lua +teuthology.task.install = + bin/adjust-ulimits + bin/daemon-helper + bin/stdin-killer +teuthology.task.internal = + edit_sudoers.sh + +[options.packages.find] +exclude = + teuthology.test + teuthology.test.* + teuthology.lock.test + teuthology.task.tests + teuthology.openstack.test + teuthology.orchestra.test + teuthology.orchestra.test.* + +[flake8] +max-line-length = 100 diff --git a/systemd/teuthology-dispatcher@.service b/systemd/teuthology-dispatcher@.service new file mode 100644 index 000000000..43a1ec1db --- /dev/null +++ b/systemd/teuthology-dispatcher@.service @@ -0,0 +1,18 @@ +[Unit] +Description=Teuthology Dispatcher + +Wants=ceph.target +After=ceph.target + +[Service] +Type=simple +User=teuthworker +ExecStart=/home/teuthworker/src/git.ceph.com_git_teuthology_main/virtualenv/bin/python3 \ + /home/teuthworker/src/git.ceph.com_git_teuthology_main/virtualenv/bin/teuthology-dispatcher \ + -v \ + --archive-dir /home/teuthworker/archive \ + --tube %i \ + --log-dir /home/teuthworker/archive/worker_logs +ExecStop=touch /tmp/teuthology-stop-dispatcher +Restart=on-failure +TimeoutStopSec=infinity diff --git a/systemd/teuthology-exporter.service b/systemd/teuthology-exporter.service new file mode 100644 index 000000000..15b951aaf --- /dev/null +++ b/systemd/teuthology-exporter.service @@ -0,0 +1,12 @@ +[Unit] +Description=Teuthology Exporter + +Wants=ceph.target +After=ceph.target + +[Service] +Type=simple +User=teuthworker +ExecStart=/home/teuthworker/src/git.ceph.com_git_teuthology_main/virtualenv/bin/teuthology-exporter +Restart=on-failure +TimeoutStopSec=60 diff --git a/teuthology/__init__.py b/teuthology/__init__.py new file mode 100644 index 000000000..0142d44cb --- /dev/null +++ b/teuthology/__init__.py @@ -0,0 +1,116 @@ +import os, sys +try: + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + +__version__ = importlib_metadata.version("teuthology") + +# Tell gevent not to patch os.waitpid() since it is susceptible to race +# conditions. See: +# http://www.gevent.org/gevent.monkey.html#gevent.monkey.patch_os +os.environ['GEVENT_NOWAITPID'] = 'true' + +# Use manhole to give us a way to debug hung processes +# https://pypi.python.org/pypi/manhole +try: + import manhole + manhole.install( + verbose=False, + # Listen for SIGUSR1 + oneshot_on="USR1" + ) +except ImportError: + pass +from gevent import monkey +patch_threads=True +for arg in sys.argv: + if "teuthology_api" in arg: + patch_threads=False +monkey.patch_all( + dns=False, + # Don't patch subprocess to avoid http://tracker.ceph.com/issues/14990 + subprocess=False, + thread=patch_threads, +) +import sys +from gevent.hub import Hub + +# Don't write pyc files +sys.dont_write_bytecode = True + +from teuthology.orchestra import monkey +monkey.patch_all() + +import logging + +# If we are running inside a virtualenv, ensure we have its 'bin' directory in +# our PATH. This doesn't happen automatically if scripts are called without +# first activating the virtualenv. +exec_dir = os.path.abspath(os.path.dirname(sys.argv[0])) +if os.path.split(exec_dir)[-1] == 'bin' and exec_dir not in os.environ['PATH']: + os.environ['PATH'] = ':'.join((exec_dir, os.environ['PATH'])) + +# We don't need to see log entries for each connection opened +logging.getLogger('requests.packages.urllib3.connectionpool').setLevel( + logging.WARN) +# if requests doesn't bundle it, shut it up anyway +logging.getLogger('urllib3.connectionpool').setLevel( + logging.WARN) +# We also don't need the "Converted retries value" messages +logging.getLogger('urllib3.util.retry').setLevel( + logging.WARN) + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s') +log = logging.getLogger(__name__) + +log.debug('teuthology version: %s', __version__) + + +def setup_log_file(log_path): + root_logger = logging.getLogger() + handlers = root_logger.handlers + for handler in handlers: + if isinstance(handler, logging.FileHandler) and \ + handler.stream.name == log_path: + log.debug("Already logging to %s; not adding new handler", + log_path) + return + formatter = logging.Formatter( + fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s', + datefmt='%Y-%m-%dT%H:%M:%S') + handler = logging.FileHandler(filename=log_path) + handler.setFormatter(formatter) + root_logger.addHandler(handler) + root_logger.info('teuthology version: %s', __version__) + + +def install_except_hook(): + """ + Install an exception hook that first logs any uncaught exception, then + raises it. + """ + def log_exception(exc_type, exc_value, exc_traceback): + if not issubclass(exc_type, KeyboardInterrupt): + log.critical("Uncaught exception", exc_info=(exc_type, exc_value, + exc_traceback)) + sys.__excepthook__(exc_type, exc_value, exc_traceback) + sys.excepthook = log_exception + + +def patch_gevent_hub_error_handler(): + Hub._origin_handle_error = Hub.handle_error + + def custom_handle_error(self, context, type, value, tb): + if context is None or issubclass(type, Hub.SYSTEM_ERROR): + self.handle_system_error(type, value) + elif issubclass(type, Hub.NOT_ERROR): + pass + else: + log.error("Uncaught exception (Hub)", exc_info=(type, value, tb)) + + Hub.handle_error = custom_handle_error + +patch_gevent_hub_error_handler() diff --git a/teuthology/beanstalk.py b/teuthology/beanstalk.py new file mode 100644 index 000000000..76bc2c97a --- /dev/null +++ b/teuthology/beanstalk.py @@ -0,0 +1,215 @@ +import beanstalkc +import json +import yaml +import logging +import pprint +import sys +from collections import OrderedDict + +from teuthology import report +from teuthology.config import config + +log = logging.getLogger(__name__) + + +def connect(): + host = config.queue_host + port = config.queue_port + if host is None or port is None: + raise RuntimeError( + 'Beanstalk queue information not found in {conf_path}'.format( + conf_path=config.teuthology_yaml)) + return beanstalkc.Connection(host=host, port=port, parse_yaml=yaml.safe_load) + + +def watch_tube(connection, tube_name): + """ + Watch a given tube, potentially correcting to 'multi' if necessary. Returns + the tube_name that was actually used. + """ + if ',' in tube_name: + log.debug("Correcting tube name to 'multi'") + tube_name = 'multi' + connection.watch(tube_name) + connection.ignore('default') + return tube_name + + +def walk_jobs(connection, tube_name, processor, pattern=None): + """ + def callback(jobs_dict) + """ + log.info("Checking Beanstalk Queue...") + job_count = connection.stats_tube(tube_name)['current-jobs-ready'] + if job_count == 0: + log.info('No jobs in Beanstalk Queue') + return + + # Try to figure out a sane timeout based on how many jobs are in the queue + timeout = job_count / 2000.0 * 60 + for i in range(1, job_count + 1): + print_progress(i, job_count, "Loading") + job = connection.reserve(timeout=timeout) + if job is None or job.body is None: + continue + job_config = yaml.safe_load(job.body) + job_name = job_config['name'] + job_id = job.stats()['id'] + if pattern is not None and pattern not in job_name: + continue + processor.add_job(job_id, job_config, job) + end_progress() + processor.complete() + + +def print_progress(index, total, message=None): + msg = "{m} ".format(m=message) if message else '' + sys.stderr.write("{msg}{i}/{total}\r".format( + msg=msg, i=index, total=total)) + sys.stderr.flush() + + +def end_progress(): + sys.stderr.write('\n') + sys.stderr.flush() + + +class JobProcessor(object): + def __init__(self): + self.jobs = OrderedDict() + + def add_job(self, job_id, job_config, job_obj=None): + job_id = str(job_id) + + job_dict = dict( + index=(len(self.jobs) + 1), + job_config=job_config, + ) + if job_obj: + job_dict['job_obj'] = job_obj + self.jobs[job_id] = job_dict + + self.process_job(job_id) + + def process_job(self, job_id): + pass + + def complete(self): + pass + + +class JobPrinter(JobProcessor): + def __init__(self, show_desc=False, full=False): + super(JobPrinter, self).__init__() + self.show_desc = show_desc + self.full = full + + def process_job(self, job_id): + job_config = self.jobs[job_id]['job_config'] + job_index = self.jobs[job_id]['index'] + job_priority = job_config['priority'] + job_name = job_config['name'] + job_desc = job_config['description'] + print('Job: {i:>4} priority: {pri:>4} {job_name}/{job_id}'.format( + i=job_index, + pri=job_priority, + job_id=job_id, + job_name=job_name, + )) + if self.full: + pprint.pprint(job_config) + elif job_desc and self.show_desc: + for desc in job_desc.split(): + print('\t {}'.format(desc)) + + +class RunPrinter(JobProcessor): + def __init__(self): + super(RunPrinter, self).__init__() + self.runs = list() + + def process_job(self, job_id): + run = self.jobs[job_id]['job_config']['name'] + if run not in self.runs: + self.runs.append(run) + print(run) + + +class JobDeleter(JobProcessor): + def __init__(self, pattern): + self.pattern = pattern + super(JobDeleter, self).__init__() + + def add_job(self, job_id, job_config, job_obj=None): + job_name = job_config['name'] + if self.pattern in job_name: + super(JobDeleter, self).add_job(job_id, job_config, job_obj) + + def process_job(self, job_id): + job_config = self.jobs[job_id]['job_config'] + job_name = job_config['name'] + print('Deleting {job_name}/{job_id}'.format( + job_id=job_id, + job_name=job_name, + )) + job_obj = self.jobs[job_id].get('job_obj') + if job_obj: + job_obj.delete() + report.try_delete_jobs(job_name, job_id) + + +def pause_tube(connection, tube, duration): + duration = int(duration) + if not tube: + tubes = sorted(connection.tubes()) + else: + tubes = [tube] + + prefix = 'Unpausing' if duration == 0 else "Pausing for {dur}s" + templ = prefix + ": {tubes}" + log.info(templ.format(dur=duration, tubes=tubes)) + for tube in tubes: + connection.pause_tube(tube, duration) + + +def stats_tube(connection, tube): + stats = connection.stats_tube(tube) + result = dict( + name=tube, + count=stats['current-jobs-ready'], + paused=(stats['pause'] != 0), + ) + return result + + +def main(args): + machine_type = args['--machine_type'] + status = args['--status'] + delete = args['--delete'] + runs = args['--runs'] + show_desc = args['--description'] + full = args['--full'] + pause_duration = args['--pause'] + try: + connection = connect() + if machine_type and not pause_duration: + # watch_tube needs to be run before we inspect individual jobs; + # it is not needed for pausing tubes + watch_tube(connection, machine_type) + if status: + print(json.dumps(stats_tube(connection, machine_type))) + elif pause_duration: + pause_tube(connection, machine_type, pause_duration) + elif delete: + walk_jobs(connection, machine_type, + JobDeleter(delete)) + elif runs: + walk_jobs(connection, machine_type, + RunPrinter()) + else: + walk_jobs(connection, machine_type, + JobPrinter(show_desc=show_desc, full=full)) + except KeyboardInterrupt: + log.info("Interrupted.") + finally: + connection.close() diff --git a/teuthology/ceph.conf.template b/teuthology/ceph.conf.template new file mode 100644 index 000000000..bdf92863d --- /dev/null +++ b/teuthology/ceph.conf.template @@ -0,0 +1,101 @@ +# XXX +# +# DO NOT MODIFY THIS FILE +# +# This file is a legacy ceph.conf template used only when testing older +# releases of Ceph (pre-Nautilus). The new template exists in ceph.git at +# qa/tasks/ceph.conf.template +# +# XXX + +[global] + chdir = "" + pid file = /var/run/ceph/$cluster-$name.pid + auth supported = cephx + + filestore xattr use omap = true + + mon clock drift allowed = 1.000 + + osd crush chooseleaf type = 0 + auth debug = true + + ms die on old message = true + + mon pg warn min per osd = 1 + mon pg warn max per osd = 10000 # <= luminous + mon max pg per osd = 10000 # >= luminous + mon pg warn max object skew = 0 + + osd pool default size = 2 + + mon osd allow primary affinity = true + mon osd allow pg remap = true + mon warn on legacy crush tunables = false + mon warn on crush straw calc version zero = false + mon warn on no sortbitwise = false + mon warn on osd down out interval zero = false + + osd pool default erasure code profile = "plugin=jerasure technique=reed_sol_van k=2 m=1 ruleset-failure-domain=osd crush-failure-domain=osd" + + osd default data pool replay window = 5 + + mon allow pool delete = true + + mon cluster log file level = debug + debug asserts on shutdown = true + +[osd] + osd journal size = 100 + + osd scrub load threshold = 5.0 + osd scrub max interval = 600 + + osd recover clone overlap = true + osd recovery max chunk = 1048576 + + osd debug shutdown = true + osd debug op order = true + osd debug verify stray on activate = true + + osd open classes on start = true + osd debug pg log writeout = true + + osd deep scrub update digest min age = 30 + + osd map max advance = 10 + + journal zero on create = true + + filestore ondisk finisher threads = 3 + filestore apply finisher threads = 3 + + bdev debug aio = true + osd debug misdirected ops = true + +[mgr] + debug ms = 1 + debug mgr = 20 + debug mon = 20 + debug auth = 20 + mon reweight min pgs per osd = 4 + mon reweight min bytes per osd = 10 + +[mon] + debug ms = 1 + debug mon = 20 + debug paxos = 20 + debug auth = 20 + mon data avail warn = 5 + mon mgr mkfs grace = 120 + mon reweight min pgs per osd = 4 + mon osd reporter subtree level = osd + mon osd prime pg temp = true + mon reweight min bytes per osd = 10 + +[client] + rgw cache enabled = true + rgw enable ops log = true + rgw enable usage log = true + log file = /var/log/ceph/$cluster-$name.$pid.log + admin socket = /var/run/ceph/$cluster-$name.$pid.asok diff --git a/teuthology/config.py b/teuthology/config.py new file mode 100644 index 000000000..241b10213 --- /dev/null +++ b/teuthology/config.py @@ -0,0 +1,315 @@ +import os +import yaml +import logging +try: + from collections.abc import MutableMapping +except ImportError: + from collections import MutableMapping + + +# Configuration constants +SYSTEM_CONFIG_PATH = '/etc/teuthology.yaml' +USER_CONFIG_PATH = '~/.teuthology.yaml' +CONFIG_PATH_VAR_NAME = 'TEUTHOLOGY_CONFIG' # name of env var to check + + +def init_logging(): + log = logging.getLogger(__name__) + return log + +log = init_logging() + + +class YamlConfig(MutableMapping): + """ + A configuration object populated by parsing a yaml file, with optional + default values. + + Note that modifying the _defaults attribute of an instance can potentially + yield confusing results; if you need to do modify defaults, use the class + variable or create a subclass. + """ + _defaults = dict() + + def __init__(self, yaml_path=None): + self.yaml_path = yaml_path + if self.yaml_path: + self.load() + else: + self._conf = dict() + + def load(self, conf=None): + if conf is not None: + if isinstance(conf, dict): + self._conf = conf + return + elif conf: + self._conf = yaml.safe_load(conf) + return + if os.path.exists(self.yaml_path): + with open(self.yaml_path) as f: + self._conf = yaml.safe_load(f) + else: + log.debug("%s not found", self.yaml_path) + self._conf = dict() + + def update(self, in_dict): + """ + Update an existing configuration using dict.update() + + :param in_dict: The dict to use to update + """ + self._conf.update(in_dict) + + @classmethod + def from_dict(cls, in_dict): + """ + Build a config object from a dict. + + :param in_dict: The dict to use + :returns: The config object + """ + conf_obj = cls() + conf_obj._conf = in_dict + return conf_obj + + def to_dict(self): + """ + :returns: A shallow copy of the configuration as a dict + """ + return dict(self._conf) + + @classmethod + def from_str(cls, in_str): + """ + Build a config object from a string or yaml stream. + + :param in_str: The stream or string + :returns: The config object + """ + conf_obj = cls() + conf_obj._conf = yaml.safe_load(in_str) + return conf_obj + + def to_str(self): + """ + :returns: str(self) + """ + return str(self) + + def get(self, key, default=None): + return self._conf.get(key, default) + + def __str__(self): + return yaml.safe_dump(self._conf, default_flow_style=False).strip() + + def __repr__(self): + return self.__str__() + + def __getitem__(self, name): + return self.__getattr__(name) + + def __getattr__(self, name): + return self._conf.get(name, self._defaults.get(name)) + + def __contains__(self, name): + return self._conf.__contains__(name) + + def __setattr__(self, name, value): + if name.endswith('_conf') or name in ('yaml_path'): + object.__setattr__(self, name, value) + else: + self._conf[name] = value + + def __delattr__(self, name): + del self._conf[name] + + def __len__(self): + return self._conf.__len__() + + def __iter__(self): + return self._conf.__iter__() + + def __setitem__(self, name, value): + self._conf.__setitem__(name, value) + + def __delitem__(self, name): + self._conf.__delitem__(name) + + +class TeuthologyConfig(YamlConfig): + """ + This class is intended to unify teuthology's many configuration files and + objects. Currently it serves as a convenient interface to + ~/.teuthology.yaml or equivalent. + """ + yaml_path = USER_CONFIG_PATH # yaml_path is updated in _get_config_path + _defaults = { + 'archive_base': '/home/teuthworker/archive', + 'archive_upload': None, + 'archive_upload_key': None, + 'archive_upload_url': None, + 'automated_scheduling': False, + 'reserve_machines': 5, + 'ceph_git_base_url': 'https://github.com/ceph/', + 'ceph_git_url': None, + 'ceph_qa_suite_git_url': None, + 'ceph_cm_ansible_git_url': None, + 'teuthology_git_url': None, + 'use_conserver': False, + 'conserver_master': 'conserver.front.sepia.ceph.com', + 'conserver_port': 3109, + 'gitbuilder_host': 'gitbuilder.ceph.com', + 'githelper_base_url': 'http://githelper.ceph.com', + 'check_package_signatures': True, + 'job_threshold': 500, + 'lab_domain': 'front.sepia.ceph.com', + 'lock_server': 'http://paddles.front.sepia.ceph.com/', + 'max_job_age': 1209600, # 2 weeks + 'max_job_time': 259200, # 3 days + 'nsupdate_url': 'http://nsupdate.front.sepia.ceph.com/update', + 'results_server': 'http://paddles.front.sepia.ceph.com/', + 'results_ui_server': 'http://pulpito.ceph.com/', + 'results_sending_email': 'teuthology', + 'results_timeout': 43200, + 'src_base_path': os.path.expanduser('~/src'), + 'verify_host_keys': True, + 'watchdog_interval': 120, + 'fog_reimage_timeout': 1800, + 'fog_wait_for_ssh_timeout': 600, + 'kojihub_url': 'http://koji.fedoraproject.org/kojihub', + 'kojiroot_url': 'http://kojipkgs.fedoraproject.org/packages', + 'koji_task_url': 'https://kojipkgs.fedoraproject.org/work/', + 'baseurl_template': 'http://{host}/{proj}-{pkg_type}-{dist}-{arch}-{flavor}/{uri}', + 'use_shaman': True, + 'shaman_host': 'shaman.ceph.com', + 'teuthology_path': None, + 'suite_verify_ceph_hash': True, + 'suite_allow_missing_packages': False, + 'openstack': { + 'clone': 'git clone http://github.com/ceph/teuthology', + 'user-data': 'teuthology/openstack/openstack-{os_type}-{os_version}-user-data.txt', + 'ip': '1.1.1.1', + 'machine': { + 'disk': 20, + 'ram': 8000, + 'cpus': 1, + }, + 'volumes': { + 'count': 0, + 'size': 1, + }, + }, + 'rocketchat': None, + 'sleep_before_teardown': 0, + 'ssh_key': None, + 'active_machine_types': [], + } + + def __init__(self, yaml_path=None): + super(TeuthologyConfig, self).__init__(yaml_path or self.yaml_path) + + def get_ceph_cm_ansible_git_url(self): + return (self.ceph_cm_ansible_git_url or + self.ceph_git_base_url + 'ceph-cm-ansible.git') + + def get_ceph_qa_suite_git_url(self): + return (self.ceph_qa_suite_git_url or + self.get_ceph_git_url()) + + def get_ceph_git_url(self): + return (self.ceph_git_url or + self.ceph_git_base_url + 'ceph-ci.git') + + def get_teuthology_git_url(self): + return (self.teuthology_git_url or + self.ceph_git_base_url + 'teuthology.git') + + +class JobConfig(YamlConfig): + pass + + +class FakeNamespace(YamlConfig): + """ + This class is meant to behave like a argparse Namespace + + We'll use this as a stop-gap as we refactor commands but allow the tasks + to still be passed a single namespace object for the time being. + """ + def __init__(self, config_dict=None): + if not config_dict: + config_dict = dict() + self._conf = self._clean_config(config_dict) + set_config_attr(self) + + def _clean_config(self, config_dict): + """ + Makes sure that the keys of config_dict are able to be used. For + example the "--" prefix of a docopt dict isn't valid and won't populate + correctly. + """ + result = dict() + for key, value in config_dict.items(): + new_key = key + if new_key.startswith("--"): + new_key = new_key[2:] + elif new_key.startswith("<") and new_key.endswith(">"): + new_key = new_key[1:-1] + + if "-" in new_key: + new_key = new_key.replace("-", "_") + + result[new_key] = value + + return result + + def __getattr__(self, name): + """ + We need to modify this for FakeNamespace so that getattr() will + work correctly on a FakeNamespace instance. + """ + if name in self._conf: + return self._conf[name] + elif name in self._defaults: + return self._defaults[name] + raise AttributeError(name) + + def __setattr__(self, name, value): + if name == 'teuthology_config': + object.__setattr__(self, name, value) + else: + super(FakeNamespace, self).__setattr__(name, value) + + def __repr__(self): + return repr(self._conf) + + def __str__(self): + return str(self._conf) + + +def set_config_attr(obj): + """ + Set obj.teuthology_config, mimicking the old behavior of misc.read_config + """ + obj.teuthology_config = config + + +def _get_config_path(): + """Look for a teuthology config yaml and return it's path. + Raises ValueError if no config yaml can be found. + """ + paths = [ + os.path.join(os.path.expanduser(USER_CONFIG_PATH)), + SYSTEM_CONFIG_PATH, + ] + if CONFIG_PATH_VAR_NAME in os.environ: + paths.insert(0, os.path.expanduser(os.environ[CONFIG_PATH_VAR_NAME])) + for path in paths: + if os.path.exists(path): + return path + log.warning(f"no teuthology config found, looked for: {paths}") + return None + + +config = TeuthologyConfig(yaml_path=_get_config_path()) diff --git a/teuthology/contextutil.py b/teuthology/contextutil.py new file mode 100644 index 000000000..8e53e5439 --- /dev/null +++ b/teuthology/contextutil.py @@ -0,0 +1,149 @@ +import contextlib +import sys +import logging +import time + +from teuthology.config import config +from teuthology.exceptions import MaxWhileTries + + +log = logging.getLogger(__name__) + +@contextlib.contextmanager +def nested(*managers): + """ + Like contextlib.nested but takes callables returning context + managers, to avoid the major reason why contextlib.nested was + deprecated. + + This version also logs any exceptions early, much like run_tasks, + to ease debugging. TODO combine nested and run_tasks. + """ + exits = [] + vars = [] + exc = (None, None, None) + try: + for mgr_fn in managers: + mgr = mgr_fn() + exit = mgr.__exit__ + enter = mgr.__enter__ + vars.append(enter()) + exits.append(exit) + yield vars + except Exception: + log.exception('Saw exception from nested tasks') + exc = sys.exc_info() + # FIXME this needs to be more generic + if config.ctx and config.ctx.config.get('interactive-on-error'): + config.ctx.config['interactive-on-error'] = False + from teuthology.task import interactive + log.warning('Saw failure, going into interactive mode...') + interactive.task(ctx=config.ctx, config=None) + finally: + while exits: + exit = exits.pop() + try: + if exit(*exc): + exc = (None, None, None) + except Exception: + exc = sys.exc_info() + if exc != (None, None, None): + # Don't rely on sys.exc_info() still containing + # the right information. Another exception may + # have been raised and caught by an exit method + raise exc[1] + + +class safe_while(object): + """ + A context manager to remove boiler plate code that deals with `while` loops + that need a given number of tries or total timeout and some seconds to sleep + between each one of those tries. + + The most simple example possible will try 10 times sleeping for 6 seconds: + + >>> from teuthology.contexutil import safe_while + >>> with safe_while() as proceed: + ... while proceed(): + ... # repetitive code here + ... print("hello world") + ... + Traceback (most recent call last): + ... + MaxWhileTries: reached maximum tries (5) after waiting for 75 seconds + + Yes, this adds yet another level of indentation but it allows you to + implement while loops exactly the same as before with just 1 more + indentation level and one extra call. Everything else stays the same, + code-wise. So adding this helper to existing code is simpler. + + :param sleep: The amount of time to sleep between tries. Default 6 + :param increment: The amount to add to the sleep value on each try. + Default 0. + :param tries: The amount of tries before giving up. Default 10. + :param timeout: Total seconds to try for, overrides the tries parameter + if specified. Default 0. + :param action: The name of the action being attempted. Default none. + :param _raise: Whether to raise an exception (or log a warning). + Default True. + :param _sleeper: The function to use to sleep. Only used for testing. + Default time.sleep + """ + + def __init__(self, sleep=6, increment=0, tries=10, timeout=0, action=None, + _raise=True, _sleeper=None): + self.sleep = sleep + self.increment = increment + self.tries = tries + self.timeout = timeout + self.counter = 0 + self.sleep_current = sleep + self.action = action + self._raise = _raise + self.sleeper = _sleeper or time.sleep + self.total_seconds = sleep + + def _make_error_msg(self): + """ + Sum the total number of seconds we waited while providing the number + of tries we attempted + """ + msg = 'reached maximum tries ({tries})' + \ + ' after waiting for {total} seconds' + if self.action: + msg = "'{action}' " + msg + + msg = msg.format( + action=self.action, + tries=self.counter - 1, + total=self.total_seconds, + ) + return msg + + def __call__(self): + self.counter += 1 + if self.counter == 1: + return True + def must_stop(): + return self.tries > 0 and self.counter > self.tries + if ((self.timeout > 0 and + self.total_seconds >= self.timeout) or + (self.timeout == 0 and must_stop())): + error_msg = self._make_error_msg() + if self._raise: + raise MaxWhileTries(error_msg) + else: + log.warning(error_msg) + return False + self.sleep_current += self.increment + if self.timeout > 0: + self.sleep_current = min(self.timeout - self.total_seconds, self.sleep_current) + self.total_seconds += self.sleep_current + self.sleeper(self.sleep_current) + return True + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + return False diff --git a/teuthology/describe_tests.py b/teuthology/describe_tests.py new file mode 100644 index 000000000..c0054adc1 --- /dev/null +++ b/teuthology/describe_tests.py @@ -0,0 +1,360 @@ +# -*- coding: utf-8 -*- + +import csv +import json +from prettytable import PrettyTable, FRAME, ALL +import os +import sys +import yaml + +import random +from distutils.util import strtobool + +from teuthology.exceptions import ParseError +from teuthology.suite.build_matrix import \ + build_matrix, generate_combinations, _get_matrix +from teuthology.suite import util, merge + +def main(args): + try: + describe_tests(args) + except ParseError: + sys.exit(1) + + +def describe_tests(args): + suite_dir = os.path.abspath(args[""]) + output_format = args['--format'] + + conf=dict() + rename_args = { + 'filter': 'filter_in', + } + for (key, value) in args.items(): + key = key.lstrip('--').replace('-', '_') + key = rename_args.get(key) or key + if key in ('filter_all', 'filter_in', 'filter_out', 'fields'): + if not value: + value = [] + else: + value = [_ for _ in + (x.strip() for x in value.split(',')) if _] + elif key in ('limit'): + value = int(value) + elif key in ('seed'): + value = int(value) + if value < 0: + value = None + elif key == 'subset' and value is not None: + # take input string '2/3' and turn into (2, 3) + value = tuple(map(int, value.split('/'))) + elif key in ('show_facet'): + value = strtobool(value) + conf[key] = value + + if args['--combinations']: + headers, rows = get_combinations(suite_dir, + limit=conf['limit'], + seed=conf['seed'], + subset=conf['subset'], + no_nested_subset=conf['no_nested_subset'], + fields=conf['fields'], + filter_in=conf['filter_in'], + filter_out=conf['filter_out'], + filter_all=conf['filter_all'], + filter_fragments=conf['filter_fragments'], + include_facet=conf['show_facet']) + hrule = ALL + elif args['--summary']: + output_summary(suite_dir, + limit=conf['limit'], + seed=conf['seed'], + subset=conf['subset'], + no_nested_subset=conf['no_nested_subset'], + show_desc=conf['print_description'], + show_frag=conf['print_fragments'], + filter_in=conf['filter_in'], + filter_out=conf['filter_out'], + filter_all=conf['filter_all'], + filter_fragments=conf['filter_fragments']) + exit(0) + else: + headers, rows = describe_suite(suite_dir, conf['fields'], conf['show_facet'], + output_format) + hrule = FRAME + + output_results(headers, rows, output_format, hrule) + + +def output_results(headers, rows, output_format, hrule): + """ + Write the headers and rows given in the specified output format to + stdout. + """ + if output_format == 'json': + objects = [{k: v for k, v in zip(headers, row) if v} + for row in rows] + print(json.dumps(dict(headers=headers, data=objects))) + elif output_format == 'csv': + writer = csv.writer(sys.stdout) + writer.writerows([headers] + rows) + else: + table = PrettyTable(headers) + table.align = 'l' + table.vrules = ALL + table.hrules = hrule + for row in rows: + table.add_row(row) + print(table) + + +def output_summary(path, limit=0, + seed=None, + subset=None, + no_nested_subset=None, + show_desc=True, + show_frag=False, + show_matrix=False, + filter_in=None, + filter_out=None, + filter_all=None, + filter_fragments=True): + """ + Prints number of all facets for a given suite for inspection, + taking into accout such options like --subset, --filter, + --filter-out and --filter-all. Optionally dumps matrix objects, + yaml files which is used for generating combinations. + """ + + random.seed(seed) + mat, first, matlimit = _get_matrix(path, subset=subset, no_nested_subset=no_nested_subset) + configs = generate_combinations(path, mat, first, matlimit) + count = 0 + total = len(configs) + suite = os.path.basename(path) + configs = merge.config_merge(configs, + suite_name=suite, + filter_in=filter_in, + filter_out=filter_out, + filter_all=filter_all, + filter_fragments=filter_fragments, + seed=seed) + for c in configs: + if limit and count >= limit: + break + count += 1 + if show_desc or show_frag: + print("{}".format(c[0])) + if show_frag: + for path in c[1]: + print(" {}".format(util.strip_fragment_path(path))) + if show_matrix: + print(mat.tostr(1)) + print("# {}/{} {}".format(count, total, path)) + +def get_combinations(suite_dir, + limit=0, + seed=None, + subset=None, + no_nested_subset=False, + fields=[], + filter_in=None, + filter_out=None, + filter_all=None, + filter_fragments=False, + include_facet=True): + """ + Describes the combinations of a suite, optionally limiting + or filtering output based on the given parameters. Includes + columns for the subsuite and facets when include_facet is True. + + Returns a tuple of (headers, rows) where both elements are lists + of strings. + """ + suite = os.path.basename(suite_dir) + configs = build_matrix(suite_dir, subset=subset, no_nested_subset=no_nested_subset, seed=seed) + + num_listed = 0 + rows = [] + + facet_headers = set() + dirs = {} + max_dir_depth = 0 + + configs = merge.config_merge(configs, + suite_name=suite, + filter_in=filter_in, + filter_out=filter_out, + filter_all=filter_all, + filter_fragments=filter_fragments, + seed=seed) + for _, fragment_paths, __ in configs: + if limit > 0 and num_listed >= limit: + break + + fragment_fields = [extract_info(path, fields) + for path in fragment_paths] + + # merge fields from multiple fragments by joining their values with \n + metadata = {} + for fragment_meta in fragment_fields: + for field, value in fragment_meta.items(): + if value == '': + continue + if field in metadata: + metadata[field] += '\n' + str(value) + else: + metadata[field] = str(value) + + if include_facet: + # map final dir (facet) -> filename without the .yaml suffix + for path in fragment_paths: + facet_dir = os.path.dirname(path) + facet = os.path.basename(facet_dir) + metadata[facet] = os.path.basename(path)[:-5] + facet_headers.add(facet) + facet_dirs = facet_dir.split('/')[:-1] + for i, dir_ in enumerate(facet_dirs): + if i not in dirs: + dirs[i] = set() + dirs[i].add(dir_) + metadata['_dir_' + str(i)] = os.path.basename(dir_) + max_dir_depth = max(max_dir_depth, i) + + rows.append(metadata) + num_listed += 1 + + subsuite_headers = [] + if include_facet: + first_subsuite_depth = max_dir_depth + for i in range(max_dir_depth): + if len(dirs[i]) > 1: + first_subsuite_depth = i + break + + subsuite_headers = ['subsuite depth ' + str(i) + for i in + range(0, max_dir_depth - first_subsuite_depth + 1)] + + for row in rows: + for i in range(first_subsuite_depth, max_dir_depth + 1): + row[subsuite_headers[i - first_subsuite_depth]] = \ + row.get('_dir_' + str(i), '') + + headers = subsuite_headers + sorted(facet_headers) + fields + return headers, sorted([[row.get(field, '') for field in headers] + for row in rows]) + + +def describe_suite(suite_dir, fields, include_facet, output_format): + """ + Describe a suite listing each subdirectory and file once as a + separate row. + + Returns a tuple of (headers, rows) where both elements are lists + of strings. + + """ + rows = tree_with_info(suite_dir, fields, include_facet, '', [], + output_format=output_format) + + headers = ['path'] + if include_facet: + headers.append('facet') + return headers + fields, rows + + +def extract_info(file_name, fields): + """ + Read a yaml file and return a dictionary mapping the fields to the + values of those fields in the file. + + The returned dictionary will always contain all the provided + fields, mapping any non-existent ones to ''. + + Assumes fields are set in a format of: + + {'meta': [{'field' : value, 'field2' : value2}] + + or in yaml: + + meta: + - field: value + field2: value2 + + If 'meta' is present but not in this format, prints an error + message and raises ParseError. + """ + empty_result = {f: '' for f in fields} + if os.path.isdir(file_name) or not file_name.endswith('.yaml'): + return empty_result + + with open(file_name, 'r') as f: + parsed = yaml.safe_load(f) + + if not isinstance(parsed, dict): + return empty_result + + meta = parsed.get('meta', [{}]) + if not (isinstance(meta, list) and + len(meta) == 1 and + isinstance(meta[0], dict)): + print('Error in meta format in %s' % file_name) + print('Meta must be a list containing exactly one dict.') + print('Meta is: %s' % meta) + raise ParseError() + + return {field: meta[0].get(field, '') for field in fields} + + +def path_relative_to_suites(path): + """ + Attempt to trim the ceph-qa-suite root directory from the beginning + of a path. + """ + try: + root = os.path.join('ceph-qa-suite', 'suites') + return path[path.index(root) + len(root):] + except ValueError: + return path + + +def tree_with_info(cur_dir, fields, include_facet, prefix, rows, + output_format='plain'): + """ + Gather fields from all files and directories in cur_dir. + Returns a list of strings for each path containing: + + 1) the path relative to ceph-qa-suite/suites (or the basename with + a /usr/bin/tree-like prefix if output_format is plain) + 2) the facet containing the path (if include_facet is True) + 3) the values of the provided fields in the path ('' is used for + missing values) in the same order as the provided fields + """ + files = sorted(os.listdir(cur_dir)) + has_yamls = any([x.endswith('.yaml') for x in files]) + facet = os.path.basename(cur_dir) if has_yamls else '' + for i, f in enumerate(files): + # skip any hidden files + if f.startswith('.'): + continue + path = os.path.join(cur_dir, f) + if i == len(files) - 1: + file_pad = '└── ' + dir_pad = ' ' + else: + file_pad = '├── ' + dir_pad = '│ ' + info = extract_info(path, fields) + tree_node = prefix + file_pad + f + if output_format != 'plain': + tree_node = path_relative_to_suites(path) + meta = [info[f] for f in fields] + row = [tree_node] + if include_facet: + row.append(facet) + rows.append(row + meta) + if os.path.isdir(path): + tree_with_info(path, fields, include_facet, + prefix + dir_pad, rows, output_format) + return rows diff --git a/teuthology/dispatcher/__init__.py b/teuthology/dispatcher/__init__.py new file mode 100644 index 000000000..59f8ae327 --- /dev/null +++ b/teuthology/dispatcher/__init__.py @@ -0,0 +1,365 @@ +import datetime +import logging +import os +import psutil +import subprocess +import sys +import yaml + +from typing import Dict, List + +from teuthology import ( + # non-modules + setup_log_file, + install_except_hook, + # modules + beanstalk, + exporter, + report, + repo_utils, +) +from teuthology.config import config as teuth_config +from teuthology.dispatcher import supervisor +from teuthology.exceptions import BranchNotFoundError, CommitNotFoundError, SkipJob, MaxWhileTries +from teuthology.lock import ops as lock_ops +from teuthology.util.time import parse_timestamp +from teuthology import safepath + +log = logging.getLogger(__name__) +start_time = datetime.datetime.now(datetime.timezone.utc) +restart_file_path = '/tmp/teuthology-restart-dispatcher' +stop_file_path = '/tmp/teuthology-stop-dispatcher' + + +def sentinel(path): + if not os.path.exists(path): + return False + file_mtime = datetime.datetime.fromtimestamp( + os.path.getmtime(path), + datetime.timezone.utc, + ) + return file_mtime > start_time + + +def restart(log=log): + log.info('Restarting...') + args = sys.argv[:] + args.insert(0, sys.executable) + os.execv(sys.executable, args) + + +def stop(): + log.info('Stopping...') + sys.exit(0) + + +def load_config(archive_dir=None): + teuth_config.load() + if archive_dir is not None: + if not os.path.isdir(archive_dir): + sys.exit("{prog}: archive directory must exist: {path}".format( + prog=os.path.basename(sys.argv[0]), + path=archive_dir, + )) + else: + teuth_config.archive_base = archive_dir + + +def main(args): + archive_dir = args.archive_dir or teuth_config.archive_base + + # Refuse to start more than one dispatcher per machine type + procs = find_dispatcher_processes().get(args.tube) + if procs: + raise RuntimeError( + "There is already a teuthology-dispatcher process running:" + f" {procs}" + ) + + # setup logging for disoatcher in {log_dir} + loglevel = logging.INFO + if args.verbose: + loglevel = logging.DEBUG + logging.getLogger().setLevel(loglevel) + log.setLevel(loglevel) + log_file_path = os.path.join(args.log_dir, f"dispatcher.{args.tube}.{os.getpid()}") + setup_log_file(log_file_path) + install_except_hook() + + load_config(archive_dir=archive_dir) + + connection = beanstalk.connect() + beanstalk.watch_tube(connection, args.tube) + result_proc = None + + if teuth_config.teuthology_path is None: + repo_utils.fetch_teuthology('main') + repo_utils.fetch_qa_suite('main') + + keep_running = True + job_procs = set() + worst_returncode = 0 + while keep_running: + # Check to see if we have a teuthology-results process hanging around + # and if so, read its return code so that it can exit. + if result_proc is not None and result_proc.poll() is not None: + log.debug("teuthology-results exited with code: %s", + result_proc.returncode) + result_proc = None + + if sentinel(restart_file_path): + restart() + elif sentinel(stop_file_path): + stop() + + load_config() + for proc in list(job_procs): + rc = proc.poll() + if rc is not None: + worst_returncode = max([worst_returncode, rc]) + job_procs.remove(proc) + job = connection.reserve(timeout=60) + if job is None: + if args.exit_on_empty_queue and not job_procs: + log.info("Queue is empty and no supervisor processes running; exiting!") + break + continue + + # bury the job so it won't be re-run if it fails + job.bury() + job_id = job.jid + log.info('Reserved job %d', job_id) + log.info('Config is: %s', job.body) + job_config = yaml.safe_load(job.body) + job_config['job_id'] = str(job_id) + + if job_config.get('stop_worker'): + keep_running = False + + try: + job_config, teuth_bin_path = prep_job( + job_config, + log_file_path, + archive_dir, + ) + except SkipJob: + continue + + # lock machines but do not reimage them + if 'roles' in job_config: + job_config = lock_machines(job_config) + + run_args = [ + os.path.join(teuth_bin_path, 'teuthology-supervisor'), + '-v', + '--bin-path', teuth_bin_path, + '--archive-dir', archive_dir, + ] + + # Create run archive directory if not already created and + # job's archive directory + create_job_archive(job_config['name'], + job_config['archive_path'], + archive_dir) + job_config_path = os.path.join(job_config['archive_path'], 'orig.config.yaml') + + # Write initial job config in job archive dir + with open(job_config_path, 'w') as f: + yaml.safe_dump(job_config, f, default_flow_style=False) + + run_args.extend(["--job-config", job_config_path]) + + try: + job_proc = subprocess.Popen( + run_args, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + job_procs.add(job_proc) + log.info('Job supervisor PID: %s', job_proc.pid) + except Exception: + error_message = "Saw error while trying to spawn supervisor." + log.exception(error_message) + if 'targets' in job_config: + node_names = job_config["targets"].keys() + lock_ops.unlock_safe( + node_names, + job_config["owner"], + job_config["name"], + job_config["job_id"] + ) + report.try_push_job_info(job_config, dict( + status='fail', + failure_reason=error_message)) + + # This try/except block is to keep the worker from dying when + # beanstalkc throws a SocketError + try: + job.delete() + except Exception: + log.exception("Saw exception while trying to delete job") + + return worst_returncode + + +def find_dispatcher_processes() -> Dict[str, List[psutil.Process]]: + def match(proc): + try: + cmdline = proc.cmdline() + except psutil.AccessDenied: + return False + except psutil.ZombieProcess: + return False + if len(cmdline) < 3: + return False + if not cmdline[1].endswith("/teuthology-dispatcher"): + return False + if cmdline[2] == "--supervisor": + return False + if "--tube" not in cmdline: + return False + if proc.pid == os.getpid(): + return False + return True + + procs = {} + attrs = ["pid", "cmdline"] + for proc in psutil.process_iter(attrs=attrs): + if not match(proc): + continue + cmdline = proc.cmdline() + machine_type = cmdline[cmdline.index("--tube") + 1] + procs.setdefault(machine_type, []).append(proc) + return procs + + +def prep_job(job_config, log_file_path, archive_dir): + job_id = job_config['job_id'] + check_job_expiration(job_config) + + safe_archive = safepath.munge(job_config['name']) + job_config['worker_log'] = log_file_path + archive_path_full = os.path.join( + archive_dir, safe_archive, str(job_id)) + job_config['archive_path'] = archive_path_full + + # If the teuthology branch was not specified, default to main and + # store that value. + teuthology_branch = job_config.get('teuthology_branch', 'main') + job_config['teuthology_branch'] = teuthology_branch + teuthology_sha1 = job_config.get('teuthology_sha1') + if not teuthology_sha1: + repo_url = repo_utils.build_git_url('teuthology', 'ceph') + try: + teuthology_sha1 = repo_utils.ls_remote(repo_url, teuthology_branch) + except Exception as exc: + log.exception(f"Could not get teuthology sha1 for branch {teuthology_branch}") + report.try_push_job_info( + job_config, + dict(status='dead', failure_reason=str(exc)) + ) + raise SkipJob() + if not teuthology_sha1: + reason = "Teuthology branch {} not found; marking job as dead".format(teuthology_branch) + log.error(reason) + report.try_push_job_info( + job_config, + dict(status='dead', failure_reason=reason) + ) + raise SkipJob() + if teuth_config.teuthology_path is None: + log.info('Using teuthology sha1 %s', teuthology_sha1) + + try: + if teuth_config.teuthology_path is not None: + teuth_path = teuth_config.teuthology_path + else: + teuth_path = repo_utils.fetch_teuthology(branch=teuthology_branch, + commit=teuthology_sha1) + # For the teuthology tasks, we look for suite_branch, and if we + # don't get that, we look for branch, and fall back to 'main'. + # last-in-suite jobs don't have suite_branch or branch set. + ceph_branch = job_config.get('branch', 'main') + suite_branch = job_config.get('suite_branch', ceph_branch) + suite_sha1 = job_config.get('suite_sha1') + suite_repo = job_config.get('suite_repo') + if suite_repo: + teuth_config.ceph_qa_suite_git_url = suite_repo + job_config['suite_path'] = os.path.normpath(os.path.join( + repo_utils.fetch_qa_suite(suite_branch, suite_sha1), + job_config.get('suite_relpath', ''), + )) + except (BranchNotFoundError, CommitNotFoundError) as exc: + log.exception("Requested version not found; marking job as dead") + report.try_push_job_info( + job_config, + dict(status='dead', failure_reason=str(exc)) + ) + raise SkipJob() + except MaxWhileTries as exc: + log.exception("Failed to fetch or bootstrap; marking job as dead") + report.try_push_job_info( + job_config, + dict(status='dead', failure_reason=str(exc)) + ) + raise SkipJob() + + teuth_bin_path = os.path.join(teuth_path, 'virtualenv', 'bin') + if not os.path.isdir(teuth_bin_path): + raise RuntimeError("teuthology branch %s at %s not bootstrapped!" % + (teuthology_branch, teuth_bin_path)) + return job_config, teuth_bin_path + + +def check_job_expiration(job_config): + job_id = job_config['job_id'] + expired = False + now = datetime.datetime.now(datetime.timezone.utc) + if expire_str := job_config.get('timestamp'): + expire = parse_timestamp(expire_str) + \ + datetime.timedelta(seconds=teuth_config.max_job_age) + expired = expire < now + if not expired and (expire_str := job_config.get('expire')): + try: + expire = parse_timestamp(expire_str) + expired = expired or expire < now + except ValueError: + log.warning(f"Failed to parse job expiration: {expire_str=}") + pass + if expired: + log.info(f"Skipping job {job_id} because it is expired: {expire_str} is in the past") + report.try_push_job_info( + job_config, + # TODO: Add a 'canceled' status to paddles, and use that. + dict(status='dead'), + ) + raise SkipJob() + + +def lock_machines(job_config): + report.try_push_job_info(job_config, dict(status='running')) + fake_ctx = supervisor.create_fake_context(job_config, block=True) + machine_type = job_config["machine_type"] + count = len(job_config['roles']) + with exporter.NodeLockingTime().time( + machine_type=machine_type, + count=count, + ): + lock_ops.block_and_lock_machines( + fake_ctx, + count, + machine_type, + tries=-1, + reimage=False, + ) + job_config = fake_ctx.config + return job_config + + +def create_job_archive(job_name, job_archive_path, archive_dir): + log.info('Creating job\'s archive dir %s', job_archive_path) + safe_archive = safepath.munge(job_name) + run_archive = os.path.join(archive_dir, safe_archive) + if not os.path.exists(run_archive): + safepath.makedirs('/', run_archive) + safepath.makedirs('/', job_archive_path) diff --git a/teuthology/dispatcher/supervisor.py b/teuthology/dispatcher/supervisor.py new file mode 100644 index 000000000..b89c39ac5 --- /dev/null +++ b/teuthology/dispatcher/supervisor.py @@ -0,0 +1,377 @@ +import datetime +import logging +import os +import subprocess +import time +import yaml +import requests + +from urllib.parse import urljoin + +from teuthology import exporter, dispatcher, kill, report, safepath +from teuthology.config import config as teuth_config +from teuthology.exceptions import SkipJob, MaxWhileTries +from teuthology import setup_log_file, install_except_hook +from teuthology.misc import get_user, archive_logs, compress_logs +from teuthology.config import FakeNamespace +from teuthology.lock import ops as lock_ops +from teuthology.task import internal +from teuthology.misc import decanonicalize_hostname as shortname +from teuthology.lock import query +from teuthology.util import sentry + +log = logging.getLogger(__name__) + + +def main(args): + with open(args.job_config, 'r') as config_file: + job_config = yaml.safe_load(config_file) + + loglevel = logging.INFO + if args.verbose: + loglevel = logging.DEBUG + logging.getLogger().setLevel(loglevel) + log.setLevel(loglevel) + + log_file_path = os.path.join(job_config['archive_path'], + f"supervisor.{job_config['job_id']}.log") + setup_log_file(log_file_path) + install_except_hook() + try: + dispatcher.check_job_expiration(job_config) + except SkipJob: + return 0 + + # reimage target machines before running the job + if 'targets' in job_config: + node_count = len(job_config["targets"]) + # If a job (e.g. from the nop suite) doesn't need nodes, avoid + # submitting a zero here. + if node_count: + with exporter.NodeReimagingTime().time( + machine_type=job_config["machine_type"], + node_count=node_count, + ): + reimage(job_config) + else: + reimage(job_config) + with open(args.job_config, 'w') as f: + yaml.safe_dump(job_config, f, default_flow_style=False) + + suite = job_config.get("suite") + if suite: + with exporter.JobTime().time(suite=suite): + return run_job( + job_config, + args.bin_path, + args.archive_dir, + args.verbose + ) + else: + return run_job( + job_config, + args.bin_path, + args.archive_dir, + args.verbose + ) + + +def run_job(job_config, teuth_bin_path, archive_dir, verbose): + safe_archive = safepath.munge(job_config['name']) + if job_config.get('first_in_suite') or job_config.get('last_in_suite'): + job_archive = os.path.join(archive_dir, safe_archive) + args = [ + os.path.join(teuth_bin_path, 'teuthology-results'), + '--archive-dir', job_archive, + '--name', job_config['name'], + ] + if job_config.get('first_in_suite'): + log.info('Generating memo for %s', job_config['name']) + if job_config.get('seed'): + args.extend(['--seed', job_config['seed']]) + if job_config.get('subset'): + args.extend(['--subset', job_config['subset']]) + if job_config.get('no_nested_subset'): + args.extend(['--no-nested-subset']) + else: + log.info('Generating results for %s', job_config['name']) + timeout = job_config.get('results_timeout', + teuth_config.results_timeout) + args.extend(['--timeout', str(timeout)]) + if job_config.get('email'): + args.extend(['--email', job_config['email']]) + # Execute teuthology-results, passing 'preexec_fn=os.setpgrp' to + # make sure that it will continue to run if this worker process + # dies (e.g. because of a restart) + result_proc = subprocess.Popen(args=args, preexec_fn=os.setpgrp) + log.info("teuthology-results PID: %s", result_proc.pid) + # Remove unnecessary logs for first and last jobs in run + log.info('Deleting job\'s archive dir %s', job_config['archive_path']) + for f in os.listdir(job_config['archive_path']): + os.remove(os.path.join(job_config['archive_path'], f)) + os.rmdir(job_config['archive_path']) + return + + log.info('Running job %s', job_config['job_id']) + + arg = [ + os.path.join(teuth_bin_path, 'teuthology'), + ] + # The following is for compatibility with older schedulers, from before we + # started merging the contents of job_config['config'] into job_config + # itself. + if 'config' in job_config: + inner_config = job_config.pop('config') + if not isinstance(inner_config, dict): + log.warning("run_job: job_config['config'] isn't a dict, it's a %s", + str(type(inner_config))) + else: + job_config.update(inner_config) + + if verbose or job_config['verbose']: + arg.append('-v') + + arg.extend([ + '--owner', job_config['owner'], + '--archive', job_config['archive_path'], + '--name', job_config['name'], + ]) + if job_config['description'] is not None: + arg.extend(['--description', job_config['description']]) + job_archive = os.path.join(job_config['archive_path'], 'orig.config.yaml') + arg.extend(['--', job_archive]) + + log.debug("Running: %s" % ' '.join(arg)) + p = subprocess.Popen( + args=arg, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + log.info("Job archive: %s", job_config['archive_path']) + log.info("Job PID: %s", str(p.pid)) + + if teuth_config.results_server: + log.info("Running with watchdog") + try: + run_with_watchdog(p, job_config) + except Exception: + log.exception("run_with_watchdog had an unhandled exception") + raise + else: + log.info("Running without watchdog") + # This sleep() is to give the child time to start up and create the + # archive dir. + time.sleep(5) + p.wait() + + if p.returncode != 0: + log.error('Child exited with code %d', p.returncode) + else: + log.info('Success!') + if 'targets' in job_config: + unlock_targets(job_config) + return p.returncode + +def failure_is_reimage(failure_reason): + if not failure_reason: + return False + reimage_failure = "Error reimaging machines:" + if reimage_failure in failure_reason: + return True + else: + return False + + +def check_for_reimage_failures_and_mark_down(targets, count=10): + # Grab paddles history of jobs in the machine + # and count the number of reimaging errors + # if it fails N times then mark the machine down + base_url = teuth_config.results_server + for k, _ in targets.items(): + machine = k.split('@')[-1] + url = urljoin( + base_url, + '/nodes/{0}/jobs/?count={1}'.format(machine, count) + ) + resp = requests.get(url) + jobs = resp.json() + if len(jobs) < count: + continue + reimage_failures = list(filter( + lambda j: failure_is_reimage(j['failure_reason']), + jobs + )) + if len(reimage_failures) < count: + continue + # Mark machine down + machine_name = shortname(k) + lock_ops.update_lock( + machine_name, + description='reimage failed {0} times'.format(count), + status='down', + ) + log.error( + 'Reimage failed {0} times ... marking machine down'.format(count) + ) + + +def reimage(job_config): + # Reimage the targets specified in job config + # and update their keys in config after reimaging + ctx = create_fake_context(job_config) + # change the status during the reimaging process + report.try_push_job_info(ctx.config, dict(status='waiting')) + targets = job_config['targets'] + try: + reimaged = lock_ops.reimage_machines(ctx, targets, job_config['machine_type']) + except Exception as e: + log.exception('Reimaging error. Nuking machines...') + unlock_targets(job_config) + # Reimage failures should map to the 'dead' status instead of 'fail' + report.try_push_job_info( + ctx.config, + dict(status='dead', failure_reason='Error reimaging machines: ' + str(e)) + ) + # There isn't an actual task called "reimage", but it doesn't seem + # necessary to create a whole new Sentry tag for this. + ctx.summary = { + 'sentry_event': sentry.report_error(job_config, e, task_name="reimage") + } + # Machine that fails to reimage after 10 times will be marked down + check_for_reimage_failures_and_mark_down(targets) + raise + ctx.config['targets'] = reimaged + # change the status to running after the reimaging process + report.try_push_job_info(ctx.config, dict(status='running')) + + +def unlock_targets(job_config): + """ + Unlock machines only if locked and description matches. + + :param job_config: dict, job config data + """ + machine_statuses = query.get_statuses(job_config['targets'].keys()) + locked = [] + for status in machine_statuses: + name = shortname(status['name']) + description = status['description'] + if not status['locked']: + continue + if description != job_config['archive_path']: + log.warning( + "Was going to unlock %s but it was locked by another job: %s", + name, description + ) + continue + locked.append(name) + if not locked: + return + if job_config.get("unlock_on_failure", True): + log.info('Unlocking machines...') + lock_ops.unlock_safe(locked, job_config["owner"], job_config["name"], job_config["job_id"]) + + +def run_with_watchdog(process, job_config): + job_start_time = datetime.datetime.now(datetime.timezone.utc) + + # Only push the information that's relevant to the watchdog, to save db + # load + job_info = dict( + name=job_config['name'], + job_id=job_config['job_id'], + ) + + # Sleep once outside of the loop to avoid double-posting jobs + time.sleep(teuth_config.watchdog_interval) + hit_max_timeout = False + while process.poll() is None: + # Kill jobs that have been running longer than the global max + run_time = datetime.datetime.now(datetime.timezone.utc) - job_start_time + total_seconds = run_time.days * 60 * 60 * 24 + run_time.seconds + if total_seconds > teuth_config.max_job_time: + hit_max_timeout = True + log.warning("Job ran longer than {max}s. Killing...".format( + max=teuth_config.max_job_time)) + try: + # kill processes but do not unlock yet so we can save + # the logs, coredumps, etc. + kill.kill_job( + job_info['name'], job_info['job_id'], + teuth_config.archive_base, job_config['owner'], + skip_unlock=True + ) + except Exception: + log.exception('Failed to kill job') + + try: + transfer_archives(job_info['name'], job_info['job_id'], + teuth_config.archive_base, job_config) + except Exception: + log.exception('Could not save logs') + + try: + # this time remove everything and unlock the machines + kill.kill_job( + job_info['name'], job_info['job_id'], + teuth_config.archive_base, job_config['owner'] + ) + except Exception: + log.exception('Failed to kill job and unlock machines') + + # calling this without a status just updates the jobs updated time + try: + report.try_push_job_info(job_info) + except MaxWhileTries: + log.exception("Failed to report job status; ignoring") + time.sleep(teuth_config.watchdog_interval) + + # we no longer support testing theses old branches + assert(job_config.get('teuthology_branch') not in ('argonaut', 'bobtail', + 'cuttlefish', 'dumpling')) + + # Let's make sure that paddles knows the job is finished. We don't know + # the status, but if it was a pass or fail it will have already been + # reported to paddles. In that case paddles ignores the 'dead' status. + # If the job was killed, paddles will use the 'dead' status. + extra_info = dict(status='dead') + if hit_max_timeout: + extra_info['failure_reason'] = 'hit max job timeout' + if not (job_config.get('first_in_suite') or job_config.get('last_in_suite')): + report.try_push_job_info(job_info, extra_info) + + +def create_fake_context(job_config, block=False): + owner = job_config.get('owner', get_user()) + os_version = job_config.get('os_version', None) + + ctx_args = { + 'config': job_config, + 'block': block, + 'owner': owner, + 'archive': job_config['archive_path'], + 'machine_type': job_config['machine_type'], + 'os_type': job_config.get('os_type', 'ubuntu'), + 'os_version': os_version, + 'name': job_config['name'], + 'job_id': job_config['job_id'], + } + + return FakeNamespace(ctx_args) + + +def transfer_archives(run_name, job_id, archive_base, job_config): + serializer = report.ResultsSerializer(archive_base) + job_info = serializer.job_info(run_name, job_id, simple=True) + + if 'archive' in job_info: + ctx = create_fake_context(job_config) + internal.add_remotes(ctx, job_config) + + for log_type, log_path in job_info['archive'].items(): + if log_type == 'init': + log_type = '' + compress_logs(ctx, log_path) + archive_logs(ctx, log_path, log_type) + else: + log.info('No archives to transfer.') diff --git a/teuthology/dispatcher/test/test_dispatcher.py b/teuthology/dispatcher/test/test_dispatcher.py new file mode 100644 index 000000000..58f58cf9c --- /dev/null +++ b/teuthology/dispatcher/test/test_dispatcher.py @@ -0,0 +1,203 @@ +import datetime +import os +import pytest + +from unittest.mock import patch, Mock, MagicMock + +from teuthology import dispatcher +from teuthology.config import FakeNamespace +from teuthology.contextutil import MaxWhileTries +from teuthology.util.time import TIMESTAMP_FMT + + +class TestDispatcher(object): + @pytest.fixture(autouse=True) + def setup_method(self, tmp_path): + self.ctx = FakeNamespace() + self.ctx.verbose = True + self.ctx.archive_dir = str(tmp_path / "archive/dir") + self.ctx.log_dir = str(tmp_path / "log/dir") + self.ctx.tube = 'tube' + + @patch("os.path.exists") + def test_restart_file_path_doesnt_exist(self, m_exists): + m_exists.return_value = False + result = dispatcher.sentinel(dispatcher.restart_file_path) + assert not result + + @patch("os.path.getmtime") + @patch("os.path.exists") + def test_needs_restart(self, m_exists, m_getmtime): + m_exists.return_value = True + now = datetime.datetime.now(datetime.timezone.utc) + m_getmtime.return_value = (now + datetime.timedelta(days=1)).timestamp() + assert dispatcher.sentinel(dispatcher.restart_file_path) + + @patch("os.path.getmtime") + @patch("os.path.exists") + def test_does_not_need_restart(self, m_exists, m_getmtime): + m_exists.return_value = True + now = datetime.datetime.now(datetime.timezone.utc) + m_getmtime.return_value = (now - datetime.timedelta(days=1)).timestamp() + assert not dispatcher.sentinel(dispatcher.restart_file_path) + + @patch("teuthology.repo_utils.ls_remote") + @patch("os.path.isdir") + @patch("teuthology.repo_utils.fetch_teuthology") + @patch("teuthology.dispatcher.teuth_config") + @patch("teuthology.repo_utils.fetch_qa_suite") + def test_prep_job(self, m_fetch_qa_suite, m_teuth_config, + m_fetch_teuthology, m_isdir, m_ls_remote): + config = dict( + name="the_name", + job_id="1", + suite_sha1="suite_hash", + ) + m_fetch_teuthology.return_value = '/teuth/path' + m_fetch_qa_suite.return_value = '/suite/path' + m_ls_remote.return_value = 'teuth_hash' + m_isdir.return_value = True + m_teuth_config.teuthology_path = None + got_config, teuth_bin_path = dispatcher.prep_job( + config, + self.ctx.log_dir, + self.ctx.archive_dir, + ) + assert got_config['worker_log'] == self.ctx.log_dir + assert got_config['archive_path'] == os.path.join( + self.ctx.archive_dir, + config['name'], + config['job_id'], + ) + assert got_config['teuthology_branch'] == 'main' + m_fetch_teuthology.assert_called_once_with(branch='main', commit='teuth_hash') + assert teuth_bin_path == '/teuth/path/virtualenv/bin' + m_fetch_qa_suite.assert_called_once_with('main', 'suite_hash') + assert got_config['suite_path'] == '/suite/path' + + def build_fake_jobs(self, m_connection, m_job, job_bodies): + """ + Given patched copies of: + beanstalkc.Connection + beanstalkc.Job + And a list of basic job bodies, return a list of mocked Job objects + """ + # Make sure instantiating m_job returns a new object each time + jobs = [] + job_id = 0 + for job_body in job_bodies: + job_id += 1 + job = MagicMock(conn=m_connection, jid=job_id, body=job_body) + job.jid = job_id + job.body = job_body + jobs.append(job) + return jobs + + @patch("teuthology.dispatcher.find_dispatcher_processes") + @patch("teuthology.repo_utils.ls_remote") + @patch("teuthology.dispatcher.report.try_push_job_info") + @patch("teuthology.dispatcher.supervisor.run_job") + @patch("beanstalkc.Job", autospec=True) + @patch("teuthology.repo_utils.fetch_qa_suite") + @patch("teuthology.repo_utils.fetch_teuthology") + @patch("teuthology.dispatcher.beanstalk.watch_tube") + @patch("teuthology.dispatcher.beanstalk.connect") + @patch("os.path.isdir", return_value=True) + @patch("teuthology.dispatcher.setup_log_file") + def test_main_loop( + self, m_setup_log_file, m_isdir, m_connect, m_watch_tube, + m_fetch_teuthology, m_fetch_qa_suite, m_job, m_run_job, + m_try_push_job_info, m_ls_remote, m_find_dispatcher_processes, + ): + m_find_dispatcher_processes.return_value = {} + m_connection = Mock() + jobs = self.build_fake_jobs( + m_connection, + m_job, + [ + 'name: name\nfoo: bar', + 'name: name\nstop_worker: true', + ], + ) + m_connection.reserve.side_effect = jobs + m_connect.return_value = m_connection + dispatcher.main(self.ctx) + # There should be one reserve call per item in the jobs list + expected_reserve_calls = [ + dict(timeout=60) for i in range(len(jobs)) + ] + got_reserve_calls = [ + call[1] for call in m_connection.reserve.call_args_list + ] + assert got_reserve_calls == expected_reserve_calls + for job in jobs: + job.bury.assert_called_once_with() + job.delete.assert_called_once_with() + + @patch("teuthology.dispatcher.find_dispatcher_processes") + @patch("teuthology.repo_utils.ls_remote") + @patch("teuthology.dispatcher.report.try_push_job_info") + @patch("teuthology.dispatcher.supervisor.run_job") + @patch("beanstalkc.Job", autospec=True) + @patch("teuthology.repo_utils.fetch_qa_suite") + @patch("teuthology.repo_utils.fetch_teuthology") + @patch("teuthology.dispatcher.beanstalk.watch_tube") + @patch("teuthology.dispatcher.beanstalk.connect") + @patch("os.path.isdir", return_value=True) + @patch("teuthology.dispatcher.setup_log_file") + def test_main_loop_13925( + self, m_setup_log_file, m_isdir, m_connect, m_watch_tube, + m_fetch_teuthology, m_fetch_qa_suite, m_job, m_run_job, + m_try_push_job_info, m_ls_remote, m_find_dispatcher_processes, + ): + m_find_dispatcher_processes.return_value = {} + m_connection = Mock() + jobs = self.build_fake_jobs( + m_connection, + m_job, + [ + 'name: name', + 'name: name\nstop_worker: true', + ], + ) + m_connection.reserve.side_effect = jobs + m_connect.return_value = m_connection + m_fetch_qa_suite.side_effect = [ + '/suite/path', + MaxWhileTries(), + MaxWhileTries(), + ] + dispatcher.main(self.ctx) + assert len(m_run_job.call_args_list) == 0 + assert len(m_try_push_job_info.call_args_list) == len(jobs) + for i in range(len(jobs)): + push_call = m_try_push_job_info.call_args_list[i] + assert push_call[0][1]['status'] == 'dead' + + @pytest.mark.parametrize( + ["timestamp", "expire", "skip"], + [ + [datetime.timedelta(days=-1), None, False], + [datetime.timedelta(days=-30), None, True], + [None, datetime.timedelta(days=1), False], + [None, datetime.timedelta(days=-1), True], + [datetime.timedelta(days=-1), datetime.timedelta(days=1), False], + [datetime.timedelta(days=1), datetime.timedelta(days=-1), True], + ] + ) + @patch("teuthology.dispatcher.report.try_push_job_info") + def test_check_job_expiration(self, _, timestamp, expire, skip): + now = datetime.datetime.now(datetime.timezone.utc) + job_config = dict( + job_id="1", + name="job_name", + ) + if timestamp: + job_config["timestamp"] = (now + timestamp).strftime(TIMESTAMP_FMT) + if expire: + job_config["expire"] = (now + expire).strftime(TIMESTAMP_FMT) + if skip: + with pytest.raises(dispatcher.SkipJob): + dispatcher.check_job_expiration(job_config) + else: + dispatcher.check_job_expiration(job_config) diff --git a/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py b/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py new file mode 100644 index 000000000..63ec8ebbe --- /dev/null +++ b/teuthology/dispatcher/test/test_reimage_error_mark_machine_down.py @@ -0,0 +1,104 @@ +from teuthology.dispatcher import supervisor +from unittest.mock import patch + +class TestCheckReImageFailureMarkDown(object): + def setup_method(self): + self.the_function = supervisor.check_for_reimage_failures_and_mark_down + + def create_n_out_of_10_reimage_failed_jobs(self, n): + ret_list = [] + for i in range(n): + obj1 = { + "failure_reason":"Error reimaging machines: Manually raised error" + } + ret_list.append(obj1) + for j in range(10-n): + obj2 = {"failure_reason":"Error something else: dummy"} + ret_list.append(obj2) + return ret_list + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_one_machine_ten_reimage_failed_jobs( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'} + m_requests.get.return_value.json.return_value = \ + self.create_n_out_of_10_reimage_failed_jobs(10) + shortname.return_value = 'rmachine061' + self.the_function(targets) + assert mark_down.called + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_one_machine_seven_reimage_failed_jobs( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname, + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519'} + m_requests.get.return_value.json.return_value = \ + self.create_n_out_of_10_reimage_failed_jobs(7) + shortname.return_value = 'rmachine061' + self.the_function(targets) + assert mark_down.called is False + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_two_machine_all_reimage_failed_jobs( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname, + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519', + 'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'} + m_requests.get.return_value.json.side_effect = \ + [self.create_n_out_of_10_reimage_failed_jobs(10), + self.create_n_out_of_10_reimage_failed_jobs(10)] + shortname.return_value.side_effect = ['rmachine061', 'rmachine179'] + self.the_function(targets) + assert mark_down.call_count == 2 + + @patch('teuthology.dispatcher.supervisor.shortname') + @patch('teuthology.lock.ops.update_lock') + @patch('teuthology.dispatcher.supervisor.requests') + @patch('teuthology.dispatcher.supervisor.urljoin') + @patch('teuthology.dispatcher.supervisor.teuth_config') + def test_two_machine_one_healthy_one_reimage_failure( + self, + m_t_config, + m_urljoin, + m_requests, + mark_down, + shortname, + ): + targets = {'fakeos@rmachine061.front.sepia.ceph.com': 'ssh-ed25519', + 'fakeos@rmachine179.back.sepia.ceph.com': 'ssh-ed45333'} + m_requests.get.return_value.json.side_effect = \ + [self.create_n_out_of_10_reimage_failed_jobs(0), + self.create_n_out_of_10_reimage_failed_jobs(10)] + shortname.return_value.side_effect = ['rmachine061', 'rmachine179'] + self.the_function(targets) + assert mark_down.call_count == 1 + assert mark_down.call_args_list[0][0][0].startswith('rmachine179') + diff --git a/teuthology/dispatcher/test/test_supervisor.py b/teuthology/dispatcher/test/test_supervisor.py new file mode 100644 index 000000000..2b422c07b --- /dev/null +++ b/teuthology/dispatcher/test/test_supervisor.py @@ -0,0 +1,117 @@ +from subprocess import DEVNULL +from unittest.mock import patch, Mock, MagicMock + +from teuthology.dispatcher import supervisor + + +class TestSuperviser(object): + @patch("teuthology.dispatcher.supervisor.run_with_watchdog") + @patch("teuthology.dispatcher.supervisor.teuth_config") + @patch("subprocess.Popen") + @patch("os.environ") + @patch("os.mkdir") + @patch("yaml.safe_dump") + @patch("tempfile.NamedTemporaryFile") + def test_run_job_with_watchdog(self, m_tempfile, m_safe_dump, m_mkdir, + m_environ, m_popen, m_t_config, + m_run_watchdog): + config = { + "suite_path": "suite/path", + "config": {"foo": "bar"}, + "verbose": True, + "owner": "the_owner", + "archive_path": "archive/path", + "name": "the_name", + "description": "the_description", + "job_id": "1", + } + m_tmp = MagicMock() + temp_file = Mock() + temp_file.name = "the_name" + m_tmp.__enter__.return_value = temp_file + m_tempfile.return_value = m_tmp + m_p = Mock() + m_p.returncode = 0 + m_popen.return_value = m_p + m_t_config.results_server = True + supervisor.run_job(config, "teuth/bin/path", "archive/dir", verbose=False) + m_run_watchdog.assert_called_with(m_p, config) + expected_args = [ + 'teuth/bin/path/teuthology', + '-v', + '--owner', 'the_owner', + '--archive', 'archive/path', + '--name', 'the_name', + '--description', + 'the_description', + '--', + "archive/path/orig.config.yaml", + ] + m_popen.assert_called_with(args=expected_args, stderr=DEVNULL, stdout=DEVNULL) + + @patch("time.sleep") + @patch("teuthology.dispatcher.supervisor.teuth_config") + @patch("subprocess.Popen") + @patch("os.environ") + @patch("os.mkdir") + @patch("yaml.safe_dump") + @patch("tempfile.NamedTemporaryFile") + def test_run_job_no_watchdog(self, m_tempfile, m_safe_dump, m_mkdir, + m_environ, m_popen, m_t_config, + m_sleep): + config = { + "suite_path": "suite/path", + "config": {"foo": "bar"}, + "verbose": True, + "owner": "the_owner", + "archive_path": "archive/path", + "name": "the_name", + "description": "the_description", + "job_id": "1", + } + m_tmp = MagicMock() + temp_file = Mock() + temp_file.name = "the_name" + m_tmp.__enter__.return_value = temp_file + m_tempfile.return_value = m_tmp + env = dict(PYTHONPATH="python/path") + m_environ.copy.return_value = env + m_p = Mock() + m_p.returncode = 1 + m_popen.return_value = m_p + m_t_config.results_server = False + supervisor.run_job(config, "teuth/bin/path", "archive/dir", verbose=False) + + @patch("teuthology.dispatcher.supervisor.report.try_push_job_info") + @patch("time.sleep") + def test_run_with_watchdog_no_reporting(self, m_sleep, m_try_push): + config = { + "name": "the_name", + "job_id": "1", + "archive_path": "archive/path", + "teuthology_branch": "main" + } + process = Mock() + process.poll.return_value = "not None" + supervisor.run_with_watchdog(process, config) + m_try_push.assert_called_with( + dict(name=config["name"], job_id=config["job_id"]), + dict(status='dead') + ) + + @patch("subprocess.Popen") + @patch("time.sleep") + @patch("teuthology.dispatcher.supervisor.report.try_push_job_info") + def test_run_with_watchdog_with_reporting(self, m_tpji, m_sleep, m_popen): + config = { + "name": "the_name", + "job_id": "1", + "archive_path": "archive/path", + "teuthology_branch": "jewel" + } + process = Mock() + process.poll.return_value = "not None" + m_proc = Mock() + m_proc.poll.return_value = "not None" + m_popen.return_value = m_proc + supervisor.run_with_watchdog(process, config) diff --git a/teuthology/exceptions.py b/teuthology/exceptions.py new file mode 100644 index 000000000..da3834354 --- /dev/null +++ b/teuthology/exceptions.py @@ -0,0 +1,238 @@ +class BranchNotFoundError(ValueError): + def __init__(self, branch, repo=None): + self.branch = branch + self.repo = repo + + def __str__(self): + if self.repo: + repo_str = " in repo: %s" % self.repo + else: + repo_str = "" + return "Branch '{branch}' not found{repo_str}!".format( + branch=self.branch, repo_str=repo_str) + + +class BranchMismatchError(ValueError): + def __init__(self, branch, repo, reason=None): + self.branch = branch + self.repo = repo + self.reason = reason + + def __str__(self): + msg = f"Cannot use branch {self.branch} with repo {self.repo}" + if self.reason: + msg = f"{msg} because {self.reason}" + return msg + +class CommitNotFoundError(ValueError): + def __init__(self, commit, repo=None): + self.commit = commit + self.repo = repo + + def __str__(self): + if self.repo: + repo_str = " in repo: %s" % self.repo + else: + repo_str = "" + return "'{commit}' not found{repo_str}!".format( + commit=self.commit, repo_str=repo_str) + + +class GitError(RuntimeError): + pass + + +class BootstrapError(RuntimeError): + pass + + +class ConfigError(RuntimeError): + """ + Meant to be used when an invalid config entry is found. + """ + pass + + +class ParseError(Exception): + pass + + +class CommandFailedError(Exception): + + """ + Exception thrown on command failure + """ + def __init__(self, command, exitstatus, node=None, label=None): + self.command = command + self.exitstatus = exitstatus + self.node = node + self.label = label + + def __str__(self): + prefix = "Command failed" + if self.label: + prefix += " ({label})".format(label=self.label) + if self.node: + prefix += " on {node}".format(node=self.node) + return "{prefix} with status {status}: {cmd!r}".format( + status=self.exitstatus, + cmd=self.command, + prefix=prefix, + ) + + def fingerprint(self): + """ + Returns a list of strings to group failures with. + Used by sentry instead of grouping by backtrace. + """ + return [ + self.label or self.command, + 'exit status {}'.format(self.exitstatus), + '{{ type }}', + ] + + +class AnsibleFailedError(Exception): + + """ + Exception thrown when an ansible playbook fails + """ + def __init__(self, failures): + self.failures = failures + + def __str__(self): + return "{failures}".format( + failures=self.failures, + ) + + def fingerprint(self): + """ + Sentry will use this to group events by their failure reasons, rather + than lumping all AnsibleFailedErrors together + """ + return self.failures + + +class CommandCrashedError(Exception): + + """ + Exception thrown on crash + """ + def __init__(self, command): + self.command = command + + def __str__(self): + return "Command crashed: {command!r}".format( + command=self.command, + ) + + +class ConnectionLostError(Exception): + + """ + Exception thrown when the connection is lost + """ + def __init__(self, command, node=None): + self.command = command + self.node = node + + def __str__(self): + node_str = 'to %s ' % self.node if self.node else '' + return "SSH connection {node_str}was lost: {command!r}".format( + node_str=node_str, + command=self.command, + ) + + +class ScheduleFailError(RuntimeError): + def __init__(self, message, name=None): + self.message = message + self.name = name + + def __str__(self): + return "Scheduling {name} failed: {msg}".format( + name=self.name, + msg=self.message, + ).replace(' ', ' ') + + +class VersionNotFoundError(Exception): + def __init__(self, url): + self.url = url + + def __str__(self): + return "Failed to fetch package version from %s" % self.url + + +class UnsupportedPackageTypeError(Exception): + def __init__(self, node): + self.node = node + + def __str__(self): + return "os.package_type {pkg_type!r} on {node}".format( + node=self.node, pkg_type=self.node.os.package_type) + + +class SELinuxError(Exception): + def __init__(self, node, denials): + self.node = node + self.denials = denials + + def __str__(self): + return "SELinux denials found on {node}: {denials}".format( + node=self.node, denials=self.denials) + + +class QuotaExceededError(Exception): + def __init__(self, message): + self.message = message + + def __str__(self): + return self.message + + +class SkipJob(Exception): + """ + Used by teuthology.worker when it notices that a job is broken and should + be skipped. + """ + pass + + +class MaxWhileTries(Exception): + pass + + +class ConsoleError(Exception): + pass + + +class NoRemoteError(Exception): + message = "This operation requires a remote" + + def __str__(self): + return self.message + + +class UnitTestError(Exception): + """ + Exception thrown on unit test failure + """ + def __init__(self, exitstatus=None, node=None, label=None, message=None): + self.exitstatus = exitstatus + self.node = node + self.label = label + self.message = message + + def __str__(self): + prefix = "Unit test failed" + if self.label: + prefix += " ({label})".format(label=self.label) + if self.node: + prefix += " on {node}".format(node=self.node) + if self.exitstatus: + prefix += " with status {status}".format(status=self.exitstatus) + return "{prefix}: '{message}'".format( + prefix=prefix, + message=self.message, + ) diff --git a/teuthology/exit.py b/teuthology/exit.py new file mode 100644 index 000000000..266e988eb --- /dev/null +++ b/teuthology/exit.py @@ -0,0 +1,78 @@ +import logging +import os +import signal + + +log = logging.getLogger(__name__) + + +class Exiter(object): + """ + A helper to manage any signal handlers we need to call upon receiving a + given signal + """ + def __init__(self): + self.handlers = list() + + def add_handler(self, signals, func): + """ + Adds a handler function to be called when any of the given signals are + received. + + The handler function should have a signature like:: + + my_handler(signal, frame) + """ + if isinstance(signals, int): + signals = [signals] + + for signal_ in signals: + signal.signal(signal_, self.default_handler) + + handler = Handler(self, func, signals) + log.debug( + "Installing handler: %s", + repr(handler), + ) + self.handlers.append(handler) + return handler + + def default_handler(self, signal_, frame): + log.debug( + "Got signal %s; running %s handler%s...", + signal_, + len(self.handlers), + '' if len(self.handlers) == 1 else 's', + ) + for handler in self.handlers: + handler.func(signal_, frame) + log.debug("Finished running handlers") + # Restore the default handler + signal.signal(signal_, 0) + # Re-send the signal to our main process + os.kill(os.getpid(), signal_) + + +class Handler(object): + def __init__(self, exiter, func, signals): + self.exiter = exiter + self.func = func + self.signals = signals + + def remove(self): + try: + log.debug("Removing handler: %s", self) + self.exiter.handlers.remove(self) + except ValueError: + pass + + def __repr__(self): + return "{c}(exiter={e}, func={f}, signals={s})".format( + c=self.__class__.__name__, + e=self.exiter, + f=self.func, + s=self.signals, + ) + + +exiter = Exiter() diff --git a/teuthology/exporter.py b/teuthology/exporter.py new file mode 100644 index 000000000..30aead875 --- /dev/null +++ b/teuthology/exporter.py @@ -0,0 +1,347 @@ +import contextlib +import itertools +import logging +import os +import psutil +import time + +from pathlib import Path + +import teuthology.beanstalk as beanstalk +import teuthology.dispatcher +from teuthology.config import config +from teuthology.lock.query import list_locks + +log = logging.getLogger(__name__) + + +PROMETHEUS_MULTIPROC_DIR = Path("~/.cache/teuthology-exporter").expanduser() +os.environ["PROMETHEUS_MULTIPROC_DIR"] = str(PROMETHEUS_MULTIPROC_DIR) + +# We can't import prometheus_client until after we set PROMETHEUS_MULTIPROC_DIR +from prometheus_client import ( # noqa: E402 + start_http_server, + Gauge, + Counter, + Summary, + multiprocess, + CollectorRegistry, +) + +MACHINE_TYPES = list(config.active_machine_types) +REGISTRY = None + + +class TeuthologyExporter: + port = 61764 # int(''.join([str((ord(c) - 100) % 10) for c in "teuth"])) + + def __init__(self, interval=60): + if REGISTRY: + for file in PROMETHEUS_MULTIPROC_DIR.iterdir(): + file.unlink() + self.interval = interval + self.metrics = [ + Dispatchers(), + BeanstalkQueue(), + JobProcesses(), + Nodes(), + ] + self._created_time = time.perf_counter() + + def start(self): + if REGISTRY: + start_http_server(self.port, registry=REGISTRY) + self.loop() + + def update(self): + log.info("Updating...") + for metric in self.metrics: + metric.update() + log.info("Update finished.") + + def loop(self): + log.info("Starting teuthology-exporter...") + while True: + try: + before = time.perf_counter() + if before - self._created_time > 24 * 60 * 60: + self.restart() + try: + self.update() + except Exception: + log.exception("Failed to update metrics") + interval = self.interval + # try to deliver metrics _at_ $interval, as opposed to sleeping + # for $interval between updates + elapsed: float = time.perf_counter() - before + if elapsed < 0: + interval *= 2 + interval -= elapsed + time.sleep(interval) + except KeyboardInterrupt: + log.info("Stopping.") + raise SystemExit + + def restart(self): + # Use the dispatcher's restart function - note that by using this here, + # it restarts the exporter, *not* the dispatcher. + if REGISTRY: + return teuthology.dispatcher.restart(log=log) + + +class SingletonMeta(type): + _instances = {} + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls] = instance + return cls._instances[cls] + + +class TeuthologyMetric(metaclass=SingletonMeta): + def __init__(self): + if REGISTRY: + self._init() + + def _init(self): + raise NotImplementedError + + def update(self): + if REGISTRY: + self._update() + + def _update(self): + raise NotImplementedError + + def record(self, **kwargs): + if REGISTRY: + self._record(**kwargs) + + def _record(self, **_): + raise NotImplementedError + + @contextlib.contextmanager + def time(self, **labels): + if REGISTRY: + yield self._time(**labels) + else: + yield + + def _time(self): + raise NotImplementedError + + +class Dispatchers(TeuthologyMetric): + def _init(self): + self.metric = Gauge( + "teuthology_dispatchers", + "Teuthology Dispatchers", + ["machine_type"], + ) + + def _update(self): + dispatcher_procs = teuthology.dispatcher.find_dispatcher_processes() + for machine_type in MACHINE_TYPES: + self.metric.labels(machine_type).set( + len(dispatcher_procs.get(machine_type, [])) + ) + + +class BeanstalkQueue(TeuthologyMetric): + def _init(self): + self.length = Gauge( + "beanstalk_queue_length", + "Beanstalk Queue Length", + ["machine_type"], + ) + self.paused = Gauge( + "beanstalk_queue_paused", "Beanstalk Queue is Paused", ["machine_type"] + ) + + def _update(self): + for machine_type in MACHINE_TYPES: + queue_stats = beanstalk.stats_tube(beanstalk.connect(), machine_type) + self.length.labels(machine_type).set(queue_stats["count"]) + self.paused.labels(machine_type).set(1 if queue_stats["paused"] else 0) + + +class JobProcesses(TeuthologyMetric): + def _init(self): + self.metric = Gauge( + "teuthology_job_processes", + "Teuthology Job Processes", + ) + + def _update(self): + attrs = ["pid", "cmdline"] + total = 0 + for proc in psutil.process_iter(attrs=attrs): + if self._match(proc): + total += 1 + self.metric.set(total) + + @staticmethod + def _match(proc): + try: + cmdline = proc.cmdline() + except psutil.ZombieProcess: + return False + except psutil.AccessDenied: + return False + if not len(cmdline) > 1: + return False + if not cmdline[1].endswith("teuthology"): + return False + if "--archive" not in cmdline: + return False + if "--name" not in cmdline: + return False + try: + owner_index = cmdline.index("--owner") + 1 + if not cmdline[owner_index].startswith("scheduled_"): + return False + except ValueError: + return False + return True + + +class Nodes(TeuthologyMetric): + def _init(self): + self.metric = Gauge( + "teuthology_nodes", + "Teuthology Nodes", + ["machine_type", "locked", "up"], + ) + + def _update(self): + for machine_type in MACHINE_TYPES: + nodes = list_locks(machine_type=machine_type) + for up, locked in itertools.product([True, False], [True, False]): + self.metric.labels(machine_type=machine_type, up=up, locked=locked).set( + len([n for n in nodes if n["up"] is up and n["locked"] is locked]) + ) + + +class JobResults(TeuthologyMetric): + def _init(self): + self.metric = Counter( + "teuthology_job_results", + "Teuthology Job Results", + ["machine_type", "status"], + ) + + # As this is to be used within job processes, we implement record() rather than update() + def _record(self, **labels): + self.metric.labels(**labels).inc() + + +class NodeReimagingResults(TeuthologyMetric): + def _init(self): + self.metric = Counter( + "teuthology_reimaging_results", + "Teuthology Reimaging Results", + ["machine_type", "status"], + ) + + # As this is to be used within job processes, we implement record() rather than update() + def _record(self, **labels): + if REGISTRY: + self.metric.labels(**labels).inc() + + +class NodeLockingTime(TeuthologyMetric): + def _init(self): + self.metric = Summary( + "teuthology_node_locking_duration_seconds", + "Time spent waiting to lock nodes", + ["machine_type", "count"], + ) + + def _time(self, **labels): + yield self.metric.labels(**labels).time() + + +class NodeReimagingTime(TeuthologyMetric): + def _init(self): + self.metric = Summary( + "teuthology_node_reimaging_duration_seconds", + "Time spent reimaging nodes", + ["machine_type", "count"], + ) + + def _time(self, **labels): + yield self.metric.labels(**labels).time() + + +class JobTime(TeuthologyMetric): + def _init(self): + self.metric = Summary( + "teuthology_job_duration_seconds", + "Time spent executing a job", + ["suite"], + ) + + def _time(self, **labels): + yield self.metric.labels(**labels).time() + + +class TaskTime(TeuthologyMetric): + def _init(self): + self.metric = Summary( + "teuthology_task_duration_seconds", + "Time spent executing a task", + ["name", "phase"], + ) + + def _time(self, **labels): + yield self.metric.labels(**labels).time() + + +class BootstrapTime(TeuthologyMetric): + def _init(self): + self.metric = Summary( + "teuthology_bootstrap_duration_seconds", + "Time spent running teuthology's bootstrap script", + ) + + def _time(self, **labels): + yield self.metric.labels(**labels).time() + + +def find_exporter_process() -> int | None: + attrs = ['pid', 'uids', 'cmdline'] + for proc in psutil.process_iter(attrs=attrs): + try: + cmdline = proc.info['cmdline'] + except psutil.AccessDenied: + continue + pid = proc.info['pid'] + if not cmdline: + continue + if not [i for i in cmdline if i.split('/')[-1] == 'teuthology-exporter']: + continue + if os.getuid() not in proc.info['uids']: + continue + return pid + + +def main(args) -> int: + if pid := find_exporter_process(): + if os.getpid() != pid: + log.error(f"teuthology-exporter is already running as PID {pid}") + return 2 + exporter = TeuthologyExporter(interval=int(args["--interval"])) + try: + exporter.start() + except Exception: + log.exception("Exporter failed") + return 1 + else: + return 0 + + +pid = find_exporter_process() +if pid: + PROMETHEUS_MULTIPROC_DIR.mkdir(parents=True, exist_ok=True) + REGISTRY = CollectorRegistry() + multiprocess.MultiProcessCollector(REGISTRY) diff --git a/teuthology/job_status.py b/teuthology/job_status.py new file mode 100644 index 000000000..05ff80d71 --- /dev/null +++ b/teuthology/job_status.py @@ -0,0 +1,38 @@ +def get_status(summary): + """ + :param summary: The job summary dict. Normally ctx.summary + :returns: A status string like 'pass', 'fail', or 'dead' + """ + status = summary.get('status') + if status is not None: + return status + + success = summary.get('success') + if success is True: + status = 'pass' + elif success is False: + status = 'fail' + else: + status = None + return status + + +def set_status(summary, status): + """ + Sets summary['status'] to status, and summary['success'] to True if status + is 'pass'. If status is not 'pass', then 'success' is False. + + If status is None, do nothing. + + :param summary: The job summary dict. Normally ctx.summary + :param status: The job status, e.g. 'pass', 'fail', 'dead' + """ + if status is None: + return + + summary['status'] = status + if status == 'pass': + summary['success'] = True + else: + summary['success'] = False + diff --git a/teuthology/kill.py b/teuthology/kill.py new file mode 100755 index 000000000..137e49080 --- /dev/null +++ b/teuthology/kill.py @@ -0,0 +1,248 @@ +#!/usr/bin/python +import os +import sys +import yaml +import psutil +import subprocess +import logging +import getpass + +from typing import Union + +import teuthology.exporter + +from teuthology import beanstalk +from teuthology import report +from teuthology.config import config +from teuthology.lock import ops as lock_ops + +log = logging.getLogger(__name__) + + +def main(args): + run_name = args['--run'] + job = args['--job'] + jobspec = args['--jobspec'] + archive_base = args['--archive'] + owner = args['--owner'] + machine_type = args['--machine-type'] + preserve_queue = args['--preserve-queue'] + + if jobspec: + split_spec = jobspec.split('/') + run_name = split_spec[0] + job = [split_spec[1]] + + if job: + for job_id in job: + kill_job(run_name, job_id, archive_base, owner) + else: + kill_run(run_name, archive_base, owner, machine_type, + preserve_queue=preserve_queue) + + +def kill_run(run_name, archive_base=None, owner=None, machine_type=None, + preserve_queue=False): + run_info = {} + serializer = report.ResultsSerializer(archive_base) + if archive_base: + run_archive_dir = os.path.join(archive_base, run_name) + if os.path.isdir(run_archive_dir): + run_info = find_run_info(serializer, run_name) + if 'machine_type' in run_info: + machine_type = run_info['machine_type'] + owner = run_info['owner'] + else: + log.warning("The run info does not have machine type: %s" % run_info) + log.warning("Run archive used: %s" % run_archive_dir) + log.info("Using machine type '%s' and owner '%s'" % (machine_type, owner)) + elif machine_type is None: + # no jobs found in archive and no machine type specified, + # so we try paddles to see if there is anything scheduled + run_info = report.ResultsReporter().get_run(run_name) + machine_type = run_info.get('machine_type', None) + if machine_type: + log.info(f"Using machine type '{machine_type}' received from paddles.") + else: + raise RuntimeError(f"Cannot find machine type for the run {run_name}; " + + "you must also pass --machine-type") + + if not preserve_queue: + remove_beanstalk_jobs(run_name, machine_type) + remove_paddles_jobs(run_name) + if kill_processes(run_name, run_info.get('pids')): + return + if owner is not None: + targets = find_targets(run_name) + names = list(targets.keys()) + lock_ops.unlock_safe(names, owner, run_name) + report.try_mark_run_dead(run_name) + + +def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False): + serializer = report.ResultsSerializer(archive_base) + job_info = serializer.job_info(run_name, job_id) + # If we can't read the filesystem, job_info will be nearly empty. Ask paddles: + if 'name' not in job_info: + job_info = report.ResultsReporter().get_jobs(run_name, job_id) + if not owner: + if 'owner' not in job_info: + raise RuntimeError( + "I could not figure out the owner of the requested job. " + "Please pass --owner .") + owner = job_info['owner'] + if kill_processes(run_name, [job_info.get('pid')]): + return + report.try_push_job_info(job_info, dict(status="dead")) + if 'machine_type' in job_info: + teuthology.exporter.JobResults().record( + machine_type=job_info["machine_type"], + status=job_info.get("status", "dead") + ) + else: + log.warn(f"Job {job_id} has no machine_type; cannot report via Prometheus") + if not skip_unlock: + targets = find_targets(run_name, job_id) + lock_ops.unlock_safe(list(targets.keys()), owner, run_name, job_id) + + +def find_run_info(serializer, run_name): + log.info("Assembling run information...") + run_info_fields = [ + 'machine_type', + 'owner', + ] + + pids = [] + run_info = {} + job_info = {} + job_num = 0 + jobs = serializer.jobs_for_run(run_name) + job_total = len(jobs) + for (job_id, job_dir) in jobs.items(): + if not os.path.isdir(job_dir): + continue + job_num += 1 + beanstalk.print_progress(job_num, job_total, 'Reading Job: ') + job_info = serializer.job_info(run_name, job_id, simple=True) + for key in job_info.keys(): + if key in run_info_fields and key not in run_info: + run_info[key] = job_info[key] + if 'pid' in job_info: + pids.append(job_info['pid']) + run_info['pids'] = pids + return run_info + + +def remove_paddles_jobs(run_name): + jobs = report.ResultsReporter().get_jobs(run_name, fields=['status']) + job_ids = [job['job_id'] for job in jobs if job['status'] == 'queued'] + if job_ids: + log.info("Deleting jobs from paddles: %s", str(job_ids)) + report.try_delete_jobs(run_name, job_ids) + + +def remove_beanstalk_jobs(run_name, tube_name): + qhost = config.queue_host + qport = config.queue_port + if qhost is None or qport is None: + raise RuntimeError( + 'Beanstalk queue information not found in {conf_path}'.format( + conf_path=config.yaml_path)) + log.info("Checking Beanstalk Queue...") + beanstalk_conn = beanstalk.connect() + real_tube_name = beanstalk.watch_tube(beanstalk_conn, tube_name) + + curjobs = beanstalk_conn.stats_tube(real_tube_name)['current-jobs-ready'] + if curjobs != 0: + x = 1 + while x != curjobs: + x += 1 + job = beanstalk_conn.reserve(timeout=20) + if job is None: + continue + job_config = yaml.safe_load(job.body) + if run_name == job_config['name']: + job_id = job.stats()['id'] + msg = "Deleting job from queue. ID: " + \ + "{id} Name: {name} Desc: {desc}".format( + id=str(job_id), + name=job_config['name'], + desc=job_config['description'], + ) + log.info(msg) + job.delete() + else: + print("No jobs in Beanstalk Queue") + beanstalk_conn.close() + + +def kill_processes(run_name, pids=None): + if pids: + to_kill = set(pids).intersection(psutil.pids()) + else: + to_kill = find_pids(run_name) + + pids_need_sudo = set() + for pid in set(to_kill): + if not process_matches_run(pid, run_name): + to_kill.remove(pid) + elif psutil.Process(int(pid)).username() != getpass.getuser(): + pids_need_sudo.add(pid) + + survivors = [] + if len(to_kill) == 0: + log.info("No teuthology processes running") + else: + log.info("Killing Pids: " + str(to_kill)) + sudo_works = False + if pids_need_sudo: + sudo_works = subprocess.Popen(['sudo', '-n', '-l']).wait() == 0 + if not sudo_works: + log.debug("Passwordless sudo not configured; not using sudo") + for pid in to_kill: + use_sudo = pid in pids_need_sudo and sudo_works + args = ['kill', str(pid)] + # Don't attempt to use sudo if it's not necessary + if use_sudo: + args = ['sudo', '-n'] + args + try: + subprocess.check_call(args) + except subprocess.CalledProcessError: + survivors.append(pid) + if survivors: + log.error(f"Failed to kill PIDs: {survivors}") + return survivors + + +def process_matches_run(pid, run_name): + try: + p = psutil.Process(pid) + cmd = p.cmdline() + if run_name in cmd and sys.argv[0] not in cmd: + return True + except psutil.NoSuchProcess: + pass + except psutil.AccessDenied: + pass + return False + + +def find_pids(run_name): + run_pids = [] + for pid in psutil.pids(): + if process_matches_run(pid, run_name): + run_pids.append(pid) + return run_pids + +def find_targets(run_name: str, job_id: Union[str, int, None] = None) -> dict: + if job_id is not None: + job_info = report.ResultsReporter().get_jobs(run_name, str(job_id)) + return job_info.get("targets") or dict() + result = dict() + run_info = report.ResultsReporter().get_jobs(run_name) + for job_info in run_info: + if job_info.get("status") not in ("running", "waiting"): + continue + result.update(job_info.get("targets") or dict()) + return result diff --git a/teuthology/lock/__init__.py b/teuthology/lock/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/lock/cli.py b/teuthology/lock/cli.py new file mode 100644 index 000000000..ed4f28bd5 --- /dev/null +++ b/teuthology/lock/cli.py @@ -0,0 +1,302 @@ +import argparse +import collections +import json +import logging +import re + +import yaml + +import teuthology +import teuthology.parallel +import teuthology.provision +from teuthology import misc +from teuthology.config import set_config_attr + +from teuthology.lock import ( + ops, + util, + query, +) + + +log = logging.getLogger(__name__) + + +def main(ctx): + if ctx.verbose: + teuthology.log.setLevel(logging.DEBUG) + + set_config_attr(ctx) + + ret = 0 + user = ctx.owner + machines = [misc.canonicalize_hostname(m, user=False) + for m in ctx.machines] + machines_to_update = [] + + if ctx.targets: + try: + with open(ctx.targets) as f: + g = yaml.safe_load_all(f) + for new in g: + if 'targets' in new: + for t in new['targets'].keys(): + machines.append(t) + except IOError as e: + raise argparse.ArgumentTypeError(str(e)) + + if ctx.f: + assert ctx.lock or ctx.unlock, \ + '-f is only supported by --lock and --unlock' + if machines: + assert ctx.lock or ctx.unlock or ctx.list or ctx.list_targets \ + or ctx.update or ctx.brief, \ + 'machines cannot be specified with that operation' + else: + if ctx.lock: + log.error("--lock requires specific machines passed as arguments") + else: + # This condition might never be hit, but it's not clear. + assert ctx.num_to_lock or ctx.list or ctx.list_targets or \ + ctx.summary or ctx.brief, \ + 'machines must be specified for that operation' + if ctx.all: + assert ctx.list or ctx.list_targets or ctx.brief, \ + '--all can only be used with --list, --list-targets, and --brief' + assert ctx.owner is None, \ + '--all and --owner are mutually exclusive' + assert not machines, \ + '--all and listing specific machines are incompatible' + if ctx.num_to_lock: + assert ctx.machine_type, \ + 'must specify machine type to lock' + + if ctx.brief or ctx.list or ctx.list_targets: + assert ctx.desc is None, '--desc does nothing with --list/--brief' + + # we may need to update host keys for vms. Don't do it for + # every vm; however, update any vms included in the list given + # to the CLI (machines), or any owned by the specified owner or + # invoking user if no machines are specified. + vmachines = [] + statuses = query.get_statuses(machines) + owner = ctx.owner or misc.get_user() + for machine in statuses: + if query.is_vm(status=machine) and machine['locked'] and \ + (machines or machine['locked_by'] == owner): + vmachines.append(machine['name']) + if vmachines: + log.info("updating host keys for %s", ' '.join(sorted(vmachines))) + ops.do_update_keys(vmachines, _raise=False) + # get statuses again to refresh any updated keys + statuses = query.get_statuses(machines) + if statuses: + statuses = util.winnow(statuses, ctx.machine_type, 'machine_type') + if not machines and ctx.owner is None and not ctx.all: + ctx.owner = misc.get_user() + statuses = util.winnow(statuses, ctx.owner, 'locked_by') + statuses = util.winnow(statuses, ctx.status, 'up', + lambda s: s['up'] == (ctx.status == 'up')) + statuses = util.winnow(statuses, ctx.locked, 'locked', + lambda s: s['locked'] == (ctx.locked == 'true')) + statuses = util.winnow(statuses, ctx.desc, 'description') + statuses = util.winnow(statuses, ctx.desc_pattern, 'description', + lambda s: s['description'] and \ + ctx.desc_pattern in s['description']) + if ctx.json_query: + statuses = util.json_matching_statuses(ctx.json_query, statuses) + statuses = util.winnow(statuses, ctx.os_type, 'os_type') + statuses = util.winnow(statuses, ctx.os_version, 'os_version') + + # When listing, only show the vm_host's name, not every detail + for s in statuses: + if not query.is_vm(status=s): + continue + # with an OpenStack API, there is no host for a VM + if s['vm_host'] is None: + continue + vm_host_name = s.get('vm_host', dict())['name'] + if vm_host_name: + s['vm_host'] = vm_host_name + if ctx.list: + print(json.dumps(statuses, indent=4)) + + elif ctx.brief: + maxname = max((len(_['name'] or '') + for _ in statuses), default=0) + maxuser = max((len(_['locked_by'] or 'None') + for _ in statuses), default=0) + node_status_template = ( + '{{host:<{name}}} {{up:<4}} {{locked:<8}} ' + '{{owner:<{user}}} "{{desc}}"' + ).format(name=maxname, user=maxuser) + for s in sorted(statuses, key=lambda s: s.get('name')): + locked = 'unlocked' if s['locked'] == 0 else 'locked' + up = 'up' if s['up'] else 'down' + mo = re.match(r'\w+@(\w+?)\..*', s['name']) + host = mo.group(1) if mo else s['name'] + print(node_status_template.format( + up=up, locked=locked, host=host, + owner=s['locked_by'] or 'None', desc=s['description'])) + + else: + frag = {'targets': {}} + for f in statuses: + frag['targets'][f['name']] = f['ssh_pub_key'] + print(yaml.safe_dump(frag, default_flow_style=False)) + else: + log.error('error retrieving lock statuses') + ret = 1 + + elif ctx.summary: + do_summary(ctx) + return 0 + + elif ctx.lock: + if not util.vps_version_or_type_valid( + ctx.machine_type, ctx.os_type, ctx.os_version): + log.error('Invalid os-type or version detected -- lock failed') + return 1 + reimage_types = teuthology.provision.get_reimage_types() + reimage_machines = list() + updatekeys_machines = list() + machine_types = dict() + for machine in machines: + resp = ops.lock_one(machine, user, ctx.desc) + if resp.ok: + machine_status = resp.json() + machine_type = machine_status['machine_type'] + machine_types[machine] = machine_type + if not resp.ok: + ret = 1 + if not ctx.f: + return ret + elif not query.is_vm(machine, machine_status): + if machine_type in reimage_types: + # Reimage in parallel just below here + reimage_machines.append(machine) + # Update keys last + updatekeys_machines = list() + else: + ops.update_nodes([machine], True) + created = teuthology.provision.create_if_vm( + ctx, + misc.canonicalize_hostname(machine), + ) + # do not try to update inventory if failed to create vm + if created: + machines_to_update.append(machine) + with teuthology.parallel.parallel() as p: + ops.update_nodes(reimage_machines, True) + for machine in reimage_machines: + p.spawn(teuthology.provision.reimage, ctx, machine, machine_types[machine]) + for machine in updatekeys_machines: + ops.do_update_keys([machine]) + ops.update_nodes(reimage_machines + machines_to_update) + + elif ctx.unlock: + if ctx.owner is None and user is None: + user = misc.get_user() + # If none of them are vpm, do them all in one shot + if not filter(query.is_vm, machines): + res = ops.unlock_many(machines, user) + return 0 if res else 1 + for machine in machines: + if not ops.unlock_one(machine, user): + ret = 1 + if not ctx.f: + return ret + else: + machines_to_update.append(machine) + elif ctx.num_to_lock: + result = ops.lock_many(ctx, ctx.num_to_lock, ctx.machine_type, user, + ctx.desc, ctx.os_type, ctx.os_version, ctx.arch) + if not result: + ret = 1 + else: + machines_to_update = result.keys() + if ctx.machine_type == 'vps': + shortnames = ' '.join( + [misc.decanonicalize_hostname(name) for name in + result.keys()] + ) + if len(result) < ctx.num_to_lock: + log.error("Locking failed.") + for machine in result: + ops.unlock_one(machine, user) + ret = 1 + else: + log.info("Successfully Locked:\n%s\n" % shortnames) + log.info( + "Unable to display keys at this time (virtual " + + "machines are booting).") + log.info( + "Please run teuthology-lock --list-targets %s once " + + "these machines come up.", + shortnames) + else: + print(yaml.safe_dump( + dict(targets=result), + default_flow_style=False)) + elif ctx.update: + assert ctx.desc is not None or ctx.status is not None, \ + 'you must specify description or status to update' + assert ctx.owner is None, 'only description and status may be updated' + machines_to_update = machines + + if ctx.desc is not None or ctx.status is not None: + for machine in machines_to_update: + ops.update_lock(machine, ctx.desc, ctx.status) + + return ret + + +def do_summary(ctx): + lockd = collections.defaultdict(lambda: [0, 0, 'unknown']) + if ctx.machine_type: + locks = query.list_locks(machine_type=ctx.machine_type) + else: + locks = query.list_locks() + for l in locks: + who = l['locked_by'] if l['locked'] == 1 \ + else '(free)', l['machine_type'] + lockd[who][0] += 1 + lockd[who][1] += 1 if l['up'] else 0 + lockd[who][2] = l['machine_type'] + + # sort locks by machine type and count + locks = sorted([p for p in lockd.items() + ], key=lambda sort: (sort[1][2] or '', sort[1][0])) + total_count, total_up = 0, 0 + print("TYPE COUNT UP OWNER") + + for (owner, (count, upcount, machinetype)) in locks: + # if machinetype == spectype: + print("{machinetype:8s} {count:3d} {up:3d} {owner}".format( + count=count, up=upcount, owner=owner[0], + machinetype=machinetype or '(none)')) + total_count += count + total_up += upcount + + print(" --- ---") + print("{cnt:12d} {up:3d}".format(cnt=total_count, up=total_up)) + + +def updatekeys(args): + loglevel = logging.DEBUG if args['--verbose'] else logging.INFO + logging.basicConfig( + level=loglevel, + ) + all_ = args['--all'] + machines = [] + if args['']: + machines = [misc.canonicalize_hostname(m, user=None) + for m in args['']] + elif args['--targets']: + targets = args['--targets'] + with open(targets) as f: + docs = yaml.safe_load_all(f) + for doc in docs: + machines = [n for n in doc.get('targets', dict()).keys()] + + return ops.do_update_keys(machines, all_)[0] diff --git a/teuthology/lock/ops.py b/teuthology/lock/ops.py new file mode 100644 index 000000000..968ab8641 --- /dev/null +++ b/teuthology/lock/ops.py @@ -0,0 +1,513 @@ +import logging +import json +import os +import random +import time +import yaml +import requests + +from typing import List, Union + +import teuthology.orchestra.remote +import teuthology.parallel +import teuthology.provision + +from teuthology import misc, report, provision +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.task import console_log +from teuthology.misc import canonicalize_hostname +from teuthology.job_status import set_status + +from teuthology.lock import util, query +from teuthology.orchestra import remote + +log = logging.getLogger(__name__) + + +def update_nodes(nodes, reset_os=False): + for node in nodes: + remote = teuthology.orchestra.remote.Remote( + canonicalize_hostname(node)) + if reset_os: + log.info("Updating [%s]: reset os type and version on server", node) + inventory_info = dict() + inventory_info['os_type'] = '' + inventory_info['os_version'] = '' + inventory_info['name'] = remote.hostname + else: + log.info("Updating [%s]: set os type and version on server", node) + inventory_info = remote.inventory_info + update_inventory(inventory_info) + + +def lock_many_openstack(ctx, num, machine_type, user=None, description=None, + arch=None): + os_type = teuthology.provision.get_distro(ctx) + os_version = teuthology.provision.get_distro_version(ctx) + if hasattr(ctx, 'config'): + resources_hint = ctx.config.get('openstack') + else: + resources_hint = None + machines = teuthology.provision.openstack.ProvisionOpenStack().create( + num, os_type, os_version, arch, resources_hint) + result = {} + for machine in machines: + lock_one(machine, user, description) + result[machine] = None # we do not collect ssh host keys yet + return result + + +def lock_many(ctx, num, machine_type, user=None, description=None, + os_type=None, os_version=None, arch=None, reimage=True): + if user is None: + user = misc.get_user() + + if not util.vps_version_or_type_valid( + ctx.machine_type, + os_type, + os_version + ): + log.error('Invalid os-type or version detected -- lock failed') + return + + # In the for loop below we can safely query for all bare-metal machine_type + # values at once. So, if we're being asked for 'plana,mira,burnupi', do it + # all in one shot. If we are passed 'plana,mira,burnupi,vps', do one query + # for 'plana,mira,burnupi' and one for 'vps' + machine_types_list = misc.get_multi_machine_types(machine_type) + downburst_types = teuthology.provision.downburst.get_types() + if all(t in downburst_types for t in machine_types_list): + machine_types = machine_types_list + elif machine_types_list == ['openstack']: + return lock_many_openstack(ctx, num, machine_type, + user=user, + description=description, + arch=arch) + elif any(t in downburst_types for t in machine_types_list): + the_vps = list(t for t in machine_types_list + if t in downburst_types) + non_vps = list(t for t in machine_types_list + if not t in downburst_types) + machine_types = ['|'.join(non_vps), '|'.join(the_vps)] + else: + machine_types_str = '|'.join(machine_types_list) + machine_types = [machine_types_str, ] + + for machine_type in machine_types: + uri = os.path.join(config.lock_server, 'nodes', 'lock_many', '') + data = dict( + locked_by=user, + count=num, + machine_type=machine_type, + description=description, + ) + # Only query for os_type/os_version if non-vps and non-libcloud, since + # in that case we just create them. + vm_types = downburst_types + teuthology.provision.cloud.get_types() + reimage_types = teuthology.provision.get_reimage_types() + if machine_type not in (vm_types + reimage_types): + if os_type: + data['os_type'] = os_type + if os_version: + data['os_version'] = os_version + if arch: + data['arch'] = arch + log.debug("lock_many request: %s", repr(data)) + response = requests.post( + uri, + data=json.dumps(data), + headers={'content-type': 'application/json'}, + ) + if response.ok: + machines = dict() + for machine in response.json(): + key = misc.canonicalize_hostname( + machine['name'], + user=machine.get('user'), + ) + machines[key] = machine['ssh_pub_key'] + log.debug('locked {machines}'.format( + machines=', '.join(machines.keys()))) + if machine_type in vm_types: + ok_machs = {} + update_nodes(machines, True) + for machine in machines: + if teuthology.provision.create_if_vm(ctx, machine): + ok_machs[machine] = machines[machine] + else: + log.error('Unable to create virtual machine: %s', + machine) + unlock_one(machine, user) + ok_machs = do_update_keys(list(ok_machs.keys()))[1] + update_nodes(ok_machs) + return ok_machs + elif reimage and machine_type in reimage_types: + return reimage_machines(ctx, machines, machine_type) + return machines + elif response.status_code == 503: + log.error('Insufficient nodes available to lock %d %s nodes.', + num, machine_type) + log.error(response.text) + else: + log.error('Could not lock %d %s nodes, reason: unknown.', + num, machine_type) + return [] + + +def lock_one(name, user=None, description=None): + name = misc.canonicalize_hostname(name, user=None) + if user is None: + user = misc.get_user() + request = dict(name=name, locked=True, locked_by=user, + description=description) + uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '') + response = requests.put(uri, json.dumps(request)) + success = response.ok + if success: + log.debug('locked %s as %s', name, user) + else: + try: + reason = response.json().get('message') + except ValueError: + reason = str(response.status_code) + log.error('failed to lock {node}. reason: {reason}'.format( + node=name, reason=reason)) + return response + + +def unlock_safe(names: List[str], owner: str, run_name: str = "", job_id: str = ""): + with teuthology.parallel.parallel() as p: + for name in names: + p.spawn(unlock_one_safe, name, owner, run_name, job_id) + return all(p) + + +def unlock_one_safe(name: str, owner: str, run_name: str = "", job_id: str = "") -> bool: + node_status = query.get_status(name) + if node_status.get("locked", False) is False: + log.info(f"Refusing to unlock {name} since it is already unlocked") + return False + maybe_job = query.node_active_job(name, node_status) + if not maybe_job: + return unlock_one(name, owner, node_status["description"], node_status) + if run_name: + if job_id and not maybe_job.endswith(f"{run_name}/{job_id}"): + log.info("Not unlocking {name} since it is running {maybe_job}, not {run_name}/{job_id}") + return False + elif not job_id and not maybe_job.endswith(run_name): + log.info(f"Not unlocking {name} since it is running {maybe_job}, not {run_name}") + return False + else: + return unlock_one(name, owner, node_status["description"], node_status) + else: + log.info(f"Refusing to unlock {name} since it has an active job: {maybe_job}") + return False + + +def unlock_many(names, user): + fixed_names = [misc.canonicalize_hostname(name, user=None) for name in + names] + names = fixed_names + uri = os.path.join(config.lock_server, 'nodes', 'unlock_many', '') + data = dict( + locked_by=user, + names=names, + ) + with safe_while( + sleep=1, increment=0.5, action=f'unlock_many {names}') as proceed: + while proceed(): + response = requests.post( + uri, + data=json.dumps(data), + headers={'content-type': 'application/json'}, + ) + if response.ok: + log.debug("Unlocked: %s", ', '.join(names)) + return True + log.error("Failed to unlock: %s", ', '.join(names)) + return False + + +def unlock_one(name, user, description=None, status: Union[dict, None] = None) -> bool: + name = misc.canonicalize_hostname(name, user=None) + if not description and status: + description = status["description"] + if not teuthology.provision.destroy_if_vm(name, user, description or ""): + log.error('destroy failed for %s', name) + return False + # we're trying to stop node before actual unlocking + status_info = teuthology.lock.query.get_status(name) + try: + if not teuthology.lock.query.is_vm(status=status_info): + stop_node(name, status) + except Exception: + log.exception(f"Failed to stop {name}!") + request = dict(name=name, locked=False, locked_by=user, + description=description) + uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '') + with safe_while( + sleep=1, increment=0.5, action="unlock %s" % name) as proceed: + while proceed(): + try: + response = requests.put(uri, json.dumps(request)) + if response.ok: + log.info('unlocked: %s', name) + return response.ok + if response.status_code == 403: + break + # Work around https://github.com/kennethreitz/requests/issues/2364 + except requests.ConnectionError as e: + log.warning("Saw %s while unlocking; retrying...", str(e)) + try: + reason = response.json().get('message') + except ValueError: + reason = str(response.status_code) + log.error('failed to unlock {node}. reason: {reason}'.format( + node=name, reason=reason)) + return False + + +def update_lock(name, description=None, status=None, ssh_pub_key=None): + name = misc.canonicalize_hostname(name, user=None) + updated = {} + if description is not None: + updated['description'] = description + if status is not None: + updated['up'] = (status == 'up') + if ssh_pub_key is not None: + updated['ssh_pub_key'] = ssh_pub_key + + if updated: + uri = os.path.join(config.lock_server, 'nodes', name, '') + inc = random.uniform(0, 1) + with safe_while( + sleep=1, increment=inc, action=f'update lock {name}') as proceed: + while proceed(): + response = requests.put( + uri, + json.dumps(updated)) + if response.ok: + return True + return response.ok + return True + + +def update_inventory(node_dict): + """ + Like update_lock(), but takes a dict and doesn't try to do anything smart + by itself + """ + name = node_dict.get('name') + if not name: + raise ValueError("must specify name") + if not config.lock_server: + return + uri = os.path.join(config.lock_server, 'nodes', name, '') + log.info("Updating %s on lock server", name) + inc = random.uniform(0, 1) + with safe_while( + sleep=1, increment=inc, action=f'update inventory {name}') as proceed: + while proceed(): + response = requests.put( + uri, + json.dumps(node_dict), + headers={'content-type': 'application/json'}, + ) + if response.status_code == 404: + log.info("Creating new node %s on lock server", name) + uri = os.path.join(config.lock_server, 'nodes', '') + response = requests.post( + uri, + json.dumps(node_dict), + headers={'content-type': 'application/json'}, + ) + if response.ok: + return + +def do_update_keys(machines, all_=False, _raise=True): + reference = query.list_locks(keyed_by_name=True) + if all_: + machines = reference.keys() + keys_dict = misc.ssh_keyscan(machines, _raise=_raise) + return push_new_keys(keys_dict, reference), keys_dict + + +def push_new_keys(keys_dict, reference): + ret = 0 + for hostname, pubkey in keys_dict.items(): + log.info('Checking %s', hostname) + if reference[hostname]['ssh_pub_key'] != pubkey: + log.info('New key found. Updating...') + if not update_lock(hostname, ssh_pub_key=pubkey): + log.error('failed to update %s!', hostname) + ret = 1 + return ret + + +def reimage_machines(ctx, machines, machine_type): + reimage_types = teuthology.provision.get_reimage_types() + if machine_type not in reimage_types: + log.info(f"Skipping reimage of {machines.keys()} because {machine_type} is not in {reimage_types}") + return machines + # Setup log file, reimage machines and update their keys + reimaged = dict() + console_log_conf = dict( + logfile_name='{shortname}_reimage.log', + remotes=[teuthology.orchestra.remote.Remote(machine) + for machine in machines], + ) + with console_log.task(ctx, console_log_conf): + with teuthology.parallel.parallel() as p: + for machine in machines: + log.info("Start node '%s' reimaging", machine) + update_nodes([machine], True) + p.spawn(teuthology.provision.reimage, ctx, + machine, machine_type) + reimaged[machine] = machines[machine] + reimaged = do_update_keys(list(reimaged.keys()))[1] + update_nodes(reimaged) + return reimaged + + +def block_and_lock_machines(ctx, total_requested, machine_type, reimage=True, tries=10): + # It's OK for os_type and os_version to be None here. If we're trying + # to lock a bare metal machine, we'll take whatever is available. If + # we want a vps, defaults will be provided by misc.get_distro and + # misc.get_distro_version in provision.create_if_vm + os_type = ctx.config.get("os_type") + os_version = ctx.config.get("os_version") + arch = ctx.config.get('arch') + reserved = config.reserve_machines + assert isinstance(reserved, int), 'reserve_machines must be integer' + assert (reserved >= 0), 'reserve_machines should >= 0' + + log.info('Locking machines...') + # change the status during the locking process + report.try_push_job_info(ctx.config, dict(status='waiting')) + + all_locked = dict() + requested = total_requested + while True: + # get a candidate list of machines + machines = query.list_locks( + machine_type=machine_type, + up=True, + locked=False, + count=requested + reserved, + tries=tries, + ) + if machines is None: + if ctx.block: + log.error('Error listing machines, trying again') + time.sleep(20) + continue + else: + raise RuntimeError('Error listing machines') + + # make sure there are machines for non-automated jobs to run + if len(machines) < reserved + requested \ + and ctx.owner.startswith('scheduled'): + if ctx.block: + log.info( + 'waiting for more %s machines to be free (need %s + %s, have %s)...', + machine_type, + reserved, + requested, + len(machines), + ) + time.sleep(10) + continue + else: + assert 0, ('not enough machines free; need %s + %s, have %s' % + (reserved, requested, len(machines))) + + try: + newly_locked = lock_many(ctx, requested, machine_type, + ctx.owner, ctx.archive, os_type, + os_version, arch, reimage=reimage) + except Exception: + # Lock failures should map to the 'dead' status instead of 'fail' + if 'summary' in ctx: + set_status(ctx.summary, 'dead') + raise + all_locked.update(newly_locked) + log.info( + '{newly_locked} {mtype} machines locked this try, ' + '{total_locked}/{total_requested} locked so far'.format( + newly_locked=len(newly_locked), + mtype=machine_type, + total_locked=len(all_locked), + total_requested=total_requested, + ) + ) + if len(all_locked) == total_requested: + vmlist = [] + for lmach in all_locked: + if query.is_vm(lmach): + vmlist.append(lmach) + if vmlist: + log.info('Waiting for virtual machines to come up') + keys_dict = dict() + loopcount = 0 + while len(keys_dict) != len(vmlist): + loopcount += 1 + time.sleep(10) + keys_dict = misc.ssh_keyscan(vmlist) + if loopcount == 40: + loopcount = 0 + log.info('virtual machine(s) still not up, ' + + 'recreating unresponsive ones.') + for guest in vmlist: + if guest not in keys_dict.keys(): + log.info('recreating: ' + guest) + full_name = misc.canonicalize_hostname(guest) + teuthology.provision.destroy_if_vm(full_name) + teuthology.provision.create_if_vm(ctx, full_name) + if do_update_keys(keys_dict)[0]: + log.info("Error in virtual machine keys") + newscandict = {} + for dkey in all_locked.keys(): + stats = query.get_status(dkey) + newscandict[dkey] = stats['ssh_pub_key'] + ctx.config['targets'] = newscandict + else: + ctx.config['targets'] = all_locked + locked_targets = yaml.safe_dump( + ctx.config['targets'], + default_flow_style=False + ).splitlines() + log.info('\n '.join(['Locked targets:', ] + locked_targets)) + # successfully locked machines, change status back to running + report.try_push_job_info(ctx.config, dict(status='running')) + break + elif not ctx.block: + assert 0, 'not enough machines are available' + else: + requested = requested - len(newly_locked) + assert requested > 0, "lock_machines: requested counter went" \ + "negative, this shouldn't happen" + + log.info( + "{total} machines locked ({new} new); need {more} more".format( + total=len(all_locked), new=len(newly_locked), more=requested) + ) + log.warning('Could not lock enough machines, waiting...') + time.sleep(10) + + +def stop_node(name: str, status: Union[dict, None]): + status = status or query.get_status(name) + remote_ = remote.Remote(name) + if status['machine_type'] in provision.fog.get_types(): + remote_.console.power_off() + return + elif status['machine_type'] in provision.pelagos.get_types(): + provision.pelagos.park_node(name) + return + elif remote_.is_container: + remote_.run( + args=['sudo', '/testnode_stop.sh'], + check_status=False, + ) + return diff --git a/teuthology/lock/query.py b/teuthology/lock/query.py new file mode 100644 index 000000000..ac4cede07 --- /dev/null +++ b/teuthology/lock/query.py @@ -0,0 +1,184 @@ +import datetime +import logging +import os +import requests + +from typing import Dict, List, Union + +from teuthology import misc +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.util.compat import urlencode +from teuthology.util.time import parse_timestamp + + +log = logging.getLogger(__name__) + + +def get_status(name) -> dict: + name = misc.canonicalize_hostname(name, user=None) + uri = os.path.join(config.lock_server, 'nodes', name, '') + with safe_while( + sleep=1, increment=0.5, action=f'get_status {name}') as proceed: + while proceed(): + response = requests.get(uri) + if response.ok: + return response.json() + elif response.status_code == 404: + return dict() + log.warning( + "Failed to query lock server for status of {name}".format(name=name)) + return dict() + + +def get_statuses(machines): + if machines: + statuses = [] + for machine in machines: + machine = misc.canonicalize_hostname(machine) + status = get_status(machine) + if status: + statuses.append(status) + else: + log.error("Lockserver doesn't know about machine: %s" % + machine) + else: + statuses = list_locks() + return statuses + + +def is_vm(name=None, status=None): + if status is None: + if name is None: + raise ValueError("Must provide either name or status, or both") + name = misc.canonicalize_hostname(name) + status = get_status(name) + return status.get('is_vm', False) + + +def list_locks(keyed_by_name=False, tries=10, **kwargs): + uri = os.path.join(config.lock_server, 'nodes', '') + for key, value in kwargs.items(): + if kwargs[key] is False: + kwargs[key] = '0' + if kwargs[key] is True: + kwargs[key] = '1' + if kwargs: + if 'machine_type' in kwargs: + kwargs['machine_type'] = kwargs['machine_type'].replace(',','|') + uri += '?' + urlencode(kwargs) + with safe_while( + sleep=1, + increment=0.5, + tries=tries, + action='list_locks' + ) as proceed: + while proceed(): + try: + response = requests.get(uri) + if response.ok: + break + except requests.ConnectionError: + log.exception("Could not contact lock server: %s, retrying...", config.lock_server) + if response.ok: + if not keyed_by_name: + return response.json() + else: + return {node['name']: node + for node in response.json()} + return dict() + + +def find_stale_locks(owner=None) -> List[Dict]: + """ + Return a list of node dicts corresponding to nodes that were locked to run + a job, but the job is no longer running. The purpose of this is to enable + us to find nodes that were left locked due to e.g. infrastructure failures + and return them to the pool. + + :param owner: If non-None, return nodes locked by owner. Default is None. + """ + def might_be_stale(node_dict): + """ + Answer the question: "might this be a stale lock?" + + The answer is yes if: + It is locked + It has a non-null description containing multiple '/' characters + + ... because we really want "nodes that were locked for a particular job + and are still locked" and the above is currently the best way to guess. + """ + desc = node_dict['description'] + if (node_dict['locked'] is True and + desc is not None and desc.startswith('/') and + desc.count('/') > 1): + return True + return False + + # Which nodes are locked for jobs? + nodes = list_locks(locked=True) + if owner is not None: + nodes = [node for node in nodes if node['locked_by'] == owner] + nodes = filter(might_be_stale, nodes) + + # Here we build the list of of nodes that are locked, for a job (as opposed + # to being locked manually for random monkeying), where the job is not + # running + result = list() + for node in nodes: + if node_active_job(node["name"], grace_time=5): + continue + result.append(node) + return result + +def node_active_job(name: str, status: Union[dict, None] = None, grace_time: int = 0) -> Union[str, None]: + """ + Is this node's job active (e.g. running or waiting)? + + :param node: The node dict as returned from the lock server + :param cache: A set() used for caching results + :param grace: A period of time (in mins) after job finishes before we consider the node inactive + :returns: A string if the node has an active job, or None if not + """ + status = status or get_status(name) + if not status: + # This should never happen with a normal node + return "node had no status" + description = status['description'] + if '/' not in description: + # technically not an "active job", but someone locked the node + # for a different purpose and is likely still using it. + return description + (run_name, job_id) = description.split('/')[-2:] + if not run_name or job_id == '': + # We thought this node might have a stale job, but no. + return "node description does not contained scheduled job info" + url = f"{config.results_server}/runs/{run_name}/jobs/{job_id}/" + job_status = "" + active = True + with safe_while( + sleep=1, increment=0.5, action='node_is_active') as proceed: + while proceed(): + resp = requests.get(url) + if resp.ok: + job_obj = resp.json() + job_status = job_obj["status"] + active = job_status and job_status not in ('pass', 'fail', 'dead') + if active: + break + job_updated = job_obj["updated"] + if not grace_time: + break + try: + delta = datetime.datetime.now(datetime.timezone.utc) - parse_timestamp(job_updated) + active = active or delta < datetime.timedelta(minutes=grace_time) + except Exception: + log.exception(f"{run_name}/{job_id} updated={job_updated}") + break + elif resp.status_code == 404: + break + else: + log.debug(f"Error {resp.status_code} listing job {run_name}/{job_id} for {name}: {resp.text}") + if active: + return description diff --git a/teuthology/lock/test/__init__.py b/teuthology/lock/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/lock/test/test_lock.py b/teuthology/lock/test/test_lock.py new file mode 100644 index 000000000..5f1679afc --- /dev/null +++ b/teuthology/lock/test/test_lock.py @@ -0,0 +1,7 @@ +import teuthology.lock.util + +class TestLock(object): + + def test_locked_since_seconds(self): + node = { "locked_since": "2013-02-07 19:33:55.000000" } + assert teuthology.lock.util.locked_since_seconds(node) > 3600 diff --git a/teuthology/lock/util.py b/teuthology/lock/util.py new file mode 100644 index 000000000..955b97e43 --- /dev/null +++ b/teuthology/lock/util.py @@ -0,0 +1,121 @@ +import datetime +import json +import logging + +from teuthology import misc +import teuthology.provision.downburst + +log = logging.getLogger(__name__) + + +def vps_version_or_type_valid(machine_type, os_type, os_version): + """ + Check os-type and os-version parameters when locking a vps. + Os-type will always be set (defaults to ubuntu). + + In the case where downburst does not handle list-json (an older version + of downburst, for instance), a message is printed and this checking + is skipped (so that this code should behave as it did before this + check was added). + """ + if not (machine_type in teuthology.provision.downburst.get_types()): + return True + if os_type is None or os_version is None: + # we'll use the defaults provided by provision.create_if_vm + # later on during provisioning + return True + valid_os_and_version = \ + teuthology.provision.downburst.get_distro_from_downburst() + if os_type not in valid_os_and_version: + log.error("os-type '%s' is invalid. Try one of: %s", + os_type, + ', '.join(valid_os_and_version.keys())) + return False + if not validate_distro_version(os_version, + valid_os_and_version[os_type]): + log.error( + "os-version '%s' is invalid for os-type '%s'. Try one of: %s", + os_version, + os_type, + ', '.join(valid_os_and_version[os_type])) + return False + return True + + +def validate_distro_version(version, supported_versions): + """ + Return True if the version is valid. For Ubuntu, possible + supported version values are of the form '12.04 (precise)' where + either the number of the version name is acceptable. + """ + if version in supported_versions: + return True + for parts in supported_versions: + part = parts.split('(') + if len(part) == 2: + if version == part[0]: + return True + if version == part[1][0:len(part[1])-1]: + return True + + +def json_matching_statuses(json_file_or_str, statuses): + """ + Filter statuses by json dict in file or fragment; return list of + matching statuses. json_file_or_str must be a file containing + json or json in a string. + """ + try: + open(json_file_or_str, 'r') + except IOError: + query = json.loads(json_file_or_str) + else: + query = json.load(json_file_or_str) + + if not isinstance(query, dict): + raise RuntimeError('--json-query must be a dict') + + return_statuses = list() + for status in statuses: + for k, v in query.items(): + if not misc.is_in_dict(k, v, status): + break + else: + return_statuses.append(status) + + return return_statuses + + +def winnow(statuses, arg, status_key, func=None): + """ + Call with a list of statuses, and the ctx. + 'arg' that you may want to filter by. + If arg is not None, filter statuses by either: + + 1) func=None: filter by status[status_key] == arg + remove any status that fails + + 2) func=: remove any + status for which func returns False + + Return the possibly-smaller set of statuses. + """ + + if arg is not None: + if func: + statuses = [_status for _status in statuses + if func(_status)] + else: + statuses = [_status for _status in statuses + if _status[status_key] == arg] + + return statuses + + +def locked_since_seconds(node): + now = datetime.datetime.now() + since = datetime.datetime.strptime( + node['locked_since'], '%Y-%m-%d %H:%M:%S.%f') + return (now - since).total_seconds() + + diff --git a/teuthology/ls.py b/teuthology/ls.py new file mode 100644 index 000000000..de8e6d4bc --- /dev/null +++ b/teuthology/ls.py @@ -0,0 +1,69 @@ +from __future__ import print_function + +import os +import yaml +import errno +import re + +from teuthology.job_status import get_status + + +def main(args): + return ls(args[""], args["--verbose"]) + + +def ls(archive_dir, verbose): + for j in get_jobs(archive_dir): + job_dir = os.path.join(archive_dir, j) + summary = {} + try: + with open(os.path.join(job_dir, 'summary.yaml')) as f: + g = yaml.safe_load_all(f) + for new in g: + summary.update(new) + except IOError as e: + if e.errno == errno.ENOENT: + print_debug_info(j, job_dir, archive_dir) + continue + else: + raise + + print("{job} {status} {owner} {desc} {duration}s".format( + job=j, + owner=summary.get('owner', '-'), + desc=summary.get('description', '-'), + status=get_status(summary), + duration=int(summary.get('duration', 0)), + )) + if verbose and 'failure_reason' in summary: + print(' {reason}'.format(reason=summary['failure_reason'])) + + +def get_jobs(archive_dir): + dir_contents = os.listdir(archive_dir) + + def is_job_dir(parent, subdir): + if (os.path.isdir(os.path.join(parent, subdir)) and re.match(r'\d+$', + subdir)): + return True + return False + + jobs = [job for job in dir_contents if is_job_dir(archive_dir, job)] + return sorted(jobs) + + +def print_debug_info(job, job_dir, archive_dir): + print('%s ' % job, end='') + + try: + log_path = os.path.join(archive_dir, job, 'teuthology.log') + if os.path.exists(log_path): + tail = os.popen( + 'tail -1 %s' % log_path + ).read().rstrip() + print(tail, end='') + else: + print('', end='') + except IOError: + pass + print('') diff --git a/teuthology/misc.py b/teuthology/misc.py new file mode 100644 index 000000000..d4c619cc5 --- /dev/null +++ b/teuthology/misc.py @@ -0,0 +1,1317 @@ +""" +Miscellaneous teuthology functions. +Used by other modules, but mostly called from tasks. +""" +import argparse +import os +import logging +import configobj +import getpass +import shutil +import socket +import subprocess +import tarfile +import time +import yaml +import json +import re +from sys import stdin +import pprint +import datetime + +from tarfile import ReadError + +from typing import Optional, TypeVar + +from teuthology.util.compat import urljoin, urlopen, HTTPError + +from netaddr.strategy.ipv4 import valid_str as _is_ipv4 +from netaddr.strategy.ipv6 import valid_str as _is_ipv6 +from teuthology import safepath +from teuthology.exceptions import (CommandCrashedError, CommandFailedError, + ConnectionLostError) +from teuthology.orchestra import run +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.orchestra.opsys import DEFAULT_OS_VERSION + + +log = logging.getLogger(__name__) + +stamp = datetime.datetime.now().strftime("%y%m%d%H%M") + +is_arm = lambda x: x.startswith('tala') or x.startswith( + 'ubuntu@tala') or x.startswith('saya') or x.startswith('ubuntu@saya') + +hostname_expr_templ = '(?P.*@)?(?P.*){lab_domain}' + +def host_shortname(hostname): + if _is_ipv4(hostname) or _is_ipv6(hostname): + return hostname + else: + return hostname.split('.', 1)[0] + +def canonicalize_hostname(hostname, user: Optional[str] ='ubuntu'): + hostname_expr = hostname_expr_templ.format( + lab_domain=config.lab_domain.replace('.', r'\.')) + match = re.match(hostname_expr, hostname) + if _is_ipv4(hostname) or _is_ipv6(hostname): + return "%s@%s" % (user, hostname) + if match: + match_d = match.groupdict() + shortname = match_d['shortname'] + if user is None: + user_ = user + else: + user_ = match_d.get('user') or user + else: + shortname = host_shortname(hostname) + user_ = user + + user_at = user_.strip('@') + '@' if user_ else '' + domain = config.lab_domain + if domain and not shortname.endswith('.'): + domain = '.' + domain + ret = '{user_at}{short}{domain}'.format( + user_at=user_at, + short=shortname, + domain=domain, + ) + return ret + + +def decanonicalize_hostname(hostname): + lab_domain = '' + if config.lab_domain: + lab_domain=r'\.' + config.lab_domain.replace('.', r'\.') + hostname_expr = hostname_expr_templ.format(lab_domain=lab_domain) + match = re.match(hostname_expr, hostname) + if match: + hostname = match.groupdict()['shortname'] + return hostname + + +def config_file(string): + """ + Create a config file + + :param string: name of yaml file used for config. + :returns: Dictionary of configuration information. + """ + config_dict = {} + try: + with open(string) as f: + g = yaml.safe_load_all(f) + for new in g: + config_dict.update(new) + except IOError as e: + raise argparse.ArgumentTypeError(str(e)) + return config_dict + + +def merge_configs(config_paths) -> dict: + """ Takes one or many paths to yaml config files and merges them + together, returning the result. + """ + conf_dict = dict() + for conf_path in config_paths: + if conf_path == "-": + partial_dict = yaml.safe_load(stdin) + elif not os.path.exists(conf_path): + log.debug("The config path {0} does not exist, skipping.".format(conf_path)) + continue + else: + with open(conf_path) as partial_file: + partial_dict: dict = yaml.safe_load(partial_file) + try: + conf_dict = deep_merge(conf_dict, partial_dict) + except Exception: + # TODO: Should this log as well? + pprint.pprint("failed to merge {0} into {1}".format(conf_dict, partial_dict)) + raise + + return conf_dict + + +def get_testdir(ctx=None): + """ + :param ctx: Unused; accepted for compatibility + :returns: A test directory + """ + if 'test_path' in config: + return config['test_path'] + return config.get( + 'test_path', + '/home/%s/cephtest' % get_test_user() + ) + + +def get_test_user(ctx=None): + """ + :param ctx: Unused; accepted for compatibility + :returns: str -- the user to run tests as on remote hosts + """ + return config.get('test_user', 'ubuntu') + + +def get_archive_dir(ctx): + """ + :returns: archive directory (a subdirectory of the test directory) + """ + test_dir = get_testdir(ctx) + return os.path.normpath(os.path.join(test_dir, 'archive')) + + +def get_http_log_path(archive_dir, job_id=None): + """ + :param archive_dir: directory to be searched + :param job_id: id of job that terminates the name of the log path + :returns: http log path + """ + http_base = config.archive_server + if not http_base: + return None + + sep = os.path.sep + archive_dir = archive_dir.rstrip(sep) + archive_subdir = archive_dir.split(sep)[-1] + if archive_subdir.endswith(str(job_id)): + archive_subdir = archive_dir.split(sep)[-2] + + if job_id is None: + return os.path.join(http_base, archive_subdir, '') + return os.path.join(http_base, archive_subdir, str(job_id), '') + + +def get_results_url(run_name, job_id=None): + """ + :param run_name: The name of the test run + :param job_id: The job_id of the job. Optional. + :returns: URL to the run (or job, if job_id is passed) in the results web + UI. For example, Inktank uses Pulpito. + """ + if not config.results_ui_server: + return None + base_url = config.results_ui_server + + if job_id is None: + return os.path.join(base_url, run_name, '') + return os.path.join(base_url, run_name, str(job_id), '') + + +def get_ceph_binary_url(package=None, + branch=None, tag=None, sha1=None, dist=None, + flavor=None, format=None, arch=None): + """ + return the url of the ceph binary found on gitbuildder. + """ + BASE = 'http://{host}/{package}-{format}-{dist}-{arch}-{flavor}/'.format( + host=config.gitbuilder_host, + package=package, + flavor=flavor, + arch=arch, + format=format, + dist=dist + ) + + if sha1 is not None: + assert branch is None, "cannot set both sha1 and branch" + assert tag is None, "cannot set both sha1 and tag" + else: + # gitbuilder uses remote-style ref names for branches, mangled to + # have underscores instead of slashes; e.g. origin_main + if tag is not None: + ref = tag + assert branch is None, "cannot set both branch and tag" + else: + if branch is None: + branch = 'main' + ref = branch + + sha1_url = urljoin(BASE, 'ref/{ref}/sha1'.format(ref=ref)) + log.debug('Translating ref to sha1 using url %s', sha1_url) + + try: + sha1_fp = urlopen(sha1_url) + sha1 = sha1_fp.read().rstrip('\n') + sha1_fp.close() + except HTTPError as e: + log.error('Failed to get url %s', sha1_url) + raise e + + log.debug('Using %s %s sha1 %s', package, format, sha1) + bindir_url = urljoin(BASE, 'sha1/{sha1}/'.format(sha1=sha1)) + return (sha1, bindir_url) + + +def feed_many_stdins(fp, processes): + """ + :param fp: input file + :param processes: list of processes to be written to. + """ + while True: + data = fp.read(8192) + if not data: + break + for proc in processes: + proc.stdin.write(data) + + +def feed_many_stdins_and_close(fp, processes): + """ + Feed many and then close processes. + + :param fp: input file + :param processes: list of processes to be written to. + """ + feed_many_stdins(fp, processes) + for proc in processes: + proc.stdin.close() + + +def get_mons(roles, ips, + mon_bind_msgr2=False, + mon_bind_addrvec=False): + """ + Get monitors and their associated addresses + """ + mons = {} + mon_ports = {} + mon_id = 0 + is_mon = is_type('mon') + for idx, roles in enumerate(roles): + for role in roles: + if not is_mon(role): + continue + if ips[idx] not in mon_ports: + mon_ports[ips[idx]] = 6789 + else: + mon_ports[ips[idx]] += 1 + if mon_bind_msgr2: + assert mon_bind_addrvec + addr = 'v2:{ip}:{port},v1:{ip}:{port2}'.format( + ip=ips[idx], + port=mon_ports[ips[idx]], + port2=mon_ports[ips[idx]] + 1, + ) + mon_ports[ips[idx]] += 1 + elif mon_bind_addrvec: + addr = 'v1:{ip}:{port}'.format( + ip=ips[idx], + port=mon_ports[ips[idx]], + ) + else: + addr = '{ip}:{port}'.format( + ip=ips[idx], + port=mon_ports[ips[idx]], + ) + mon_id += 1 + mons[role] = addr + assert mons + return mons + + +def skeleton_config(ctx, roles, ips, cluster='ceph', + mon_bind_msgr2=False, + mon_bind_addrvec=False): + """ + Returns a ConfigObj that is prefilled with a skeleton config. + + Use conf[section][key]=value or conf.merge to change it. + + Use conf.write to write it out, override .filename first if you want. + """ + path = os.path.join(os.path.dirname(__file__), 'ceph.conf.template') + conf = configobj.ConfigObj(path, file_error=True) + mons = get_mons(roles=roles, ips=ips, + mon_bind_msgr2=mon_bind_msgr2, + mon_bind_addrvec=mon_bind_addrvec) + for role, addr in mons.items(): + mon_cluster, _, _ = split_role(role) + if mon_cluster != cluster: + continue + name = ceph_role(role) + conf.setdefault(name, {}) + conf[name]['mon addr'] = addr + # set up standby mds's + is_mds = is_type('mds', cluster) + for roles_subset in roles: + for role in roles_subset: + if is_mds(role): + name = ceph_role(role) + conf.setdefault(name, {}) + if '-s-' in name: + standby_mds = name[name.find('-s-') + 3:] + conf[name]['mds standby for name'] = standby_mds + return conf + + +def ceph_role(role): + """ + Return the ceph name for the role, without any cluster prefix, e.g. osd.0. + """ + _, type_, id_ = split_role(role) + return type_ + '.' + id_ + + +def split_role(role): + """ + Return a tuple of cluster, type, and id + If no cluster is included in the role, the default cluster, 'ceph', is used + """ + cluster = 'ceph' + if role.count('.') > 1: + cluster, role = role.split('.', 1) + type_, id_ = role.split('.', 1) + return cluster, type_, id_ + + +def roles_of_type(roles_for_host, type_): + """ + Generator of ids. + + Each call returns the next possible role of the type specified. + :param roles_for_host: list of roles possible + :param type_: type of role + """ + for role in cluster_roles_of_type(roles_for_host, type_, None): + _, _, id_ = split_role(role) + yield id_ + + +def cluster_roles_of_type(roles_for_host, type_, cluster): + """ + Generator of roles. + + Each call returns the next possible role of the type specified. + :param roles_for_host: list of roles possible + :param type_: type of role + :param cluster: cluster name + """ + is_type_in_cluster = is_type(type_, cluster) + for role in roles_for_host: + if not is_type_in_cluster(role): + continue + yield role + + +def all_roles(cluster): + """ + Generator of role values. Each call returns another role. + + :param cluster: Cluster extracted from the ctx. + """ + for _, roles_for_host in cluster.remotes.items(): + for name in roles_for_host: + yield name + + +def all_roles_of_type(cluster, type_): + """ + Generator of role values. Each call returns another role of the + type specified. + + :param cluster: Cluster extracted from the ctx. + :param type_: role type + """ + for _, roles_for_host in cluster.remotes.items(): + for id_ in roles_of_type(roles_for_host, type_): + yield id_ + + +def is_type(type_, cluster=None): + """ + Returns a matcher function for whether role is of type given. + + :param cluster: cluster name to check in matcher (default to no check for cluster) + """ + def _is_type(role): + """ + Return type based on the starting role name. + + If there is more than one period, strip the first part + (ostensibly a cluster name) and check the remainder for the prefix. + """ + role_cluster, role_type, _ = split_role(role) + if cluster is not None and role_cluster != cluster: + return False + return role_type == type_ + return _is_type + + +def num_instances_of_type(cluster, type_, ceph_cluster='ceph'): + """ + Total the number of instances of the role type specified in all remotes. + + :param cluster: Cluster extracted from ctx. + :param type_: role + :param ceph_cluster: filter for ceph cluster name + """ + remotes_and_roles = cluster.remotes.items() + roles = [roles for (remote, roles) in remotes_and_roles] + is_ceph_type = is_type(type_, ceph_cluster) + num = sum(sum(1 for role in hostroles if is_ceph_type(role)) + for hostroles in roles) + return num + + +def create_simple_monmap(ctx, remote, conf, path=None, + mon_bind_addrvec=False): + """ + Writes a simple monmap based on current ceph.conf into path, or + /monmap by default. + + Assumes ceph_conf is up to date. + + Assumes mon sections are named "mon.*", with the dot. + + :return the FSID (as a string) of the newly created monmap + """ + def gen_addresses(): + """ + Monitor address generator. + + Each invocation returns the next monitor address + """ + for section, data in conf.items(): + PREFIX = 'mon.' + if not section.startswith(PREFIX): + continue + name = section[len(PREFIX):] + addr = data['mon addr'] + yield (name, addr) + + addresses = list(gen_addresses()) + assert addresses, "There are no monitors in config!" + log.debug('Ceph mon addresses: %s', addresses) + + testdir = get_testdir(ctx) + args = [ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir), + 'monmaptool', + '--create', + '--clobber', + ] + for (name, addr) in addresses: + if mon_bind_addrvec: + args.extend(('--addv', name, addr)) + else: + args.extend(('--add', name, addr)) + if not path: + path = '{tdir}/monmap'.format(tdir=testdir) + args.extend([ + '--print', + path + ]) + + monmap_output = remote.sh(args) + fsid = re.search("generated fsid (.+)$", + monmap_output, re.MULTILINE).group(1) + return fsid + + +def write_file(remote, path, data): + """ + Write data to a remote file + + :param remote: Remote site. + :param path: Path on the remote being written to. + :param data: Data to be written. + """ + remote.write_file(path, data) + + +def sudo_write_file(remote, path, data, perms=None, owner=None): + """ + Write data to a remote file as super user + + :param remote: Remote site. + :param path: Path on the remote being written to. + :param data: Data to be written. + :param perms: Permissions on the file being written + :param owner: Owner for the file being written + + Both perms and owner are passed directly to chmod. + """ + remote.sudo_write_file(path, data, mode=perms, owner=owner) + + +def copy_file(from_remote, from_path, to_remote, to_path=None): + """ + Copies a file from one remote to another. + """ + if to_path is None: + to_path = from_path + from_remote.run(args=[ + 'sudo', 'scp', '-v', from_path, "{host}:{file}".format( + host=to_remote.name, file=to_path) + ]) + + +def move_file(remote, from_path, to_path, sudo=False, preserve_perms=True): + """ + Move a file from one path to another on a remote site + + If preserve_perms is true, the contents of the destination file (to_path, + which must already exist in this case) are replaced with the contents of the + source file (from_path) and the permissions of to_path are preserved. If + preserve_perms is false, to_path does not need to exist, and is simply + clobbered if it does. + """ + if preserve_perms: + args = [] + if sudo: + args.append('sudo') + args.extend([ + 'stat', + '-c', + '\"%a\"', + to_path + ]) + perms = remote.sh(args).rstrip().strip('\"') + + args = [] + if sudo: + args.append('sudo') + args.extend([ + 'mv', + '--', + from_path, + to_path, + ]) + remote.sh(args) + + if preserve_perms: + # reset the file back to the original permissions + args = [] + if sudo: + args.append('sudo') + args.extend([ + 'chmod', + perms, + to_path, + ]) + remote.sh(args) + + +def delete_file(remote, path, sudo=False, force=False, check=True): + """ + rm a file on a remote site. Use force=True if the call should succeed even + if the file is absent or rm path would otherwise fail. + """ + args = [] + if sudo: + args.append('sudo') + args.extend(['rm']) + if force: + args.extend(['-f']) + args.extend([ + '--', + path, + ]) + remote.sh(args, check_status=check) + + +def remove_lines_from_file(remote, path, line_is_valid_test, + string_to_test_for): + """ + Remove lines from a file. This involves reading the file in, removing + the appropriate lines, saving the file, and then replacing the original + file with the new file. Intermediate files are used to prevent data loss + on when the main site goes up and down. + """ + # read in the specified file + in_data = remote.read_file(path, False).decode() + out_data = "" + + first_line = True + # use the 'line_is_valid_test' function to remove unwanted lines + for line in in_data.split('\n'): + if line_is_valid_test(line, string_to_test_for): + if not first_line: + out_data += '\n' + else: + first_line = False + + out_data += '{line}'.format(line=line) + + else: + log.info('removing line: {bad_line}'.format(bad_line=line)) + + # get a temp file path on the remote host to write to, + # we don't want to blow away the remote file and then have the + # network drop out + temp_file_path = remote.mktemp() + + # write out the data to a temp file + write_file(remote, temp_file_path, out_data) + + # then do a 'mv' to the actual file location + move_file(remote, temp_file_path, path) + + +def append_lines_to_file(remote, path, lines, sudo=False): + """ + Append lines to a file. + """ + remote.write_file(path, lines, append=True, sudo=sudo) + +def prepend_lines_to_file(remote, path, lines, sudo=False): + """ + Prepend lines to a file. + An intermediate file is used in the same manner as in + Remove_lines_from_list. + """ + + temp_file_path = remote.mktemp() + remote.write_file(temp_file_path, lines) + remote.copy_file(path, temp_file_path, append=True, sudo=sudo) + remote.move_file(temp_file_path, path, sudo=sudo) + + +def create_file(remote, path, data="", permissions=str(644), sudo=False): + """ + Create a file on the remote host. + """ + args = [] + if sudo: + args.append('sudo') + args.extend([ + 'touch', + path, + run.Raw('&&') + ]) + if sudo: + args.append('sudo') + args.extend([ + 'chmod', + permissions, + '--', + path + ]) + remote.sh(args) + # now write out the data if any was passed in + if "" != data: + append_lines_to_file(remote, path, data, sudo) + + +def get_file(remote, path, sudo=False, dest_dir='/tmp'): + """ + Get the contents of a remote file. Do not use for large files; use + Remote.get_file() instead. + """ + local_path = remote.get_file(path, sudo=sudo, dest_dir=dest_dir) + with open(local_path, 'rb') as file_obj: + file_data = file_obj.read() + os.remove(local_path) + return file_data + + +def copy_fileobj(src, tarinfo, local_path): + with open(local_path, 'wb') as dest: + shutil.copyfileobj(src, dest) + + +def pull_directory(remote, remotedir, localdir, write_to=copy_fileobj): + """ + Copy a remote directory to a local directory. + + :param remote: the remote object representing the remote host from where + the specified directory is pulled + :param remotedir: the source directory on remote host + :param localdir: the destination directory on localhost + :param write_to: optional function to write the file to localdir. + its signature should be: + func(src: fileobj, + tarinfo: tarfile.TarInfo, + local_path: str) + """ + log.debug('Transferring archived files from %s:%s to %s', + remote.shortname, remotedir, localdir) + if not os.path.exists(localdir): + os.mkdir(localdir) + r = remote.get_tar_stream(remotedir, sudo=True, compress=False) + tar = tarfile.open(mode='r|', fileobj=r.stdout) + while True: + ti = tar.next() + if ti is None: + break + + if ti.isdir(): + # ignore silently; easier to just create leading dirs below + # XXX this mean empty dirs are not transferred + pass + elif ti.isfile(): + sub = safepath.munge(ti.name) + safepath.makedirs(root=localdir, path=os.path.dirname(sub)) + with tar.extractfile(ti) as src: + write_to(src, ti, os.path.join(localdir, sub)) + else: + if ti.isdev(): + type_ = 'device' + elif ti.issym(): + type_ = 'symlink' + elif ti.islnk(): + type_ = 'hard link' + else: + type_ = 'unknown' + log.info('Ignoring tar entry: %r type %r', ti.name, type_) + + +def pull_directory_tarball(remote, remotedir, localfile): + """ + Copy a remote directory to a local tarball. + """ + log.debug('Transferring archived files from %s:%s to %s', + remote.shortname, remotedir, localfile) + remote.get_tar(remotedir, localfile, sudo=True) + + +def get_wwn_id_map(remote, devs): + log.warning("Entering get_wwn_id_map, a deprecated function that will be removed") + return dict((d, d) for d in devs) + + +def get_scratch_devices(remote): + """ + Read the scratch disk list from remote host + """ + devs = [] + try: + file_data = remote.read_file("/scratch_devs").decode() + devs = file_data.split() + except Exception: + devs = remote.sh('ls /dev/[sv]d?').strip().split('\n') + + # Remove root device (vm guests) from the disk list + for dev in devs: + if 'vda' in dev: + devs.remove(dev) + log.warning("Removing root device: %s from device list" % dev) + + log.debug('devs={d}'.format(d=devs)) + + retval = [] + for dev in devs: + dev_checks = [ + [['stat', dev], "does not exist"], + [['sudo', 'dd', 'if=%s' % dev, 'of=/dev/null', 'count=1'], "is not readable"], + [ + [run.Raw('!'), 'mount', run.Raw('|'), 'grep', '-v', 'devtmpfs', run.Raw('|'), + 'grep', '-q', dev], + "is in use" + ], + ] + for args, msg in dev_checks: + try: + remote.run(args=args) + except CommandFailedError: + log.debug(f"get_scratch_devices: {dev} {msg}") + break + else: + retval.append(dev) + continue + break + return retval + + +def wait_until_healthy(ctx, remote, ceph_cluster='ceph', use_sudo=False): + """ + Wait until a Ceph cluster is healthy. Give up after 15min. + """ + testdir = get_testdir(ctx) + # when cluster is setup using ceph-deploy or ansible + # access to admin key is readonly for ceph user + cmd = ['ceph', '--cluster', ceph_cluster, 'health'] + if use_sudo: + cmd.insert(0, 'sudo') + args = ['adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir)] + args.extend(cmd) + with safe_while(tries=(900 // 6), action="wait_until_healthy") as proceed: + while proceed(): + out = remote.sh(args, logger=log.getChild('health')) + log.debug('Ceph health: %s', out.rstrip('\n')) + if out.split(None, 1)[0] == 'HEALTH_OK': + break + time.sleep(1) + + +def wait_until_osds_up(ctx, cluster, remote, ceph_cluster='ceph'): + """Wait until all Ceph OSDs are booted.""" + num_osds = num_instances_of_type(cluster, 'osd', ceph_cluster) + testdir = get_testdir(ctx) + with safe_while(sleep=6, tries=90) as proceed: + while proceed(): + daemons = ctx.daemons.iter_daemons_of_role('osd', ceph_cluster) + for daemon in daemons: + daemon.check_status() + out = remote.sh( + [ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir), + 'ceph', + '--cluster', ceph_cluster, + 'osd', 'dump', '--format=json' + ], + logger=log.getChild('health'), + ) + j = json.loads('\n'.join(out.split('\n')[1:])) + up = sum(1 for o in j['osds'] if 'up' in o['state']) + log.debug('%d of %d OSDs are up' % (up, num_osds)) + if up == num_osds: + break + + +def reboot(node, timeout=300, interval=30): + """ + Reboots a given system, then waits for it to come back up and + re-establishes the ssh connection. + + :param node: The teuthology.orchestra.remote.Remote object of the node + :param timeout: The amount of time, in seconds, after which to give up + waiting for the node to return + :param interval: The amount of time, in seconds, to wait between attempts + to re-establish with the node. This should not be set to + less than maybe 10, to make sure the node actually goes + down first. + """ + log.info("Rebooting {host}...".format(host=node.hostname)) + node.run(args=['sudo', 'shutdown', '-r', 'now']) + reboot_start_time = time.time() + while time.time() - reboot_start_time < timeout: + time.sleep(interval) + if node.is_online or node.reconnect(): + return + raise RuntimeError( + "{host} did not come up after reboot within {time}s".format( + host=node.hostname, time=timeout)) + + +def reconnect(ctx, timeout, remotes=None): + """ + Connect to all the machines in ctx.cluster. + + Presumably, some of them won't be up. Handle this + by waiting for them, unless the wait time exceeds + the specified timeout. + + ctx needs to contain the cluster of machines you + wish it to try and connect to, as well as a config + holding the ssh keys for each of them. As long as it + contains this data, you can construct a context + that is a subset of your full cluster. + """ + log.info('Re-opening connections...') + starttime = time.time() + + if remotes: + need_reconnect = remotes + else: + need_reconnect = list(ctx.cluster.remotes.keys()) + + while need_reconnect: + for remote in need_reconnect: + log.info('trying to connect to %s', remote.name) + success = remote.reconnect() + if not success: + if time.time() - starttime > timeout: + raise RuntimeError("Could not reconnect to %s" % + remote.name) + else: + need_reconnect.remove(remote) + + log.debug('waited {elapsed}'.format( + elapsed=str(time.time() - starttime))) + time.sleep(1) + + +def get_clients(ctx, roles): + """ + return all remote roles that are clients. + """ + for role in roles: + assert isinstance(role, str) + assert 'client.' in role + _, _, id_ = split_role(role) + (remote,) = ctx.cluster.only(role).remotes.keys() + yield (id_, remote) + + +def get_user(): + """ + Return the username in the format user@host. + """ + return getpass.getuser() + '@' + socket.gethostname() + + +def get_mon_names(ctx, cluster='ceph'): + """ + :returns: a list of monitor names + """ + is_mon = is_type('mon', cluster) + host_mons = [[role for role in roles if is_mon(role)] + for roles in ctx.cluster.remotes.values()] + return [mon for mons in host_mons for mon in mons] + + +def get_first_mon(ctx, config, cluster='ceph'): + """ + return the "first" mon role (alphanumerically, for lack of anything better) + """ + mons = get_mon_names(ctx, cluster) + if mons: + return sorted(mons)[0] + assert False, 'no mon for cluster found' + + +def replace_all_with_clients(cluster, config): + """ + Converts a dict containing a key all to one + mapping all clients to the value of config['all'] + """ + assert isinstance(config, dict), 'config must be a dict' + if 'all' not in config: + return config + norm_config = {} + assert len(config) == 1, \ + "config cannot have 'all' and specific clients listed" + for client in all_roles_of_type(cluster, 'client'): + norm_config['client.{id}'.format(id=client)] = config['all'] + return norm_config + + +DeepMerge = TypeVar('DeepMerge') +def deep_merge(a: DeepMerge, b: DeepMerge) -> DeepMerge: + """ + Deep Merge. If a and b are both lists, all elements in b are + added into a. If a and b are both dictionaries, elements in b are + recursively added to a. + :param a: object items will be merged into + :param b: object items will be merged from + """ + if b is None: + return a + if a is None: + return deep_merge(b.__class__(), b) + if isinstance(a, list): + assert isinstance(b, list) + a.extend(b) + return a + if isinstance(a, dict): + assert isinstance(b, dict) + for (k, v) in b.items(): + a[k] = deep_merge(a.get(k), v) + return a + return b + +def update_key(key_to_update, a: dict, b: dict): + """ + Update key (`key_to_update`) of dict `a` on all levels + to the values of same key in `b` dict. + """ + for key, value in b.items(): + if key == key_to_update: + a[key] = value + elif isinstance(value, dict): + if key in a and isinstance(a[key], dict): + update_key(key_to_update, a[key], value) + +def ssh_keyscan(hostnames, _raise=True): + """ + Fetch the SSH public key of one or more hosts + + :param hostnames: A list of hostnames, or a dict keyed by hostname + :param _raise: Whether to raise an exception if not all keys are retrieved + :returns: A dict keyed by hostname, with the host keys as values + """ + if not isinstance(hostnames, list) and not isinstance(hostnames, dict): + raise TypeError("'hostnames' must be a list") + hostnames = [canonicalize_hostname(name, user=None) for name in + hostnames] + keys_dict = dict() + for hostname in hostnames: + with safe_while( + sleep=1, + tries=15 if _raise else 1, + increment=1, + _raise=_raise, + action="ssh_keyscan " + hostname, + ) as proceed: + while proceed(): + key = _ssh_keyscan(hostname) + if key: + keys_dict[hostname] = key + break + if len(keys_dict) != len(hostnames): + missing = set(hostnames) - set(keys_dict.keys()) + msg = "Unable to scan these host keys: %s" % ' '.join(missing) + if not _raise: + log.warning(msg) + else: + raise RuntimeError(msg) + return keys_dict + + +def _ssh_keyscan(hostname): + """ + Fetch the SSH public key of one or more hosts + + :param hostname: The hostname + :returns: The host key + """ + args = ['ssh-keyscan', '-T', '1', hostname] + p = subprocess.Popen( + args=args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + p.wait() + for line in p.stderr: + line = line.decode() + line = line.strip() + if line and not line.startswith('#'): + log.error(line) + keys = list() + for line in p.stdout: + host, key = line.strip().decode().split(' ', 1) + keys.append(key) + if len(keys) > 0: + return sorted(keys)[0] + + +def ssh_keyscan_wait(hostname): + """ + Run ssh-keyscan against a host, return True if it succeeds, + False otherwise. Try again if ssh-keyscan timesout. + :param hostname: on which ssh-keyscan is run + """ + with safe_while(sleep=6, tries=100, _raise=False, + action="ssh_keyscan_wait " + hostname) as proceed: + success = False + while proceed(): + key = _ssh_keyscan(hostname) + if key: + success = True + break + log.info("try ssh_keyscan again for " + str(hostname)) + return success + +def stop_daemons_of_type(ctx, type_, cluster='ceph', timeout=300): + """ + :param type_: type of daemons to be stopped. + :param cluster: Cluster name, default is 'ceph'. + :param timeout: Timeout in seconds for stopping each daemon. + """ + log.info('Shutting down %s daemons...' % type_) + exc = None + for daemon in ctx.daemons.iter_daemons_of_role(type_, cluster): + try: + daemon.stop(timeout) + except (CommandFailedError, + CommandCrashedError, + ConnectionLostError) as e: + exc = e + log.exception('Saw exception from %s.%s', daemon.role, daemon.id_) + if exc is not None: + raise exc + + +def get_system_type(remote, distro=False, version=False): + """ + If distro, return distro. + If version, return version + If both, return both. + If neither, return 'deb' or 'rpm' if distro is known to be one of those + """ + if version: + version = remote.os.version + if distro and version: + return remote.os.name, version + if distro: + return remote.os.name + if version: + return version + return remote.os.package_type + +def get_pkg_type(os_type): + if os_type in ('centos', 'fedora', 'opensuse', 'rhel', 'sle'): + return 'rpm' + else: + return 'deb' + +def get_distro(ctx): + """ + Get the name of the distro that we are using (usually the os_type). + """ + os_type = None + if ctx.os_type: + return ctx.os_type + + try: + os_type = ctx.config.get('os_type', None) + except AttributeError: + pass + + # if os_type is None, return the default of ubuntu + return os_type or "ubuntu" + + +def get_distro_version(ctx): + """ + Get the verstion of the distro that we are using (release number). + """ + distro = get_distro(ctx) + if ctx.os_version is not None: + return str(ctx.os_version) + try: + os_version = ctx.config.get('os_version', DEFAULT_OS_VERSION[distro]) + except AttributeError: + os_version = DEFAULT_OS_VERSION[distro] + return str(os_version) + + +def get_multi_machine_types(machinetype): + """ + Converts machine type string to list based on common deliminators + """ + machinetypes = [] + machine_type_deliminator = [',', ' ', '\t'] + for deliminator in machine_type_deliminator: + if deliminator in machinetype: + machinetypes = machinetype.split(deliminator) + break + if not machinetypes: + machinetypes.append(machinetype) + return machinetypes + + +def is_in_dict(searchkey, searchval, d): + """ + Test if searchkey/searchval are in dictionary. searchval may + itself be a dict, in which case, recurse. searchval may be + a subset at any nesting level (that is, all subkeys in searchval + must be found in d at the same level/nest position, but searchval + is not required to fully comprise d[searchkey]). + + >>> is_in_dict('a', 'foo', {'a':'foo', 'b':'bar'}) + True + + >>> is_in_dict( + ... 'a', + ... {'sub1':'key1', 'sub2':'key2'}, + ... {'a':{'sub1':'key1', 'sub2':'key2', 'sub3':'key3'}} + ... ) + True + + >>> is_in_dict('a', 'foo', {'a':'bar', 'b':'foo'}) + False + + >>> is_in_dict('a', 'foo', {'a':{'a': 'foo'}}) + False + """ + val = d.get(searchkey, None) + if isinstance(val, dict) and isinstance(searchval, dict): + for foundkey, foundval in searchval.items(): + if not is_in_dict(foundkey, foundval, val): + return False + return True + else: + return searchval == val + + +def sh(command, log_limit=1024, cwd=None, env=None): + """ + Run the shell command and return the output in ascii (stderr and + stdout). If the command fails, raise an exception. The command + and its output are logged, on success and on error. + """ + log.debug(":sh: " + command) + proc = subprocess.Popen( + args=command, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + bufsize=1) + lines = [] + truncated = False + with proc.stdout: + for line in proc.stdout: + line = line.decode() + lines.append(line) + line = line.rstrip() + if len(line) > log_limit: + truncated = True + log.debug(line[:log_limit] + + "... (truncated to the first " + str(log_limit) + + " characters)") + else: + log.debug(line) + output = "".join(lines) + if proc.wait() != 0: + if truncated: + log.error(command + " replay full stdout/stderr" + " because an error occurred and some of" + " it was truncated") + log.error(output) + raise subprocess.CalledProcessError( + returncode=proc.returncode, + cmd=command, + output=output + ) + return output + + +def add_remote_path(ctx, local_dir, remote_dir): + """ + Add key/value pair (local_dir: remote_dir) to job's info.yaml. + These key/value pairs are read to archive them in case of job timeout. + """ + if ctx.archive is None: + return + with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file: + info_yaml = yaml.safe_load(info_file) + info_file.seek(0) + if 'archive' in info_yaml: + info_yaml['archive'][local_dir] = remote_dir + else: + info_yaml['archive'] = {local_dir: remote_dir} + yaml.safe_dump(info_yaml, info_file, default_flow_style=False) + + +def archive_logs(ctx, remote_path, log_path): + """ + Archive directories from all nodes in a cliuster. It pulls all files in + remote_path dir to job's archive dir under log_path dir. + """ + if ctx.archive is None: + return + path = os.path.join(ctx.archive, 'remote') + os.makedirs(path, exist_ok=True) + for remote in ctx.cluster.remotes.keys(): + sub = os.path.join(path, remote.shortname) + os.makedirs(sub, exist_ok=True) + try: + pull_directory(remote, remote_path, os.path.join(sub, log_path)) + except ReadError: + pass + + +def compress_logs(ctx, remote_dir): + """ + Compress all files in remote_dir from all nodes in a cluster. + """ + log.info('Compressing logs...') + run.wait( + ctx.cluster.run( + args=(f"sudo find {remote_dir} -name *.log -print0 | " + f"sudo xargs --max-args=1 --max-procs=0 --verbose -0 --no-run-if-empty -- gzip -5 --verbose --"), + wait=False, + ), + ) diff --git a/teuthology/nuke/__init__.py b/teuthology/nuke/__init__.py new file mode 100644 index 000000000..9c6eefe18 --- /dev/null +++ b/teuthology/nuke/__init__.py @@ -0,0 +1,20 @@ +import logging + +log = logging.getLogger(__name__) + + +# This is being kept because ceph.git/qa/tasks/cephfs/filesystem.py references it. +def clear_firewall(ctx): + """ + Remove any iptables rules created by teuthology. These rules are + identified by containing a comment with 'teuthology' in it. Non-teuthology + firewall rules are unaffected. + """ + log.info("Clearing teuthology firewall rules...") + ctx.cluster.run( + args=[ + "sudo", "sh", "-c", + "iptables-save | grep -v teuthology | iptables-restore" + ], + ) + log.info("Cleared teuthology firewall rules.") diff --git a/teuthology/openstack/__init__.py b/teuthology/openstack/__init__.py new file mode 100644 index 000000000..54973cfd1 --- /dev/null +++ b/teuthology/openstack/__init__.py @@ -0,0 +1,1400 @@ +# +# Copyright (c) 2015 Red Hat, Inc. +# +# Author: Loic Dachary +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +import copy +import datetime +import functools +import json +import logging +import operator +import os +import paramiko +import re +import socket +import sys +import subprocess +import tempfile +import teuthology +import time +import yaml +import base64 + +from subprocess import CalledProcessError + +from teuthology.contextutil import safe_while +from teuthology.config import config as teuth_config +from teuthology.config import set_config_attr +from teuthology.orchestra import connection +from teuthology import misc + +from yaml.representer import SafeRepresenter + +try: + from openstack import connection as openstack_connection +except ImportError: + openstack_connection = None + +class cmd_str(str): pass + +def cmd_repr(dumper, data): + scalar = SafeRepresenter.represent_str(dumper, data) + scalar.style ='|' + return scalar + +yaml.add_representer(cmd_str, cmd_repr) + +log = logging.getLogger(__name__) + +class NoFlavorException(Exception): + pass + +def enforce_json_dictionary(something): + if not isinstance(something, dict): + raise Exception( + 'Please pip uninstall --yes cliff-tablib and try again.' + ' Details about this error can be found at' + ' https://bugs.launchpad.net/python-openstackclient/+bug/1510546' + ' you are encouraged to add a comment if you want it to be' + ' fixed.') + +def create_connection(): + if openstack_connection is None: + raise RuntimeError( + "Did not find required openstack dependencies. " + f"Try: {sys.executable} -m pip install -e .[openstack]" + ) + return openstack_connection.from_config(cloud=None) + + +class OpenStackInstance(object): + def __init__(self, name_or_id, info=None): + self.name_or_id = name_or_id + self.private_or_floating_ip = None + self.private_ip = None + self.info = info + self.conn = create_connection() + if info is None: + self.set_info() + else: + self.info = {k.lower(): v for k, v in info.items()} + if isinstance(self.info, dict) and self.info.get('status', '') == 'ERROR': + errmsg = 'VM creation failed' + if 'message' in self.info: + errmsg = '{}: {}'.format(errmsg, self.info['message']) + raise Exception(errmsg) + + def set_info(self): + try: + server = self.conn.compute.find_server(self.name_or_id) + if server: + self.info = {k.lower(): v for k, v in server.to_dict().items()} + except CalledProcessError: + self.info = None + + def __getitem__(self, name): + return self.info[name.lower()] + + def get_created(self): + now = datetime.datetime.now() + created = datetime.datetime.strptime( + self['created'], '%Y-%m-%dT%H:%M:%SZ') + return (now - created).total_seconds() + + def exists(self): + return self.info is not None + + def get_volumes(self): + """ + Return the uuid of the volumes attached to the name_or_id + OpenStack instance. + """ + volumes = self['os-extended-volumes:volumes_attached'] + return [volume['id'] for volume in volumes ] + + def get_addresses(self): + """ + Return the list of IPs associated with instance_id in OpenStack. + """ + with safe_while(sleep=2, tries=30, + action="get ip " + self['id']) as proceed: + while proceed(): + found = re.match(r'.*\d+', self['addresses']) + if found: + return self['addresses'] + self.set_info() + + def get_ip_neutron(self): + conn = OpenStack().conn + subnets = [subnet.id for subnet in conn.network.subnets() if subnet.ip_version == 4] + if not subnets: + raise Exception("No subnet with ip_version == 4 found") + ports = conn.network.ports(device_id=self['id']) + for port in ports: + for fixed_ip in port.fixed_ips: + if fixed_ip.get('subnet_id') in subnets: + return fixed_ip['ip_address'] + + raise Exception("No IP found for instance") + + def get_ip(self, network): + """ + Return the private IP of the OpenStack instance_id. + """ + if self.private_ip is None: + try: + self.private_ip = self.get_ip_neutron() + except Exception as e: + log.debug("ignoring get_ip_neutron exception " + str(e)) + self.private_ip = re.findall(network + r'=([\d.]+)', + self.get_addresses())[0] + return self.private_ip + + def get_floating_ip(self): + ips = TeuthologyOpenStack.get_os_floating_ips() + for ip in ips: + if ip['Fixed IP Address'] == self.get_ip(''): + return ip['Floating IP Address'] + return None + + def get_floating_ip_or_ip(self): + if not self.private_or_floating_ip: + self.private_or_floating_ip = self.get_floating_ip() + if not self.private_or_floating_ip: + self.private_or_floating_ip = self.get_ip('') + return self.private_or_floating_ip + + def destroy(self): + """ + Delete the name_or_id OpenStack instance. + """ + if not self.exists(): + return True + volumes = self.get_volumes() + OpenStack().run("server set --name REMOVE-ME-" + self.name_or_id + + " " + self['id']) + OpenStack().run("server delete --wait " + self['id'] + + " || true") + for volume in volumes: + OpenStack().volume_delete(volume) + return True + + +class OpenStack(object): + + # http://cdimage.debian.org/cdimage/openstack/current/ + # https://cloud-images.ubuntu.com/precise/current/precise-server-cloudimg-amd64-disk1.img etc. + # http://download.opensuse.org/repositories/Cloud:/Images:/openSUSE_13.2/images/openSUSE-13.2-OpenStack-Guest.x86_64.qcow2 + # http://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud.qcow2 etc. + # http://cloud.centos.org/centos/6/images/CentOS-6-x86_64-GenericCloud.qcow2 etc. + # https://download.fedoraproject.org/pub/fedora/linux/releases/22/Cloud/x86_64/Images/Fedora-Cloud-Base-22-20150521.x86_64.qcow2 + # http://fedora.mirrors.ovh.net/linux/releases/21/Cloud/Images/x86_64/Fedora-Cloud-Base-20141203-21.x86_64.qcow2 + # http://fedora.mirrors.ovh.net/linux/releases/20/Images/x86_64/Fedora-x86_64-20-20131211.1-sda.qcow2 + image2url = { + 'centos-7.2-x86_64': 'http://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud-1511.qcow2', + 'centos-7.3-x86_64': 'http://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud-1701.qcow2', + 'centos-9.stream-x86_64': 'https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-GenericCloud-9-20240703.1.x86_64.qcow2', + 'opensuse-42.1-x86_64': 'http://download.opensuse.org/repositories/Cloud:/Images:/Leap_42.1/images/openSUSE-Leap-42.1-OpenStack.x86_64.qcow2', + 'opensuse-42.2-x86_64': 'http://download.opensuse.org/repositories/Cloud:/Images:/Leap_42.2/images/openSUSE-Leap-42.2-OpenStack.x86_64.qcow2', + 'opensuse-42.3-x86_64': 'http://download.opensuse.org/repositories/Cloud:/Images:/Leap_42.3/images/openSUSE-Leap-42.3-OpenStack.x86_64.qcow2', + 'ubuntu-14.04-x86_64': 'https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-amd64-disk1.img', + 'ubuntu-14.04-aarch64': 'https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-arm64-disk1.img', + 'ubuntu-14.04-i686': 'https://cloud-images.ubuntu.com/trusty/current/trusty-server-cloudimg-i386-disk1.img', + 'ubuntu-16.04-x86_64': 'https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-amd64-disk1.img', + 'ubuntu-16.04-aarch64': 'https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-arm64-disk1.img', + 'ubuntu-16.04-i686': 'https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-i386-disk1.img', + 'ubuntu-18.04-x86_64': 'https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-amd64.img', + 'ubuntu-18.04-aarch64': 'https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-arm64.img', + 'ubuntu-18.04-i686': 'https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-i386.img', + 'ubuntu-20.04-x86_64': 'https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img', + 'ubuntu-20.04-aarch64': 'https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-arm64.img', + 'ubuntu-22.04-x86_64': 'https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img', + 'ubuntu-22.04-aarch64': 'https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-arm64.img', + 'ubuntu-24.04-x86_64': 'https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img', + 'debian-8.0-x86_64': 'http://cdimage.debian.org/cdimage/openstack/current/debian-8.7.1-20170215-openstack-amd64.qcow2', + } + + def __init__(self): + self.provider = None + self.key_filename = None + self.username = 'ubuntu' + self.up_string = "UNKNOWN" + self.teuthology_suite = 'teuthology-suite' + self.conn = create_connection() + + token = None + token_expires = None + token_cache_duration = 3600 + + def cache_token(self): + if self.provider != 'ovh': + return False + if (OpenStack.token is None and + 'OS_TOKEN_VALUE' in os.environ and + 'OS_TOKEN_EXPIRES' in os.environ): + log.debug("get token from the environment of the parent process") + OpenStack.token = os.environ['OS_TOKEN_VALUE'] + OpenStack.token_expires = int(os.environ['OS_TOKEN_EXPIRES']) + if (OpenStack.token_expires is not None and + OpenStack.token_expires < time.time()): + log.debug("token discarded because it has expired") + OpenStack.token = None + if OpenStack.token is None: + if 'OS_TOKEN_VALUE' in os.environ: + del os.environ['OS_TOKEN_VALUE'] + OpenStack.token = misc.sh("openstack -q token issue -c id -f value").strip() + os.environ['OS_TOKEN_VALUE'] = OpenStack.token + OpenStack.token_expires = int(time.time() + OpenStack.token_cache_duration) + os.environ['OS_TOKEN_EXPIRES'] = str(OpenStack.token_expires) + log.debug("caching OS_TOKEN_VALUE " + "during %s seconds" % OpenStack.token_cache_duration) + return True + + def get_os_url(self, cmd, type=None): + if self.provider != 'ovh': + return "" + url = "" + if (type == 'compute' or + cmd.startswith("server ") or + cmd.startswith("flavor ")): + url = "https://compute.{reg}.cloud.ovh.net/v2/{tenant}" + elif (type == 'network' or + cmd.startswith("ip ") or + cmd.startswith("security ") or + cmd.startswith("network ")): + url = "https://network.compute.{reg}.cloud.ovh.net/" + elif (type == 'image' or + cmd.startswith("image ")): + url = "https://image.compute.{reg}.cloud.ovh.net/" + elif (type == 'volume' or + cmd.startswith("volume ")): + url = "https://volume.compute.{reg}.cloud.ovh.net/v2/{tenant}" + if url != "": + url = url.format(reg=os.environ['OS_REGION_NAME'], + tenant=os.environ['OS_TENANT_ID']) + return url + + def run(self, cmd, *args, **kwargs): + url = self.get_os_url(cmd, kwargs.get('type')) + if url != "": + if self.cache_token(): + os.environ['OS_TOKEN'] = os.environ['OS_TOKEN_VALUE'] + os.environ['OS_URL'] = url + if re.match('(server|flavor|ip|security|network|image|volume)', cmd): + cmd = "openstack --quiet " + cmd + try: + status = misc.sh(cmd) + finally: + if 'OS_TOKEN' in os.environ: + del os.environ['OS_TOKEN'] + if 'OS_URL' in os.environ: + del os.environ['OS_URL'] + return status + + def set_provider(self): + if 'OS_AUTH_URL' not in os.environ: + raise Exception('no OS_AUTH_URL environment variable') + providers = (('runabove.io', 'runabove'), + ('cloud.ovh.net', 'ovh'), + ('engcloud.prv.suse.net', 'ecp'), + ('cloudlab.us', 'cloudlab'), + ('entercloudsuite.com', 'entercloudsuite'), + ('rackspacecloud.com', 'rackspace'), + ('dream.io', 'dreamhost')) + self.provider = 'any' + for (pattern, provider) in providers: + if pattern in os.environ['OS_AUTH_URL']: + self.provider = provider + break + return self.provider + + def get_provider(self): + if self.provider is None: + self.set_provider() + return self.provider + + @staticmethod + def get_value(result, field): + """ + Get the value of a field from a result returned by the openstack + command in json format. + + :param result: A dictionary similar to the output of + 'openstack server show' + :param field: The name of the field whose value to retrieve. Case is + ignored. + """ + enforce_json_dictionary(result) + return result[field.lower()] + + def image_exists(self, image): + """ + Return true if the image exists in OpenStack. + """ + found = self.run("image list -f json --limit 2000 --private --property name='" + + self.image_name(image) + "'") + return len(json.loads(found)) > 0 + + def net_id(self, network): + """ + Return the uuid of the network in OpenStack. + """ + conn = self.conn + network = conn.network.find_network(network) + if network: + return network.id + + def type_version_arch(self, os_type, os_version, arch): + """ + Return the string used to differentiate os_type and os_version in names. + """ + return os_type + '-' + os_version + '-' + arch + + def image_name(self, name): + """ + Return the image name used by teuthology in OpenStack to avoid + conflicts with existing names. + """ + return "teuthology-" + name + + def image_create(self, name, arch): + """ + Upload an image into OpenStack + """ + misc.sh("wget -c -O " + name + ".qcow2 " + self.image2url[name]) + if self.get_provider() == 'dreamhost': + image = name + ".raw" + disk_format = 'raw' + misc.sh("qemu-img convert " + name + ".qcow2 " + image) + else: + image = name + ".qcow2" + disk_format = 'qcow2' + if self.get_provider() == 'runabove': + properties = [ + "--property architecture_restrict=" + arch, + "--property architecture=" + arch + ] + elif self.get_provider() == 'cloudlab': + # if not, nova-compute fails on the compute node with + # Error: Cirrus VGA not available + properties = [ + "--property hw_video_model=vga", + ] + else: + properties = [] + + misc.sh("openstack image create --property ownedby=teuthology " + + " ".join(properties) + + " --disk-format=" + disk_format + " --container-format=bare " + + " --private" + + " --file " + image + " " + self.image_name(name)) + + def image(self, os_type, os_version, arch): + """ + Return the image name for the given os_type and os_version. If the image + does not exist it will be created. + """ + name = self.type_version_arch(os_type, os_version, arch) + if not self.image_exists(name): + self.image_create(name, arch) + return self.image_name(name) + + @staticmethod + def sort_flavors(flavors): + def sort_key(flavor): + # Create a tuple for sorting: (VCPUs, RAM, Disk) + return (flavor['VCPUs'], flavor['RAM'], flavor['Disk']) + return sorted(flavors, key=sort_key) + + def get_os_flavors(self): + flavors = json.loads(self.run("flavor list -f json")) + return flavors + + def get_sorted_flavors(self, arch, select, flavor_list = None): + log.debug("flavor selection regex: " + select) + flavors = flavor_list or self.get_os_flavors() + found = [] + for flavor in flavors: + if select and not re.match(select, flavor['Name']): + continue + found.append(flavor) + sorted_flavors = OpenStack.sort_flavors(found) + log.debug("sorted flavors = " + str(sorted_flavors)) + return sorted_flavors + + def __flavor(self, hint, flavors): + """ + Return the smallest flavor that satisfies the desired size. + """ + flavors = OpenStack.sort_flavors(flavors) + for flavor in flavors: + if (flavor['RAM'] >= hint['ram'] and + flavor['VCPUs'] >= hint['cpus'] and + flavor['Disk'] >= hint['disk']): + return flavor['Name'] + raise NoFlavorException("openstack flavor list: " + str(flavors) + + " does not contain a flavor in which" + + " the desired " + str(hint) + " can fit") + + def __flavor_range(self, min, good, flavors): + """ + Return the smallest flavor that satisfies the good hint. + If no such flavor, get the largest flavor smaller than good + and larger than min. + """ + flavors = OpenStack.sort_flavors(flavors) + low_range = [] + for flavor in flavors: + if (flavor['RAM'] >= good['ram'] and + flavor['VCPUs'] >= good['cpus'] and + flavor['Disk'] >= good['disk']): + return flavor['Name'] + else: + low_range.append(flavor) + low_range.reverse() + for flavor in low_range: + if (flavor['RAM'] >= min['ram'] and + flavor['VCPUs'] >= min['cpus'] and + flavor['Disk'] >= min['disk']): + return flavor['Name'] + raise NoFlavorException("openstack flavor list: " + str(flavors) + + " does not contain a flavor which" + + " is larger than " + str(min)) + + def __flavor_wrapper(self, min, good, hint, arch): + """ + Wrapper for __flavor_range() and __flavor(), to hide the messiness of + the real world. + + This is the one, single place for coding OpenStack-provider-specific + heuristics for selecting flavors. + """ + select_dict = { + #'ovh': ['^(s1|vps-ssd)-', '^(c2-[0-9]+|(hg|sg)-.*ssd)$', '^(hg|sg|c2)-.*ssd'], + 'ovh': [ + '^s1-', '^c2-[0-9]+$', # new ovh flavors at first + '^vps-ssd-', '^(hg|sg)-.*ssd$' # old ovh flavors + ], + 'ecp': ['^(m1|m2).'], + } + if 'flavor' in teuth_config.openstack: + flavor_select = teuth_config.openstack['flavor'] or [None] + else: + flavor_select = select_dict[self.get_provider()] \ + if self.get_provider() in select_dict else [None] + all_flavors = self.get_os_flavors() + for select in flavor_select: + try: + flavors = self.get_sorted_flavors(arch, select, all_flavors) + if hint: + flavor = self.__flavor(hint, flavors) + else: + flavor = self.__flavor_range(min, good, flavors) + if flavor: + return flavor + except NoFlavorException: + log.debug('No flavor found for select [%s]' % select) + pass + raise NoFlavorException('No flavors found for filters: %s' % flavor_select) + + def flavor(self, hint, arch): + return self.__flavor_wrapper(None, None, hint, arch) + + def flavor_range(self, min, good, arch): + return self.__flavor_wrapper(min, good, None, arch) + + def interpret_hints(self, defaults, hints): + """ + Return a hint hash which is the interpretation of a list of hints + """ + result = copy.deepcopy(defaults) + if not hints: + return result + if isinstance(hints, dict): + raise TypeError("openstack: " + str(hints) + + " must be an array, not a dict") + for hint in hints: + for resource in ('machine', 'volumes'): + if resource in hint: + new = hint[resource] + current = result[resource] + for key, value in hint[resource].items(): + current[key] = max(current[key], new[key]) + return result + + @staticmethod + def list_instances(): + conn = OpenStack().conn + ownedby = "ownedby='" + teuth_config.openstack['ip'] + "'" + instances = conn.compute.servers(all_projects=True) + return [inst for inst in instances if ownedby in (getattr(inst, 'metadata', {}) or {}).get('Properties', '')] + + @staticmethod + def list_volumes(): + conn = OpenStack().conn + ownedby = "ownedby='" + teuth_config.openstack['ip'] + "'" + volumes = conn.block_storage.volumes() + def select(volume): + props = volume.metadata or {} + return (ownedby in props.get('Properties', '') and + props.get('display_name', '').startswith('target')) + return filter(select, volumes) + + def cloud_init_wait(self, instance): + """ + Wait for cloud-init to complete on the name_or_ip OpenStack instance. + """ + ip = instance.get_floating_ip_or_ip() + log.debug('cloud_init_wait ' + ip) + client_args = { + 'user_at_host': '@'.join((self.username, ip)), + 'timeout': 240, + 'retry': False, + } + if self.key_filename: + log.debug("using key " + self.key_filename) + client_args['key_filename'] = self.key_filename + with safe_while(sleep=30, tries=30, + action="cloud_init_wait " + ip) as proceed: + success = False + tail = ("tail --follow=name --retry" + " /var/log/cloud-init*.log /tmp/init.out") + while proceed(): + try: + log.debug("Attempting to connect to instance at IP: " + ip) + client = connection.connect(**client_args) + except paramiko.PasswordRequiredException: + raise Exception( + "The private key requires a passphrase.\n" + "Create a new key with:" + " openstack keypair create myself > myself.pem\n" + " chmod 600 myself.pem\n" + "and call teuthology-openstack with the options\n" + " --key-name myself --key-filename myself.pem\n") + except paramiko.AuthenticationException as e: + log.debug('cloud_init_wait AuthenticationException ' + str(e)) + continue + except socket.timeout as e: + log.debug('cloud_init_wait connect socket.timeout ' + str(e)) + continue + except socket.error as e: + log.debug('cloud_init_wait connect socket.error ' + str(e)) + continue + except Exception as e: + transients = ('Incompatible ssh peer', 'Unknown server') + for transient in transients: + if transient in str(e): + continue + log.exception('cloud_init_wait ' + ip) + raise + log.debug('cloud_init_wait ' + tail) + try: + # get the I/O channel to iterate line by line + transport = client.get_transport() + channel = transport.open_session() + channel.get_pty() + channel.settimeout(240) + output = channel.makefile('r', 1) + channel.exec_command(tail) + for line in iter(output.readline, b''): + log.info(line.strip()) + if self.up_string in line: + success = True + break + except socket.timeout: + client.close() + continue + except socket.error: + client.close() + continue + finally: + client.close() + if success: + log.debug('Cloud-init completed successfully for IP: ' + ip) + break + if not success: + log.debug('Cloud-init did not complete successfully within the given retries.') + return success + + def get_ip(self, instance_id, network): + return OpenStackInstance(instance_id).get_ip(network) + + def get_network(self): + nets = { + 'entercloudsuite' : 'default', + 'cloudlab' : 'flat-lan-1-net', + 'ecp' : 'sesci', + } + if 'network' in teuth_config.openstack: + return teuth_config.openstack['network'] + elif self.get_provider() in nets: + return nets[self.get_provider()] + else: + return None + + def net(self): + """ + Return the network to be used when creating an OpenStack instance. + By default it should not be set. But some providers such as + entercloudsuite require it is. + """ + log.debug('Using config: %s', teuth_config) + network = self.get_network() + return "--nic net-id=" + network if network else "" + + def get_available_archs(self): + if (self.get_provider() == 'cloudlab' or + (self.get_provider() == 'runabove' and + 'HZ1' in os.environ.get('OS_REGION_NAME', ''))): + return ('aarch64',) + else: + return ('x86_64', 'i686') + + def get_default_arch(self): + return self.get_available_archs()[0] + + def volume_delete(self, name_or_id): + self.run("volume set --name REMOVE-ME " + name_or_id + " || true") + self.run("volume delete " + name_or_id + " || true") + + +class TeuthologyOpenStack(OpenStack): + + def __init__(self, args, config, argv): + """ + args is of type argparse.Namespace as returned + when parsing argv and config is the job + configuration. The argv argument can be re-used + to build the arguments list of teuthology-suite. + """ + super(TeuthologyOpenStack, self).__init__() + self.argv = argv + self.args = args + self.config = config + self.up_string = 'teuthology is up and running' + self.user_data = 'teuthology/openstack/openstack-user-data.txt' + + def get_instance(self): + if not hasattr(self, 'instance'): + self.instance = OpenStackInstance(self.server_name()) + return self.instance + + def main(self): + """ + Entry point implementing the teuthology-openstack command. + """ + self.setup_logs() + set_config_attr(self.args) + log.debug('Teuthology config: %s' % self.config.openstack) + key_filenames = (lambda x: x if isinstance(x, list) else [x]) \ + (self.args.key_filename) + for keyfile in key_filenames: + if os.path.isfile(keyfile): + self.key_filename = keyfile + break + if not self.key_filename: + raise Exception('No key file provided, please, use --key-filename option') + self.verify_openstack() + if self.args.teardown: + self.teardown() + return 0 + if self.args.setup: + self.setup() + exit_code = 0 + if self.args.suite: + self.get_instance() + if self.args.wait: + self.reminders() + exit_code = self.run_suite() + self.reminders() + if self.args.teardown: + if self.args.suite and not self.args.wait: + log.error("it does not make sense to teardown a cluster" + " right after a suite is scheduled") + else: + self.teardown() + return exit_code + + def _upload_yaml_file(self, fp): + """ + Given an absolute path fp, assume it is a YAML file existing + on the local machine and upload it to the remote teuthology machine + (see https://github.com/SUSE/teuthology/issues/56 for details) + """ + f = open(fp, 'r') # will throw exception on failure + f.close() + log.info("Detected local YAML file {}".format(fp)) + machine = self.username + "@" + self.instance.get_floating_ip_or_ip() + + sshopts=('-o ConnectTimeout=3 -o UserKnownHostsFile=/dev/null ' + '-o StrictHostKeyChecking=no') + + def ssh_command(s): + return "ssh {o} -i {k} {m} sh -c \\\"{s}\\\"".format( + o=sshopts, + k=self.key_filename, + m=machine, + s=s, + ) + + log.info("Uploading local file {} to teuthology machine".format(fp)) + remote_fp=os.path.normpath( + '/home/{un}/yaml/{fp}'.format( + un=self.username, + fp=fp, + ) + ) + command = ssh_command("stat {aug_fp}".format( + aug_fp=remote_fp, + )) + try: + misc.sh(command) + except: + pass + else: + log.warning( + ('{fp} probably already exists remotely as {aug_fp}; ' + 'the remote one will be clobbered').format( + fp=fp, + aug_fp=remote_fp, + )) + remote_dn=os.path.dirname(remote_fp) + command = ssh_command("mkdir -p {aug_dn}".format( + aug_dn=remote_dn, + )) + misc.sh(command) # will throw exception on failure + command = "scp {o} -i {k} {yamlfile} {m}:{dn}".format( + o=sshopts, + k=self.key_filename, + yamlfile=fp, + m=machine, + dn=remote_dn, + ) + misc.sh(command) # will throw exception on failure + return remote_fp + + def _repos_from_file(self, path): + def __check_repo_dict(obj): + if not isinstance(obj, dict): + raise Exception( + 'repo item must be a dict, %s instead' % type(obj)) + required = ['name', 'url'] + if not all(x in obj.keys() for x in required): + raise Exception( + 'repo spec must have at least %s elements' % required) + + def __check_repo_list(obj): + if not isinstance(obj, list): + raise Exception( + 'repo data must be a list, %s instead' % type(obj)) + for i in obj: + __check_repo_dict(i) + + with open(path) as f: + if path.endswith('.yaml') or path.endswith('.yml'): + data = yaml.safe_load(f) + elif path.endswith('.json') or path.endswith('.jsn'): + data = json.load(f) + else: + raise Exception( + 'Cannot detect file type from name {name}. ' + 'Supported: .yaml, .yml, .json, .jsn' + .format(name=f.name)) + __check_repo_list(data) + return data + + def _repo_from_arg(self, value): + (name, url) = value.split(':', 1) + if '!' in name: + n, p = name.split('!', 1) + return {'name': n, 'priority': int(p), 'url': url} + else: + return {'name': name, 'url': url} + + def run_suite(self): + """ + Delegate running teuthology-suite to the OpenStack instance + running the teuthology cluster. + """ + original_argv = self.argv[:] + argv = ['--ceph', self.args.ceph, + '--ceph-repo', self.args.ceph_repo, + '--suite-repo', self.args.suite_repo, + '--suite-branch', self.args.suite_branch, + ] + while len(original_argv) > 0: + if original_argv[0] in ('--name', + '--nameserver', + '--conf', + '--teuthology-branch', + '--teuthology-git-url', + '--test-repo', + '--suite-repo', + '--suite-branch', + '--ceph-repo', + '--ceph', + '--ceph-workbench-branch', + '--ceph-workbench-git-url', + '--archive-upload', + '--archive-upload-url', + '--key-name', + '--key-filename', + '--simultaneous-jobs', + '--controller-cpus', + '--controller-ram', + '--controller-disk'): + del original_argv[0:2] + elif original_argv[0] in ('--teardown', + '--setup', + '--upload', + '--no-canonical-tags'): + del original_argv[0] + elif os.path.isabs(original_argv[0]): + remote_path = self._upload_yaml_file(original_argv[0]) + argv.append(remote_path) + original_argv.pop(0) + else: + argv.append(original_argv.pop(0)) + if self.args.test_repo: + log.info("Using repos: %s" % self.args.test_repo) + repos = functools.reduce(operator.concat, ( + self._repos_from_file(it.lstrip('@')) + if it.startswith('@') else + [self._repo_from_arg(it)] + for it in self.args.test_repo)) + + overrides = { + 'overrides': { + 'install': { + 'repos' : repos + } + } + } + with tempfile.NamedTemporaryFile(mode='w+b', + suffix='-artifact.yaml', + delete=False) as f: + yaml_file = f.name + log.debug("Using file " + yaml_file) + yaml.safe_dump(overrides, stream=f, default_flow_style=False) + + path = self._upload_yaml_file(yaml_file) + argv.append(path) + + # + # If --upload, provide --archive-upload{,-url} regardless of + # what was originally provided on the command line because the + # teuthology-openstack defaults are different from the + # teuthology-suite defaults. + # + if self.args.upload: + argv.extend(['--archive-upload', self.args.archive_upload, + '--archive-upload-url', self.args.archive_upload_url]) + ceph_repo = getattr(self.args, 'ceph_repo') + if ceph_repo: + command = ( + "perl -pi -e 's|.*{opt}.*|{opt}: {value}|'" + " ~/.teuthology.yaml || true" + ).format(opt='ceph_git_url', value=ceph_repo) + self.ssh(command) + user_home = '/home/' + self.username + openstack_home = user_home + '/teuthology/teuthology/openstack' + if self.args.test_repo: + argv.append(openstack_home + '/openstack-basic.yaml') + else: + argv.append(openstack_home + '/openstack-basic.yaml') + argv.append(openstack_home + '/openstack-buildpackages.yaml') + command = ( + "source ~/.bashrc_teuthology ; " + self.teuthology_suite + " " + + " --machine-type openstack " + + " ".join(map(lambda x: "'" + x + "'", argv)) + ) + log.info("Running teuthology-suite: " + command) + return self.ssh(command) + + def reminders(self): + if self.key_filename: + identity = '-i ' + self.key_filename + ' ' + else: + identity = '' + if self.args.upload: + upload = 'upload to : ' + self.args.archive_upload + else: + upload = '' + log.info(""" +pulpito web interface: http://{ip}:8081/ +ssh access : ssh {identity}{username}@{ip} # logs in /usr/share/nginx/html +{upload}""".format(ip=self.instance.get_floating_ip_or_ip(), + username=self.username, + identity=identity, + upload=upload)) + + def setup(self): + instance = self.get_instance() + if not instance.exists(): + if self.get_provider() != 'rackspace': + self.create_security_group() + self.create_cluster() + self.reminders() + + def setup_logs(self): + """ + Setup the log level according to --verbose + """ + loglevel = logging.INFO + if self.args.verbose: + loglevel = logging.DEBUG + logging.getLogger("paramiko.transport").setLevel(logging.DEBUG) + teuthology.log.setLevel(loglevel) + + def ssh(self, command, timeout=300): + """ + Run a command in the OpenStack instance of the teuthology cluster. + Return the stdout / stderr of the command. + """ + ip = self.instance.get_floating_ip_or_ip() + client_args = { + 'user_at_host': '@'.join((self.username, ip)), + 'retry': False, + 'timeout': 240, + } + if self.key_filename: + log.debug("ssh overriding key with " + self.key_filename) + client_args['key_filename'] = self.key_filename + client = connection.connect(**client_args) + # get the I/O channel to iterate line by line + transport = client.get_transport() + channel = transport.open_session() + channel.settimeout(timeout) + log.debug(f"ssh {self.instance.get_floating_ip_or_ip()}: {command}") + channel.exec_command(command) + stdout, stderr = [], [] + start_time = time.time() + while True: + if channel.recv_ready(): + stdout.append(channel.recv(4096).decode()) + if channel.recv_stderr_ready(): + stderr.append(channel.recv_stderr(4096).decode()) + if channel.exit_status_ready(): + break + if time.time() - start_time > timeout: + raise TimeoutError("SSH command timed out!") + time.sleep(0.1) # Small sleep to avoid busy waiting + exit_status = channel.recv_exit_status() + stdout_txt, stderr_txt = ''.join(stdout), ''.join(stderr) + if exit_status != 0: + log.warning(f"SSH command failed with exit status {exit_status}") + return exit_status, stdout_txt, stderr_txt + + + def verify_openstack(self): + """ + Check there is a working connection to an OpenStack cluster + and set the provider data member if it is among those we + know already. + """ + try: + self.run("flavor list | tail -2") + except subprocess.CalledProcessError: + log.exception("flavor list") + raise Exception("verify openrc.sh has been sourced") + + def teuthology_openstack_flavor(self, arch): + """ + Return an OpenStack flavor fit to run the teuthology cluster. + The RAM size depends on the maximum number of workers that + will run simultaneously. + """ + hint = { + 'disk': 10, # GB + 'ram': 1024, # MB + 'cpus': 1, + } + if self.args.simultaneous_jobs >= 100: + hint['ram'] = 60000 # MB + elif self.args.simultaneous_jobs >= 50: + hint['ram'] = 30000 # MB + elif self.args.simultaneous_jobs >= 25: + hint['ram'] = 15000 # MB + elif self.args.simultaneous_jobs >= 10: + hint['ram'] = 8000 # MB + elif self.args.simultaneous_jobs >= 2: + hint['ram'] = 4000 # MB + if self.args.controller_cpus > 0: + hint['cpus'] = self.args.controller_cpus + if self.args.controller_ram > 0: + hint['ram'] = self.args.controller_ram + if self.args.controller_disk > 0: + hint['disk'] = self.args.controller_disk + + return self.flavor(hint, arch) + + def get_user_data(self): + """ + Create a user-data.txt file to be used to spawn the teuthology + cluster, based on a template where the OpenStack credentials + and a few other values are substituted. + """ + fd, path = tempfile.mkstemp() + os.close(fd) + + bootstrap_path = os.getcwd() + "/teuthology/openstack" + '/bootstrap-teuthology.sh' + with open(bootstrap_path, 'rb') as f: + b64_bootstrap = base64.b64encode(f.read()) + bootstrap_content = str(b64_bootstrap.decode()) + + openrc_sh = '' + cacert_cmd = None + clouds_yaml_path = os.path.expanduser('~/.config/openstack/clouds.yaml') + if os.path.exists(clouds_yaml_path): + log.debug(f"clouds.yaml found at {clouds_yaml_path}, processing for openrc.sh") + with open(clouds_yaml_path, 'r') as f: + clouds_data = yaml.safe_load(f) + cloud_name = os.environ.get('OS_CLOUD', 'default') + cloud_config = clouds_data.get('clouds', {}).get(cloud_name, {}) + if not cloud_config: + raise Exception(f"Cloud '{cloud_name}' not found in clouds.yaml") + auth = cloud_config.get('auth', {}) + for key, value in {**auth, **cloud_config}.items(): + if isinstance(value, str): + openrc_sh += f"export OS_{key.upper()}={value}\n" + else: + for (var, value) in os.environ.items(): + if var in ('OS_TOKEN_VALUE', 'OS_TOKEN_EXPIRES'): + continue + if var == 'OS_CACERT': + cacert_path = '/home/%s/.openstack.crt' % self.username + cacert_file = value + openrc_sh += 'export %s=%s\n' % (var, cacert_path) + cacert_cmd = ( + "su - -c 'cat > {path}' {user} <> " + "/tmp/init.out 2>&1".format( + url=self.args.teuthology_git_url, + branch=self.args.teuthology_branch, + user=self.username)), + cmd_str( + "su - -c 'cp /tmp/openrc.sh $HOME/openrc.sh' {user}" + .format(user=self.username)), + cmd_str( + "su - -c '(set +x ; source openrc.sh ; set -x ; cd teuthology ; " + "source virtualenv/bin/activate ; " + "teuthology/openstack/setup-openstack.sh {opts})' " + "{user} >> /tmp/init.out " + "2>&1".format(user=self.username, + opts=' '.join(setup_options + all_options))), + "pkill -f 'pecan serve'", + "pkill -f 'python run.py'", + "systemctl enable teuthology", + "systemctl start teuthology", + ] + if cacert_cmd: + cmds.insert(0,cmd_str(cacert_cmd)) + #cloud-config + cloud_config = { + 'bootcmd': [ + 'touch /tmp/init.out', + 'echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf', + ], + 'manage_etc_hosts': True, + 'system_info': { + 'default_user': { + 'name': self.username + } + }, + 'write_files': [ + { + 'path': '/tmp/bootstrap-teuthology.sh', + 'content': cmd_str(bootstrap_content), + 'encoding': 'b64', + 'permissions': '0755', + }, + { + 'path': '/tmp/openrc.sh', + 'owner': self.username, + 'content': cmd_str(openrc_sh_content), + 'encoding': 'b64', + 'permissions': '0644', + } + ], + 'runcmd': [ + 'apt-get update && apt-get install -y python3-virtualenv git rsync >> /tmp/init.out 2>&1' + ] + cmds, + 'final_message': 'teuthology is up and running after $UPTIME seconds' + } + user_data = "#cloud-config\n%s" % \ + yaml.dump(cloud_config, default_flow_style = False) + open(path, 'w').write(user_data) + log.debug("user_data: %s" % user_data) + + return path + + def key_pair(self): + return "teuth-%s" % self.args.name + + def server_name(self): + return "teuth-%s" % self.args.name + + def server_group(self): + return "teuth-%s" % self.args.name + + def worker_group(self): + return "teuth-%s-worker" % self.args.name + + def create_security_group(self): + conn = OpenStack().conn + server_sg = conn.network.find_security_group(self.server_group()) + worker_sg = conn.network.find_security_group(self.worker_group()) + if not server_sg: + server_sg = conn.network.create_security_group(name=self.server_group()) + if not worker_sg: + worker_sg = conn.network.create_security_group(name=self.worker_group()) + def add_rule(sg_id, protocol, port=None, remote_group_id=None): + rule_args = { + 'security_group_id': sg_id, + 'direction': 'ingress', + 'protocol': protocol, + 'ethertype': 'IPv4', + } + if port is not None: + rule_args['port_range_min'] = rule_args['port_range_max'] = port + if remote_group_id: + rule_args['remote_group_id'] = remote_group_id + else: + rule_args['remote_ip_prefix'] = '0.0.0.0/0' + try: + conn.network.create_security_group_rule(**rule_args) + except Exception as e: + log.warning(f"Security group rule creation skipped or failed: {e}") + # tcp access to enable reliable inter-node communication + for sg in (server_sg, worker_sg): + add_rule(sg.id, 'tcp') + # access between teuthology and workers + for port in (65535,): + add_rule(worker_sg.id, 'udp', port=port, remote_group_id=server_sg.id) + add_rule(server_sg.id, 'udp', port=port, remote_group_id=worker_sg.id) + # access between members of one group + add_rule(server_sg.id, 'udp', port=65535, remote_group_id=server_sg.id) + # access within worker group + add_rule(worker_sg.id, 'udp', port=65535, remote_group_id=worker_sg.id) + + @staticmethod + def get_unassociated_floating_ip(): + """ + Return a floating IP address not associated with an instance or None. + """ + ips = TeuthologyOpenStack.get_os_floating_ips() + for ip in ips: + if not ip['Port']: + return ip['Floating IP Address'] + return None + + @staticmethod + def create_floating_ip(): + conn = OpenStack().conn + network_name = 'floating' + network = conn.network.find_network(network_name) + if not network: + log.debug(f"Floating network {network_name} not found.") + return None + floating_ip = conn.network.create_ip(floating_network_id=network.id) + return floating_ip.floating_ip_address + + @staticmethod + def associate_floating_ip(name_or_id): + """ + Associate a floating IP to the OpenStack instance + or do nothing if no floating ip can be created. + """ + conn = OpenStack().conn + server = conn.compute.find_server(name_or_id) + ip_address = TeuthologyOpenStack.get_unassociated_floating_ip() + if not ip_address: + ip_address = TeuthologyOpenStack.create_floating_ip() + if ip_address: + conn.compute.add_floating_ip_to_server(server, ip_address) + + @staticmethod + def get_os_floating_ips(): + conn = OpenStack().conn + return list(conn.network.ips()) + + @staticmethod + def get_floating_ip_id(ip): + """ + Return the id of a floating IP + """ + results = TeuthologyOpenStack.get_os_floating_ips() + for result in results: + for k in ['IP', 'Floating IP Address']: + if k in result: + if result[k] == ip: + return str(result['ID']) + + return None + + def get_instance_id(self): + instance = self.get_instance() + if instance.info: + return instance['id'] + else: + return None + + @staticmethod + def delete_floating_ip(instance_id): + """ + Remove the floating ip from instance_id and delete it. + """ + conn = OpenStack().conn + server = conn.compute.find_server(instance_id) + if not server: + return + ip_address = OpenStackInstance(instance_id).get_floating_ip() + if not ip_address: + return + conn.compute.remove_floating_ip_from_server(server, ip_address) + floating_ip_obj = conn.network.find_ip(ip_address) + if floating_ip_obj: + conn.network.delete_ip(floating_ip_obj) + + def create_cluster(self): + user_data = self.get_user_data() + security_group = \ + " --security-group {teuthology}".format(teuthology=self.server_group()) + if self.get_provider() == 'rackspace': + security_group = '' + arch = self.get_default_arch() + flavor = self.teuthology_openstack_flavor(arch) + log.debug('Create server: %s' % self.server_name()) + log.debug('Using config: %s' % self.config.openstack) + log.debug('Using flavor: %s' % flavor) + key_name = self.args.key_name + if not key_name: + raise Exception('No key name provided, use --key-name option') + log.debug('Using key name: %s' % self.args.key_name) + image_name = self.image('ubuntu', '22.04', arch) + log.debug("Using image: %s" % image_name) + net_config = self.net() + try: + self.run( + "server create " + + " --image '" + image_name + "' " + + " --flavor '" + flavor + "' " + + " " + net_config + + " --key-name " + key_name + + " --user-data " + user_data + + security_group + + " --wait " + self.server_name() + + " -f json") + except Exception as e: + log.error("Error during server creation: %s" % str(e)) + raise + finally: + os.unlink(user_data) + self.instance = OpenStackInstance(self.server_name()) + log.debug("OpenStackInstance created for server: %s" % self.server_name()) + self.associate_floating_ip(self.instance['id']) + log.debug("Floating IP associated for instance ID: %s" % self.instance.get('id')) + return self.cloud_init_wait(self.instance) + + def packages_repository(self): + return 'teuth-%s-repo' % self.args.name #packages-repository + + def teardown(self): + """ + Delete all instances run by the teuthology cluster and delete the + instance running the teuthology cluster. + """ + instance_id = self.get_instance_id() + + if instance_id: + self.ssh("sudo /etc/init.d/teuthology stop || true") + self.delete_floating_ip(instance_id) + self.run("server delete %s || true" % self.packages_repository()) + self.run("server delete --wait %s || true" % self.server_name()) + self.run("keypair delete %s || true" % self.key_pair()) + self.run("security group delete %s || true" % self.worker_group()) + self.run("security group delete %s || true" % self.server_group()) + +def main(ctx, argv): + return TeuthologyOpenStack(ctx, teuth_config, argv).main() diff --git a/teuthology/openstack/archive-key b/teuthology/openstack/archive-key new file mode 100644 index 000000000..a8861441d --- /dev/null +++ b/teuthology/openstack/archive-key @@ -0,0 +1,27 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIEowIBAAKCAQEAvLz+sao32JL/yMgwTFDTnQVZK3jyXlhQJpHLsgwgHWHQ/27L +fwEbGFVYsJNBGntZwCZvH/K4c0IevbnX/Y69qgmAc9ZpZQLIcIF0A8hmwVYRU+Ap +TAK2qAvadThWfiRBA6+SGoRy6VV5MWeq+hqlGf9axRKqhECNhHuGBuBeosUOZOOH +NVzvFIbp/4842yYrZUDnDzW7JX2kYGi6kaEAYeR8qYJgT/95Pm4Bgu1V7MI36rx1 +O/5BSPF3LvDSnnaZyHCDZtwzC50lBnS2nx8kKPmmdKBSEJoTdNRPIXZ/lMq5pzIW +QPDjI8O5pbX1BJcxfFlZ/h+bI6u8IX3vfTGHWwIDAQABAoIBAG5yLp0rHfkXtKT7 +OQA/wEW/znmZEkPRbD3VzZyIafanuhTv8heFPyTTNM5Hra5ghpniI99PO07/X1vp +OBMCB81MOCYRT6WzpjXoG0rnZ/I1enhZ0fDQGbFnFlTIPh0c/Aq7IEVyQoh24y/d +GXm4Q+tdufFfRfeUivv/CORXQin/Iugbklj8erjx+fdVKPUXilmDIEVleUncer5/ +K5Fxy0lWbm6ZX1fE+rfJvCwNjAaIJgrN8TWUTE8G72F9Y0YU9hRtqOZe6MMbSufy +5+/yj2Vgp+B8Id7Ass2ylDQKsjBett/M2bNKt/DUVIiaxKi0usNSerLvtbkWEw9s +tgUI6ukCgYEA6qqnZwkbgV0lpj1MrQ3BRnFxNR42z2MyEY5xRGaYp22ByxS207z8 +mM3EuLH8k2u6jzsGoPpBWhBbs97MuGDHwsMEO5rBpytnTE4Hxrgec/13Arzk4Bme +eqg1Ji+lNkoLzEHkuihskcZwnQ8uaOdqrnH/NRGuUhA9hjeh+lQzBy8CgYEAzeV1 +zYsw8xIBFtbmFhBQ8imHr0SQalTiQU2Qn46LORK0worsf4sZV5ZF3VBRdnCUwwbm +0XaMb3kE2UBlU8qPqLgxXPNjcEKuqtVlp76dT/lrXIhYUq+Famrf20Lm01kC5itz +QF247hnUfo2uzxpatuEr2ggs2NjuODn57tVw95UCgYEAv0s+C5AxC9OSzWFLEAcW +dwYi8toedBC4z/b9/nRkHJf4JkRMhW6ZuzaCFs2Ax+wZuIi1bqSSgYi0OHx3BhZe +wTWYTb5p/owzONCjJisRKByG14SETuqTdgmIyggs9YSG+Yr9mYM6fdr2EhI+EuYS +4QGsuOYg5GS4wqC3OglJT6ECgYA8y28QRPQsIXnO259OjnzINDkLKGyX6P5xl8yH +QFidfod/FfQk6NaPxSBV67xSA4X5XBVVbfKji5FB8MC6kAoBIHn63ybSY+4dJSuB +70eV8KihxuSFbawwMuRsYoGzkAnKGrRKIiJTs67Ju14NatO0QiJnm5haYxtb4MqK +md1kTQKBgDmTxtSBVOV8eMhl076OoOvdnpb3sy/obI/XUvurS0CaAcqmkVSNJ6c+ +g1O041ocTbuW5d3fbzo9Jyle6qsvUQd7fuoUfAMrd0inKsuYPPM0IZOExbt8QqLI +KFJ+r/nQYoJkmiNO8PssxcP3CMFB6TpUx0BgFcrhH//TtKKNrGTl +-----END RSA PRIVATE KEY----- diff --git a/teuthology/openstack/archive-key.pub b/teuthology/openstack/archive-key.pub new file mode 100644 index 000000000..57513806d --- /dev/null +++ b/teuthology/openstack/archive-key.pub @@ -0,0 +1 @@ +ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC8vP6xqjfYkv/IyDBMUNOdBVkrePJeWFAmkcuyDCAdYdD/bst/ARsYVViwk0Eae1nAJm8f8rhzQh69udf9jr2qCYBz1mllAshwgXQDyGbBVhFT4ClMAraoC9p1OFZ+JEEDr5IahHLpVXkxZ6r6GqUZ/1rFEqqEQI2Ee4YG4F6ixQ5k44c1XO8Uhun/jzjbJitlQOcPNbslfaRgaLqRoQBh5HypgmBP/3k+bgGC7VXswjfqvHU7/kFI8Xcu8NKedpnIcINm3DMLnSUGdLafHyQo+aZ0oFIQmhN01E8hdn+UyrmnMhZA8OMjw7mltfUElzF8WVn+H5sjq7whfe99MYdb loic@fold diff --git a/teuthology/openstack/bootstrap-teuthology.sh b/teuthology/openstack/bootstrap-teuthology.sh new file mode 100644 index 000000000..df433315e --- /dev/null +++ b/teuthology/openstack/bootstrap-teuthology.sh @@ -0,0 +1,33 @@ +#!/bin/bash -ex +TEUTH_PATH=${1:-"teuthology"} +TEUTH_GIT=${2:-"https://github.com/ceph/teuthology"} +TEUTH_BRANCH=${3:-"main"} + +mkdir -p $TEUTH_PATH +git init $TEUTH_PATH + +pushd $TEUTH_PATH + +echo Fetch upstream changes from $TEUTH_GIT +git fetch --tags --progress $TEUTH_GIT +refs/heads/*:refs/remotes/origin/* +git config remote.origin.url $TEUTH_GIT +git config --add remote.origin.fetch +refs/heads/*:refs/remotes/origin/* +git config remote.origin.url $TEUTH_GIT + +# Check if branch has form origin/pr/*/merge +isPR="^origin\/pr\/" +if [[ "$TEUTH_BRANCH" =~ $isPR ]] ; then + +git fetch --tags --progress https://github.com/suse/teuthology +refs/pull/*:refs/remotes/origin/pr/* +rev=$(git rev-parse refs/remotes/$TEUTH_BRANCH^{commit}) + +git config core.sparsecheckout +git checkout -f $rev +else +git checkout $TEUTH_BRANCH +fi + +./bootstrap install + +popd + diff --git a/teuthology/openstack/openstack-basic.yaml b/teuthology/openstack/openstack-basic.yaml new file mode 100644 index 000000000..db443f4df --- /dev/null +++ b/teuthology/openstack/openstack-basic.yaml @@ -0,0 +1,15 @@ +overrides: + ceph: + conf: + global: + osd heartbeat grace: 100 + # this line to address issue #1017 + mon lease: 15 + mon lease ack timeout: 25 + s3tests: + idle_timeout: 1200 + ceph-fuse: + client.0: + mount_wait: 60 + mount_timeout: 120 +archive-on-error: true diff --git a/teuthology/openstack/openstack-buildpackages.yaml b/teuthology/openstack/openstack-buildpackages.yaml new file mode 100644 index 000000000..1e404b48c --- /dev/null +++ b/teuthology/openstack/openstack-buildpackages.yaml @@ -0,0 +1,10 @@ +tasks: + - buildpackages: + good_machine: + disk: 100 # GB + ram: 15000 # MB + cpus: 16 + min_machine: + disk: 100 # GB + ram: 8000 # MB + cpus: 1 diff --git a/teuthology/openstack/openstack-centos-6.5-user-data.txt b/teuthology/openstack/openstack-centos-6.5-user-data.txt new file mode 100644 index 000000000..27e705df0 --- /dev/null +++ b/teuthology/openstack/openstack-centos-6.5-user-data.txt @@ -0,0 +1,24 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -ie 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network-scripts/ifcfg-* + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/6/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-6 && rm /etc/yum.repos.d/dl.fedoraproject.org* + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp + - dracut-modules-growroot +runcmd: + - mkinitrd --force /boot/initramfs-2.6.32-573.3.1.el6.x86_64.img 2.6.32-573.3.1.el6.x86_64 + - reboot +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-centos-7.0-user-data.txt b/teuthology/openstack/openstack-centos-7.0-user-data.txt new file mode 100644 index 000000000..475aaaa37 --- /dev/null +++ b/teuthology/openstack/openstack-centos-7.0-user-data.txt @@ -0,0 +1,21 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -ie 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network-scripts/ifcfg-* + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp + - redhat-lsb-core +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-centos-7.1-user-data.txt b/teuthology/openstack/openstack-centos-7.1-user-data.txt new file mode 100644 index 000000000..475aaaa37 --- /dev/null +++ b/teuthology/openstack/openstack-centos-7.1-user-data.txt @@ -0,0 +1,21 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -ie 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network-scripts/ifcfg-* + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp + - redhat-lsb-core +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-centos-7.2-user-data.txt b/teuthology/openstack/openstack-centos-7.2-user-data.txt new file mode 100644 index 000000000..475aaaa37 --- /dev/null +++ b/teuthology/openstack/openstack-centos-7.2-user-data.txt @@ -0,0 +1,21 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -ie 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network-scripts/ifcfg-* + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp + - redhat-lsb-core +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-centos-7.3-user-data.txt b/teuthology/openstack/openstack-centos-7.3-user-data.txt new file mode 120000 index 000000000..123a8b40a --- /dev/null +++ b/teuthology/openstack/openstack-centos-7.3-user-data.txt @@ -0,0 +1 @@ +openstack-centos-7.2-user-data.txt \ No newline at end of file diff --git a/teuthology/openstack/openstack-centos-9.stream-user-data.txt b/teuthology/openstack/openstack-centos-9.stream-user-data.txt new file mode 100644 index 000000000..9b6d6753c --- /dev/null +++ b/teuthology/openstack/openstack-centos-9.stream-user-data.txt @@ -0,0 +1,16 @@ +#cloud-config +bootcmd: + - hostnamectl set-hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python3 + - wget + - git + - ntp +final_message: "{up}, after $UPTIME seconds" \ No newline at end of file diff --git a/teuthology/openstack/openstack-debian-7.0-user-data.txt b/teuthology/openstack/openstack-debian-7.0-user-data.txt new file mode 120000 index 000000000..1c0d25675 --- /dev/null +++ b/teuthology/openstack/openstack-debian-7.0-user-data.txt @@ -0,0 +1 @@ +openstack-ubuntu-14.04-user-data.txt \ No newline at end of file diff --git a/teuthology/openstack/openstack-debian-8.0-user-data.txt b/teuthology/openstack/openstack-debian-8.0-user-data.txt new file mode 100644 index 000000000..61180663c --- /dev/null +++ b/teuthology/openstack/openstack-debian-8.0-user-data.txt @@ -0,0 +1,24 @@ +#cloud-config +bootcmd: + - apt-get remove --purge -y resolvconf || true + - echo 'prepend domain-name-servers {nameserver};' | tee -a /etc/dhcp/dhclient.conf + - echo 'supersede domain-name "{lab_domain}";' | tee -a /etc/dhcp/dhclient.conf + - ifdown -a ; ifup -a + - grep --quiet {nameserver} /etc/resolv.conf || ( echo 'nameserver {nameserver}' ; echo 'search {lab_domain}' ) | tee /etc/resolv.conf + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - echo "MaxSessions 1000" >> /etc/ssh/sshd_config +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp +runcmd: +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo + - echo '{username} ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-opensuse-15.0-user-data.txt b/teuthology/openstack/openstack-opensuse-15.0-user-data.txt new file mode 100644 index 000000000..7cbbc852f --- /dev/null +++ b/teuthology/openstack/openstack-opensuse-15.0-user-data.txt @@ -0,0 +1,26 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - zypper --non-interactive --gpg-auto-import-keys refresh + - zypper --non-interactive remove --force librados2 librbd1 multipath-tools-rbd qemu-block-rbd ntp + - zypper --non-interactive install --no-recommends --force wget git-core rsyslog lsb-release make gcc gcc-c++ salt-master salt-minion salt-api chrony + - systemctl enable chronyd.service + - systemctl start chronyd.service + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion + - sleep 30 +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-opensuse-15.1-user-data.txt b/teuthology/openstack/openstack-opensuse-15.1-user-data.txt new file mode 120000 index 000000000..17a81c070 --- /dev/null +++ b/teuthology/openstack/openstack-opensuse-15.1-user-data.txt @@ -0,0 +1 @@ +openstack-opensuse-15.0-user-data.txt \ No newline at end of file diff --git a/teuthology/openstack/openstack-opensuse-42.1-user-data.txt b/teuthology/openstack/openstack-opensuse-42.1-user-data.txt new file mode 100644 index 000000000..1860ef140 --- /dev/null +++ b/teuthology/openstack/openstack-opensuse-42.1-user-data.txt @@ -0,0 +1,27 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive remove systemd-logger + - zypper --non-interactive install --no-recommends python wget git ntp rsyslog + lsb-release salt-minion salt-master make + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion + - ( if ! grep '^server' /etc/ntp.conf ; then for i in 0 1 2 3 ; do echo "server $i.opensuse.pool.ntp.org iburst" >> /etc/ntp.conf ; done ; fi ) + - systemctl enable salt-minion.service ntpd.service + - systemctl restart ntpd.service +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-opensuse-42.2-user-data.txt b/teuthology/openstack/openstack-opensuse-42.2-user-data.txt new file mode 100644 index 000000000..c8ca72c62 --- /dev/null +++ b/teuthology/openstack/openstack-opensuse-42.2-user-data.txt @@ -0,0 +1,28 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - 'zypper rr openSUSE-Leap-Cloud-Tools || :' + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive remove systemd-logger + - zypper --non-interactive install --no-recommends python wget git ntp rsyslog + lsb-release salt-minion salt-master make gcc gcc-c++ + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion + - ( if ! grep '^server' /etc/ntp.conf ; then for i in 0 1 2 3 ; do echo "server $i.opensuse.pool.ntp.org iburst" >> /etc/ntp.conf ; done ; fi ) + - systemctl enable salt-minion.service ntpd.service + - systemctl restart ntpd.service +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-opensuse-42.3-user-data.txt b/teuthology/openstack/openstack-opensuse-42.3-user-data.txt new file mode 100644 index 000000000..ee7d4fd7b --- /dev/null +++ b/teuthology/openstack/openstack-opensuse-42.3-user-data.txt @@ -0,0 +1,27 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - 'zypper rr openSUSE-Leap-Cloud-Tools || :' + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive remove systemd-logger + - zypper --non-interactive install --no-recommends python wget git ntp rsyslog lsb-release make gcc gcc-c++ + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion + - ( if ! grep '^server' /etc/ntp.conf ; then for i in 0 1 2 3 ; do echo "server $i.opensuse.pool.ntp.org iburst" >> /etc/ntp.conf ; done ; fi ) + - systemctl enable ntpd.service + - systemctl restart ntpd.service +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-sle-12.1-user-data.txt b/teuthology/openstack/openstack-sle-12.1-user-data.txt new file mode 100644 index 000000000..820cd9c26 --- /dev/null +++ b/teuthology/openstack/openstack-sle-12.1-user-data.txt @@ -0,0 +1,25 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo + - SuSEfirewall2 stop +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive install --no-recommends python wget git ntp rsyslog + lsb-release make + - ( if ! grep '^server' /etc/ntp.conf ; then for i in 0 1 2 3 ; do echo "server $i.opensuse.pool.ntp.org iburst" >> /etc/ntp.conf ; done ; fi ) + - systemctl restart ntpd.service +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-sle-12.2-user-data.txt b/teuthology/openstack/openstack-sle-12.2-user-data.txt new file mode 100644 index 000000000..6977f381e --- /dev/null +++ b/teuthology/openstack/openstack-sle-12.2-user-data.txt @@ -0,0 +1,27 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo + - SuSEfirewall2 stop +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive install --no-recommends python wget git ntp rsyslog + lsb-release salt-minion salt-master make gcc gcc-c++ + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion + - ( if ! grep '^server' /etc/ntp.conf ; then for i in 0 1 2 3 ; do echo "server $i.opensuse.pool.ntp.org iburst" >> /etc/ntp.conf ; done ; fi ) + - systemctl enable salt-minion.service ntpd.service + - systemctl restart ntpd.service +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-sle-12.3-user-data.txt b/teuthology/openstack/openstack-sle-12.3-user-data.txt new file mode 100644 index 000000000..fa1d2267c --- /dev/null +++ b/teuthology/openstack/openstack-sle-12.3-user-data.txt @@ -0,0 +1,24 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo + - SuSEfirewall2 stop +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion + - ( if ! grep '^server' /etc/ntp.conf ; then for i in 0 1 2 3 ; do echo "server $i.opensuse.pool.ntp.org iburst" >> /etc/ntp.conf ; done ; fi ) + - systemctl enable salt-minion.service ntpd.service + - systemctl restart ntpd.service +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-sle-15.0-user-data.txt b/teuthology/openstack/openstack-sle-15.0-user-data.txt new file mode 100644 index 000000000..0fb900879 --- /dev/null +++ b/teuthology/openstack/openstack-sle-15.0-user-data.txt @@ -0,0 +1,25 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-eth0 + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive install --no-recommends wget rsyslog lsb-release make gcc gcc-c++ chrony + - sed -i -e 's/^! pool/pool/' /etc/chrony.conf + - systemctl enable chronyd.service + - systemctl start chronyd.service + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-sle-15.1-user-data.txt b/teuthology/openstack/openstack-sle-15.1-user-data.txt new file mode 100644 index 000000000..2bdd6eea1 --- /dev/null +++ b/teuthology/openstack/openstack-sle-15.1-user-data.txt @@ -0,0 +1,37 @@ +#cloud-config +bootcmd: + - echo nameserver {nameserver} | tee /etc/resolv.conf + - echo search {lab_domain} | tee -a /etc/resolv.conf + - ( echo ; echo "MaxSessions 1000" ) >> /etc/ssh/sshd_config +# See https://github.com/ceph/ceph-cm-ansible/blob/main/roles/cobbler/templates/snippets/cephlab_user + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo ; chmod 0440 /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +users: + - name: {username} + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - | + for i in $(seq 1 30) ; do + ping -q -c 1 8.8.8.8 && break + sleep 10 + done + ETH=$(ip route list | grep "scope link" | cut -f 3 -d ' ') + sed -i -e 's/PEERDNS="yes"/PEERDNS="no"/' /etc/sysconfig/network/ifcfg-$ETH + ( + curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | + sed -e 's/[\.-].*//' + eval printf "%03d%03d%03d%03d.{lab_domain}" $( + curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | + tr . ' ' ) + ) | tee /etc/hostname + hostname $(cat /etc/hostname) + - ( MYHOME=/home/{username} ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R {username}.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks refresh + - zypper --non-interactive install --no-recommends wget rsyslog lsb-release make gcc gcc-c++ chrony + - sed -i -e 's/^! pool/pool/' /etc/chrony.conf + - systemctl enable chronyd.service + - systemctl start chronyd.service + - sed -i -e "s/^#master:.*$/master:\ $(curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//')$(eval printf "%03d%03d%03d%03d.{lab_domain}" $(echo "{nameserver}" | tr . ' '))/" /etc/salt/minion +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-teuthology.cron b/teuthology/openstack/openstack-teuthology.cron new file mode 100644 index 000000000..835e1dcb4 --- /dev/null +++ b/teuthology/openstack/openstack-teuthology.cron @@ -0,0 +1 @@ +SHELL=/bin/bash diff --git a/teuthology/openstack/openstack-teuthology.init b/teuthology/openstack/openstack-teuthology.init new file mode 100755 index 000000000..87bf35be4 --- /dev/null +++ b/teuthology/openstack/openstack-teuthology.init @@ -0,0 +1,225 @@ +#!/bin/bash +# +# Copyright (c) 2015 Red Hat, Inc. +# +# Author: Loic Dachary +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +### BEGIN INIT INFO +# Provides: teuthology +# Required-Start: $network $remote_fs $syslog beanstalkd nginx +# Required-Stop: $network $remote_fs $syslog +# Default-Start: 2 3 4 5 +# Default-Stop: +# Short-Description: Start teuthology +### END INIT INFO + +cd /home/ubuntu + +source /etc/default/teuthology + +user=${TEUTHOLOGY_USERNAME:-ubuntu} + +export HOME=/home/$user + +function worker_pidfile() { + echo /var/run/teuthology-worker.$1.pid +} +function worker_logfile() { + echo /var/log/teuthology.${1}.log +} + +function stop_worker() { + wnum=$1 + wpidfile=$(worker_pidfile $wnum) + if [[ -f $wpidfile ]] ; then + wpid=$(cat $wpidfile) + echo Killing worker $wnum with pid=$wpid... + pkill -P $wpid + pkill $wpid + rm -f $wpidfile + fi +} + +function stop_workers() { + for i in $(seq 1 $NWORKERS) ; do + stop_worker $i + done +} + +function start_worker() { + local wlogfile=$1 + local wpidfile=$2 + mkdir -p /tmp/log && chown $user /tmp/log + su - -c " +cd /home/$user +source openrc.sh +cd teuthology +export LC_ALL=C +virtualenv/bin/teuthology-worker --tube openstack -l /tmp/log --archive-dir /usr/share/nginx/html +" $user > $wlogfile 2>&1 & { + echo $! > $wpidfile + echo "Started worker with pid=$! see log $wlogfile" + } +} + +function rkill() { + local pid=$1 + for i in $(pgrep -P $pid) ; do + rkill $i + done + echo Killing process $pid + kill -9 $pid +} +function status_process() { + local name=$1 + local pidf=$2 + [[ -f $pidf ]] && { + PID=$(cat $pidf) + STATUS=$(ps aux --no-headers -q $PID 2>&1 > /dev/null && echo running || echo dead) + echo $name PID:$PID STATUS:$STATUS + } +} + +function stop_process() { + local pidfile=$1 + [[ -f $pidfile ]] && { + local pid=$(cat $pidfile) + rkill $pid + ps aux --no-headers -q $pid 2>&1 > /dev/null || rm $pidfile + } +} + +function start_workers() { + for i in $(seq 1 $NWORKERS) ; do + local wpidfile=$(worker_pidfile $i) + local wlogfile=$(worker_logfile $i) + [[ -f $wpidfile ]] && { + local wpid=$(cat $wpidfile) + ps aux --no-headers -q $wpid 2>&1 > /dev/null && { + echo Worker $i is already running with process $wpid + continue + } + } + start_worker $wlogfile $wpidfile + done +} + +case $1 in + start-workers) + start_workers + ;; + status-workers|list-workers) + for i in $(ls /var/run | grep teuthology-worker | sort) ; do + WORKER=${i##teuthology-worker.} + WORKER=${WORKER%%.pid} + status_process "worker $WORKER" /var/run/$i + done + ;; + stop-workers) + echo Stopping workers + stop_workers + ;; + stop-worker) + stop_worker $2 + ;; + restart-workers) + $0 stop-workers + $1 start-workers + ;; + status-pulpito) + status_process pulpito /var/run/pulpito.pid + ;; + start-pulpito) + su - -c "cd /home/$user/pulpito ; virtualenv/bin/python run.py" $user > /var/log/pulpito.log 2>&1 & \ + echo $! > /var/run/pulpito.pid + ;; + stop-pulpito) + echo Stopping pulpito + stop_process /var/run/pulpito.pid + ;; + status-paddles) + status_process paddles /var/run/paddles.pid + ;; + start-paddles) + su - -c "cd /home/$user/paddles ; virtualenv/bin/pecan serve config.py" $user > /var/log/paddles.log 2>&1 & + echo $! > /var/run/paddles.pid + ;; + stop-paddles) + echo Stopping paddles + stop_process /var/run/paddles.pid + ;; + start) + /etc/init.d/beanstalkd start + $0 start-paddles + $0 start-pulpito + sleep 3 + ( + cd /home/$user + source openrc.sh + cd teuthology + . virtualenv/bin/activate + teuthology-lock --list-targets --owner scheduled_$user@teuthology > /tmp/t + if test -s /tmp/t && ! grep -qq 'targets: {}' /tmp/t ; then + teuthology-lock --unlock -t /tmp/t --owner scheduled_$user@teuthology + fi + start_workers + ) + ;; + stop) + #pkill -f 'pecan serve' + #pkill -f 'python run.py' + #pkill -f 'teuthology-worker' + $0 stop-pulpito + $0 stop-paddles + $0 stop-workers + pkill -f 'ansible' + /etc/init.d/beanstalkd stop + source /home/$user/teuthology/virtualenv/bin/activate + source /home/$user/openrc.sh + for dev in eth0 ens3 ; do + ip=$(ip a show dev $dev 2>/dev/null | sed -n "s:.*inet \(.*\)/.*:\1:p") + test "$ip" && break + done + openstack server list --long -f json --name target | \ + jq ".[] | select(.Properties | contains(\"ownedby='$ip'\")) | .ID" | \ + xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait + openstack server list --long -f json --name ceph- | \ + jq ".[] | select(.Properties | contains(\"ownedby='$ip'\")) | .ID" | \ + xargs --no-run-if-empty --max-args 1 -P20 openstack server delete --wait + openstack volume list --long -f json | \ + jq ".[] | select(.Properties | contains(\"ownedby='$ip'\")) | .ID" | \ + xargs --no-run-if-empty --max-args 1 -P20 openstack volume delete + perl -pi -e 's/.*gitbuilder_host.*/gitbuilder_host: gitbuilder.ceph.com/' /home/$user/.teuthology.yaml + rm -fr /home/$user/src/* + mv /tmp/stampsdir /tmp/stampsdir.old + mkdir /tmp/stampsdir + chown $user /tmp/stampsdir + if test -f /tmp/stampsdir.old/packages-repository ; then + mv /tmp/stampsdir.old/*packages-repository* /tmp/stampsdir + fi + rm -fr /tmp/stampsdir.old + ;; + restart) + $0 stop + $0 start + ;; + *) +esac diff --git a/teuthology/openstack/openstack-ubuntu-12.04-user-data.txt b/teuthology/openstack/openstack-ubuntu-12.04-user-data.txt new file mode 100644 index 000000000..0b104f5fd --- /dev/null +++ b/teuthology/openstack/openstack-ubuntu-12.04-user-data.txt @@ -0,0 +1,23 @@ +#cloud-config +bootcmd: + - apt-get remove --purge -y resolvconf || true + - echo 'prepend domain-name-servers {nameserver};' | tee -a /etc/dhcp/dhclient.conf + - echo 'supersede domain-name "{lab_domain}";' | tee -a /etc/dhcp/dhclient.conf + - ifdown -a ; ifup -a + - grep --quiet {nameserver} /etc/resolv.conf || ( echo 'nameserver {nameserver}' ; echo 'search {lab_domain}' ) | tee /etc/resolv.conf + - ( curl --silent http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(curl --silent http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - echo "MaxSessions 1000" >> /etc/ssh/sshd_config +preserve_hostname: true +manage_etc_hosts: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp +runcmd: + - dpkg -l python wget git ntp >> /var/log/cloud-init-output.log + - echo "{up}" >> /var/log/cloud-init-output.log diff --git a/teuthology/openstack/openstack-ubuntu-14.04-user-data.txt b/teuthology/openstack/openstack-ubuntu-14.04-user-data.txt new file mode 100644 index 000000000..5a6ea6a3d --- /dev/null +++ b/teuthology/openstack/openstack-ubuntu-14.04-user-data.txt @@ -0,0 +1,21 @@ +#cloud-config +bootcmd: + - apt-get remove --purge -y resolvconf || true + - echo 'prepend domain-name-servers {nameserver};' | tee -a /etc/dhcp/dhclient.conf + - echo 'supersede domain-name "{lab_domain}";' | tee -a /etc/dhcp/dhclient.conf + - ifdown -a ; ifup -a + - grep --quiet {nameserver} /etc/resolv.conf || ( echo 'nameserver {nameserver}' ; echo 'search {lab_domain}' ) | tee /etc/resolv.conf + - ( wget -qO - http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(wget -qO - http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - echo "MaxSessions 1000" >> /etc/ssh/sshd_config +manage_etc_hosts: true +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-ubuntu-16.04-user-data.txt b/teuthology/openstack/openstack-ubuntu-16.04-user-data.txt new file mode 100644 index 000000000..5a6ea6a3d --- /dev/null +++ b/teuthology/openstack/openstack-ubuntu-16.04-user-data.txt @@ -0,0 +1,21 @@ +#cloud-config +bootcmd: + - apt-get remove --purge -y resolvconf || true + - echo 'prepend domain-name-servers {nameserver};' | tee -a /etc/dhcp/dhclient.conf + - echo 'supersede domain-name "{lab_domain}";' | tee -a /etc/dhcp/dhclient.conf + - ifdown -a ; ifup -a + - grep --quiet {nameserver} /etc/resolv.conf || ( echo 'nameserver {nameserver}' ; echo 'search {lab_domain}' ) | tee /etc/resolv.conf + - ( wget -qO - http://169.254.169.254/2009-04-04/meta-data/hostname | sed -e 's/[\.-].*//' ; eval printf "%03d%03d%03d%03d.{lab_domain}" $(wget -qO - http://169.254.169.254/2009-04-04/meta-data/local-ipv4 | tr . ' ' ) ) | tee /etc/hostname + - hostname $(cat /etc/hostname) + - echo "MaxSessions 1000" >> /etc/ssh/sshd_config +manage_etc_hosts: true +preserve_hostname: true +system_info: + default_user: + name: {username} +packages: + - python + - wget + - git + - ntp +final_message: "{up}, after $UPTIME seconds" diff --git a/teuthology/openstack/openstack-user-data.txt b/teuthology/openstack/openstack-user-data.txt new file mode 100644 index 000000000..8b2ba9b85 --- /dev/null +++ b/teuthology/openstack/openstack-user-data.txt @@ -0,0 +1,22 @@ +#cloud-config +bootcmd: + - touch /tmp/init.out + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +system_info: + default_user: + name: TEUTHOLOGY_USERNAME +packages: + - python-virtualenv + - git + - rsync +runcmd: + - su - -c '(set -x ; CLONE_OPENSTACK && cd teuthology && ./bootstrap install)' TEUTHOLOGY_USERNAME >> /tmp/init.out 2>&1 + - echo 'export OPENRC' | tee /home/TEUTHOLOGY_USERNAME/openrc.sh + - su - -c '(set -x ; source openrc.sh ; cd teuthology ; source virtualenv/bin/activate ; teuthology/openstack/setup-openstack.sh --nworkers NWORKERS UPLOAD CEPH_WORKBENCH CANONICAL_TAGS SETUP_OPTIONS)' TEUTHOLOGY_USERNAME >> /tmp/init.out 2>&1 + # wa: we want to stop paddles and pulpito started by setup-openstack, before start teuthology service + - pkill -f 'pecan serve' + - pkill -f 'python run.py' + - systemctl enable teuthology + - systemctl start teuthology +final_message: "teuthology is up and running after $UPTIME seconds" diff --git a/teuthology/openstack/setup-openstack.sh b/teuthology/openstack/setup-openstack.sh new file mode 100755 index 000000000..526ba9813 --- /dev/null +++ b/teuthology/openstack/setup-openstack.sh @@ -0,0 +1,818 @@ +#!/bin/bash +# +# Copyright (c) 2015 Red Hat, Inc. +# +# Author: Loic Dachary +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +# +# Most of this file is intended to be obsoleted by the ansible equivalent +# when they are available (setting up paddles, pulpito, etc.). +# +function create_config() { + local network="$1" + local subnet="$2" + local nameserver="$3" + local labdomain="$4" + local ip="$5" + local archive_upload="$6" + local canonical_tags="$7" + local selfname="$8" + local keypair="$9" + local server_name="${10}" + local server_group="${11}" + local worker_group="${12}" + local package_repo="${13}" + local teuthology_branch="${14}" + local teuthology_git_url="${15}" + + if test "$network" ; then + network="network: $network" + fi + + if test "$archive_upload" ; then + archive_upload="archive_upload: $archive_upload" + fi + + cat > ~/.teuthology.yaml < ~/.vault_pass.txt + echo "OVERRIDE ~/.vault_pass.txt" + return 0 +} + +function apt_get_update() { + sudo apt-get update +} + +function setup_docker() { + source /etc/os-release + if ! $VERSION_CODENAME; then + echo "ERROR: VERSION_CODENAME is not set. Cannot proceed with Docker installation." + return + fi + if !command -v docker &> /dev/null; then + curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \ + sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg + echo \ + "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] \ + https://download.docker.com/linux/ubuntu $VERSION_CODENAME stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io + echo "INSTALLED docker" + else + echo "OK docker is installed" + fi +} + +function setup_fail2ban() { + if test -f /usr/bin/fail2ban-server; then + echo "OK fail2ban is installed" + else + sudo apt-get -qq install -y fail2ban + echo "INSTALLED fail2ban" + fi + sudo systemctl restart fail2ban + sudo systemctl enable fail2ban + echo "STARTED fail2ban" +} + +function setup_salt_master() { + if test -f /etc/salt/master ; then + echo "OK salt-master is installed" + else + sudo apt-get -qq install -y salt-master + fi +} + +function teardown_paddles() { + if pkill -f 'pecan' ; then + echo "SHUTDOWN the paddles server" + fi +} + +function setup_paddles() { + local ip=$1 + + local public_ip=$(curl --silent http://169.254.169.254/2009-04-04/meta-data/public-ipv4/) + if test -z "$public_ip" ; then + public_ip=$ip + fi + + local paddles_dir=$(dirname $0)/../../../paddles + + if ! test -d $paddles_dir ; then + git clone https://github.com/ceph/paddles.git $paddles_dir || return 1 + fi + + sudo apt-get -qq install -y beanstalkd postgresql postgresql-contrib postgresql-server-dev-all supervisor + + if ! sudo /etc/init.d/postgresql status ; then + sudo mkdir -p /etc/postgresql + sudo chown postgres /etc/postgresql + sudo -u postgres pg_createcluster 9.3 paddles + sudo /etc/init.d/postgresql start || return 1 + fi + if ! psql --command 'select 1' 'postgresql://paddles:paddles@localhost/paddles' > /dev/null 2>&1 ; then + sudo -u postgres psql -c "CREATE USER paddles with PASSWORD 'paddles';" || return 1 + sudo -u postgres createdb -O paddles paddles || return 1 + fi + ( + cd $paddles_dir || return 1 + git pull --rebase + git clean -ffqdx + sed -e "/^address = os.environ.get(/,/^)/c\address = os.environ.get('PADDLES_ADDRESS', 'http://localhost')" \ + -e "s|^job_log_href_templ = os.environ.get(.*)|job_log_href_templ = os.environ.get('PADDLES_JOB_LOG_HREF_TEMPL', 'http://$public_ip')|" \ + -e "/sqlite/d" \ + -e "s|^ *'url':.*|'url': 'postgresql://paddles:paddles@localhost/paddles',|" \ + -e "s/'host': '127.0.0.1'/'host': '0.0.0.0'/" \ + < config.py.in > config.py + virtualenv ./virtualenv + source ./virtualenv/bin/activate + pip install -r requirements.txt + pip install sqlalchemy tzlocal requests netaddr + python setup.py develop + ) + + echo "CONFIGURED the paddles server" +} + +function populate_paddles() { + local subnets="$1" + local labdomain=$2 + + local paddles_dir=$(dirname $0)/../../../paddles + + local url='postgresql://paddles:paddles@localhost/paddles' + + pkill -f 'pecan serve' + + sudo -u postgres dropdb paddles + sudo -u postgres createdb -O paddles paddles + + ( + cd $paddles_dir || return 1 + source virtualenv/bin/activate + pecan populate config.py + + ( + echo "begin transaction;" + for subnet in $subnets ; do + subnet_names_and_ips $subnet | while read name ip ; do + echo "insert into nodes (name,machine_type,is_vm,locked,up) values ('${name}.${labdomain}', 'openstack', TRUE, FALSE, TRUE);" + done + done + echo "commit transaction;" + ) | psql --quiet $url + + setsid pecan serve config.py < /dev/null > /dev/null 2>&1 & + for i in $(seq 1 20) ; do + if curl --silent http://localhost:8080/ > /dev/null 2>&1 ; then + break + else + echo -n . + sleep 5 + fi + done + echo -n ' ' + ) + + echo "RESET the paddles server" +} + +function teardown_pulpito() { + if pkill -f 'python run.py' ; then + echo "SHUTDOWN the pulpito server" + fi +} + +function setup_pulpito() { + local pulpito=http://localhost:8081/ + + local pulpito_dir=$(dirname $0)/../../../pulpito + + if curl --silent $pulpito | grep -q pulpito ; then + echo "OK pulpito is running" + return 0 + fi + + if ! test -d $pulpito_dir ; then + git clone https://github.com/ceph/pulpito.git $pulpito_dir || return 1 + fi + + sudo apt-get -qq install -y nginx + local nginx_conf=/etc/nginx/sites-available/default + sudo sed -i '/text\/plain/a\ text\/plain log;' \ + /etc/nginx/mime.types + sudo perl -pi -e 's|root /var/www/html|root /usr/share/nginx/html|' $nginx_conf + if ! grep -qq 'autoindex on' $nginx_conf ; then + sudo perl -pi -e 's|location / {|location / { autoindex on;|' $nginx_conf + sudo /etc/init.d/nginx restart + sudo rm -f /usr/share/nginx/html/*.html + echo "ADDED autoindex on to nginx configuration" + fi + sudo chown $USER /usr/share/nginx/html + ( + cd $pulpito_dir || return 1 + git pull --rebase + git clean -ffqdx + sed -e "s|paddles_address.*|paddles_address = 'http://localhost:8080'|" < config.py.in > prod.py + virtualenv ./virtualenv + source ./virtualenv/bin/activate + pip install --upgrade pip + pip install 'setuptools>=58.0.0' + pip install -r requirements.txt + python run.py & + ) + + echo "LAUNCHED the pulpito server" +} + +function setup_bashrc() { + if test -f ~/.bashrc && grep -qq '.bashrc_teuthology' ~/.bashrc ; then + echo "OK .bashrc_teuthology found in ~/.bashrc" + else + cat > ~/.bashrc_teuthology <<'EOF' +source $HOME/openrc.sh +source $HOME/teuthology/virtualenv/bin/activate +export HISTSIZE=500000 +export PROMPT_COMMAND='history -a' +EOF + echo 'source $HOME/.bashrc_teuthology' >> ~/.bashrc + echo "ADDED .bashrc_teuthology to ~/.bashrc" + fi +} + +function setup_ssh_config() { + if test -f ~/.ssh/config && grep -qq 'StrictHostKeyChecking no' ~/.ssh/config ; then + echo "OK ~/.ssh/config" + else + cat >> ~/.ssh/config <> ~/.ssh/authorized_keys + chmod 600 teuthology/openstack/archive-key + echo "APPEND to ~/.ssh/authorized_keys" +} + +function setup_bootscript() { + local nworkers=$1 + + local where=$(dirname $0) + + sudo cp -a $where/openstack-teuthology.init /etc/init.d/teuthology + echo NWORKERS=$1 | sudo tee /etc/default/teuthology > /dev/null + echo "CREATED init script /etc/init.d/teuthology" +} + +function setup_crontab() { + local where=$(dirname $0) + crontab $where/openstack-teuthology.cron +} + +function remove_crontab() { + crontab -r +} + +function setup_ceph_workbench() { + local url=$1 + local branch=$2 + + ( + cd $HOME + source teuthology/virtualenv/bin/activate + if test "$url" ; then + git clone -b $branch $url + cd ceph-workbench + pip install -e . + echo "INSTALLED ceph-workbench from $url" + else + pip install ceph-workbench + echo "INSTALLED ceph-workbench from pypi" + fi + mkdir -p ~/.ceph-workbench + chmod 700 ~/.ceph-workbench + cp -a $HOME/openrc.sh ~/.ceph-workbench + cp -a $HOME/.ssh/id_rsa ~/.ceph-workbench/teuthology.pem + echo "RESET ceph-workbench credentials (key & OpenStack)" + ) +} + +function get_or_create_keypair() { + local keypair=$1 + + ( + cd $HOME/.ssh + if ! test -f $keypair.pem ; then + openstack keypair delete $keypair || true + openstack keypair create $keypair > $keypair.pem || return 1 + chmod 600 $keypair.pem + fi + if ! test -f $keypair.pub ; then + if ! ssh-keygen -y -f $keypair.pem > $keypair.pub ; then + cat $keypair.pub + return 1 + fi + fi + if ! openstack keypair show $keypair > $keypair.keypair 2>&1 ; then + openstack keypair create --public-key $keypair.pub $keypair || return 1 # noqa + else + fingerprint=$(ssh-keygen -l -f $keypair.pub | cut -d' ' -f2) + if ! grep --quiet $fingerprint $keypair.keypair ; then + openstack keypair delete $keypair || return 1 + openstack keypair create --public-key $keypair.pub $keypair || return 1 # noqa + fi + fi + ln -f $keypair.pem id_rsa + cat $keypair.pub >> authorized_keys + ) +} + +function delete_keypair() { + local keypair=$1 + + if openstack keypair show $keypair > /dev/null 2>&1 ; then + openstack keypair delete $keypair || return 1 + echo "REMOVED keypair $keypair" + fi +} + +function setup_dnsmasq() { + local provider=$1 + local dev=$2 + + if ! test -f /etc/dnsmasq.d/resolv ; then + resolver=$(grep nameserver /etc/resolv.conf | head -1 | perl -ne 'print $1 if(/\s*nameserver\s+([\d\.]+)/)') + sudo apt-get -qq install -y dnsmasq resolvconf + # FIXME: this opens up dnsmasq to DNS reflection/amplification attacks, and can be reverted + # FIXME: once we figure out how to configure dnsmasq to accept DNS queries from all subnets + sudo perl -pi -e 's/--local-service//' /etc/init.d/dnsmasq + echo resolv-file=/etc/dnsmasq-resolv.conf | sudo tee /etc/dnsmasq.d/resolv + echo nameserver $resolver | sudo tee /etc/dnsmasq-resolv.conf + # restart is not always picking up changes + sudo /etc/init.d/dnsmasq stop || true + sudo /etc/init.d/dnsmasq start + sudo sed -ie 's/^#IGNORE_RESOLVCONF=yes/IGNORE_RESOLVCONF=yes/' /etc/default/dnsmasq + echo nameserver 127.0.0.1 | sudo tee /etc/resolvconf/resolv.conf.d/head + sudo resolvconf -u + if test $provider = cloudlab ; then + sudo perl -pi -e 's/.*(prepend domain-name-servers 127.0.0.1;)/\1/' /etc/dhcp/dhclient.conf + sudo bash -c "ifdown $dev ; ifup $dev" + fi + echo "INSTALLED dnsmasq and configured to be a resolver" + else + echo "OK dnsmasq installed" + fi +} + +function subnet_names_and_ips() { + local subnet=$1 + python -c 'import netaddr; print("\n".join([str(i) for i in netaddr.IPNetwork("'$subnet'")]))' | + sed -e 's/\./ /g' | while read a b c d ; do + printf "target%03d%03d%03d%03d " $a $b $c $d + echo $a.$b.$c.$d + done +} + +function define_dnsmasq() { + local subnets="$1" + local labdomain=$2 + local host_records=/etc/dnsmasq.d/teuthology + if ! test -f $host_records ; then + for subnet in $subnets ; do + subnet_names_and_ips $subnet | while read name ip ; do + echo host-record=$name.$labdomain,$ip + done + done | sudo tee $host_records > /tmp/dnsmasq + head -2 /tmp/dnsmasq + echo 'etc.' + # restart is not always picking up changes + sudo /etc/init.d/dnsmasq stop || true + sudo /etc/init.d/dnsmasq start + echo "CREATED $host_records" + else + echo "OK $host_records exists" + fi +} + +function undefine_dnsmasq() { + local host_records=/etc/dnsmasq.d/teuthology + + sudo rm -f $host_records + echo "REMOVED $host_records" +} + +function setup_ansible() { + local subnets="$1" + local labdomain=$2 + local dir=/etc/ansible/hosts + if ! test -f $dir/teuthology ; then + sudo mkdir -p $dir/group_vars + echo '[testnodes]' | sudo tee $dir/teuthology + for subnet in $subnets ; do + subnet_names_and_ips $subnet | while read name ip ; do + echo $name.$labdomain + done + done | sudo tee -a $dir/teuthology > /tmp/ansible + head -2 /tmp/ansible + echo 'etc.' + echo 'modify_fstab: false' | sudo tee $dir/group_vars/all.yml + echo "CREATED $dir/teuthology" + else + echo "OK $dir/teuthology exists" + fi +} + +function teardown_ansible() { + sudo rm -fr /etc/ansible/hosts/teuthology +} + +function remove_images() { + glance image-list --property-filter ownedby=teuthology | grep -v -e ---- -e 'Disk Format' | cut -f4 -d ' ' | while read image ; do + echo "DELETED image $image" + glance image-delete $image + done +} + +function install_packages() { + source /etc/os-release + if ! $VERSION_CODENAME; then + echo "ERROR: VERSION_CODENAME is not set. Cannot proceed with Docker installation." + return + fi + local codename=$VERSION_CODENAME + local backports_file="/etc/apt/sources.list.d/${codename}-backports.list" + if [ ! -f "$backports_file" ]; then + echo "Adding backports repo for $codename..." + echo "deb http://archive.ubuntu.com/ubuntu ${codename}-backports main universe" | sudo tee "$backports_file" + sudo apt-get update + fi + local packages="jq curl" + sudo apt-get -qq install -y $packages + echo "INSTALL required packages $packages" +} + +CAT=${CAT:-cat} + +function verify_openstack() { + if ! openstack server list --format json > /dev/null ; then + echo ERROR: the credentials from ~/openrc.sh are not working >&2 + return 1 + fi + echo "OK $OS_TENANT_NAME can use $OS_AUTH_URL" >&2 + local provider + if echo $OS_AUTH_URL | grep -qq cloud.ovh.net ; then + provider=ovh + elif echo $OS_AUTH_URL | grep -qq entercloudsuite.com ; then + provider=entercloudsuite + elif echo $OS_AUTH_URL | grep -qq cloudlab.us ; then + provider=cloudlab + else + provider=any + fi + echo "OPENSTACK PROVIDER $provider" >&2 + echo $provider +} + +function main() { + local network + local subnets + local nameserver + local labdomain=teuthology + local nworkers=2 + local keypair=teuthology + local selfname=teuthology + local server_name=teuthology + local server_group=teuthology + local worker_group=teuthology + local package_repo=packages-repository + local archive_upload + local ceph_workbench_git_url + local ceph_workbench_branch + + local do_setup_keypair=false + local do_apt_get_update=false + local do_setup_docker=false + local do_setup_salt_master=false + local do_ceph_workbench=false + local do_create_config=false + local do_setup_dnsmasq=false + local do_install_packages=false + local do_setup_paddles=false + local do_populate_paddles=false + local do_setup_pulpito=false + local do_clobber=false + local canonical_tags=true + + export LC_ALL=C + + while [ $# -ge 1 ]; do + case $1 in + --verbose) + set -x + PS4='${FUNCNAME[0]}: $LINENO: ' + ;; + --nameserver) + shift + nameserver=$1 + ;; + --subnets) + shift + subnets=$1 + ;; + --labdomain) + shift + labdomain=$1 + ;; + --network) + shift + network=$1 + ;; + --nworkers) + shift + nworkers=$1 + ;; + --archive-upload) + shift + archive_upload=$1 + ;; + --ceph-workbench-git-url) + shift + ceph_workbench_git_url=$1 + ;; + --ceph-workbench-branch) + shift + ceph_workbench_branch=$1 + ;; + --install) + do_install_packages=true + ;; + --config) + do_create_config=true + ;; + --setup-docker) + do_apt_get_update=true + do_setup_docker=true + ;; + --setup-salt-master) + do_apt_get_update=true + do_setup_salt_master=true + ;; + --server-name) + shift + server_name=$1 + ;; + --server-group) + shift + server_group=$1 + ;; + --worker-group) + shift + worker_group=$1 + ;; + --package-repo) + shift + package_repo=$1 + ;; + --selfname) + shift + selfname=$1 + ;; + --keypair) + shift + keypair=$1 + ;; + --setup-keypair) + do_setup_keypair=true + ;; + --setup-ceph-workbench) + do_ceph_workbench=true + ;; + --setup-dnsmasq) + do_setup_dnsmasq=true + ;; + --setup-fail2ban) + do_setup_fail2ban=true + ;; + --setup-paddles) + do_setup_paddles=true + ;; + --setup-pulpito) + do_setup_pulpito=true + ;; + --populate-paddles) + do_populate_paddles=true + ;; + --setup-all) + do_install_packages=true + do_ceph_workbench=true + do_create_config=true + do_setup_keypair=true + do_apt_get_update=true + do_setup_docker=true + do_setup_salt_master=true + do_setup_dnsmasq=true + do_setup_fail2ban=true + do_setup_paddles=true + do_setup_pulpito=true + do_populate_paddles=true + ;; + --clobber) + do_clobber=true + ;; + --no-canonical-tags) + canonical_tags=false + ;; + *) + echo $1 is not a known option + return 1 + ;; + esac + shift + done + + if $do_install_packages ; then + install_packages || return 1 + fi + + local provider=$(verify_openstack) + + # + # assume the first available IPv4 subnet is going to be used to assign IP to the instance + # + [ -z "$network" ] && { + local default_subnets=$(openstack subnet list --ip-version 4 -f json | jq -r '.[] | select(.Name != null) | .Subnet' | sort | uniq) + } || { + local network_id=$(openstack network list -f json | jq -r ".[] | select(.Name == \"$network\") | .ID") + local default_subnets=$(openstack subnet list --ip-version 4 -f json \ + | jq -r ".[] | select(.Network == \"$network_id\") | .Subnet" | sort | uniq) + } + subnets=$(echo $subnets $default_subnets) + echo "subnets: $subnets" + + case $provider in + entercloudsuite) + eval network=$(neutron net-list -f json | jq '.[] | select(.subnets | contains("'$subnet'")) | .name') + ;; + cloudlab) + network='flat-lan-1-net' + subnet='10.11.10.0/24' + ;; + esac + + local ip + for dev in $(ip -o link show | awk -F': ' '{print $2}' | grep -v '^lo$'); do + ip=$(ip -4 addr show dev "$dev" | awk '/inet / {print $2}' | cut -d/ -f1) + if [ -n "$ip" ]; then + nameserver="$ip" + break + fi + done + + local teuthology_branch="$(git -C $(dirname $0)/../../../teuthology rev-parse --abbrev-ref HEAD)" + local teuthology_git_url="$(git -C $(dirname $0)/../../../teuthology config --get remote.origin.url)" + + if $do_create_config ; then + create_config "$network" "$subnets" "$nameserver" "$labdomain" "$ip" \ + "$archive_upload" "$canonical_tags" "$selfname" "$keypair" \ + "$server_name" "$server_group" "$worker_group" "$package_repo" "$teuthology_branch" "$teuthology_git_url" || return 1 + setup_ansible "$subnets" $labdomain || return 1 + setup_ssh_config || return 1 + setup_authorized_keys || return 1 + setup_bashrc || return 1 + setup_bootscript $nworkers || return 1 + setup_crontab || return 1 + fi + + if $do_setup_keypair ; then + get_or_create_keypair $keypair || return 1 + fi + + if $do_ceph_workbench ; then + setup_ceph_workbench $ceph_workbench_git_url $ceph_workbench_branch || return 1 + fi + + if $do_apt_get_update ; then + apt_get_update || return 1 + fi + + if test $provider != "cloudlab" && $do_setup_docker ; then + setup_docker || return 1 + fi + + if $do_setup_salt_master ; then + setup_salt_master || return 1 + fi + + if $do_setup_fail2ban ; then + setup_fail2ban || return 1 + fi + + if $do_setup_dnsmasq ; then + setup_dnsmasq $provider $dev || return 1 + define_dnsmasq "$subnets" $labdomain || return 1 + fi + + if $do_setup_paddles ; then + setup_paddles $ip || return 1 + fi + + if $do_populate_paddles ; then + populate_paddles "$subnets" $labdomain || return 1 + fi + + if $do_setup_pulpito ; then + setup_pulpito || return 1 + fi + + if $do_clobber ; then + undefine_dnsmasq || return 1 + delete_keypair $keypair || return 1 + teardown_paddles || return 1 + teardown_pulpito || return 1 + teardown_ansible || return 1 + remove_images || return 1 + remove_crontab || return 1 + fi +} + +main "$@" diff --git a/teuthology/openstack/test/__init__.py b/teuthology/openstack/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/openstack/test/archive-on-error.yaml b/teuthology/openstack/test/archive-on-error.yaml new file mode 100644 index 000000000..f9f524792 --- /dev/null +++ b/teuthology/openstack/test/archive-on-error.yaml @@ -0,0 +1 @@ +archive-on-error: true diff --git a/teuthology/openstack/test/noop.yaml b/teuthology/openstack/test/noop.yaml new file mode 100644 index 000000000..6aae7ec90 --- /dev/null +++ b/teuthology/openstack/test/noop.yaml @@ -0,0 +1,12 @@ +stop_worker: true +machine_type: openstack +os_type: ubuntu +os_version: "14.04" +roles: +- - mon.a + - osd.0 +tasks: +- exec: + mon.a: + - echo "Well done !" + diff --git a/teuthology/openstack/test/openstack-integration.py b/teuthology/openstack/test/openstack-integration.py new file mode 100644 index 000000000..5d1a266f6 --- /dev/null +++ b/teuthology/openstack/test/openstack-integration.py @@ -0,0 +1,243 @@ +# +# Copyright (c) 2015, 2016 Red Hat, Inc. +# +# Author: Loic Dachary +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +import argparse +import logging +import json +import os +import subprocess +import tempfile +import shutil + +import teuthology.lock +import teuthology.lock.cli +import teuthology.lock.query +import teuthology.lock.util +import teuthology.misc +import teuthology.schedule +import teuthology.suite +import teuthology.openstack +import scripts.schedule +import scripts.lock +import scripts.suite +from teuthology.config import config as teuth_config +from teuthology.config import set_config_attr + + +class Integration(object): + + @classmethod + def setup_class(self): + teuthology.log.setLevel(logging.DEBUG) + set_config_attr(argparse.Namespace()) + self.teardown_class() + + @classmethod + def teardown_class(self): + os.system("sudo /etc/init.d/beanstalkd restart") + # if this fails it will not show the error but some weird + # INTERNALERROR> IndexError: list index out of range + # move that to def tearDown for debug and when it works move it + # back in tearDownClass so it is not called on every test + ownedby = "ownedby='" + teuth_config.openstack['ip'] + all_instances = teuthology.openstack.OpenStack().run( + "server list -f json --long") + for instance in json.loads(all_instances): + if ownedby in instance['Properties']: + teuthology.openstack.OpenStack().run( + "server delete --wait " + instance['ID']) + + def setup_worker(self): + self.logs = self.d + "/log" + os.mkdir(self.logs, 0o755) + self.archive = self.d + "/archive" + os.mkdir(self.archive, 0o755) + self.worker_cmd = ("teuthology-worker --tube openstack " + + "-l " + self.logs + " " + "--archive-dir " + self.archive + " ") + logging.info(self.worker_cmd) + self.worker = subprocess.Popen(self.worker_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + + def wait_worker(self): + if not self.worker: + return + + (stdoutdata, stderrdata) = self.worker.communicate() + stdoutdata = stdoutdata.decode('utf-8') + stderrdata = stderrdata.decode('utf-8') + logging.info(self.worker_cmd + ":" + + " stdout " + stdoutdata + + " stderr " + stderrdata + " end ") + assert self.worker.returncode == 0 + self.worker = None + + def get_teuthology_log(self): + # the archive is removed before each test, there must + # be only one run and one job + run = os.listdir(self.archive)[0] + job = os.listdir(os.path.join(self.archive, run))[0] + path = os.path.join(self.archive, run, job, 'teuthology.log') + return open(path, 'r').read() + +class TestSuite(Integration): + + def setup_method(self): + self.d = tempfile.mkdtemp() + self.setup_worker() + logging.info("TestSuite: done worker") + + def teardown(self): + self.wait_worker() + shutil.rmtree(self.d) + + def test_suite_noop(self): + cwd = os.getcwd() + os.mkdir(self.d + '/upload', 0o755) + upload = 'localhost:' + self.d + '/upload' + args = ['--suite', 'noop', + '--suite-dir', cwd + '/teuthology/openstack/test', + '--machine-type', 'openstack', + '--archive-upload', upload, + '--verbose'] + logging.info("TestSuite:test_suite_noop") + scripts.suite.main(args) + self.wait_worker() + log = self.get_teuthology_log() + assert "teuthology.run:pass" in log + assert "Well done" in log + upload_key = teuth_config.archive_upload_key + if upload_key: + ssh = "RSYNC_RSH='ssh -i " + upload_key + "'" + else: + ssh = '' + assert 'teuthology.log' in teuthology.misc.sh(ssh + " rsync -av " + upload) + +class TestSchedule(Integration): + + def setup_method(self): + self.d = tempfile.mkdtemp() + self.setup_worker() + + def teardown(self): + self.wait_worker() + shutil.rmtree(self.d) + + def test_schedule_stop_worker(self): + job = 'teuthology/openstack/test/stop_worker.yaml' + args = ['--name', 'fake', + '--verbose', + '--owner', 'test@test.com', + '--worker', 'openstack', + job] + scripts.schedule.main(args) + self.wait_worker() + + def test_schedule_noop(self): + job = 'teuthology/openstack/test/noop.yaml' + args = ['--name', 'fake', + '--verbose', + '--owner', 'test@test.com', + '--worker', 'openstack', + job] + scripts.schedule.main(args) + self.wait_worker() + log = self.get_teuthology_log() + assert "teuthology.run:pass" in log + assert "Well done" in log + + def test_schedule_resources_hint(self): + """It is tricky to test resources hint in a provider agnostic way. The + best way seems to ask for at least 1GB of RAM and 10GB + disk. Some providers do not offer a 1GB RAM flavor (OVH for + instance) and the 2GB RAM will be chosen instead. It however + seems unlikely that a 4GB RAM will be chosen because it would + mean such a provider has nothing under that limit and it's a + little too high. + + Since the default when installing is to ask for 7000 MB, we + can reasonably assume that the hint has been taken into + account if the instance has less than 4GB RAM. + """ + try: + teuthology.openstack.OpenStack().run("volume list") + job = 'teuthology/openstack/test/resources_hint.yaml' + has_cinder = True + except subprocess.CalledProcessError: + job = 'teuthology/openstack/test/resources_hint_no_cinder.yaml' + has_cinder = False + args = ['--name', 'fake', + '--verbose', + '--owner', 'test@test.com', + '--worker', 'openstack', + job] + scripts.schedule.main(args) + self.wait_worker() + log = self.get_teuthology_log() + assert "teuthology.run:pass" in log + assert "RAM size ok" in log + if has_cinder: + assert "Disk size ok" in log + +class TestLock(Integration): + + def setup_method(self): + self.options = ['--verbose', + '--machine-type', 'openstack' ] + + def test_main(self): + args = scripts.lock.parse_args(self.options + ['--lock']) + assert teuthology.lock.cli.main(args) == 0 + + def test_lock_unlock(self): + default_archs = teuthology.openstack.OpenStack().get_available_archs() + if 'TEST_IMAGES' in os.environ: + images = os.environ['TEST_IMAGES'].split() + else: + images = teuthology.openstack.OpenStack.image2url.keys() + for image in images: + (os_type, os_version, arch) = image.split('-') + if arch not in default_archs: + logging.info("skipping " + image + " because arch " + + " is not supported (" + str(default_archs) + ")") + continue + args = scripts.lock.parse_args(self.options + + ['--lock-many', '1', + '--os-type', os_type, + '--os-version', os_version, + '--arch', arch]) + assert teuthology.lock.cli.main(args) == 0 + locks = teuthology.lock.query.list_locks(locked=True) + assert len(locks) == 1 + args = scripts.lock.parse_args(self.options + + ['--unlock', locks[0]['name']]) + assert teuthology.lock.cli.main(args) == 0 + + def test_list(self, capsys): + args = scripts.lock.parse_args(self.options + ['--list', '--all']) + teuthology.lock.cli.main(args) + out, err = capsys.readouterr() + assert 'machine_type' in out + assert 'openstack' in out diff --git a/teuthology/openstack/test/resources_hint.yaml b/teuthology/openstack/test/resources_hint.yaml new file mode 100644 index 000000000..b8f595964 --- /dev/null +++ b/teuthology/openstack/test/resources_hint.yaml @@ -0,0 +1,25 @@ +stop_worker: true +machine_type: openstack +openstack: + - machine: + disk: 10 # GB + ram: 10000 # MB + cpus: 1 + volumes: + count: 1 + size: 2 # GB +os_type: ubuntu +os_version: "14.04" +roles: +- - mon.a + - osd.0 +tasks: +- exec: + mon.a: + - test $(sed -n -e 's/MemTotal.* \([0-9][0-9]*\).*/\1/p' < /proc/meminfo) -ge 10000000 && echo "RAM" "size" "ok" + - cat /proc/meminfo +# wait for the attached volume to show up + - for delay in 1 2 4 8 16 32 64 128 256 512 ; do if test -e /sys/block/vdb/size ; then break ; else sleep $delay ; fi ; done +# 4000000 because 512 bytes sectors + - test $(cat /sys/block/vdb/size) -gt 4000000 && echo "Disk" "size" "ok" + - cat /sys/block/vdb/size diff --git a/teuthology/openstack/test/resources_hint_no_cinder.yaml b/teuthology/openstack/test/resources_hint_no_cinder.yaml new file mode 100644 index 000000000..c603804a5 --- /dev/null +++ b/teuthology/openstack/test/resources_hint_no_cinder.yaml @@ -0,0 +1,20 @@ +stop_worker: true +machine_type: openstack +openstack: + - machine: + disk: 10 # GB + ram: 10000 # MB + cpus: 1 + volumes: + count: 0 + size: 2 # GB +os_type: ubuntu +os_version: "14.04" +roles: +- - mon.a + - osd.0 +tasks: +- exec: + mon.a: + - cat /proc/meminfo + - test $(sed -n -e 's/MemTotal.* \([0-9][0-9]*\).*/\1/p' < /proc/meminfo) -ge 10000000 && echo "RAM" "size" "ok" diff --git a/teuthology/openstack/test/stop_worker.yaml b/teuthology/openstack/test/stop_worker.yaml new file mode 100644 index 000000000..45133bb00 --- /dev/null +++ b/teuthology/openstack/test/stop_worker.yaml @@ -0,0 +1 @@ +stop_worker: true diff --git a/teuthology/openstack/test/suites/noop/+ b/teuthology/openstack/test/suites/noop/+ new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/openstack/test/suites/noop/noop.yaml b/teuthology/openstack/test/suites/noop/noop.yaml new file mode 100644 index 000000000..49497c228 --- /dev/null +++ b/teuthology/openstack/test/suites/noop/noop.yaml @@ -0,0 +1,9 @@ +stop_worker: true +roles: +- - mon.a + - osd.0 +tasks: +- exec: + mon.a: + - echo "Well done !" + diff --git a/teuthology/openstack/test/suites/nuke/+ b/teuthology/openstack/test/suites/nuke/+ new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/openstack/test/test_config.py b/teuthology/openstack/test/test_config.py new file mode 100644 index 000000000..530aea927 --- /dev/null +++ b/teuthology/openstack/test/test_config.py @@ -0,0 +1,35 @@ +from teuthology.config import config + + +class TestOpenStack(object): + + def setup_method(self): + self.openstack_config = config['openstack'] + + def test_config_clone(self): + assert 'clone' in self.openstack_config + + def test_config_user_data(self): + os_type = 'rhel' + os_version = '7.0' + template_path = self.openstack_config['user-data'].format( + os_type=os_type, + os_version=os_version) + assert os_type in template_path + assert os_version in template_path + + def test_config_ip(self): + assert 'ip' in self.openstack_config + + def test_config_machine(self): + assert 'machine' in self.openstack_config + machine_config = self.openstack_config['machine'] + assert 'disk' in machine_config + assert 'ram' in machine_config + assert 'cpus' in machine_config + + def test_config_volumes(self): + assert 'volumes' in self.openstack_config + volumes_config = self.openstack_config['volumes'] + assert 'count' in volumes_config + assert 'size' in volumes_config diff --git a/teuthology/openstack/test/test_openstack.py b/teuthology/openstack/test/test_openstack.py new file mode 100644 index 000000000..1f34d82a0 --- /dev/null +++ b/teuthology/openstack/test/test_openstack.py @@ -0,0 +1,1695 @@ +# +# Copyright (c) 2015,2016 Red Hat, Inc. +# +# Author: Loic Dachary +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +import argparse +import logging +import os +import pytest +import subprocess +import tempfile +import time +from mock import patch + +import teuthology +from teuthology import misc +from teuthology.config import set_config_attr +from teuthology.openstack import TeuthologyOpenStack, OpenStack, OpenStackInstance +from teuthology.openstack import NoFlavorException +import scripts.openstack + + +class TestOpenStackBase(object): + + def setup_method(self): + OpenStack.token = None + OpenStack.token_expires = None + self.environ = {} + for k in os.environ.keys(): + if k.startswith('OS_'): + self.environ[k] = os.environ[k] + + def teardown_method(self): + OpenStack.token = None + OpenStack.token_expires = None + for k in os.environ.keys(): + if k.startswith('OS_'): + if k in self.environ: + os.environ[k] = self.environ[k] + else: + del os.environ[k] + +class TestOpenStackInstance(TestOpenStackBase): + + teuthology_instance = """ +{ + "OS-EXT-STS:task_state": null, + "addresses": "Ext-Net=167.114.233.32", + "image": "Ubuntu 14.04 (0d315a8d-75e3-418a-80e4-48e62d599627)", + "OS-EXT-STS:vm_state": "active", + "OS-SRV-USG:launched_at": "2015-08-17T12:22:13.000000", + "flavor": "vps-ssd-1 (164fcc7e-7771-414f-a607-b388cb7b7aa0)", + "id": "f3ca32d7-212b-458b-a0d4-57d1085af953", + "security_groups": [ + { + "name": "default" + } + ], + "user_id": "3a075820e5d24fda96cd340b87fd94e9", + "OS-DCF:diskConfig": "AUTO", + "accessIPv4": "", + "accessIPv6": "", + "progress": 0, + "OS-EXT-STS:power_state": 1, + "project_id": "62cf1be03cec403c8ed8e64df55732ea", + "config_drive": "", + "status": "ACTIVE", + "updated": "2015-11-03T13:48:53Z", + "hostId": "bcdf964b6f724e573c07156ff85b4db1707f6f0969f571cf33e0468d", + "OS-SRV-USG:terminated_at": null, + "key_name": "loic", + "properties": "", + "OS-EXT-AZ:availability_zone": "nova", + "name": "mrdarkdragon", + "created": "2015-08-17T12:21:31Z", + "os-extended-volumes:volumes_attached": [{"id": "627e2631-fbb3-48cd-b801-d29cd2a76f74"}, {"id": "09837649-0881-4ee2-a560-adabefc28764"}, {"id": "44e5175b-6044-40be-885a-c9ddfb6f75bb"}] +} + """ + + teuthology_instance_no_addresses = """ +{ + "OS-EXT-STS:task_state": null, + "addresses": "", + "image": "Ubuntu 14.04 (0d315a8d-75e3-418a-80e4-48e62d599627)", + "OS-EXT-STS:vm_state": "active", + "OS-SRV-USG:launched_at": "2015-08-17T12:22:13.000000", + "flavor": "vps-ssd-1 (164fcc7e-7771-414f-a607-b388cb7b7aa0)", + "id": "f3ca32d7-212b-458b-a0d4-57d1085af953", + "security_groups": [ + { + "name": "default" + } + ], + "user_id": "3a075820e5d24fda96cd340b87fd94e9", + "OS-DCF:diskConfig": "AUTO", + "accessIPv4": "", + "accessIPv6": "", + "progress": 0, + "OS-EXT-STS:power_state": 1, + "project_id": "62cf1be03cec403c8ed8e64df55732ea", + "config_drive": "", + "status": "ACTIVE", + "updated": "2015-11-03T13:48:53Z", + "hostId": "bcdf964b6f724e573c07156ff85b4db1707f6f0969f571cf33e0468d", + "OS-SRV-USG:terminated_at": null, + "key_name": "loic", + "properties": "", + "OS-EXT-AZ:availability_zone": "nova", + "name": "mrdarkdragon", + "created": "2015-08-17T12:21:31Z", + "os-extended-volumes:volumes_attached": [] +} + """ + + @classmethod + def setup_class(self): + if 'OS_AUTH_URL' not in os.environ: + pytest.skip('no OS_AUTH_URL environment variable') + + def test_init(self): + with patch.multiple( + misc, + sh=lambda cmd: self.teuthology_instance, + ): + o = OpenStackInstance('NAME') + assert o['id'] == 'f3ca32d7-212b-458b-a0d4-57d1085af953' + o = OpenStackInstance('NAME', {"id": "OTHER"}) + assert o['id'] == "OTHER" + + def test_get_created(self): + with patch.multiple( + misc, + sh=lambda cmd: self.teuthology_instance, + ): + o = OpenStackInstance('NAME') + assert o.get_created() > 0 + + def test_exists(self): + with patch.multiple( + misc, + sh=lambda cmd: self.teuthology_instance, + ): + o = OpenStackInstance('NAME') + assert o.exists() + def sh_raises(cmd): + raise subprocess.CalledProcessError('FAIL', 'BAD') + with patch.multiple( + misc, + sh=sh_raises, + ): + o = OpenStackInstance('NAME') + assert not o.exists() + + def test_volumes(self): + with patch.multiple( + misc, + sh=lambda cmd: self.teuthology_instance, + ): + o = OpenStackInstance('NAME') + assert len(o.get_volumes()) == 3 + + def test_get_addresses(self): + answers = [ + self.teuthology_instance_no_addresses, + self.teuthology_instance, + ] + def sh(self): + return answers.pop(0) + with patch.multiple( + misc, + sh=sh, + ): + o = OpenStackInstance('NAME') + assert o.get_addresses() == 'Ext-Net=167.114.233.32' + + def test_get_ip_neutron(self): + instance_id = '8e1fd70a-3065-46f8-9c30-84dc028c1834' + ip = '10.10.10.4' + def sh(cmd): + if 'neutron subnet-list' in cmd: + return """ +[ + { + "ip_version": 6, + "id": "c45b9661-b2ba-4817-9e3a-f8f63bf32989" + }, + { + "ip_version": 4, + "id": "e03a3dbc-afc8-4b52-952e-7bf755397b50" + } +] + """ + elif 'neutron port-list' in cmd: + return (""" +[ + { + "device_id": "915504ad-368b-4cce-be7c-4f8a83902e28", + "fixed_ips": "{\\"subnet_id\\": \\"e03a3dbc-afc8-4b52-952e-7bf755397b50\\", \\"ip_address\\": \\"10.10.10.1\\"}\\n{\\"subnet_id\\": \\"c45b9661-b2ba-4817-9e3a-f8f63bf32989\\", \\"ip_address\\": \\"2607:f298:6050:9afc::1\\"}" + }, + { + "device_id": "{instance_id}", + "fixed_ips": "{\\"subnet_id\\": \\"e03a3dbc-afc8-4b52-952e-7bf755397b50\\", \\"ip_address\\": \\"{ip}\\"}\\n{\\"subnet_id\\": \\"c45b9661-b2ba-4817-9e3a-f8f63bf32989\\", \\"ip_address\\": \\"2607:f298:6050:9afc:f816:3eff:fe07:76c1\\"}" + }, + { + "device_id": "17e4a968-4caa-4cee-8e4b-f950683a02bd", + "fixed_ips": "{\\"subnet_id\\": \\"e03a3dbc-afc8-4b52-952e-7bf755397b50\\", \\"ip_address\\": \\"10.10.10.5\\"}\\n{\\"subnet_id\\": \\"c45b9661-b2ba-4817-9e3a-f8f63bf32989\\", \\"ip_address\\": \\"2607:f298:6050:9afc:f816:3eff:fe9c:37f0\\"}" + } +] + """.replace('{instance_id}', instance_id). + replace('{ip}', ip)) + else: + raise Exception("unexpected " + cmd) + with patch.multiple( + misc, + sh=sh, + ): + assert ip == OpenStackInstance( + instance_id, + { 'id': instance_id }, + ).get_ip_neutron() + +class TestOpenStack(TestOpenStackBase): + + flavors = """[ + { + "Name": "eg-120-ssd", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 800, + "ID": "008f75de-c467-4d15-8f70-79c8fbe19538" + }, + { + "Name": "hg-60", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 1600, + "ID": "0297d7ac-fe6f-4ff1-b6e7-0b8b0908c94f" + }, + { + "Name": "win-sp-120-ssd-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "039e31f2-6541-46c8-85cf-7f47fab7ad78" + }, + { + "Name": "win-sp-60", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 400, + "ID": "0417a0e6-f68a-4b8b-a642-ca5ecb9652f7" + }, + { + "Name": "hg-120-ssd", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 800, + "ID": "042aefc6-b713-4a7e-ada5-3ff81daa1960" + }, + { + "Name": "win-sp-60-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "0609290c-ad2a-40f0-8c66-c755dd38fe3f" + }, + { + "Name": "win-eg-120", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 800, + "ID": "0651080f-5d07-44b1-a759-7ea4594b669e" + }, + { + "Name": "win-sp-240", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 1600, + "ID": "07885848-8831-486d-8525-91484c09cc7e" + }, + { + "Name": "win-hg-60-ssd", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "079aa0a2-5e48-4e58-8205-719bc962736e" + }, + { + "Name": "eg-120", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 1600, + "ID": "090f8b8c-673c-4ab8-9a07-6e54a8776e7b" + }, + { + "Name": "win-hg-15-ssd-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "10e10c58-d29f-4ff6-a1fd-085c35a3bd9b" + }, + { + "Name": "eg-15-ssd", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 200, + "ID": "1340a920-0f2f-4c1b-8d74-e2502258da73" + }, + { + "Name": "win-eg-30-ssd-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "13e54752-fbd0-47a6-aa93-e5a67dfbc743" + }, + { + "Name": "eg-120-ssd-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "15c07a54-2dfb-41d9-aa73-6989fd8cafc2" + }, + { + "Name": "win-eg-120-ssd-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "15e0dfcc-10f4-4e70-8ac1-30bc323273e2" + }, + { + "Name": "vps-ssd-1", + "RAM": 2000, + "Ephemeral": 0, + "VCPUs": 1, + "Is Public": true, + "Disk": 10, + "ID": "164fcc7e-7771-414f-a607-b388cb7b7aa0" + }, + { + "Name": "win-sp-120-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "169415e1-0979-4527-94fb-638c885bbd8c" + }, + { + "Name": "win-hg-60-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "16f13d5b-be27-4b8b-88da-959d3904d3ba" + }, + { + "Name": "win-sp-30-ssd", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 100, + "ID": "1788102b-ab80-4a0c-b819-541deaca7515" + }, + { + "Name": "win-sp-240-flex", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "17bcfa14-135f-442f-9397-a4dc25265560" + }, + { + "Name": "win-eg-60-ssd-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "194ca9ba-04af-4d86-ba37-d7da883a7eab" + }, + { + "Name": "win-eg-60-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "19ff8837-4751-4f6c-a82b-290bc53c83c1" + }, + { + "Name": "win-eg-30-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "1aaef5e5-4df9-4462-80d3-701683ab9ff0" + }, + { + "Name": "eg-15", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 400, + "ID": "1cd85b81-5e4d-477a-a127-eb496b1d75de" + }, + { + "Name": "hg-120", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 1600, + "ID": "1f1efedf-ec91-4a42-acd7-f5cf64b02d3c" + }, + { + "Name": "hg-15-ssd-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "20347a07-a289-4c07-a645-93cb5e8e2d30" + }, + { + "Name": "win-eg-7-ssd", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 100, + "ID": "20689394-bd77-4f4d-900e-52cc8a86aeb4" + }, + { + "Name": "win-sp-60-ssd-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "21104d99-ba7b-47a0-9133-7e884710089b" + }, + { + "Name": "win-sp-120-ssd", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 400, + "ID": "23c21ecc-9ee8-4ad3-bd9f-aa17a3faf84e" + }, + { + "Name": "win-hg-15-ssd", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 200, + "ID": "24e293ed-bc54-4f26-8fb7-7b9457d08e66" + }, + { + "Name": "eg-15-ssd-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "25f3534a-89e5-489d-aa8b-63f62e76875b" + }, + { + "Name": "win-eg-60", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "291173f1-ea1d-410b-8045-667361a4addb" + }, + { + "Name": "sp-30-ssd-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "2b646463-2efa-428b-94ed-4059923c3636" + }, + { + "Name": "win-eg-120-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "2c74df82-29d2-4b1a-a32c-d5633e7359b4" + }, + { + "Name": "win-eg-15-ssd", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 200, + "ID": "2fe4344f-d701-4bc4-8dcd-6d0b5d83fa13" + }, + { + "Name": "sp-30-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "31487b30-eeb6-472f-a9b6-38ace6587ebc" + }, + { + "Name": "win-sp-240-ssd", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "325b602f-ecc4-4444-90bd-5a2cf4e0da53" + }, + { + "Name": "win-hg-7", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 200, + "ID": "377ded36-491f-4ad7-9eb4-876798b2aea9" + }, + { + "Name": "sp-30-ssd", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 100, + "ID": "382f2831-4dba-40c4-bb7a-6fadff71c4db" + }, + { + "Name": "hg-30", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 800, + "ID": "3c1d6170-0097-4b5c-a3b3-adff1b7a86e0" + }, + { + "Name": "hg-60-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "3c669730-b5cd-4e44-8bd2-bc8d9f984ab2" + }, + { + "Name": "sp-240-ssd-flex", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "3d66fea3-26f2-4195-97ab-fdea3b836099" + }, + { + "Name": "sp-240-flex", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "40c781f7-d7a7-4b0d-bcca-5304aeabbcd9" + }, + { + "Name": "hg-7-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "42730e52-147d-46b8-9546-18e31e5ac8a9" + }, + { + "Name": "eg-30-ssd", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 400, + "ID": "463f30e9-7d7a-4693-944f-142067cf553b" + }, + { + "Name": "hg-15-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "534f07c6-91af-44c8-9e62-156360fe8359" + }, + { + "Name": "win-sp-30-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "55533fdf-ad57-4aa7-a2c6-ee31bb94e77b" + }, + { + "Name": "win-hg-60-ssd-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "58b24234-3804-4c4f-9eb6-5406a3a13758" + }, + { + "Name": "hg-7-ssd-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "596c1276-8e53-40a0-b183-cdd9e9b1907d" + }, + { + "Name": "win-hg-30-ssd", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 400, + "ID": "5c54dc08-28b9-4860-9f24-a2451b2a28ec" + }, + { + "Name": "eg-7", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 200, + "ID": "5e409dbc-3f4b-46e8-a629-a418c8497922" + }, + { + "Name": "hg-30-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "656423ea-0551-48c6-9e0f-ec6e15952029" + }, + { + "Name": "hg-15", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 400, + "ID": "675558ea-04fe-47a2-83de-40be9b2eacd4" + }, + { + "Name": "eg-60-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "68a8e4e1-d291-46e8-a724-fbb1c4b9b051" + }, + { + "Name": "hg-30-ssd", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 400, + "ID": "6ab72807-e0a5-4e9f-bbb9-7cbbf0038b26" + }, + { + "Name": "win-hg-30", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 800, + "ID": "6e12cae3-0492-483c-aa39-54a0dcaf86dd" + }, + { + "Name": "win-hg-7-ssd", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 100, + "ID": "6ead771c-e8b9-424c-afa0-671280416422" + }, + { + "Name": "win-hg-30-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "70ded741-8f58-4bb9-8cfd-5e838b66b5f3" + }, + { + "Name": "win-sp-30-ssd-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "7284d104-a260-421d-8cee-6dc905107b25" + }, + { + "Name": "win-eg-120-ssd", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 800, + "ID": "72c0b262-855d-40bb-a3e9-fd989a1bc466" + }, + { + "Name": "win-hg-7-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "73961591-c5f1-436f-b641-1a506eddaef4" + }, + { + "Name": "sp-240-ssd", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "7568d834-3b16-42ce-a2c1-0654e0781160" + }, + { + "Name": "win-eg-60-ssd", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "75f7fe5c-f87a-41d8-a961-a0169d02c268" + }, + { + "Name": "eg-7-ssd-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "77e1db73-0b36-4e37-8e47-32c2d2437ca9" + }, + { + "Name": "eg-60-ssd-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "78df4e30-98ca-4362-af68-037d958edaf0" + }, + { + "Name": "vps-ssd-2", + "RAM": 4000, + "Ephemeral": 0, + "VCPUs": 1, + "Is Public": true, + "Disk": 20, + "ID": "7939cc5c-79b1-45c0-be2d-aa935d92faa1" + }, + { + "Name": "sp-60", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 400, + "ID": "80d8510a-79cc-4307-8db7-d1965c9e8ddb" + }, + { + "Name": "win-hg-120-ssd-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "835e734a-46b6-4cb2-be68-e8678fd71059" + }, + { + "Name": "win-eg-7", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 200, + "ID": "84869b00-b43a-4523-babd-d47d206694e9" + }, + { + "Name": "win-eg-7-ssd-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "852308f8-b8bf-44a4-af41-cbc27437b275" + }, + { + "Name": "win-sp-30", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 200, + "ID": "8be9dc29-3eca-499b-ae2d-e3c99699131a" + }, + { + "Name": "win-hg-7-ssd-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "8d704cfd-05b2-4d4a-add2-e2868bcc081f" + }, + { + "Name": "eg-30", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 800, + "ID": "901f77c2-73f6-4fae-b28a-18b829b55a17" + }, + { + "Name": "sp-60-ssd-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "944b92fb-9a0c-406d-bb9f-a1d93cda9f01" + }, + { + "Name": "eg-30-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "965472c7-eb54-4d4d-bd6e-01ebb694a631" + }, + { + "Name": "sp-120-ssd", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 400, + "ID": "97824a8c-e683-49a8-a70a-ead64240395c" + }, + { + "Name": "hg-60-ssd", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "9831d7f1-3e79-483d-8958-88e3952c7ea2" + }, + { + "Name": "eg-60", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 1600, + "ID": "9e1f13d0-4fcc-4abc-a9e6-9c76d662c92d" + }, + { + "Name": "win-eg-30-ssd", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 400, + "ID": "9e6b85fa-6f37-45ce-a3d6-11ab40a28fad" + }, + { + "Name": "hg-120-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "9ed787cc-a0db-400b-8cc1-49b6384a1000" + }, + { + "Name": "sp-120-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "9f3cfdf7-b850-47cc-92be-33aefbfd2b05" + }, + { + "Name": "hg-60-ssd-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "a37bdf17-e1b1-41cc-a67f-ed665a120446" + }, + { + "Name": "win-hg-120-ssd", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 800, + "ID": "aa753e73-dadb-4528-9c4a-24e36fc41bf4" + }, + { + "Name": "win-sp-240-ssd-flex", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 50, + "ID": "abc007b8-cc44-4b6b-9606-fd647b03e101" + }, + { + "Name": "sp-120", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 800, + "ID": "ac74cb45-d895-47dd-b9cf-c17778033d83" + }, + { + "Name": "win-hg-15", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 400, + "ID": "ae900175-72bd-4fbc-8ab2-4673b468aa5b" + }, + { + "Name": "win-eg-15-ssd-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "aeb37dbf-d7c9-4fd7-93f1-f3818e488ede" + }, + { + "Name": "hg-7-ssd", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 100, + "ID": "b1dc776c-b6e3-4a96-b230-850f570db3d5" + }, + { + "Name": "sp-60-ssd", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 200, + "ID": "b24df495-10f3-466e-95ab-26f0f6839a2f" + }, + { + "Name": "win-hg-120", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 1600, + "ID": "b798e44e-bf71-488c-9335-f20bf5976547" + }, + { + "Name": "eg-7-ssd", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 100, + "ID": "b94e6623-913d-4147-b2a3-34ccf6fe7a5e" + }, + { + "Name": "eg-15-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "bb5fdda8-34ec-40c8-a4e3-308b9e2c9ee2" + }, + { + "Name": "win-eg-7-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "c65384f6-4665-461a-a292-2f3f5a016244" + }, + { + "Name": "eg-60-ssd", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 800, + "ID": "c678f1a8-6542-4f9d-89af-ffc98715d674" + }, + { + "Name": "hg-30-ssd-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "d147a094-b653-41e7-9250-8d4da3044334" + }, + { + "Name": "sp-30", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 200, + "ID": "d1acf88d-6f55-4c5c-a914-4ecbdbd50d6b" + }, + { + "Name": "sp-120-ssd-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "d2d33e8e-58b1-4661-8141-826c47f82166" + }, + { + "Name": "hg-120-ssd-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "d7322c37-9881-4a57-9b40-2499fe2e8f42" + }, + { + "Name": "win-hg-15-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "daf597ea-fbbc-4c71-a35e-5b41d33ccc6c" + }, + { + "Name": "win-hg-30-ssd-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "dcfd834c-3932-47a3-8b4b-cdfeecdfde2c" + }, + { + "Name": "win-hg-60", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 1600, + "ID": "def75cbd-a4b1-4f82-9152-90c65df9587b" + }, + { + "Name": "eg-30-ssd-flex", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 50, + "ID": "e04c7ad6-a5de-45f5-93c9-f3343bdfe8d1" + }, + { + "Name": "vps-ssd-3", + "RAM": 8000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 40, + "ID": "e43d7458-6b82-4a78-a712-3a4dc6748cf4" + }, + { + "Name": "win-eg-15-flex", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "e8bd3402-7310-4a0f-8b99-d9212359c957" + }, + { + "Name": "win-eg-30", + "RAM": 30000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 800, + "ID": "ebf7a997-e2f8-42f4-84f7-33a3d53d1af9" + }, + { + "Name": "eg-120-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "ec852ed3-1e42-4c59-abc3-12bcd26abec8" + }, + { + "Name": "sp-240", + "RAM": 240000, + "Ephemeral": 0, + "VCPUs": 16, + "Is Public": true, + "Disk": 1600, + "ID": "ed286e2c-769f-4c47-ac52-b8de7a4891f6" + }, + { + "Name": "win-sp-60-ssd", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 200, + "ID": "ed835a73-d9a0-43ee-bd89-999c51d8426d" + }, + { + "Name": "win-eg-15", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 400, + "ID": "f06056c1-a2d4-40e7-a7d8-e5bfabada72e" + }, + { + "Name": "win-sp-120", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 8, + "Is Public": true, + "Disk": 800, + "ID": "f247dc56-395b-49de-9a62-93ccc4fff4ed" + }, + { + "Name": "eg-7-flex", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 50, + "ID": "f476f959-ffa6-46f2-94d8-72293570604d" + }, + { + "Name": "sp-60-flex", + "RAM": 60000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 50, + "ID": "f52db47a-315f-49d4-bc5c-67dd118e7ac0" + }, + { + "Name": "win-hg-120-flex", + "RAM": 120000, + "Ephemeral": 0, + "VCPUs": 32, + "Is Public": true, + "Disk": 50, + "ID": "f6cb8144-5d98-4057-b44f-46da342fb571" + }, + { + "Name": "hg-7", + "RAM": 7000, + "Ephemeral": 0, + "VCPUs": 2, + "Is Public": true, + "Disk": 200, + "ID": "fa3cc551-0358-4170-be64-56ea432b064c" + }, + { + "Name": "hg-15-ssd", + "RAM": 15000, + "Ephemeral": 0, + "VCPUs": 4, + "Is Public": true, + "Disk": 200, + "ID": "ff48c2cf-c17f-4682-aaf6-31d66786f808" + } + ]""" + + @classmethod + def setup_class(self): + if 'OS_AUTH_URL' not in os.environ: + pytest.skip('no OS_AUTH_URL environment variable') + + @patch('teuthology.misc.sh') + def test_sorted_flavors(self, m_sh): + o = OpenStack() + select = '^(vps|hg)-.*ssd' + m_sh.return_value = TestOpenStack.flavors + flavors = o.get_sorted_flavors('arch', select) + assert [u'vps-ssd-1', + u'vps-ssd-2', + u'hg-7-ssd-flex', + u'hg-7-ssd', + u'vps-ssd-3', + u'hg-15-ssd-flex', + u'hg-15-ssd', + u'hg-30-ssd-flex', + u'hg-30-ssd', + u'hg-60-ssd-flex', + u'hg-60-ssd', + u'hg-120-ssd-flex', + u'hg-120-ssd', + ] == [ f['Name'] for f in flavors ] + m_sh.assert_called_with("openstack --quiet flavor list -f json") + + def test_flavor(self): + def get_sorted_flavors(self, arch, select): + return [ + { + 'Name': 'too_small', + 'RAM': 2048, + 'Disk': 50, + 'VCPUs': 1, + }, + ] + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + with pytest.raises(NoFlavorException): + hint = { 'ram': 1000, 'disk': 40, 'cpus': 2 } + OpenStack().flavor(hint, 'arch') + + flavor = 'good-flavor' + def get_sorted_flavors(self, arch, select): + return [ + { + 'Name': flavor, + 'RAM': 2048, + 'Disk': 50, + 'VCPUs': 2, + }, + ] + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + hint = { 'ram': 1000, 'disk': 40, 'cpus': 2 } + assert flavor == OpenStack().flavor(hint, 'arch') + + def test_flavor_range(self): + flavors = [ + { + 'Name': 'too_small', + 'RAM': 2048, + 'Disk': 50, + 'VCPUs': 1, + }, + ] + def get_sorted_flavors(self, arch, select): + return flavors + + min = { 'ram': 1000, 'disk': 40, 'cpus': 2 } + good = { 'ram': 4000, 'disk': 40, 'cpus': 2 } + + # + # there are no flavors in the required range + # + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + with pytest.raises(NoFlavorException): + OpenStack().flavor_range(min, good, 'arch') + + # + # there is one flavor in the required range + # + flavors.append({ + 'Name': 'min', + 'RAM': 2048, + 'Disk': 40, + 'VCPUs': 2, + }) + + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + + assert 'min' == OpenStack().flavor_range(min, good, 'arch') + + # + # out of the two flavors in the required range, get the bigger one + # + flavors.append({ + 'Name': 'good', + 'RAM': 3000, + 'Disk': 40, + 'VCPUs': 2, + }) + + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + + assert 'good' == OpenStack().flavor_range(min, good, 'arch') + + # + # there is one flavor bigger or equal to good, get this one + # + flavors.append({ + 'Name': 'best', + 'RAM': 4000, + 'Disk': 40, + 'VCPUs': 2, + }) + + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + + assert 'best' == OpenStack().flavor_range(min, good, 'arch') + + # + # there are two flavors bigger or equal to good, get the smallest one + # + flavors.append({ + 'Name': 'too_big', + 'RAM': 30000, + 'Disk': 400, + 'VCPUs': 20, + }) + + with patch.multiple( + OpenStack, + get_sorted_flavors=get_sorted_flavors, + ): + + assert 'best' == OpenStack().flavor_range(min, good, 'arch') + + + def test_interpret_hints(self): + defaults = { + 'machine': { + 'ram': 0, + 'disk': 0, + 'cpus': 0, + }, + 'volumes': { + 'count': 0, + 'size': 0, + }, + } + expected_disk = 10 # first hint larger than the second + expected_ram = 20 # second hint larger than the first + expected_cpus = 0 # not set, hence zero by default + expected_count = 30 # second hint larger than the first + expected_size = 40 # does not exist in the first hint + hints = [ + { + 'machine': { + 'ram': 2, + 'disk': expected_disk, + }, + 'volumes': { + 'count': 9, + 'size': expected_size, + }, + }, + { + 'machine': { + 'ram': expected_ram, + 'disk': 3, + }, + 'volumes': { + 'count': expected_count, + }, + }, + ] + hint = OpenStack().interpret_hints(defaults, hints) + assert hint == { + 'machine': { + 'ram': expected_ram, + 'disk': expected_disk, + 'cpus': expected_cpus, + }, + 'volumes': { + 'count': expected_count, + 'size': expected_size, + } + } + assert defaults == OpenStack().interpret_hints(defaults, None) + + def test_get_provider(self): + auth = os.environ.get('OS_AUTH_URL', None) + os.environ['OS_AUTH_URL'] = 'cloud.ovh.net' + assert OpenStack().get_provider() == 'ovh' + if auth != None: + os.environ['OS_AUTH_URL'] = auth + else: + del os.environ['OS_AUTH_URL'] + + def test_get_os_url(self): + o = OpenStack() + # + # Only for OVH + # + o.provider = 'something' + assert "" == o.get_os_url("server ") + o.provider = 'ovh' + assert "" == o.get_os_url("unknown ") + type2cmd = { + 'compute': ('server', 'flavor'), + 'network': ('ip', 'security', 'network'), + 'image': ('image',), + 'volume': ('volume',), + } + os.environ['OS_REGION_NAME'] = 'REGION' + os.environ['OS_TENANT_ID'] = 'TENANT' + for (type, cmds) in type2cmd.items(): + for cmd in cmds: + assert ("//" + type) in o.get_os_url(cmd + " ") + for type in type2cmd.keys(): + assert ("//" + type) in o.get_os_url("whatever ", type=type) + + @patch('teuthology.misc.sh') + def test_cache_token(self, m_sh): + token = 'TOKEN VALUE' + m_sh.return_value = token + OpenStack.token = None + o = OpenStack() + # + # Only for OVH + # + o.provider = 'something' + assert False == o.cache_token() + o.provider = 'ovh' + # + # Set the environment with the token + # + assert 'OS_TOKEN_VALUE' not in os.environ + assert 'OS_TOKEN_EXPIRES' not in os.environ + assert True == o.cache_token() + m_sh.assert_called_with('openstack -q token issue -c id -f value') + assert token == os.environ['OS_TOKEN_VALUE'] + assert token == OpenStack.token + assert time.time() < int(os.environ['OS_TOKEN_EXPIRES']) + assert time.time() < OpenStack.token_expires + # + # Reset after it expires + # + token_expires = int(time.time()) - 2000 + OpenStack.token_expires = token_expires + assert True == o.cache_token() + assert time.time() < int(os.environ['OS_TOKEN_EXPIRES']) + assert time.time() < OpenStack.token_expires + + @patch('teuthology.misc.sh') + def test_cache_token_from_environment(self, m_sh): + OpenStack.token = None + o = OpenStack() + o.provider = 'ovh' + token = 'TOKEN VALUE' + os.environ['OS_TOKEN_VALUE'] = token + token_expires = int(time.time()) + OpenStack.token_cache_duration + os.environ['OS_TOKEN_EXPIRES'] = str(token_expires) + assert True == o.cache_token() + assert token == OpenStack.token + assert token_expires == OpenStack.token_expires + m_sh.assert_not_called() + + @patch('teuthology.misc.sh') + def test_cache_token_expired_environment(self, m_sh): + token = 'TOKEN VALUE' + m_sh.return_value = token + OpenStack.token = None + o = OpenStack() + o.provider = 'ovh' + os.environ['OS_TOKEN_VALUE'] = token + token_expires = int(time.time()) - 2000 + os.environ['OS_TOKEN_EXPIRES'] = str(token_expires) + assert True == o.cache_token() + m_sh.assert_called_with('openstack -q token issue -c id -f value') + assert token == os.environ['OS_TOKEN_VALUE'] + assert token == OpenStack.token + assert time.time() < int(os.environ['OS_TOKEN_EXPIRES']) + assert time.time() < OpenStack.token_expires + +class TestTeuthologyOpenStack(TestOpenStackBase): + + @classmethod + def setup_class(self): + if 'OS_AUTH_URL' not in os.environ: + pytest.skip('no OS_AUTH_URL environment variable') + + teuthology.log.setLevel(logging.DEBUG) + set_config_attr(argparse.Namespace()) + + ip = TeuthologyOpenStack.create_floating_ip() + if ip: + ip_id = TeuthologyOpenStack.get_floating_ip_id(ip) + OpenStack().run("ip floating delete " + ip_id) + self.can_create_floating_ips = True + else: + self.can_create_floating_ips = False + + def setup_method(self): + super(TestTeuthologyOpenStack, self).setup_method() + self.key_filename = tempfile.mktemp() + self.key_name = 'teuthology-test' + self.name = 'teuthology-test' + self.clobber() + misc.sh(""" +openstack keypair create {key_name} > {key_filename} +chmod 600 {key_filename} + """.format(key_filename=self.key_filename, + key_name=self.key_name)) + self.options = ['--key-name', self.key_name, + '--key-filename', self.key_filename, + '--name', self.name, + '--verbose'] + + def teardown_method(self): + super(TestTeuthologyOpenStack, self).teardown_method() + self.clobber() + os.unlink(self.key_filename) + + def clobber(self): + misc.sh(""" +openstack server delete {name} --wait || true +openstack keypair delete {key_name} || true + """.format(key_name=self.key_name, + name=self.name)) + + def test_create(self, caplog): + teuthology_argv = [ + '--suite', 'upgrade/hammer', + '--dry-run', + '--ceph', 'main', + '--kernel', 'distro', + '--flavor', 'gcov', + '--distro', 'ubuntu', + '--suite-branch', 'hammer', + '--email', 'loic@dachary.org', + '--num', '10', + '--limit', '23', + '--subset', '1/2', + '--priority', '101', + '--timeout', '234', + '--filter', 'trasher', + '--filter-out', 'erasure-code', + '--throttle', '3', + ] + archive_upload = 'user@archive:/tmp' + argv = (self.options + + ['--teuthology-git-url', 'TEUTHOLOGY_URL', + '--teuthology-branch', 'TEUTHOLOGY_BRANCH', + '--ceph-workbench-git-url', 'CEPH_WORKBENCH_URL', + '--ceph-workbench-branch', 'CEPH_WORKBENCH_BRANCH', + '--upload', + '--archive-upload', archive_upload] + + teuthology_argv) + args = scripts.openstack.parse_args(argv) + teuthology_argv.extend([ + '--archive-upload', archive_upload, + '--archive-upload-url', args.archive_upload_url, + ]) + teuthology = TeuthologyOpenStack(args, None, argv) + teuthology.user_data = 'teuthology/openstack/test/user-data-test1.txt' + teuthology.teuthology_suite = 'echo --' + + teuthology.main() + assert 0 == teuthology.ssh("lsb_release -a") + assert 0 == teuthology.ssh("grep 'substituded variables' /var/log/cloud-init.log") + l = caplog.text + assert 'Ubuntu 14.04' in l + assert "nworkers=" + str(args.simultaneous_jobs) in l + assert "username=" + teuthology.username in l + assert "upload=--archive-upload user@archive:/tmp" in l + assert ("ceph_workbench=" + " --ceph-workbench-branch CEPH_WORKBENCH_BRANCH" + " --ceph-workbench-git-url CEPH_WORKBENCH_URL") in l + assert "clone=git clone -b TEUTHOLOGY_BRANCH TEUTHOLOGY_URL" in l + assert os.environ['OS_AUTH_URL'] in l + assert " ".join(teuthology_argv) in l + + if self.can_create_floating_ips: + ip = teuthology.get_floating_ip(self.name) + teuthology.teardown() + if self.can_create_floating_ips: + assert teuthology.get_floating_ip_id(ip) == None + + def test_floating_ip(self): + if not self.can_create_floating_ips: + pytest.skip('unable to create floating ips') + + expected = TeuthologyOpenStack.create_floating_ip() + ip = TeuthologyOpenStack.get_unassociated_floating_ip() + assert expected == ip + ip_id = TeuthologyOpenStack.get_floating_ip_id(ip) + OpenStack().run("ip floating delete " + ip_id) diff --git a/teuthology/openstack/test/user-data-test1.txt b/teuthology/openstack/test/user-data-test1.txt new file mode 100644 index 000000000..4e3e466c2 --- /dev/null +++ b/teuthology/openstack/test/user-data-test1.txt @@ -0,0 +1,5 @@ +#cloud-config +system_info: + default_user: + name: ubuntu +final_message: "teuthology is up and running after $UPTIME seconds, substituded variables nworkers=NWORKERS openrc=OPENRC username=TEUTHOLOGY_USERNAME upload=UPLOAD ceph_workbench=CEPH_WORKBENCH clone=CLONE_OPENSTACK" diff --git a/teuthology/orchestra/__init__.py b/teuthology/orchestra/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/orchestra/cluster.py b/teuthology/orchestra/cluster.py new file mode 100644 index 000000000..654ef0c3d --- /dev/null +++ b/teuthology/orchestra/cluster.py @@ -0,0 +1,188 @@ +""" +Cluster definition +part of context, Cluster is used to save connection information. +""" +from teuthology.orchestra import run + +class Cluster(object): + """ + Manage SSH connections to a cluster of machines. + """ + + def __init__(self, remotes=None): + """ + :param remotes: A sequence of 2-tuples of this format: + (Remote, [role_1, role_2 ...]) + """ + self.remotes = {} + if remotes is not None: + for remote, roles in remotes: + self.add(remote, roles) + + def __repr__(self): + remotes = [(k, v) for k, v in self.remotes.items()] + remotes.sort(key=lambda tup: tup[0].name) + remotes = '[' + ', '.join('[{remote!r}, {roles!r}]'.format( + remote=k, roles=v) for k, v in remotes) + ']' + return '{classname}(remotes={remotes})'.format( + classname=self.__class__.__name__, + remotes=remotes, + ) + + def __str__(self): + remotes = list(self.remotes.items()) + remotes.sort(key=lambda tup: tup[0].name) + remotes = ((k, ','.join(v)) for k, v in remotes) + remotes = ('{k}[{v}]'.format(k=k, v=v) for k, v in remotes) + return ' '.join(remotes) + + def add(self, remote, roles): + """ + Add roles to the list of remotes. + """ + if remote in self.remotes: + raise RuntimeError( + 'Remote {new!r} already found in remotes: {old!r}'.format( + new=remote, + old=self.remotes[remote], + ), + ) + self.remotes[remote] = list(roles) + + def run(self, wait=True, parallel=False, **kwargs): + """ + Run a command on all the nodes in this cluster. + + Goes through nodes in alphabetical order. + + The default usage is when parallel=False and wait=True, + which is a sequential run for each node one by one. + + If you specify parallel=True, it will be in parallel. + + If you specify wait=False, it returns immediately. + Since it is not possible to run sequentially and + do not wait each command run finished, the parallel value + is ignored and treated as True. + + Returns a list of `RemoteProcess`. + """ + # -+-------+----------+----------+------------+--------------- + # | wait | parallel | run.wait | remote.run | comments + # -+-------+----------+----------+------------+--------------- + # 1|*True |*False | no | wait=True | sequentially + # 2| True | True | yes | wait=False | parallel + # 3| False | True | no | wait=False | parallel + # 4| False | False | no | wait=False | same as above + + # We always run in parallel if wait=False, + # that is why (4) is equivalent to (3). + + # We wait from remote.run only if run sequentially. + _wait = (parallel == False and wait == True) + + remotes = sorted(self.remotes.keys(), key=lambda rem: rem.name) + procs = [remote.run(**kwargs, wait=_wait) for remote in remotes] + + # We do run.wait only if parallel=True, because if parallel=False, + # we have run sequentially and all processes are complete. + + if parallel and wait: + run.wait(procs) + return procs + + def sh(self, script, **kwargs): + """ + Run a command on all the nodes in this cluster. + + Goes through nodes in alphabetical order. + + Returns a list of the command outputs correspondingly. + """ + remotes = sorted(self.remotes.keys(), key=lambda rem: rem.name) + return [remote.sh(script, **kwargs) for remote in remotes] + + def write_file(self, file_name, content, sudo=False, perms=None, owner=None): + """ + Write text to a file on each node. + + :param file_name: file name + :param content: file content + :param sudo: use sudo + :param perms: file permissions (passed to chmod) ONLY if sudo is True + """ + remotes = sorted(self.remotes.keys(), key=lambda rem: rem.name) + for remote in remotes: + if sudo: + remote.write_file(file_name, content, + sudo=True, mode=perms, owner=owner) + else: + if perms is not None or owner is not None: + raise ValueError("To specify perms or owner, sudo must be True") + remote.write_file(file_name, content) + + def only(self, *roles): + """ + Return a cluster with only the remotes that have all of given roles. + + For roles given as strings, they are matched against the roles + on a remote, and the remote passes the check only if all the + roles listed are present. + + Argument can be callable, and will act as a match on roles of + the remote. The matcher will be evaluated one role at a time, + but a match on any role is good enough. Note that this is + subtly diffent from the behavior of string roles, but is + logical if you consider a callable to be similar to passing a + non-string object with an `__eq__` method. + + For example:: + + web = mycluster.only(lambda role: role.startswith('web-')) + """ + c = self.__class__() + want = frozenset(r for r in roles if not callable(r)) + matchers = [r for r in roles if callable(r)] + + for remote, has_roles in self.remotes.items(): + # strings given as roles must all match + if frozenset(has_roles) & want != want: + # not a match + continue + + # every matcher given must match at least one role + if not all( + any(matcher(role) for role in has_roles) + for matcher in matchers + ): + continue + + c.add(remote, has_roles) + + return c + + def exclude(self, *roles): + """ + Return a cluster *without* remotes that have all of given roles. + + This is the opposite of `only`. + """ + matches = self.only(*roles) + c = self.__class__() + for remote, has_roles in self.remotes.items(): + if remote not in matches.remotes: + c.add(remote, has_roles) + return c + + def filter(self, func): + """ + Return a cluster whose remotes are filtered by `func`. + + Example:: + cluster = ctx.cluster.filter(lambda r: r.is_online) + """ + result = self.__class__() + for rem, roles in self.remotes.items(): + if func(rem): + result.add(rem, roles) + return result diff --git a/teuthology/orchestra/connection.py b/teuthology/orchestra/connection.py new file mode 100644 index 000000000..1772d37b5 --- /dev/null +++ b/teuthology/orchestra/connection.py @@ -0,0 +1,123 @@ +""" +Connection utilities +""" +import paramiko +import os +import logging + +from teuthology.config import config +from teuthology.contextutil import safe_while +from paramiko.hostkeys import HostKeyEntry + +log = logging.getLogger(__name__) + + +def split_user(user_at_host): + """ + break apart user@host fields into user and host. + """ + try: + user, host = user_at_host.rsplit('@', 1) + except ValueError: + user, host = None, user_at_host + assert user != '', \ + "Bad input to split_user: {user_at_host!r}".format(user_at_host=user_at_host) + return user, host + + +def create_key(keytype, key): + """ + Create an ssh-rsa, ssh-dss or ssh-ed25519 key. + """ + l = "{hostname} {keytype} {key}".format(hostname="x", keytype=keytype, key=key) + + ke = HostKeyEntry.from_line(l) + assert ke, f'invalid host key "{keytype} {key}"' + return ke.key + + +def connect(user_at_host, host_key=None, keep_alive=False, timeout=60, + _SSHClient=None, _create_key=None, retry=True, key_filename=None): + """ + ssh connection routine. + + :param user_at_host: user@host + :param host_key: ssh key + :param keep_alive: keep_alive indicator + :param timeout: timeout in seconds + :param _SSHClient: client, default is paramiko ssh client + :param _create_key: routine to create a key (defaults to local reate_key) + :param retry: Whether or not to retry failed connection attempts + (eventually giving up if none succeed). Default is True + :param key_filename: Optionally override which private key to use. + :return: ssh connection. + """ + user, host = split_user(user_at_host) + if _SSHClient is None: + _SSHClient = paramiko.SSHClient + ssh = _SSHClient() + + if _create_key is None: + _create_key = create_key + + if host_key is None: + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + if config.verify_host_keys is True: + ssh.load_system_host_keys() + + else: + keytype, key = host_key.split(' ', 1) + ssh.get_host_keys().add( + hostname=host, + keytype=keytype, + key=_create_key(keytype, key) + ) + + connect_args = dict( + hostname=host, + username=user, + timeout=timeout + ) + + key_filename = key_filename or config.ssh_key + ssh_config_path = config.ssh_config_path or "~/.ssh/config" + ssh_config_path = os.path.expanduser(ssh_config_path) + if os.path.exists(ssh_config_path): + ssh_config = paramiko.SSHConfig() + ssh_config.parse(open(ssh_config_path)) + opts = ssh_config.lookup(host) + if not key_filename and 'identityfile' in opts: + key_filename = opts['identityfile'] + if 'hostname' in opts: + connect_args['hostname'] = opts['hostname'] + if 'user' in opts: + connect_args['username'] = opts['user'] + + if key_filename: + if not isinstance(key_filename, list): + key_filename = [key_filename] + key_filename = [os.path.expanduser(f) for f in key_filename] + connect_args['key_filename'] = key_filename + + log.debug(connect_args) + + if not retry: + ssh.connect(**connect_args) + else: + with safe_while(sleep=1, increment=3, action='connect to ' + host) as proceed: + while proceed(): + auth_err_msg = f"Error authenticating with {host}" + try: + ssh.connect(**connect_args) + break + except EOFError: + log.error(f"{auth_err_msg}: EOFError") + except paramiko.AuthenticationException as e: + log.error(f"{auth_err_msg}: {repr(e)}") + except paramiko.SSHException as e: + auth_err_msg = f"{auth_err_msg}: {repr(e)}" + if not key_filename: + auth_err_msg = f"{auth_err_msg} (No SSH private key found!)" + log.exception(auth_err_msg) + ssh.get_transport().set_keepalive(keep_alive) + return ssh diff --git a/teuthology/orchestra/console.py b/teuthology/orchestra/console.py new file mode 100644 index 000000000..a9c67ebbf --- /dev/null +++ b/teuthology/orchestra/console.py @@ -0,0 +1,448 @@ +import io +import logging +import os +import pexpect +import psutil +import subprocess +import sys +import time + +from typing import Union, Literal, Optional + +import teuthology.lock.query +import teuthology.lock.util +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.exceptions import ConsoleError +from teuthology.misc import host_shortname + +try: + import libvirt +except ImportError: + libvirt = None + +log = logging.getLogger(__name__) +PowerOnOffState = Union[Literal["on"], Literal["off"]] + + +class RemoteConsole(): + def getShortName(self, name=None): + """ + Extract the name portion from remote name strings. + """ + hostname = (name or self.name).split('@')[-1] + return host_shortname(hostname) + + +class PhysicalConsole(RemoteConsole): + """ + Physical Console (set from getRemoteConsole) + """ + def __init__(self, name, ipmiuser=None, ipmipass=None, ipmidomain=None, + timeout=120): + self.name = name + self.shortname = self.getShortName(name) + self.log = log.getChild(self.shortname) + self.timeout = timeout + self.ipmiuser = ipmiuser or config.ipmi_user + self.ipmipass = ipmipass or config.ipmi_password + self.ipmidomain = ipmidomain or config.ipmi_domain + self.has_ipmi_credentials = all( + [self.ipmiuser, self.ipmipass, self.ipmidomain] + ) + self.conserver_master = config.conserver_master + self.conserver_port = config.conserver_port + conserver_client_found = psutil.Popen( + 'which console', + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT).wait() == 0 + self.has_conserver = all([ + config.use_conserver is not False, + self.conserver_master, + self.conserver_port, + conserver_client_found, + ]) + + def _pexpect_spawn_ipmi(self, ipmi_cmd): + """ + Run the cmd specified using ipmitool. + """ + full_command = self._ipmi_command(ipmi_cmd) + return self._pexpect_spawn(full_command) + + def _pexpect_spawn(self, cmd): + """ + Run a command using pexpect.spawn(). Return the child object. + """ + self.log.debug('pexpect command: %s', cmd) + p = pexpect.spawn( + cmd, + encoding='utf-8', + codec_errors="backslashreplace", + ) + p.logfile_read = io.StringIO() + return p + + def _get_console(self, readonly=True): + def start(): + cmd = self._console_command(readonly=readonly) + return self._pexpect_spawn(cmd) + + child = start() + if self.has_conserver and not child.isalive(): + self.log.error("conserver failed to get the console; will try ipmitool") + self.has_conserver = False + child = start() + return child + + def _console_command(self, readonly=True): + if self.has_conserver: + return 'console -M {master} -p {port} {mode} {host}'.format( + master=self.conserver_master, + port=self.conserver_port, + mode='-s' if readonly else '-f', + host=self.shortname, + ) + else: + return self._ipmi_command('sol activate') + + def _ipmi_command(self, subcommand): + self._check_ipmi_credentials() + template = \ + 'ipmitool -H {s}.{dn} -I lanplus -U {ipmiuser} -P {ipmipass} {cmd}' + return template.format( + cmd=subcommand, + s=self.shortname, + dn=self.ipmidomain, + ipmiuser=self.ipmiuser, + ipmipass=self.ipmipass, + ) + + def _check_ipmi_credentials(self): + if not self.has_ipmi_credentials: + self.log.error( + "Must set ipmi_user, ipmi_password, and ipmi_domain in " + ".teuthology.yaml" + ) + + def _exit_session(self, child, timeout=None): + t = timeout or self.timeout + if self.has_conserver: + child.sendcontrol('e') + child.send('c.') + r = child.expect( + ['[disconnect]', pexpect.TIMEOUT, pexpect.EOF], + timeout=t) + if r != 0: + child.kill(15) + self.log.debug('console disconnect output: %s', child.logfile_read.getvalue().strip()) + else: + child.send('~.') + r = child.expect( + ['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], + timeout=t) + self.log.debug('ipmitool disconnect output: %s', child.logfile_read.getvalue().strip()) + if r != 0: + self._pexpect_spawn_ipmi('sol deactivate') + self.log.debug('sol deactivate output: %s', child.logfile_read.getvalue().strip()) + + def _wait_for_login(self, timeout=None, attempts=2): + """ + Wait for login. Retry if timeouts occur on commands. + """ + t = timeout or self.timeout + self.log.debug('Waiting for login prompt') + # wait for login prompt to indicate boot completed + for i in range(0, attempts): + start = time.time() + while time.time() - start < t: + child = self._get_console(readonly=False) + child.send('\n') + r = child.expect( + ['{s} login: '.format(s=self.shortname), + pexpect.TIMEOUT, + pexpect.EOF], + timeout=(t - (time.time() - start))) + self.log.debug('expect before: {b}'.format(b=child.before)) + self.log.debug('expect after: {a}'.format(a=child.after)) + + self._exit_session(child) + if r == 0: + return + raise ConsoleError("Did not get a login prompt from %s!" % self.name) + + def check_power(self, state: Literal["on","off"]): + c = self._pexpect_spawn_ipmi('power status') + r = c.expect(['Chassis Power is {s}'.format( + s=state), pexpect.EOF, pexpect.TIMEOUT], timeout=1) + self.log.debug('check power output: %s', c.logfile_read.getvalue().strip()) + return r == 0 + + def set_power(self, state: PowerOnOffState, timeout: Optional[int]): + self.log.info(f"Power {state}") + timeout = timeout or self.timeout + sleep_time = 4 + reissue_after_failures = 5 + failures = 0 + issued = False + succeeded = False + with safe_while( + sleep=sleep_time, + tries=int(timeout / sleep_time), + _raise=False, + action='wait for power on') as proceed: + while proceed(): + if not issued: + child = self._pexpect_spawn_ipmi(f"power {state}") + rc = child.expect( + [ + "Up/On" if state.lower() == "on" else "Down/Off", + pexpect.EOF + ], + timeout=self.timeout + ) + self.log.debug( + f"power {state} output: {child.logfile_read.getvalue().strip()}" + ) + if rc == 0: + issued = True + continue + + if not succeeded: + child = self._pexpect_spawn_ipmi('power status') + rc = child.expect( + [ + f"Chassis Power is {state}", + pexpect.EOF, + pexpect.TIMEOUT + ], + timeout=1 + ) + self.log.debug( + f"check power output: {child.logfile_read.getvalue().strip()}" + ) + if rc == 0: + succeeded = True + break + failures += 1 + if failures == reissue_after_failures: + issued = False + + if issued and succeeded: + self.log.info(f"Power {state} completed") + return True + raise RuntimeError( + f"Failed to power {state} {self.shortname} in {self.timeout}s" + ) + return False + + def check_power_retries(self, state, timeout=None): + """ + Check power. Retry if EOF encountered on power check read. + """ + timeout = timeout or self.timeout + sleep_time = 4.0 + with safe_while( + sleep=sleep_time, + tries=int(timeout / sleep_time), + _raise=False, + action='wait for power %s' % state) as proceed: + while proceed(): + c = self._pexpect_spawn_ipmi('power status') + r = c.expect(['Chassis Power is {s}'.format( + s=state), pexpect.EOF, pexpect.TIMEOUT], timeout=1) + self.log.debug('check power output: %s', c.logfile_read.getvalue().strip()) + if r == 0: + return True + return False + + def check_status(self, timeout=None): + """ + Check status. Returns True if console is at login prompt + """ + try: + # check for login prompt at console + self._wait_for_login(timeout) + return True + except Exception: + self.log.exception('Failed to get ipmi console status') + return False + + def power_cycle(self, timeout=300): + """ + Power cycle and wait for login. + + :param timeout: How long to wait for login + """ + self.log.info('Power cycling') + child = self._pexpect_spawn_ipmi('power cycle') + child.expect('Chassis Power Control: Cycle', timeout=self.timeout) + self.log.debug('power cycle output: %s', child.logfile_read.getvalue().strip()) + self._wait_for_login(timeout=timeout) + self.log.info('Power cycle completed') + + def hard_reset(self, wait_for_login=True): + """ + Perform physical hard reset. Retry if EOF returned from read + and wait for login when complete. + """ + self.log.info('Performing hard reset') + start = time.time() + while time.time() - start < self.timeout: + child = self._pexpect_spawn_ipmi('power reset') + r = child.expect(['Chassis Power Control: Reset', pexpect.EOF], + timeout=self.timeout) + self.log.debug('power reset output: %s', child.logfile_read.getvalue().strip()) + if r == 0: + break + if wait_for_login: + self._wait_for_login() + self.log.info('Hard reset completed') + + def power_on(self): + """ + Physical power on. Loop checking cmd return. + """ + return self.set_power("on", timeout=None) + + def power_off(self): + """ + Physical power off. Loop checking cmd return. + """ + try: + return self.set_power("off", timeout=None) + except Exception: + pass + + def power_off_for_interval(self, interval=30): + """ + Physical power off for an interval. Wait for login when complete. + + :param interval: Length of power-off period. + """ + self.log.info('Power off for {i} seconds'.format(i=interval)) + child = self._pexpect_spawn_ipmi('power off') + child.expect('Chassis Power Control: Down/Off', timeout=self.timeout) + + self.log.debug('power off output: %s', child.logfile_read.getvalue().strip()) + child.logfile_read.seek(0) + child.logfile_read.truncate() + + time.sleep(interval) + + child = self._pexpect_spawn_ipmi('power on') + child.expect('Chassis Power Control: Up/On', timeout=self.timeout) + self.log.debug('power on output: %s', child.logfile_read.getvalue().strip()) + self._wait_for_login() + self.log.info('Power off for {i} seconds completed'.format(i=interval)) + + def spawn_sol_log(self, dest_path): + """ + Using the subprocess module, spawn an ipmitool process using 'sol + activate' and redirect its output to a file. + + :returns: a psutil.Popen object + """ + pexpect_templ = \ + "import pexpect; " \ + "pexpect.run('{cmd}', logfile=open('{log}', 'wb'), timeout=None)" + + def start(): + console_cmd = self._console_command() + # use sys.executable to find python rather than /usr/bin/env. + # The latter relies on PATH, which is set in a virtualenv + # that's been activated, but is not set when binaries are + # run directly from the virtualenv's bin/ directory. + python_cmd = [ + sys.executable, '-c', + pexpect_templ.format( + cmd=console_cmd, + log=dest_path, + ), + ] + return psutil.Popen( + python_cmd, + env=os.environ, + ) + + proc = start() + if self.has_conserver and proc.poll() is not None: + self.log.error("conserver failed to get the console; will try ipmitool") + self.has_conserver = False + proc = start() + return proc + + +class VirtualConsole(RemoteConsole): + """ + Virtual Console (set from getRemoteConsole) + """ + def __init__(self, name): + if libvirt is None: + raise RuntimeError("libvirt not found") + + self.shortname = self.getShortName(name) + self.log = log.getChild(self.shortname) + status_info = teuthology.lock.query.get_status(self.shortname) + try: + if teuthology.lock.query.is_vm(status=status_info): + phys_host = status_info['vm_host']['name'].split('.')[0] + except TypeError: + raise RuntimeError("Cannot create a virtual console for %s", name) + self.connection = libvirt.open(phys_host) + for i in self.connection.listDomainsID(): + d = self.connection.lookupByID(i) + if d.name() == self.shortname: + self.vm_domain = d + break + return + + def check_power(self, state, timeout=None): + """ + Return true if vm domain state indicates power is on. + """ + return self.vm_domain.info[0] in [libvirt.VIR_DOMAIN_RUNNING, + libvirt.VIR_DOMAIN_BLOCKED, + libvirt.VIR_DOMAIN_PAUSED] + + def check_status(self, timeout=None): + """ + Return true if running. + """ + return self.vm_domain.info()[0] == libvirt.VIR_DOMAIN_RUNNING + + def power_cycle(self): + """ + Simiulate virtual machine power cycle + """ + self.vm_domain.info().destroy() + self.vm_domain.info().create() + + def hard_reset(self): + """ + Simiulate hard reset + """ + self.vm_domain.info().destroy() + + def power_on(self): + """ + Simiulate power on + """ + self.vm_domain.info().create() + + def power_off(self): + """ + Simiulate power off + """ + self.vm_domain.info().destroy() + + def power_off_for_interval(self, interval=30): + """ + Simiulate power off for an interval. + """ + self.log.info('Power off for {i} seconds'.format(i=interval)) + self.vm_domain.info().destroy() + time.sleep(interval) + self.vm_domain.info().create() + self.log.info('Power off for {i} seconds completed'.format(i=interval)) diff --git a/teuthology/orchestra/daemon/__init__.py b/teuthology/orchestra/daemon/__init__.py new file mode 100644 index 000000000..ff8be0c67 --- /dev/null +++ b/teuthology/orchestra/daemon/__init__.py @@ -0,0 +1 @@ +from teuthology.orchestra.daemon.group import DaemonGroup # noqa diff --git a/teuthology/orchestra/daemon/cephadmunit.py b/teuthology/orchestra/daemon/cephadmunit.py new file mode 100644 index 000000000..f4959b173 --- /dev/null +++ b/teuthology/orchestra/daemon/cephadmunit.py @@ -0,0 +1,190 @@ +import logging + +from teuthology.orchestra.daemon.state import DaemonState + +log = logging.getLogger(__name__) + +class CephadmUnit(DaemonState): + def __init__(self, remote, role, id_, *command_args, + **command_kwargs): + super(CephadmUnit, self).__init__( + remote, role, id_, *command_args, **command_kwargs) + self._set_commands() + self.log = command_kwargs.get('logger', log) + self.use_cephadm = command_kwargs.get('use_cephadm') + self.is_started = command_kwargs.get('started', False) + if self.is_started: + self._start_logger() + + def name(self): + return '%s.%s' % (self.type_, self.id_) + + def _get_systemd_cmd(self, action): + return ' '.join([ + 'sudo', 'systemctl', + action, + 'ceph-%s@%s.%s' % (self.fsid, self.type_, self.id_), + ]) + + def _set_commands(self): + self.start_cmd = self._get_systemd_cmd('start') + self.stop_cmd = self._get_systemd_cmd('stop') + self.restart_cmd = self._get_systemd_cmd('restart') + self.show_cmd = self._get_systemd_cmd('show') + self.status_cmd = self._get_systemd_cmd('status') + + def kill_cmd(self, sig): + return ' '.join([ + 'sudo', 'docker', 'kill', + '-s', str(int(sig)), + 'ceph-%s-%s.%s' % (self.fsid, self.type_, self.id_), + ]) + + def _start_logger(self): + name = '%s.%s' % (self.type_, self.id_) + #self.log.info('_start_logger %s' % name) + self.remote_logger = self.remote.run( + args=['sudo', 'journalctl', + '-f', + '-n', '0', + '-u', + 'ceph-%s@%s.service' % (self.fsid, name) + ], + logger=logging.getLogger('journalctl@' + self.cluster + '.' + name), + label=name, + wait=False, + check_status=False, + ) + + def _stop_logger(self): + name = '%s.%s' % (self.type_, self.id_) + # this is a horrible kludge, since i don't know how else to kill + # the journalctl process at the other end :( + #self.log.info('_stop_logger %s running pkill' % name) + self.remote.run( + args=['sudo', 'pkill', '-f', + ' '.join(['journalctl', + '-f', + '-n', '0', + '-u', + 'ceph-%s@%s.service' % (self.fsid, name)]), + ], + check_status=False, + ) + #self.log.info('_stop_logger %s waiting') + self.remote_logger.wait() + self.remote_logger = None + #self.log.info('_stop_logger done') + + def reset(self): + """ + Does nothing in this implementation + """ + pass + + def restart(self, *args, **kwargs): + """ + Restart with a new command passed in the arguments + + :param args: positional arguments passed to remote.run + :param kwargs: keyword arguments passed to remote.run + """ + if not self.running(): + self.log.info('Restarting %s (starting--it wasn\'t running)...' % self.name()) + self._start_logger() + self.remote.sh(self.start_cmd) + self.is_started = True + else: + self.log.info('Restarting %s...' % self.name()) + self.remote.sh(self.restart_cmd) + + def restart_with_args(self, extra_args): + """ + Restart, adding new paramaters to the current command. + + :param extra_args: Extra keyword arguments to be added. + """ + raise NotImplementedError + + def running(self): + """ + Are we running? + """ + return self.is_started + + def finished(self): + """ + Is the daemon finished? + Return False if active. + """ + proc = self.remote.run( + args=self.status_cmd, + check_status=False, + quiet=True, + ) + return proc.returncode != 0 + + def signal(self, sig, silent=False): + """ + Send a signal to associated remote command + + :param sig: signal to send + """ + if not silent: + self.log.info('Senging signal %d to %s...' % (sig, self.name())) + # Ignore exception here because sending a singal via docker can be + # quite slow and easily race with, say, the daemon shutting down. + try: + self.remote.sh(self.kill_cmd(sig)) + except Exception as e: + self.log.info(f'Ignoring exception while sending signal: {e}') + + def start(self, timeout=300): + """ + Start this daemon instance. + """ + if self.running(): + self.log.warning('Restarting a running daemon') + self.restart() + return + self._start_logger() + self.remote.run(args=self.start_cmd) + self.is_started = True + + def stop(self, timeout=300): + """ + Stop this daemon instance. + + Note: this can raise a CommandFailedError, + CommandCrashedError, or ConnectionLostError. + + :param timeout: timeout to pass to orchestra.run.wait() + """ + if not self.running(): + self.log.error('Tried to stop a non-running daemon') + return + self.log.info('Stopping %s...' % self.name()) + self.remote.sh(self.stop_cmd) + self.is_started = False + self._stop_logger() + self.log.info('Stopped %s' % self.name()) + + # FIXME why are there two wait methods? + def wait(self, timeout=300): + """ + Wait for daemon to exit + + Wait for daemon to stop (but don't trigger the stop). Pass up + any exception. Mark the daemon as not running. + """ + self.log.info('Waiting for %s to exit...' % self.name()) + self.remote.sh(self.stop_cmd) + self.is_started = False + self._stop_logger() + self.log.info('Finished waiting for %s to stop' % self.name()) + + def wait_for_exit(self): + """ + clear remote run command value after waiting for exit. + """ + self.wait() diff --git a/teuthology/orchestra/daemon/group.py b/teuthology/orchestra/daemon/group.py new file mode 100644 index 000000000..656f5a0ba --- /dev/null +++ b/teuthology/orchestra/daemon/group.py @@ -0,0 +1,180 @@ +from teuthology import misc +from teuthology.orchestra.daemon.state import DaemonState +from teuthology.orchestra.daemon.systemd import SystemDState +from teuthology.orchestra.daemon.cephadmunit import CephadmUnit + + +class DaemonGroup(object): + """ + Collection of daemon state instances + """ + def __init__(self, use_systemd=False, use_cephadm=None): + """ + self.daemons is a dictionary indexed by role. Each entry is a + dictionary of DaemonState values indexed by an id parameter. + + :param use_systemd: Whether or not to use systemd when appropriate + (default: False) Note: This option may be removed + in the future. + """ + self.daemons = {} + self.use_systemd = use_systemd + self.use_cephadm = use_cephadm + + def add_daemon(self, remote, type_, id_, *args, **kwargs): + """ + Add a daemon. If there already is a daemon for this id_ and role, stop + that daemon. (Re)start the daemon once the new value is set. + + :param remote: Remote site + :param type_: type of daemon (osd, mds, mon, rgw, for example) + :param id_: Id (index into role dictionary) + :param args: Daemonstate positional parameters + :param kwargs: Daemonstate keyword parameters + """ + # for backwards compatibility with older ceph-qa-suite branches, + # we can only get optional args from unused kwargs entries + self.register_daemon(remote, type_, id_, *args, **kwargs) + cluster = kwargs.pop('cluster', 'ceph') + role = cluster + '.' + type_ + self.daemons[role][id_].restart() + + def register_daemon(self, remote, type_, id_, *args, **kwargs): + """ + Add a daemon. If there already is a daemon for this id_ and role, stop + that daemon. + + :param remote: Remote site + :param type_: type of daemon (osd, mds, mon, rgw, for example) + :param id_: Id (index into role dictionary) + :param args: Daemonstate positional parameters + :param kwargs: Daemonstate keyword parameters + """ + # for backwards compatibility with older ceph-qa-suite branches, + # we can only get optional args from unused kwargs entries + cluster = kwargs.pop('cluster', 'ceph') + role = cluster + '.' + type_ + if role not in self.daemons: + self.daemons[role] = {} + if id_ in self.daemons[role]: + self.daemons[role][id_].stop() + self.daemons[role][id_] = None + + klass = DaemonState + if self.use_cephadm: + klass = CephadmUnit + kwargs['use_cephadm'] = self.use_cephadm + elif self.use_systemd and \ + not any(i == 'valgrind' for i in args) and \ + remote.init_system == 'systemd': + # We currently cannot use systemd and valgrind together because + # it would require rewriting the unit files + klass = SystemDState + self.daemons[role][id_] = klass( + remote, role, id_, *args, **kwargs) + + def get_daemon(self, type_, id_, cluster='ceph'): + """ + get the daemon associated with this id_ for this role. + + :param type_: type of daemon (osd, mds, mon, rgw, for example) + :param id_: Id (index into role dictionary) + """ + role = cluster + '.' + type_ + if role not in self.daemons: + return None + return self.daemons[role].get(str(id_), None) + + def iter_daemons_of_role(self, type_, cluster='ceph'): + """ + Iterate through all daemon instances for this role. Return dictionary + of daemon values. + + :param type_: type of daemon (osd, mds, mon, rgw, for example) + """ + role = cluster + '.' + type_ + return self.daemons.get(role, {}).values() + + def resolve_role_list(self, roles, types, cluster_aware=False): + """ + Resolve a configuration setting that may be None or contain wildcards + into a list of roles (where a role is e.g. 'mds.a' or 'osd.0'). This + is useful for tasks that take user input specifying a flexible subset + of the available roles. + + The task calling this must specify what kinds of roles it can can + handle using the ``types`` argument, where a role type is 'osd' or + 'mds' for example. When selecting roles this is used as a filter, or + when an explicit list of roles is passed, the an exception is raised if + any are not of a suitable type. + + Examples: + + :: + + # Passing None (i.e. user left config blank) defaults to all roles + # (filtered by ``types``) + None, types=['osd', 'mds', 'mon'] -> + ['osd.0', 'osd.1', 'osd.2', 'mds.a', mds.b', 'mon.a'] + # Wildcards are expanded + roles=['mds.*', 'osd.0'], types=['osd', 'mds', 'mon'] -> + ['mds.a', 'mds.b', 'osd.0'] + # Boring lists are unaltered + roles=['osd.0', 'mds.a'], types=['osd', 'mds', 'mon'] -> + ['osd.0', 'mds.a'] + # Entries in role list that don't match types result in an + # exception + roles=['osd.0', 'mds.a'], types=['osd'] -> RuntimeError + + :param roles: List (of roles or wildcards) or None (select all suitable + roles) + :param types: List of acceptable role types, for example + ['osd', 'mds']. + :param cluster_aware: bool to determine whether to consider include + cluster in the returned roles - just for + backwards compatibility with pre-jewel versions + of ceph-qa-suite + :return: List of strings like ["mds.0", "osd.2"] + """ + assert (isinstance(roles, list) or roles is None) + + resolved = [] + if roles is None: + # Handle default: all roles available + for type_ in types: + for role, daemons in self.daemons.items(): + if not role.endswith('.' + type_): + continue + for daemon in daemons.values(): + prefix = type_ + if cluster_aware: + prefix = daemon.role + resolved.append(prefix + '.' + daemon.id_) + else: + # Handle explicit list of roles or wildcards + for raw_role in roles: + try: + cluster, role_type, role_id = misc.split_role(raw_role) + except ValueError: + msg = ("Invalid role '{0}', roles must be of format " + "[.].").format(raw_role) + raise RuntimeError(msg) + + if role_type not in types: + msg = "Invalid role type '{0}' in role '{1}'".format( + role_type, raw_role) + raise RuntimeError(msg) + + if role_id == "*": + # Handle wildcard, all roles of the type + for daemon in self.iter_daemons_of_role(role_type, + cluster=cluster): + prefix = role_type + if cluster_aware: + prefix = daemon.role + resolved.append(prefix + '.' + daemon.id_) + else: + # Handle explicit role + resolved.append(raw_role) + + return resolved diff --git a/teuthology/orchestra/daemon/state.py b/teuthology/orchestra/daemon/state.py new file mode 100644 index 000000000..03387c999 --- /dev/null +++ b/teuthology/orchestra/daemon/state.py @@ -0,0 +1,178 @@ +import logging +import struct + +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + + +class DaemonState(object): + """ + Daemon State. A daemon exists for each instance of each role. + """ + def __init__(self, remote, role, id_, *command_args, **command_kwargs): + """ + Pass remote command information as parameters to remote site + + :param remote: Remote site + :param role: Role (osd, rgw, mon, mds) + :param id_: Id within role (osd.1, osd.2, for eaxmple) + :param command_args: positional arguments (used in restart commands) + :param command_kwargs: keyword arguments (used in restart commands) + """ + self.remote = remote + self.command_args = command_args + self.role = role + self.cluster, self.type_ = self.role.split('.')[0:2] + self.id_ = id_ + self.log = command_kwargs.get('logger', log) + self.fsid = command_kwargs.pop('fsid', None) + self.proc = None + self.command_kwargs = command_kwargs + + def check_status(self): + """ + Check to see if the process has exited. + + :returns: The exit status, if any + :raises: CommandFailedError, if the process was run with + check_status=True + """ + if self.proc: + return self.proc.poll() + + @property + def pid(self): + raise NotImplementedError + + def reset(self): + """ + clear remote run command value. + """ + self.proc = None + + def restart(self, *args, **kwargs): + """ + Restart with a new command passed in the arguments + + :param args: positional arguments passed to remote.run + :param kwargs: keyword arguments passed to remote.run + """ + self.log.info('Restarting daemon') + if self.proc is not None: + self.log.info('Stopping old one...') + self.stop() + cmd_args = list(self.command_args) + cmd_args.extend(args) + cmd_kwargs = self.command_kwargs + cmd_kwargs.update(kwargs) + self.proc = self.remote.run(*cmd_args, **cmd_kwargs) + self.log.info('Started') + + def restart_with_args(self, extra_args): + """ + Restart, adding new paramaters to the current command. + + :param extra_args: Extra keyword arguments to be added. + """ + self.log.info('Restarting daemon with args') + if self.proc is not None: + self.log.info('Stopping old one...') + self.stop() + cmd_args = list(self.command_args) + # we only want to make a temporary mod of the args list + # so we shallow copy the dict, and deepcopy the args list + cmd_kwargs = self.command_kwargs.copy() + from copy import deepcopy + cmd_kwargs['args'] = deepcopy(self.command_kwargs['args']) + cmd_kwargs['args'].extend(extra_args) + self.proc = self.remote.run(*cmd_args, **cmd_kwargs) + self.log.info('Started') + + def running(self): + """ + Are we running? + :return: True if remote run command value is set, False otherwise. + """ + return self.proc is not None + + def finished(self): + """ + Is the daemon finished? + Return False if active. + """ + return self.proc.finished if self.proc is not None else False + + def signal(self, sig, silent=False): + """ + Send a signal to associated remote command. + + :param sig: signal to send + """ + if self.running(): + try: + self.proc.stdin.write(struct.pack('!b', sig)) + except IOError as e: + log.exception('Failed to send signal %d: %s', sig, e.strerror) + if not silent: + self.log.info('Sent signal %d', sig) + else: + self.log.error('No such daemon running') + + def start(self, timeout=300): + """ + Start this daemon instance. + """ + if self.running(): + self.log.warning('Restarting a running daemon') + self.restart() + + def stop(self, timeout=300): + """ + Stop this daemon instance. + + Note: this can raise a CommandFailedError, + CommandCrashedError, or ConnectionLostError. + + :param timeout: timeout to pass to orchestra.run.wait() + """ + if not self.running(): + self.log.error('tried to stop a non-running daemon') + return + self.proc.stdin.close() + self.log.debug('waiting for process to exit') + try: + run.wait([self.proc], timeout=timeout) + except CommandFailedError: + log.exception("Error while waiting for process to exit") + self.proc = None + self.log.info('Stopped') + + # FIXME why are there two wait methods? + def wait(self, timeout=300): + """ + Wait for daemon to exit + + Wait for daemon to stop (but don't trigger the stop). Pass up + any exception. Mark the daemon as not running. + """ + self.log.debug('waiting for process to exit') + try: + run.wait([self.proc], timeout=timeout) + self.log.info('Stopped') + except: + self.log.info('Failed') + raise + finally: + self.proc = None + + def wait_for_exit(self): + """ + clear remote run command value after waiting for exit. + """ + if self.proc: + try: + run.wait([self.proc]) + finally: + self.proc = None diff --git a/teuthology/orchestra/daemon/systemd.py b/teuthology/orchestra/daemon/systemd.py new file mode 100644 index 000000000..9a4bc4374 --- /dev/null +++ b/teuthology/orchestra/daemon/systemd.py @@ -0,0 +1,229 @@ +import logging +import re + +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra import run +from teuthology.orchestra.daemon.state import DaemonState + +log = logging.getLogger(__name__) + +systemd_cmd_templ = 'sudo systemctl {action} {daemon}@{id_}' + + +class SystemDState(DaemonState): + def __init__(self, remote, role, id_, *command_args, + **command_kwargs): + super(SystemDState, self).__init__( + remote, role, id_, *command_args, **command_kwargs) + self._set_commands() + self.log = command_kwargs.get('logger', log) + + @property + def daemon_type(self): + if self.type_ == 'rgw': + return 'radosgw' + return self.type_ + + def _get_systemd_cmd(self, action): + cmd = systemd_cmd_templ.format( + action=action, + daemon='%s-%s' % (self.cluster, self.daemon_type), + id_=self.id_.replace('client.', ''), + ) + return cmd + + def _set_commands(self): + self.start_cmd = self._get_systemd_cmd('start') + self.stop_cmd = self._get_systemd_cmd('stop') + self.restart_cmd = self._get_systemd_cmd('restart') + self.show_cmd = self._get_systemd_cmd('show') + self.status_cmd = self._get_systemd_cmd('status') + cluster_and_type = '%s-%s' % (self.cluster, self.daemon_type) + if self.type_ == self.daemon_type: + syslog_id = cluster_and_type + else: + syslog_id = self.daemon_type + self.output_cmd = 'sudo journalctl -u ' \ + '{0}@{1} -t {2} -n 10'.format( + cluster_and_type, + self.id_.replace('client.', ''), + syslog_id, + ) + + def check_status(self): + """ + Check to see if the process has exited. + + :returns: The exit status, if any + :raises: CommandFailedError, if the process was run with + check_status=True + """ + output = self.remote.sh(self.show_cmd + ' | grep -i state') + + def parse_line(line): + key, value = line.strip().split('=', 1) + return {key.strip(): value.strip()} + + show_dict = dict() + + for line in output.split('\n'): + # skip empty and commented string + if not line or line.startswith("#"): + continue + show_dict.update(parse_line(line)) + + active_state = show_dict['ActiveState'] + sub_state = show_dict['SubState'] + if active_state == 'active': + return None + self.log.info("State is: %s/%s", active_state, sub_state) + out = self.remote.sh( + # This will match a line like: + # Main PID: 13394 (code=exited, status=1/FAILURE) + # Or (this is wrapped): + # Apr 26 21:29:33 ovh083 systemd[1]: ceph-osd@1.service: + # Main process exited, code=exited, status=1/FAILURE + self.status_cmd + " | grep 'Main.*code=exited'", + ) + line = out.strip().split('\n')[-1] + exit_code = int(re.match(r'.*status=(\d+).*', line).groups()[0]) + if exit_code: + self.remote.run( + args=self.output_cmd + ) + raise CommandFailedError( + self.start_cmd, + exit_code, + self.remote, + ) + return exit_code + + @property + def pid(self): + """ + Method to retrieve daemon process id + """ + proc_name = 'ceph-%s' % self.type_ + + # process regex to match OSD, MON, MGR, MDS process command string + # eg. "/usr/bin/ceph- -f --cluster ceph --id " + proc_regex = '"%s.*--id %s "' % (proc_name, self.id_) + + # process regex to match RADOSGW process command string + # eg. "/usr/bin/radosgw -f --cluster ceph --name " + if self.type_ == "rgw": + proc_regex = '"{}.*--name.*{}"'.format(self.daemon_type, self.id_) + + args = ['ps', '-ef', + run.Raw('|'), + 'grep', + run.Raw(proc_regex), + run.Raw('|'), + 'grep', '-v', + 'grep', run.Raw('|'), + 'awk', + run.Raw("{'print $2'}")] + pid_string = self.remote.sh(args).strip() + if not pid_string.isdigit(): + return None + return int(pid_string) + + def reset(self): + """ + Does nothing in this implementation + """ + pass + + def restart(self, *args, **kwargs): + """ + Restart with a new command passed in the arguments + + :param args: positional arguments passed to remote.run + :param kwargs: keyword arguments passed to remote.run + """ + self.log.info('Restarting daemon using systemd') + if not self.running(): + self.log.info('starting a non-running daemon') + self.remote.run(args=[run.Raw(self.start_cmd)]) + else: + self.remote.run(args=[run.Raw(self.restart_cmd)]) + # check status will also fail if the process hasn't restarted + self.check_status() + + def restart_with_args(self, extra_args): + """ + Restart, adding new paramaters to the current command. + + :param extra_args: Extra keyword arguments to be added. + """ + self.log.warning( + "restart_with_args() is not supported with systemd; performing" + "normal restart") + self.restart() + + def running(self): + """ + Are we running? + :return: The PID if remote run command value is set, False otherwise. + """ + pid = self.pid + if pid is None: + return None + elif pid <= 0: + return None + else: + return pid + + def signal(self, sig, silent=False): + """ + Send a signal to associated remote command + + :param sig: signal to send + """ + self.log.warning("systemd may restart daemons automatically") + pid = self.pid + self.log.info("Sending signal %s to process %s", sig, pid) + sig = '-' + str(sig) + self.remote.run(args=['sudo', 'kill', str(sig), str(pid)]) + + def start(self, timeout=300): + """ + Start this daemon instance. + """ + if self.running(): + self.log.warning('Restarting a running daemon') + self.restart() + return + self.remote.run(args=[run.Raw(self.start_cmd)]) + + def stop(self, timeout=300): + """ + Stop this daemon instance. + + Note: this can raise a CommandFailedError, + CommandCrashedError, or ConnectionLostError. + + :param timeout: timeout to pass to orchestra.run.wait() + """ + if not self.running(): + self.log.error('tried to stop a non-running daemon') + return + self.remote.run(args=[run.Raw(self.stop_cmd)]) + self.log.info('Stopped') + + # FIXME why are there two wait methods? + def wait(self, timeout=300): + """ + Wait for daemon to exit + + Wait for daemon to stop (but don't trigger the stop). Pass up + any exception. Mark the daemon as not running. + """ + self.log.error("wait() not suported in systemd") + + def wait_for_exit(self): + """ + clear remote run command value after waiting for exit. + """ + # TODO: This ought to be possible, no? + self.log.error("wait_for_exit() is not supported with systemd") diff --git a/teuthology/orchestra/monkey.py b/teuthology/orchestra/monkey.py new file mode 100644 index 000000000..e13e77305 --- /dev/null +++ b/teuthology/orchestra/monkey.py @@ -0,0 +1,56 @@ +""" +Monkey patches (paramiko support) +""" +import logging + +log = logging.getLogger(__name__) + +def patch_001_paramiko_deprecation(): + """ + Silence an an unhelpful Deprecation Warning triggered by Paramiko. + + Not strictly a monkeypatch. + """ + import warnings + warnings.filterwarnings( + category=DeprecationWarning, + message='This application uses RandomPool,', + action='ignore', + ) + + +def patch_100_paramiko_log(): + """ + Silence some noise paramiko likes to log. + + Not strictly a monkeypatch. + """ + logging.getLogger('paramiko.transport').setLevel(logging.WARNING) + + +def patch_100_logger_getChild(): + """ + Imitate Python 2.7 feature Logger.getChild. + """ + import logging + if not hasattr(logging.Logger, 'getChild'): + def getChild(self, name): + return logging.getLogger('.'.join([self.name, name])) + logging.Logger.getChild = getChild + + +def patch_100_trigger_rekey(): + # Fixes http://tracker.ceph.com/issues/15236 + from paramiko.packet import Packetizer + Packetizer._trigger_rekey = lambda self: True + + +def patch_all(): + """ + Run all the patch_* functions in this module. + """ + monkeys = [(k, v) for (k, v) in globals().items() if k.startswith('patch_') and k != 'patch_all'] + monkeys.sort() + for k, v in monkeys: + log.debug('Patching %s', k) + v() diff --git a/teuthology/orchestra/opsys.py b/teuthology/orchestra/opsys.py new file mode 100644 index 000000000..7f72234f7 --- /dev/null +++ b/teuthology/orchestra/opsys.py @@ -0,0 +1,272 @@ +import re + +from packaging.version import parse as parse_version, Version + + +DISTRO_CODENAME_MAP = { + "ubuntu": { + "24.04": "noble", + "22.04": "jammy", + "20.04": "focal", + "18.04": "bionic", + "17.10": "artful", + "17.04": "zesty", + "16.10": "yakkety", + "16.04": "xenial", + "15.10": "wily", + "15.04": "vivid", + "14.10": "utopic", + "14.04": "trusty", + "13.10": "saucy", + "12.04": "precise", + }, + "debian": { + "7": "wheezy", + "8": "jessie", + "9": "stretch", + "10": "buster", + "11": "bullseye", + "12": "bookworm", + }, + "rhel": { + "9": "plow", + "8": "ootpa", + "7": "maipo", + "6": "santiago", + }, + "alma": { + "8.10": "alma", + "9.6": "alma", + "10.0": "alma", + }, + "rocky": { + "8.10": "rocky", + "9.6": "rocky", + "10.0": "rocky", + }, + "centos": { + "10": "stream", + "9": "stream", + "8": "core", + "7": "core", + "6": "core", + }, + "fedora": { + "28": "28", + "27": "27", + "26": "26", + "25": "25", + "24": "24", + "23": "23", + "22": "22", + "21": "21", + "20": "heisenbug", + }, + "opensuse": { + "1.0": "tumbleweed", + "15.0": "leap", + "15.1": "leap", + "15.2": "leap", + "15.3": "leap", + "15.4": "leap", + "15.5": "leap", + "42.2": "leap", + "42.3": "leap", + }, + "sle": { + "12.1": "sle", + "12.2": "sle", + "12.3": "sle", + "15.0": "sle", + "15.1": "sle", + "15.2": "sle", + "15.3": "sle", + "15.4": "sle", + "15.5": "sle", + }, +} + +DEFAULT_OS_VERSION = dict( + ubuntu="22.04", + fedora="25", + centos="9.stream", + opensuse="15.4", + sle="15.2", + rhel="8.6", + rocky="9.6", + alma="9.6", + debian='8.0' +) + + +class OS(object): + """ + Class that parses either /etc/os-release or the output of 'lsb_release -a' + and provides OS name and version information. + + Must be initialized with OS.from_lsb_release or OS.from_os_release + """ + + __slots__ = ['name', 'version', 'codename', 'package_type'] + + _deb_distros = ('debian', 'ubuntu') + _rpm_distros = ('fedora', 'rhel', 'centos', 'opensuse', 'sle') + + def __init__(self, name=None, version=None, codename=None): + self.name = name + self.version = version or self._codename_to_version(name, codename) + self.codename = codename or self._version_to_codename(name, version) + self._set_package_type() + + @staticmethod + def _version_to_codename(name, version): + for (_version, codename) in DISTRO_CODENAME_MAP[name].items(): + if str(version) == _version or str(version).split('.')[0] == _version: + return codename + + @staticmethod + def _codename_to_version(name, codename): + for (version, _codename) in DISTRO_CODENAME_MAP[name].items(): + if codename == _codename: + return version + raise RuntimeError("No version found for %s %s !" % ( + name, + codename, + )) + + @classmethod + def from_lsb_release(cls, lsb_release_str): + """ + Parse output from lsb_release -a and populate attributes + + Given output like: + Distributor ID: Ubuntu + Description: Ubuntu 12.04.4 LTS + Release: 12.04 + Codename: precise + + Attributes will be: + name = 'ubuntu' + version = '12.04' + codename = 'precise' + Additionally, we set the package type: + package_type = 'deb' + """ + str_ = lsb_release_str.strip() + name = cls._get_value(str_, 'Distributor ID') + if name == 'RedHatEnterpriseServer': + name = 'rhel' + elif name.startswith('openSUSE'): + name = 'opensuse' + elif name.startswith('SUSE'): + name = 'sle' + name = name.lower() + + version = cls._get_value(str_, 'Release') + codename = cls._get_value(str_, 'Codename').lower() + obj = cls(name=name, version=version, codename=codename) + + return obj + + @classmethod + def from_os_release(cls, os_release_str): + """ + Parse /etc/os-release and populate attributes + + Given output like: + NAME="Ubuntu" + VERSION="12.04.4 LTS, Precise Pangolin" + ID=ubuntu + ID_LIKE=debian + PRETTY_NAME="Ubuntu precise (12.04.4 LTS)" + VERSION_ID="12.04" + + Attributes will be: + name = 'ubuntu' + version = '12.04' + codename = None + Additionally, we set the package type: + package_type = 'deb' + """ + str_ = os_release_str.strip() + version = cls._get_value(str_, 'VERSION_ID') + name = cls._get_value(str_, 'ID').lower() + if name == 'sles': + name = 'sle' + elif name == 'opensuse-leap': + name = 'opensuse' + elif name == 'opensuse-tumbleweed': + name = 'opensuse' + elif name == 'centos': + if parse_version(version) >= Version("8.0"): + version = f"{version}.stream" + elif name == 'almalinux': + name = 'alma' + obj = cls(name=name, version=version) + return obj + + + @classmethod + def version_codename(cls, name, version_or_codename): + """ + Return (version, codename) based on one input, trying to infer + which we're given + """ + codename = None + version = None + + try: + codename = OS._version_to_codename(name, version_or_codename) + except KeyError: + pass + + try: + version = OS._codename_to_version(name, version_or_codename) + except (KeyError, RuntimeError): + pass + + if version: + codename = version_or_codename + elif codename: + version = version_or_codename + else: + raise KeyError('%s not a %s version or codename' % + (version_or_codename, name)) + return version, codename + + + @staticmethod + def _get_value(str_, name): + regex = '^%s[:=](.+)' % name + match = re.search(regex, str_, flags=re.M) + if match: + return match.groups()[0].strip(' \t"\'') + return '' + + def _set_package_type(self): + if self.name in self._deb_distros: + self.package_type = "deb" + elif self.name in self._rpm_distros: + self.package_type = "rpm" + + def to_dict(self): + return dict( + name=self.name, + version=self.version, + codename=self.codename, + ) + + def __str__(self): + return " ".join([self.name, self.version]).strip() + + def __repr__(self): + return "OS(name={name}, version={version}, codename={codename})"\ + .format(name=repr(self.name), + version=repr(self.version), + codename=repr(self.codename)) + + def __eq__(self, other): + if self.name.lower() != other.name.lower(): + return False + normalize = lambda s: s.lower().removesuffix(".stream") + return normalize(self.version) == normalize(other.version) diff --git a/teuthology/orchestra/remote.py b/teuthology/orchestra/remote.py new file mode 100644 index 000000000..1790d1e2b --- /dev/null +++ b/teuthology/orchestra/remote.py @@ -0,0 +1,806 @@ +""" +Support for paramiko remote objects. +""" + +import teuthology.lock.query +import teuthology.lock.util +from teuthology.contextutil import safe_while +from teuthology.orchestra import run +from teuthology.orchestra import connection +from teuthology.orchestra import console +from teuthology.orchestra.opsys import OS +import teuthology.provision +from teuthology import misc +from teuthology.exceptions import CommandFailedError, UnitTestError +from teuthology.util.scanner import UnitTestScanner +from teuthology.misc import host_shortname +import errno +import re +import logging +from io import BytesIO +from io import StringIO +import os +import pwd +import tempfile +import netaddr + +log = logging.getLogger(__name__) + + +class RemoteShell(object): + """ + Contains methods to run miscellaneous shell commands on remote machines. + + These methods were originally part of orchestra.remote.Remote. The reason + for moving these methods from Remote is that applications that use + teuthology for testing usually have programs that can run tests locally on + a single node machine for development work (for example, vstart_runner.py + in case of Ceph). These programs can import and reuse these methods + without having to deal SSH stuff. In short, this class serves a shared + interface. + + To use these methods, inherit the class here and implement "run()" method in + the subclass. + """ + + def remove(self, path): + self.run(args=['rm', '-fr', path]) + + def mkdtemp(self, suffix=None, parentdir=None): + """ + Create a temporary directory on remote machine and return it's path. + """ + args = ['mktemp', '-d'] + + if suffix: + args.append('--suffix=%s' % suffix) + if parentdir: + args.append('--tmpdir=%s' % parentdir) + + return self.sh(args).strip() + + def mktemp(self, suffix=None, parentdir=None, data=None): + """ + Make a remote temporary file. + + :param suffix: suffix for the temporary file + :param parentdir: parent dir where temp file should be created + :param data: write data to the file if provided + + Returns: the path of the temp file created. + """ + args = ['mktemp'] + if suffix: + args.append('--suffix=%s' % suffix) + if parentdir: + args.append('--tmpdir=%s' % parentdir) + + path = self.sh(args).strip() + + if data: + self.write_file(path=path, data=data) + + return path + + def sh(self, script, **kwargs): + """ + Shortcut for run method. + + Usage: + my_name = remote.sh('whoami') + remote_date = remote.sh('date') + """ + if 'stdout' not in kwargs: + kwargs['stdout'] = BytesIO() + if 'args' not in kwargs: + kwargs['args'] = script + proc = self.run(**kwargs) + out = proc.stdout.getvalue() + if isinstance(out, bytes): + return out.decode() + else: + return out + + def sh_file(self, script, label="script", sudo=False, **kwargs): + """ + Run shell script after copying its contents to a remote file + + :param script: string with script text, or file object + :param sudo: run command with sudo if True, + run as user name if string value (defaults to False) + :param label: string value which will be part of file name + Returns: stdout + """ + ftempl = '/tmp/teuthology-remote-$(date +%Y%m%d%H%M%S)-{}-XXXX'\ + .format(label) + script_file = self.sh("mktemp %s" % ftempl).strip() + self.sh("cat - | tee {script} ; chmod a+rx {script}"\ + .format(script=script_file), stdin=script) + if sudo: + if isinstance(sudo, str): + command="sudo -u %s %s" % (sudo, script_file) + else: + command="sudo %s" % script_file + else: + command="%s" % script_file + + return self.sh(command, **kwargs) + + def chmod(self, file_path, permissions): + """ + As super-user, set permissions on the remote file specified. + """ + args = [ + 'sudo', + 'chmod', + permissions, + file_path, + ] + self.run( + args=args, + ) + + def chcon(self, file_path, context): + """ + Set the SELinux context of a given file. + + VMs and non-RPM-based hosts will skip this operation because ours + currently have SELinux disabled. + + :param file_path: The path to the file + :param context: The SELinux context to be used + """ + if self.os.package_type != 'rpm' or \ + self.os.name in ['opensuse', 'sle']: + return + if teuthology.lock.query.is_vm(self.shortname): + return + self.run(args="sudo chcon {con} {path}".format( + con=context, path=file_path)) + + def copy_file(self, src, dst, sudo=False, mode=None, owner=None, + mkdir=False, append=False): + """ + Copy data to remote file + + :param src: source file path on remote host + :param dst: destination file path on remote host + :param sudo: use sudo to write file, defaults False + :param mode: set file mode bits if provided + :param owner: set file owner if provided + :param mkdir: ensure the destination directory exists, defaults + False + :param append: append data to the file, defaults False + """ + dd = 'sudo dd' if sudo else 'dd' + args = dd + ' if=' + src + ' of=' + dst + if append: + args += ' conv=notrunc oflag=append' + if mkdir: + mkdirp = 'sudo mkdir -p' if sudo else 'mkdir -p' + dirpath = os.path.dirname(dst) + if dirpath: + args = mkdirp + ' ' + dirpath + '\n' + args + if mode: + chmod = 'sudo chmod' if sudo else 'chmod' + args += '\n' + chmod + ' ' + mode + ' ' + dst + if owner: + chown = 'sudo chown' if sudo else 'chown' + args += '\n' + chown + ' ' + owner + ' ' + dst + args = 'set -ex' + '\n' + args + self.run(args=args) + + def move_file(self, src, dst, sudo=False, mode=None, owner=None, + mkdir=False): + """ + Move data to remote file + + :param src: source file path on remote host + :param dst: destination file path on remote host + :param sudo: use sudo to write file, defaults False + :param mode: set file mode bits if provided + :param owner: set file owner if provided + :param mkdir: ensure the destination directory exists, defaults + False + """ + mv = 'sudo mv' if sudo else 'mv' + args = mv + ' ' + src + ' ' + dst + if mkdir: + mkdirp = 'sudo mkdir -p' if sudo else 'mkdir -p' + dirpath = os.path.dirname(dst) + if dirpath: + args = mkdirp + ' ' + dirpath + '\n' + args + if mode: + chmod = 'sudo chmod' if sudo else 'chmod' + args += ' && ' + chmod + ' ' + mode + ' ' + dst + if owner: + chown = 'sudo chown' if sudo else 'chown' + args += ' && ' + chown + ' ' + owner + ' ' + dst + self.run(args=args) + + def read_file(self, path, sudo=False, stdout=None, + offset=0, length=0): + """ + Read data from remote file + + :param path: file path on remote host + :param sudo: use sudo to read the file, defaults False + :param stdout: output object, defaults to io.BytesIO() + :param offset: number of bytes to skip from the file + :param length: number of bytes to read from the file + + :raises: :class:`FileNotFoundError`: there is no such file by the path + :raises: :class:`RuntimeError`: unexpected error occurred + + :returns: the file contents in bytes, if stdout is `io.BytesIO`, by + default + :returns: the file contents in str, if stdout is `io.StringIO` + """ + dd = 'sudo dd' if sudo else 'dd' + args = dd + ' if=' + path + ' of=/dev/stdout' + iflags=[] + # we have to set defaults here instead of the method's signature, + # because python is reusing the object from call to call + stdout = stdout or BytesIO() + if offset: + args += ' skip=' + str(offset) + iflags += 'skip_bytes' + if length: + args += ' count=' + str(length) + iflags += 'count_bytes' + if iflags: + args += ' iflag=' + ','.join(iflags) + args = 'set -ex' + '\n' + args + proc = self.run(args=args, stdout=stdout, stderr=StringIO(), + check_status=False, quiet=True) + if proc.returncode: + if 'No such file or directory' in proc.stderr.getvalue(): + raise FileNotFoundError(errno.ENOENT, + f"Cannot find file on the remote '{self.name}'", path) + else: + raise RuntimeError("Unexpected error occurred while trying to " + f"read '{path}' file on the remote '{self.name}'") + + return proc.stdout.getvalue() + + + def write_file(self, path, data, sudo=False, mode=None, owner=None, + mkdir=False, append=False, bs=None, + offset=None, sync=False): + """ + Write data to remote file + + The data written in 512-byte blocks, provide `bs` to use bigger blocks. + + :param path: file path on remote host + :param data: str, binary or fileobj to be written + :param sudo: use sudo to write file, defaults False + :param mode: set file mode bits if provided + :param owner: set file owner if provided + :param mkdir: preliminary create the file directory, defaults False + :param append: append data to the file, defaults False + :param bs: write up to N bytes at a time if provided, default is 512 in `dd` + :param offset: number of bs blocks to seek to in file, defaults 0 + :param sync: sync file after write is complete if provided + """ + dd = 'sudo dd' if sudo else 'dd' + args = dd + ' of=' + path + if append: + args += ' conv=notrunc oflag=append' + if bs: + args += ' bs=' + str(bs) + if offset: + args += ' seek=' + str(offset) + if sync: + args += ' conv=sync' + if mkdir: + mkdirp = 'sudo mkdir -p' if sudo else 'mkdir -p' + dirpath = os.path.dirname(path) + if dirpath: + args = mkdirp + ' ' + dirpath + '\n' + args + if mode: + chmod = 'sudo chmod' if sudo else 'chmod' + args += '\n' + chmod + ' ' + mode + ' ' + path + if owner: + chown = 'sudo chown' if sudo else 'chown' + args += '\n' + chown + ' ' + owner + ' ' + path + args = 'set -ex' + '\n' + args + self.run(args=args, stdin=data, quiet=True) + + def sudo_write_file(self, path, data, **kwargs): + """ + Write data to remote file with sudo, for more info see `write_file()`. + """ + self.write_file(path, data, sudo=True, **kwargs) + + def is_mounted(self, path): + """ + Check if the given path is mounted on the remote machine. + + This method checks the contents of "/proc/self/mounts" instead of + using "mount" or "findmnt" command since these commands hang when a + CephFS client is blocked and its mount point on the remote machine + is left unhandled/unmounted. + + :param path: path on remote host + """ + # XXX: matching newline too is crucial so that "/mnt" does not match + # "/mnt/cephfs" if it's present in the output. + return f'{path}\n' in self.sh("cat /proc/self/mounts | awk '{print $2}'") + + @property + def os(self): + if not hasattr(self, '_os'): + try: + os_release = self.sh('cat /etc/os-release').strip() + self._os = OS.from_os_release(os_release) + return self._os + except CommandFailedError: + pass + + lsb_release = self.sh('lsb_release -a').strip() + self._os = OS.from_lsb_release(lsb_release) + return self._os + + @property + def arch(self): + if not hasattr(self, '_arch'): + self._arch = self.sh('uname -m').strip() + return self._arch + + +class Remote(RemoteShell): + """ + A connection to a remote host. + + This is a higher-level wrapper around Paramiko's `SSHClient`. + """ + + # for unit tests to hook into + _runner = staticmethod(run.run) + _reimage_types = None + + def __init__(self, name, ssh=None, shortname=None, console=None, + host_key=None, keep_alive=True): + self.name = name + if '@' in name: + (self.user, hostname) = name.split('@') + # Temporary workaround for 'hostname --fqdn' not working on some + # machines + self._hostname = hostname + else: + # os.getlogin() doesn't work on non-login shells. The following + # should work on any unix system + self.user = pwd.getpwuid(os.getuid()).pw_name + hostname = name + self._shortname = shortname or host_shortname(hostname) + self._host_key = host_key + self.keep_alive = keep_alive + self._console = console + self.ssh = ssh + + if self._reimage_types is None: + Remote._reimage_types = teuthology.provision.get_reimage_types() + + def connect(self, timeout=None, create_key=None, context='connect'): + args = dict(user_at_host=self.name, host_key=self._host_key, + keep_alive=self.keep_alive, _create_key=create_key) + if context == 'reconnect': + # The reason for the 'context' workaround is not very + # clear from the technical side. + # I'll get "[Errno 98] Address already in use" altough + # there are no open tcp(ssh) connections. + # When connecting without keepalive, host_key and _create_key + # set, it will proceed. + args = dict(user_at_host=self.name, _create_key=False, host_key=None) + if timeout: + args['timeout'] = timeout + + self.ssh = connection.connect(**args) + return self.ssh + + def reconnect(self, timeout=30, socket_timeout=None): + """ + Attempts to re-establish connection. Returns True for success; False + for failure. + """ + if self.ssh is not None: + self.ssh.close() + if not timeout: + return self._reconnect(timeout=socket_timeout) + action = f"reconnect to {self.shortname}" + with safe_while(action=action, timeout=timeout, increment=3, _raise=False) as proceed: + success = False + while proceed(): + success = self._reconnect(timeout=socket_timeout) + if success: + log.info(f"Successfully reconnected to host '{self.name}'") + return success + return success + + def _reconnect(self, timeout=None): + log.info(f"Trying to reconnect to host '{self.name}'") + try: + self.connect(timeout=timeout, context='reconnect') + return self.is_online + except Exception as e: + log.debug(e) + return False + + @property + def ip_address(self): + return self.ssh.get_transport().getpeername()[0] + + @property + def interface(self): + """ + The interface used by the current SSH connection + """ + if not hasattr(self, '_interface'): + self._set_iface_and_cidr() + return self._interface + + @property + def cidr(self): + """ + The network (in CIDR notation) used by the remote's SSH connection + """ + if not hasattr(self, '_cidr'): + self._set_iface_and_cidr() + return self._cidr + + def _set_iface_and_cidr(self): + ip_addr_show = self.sh('PATH=/sbin:/usr/sbin ip addr show') + regexp = 'inet.? %s' % self.ip_address + for line in ip_addr_show.split('\n'): + line = line.strip() + if re.match(regexp, line): + items = line.split() + self._interface = items[-1] + self._cidr = str(netaddr.IPNetwork(items[1]).cidr) + return + raise RuntimeError("Could not determine interface/CIDR!") + + + def resolve_ip(self, name=None, ipv='4') -> str: + """ + Resolve IP address of the remote host via remote host + + Because remote object maybe behind bastion host we need + the remote host address resolvable from remote side. + So in order to the ip address we just call `host` remotely + and parse output like: + 'smithi001.front.sepia.ceph.com has address 172.21.15.1\n' + + :param name: hostname to resolve, by defaults remote host itself. + :param ipv: the IP version, 4 or 6, defaults to 4. + + :raises: :class:`Exception`: when the hostname cannot be resolved. + :raises: :class:`ValueError`: when the ipv argument mismatch 4 or 6. + + :returns: str object, the ip addres of the remote host. + """ + hostname = name or self.hostname + version = str(ipv) + if version in ['4', '6']: + remote_host_ip = self.sh(f'host -{ipv} {hostname}') + else: + raise ValueError(f'Unknown IP version {ipv}, expected 4 or 6') + # `host -4` or `host -6` may have multiline output: a host can have + # several address; thus try and find the first suitable + for info in remote_host_ip.split("\n"): + if version == '4' and 'has address' in info: + (host, ip) = info.strip().split(' has address ') + if hostname in host: + return ip + elif version == '6' and 'has IPv6 address' in info: + (host, ip) = info.strip().split(' has IPv6 address ') + if hostname in host: + return ip + else: + raise Exception(f'Cannot get IPv{ipv} address for the host "{hostname}" via remote "{self.hostname}"') + + + @property + def hostname(self): + if not hasattr(self, '_hostname'): + self._hostname = self.sh('hostname --fqdn').strip() + return self._hostname + + @property + def machine_type(self): + if not getattr(self, '_machine_type', None): + remote_info = teuthology.lock.query.get_status(self.hostname) + if not remote_info: + return None + self._machine_type = remote_info.get("machine_type", None) + return self._machine_type + + @property + def is_reimageable(self): + return self.machine_type in self._reimage_types + + @property + def shortname(self): + if self._shortname is None: + self._shortname = host_shortname(self.hostname) + return self._shortname + + @property + def is_online(self): + if self.ssh is None: + return False + if self.ssh.get_transport() is None: + return False + try: + self.run(args="true") + except Exception: + return False + return self.ssh.get_transport().is_active() + + def ensure_online(self): + if self.is_online: + return + self.connect() + if not self.is_online: + raise ConnectionError(f'Failed to connect to {self.shortname}') + + @property + def system_type(self): + """ + System type decorator + """ + return misc.get_system_type(self) + + def __str__(self): + return self.name + + def __repr__(self): + return '{classname}(name={name!r})'.format( + classname=self.__class__.__name__, + name=self.name, + ) + + def run(self, **kwargs): + """ + This calls `orchestra.run.run` with our SSH client. + + TODO refactor to move run.run here? + """ + if not self.ssh or \ + not self.ssh.get_transport() or \ + not self.ssh.get_transport().is_active(): + if not self.reconnect(): + raise ConnectionError(f'Failed to reconnect to {self.shortname}') + r = self._runner(client=self.ssh, name=self.shortname, **kwargs) + r.remote = self + return r + + def run_unit_test(self, xml_path_regex, output_yaml, **kwargs): + try: + r = self.run(**kwargs) + except CommandFailedError as exc: + if xml_path_regex: + error_msg = UnitTestScanner(remote=self).scan_and_write(xml_path_regex, output_yaml) + if error_msg: + raise UnitTestError( + exitstatus=exc.exitstatus, node=exc.node, + label=exc.label, message=error_msg + ) + raise exc + return r + + def _sftp_put_file(self, local_path, remote_path): + """ + Use the paramiko.SFTPClient to put a file. Returns the remote filename. + """ + sftp = self.ssh.open_sftp() + sftp.put(local_path, remote_path) + return + + def _sftp_get_file(self, remote_path, local_path): + """ + Use the paramiko.SFTPClient to get a file. Returns the local filename. + """ + file_size = self._format_size( + self._sftp_get_size(remote_path) + ).strip() + log.debug("{}:{} is {}".format(self.shortname, remote_path, file_size)) + sftp = self.ssh.open_sftp() + sftp.get(remote_path, local_path) + return local_path + + def _sftp_open_file(self, remote_path, mode=None): + """ + Use the paramiko.SFTPClient to open a file. Returns a + paramiko.SFTPFile object. + """ + sftp = self.ssh.open_sftp() + if mode: + return sftp.open(remote_path, mode) + return sftp.open(remote_path) + + def _sftp_get_size(self, remote_path): + """ + Via _sftp_open_file, return the filesize in bytes + """ + with self._sftp_open_file(remote_path) as f: + return f.stat().st_size + + @staticmethod + def _format_size(file_size): + """ + Given a file_size in bytes, returns a human-readable representation. + """ + for unit in ('B', 'KB', 'MB', 'GB', 'TB'): + if abs(file_size) < 1024.0: + break + file_size = file_size / 1024.0 + return "{:3.0f}{}".format(file_size, unit) + + def put_file(self, path, dest_path, sudo=False): + """ + Copy a local filename to a remote file + """ + if sudo: + raise NotImplementedError("sudo not supported") + + self._sftp_put_file(path, dest_path) + return + + def get_file(self, path, sudo=False, dest_dir='/tmp'): + """ + Fetch a remote file, and return its local filename. + + :param sudo: Use sudo on the remote end to read a file that + requires it. Defaults to False. + :param dest_dir: Store the file in this directory. If it is /tmp, + generate a unique filename; if not, use the original + filename. + :returns: The path to the local file + """ + if not os.path.isdir(dest_dir): + raise IOError("{dir} is not a directory".format(dir=dest_dir)) + + if sudo: + orig_path = path + path = self.mktemp() + args = [ + 'sudo', + 'cp', + orig_path, + path, + ] + self.run(args=args) + self.chmod(path, '0666') + + if dest_dir == '/tmp': + # If we're storing in /tmp, generate a unique filename + (fd, local_path) = tempfile.mkstemp(dir=dest_dir) + os.close(fd) + else: + # If we are storing somewhere other than /tmp, use the original + # filename + local_path = os.path.join(dest_dir, path.split(os.path.sep)[-1]) + + self._sftp_get_file(path, local_path) + if sudo: + self.remove(path) + return local_path + + def get_tar(self, path, to_path, sudo=False, compress=True): + """ + Tar a remote directory and copy it locally + """ + remote_temp_path = self.mktemp() + args = [] + if sudo: + args.append('sudo') + args.extend([ + 'tar', + 'cz' if compress else 'c', + '-f', '-', + '-C', path, + '--', + '.', + run.Raw('>'), remote_temp_path + ]) + self.run(args=args) + if sudo: + self.chmod(remote_temp_path, '0666') + self._sftp_get_file(remote_temp_path, to_path) + self.remove(remote_temp_path) + + def get_tar_stream(self, path, sudo=False, compress=True): + """ + Tar-compress a remote directory and return the RemoteProcess + for streaming + """ + args = [] + if sudo: + args.append('sudo') + args.extend([ + 'tar', + 'cz' if compress else 'c', + '-f', '-', + '-C', path, + '--', + '.', + ]) + return self.run(args=args, wait=False, stdout=run.PIPE) + + @property + def host_key(self): + if not self._host_key: + trans = self.ssh.get_transport() + key = trans.get_remote_server_key() + self._host_key = ' '.join((key.get_name(), key.get_base64())) + return self._host_key + + @property + def inventory_info(self): + node = dict() + node['name'] = self.hostname + node['user'] = self.user + node['arch'] = self.arch + node['os_type'] = self.os.name + node['os_version'] = '.'.join(self.os.version.split('.')[:2]) + node['ssh_pub_key'] = self.host_key + node['up'] = True + return node + + @property + def console(self): + if not self._console: + self._console = getRemoteConsole(self.name) + return self._console + + @property + def is_vm(self): + if not hasattr(self, '_is_vm'): + self._is_vm = teuthology.lock.query.is_vm(self.name) + return self._is_vm + + @property + def is_container(self): + if not hasattr(self, '_is_container'): + self._is_container = not bool(self.run( + args="test -f /run/.containerenv -o -f /.dockerenv", + check_status=False, + ).returncode) + return self._is_container + + @property + def init_system(self): + """ + Which init system does the remote use? + + :returns: 'systemd' or None + """ + if not hasattr(self, '_init_system'): + self._init_system = None + proc = self.run( + args=['which', 'systemctl'], + check_status=False, + ) + if proc.returncode == 0: + self._init_system = 'systemd' + return self._init_system + + def __del__(self): + if self.ssh is not None: + self.ssh.close() + + +def getRemoteConsole(name, ipmiuser=None, ipmipass=None, ipmidomain=None, + timeout=60): + """ + Return either VirtualConsole or PhysicalConsole depending on name. + """ + if teuthology.lock.query.is_vm(name): + try: + return console.VirtualConsole(name) + except Exception: + return None + return console.PhysicalConsole( + name, ipmiuser, ipmipass, ipmidomain, timeout) diff --git a/teuthology/orchestra/run.py b/teuthology/orchestra/run.py new file mode 100644 index 000000000..7f6fdb240 --- /dev/null +++ b/teuthology/orchestra/run.py @@ -0,0 +1,485 @@ +""" +Paramiko run support +""" + +import io + +from paramiko import ChannelFile + +import gevent +import gevent.event +import socket +import shlex +import logging +import shutil + +from teuthology.contextutil import safe_while +from teuthology.exceptions import (CommandCrashedError, CommandFailedError, + ConnectionLostError) + +log = logging.getLogger(__name__) + + +class RemoteProcess(object): + """ + An object to begin and monitor execution of a process on a remote host + """ + __slots__ = [ + 'client', 'args', 'check_status', 'command', 'hostname', + 'stdin', 'stdout', 'stderr', + '_stdin_buf', '_stdout_buf', '_stderr_buf', + 'returncode', 'exitstatus', 'timeout', + 'greenlets', + '_wait', 'logger', + # for orchestra.remote.Remote to place a backreference + 'remote', + 'label', + ] + + deadlock_warning = "Using PIPE for %s without wait=False would deadlock" + + def __init__(self, client, args, check_status=True, hostname=None, + label=None, timeout=None, wait=True, logger=None, cwd=None): + """ + Create the object. Does not initiate command execution. + + :param client: paramiko.SSHConnection to run the command with + :param args: Command to run. + :type args: String or list of strings + :param check_status: Whether to raise CommandFailedError on non-zero + exit status, and . Defaults to True. All signals + and connection loss are made to look like SIGHUP. + :param hostname: Name of remote host (optional) + :param label: Can be used to label or describe what the + command is doing. + :param timeout: timeout value for arg that is passed to + exec_command of paramiko + :param wait: Whether self.wait() will be called automatically + :param logger: Alternative logger to use (optional) + :param cwd: Directory in which the command will be executed + (optional) + """ + self.client = client + self.args = args + if isinstance(args, list): + self.command = quote(args) + else: + self.command = args + + if cwd: + self.command = '(cd {cwd} && exec {cmd})'.format( + cwd=cwd, cmd=self.command) + + self.check_status = check_status + self.label = label + if timeout: + self.timeout = timeout + if hostname: + self.hostname = hostname + else: + (self.hostname, port) = client.get_transport().getpeername()[0:2] + + self.greenlets = [] + self.stdin, self.stdout, self.stderr = (None, None, None) + self.returncode = self.exitstatus = None + self._wait = wait + self.logger = logger or log + + def execute(self): + """ + Execute remote command + """ + for line in self.command.split('\n'): + log.getChild(self.hostname).debug('%s> %s' % (self.label or '', line)) + + if hasattr(self, 'timeout'): + (self._stdin_buf, self._stdout_buf, self._stderr_buf) = \ + self.client.exec_command(self.command, timeout=self.timeout) + else: + (self._stdin_buf, self._stdout_buf, self._stderr_buf) = \ + self.client.exec_command(self.command) + (self.stdin, self.stdout, self.stderr) = \ + (self._stdin_buf, self._stdout_buf, self._stderr_buf) + + def add_greenlet(self, greenlet): + self.greenlets.append(greenlet) + + def setup_stdin(self, stream_obj): + self.stdin = KludgeFile(wrapped=self.stdin) + if stream_obj is not PIPE: + greenlet = gevent.spawn(copy_and_close, stream_obj, self.stdin) + self.add_greenlet(greenlet) + self.stdin = None + elif self._wait: + # FIXME: Is this actually true? + raise RuntimeError(self.deadlock_warning % 'stdin') + + def setup_output_stream(self, stream_obj, stream_name, quiet=False): + if stream_obj is not PIPE: + # Log the stream + host_log = self.logger.getChild(self.hostname) + stream_log = host_log.getChild(stream_name) + self.add_greenlet( + gevent.spawn( + copy_file_to, + getattr(self, stream_name), + stream_log, + stream_obj, + quiet, + ) + ) + setattr(self, stream_name, stream_obj) + elif self._wait: + # FIXME: Is this actually true? + raise RuntimeError(self.deadlock_warning % stream_name) + + def wait(self): + """ + Block until remote process finishes. + + :returns: self.returncode + """ + + status = self._get_exitstatus() + if status != 0: + log.debug("got remote process result: {}".format(status)) + for greenlet in self.greenlets: + try: + greenlet.get(block=True,timeout=60) + except gevent.Timeout: + log.debug("timed out waiting; will kill: {}".format(greenlet)) + greenlet.kill(block=False) + for stream in ('stdout', 'stderr'): + if hasattr(self, stream): + stream_obj = getattr(self, stream) + # Despite ChannelFile having a seek() method, it raises + # "IOError: File does not support seeking." + if hasattr(stream_obj, 'seek') and \ + not isinstance(stream_obj, ChannelFile): + stream_obj.seek(0) + + self._raise_for_status() + return status + + def _raise_for_status(self): + if self.returncode is None: + self._get_exitstatus() + if self.check_status: + if self.returncode in (None, -1): + # command either died due to a signal, or the connection + # was lost + transport = self.client.get_transport() + if transport is None or not transport.is_active(): + # look like we lost the connection + raise ConnectionLostError(command=self.command, + node=self.hostname) + + # connection seems healthy still, assuming it was a + # signal; sadly SSH does not tell us which signal + raise CommandCrashedError(command=self.command) + if self.returncode != 0: + raise CommandFailedError( + command=self.command, exitstatus=self.returncode, + node=self.hostname, label=self.label + ) + + def _get_exitstatus(self): + """ + :returns: the remote command's exit status (return code). Note that + if the connection is lost, or if the process was killed by a + signal, this returns None instead of paramiko's -1. + """ + status = self._stdout_buf.channel.recv_exit_status() + self.exitstatus = self.returncode = status + if status == -1: + status = None + return status + + @property + def finished(self): + gevent.wait(self.greenlets, timeout=0.1) + ready = self._stdout_buf.channel.exit_status_ready() + if ready: + self._get_exitstatus() + return ready + + def poll(self): + """ + :returns: self.returncode if the process is finished; else None + """ + if self.finished: + self._raise_for_status() + return self.returncode + return None + + def __repr__(self): + return '{classname}(client={client!r}, args={args!r}, check_status={check}, hostname={name!r})'.format( # noqa + classname=self.__class__.__name__, + client=self.client, + args=self.args, + check=self.check_status, + name=self.hostname, + ) + + +class Raw(object): + + """ + Raw objects are passed to remote objects and are not processed locally. + """ + def __init__(self, value): + self.value = value + + def __repr__(self): + return '{cls}({value!r})'.format( + cls=self.__class__.__name__, + value=self.value, + ) + + def __eq__(self, value): + return self.value == value + + +def quote(args): + """ + Internal quote wrapper. None arguments are not allowed. + + :param args: list of str or Raw objects + + :raises: :class:`RuntimeError`: if one of the args is None. + """ + def _quote(args): + """ + Handle quoted string, testing for raw charaters. + """ + for i, a in enumerate(args): + if isinstance(a, Raw): + yield a.value + elif a is None: + raise RuntimeError(f"Argument at position {i} is None: {args}") + else: + yield shlex.quote(a) + if isinstance(args, list): + return ' '.join(_quote(args)) + else: + return args + + +def copy_to_log(f, logger, loglevel=logging.INFO, capture=None, quiet=False): + """ + Copy line by line from file in f to the log from logger + + :param f: source stream object + :param logger: the destination logger object + :param loglevel: the level of logging data + :param capture: an optional stream object for data copy + :param quiet: suppress `logger` usage if True, this is useful only + in combination with `capture`, defaults False + """ + # Work-around for http://tracker.ceph.com/issues/8313 + if isinstance(f, ChannelFile): + f._flags += ChannelFile.FLAG_BINARY + for line in f: + if capture: + if isinstance(capture, io.StringIO): + if isinstance(line, str): + capture.write(line) + else: + capture.write(line.decode('utf-8', 'replace')) + elif isinstance(capture, io.BytesIO): + if isinstance(line, str): + capture.write(line.encode()) + else: + capture.write(line) + line = line.rstrip() + # Second part of work-around for http://tracker.ceph.com/issues/8313 + if quiet: + continue + try: + if isinstance(line, bytes): + line = line.decode('utf-8', 'replace') + logger.log(loglevel, line) + except (UnicodeDecodeError, UnicodeEncodeError): + logger.exception("Encountered unprintable line in command output") + + +def copy_and_close(src, fdst): + """ + copyfileobj call wrapper. + """ + if src is not None: + if isinstance(src, bytes): + src = io.BytesIO(src) + elif isinstance(src, str): + src = io.StringIO(src) + shutil.copyfileobj(src, fdst) + fdst.close() + + +def copy_file_to(src, logger, stream=None, quiet=False): + """ + Copy file + :param src: file to be copied. + :param logger: the logger object + :param stream: an optional file-like object which will receive + a copy of src. + :param quiet: disable logger usage if True, useful in combination + with `stream` parameter, defaults False. + """ + copy_to_log(src, logger, capture=stream, quiet=quiet) + +def spawn_asyncresult(fn, *args, **kwargs): + """ + Spawn a Greenlet and pass it's results to an AsyncResult. + + This function is useful to shuffle data from a Greenlet to + AsyncResult, which then again is useful because any Greenlets that + raise exceptions will cause tracebacks to be shown on stderr by + gevent, even when ``.link_exception`` has been called. Using an + AsyncResult avoids this. + """ + r = gevent.event.AsyncResult() + + def wrapper(): + """ + Internal wrapper. + """ + try: + value = fn(*args, **kwargs) + except Exception as e: + r.set_exception(e) + else: + r.set(value) + gevent.spawn(wrapper) + + return r + + +class Sentinel(object): + + """ + Sentinel -- used to define PIPE file-like object. + """ + def __init__(self, name): + self.name = name + + def __str__(self): + return self.name + +PIPE = Sentinel('PIPE') + + +class KludgeFile(object): + + """ + Wrap Paramiko's ChannelFile in a way that lets ``f.close()`` + actually cause an EOF for the remote command. + """ + def __init__(self, wrapped): + self._wrapped = wrapped + + def __getattr__(self, name): + return getattr(self._wrapped, name) + + def close(self): + """ + Close and shutdown. + """ + self._wrapped.close() + self._wrapped.channel.shutdown_write() + + +def run( + client, args, + stdin=None, stdout=None, stderr=None, + logger=None, + check_status=True, + wait=True, + name=None, + label=None, + quiet=False, + timeout=None, + cwd=None, + # omit_sudo is used by vstart_runner.py + omit_sudo=False +): + """ + Run a command remotely. If any of 'args' contains shell metacharacters + that you want to pass unquoted, pass it as an instance of Raw(); otherwise + it will be quoted with shlex.quote() (single quote, and single quotes + enclosed in double quotes). + + :param client: SSHConnection to run the command with + :param args: command to run + :type args: list of string + :param stdin: Standard input to send; either a string, a file-like object, + None, or `PIPE`. `PIPE` means caller is responsible for + closing stdin, or command may never exit. + :param stdout: What to do with standard output. Either a file-like object, + a `logging.Logger`, `PIPE`, or `None` for copying to default + log. `PIPE` means caller is responsible for reading, or + command may never exit. + :param stderr: What to do with standard error. See `stdout`. + :param logger: If logging, write stdout/stderr to "out" and "err" children + of this logger. Defaults to logger named after this module. + :param check_status: Whether to raise CommandFailedError on non-zero exit + status, and . Defaults to True. All signals and + connection loss are made to look like SIGHUP. + :param wait: Whether to wait for process to exit. If False, returned + ``r.exitstatus`` s a `gevent.event.AsyncResult`, and the + actual status is available via ``.get()``. + :param name: Human readable name (probably hostname) of the destination + host + :param label: Can be used to label or describe what the command is doing. + :param quiet: Do not log command's stdout and stderr, defaults False. + :param timeout: timeout value for args to complete on remote channel of + paramiko + :param cwd: Directory in which the command should be executed. + """ + try: + transport = client.get_transport() + if transport: + (host, port) = transport.getpeername()[0:2] + else: + raise ConnectionLostError(command=quote(args), node=name) + except socket.error: + raise ConnectionLostError(command=quote(args), node=name) + + if name is None: + name = host + + if timeout: + log.info("Running command with timeout %d", timeout) + r = RemoteProcess(client, args, check_status=check_status, hostname=name, + label=label, timeout=timeout, wait=wait, logger=logger, + cwd=cwd) + r.execute() + r.setup_stdin(stdin) + r.setup_output_stream(stderr, 'stderr', quiet) + r.setup_output_stream(stdout, 'stdout', quiet) + if wait: + r.wait() + return r + + +def wait(processes, timeout=None): + """ + Wait for all given processes to exit. + + Raise if any one of them fails. + + Optionally, timeout after 'timeout' seconds. + """ + if timeout: + log.info("waiting for %d", timeout) + if timeout and timeout > 0: + with safe_while(tries=(timeout // 6)) as check_time: + not_ready = list(processes) + while len(not_ready) > 0: + check_time() + for proc in list(not_ready): + if proc.finished: + not_ready.remove(proc) + + for proc in processes: + proc.wait() diff --git a/teuthology/orchestra/test/__init__.py b/teuthology/orchestra/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/orchestra/test/files/daemon-systemdstate-pid-ps-ef.output b/teuthology/orchestra/test/files/daemon-systemdstate-pid-ps-ef.output new file mode 100644 index 000000000..ddddf571c --- /dev/null +++ b/teuthology/orchestra/test/files/daemon-systemdstate-pid-ps-ef.output @@ -0,0 +1,5 @@ +ceph 658 1 0 Jun08 ? 00:07:43 /usr/bin/ceph-mgr -f --cluster ceph --id host1 --setuser ceph --setgroup ceph +ceph 1634 1 0 Jun08 ? 00:02:17 /usr/bin/ceph-mds -f --cluster ceph --id host1 --setuser ceph --setgroup ceph +ceph 31555 1 0 Jun08 ? 01:13:50 /usr/bin/ceph-mon -f --cluster ceph --id host1 --setuser ceph --setgroup ceph +ceph 31765 1 0 Jun08 ? 00:48:42 /usr/bin/radosgw -f --cluster ceph --name client.rgw.host1.rgw0 --setuser ceph --setgroup ceph +ceph 97427 1 0 Jun17 ? 00:41:39 /usr/bin/ceph-osd -f --cluster ceph --id 0 --setuser ceph --setgroup ceph \ No newline at end of file diff --git a/teuthology/orchestra/test/integration/__init__.py b/teuthology/orchestra/test/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/orchestra/test/integration/test_integration.py b/teuthology/orchestra/test/integration/test_integration.py new file mode 100644 index 000000000..5adeb6dcc --- /dev/null +++ b/teuthology/orchestra/test/integration/test_integration.py @@ -0,0 +1,94 @@ +from teuthology.orchestra import monkey +monkey.patch_all() + +from io import StringIO + +import os +from teuthology.orchestra import connection, remote, run +from teuthology.orchestra.test.util import assert_raises +from teuthology.exceptions import CommandCrashedError, ConnectionLostError + +from pytest import skip + +HOST = None + + +class TestIntegration(): + def setup_method(self): + try: + host = os.environ['ORCHESTRA_TEST_HOST'] + except KeyError: + skip('To run integration tests, set environment ' + + 'variable ORCHESTRA_TEST_HOST to user@host to use.') + global HOST + HOST = host + + def test_crash(self): + ssh = connection.connect(HOST) + e = assert_raises( + CommandCrashedError, + run.run, + client=ssh, + args=['sh', '-c', 'kill -ABRT $$'], + ) + assert e.command == "sh -c 'kill -ABRT $$'" + assert str(e) == "Command crashed: \"sh -c 'kill -ABRT $$'\"" + + def test_lost(self): + ssh = connection.connect(HOST) + e = assert_raises( + ConnectionLostError, + run.run, + client=ssh, + args=['sh', '-c', 'kill -ABRT $PPID'], + name=HOST, + ) + assert e.command == "sh -c 'kill -ABRT $PPID'" + assert str(e) == \ + "SSH connection to {host} was lost: ".format(host=HOST) + \ + "\"sh -c 'kill -ABRT $PPID'\"" + + def test_pipe(self): + ssh = connection.connect(HOST) + r = run.run( + client=ssh, + args=['cat'], + stdin=run.PIPE, + stdout=StringIO(), + wait=False, + ) + assert r.stdout.getvalue() == '' + r.stdin.write('foo\n') + r.stdin.write('bar\n') + r.stdin.close() + + r.wait() + got = r.exitstatus + assert got == 0 + assert r.stdout.getvalue() == 'foo\nbar\n' + + def test_and(self): + ssh = connection.connect(HOST) + r = run.run( + client=ssh, + args=['true', run.Raw('&&'), 'echo', 'yup'], + stdout=StringIO(), + ) + assert r.stdout.getvalue() == 'yup\n' + + def test_os(self): + rem = remote.Remote(HOST) + assert rem.os.name + assert rem.os.version + + def test_17102(self, caplog): + # http://tracker.ceph.com/issues/17102 + rem = remote.Remote(HOST) + interval = 3 + rem.run(args="echo before; sleep %s; echo after" % interval) + for record in caplog.records: + if record.msg == 'before': + before_time = record.created + elif record.msg == 'after': + after_time = record.created + assert int(round(after_time - before_time)) == interval diff --git a/teuthology/orchestra/test/test_cluster.py b/teuthology/orchestra/test/test_cluster.py new file mode 100644 index 000000000..27bef8b83 --- /dev/null +++ b/teuthology/orchestra/test/test_cluster.py @@ -0,0 +1,232 @@ +import pytest + +from mock import patch, Mock + +from teuthology.orchestra import cluster, remote, run + + +class TestCluster(object): + def test_init_empty(self): + c = cluster.Cluster() + assert c.remotes == {} + + def test_init(self): + r1 = Mock() + r2 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['baz']), + ], + ) + r3 = Mock() + c.add(r3, ['xyzzy', 'thud', 'foo']) + assert c.remotes == { + r1: ['foo', 'bar'], + r2: ['baz'], + r3: ['xyzzy', 'thud', 'foo'], + } + + def test_repr(self): + r1 = remote.Remote('r1', ssh=Mock()) + r2 = remote.Remote('r2', ssh=Mock()) + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['baz']), + ], + ) + assert repr(c) == \ + "Cluster(remotes=[[Remote(name='r1'), ['foo', 'bar']], " \ + "[Remote(name='r2'), ['baz']]])" + + def test_str(self): + r1 = remote.Remote('r1', ssh=Mock()) + r2 = remote.Remote('r2', ssh=Mock()) + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['baz']), + ], + ) + assert str(c) == "r1[foo,bar] r2[baz]" + + def test_run_all(self): + r1 = Mock(spec=remote.Remote) + r1.configure_mock(name='r1') + ret1 = Mock(spec=run.RemoteProcess) + r1.run.return_value = ret1 + r2 = Mock(spec=remote.Remote) + r2.configure_mock(name='r2') + ret2 = Mock(spec=run.RemoteProcess) + r2.run.return_value = ret2 + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['baz']), + ], + ) + got = c.run(args=['test']) + r1.run.assert_called_once_with(args=['test'], wait=True) + r2.run.assert_called_once_with(args=['test'], wait=True) + assert len(got) == 2 + assert got, [ret1 == ret2] + # check identity not equality + assert got[0] is ret1 + assert got[1] is ret2 + + def test_only_one(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_foo = c.only('foo') + assert c_foo.remotes == {r1: ['foo', 'bar'], r3: ['foo']} + + def test_only_two(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_both = c.only('foo', 'bar') + assert c_both.remotes, {r1: ['foo' == 'bar']} + + def test_only_none(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_none = c.only('impossible') + assert c_none.remotes == {} + + def test_only_match(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_foo = c.only('foo', lambda role: role.startswith('b')) + assert c_foo.remotes, {r1: ['foo' == 'bar']} + + def test_exclude_one(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_foo = c.exclude('foo') + assert c_foo.remotes == {r2: ['bar']} + + def test_exclude_two(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_both = c.exclude('foo', 'bar') + assert c_both.remotes == {r2: ['bar'], r3: ['foo']} + + def test_exclude_none(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_none = c.exclude('impossible') + assert c_none.remotes == {r1: ['foo', 'bar'], r2: ['bar'], r3: ['foo']} + + def test_exclude_match(self): + r1 = Mock() + r2 = Mock() + r3 = Mock() + c = cluster.Cluster( + remotes=[ + (r1, ['foo', 'bar']), + (r2, ['bar']), + (r3, ['foo']), + ], + ) + c_foo = c.exclude('foo', lambda role: role.startswith('b')) + assert c_foo.remotes == {r2: ['bar'], r3: ['foo']} + + def test_filter(self): + r1 = Mock(_name='r1') + r2 = Mock(_name='r2') + def func(r): + return r._name == "r1" + c = cluster.Cluster(remotes=[ + (r1, ['foo']), + (r2, ['bar']), + ]) + assert c.filter(func).remotes == { + r1: ['foo'] + } + + +class TestWriteFile(object): + """ Tests for cluster.write_file """ + def setup_method(self): + self.r1 = remote.Remote('r1', ssh=Mock()) + self.c = cluster.Cluster( + remotes=[ + (self.r1, ['foo', 'bar']), + ], + ) + + @patch("teuthology.orchestra.remote.RemoteShell.write_file") + def test_write_file(self, m_write_file): + self.c.write_file("filename", "content") + m_write_file.assert_called_with("filename", "content") + + @patch("teuthology.orchestra.remote.RemoteShell.write_file") + def test_fails_with_invalid_perms(self, m_write_file): + with pytest.raises(ValueError): + self.c.write_file("filename", "content", sudo=False, perms="invalid") + + @patch("teuthology.orchestra.remote.RemoteShell.write_file") + def test_fails_with_invalid_owner(self, m_write_file): + with pytest.raises(ValueError): + self.c.write_file("filename", "content", sudo=False, owner="invalid") + + @patch("teuthology.orchestra.remote.RemoteShell.write_file") + def test_with_sudo(self, m_write_file): + self.c.write_file("filename", "content", sudo=True) + m_write_file.assert_called_with("filename", "content", sudo=True, owner=None, mode=None) diff --git a/teuthology/orchestra/test/test_connection.py b/teuthology/orchestra/test/test_connection.py new file mode 100644 index 000000000..b9f7296aa --- /dev/null +++ b/teuthology/orchestra/test/test_connection.py @@ -0,0 +1,120 @@ +from mock import patch, Mock + +from teuthology import config +from teuthology.orchestra import connection +from teuthology.orchestra.test.util import assert_raises + + +class TestConnection(object): + def setup_method(self): + self.start_patchers() + + def teardown_method(self): + self.stop_patchers() + + def start_patchers(self): + self.patcher_sleep = patch( + 'time.sleep', + ) + self.patcher_sleep.start() + self.m_ssh = Mock() + self.patcher_ssh = patch( + 'teuthology.orchestra.connection.paramiko.SSHClient', + self.m_ssh, + ) + self.patcher_ssh.start() + + def stop_patchers(self): + self.patcher_ssh.stop() + self.patcher_sleep.stop() + + def clear_config(self): + config.config.teuthology_yaml = '' + config.config.load() + config.config.ssh_key = None + + def test_split_user_just_host(self): + got = connection.split_user('somehost.example.com') + assert got == (None, 'somehost.example.com') + + def test_split_user_both(self): + got = connection.split_user('jdoe@somehost.example.com') + assert got == ('jdoe', 'somehost.example.com') + + def test_split_user_empty_user(self): + s = '@somehost.example.com' + e = assert_raises(AssertionError, connection.split_user, s) + assert str(e) == 'Bad input to split_user: {s!r}'.format(s=s) + + def test_connect(self): + self.clear_config() + config.config.verify_host_keys = True + m_ssh_instance = self.m_ssh.return_value = Mock(); + m_transport = Mock() + m_ssh_instance.get_transport.return_value = m_transport + got = connection.connect( + 'jdoe@orchestra.test.newdream.net.invalid', + _SSHClient=self.m_ssh, + ) + self.m_ssh.assert_called_once() + m_ssh_instance.set_missing_host_key_policy.assert_called_once() + m_ssh_instance.load_system_host_keys.assert_called_once_with() + m_ssh_instance.connect.assert_called_once_with( + hostname='orchestra.test.newdream.net.invalid', + username='jdoe', + timeout=60, + ) + m_transport.set_keepalive.assert_called_once_with(False) + assert got is m_ssh_instance + + def test_connect_no_verify_host_keys(self): + self.clear_config() + config.config.verify_host_keys = False + m_ssh_instance = self.m_ssh.return_value = Mock(); + m_transport = Mock() + m_ssh_instance.get_transport.return_value = m_transport + got = connection.connect( + 'jdoe@orchestra.test.newdream.net.invalid', + _SSHClient=self.m_ssh, + ) + self.m_ssh.assert_called_once() + m_ssh_instance.set_missing_host_key_policy.assert_called_once() + assert not m_ssh_instance.load_system_host_keys.called + m_ssh_instance.connect.assert_called_once_with( + hostname='orchestra.test.newdream.net.invalid', + username='jdoe', + timeout=60, + ) + m_transport.set_keepalive.assert_called_once_with(False) + assert got is m_ssh_instance + + def test_connect_override_hostkeys(self): + self.clear_config() + m_ssh_instance = self.m_ssh.return_value = Mock(); + m_transport = Mock() + m_ssh_instance.get_transport.return_value = m_transport + m_host_keys = Mock() + m_ssh_instance.get_host_keys.return_value = m_host_keys + m_create_key = Mock() + m_create_key.return_value = "frobnitz" + got = connection.connect( + 'jdoe@orchestra.test.newdream.net.invalid', + host_key='ssh-rsa testkey', + _SSHClient=self.m_ssh, + _create_key=m_create_key, + ) + self.m_ssh.assert_called_once() + m_ssh_instance.get_host_keys.assert_called_once() + m_host_keys.add.assert_called_once_with( + hostname='orchestra.test.newdream.net.invalid', + keytype='ssh-rsa', + key='frobnitz', + ) + m_create_key.assert_called_once_with('ssh-rsa', 'testkey') + m_ssh_instance.connect.assert_called_once_with( + hostname='orchestra.test.newdream.net.invalid', + username='jdoe', + timeout=60, + ) + m_transport.set_keepalive.assert_called_once_with(False) + assert got is m_ssh_instance diff --git a/teuthology/orchestra/test/test_console.py b/teuthology/orchestra/test/test_console.py new file mode 100644 index 000000000..fe0399b48 --- /dev/null +++ b/teuthology/orchestra/test/test_console.py @@ -0,0 +1,217 @@ +from mock import patch + +from teuthology.config import config as teuth_config + +from teuthology.orchestra import console + + +class TestConsole(object): + pass + + +class TestPhysicalConsole(TestConsole): + klass = console.PhysicalConsole + ipmi_cmd_templ = 'ipmitool -H {h}.{d} -I lanplus -U {u} -P {p} {c}' + conserver_cmd_templ = 'console -M {m} -p {p} {mode} {h}' + + def setup_method(self): + self.hostname = 'host' + teuth_config.ipmi_domain = 'ipmi_domain' + teuth_config.ipmi_user = 'ipmi_user' + teuth_config.ipmi_password = 'ipmi_pass' + teuth_config.conserver_master = 'conserver_master' + teuth_config.conserver_port = 3109 + teuth_config.use_conserver = True + + def test_has_ipmi_creds(self): + cons = self.klass(self.hostname) + assert cons.has_ipmi_credentials is True + teuth_config.ipmi_domain = None + cons = self.klass(self.hostname) + assert cons.has_ipmi_credentials is False + + def test_console_command_conserver(self): + cons = self.klass( + self.hostname, + teuth_config.ipmi_user, + teuth_config.ipmi_password, + teuth_config.ipmi_domain, + ) + cons.has_conserver = True + console_cmd = cons._console_command() + assert console_cmd == self.conserver_cmd_templ.format( + m=teuth_config.conserver_master, + p=teuth_config.conserver_port, + mode='-s', + h=self.hostname, + ) + console_cmd = cons._console_command(readonly=False) + assert console_cmd == self.conserver_cmd_templ.format( + m=teuth_config.conserver_master, + p=teuth_config.conserver_port, + mode='-f', + h=self.hostname, + ) + + def test_console_command_ipmi(self): + teuth_config.conserver_master = None + cons = self.klass( + self.hostname, + teuth_config.ipmi_user, + teuth_config.ipmi_password, + teuth_config.ipmi_domain, + ) + sol_cmd = cons._console_command() + assert sol_cmd == self.ipmi_cmd_templ.format( + h=self.hostname, + d=teuth_config.ipmi_domain, + u=teuth_config.ipmi_user, + p=teuth_config.ipmi_password, + c='sol activate', + ) + + def test_ipmi_command_ipmi(self): + cons = self.klass( + self.hostname, + teuth_config.ipmi_user, + teuth_config.ipmi_password, + teuth_config.ipmi_domain, + ) + pc_cmd = cons._ipmi_command('power cycle') + assert pc_cmd == self.ipmi_cmd_templ.format( + h=self.hostname, + d=teuth_config.ipmi_domain, + u=teuth_config.ipmi_user, + p=teuth_config.ipmi_password, + c='power cycle', + ) + + def test_spawn_log_conserver(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 0 + m_popen.return_value.wait.return_value = 0 + cons = self.klass(self.hostname) + assert cons.has_conserver is True + m_popen.reset_mock() + m_popen.return_value.poll.return_value = None + cons.spawn_sol_log('/fake/path') + assert m_popen.call_count == 1 + call_args = m_popen.call_args_list[0][0][0] + assert any( + [teuth_config.conserver_master in arg for arg in call_args] + ) + + def test_spawn_log_ipmi(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 1 + m_popen.return_value.wait.return_value = 1 + cons = self.klass(self.hostname) + assert cons.has_conserver is False + m_popen.reset_mock() + m_popen.return_value.poll.return_value = 1 + cons.spawn_sol_log('/fake/path') + assert m_popen.call_count == 1 + call_args = m_popen.call_args_list[0][0][0] + assert any( + ['ipmitool' in arg for arg in call_args] + ) + + def test_spawn_log_fallback(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 0 + m_popen.return_value.wait.return_value = 0 + cons = self.klass(self.hostname) + assert cons.has_conserver is True + m_popen.reset_mock() + m_popen.return_value.poll.return_value = 1 + cons.spawn_sol_log('/fake/path') + assert cons.has_conserver is False + assert m_popen.call_count == 2 + call_args = m_popen.call_args_list[1][0][0] + assert any( + ['ipmitool' in arg for arg in call_args] + ) + + def test_get_console_conserver(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 0 + m_popen.return_value.wait.return_value = 0 + cons = self.klass(self.hostname) + assert cons.has_conserver is True + with patch( + 'teuthology.orchestra.console.pexpect.spawn', + autospec=True, + ) as m_spawn: + cons._get_console() + assert m_spawn.call_count == 1 + assert teuth_config.conserver_master in \ + m_spawn.call_args_list[0][0][0] + + def test_get_console_ipmitool(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 0 + m_popen.return_value.wait.return_value = 0 + cons = self.klass(self.hostname) + assert cons.has_conserver is True + with patch( + 'teuthology.orchestra.console.pexpect.spawn', + autospec=True, + ) as m_spawn: + cons.has_conserver = False + cons._get_console() + assert m_spawn.call_count == 1 + assert 'ipmitool' in m_spawn.call_args_list[0][0][0] + + def test_get_console_fallback(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 0 + m_popen.return_value.wait.return_value = 0 + cons = self.klass(self.hostname) + assert cons.has_conserver is True + with patch( + 'teuthology.orchestra.console.pexpect.spawn', + autospec=True, + ) as m_spawn: + cons.has_conserver = True + m_spawn.return_value.isalive.return_value = False + cons._get_console() + assert m_spawn.return_value.isalive.call_count == 1 + assert m_spawn.call_count == 2 + assert cons.has_conserver is False + assert 'ipmitool' in m_spawn.call_args_list[1][0][0] + + def test_disable_conserver(self): + with patch( + 'teuthology.orchestra.console.psutil.subprocess.Popen', + autospec=True, + ) as m_popen: + m_popen.return_value.pid = 42 + m_popen.return_value.returncode = 0 + m_popen.return_value.wait.return_value = 0 + teuth_config.use_conserver = False + cons = self.klass(self.hostname) + assert cons.has_conserver is False diff --git a/teuthology/orchestra/test/test_opsys.py b/teuthology/orchestra/test/test_opsys.py new file mode 100644 index 000000000..fed0e7025 --- /dev/null +++ b/teuthology/orchestra/test/test_opsys.py @@ -0,0 +1,428 @@ +from textwrap import dedent +from teuthology.orchestra.opsys import OS +import pytest + + +class TestOS(object): + str_centos_9_os_release = dedent(""" + NAME="CentOS Stream" + VERSION="9" + ID="centos" + ID_LIKE="rhel fedora" + VERSION_ID="9" + PLATFORM_ID="platform:el9" + PRETTY_NAME="CentOS Stream 9" + ANSI_COLOR="0;31" + LOGO="fedora-logo-icon" + CPE_NAME="cpe:/o:centos:centos:9" + HOME_URL="https://centos.org/" + BUG_REPORT_URL="https://issues.redhat.com/" + REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux 9" + REDHAT_SUPPORT_PRODUCT_VERSION="CentOS Stream" + """) + + str_centos_7_os_release = dedent(""" + NAME="CentOS Linux" + VERSION="7 (Core)" + ID="centos" + ID_LIKE="rhel fedora" + VERSION_ID="7" + PRETTY_NAME="CentOS Linux 7 (Core)" + ANSI_COLOR="0;31" + CPE_NAME="cpe:/o:centos:centos:7" + HOME_URL="https://www.centos.org/" + BUG_REPORT_URL="https://bugs.centos.org/" + """) + + str_centos_7_os_release_newer = dedent(""" + NAME="CentOS Linux" + VERSION="7 (Core)" + ID="centos" + ID_LIKE="rhel fedora" + VERSION_ID="7" + PRETTY_NAME="CentOS Linux 7 (Core)" + ANSI_COLOR="0;31" + CPE_NAME="cpe:/o:centos:centos:7" + HOME_URL="https://www.centos.org/" + BUG_REPORT_URL="https://bugs.centos.org/" + + CENTOS_MANTISBT_PROJECT="CentOS-7" + CENTOS_MANTISBT_PROJECT_VERSION="7" + REDHAT_SUPPORT_PRODUCT="centos" + REDHAT_SUPPORT_PRODUCT_VERSION="7" + """) + + str_debian_7_lsb_release = dedent(""" + Distributor ID: Debian + Description: Debian GNU/Linux 7.1 (wheezy) + Release: 7.1 + Codename: wheezy + """) + + str_debian_7_os_release = dedent(""" + PRETTY_NAME="Debian GNU/Linux 7 (wheezy)" + NAME="Debian GNU/Linux" + VERSION_ID="7" + VERSION="7 (wheezy)" + ID=debian + ANSI_COLOR="1;31" + HOME_URL="http://www.debian.org/" + SUPPORT_URL="http://www.debian.org/support/" + BUG_REPORT_URL="http://bugs.debian.org/" + """) + + str_debian_8_os_release = dedent(""" + PRETTY_NAME="Debian GNU/Linux 8 (jessie)" + NAME="Debian GNU/Linux" + VERSION_ID="8" + VERSION="8 (jessie)" + ID=debian + HOME_URL="http://www.debian.org/" + SUPPORT_URL="http://www.debian.org/support/" + BUG_REPORT_URL="https://bugs.debian.org/" + """) + + str_debian_9_os_release = dedent(""" + PRETTY_NAME="Debian GNU/Linux 9 (stretch)" + NAME="Debian GNU/Linux" + VERSION_ID="9" + VERSION="9 (stretch)" + ID=debian + HOME_URL="https://www.debian.org/" + SUPPORT_URL="https://www.debian.org/support" + BUG_REPORT_URL="https://bugs.debian.org/" + """) + + str_ubuntu_12_04_lsb_release = dedent(""" + Distributor ID: Ubuntu + Description: Ubuntu 12.04.4 LTS + Release: 12.04 + Codename: precise + """) + + str_ubuntu_12_04_os_release = dedent(""" + NAME="Ubuntu" + VERSION="12.04.4 LTS, Precise Pangolin" + ID=ubuntu + ID_LIKE=debian + PRETTY_NAME="Ubuntu precise (12.04.4 LTS)" + VERSION_ID="12.04" + """) + + str_ubuntu_14_04_os_release = dedent(""" + NAME="Ubuntu" + VERSION="14.04.4 LTS, Trusty Tahr" + ID=ubuntu + ID_LIKE=debian + PRETTY_NAME="Ubuntu 14.04.4 LTS" + VERSION_ID="14.04" + HOME_URL="http://www.ubuntu.com/" + SUPPORT_URL="http://help.ubuntu.com/" + BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/" + """) + + str_ubuntu_16_04_os_release = dedent(""" + NAME="Ubuntu" + VERSION="16.04 LTS (Xenial Xerus)" + ID=ubuntu + ID_LIKE=debian + PRETTY_NAME="Ubuntu 16.04 LTS" + VERSION_ID="16.04" + HOME_URL="http://www.ubuntu.com/" + SUPPORT_URL="http://help.ubuntu.com/" + BUG_REPORT_URL="http://bugs.launchpad.net/ubuntu/" + UBUNTU_CODENAME=xenial + """) + + str_ubuntu_18_04_os_release = dedent(""" + NAME="Ubuntu" + VERSION="18.04 LTS (Bionic Beaver)" + ID=ubuntu + ID_LIKE=debian + PRETTY_NAME="Ubuntu 18.04 LTS" + VERSION_ID="18.04" + HOME_URL="https://www.ubuntu.com/" + SUPPORT_URL="https://help.ubuntu.com/" + BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/" + PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy" + VERSION_CODENAME=bionic + UBUNTU_CODENAME=bionic + """) + + str_rhel_6_4_lsb_release = dedent(""" + LSB Version: :base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch:graphics-4.0-amd64:graphics-4.0-noarch:printing-4.0-amd64:printing-4.0-noarch + Distributor ID: RedHatEnterpriseServer + Description: Red Hat Enterprise Linux Server release 6.4 (Santiago) + Release: 6.4 + Codename: Santiago + """) + + str_rhel_7_lsb_release = dedent(""" + LSB Version: :core-4.1-amd64:core-4.1-noarch:cxx-4.1-amd64:cxx-4.1-noarch:desktop-4.1-amd64:desktop-4.1-noarch:languages-4.1-amd64:languages-4.1-noarch:printing-4.1-amd64:printing-4.1-noarch + Distributor ID: RedHatEnterpriseServer + Description: Red Hat Enterprise Linux Server release 7.0 (Maipo) + Release: 7.0 + Codename: Maipo + """) + + str_rhel_7_os_release = dedent(""" + NAME="Red Hat Enterprise Linux Server" + VERSION="7.0 (Maipo)" + ID="rhel" + ID_LIKE="fedora" + VERSION_ID="7.0" + PRETTY_NAME="Red Hat Enterprise Linux Server 7.0 (Maipo)" + ANSI_COLOR="0;31" + CPE_NAME="cpe:/o:redhat:enterprise_linux:7.0:GA:server" + HOME_URL="https://www.redhat.com/" + BUG_REPORT_URL="https://bugzilla.redhat.com/" + + REDHAT_BUGZILLA_PRODUCT="Red Hat Enterprise Linux 7" + REDHAT_BUGZILLA_PRODUCT_VERSION=7.0 + REDHAT_SUPPORT_PRODUCT="Red Hat Enterprise Linux" + REDHAT_SUPPORT_PRODUCT_VERSION=7.0 + """) + + str_fedora_26_os_release = dedent(""" + NAME=Fedora + VERSION="26 (Twenty Six)" + ID=fedora + VERSION_ID=26 + PRETTY_NAME="Fedora 26 (Twenty Six)" + ANSI_COLOR="0;34" + CPE_NAME="cpe:/o:fedoraproject:fedora:26" + HOME_URL="https://fedoraproject.org/" + BUG_REPORT_URL="https://bugzilla.redhat.com/" + REDHAT_BUGZILLA_PRODUCT="Fedora" + REDHAT_BUGZILLA_PRODUCT_VERSION=26 + REDHAT_SUPPORT_PRODUCT="Fedora" + REDHAT_SUPPORT_PRODUCT_VERSION=26 + PRIVACY_POLICY_URL=https://fedoraproject.org/wiki/Legal:PrivacyPolicy + """) + + str_opensuse_42_2_os_release = dedent(""" + NAME="openSUSE Leap" + VERSION="42.2" + ID=opensuse + ID_LIKE="suse" + VERSION_ID="42.2" + PRETTY_NAME="openSUSE Leap 42.2" + ANSI_COLOR="0;32" + CPE_NAME="cpe:/o:opensuse:leap:42.2" + BUG_REPORT_URL="https://bugs.opensuse.org" + HOME_URL="https://www.opensuse.org/" + """) + + str_opensuse_42_3_os_release = dedent(""" + NAME="openSUSE Leap" + VERSION="42.3" + ID=opensuse + ID_LIKE="suse" + VERSION_ID="42.3" + PRETTY_NAME="openSUSE Leap 42.3" + ANSI_COLOR="0;32" + CPE_NAME="cpe:/o:opensuse:leap:42.3" + BUG_REPORT_URL="https://bugs.opensuse.org" + HOME_URL="https://www.opensuse.org/" + """) + + str_opensuse_15_0_os_release = dedent(""" + NAME="openSUSE Leap" + VERSION="15.0" + ID="opensuse-leap" + ID_LIKE="suse opensuse" + VERSION_ID="15.0" + PRETTY_NAME="openSUSE Leap 15.0" + ANSI_COLOR="0;32" + CPE_NAME="cpe:/o:opensuse:leap:15.0" + BUG_REPORT_URL="https://bugs.opensuse.org" + HOME_URL="https://www.opensuse.org/" + """) + + str_opensuse_15_1_os_release = dedent(""" + NAME="openSUSE Leap" + VERSION="15.1" + ID="opensuse-leap" + ID_LIKE="suse opensuse" + VERSION_ID="15.1" + PRETTY_NAME="openSUSE Leap 15.1" + ANSI_COLOR="0;32" + CPE_NAME="cpe:/o:opensuse:leap:15.1" + BUG_REPORT_URL="https://bugs.opensuse.org" + HOME_URL="https://www.opensuse.org/" + """) + + def test_centos_9_os_release(self): + os = OS.from_os_release(self.str_centos_9_os_release) + assert os.name == 'centos' + assert os.version == '9.stream' + assert os.codename == 'stream' + assert os.package_type == 'rpm' + + def test_centos_7_os_release(self): + os = OS.from_os_release(self.str_centos_7_os_release) + assert os.name == 'centos' + assert os.version == '7' + assert os.codename == 'core' + assert os.package_type == 'rpm' + + def test_centos_7_os_release_newer(self): + os = OS.from_os_release(self.str_centos_7_os_release_newer) + assert os.name == 'centos' + assert os.version == '7' + assert os.codename == 'core' + assert os.package_type == 'rpm' + + def test_debian_7_lsb_release(self): + os = OS.from_lsb_release(self.str_debian_7_lsb_release) + assert os.name == 'debian' + assert os.version == '7.1' + assert os.codename == 'wheezy' + assert os.package_type == 'deb' + + def test_debian_7_os_release(self): + os = OS.from_os_release(self.str_debian_7_os_release) + assert os.name == 'debian' + assert os.version == '7' + assert os.codename == 'wheezy' + assert os.package_type == 'deb' + + def test_debian_8_os_release(self): + os = OS.from_os_release(self.str_debian_8_os_release) + assert os.name == 'debian' + assert os.version == '8' + assert os.codename == 'jessie' + assert os.package_type == 'deb' + + def test_debian_9_os_release(self): + os = OS.from_os_release(self.str_debian_9_os_release) + assert os.name == 'debian' + assert os.version == '9' + assert os.codename == 'stretch' + assert os.package_type == 'deb' + + def test_ubuntu_12_04_lsb_release(self): + os = OS.from_lsb_release(self.str_ubuntu_12_04_lsb_release) + assert os.name == 'ubuntu' + assert os.version == '12.04' + assert os.codename == 'precise' + assert os.package_type == 'deb' + + def test_ubuntu_12_04_os_release(self): + os = OS.from_os_release(self.str_ubuntu_12_04_os_release) + assert os.name == 'ubuntu' + assert os.version == '12.04' + assert os.codename == 'precise' + assert os.package_type == 'deb' + + def test_ubuntu_14_04_os_release(self): + os = OS.from_os_release(self.str_ubuntu_14_04_os_release) + assert os.name == 'ubuntu' + assert os.version == '14.04' + assert os.codename == 'trusty' + assert os.package_type == 'deb' + + def test_ubuntu_16_04_os_release(self): + os = OS.from_os_release(self.str_ubuntu_16_04_os_release) + assert os.name == 'ubuntu' + assert os.version == '16.04' + assert os.codename == 'xenial' + assert os.package_type == 'deb' + + def test_ubuntu_18_04_os_release(self): + os = OS.from_os_release(self.str_ubuntu_18_04_os_release) + assert os.name == 'ubuntu' + assert os.version == '18.04' + assert os.codename == 'bionic' + assert os.package_type == 'deb' + + def test_rhel_6_4_lsb_release(self): + os = OS.from_lsb_release(self.str_rhel_6_4_lsb_release) + assert os.name == 'rhel' + assert os.version == '6.4' + assert os.codename == 'santiago' + assert os.package_type == 'rpm' + + def test_rhel_7_lsb_release(self): + os = OS.from_lsb_release(self.str_rhel_7_lsb_release) + assert os.name == 'rhel' + assert os.version == '7.0' + assert os.codename == 'maipo' + assert os.package_type == 'rpm' + + def test_rhel_7_os_release(self): + os = OS.from_os_release(self.str_rhel_7_os_release) + assert os.name == 'rhel' + assert os.version == '7.0' + assert os.codename == 'maipo' + assert os.package_type == 'rpm' + + def test_fedora_26_os_release(self): + os = OS.from_os_release(self.str_fedora_26_os_release) + assert os.name == 'fedora' + assert os.version == '26' + assert os.codename == '26' + assert os.package_type == 'rpm' + + def test_opensuse_42_2_os_release(self): + os = OS.from_os_release(self.str_opensuse_42_2_os_release) + assert os.name == 'opensuse' + assert os.version == '42.2' + assert os.codename == 'leap' + assert os.package_type == 'rpm' + + def test_opensuse_42_3_os_release(self): + os = OS.from_os_release(self.str_opensuse_42_3_os_release) + assert os.name == 'opensuse' + assert os.version == '42.3' + assert os.codename == 'leap' + assert os.package_type == 'rpm' + + def test_opensuse_15_0_os_release(self): + os = OS.from_os_release(self.str_opensuse_15_0_os_release) + assert os.name == 'opensuse' + assert os.version == '15.0' + assert os.codename == 'leap' + assert os.package_type == 'rpm' + + def test_opensuse_15_1_os_release(self): + os = OS.from_os_release(self.str_opensuse_15_1_os_release) + assert os.name == 'opensuse' + assert os.version == '15.1' + assert os.codename == 'leap' + assert os.package_type == 'rpm' + + def test_version_codename_success(self): + assert OS.version_codename('ubuntu', '14.04') == ('14.04', 'trusty') + assert OS.version_codename('ubuntu', 'trusty') == ('14.04', 'trusty') + + def test_version_codename_failure(self): + with pytest.raises(KeyError) as excinfo: + OS.version_codename('ubuntu', 'frog') + assert excinfo.type == KeyError + assert 'frog' in excinfo.value.args[0] + + def test_repr(self): + os = OS(name='NAME', version='0.1.2', codename='code') + assert repr(os) == "OS(name='NAME', version='0.1.2', codename='code')" + + def test_to_dict(self): + os = OS(name='NAME', version='0.1.2', codename='code') + ref_dict = dict(name='NAME', version='0.1.2', codename='code') + assert os.to_dict() == ref_dict + + def test_version_no_codename(self): + os = OS(name='ubuntu', version='16.04') + assert os.codename == 'xenial' + + def test_codename_no_version(self): + os = OS(name='ubuntu', codename='trusty') + assert os.version == '14.04' + + def test_eq_equal(self): + os = OS(name='ubuntu', codename='trusty', version='14.04') + assert OS(name='ubuntu', codename='trusty', version='14.04') == os + + def test_eq_not_equal(self): + os = OS(name='ubuntu', codename='trusty', version='16.04') + assert OS(name='ubuntu', codename='trusty', version='14.04') != os diff --git a/teuthology/orchestra/test/test_remote.py b/teuthology/orchestra/test/test_remote.py new file mode 100644 index 000000000..1328bc83e --- /dev/null +++ b/teuthology/orchestra/test/test_remote.py @@ -0,0 +1,259 @@ +from mock import patch, Mock, MagicMock +from pytest import raises + +from io import BytesIO + +from teuthology.orchestra import remote +from teuthology.orchestra import opsys +from teuthology.orchestra.run import RemoteProcess +from teuthology.exceptions import CommandFailedError, UnitTestError + + +class TestRemote(object): + + def setup_method(self): + self.start_patchers() + + def teardown_method(self): + self.stop_patchers() + + def start_patchers(self): + self.m_ssh = MagicMock() + self.patcher_ssh = patch( + 'teuthology.orchestra.connection.paramiko.SSHClient', + self.m_ssh, + ) + self.patcher_ssh.start() + + def stop_patchers(self): + self.patcher_ssh.stop() + + def test_shortname(self): + r = remote.Remote( + name='jdoe@xyzzy.example.com', + shortname='xyz', + ssh=self.m_ssh, + ) + assert r.shortname == 'xyz' + assert str(r) == 'jdoe@xyzzy.example.com' + + def test_shortname_default(self): + r = remote.Remote( + name='jdoe@xyzzy.example.com', + ssh=self.m_ssh, + ) + assert r.shortname == 'xyzzy' + assert str(r) == 'jdoe@xyzzy.example.com' + + def test_run(self): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + self.m_ssh.get_transport.return_value = m_transport + m_run = MagicMock() + args = [ + 'something', + 'more', + ] + proc = RemoteProcess( + client=self.m_ssh, + args=args, + ) + m_run.return_value = proc + rem = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + rem._runner = m_run + result = rem.run(args=args) + m_transport.getpeername.assert_called_once_with() + m_run_call_kwargs = m_run.call_args_list[0][1] + assert m_run_call_kwargs['args'] == args + assert result is proc + assert result.remote is rem + + @patch('teuthology.util.scanner.UnitTestScanner.scan_and_write') + def test_run_unit_test(self, m_scan_and_write): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + self.m_ssh.get_transport.return_value = m_transport + m_run = MagicMock(name="run", side_effect=CommandFailedError('mocked error', 1, 'smithi')) + args = [ + 'something', + 'more', + ] + rem = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + rem._runner = m_run + m_scan_and_write.return_value = "Error Message" + with raises(UnitTestError) as exc: + rem.run_unit_test(args=args, xml_path_regex="xml_path", output_yaml="yaml_path") + assert str(exc.value) == "Unit test failed on smithi with status 1: 'Error Message'" + + def test_hostname(self): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + self.m_ssh.get_transport.return_value = m_transport + m_run = MagicMock() + args = [ + 'hostname', + '--fqdn', + ] + stdout = BytesIO(b'test_hostname') + stdout.seek(0) + proc = RemoteProcess( + client=self.m_ssh, + args=args, + ) + proc.stdout = stdout + proc._stdout_buf = Mock() + proc._stdout_buf.channel.recv_exit_status.return_value = 0 + r = remote.Remote(name='xyzzy.example.com', ssh=self.m_ssh) + m_run.return_value = proc + r._runner = m_run + assert r.hostname == 'test_hostname' + + def test_arch(self): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + self.m_ssh.get_transport.return_value = m_transport + m_run = MagicMock() + args = [ + 'uname', + '-m', + ] + stdout = BytesIO(b'test_arch') + stdout.seek(0) + proc = RemoteProcess( + client=self.m_ssh, + args=args, + ) + proc._stdout_buf = Mock() + proc._stdout_buf.channel = Mock() + proc._stdout_buf.channel.recv_exit_status.return_value = 0 + proc._stdout_buf.channel.expects('recv_exit_status').returns(0) + proc.stdout = stdout + m_run.return_value = proc + r = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + r._runner = m_run + assert r.arch == 'test_arch' + assert len(m_run.call_args_list) == 1 + m_run_call_kwargs = m_run.call_args_list[0][1] + assert m_run_call_kwargs['client'] == self.m_ssh + assert m_run_call_kwargs['name'] == r.shortname + assert m_run_call_kwargs['args'] == ' '.join(args) + + def test_host_key(self): + m_key = MagicMock() + m_key.get_name.return_value = 'key_type' + m_key.get_base64.return_value = 'test ssh key' + m_transport = MagicMock() + m_transport.get_remote_server_key.return_value = m_key + self.m_ssh.get_transport.return_value = m_transport + r = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + assert r.host_key == 'key_type test ssh key' + self.m_ssh.get_transport.assert_called_once_with() + m_transport.get_remote_server_key.assert_called_once_with() + + def test_inventory_info(self): + r = remote.Remote('user@host', host_key='host_key') + r._arch = 'arch' + r._os = opsys.OS(name='os_name', version='1.2.3', codename='code') + inv_info = r.inventory_info + assert inv_info == dict( + name='host', + user='user', + arch='arch', + os_type='os_name', + os_version='1.2', + ssh_pub_key='host_key', + up=True, + ) + + def test_sftp_open_file(self): + m_file_obj = MagicMock() + m_stat = Mock() + m_stat.st_size = 42 + m_file_obj.stat.return_value = m_stat + m_open = MagicMock() + m_open.return_value = m_file_obj + m_open.return_value.__enter__.return_value = m_file_obj + with patch.object(remote.Remote, '_sftp_open_file', new=m_open): + rem = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + assert rem._sftp_open_file('x') is m_file_obj + assert rem._sftp_open_file('x').stat() is m_stat + assert rem._sftp_open_file('x').stat().st_size == 42 + with rem._sftp_open_file('x') as f: + assert f == m_file_obj + + def test_sftp_get_size(self): + m_file_obj = MagicMock() + m_stat = Mock() + m_stat.st_size = 42 + m_file_obj.stat.return_value = m_stat + m_open = MagicMock() + m_open.return_value = m_file_obj + m_open.return_value.__enter__.return_value = m_file_obj + with patch.object(remote.Remote, '_sftp_open_file', new=m_open): + rem = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + assert rem._sftp_get_size('/fake/file') == 42 + + def test_format_size(self): + assert remote.Remote._format_size(1023).strip() == '1023B' + assert remote.Remote._format_size(1024).strip() == '1KB' + assert remote.Remote._format_size(1024**2).strip() == '1MB' + assert remote.Remote._format_size(1024**5).strip() == '1TB' + assert remote.Remote._format_size(1021112).strip() == '997KB' + assert remote.Remote._format_size(1021112**2).strip() == '971GB' + + def test_is_container(self): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + self.m_ssh.get_transport.return_value = m_transport + m_run = MagicMock() + args = [] + proc = RemoteProcess( + client=self.m_ssh, + args=args, + ) + proc.returncode = 0 + m_run.return_value = proc + rem = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + rem._runner = m_run + assert rem.is_container + proc.returncode = 1 + rem2 = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + rem2._runner = m_run + assert not rem2.is_container + + @patch("teuthology.orchestra.remote.Remote.sh") + def test_resolve_ip(self, m_sh): + r = remote.Remote(name="jdoe@xyzzy.example.com", ssh=self.m_ssh) + m_sh.return_value = "smithi001.front.sepia.ceph.com has address 172.21.15.1\n" + ip4 = remote.Remote.resolve_ip(r, 'smithi001') + assert ip4 == "172.21.15.1" + m_sh.return_value = "\n" + try: + ip4 = remote.Remote.resolve_ip(r, 'smithi001') + except Exception as e: + assert 'Cannot get IPv4 address' in str(e) + try: + ip4 = remote.Remote.resolve_ip(r, 'smithi001', 5) + except Exception as e: + assert 'Unknown IP version' in str(e) + + m_sh.return_value = ("google.com has address 142.251.37.14\n" + "google.com has IPv6 address 2a00:1450:4016:80b::200e\n" + "google.com mail is handled by 10 smtp.google.com.\n") + ip4 = remote.Remote.resolve_ip(r, 'google.com') + assert ip4 == "142.251.37.14" + ip6 = remote.Remote.resolve_ip(r, 'google.com', '6') + assert ip6 == "2a00:1450:4016:80b::200e" + + + @patch("teuthology.orchestra.remote.Remote.run") + def test_write_file(self, m_run): + file = "fakefile" + contents = "fakecontents" + rem = remote.Remote(name='jdoe@xyzzy.example.com', ssh=self.m_ssh) + + remote.Remote.write_file(rem, file, contents, bs=1, offset=1024) + m_run.assert_called_with(args=f"set -ex\ndd of={file} bs=1 seek=1024", stdin=contents, quiet=True) + + remote.Remote.write_file(rem, file, contents, sync=True) + m_run.assert_called_with(args=f"set -ex\ndd of={file} conv=sync", stdin=contents, quiet=True) diff --git a/teuthology/orchestra/test/test_run.py b/teuthology/orchestra/test/test_run.py new file mode 100644 index 000000000..e8051ccbc --- /dev/null +++ b/teuthology/orchestra/test/test_run.py @@ -0,0 +1,286 @@ +from io import BytesIO + +import paramiko +import socket + +from mock import MagicMock, patch +from pytest import raises + +from teuthology.orchestra import run +from teuthology.exceptions import (CommandCrashedError, CommandFailedError, + ConnectionLostError) + +def set_buffer_contents(buf, contents): + buf.seek(0) + if isinstance(contents, bytes): + buf.write(contents) + elif isinstance(contents, (list, tuple)): + buf.writelines(contents) + elif isinstance(contents, str): + buf.write(contents.encode()) + else: + raise TypeError( + "%s is a %s; should be a byte string, list or tuple" % ( + contents, type(contents) + ) + ) + buf.seek(0) + + +class TestRun(object): + def setup_method(self): + self.start_patchers() + + def teardown_method(self): + self.stop_patchers() + + def start_patchers(self): + self.m_remote_process = MagicMock(wraps=run.RemoteProcess) + self.patcher_remote_proc = patch( + 'teuthology.orchestra.run.RemoteProcess', + self.m_remote_process, + ) + self.m_channel = MagicMock(spec=paramiko.Channel)() + """ + self.m_channelfile = MagicMock(wraps=paramiko.ChannelFile) + self.m_stdin_buf = self.m_channelfile(self.m_channel()) + self.m_stdout_buf = self.m_channelfile(self.m_channel()) + self.m_stderr_buf = self.m_channelfile(self.m_channel()) + """ + class M_ChannelFile(BytesIO): + channel = MagicMock(spec=paramiko.Channel)() + + self.m_channelfile = M_ChannelFile + self.m_stdin_buf = self.m_channelfile() + self.m_stdout_buf = self.m_channelfile() + self.m_stderr_buf = self.m_channelfile() + self.m_ssh = MagicMock() + self.m_ssh.exec_command.return_value = ( + self.m_stdin_buf, + self.m_stdout_buf, + self.m_stderr_buf, + ) + self.m_transport = MagicMock() + self.m_transport.getpeername.return_value = ('name', 22) + self.m_ssh.get_transport.return_value = self.m_transport + self.patcher_ssh = patch( + 'teuthology.orchestra.connection.paramiko.SSHClient', + self.m_ssh, + ) + self.patcher_ssh.start() + # Tests must start this if they wish to use it + # self.patcher_remote_proc.start() + + def stop_patchers(self): + # If this patcher wasn't started, it's ok + try: + self.patcher_remote_proc.stop() + except RuntimeError: + pass + self.patcher_ssh.stop() + + def test_exitstatus(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 0 + proc = run.run( + client=self.m_ssh, + args=['foo', 'bar baz'], + ) + assert proc.exitstatus == 0 + + def test_run_cwd(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 0 + run.run( + client=self.m_ssh, + args=['foo_bar_baz'], + cwd='/cwd/test', + ) + self.m_ssh.exec_command.assert_called_with('(cd /cwd/test && exec foo_bar_baz)') + + def test_capture_stdout(self): + output = 'foo\nbar' + set_buffer_contents(self.m_stdout_buf, output) + self.m_stdout_buf.channel.recv_exit_status.return_value = 0 + stdout = BytesIO() + proc = run.run( + client=self.m_ssh, + args=['foo', 'bar baz'], + stdout=stdout, + ) + assert proc.stdout is stdout + assert proc.stdout.read().decode() == output + assert proc.stdout.getvalue().decode() == output + + def test_capture_stderr_newline(self): + output = 'foo\nbar\n' + set_buffer_contents(self.m_stderr_buf, output) + self.m_stderr_buf.channel.recv_exit_status.return_value = 0 + stderr = BytesIO() + proc = run.run( + client=self.m_ssh, + args=['foo', 'bar baz'], + stderr=stderr, + ) + assert proc.stderr is stderr + assert proc.stderr.read().decode() == output + assert proc.stderr.getvalue().decode() == output + + def test_status_bad(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 42 + with raises(CommandFailedError) as exc: + run.run( + client=self.m_ssh, + args=['foo'], + ) + assert str(exc.value) == "Command failed on name with status 42: 'foo'" + + def test_status_bad_nocheck(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 42 + proc = run.run( + client=self.m_ssh, + args=['foo'], + check_status=False, + ) + assert proc.exitstatus == 42 + + def test_status_crash(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = -1 + with raises(CommandCrashedError) as exc: + run.run( + client=self.m_ssh, + args=['foo'], + ) + assert str(exc.value) == "Command crashed: 'foo'" + + def test_status_crash_nocheck(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = -1 + proc = run.run( + client=self.m_ssh, + args=['foo'], + check_status=False, + ) + assert proc.exitstatus == -1 + + def test_status_lost(self): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + m_transport.is_active.return_value = False + self.m_stdout_buf.channel.recv_exit_status.return_value = -1 + self.m_ssh.get_transport.return_value = m_transport + with raises(ConnectionLostError) as exc: + run.run( + client=self.m_ssh, + args=['foo'], + ) + assert str(exc.value) == "SSH connection to name was lost: 'foo'" + + def test_status_lost_socket(self): + m_transport = MagicMock() + m_transport.getpeername.side_effect = socket.error + self.m_ssh.get_transport.return_value = m_transport + with raises(ConnectionLostError) as exc: + run.run( + client=self.m_ssh, + args=['foo'], + ) + assert str(exc.value) == "SSH connection was lost: 'foo'" + + def test_status_lost_nocheck(self): + m_transport = MagicMock() + m_transport.getpeername.return_value = ('name', 22) + m_transport.is_active.return_value = False + self.m_stdout_buf.channel.recv_exit_status.return_value = -1 + self.m_ssh.get_transport.return_value = m_transport + proc = run.run( + client=self.m_ssh, + args=['foo'], + check_status=False, + ) + assert proc.exitstatus == -1 + + def test_status_bad_nowait(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 42 + proc = run.run( + client=self.m_ssh, + args=['foo'], + wait=False, + ) + with raises(CommandFailedError) as exc: + proc.wait() + assert proc.returncode == 42 + assert str(exc.value) == "Command failed on name with status 42: 'foo'" + + def test_stdin_pipe(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 0 + proc = run.run( + client=self.m_ssh, + args=['foo'], + stdin=run.PIPE, + wait=False + ) + assert proc.poll() == 0 + code = proc.wait() + assert code == 0 + assert proc.exitstatus == 0 + + def test_stdout_pipe(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 0 + lines = [b'one\n', b'two', b''] + set_buffer_contents(self.m_stdout_buf, lines) + proc = run.run( + client=self.m_ssh, + args=['foo'], + stdout=run.PIPE, + wait=False + ) + assert proc.poll() == 0 + assert proc.stdout.readline() == lines[0] + assert proc.stdout.readline() == lines[1] + assert proc.stdout.readline() == lines[2] + code = proc.wait() + assert code == 0 + assert proc.exitstatus == 0 + + def test_stderr_pipe(self): + self.m_stdout_buf.channel.recv_exit_status.return_value = 0 + lines = [b'one\n', b'two', b''] + set_buffer_contents(self.m_stderr_buf, lines) + proc = run.run( + client=self.m_ssh, + args=['foo'], + stderr=run.PIPE, + wait=False + ) + assert proc.poll() == 0 + assert proc.stderr.readline() == lines[0] + assert proc.stderr.readline() == lines[1] + assert proc.stderr.readline() == lines[2] + code = proc.wait() + assert code == 0 + assert proc.exitstatus == 0 + + def test_copy_and_close(self): + run.copy_and_close(None, MagicMock()) + run.copy_and_close('', MagicMock()) + run.copy_and_close(b'', MagicMock()) + + +class TestQuote(object): + def test_quote_simple(self): + got = run.quote(['a b', ' c', 'd e ']) + assert got == "'a b' ' c' 'd e '" + + def test_quote_and_quote(self): + got = run.quote(['echo', 'this && is embedded', '&&', + 'that was standalone']) + assert got == "echo 'this && is embedded' '&&' 'that was standalone'" + + def test_quote_and_raw(self): + got = run.quote(['true', run.Raw('&&'), 'echo', 'yay']) + assert got == "true && echo yay" + + +class TestRaw(object): + def test_eq(self): + str_ = "I am a raw something or other" + raw = run.Raw(str_) + assert raw == run.Raw(str_) diff --git a/teuthology/orchestra/test/test_systemd.py b/teuthology/orchestra/test/test_systemd.py new file mode 100644 index 000000000..c7cb3425f --- /dev/null +++ b/teuthology/orchestra/test/test_systemd.py @@ -0,0 +1,54 @@ +import argparse +import os + +from logging import debug +from teuthology import misc +from teuthology.orchestra import cluster +from teuthology.orchestra.run import quote +from teuthology.orchestra.daemon.group import DaemonGroup +import subprocess + + +class FakeRemote(object): + pass + + +def test_pid(): + ctx = argparse.Namespace() + ctx.daemons = DaemonGroup(use_systemd=True) + remote = FakeRemote() + + ps_ef_output_path = os.path.join( + os.path.dirname(__file__), + "files/daemon-systemdstate-pid-ps-ef.output" + ) + + # patching ps -ef command output using a file + def sh(args): + args[0:2] = ["cat", ps_ef_output_path] + debug(args) + return subprocess.getoutput(quote(args)) + + remote.sh = sh + remote.init_system = 'systemd' + remote.shortname = 'host1' + + ctx.cluster = cluster.Cluster( + remotes=[ + (remote, ['rgw.0', 'mon.a', 'mgr.a', 'mds.a', 'osd.0']) + ], + ) + + for remote, roles in ctx.cluster.remotes.items(): + for role in roles: + _, rol, id_ = misc.split_role(role) + if any(rol.startswith(x) for x in ['mon', 'mgr', 'mds']): + ctx.daemons.register_daemon(remote, rol, remote.shortname) + else: + ctx.daemons.register_daemon(remote, rol, id_) + + for _, daemons in ctx.daemons.daemons.items(): + for daemon in daemons.values(): + pid = daemon.pid + debug(pid) + assert pid diff --git a/teuthology/orchestra/test/util.py b/teuthology/orchestra/test/util.py new file mode 100644 index 000000000..4aedc2ee3 --- /dev/null +++ b/teuthology/orchestra/test/util.py @@ -0,0 +1,12 @@ +def assert_raises(excClass, callableObj, *args, **kwargs): + """ + Like unittest.TestCase.assertRaises, but returns the exception. + """ + try: + callableObj(*args, **kwargs) + except excClass as e: + return e + else: + if hasattr(excClass,'__name__'): excName = excClass.__name__ + else: excName = str(excClass) + raise AssertionError("%s not raised" % excName) diff --git a/teuthology/packaging.py b/teuthology/packaging.py new file mode 100644 index 000000000..2f6e6ba13 --- /dev/null +++ b/teuthology/packaging.py @@ -0,0 +1,1064 @@ +import logging +import ast +import re +import requests + +from teuthology.util.compat import urljoin, urlencode + +from collections import OrderedDict +from teuthology.util.compat import PY3 +if PY3: + from io import StringIO +else: + from io import BytesIO as StringIO +from teuthology import repo_utils + +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.exceptions import (VersionNotFoundError, CommitNotFoundError, + NoRemoteError) +from teuthology.misc import sudo_write_file +from teuthology.orchestra.opsys import OS, DEFAULT_OS_VERSION +from teuthology.orchestra.run import Raw + +log = logging.getLogger(__name__) + +''' +Map 'generic' package name to 'flavor-specific' package name. +If entry is None, either the package isn't known here, or +it's known but should not be installed on remotes of this flavor +''' + +_PACKAGE_MAP = { + 'sqlite': {'deb': 'sqlite3', 'rpm': None} +} + +''' +Map 'generic' service name to 'flavor-specific' service name. +''' +_SERVICE_MAP = { + 'httpd': {'deb': 'apache2', 'rpm': 'httpd'} +} + + +def get_package_name(pkg, rem): + """ + Find the remote-specific name of the generic 'pkg' + """ + flavor = rem.os.package_type + + try: + return _PACKAGE_MAP[pkg][flavor] + except KeyError: + return None + + +def get_service_name(service, rem): + """ + Find the remote-specific name of the generic 'service' + """ + flavor = rem.os.package_type + try: + return _SERVICE_MAP[service][flavor] + except KeyError: + return None + + +def install_package(package, remote): + """ + Install 'package' on 'remote' + Assumes repo has already been set up (perhaps with install_repo) + """ + log.info('Installing package %s on %s', package, remote) + flavor = remote.os.package_type + if flavor == 'deb': + pkgcmd = ['DEBIAN_FRONTEND=noninteractive', + 'sudo', + '-E', + 'apt-get', + '-y', + '--force-yes', + 'install', + '{package}'.format(package=package)] + elif flavor == 'rpm': + # FIXME: zypper + pkgcmd = ['sudo', + 'yum', + '-y', + 'install', + '{package}'.format(package=package)] + else: + log.error('install_package: bad flavor ' + flavor + '\n') + return False + return remote.run(args=pkgcmd) + + +def remove_package(package, remote): + """ + Remove package from remote + """ + flavor = remote.os.package_type + if flavor == 'deb': + pkgcmd = ['DEBIAN_FRONTEND=noninteractive', + 'sudo', + '-E', + 'apt-get', + '-y', + 'purge', + '{package}'.format(package=package)] + elif flavor == 'rpm': + # FIXME: zypper + pkgcmd = ['sudo', + 'yum', + '-y', + 'erase', + '{package}'.format(package=package)] + else: + log.error('remove_package: bad flavor ' + flavor + '\n') + return False + return remote.run(args=pkgcmd) + + +def get_koji_task_result(task_id, remote, ctx): + """ + Queries kojihub and retrieves information about + the given task_id. The package, koji, must be installed + on the remote for this command to work. + + We need a remote here because koji can only be installed + on rpm based machines and teuthology runs on Ubuntu. + + The results of the given task are returned. For example: + + { + 'brootid': 3303567, + 'srpms': [], + 'rpms': [ + 'tasks/6745/9666745/kernel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', + 'tasks/6745/9666745/kernel-modules-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', + ], + 'logs': [] + } + + :param task_id: The koji task_id we want to retrieve results for. + :param remote: The remote to run the koji command on. + :param ctx: The ctx from the current run, used to provide a + failure_reason and status if the koji command fails. + :returns: A python dict containing info about the task results. + """ + py_cmd = ('import koji; ' + 'hub = koji.ClientSession("{kojihub_url}"); ' + 'print(hub.getTaskResult({task_id}))') + py_cmd = py_cmd.format( + task_id=task_id, + kojihub_url=config.kojihub_url + ) + log.info("Querying kojihub for the result of task {0}".format(task_id)) + task_result = _run_python_command(py_cmd, remote, ctx) + return task_result + + +def get_koji_task_rpm_info(package, task_rpms): + """ + Extracts information about a given package from the provided + rpm results of a koji task. + + For example, if trying to retrieve the package 'kernel' from + the results of a task, the output would look like this: + + { + 'base_url': 'https://kojipkgs.fedoraproject.org/work/tasks/6745/9666745/', + 'rpm_name': 'kernel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', + 'package_name': 'kernel', + 'version': '4.1.0-0.rc2.git2.1.fc23.x86_64', + } + + :param task_rpms: A list of rpms from a tasks reusults. + :param package: The name of the package to retrieve. + :returns: A python dict containing info about the package. + """ + result = dict() + result['package_name'] = package + found_pkg = _find_koji_task_result(package, task_rpms) + if not found_pkg: + raise RuntimeError("The package {pkg} was not found in: {rpms}".format( + pkg=package, + rpms=task_rpms, + )) + + path, rpm_name = found_pkg.rsplit("/", 1) + result['rpm_name'] = rpm_name + result['base_url'] = "{koji_task_url}/{path}/".format( + koji_task_url=config.koji_task_url, + path=path, + ) + # removes the package name from the beginning of rpm_name + version = rpm_name.split("{0}-".format(package), 1)[1] + # removes .rpm from the rpm_name + version = version.split(".rpm")[0] + result['version'] = version + return result + + +def _find_koji_task_result(package, rpm_list): + """ + Looks in the list of rpms from koji task results to see if + the package we are looking for is present. + + Returns the full list item, including the path, if found. + + If not found, returns None. + """ + for rpm in rpm_list: + if package == _get_koji_task_result_package_name(rpm): + return rpm + return None + + +def _get_koji_task_result_package_name(path): + """ + Strips the package name from a koji rpm result. + + This makes the assumption that rpm names are in the following + format: -...rpm + + For example, given a koji rpm result might look like: + + tasks/6745/9666745/kernel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm + + This method would return "kernel". + """ + filename = path.split('/')[-1] + trimmed = [] + for part in filename.split('-'): + # assumes that when the next part is not a digit + # we're past the name and at the version + if part[0].isdigit(): + return '-'.join(trimmed) + trimmed.append(part) + + return '-'.join(trimmed) + + +def get_koji_build_info(build_id, remote, ctx): + """ + Queries kojihub and retrieves information about + the given build_id. The package, koji, must be installed + on the remote for this command to work. + + We need a remote here because koji can only be installed + on rpm based machines and teuthology runs on Ubuntu. + + Here is an example of the build info returned: + + {'owner_name': 'kdreyer', 'package_name': 'ceph', + 'task_id': 8534149, 'completion_ts': 1421278726.1171, + 'creation_event_id': 10486804, 'creation_time': '2015-01-14 18:15:17.003134', + 'epoch': None, 'nvr': 'ceph-0.80.5-4.el7ost', 'name': 'ceph', + 'completion_time': '2015-01-14 18:38:46.1171', 'state': 1, 'version': '0.80.5', + 'volume_name': 'DEFAULT', 'release': '4.el7ost', 'creation_ts': 1421277317.00313, + 'package_id': 34590, 'id': 412677, 'volume_id': 0, 'owner_id': 2826 + } + + :param build_id: The koji build_id we want to retrieve info on. + :param remote: The remote to run the koji command on. + :param ctx: The ctx from the current run, used to provide a + failure_reason and status if the koji command fails. + :returns: A python dict containing info about the build. + """ + py_cmd = ('import koji; ' + 'hub = koji.ClientSession("{kojihub_url}"); ' + 'print(hub.getBuild({build_id}))') + py_cmd = py_cmd.format( + build_id=build_id, + kojihub_url=config.kojihub_url + ) + log.info('Querying kojihub for info on build {0}'.format(build_id)) + build_info = _run_python_command(py_cmd, remote, ctx) + return build_info + + +def _run_python_command(py_cmd, remote, ctx): + """ + Runs the given python code on the remote + and returns the stdout from the code as + a python object. + """ + proc = remote.run( + args=[ + 'python', '-c', py_cmd + ], + stdout=StringIO(), stderr=StringIO(), check_status=False + ) + if proc.exitstatus == 0: + # returns the __repr__ of a python dict + stdout = proc.stdout.getvalue().strip() + # take the __repr__ and makes it a python dict again + result = ast.literal_eval(stdout) + else: + msg = "Error running the following on {0}: {1}".format(remote, py_cmd) + log.error(msg) + log.error("stdout: {0}".format(proc.stdout.getvalue().strip())) + log.error("stderr: {0}".format(proc.stderr.getvalue().strip())) + ctx.summary["failure_reason"] = msg + ctx.summary["status"] = "dead" + raise RuntimeError(msg) + + return result + + +def get_kojiroot_base_url(build_info, arch="x86_64"): + """ + Builds the base download url for kojiroot given the current + build information. + + :param build_info: A dict of koji build information, possibly + retrieved from get_koji_build_info. + :param arch: The arch you want to download rpms for. + :returns: The base_url to use when downloading rpms + from brew. + """ + base_url = "{kojiroot}/{package_name}/{ver}/{rel}/{arch}/".format( + kojiroot=config.kojiroot_url, + package_name=build_info["package_name"], + ver=build_info["version"], + rel=build_info["release"], + arch=arch, + ) + return base_url + + +def get_koji_package_name(package, build_info, arch="x86_64"): + """ + Builds the package name for a brew rpm. + + :param package: The name of the package + :param build_info: A dict of koji build information, possibly + retrieved from get_brew_build_info. + :param arch: The arch you want to download rpms for. + :returns: A string representing the file name for the + requested package in koji. + """ + pkg_name = "{name}-{ver}-{rel}.{arch}.rpm".format( + name=package, + ver=build_info["version"], + rel=build_info["release"], + arch=arch, + ) + + return pkg_name + + +def get_package_version(remote, package): + installed_ver = None + if remote.os.package_type == "deb": + proc = remote.run( + args=[ + 'dpkg-query', '-W', '-f', '${Version}', package + ], + stdout=StringIO(), + ) + else: + proc = remote.run( + args=[ + 'rpm', '-q', package, '--qf', '%{VERSION}-%{RELEASE}' + ], + stdout=StringIO(), + ) + if proc.exitstatus == 0: + installed_ver = proc.stdout.getvalue().strip() + # Does this look like a version string? + # this assumes a version string starts with non-alpha characters + if installed_ver and re.match('^[^a-zA-Z]', installed_ver): + log.info("The installed version of {pkg} is {ver}".format( + pkg=package, + ver=installed_ver, + )) + else: + installed_ver = None + else: + # should this throw an exception and stop the job? + log.warning( + "Unable to determine if {pkg} is installed: {stdout}".format( + pkg=package, + stdout=proc.stdout.getvalue().strip(), + ) + ) + + return installed_ver + + +def _get_config_value_for_remote(ctx, remote, config, key): + """ + Look through config, and attempt to determine the "best" value to use + for a given key. For example, given:: + + config = { + 'all': + {'branch': 'main'}, + 'branch': 'next' + } + _get_config_value_for_remote(ctx, remote, config, 'branch') + + would return 'main'. + + :param ctx: the argparse.Namespace object + :param remote: the teuthology.orchestra.remote.Remote object + :param config: the config dict + :param key: the name of the value to retrieve + """ + roles = ctx.cluster.remotes[remote] if ctx else None + if 'all' in config: + return config['all'].get(key) + elif roles: + for role in roles: + if role in config and key in config[role]: + return config[role].get(key) + return config.get(key) + + +def _get_response(url, wait=False, sleep=15, tries=10): + with safe_while(sleep=sleep, tries=tries, _raise=False) as proceed: + while proceed(): + resp = requests.get(url) + if resp.ok: + log.info('Package found...') + break + + if not wait: + log.info( + 'Package is not found at: %s (got HTTP code %s)...', + url, + resp.status_code, + ) + break + + log.info( + 'Package not there yet (got HTTP code %s), waiting...', + resp.status_code, + ) + + return resp + + +class GitbuilderProject(object): + """ + Represents a project that is built by gitbuilder. + """ + # gitbuilder always uses this value + rpm_release = "1-0" + + def __init__(self, project, job_config, ctx=None, remote=None): + self.project = project + self.job_config = job_config + #TODO: we could get around the need for ctx by using a list + # of roles instead, ctx is only used in _get_config_value_for_remote. + self.ctx = ctx + self.remote = remote + + if remote and ctx: + self._init_from_remote() + else: + self._init_from_config() + + self.dist_release = self._get_dist_release() + + def _init_from_remote(self): + """ + Initializes the class from a teuthology.orchestra.remote.Remote object + """ + self.arch = self.remote.arch + self.os_type = self.remote.os.name + self.os_version = self.remote.os.version + self.codename = self.remote.os.codename + self.pkg_type = self.remote.system_type + self.distro = self._get_distro( + distro=self.remote.os.name, + version=self.remote.os.version, + codename=self.remote.os.codename, + ) + # when we're initializing with a remote we most likely have + # a task config, not the entire teuthology job config + self.flavor = self.job_config.get("flavor", "default") + self.tag = self.job_config.get("tag") + + def _init_from_config(self): + """ + Initializes the class from a teuthology job config + """ + self.arch = self.job_config.get('arch', 'x86_64') + self.os_type = self.job_config.get("os_type") + self.flavor = self.job_config.get("flavor") + self.codename = self.job_config.get("codename") + self.os_version = self._get_version() + # if os_version is given, prefer version/codename derived from it + if self.os_version: + self.os_version, self.codename = \ + OS.version_codename(self.os_type, self.os_version) + self.branch = self.job_config.get("branch") + self.tag = self.job_config.get("tag") + self.ref = self.job_config.get("ref") + self.distro = self._get_distro( + distro=self.os_type, + version=self.os_version, + codename=self.codename, + ) + self.pkg_type = "deb" if self.os_type.lower() in ( + "ubuntu", + "debian", + ) else "rpm" + + if not getattr(self, 'flavor'): + # avoiding circular imports + from teuthology.suite.util import get_install_task_flavor + # when we're initializing from a full teuthology config, not just a + # task config we need to make sure we're looking at the flavor for + # the install task + self.flavor = get_install_task_flavor(self.job_config) + + @property + def sha1(self): + """ + Performs a call to gitbuilder to retrieve the sha1 if not provided in + the job_config. The returned value is cached so that this call only + happens once. + + :returns: The sha1 of the project as a string. + """ + if not hasattr(self, "_sha1"): + self._sha1 = self.job_config.get('sha1') + if not self._sha1: + self._sha1 = self._get_package_sha1() + return self._sha1 + + @property + def version(self): + """ + Performs a call to gitubilder to retrieve the version number for the + project. The returned value is cached so that this call only happens + once. + + :returns: The version number of the project as a string. + """ + if not hasattr(self, '_version'): + self._version = self._get_package_version() + return self._version + + @property + def base_url(self): + """ + The base url that points at this project on gitbuilder. + + :returns: A string of the base url for this project + """ + return self._get_base_url() + + @property + def uri_reference(self): + """ + The URI reference that identifies what build of the project + we'd like to use. + + For example, the following could be returned:: + + ref/ + sha1/ + ref/ + + :returns: The uri_reference as a string. + """ + return self._get_uri_reference() + + def _get_dist_release(self): + version = self._parse_version(self.os_version) + if self.os_type in ('centos', 'rhel'): + return "el{0}".format(version) + elif self.os_type == "fedora": + return "fc{0}".format(version) + else: + # debian and ubuntu just use the distro name + return self.os_type + + @staticmethod + def _parse_version(version): + """ + Parses a distro version string and returns a modified string + that matches the format needed for the gitbuilder url. + + Minor version numbers are ignored. + """ + return version.split(".")[0] + + @classmethod + def _get_distro(cls, distro=None, version=None, codename=None): + """ + Given a distro and a version, returned the combined string + to use in a gitbuilder url. + + :param distro: The distro as a string + :param version: The version as a string + :param codename: The codename for the distro. + Used for deb based distros. + """ + if distro in ('centos', 'rhel'): + distro = "centos" + elif distro == "fedora": + distro = "fedora" + elif distro == "opensuse": + distro = "opensuse" + elif distro == "sle": + distro == "sle" + else: + # deb based systems use codename instead of a distro/version combo + if not codename: + # lookup codename based on distro string + codename = OS._version_to_codename(distro, version) + if not codename: + msg = "No codename found for: {distro} {version}".format( + distro=distro, + version=version, + ) + log.exception(msg) + raise RuntimeError() + return codename + + return "{distro}{version}".format( + distro=distro, + version=cls._parse_version(version), + ) + + def _get_version(self): + """ + Attempts to find the distro version from the job_config. + + If not found, it will return the default version for + the distro found in job_config. + + :returns: A string distro version + """ + version = self.job_config.get("os_version") + if not version: + version = DEFAULT_OS_VERSION.get(self.os_type) + + return str(version) + + def _get_uri_reference(self): + """ + Returns the URI reference that identifies what build of the project + we'd like to use. + + If a remote is given, it will attempt to read the config for the given + remote to find either a tag, branch or sha1 defined. If there is no + remote, the sha1 from the config will be used. + + If a tag, branch or sha1 can't be found it will default to use the + build from the main branch. + + :returns: A string URI. Ex: ref/main + """ + ref_name, ref_val = next(iter(self._choose_reference().items())) + if ref_name == 'sha1': + return 'sha1/%s' % ref_val + else: + return 'ref/%s' % ref_val + + def _choose_reference(self): + """ + Since it's only meaningful to search for one of: + ref, tag, branch, sha1 + Decide which to use. + + :returns: a single-key dict containing the name and value of the + reference to use, e.g. {'branch': 'main'} + """ + tag = branch = sha1 = None + if self.remote: + tag = _get_config_value_for_remote(self.ctx, self.remote, + self.job_config, 'tag') + branch = _get_config_value_for_remote(self.ctx, self.remote, + self.job_config, 'branch') + sha1 = _get_config_value_for_remote(self.ctx, self.remote, + self.job_config, 'sha1') + ref = None + else: + ref = self.ref + tag = self.tag + branch = self.branch + sha1 = self.sha1 + + def warn(attrname): + names = ('ref', 'tag', 'branch', 'sha1') + vars = (ref, tag, branch, sha1) + # filter(None,) filters for truth + if sum(1 for _ in vars if _) > 1: + log.warning( + "More than one of ref, tag, branch, or sha1 supplied; " + "using %s", + attrname + ) + for n, v in zip(names, vars): + log.info('%s: %s' % (n, v)) + + if ref: + warn('ref') + return dict(ref=ref) + elif tag: + warn('tag') + return dict(tag=tag) + elif branch: + warn('branch') + return dict(branch=branch) + elif sha1: + warn('sha1') + return dict(sha1=sha1) + else: + log.warning("defaulting to main branch") + return dict(branch='main') + + def _get_base_url(self): + """ + Figures out which package repo base URL to use. + """ + template = config.baseurl_template + # get distro name and arch + base_url = template.format( + host=config.gitbuilder_host, + proj=self.project, + pkg_type=self.pkg_type, + arch=self.arch, + dist=self.distro, + flavor=self.flavor, + uri=self.uri_reference, + ) + return base_url + + def _get_package_version(self): + """ + Look for, and parse, a file called 'version' in base_url. + """ + url = "{0}/version".format(self.base_url) + log.info("Looking for package version: {0}".format(url)) + # will loop and retry until a 200 is returned or the retry + # limits are reached + resp = _get_response(url, wait=self.job_config.get("wait_for_package", False)) + + if not resp.ok: + raise VersionNotFoundError(url) + version = resp.text.strip().lstrip('v') + log.info("Found version: {0}".format(version)) + return version + + def _get_package_sha1(self): + """ + Look for, and parse, a file called 'sha1' in base_url. + """ + url = "{0}/sha1".format(self.base_url) + log.info("Looking for package sha1: {0}".format(url)) + resp = requests.get(url) + sha1 = None + if not resp.ok: + # TODO: maybe we should have this retry a few times? + log.error( + 'Package sha1 was not there (got HTTP code %s)...', + resp.status_code, + ) + else: + sha1 = resp.text.strip() + log.info("Found sha1: {0}".format(sha1)) + + return sha1 + + def install_repo(self): + """ + Install the .repo file or sources.list fragment on self.remote if there + is one. If not, raises an exception + """ + if not self.remote: + raise NoRemoteError() + if self.remote.os.package_type == 'rpm': + self._install_rpm_repo() + elif self.remote.os.package_type == 'deb': + self._install_deb_repo() + + def _install_rpm_repo(self): + dist_release = self.dist_release + project = self.project + proj_release = \ + '{proj}-release-{release}.{dist_release}.noarch'.format( + proj=project, release=self.rpm_release, + dist_release=dist_release + ) + rpm_name = "{rpm_nm}.rpm".format(rpm_nm=proj_release) + url = "{base_url}/noarch/{rpm_name}".format( + base_url=self.base_url, rpm_name=rpm_name) + if dist_release in ['opensuse', 'sle']: + url = "{base_url}/{arch}".format( + base_url=self.base_url, arch=self.arch) + self.remote.run(args=[ + 'sudo', 'zypper', '-n', 'addrepo', '--refresh', '--no-gpgcheck', + '-p', '1', url, 'ceph-rpm-under-test', + ]) + else: + self.remote.run(args=['sudo', 'yum', '-y', 'install', url]) + + def _install_deb_repo(self): + self.remote.run( + args=[ + 'echo', 'deb', self.base_url, self.codename, 'main', + Raw('|'), + 'sudo', 'tee', + '/etc/apt/sources.list.d/{proj}.list'.format( + proj=self.project), + ], + stdout=StringIO(), + ) + + def remove_repo(self): + """ + Remove the .repo file or sources.list fragment on self.remote if there + is one. If not, raises an exception + """ + if not self.remote: + raise NoRemoteError() + if self.remote.os.package_type == 'rpm': + self._remove_rpm_repo() + elif self.remote.os.package_type == 'deb': + self._remove_deb_repo() + + def _remove_rpm_repo(self): + if self.dist_release in ['opensuse', 'sle']: + self.remote.run(args=[ + 'sudo', 'zypper', '-n', 'removerepo', 'ceph-rpm-under-test' + ]) + else: + remove_package('%s-release' % self.project, self.remote) + + def _remove_deb_repo(self): + self.remote.run( + args=[ + 'sudo', + 'rm', '-f', + '/etc/apt/sources.list.d/{proj}.list'.format( + proj=self.project), + ] + ) + + +class ShamanProject(GitbuilderProject): + def __init__(self, project, job_config, ctx=None, remote=None): + super(ShamanProject, self).__init__(project, job_config, ctx, remote) + self.query_url = 'https://%s/api/' % config.shaman_host + + # Force to use the "noarch" instead to build the uri. + self.force_noarch = self.job_config.get("shaman", {}).get("force_noarch", False) + + def _get_base_url(self): + self.assert_result() + return self._result.json()[0]['url'] + + @property + def _result(self): + if getattr(self, '_result_obj', None) is None: + self._result_obj = self._search() + return self._result_obj + + def _search(self): + uri = self._search_uri + log.debug("Querying %s", uri) + resp = requests.get( + uri, + headers={'content-type': 'application/json'}, + ) + resp.raise_for_status() + return resp + + @property + def _search_uri(self): + flavor = self.flavor + req_obj = OrderedDict() + req_obj['status'] = 'ready' + req_obj['project'] = self.project + req_obj['flavor'] = flavor + arch = "noarch" if self.force_noarch else self.arch + req_obj['distros'] = '%s/%s' % (self.distro, arch) + ref_name, ref_val = list(self._choose_reference().items())[0] + if ref_name == 'tag': + req_obj['sha1'] = self._sha1 = self._tag_to_sha1() + elif ref_name == 'sha1': + req_obj['sha1'] = ref_val + else: + req_obj['ref'] = ref_val + req_str = urlencode(req_obj) + uri = urljoin( + self.query_url, + 'search', + ) + '?%s' % req_str + return uri + + def _tag_to_sha1(self): + """ + Shaman doesn't know about tags. Use git ls-remote to query the remote + repo in order to map tags to their sha1 value. + + This method will also retry against ceph.git if the original request + uses ceph-ci.git and fails. + """ + def get_sha1(url): + # Ceph (and other projects) uses annotated tags for releases. This + # has the side-effect of making git ls-remote return the sha1 for + # the annotated tag object and not the last "real" commit in that + # tag. By contrast, when a person (or a build system) issues a + # "git checkout " command, HEAD will be the last "real" commit + # and not the tag. + # Below we have to append "^{}" to the tag value to work around + # this in order to query for the sha1 that the build system uses. + return repo_utils.ls_remote(url, "%s^{}" % self.tag) + + git_url = repo_utils.build_git_url(self.project) + result = get_sha1(git_url) + # For upgrade tests that are otherwise using ceph-ci.git, we need to + # also look in ceph.git to lookup released tags. + if result is None and 'ceph-ci' in git_url: + alt_git_url = git_url.replace('ceph-ci', 'ceph') + log.info( + "Tag '%s' not found in %s; will also look in %s", + self.tag, + git_url, + alt_git_url, + ) + result = get_sha1(alt_git_url) + + if result is None: + raise CommitNotFoundError(self.tag, git_url) + return result + + def assert_result(self): + if len(self._result.json()) == 0: + raise VersionNotFoundError(self._result.url) + + @classmethod + def _get_distro(cls, distro=None, version=None, codename=None): + if distro in ('centos', 'rhel'): + distro = 'centos' + version = cls._parse_version(version) + return "%s/%s" % (distro, version) + + def _get_package_sha1(self): + # This doesn't raise because GitbuilderProject._get_package_sha1() + # doesn't either. + if not len(self._result.json()): + log.error("sha1 not found: %s", self._result.url) + else: + return self._result.json()[0]['sha1'] + + def _get_package_version(self): + self.assert_result() + return self._result.json()[0]['extra']['package_manager_version'] + + @property + def scm_version(self): + self.assert_result() + return self._result.json()[0]['extra']['version'] + + @property + def repo_url(self): + self.assert_result() + return urljoin( + self._result.json()[0]['chacra_url'], + 'repo', + ) + + @property + def build_complete(self): + # use the repo search results to get a ref and a sha1; the + # input to teuthology-suite doesn't contain both + try: + self.assert_result() + except VersionNotFoundError: + return False + + # self._result has status, project, flavor, distros, arch, and sha1 + # restrictions, so the only reason for multiples should be "multiple + # builds of the same sha1 etc."; the first entry is the newest + search_result = self._result.json()[0] + + # now look for the build complete status + path = '/'.join( + ('builds/ceph', search_result['ref'], search_result['sha1']) + ) + build_url = urljoin(self.query_url, path) + + try: + resp = requests.get(build_url) + resp.raise_for_status() + except requests.HttpError: + return False + log.debug(f'looking for {self.distro} {self.arch} {self.flavor}') + for build in resp.json(): + log.debug(f'build: {build["distro"]}/{build["distro_version"]} {build["distro_arch"]} {build["flavor"]}') + if ( + # we must compare build arch to self.arch, since shaman's + # results can have multiple arches but we're searching + # for precisely one here + build['distro'] == search_result['distro'] and + build['distro_version'] == search_result['distro_version'] and + build['flavor'] == search_result['flavor'] and + build['distro_arch'] == self.arch and + build['status'] == 'completed' + ): + return True + return False + + def _get_repo(self): + resp = requests.get(self.repo_url) + resp.raise_for_status() + return str(resp.text) + + def _install_rpm_repo(self): + dist_release = self.dist_release + repo = self._get_repo() + if dist_release in ['opensuse', 'sle']: + log.info("Writing zypper repo:\n{}".format(repo)) + sudo_write_file( + self.remote, + '/etc/zypp/repos.d/{proj}.repo'.format(proj=self.project), + repo, + ) + else: + log.info("Writing yum repo:\n{}".format(repo)) + sudo_write_file( + self.remote, + '/etc/yum.repos.d/{proj}.repo'.format(proj=self.project), + repo, + ) + + def _install_deb_repo(self): + repo = self._get_repo() + sudo_write_file( + self.remote, + '/etc/apt/sources.list.d/{proj}.list'.format( + proj=self.project), + repo, + ) + + def _remove_rpm_repo(self): + # FIXME: zypper + self.remote.run( + args=[ + 'sudo', + 'rm', '-f', + '/etc/yum.repos.d/{proj}.repo'.format(proj=self.project), + ] + ) + + +def get_builder_project(): + """ + Depending on whether config.use_shaman is True or False, return + GitbuilderProject or ShamanProject (the class, not an instance). + """ + if config.use_shaman is True: + builder_class = ShamanProject + else: + builder_class = GitbuilderProject + return builder_class diff --git a/teuthology/parallel.py b/teuthology/parallel.py new file mode 100644 index 000000000..0a7d3ab35 --- /dev/null +++ b/teuthology/parallel.py @@ -0,0 +1,115 @@ +import logging +import sys + +import gevent +import gevent.pool +import gevent.queue + + +log = logging.getLogger(__name__) + + +class ExceptionHolder(object): + def __init__(self, exc_info): + self.exc_info = exc_info + + +def capture_traceback(func, *args, **kwargs): + """ + Utility function to capture tracebacks of any exception func + raises. + """ + try: + return func(*args, **kwargs) + except Exception: + return ExceptionHolder(sys.exc_info()) + + +def resurrect_traceback(exc): + if isinstance(exc, ExceptionHolder): + raise exc.exc_info[1] + elif isinstance(exc, BaseException): + raise exc + else: + return + + +class parallel(object): + """ + This class is a context manager for running functions in parallel. + + You add functions to be run with the spawn method:: + + with parallel() as p: + for foo in bar: + p.spawn(quux, foo, baz=True) + + You can iterate over the results (which are in arbitrary order):: + + with parallel() as p: + for foo in bar: + p.spawn(quux, foo, baz=True) + for result in p: + print(result) + + If one of the spawned functions throws an exception, it will be thrown + when iterating over the results, or when the with block ends. + + At the end of the with block, the main thread waits until all + spawned functions have completed, or, if one exited with an exception, + kills the rest and raises the exception. + """ + + def __init__(self): + self.group = gevent.pool.Group() + self.results = gevent.queue.Queue() + self.count = 0 + self.any_spawned = False + self.iteration_stopped = False + + def spawn(self, func, *args, **kwargs): + self.count += 1 + self.any_spawned = True + greenlet = self.group.spawn(capture_traceback, func, *args, **kwargs) + greenlet.link(self._finish) + + def __enter__(self): + return self + + def __exit__(self, type_, value, traceback): + if value is not None: + return False + + # raises if any greenlets exited with an exception + for result in self: + log.debug('result is %s', repr(result)) + + return True + + def __iter__(self): + return self + + def __next__(self): + if not self.any_spawned or self.iteration_stopped: + raise StopIteration() + result = self.results.get() + + try: + resurrect_traceback(result) + except StopIteration: + self.iteration_stopped = True + raise + + return result + + next = __next__ + + def _finish(self, greenlet): + if greenlet.successful(): + self.results.put(greenlet.value) + else: + self.results.put(greenlet.exception) + + self.count -= 1 + if self.count <= 0: + self.results.put(StopIteration()) diff --git a/teuthology/provision/__init__.py b/teuthology/provision/__init__.py new file mode 100644 index 000000000..48392eaba --- /dev/null +++ b/teuthology/provision/__init__.py @@ -0,0 +1,141 @@ +import logging +import os + +import teuthology.exporter +import teuthology.lock.query +from teuthology.misc import decanonicalize_hostname, get_distro, get_distro_version + +from teuthology.provision import cloud +from teuthology.provision import downburst +from teuthology.provision import fog +from teuthology.provision import openstack +from teuthology.provision import pelagos + +log = logging.getLogger(__name__) + + +def _logfile(shortname: str, archive_path: str = ""): + if os.path.isfile(archive_path): + return f"{archive_path}/{shortname}.downburst.log" + + +def get_reimage_types(): + return pelagos.get_types() + fog.get_types() + + +def reimage(ctx, machine_name, machine_type): + os_type = get_distro(ctx) + os_version = get_distro_version(ctx) + + pelagos_types = pelagos.get_types() + fog_types = fog.get_types() + if machine_type in pelagos_types and machine_type in fog_types: + raise Exception('machine_type can be used with one provisioner only') + elif machine_type in pelagos_types: + obj = pelagos.Pelagos(machine_name, os_type, os_version) + elif machine_type in fog_types: + obj = fog.FOG(machine_name, os_type, os_version) + else: + raise Exception("The machine_type '%s' is not known to any " + "of configured provisioners" % machine_type) + status = "fail" + try: + result = obj.create() + status = "success" + except Exception: + # We only need this clause so that we avoid triggering the finally + # clause below in cases where the exception raised is KeyboardInterrupt + # or SystemExit + raise + finally: + teuthology.exporter.NodeReimagingResults().record( + machine_type=machine_type, + status=status, + ) + return result + + +def create_if_vm(ctx, machine_name, _downburst=None): + """ + Use downburst to create a virtual machine + + :param _downburst: Only used for unit testing. + """ + if _downburst: + status_info = _downburst.status + else: + status_info = teuthology.lock.query.get_status(machine_name) + shortname = decanonicalize_hostname(machine_name) + machine_type = status_info['machine_type'] + os_type = get_distro(ctx) + os_version = get_distro_version(ctx) + if not teuthology.lock.query.is_vm(status=status_info): + return False + + if machine_type in cloud.get_types(): + return cloud.get_provisioner( + machine_type, + shortname, + os_type, + os_version, + conf=getattr(ctx, 'config', dict()), + ).create() + + has_config = hasattr(ctx, 'config') and ctx.config is not None + if has_config and 'downburst' in ctx.config: + log.warning( + 'Usage of a custom downburst config has been deprecated.' + ) + + dbrst = _downburst or \ + downburst.Downburst(name=machine_name, os_type=os_type, + os_version=os_version, status=status_info, + logfile=_logfile(ctx, shortname)) + return dbrst.create() + + +def destroy_if_vm( + machine_name: str, + user: str = "", + description: str = "", + _downburst=None +): + """ + Use downburst to destroy a virtual machine + + Return False only on vm downburst failures. + + :param _downburst: Only used for unit testing. + """ + if _downburst: + status_info = _downburst.status + else: + status_info = teuthology.lock.query.get_status(machine_name) + if not status_info or not teuthology.lock.query.is_vm(status=status_info): + return True + if user is not None and user != status_info['locked_by']: + msg = "Tried to destroy {node} as {as_user} but it is locked " + \ + "by {locked_by}" + log.error(msg.format(node=machine_name, as_user=user, + locked_by=status_info['locked_by'])) + return False + if (description and description != + status_info['description']): + msg = "Tried to destroy {node} with description {desc_arg} " + \ + "but it is locked with description {desc_lock}" + log.error(msg.format(node=machine_name, desc_arg=description, + desc_lock=status_info['description'])) + return False + machine_type = status_info.get('machine_type') + shortname = decanonicalize_hostname(machine_name) + if machine_type == 'openstack': + return openstack.ProvisionOpenStack().destroy(shortname) + elif machine_type in cloud.get_types(): + return cloud.get_provisioner( + machine_type, shortname, None, None).destroy() + + dbrst = _downburst or \ + downburst.Downburst(name=machine_name, os_type=None, + os_version=None, status=status_info, + logfile=_logfile(description, shortname)) + return dbrst.destroy() diff --git a/teuthology/provision/cloud/__init__.py b/teuthology/provision/cloud/__init__.py new file mode 100644 index 000000000..d30ad3338 --- /dev/null +++ b/teuthology/provision/cloud/__init__.py @@ -0,0 +1,49 @@ +import logging + +from teuthology.config import config + +from teuthology.provision.cloud import openstack + +log = logging.getLogger(__name__) + + +supported_drivers = dict( + openstack=dict( + provider=openstack.OpenStackProvider, + provisioner=openstack.OpenStackProvisioner, + ), +) + + +def get_types(): + types = list() + if 'libcloud' in config and 'providers' in config.libcloud: + types = list(config.libcloud['providers'].keys()) + return types + + +def get_provider_conf(node_type): + all_providers = config.libcloud['providers'] + provider_conf = all_providers[node_type] + return provider_conf + + +def get_provider(node_type): + provider_conf = get_provider_conf(node_type) + driver = provider_conf['driver'] + provider_cls = supported_drivers[driver]['provider'] + return provider_cls(name=node_type, conf=provider_conf) + + +def get_provisioner(node_type, name, os_type, os_version, conf=None): + provider = get_provider(node_type) + provider_conf = get_provider_conf(node_type) + driver = provider_conf['driver'] + provisioner_cls = supported_drivers[driver]['provisioner'] + return provisioner_cls( + provider=provider, + name=name, + os_type=os_type, + os_version=os_version, + conf=conf, + ) diff --git a/teuthology/provision/cloud/base.py b/teuthology/provision/cloud/base.py new file mode 100644 index 000000000..1700fa9ed --- /dev/null +++ b/teuthology/provision/cloud/base.py @@ -0,0 +1,89 @@ +import logging +from copy import deepcopy + +from libcloud.compute.providers import get_driver +from libcloud.compute.types import Provider as lc_Provider + +import teuthology.orchestra.remote +import teuthology.provision.cloud +from teuthology.misc import canonicalize_hostname, decanonicalize_hostname + +log = logging.getLogger(__name__) + + +class Provider(object): + _driver_posargs = list() + + def __init__(self, name, conf): + self.name = name + self.conf = conf + self.driver_name = self.conf['driver'] + + def _get_driver(self): + driver_type = get_driver( + getattr(lc_Provider, self.driver_name.upper()) + ) + driver_args = self._get_driver_args() + driver = driver_type( + *[driver_args.pop(arg_name) for arg_name in self._driver_posargs], + **driver_args + ) + return driver + driver = property(fget=_get_driver) + + def _get_driver_args(self): + return deepcopy(self.conf['driver_args']) + + +class Provisioner(object): + def __init__( + self, provider, name, os_type=None, os_version=None, + conf=None, user='ubuntu', + ): + if isinstance(provider, str): + provider = teuthology.provision.cloud.get_provider(provider) + self.provider = provider + self.name = decanonicalize_hostname(name) + self.hostname = canonicalize_hostname(name, user=None) + self.os_type = os_type + self.os_version = os_version + self.user = user + + def create(self): + try: + return self._create() + except Exception: + log.exception("Failed to create %s", self.name) + return False + + def _create(self): + pass + + def destroy(self): + try: + return self._destroy() + except Exception: + log.exception("Failed to destroy %s", self.name) + return False + + def _destroy(self): + pass + + @property + def remote(self): + if not hasattr(self, '_remote'): + self._remote = teuthology.orchestra.remote.Remote( + "%s@%s" % (self.user, self.name), + ) + return self._remote + + def __repr__(self): + template = "%s(provider='%s', name='%s', os_type='%s', " \ + "os_version='%s')" + return template % ( + self.__class__.__name__, + self.provider.name, + self.name, + self.os_type, + self.os_version, + ) diff --git a/teuthology/provision/cloud/openstack.py b/teuthology/provision/cloud/openstack.py new file mode 100644 index 000000000..d8b838b13 --- /dev/null +++ b/teuthology/provision/cloud/openstack.py @@ -0,0 +1,452 @@ +import logging +import re +import requests +import socket +import time +import yaml + +from teuthology.util.compat import urlencode + +from copy import deepcopy +from libcloud.common.exceptions import RateLimitReachedError, BaseHTTPError + +from paramiko import AuthenticationException +from paramiko.ssh_exception import NoValidConnectionsError + +from teuthology.config import config +from teuthology.contextutil import safe_while + +from teuthology.provision.cloud import base +from teuthology.provision.cloud import util +from teuthology.provision.cloud.base import Provider + + +log = logging.getLogger(__name__) + + +RETRY_EXCEPTIONS = (RateLimitReachedError, BaseHTTPError) + + +def retry(function, *args, **kwargs): + """ + Call a function (returning its results), retrying if any of the exceptions + in RETRY_EXCEPTIONS are raised + """ + with safe_while(sleep=1, tries=24, increment=1) as proceed: + tries = 0 + while proceed(): + tries += 1 + try: + result = function(*args, **kwargs) + if tries > 1: + log.debug( + "'%s' succeeded after %s tries", + function.__name__, + tries, + ) + return result + except RETRY_EXCEPTIONS: + pass + + +class OpenStackProvider(Provider): + _driver_posargs = ['username', 'password'] + + def _get_driver(self): + self._auth_token = util.AuthToken(name='teuthology_%s' % self.name) + with self._auth_token as token: + driver = super(OpenStackProvider, self)._get_driver() + # We must apparently call get_service_catalog() so that + # get_endpoint() works. + driver.connection.get_service_catalog() + if not token.value: + token.write( + driver.connection.auth_token, + driver.connection.auth_token_expires, + driver.connection.get_endpoint(), + ) + return driver + driver = property(fget=_get_driver) + + def _get_driver_args(self): + driver_args = super(OpenStackProvider, self)._get_driver_args() + if self._auth_token.value: + driver_args['ex_force_auth_token'] = self._auth_token.value + driver_args['ex_force_base_url'] = self._auth_token.endpoint + return driver_args + + @property + def ssh_interface(self): + if not hasattr(self, '_ssh_interface'): + self._ssh_interface = self.conf.get('ssh_interface', 'public_ips') + return self._ssh_interface + + @property + def images(self): + if not hasattr(self, '_images'): + exclude_image = self.conf.get('exclude_image', []) + if exclude_image and not isinstance(exclude_image, list): + exclude_image = [exclude_image] + exclude_re = [re.compile(x) for x in exclude_image] + images = retry(self.driver.list_images) + self._images = [_ for _ in images + if not any(x.match(_.name) for x in exclude_re)] + return self._images + + @property + def sizes(self): + if not hasattr(self, '_sizes'): + allow_sizes = self.conf.get('allow_sizes', '.*') + if not isinstance(allow_sizes, list): + allow_sizes = [allow_sizes] + allow_re = [re.compile(x) for x in allow_sizes] + # By default, exclude instance types meant for Windows + exclude_sizes = self.conf.get('exclude_sizes', 'win-.*') + if not isinstance(exclude_sizes, list): + exclude_sizes = [exclude_sizes] + exclude_re = [re.compile(x) for x in exclude_sizes] + sizes = retry(self.driver.list_sizes) + self._sizes = list(filter( + lambda s: + any(x.match(s.name) for x in allow_re) + and not + all(x.match(s.name) for x in exclude_re), + sizes + )) + return self._sizes + + @property + def networks(self): + if not hasattr(self, '_networks'): + allow_networks = self.conf.get('allow_networks', '.*') + if not isinstance(allow_networks, list): + allow_networks=[allow_networks] + networks_re = [re.compile(x) for x in allow_networks] + try: + networks = retry(self.driver.ex_list_networks) + if networks: + self._networks = filter( + lambda s: any(x.match(s.name) for x in networks_re), + networks + ) + else: + self._networks = list() + except AttributeError: + log.warning("Unable to list networks for %s", self.driver) + self._networks = list() + return self._networks + + @property + def default_userdata(self): + if not hasattr(self, '_default_userdata'): + self._default_userdata = self.conf.get('userdata', dict()) + return self._default_userdata + + @property + def security_groups(self): + if not hasattr(self, '_security_groups'): + try: + self._security_groups = retry( + self.driver.ex_list_security_groups + ) + except AttributeError: + log.warning("Unable to list security groups for %s", self.driver) + self._security_groups = list() + return self._security_groups + + +class OpenStackProvisioner(base.Provisioner): + _sentinel_path = '/.teuth_provisioned' + + defaults = dict( + openstack=dict( + machine=dict( + disk=20, + ram=8000, + cpus=1, + ), + volumes=dict( + count=0, + size=0, + ), + ) + ) + + def __init__( + self, + provider, name, os_type=None, os_version=None, + conf=None, + user='ubuntu', + ): + super(OpenStackProvisioner, self).__init__( + provider, name, os_type, os_version, conf=conf, user=user, + ) + self._read_conf(conf) + + def _read_conf(self, conf=None): + """ + Looks through the following in order: + + the 'conf' arg + conf[DRIVER_NAME] + teuthology.config.config.DRIVER_NAME + self.defaults[DRIVER_NAME] + + It will use the highest value for each of the following: disk, RAM, + cpu, volume size and count + + The resulting configuration becomes the new instance configuration + and is stored as self.conf + + :param conf: The instance configuration + + :return: None + """ + driver_name = self.provider.driver_name.lower() + full_conf = conf or dict() + driver_conf = full_conf.get(driver_name, dict()) + legacy_conf = getattr(config, driver_name) or dict() + defaults = self.defaults.get(driver_name, dict()) + confs = list() + for obj in (full_conf, driver_conf, legacy_conf, defaults): + obj = deepcopy(obj) + if isinstance(obj, list): + confs.extend(obj) + else: + confs.append(obj) + self.conf = util.combine_dicts(confs, lambda x, y: x > y) + + def _create(self): + userdata = self.userdata + log.debug("Creating node: %s", self) + log.debug("Selected size: %s", self.size) + log.debug("Selected image: %s", self.image) + log.debug("Using userdata: %s", userdata) + create_args = dict( + name=self.name, + size=self.size, + image=self.image, + ex_userdata=userdata, + ) + networks = self.provider.networks + if networks: + create_args['networks'] = networks + security_groups = self.security_groups + if security_groups: + create_args['ex_security_groups'] = security_groups + self._node = retry( + self.provider.driver.create_node, + **create_args + ) + log.debug("Created node: %s", self.node) + results = retry( + self.provider.driver.wait_until_running, + nodes=[self.node], + ssh_interface=self.provider.ssh_interface, + ) + self._node, self.ips = results[0] + log.debug("Node started: %s", self.node) + if not self._create_volumes(): + self._destroy_volumes() + return False + self._update_dns() + # Give cloud-init a few seconds to bring up the network, start sshd, + # and install the public key + time.sleep(20) + self._wait_for_ready() + return self.node + + def _create_volumes(self): + vol_count = self.conf['volumes']['count'] + vol_size = self.conf['volumes']['size'] + name_templ = "%s_%0{0}d".format(len(str(vol_count - 1))) + vol_names = [name_templ % (self.name, i) + for i in range(vol_count)] + try: + for name in vol_names: + volume = retry( + self.provider.driver.create_volume, + vol_size, + name, + ) + log.info("Created volume %s", volume) + retry( + self.provider.driver.attach_volume, + self.node, + volume, + device=None, + ) + except Exception: + log.exception("Failed to create or attach volume!") + return False + return True + + def _destroy_volumes(self): + all_volumes = retry(self.provider.driver.list_volumes) + our_volumes = [vol for vol in all_volumes + if vol.name.startswith("%s_" % self.name)] + for vol in our_volumes: + try: + retry(self.provider.driver.detach_volume, vol) + except Exception: + log.exception("Could not detach volume %s", vol) + try: + retry(self.provider.driver.destroy_volume, vol) + except Exception: + log.exception("Could not destroy volume %s", vol) + + def _update_dns(self): + query = urlencode(dict( + name=self.name, + ip=self.ips[0], + )) + nsupdate_url = "%s?%s" % ( + config.nsupdate_url, + query, + ) + resp = requests.get(nsupdate_url) + resp.raise_for_status() + + def _wait_for_ready(self): + with safe_while(sleep=6, tries=20) as proceed: + while proceed(): + try: + self.remote.connect() + break + except ( + socket.error, + NoValidConnectionsError, + AuthenticationException, + ): + pass + cmd = "while [ ! -e '%s' ]; do sleep 5; done" % self._sentinel_path + self.remote.run(args=cmd, timeout=600) + log.info("Node is ready: %s", self.node) + + @property + def image(self): + os_specs = [ + '{os_type} {os_version}', + '{os_type}-{os_version}', + ] + for spec in os_specs: + matches = [image for image in self.provider.images + if spec.format( + os_type=self.os_type, + os_version=self.os_version, + ) in image.name.lower()] + if matches: + break + if not matches: + raise RuntimeError( + "Could not find an image for %s %s" % + (self.os_type, self.os_version)) + return matches[0] + + @property + def size(self): + ram = self.conf['machine']['ram'] + disk = self.conf['machine']['disk'] + cpu = self.conf['machine']['cpus'] + + def good_size(size): + if (size.ram < ram or size.disk < disk or size.vcpus < cpu): + return False + return True + + all_sizes = self.provider.sizes + good_sizes = filter(good_size, all_sizes) + smallest_match = sorted( + good_sizes, + key=lambda s: (s.ram, s.disk, s.vcpus) + )[0] + return smallest_match + + @property + def security_groups(self): + group_names = self.provider.conf.get('security_groups') + if group_names is None: + return + result = list() + groups = self.provider.security_groups + for name in group_names: + matches = [group for group in groups if group.name == name] + if not matches: + msg = "No security groups found with name '%s'" + elif len(matches) > 1: + msg = "More than one security group found with name '%s'" + elif len(matches) == 1: + result.append(matches[0]) + continue + raise RuntimeError(msg % name) + return result + + @property + def userdata(self): + spec="{t}-{v}".format(t=self.os_type, + v=self.os_version) + base_config = dict( + packages=[ + 'git', + 'wget', + 'python', + 'ntp', + ], + ) + runcmd=[ + # Remove the user's password so that console logins are + # possible + ['passwd', '-d', self.user], + ['touch', self._sentinel_path] + ] + if spec in self.provider.default_userdata: + base_config = deepcopy( + self.provider.default_userdata.get(spec, dict())) + base_config.update(user=self.user) + if 'manage_etc_hosts' not in base_config: + base_config.update( + manage_etc_hosts=True, + hostname=self.hostname, + ) + base_config['runcmd'] = base_config.get('runcmd', list()) + base_config['runcmd'].extend(runcmd) + ssh_pubkey = util.get_user_ssh_pubkey() + if ssh_pubkey: + authorized_keys = base_config.get('ssh_authorized_keys', list()) + authorized_keys.append(ssh_pubkey) + base_config['ssh_authorized_keys'] = authorized_keys + user_str = "#cloud-config\n" + yaml.safe_dump(base_config) + return user_str + + @property + def node(self): + if hasattr(self, '_node'): + return self._node + matches = self._find_nodes() + msg = "Unknown error locating %s" + if not matches: + msg = "No nodes found with name '%s'" % self.name + log.warning(msg) + return + elif len(matches) > 1: + msg = "More than one node found with name '%s'" + elif len(matches) == 1: + self._node = matches[0] + return self._node + raise RuntimeError(msg % self.name) + + def _find_nodes(self): + nodes = retry(self.provider.driver.list_nodes) + matches = [node for node in nodes if node.name == self.name] + return matches + + def _destroy(self): + self._destroy_volumes() + nodes = self._find_nodes() + if not nodes: + log.warning("Didn't find any nodes named '%s' to destroy!", self.name) + return True + if len(nodes) > 1: + log.warning("Found multiple nodes named '%s' to destroy!", self.name) + log.info("Destroying nodes: %s", nodes) + return all([node.destroy() for node in nodes]) diff --git a/teuthology/provision/cloud/test/test_base.py b/teuthology/provision/cloud/test/test_base.py new file mode 100644 index 000000000..67e838070 --- /dev/null +++ b/teuthology/provision/cloud/test/test_base.py @@ -0,0 +1,90 @@ +from libcloud.compute.providers import get_driver +from mock import patch + +from teuthology.config import config +from teuthology.provision import cloud + +from test_cloud_init import dummy_config, dummy_drivers + + +class TestBase(object): + def setup_method(self): + config.load() + config.libcloud = dummy_config + cloud.supported_drivers['dummy'] = dummy_drivers + + def teardown_method(self): + del cloud.supported_drivers['dummy'] + + +class TestProvider(TestBase): + def test_init(self): + obj = cloud.get_provider('my_provider') + assert obj.name == 'my_provider' + assert obj.driver_name == 'dummy' + assert obj.conf == dummy_config['providers']['my_provider'] + + def test_driver(self): + obj = cloud.get_provider('my_provider') + assert isinstance(obj.driver, get_driver('dummy')) + + +class TestProvisioner(TestBase): + klass = cloud.base.Provisioner + + def get_obj( + self, name='node_name', os_type='ubuntu', os_version='ubuntu'): + return cloud.get_provisioner( + 'my_provider', + 'node_name', + 'ubuntu', + '16.04', + ) + + def test_init_provider_string(self): + obj = self.klass('my_provider', 'ubuntu', '16.04') + assert obj.provider.name == 'my_provider' + + def test_create(self): + obj = self.get_obj() + with patch.object( + self.klass, + '_create', + ) as m_create: + for val in [True, False]: + m_create.return_value = val + res = obj.create() + assert res is val + m_create.assert_called_once_with() + m_create.reset_mock() + m_create.side_effect = RuntimeError + res = obj.create() + assert res is False + assert obj.create() is None + + def test_destroy(self): + obj = self.get_obj() + with patch.object( + self.klass, + '_destroy', + ) as m_destroy: + for val in [True, False]: + m_destroy.return_value = val + res = obj.destroy() + assert res is val + m_destroy.assert_called_once_with() + m_destroy.reset_mock() + m_destroy.side_effect = RuntimeError + res = obj.destroy() + assert res is False + assert obj.destroy() is None + + def test_remote(self): + obj = self.get_obj() + assert obj.remote.shortname == 'node_name' + + def test_repr(self): + obj = self.get_obj() + assert repr(obj) == \ + "Provisioner(provider='my_provider', name='node_name', os_type='ubuntu', os_version='16.04')" # noqa + diff --git a/teuthology/provision/cloud/test/test_cloud_init.py b/teuthology/provision/cloud/test/test_cloud_init.py new file mode 100644 index 000000000..1b0df968d --- /dev/null +++ b/teuthology/provision/cloud/test/test_cloud_init.py @@ -0,0 +1,60 @@ +from teuthology.config import config +from teuthology.provision import cloud + +dummy_config = dict( + providers=dict( + my_provider=dict( + driver='dummy', + driver_args=dict( + creds=0, + ), + conf_1='1', + conf_2='2', + ) + ) +) + + +class DummyProvider(cloud.base.Provider): + # For libcloud's dummy driver + _driver_posargs = ['creds'] + +dummy_drivers = dict( + provider=DummyProvider, + provisioner=cloud.base.Provisioner, +) + + +class TestInit(object): + def setup_method(self): + config.load() + config.libcloud = dummy_config + cloud.supported_drivers['dummy'] = dummy_drivers + + def teardown_method(self): + del cloud.supported_drivers['dummy'] + + def test_get_types(self): + assert list(cloud.get_types()) == ['my_provider'] + + def test_get_provider_conf(self): + expected = dummy_config['providers']['my_provider'] + assert cloud.get_provider_conf('my_provider') == expected + + def test_get_provider(self): + obj = cloud.get_provider('my_provider') + assert obj.name == 'my_provider' + assert obj.driver_name == 'dummy' + + def test_get_provisioner(self): + obj = cloud.get_provisioner( + 'my_provider', + 'node_name', + 'ubuntu', + '16.04', + dict(foo='bar'), + ) + assert obj.provider.name == 'my_provider' + assert obj.name == 'node_name' + assert obj.os_type == 'ubuntu' + assert obj.os_version == '16.04' diff --git a/teuthology/provision/cloud/test/test_cloud_util.py b/teuthology/provision/cloud/test/test_cloud_util.py new file mode 100644 index 000000000..3093d2c40 --- /dev/null +++ b/teuthology/provision/cloud/test/test_cloud_util.py @@ -0,0 +1,172 @@ +import datetime +import dateutil +import json + +from mock import patch, mock_open +from pytest import mark + +from teuthology.provision.cloud import util + + +@mark.parametrize( + 'path, exists', + [ + ('/fake/path', True), + ('/fake/path', False), + ] +) +def test_get_user_ssh_pubkey(path, exists): + with patch('os.path.exists') as m_exists: + m_exists.return_value = exists + with patch('teuthology.provision.cloud.util.open', mock_open(), create=True) as m_open: + util.get_user_ssh_pubkey(path) + if exists: + m_open.assert_called_once_with(path) + + +@mark.parametrize( + 'input_, func, expected', + [ + [ + [ + dict(sub0=dict(key0=0, key1=0)), + dict(sub0=dict(key1=1, key2=2)), + ], + lambda x, y: x > y, + dict(sub0=dict(key0=0, key1=1, key2=2)) + ], + [ + [ + dict(), + dict(sub0=dict(key1=1, key2=2)), + ], + lambda x, y: x > y, + dict(sub0=dict(key1=1, key2=2)) + ], + [ + [ + dict(sub0=dict(key1=1, key2=2)), + dict(), + ], + lambda x, y: x > y, + dict(sub0=dict(key1=1, key2=2)) + ], + [ + [ + dict(sub0=dict(key0=0, key1=0, key2=0)), + dict(sub0=dict(key0=1, key2=3), sub1=dict(key0=0)), + dict(sub0=dict(key0=3, key1=2, key2=1)), + dict(sub0=dict(key1=3), + sub1=dict(key0=3, key1=0)), + ], + lambda x, y: x > y, + dict(sub0=dict(key0=3, key1=3, key2=3), + sub1=dict(key0=3, key1=0)) + ], + ] +) +def test_combine_dicts(input_, func, expected): + assert util.combine_dicts(input_, func) == expected + + +def get_datetime(offset_hours=0): + delta = datetime.timedelta(hours=offset_hours) + return datetime.datetime.now(dateutil.tz.tzutc()) + delta + + +def get_datetime_string(offset_hours=0): + obj = get_datetime(offset_hours) + return obj.strftime(util.AuthToken.time_format) + + +class TestAuthToken(object): + klass = util.AuthToken + + def setup_method(self): + default_expires = get_datetime_string(0) + self.test_data = dict( + value='token_value', + endpoint='endpoint', + expires=default_expires, + ) + self.patchers = dict() + self.patchers['m_open'] = patch( + 'teuthology.provision.cloud.util.open' + ) + self.patchers['m_exists'] = patch( + 'os.path.exists' + ) + self.patchers['m_file_lock'] = patch( + 'teuthology.provision.cloud.util.FileLock' + ) + self.mocks = dict() + for name, patcher in self.patchers.items(): + self.mocks[name] = patcher.start() + + def teardown_method(self): + for patcher in self.patchers.values(): + patcher.stop() + + def get_obj(self, name='name', directory='/fake/directory'): + return self.klass( + name=name, + directory=directory, + ) + + def test_no_token(self): + obj = self.get_obj() + self.mocks['m_exists'].return_value = False + with obj: + assert obj.value is None + assert obj.expired is True + + @mark.parametrize( + 'test_data, expired', + [ + [ + dict( + value='token_value', + endpoint='endpoint', + expires=get_datetime_string(-1), + ), + True + ], + [ + dict( + value='token_value', + endpoint='endpoint', + expires=get_datetime_string(1), + ), + False + ], + ] + ) + def test_token_read(self, test_data, expired): + obj = self.get_obj() + self.mocks['m_exists'].return_value = True + self.mocks['m_open'].return_value.__enter__.return_value.read.return_value = \ + json.dumps(test_data) + with obj: + if expired: + assert obj.value is None + assert obj.expired is True + else: + assert obj.value == test_data['value'] + + def test_token_write(self): + obj = self.get_obj() + datetime_obj = get_datetime(0) + datetime_string = get_datetime_string(0) + self.mocks['m_exists'].return_value = False + with obj: + obj.write('value', datetime_obj, 'endpoint') + m_open = self.mocks['m_open'] + write_calls = m_open.return_value.__enter__.return_value.write\ + .call_args_list + assert len(write_calls) == 1 + expected = json.dumps(dict( + value='value', + expires=datetime_string, + endpoint='endpoint', + )) + assert write_calls[0][0][0] == expected diff --git a/teuthology/provision/cloud/test/test_openstack.py b/teuthology/provision/cloud/test/test_openstack.py new file mode 100644 index 000000000..c1521054a --- /dev/null +++ b/teuthology/provision/cloud/test/test_openstack.py @@ -0,0 +1,781 @@ +import socket +import yaml +import os + +from teuthology.util.compat import parse_qs + +from copy import deepcopy +from libcloud.compute.providers import get_driver +from mock import patch, Mock, DEFAULT +from pytest import raises, mark + +from teuthology.config import config +from teuthology.exceptions import MaxWhileTries +from teuthology.provision import cloud + +test_config = dict( + providers=dict( + my_provider=dict( + driver='openstack', + driver_args=dict( + username='user', + password='password', + ex_force_auth_url='http://127.0.0.1:9999/v2.0/tokens', + ), + ), + image_exclude_provider=dict( + driver='openstack', + exclude_image=['.*-exclude1', '.*-exclude2'], + driver_args=dict( + username='user', + password='password', + ex_force_auth_url='http://127.0.0.1:9999/v2.0/tokens', + ), + ) + ) +) + + +@patch('time.sleep') +def test_retry(m_sleep): + orig_exceptions = cloud.openstack.RETRY_EXCEPTIONS + new_exceptions = orig_exceptions + (RuntimeError, ) + + class test_cls(object): + def __init__(self, min_val): + self.min_val = min_val + self.cur_val = 0 + + def func(self): + self.cur_val += 1 + if self.cur_val < self.min_val: + raise RuntimeError + return self.cur_val + + with patch.object( + cloud.openstack, + 'RETRY_EXCEPTIONS', + new=new_exceptions, + ): + test_obj = test_cls(min_val=5) + assert cloud.openstack.retry(test_obj.func) == 5 + test_obj = test_cls(min_val=1000) + with raises(MaxWhileTries): + cloud.openstack.retry(test_obj.func) + + +def get_fake_obj(mock_args=None, attributes=None): + if mock_args is None: + mock_args = dict() + if attributes is None: + attributes = dict() + obj = Mock(**mock_args) + for name, value in attributes.items(): + setattr(obj, name, value) + return obj + + +class TestOpenStackBase(object): + def setup_method(self): + config.load(dict(libcloud=deepcopy(test_config))) + self.start_patchers() + + def start_patchers(self): + self.patchers = dict() + self.patchers['m_list_images'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.list_images' + ) + self.patchers['m_list_sizes'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.list_sizes' + ) + self.patchers['m_ex_list_networks'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStack_1_1_NodeDriver.ex_list_networks' + ) + self.patchers['m_ex_list_security_groups'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStack_1_1_NodeDriver.ex_list_security_groups' + ) + self.patchers['m_get_user_ssh_pubkey'] = patch( + 'teuthology.provision.cloud.util.get_user_ssh_pubkey' + ) + self.patchers['m_list_nodes'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.list_nodes' + ) + self.patchers['m_create_node'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStack_1_1_NodeDriver.create_node' + ) + self.patchers['m_wait_until_running'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.wait_until_running' + ) + self.patchers['m_create_volume'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.create_volume' + ) + self.patchers['m_attach_volume'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.attach_volume' + ) + self.patchers['m_detach_volume'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.detach_volume' + ) + self.patchers['m_list_volumes'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.list_volumes' + ) + self.patchers['m_destroy_volume'] = patch( + 'libcloud.compute.drivers.openstack' + '.OpenStackNodeDriver.destroy_volume' + ) + self.patchers['m_get_service_catalog'] = patch( + 'libcloud.common.openstack' + '.OpenStackBaseConnection.get_service_catalog' + ) + self.patchers['m_auth_token'] = patch( + 'teuthology.provision.cloud.util.AuthToken' + ) + self.patchers['m_get_endpoint'] = patch( + 'libcloud.common.openstack' + '.OpenStackBaseConnection.get_endpoint', + ) + self.patchers['m_connect'] = patch( + 'libcloud.common.base' + '.Connection.connect', + ) + self.patchers['m_sleep'] = patch( + 'time.sleep' + ) + self.patchers['m_get'] = patch( + 'requests.get' + ) + self.mocks = dict() + for name, patcher in self.patchers.items(): + self.mocks[name] = patcher.start() + self.mocks['m_get_endpoint'].return_value = 'endpoint' + + def teardown_method(self): + for patcher in self.patchers.values(): + patcher.stop() + + +class TestOpenStackProvider(TestOpenStackBase): + klass = cloud.openstack.OpenStackProvider + + def test_init(self): + obj = cloud.get_provider('my_provider') + assert obj.name == 'my_provider' + assert obj.driver_name == 'openstack' + assert obj.conf == test_config['providers']['my_provider'] + + def test_driver(self): + token = self.mocks['m_auth_token'].return_value + self.mocks['m_auth_token'].return_value.__enter__.return_value = token + token.value = None + obj = cloud.get_provider('my_provider') + assert isinstance(obj.driver, get_driver('openstack')) + assert obj._auth_token.value is None + + def test_images(self): + obj = cloud.get_provider('my_provider') + self.mocks['m_list_images'].return_value = [ + get_fake_obj(attributes=dict(name=_)) + for _ in ['image0', 'image1']] + assert not hasattr(obj, '_images') + assert [_.name for _ in obj.images] == ['image0', 'image1'] + assert hasattr(obj, '_images') + + def test_exclude_image(self): + obj = cloud.get_provider('image_exclude_provider') + self.mocks['m_list_images'].return_value = [ + get_fake_obj(attributes=dict(name=_)) + for _ in ['image0', 'image1', + 'image2-exclude1', 'image3-exclude2']] + assert not hasattr(obj, '_images') + assert [_.name for _ in obj.images] == ['image0', 'image1'] + assert hasattr(obj, '_images') + + def test_sizes(self): + obj = cloud.get_provider('my_provider') + fake_sizes = [get_fake_obj(attributes=dict(name='size%s' % i)) for + i in range(2)] + self.mocks['m_list_sizes'].return_value = fake_sizes + assert not hasattr(obj, '_sizes') + assert [s.name for s in obj.sizes] == ['size0', 'size1'] + assert hasattr(obj, '_sizes') + + def test_networks(self): + obj = cloud.get_provider('my_provider') + nets = [get_fake_obj(attributes=dict(name=i)) for i in ['net0', 'net1']] + self.mocks['m_ex_list_networks'].return_value = nets + assert not hasattr(obj, '_networks') + assert [i.name for i in obj.networks] == [i.name for i in nets] + assert hasattr(obj, '_networks') + self.mocks['m_ex_list_networks'].side_effect = AttributeError + obj = cloud.get_provider('my_provider') + assert not hasattr(obj, '_networks') + assert obj.networks == list() + assert hasattr(obj, '_networks') + + def test_security_groups(self): + obj = cloud.get_provider('my_provider') + self.mocks['m_ex_list_security_groups'].return_value = ['sg0', 'sg1'] + assert not hasattr(obj, '_security_groups') + assert obj.security_groups == ['sg0', 'sg1'] + assert hasattr(obj, '_security_groups') + self.mocks['m_ex_list_security_groups'].side_effect = AttributeError + obj = cloud.get_provider('my_provider') + assert not hasattr(obj, '_security_groups') + assert obj.security_groups == list() + assert hasattr(obj, '_security_groups') + + +class TestOpenStackCustomProvisioner(TestOpenStackBase): + klass = cloud.openstack.OpenStackProvisioner + def get_obj( + self, name='node_name', os_type='ubuntu', + os_version='16.04', conf=None, test_conf=None): + + if test_conf: + yaml_file = os.path.dirname(__file__) + '/' + test_conf + print("Reading conf: %s" % yaml_file) + with open(yaml_file) as f: + teuth_conf=yaml.safe_load(f) + print(teuth_conf) + config.libcloud = deepcopy(teuth_conf['libcloud'] or test_config) + else: + config.libcloud = deepcopy(test_config) + return cloud.get_provisioner( + node_type='my_provider', + name=name, + os_type=os_type, + os_version=os_version, + conf=conf, + ) + + @mark.parametrize( + "conf", + [ + dict( + path='test_openstack_userdata_conf.yaml', + runcmd_head=['uptime', 'date'], + ssh_authorized_keys=['user_public_key1', 'user_public_key2'], + user_ssh_pubkey='my_ssh_key', + os_version='16.04', + os_type='ubuntu', + ), + dict( + path='test_openstack_userdata_conf.yaml', + runcmd_head=['uptime', 'date'], + ssh_authorized_keys=['user_public_key1', 'user_public_key2'], + user_ssh_pubkey=None, + os_version='16.04', + os_type='ubuntu', + ), + dict( + os_version='16.04', + os_type='ubuntu', + path=None, + user_ssh_pubkey=None, + ), + ] + ) + def test_userdata_conf(self, conf): + self.mocks['m_get_user_ssh_pubkey'].return_value = conf['user_ssh_pubkey'] + obj = self.get_obj(os_version=conf['os_version'], + os_type=conf['os_type'], + test_conf=conf['path']) + userdata = yaml.safe_load(obj.userdata) + print(">>>> ", obj.conf) + print(">>>> ", obj.provider.conf) + print(">>>> ", obj.provider) + print(obj.userdata) + if conf and 'path' in conf and conf['path']: + assert userdata['runcmd'][0:len(conf['runcmd_head'])] == conf['runcmd_head'] + assert userdata['bootcmd'] == [ + 'SuSEfirewall2 stop || true', + 'service firewalld stop || true', + ] + assert 'packages' not in userdata + else: + assert 'bootcmd' not in userdata + assert userdata['packages'] == ['git', 'wget', 'python', 'ntp'] + assert userdata['user'] == obj.user + assert userdata['hostname'] == obj.hostname + if 'user_ssh_pubkey' in conf and conf['user_ssh_pubkey']: + assert userdata['ssh_authorized_keys'][-1] == conf['user_ssh_pubkey'] + if 'ssh_authorized_keys' in conf: + keys = conf['ssh_authorized_keys'] + assert userdata['ssh_authorized_keys'][0:len(keys)] == keys + else: + if 'ssh_authorized_keys' in conf: + keys = conf['ssh_authorized_keys'] + assert userdata['ssh_authorized_keys'][0:len(keys)] == keys + else: + assert 'ssh_authorized_keys' not in userdata + + @mark.parametrize( + "conf", + [ + dict( + path='test_openstack_userdata_conf.yaml', + runcmd_head=['uptime', 'date'], + ), + dict( + path=None, + ), + ] + ) + def test_userdata_conf_runcmd(self, conf): + self.mocks['m_get_user_ssh_pubkey'].return_value = None + obj = self.get_obj(test_conf=conf['path']) + userdata = yaml.safe_load(obj.userdata) + assert userdata['runcmd'][-2:] == [['passwd', '-d', 'ubuntu'], ['touch', '/.teuth_provisioned']] + + @mark.parametrize( + "conf", + [ + dict( + path='test_openstack_userdata_conf.yaml', + packages=None, + ), + dict( + path=None, + packages=['git', 'wget', 'python', 'ntp'] + ), + ] + ) + def test_userdata_conf_packages(self, conf): + self.mocks['m_get_user_ssh_pubkey'].return_value = None + obj = self.get_obj(test_conf=conf['path']) + userdata = yaml.safe_load(obj.userdata) + assert userdata.get('packages', None) == conf['packages'] + +class TestOpenStackProvisioner(TestOpenStackBase): + klass = cloud.openstack.OpenStackProvisioner + + def get_obj( + self, name='node_name', os_type='ubuntu', + os_version='16.04', conf=None): + return cloud.get_provisioner( + node_type='my_provider', + name=name, + os_type=os_type, + os_version=os_version, + conf=conf, + ) + + def test_init(self): + with patch.object( + self.klass, + '_read_conf', + ) as m_read_conf: + self.get_obj() + assert len(m_read_conf.call_args_list) == 1 + + @mark.parametrize( + 'input_conf', + [ + dict(machine=dict( + disk=42, + ram=9001, + cpus=3, + )), + dict(volumes=dict( + count=3, + size=100, + )), + dict(), + dict( + machine=dict( + disk=1, + ram=2, + cpus=3, + ), + volumes=dict( + count=4, + size=5, + ) + ), + dict( + machine=dict( + disk=100, + ), + ), + ] + ) + def test_read_conf(self, input_conf): + obj = self.get_obj(conf=input_conf) + for topic in ['machine', 'volumes']: + combined = cloud.util.combine_dicts( + [input_conf, config.openstack], + lambda x, y: x > y, + ) + assert obj.conf[topic] == combined[topic] + + @mark.parametrize( + 'input_conf, expected_machine, expected_vols', + [ + [ + dict(openstack=[ + dict(machine=dict(disk=64, ram=10000, cpus=3)), + dict(volumes=dict(count=1, size=1)), + ]), + dict(disk=64, ram=10000, cpus=3), + dict(count=1, size=1), + ], + [ + dict(openstack=[ + dict(machine=dict(cpus=3)), + dict(machine=dict(disk=1, ram=9000)), + dict(machine=dict(disk=50, ram=2, cpus=1)), + dict(machine=dict()), + dict(volumes=dict()), + dict(volumes=dict(count=0, size=0)), + dict(volumes=dict(count=1, size=0)), + dict(volumes=dict(size=1)), + ]), + dict(disk=50, ram=9000, cpus=3), + dict(count=1, size=1), + ], + [ + dict(openstack=[ + dict(volumes=dict(count=3, size=30)), + dict(volumes=dict(size=50)), + ]), + None, + dict(count=3, size=50), + ], + [ + dict(openstack=[ + dict(machine=dict(disk=100)), + dict(volumes=dict(count=3, size=30)), + ]), + dict(disk=100, ram=8000, cpus=1), + dict(count=3, size=30), + ], + ] + ) + def test_read_conf_legacy( + self, input_conf, expected_machine, expected_vols): + obj = self.get_obj(conf=input_conf) + if expected_machine is not None: + assert obj.conf['machine'] == expected_machine + else: + assert obj.conf['machine'] == config.openstack['machine'] + if expected_vols is not None: + assert obj.conf['volumes'] == expected_vols + + @mark.parametrize( + "os_type, os_version, should_find", + [ + ('centos', '7', True), + ('BeOS', '42', False), + ] + ) + def test_image(self, os_type, os_version, should_find): + image_attrs = [ + dict(name='ubuntu-14.04'), + dict(name='ubuntu-16.04'), + dict(name='centos-7.0'), + ] + fake_images = list() + for item in image_attrs: + fake_images.append( + get_fake_obj(attributes=item) + ) + obj = self.get_obj(os_type=os_type, os_version=os_version) + self.mocks['m_list_images'].return_value = fake_images + if should_find: + assert obj.os_version in obj.image.name + assert obj.image in fake_images + else: + with raises(RuntimeError): + obj.image + + @mark.parametrize( + "input_attrs, func_or_exc", + [ + (dict(ram=2**16), + lambda s: s.ram == 2**16), + (dict(disk=9999), + lambda s: s.disk == 9999), + (dict(cpus=99), + lambda s: s.vcpus == 99), + (dict(ram=2**16, disk=9999, cpus=99), + IndexError), + ] + ) + def test_size(self, input_attrs, func_or_exc): + size_attrs = [ + dict(ram=8000, disk=9999, vcpus=99, name='s0'), + dict(ram=2**16, disk=20, vcpus=99, name='s1'), + dict(ram=2**16, disk=9999, vcpus=1, name='s2'), + ] + fake_sizes = list() + for item in size_attrs: + fake_sizes.append( + get_fake_obj(attributes=item) + ) + base_spec = dict(machine=dict( + ram=1, + disk=1, + cpus=1, + )) + spec = deepcopy(base_spec) + spec['machine'].update(input_attrs) + obj = self.get_obj(conf=spec) + self.mocks['m_list_sizes'].return_value = fake_sizes + if isinstance(func_or_exc, type): + with raises(func_or_exc): + obj.size + else: + assert obj.size in fake_sizes + assert func_or_exc(obj.size) is True + + @mark.parametrize( + "wanted_groups", + [ + ['group1'], + ['group0', 'group2'], + [], + ] + ) + def test_security_groups(self, wanted_groups): + group_names = ['group0', 'group1', 'group2'] + fake_groups = list() + for name in group_names: + fake_groups.append( + get_fake_obj(attributes=dict(name=name)) + ) + self.mocks['m_ex_list_security_groups'].return_value = fake_groups + obj = self.get_obj() + assert obj.security_groups is None + obj = self.get_obj() + obj.provider.conf['security_groups'] = wanted_groups + assert [g.name for g in obj.security_groups] == wanted_groups + + def test_security_groups_exc(self): + fake_groups = [ + get_fake_obj(attributes=dict(name='sg')) for i in range(2) + ] + obj = self.get_obj() + obj.provider.conf['security_groups'] = ['sg'] + with raises(RuntimeError): + obj.security_groups + self.mocks['m_ex_list_security_groups'].return_value = fake_groups + obj = self.get_obj() + obj.provider.conf['security_groups'] = ['sg'] + with raises(RuntimeError): + obj.security_groups + + @mark.parametrize( + "ssh_key", + [ + 'my_ssh_key', + None, + ] + ) + def test_userdata(self, ssh_key): + self.mocks['m_get_user_ssh_pubkey'].return_value = ssh_key + obj = self.get_obj() + userdata = yaml.safe_load(obj.userdata) + assert userdata['user'] == obj.user + assert userdata['hostname'] == obj.hostname + if ssh_key: + assert userdata['ssh_authorized_keys'] == [ssh_key] + else: + assert 'ssh_authorized_keys' not in userdata + + @mark.parametrize( + 'wanted_name, should_find, exception', + [ + ('node0', True, None), + ('node1', True, None), + ('node2', False, RuntimeError), + ('node3', False, None), + ] + ) + def test_node(self, wanted_name, should_find, exception): + node_names = ['node0', 'node1', 'node2', 'node2'] + fake_nodes = list() + for name in node_names: + fake_nodes.append( + get_fake_obj(attributes=dict(name=name)) + ) + self.mocks['m_list_nodes'].return_value = fake_nodes + obj = self.get_obj(name=wanted_name) + if should_find: + assert obj.node.name == wanted_name + elif exception: + with raises(exception) as excinfo: + obj.node + assert excinfo.value.message + else: + assert obj.node is None + + @mark.parametrize( + 'networks, security_groups', + [ + ([], []), + (['net0'], []), + ([], ['sg0']), + (['net0'], ['sg0']), + ] + ) + def test_create(self, networks, security_groups): + node_name = 'node0' + fake_sizes = [ + get_fake_obj( + attributes=dict(ram=2**16, disk=9999, vcpus=99, name='s0')), + ] + fake_security_groups = [ + get_fake_obj(attributes=dict(name=name)) + for name in security_groups + ] + self.mocks['m_ex_list_networks'].return_value = networks + self.mocks['m_ex_list_security_groups'].return_value = \ + fake_security_groups + self.mocks['m_list_sizes'].return_value = fake_sizes + fake_images = [ + get_fake_obj(attributes=dict(name='ubuntu-16.04')), + ] + self.mocks['m_list_images'].return_value = fake_images + self.mocks['m_get_user_ssh_pubkey'].return_value = 'ssh_key' + fake_node = get_fake_obj(attributes=dict(name=node_name)) + fake_ips = ['555.123.4.0'] + self.mocks['m_create_node'].return_value = fake_node + self.mocks['m_wait_until_running'].return_value = \ + [(fake_node, fake_ips)] + obj = self.get_obj(name=node_name) + obj._networks = networks + obj.provider.conf['security_groups'] = security_groups + p_wait_for_ready = patch( + 'teuthology.provision.cloud.openstack.OpenStackProvisioner' + '._wait_for_ready' + ) + with p_wait_for_ready: + res = obj.create() + assert res is obj.node + # Test once again to ensure that if volume creation/attachment fails, + # we destroy any remaining volumes and consider the node creation to + # have failed as well. + del obj._node + with p_wait_for_ready: + obj.conf['volumes']['count'] = 1 + obj.provider.driver.create_volume.side_effect = Exception + with patch.object(obj, '_destroy_volumes'): + assert obj.create() is False + obj._destroy_volumes.assert_called_once_with() + + def test_update_dns(self): + config.nsupdate_url = 'nsupdate_url' + obj = self.get_obj() + obj.name = 'x' + obj.ips = ['y'] + obj._update_dns() + call_args = self.mocks['m_get'].call_args_list + assert len(call_args) == 1 + url_base, query_string = call_args[0][0][0].split('?') + assert url_base == 'nsupdate_url' + parsed_query = parse_qs(query_string) + assert parsed_query == dict(name=['x'], ip=['y']) + + @mark.parametrize( + 'nodes', + [[], [Mock()], [Mock(), Mock()]] + ) + def test_destroy(self, nodes): + with patch( + 'teuthology.provision.cloud.openstack.' + 'OpenStackProvisioner._find_nodes' + ) as m_find_nodes: + m_find_nodes.return_value = nodes + obj = self.get_obj() + result = obj.destroy() + if not all(nodes): + assert result is True + else: + for node in nodes: + node.destroy.assert_called_once_with() + + _volume_matrix = ( + 'count, size, should_succeed', + [ + (1, 10, True), + (0, 10, True), + (10, 1, True), + (1, 10, False), + (10, 1, False), + ] + ) + + @mark.parametrize(*_volume_matrix) + def test_create_volumes(self, count, size, should_succeed): + obj_conf = dict(volumes=dict(count=count, size=size)) + obj = self.get_obj(conf=obj_conf) + node = get_fake_obj() + if not should_succeed: + obj.provider.driver.create_volume.side_effect = Exception + obj._node = node + result = obj._create_volumes() + assert result is should_succeed + if should_succeed: + create_calls = obj.provider.driver.create_volume.call_args_list + attach_calls = obj.provider.driver.attach_volume.call_args_list + assert len(create_calls) == count + assert len(attach_calls) == count + for i in range(count): + vol_size, vol_name = create_calls[i][0] + assert vol_size == size + assert vol_name == '%s_%s' % (obj.name, i) + assert attach_calls[i][0][0] is obj._node + assert attach_calls[i][1]['device'] is None + + @mark.parametrize(*_volume_matrix) + def test_destroy_volumes(self, count, size, should_succeed): + obj_conf = dict(volumes=dict(count=count, size=size)) + obj = self.get_obj(conf=obj_conf) + fake_volumes = list() + for i in range(count): + vol_name = '%s_%s' % (obj.name, i) + fake_volumes.append( + get_fake_obj(attributes=dict(name=vol_name)) + ) + obj.provider.driver.list_volumes.return_value = fake_volumes + obj._destroy_volumes() + detach_calls = obj.provider.driver.detach_volume.call_args_list + destroy_calls = obj.provider.driver.destroy_volume.call_args_list + assert len(detach_calls) == count + assert len(destroy_calls) == count + assert len(obj.provider.driver.detach_volume.call_args_list) == count + assert len(obj.provider.driver.destroy_volume.call_args_list) == count + obj.provider.driver.detach_volume.reset_mock() + obj.provider.driver.destroy_volume.reset_mock() + obj.provider.driver.detach_volume.side_effect = Exception + obj.provider.driver.destroy_volume.side_effect = Exception + obj._destroy_volumes() + assert len(obj.provider.driver.detach_volume.call_args_list) == count + assert len(obj.provider.driver.destroy_volume.call_args_list) == count + + def test_destroy_volumes_exc(self): + obj = self.get_obj() + obj.provider.driver.detach_volume.side_effect = Exception + + def test_wait_for_ready(self): + obj = self.get_obj() + obj._node = get_fake_obj(attributes=dict(name='node_name')) + with patch.multiple( + 'teuthology.orchestra.remote.Remote', + connect=DEFAULT, + run=DEFAULT, + ) as mocks: + obj._wait_for_ready() + mocks['connect'].side_effect = socket.error + with raises(MaxWhileTries): + obj._wait_for_ready() diff --git a/teuthology/provision/cloud/test/test_openstack_userdata_conf.yaml b/teuthology/provision/cloud/test/test_openstack_userdata_conf.yaml new file mode 100644 index 000000000..f3e87a846 --- /dev/null +++ b/teuthology/provision/cloud/test/test_openstack_userdata_conf.yaml @@ -0,0 +1,24 @@ +libcloud: + providers: + my_provider: + allow_networks: + - sesci + userdata: + 'ubuntu-16.04': + bootcmd: + - 'SuSEfirewall2 stop || true' + - 'service firewalld stop || true' + runcmd: + - 'uptime' + - 'date' + - 'zypper in -y lsb-release make gcc gcc-c++ chrony || true' + - 'systemctl enable chronyd.service || true' + - 'systemctl start chronyd.service || true' + ssh_authorized_keys: + - user_public_key1 + - user_public_key2 + driver: openstack + driver_args: + username: user + password: password + ex_force_auth_url: 'http://127.0.0.1:9999/v2.0/tokens' diff --git a/teuthology/provision/cloud/util.py b/teuthology/provision/cloud/util.py new file mode 100644 index 000000000..03ea7796f --- /dev/null +++ b/teuthology/provision/cloud/util.py @@ -0,0 +1,115 @@ +import datetime +import dateutil.parser +import json +import os + +from teuthology.util.flock import FileLock + +def get_user_ssh_pubkey(path='~/.ssh/id_rsa.pub'): + full_path = os.path.expanduser(path) + if not os.path.exists(full_path): + return + with open(full_path) as f: + return f.read().strip() + + +def combine_dicts(list_of_dicts, func): + """ + A useful function to merge a list of dicts. Most of the work is done by + selective_update(). + + :param list_of_dicts: A list of dicts to combine using selective_update() + :param func: A comparison function that will be passed to + selective_update() along with values from each input + dict + :returns: The new, merged, dict + """ + new_dict = dict() + for item in list_of_dicts: + selective_update(new_dict, item, func) + return new_dict + + +def selective_update(a, b, func): + """ + Given two dicts and a comparison function, recursively inspects key-value + pairs in the second dict and merges them into the first dict if func() + returns a "Truthy" value. + + Example:: + + >>> a = dict(x=0, y=1, z=3) + >>> b = dict(x=1, y=2, z=0) + >>> selective_update(a, b, lambda foo, bar: foo > bar) + >>> print(a) + {'x': 1, 'y': 2, 'z': 3} + + :param a: A dict. This is modified in-place! + :param b: Another dict. + :param func: A binary comparison function that will be called similarly to: + func(a[key], b[key]) for each key in b. + """ + for key, value in b.items(): + if key not in a: + a[key] = value + continue + if isinstance(value, dict): + selective_update(a[key], value, func) + elif func(value, a[key]): + a[key] = value + + +class AuthToken(object): + time_format = '%Y-%m-%d %H:%M:%S%z' + + def __init__(self, name, directory=os.path.expanduser('~/.cache/')): + self.name = name + self.directory = directory + self.path = os.path.join(directory, name) + self.lock_path = "%s.lock" % self.path + self.expires = None + self.value = None + self.endpoint = None + + def read(self): + if not os.path.exists(self.path): + self.value = None + self.expires = None + self.endpoint = None + return + with open(self.path, 'r') as obj: + string = obj.read() + obj = json.loads(string) + self.expires = dateutil.parser.parse(obj['expires']) + if self.expired: + self.value = None + self.endpoint = None + else: + self.value = obj['value'] + self.endpoint = obj['endpoint'] + + def write(self, value, expires, endpoint): + obj = dict( + value=value, + expires=datetime.datetime.strftime(expires, self.time_format), + endpoint=endpoint, + ) + string = json.dumps(obj) + with open(self.path, 'w') as obj: + obj.write(string) + + @property + def expired(self): + if self.expires is None: + return True + utcnow = datetime.datetime.now(datetime.timezone.utc) + offset = datetime.timedelta(minutes=30) + return self.expires < (utcnow + offset) + + def __enter__(self): + with FileLock(self.lock_path): + self.read() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass diff --git a/teuthology/provision/downburst.py b/teuthology/provision/downburst.py new file mode 100644 index 000000000..e4dbe5332 --- /dev/null +++ b/teuthology/provision/downburst.py @@ -0,0 +1,364 @@ +import json +import logging +import os +import subprocess +import tempfile +import yaml + +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.misc import decanonicalize_hostname +from teuthology.misc import deep_merge +from teuthology.lock import query + +log = logging.getLogger(__name__) + + +def get_types(): + types = ['vps'] + if 'downburst' in config and 'machine' in config.downburst: + machine = config.downburst.get('machine') + if isinstance(machine, list): + types = list(m.get('type') for m in machine) + return types + + +def downburst_executable(): + """ + First check for downburst in the user's path. + Then check in ~/src, ~ubuntu/src, and ~teuthology/src. + Return '' if no executable downburst is found. + """ + if config.downburst: + if isinstance(config.downburst, dict): + if 'path' in config.downburst: + return config.downburst['path'] + else: + return config.downburst + path = os.environ.get('PATH', None) + if path: + for p in os.environ.get('PATH', '').split(os.pathsep): + pth = os.path.join(p, 'downburst') + if os.access(pth, os.X_OK): + return pth + import pwd + little_old_me = pwd.getpwuid(os.getuid()).pw_name + for user in [little_old_me, 'ubuntu', 'teuthology']: + pth = os.path.expanduser( + "~%s/src/downburst/virtualenv/bin/downburst" % user) + if os.access(pth, os.X_OK): + return pth + return '' + + +def downburst_environment(): + env = dict() + env['PATH'] = os.environ.get('PATH') + discover_url = os.environ.get('DOWNBURST_DISCOVER_URL') + if config.downburst and not discover_url: + if isinstance(config.downburst, dict): + discover_url = config.downburst.get('discover_url') + if discover_url: + env['DOWNBURST_DISCOVER_URL'] = discover_url + return env + + +class Downburst(object): + """ + A class that provides methods for creating and destroying virtual machine + instances using downburst: https://github.com/ceph/downburst + """ + def __init__(self, name, os_type, os_version, status=None, user='ubuntu', + logfile=None): + self.name = name + self.shortname = decanonicalize_hostname(self.name) + self.os_type = os_type + self.os_version = os_version + self.status = status or query.get_status(self.name) + self.config_path = None + self.user_path = None + self.user = user + self.logfile = logfile + self.host = decanonicalize_hostname(self.status['vm_host']['name']) + self.executable = downburst_executable() + self.environment = downburst_environment() + + def create(self): + """ + Launch a virtual machine instance. + + If creation fails because an instance with the specified name is + already running, first destroy it, then try again. This process will + repeat two more times, waiting 60s between tries, before giving up. + """ + if not self.executable: + log.error("No downburst executable found.") + return False + self.build_config() + success = None + with safe_while(sleep=60, tries=3, + action="downburst create") as proceed: + while proceed(): + (returncode, stdout, stderr) = self._run_create() + log.info(stdout) + log.info(stderr) + if returncode == 0: + log.info("Downburst created %s: %s" % (self.name, + stdout.strip())) + success = True + break + elif stderr: + # If the guest already exists first destroy then re-create: + if 'exists' in stderr: + success = False + log.info("Guest files exist. Re-creating guest: %s" % + (self.name)) + self.destroy() + else: + success = False + log.error("Downburst failed on %s" % self.name) + for i in stderr.split('\n'): + log.error(f">>> {i}") + break + return success + + def _run_create(self): + """ + Used by create(), this method is what actually calls downburst when + creating a virtual machine instance. + """ + if not self.config_path: + raise ValueError("I need a config_path!") + if not self.user_path: + raise ValueError("I need a user_path!") + + args = [self.executable, '-v', '-c', self.host] + if self.logfile: + args.extend(['-l', self.logfile]) + args.extend([ + 'create', + '--wait', + '--meta-data=%s' % self.config_path, + '--user-data=%s' % self.user_path, + self.shortname, + ]) + log.info("Provisioning a {distro} {distroversion} vps".format( + distro=self.os_type, + distroversion=self.os_version + )) + log.debug(args) + proc = subprocess.Popen(args, universal_newlines=True, + env=self.environment, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = proc.communicate() + return (proc.returncode, out, err) + + def destroy(self): + """ + Destroy (shutdown and delete) a virtual machine instance. + """ + executable = self.executable + if not executable: + log.error("No downburst executable found.") + return False + args = [executable, '-v', '-c', self.host] + if self.logfile: + args.extend(['-l', self.logfile]) + args.extend(['destroy', self.shortname]) + log.debug(args) + proc = subprocess.Popen(args, universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE,) + out, err = proc.communicate() + log.info(out) + log.info(err) + if proc.returncode != 0: + not_found_msg = "no domain with matching name '%s'" % self.shortname + if not_found_msg in err: + log.warning("Ignoring error during destroy: %s", err) + return True + log.error("Error destroying %s: %s", self.name, err) + return False + else: + out_str = ': %s' % out if out else '' + log.info("Destroyed %s%s" % (self.name, out_str)) + return True + + def build_config(self): + """ + Assemble a configuration to pass to downburst, and write it to a file. + """ + config_fd = tempfile.NamedTemporaryFile(delete=False, mode='wt') + + os_type = self.os_type.lower() + os_version = self.os_version.lower() + + mac_address = self.status['mac_address'] + machine = dict( + disk=os.environ.get('DOWNBURST_DISK_SIZE', '100G'), + ram=os.environ.get('DOWNBURST_RAM_SIZE', '3.8G'), + cpus=int(os.environ.get('DOWNBURST_CPUS', 1)), + volumes=dict( + count=int(os.environ.get('DOWNBURST_EXTRA_DISK_NUMBER', 4)), + size=os.environ.get('DOWNBURST_EXTRA_DISK_SIZE', '100G'), + ), + ) + def belongs_machine_type(machine_config: dict, machine_type: str) -> bool: + if isinstance(machine_config, dict): + t = machine_config.get('type', None) + if isinstance(t, str): + return machine_type == t + elif isinstance(t, list): + return machine_type in t + return False + if isinstance(config.downburst, dict) and isinstance(config.downburst.get('machine'), list): + machine_type = self.status['machine_type'] + machine_config = next((m for m in config.downburst.get('machine') + if belongs_machine_type(m, machine_type)), None) + if machine_config is None: + raise RuntimeError(f"Cannot find config for machine type {machine_type}.") + elif isinstance(config.downburst, dict) and isinstance(config.downburst.get('machine'), dict): + machine_config = config.downburst.get('machine') + deep_merge(machine, machine_config) + log.debug('Using machine config: %s', machine) + file_info = { + 'disk-size': machine['disk'], + 'ram': machine['ram'], + 'cpus': machine['cpus'], + 'networks': [ + {'source': 'front', 'mac': mac_address}], + 'distro': os_type, + 'distroversion': self.os_version, + 'additional-disks': machine['volumes']['count'], + 'additional-disks-size': machine['volumes']['size'], + 'arch': 'x86_64', + } + fqdn = self.name.split('@')[-1] + file_out = { + 'downburst': file_info, + 'local-hostname': fqdn, + } + yaml.safe_dump(file_out, config_fd) + self.config_path = config_fd.name + + user_info = { + 'user': self.user, + # Remove the user's password so console logins are possible + 'runcmd': [ + ['passwd', '-d', self.user], + ] + } + # for opensuse-15.2 we need to replace systemd-logger with rsyslog for teuthology + if os_type == 'opensuse' and os_version == '15.2': + user_info['runcmd'].extend([ + ['zypper', 'rm', '-y', 'systemd-logger'], + ['zypper', 'in', '-y', 'rsyslog'], + ]) + # Install git on downbursted VMs to clone upstream linux-firmware. + # Issue #17154 + if 'packages' not in user_info: + user_info['packages'] = list() + user_info['packages'].extend([ + 'git', + 'wget', + ]) + if os_type in ('centos', 'opensuse'): + user_info['packages'].extend([ + 'chrony', + ]) + if os_type in ('ubuntu', 'debian'): + user_info['packages'].extend([ + 'ntp', + ]) + + # On CentOS/RHEL/Fedora, write the correct mac address and + if os_type in ['centos', 'rhel', 'fedora']: + user_info['runcmd'].extend([ + ['sed', '-ie', 's/HWADDR=".*"/HWADDR="%s"/' % mac_address, + '/etc/sysconfig/network-scripts/ifcfg-eth0'], + ]) + # On Ubuntu, starting with 16.04, and Fedora, starting with 24, we need + # to install 'python' to get python2.7, which ansible needs + if os_type in ('ubuntu', 'fedora'): + user_info['packages'].append('python') + if os_type in ('centos'): + user_info['packages'].extend([ + 'python3-pip', + 'bind-utils', + ]) + user_fd = tempfile.NamedTemporaryFile(delete=False, mode='wt') + user_str = "#cloud-config\n" + yaml.safe_dump(user_info) + user_fd.write(user_str) + self.user_path = user_fd.name + return True + + def remove_config(self): + """ + Remove the downburst configuration file created by build_config() + """ + if self.config_path and os.path.exists(self.config_path): + os.remove(self.config_path) + self.config_path = None + return True + if self.user_path and os.path.exists(self.user_path): + os.remove(self.user_path) + self.user_path = None + return True + return False + + def __del__(self): + self.remove_config() + + +_known_downburst_distros = { + 'rhel_minimal': ['6.4', '6.5'], + 'centos': ['9.stream', '10.stream'], + 'centos_minimal': ['6.4', '6.5'], + 'debian': ['6.0', '7.0', '7.9', '8.0'], + 'fedora': ['41', '42'], + 'opensuse': ['1.0(tumbleweed)', + '15.5(leap)', '15.6(leap)', + '16.0(leap)', + ], + 'sles': ['12-sp3', '15-sp1', '15-sp2'], + 'alma': ['10.0', '8.10', '9.6'], + 'rocky': ['10.0', '8.10', '9.6'], + 'ubuntu': ['20.04(focal)', '20.10(groovy)', + '21.04(hirsute)', '21.10(impish)', + '22.04(jammy)', '22.10(kinetic)', + '23.04(lunar)', '23.10(mantic)', + '24.04(noble)', '24.10(oracular)', + '25.04(plucky)', + ], +} + +def get_distro_from_downburst(): + """ + Return a table of valid distros. + + If downburst is in path use it. If either downburst is unavailable, + or if downburst is unable to produce a json list, then use a default + table or a table from previous successful call. + """ + # because sometimes downburst fails to complete list-json + # due to temporary issues with vendor site accessibility + # we cache known downburst distros from previous call + # to be reused in such cases of outage + global _known_downburst_distros + executable_cmd = downburst_executable() + environment_dict = downburst_environment() + if not executable_cmd: + log.warning("Downburst not found!") + log.info('Using default values for supported os_type/os_version') + return _known_downburst_distros + try: + log.debug(executable_cmd) + output = subprocess.check_output([executable_cmd, 'list-json'], + env=environment_dict) + _known_downburst_distros = json.loads(output) + return _known_downburst_distros + except (subprocess.CalledProcessError, OSError): + log.exception("Error calling downburst!") + log.info('Using default values for supported os_type/os_version or values from previous call...') + return _known_downburst_distros diff --git a/teuthology/provision/fog.py b/teuthology/provision/fog.py new file mode 100644 index 000000000..101da2464 --- /dev/null +++ b/teuthology/provision/fog.py @@ -0,0 +1,360 @@ +import datetime +import json +import logging +import requests +import socket +import re + +from paramiko import SSHException +from paramiko.ssh_exception import NoValidConnectionsError + +import teuthology.orchestra + +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.exceptions import MaxWhileTries +from teuthology.orchestra.opsys import OS +from teuthology import misc + +log = logging.getLogger(__name__) + + +def enabled(warn=False): + """ + Check for required FOG settings + + :param warn: Whether or not to log a message containing unset parameters + :returns: True if they are present; False if they are not + """ + fog_conf = config.get('fog', dict()) + params = ['endpoint', 'api_token', 'user_token', 'machine_types'] + unset = [param for param in params if not fog_conf.get(param)] + if unset and warn: + log.warning( + "FOG disabled; set the following config options to enable: %s", + ' '.join(unset), + ) + return (unset == []) + + +def get_types(): + """ + Fetch and parse config.fog['machine_types'] + + :returns: The list of FOG-configured machine types. An empty list if FOG is + not configured. + """ + if not enabled(): + return [] + fog_conf = config.get('fog', dict()) + types = fog_conf.get('machine_types', '') + if not isinstance(types, list): + types = types.split(',') + return [type_ for type_ in types if type_] + + +class FOG(object): + """ + Reimage bare-metal machines with https://fogproject.org/ + """ + timestamp_format = '%Y-%m-%d %H:%M:%S' + + def __init__(self, name, os_type, os_version): + self.remote = teuthology.orchestra.remote.Remote( + misc.canonicalize_hostname(name)) + self.name = self.remote.hostname + self.shortname = self.remote.shortname + self.os_type = os_type + self.os_version = os_version + self.log = log.getChild(self.shortname) + + def create(self): + """ + Initiate deployment and wait until completion + """ + if not enabled(): + raise RuntimeError("FOG is not configured!") + host_data = self.get_host_data() + host_id = int(host_data['id']) + self.set_image(host_id) + task_id = self.schedule_deploy_task(host_id) + try: + # Use power_off/power_on because other methods call + # _wait_for_login, which will not work here since the newly-imaged + # host will have an incorrect hostname + self.remote.console.power_off() + self.remote.console.power_on() + self.wait_for_deploy_task(task_id) + except Exception: + self.cancel_deploy_task(task_id) + raise + self._wait_for_ready() + self._fix_hostname() + self._verify_installed_os() + self.log.info("Deploy complete!") + + def do_request(self, url_suffix, data=None, method='GET', verify=True): + """ + A convenience method to submit a request to the FOG server + :param url_suffix: The portion of the URL to append to the endpoint, + e.g. '/system/info' + :param data: Optional JSON data to submit with the request + :param method: The HTTP method to use for the request (default: 'GET') + :param verify: Whether or not to raise an exception if the request is + unsuccessful (default: True) + :returns: A requests.models.Response object + """ + req_kwargs = dict( + headers={ + 'fog-api-token': config.fog['api_token'], + 'fog-user-token': config.fog['user_token'], + }, + ) + if data is not None: + req_kwargs['data'] = data + req = requests.Request( + method, + config.fog['endpoint'] + url_suffix, + **req_kwargs + ) + prepped = req.prepare() + resp = requests.Session().send(prepped) + if not resp.ok: + self.log.error(f"Got status {resp.status_code} from {url_suffix}: '{resp.text}'") + if verify: + resp.raise_for_status() + return resp + + def get_host_data(self): + """ + Locate the host we want to use, and return the FOG object which + represents it + :returns: A dict describing the host + """ + resp = self.do_request( + '/host', + data=json.dumps(dict(name=self.shortname)), + ) + obj = resp.json() + if obj['count'] == 0: + raise RuntimeError("Host %s not found!" % self.shortname) + if obj['count'] > 1: + raise RuntimeError( + "More than one host found for %s" % self.shortname) + return obj['hosts'][0] + + def get_image_data(self): + """ + Locate the image we want to use, and return the FOG object which + represents it + :returns: A dict describing the image + """ + def do_get(name): + resp = self.do_request( + '/image', + data=json.dumps(dict(name=name)), + ) + obj = resp.json() + if obj['count']: + return obj['images'][0] + + os_type = self.os_type.lower() + os_version = self.os_version + name = f"{self.remote.machine_type}_{os_type}_{os_version}" + if image := do_get(name): + return image + elif os_type == 'centos' and not os_version.endswith('.stream'): + image = do_get(f"{name}.stream") + if image: + return image + else: + raise RuntimeError( + "Fog has no %s image. Available %s images: %s" % + (name, self.remote.machine_type, self.suggest_image_names())) + + def suggest_image_names(self): + """ + Suggest available image names for this machine type. + + :returns: A list of image names. + """ + resp = self.do_request('/image/search/%s' % self.remote.machine_type) + obj = resp.json() + images = obj['images'] + return [image['name'] for image in images] + + def set_image(self, host_id): + """ + Tell FOG to use the proper image on the next deploy + :param host_id: The id of the host to deploy + """ + image_data = self.get_image_data() + image_id = int(image_data['id']) + image_name = image_data.get("name") + self.log.debug(f"Requesting image {image_name} (ID {image_id})") + self.do_request( + '/host/%s' % host_id, + method='PUT', + data=json.dumps(dict(imageID=image_id)), + ) + + def schedule_deploy_task(self, host_id): + """ + :param host_id: The id of the host to deploy + :returns: The id of the scheduled task + """ + self.log.info( + "Scheduling deploy of %s %s", + self.os_type, self.os_version) + # First, let's find and cancel any existing deploy tasks for the host. + for task in self.get_deploy_tasks(): + self.cancel_deploy_task(task['id']) + # Next, we need to find the right tasktype ID + resp = self.do_request( + '/tasktype', + data=json.dumps(dict(name='deploy')), + ) + tasktypes = [obj for obj in resp.json()['tasktypes'] + if obj['name'].lower() == 'deploy'] + deploy_id = int(tasktypes[0]['id']) + # Next, schedule the task + resp = self.do_request( + '/host/%i/task' % host_id, + method='POST', + data='{"taskTypeID": %i}' % deploy_id, + ) + host_tasks = self.get_deploy_tasks() + for task in host_tasks: + timestamp = task['createdTime'] + time_delta = ( + datetime.datetime.now(datetime.timezone.utc) - datetime.datetime.strptime( + timestamp, self.timestamp_format).replace(tzinfo=datetime.timezone.utc) + ).total_seconds() + # There should only be one deploy task matching our host. Just in + # case there are multiple, select a very recent one. + if time_delta < 5: + return task['id'] + + def get_deploy_tasks(self): + """ + :returns: A list of deploy tasks which are active on our host + """ + resp = self.do_request('/task/active') + try: + tasks = resp.json()['tasks'] + except Exception: + self.log.exception("Failed to get deploy tasks!") + return list() + host_tasks = [obj for obj in tasks + if obj['host']['name'] == self.shortname] + return host_tasks + + def deploy_task_active(self, task_id): + """ + :param task_id: The id of the task to query + :returns: True if the task is active + """ + host_tasks = self.get_deploy_tasks() + return any( + [task['id'] == task_id for task in host_tasks] + ) + + def wait_for_deploy_task(self, task_id): + """ + Wait until the specified task is no longer active (i.e., it has + completed) + """ + self.log.info("Waiting for deploy to finish") + with safe_while(sleep=15, tries=120, timeout=config.fog_reimage_timeout) as proceed: + while proceed(): + if not self.deploy_task_active(task_id): + break + + def cancel_deploy_task(self, task_id): + """ Cancel an active deploy task """ + self.log.debug(f"Canceling deploy task with ID {task_id}") + resp = self.do_request( + '/task/cancel', + method='DELETE', + data='{"id": %s}' % task_id, + ) + resp.raise_for_status() + + def _wait_for_ready(self): + """ Attempt to connect to the machine via SSH """ + with safe_while(sleep=6, timeout=config.fog_wait_for_ssh_timeout) as proceed: + while proceed(): + try: + self.remote.connect() + break + except ( + socket.error, + SSHException, + NoValidConnectionsError, + MaxWhileTries, + EOFError, + ) as e: + # log this, because otherwise lots of failures just + # keep retrying without any notification (like, say, + # a mismatched host key in ~/.ssh/known_hosts, or + # something) + self.log.warning(e) + sentinel_file = config.fog.get('sentinel_file', None) + if sentinel_file: + cmd = "while [ ! -e '%s' ]; do sleep 5; done" % sentinel_file + action = f"wait for sentinel on {self.shortname}" + with safe_while(action=action, timeout=1800, increment=3) as proceed: + while proceed(): + try: + self.remote.run(args=cmd, timeout=600) + break + except ( + ConnectionError, + EOFError, + ) as e: + log.error(f"{e} on {self.shortname}") + self.log.info("Node is ready") + + def _fix_hostname(self): + """ + After a reimage, the host will still have the hostname of the machine + used to create the image initially. Fix that by making a call to + /binhostname and tweaking /etc/hosts. + """ + wrong_hostname = self.remote.sh('hostname').strip() + etc_hosts = self.remote.sh( + 'grep %s /etc/hosts' % wrong_hostname, + check_status=False, + ).strip() + if etc_hosts: + wrong_ip = re.split(r'\s+', etc_hosts.split('\n')[0].strip())[0] + self.remote.run(args="sudo hostname %s" % self.shortname) + self.remote.run( + args="sudo sed -i -e 's/%s/%s/g' /etc/hosts" % ( + wrong_hostname, self.shortname), + ) + self.remote.run( + args="sudo sed -i -e 's/%s/%s/g' /etc/hosts" % ( + wrong_ip, self.remote.ip_address), + ) + self.remote.run( + args="sudo sed -i -e 's/%s/%s/g' /etc/hostname" % ( + wrong_hostname, self.shortname), + check_status=False, + ) + self.remote.run( + args="sudo hostname %s" % self.shortname, + check_status=False, + ) + + def _verify_installed_os(self): + wanted_os = OS(name=self.os_type, version=self.os_version) + if self.remote.os != wanted_os: + raise RuntimeError( + f"Expected {self.remote.shortname}'s OS to be {wanted_os} but " + f"found {self.remote.os}" + ) + + def destroy(self): + """A no-op; we just leave idle nodes as-is""" + pass diff --git a/teuthology/provision/openstack.py b/teuthology/provision/openstack.py new file mode 100644 index 000000000..d829b4ee5 --- /dev/null +++ b/teuthology/provision/openstack.py @@ -0,0 +1,235 @@ +import json +import logging +import os +import random +import re +import subprocess +import time +import tempfile + +from subprocess import CalledProcessError + +from teuthology import misc + +from teuthology.openstack import OpenStack, OpenStackInstance +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.exceptions import QuotaExceededError + + +log = logging.getLogger(__name__) + + +class ProvisionOpenStack(OpenStack): + """ + A class that provides methods for creating and destroying virtual machine + instances using OpenStack + """ + def __init__(self): + super(ProvisionOpenStack, self).__init__() + fd, self.user_data = tempfile.mkstemp() + os.close(fd) + log.debug("ProvisionOpenStack: " + str(config.openstack)) + self.basename = 'target' + self.up_string = 'The system is finally up' + self.property = "%16x" % random.getrandbits(128) + + def __del__(self): + if os.path.exists(self.user_data): + os.unlink(self.user_data) + + def init_user_data(self, os_type, os_version): + """ + Get the user-data file that is fit for os_type and os_version. + It is responsible for setting up enough for ansible to take + over. + """ + template_path = config['openstack']['user-data'].format( + os_type=os_type, + os_version=os_version) + nameserver = config['openstack'].get('nameserver', '8.8.8.8') + user_data_template = open(template_path).read() + user_data = user_data_template.format( + up=self.up_string, + nameserver=nameserver, + username=self.username, + lab_domain=config.lab_domain) + open(self.user_data, 'w').write(user_data) + + def _openstack(self, subcommand, get=None): + # do not use OpenStack().run because its + # bugous for volume create as of openstackclient 3.2.0 + # https://bugs.launchpad.net/python-openstackclient/+bug/1619726 + #r = OpenStack().run("%s -f json " % command) + json_result = misc.sh("openstack %s -f json" % subcommand) + r = json.loads(json_result) + if get: + return self.get_value(r, get) + return r + + def _create_volume(self, volume_name, size): + """ + Create a volume and return valume id + """ + volume_id = None + try: + volume_id = self._openstack("volume show %s" % volume_name, 'id') + except subprocess.CalledProcessError as e: + if 'No volume with a name or ID' not in e.output: + raise e + if volume_id: + log.warning("Volume {} already exists with ID {}; using it" + .format(volume_name, volume_id)) + volume_id = self._openstack( + "volume create %s" % config['openstack'].get('volume-create','') + + " --property ownedby=%s" % config['openstack']['ip'] + + " --size %s" % str(size) + ' ' + volume_name, 'id') + if volume_id: + log.info("Volume {} created with ID {}" + .format(volume_name, volume_id)) + return volume_id + else: + raise Exception("Failed to create volume %s" % volume_name) + + def _await_volume_status(self, volume_id, status='available'): + """ + Wait for volume to have status, like 'available' or 'in-use' + """ + with safe_while(sleep=4, tries=50, + action="volume " + volume_id) as proceed: + while proceed(): + try: + volume_status = \ + self._openstack("volume show %s" % volume_id, 'status') + if volume_status == status: + break + else: + log.debug("volume %s not in '%s' status yet" + % (volume_id, status)) + except subprocess.CalledProcessError: + log.warning("volume " + volume_id + + " not information available yet") + + def _attach_volume(self, volume_id, name): + """ + Attach volume to OpenStack instance. + + Try and attach volume to server, wait until volume gets in-use state. + """ + with safe_while(sleep=20, increment=20, tries=3, + action="add volume " + volume_id) as proceed: + while proceed(): + try: + misc.sh("openstack server add volume " + name + " " + volume_id) + break + except subprocess.CalledProcessError: + log.warning("openstack add volume failed unexpectedly; retrying") + self._await_volume_status(volume_id, 'in-use') + + def attach_volumes(self, server_name, volumes): + """ + Create and attach volumes to the named OpenStack instance. + If attachment is failed, make another try. + """ + for i in range(volumes['count']): + volume_name = server_name + '-' + str(i) + volume_id = None + with safe_while(sleep=10, tries=3, + action="volume " + volume_name) as proceed: + while proceed(): + try: + volume_id = self._create_volume(volume_name, volumes['size']) + self._await_volume_status(volume_id, 'available') + self._attach_volume(volume_id, server_name) + break + except Exception as e: + log.warning("%s" % e) + if volume_id: + OpenStack().volume_delete(volume_id) + + @staticmethod + def ip2name(prefix, ip): + """ + return the instance name suffixed with the IP address. + """ + digits = map(int, re.findall(r'(\d+)\.(\d+)\.(\d+)\.(\d+)', ip)[0]) + return prefix + "%03d%03d%03d%03d" % tuple(digits) + + def create(self, num, os_type, os_version, arch, resources_hint): + """ + Create num OpenStack instances running os_type os_version and + return their names. Each instance has at least the resources + described in resources_hint. + """ + log.debug('ProvisionOpenStack:create') + if arch is None: + arch = self.get_default_arch() + resources_hint = self.interpret_hints({ + 'machine': config['openstack']['machine'], + 'volumes': config['openstack']['volumes'], + }, resources_hint) + self.init_user_data(os_type, os_version) + image = self.image(os_type, os_version, arch) + if 'network' in config['openstack']: + net = "--nic net-id=" + str(self.net_id(config['openstack']['network'])) + else: + net = '' + flavor = self.flavor(resources_hint['machine'], arch) + keypair = config['openstack']['keypair'] or 'teuthology' + worker_group = config['openstack']['worker_group'] or 'teuthology-worker' + cmd = ("flock --close --timeout 28800 /tmp/teuthology-server-create.lock" + + " openstack server create" + + " " + config['openstack'].get('server-create', '') + + " -f json " + + " --image '" + str(image) + "'" + + " --flavor '" + str(flavor) + "'" + + " --key-name %s " % keypair + + " --user-data " + str(self.user_data) + + " " + net + + " --min " + str(num) + + " --max " + str(num) + + " --security-group %s" % worker_group + + " --property teuthology=" + self.property + + " --property ownedby=" + config.openstack['ip'] + + " --wait " + + " " + self.basename) + try: + self.run(cmd, type='compute') + except CalledProcessError as exc: + if "quota exceeded" in exc.output.lower(): + raise QuotaExceededError(message=exc.output) + raise + instances = filter( + lambda instance: self.property in instance['Properties'], + self.list_instances()) + instances = [OpenStackInstance(i['ID']) for i in instances] + fqdns = [] + try: + network = config['openstack'].get('network', '') + for instance in instances: + ip = instance.get_ip(network) + name = self.ip2name(self.basename, ip) + self.run("server set " + + "--name " + name + " " + + instance['ID']) + fqdn = name + '.' + config.lab_domain + if not misc.ssh_keyscan_wait(fqdn): + console_log = misc.sh("openstack console log show %s " + "|| true" % instance['ID']) + log.error(console_log) + raise ValueError('ssh_keyscan_wait failed for ' + fqdn) + time.sleep(15) + if not self.cloud_init_wait(instance): + raise ValueError('cloud_init_wait failed for ' + fqdn) + self.attach_volumes(name, resources_hint['volumes']) + fqdns.append(fqdn) + except Exception as e: + log.exception(str(e)) + for id in [instance['ID'] for instance in instances]: + self.destroy(id) + raise e + return fqdns + + def destroy(self, name_or_id): + log.debug('ProvisionOpenStack:destroy ' + name_or_id) + return OpenStackInstance(name_or_id).destroy() diff --git a/teuthology/provision/pelagos.py b/teuthology/provision/pelagos.py new file mode 100644 index 000000000..5dd04a4fa --- /dev/null +++ b/teuthology/provision/pelagos.py @@ -0,0 +1,173 @@ + +import logging +import requests +import re +import time + +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.misc import canonicalize_hostname +from teuthology.util.compat import HTTPError + +log = logging.getLogger(__name__) +config_section = 'pelagos' + +# Provisioner configuration section description see in +# docs/siteconfig.rst + +def enabled(warn=False): + """ + Check for required Pelagos settings + + :param warn: Whether or not to log a message containing unset parameters + :returns: True if they are present; False if they are not + """ + conf = config.get(config_section, dict()) + params = ['endpoint', 'machine_types'] + unset = [_ for _ in params if not conf.get(_)] + if unset and warn: + log.warning( + "Pelagos is disabled; set the following config options to enable: %s", + ' '.join(unset), + ) + return (unset == []) + + +def get_types(): + """ + Fetch and parse config.pelagos['machine_types'] + + :returns: The list of Pelagos-configured machine types. An empty list if Pelagos is + not configured. + """ + if not enabled(): + return [] + conf = config.get(config_section, dict()) + types = conf.get('machine_types', '') + if not isinstance(types, list): + types = [_ for _ in types.split(',') if _] + return [_ for _ in types if _] + +def park_node(name): + p = Pelagos(name, "maintenance_image") + p.create(wait=False) + + +class Pelagos(object): + + def __init__(self, name, os_type, os_version=""): + #for service should be a hostname, not a user@host + split_uri = re.search(r'(\w*)@(.+)', canonicalize_hostname(name)) + if split_uri is not None: + self.name = split_uri.groups()[1] + else: + self.name = name + + self.os_type = os_type + self.os_version = os_version + if os_version: + self.os_name = os_type + "-" + os_version + else: + self.os_name = os_type + self.log = log.getChild(self.name) + + def create(self, wait=True): + """ + Initiate deployment via REST requests and wait until completion + :param wait: optional, by default is True, if set to False, function + doesn't wait for the end of node provisioning + :returns: http response code if operation is successful + :raises: :class:`Exception`: if node provision failure reported by + Pelagos or if timeout is reached + :raises: :class:`RuntimeError`: if pelagos is not configured + + """ + if not enabled(): + raise RuntimeError("Pelagos is not configured!") + location = None + try: + params = dict(os=self.os_name, node=self.name) + response = self.do_request('node/provision', + data=params, method='POST') + if not wait: + return response + location = response.headers.get('Location') + self.log.debug("provision task: '%s'", location) + # gracefully wait till provision task gets created on pelagos + time.sleep(2) + self.log.info("Waiting for deploy to finish") + sleep_time=15 + with safe_while(sleep=sleep_time, tries=60) as proceed: + while proceed(): + if not self.is_task_active(location): + break + self.log.info('Sleeping %s seconds' % sleep_time) + except Exception as e: + if location: + self.cancel_deploy_task(location) + else: + self.log.error("Failed to start deploy tasks!") + raise e + self.log.info("Deploy complete!") + if self.task_status_response.status_code != 200: + raise Exception("Deploy failed") + return self.task_status_response + + def cancel_deploy_task(self, task_id): + # TODO implement it + return + + def is_task_active(self, task_url): + try: + status_response = self.do_request('', url=task_url, verify=False) + except HTTPError as err: + self.log.error("Task fail reason: '%s'", err.reason) + if err.status_code == 404: + self.log.error(err.reason) + self.task_status_response = 'failed' + return False + else: + raise HTTPError(err.code, err.reason) + self.log.debug("Response code '%s'", + str(status_response.status_code)) + self.task_status_response = status_response + if status_response.status_code == 202: + status = status_response.headers['status'] + self.log.debug("Status response: '%s'", status) + if status == 'not completed': + return True + return False + + def do_request(self, url_suffix, url="" , data=None, method='GET', verify=True): + """ + A convenience method to submit a request to the Pelagos server + :param url_suffix: The portion of the URL to append to the endpoint, + e.g. '/system/info' + :param data: Optional JSON data to submit with the request + :param method: The HTTP method to use for the request (default: 'GET') + :param verify: Whether or not to raise an exception if the request is + unsuccessful (default: True) + :returns: A requests.models.Response object + """ + prepared_url = url or config.pelagos['endpoint'] + url_suffix + self.log.debug("Sending %s request to: '%s'", method, prepared_url) + if data: + self.log.debug("Using data: '%s'", str(data)) + req = requests.Request( + method, + prepared_url, + data=data + ) + prepared = req.prepare() + resp = requests.Session().send(prepared) + if not resp.ok and resp.text: + self.log.error("Returned status code: '%s', text: %s", + resp.status_code, resp.text or 'Empty') + if verify: + resp.raise_for_status() + return resp + + def destroy(self): + """A no-op; we just leave idle nodes as-is""" + pass + diff --git a/teuthology/provision/test/test_downburst.py b/teuthology/provision/test/test_downburst.py new file mode 100644 index 000000000..4ba7d9a8a --- /dev/null +++ b/teuthology/provision/test/test_downburst.py @@ -0,0 +1,105 @@ +from mock import Mock, MagicMock, patch + +from teuthology import provision + + +class TestDownburst(object): + def setup_method(self): + self.ctx = Mock() + self.ctx.os_type = 'rhel' + self.ctx.os_version = '7.0' + self.ctx.config = dict() + self.name = 'vpm999' + self.status = dict( + vm_host=dict(name='host999'), + is_vm=True, + machine_type='mtype', + locked_by='user@a', + description="desc", + ) + + def test_create_if_vm_success(self): + name = self.name + ctx = self.ctx + status = self.status + + dbrst = provision.downburst.Downburst( + name, ctx.os_type, ctx.os_version, status) + dbrst.executable = '/fake/path' + dbrst.build_config = MagicMock(name='build_config') + dbrst._run_create = MagicMock(name='_run_create') + dbrst._run_create.return_value = (0, '', '') + remove_config = MagicMock(name='remove_config') + dbrst.remove_config = remove_config + + result = provision.create_if_vm(ctx, name, dbrst) + assert result is True + + dbrst._run_create.assert_called_with() + dbrst.build_config.assert_called_with() + del dbrst + remove_config.assert_called_with() + + def test_destroy_if_vm_success(self): + name = self.name + ctx = self.ctx + status = self.status + + dbrst = provision.downburst.Downburst( + name, ctx.os_type, ctx.os_version, status) + dbrst.destroy = MagicMock(name='destroy') + dbrst.destroy.return_value = True + + result = provision.destroy_if_vm(name, user="user@a", _downburst=dbrst) + assert result is True + + dbrst.destroy.assert_called_with() + + def test_destroy_if_vm_wrong_owner(self): + name = self.name + ctx = self.ctx + status = self.status + + dbrst = provision.downburst.Downburst( + name, ctx.os_type, ctx.os_version, status) + dbrst.destroy = MagicMock(name='destroy', side_effect=RuntimeError) + + result = provision.destroy_if_vm(name, user='user@b', + _downburst=dbrst) + assert result is False + + def test_destroy_if_vm_wrong_description(self): + name = self.name + ctx = self.ctx + status = self.status + + dbrst = provision.downburst.Downburst( + name, ctx.os_type, ctx.os_version, status) + dbrst.destroy = MagicMock(name='destroy') + dbrst.destroy = MagicMock(name='destroy', side_effect=RuntimeError) + + result = provision.destroy_if_vm(name, description='desc_b', + _downburst=dbrst) + assert result is False + + @patch('teuthology.provision.downburst.downburst_executable') + def test_create_fails_without_executable(self, m_exec): + name = self.name + ctx = self.ctx + status = self.status + m_exec.return_value = '' + dbrst = provision.downburst.Downburst( + name, ctx.os_type, ctx.os_version, status) + result = dbrst.create() + assert result is False + + @patch('teuthology.provision.downburst.downburst_executable') + def test_destroy_fails_without_executable(self, m_exec): + name = self.name + ctx = self.ctx + status = self.status + m_exec.return_value = '' + dbrst = provision.downburst.Downburst( + name, ctx.os_type, ctx.os_version, status) + result = dbrst.destroy() + assert result is False diff --git a/teuthology/provision/test/test_fog.py b/teuthology/provision/test/test_fog.py new file mode 100644 index 000000000..3d0baa752 --- /dev/null +++ b/teuthology/provision/test/test_fog.py @@ -0,0 +1,339 @@ +import datetime + +from copy import deepcopy +from mock import patch, DEFAULT, PropertyMock +from pytest import raises, mark + +from teuthology.config import config +from teuthology.exceptions import MaxWhileTries, CommandFailedError +from teuthology.provision import fog + + +test_config = dict( + fog=dict( + endpoint='http://fog.example.com/fog', + api_token='API_TOKEN', + user_token='USER_TOKEN', + machine_types='type1,type2', + ), + fog_reimage_timeout=1800, +) + + +class TestFOG(object): + klass = fog.FOG + + def setup_method(self): + config.load() + config.update(deepcopy(test_config)) + self.start_patchers() + + def start_patchers(self): + self.patchers = dict() + self.patchers['m_sleep'] = patch( + 'time.sleep', + ) + self.patchers['m_requests_Session_send'] = patch( + 'requests.Session.send', + ) + self.patchers['m_Remote_connect'] = patch( + 'teuthology.orchestra.remote.Remote.connect' + ) + self.patchers['m_Remote_run'] = patch( + 'teuthology.orchestra.remote.Remote.run' + ) + self.patchers['m_Remote_console'] = patch( + 'teuthology.orchestra.remote.Remote.console', + new_callable=PropertyMock, + ) + self.patchers['m_Remote_hostname'] = patch( + 'teuthology.orchestra.remote.Remote.hostname', + new_callable=PropertyMock, + ) + self.patchers['m_Remote_machine_type'] = patch( + 'teuthology.orchestra.remote.Remote.machine_type', + new_callable=PropertyMock, + ) + self.mocks = dict() + for name, patcher in self.patchers.items(): + self.mocks[name] = patcher.start() + + def teardown_method(self): + for patcher in self.patchers.values(): + patcher.stop() + + @mark.parametrize('enabled', [True, False]) + def test_get_types(self, enabled): + with patch('teuthology.provision.fog.enabled') as m_enabled: + m_enabled.return_value = enabled + types = fog.get_types() + if enabled: + assert types == test_config['fog']['machine_types'].split(',') + else: + assert types == [] + + def test_disabled(self): + config.fog['endpoint'] = None + obj = self.klass('name.fqdn', 'type', '1.0') + with raises(RuntimeError): + obj.create() + + def test_init(self): + self.mocks['m_Remote_hostname'].return_value = 'name.fqdn' + obj = self.klass('name.fqdn', 'type', '1.0') + assert obj.name == 'name.fqdn' + assert obj.shortname == 'name' + assert obj.os_type == 'type' + assert obj.os_version == '1.0' + + @mark.parametrize('success', [True, False]) + def test_create(self, success): + self.mocks['m_Remote_hostname'].return_value = 'name.fqdn' + self.mocks['m_Remote_machine_type'].return_value = 'type1' + obj = self.klass('name.fqdn', 'type', '1.0') + host_id = 99 + task_id = 1234 + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_host_data=DEFAULT, + set_image=DEFAULT, + schedule_deploy_task=DEFAULT, + wait_for_deploy_task=DEFAULT, + cancel_deploy_task=DEFAULT, + _wait_for_ready=DEFAULT, + _fix_hostname=DEFAULT, + _verify_installed_os=DEFAULT, + ) as local_mocks: + local_mocks['get_host_data'].return_value = dict(id=host_id) + local_mocks['schedule_deploy_task'].return_value = task_id + if not success: + local_mocks['wait_for_deploy_task'].side_effect = RuntimeError + with raises(RuntimeError): + obj.create() + else: + obj.create() + local_mocks['get_host_data'].assert_called_once_with() + local_mocks['set_image'].assert_called_once_with(host_id) + local_mocks['schedule_deploy_task'].assert_called_once_with(host_id) + local_mocks['wait_for_deploy_task'].assert_called_once_with(task_id) + if success: + local_mocks['_wait_for_ready'].assert_called_once_with() + local_mocks['_fix_hostname'].assert_called_once_with() + else: + assert len(local_mocks['cancel_deploy_task'].call_args_list) == 1 + self.mocks['m_Remote_console'].return_value.power_off.assert_called_once_with() + self.mocks['m_Remote_console'].return_value.power_on.assert_called_once_with() + + def test_do_request(self): + obj = self.klass('name.fqdn', 'type', '1.0') + obj.do_request('test_url', data='DATA', method='GET') + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + 'test_url' + assert req.method == 'GET' + assert req.headers['fog-api-token'] == test_config['fog']['api_token'] + assert req.headers['fog-user-token'] == test_config['fog']['user_token'] + assert req.body == 'DATA' + + @mark.parametrize( + 'count', + [0, 1, 2], + ) + def test_get_host_data(self, count): + host_objs = [dict(id=i) for i in range(count)] + resp_obj = dict(count=count, hosts=host_objs) + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = resp_obj + obj = self.klass('name.fqdn', 'type', '1.0') + if count != 1: + with raises(RuntimeError): + result = obj.get_host_data() + return + result = obj.get_host_data() + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + '/host' + assert req.body == '{"name": "name"}' + assert result == host_objs[0] + + @mark.parametrize( + 'count', + [0, 1, 2], + ) + def test_get_image_data(self, count): + img_objs = [dict(id=i) for i in range(count)] + resp_obj = dict(count=count, images=img_objs) + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = resp_obj + self.mocks['m_Remote_machine_type'].return_value = 'type1' + obj = self.klass('name.fqdn', 'windows', 'xp') + if count < 1: + with raises(RuntimeError): + result = obj.get_image_data() + return + result = obj.get_image_data() + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + '/image' + assert req.body == '{"name": "type1_windows_xp"}' + assert result == img_objs[0] + + def test_suggest_image_names(self): + data = {'images': [ + {'name': 'mira_rhel_9.1'}, + {'name': 'mira_rhel_9.2'}, + ]} + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = data + self.mocks['m_Remote_machine_type'].return_value = 'mira' + # Not sure what this klass() is for here: + obj = self.klass('name.fqdn', 'mira', '1.0') + result = obj.suggest_image_names() + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 1 + req = self.mocks['m_requests_Session_send'].call_args_list[0][0][0] + assert req.url == test_config['fog']['endpoint'] + '/image/search/mira' + assert result == ['mira_rhel_9.1', 'mira_rhel_9.2'] + + def test_set_image(self): + self.mocks['m_Remote_hostname'].return_value = 'name.fqdn' + self.mocks['m_Remote_machine_type'].return_value = 'type1' + host_id = 999 + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_image_data=DEFAULT, + do_request=DEFAULT, + ) as local_mocks: + local_mocks['get_image_data'].return_value = dict(id='13') + obj.set_image(host_id) + local_mocks['do_request'].assert_called_once_with( + '/host/999', method='PUT', data='{"imageID": 13}', + ) + + def test_schedule_deploy_task(self): + host_id = 12 + tasktype_id = 6 + task_id = 5 + tasktype_result = dict(tasktypes=[dict(name='deploy', id=tasktype_id)]) + schedule_result = dict() + host_tasks = [dict( + createdTime=datetime.datetime.strftime( + datetime.datetime.now(datetime.timezone.utc), + self.klass.timestamp_format + ), + id=task_id, + )] + self.mocks['m_requests_Session_send']\ + .return_value.json.side_effect = [ + tasktype_result, schedule_result, + ] + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_deploy_tasks=DEFAULT, + ) as local_mocks: + local_mocks['get_deploy_tasks'].return_value = host_tasks + obj = self.klass('name.fqdn', 'type', '1.0') + result = obj.schedule_deploy_task(host_id) + assert len(local_mocks['get_deploy_tasks'].call_args_list) == 2 + assert len(self.mocks['m_requests_Session_send'].call_args_list) == 3 + assert result == task_id + + def test_get_deploy_tasks(self): + obj = self.klass('name.fqdn', 'type', '1.0') + resp_obj = dict( + count=2, + tasks=[ + dict(host=dict(name='notme')), + dict(host=dict(name='name')), + ] + ) + self.mocks['m_requests_Session_send']\ + .return_value.json.return_value = resp_obj + result = obj.get_deploy_tasks() + assert result[0]['host']['name'] == 'name' + + @mark.parametrize( + 'active_ids', + [ + [2, 4, 6, 8], + [1], + [], + ] + ) + def test_deploy_task_active(self, active_ids): + our_task_id = 4 + result_objs = [dict(id=task_id) for task_id in active_ids] + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + get_deploy_tasks=DEFAULT, + ) as local_mocks: + local_mocks['get_deploy_tasks'].return_value = result_objs + result = obj.deploy_task_active(our_task_id) + assert result is (our_task_id in active_ids) + + @mark.parametrize( + 'tries', + [3, 121], + ) + def test_wait_for_deploy_task(self, tries): + wait_results = [True for i in range(tries)] + [False] + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + deploy_task_active=DEFAULT, + ) as local_mocks: + local_mocks['deploy_task_active'].side_effect = wait_results + if tries >= 60: + with raises(MaxWhileTries): + obj.wait_for_deploy_task(9) + return + obj.wait_for_deploy_task(9) + assert len(local_mocks['deploy_task_active'].call_args_list) == \ + tries + 1 + + def test_cancel_deploy_task(self): + obj = self.klass('name.fqdn', 'type', '1.0') + with patch.multiple( + 'teuthology.provision.fog.FOG', + do_request=DEFAULT, + ) as local_mocks: + obj.cancel_deploy_task(10) + local_mocks['do_request'].assert_called_once_with( + '/task/cancel', + method='DELETE', + data='{"id": 10}', + ) + + @mark.parametrize( + 'tries', + [1, 101], + ) + def test_wait_for_ready_tries(self, tries): + connect_results = [MaxWhileTries for i in range(tries)] + [True] + obj = self.klass('name.fqdn', 'type', '1.0') + self.mocks['m_Remote_connect'].side_effect = connect_results + if tries >= 100: + with raises(MaxWhileTries): + obj._wait_for_ready() + return + obj._wait_for_ready() + assert len(self.mocks['m_Remote_connect'].call_args_list) == tries + 1 + + @mark.parametrize( + 'sentinel_present', + ([False, True]), + ) + def test_wait_for_ready_sentinel(self, sentinel_present): + config.fog['sentinel_file'] = '/a_file' + obj = self.klass('name.fqdn', 'type', '1.0') + if not sentinel_present: + self.mocks['m_Remote_run'].side_effect = [ + CommandFailedError(command='cmd', exitstatus=1)] + with raises(CommandFailedError): + obj._wait_for_ready() + else: + obj._wait_for_ready() + assert len(self.mocks['m_Remote_run'].call_args_list) == 1 + assert "'/a_file'" in \ + self.mocks['m_Remote_run'].call_args_list[0][1]['args'] diff --git a/teuthology/provision/test/test_init_provision.py b/teuthology/provision/test/test_init_provision.py new file mode 100644 index 000000000..e63c5aa36 --- /dev/null +++ b/teuthology/provision/test/test_init_provision.py @@ -0,0 +1,46 @@ +from copy import deepcopy +from pytest import raises +from teuthology.config import config + +import teuthology.provision + +test_config = dict( + pelagos=dict( + endpoint='http://pelagos.example:5000/', + machine_types='ptype1,ptype2,common_type', + ), + fog=dict( + endpoint='http://fog.example.com/fog', + api_token='API_TOKEN', + user_token='USER_TOKEN', + machine_types='ftype1,ftype2,common_type', + ) +) + +class TestInitProvision(object): + + def setup_method(self): + config.load(deepcopy(test_config)) + + def test_get_reimage_types(self): + reimage_types = teuthology.provision.get_reimage_types() + assert reimage_types == ["ptype1", "ptype2", "common_type", + "ftype1", "ftype2", "common_type"] + + def test_reimage(self): + class context: + pass + ctx = context() + ctx.os_type = 'sle' + ctx.os_version = '15.1' + with raises(Exception) as e_info: + teuthology.provision.reimage(ctx, 'f.q.d.n.org', 'not-defined-type') + e_str = str(e_info) + print("Caught exception: " + e_str) + assert e_str.find(r"configured\sprovisioners") == -1 + + with raises(Exception) as e_info: + teuthology.provision.reimage(ctx, 'f.q.d.n.org', 'common_type') + e_str = str(e_info) + print("Caught exception: " + e_str) + assert e_str.find(r"used\swith\sone\sprovisioner\sonly") == -1 diff --git a/teuthology/provision/test/test_pelagos.py b/teuthology/provision/test/test_pelagos.py new file mode 100644 index 000000000..43d474176 --- /dev/null +++ b/teuthology/provision/test/test_pelagos.py @@ -0,0 +1,46 @@ +from copy import deepcopy +from pytest import raises +from teuthology.config import config +from teuthology.provision import pelagos + +import teuthology.provision + + +test_config = dict( + pelagos=dict( + endpoint='http://pelagos.example:5000/', + machine_types='ptype1,ptype2', + ), +) + +class TestPelagos(object): + + def setup_method(self): + config.load(deepcopy(test_config)) + + def teardown_method(self): + pass + + def test_get_types(self): + #klass = pelagos.Pelagos + types = pelagos.get_types() + assert types == ["ptype1", "ptype2"] + + def test_disabled(self): + config.pelagos['endpoint'] = None + enabled = pelagos.enabled() + assert enabled == False + + def test_pelagos(self): + class context: + pass + + ctx = context() + ctx.os_type ='sle' + ctx.os_version = '15.1' + with raises(Exception) as e_info: + teuthology.provision.reimage(ctx, 'f.q.d.n.org', 'ptype1') + e_str = str(e_info) + print("Caught exception: " + e_str) + assert e_str.find(r"Name\sor\sservice\snot\sknown") == -1 + diff --git a/teuthology/prune.py b/teuthology/prune.py new file mode 100644 index 000000000..dc720cb1e --- /dev/null +++ b/teuthology/prune.py @@ -0,0 +1,237 @@ +import gzip +import logging +import os +import shutil +import time + +import teuthology +from teuthology.contextutil import safe_while + +log = logging.getLogger(__name__) + + +# If we see this in any directory, we do not prune it +PRESERVE_FILE = '.preserve' + + +def main(args): + """ + Main function; parses args and calls prune_archive() + """ + verbose = args['--verbose'] + if verbose: + teuthology.log.setLevel(logging.DEBUG) + archive_dir = args['--archive'] + dry_run = args['--dry-run'] + pass_days = int(args['--pass']) + fail_days = int(args['--fail']) + remotes_days = int(args['--remotes']) + compress_days = int(args['--compress']) + + prune_archive( + archive_dir, pass_days, fail_days, remotes_days, compress_days, dry_run + ) + + +def prune_archive( + archive_dir, + pass_days, + fail_days, + remotes_days, + compress_days, + dry_run=False, +): + """ + Walk through the archive_dir, calling the cleanup functions to process + directories that might be old enough + """ + min_days = min(filter( + lambda n: n >= 0, [pass_days, fail_days, remotes_days])) + log.debug("Archive {archive} has {count} children".format( + archive=archive_dir, count=len(os.listdir(archive_dir)))) + # Use full paths + children = [os.path.join(archive_dir, p) for p in listdir(archive_dir)] + run_dirs = list() + for child in children: + # Ensure that the path is not a symlink, is a directory, and is old + # enough to process + if (not os.path.islink(child) and os.path.isdir(child) and + is_old_enough(child, min_days)): + run_dirs.append(child) + run_dirs.sort(key=lambda p: os.path.getctime(p), reverse=True) + for run_dir in run_dirs: + log.debug("Processing %s ..." % run_dir) + maybe_remove_jobs(run_dir, pass_days, fail_days, dry_run) + maybe_remove_remotes(run_dir, remotes_days, dry_run) + maybe_compress_logs(run_dir, compress_days, dry_run) + + +def listdir(path): + with safe_while(sleep=1, increment=1, tries=10) as proceed: + while proceed(): + try: + return os.listdir(path) + except OSError: + log.exception("Failed to list %s !" % path) + + +def should_preserve(dir_name): + """ + Should the directory be preserved? + + :returns: True if the directory contains a file named '.preserve'; False + otherwise + """ + preserve_path = os.path.join(dir_name, PRESERVE_FILE) + if os.path.isdir(dir_name) and os.path.exists(preserve_path): + return True + return False + + +def is_old_enough(file_name, days): + """ + :returns: True if the file's modification date is earlier than the amount + of days specified + """ + if days < 0: + return False + now = time.time() + secs_to_days = lambda s: s / (60 * 60 * 24) + age = now - os.path.getmtime(file_name) + if secs_to_days(age) > days: + return True + return False + + +def remove(path): + """ + Attempt to recursively remove a directory. If an OSError is encountered, + log it and continue. + """ + try: + shutil.rmtree(path) + except OSError: + log.exception("Failed to remove %s !" % path) + + +def maybe_remove_jobs(run_dir, pass_days, fail_days, dry_run=False): + """ + Remove entire job log directories if they are old enough and the job passed + """ + if pass_days < 0 and fail_days < 0: + return + contents = listdir(run_dir) + if PRESERVE_FILE in contents: + return + for child in contents: + job_path = os.path.join(run_dir, child) + # Ensure the path isn't marked for preservation and that it is a + # directory + if should_preserve(job_path) or not os.path.isdir(job_path): + continue + # Is it a job dir? + summary_path = os.path.join(job_path, 'summary.yaml') + if not os.path.exists(summary_path): + continue + # Depending on whether it passed or failed, we have a different age + # threshold + summary_lines = [line.strip() for line in + open(summary_path).readlines()] + if 'success: true' in summary_lines: + status = 'passed' + days = pass_days + elif 'success: false' in summary_lines: + status = 'failed' + days = fail_days + else: + continue + # Ensure the directory is old enough to remove + if not is_old_enough(summary_path, days): + continue + log.info("{job} is a {days}-day old {status} job; removing".format( + job=job_path, days=days, status=status)) + if not dry_run: + remove(job_path) + + +def maybe_remove_remotes(run_dir, days, dry_run=False): + """ + Remove remote logs (not teuthology logs) from job directories if they are + old enough + """ + if days < 0: + return + contents = listdir(run_dir) + subdirs = dict( + remote='remote logs', + data='mon data', + ) + if PRESERVE_FILE in contents: + return + for child in contents: + item = os.path.join(run_dir, child) + # Ensure the path isn't marked for preservation, that it is a + # directory, and that it is old enough + if (should_preserve(item) or not os.path.isdir(item) or not + is_old_enough(item, days)): + continue + for (subdir, description) in subdirs.items(): + _maybe_remove_subdir(item, subdir, days, description, dry_run) + + +def _maybe_remove_subdir(job_dir, subdir, days, description, dry_run=False): + # Does the subdir exist? + subdir_path = os.path.join(job_dir, subdir) + if not os.path.isdir(subdir_path): + return + log.info("{job} is {days} days old; removing {desc}".format( + job=job_dir, + days=days, + desc=description, + )) + if not dry_run: + remove(subdir_path) + + +def maybe_compress_logs(run_dir, days, dry_run=False): + if days < 0: + return + contents = listdir(run_dir) + if PRESERVE_FILE in contents: + return + for child in contents: + item = os.path.join(run_dir, child) + # Ensure the path isn't marked for preservation, that it is a + # directory, and that it is old enough + if (should_preserve(item) or not os.path.isdir(item) or not + is_old_enough(item, days)): + continue + log_name = 'teuthology.log' + log_path = os.path.join(item, log_name) + if not os.path.exists(log_path): + continue + log.info("{job} is {days} days old; compressing {name}".format( + job=item, + days=days, + name=log_name, + )) + if dry_run: + continue + zlog_path = log_path + '.gz' + try: + _compress(log_path, zlog_path) + except Exception: + log.exception("Failed to compress %s", log_path) + os.remove(zlog_path) + else: + os.remove(log_path) + + +def _compress(in_path, out_path): + """ + Compresses a file using gzip, preserving the original permissions, atime, + and mtime. Does not remove the original. + """ + with open(in_path, 'rb') as src, gzip.open(out_path, 'wb') as dest: + shutil.copyfileobj(src, dest) + shutil.copystat(in_path, out_path) diff --git a/teuthology/reimage.py b/teuthology/reimage.py new file mode 100644 index 000000000..fdc90543a --- /dev/null +++ b/teuthology/reimage.py @@ -0,0 +1,57 @@ +import argparse +import logging + +import teuthology + +from teuthology.parallel import parallel +from teuthology.provision import reimage, get_reimage_types +from teuthology.lock import query, ops +from teuthology.misc import get_user +from teuthology.misc import decanonicalize_hostname as shortname + +log = logging.getLogger(__name__) + +def main(args): + if (args['--verbose']): + teuthology.log.setLevel(logging.DEBUG) + + ctx = argparse.Namespace() + ctx.os_type = args['--os-type'] + ctx.os_version = args['--os-version'] + + nodes = args[''] + + reimage_types = get_reimage_types() + statuses = query.get_statuses(nodes) + owner = args['--owner'] or get_user() + unlocked = [shortname(_['name']) + for _ in statuses if not _['locked']] + if unlocked: + log.error( + "Some of the nodes are not locked: %s", unlocked) + exit(1) + + improper = [shortname(_['name']) for _ in statuses if _['locked_by'] != owner] + if improper: + log.error( + "Some of the nodes are not owned by '%s': %s", owner, improper) + exit(1) + + irreimageable = [shortname(_['name']) for _ in statuses + if _['machine_type'] not in reimage_types] + if irreimageable: + log.error( + "Following nodes cannot be reimaged because theirs machine type " + "is not reimageable: %s", irreimageable) + exit(1) + + def reimage_node(ctx, machine_name, machine_type): + ops.update_nodes([machine_name], True) + reimage(ctx, machine_name, machine_type) + ops.update_nodes([machine_name]) + log.debug("Node '%s' reimaging is complete", machine_name) + + with parallel() as p: + for node in statuses: + log.debug("Start node '%s' reimaging", node['name']) + p.spawn(reimage_node, ctx, shortname(node['name']), node['machine_type']) diff --git a/teuthology/repo_utils.py b/teuthology/repo_utils.py new file mode 100644 index 000000000..db6e87a8c --- /dev/null +++ b/teuthology/repo_utils.py @@ -0,0 +1,470 @@ +import functools +import logging +import os +import re +import shutil +import subprocess +import time + +import teuthology.exporter as exporter + +from teuthology import misc +from teuthology.util.flock import FileLock +from teuthology.config import config +from teuthology.contextutil import MaxWhileTries, safe_while +from teuthology.exceptions import BootstrapError, BranchNotFoundError, CommitNotFoundError, GitError + +log = logging.getLogger(__name__) + + +# Repos must not have been fetched in the last X seconds to get fetched again. +# Similar for teuthology's bootstrap +FRESHNESS_INTERVAL = 60 + + +def touch_file(path): + out = subprocess.check_output(('touch', path)) + if out: + log.info(out) + + +def is_fresh(path): + """ + Has this file been modified in the last FRESHNESS_INTERVAL seconds? + + Returns False if the file does not exist + """ + if not os.path.exists(path): + return False + elif time.time() - os.stat(path).st_mtime < FRESHNESS_INTERVAL: + return True + return False + + +def build_git_url(project, project_owner='ceph'): + """ + Return the git URL to clone the project + """ + if project == 'ceph-qa-suite': + base = config.get_ceph_qa_suite_git_url() + elif project == 'ceph-cm-ansible': + base = config.get_ceph_cm_ansible_git_url() + elif project == 'ceph': + base = config.get_ceph_git_url() + elif project == 'teuthology': + base = config.get_teuthology_git_url() + else: + base = 'https://github.com/{project_owner}/{project}' + url_templ = re.sub(r'\.git$', '', base) + return url_templ.format(project_owner=project_owner, project=project) + + +@functools.lru_cache() +def ls_remote(url, ref): + """ + Return the current sha1 for a given repository and ref + + :returns: The sha1 if found; else None + """ + sha1 = None + cmd = "git ls-remote {} {}".format(url, ref) + result = subprocess.check_output( + cmd, shell=True).split() + if result: + sha1 = result[0].decode() + log.debug("{} -> {}".format(cmd, sha1)) + return sha1 + + +def current_branch(path: str) -> str: + """ + Return the current branch for a given on-disk repository. + + :returns: the current branch, or an empty string if none is found. + """ + # git branch --show-current was added in 2.22.0, and we can't assume + # our version is new enough. + cmd = "git rev-parse --abbrev-ref HEAD" + result = subprocess.Popen( + cmd, + shell=True, + cwd=path, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + ).communicate()[0].strip().decode() + if result == "HEAD": + return "" + return result + + +def enforce_repo_state(repo_url, dest_path, branch, commit=None, remove_on_error=True): + """ + Use git to either clone or update a given repo, forcing it to switch to the + specified branch. + + :param repo_url: The full URL to the repo (not including the branch) + :param dest_path: The full path to the destination directory + :param branch: The branch. + :param commit: The sha1 to checkout. Defaults to None, which uses HEAD of the branch. + :param remove_on_error: Whether or not to remove dest_dir when an error occurs + :raises: BranchNotFoundError if the branch is not found; + CommitNotFoundError if the commit is not found; + GitError for other errors + """ + validate_branch(branch) + sentinel = os.path.join(dest_path, '.fetched') + # sentinel to track whether the repo has checked out the intended + # version, in addition to being cloned + repo_reset = os.path.join(dest_path, '.fetched_and_reset') + try: + if not os.path.isdir(dest_path): + clone_repo(repo_url, dest_path, branch, shallow=commit is None) + elif not commit and not is_fresh(sentinel): + set_remote(dest_path, repo_url) + fetch_branch(dest_path, branch) + touch_file(sentinel) + + if commit and os.path.exists(repo_reset): + return + + reset_repo(repo_url, dest_path, branch, commit) + touch_file(repo_reset) + # remove_pyc_files(dest_path) + except (BranchNotFoundError, CommitNotFoundError): + if remove_on_error: + shutil.rmtree(dest_path, ignore_errors=True) + raise + + +def clone_repo(repo_url, dest_path, branch, shallow=True): + """ + Clone a repo into a path + + :param repo_url: The full URL to the repo (not including the branch) + :param dest_path: The full path to the destination directory + :param branch: The branch. + :param shallow: Whether to perform a shallow clone (--depth 1) + :raises: BranchNotFoundError if the branch is not found; + GitError for other errors + """ + validate_branch(branch) + log.info("Cloning %s %s from upstream", repo_url, branch) + if branch.startswith('refs/'): + clone_repo_ref(repo_url, dest_path, branch) + return + args = ['git', 'clone', '--single-branch'] + if shallow: + args.extend(['--depth', '1']) + args.extend(['--branch', branch, repo_url, dest_path]) + proc = subprocess.Popen( + args, + cwd=os.path.dirname(dest_path), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + + not_found_str = "Remote branch %s not found" % branch + out = proc.stdout.read().decode() + result = proc.wait() + # Newer git versions will bail if the branch is not found, but older ones + # will not. Fortunately they both output similar text. + if result != 0: + log.error(out) + if not_found_str in out: + if result == 0: + # Old git left a repo with the wrong branch. Remove it. + shutil.rmtree(dest_path, ignore_errors=True) + raise BranchNotFoundError(branch, repo_url) + elif result != 0: + # Unknown error + raise GitError("git clone failed!") + + +def rsstrip(s, suffix): + return s[:-len(suffix)] if s.endswith(suffix) else s + + +def lsstrip(s, prefix): + return s[len(prefix):] if s.startswith(prefix) else s + + +def remote_ref_from_ref(ref, remote='origin'): + if ref.startswith('refs/pull/'): + return 'refs/remotes/' + remote + lsstrip(ref, 'refs') + elif ref.startswith('refs/heads/'): + return 'refs/remotes/' + remote + lsstrip(ref, 'refs/heads') + raise GitError("Unsupported ref '%s'" % ref) + + +def local_branch_from_ref(ref): + if ref.startswith('refs/pull/'): + s = lsstrip(ref, 'refs/pull/') + s = rsstrip(s, '/merge') + s = rsstrip(s, '/head') + return "PR#%s" % s + elif ref.startswith('refs/heads/'): + return lsstrip(ref, 'refs/heads/') + raise GitError("Unsupported ref '%s', try 'refs/heads/' or 'refs/pull/'" % ref) + + +def fetch_refspec(ref): + if '/' in ref: + remote_ref = remote_ref_from_ref(ref) + return "+%s:%s" % (ref, remote_ref) + else: + # looks like a branch name + return ref + + +def clone_repo_ref(repo_url, dest_path, ref): + branch_name = local_branch_from_ref(ref) + remote_ref = remote_ref_from_ref(ref) + misc.sh('git init %s' % dest_path) + misc.sh('git remote add origin %s' % repo_url, cwd=dest_path) + #misc.sh('git fetch --depth 1 origin %s' % fetch_refspec(ref), + # cwd=dest_path) + fetch_branch(dest_path, ref) + misc.sh('git checkout -b %s %s' % (branch_name, remote_ref), + cwd=dest_path) + + +def set_remote(repo_path, repo_url): + """ + Call "git remote set-url origin " + + :param repo_url: The full URL to the repo (not including the branch) + :param repo_path: The full path to the repository + :raises: GitError if the operation fails + """ + log.debug("Setting repo remote to %s", repo_url) + proc = subprocess.Popen( + ('git', 'remote', 'set-url', 'origin', repo_url), + cwd=repo_path, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + if proc.wait() != 0: + out = proc.stdout.read() + log.error(out) + raise GitError("git remote set-url failed!") + + +def fetch(repo_path): + """ + Call "git fetch -p origin" + + :param repo_path: The full path to the repository + :raises: GitError if the operation fails + """ + log.info("Fetching from upstream into %s", repo_path) + proc = subprocess.Popen( + ('git', 'fetch', '-p', 'origin'), + cwd=repo_path, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + if proc.wait() != 0: + out = proc.stdout.read().decode() + log.error(out) + raise GitError("git fetch failed!") + + +def fetch_branch(repo_path, branch, shallow=True): + """ + Call "git fetch -p origin " + + :param repo_path: The full path to the repository on-disk + :param branch: The branch. + :param shallow: Whether to perform a shallow fetch (--depth 1) + :raises: BranchNotFoundError if the branch is not found; + GitError for other errors + """ + validate_branch(branch) + log.info("Fetching %s from origin", repo_path.split("/")[-1]) + args = ['git', 'fetch'] + if shallow: + args.extend(['--depth', '1']) + args.extend(['-p', 'origin', fetch_refspec(branch)]) + proc = subprocess.Popen( + args, + cwd=repo_path, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + if proc.wait() != 0: + not_found_str = "fatal: couldn't find remote ref %s" % branch + out = proc.stdout.read().decode() + log.error(out) + if not_found_str in out.lower(): + raise BranchNotFoundError(branch) + else: + raise GitError("git fetch failed!") + + +def reset_repo(repo_url, dest_path, branch, commit=None): + """ + + :param repo_url: The full URL to the repo (not including the branch) + :param dest_path: The full path to the destination directory + :param branch: The branch. + :param commit: The sha1 to checkout. Defaults to None, which uses HEAD of the branch. + :raises: BranchNotFoundError if the branch is not found; + CommitNotFoundError if the commit is not found; + GitError for other errors + """ + validate_branch(branch) + if '/' in branch: + reset_branch = lsstrip(remote_ref_from_ref(branch), 'refs/remotes/') + else: + reset_branch = 'origin/%s' % branch + reset_ref = commit or reset_branch + log.debug('Resetting repo at %s to %s', dest_path, reset_ref) + # This try/except block will notice if the requested branch doesn't + # exist, whether it was cloned or fetched. + try: + subprocess.check_output( + ('git', 'reset', '--hard', reset_ref), + cwd=dest_path, + ) + except subprocess.CalledProcessError: + if commit: + raise CommitNotFoundError(commit, repo_url) + raise BranchNotFoundError(branch, repo_url) + + +def remove_pyc_files(dest_path): + subprocess.check_call( + ['find', dest_path, '-name', '*.pyc', '-exec', 'rm', '{}', ';'] + ) + + +def validate_branch(branch): + if ' ' in branch: + raise ValueError("Illegal branch name: '%s'" % branch) + + +def fetch_repo(url, branch, commit=None, bootstrap=None, lock=True): + """ + Make sure we have a given project's repo checked out and up-to-date with + the current branch requested + + :param url: The URL to the repo + :param bootstrap: An optional callback function to execute. Gets passed a + dest_dir argument: the path to the repo on-disk. + :param branch: The branch we want + :param commit: The sha1 to checkout. Defaults to None, which uses HEAD of the branch. + :returns: The destination path + """ + src_base_path = config.src_base_path + if not os.path.exists(src_base_path): + os.mkdir(src_base_path) + ref_dir = ref_to_dirname(commit or branch) + dirname = '%s_%s' % (url_to_dirname(url), ref_dir) + dest_path = os.path.join(src_base_path, dirname) + # only let one worker create/update the checkout at a time + lock_path = dest_path.rstrip('/') + '.lock' + with FileLock(lock_path, noop=not lock): + with safe_while(sleep=10, tries=6) as proceed: + try: + while proceed(): + try: + enforce_repo_state(url, dest_path, branch, commit) + if bootstrap: + sentinel = os.path.join(dest_path, '.bootstrapped') + if commit and os.path.exists(sentinel) or is_fresh(sentinel): + log.info( + "Skipping bootstrap as it was already done in the last %ss", + FRESHNESS_INTERVAL, + ) + break + bootstrap(dest_path) + touch_file(sentinel) + break + except GitError: + log.exception("Git error encountered; retrying") + except BootstrapError: + log.exception("Bootstrap error encountered; retrying") + except MaxWhileTries: + shutil.rmtree(dest_path, ignore_errors=True) + raise + return dest_path + + +def ref_to_dirname(branch): + if '/' in branch: + return local_branch_from_ref(branch) + else: + return branch + + +def url_to_dirname(url): + """ + Given a URL, returns a string that's safe to use as a directory name. + Examples: + + git@git.ceph.com/ceph-qa-suite.git -> git.ceph.com_ceph-qa-suite + git://git.ceph.com/ceph-qa-suite.git -> git.ceph.com_ceph-qa-suite + https://github.com/ceph/ceph -> github.com_ceph_ceph + https://github.com/liewegas/ceph.git -> github.com_liewegas_ceph + file:///my/dir/has/ceph.git -> my_dir_has_ceph + """ + # Strip protocol from left-hand side + string = re.match('(?:.*://|.*@)(.*)', url).groups()[0] + # Strip '.git' from the right-hand side + string = string.rstrip('.git') + # Replace certain characters with underscores + string = re.sub('[:/]', '_', string) + # Remove duplicate underscores + string = re.sub('_+', '_', string) + # Remove leading or trailing underscore + string = string.strip('_') + return string + + +def fetch_qa_suite(branch, commit=None, lock=True): + """ + Make sure ceph-qa-suite is checked out. + + :param branch: The branch to fetch + :param commit: The sha1 to checkout. Defaults to None, which uses HEAD of the branch. + :returns: The destination path + """ + return fetch_repo(config.get_ceph_qa_suite_git_url(), + branch, commit, lock=lock) + + +def fetch_teuthology(branch, commit=None, lock=True): + """ + Make sure we have the correct teuthology branch checked out and up-to-date + + :param branch: The branch we want + :param commit: The sha1 to checkout. Defaults to None, which uses HEAD of the branch. + :returns: The destination path + """ + url = config.get_teuthology_git_url() + return fetch_repo(url, branch, commit, bootstrap_teuthology, lock) + + +def bootstrap_teuthology(dest_path): + with exporter.BootstrapTime().time(): + log.info("Bootstrapping %s", dest_path) + # This magic makes the bootstrap script not attempt to clobber an + # existing virtualenv. But the branch's bootstrap needs to actually + # check for the NO_CLOBBER variable. + env = os.environ.copy() + env['NO_CLOBBER'] = '1' + cmd = './bootstrap' + boot_proc = subprocess.Popen( + cmd, shell=True, + cwd=dest_path, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + out, _ = boot_proc.communicate() + returncode = boot_proc.wait() + log.info("Bootstrap exited with status %s", returncode) + if returncode != 0: + for line in out.split("\n"): + log.warning(line.strip()) + venv_path = os.path.join(dest_path, 'virtualenv') + log.info("Removing %s", venv_path) + shutil.rmtree(venv_path, ignore_errors=True) + raise BootstrapError("Bootstrap failed!") diff --git a/teuthology/report.py b/teuthology/report.py new file mode 100644 index 000000000..f0a447201 --- /dev/null +++ b/teuthology/report.py @@ -0,0 +1,596 @@ +import os +import yaml +import json +import re +import requests +import logging +import random +import socket +from datetime import datetime + +import teuthology +import teuthology.exporter +from teuthology.config import config +from teuthology.contextutil import safe_while +from teuthology.job_status import get_status, set_status + +report_exceptions = (requests.exceptions.RequestException, socket.error) + + +def init_logging(): + """ + Set up logging for the module + + :returns: a logger + """ + log = logging.getLogger(__name__) + return log + + +def main(args): + run = args['--run'] + job = args['--job'] + dead = args['--dead'] + refresh = dead or args['--refresh'] + server = args['--server'] + if server: + config.results_server = server + if args['--verbose']: + teuthology.log.setLevel(logging.DEBUG) + + archive_base = os.path.abspath(os.path.expanduser(args['--archive'])) or \ + config.archive_base + save = not args['--no-save'] + + log = init_logging() + reporter = ResultsReporter(archive_base, save=save, refresh=refresh, + log=log) + if dead and not job: + for run_name in run: + try_mark_run_dead(run[0]) + elif dead and len(run) == 1 and job: + reporter.report_jobs(run[0], job, dead=True) + elif len(run) == 1 and job: + reporter.report_jobs(run[0], job) + elif run and len(run) > 1: + reporter.report_runs(run) + elif run: + reporter.report_run(run[0]) + elif args['--all-runs']: + reporter.report_all_runs() + + +class ResultsSerializer(object): + """ + This class exists to poke around in the archive directory doing things like + assembling lists of test runs, lists of their jobs, and merging sets of job + YAML files together to form JSON objects. + """ + yamls = ('orig.config.yaml', 'config.yaml', 'info.yaml', 'summary.yaml') + + def __init__(self, archive_base, log=None): + self.archive_base = archive_base or config.archive_base + self.log = log or init_logging() + + + def job_info(self, run_name, job_id, pretty=False, simple=False): + """ + Given a run name and job id, merge the job's YAML files together. + + :param run_name: The name of the run. + :param job_id: The job's id. + :param simple(bool): Read less data for speed (only orig.config.yaml/info.yaml) + :returns: A dict. + """ + job_archive_dir = os.path.join(self.archive_base, + run_name, + job_id) + job_info = {} + + if simple: + self.yamls = ('orig.config.yaml', 'info.yaml') + + for yaml_name in self.yamls: + yaml_path = os.path.join(job_archive_dir, yaml_name) + if not os.path.exists(yaml_path): + continue + with open(yaml_path) as yaml_file: + partial_info = yaml.safe_load(yaml_file) + if partial_info is not None: + job_info.update(partial_info) + + if 'job_id' not in job_info: + job_info['job_id'] = job_id + + if simple: + return job_info + + log_path = os.path.join(job_archive_dir, 'teuthology.log') + if os.path.exists(log_path): + mtime = int(os.path.getmtime(log_path)) + mtime_dt = datetime.fromtimestamp(mtime) + job_info['updated'] = str(mtime_dt) + + + return job_info + + def json_for_job(self, run_name, job_id, pretty=False): + """ + Given a run name and job id, merge the job's YAML files together to + create a JSON object. + + :param run_name: The name of the run. + :param job_id: The job's id. + :returns: A JSON object. + """ + job_info = self.job_info(run_name, job_id, pretty) + if pretty: + job_json = json.dumps(job_info, sort_keys=True, indent=4) + else: + job_json = json.dumps(job_info) + + return job_json + + def jobs_for_run(self, run_name): + """ + Given a run name, look on the filesystem for directories containing job + information, and return a dict mapping job IDs to job directories. + + :param run_name: The name of the run. + :returns: A dict like: {'1': '/path/to/1', '2': 'path/to/2'} + """ + archive_dir = os.path.join(self.archive_base, run_name) + if not os.path.isdir(archive_dir): + return {} + jobs = {} + for item in os.listdir(archive_dir): + if not re.match(r'\d+$', item): + continue + job_id = item + job_dir = os.path.join(archive_dir, job_id) + if os.path.isdir(job_dir): + jobs[job_id] = job_dir + return jobs + + def running_jobs_for_run(self, run_name): + """ + Like jobs_for_run(), but only returns jobs with no summary.yaml + + :param run_name: The name of the run. + :returns: A dict like: {'1': '/path/to/1', '2': 'path/to/2'} + """ + jobs = self.jobs_for_run(run_name) + for job_id in list(jobs): + if os.path.exists(os.path.join(jobs[job_id], 'summary.yaml')): + jobs.pop(job_id) + return jobs + + @property + def all_runs(self): + """ + Look in the base archive directory for all test runs. Return a list of + their names. + """ + archive_base = self.archive_base + if not os.path.isdir(archive_base): + return [] + runs = [] + for run_name in os.listdir(archive_base): + if not os.path.isdir(os.path.join(archive_base, run_name)): + continue + runs.append(run_name) + return runs + + +class ResultsReporter(object): + last_run_file = 'last_successful_run' + + def __init__(self, archive_base=None, base_uri=None, save=False, + refresh=False, log=None): + self.log = log or init_logging() + self.archive_base = archive_base or config.archive_base + self.base_uri = base_uri or config.results_server + if self.base_uri: + self.base_uri = self.base_uri.rstrip('/') + + self.serializer = ResultsSerializer(archive_base, log=self.log) + self.save_last_run = save + self.refresh = refresh + self.session = self._make_session() + + if not self.base_uri: + msg = "No results_server set in {yaml}; cannot report results" + self.log.warning(msg.format(yaml=config.yaml_path)) + + def _make_session(self, max_retries=10): + session = requests.Session() + adapter = requests.adapters.HTTPAdapter(max_retries=max_retries) + session.mount('http://', adapter) + return session + + def report_all_runs(self): + """ + Report *all* runs in self.archive_dir to the results server. + """ + all_runs = self.serializer.all_runs + last_run = self.last_run + if self.save_last_run and last_run and last_run in all_runs: + next_index = all_runs.index(last_run) + 1 + runs = all_runs[next_index:] + else: + runs = all_runs + return self.report_runs(runs) + + def report_runs(self, run_names): + """ + Report several runs to the results server. + + :param run_names: The names of the runs. + """ + num_runs = len(run_names) + num_jobs = 0 + self.log.info("Posting %s runs", num_runs) + for run in run_names: + job_count = self.report_run(run) + num_jobs += job_count + if self.save_last_run: + self.last_run = run + del self.last_run + self.log.info("Total: %s jobs in %s runs", num_jobs, len(run_names)) + + def report_run(self, run_name, dead=False): + """ + Report a single run to the results server. + + :param run_name: The name of the run. + :returns: The number of jobs reported. + """ + jobs = self.serializer.jobs_for_run(run_name) + self.log.info("{name} {jobs} jobs dead={dead}".format( + name=run_name, + jobs=len(jobs), + dead=str(dead), + )) + if jobs: + if not self.refresh: + response = self.session.head("{base}/runs/{name}/".format( + base=self.base_uri, name=run_name)) + if response.status_code == 200: + self.log.info(" already present; skipped") + return 0 + self.report_jobs(run_name, jobs.keys(), dead=dead) + elif not jobs: + self.log.debug(" no jobs; skipped") + return len(jobs) + + def report_jobs(self, run_name, job_ids, dead=False): + """ + Report several jobs to the results server. + + :param run_name: The name of the run. + :param job_ids: The jobs' ids + """ + for job_id in job_ids: + self.report_job(run_name, job_id, dead=dead) + + def report_job(self, run_name, job_id, job_info=None, dead=False): + """ + Report a single job to the results server. + + :param run_name: The name of the run. The run must already exist. + :param job_id: The job's id + :param job_info: The job's info dict. Optional - if not present, we + look at the archive. + """ + if job_info is not None and not isinstance(job_info, dict): + raise TypeError("job_info must be a dict") + run_uri = "{base}/runs/{name}/jobs/".format( + base=self.base_uri, name=run_name,) + if job_info is None: + job_info = self.serializer.job_info(run_name, job_id) + if dead and get_status(job_info) is None: + set_status(job_info, 'dead') + job_json = json.dumps(job_info) + headers = {'content-type': 'application/json'} + + inc = random.uniform(0, 1) + with safe_while( + sleep=1, increment=inc, action=f'report job {job_id}') as proceed: + while proceed(): + response = self.session.post(run_uri, data=job_json, headers=headers) + + if response.status_code == 200: + return + + # This call is wrapped in a try/except because of: + # http://tracker.ceph.com/issues/8166 + try: + resp_json = response.json() + except ValueError: + resp_json = dict() + + if resp_json: + msg = resp_json.get('message', '') + else: + msg = response.text + + if msg and msg.endswith('already exists'): + job_uri = os.path.join(run_uri, job_id, '') + response = self.session.put(job_uri, data=job_json, + headers=headers) + if response.status_code == 200: + return + elif msg: + self.log.error( + "POST to {uri} failed with status {status}: {msg}".format( + uri=run_uri, + status=response.status_code, + msg=msg, + )) + response.raise_for_status() + + @property + def last_run(self): + """ + The last run to be successfully reported. + """ + if hasattr(self, '__last_run'): + return self.__last_run + elif os.path.exists(self.last_run_file): + with open(self.last_run_file) as f: + self.__last_run = f.read().strip() + return self.__last_run + + @last_run.setter + def last_run(self, run_name): + self.__last_run = run_name + with open(self.last_run_file, 'w') as f: + f.write(run_name) + + @last_run.deleter + def last_run(self): + self.__last_run = None + if os.path.exists(self.last_run_file): + os.remove(self.last_run_file) + + def get_jobs(self, run_name, job_id=None, fields=None): + """ + Query the results server for jobs in a run + + :param run_name: The name of the run + :param job_id: Optionally get a single job instead of all + :param fields: Optional. A list of fields to include in the result. + Defaults to returning all fields. + """ + uri = "{base}/runs/{name}/jobs/".format(base=self.base_uri, + name=run_name) + if job_id: + uri = os.path.join(uri, job_id) + if fields: + if 'job_id' not in fields: + fields.append('job_id') + uri += "?fields=" + ','.join(fields) + response = self.session.get(uri) + response.raise_for_status() + return response.json() + + def get_run(self, run_name, fields=None): + """ + Query the results server for a run + + :param run_name: The name of the run + :param fields: Optional. A list of fields to include in the result. + Defaults to returning all fields. + """ + uri = "{base}/runs/{name}".format(base=self.base_uri, name=run_name) + if fields: + uri += "?fields=" + ','.join(fields) + response = self.session.get(uri) + response.raise_for_status() + return response.json() + + def _parse_log_line(self, line, prefix): + # parse log lines like + # 2018-07-27T00:30:55.967 INFO:teuthology.results:subset: '35/999' + msg = line.split(' ', 1)[1].split(':', 2)[-1] + if not msg.startswith(prefix): + return None + else: + return msg[len(prefix):].strip(" '") + + def get_rerun_conf(self, run_name): + log_path = os.path.join(self.archive_base, run_name, 'results.log') + # parse the log file generated by teuthology.results.results() + subset = None + no_nested_subset = None + seed = None + with open(log_path) as results_log: + for line in results_log: + if ':' not in line: + # stop if this does not look line a log line + break + line = line.strip() + if subset is None: + subset = self._parse_log_line(line, 'subset:') + if no_nested_subset is None: + no_nested_subset = self._parse_log_line(line, 'no_nested_subset:') + if seed is None: + seed = self._parse_log_line(line, 'seed:') + if subset is not None: + subset = tuple(int(i) for i in subset.split('/')) + if no_nested_subset is not None: + no_nested_subset = bool(no_nested_subset) + if seed is not None: + seed = int(seed) + return subset, no_nested_subset, seed + + def delete_job(self, run_name, job_id): + """ + Delete a job from the results server. + + :param run_name: The name of the run + :param job_id: The job's id + """ + uri = "{base}/runs/{name}/jobs/{job_id}/".format( + base=self.base_uri, name=run_name, job_id=job_id) + response = self.session.delete(uri) + response.raise_for_status() + + def delete_jobs(self, run_name, job_ids): + """ + Delete multiple jobs from the results server. + + :param run_name: The name of the run + :param job_ids: A list of job ids + """ + for job_id in job_ids: + self.delete_job(self, run_name, job_id) + + def delete_run(self, run_name): + """ + Delete a run from the results server. + + :param run_name: The name of the run + """ + uri = "{base}/runs/{name}/".format( + base=self.base_uri, name=run_name) + response = self.session.delete(uri) + response.raise_for_status() + + +def push_job_info(run_name, job_id, job_info, base_uri=None): + """ + Push a job's info (example: ctx.config) to the results server. + + :param run_name: The name of the run. + :param job_id: The job's id + :param job_info: A dict containing the job's information. + :param base_uri: The endpoint of the results server. If you leave it out + ResultsReporter will ask teuthology.config. + """ + reporter = ResultsReporter() + if not reporter.base_uri: + return + reporter.report_job(run_name, job_id, job_info) + status = get_status(job_info) + if status in ["pass", "fail", "dead"] and "machine_type" in job_info: + teuthology.exporter.JobResults().record( + machine_type=job_info["machine_type"], + status=status, + ) + + +def try_push_job_info(job_config, extra_info=None): + """ + Wrap push_job_info, gracefully doing nothing if: + Anything inheriting from requests.exceptions.RequestException is raised + A socket.error is raised + config.results_server is not set + config['job_id'] is not present or is None + + :param job_config: The ctx.config object to push + :param extra_info: Optional second dict to push + """ + log = init_logging() + + if not config.results_server: + log.warning('No results_server in config; not reporting results') + return + + if job_config.get('job_id') is None: + log.warning('No job_id found; not reporting results') + return + + run_name = job_config['name'] + job_id = job_config['job_id'] + + if extra_info is not None: + job_info = job_config.copy() + job_info.update(extra_info) + else: + job_info = job_config + + try: + log.debug("Pushing job info to %s", config.results_server) + push_job_info(run_name, job_id, job_info) + return + except report_exceptions: + log.exception("Could not report results to %s", + config.results_server) + + +def try_delete_jobs(run_name, job_ids, delete_empty_run=True): + """ + Using the same error checking and retry mechanism as try_push_job_info(), + delete one or more jobs + + :param run_name: The name of the run. + :param job_ids: Either a single job_id, or a list of job_ids + :param delete_empty_run: If this would empty the run, delete it. + """ + log = init_logging() + + if not isinstance(job_ids, list): + if isinstance(job_ids, int): + job_ids = [str(job_ids)] + elif isinstance(job_ids, bytes): + job_ids = [str(job_ids.decode())] + else: + job_ids = [job_ids] + + reporter = ResultsReporter() + if not reporter.base_uri: + return + + log.debug("Deleting jobs from {server}: {jobs}".format( + server=config.results_server, jobs=str(job_ids))) + + if delete_empty_run: + got_jobs = reporter.get_jobs(run_name, fields=['job_id']) + got_job_ids = [j['job_id'] for j in got_jobs] + if sorted(got_job_ids) == sorted(job_ids): + try: + reporter.delete_run(run_name) + return + except report_exceptions: + log.exception("Run deletion failed") + + def try_delete_job(job_id): + try: + reporter.delete_job(run_name, job_id) + return + except report_exceptions: + log.exception("Job deletion failed") + + for job_id in job_ids: + try_delete_job(job_id) + + +def try_mark_run_dead(run_name): + """ + Using the same error checking and retry mechanism as try_push_job_info(), + mark any unfinished runs as dead. + + :param run_name: The name of the run. + """ + log = init_logging() + reporter = ResultsReporter() + if not reporter.base_uri: + return + + log.debug("Marking run as dead: {name}".format(name=run_name)) + jobs = reporter.get_jobs(run_name, fields=['status']) + for job in jobs: + if job['status'] not in ['pass', 'fail', 'dead']: + job_id = job['job_id'] + try: + log.info("Marking job {job_id} as dead".format(job_id=job_id)) + reporter.report_job(run_name, job['job_id'], dead=True) + if "machine_type" in job: + teuthology.exporter.JobResults().record( + machine_type=job["machine_type"], + status=job["status"], + ) + except report_exceptions: + log.exception("Could not mark job as dead: {job_id}".format( + job_id=job_id)) diff --git a/teuthology/results.py b/teuthology/results.py new file mode 100644 index 000000000..aae991eaf --- /dev/null +++ b/teuthology/results.py @@ -0,0 +1,272 @@ +import os +import time +import logging +from collections import OrderedDict +from textwrap import dedent +from textwrap import fill + +import teuthology +from teuthology.config import config +from teuthology import misc +from teuthology.report import ResultsReporter +from teuthology.scrape import Scraper + +log = logging.getLogger(__name__) + +UNFINISHED_STATUSES = ('queued', 'running', 'waiting') + + +def main(args): + + log = logging.getLogger(__name__) + if args['--verbose']: + teuthology.log.setLevel(logging.DEBUG) + + if not args['--dry-run']: + log_path = os.path.join(args['--archive-dir'], 'results.log') + teuthology.setup_log_file(log_path) + + try: + if args['--seed']: + note_rerun_params(args['--subset'], args['--no-nested-subset'], args['--seed']) + else: + results(args['--archive-dir'], args['--name'], args['--email'], + int(args['--timeout']), args['--dry-run']) + except Exception: + log.exception('error generating memo/results') + raise + + +def note_rerun_params(subset, no_nested_subset, seed): + if subset: + log.info('subset: %r', subset) + if no_nested_subset: + log.info('no_nested_subset: %r', no_nested_subset) + if seed: + log.info('seed: %r', seed) + + +def results(archive_dir, name, email, timeout, dry_run): + starttime = time.time() + + if timeout: + log.info('Waiting up to %d seconds for tests to finish...', timeout) + + reporter = ResultsReporter() + while timeout > 0: + if time.time() - starttime > timeout: + log.warning('test(s) did not finish before timeout of %d seconds', + timeout) + break + jobs = reporter.get_jobs(name, fields=['job_id', 'status']) + unfinished_jobs = [job for job in jobs if job['status'] in + UNFINISHED_STATUSES] + if not unfinished_jobs: + log.info('Tests finished! gathering results...') + break + time.sleep(60) + + (subject, body) = build_email_body(name) + + Scraper(archive_dir).analyze() + if email and dry_run: + print("From: %s" % (config.results_sending_email or 'teuthology')) + print("To: %s" % email) + print("Subject: %s" % subject) + print(body) + elif email: + email_results( + subject=subject, + from_=(config.results_sending_email or 'teuthology'), + to=email, + body=body, + ) + + +def email_results(subject, from_, to, body): + log.info('Sending results to {to}: {body}'.format(to=to, body=body)) + import smtplib + from email.mime.text import MIMEText + msg = MIMEText(body) + msg['Subject'] = subject + msg['From'] = from_ + msg['To'] = to + log.debug('sending email %s', msg.as_string()) + smtp = smtplib.SMTP('localhost') + smtp.sendmail(msg['From'], [msg['To']], msg.as_string()) + smtp.quit() + + +def build_email_body(name, _reporter=None): + stanzas = OrderedDict([ + ('fail', dict()), + ('dead', dict()), + ('running', dict()), + ('waiting', dict()), + ('queued', dict()), + ('pass', dict()), + ]) + reporter = _reporter or ResultsReporter() + fields = ('job_id', 'status', 'description', 'duration', 'failure_reason', + 'sentry_event', 'log_href') + jobs = reporter.get_jobs(name, fields=fields) + jobs.sort(key=lambda job: job['job_id']) + + for job in jobs: + job_stanza = format_job(name, job) + stanzas[job['status']][job['job_id']] = job_stanza + + sections = OrderedDict.fromkeys(stanzas.keys(), '') + subject_fragments = [] + for status in sections.keys(): + stanza = stanzas[status] + if stanza: + subject_fragments.append('%s %s' % (len(stanza), status)) + sections[status] = email_templates['sect_templ'].format( + title=status.title(), + jobs=''.join(stanza.values()), + ) + subject = ', '.join(subject_fragments) + ' ' + + if config.archive_server: + log_root = os.path.join(config.archive_server, name, '') + else: + log_root = None + + body = email_templates['body_templ'].format( + name=name, + info_root=misc.get_results_url(name), + log_root=log_root, + fail_count=len(stanzas['fail']), + dead_count=len(stanzas['dead']), + running_count=len(stanzas['running']), + waiting_count=len(stanzas['waiting']), + queued_count=len(stanzas['queued']), + pass_count=len(stanzas['pass']), + fail_sect=sections['fail'], + dead_sect=sections['dead'], + running_sect=sections['running'], + waiting_sect=sections['waiting'], + queued_sect=sections['queued'], + pass_sect=sections['pass'], + ) + + subject += 'in {suite}'.format(suite=name) + return (subject.strip(), body.strip()) + + +def format_job(run_name, job): + job_id = job['job_id'] + status = job['status'] + description = job['description'] + duration = seconds_to_hms(int(job['duration'] or 0)) + + # Every job gets a link to e.g. pulpito's pages + info_url = misc.get_results_url(run_name, job_id) + if info_url: + info_line = email_templates['info_url_templ'].format(info=info_url) + else: + info_line = '' + + if status in UNFINISHED_STATUSES: + format_args = dict( + job_id=job_id, + desc=description, + time=duration, + info_line=info_line, + ) + return email_templates['running_templ'].format(**format_args) + + if status == 'pass': + return email_templates['pass_templ'].format( + job_id=job_id, + desc=description, + time=duration, + info_line=info_line, + ) + else: + log_dir_url = job['log_href'].rstrip('teuthology.yaml') + if log_dir_url: + log_line = email_templates['fail_log_templ'].format( + log=log_dir_url) + else: + log_line = '' + sentry_event = job.get('sentry_event') + if sentry_event: + sentry_line = email_templates['fail_sentry_templ'].format( + sentry_event=sentry_event) + else: + sentry_line = '' + + if job['failure_reason']: + # 'fill' is from the textwrap module and it collapses a given + # string into multiple lines of a maximum width as specified. + # We want 75 characters here so that when we indent by 4 on the + # next line, we have 79-character exception paragraphs. + reason = fill(job['failure_reason'] or '', 75) + reason = \ + '\n'.join((' ') + line for line in reason.splitlines()) + reason_lines = email_templates['fail_reason_templ'].format( + reason=reason).rstrip() + else: + reason_lines = '' + + format_args = dict( + job_id=job_id, + desc=description, + time=duration, + info_line=info_line, + log_line=log_line, + sentry_line=sentry_line, + reason_lines=reason_lines, + ) + return email_templates['fail_templ'].format(**format_args) + + +def seconds_to_hms(seconds): + (minutes, seconds) = divmod(seconds, 60) + (hours, minutes) = divmod(minutes, 60) + return "%02d:%02d:%02d" % (hours, minutes, seconds) + + +email_templates = { + 'body_templ': dedent("""\ + Test Run: {name} + ================================================================= + info: {info_root} + logs: {log_root} + failed: {fail_count} + dead: {dead_count} + running: {running_count} + waiting: {waiting_count} + queued: {queued_count} + passed: {pass_count} + + {fail_sect}{dead_sect}{running_sect}{waiting_sect}{queued_sect}{pass_sect} + """), + 'sect_templ': dedent("""\ + + {title} + ================================================================= + {jobs} + """), + 'fail_templ': dedent("""\ + [{job_id}] {desc} + ----------------------------------------------------------------- + time: {time}{info_line}{log_line}{sentry_line}{reason_lines} + + """), + 'info_url_templ': "\ninfo: {info}", + 'fail_log_templ': "\nlog: {log}", + 'fail_sentry_templ': "\nsentry: {sentry_event}", + 'fail_reason_templ': "\n\n{reason}\n", + 'running_templ': dedent("""\ + [{job_id}] {desc}{info_line} + + """), + 'pass_templ': dedent("""\ + [{job_id}] {desc} + time: {time}{info_line} + + """), +} diff --git a/teuthology/run.py b/teuthology/run.py new file mode 100644 index 000000000..383a74c93 --- /dev/null +++ b/teuthology/run.py @@ -0,0 +1,411 @@ +import os +import yaml +import sys +import logging + +import teuthology +from teuthology import install_except_hook +from teuthology import report +from teuthology.job_status import get_status +from teuthology.misc import get_user, merge_configs +from teuthology.run_tasks import run_tasks +from teuthology.repo_utils import fetch_qa_suite +from teuthology.results import email_results +from teuthology.config import FakeNamespace +from teuthology.config import config as teuth_config + +log = logging.getLogger(__name__) + + +def set_up_logging(verbose, archive): + if verbose: + teuthology.log.setLevel(logging.DEBUG) + + if archive is not None: + if not os.path.isdir(archive): + os.mkdir(archive) + + teuthology.setup_log_file(os.path.join(archive, 'teuthology.log')) + + install_except_hook() + + +def write_initial_metadata(archive, config, name, description, owner): + if archive is not None: + with open(os.path.join(archive, 'pid'), 'w') as f: + f.write('%d' % os.getpid()) + + with open(os.path.join(archive, 'owner'), 'w') as f: + f.write(owner + '\n') + + with open(os.path.join(archive, 'orig.config.yaml'), 'w') as f: + yaml.safe_dump(config, f, default_flow_style=False) + + info = { + 'name': name, + 'description': description, + 'owner': owner, + 'pid': os.getpid(), + } + if 'job_id' in config: + info['job_id'] = config['job_id'] + + with open(os.path.join(archive, 'info.yaml'), 'w') as f: + yaml.safe_dump(info, f, default_flow_style=False) + + +def fetch_tasks_if_needed(job_config): + """ + Fetch the suite repo (and include it in sys.path) so that we can use its + tasks. + + Returns the suite_path. The existing suite_path will be returned if the + tasks can be imported, if not a new suite_path will try to be determined. + """ + # Any scheduled job will already have the suite checked out and its + # $PYTHONPATH set. We can check for this by looking for 'suite_path' + # in its config. + suite_path = job_config.get('suite_path') + if suite_path: + log.info("suite_path is set to %s; will attempt to use it", suite_path) + if suite_path not in sys.path: + sys.path.insert(1, suite_path) + + try: + import tasks + log.info("Found tasks at %s", os.path.dirname(tasks.__file__)) + # tasks found with the existing suite branch, return it + return suite_path + except ImportError: + log.info("Tasks not found; will attempt to fetch") + + ceph_branch = job_config.get('branch', 'main') + suite_repo = job_config.get('suite_repo') + if suite_repo: + teuth_config.ceph_qa_suite_git_url = suite_repo + suite_branch = job_config.get('suite_branch', ceph_branch) + suite_sha1 = job_config.get('suite_sha1') + suite_path = os.path.normpath(os.path.join( + fetch_qa_suite(suite_branch, commit=suite_sha1), + job_config.get('suite_relpath', 'qa'), + )) + sys.path.insert(1, suite_path) + return suite_path + + +def setup_config(config_paths) -> dict: + """ + Takes a list of config yaml files and combines them + into a single dictionary. Processes / validates the dictionary and then + returns it. + """ + config = merge_configs(config_paths) + + # Older versions of teuthology stored job_id as an int. Convert it to a str + # if necessary. + job_id = config.get('job_id') + if job_id is not None: + job_id = str(job_id) + config['job_id'] = job_id + + # targets must be >= than roles + if 'targets' in config and 'roles' in config: + targets = len(config['targets']) + roles = len(config['roles']) + assert targets >= roles, \ + '%d targets are needed for all roles but found %d listed.' % ( + roles, targets) + + return config + + +def get_machine_type(machine_type, config): + """ + If no machine_type is given, find the appropriate machine_type + from the given config. + """ + if machine_type is None: + fallback_default = config.get('machine_type', + teuth_config.default_machine_type) + machine_type = config.get('machine-type', fallback_default) + + return machine_type + + +def get_summary(owner, description): + summary = dict(success=True) + summary['owner'] = owner + + if description is not None: + summary['description'] = description + + return summary + + +def validate_tasks(config): + """ + Ensures that config tasks is a list and doesn't include 'kernel'. + + Returns the original tasks key if found. If not, returns an + empty list. + """ + if 'tasks' not in config: + log.warning('No tasks specified. Continuing anyway...') + # return the default value for tasks + return [] + + msg = "Expected list in 'tasks'; instead got: {0}".format(config['tasks']) + assert isinstance(config['tasks'], list), msg + + for task in config['tasks']: + msg = ('kernel installation should be a base-level item, not part ' + + 'of the tasks list') + assert 'kernel' not in task, msg + + return config["tasks"] + + +def get_initial_tasks(lock, config, machine_type): + init_tasks = [] + overrides = config.get('overrides', {}) + having_repos = ('repos' in config.get('install', {}) or + 'repos' in overrides.get('install', {})) + if 'redhat' in config: + pass + elif having_repos: + pass + elif not config.get('verify_ceph_hash', True): + pass + else: + init_tasks += [ + {'internal.check_packages': None}, + {'internal.buildpackages_prep': None}, + ] + if 'roles' in config and lock: + msg = ('You cannot specify targets in a config file when using the ' + + '--lock option') + assert 'targets' not in config, msg + init_tasks.append({'internal.lock_machines': ( + len(config['roles']), machine_type)}) + + init_tasks.append({'internal.save_config': None}) + + if 'roles' in config: + init_tasks.append({'internal.check_lock': None}) + + init_tasks.append({'internal.add_remotes': None}) + + if 'roles' in config: + init_tasks.extend([ + {'console_log': None}, + {'internal.connect': None}, + {'internal.push_inventory': None}, + {'internal.serialize_remote_roles': None}, + {'internal.check_conflict': None}, + ]) + + if ('roles' in config and + not config.get('use_existing_cluster', False)): + init_tasks.extend([ + {'internal.check_ceph_data': None}, + {'internal.vm_setup': None}, + ]) + + # install_latest_rh_kernel is used for redhat config + if 'redhat' not in config and 'kernel' in config: + init_tasks.append({'kernel': config['kernel']}) + + if 'roles' in config: + init_tasks.append({'internal.base': None}) + init_tasks.append({'internal.archive_upload': None}) + if 'roles' in config: + init_tasks.extend([ + {'internal.archive': None}, + {'internal.coredump': None}, + {'internal.sudo': None}, + {'internal.syslog': None}, + ]) + init_tasks.append({'internal.timer': None}) + + if 'roles' in config: + init_tasks.extend([ + {'pcp': None}, + {'selinux': None}, + ]) + + if 'redhat' in config: + init_tasks.extend([ + {'internal.setup_stage_cdn': None}]) + + if config.get('ceph_cm_ansible', True): + init_tasks.append({'ansible.cephlab': None}) + + # clock_sync_task: 'clock' or 'clock.check' + clock_sync_task = config.get('clock_sync_task', 'clock') + init_tasks.append({clock_sync_task: None}) + + if 'redhat' in config: + init_tasks.extend([ + {'internal.git_ignore_ssl': None}, + {'internal.setup_cdn_repo': None}, + {'internal.setup_base_repo': None}, + {'internal.setup_additional_repo': None}, + {'internal.setup_container_registry': None}, + {'install': None}, + ]) + # Install latest kernel task for redhat downstream runs + if config.get('redhat').get('install_latest_rh_kernel', False): + init_tasks.extend({'kernel.install_latest_rh_kernel': None}) + + return init_tasks + + +def report_outcome(config, archive, summary): + """ Reports on the final outcome of the command. """ + status = get_status(summary) + passed = status == 'pass' + + if archive is not None: + with open(os.path.join(archive, 'summary.yaml'), 'w') as f: + yaml.safe_dump(summary, f, default_flow_style=False) + + summary_dump = yaml.safe_dump(summary) + log.info('Summary data:\n%s' % summary_dump) + + if ('email-on-error' in config + and not passed): + config_dump = yaml.safe_dump(config) + subject = "Teuthology error -- %s" % summary['failure_reason'] + email_results(subject, "Teuthology", config['email-on-error'], + "\n".join([summary_dump, config_dump])) + + + report.try_push_job_info(config, summary) + + if passed: + log.info(status) + else: + log.info(str(status).upper()) + sys.exit(1) + + +def get_teuthology_command(args): + """ + Rebuilds the teuthology command used to run this job + and returns it as a string. + """ + cmd = ["teuthology"] + for key, value in args.items(): + if value: + # an option, not an argument + if not key.startswith("<"): + cmd.append(key) + else: + # this is the argument + for arg in value: + cmd.append(str(arg)) + continue + # so we don't print something like --verbose True + if isinstance(value, str): + cmd.append(value) + return " ".join(cmd) + + +def main(args): + verbose = args["--verbose"] + archive = args["--archive"] + owner = args["--owner"] + config = args[""] + name = args["--name"] + description = args["--description"] + machine_type = args["--machine-type"] + block = args["--block"] + lock = args["--lock"] + suite_path = args["--suite-path"] + os_type = args["--os-type"] + os_version = args["--os-version"] + interactive_on_error = args["--interactive-on-error"] + + # print the command being ran + log.debug("Teuthology command: {0}".format(get_teuthology_command(args))) + + if owner is None: + args["--owner"] = owner = get_user() + + config = setup_config(config) + + if archive is not None and 'archive_path' not in config: + config['archive_path'] = archive + elif archive is None and 'archive_path' in config: + archive = args['--archive'] = config['archive_path'] + + set_up_logging(verbose, archive) + + write_initial_metadata(archive, config, name, description, owner) + report.try_push_job_info(config, dict(status='running')) + + machine_type = get_machine_type(machine_type, config) + args["--machine-type"] = machine_type + + if block: + assert lock, \ + 'the --block option is only supported with the --lock option' + + log.info( + '\n '.join(['Config:', ] + yaml.safe_dump( + config, default_flow_style=False).splitlines())) + + args["summary"] = get_summary(owner, description) + + ceph_repo = config.get('repo') + if ceph_repo: + teuth_config.ceph_git_url = ceph_repo + suite_repo = config.get('suite_repo') + if suite_repo: + teuth_config.ceph_qa_suite_git_url = suite_repo + + # overwrite the config values of os_{type,version} if corresponding + # command-line arguments are provided + if os_type: + config["os_type"] = os_type + if os_version: + config["os_version"] = os_version + + config["tasks"] = validate_tasks(config) + + init_tasks = get_initial_tasks(lock, config, machine_type) + + # prepend init_tasks to the front of the task list + config['tasks'][:0] = init_tasks + + if suite_path is not None: + config['suite_path'] = suite_path + + # fetches the tasks and returns a new suite_path if needed + config["suite_path"] = fetch_tasks_if_needed(config) + + # If the job has a 'use_shaman' key, use that value to override the global + # config's value. + if config.get('use_shaman') is not None: + teuth_config.use_shaman = config['use_shaman'] + + #could be refactored for setting and unsetting in hackish way + if interactive_on_error: + config['interactive-on-error'] = True + # create a FakeNamespace instance that mimics the old argparse way of doing + # things we do this so we can pass it to run_tasks without porting those + # tasks to the new way of doing things right now + args[""] = config + fake_ctx = FakeNamespace(args) + + # store on global config if interactive-on-error, for contextutil.nested() + # FIXME this should become more generic, and the keys should use + # '_' uniformly + if fake_ctx.config.get('interactive-on-error'): + teuth_config.ctx = fake_ctx + + try: + run_tasks(tasks=config['tasks'], ctx=fake_ctx) + finally: + # print to stdout the results and possibly send an email on any errors + report_outcome(config, archive, fake_ctx.summary) diff --git a/teuthology/run_tasks.py b/teuthology/run_tasks.py new file mode 100644 index 000000000..267d8fd3f --- /dev/null +++ b/teuthology/run_tasks.py @@ -0,0 +1,332 @@ +import importlib +import jinja2 +import logging +import os +import sys +import time +import types +import yaml + +from humanfriendly import format_timespan + +import teuthology.exporter as exporter + +from teuthology.config import config as teuth_config +from teuthology.exceptions import ConnectionLostError +from teuthology.job_status import set_status, get_status +from teuthology.misc import get_http_log_path, get_results_url +from teuthology.timer import Timer +from teuthology.util import sentry + +log = logging.getLogger(__name__) + + +def get_task(name): + # todo: support of submodules + if '.' in name: + module_name, task_name = name.split('.') + else: + module_name, task_name = (name, 'task') + + # First look for the tasks's module inside teuthology + module = _import('teuthology.task', module_name, task_name) + # If it is not found, try qa/ directory (if it is in sys.path) + if not module: + module = _import('tasks', module_name, task_name, fail_on_import_error=True) + try: + # Attempt to locate the task object inside the module + task = getattr(module, task_name) + # If we get another module, we need to go deeper + if isinstance(task, types.ModuleType): + task = getattr(task, task_name) + except AttributeError: + log.error("No subtask of '{}' named '{}' was found".format( + module_name, + task_name, + )) + raise + return task + + +def _import(from_package, module_name, task_name, fail_on_import_error=False): + full_module_name = '.'.join([from_package, module_name]) + try: + module = __import__( + full_module_name, + globals(), + locals(), + [task_name], + 0, + ) + except ImportError: + if fail_on_import_error: + raise + else: + if ( + importlib.util.find_spec(from_package) is not None and + importlib.util.find_spec(full_module_name) is not None + ): + # If we get here, it means we could _find_ both the module and + # the package that contains it, but still got an ImportError. + # Typically that means the module failed to import because it + # could not find one of its dependencies; if we don't raise + # here it will look like we just could't find the module, + # making the dependency issue difficult to discover. + raise + return None + return module + + +def run_one_task(taskname, **kwargs): + taskname = taskname.replace('-', '_') + task = get_task(taskname) + return task(**kwargs) + + +def run_tasks(tasks, ctx): + archive_path = ctx.config.get('archive_path') + if archive_path: + timer = Timer( + path=os.path.join(archive_path, 'timing.yaml'), + sync=True, + ) + else: + timer = Timer() + stack = [] + taskname = "" + try: + for taskdict in tasks: + try: + ((taskname, config),) = taskdict.items() + except (ValueError, AttributeError): + raise RuntimeError('Invalid task definition: %s' % taskdict) + log.info('Running task %s...', taskname) + timer.mark('%s enter' % taskname) + manager = run_one_task(taskname, ctx=ctx, config=config) + if hasattr(manager, '__enter__'): + stack.append((taskname, manager)) + with exporter.TaskTime().time( + name=taskname, + phase="enter" + ): + manager.__enter__() + except BaseException as e: + if isinstance(e, ConnectionLostError): + # Prevent connection issues being flagged as failures + set_status(ctx.summary, 'dead') + else: + # the status may have been set to dead, leave it as-is if so + if not ctx.summary.get('status', '') == 'dead': + set_status(ctx.summary, 'fail') + if 'failure_reason' not in ctx.summary: + ctx.summary['failure_reason'] = str(e) + log.exception('Saw exception from tasks.') + + ctx.summary['sentry_event'] = sentry.report_error(ctx.config, e, taskname) + if ctx.config.get('interactive-on-error'): + ctx.config['interactive-on-error'] = False + from teuthology.task import interactive + log.warning('Saw failure during task execution, going into interactive mode...') + interactive.task(ctx=ctx, config=None) + # Throughout teuthology, (x,) = y has been used to assign values + # from yaml files where only one entry of type y is correct. This + # causes failures with 'too many values to unpack.' We want to + # fail as before, but with easier to understand error indicators. + if isinstance(e, ValueError): + if str(e) == 'too many values to unpack': + emsg = 'Possible configuration error in yaml file' + log.error(emsg) + ctx.summary['failure_info'] = emsg + finally: + try: + exc_info = sys.exc_info() + sleep_before_teardown = ctx.config.get('sleep_before_teardown') + if sleep_before_teardown: + log.info( + 'Sleeping for {} seconds before unwinding because' + ' --sleep-before-teardown was given...' + .format(sleep_before_teardown)) + notify_sleep_before_teardown(ctx, stack, sleep_before_teardown) + time.sleep(sleep_before_teardown) + while stack: + taskname, manager = stack.pop() + log.debug('Unwinding manager %s', taskname) + timer.mark('%s exit' % taskname) + try: + with exporter.TaskTime().time( + name=taskname, + phase="exit" + ): + suppress = manager.__exit__(*exc_info) + except Exception as e: + if isinstance(e, ConnectionLostError): + # Prevent connection issues being flagged as failures + set_status(ctx.summary, 'dead') + else: + set_status(ctx.summary, 'fail') + if 'failure_reason' not in ctx.summary: + ctx.summary['failure_reason'] = str(e) + log.exception('Manager failed: %s', taskname) + + if exc_info == (None, None, None): + # if first failure is in an __exit__, we don't + # have exc_info set yet + exc_info = sys.exc_info() + + if ctx.config.get('interactive-on-error'): + from teuthology.task import interactive + log.warning( + 'Saw failure during task cleanup, going into interactive mode...') + interactive.task(ctx=ctx, config=None) + else: + if suppress: + exc_info = (None, None, None) + + if exc_info != (None, None, None): + log.debug('Exception was not quenched, exiting: %s: %s', + exc_info[0].__name__, exc_info[1]) + raise SystemExit(1) + finally: + # be careful about cyclic references + del exc_info + timer.mark("tasks complete") + + +def build_rocketchat_message(ctx, stack, sleep_time_sec, template_path=None): + message_template_path = template_path or os.path.dirname(__file__) + \ + '/templates/rocketchat-sleep-before-teardown.jinja2' + + with open(message_template_path) as f: + template_text = f.read() + + template = jinja2.Template(template_text) + archive_path = ctx.config.get('archive_path') + job_id = ctx.config.get('job_id') + status = get_status(ctx.summary) + stack_path = ' -> '.join(task for task, _ in stack) + suite_name=ctx.config.get('suite') + sleep_date=time.time() + sleep_date_str=time.strftime('%Y-%m-%d %H:%M:%S', + time.gmtime(sleep_date)) + + message = template.render( + sleep_time=format_timespan(sleep_time_sec), + sleep_time_sec=sleep_time_sec, + sleep_date=sleep_date_str, + owner=ctx.owner, + run_name=ctx.name, + job_id=ctx.config.get('job_id'), + job_desc=ctx.config.get('description'), + job_info=get_results_url(ctx.name, job_id), + job_logs=get_http_log_path(archive_path, job_id), + suite_name=suite_name, + status=status, + task_stack=stack_path, + ) + return message + + +def build_email_body(ctx, stack, sleep_time_sec): + email_template_path = os.path.dirname(__file__) + \ + '/templates/email-sleep-before-teardown.jinja2' + + with open(email_template_path) as f: + template_text = f.read() + + email_template = jinja2.Template(template_text) + archive_path = ctx.config.get('archive_path') + job_id = ctx.config.get('job_id') + status = get_status(ctx.summary) + stack_path = '/'.join(task for task, _ in stack) + suite_name=ctx.config.get('suite') + sleep_date=time.time() + sleep_date_str=time.strftime('%Y-%m-%d %H:%M:%S', + time.gmtime(sleep_date)) + + body = email_template.render( + sleep_time=format_timespan(sleep_time_sec), + sleep_time_sec=sleep_time_sec, + sleep_date=sleep_date_str, + owner=ctx.owner, + run_name=ctx.name, + job_id=ctx.config.get('job_id'), + job_info=get_results_url(ctx.name), + job_logs=get_http_log_path(archive_path, job_id), + suite_name=suite_name, + status=status, + task_stack=stack_path, + ) + subject = ( + 'teuthology job {run}/{job} has fallen asleep at {date}' + .format(run=ctx.name, job=job_id, date=sleep_date_str) + ) + return (subject.strip(), body.strip()) + + +def rocketchat_send_message(ctx, message, channels): + """ + Send the message to the given RocketChat channels + + Before sending the message we read the config file + from `~/.config/rocketchat.api/settings.yaml` which + must include next records: + + username: 'userloginname' + password: 'userbigsecret' + domain: 'https://chat.suse.de' + + :param message: plain text message content in the Rocket.Chat + messaging format + :param channels: a list of channels where to send the message, + the user private channel should be prefixed + with '@' symbol + """ + try: + from rocketchat.api import RocketChatAPI + except Exception as e: + log.warning(f'rocketchat: Failed to import rocketchat.api: {e}') + return + + settings_path = \ + os.environ.get('HOME') + '/.config/rocketchat.api/settings.yaml' + + try: + with open(settings_path) as f: + settings = yaml.safe_load(f) + except Exception as e: + log.warning(f'rocketchat: Failed to load settings from {settings_path}: {e}') + + r = RocketChatAPI(settings=settings) + for channel in channels: + try: + r.send_message(message, channel) + except Exception as e: + log.warning(f'rocketchat: Failed to send message to "{channel}" channel: {e}') + + +def notify_sleep_before_teardown(ctx, stack, sleep_time): + rocketchat = ctx.config.get('rocketchat', None) + + if rocketchat: + channels = [_ for _ in [_.strip() for _ in rocketchat.split(',')] if _] + log.info("Sending a message to Rocket.Chat channels: %s", channels) + message = build_rocketchat_message(ctx, stack, sleep_time) + rocketchat_send_message(ctx, message, channels) + + email = ctx.config.get('email', None) + if not email: + # we have no email configured, return silently + return + (subject, body) = build_email_body(ctx, stack, sleep_time) + log.info('Sending no to {to}: {body}'.format(to=email, body=body)) + import smtplib + from email.mime.text import MIMEText + msg = MIMEText(body) + msg['Subject'] = subject + msg['From'] = teuth_config.results_sending_email or 'teuthology' + msg['To'] = email + log.debug('sending email %s', msg.as_string()) + smtp = smtplib.SMTP('localhost') + smtp.sendmail(msg['From'], [msg['To']], msg.as_string()) + smtp.quit() + diff --git a/teuthology/safepath.py b/teuthology/safepath.py new file mode 100644 index 000000000..b8115a25e --- /dev/null +++ b/teuthology/safepath.py @@ -0,0 +1,42 @@ +import errno +import os + +def munge(path): + """ + Munge a potentially hostile path name to be safe to use. + + This very definitely changes the meaning of the path, + but it only does that for unsafe paths. + """ + # explicitly ignoring windows as a platform + segments = path.split('/') + # filter out empty segments like foo//bar + segments = [s for s in segments if s!=''] + # filter out no-op segments like foo/./bar + segments = [s for s in segments if s!='.'] + # all leading dots become underscores; makes .. safe too + for idx, seg in enumerate(segments): + if seg.startswith('.'): + segments[idx] = '_'+seg[1:] + # empty string, "/", "//", etc + if not segments: + segments = ['_'] + return '/'.join(segments) + + +def makedirs(root, path): + """ + os.makedirs gets confused if the path contains '..', and root might. + + This relies on the fact that `path` has been normalized by munge(). + """ + segments = path.split('/') + for seg in segments: + root = os.path.join(root, seg) + try: + os.mkdir(root) + except OSError as e: + if e.errno == errno.EEXIST: + pass + else: + raise diff --git a/teuthology/schedule.py b/teuthology/schedule.py new file mode 100644 index 000000000..d9af64efc --- /dev/null +++ b/teuthology/schedule.py @@ -0,0 +1,143 @@ +import os +import yaml + +import teuthology.beanstalk +from teuthology.misc import get_user, merge_configs +from teuthology import report + + +def main(args): + if not args['--first-in-suite']: + first_job_args = ['subset', 'no-nested-subset', 'seed'] + for arg in first_job_args: + opt = '--{arg}'.format(arg=arg) + msg_fmt = '{opt} is only applicable to the first job in a suite' + if args.get(opt): + raise ValueError(msg_fmt.format(opt=opt)) + + if not args['--last-in-suite']: + last_job_args = ['email', 'timeout'] + for arg in last_job_args: + opt = '--{arg}'.format(arg=arg) + msg_fmt = '{opt} is only applicable to the last job in a suite' + if args[opt]: + raise ValueError(msg_fmt.format(opt=opt)) + + if args['--first-in-suite'] or args['--last-in-suite']: + report_status = False + else: + report_status = True + + name = args['--name'] + if not name or name.isdigit(): + raise ValueError("Please use a more descriptive value for --name") + job_config = build_config(args) + backend = args['--queue-backend'] + if args['--dry-run']: + print('---\n' + yaml.safe_dump(job_config)) + elif backend == 'beanstalk': + schedule_job(job_config, args['--num'], report_status) + elif backend.startswith('@'): + dump_job_to_file(backend.lstrip('@'), job_config, args['--num']) + else: + raise ValueError("Provided schedule backend '%s' is not supported. " + "Try 'beanstalk' or '@path-to-a-file" % backend) + + +def build_config(args): + """ + Given a dict of arguments, build a job config + """ + config_paths = args.get('', list()) + conf_dict = merge_configs(config_paths) + # strip out targets; the worker will allocate new ones when we run + # the job with --lock. + if 'targets' in conf_dict: + del conf_dict['targets'] + args['config'] = conf_dict + + owner = args['--owner'] + if owner is None: + owner = 'scheduled_{user}'.format(user=get_user()) + + job_config = dict( + name=args['--name'], + first_in_suite=args['--first-in-suite'], + last_in_suite=args['--last-in-suite'], + email=args['--email'], + description=args['--description'], + owner=owner, + verbose=args['--verbose'], + machine_type=args['--worker'], + tube=args['--worker'], + priority=int(args['--priority']), + ) + # Update the dict we just created, and not the other way around, to let + # settings in the yaml override what's passed on the command line. This is + # primarily to accommodate jobs with multiple machine types. + job_config.update(conf_dict) + for arg,conf in {'--timeout':'results_timeout', + '--seed': 'seed', + '--subset': 'subset', + '--no-nested-subset': 'no_nested_subset'}.items(): + val = args.get(arg, None) + if val is not None: + job_config[conf] = val + + return job_config + + +def schedule_job(job_config, num=1, report_status=True): + """ + Schedule a job. + + :param job_config: The complete job dict + :param num: The number of times to schedule the job + """ + num = int(num) + job = yaml.safe_dump(job_config) + tube = job_config.pop('tube') + beanstalk = teuthology.beanstalk.connect() + beanstalk.use(tube) + while num > 0: + jid = beanstalk.put( + job, + ttr=60 * 60 * 24, + priority=job_config['priority'], + ) + print('Job scheduled with name {name} and ID {jid}'.format( + name=job_config['name'], jid=jid)) + job_config['job_id'] = str(jid) + if report_status: + report.try_push_job_info(job_config, dict(status='queued')) + num -= 1 + + +def dump_job_to_file(path, job_config, num=1): + """ + Schedule a job. + + :param job_config: The complete job dict + :param num: The number of times to schedule the job + :param path: The file path where the job config to append + """ + num = int(num) + count_file_path = path + '.count' + + jid = 0 + if os.path.exists(count_file_path): + with open(count_file_path, 'r') as f: + jid=int(f.read() or '0') + + with open(path, 'a') as f: + while num > 0: + jid += 1 + job_config['job_id'] = str(jid) + job = yaml.safe_dump(job_config) + print('Job scheduled with name {name} and ID {jid}'.format( + name=job_config['name'], jid=jid)) + f.write('---\n' + job) + num -= 1 + with open(count_file_path, 'w') as f: + f.write(str(jid)) + diff --git a/teuthology/scrape.py b/teuthology/scrape.py new file mode 100644 index 000000000..33a38e81c --- /dev/null +++ b/teuthology/scrape.py @@ -0,0 +1,524 @@ +# Origin: https://github.com/jcsp/scrape/blob/master/scrape.py +# Author: John Spray (github.com/jcsp) + +import difflib +from errno import ENOENT +import gzip +import sys +import os +import yaml +from collections import defaultdict +import re +import logging +import subprocess + + +log = logging.getLogger('scrape') +log.addHandler(logging.StreamHandler()) +log.setLevel(logging.INFO) + + +class Reason(object): + def get_description(self): + return self.description + + def get_detail(self): + return None + + +def grep(path, expr): + """ + Call out to native grep rather than feeding massive log files through python line by line + """ + p = subprocess.Popen(["grep", expr, path], stdout=subprocess.PIPE, + universal_newlines=True) + p.wait() + out, err = p.communicate() + if p.returncode == 0: + return out.split("\n") + else: + return [] + + +class GenericReason(Reason): + """ + A reason inferred from a Job: matches Jobs with an apparently-similar failure + """ + + def __init__(self, job, description=None): + self.failure_reason = job.get_failure_reason() + self.description = description + + self.backtrace = job.get_backtrace() + if self.backtrace: + log.debug("Found a backtrace!\n{0}".format(self.backtrace)) + + def get_detail(self): + return self.backtrace + + def get_description(self): + if self.description: + return self.description + else: + if self.backtrace: + return "Crash: {0}".format(self.failure_reason) + else: + return "Failure: {0}".format(self.failure_reason) + + def match(self, job): + # I never match dead jobs + if job.get_failure_reason() is None: + return False + + # If one has a backtrace but the other doesn't, we're a different thing even if the official + # failure_reason is the same + if (self.backtrace is None) != (job.get_backtrace() is None): + return False + + # If we have the same backtrace, we're a match even if the teuthology failure_reason + # doesn't match (a crash is a crash, it can have different symptoms) + if self.backtrace: + ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio() + return ratio > 0.5 + else: + if "Test failure:" in self.failure_reason: + return self.failure_reason == job.get_failure_reason() + elif re.search(r"workunit test (.*)\) on ", self.failure_reason): + workunit_name = re.search(r"workunit test (.*)\) on ", self.failure_reason).group(1) + other_match = re.search(r"workunit test (.*)\) on ", job.get_failure_reason()) + return other_match is not None and workunit_name == other_match.group(1) + else: + reason_ratio = difflib.SequenceMatcher(None, self.failure_reason, job.get_failure_reason()).ratio() + return reason_ratio > 0.5 + + +class RegexReason(Reason): + """ + A known reason matching a particular regex to failure reason + """ + + def __init__(self, regexes, description): + self.description = description + if isinstance(regexes, list): + self.regexes = regexes + else: + self.regexes = [regexes] + + def match(self, job): + # I never match dead jobs + if job.get_failure_reason() is None: + return False + + for regex in self.regexes: + if re.match(regex, job.get_failure_reason()): + return True + + return False + + +class AssertionReason(Reason): + def __init__(self, job): + self.assertion = job.get_assertion() + self.backtrace = job.get_backtrace() + + def get_description(self): + return "Assertion: {0}".format(self.assertion) + + def get_detail(self): + return self.backtrace + + @classmethod + def could_be(cls, job): + return job.get_assertion() is not None + + def match(self, job): + return self.assertion == job.get_assertion() + + +class LockdepReason(AssertionReason): + """ + Different to a normal assertion, because matches should not only + have the same assertion but the same backtrace (don't want to glob + all lockdep failures together if they are really being tripped in + different places) + """ + @classmethod + def could_be(cls, job): + if not super(LockdepReason, cls).could_be(job): + return False + + return "common/lockdep" in job.get_assertion() + + def get_description(self): + return "Lockdep: {0}".format(self.assertion) + + def match(self, job): + if not super(LockdepReason, self).match(job): + return False + + if self.backtrace: + if job.get_backtrace(): + ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio() + return ratio > 0.5 + else: + return False + else: + # No backtrace to compare about, allow matches based purely on assertion + return True + + +class DeadReason(Reason): + """ + A reason for picking up jobs with no summary.yaml + """ + def __init__(self, job): + self.description = "Dead" + self.last_tlog_line = job.get_last_tlog_line() + self.backtrace = job.get_backtrace() + + def get_description(self): + return "Dead: {0}".format(self.last_tlog_line) + + def get_detail(self): + return self.backtrace + + @classmethod + def could_be(cls, job): + return job.summary_data is None + + def match(self, job): + if job.summary_data: + return False + + if self.backtrace: + if job.get_backtrace(): + # We both have backtrace: use that to decide if we're the same + ratio = difflib.SequenceMatcher(None, self.backtrace, job.get_backtrace()).ratio() + return ratio > 0.5 + else: + # I have BT but he doesn't, so we're different + return False + last_tlog_line = job.get_last_tlog_line() + if self.last_tlog_line is not None and last_tlog_line is not None: + ratio = difflib.SequenceMatcher(None, self.last_tlog_line, + last_tlog_line).ratio() + return ratio > 0.5 + else: + return self.last_tlog_line == last_tlog_line + + +class TimeoutReason(Reason): + def __init__(self, job): + self.timeout, self.command = self.get_timeout(job) + + def get_description(self): + return "Timeout {0} running {1}".format( + self.timeout, self.command + ) + + @classmethod + def could_be(cls, job): + return cls.get_timeout(job) is not None + + @classmethod + def get_timeout(cls, job): + if job.get_failure_reason() is None: + return None + + match = re.search("status 124:.* timeout ([^ ]+) ([^']+)'", job.get_failure_reason()) + if not match: + return + + timeout, bin_path = match.groups() + + # Given a path like /home/ubuntu/cephtest/workunit.client.0/cephtool/test.sh + # ... strip it down to cephtool/test.sh + parts = bin_path.split(os.path.sep) + parts.reverse() + rparts = [] + for p in parts: + if 'workunit.' in p or 'cephtest' in p: + break + else: + rparts.append(p) + rparts.reverse() + command = os.path.sep.join(rparts) + + return timeout, command + + def match(self, job): + return self.get_timeout(job) == (self.timeout, self.command) + +MAX_TEUTHOLOGY_LOG = 1024 * 1024 * 100 +MAX_SVC_LOG = 100 * 1024 * 1024 +MAX_BT_LINES = 100 + + +class Job(object): + def __init__(self, path, job_id): + self.path = path + self.job_id = job_id + + try: + self.config = yaml.safe_load(open(os.path.join(self.path, "config.yaml"), 'r')) + self.description = self.config['description'] + assert self.description + except IOError: + self.config = None + self.description = None + + summary_path = os.path.join(self.path, "summary.yaml") + try: + self.summary_data = yaml.safe_load(open(summary_path, 'r')) + except IOError: + self.summary_data = None + + self.backtrace = None + self.assertion = None + self.populated = False + + def get_success(self): + if self.summary_data: + return self.summary_data['success'] + else: + return False + + def get_failure_reason(self): + if self.summary_data: + return self.summary_data['failure_reason'] + else: + return None + + def get_last_tlog_line(self): + t_path = os.path.join(self.path, "teuthology.log") + if not os.path.exists(t_path): + return None + else: + out, err = subprocess.Popen(["tail", "-n", "1", t_path], stdout=subprocess.PIPE).communicate() + return out.strip() + + def _search_backtrace(self, file_obj): + bt_lines = [] + assertion = None + for line in file_obj: + # Log prefix from teuthology.log + if ".stderr:" in line: + # Only captures the error and disregard what ever comes before + line = line.split(".stderr:")[1] + + if "FAILED assert" in line or "__ceph_assert_fail" in line: + assertion = line.strip() + + if line.startswith(" ceph version"): + # The start of a backtrace! + bt_lines = [line] + elif line.startswith(" NOTE: a copy of the executable") or \ + line.strip().endswith("clone()") or \ + "clone()+0x" in line: + # The backtrace terminated, if we have a buffer return it + if len(bt_lines): + return ("".join(bt_lines)).strip(), assertion + else: + log.warning("Saw end of BT but not start") + elif bt_lines: + # We're in a backtrace, push the line onto the list + if len(bt_lines) > MAX_BT_LINES: + # We exceeded MAX_BT_LINES, drop it + log.warning("Ignoring backtrace that exceeds MAX_BACKTRACE_LINES: {0}".format( + ", ".join(bt_lines[0:3]) + )) + log.warning("Either the backtrace is too long or we did a bad job of checking for end of backtrace!") + bt_lines = [] + else: + bt_lines.append(line) + + return None, assertion + + def get_assertion(self): + if not self.populated: + self._populate_backtrace() + return self.assertion + + def get_backtrace(self): + if not self.populated: + self._populate_backtrace() + return self.backtrace + + def _populate_backtrace(self): + tlog_path = os.path.join(self.path, "teuthology.log") + try: + s = os.stat(tlog_path) + except OSError: + log.warning("Missing teuthology log {0}".format(tlog_path)) + return None + size = s.st_size + if size > MAX_TEUTHOLOGY_LOG: + log.debug("Ignoring teuthology log for job {0}, it is {1} bytes".format(self.job_id, size)) + return None + + self.backtrace, self.assertion = self._search_backtrace(open(tlog_path)) + if self.backtrace: + return + + for line in grep(tlog_path, "command crashed with signal"): + if not line: + continue + log.debug("Found a crash indication: {0}".format(line)) + # tasks.ceph.osd.1.plana82.stderr + match = re.search(r"tasks.ceph.([^\.]+).([^\.]+).([^\.]+).stderr", line) + if not match: + log.warning("Not-understood crash indication {0}".format(line)) + continue + svc, svc_id, hostname = match.groups() + gzipped_log_path = os.path.join( + self.path, "remote", hostname, "log", "ceph-{0}.{1}.log.gz".format(svc, svc_id)) + + try: + s = os.stat(gzipped_log_path) + except OSError as e: + if e.errno == ENOENT: + log.warning("Missing log {0}".format(gzipped_log_path)) + continue + else: + raise + + size = s.st_size + if size > MAX_SVC_LOG: + log.warning("Not checking for backtrace from {0}:{1}.{2} log, too large ({3})".format( + hostname, svc, svc_id, size + )) + continue + + with gzip.open(gzipped_log_path, 'rt', errors='ignore') as f: + bt, ass = self._search_backtrace(f) + if ass and not self.assertion: + self.assertion = ass + if bt: + self.backtrace = bt + return + + return None + + +class ValgrindReason(Reason): + def __init__(self, job): + assert self.could_be(job) + self.service_types = self._get_service_types(job) + + def _get_service_types(self, job): + """ + Get dict mapping service type 'osd' etc to sorted list of violation types 'Leak_PossiblyLost' etc + """ + + result = defaultdict(list) + # Lines like: + # 2014-08-22T20:07:18.668 ERROR:tasks.ceph:saw valgrind issue Leak_DefinitelyLost in /var/log/ceph/valgrind/osd.3.log.gz + for line in grep(os.path.join(job.path, "teuthology.log"), " in "): + match = re.search("(.+) in .+/(.+)", line) + if not match: + log.warning("Misunderstood line: {0}".format(line)) + continue + err_typ, log_basename = match.groups() + svc_typ = log_basename.split(".")[0] + if err_typ not in result[svc_typ]: + result[svc_typ].append(err_typ) + result[svc_typ] = sorted(result[svc_typ]) + + return dict(result) + + def get_description(self): + desc_bits = [] + for service, types in list(self.service_types.items()): + desc_bits.append("{0} ({1})".format(service, ", ".join(types))) + return "Valgrind: " + ", ".join(desc_bits) + + @classmethod + def could_be(cls, job): + return job.get_failure_reason() is not None and "saw valgrind issues" in job.get_failure_reason() + + def match(self, job): + return self._get_service_types(job) == self.service_types + + +known_reasons = [ + # If the failure reason indicates no packages found... + RegexReason(["Failed to fetch package version from http://", + "Command failed on .* with status 100: 'sudo apt-get update"] + , "Missing packages"), +] + + +def give_me_a_reason(job): + """ + If no existing reasons match the job, generate the most specific reason we can + """ + + # Note: because we match known reasons, including GenericReasons, before any of + # the Timeout/Valgrind whatever, even if a run is a timeout or a valgrind failure, + # it will get matched up with a backtrace or assertion if one is there, hiding + # the valgrind/timeout aspect. + + for r in known_reasons: + if r.match(job): + return r + + # NB ordering matters, LockdepReason must come before AssertionReason + for klass in [DeadReason, LockdepReason, AssertionReason, TimeoutReason, ValgrindReason]: + if klass.could_be(job): + return klass(job) + + return GenericReason(job) + + +class Scraper(object): + def __init__(self, target_dir): + self.target_dir = target_dir + log.addHandler(logging.FileHandler(os.path.join(target_dir, + "scrape.log"))) + + def analyze(self): + entries = os.listdir(self.target_dir) + jobs = [] + for entry in entries: + job_dir = os.path.join(self.target_dir, entry) + if os.path.isdir(job_dir): + jobs.append(Job(job_dir, entry)) + + log.info("Found {0} jobs".format(len(jobs))) + + passes = [] + reasons = defaultdict(list) + + for job in jobs: + if job.get_success(): + passes.append(job) + continue + + matched = False + for reason, reason_jobs in reasons.items(): + if reason.match(job): + reason_jobs.append(job) + matched = True + break + + if not matched: + reasons[give_me_a_reason(job)].append(job) + + log.info("Found {0} distinct failure reasons".format(len(reasons))) + for reason, jobs in list(reasons.items()): + job_spec = "{0} jobs: {1}".format(len(jobs), [j.job_id for j in jobs]) if len(jobs) < 30 else "{0} jobs".format(len(jobs)) + log.info(reason.get_description()) + detail = reason.get_detail() + if detail: + log.info(detail) + log.info(job_spec) + suites = [set(j.description.split()) for j in jobs if j.description != None] + if len(suites) > 1: + log.info("suites intersection: {0}".format(sorted(set.intersection(*suites)))) + log.info("suites union: {0}".format(sorted(set.union(*suites)))) + elif len(suites) == 1: + log.info("suites: {0}".format(sorted(suites[0]))) + log.info("") + +if __name__ == '__main__': + Scraper(sys.argv[1]).analyze() diff --git a/teuthology/suite/__init__.py b/teuthology/suite/__init__.py new file mode 100644 index 000000000..8a17cf5f1 --- /dev/null +++ b/teuthology/suite/__init__.py @@ -0,0 +1,261 @@ +# this file is responsible for submitting tests into the queue +# by generating combinations of facets found in +# https://github.com/ceph/ceph-qa-suite.git + +import logging +import os +import random +import sys +import time +from distutils.util import strtobool + +import teuthology +from teuthology.config import config, YamlConfig +from teuthology.report import ResultsReporter +from teuthology.results import UNFINISHED_STATUSES + +from teuthology.suite.run import Run +from teuthology.suite.util import schedule_fail + +log = logging.getLogger(__name__) + + +def override_arg_defaults(name, default, env=os.environ): + env_arg = { + '--ceph-repo' : 'TEUTH_CEPH_REPO', + '--suite-repo' : 'TEUTH_SUITE_REPO', + '--ceph-branch' : 'TEUTH_CEPH_BRANCH', + '--suite-branch' : 'TEUTH_SUITE_BRANCH', + } + if name in env_arg and env_arg[name] in env.keys(): + variable = env_arg[name] + value = env[variable] + log.debug("Default value for '{arg}' is overridden " + "from environment with: {val}" + .format(arg=name, val=value)) + return value + else: + return default + + +def process_args(args): + conf = YamlConfig() + rename_args = { + 'ceph': 'ceph_branch', + 'sha1': 'ceph_sha1', + 'kernel': 'kernel_branch', + '': 'base_yaml_paths', + 'filter': 'filter_in', + } + for (key, value) in args.items(): + # Translate --foo-bar to foo_bar + key = key.lstrip('--').replace('-', '_') + # Rename the key if necessary + key = rename_args.get(key) or key + if key == 'suite_branch': + value = value or override_arg_defaults('--suite-branch', None) + if key == 'suite' and value is not None: + value = normalize_suite_name(value) + if key == 'suite_relpath' and value is None: + value = '' + elif key in ('limit', 'priority', 'num', 'newest', 'seed', 'job_threshold'): + value = int(value) + elif key == 'subset' and value is not None: + # take input string '2/3' and turn into (2, 3) + value = tuple(map(int, value.split('/'))) + elif key == 'expire' and value is None: + # Skip empty 'expire' values + continue + elif key in ('filter_all', 'filter_in', 'filter_out', 'rerun_statuses'): + if not value: + value = [] + else: + value = [x.strip() for x in value.split(',')] + elif key == 'ceph_repo': + value = expand_short_repo_name( + value, + config.get_ceph_git_url()) + elif key == 'suite_repo': + value = expand_short_repo_name( + value, + config.get_ceph_qa_suite_git_url()) + elif key in ('validate_sha1', 'filter_fragments', 'kdb'): + value = strtobool(value) + conf[key] = value + return conf + + +def normalize_suite_name(name): + return name.replace('/', ':') + +def expand_short_repo_name(name, orig): + # Allow shortname repo name 'foo' or 'foo/bar'. This works with + # github URLs, e.g. + # + # foo -> https://github.com/ceph/foo + # foo/bar -> https://github.com/foo/bar + # + # when the orig URL is also github. The two-level substitution may not + # work with some configs. + name_vec = name.split('/') + if name_vec[-1] == '': + del name_vec[-1] + if len(name_vec) <= 2 and name.count(':') == 0: + orig_vec = orig.split('/') + if orig_vec[-1] == '': + del orig_vec[-1] + return '/'.join(orig_vec[:-len(name_vec)] + name_vec) + '.git' + # otherwise, assume a full URL + return name + +def main(args): + conf = process_args(args) + if conf.verbose: + teuthology.log.setLevel(logging.DEBUG) + + dry_run = conf.dry_run + if not conf.machine_type or conf.machine_type == 'None': + if not config.default_machine_type or config.default_machine_type == 'None': + schedule_fail("Must specify a machine_type", dry_run=dry_run) + else: + conf.machine_type = config.default_machine_type + elif 'multi' in conf.machine_type: + schedule_fail("'multi' is not a valid machine_type. " + + "Maybe you want 'gibba,smithi,mira' or similar", dry_run=dry_run) + + if conf.email: + config.results_email = conf.email + if conf.archive_upload: + config.archive_upload = conf.archive_upload + log.info('Will upload archives to ' + conf.archive_upload) + + if conf.rerun: + get_rerun_conf_overrides(conf) + if conf.seed < 0: + conf.seed = random.randint(0, 9999) + log.info('Using random seed=%s', conf.seed) + + run = Run(conf) + name = run.name + run.prepare_and_schedule() + if not conf.dry_run and conf.wait: + return wait(name, config.max_job_time, + conf.archive_upload_url) + + +def get_rerun_conf_overrides(conf): + reporter = ResultsReporter() + run = reporter.get_run(conf.rerun) + + conf.suite = normalize_suite_name(run['suite']) + + try: + job0 = run['jobs'][0] + except IndexError: + job0 = None + + seed = None if job0 is None else job0.get('seed') + if conf.seed >= 0 and conf.seed != seed: + log.error('--seed %s does not match with rerun seed: %s', + conf.seed, seed) + sys.exit(1) + else: + log.info('Using rerun seed=%s', seed) + conf.seed = seed + + if job0 is not None: + subset = job0.get('subset', '1/1') + if subset is None: + subset = '1/1' + subset = tuple(map(int, subset.split('/'))) + else: + subset = None + if conf.subset is not None and conf.subset != subset: + log.error('--subset %s does not match with ' + 'rerun subset: %s', + conf.subset, subset) + sys.exit(1) + else: + if subset == (1, 1): + conf.subset = None + else: + log.info('Using rerun subset=%s', subset) + conf.subset = subset + + no_nested_subset = False if job0 is None else job0.get('no_nested_subset', False) + if conf.no_nested_subset is not None and conf.no_nested_subset != no_nested_subset: + log.error('--no-nested-subset specified but does not match with ' + 'rerun --no-nested-subset: %s', + no_nested_subset) + sys.exit(1) + else: + log.info('Using rerun no_nested_subset=%s', no_nested_subset) + conf.no_nested_subset = no_nested_subset + + rerun_filters = get_rerun_filters(run, conf.rerun_statuses) + if len(rerun_filters['descriptions']) == 0: + log.warning( + "No jobs matched the status filters: %s", + conf.rerun_statuses, + ) + return + + conf.filter_in.extend(rerun_filters['descriptions']) + + +def get_rerun_filters(run, statuses): + filters = dict() + jobs = [] + for job in run['jobs']: + if job['status'] in statuses: + jobs.append(job) + filters['descriptions'] = [job['description'] for job in jobs if job['description']] + return filters + + +class WaitException(Exception): + pass + + +def wait(name, max_job_time, upload_url): + stale_job = max_job_time + Run.WAIT_MAX_JOB_TIME + reporter = ResultsReporter() + past_unfinished_jobs = [] + progress = time.time() + log.info(f"waiting for the run {name} to complete") + log.debug("the list of unfinished jobs will be displayed " + "every " + str(Run.WAIT_PAUSE / 60) + " minutes") + exit_code = 0 + while True: + jobs = reporter.get_jobs(name, fields=['job_id', 'status']) + unfinished_jobs = [] + for job in jobs: + if job['status'] in UNFINISHED_STATUSES: + unfinished_jobs.append(job) + elif job['status'] != 'pass': + exit_code = 1 + if len(unfinished_jobs) == 0: + log.info("wait is done") + break + if (len(past_unfinished_jobs) == len(unfinished_jobs) and + time.time() - progress > stale_job): + raise WaitException( + "no progress since " + str(config.max_job_time) + + " + " + str(Run.WAIT_PAUSE) + " seconds") + if len(past_unfinished_jobs) != len(unfinished_jobs): + past_unfinished_jobs = unfinished_jobs + progress = time.time() + time.sleep(Run.WAIT_PAUSE) + job_ids = [job['job_id'] for job in unfinished_jobs] + log.debug('wait for jobs ' + str(job_ids)) + jobs = reporter.get_jobs(name, fields=['job_id', 'status', + 'description', 'log_href']) + # dead, fail, pass : show fail/dead jobs first + jobs = sorted(jobs, key=lambda x: x['status']) + for job in jobs: + if upload_url: + url = os.path.join(upload_url, name, job['job_id']) + else: + url = job['log_href'] + log.info(f"{job['status']} {url} {job['description']}") + return exit_code diff --git a/teuthology/suite/build_matrix.py b/teuthology/suite/build_matrix.py new file mode 100644 index 000000000..e9ee9e60c --- /dev/null +++ b/teuthology/suite/build_matrix.py @@ -0,0 +1,209 @@ +import logging +import os +import random + +from teuthology.suite import matrix + +log = logging.getLogger(__name__) + + +def build_matrix(path, subset=None, no_nested_subset=False, seed=None): + """ + Return a list of items descibed by path such that if the list of + items is chunked into mincyclicity pieces, each piece is still a + good subset of the suite. + + A good subset of a product ensures that each facet member appears + at least once. A good subset of a sum ensures that the subset of + each sub collection reflected in the subset is a good subset. + + A mincyclicity of 0 does not attempt to enforce the good subset + property. + + The input is just a path. The output is an array of (description, + [file list]) tuples. + + For a normal file we generate a new item for the result list. + + For a directory, we (recursively) generate a new item for each + file/dir. + + For a directory with a magic '+' file, we generate a single item + that concatenates all files/subdirs (A Sum). + + For a directory with a magic '%' file, we generate a result set + for each item in the directory, and then do a product to generate + a result list with all combinations (A Product). If the file + contains an integer, it is used as the divisor for a random + subset. + + For a directory with a magic '$' file, or for a directory whose name + ends in '$', we generate a list of all items that we will randomly + choose from. + + The final description (after recursion) for each item will look + like a relative path. If there was a % product, that path + component will appear as a file with braces listing the selection + of chosen subitems. + + :param path: The path to search for yaml fragments + :param subset: (index, outof) + :param no_nested_subset: disable nested subsets + :param seed: The seed for repeatable random test + """ + if subset: + log.info( + 'Subset=%s/%s' % + (str(subset[0]), str(subset[1])) + ) + if no_nested_subset: + log.info("no_nested_subset") + random.seed(seed) + mat, first, matlimit = _get_matrix(path, subset, no_nested_subset) + return generate_combinations(path, mat, first, matlimit) + + +def _get_matrix(path, subset=None, no_nested_subset=False): + (which, divisions) = (0,1) if subset is None else subset + if divisions > 1: + mat = _build_matrix(path, mincyclicity=divisions, no_nested_subset=no_nested_subset) + mat = matrix.Subset(mat, divisions, which=which) + else: + mat = _build_matrix(path, no_nested_subset=no_nested_subset) + return mat, 0, mat.size() + + +def _build_matrix(path, mincyclicity=0, no_nested_subset=False, item=''): + if os.path.basename(path)[0] == '.': + return None + if not os.path.exists(path): + raise IOError('%s does not exist (abs %s)' % (path, os.path.abspath(path))) + if os.path.isfile(path): + if path.endswith('.yaml'): + return matrix.Base(item) + return None + if os.path.isdir(path): + if path.endswith('.disable'): + return None + files = sorted(os.listdir(path)) + if len(files) == 0: + return None + if '+' in files: + # concatenate items + files.remove('+') + submats = [] + for fn in sorted(files): + submat = _build_matrix( + os.path.join(path, fn), + mincyclicity, + no_nested_subset, + fn) + if submat is not None: + submats.append(submat) + return matrix.Concat(item, submats) + elif path.endswith('$') or '$' in files: + # pick a random item -- make sure we don't pick any magic files + if '$' in files: + files.remove('$') + if '%' in files: + files.remove('%') + submats = [] + for fn in sorted(files): + submat = _build_matrix( + os.path.join(path, fn), + mincyclicity, + no_nested_subset, + fn) + if submat is not None: + submats.append(submat) + return matrix.PickRandom(item, submats) + elif '%' in files: + # convolve items + files.remove('%') + with open(os.path.join(path, '%')) as f: + divisions = f.read() + if no_nested_subset or len(divisions) == 0: + divisions = 1 + else: + divisions = int(divisions) + assert divisions > 0 + submats = [] + for fn in sorted(files): + submat = _build_matrix( + os.path.join(path, fn), + 0, + no_nested_subset, + fn) + if submat is not None: + submats.append(submat) + mat = matrix.Product(item, submats) + minc = mincyclicity * divisions + if mat and mat.cyclicity() < minc: + mat = matrix.Cycle( + (minc + mat.cyclicity() - 1) // mat.cyclicity(), mat + ) + if divisions > 1: + mat = matrix.Subset(mat, divisions) + return mat + else: + # list items + submats = [] + for fn in sorted(files): + submat = _build_matrix( + os.path.join(path, fn), + mincyclicity, + no_nested_subset, + fn) + if submat is None: + continue + if submat.cyclicity() < mincyclicity: + submat = matrix.Cycle( + ((mincyclicity + submat.cyclicity() - 1) // + submat.cyclicity()), + submat) + submats.append(submat) + return matrix.Sum(item, submats) + assert False, "Invalid path %s seen in _build_matrix" % path + return None + + +def generate_combinations(path, mat, generate_from, generate_to): + """ + Return a list of items describe by path + + The input is just a path. The output is an array of (description, + [file list]) tuples. + + For a normal file we generate a new item for the result list. + + For a directory, we (recursively) generate a new item for each + file/dir. + + For a directory with a magic '+' file, we generate a single item + that concatenates all files/subdirs. + + For a directory with a magic '%' file, we generate a result set + for each item in the directory, and then do a product to generate + a result list with all combinations. + + The final description (after recursion) for each item will look + like a relative path. If there was a % product, that path + component will appear as a file with braces listing the selection + of chosen subitems. + """ + ret = [] + for i in range(generate_from, generate_to): + output = mat.index(i) + ret.append(( + matrix.generate_desc(combine_path, output).replace('.yaml', ''), + matrix.generate_paths(path, output, combine_path))) + return ret + + +def combine_path(left, right): + """ + os.path.join(a, b) doesn't like it when b is None + """ + if right: + return os.path.join(left, right) + return left diff --git a/teuthology/suite/fragment-merge.lua b/teuthology/suite/fragment-merge.lua new file mode 100644 index 000000000..6be2e0b87 --- /dev/null +++ b/teuthology/suite/fragment-merge.lua @@ -0,0 +1,105 @@ +-- allow only some Lua (and lunatic) builtins for use by scripts +local lua_allowlist = { + assert = assert, + error = error, + ipairs = ipairs, + next = next, + pairs = pairs, + tonumber = tonumber, + tostring = tostring, + py_attrgetter = python.as_attrgetter, + py_dict = python.builtins.dict, + py_len = python.builtins.len, + py_list = python.builtins.list, + py_tuple = python.builtins.tuple, + py_enumerate = python.enumerate, + py_iterex = python.iterex, + py_itemgetter = python.as_itemgetter, + math = math, +} +lua_allowlist.__index = lua_allowlist + +-- accept a fragment/config (or just return true from the script!) +local function accept() + coroutine.yield(true) +end +-- reject a fragment/config (or just return false from the script!) +local function reject() + coroutine.yield(false) +end +-- this implements logic for filtering (via teuthology-suite CLI flags) +local function matches(_ENV, f) + if description:find(f, 1, true) then + return true + end + if filter_fragments then + for i,path in py_enumerate(base_frag_paths) do + if path:find(f) then + return true + end + end + end +end + +local function check_filters(_ENV) + if filter_all then + for i,f in py_enumerate(filter_all) do + if not matches(_ENV, f) then + reject() + end + end + end + if filter_in then + local found, tried = false, false + for i,f in py_enumerate(filter_in) do + tried = true + if matches(_ENV, f) then + found = true + break + end + end + if tried and not found then + reject() + end + end + if filter_out then + for i,f in py_enumerate(filter_out) do + if matches(_ENV, f) then + reject() + end + end + end +end + +function new_script(script, log, deep_merge, yaml_load) + -- create a restricted sandbox for the script: + local env = setmetatable({ + accept = accept, + deep_merge = deep_merge, + log = log, + reject = reject, + yaml_load = yaml_load, + }, lua_allowlist) + + -- avoid putting check_filters in _ENV + -- try to keep line numbers correct: + local header = [[do local check_filters = ...; accept(); check_filters(_ENV) end local function main() do ]] + local footer = [[ end return true end return main()]] + local function chunks() + coroutine.yield(header) + if #script > 0 then + coroutine.yield(script) + end + coroutine.yield(footer) + end + + -- put the script in a coroutine so we can yield success/failure from + -- anywhere in the script, including in nested function calls. + local f, err = load(coroutine.wrap(chunks), 'teuthology', 't', env) + if f == nil then + error("failure to load script: "..err) + end + f = coroutine.wrap(f) + f(check_filters) + return env, f +end diff --git a/teuthology/suite/matrix.py b/teuthology/suite/matrix.py new file mode 100644 index 000000000..e713bc443 --- /dev/null +++ b/teuthology/suite/matrix.py @@ -0,0 +1,388 @@ +import os +import random +import heapq +from math import gcd +from functools import reduce + +def lcm(a, b): + return a*b // gcd(a, b) +def lcml(l): + return reduce(lcm, l) + +class Matrix: + """ + Interface for sets + """ + def size(self): + pass + + def index(self, i): + """ + index() should return a recursive structure represending the paths + to concatenate for index i: + + Result :: (PathSegment, Result) | {Result} + Path :: string + + {Result} is a frozen_set of Results indicating that + the set of paths resulting from each of the contained + Results should be concatenated. (PathSegment, Result) + indicates that PathSegment should be prepended to the + paths resulting from Result. + """ + pass + + def minscanlen(self): + """ + min run require to get a good sample + """ + pass + + def cyclicity(self): + """ + A cyclicity of N means that the set represented by the Matrix + can be chopped into N good subsets of sequential indices. + """ + return self.size() // self.minscanlen() + + def tostr(self, depth): + pass + + def __str__(self): + """ + str method + """ + return self.tostr(0) + + +class Cycle(Matrix): + """ + Run a matrix multiple times + """ + def __init__(self, num, mat): + self.mat = mat + self.num = num + + def size(self): + return self.mat.size() * self.num + + def index(self, i): + return self.mat.index(i % self.mat.size()) + + def minscanlen(self): + return self.mat.minscanlen() + + def tostr(self, depth): + return '\t'*depth + "Cycle({num}):\n".format(num=self.num) + self.mat.tostr(depth + 1) + +# Logically, inverse of Cycle +class Subset(Matrix): + """ + Run a matrix subset. + """ + def __init__(self, mat, divisions, which=None): + self.mat = mat + self.divisions = divisions + if which is None: + self.which = random.randint(0, divisions-1) + else: + assert which < divisions + self.which = which + + def size(self): + return self.mat.size() // self.divisions + + def index(self, i): + i += self.which * self.size() + assert i < self.mat.size() + return self.mat.index(i) + + def minscanlen(self): + return self.mat.minscanlen() + + def tostr(self, depth): + return '\t'*depth + "Subset({num}, {index}):\n".format(num=self.num, index=self.index) + self.mat.tostr(depth + 1) + + +class Base(Matrix): + """ + Just a single item. + """ + def __init__(self, item): + self.item = item + + def size(self): + return 1 + + def index(self, i): + return self.item + + def minscanlen(self): + return 1 + + def tostr(self, depth): + return '\t'*depth + "Base({item})\n".format(item=self.item) + + +class Product(Matrix): + """ + Builds items by taking one item from each submatrix. Contiguous + subsequences should move through all dimensions. + """ + def __init__(self, item, _submats): + assert len(_submats) > 0, \ + "Product requires child submats to be passed in" + self.item = item + + submats = sorted( + [((i.size(), ind), i) for (i, ind) in + zip(_submats, range(len(_submats)))], reverse=True) + self.submats = [] + self._size = 1 + for ((size, _), submat) in submats: + self.submats.append((self._size, submat)) + self._size *= size + self.submats.reverse() + + self._minscanlen = max([i.minscanlen() for i in _submats]) + if self._minscanlen + 1 > self._size: + self._minscanlen = self._size + else: + self._minscanlen += 1 + + def tostr(self, depth): + ret = '\t'*depth + "Product({item}):\n".format(item=self.item) + return ret + ''.join([i[1].tostr(depth+1) for i in self.submats]) + + def minscanlen(self): + return self._minscanlen + + def size(self): + return self._size + + def _index(self, i, submats): + """ + We recursively reduce the N dimension problem to a two + dimension problem. + + index(i) = (lmat.index(i % lmat.size()), rmat.index(i % + rmat.size())) would simply work if lmat.size() and rmat.size() + are relatively prime. + + In general, if the gcd(lmat.size(), rmat.size()) == N, + index(i) would be periodic on the interval (lmat.size() * + rmat.size()) / N. To adjust, we decrement the lmat index + number on each repeat. Each of the N repeats must therefore + be distinct from the previous ones resulting in lmat.size() * + rmat.size() combinations. + """ + assert len(submats) > 0, \ + "_index requires non-empty submats" + if len(submats) == 1: + return frozenset([submats[0][1].index(i)]) + + lmat = submats[0][1] + lsize = lmat.size() + + rsize = submats[0][0] + + cycles = gcd(rsize, lsize) + clen = (rsize * lsize) // cycles + off = (i // clen) % cycles + + def combine(r, s=frozenset()): + if isinstance(r, frozenset): + return s | r + return s | frozenset([r]) + + litems = lmat.index((i - off) % lmat.size()) + ritems = self._index(i, submats[1:]) + return combine(litems, combine(ritems)) + + def index(self, i): + items = self._index(i, self.submats) + return (self.item, items) + +class Concat(Matrix): + """ + Concatenates all items in child matrices + """ + def __init__(self, item, submats): + self.submats = submats + self.item = item + + def size(self): + return 1 + + def minscanlen(self): + return 1 + + def index(self, i): + out = frozenset() + for submat in self.submats: + for i in range(submat.size()): + out = out | frozenset([submat.index(i)]) + return (self.item, out) + + def tostr(self, depth): + ret = '\t'*depth + "Concat({item}):\n".format(item=self.item) + return ret + ''.join([i.tostr(depth+1) for i in self.submats]) + +class PickRandom(Matrix): + """ + Select a random item from the child matrices. + """ + def __init__(self, item, submats): + self.submats = submats + self.item = item + + def size(self): + return 1 + + def minscanlen(self): + return 1 + + def index(self, i): + indx = random.randint(0, len(self.submats) - 1) + submat = self.submats[indx] + out = frozenset([submat.index(indx)]) + return (self.item, out) + + def tostr(self, depth): + ret = '\t'*depth + "PickRandom({item}):\n".format(item=self.item) + return ret + ''.join([i.tostr(depth+1) for i in self.submats]) + +class Sum(Matrix): + """ + We want to mix the subsequences proportionately to their size. + + The intuition is that we map all of the subsequences uniformly + onto rational numbers in [0, 1). The ith subsequence with length + l will have index k map onto i* + k*(1/l). i* + ensures that no two subsequences have an index which shares a + mapping in [0, 1) as long as is chosen to be small + enough. + + Rather than actually dealing with rational numbers, however, we'll + instead map onto whole numbers in [0, pseudo_size) where + pseudo_size is the lcm of the subsequence lengths * the number of + subsequences. Including the number of subsequences in the product + allows us to use 1 as . For each subsequence, we designate + an offset (position in input list) and a multiple (pseudo_size / size) + such that the psuedo_index for index i is + i*. + + I don't have a good way to map index to pseudo index, so we'll + precompute a mapping in the constructor (self._i_so_sis) from + index to (subset_index, subset). + """ + def __init__(self, item, _submats): + assert len(_submats) > 0, \ + f"Sum requires non-empty _submats: {item}" + self.item = item + + self._pseudo_size = lcml((i.size() for i in _submats)) * len(_submats) + self._size = sum((i.size() for i in _submats)) + self._submats = [ + ((i, self._pseudo_size // s.size()), s) for (i, s) in \ + zip(range(len(_submats)), _submats) + ] + + def sm_to_pmsl(offset_multiple_submat): + """ + offset_multiple_submat tuple to pseudo minscanlen + """ + ((offset, multiple), submat) = offset_multiple_submat + + return submat.minscanlen() * multiple + + def index_to_pindex_generator(submats): + assert len(submats) > 0, "submats must be non-empty" + h = [] + for (offset, multiple), submat in submats: + heapq.heappush(h, (offset, 0, multiple, submat)) + while True: + cur, si, multiple, submat = heapq.heappop(h) + heapq.heappush( + h, + (cur + multiple, si + 1, multiple, submat)) + yield si, submat + + self._i_to_sis = dict( + zip(range(self._size), index_to_pindex_generator(self._submats)) + ) + + self._minscanlen = self.pseudo_index_to_index( + max(map(sm_to_pmsl, self._submats))) + + def pi_to_sis(self, pi, offset_multiple): + """ + offset_multiple tuple of offset and multiple + + max(i) s.t. offset + i*multiple <= pi + """ + (offset, multiple) = offset_multiple + if pi < offset: + return -1 + return (pi - offset) // multiple + + def pseudo_index_to_index(self, pi): + """ + Count all pseudoindex values <= pi with corresponding subset indices + """ + return sum((self.pi_to_sis(pi, i) + 1 for i, _ in self._submats)) - 1 + + def tostr(self, depth): + ret = '\t'*depth + "Sum({item}):\n".format(item=self.item) + return ret + ''.join([i[1].tostr(depth+1) for i in self._submats]) + + def minscanlen(self): + return self._minscanlen + + def size(self): + return self._size + + def index(self, i): + si, submat = self._i_to_sis[i % self._size] + return (self.item, submat.index(si)) + +def generate_lists(result): + """ + Generates a set of tuples representing paths to concatenate + """ + if isinstance(result, frozenset): + ret = [] + for i in result: + ret.extend(generate_lists(i)) + return frozenset(ret) + elif isinstance(result, tuple): + ret = [] + (item, children) = result + for f in generate_lists(children): + nf = [item] + nf.extend(f) + ret.append(tuple(nf)) + return frozenset(ret) + else: + return frozenset([(result,)]) + + +def generate_paths(path, result, joinf=os.path.join): + """ + Generates from the result set a list of sorted paths to concatenate + """ + return [reduce(joinf, i, path) for i in sorted(generate_lists(result))] + + +def generate_desc(joinf, result): + """ + Generates the text description of the test represented by result + """ + if isinstance(result, frozenset): + ret = sorted([generate_desc(joinf, i) for i in result]) + return '{' + ' '.join(ret) + '}' + elif isinstance(result, tuple): + (item, children) = result + cdesc = generate_desc(joinf, children) + return joinf(str(item), cdesc) + else: + return str(result) diff --git a/teuthology/suite/merge.py b/teuthology/suite/merge.py new file mode 100644 index 000000000..0e109af02 --- /dev/null +++ b/teuthology/suite/merge.py @@ -0,0 +1,177 @@ +import copy +import logging +import lupa +import os +from types import MappingProxyType +import yaml + +from teuthology.config import JobConfig +from teuthology.suite.build_matrix import combine_path +from teuthology.suite.util import strip_fragment_path +from teuthology.misc import deep_merge + +log = logging.getLogger(__name__) + +TEUTHOLOGY_TEMPLATE = MappingProxyType({ + "teuthology": { + "fragments_dropped": [], + "meta": {}, + "postmerge": [], + } +}) + +L = lupa.LuaRuntime() +FRAGMENT_MERGE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fragment-merge.lua") +with open(FRAGMENT_MERGE) as f: + L.execute(f.read()) + +def config_merge(configs, suite_name=None, **kwargs): + """ + This procedure selects and merges YAML fragments for each job in the + configs array generated for the matrix of jobs. + + The primary task here is to run premerge and postmerge scripts specified + with the YAML fragments as part of filtering out jobs or individual YAML + fragments. This is done with Lua scripting (via "lupa", a "lunatic" + derivative). + + A premerge script looks like: + + + teuthology: + premerge: | + if yaml.os_type == 'ubuntu' then reject() end + + + This script runs prior to a YAML fragment merging into the complete YAML + specification for a job. The script has access to the complete YAML + description generated so far as part of merging earlier fragments + (remember: fragments are ordered lexicographically). In the above case, the + os_type is checked with the foo.yaml fragment dropped if the job is + configured to run on Ubuntu (note: this does not account for a jobs' + default os_type which is not yet known). + + The postmerge scripts look like: + + + teuthology: + postmerge: + - if yaml.os_type == "ubuntu" then reject() end + + + This script is the same but has a different effect: if, after combining all + the YAML fragments for a job, the os_type is "ubuntu", then the entire job + is dropped (filtered out / rejected). postmerge scripts are also specified + as a list of strings in the teuthology.postmerge array. All of these + strings are concatenated and then executed as a single script. So, + postmerge scripts from multiple fragments are all combined. You may use + this to define variables, functions, or anything else you need. + + Scripts have access to the entire yaml object and may do any desired advanced + checks. It is also possible to programatically change the YAML definition: + + + teuthology: + postmerge: + - | + local attr = py_attrgetter + local tasks = py_list() + for i = 1, 3 do + local task = py_dict( + exec = py_dict(py_list( + py_tuple("mon.a", py_list( + "echo "..i + ) + )) + ) + attr(tasks).append(task) + end + deep_merge(yaml.tasks, tasks) + + + This will be as if the yaml file contained: + + + tasks: + exec: + mon.a: + - echo 1 + exec: + mon.a: + - echo 2 + exec: + mon.a: + - echo 3 + + + Which will be merged normally (via deep_merge) after the script is run. + + Scripts are well sandboxed with access to a small selection of the Lua + builtin libraries. There is also access to some python/lupa specific + functions which are prefixed with "py_". No I/O or other system functions + permitted. + + The teuthology-suite filtering options are now implemented via builtin + postmerge scripts. Logically, if a filter matches then reject will drop + the entire job (config) from the list. + """ + seed = kwargs.setdefault('seed', 1) + base_config = kwargs.setdefault('base_config', JobConfig()) + if not isinstance(seed, int): + log.debug("no valid seed input: using 1") + seed = 1 + log.debug("configuring Lua randomseed to %d", seed) + L.execute(f'local math = require"math"; math.randomseed({seed});') + new_script = L.eval('new_script') + yaml_cache = {} + for desc, paths in configs: + log.debug("merging config %s", desc) + + if suite_name is not None: + desc = combine_path(suite_name, desc) + + yaml_complete_obj = copy.deepcopy(base_config.to_dict()) + deep_merge(yaml_complete_obj, dict(TEUTHOLOGY_TEMPLATE)) + for path in paths: + if path not in yaml_cache: + with open(path) as f: + txt = f.read() + yaml_cache[path] = (txt, yaml.safe_load(txt)) + + yaml_fragment_txt, yaml_fragment_obj = yaml_cache[path] + if yaml_fragment_obj is None: + continue + yaml_fragment_obj = copy.deepcopy(yaml_fragment_obj) + premerge = yaml_fragment_obj.get('teuthology', {}).pop('premerge', '') + if premerge: + log.debug("premerge script running:\n%s", premerge) + env, script = new_script(premerge, log, deep_merge, yaml.safe_load) + env['base_frag_paths'] = [strip_fragment_path(x) for x in paths] + env['description'] = desc + env['frag_paths'] = paths + env['suite_name'] = suite_name + env['yaml'] = yaml_complete_obj + env['yaml_fragment'] = yaml_fragment_obj + for k,v in kwargs.items(): + env[k] = v + if not script(): + log.debug("skipping merge of fragment %s due to premerge filter", path) + yaml_complete_obj['teuthology']['fragments_dropped'].append(path) + continue + deep_merge(yaml_complete_obj, yaml_fragment_obj) + + postmerge = yaml_complete_obj.get('teuthology', {}).get('postmerge', []) + postmerge = "\n".join(postmerge) + log.debug("postmerge script running:\n%s", postmerge) + env, script = new_script(postmerge, log, deep_merge, yaml.safe_load) + env['base_frag_paths'] = [strip_fragment_path(x) for x in paths] + env['description'] = desc + env['frag_paths'] = paths + env['suite_name'] = suite_name + env['yaml'] = yaml_complete_obj + for k,v in kwargs.items(): + env[k] = v + if not script(): + log.debug("skipping config %s due to postmerge filter", desc) + continue + yield desc, paths, yaml_complete_obj diff --git a/teuthology/suite/placeholder.py b/teuthology/suite/placeholder.py new file mode 100644 index 000000000..f812fccac --- /dev/null +++ b/teuthology/suite/placeholder.py @@ -0,0 +1,108 @@ +import copy + + +class Placeholder(object): + """ + A placeholder for use with substitute_placeholders. Simply has a 'name' + attribute. + """ + def __init__(self, name): + self.name = name + + +def substitute_placeholders(input_dict, values_dict): + """ + Replace any Placeholder instances with values named in values_dict. In the + case of None values, the key is omitted from the result. + + Searches through nested dicts. + + :param input_dict: A dict which may contain one or more Placeholder + instances as values. + :param values_dict: A dict, with keys matching the 'name' attributes of all + of the Placeholder instances in the input_dict, and + values to be substituted. + :returns: The modified input_dict + """ + input_dict = copy.deepcopy(input_dict) + + def _substitute(input_dict, values_dict): + for key, value in list(input_dict.items()): + if isinstance(value, dict): + _substitute(value, values_dict) + elif isinstance(value, Placeholder): + if values_dict[value.name] is None: + del input_dict[key] + continue + # If there is a Placeholder without a corresponding entry in + # values_dict, we will hit a KeyError - we want this. + input_dict[key] = values_dict[value.name] + return input_dict + + return _substitute(input_dict, values_dict) + + +# Template for the config that becomes the base for each generated job config +dict_templ = { + 'branch': Placeholder('ceph_branch'), + 'expire': Placeholder('expire'), + 'sha1': Placeholder('ceph_hash'), + 'teuthology_branch': Placeholder('teuthology_branch'), + 'teuthology_sha1': Placeholder('teuthology_sha1'), + 'archive_upload': Placeholder('archive_upload'), + 'archive_upload_key': Placeholder('archive_upload_key'), + 'machine_type': Placeholder('machine_type'), + 'os_type': Placeholder('distro'), + 'os_version': Placeholder('distro_version'), + 'overrides': { + 'admin_socket': { + 'branch': Placeholder('ceph_branch'), + }, + 'ceph': { + 'conf': { + 'mon': { + 'debug mon': 20, + 'debug ms': 1, + 'debug paxos': 20}, + 'mgr': { + 'debug mgr': 20, + 'debug ms': 1}, + 'osd': { + 'debug ms': 1, + 'debug osd': 20 + } + }, + 'flavor': Placeholder('flavor'), + 'log-ignorelist': [r'\(MDS_ALL_DOWN\)', + r'\(MDS_UP_LESS_THAN_MAX\)'], + 'sha1': Placeholder('ceph_hash'), + }, + 'ceph-deploy': { + 'conf': { + 'client': { + 'log file': '/var/log/ceph/ceph-$name.$pid.log' + }, + 'mon': { + } + } + }, + 'install': { + 'ceph': { + 'sha1': Placeholder('ceph_hash'), + 'flavor': Placeholder('flavor'), + } + }, + 'workunit': { + 'branch': Placeholder('suite_branch'), + 'sha1': Placeholder('suite_hash'), + } + }, + 'repo': Placeholder('ceph_repo'), + 'sleep_before_teardown': 0, + 'suite': Placeholder('suite'), + 'suite_repo': Placeholder('suite_repo'), + 'suite_relpath': Placeholder('suite_relpath'), + 'suite_branch': Placeholder('suite_branch'), + 'suite_sha1': Placeholder('suite_hash'), + 'tasks': [], +} diff --git a/teuthology/suite/run.py b/teuthology/suite/run.py new file mode 100644 index 000000000..984231dfb --- /dev/null +++ b/teuthology/suite/run.py @@ -0,0 +1,749 @@ +import copy +import datetime +import logging +import os +import pwd +import yaml +import re +import time + +from pathlib import Path + +from humanfriendly import format_timespan + +from teuthology import repo_utils + +from teuthology.config import config, JobConfig +from teuthology.exceptions import ( + BranchMismatchError, BranchNotFoundError, CommitNotFoundError, +) +from teuthology.misc import deep_merge, get_results_url, update_key +from teuthology.orchestra.opsys import OS +from teuthology.repo_utils import build_git_url + +from teuthology.suite import util +from teuthology.suite.merge import config_merge +from teuthology.suite.build_matrix import build_matrix +from teuthology.suite.placeholder import substitute_placeholders, dict_templ +from teuthology.util.time import parse_offset, parse_timestamp, TIMESTAMP_FMT + +log = logging.getLogger(__name__) + + +class Run(object): + WAIT_MAX_JOB_TIME = 30 * 60 + WAIT_PAUSE = 5 * 60 + __slots__ = ( + 'args', 'name', 'base_config', 'suite_repo_path', 'base_yaml_paths', + 'base_args', 'kernel_dict', 'config_input', 'timestamp', 'user', 'os', + ) + + def __init__(self, args): + """ + args must be a config.YamlConfig object + """ + self.args = args + # We assume timestamp is a datetime.datetime object + self.timestamp = self.args.timestamp or \ + datetime.datetime.now().strftime(TIMESTAMP_FMT) + self.user = self.args.owner or pwd.getpwuid(os.getuid()).pw_name + self.name = self.make_run_name() + if self.args.ceph_repo: + config.ceph_git_url = self.args.ceph_repo + if self.args.suite_repo: + config.ceph_qa_suite_git_url = self.args.suite_repo + + self.base_config = self.create_initial_config() + + # Interpret any relative paths as being relative to ceph-qa-suite + # (absolute paths are unchanged by this) + self.base_yaml_paths = [os.path.join(self.suite_repo_path, b) for b in + self.args.base_yaml_paths] + + def make_run_name(self): + """ + Generate a run name. A run name looks like: + teuthology-2014-06-23_19:00:37-rados-dumpling-testing-basic-plana + """ + worker = util.get_worker(self.args.machine_type) + return '-'.join( + [ + self.user, + str(self.timestamp), + self.args.suite, + self.args.ceph_branch, + self.args.kernel_branch or '-', + self.args.flavor, worker + ] + ).replace('/', ':') + + def create_initial_config(self): + """ + Put together the config file used as the basis for each job in the run. + Grabs hashes for the latest ceph, kernel and teuthology versions in the + branches specified and specifies them so we know exactly what we're + testing. + + :returns: A JobConfig object + """ + now = datetime.datetime.now(datetime.timezone.utc) + expires = self.get_expiration() + if expires: + if now > expires: + util.schedule_fail( + f"Refusing to schedule because the expiration date is in the past: {self.args.expire}", + dry_run=self.args.dry_run, + ) + + self.os = self.choose_os() + self.kernel_dict = self.choose_kernel() + ceph_hash = self.choose_ceph_hash() + # We don't store ceph_version because we don't use it yet outside of + # logging. + self.choose_ceph_version(ceph_hash) + suite_branch = self.choose_suite_branch() + suite_hash = self.choose_suite_hash(suite_branch) + if self.args.suite_dir: + self.suite_repo_path = self.args.suite_dir + else: + self.suite_repo_path = util.fetch_repos( + suite_branch, test_name=self.name, dry_run=self.args.dry_run, commit=suite_hash) + teuthology_branch, teuthology_sha1 = self.choose_teuthology_branch() + + + if self.args.distro_version: + self.args.distro_version, _ = \ + OS.version_codename(self.args.distro, self.args.distro_version) + self.config_input = dict( + suite=self.args.suite, + suite_branch=suite_branch, + suite_hash=suite_hash, + ceph_branch=self.args.ceph_branch, + ceph_hash=ceph_hash, + ceph_repo=config.get_ceph_git_url(), + teuthology_branch=teuthology_branch, + teuthology_sha1=teuthology_sha1, + machine_type=self.args.machine_type, + distro=self.os.name, + distro_version=self.os.version, + archive_upload=config.archive_upload, + archive_upload_key=config.archive_upload_key, + suite_repo=config.get_ceph_qa_suite_git_url(), + suite_relpath=self.args.suite_relpath, + flavor=self.args.flavor, + expire=expires.strftime(TIMESTAMP_FMT) if expires else None, + ) + return self.build_base_config() + + def get_expiration(self, _base_time: datetime.datetime | None = None) -> datetime.datetime | None: + """ + _base_time: For testing, calculate relative offsets from this base time + + :returns: True if the job should run; False if it has expired + """ + log.info(f"Checking for expiration ({self.args.expire})") + expires_str = self.args.expire + if expires_str is None: + return None + now = datetime.datetime.now(datetime.timezone.utc) + if _base_time is None: + _base_time = now + try: + expires = parse_timestamp(expires_str) + except ValueError: + expires = _base_time + parse_offset(expires_str) + return expires + + def choose_os(self): + os_type = self.args.distro + os_version = self.args.distro_version + if not (os_type and os_version): + os_ = util.get_distro_defaults( + self.args.distro, self.args.machine_type)[2] + else: + os_ = OS(os_type, os_version) + return os_ + + def choose_kernel(self): + # Put together a stanza specifying the kernel hash + if self.args.kernel_branch == 'distro': + kernel_hash = 'distro' + kernel_branch = 'distro' + # Skip the stanza if '-k none' is given + elif self.args.kernel_branch is None or \ + self.args.kernel_branch.lower() == 'none': + kernel_hash = None + kernel_branch = None + else: + kernel_branch = self.args.kernel_branch + kernel_hash = util.get_gitbuilder_hash( + 'kernel', kernel_branch, 'default', + self.args.machine_type, self.args.distro, + self.args.distro_version, + ) + if not kernel_hash: + util.schedule_fail( + "Kernel branch '{branch}' not found".format( + branch=self.args.kernel_branch), + dry_run=self.args.dry_run, + ) + kdb = True + if self.args.kdb is not None: + kdb = self.args.kdb + + if kernel_hash: + log.info("kernel sha1: {hash}".format(hash=kernel_hash)) + kernel_dict = dict(kernel=dict(branch=kernel_branch, kdb=kdb, sha1=kernel_hash)) + if kernel_hash != 'distro': + kernel_dict['kernel']['flavor'] = 'default' + else: + kernel_dict = dict() + return kernel_dict + + def choose_ceph_hash(self): + """ + Get the ceph hash: if --sha1/-S is supplied, use it if it is valid, and + just keep the ceph_branch around. Otherwise use the current git branch + tip. + """ + repo_name = self.ceph_repo_name + + ceph_hash = None + if self.args.ceph_sha1: + ceph_hash = self.args.ceph_sha1 + if self.args.validate_sha1: + ceph_hash = util.git_validate_sha1(repo_name, ceph_hash) + if not ceph_hash: + exc = CommitNotFoundError( + self.args.ceph_sha1, + '%s.git' % repo_name + ) + util.schedule_fail(message=str(exc), name=self.name, dry_run=self.args.dry_run) + log.info("ceph sha1 explicitly supplied") + + elif self.args.ceph_branch: + ceph_hash = util.git_ls_remote( + self.args.ceph_repo, self.args.ceph_branch) + if not ceph_hash: + exc = BranchNotFoundError( + self.args.ceph_branch, + '%s.git' % repo_name + ) + util.schedule_fail(message=str(exc), name=self.name, dry_run=self.args.dry_run) + + log.info("ceph sha1: {hash}".format(hash=ceph_hash)) + return ceph_hash + + def choose_ceph_version(self, ceph_hash): + if config.suite_verify_ceph_hash and not self.args.newest: + # don't bother if newest; we'll search for an older one + # Get the ceph package version + ceph_version = util.package_version_for_hash( + ceph_hash, self.args.flavor, self.os.name, + self.os.version, self.args.machine_type, + ) + if not ceph_version: + msg = f"Packages for os_type '{self.os.name}', flavor " \ + f"{self.args.flavor} and ceph hash '{ceph_hash}' not found" + util.schedule_fail(msg, self.name, dry_run=self.args.dry_run) + log.info("ceph version: {ver}".format(ver=ceph_version)) + return ceph_version + else: + log.info('skipping ceph package verification') + + def choose_teuthology_branch(self): + """Select teuthology branch, check if it is present in repo and return + tuple (branch, hash) where hash is commit sha1 corresponding + to the HEAD of the branch. + + The branch name value is determined in the following order: + + Use ``--teuthology-branch`` argument value if supplied. + + Use ``TEUTH_BRANCH`` environment variable value if declared. + + If file ``qa/.teuthology_branch`` can be found in the suite repo + supplied with ``--suite-repo`` or ``--suite-dir`` and contains + non-empty string then use it as the branch name. + + Use ``teuthology_branch`` value if it is set in the one + of the teuthology config files ``$HOME/teuthology.yaml`` + or ``/etc/teuthology.yaml`` correspondingly. + + Use ``main``. + + Generate exception if the branch is not present in the repo. + + """ + teuthology_branch = self.args.teuthology_branch + if not teuthology_branch: + teuthology_branch = os.environ.get('TEUTH_BRANCH', None) + if not teuthology_branch: + branch_file_path = self.suite_repo_path + '/qa/.teuthology_branch' + log.debug('Check file %s exists', branch_file_path) + if os.path.exists(branch_file_path): + log.debug('Found teuthology branch config file %s', + branch_file_path) + with open(branch_file_path) as f: + teuthology_branch = f.read().strip() + if teuthology_branch: + log.debug( + 'The teuthology branch is overridden with %s', + teuthology_branch) + else: + log.warning( + 'The teuthology branch config is empty, skipping') + if not teuthology_branch: + teuthology_branch = config.get('teuthology_branch') + + if config.teuthology_path: + actual_branch = repo_utils.current_branch(config.teuthology_path) + if teuthology_branch and actual_branch != teuthology_branch: + raise BranchMismatchError( + teuthology_branch, + config.teuthology_path, + "config.teuthology_path is set", + ) + if not teuthology_branch: + teuthology_branch = actual_branch + teuthology_sha1 = util.git_ls_remote( + f"file://{Path(config.teuthology_path).resolve()}", + teuthology_branch + ) + else: + if not teuthology_branch: + teuthology_branch = 'main' + teuthology_sha1 = util.git_ls_remote( + 'teuthology', + teuthology_branch + ) + if not teuthology_sha1: + exc = BranchNotFoundError(teuthology_branch, build_git_url('teuthology')) + util.schedule_fail(message=str(exc), name=self.name, dry_run=self.args.dry_run) + log.info("teuthology branch: %s %s", teuthology_branch, teuthology_sha1) + return teuthology_branch, teuthology_sha1 + + @property + def ceph_repo_name(self): + if self.args.ceph_repo: + return self._repo_name(self.args.ceph_repo) + else: + return 'ceph' + + @property + def suite_repo_name(self): + if self.args.suite_repo: + return self._repo_name(self.args.suite_repo) + else: + return 'ceph-qa-suite' + + @staticmethod + def _repo_name(url): + return re.sub(r'\.git$', '', url.split('/')[-1]) + + def choose_suite_branch(self): + suite_repo_name = self.suite_repo_name + suite_repo_project_or_url = self.args.suite_repo or 'ceph-qa-suite' + suite_branch = self.args.suite_branch + ceph_branch = self.args.ceph_branch + if suite_branch and suite_branch != 'main': + if not util.git_branch_exists( + suite_repo_project_or_url, + suite_branch + ): + exc = BranchNotFoundError(suite_branch, suite_repo_name) + util.schedule_fail(message=str(exc), name=self.name, dry_run=self.args.dry_run) + elif not suite_branch: + # Decide what branch of the suite repo to use + if util.git_branch_exists(suite_repo_project_or_url, ceph_branch): + suite_branch = ceph_branch + else: + log.info( + "branch {0} not in {1}; will use main for" + " ceph-qa-suite".format( + ceph_branch, + suite_repo_name + )) + suite_branch = 'main' + return suite_branch + + def choose_suite_hash(self, suite_branch): + suite_repo_name = self.suite_repo_name + suite_hash = None + if self.args.suite_sha1: + suite_hash = self.args.suite_sha1 + if self.args.validate_sha1: + suite_hash = util.git_validate_sha1(suite_repo_name, suite_hash) + if not suite_hash: + exc = CommitNotFoundError( + self.args.suite_sha1, + '%s.git' % suite_repo_name + ) + util.schedule_fail(message=str(exc), name=self.name, dry_run=self.args.dry_run) + log.info("suite sha1 explicitly supplied") + else: + suite_repo_project_or_url = self.args.suite_repo or 'ceph-qa-suite' + suite_hash = util.git_ls_remote( + suite_repo_project_or_url, + suite_branch + ) + if not suite_hash: + exc = BranchNotFoundError(suite_branch, suite_repo_name) + util.schedule_fail(message=str(exc), name=self.name, dry_run=self.args.dry_run) + log.info("%s branch: %s %s", suite_repo_name, suite_branch, suite_hash) + return suite_hash + + def build_base_config(self): + conf_dict = substitute_placeholders(dict_templ, self.config_input) + conf_dict.update(self.kernel_dict) + job_config = JobConfig.from_dict(conf_dict) + job_config.name = self.name + job_config.user = self.user + job_config.timestamp = self.timestamp + job_config.priority = self.args.priority + job_config.seed = self.args.seed + if self.args.subset: + job_config.subset = '/'.join(str(i) for i in self.args.subset) + if self.args.email: + job_config.email = self.args.email + if self.args.owner: + job_config.owner = self.args.owner + if self.args.sleep_before_teardown: + job_config.sleep_before_teardown = int(self.args.sleep_before_teardown) + if self.args.rocketchat: + job_config.rocketchat = self.args.rocketchat + return job_config + + def build_base_args(self): + base_args = [ + '--name', self.name, + '--worker', util.get_worker(self.args.machine_type), + ] + if self.args.dry_run: + base_args.append('--dry-run') + if self.args.priority is not None: + base_args.extend(['--priority', str(self.args.priority)]) + if self.args.verbose: + base_args.append('-v') + if self.args.owner: + base_args.extend(['--owner', self.args.owner]) + if self.args.queue_backend: + base_args.extend(['--queue-backend', self.args.queue_backend]) + return base_args + + + def write_rerun_memo(self): + args = copy.deepcopy(self.base_args) + args.append('--first-in-suite') + if self.args.subset: + subset = '/'.join(str(i) for i in self.args.subset) + args.extend(['--subset', subset]) + if self.args.no_nested_subset: + args.extend(['--no-nested-subset']) + args.extend(['--seed', str(self.args.seed)]) + util.teuthology_schedule( + args=args, + dry_run=self.args.dry_run, + verbose=self.args.verbose, + log_prefix="Memo: ") + + + def write_result(self): + arg = copy.deepcopy(self.base_args) + arg.append('--last-in-suite') + if self.base_config.email: + arg.extend(['--email', self.base_config.email]) + if self.args.timeout: + arg.extend(['--timeout', self.args.timeout]) + util.teuthology_schedule( + args=arg, + dry_run=self.args.dry_run, + verbose=self.args.verbose, + log_prefix="Results: ") + results_url = get_results_url(self.base_config.name) + if results_url: + log.info("Test results viewable at %s", results_url) + + + def prepare_and_schedule(self): + """ + Puts together some "base arguments" with which to execute + teuthology-schedule for each job, then passes them and other parameters + to schedule_suite(). Finally, schedules a "last-in-suite" job that + sends an email to the specified address (if one is configured). + """ + self.base_args = self.build_base_args() + + # Make sure the yaml paths are actually valid + for yaml_path in self.base_yaml_paths: + full_yaml_path = os.path.join(self.suite_repo_path, yaml_path) + if not os.path.exists(full_yaml_path): + raise IOError("File not found: " + full_yaml_path) + + num_jobs = self.schedule_suite() + + if num_jobs: + self.write_result() + + def collect_jobs(self, arch, configs, newest=False, limit=0): + jobs_to_schedule = [] + jobs_missing_packages = [] + for description, fragment_paths, parsed_yaml in configs: + if limit > 0 and len(jobs_to_schedule) >= limit: + log.info( + 'Stopped after {limit} jobs due to --limit={limit}'.format( + limit=limit)) + break + + os_type = parsed_yaml.get('os_type') or self.base_config.os_type + os_version = parsed_yaml.get('os_version') or self.base_config.os_version + exclude_arch = parsed_yaml.get('exclude_arch') + exclude_os_type = parsed_yaml.get('exclude_os_type') + + if exclude_arch and exclude_arch == arch: + log.info('Skipping due to excluded_arch: %s facets %s', + exclude_arch, description) + continue + if exclude_os_type and exclude_os_type == os_type: + log.info('Skipping due to excluded_os_type: %s facets %s', + exclude_os_type, description) + continue + update_key('sha1', parsed_yaml, self.base_config) + update_key('suite_sha1', parsed_yaml, self.base_config) + + full_job_config = copy.deepcopy(self.base_config.to_dict()) + deep_merge(full_job_config, parsed_yaml) + flavor = util.get_install_task_flavor(full_job_config) + + parsed_yaml['flavor'] = flavor + + arg = copy.deepcopy(self.base_args) + arg.extend([ + '--num', str(self.args.num), + '--description', description, + '--', + ]) + arg.extend(self.base_yaml_paths) + + parsed_yaml_txt = yaml.dump(parsed_yaml) + arg.append('-') + + job = dict( + yaml=parsed_yaml, + desc=description, + sha1=self.base_config.sha1, + args=arg, + stdin=parsed_yaml_txt, + ) + + sha1 = self.base_config.sha1 + if parsed_yaml.get('verify_ceph_hash', + config.suite_verify_ceph_hash): + version = util.package_version_for_hash(sha1, flavor, os_type, + os_version, self.args.machine_type) + if not version: + jobs_missing_packages.append(job) + log.error(f"Packages for os_type '{os_type}', flavor {flavor} and " + f"ceph hash '{sha1}' not found") + # optimization: one missing package causes backtrack in newest mode; + # no point in continuing the search + if newest: + return jobs_missing_packages, [] + + jobs_to_schedule.append(job) + return jobs_missing_packages, jobs_to_schedule + + def schedule_jobs(self, jobs_missing_packages, jobs_to_schedule, name): + for job in jobs_to_schedule: + log.info( + 'Scheduling %s', job['desc'] + ) + + log_prefix = '' + if job in jobs_missing_packages: + log_prefix = "Missing Packages: " + if not config.suite_allow_missing_packages: + util.schedule_fail( + "At least one job needs packages that don't exist " + f"for hash {self.base_config.sha1}.", + name, + dry_run=self.args.dry_run, + ) + util.teuthology_schedule( + args=job['args'], + dry_run=self.args.dry_run, + verbose=self.args.verbose, + log_prefix=log_prefix, + stdin=job['stdin'], + ) + throttle = self.args.throttle + if not self.args.dry_run and throttle: + log.info("pause between jobs : --throttle " + str(throttle)) + time.sleep(int(throttle)) + + def check_priority(self, jobs_to_schedule): + priority = self.args.priority + msg=f'''Unable to schedule {jobs_to_schedule} jobs with priority {priority}. + +Use the following testing priority +10 to 49: Tests which are urgent and blocking other important development. +50 to 74: Testing a particular feature/fix with less than 25 jobs and can also be used for urgent release testing. +75 to 99: Tech Leads usually schedule integration tests with this priority to verify pull requests against main. +100 to 149: QE validation of point releases. +150 to 199: Testing a particular feature/fix with less than 100 jobs and results will be available in a day or so. +200 to 1000: Large test runs that can be done over the course of a week. +Note: To force run, use --force-priority''' + if priority < 50: + util.schedule_fail(msg, dry_run=self.args.dry_run) + elif priority < 75 and jobs_to_schedule > 25: + util.schedule_fail(msg, dry_run=self.args.dry_run) + elif priority < 150 and jobs_to_schedule > 100: + util.schedule_fail(msg, dry_run=self.args.dry_run) + + def check_num_jobs(self, jobs_to_schedule): + """ + Fail schedule if number of jobs exceeds job threshold. + """ + threshold = self.args.job_threshold + msg=f'''Unable to schedule {jobs_to_schedule} jobs, too many jobs, when maximum {threshold} jobs allowed. + +Note: If you still want to go ahead, use --job-threshold 0''' + if threshold and jobs_to_schedule > threshold: + util.schedule_fail(msg, dry_run=self.args.dry_run) + + def schedule_suite(self): + """ + Schedule the suite-run. Returns the number of jobs scheduled. + """ + name = self.name + if self.args.arch: + arch = self.args.arch + log.debug("Using '%s' as an arch" % arch) + else: + arch = util.get_arch(self.base_config.machine_type) + suite_name = self.base_config.suite + suite_path = os.path.normpath(os.path.join( + self.suite_repo_path, + self.args.suite_relpath, + 'suites', + self.base_config.suite.replace(':', '/'), + )) + log.debug('Suite %s in %s' % (suite_name, suite_path)) + log.debug(f"subset = {self.args.subset}") + log.debug(f"no_nested_subset = {self.args.no_nested_subset}") + if self.args.dry_run: + log.debug("Base job config:\n%s" % self.base_config) + + configs = build_matrix(suite_path, + subset=self.args.subset, + no_nested_subset=self.args.no_nested_subset, + seed=self.args.seed) + generated = len(configs) + log.info(f'Suite {suite_name} in {suite_path} generated {generated} jobs (not yet filtered or merged)') + configs = list(config_merge(configs, + filter_in=self.args.filter_in, + filter_out=self.args.filter_out, + filter_all=self.args.filter_all, + filter_fragments=self.args.filter_fragments, + base_config=self.base_config, + seed=self.args.seed, + suite_name=suite_name)) + + # compute job limit in respect of --sleep-before-teardown + job_limit = self.args.limit or 0 + sleep_before_teardown = int(self.args.sleep_before_teardown or 0) + if sleep_before_teardown: + if job_limit == 0: + log.warning('The --sleep-before-teardown option was provided: ' + 'only 1 job will be scheduled. ' + 'Use --limit to run more jobs') + # give user a moment to read this warning + time.sleep(5) + job_limit = 1 + elif self.args.non_interactive: + log.warning( + 'The --sleep-before-teardown option is active. ' + 'There will be a maximum {} jobs running ' + 'which will fall asleep for {}' + .format(job_limit, format_timespan(sleep_before_teardown))) + elif job_limit > 4: + are_you_insane=( + 'There are {total} configs and {maximum} job limit is used. ' + 'Do you really want to lock all machines needed for ' + 'this run for {that_long}? (y/N):' + .format( + that_long=format_timespan(sleep_before_teardown), + total=generated, + maximum=job_limit)) + while True: + insane=(input(are_you_insane) or 'n').lower() + if insane == 'y': + break + elif insane == 'n': + exit(0) + + # if newest, do this until there are no missing packages + # if not, do it once + backtrack = 0 + limit = self.args.newest + sha1s = [] + jobs_to_schedule = [] + jobs_missing_packages = [] + while backtrack <= limit: + jobs_missing_packages, jobs_to_schedule = \ + self.collect_jobs(arch, configs, self.args.newest, job_limit) + if jobs_missing_packages and self.args.newest: + if not sha1s: + sha1s = util.find_git_parents('ceph', str(self.base_config.sha1), self.args.newest) + if not sha1s: + util.schedule_fail('Backtrack for --newest failed', name, dry_run=self.args.dry_run) + cur_sha1 = sha1s.pop(0) + self.config_input['ceph_hash'] = cur_sha1 + # If ceph_branch and suite_branch are the same and + # ceph_repo and suite_repo are the same, update suite_hash + if (self.args.ceph_repo == self.args.suite_repo) and \ + (self.args.ceph_branch == self.args.suite_branch): + self.config_input['suite_hash'] = cur_sha1 + self.base_config = self.build_base_config() + backtrack += 1 + continue + if backtrack: + log.info("--newest supplied, backtracked %d commits to %s" % + (backtrack, self.base_config.sha1)) + break + else: + if self.args.newest: + util.schedule_fail( + 'Exceeded %d backtracks; raise --newest value' % limit, + name, + dry_run=self.args.dry_run, + ) + + if jobs_to_schedule: + self.write_rerun_memo() + + # Before scheduling jobs, check the priority + if self.args.priority and jobs_to_schedule and not self.args.force_priority: + self.check_priority(len(jobs_to_schedule)) + + self.check_num_jobs(len(jobs_to_schedule)) + + self.schedule_jobs(jobs_missing_packages, jobs_to_schedule, name) + + count = len(jobs_to_schedule) + missing_count = len(jobs_missing_packages) + total_count = count + if self.args.num: + total_count *= self.args.num + log.info( + 'Suite %s in %s scheduled %d jobs.' % + (suite_name, suite_path, count) + ) + log.info('%d/%d jobs were filtered out.', + (generated - count), + generated) + if missing_count: + log.warning('Scheduled %d/%d jobs that are missing packages!', + missing_count, count) + log.info('Scheduled %d jobs in total.', total_count) + return count diff --git a/teuthology/suite/test/conftest.py b/teuthology/suite/test/conftest.py new file mode 100644 index 000000000..4285bdcfc --- /dev/null +++ b/teuthology/suite/test/conftest.py @@ -0,0 +1,4 @@ +from teuthology.config import config + +def pytest_runtest_setup(): + config.load({}) diff --git a/teuthology/suite/test/suites/noop/noop.yaml b/teuthology/suite/test/suites/noop/noop.yaml new file mode 100644 index 000000000..fb674b1b1 --- /dev/null +++ b/teuthology/suite/test/suites/noop/noop.yaml @@ -0,0 +1,7 @@ +roles: +- - mon.a + - osd.0 +tasks: +- exec: + mon.a: + - echo "Well done !" diff --git a/teuthology/suite/test/test_build_matrix.py b/teuthology/suite/test/test_build_matrix.py new file mode 100644 index 000000000..ad710d2ef --- /dev/null +++ b/teuthology/suite/test/test_build_matrix.py @@ -0,0 +1,815 @@ +import os +import random + +from mock import patch, MagicMock + +from teuthology.suite import build_matrix +from teuthology.test.fake_fs import make_fake_fstools + + +class TestBuildMatrixSimple(object): + def test_combine_path(self): + result = build_matrix.combine_path("/path/to/left", "right/side") + assert result == "/path/to/left/right/side" + + def test_combine_path_no_right(self): + result = build_matrix.combine_path("/path/to/left", None) + assert result == "/path/to/left" + + +class TestBuildMatrix(object): + + patchpoints = [ + 'os.path.exists', + 'os.listdir', + 'os.path.isfile', + 'os.path.isdir', + 'builtins.open', + ] + + def setup_method(self): + self.mocks = dict() + self.patchers = dict() + for ppoint in self.__class__.patchpoints: + self.mocks[ppoint] = MagicMock() + self.patchers[ppoint] = patch(ppoint, self.mocks[ppoint]) + + def start_patchers(self, fake_fs): + fake_fns = make_fake_fstools(fake_fs) + # N.B.: relies on fake_fns being in same order as patchpoints + for ppoint, fn in zip(self.__class__.patchpoints, fake_fns): + self.mocks[ppoint].side_effect = fn + self.patchers[ppoint].start() + + def stop_patchers(self): + for patcher in self.patchers.values(): + patcher.stop() + + def teardown_method(self): + self.patchers.clear() + self.mocks.clear() + + def fragment_occurences(self, jobs, fragment): + # What fraction of jobs contain fragment? + count = 0 + for (description, fragment_list) in jobs: + for item in fragment_list: + if item.endswith(fragment): + count += 1 + return count / float(len(jobs)) + + def test_concatenate_1x2x3(self): + fake_fs = { + 'd0_0': { + '+': None, + 'd1_0': { + 'd1_0_0.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 1 + + def test_convolve_2x2(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + 'd1_0_1.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 4 + assert self.fragment_occurences(result, 'd1_1_1.yaml') == 0.5 + + def test_convolve_2x2x2(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + 'd1_0_1.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 8 + assert self.fragment_occurences(result, 'd1_2_0.yaml') == 0.5 + + def test_convolve_1x2x4(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + 'd1_2_3.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 8 + assert self.fragment_occurences(result, 'd1_2_2.yaml') == 0.25 + + def test_convolve_with_concat(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + '+': None, + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + 'd1_2_3.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 2 + for i in result: + assert 'd0_0/d1_2/d1_2_0.yaml' in i[1] + assert 'd0_0/d1_2/d1_2_1.yaml' in i[1] + assert 'd0_0/d1_2/d1_2_2.yaml' in i[1] + assert 'd0_0/d1_2/d1_2_3.yaml' in i[1] + + def test_convolve_nested(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + '%': '2', + 'd1_0_1': { + 'd1_0_1_0.yaml': None, + 'd1_0_1_1.yaml': None, + }, + 'd1_0_2': { + 'd1_0_2_0.yaml': None, + 'd1_0_2_1.yaml': None, + }, + }, + 'd1_2': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + 'd1_2_3.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 8 + assert self.fragment_occurences(result, 'd1_0_0.yaml') == 1 + assert self.fragment_occurences(result, 'd1_0_1_0.yaml') == 0.5 + assert self.fragment_occurences(result, 'd1_0_1_1.yaml') == 0.5 + assert self.fragment_occurences(result, 'd1_0_2_0.yaml') == 0.5 + assert self.fragment_occurences(result, 'd1_0_2_1.yaml') == 0.5 + assert self.fragment_occurences(result, 'd1_2_0.yaml') == 0.25 + assert self.fragment_occurences(result, 'd1_2_1.yaml') == 0.25 + assert self.fragment_occurences(result, 'd1_2_2.yaml') == 0.25 + assert self.fragment_occurences(result, 'd1_2_3.yaml') == 0.25 + + + def test_random_dollar_sign_2x2x3(self): + fake_fs = { + 'd0_0': { + '$': None, + 'd1_0': { + 'd1_0_0.yaml': None, + 'd1_0_1.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + }, + }, + } + fake_fs1 = { + 'd0_0$': { + 'd1_0': { + 'd1_0_0.yaml': None, + 'd1_0_1.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 1 + self.start_patchers(fake_fs1) + try: + result = build_matrix.build_matrix('d0_0$') + finally: + self.stop_patchers() + assert len(result) == 1 + + def test_random_dollar_sign_with_concat(self): + fake_fs = { + 'd0_0': { + '$': None, + 'd1_0': { + 'd1_0_0.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + '+': None, + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + 'd1_2_3.yaml': None, + }, + }, + } + fake_fs1 = { + 'd0_0$': { + 'd1_0': { + 'd1_0_0.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + '+': None, + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + 'd1_2_3.yaml': None, + }, + }, + } + for fs, root in [(fake_fs,'d0_0'), (fake_fs1,'d0_0$')]: + self.start_patchers(fs) + try: + result = build_matrix.build_matrix(root) + finally: + self.stop_patchers() + assert len(result) == 1 + if result[0][0][1:].startswith('d1_2'): + for i in result: + assert os.path.join(root, 'd1_2/d1_2_0.yaml') in i[1] + assert os.path.join(root, 'd1_2/d1_2_1.yaml') in i[1] + assert os.path.join(root, 'd1_2/d1_2_2.yaml') in i[1] + assert os.path.join(root, 'd1_2/d1_2_3.yaml') in i[1] + + def test_random_dollar_sign_with_convolve(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + 'd1_0_1.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2': { + '$': None, + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 4 + fake_fs1 = { + 'd0_0': { + '%': None, + 'd1_0': { + 'd1_0_0.yaml': None, + 'd1_0_1.yaml': None, + }, + 'd1_1': { + 'd1_1_0.yaml': None, + 'd1_1_1.yaml': None, + }, + 'd1_2$': { + 'd1_2_0.yaml': None, + 'd1_2_1.yaml': None, + 'd1_2_2.yaml': None, + }, + }, + } + self.start_patchers(fake_fs1) + try: + result = build_matrix.build_matrix('d0_0') + finally: + self.stop_patchers() + assert len(result) == 4 + + def test_emulate_teuthology_noceph(self): + fake_fs = { + 'teuthology': { + 'no-ceph': { + '%': None, + 'clusters': { + 'single.yaml': None, + }, + 'distros': { + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + }, + 'tasks': { + 'teuthology.yaml': None, + }, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + assert len(result) == 11 + assert self.fragment_occurences(result, 'vps.yaml') == 1 / 11.0 + + def test_empty_dirs(self): + fake_fs = { + 'teuthology': { + 'no-ceph': { + '%': None, + 'clusters': { + 'single.yaml': None, + }, + 'distros': { + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + }, + 'tasks': { + 'teuthology.yaml': None, + }, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + + fake_fs2 = { + 'teuthology': { + 'no-ceph': { + '%': None, + 'clusters': { + 'single.yaml': None, + }, + 'distros': { + 'empty': {}, + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + }, + 'tasks': { + 'teuthology.yaml': None, + }, + 'empty': {}, + }, + }, + } + self.start_patchers(fake_fs2) + try: + result2 = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + assert len(result) == 11 + assert len(result2) == len(result) + + def test_hidden(self): + fake_fs = { + 'teuthology': { + 'no-ceph': { + '%': None, + '.qa': None, + 'clusters': { + 'single.yaml': None, + '.qa': None, + }, + 'distros': { + '.qa': None, + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + }, + 'tasks': { + '.qa': None, + 'teuthology.yaml': None, + }, + '.foo': { + '.qa': None, + 'teuthology.yaml': None, + }, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + + fake_fs2 = { + 'teuthology': { + 'no-ceph': { + '%': None, + 'clusters': { + 'single.yaml': None, + }, + 'distros': { + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + }, + 'tasks': { + 'teuthology.yaml': None, + }, + }, + }, + } + self.start_patchers(fake_fs2) + try: + result2 = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + assert len(result) == 11 + assert len(result2) == len(result) + + def test_disable_extension(self): + fake_fs = { + 'teuthology': { + 'no-ceph': { + '%': None, + 'clusters': { + 'single.yaml': None, + }, + 'distros': { + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + }, + 'tasks': { + 'teuthology.yaml': None, + }, + }, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + + fake_fs2 = { + 'teuthology': { + 'no-ceph': { + '%': None, + 'clusters': { + 'single.yaml': None, + }, + 'distros': { + 'baremetal.yaml': None, + 'rhel7.0.yaml': None, + 'ubuntu12.04.yaml': None, + 'ubuntu14.04.yaml': None, + 'vps.yaml': None, + 'vps_centos6.5.yaml': None, + 'vps_debian7.yaml': None, + 'vps_rhel6.4.yaml': None, + 'vps_rhel6.5.yaml': None, + 'vps_rhel7.0.yaml': None, + 'vps_ubuntu14.04.yaml': None, + 'forcefilevps_ubuntu14.04.yaml.disable': None, + 'forcefilevps_ubuntu14.04.yaml.anotherextension': None, + }, + 'tasks': { + 'teuthology.yaml': None, + 'forcefilevps_ubuntu14.04notyaml': None, + }, + 'forcefilevps_ubuntu14.04notyaml': None, + 'tasks.disable': { + 'teuthology2.yaml': None, + 'forcefilevps_ubuntu14.04notyaml': None, + }, + }, + }, + } + self.start_patchers(fake_fs2) + try: + result2 = build_matrix.build_matrix('teuthology/no-ceph') + finally: + self.stop_patchers() + assert len(result) == 11 + assert len(result2) == len(result) + + def test_sort_order(self): + # This test ensures that 'ceph' comes before 'ceph-thrash' when yaml + # fragments are sorted. + fake_fs = { + 'thrash': { + '%': None, + 'ceph-thrash': {'default.yaml': None}, + 'ceph': {'base.yaml': None}, + 'clusters': {'mds-1active-1standby.yaml': None}, + 'debug': {'mds_client.yaml': None}, + 'fs': {'btrfs.yaml': None}, + 'msgr-failures': {'none.yaml': None}, + 'overrides': {'allowlist_wrongly_marked_down.yaml': None}, + 'tasks': {'cfuse_workunit_suites_fsstress.yaml': None}, + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('thrash') + finally: + self.stop_patchers() + assert len(result) == 1 + assert self.fragment_occurences(result, 'base.yaml') == 1 + fragments = result[0][1] + assert fragments[0] == 'thrash/ceph/base.yaml' + assert fragments[1] == 'thrash/ceph-thrash/default.yaml' + +class TestSubset(object): + patchpoints = [ + 'os.path.exists', + 'os.listdir', + 'os.path.isfile', + 'os.path.isdir', + 'builtins.open', + ] + + def setup_method(self): + self.mocks = dict() + self.patchers = dict() + for ppoint in self.__class__.patchpoints: + self.mocks[ppoint] = MagicMock() + self.patchers[ppoint] = patch(ppoint, self.mocks[ppoint]) + + def start_patchers(self, fake_fs): + fake_fns = make_fake_fstools(fake_fs) + # N.B.: relies on fake_fns being in same order as patchpoints + for ppoint, fn in zip(self.__class__.patchpoints, fake_fns): + self.mocks[ppoint].side_effect = fn + self.patchers[ppoint].start() + + def stop_patchers(self): + for patcher in self.patchers.values(): + patcher.stop() + + def teardown_method(self): + self.patchers.clear() + self.mocks.clear() + + MAX_FACETS = 10 + MAX_FANOUT = 3 + MAX_DEPTH = 3 + MAX_SUBSET = 10 + @staticmethod + def generate_fake_fs(max_facets, max_fanout, max_depth): + def yamilify(name): + return name + ".yaml" + def name_generator(): + x = 0 + while True: + yield(str(x)) + x += 1 + def generate_tree( + max_facets, max_fanout, max_depth, namegen, top=True): + if max_depth == 0: + return None + if max_facets == 0: + return None + items = random.choice(range(max_fanout)) + if items == 0 and top: + items = 1 + if items == 0: + return None + sub_max_facets = max_facets / items + tree = {} + for i in range(items): + subtree = generate_tree( + sub_max_facets, max_fanout, + max_depth - 1, namegen, top=False) + if subtree is not None: + tree['d' + next(namegen)] = subtree + else: + tree[yamilify('f' + next(namegen))] = None + random.choice([ + lambda: tree.update({'%': None}), + lambda: None])() + return tree + return { + 'root': generate_tree( + max_facets, max_fanout, max_depth, name_generator()) + } + + @staticmethod + def generate_subset(maxsub): + divisions = random.choice(range(maxsub-1))+1 + return (random.choice(range(divisions)), divisions) + + @staticmethod + def generate_description_list(tree, subset): + mat, first, matlimit = build_matrix._get_matrix( + 'root', subset=subset) + return [i[0] for i in build_matrix.generate_combinations( + 'root', mat, first, matlimit)], mat, first, matlimit + + @staticmethod + def verify_facets(tree, description_list, subset, mat, first, matlimit): + def flatten(tree): + for k,v in tree.items(): + if v is None and '.yaml' in k: + yield k + elif v is not None and '.disable' not in k: + for x in flatten(v): + yield x + + def pptree(tree, tabs=0): + ret = "" + for k, v in tree.items(): + if v is None: + ret += ('\t'*tabs) + k.ljust(10) + "\n" + else: + ret += ('\t'*tabs) + (k + ':').ljust(10) + "\n" + ret += pptree(v, tabs+1) + return ret + def deyamlify(name): + if name.endswith('.yaml'): + return name[:-5] + else: + return name + for facet in (deyamlify(_) for _ in flatten(tree)): + found = False + for i in description_list: + if facet in i: + found = True + break + if not found: + print("tree\n{tree}\ngenerated list\n{desc}\n\nfrom matrix\n\n{matrix}\nsubset {subset} without facet {fac}".format( + tree=pptree(tree), + desc='\n'.join(description_list), + subset=subset, + matrix=str(mat), + fac=facet)) + all_desc = build_matrix.generate_combinations( + 'root', + mat, + 0, + mat.size()) + for i, desc in zip(range(mat.size()), all_desc): + if i == first: + print('==========') + print("{} {}".format(i, desc)) + if i + 1 == matlimit: + print('==========') + assert found + + def test_random(self): + for i in range(10000): + tree = self.generate_fake_fs( + self.MAX_FACETS, + self.MAX_FANOUT, + self.MAX_DEPTH) + subset = self.generate_subset(self.MAX_SUBSET) + self.start_patchers(tree) + try: + dlist, mat, first, matlimit = self.generate_description_list(tree, subset) + finally: + self.stop_patchers() + self.verify_facets(tree, dlist, subset, mat, first, matlimit) diff --git a/teuthology/suite/test/test_init.py b/teuthology/suite/test/test_init.py new file mode 100644 index 000000000..6e91eefa1 --- /dev/null +++ b/teuthology/suite/test/test_init.py @@ -0,0 +1,267 @@ +import os + +from copy import deepcopy + +from mock import patch, Mock, DEFAULT + +from teuthology import suite +from scripts.suite import main +from teuthology.config import config + +import pytest +import time + +from teuthology.exceptions import ScheduleFailError + +def get_fake_time_and_sleep(): + # Below we set m_time.side_effect, but we also set m_time.return_value. + # The reason for this is that we need to store a 'fake time' that + # increments when m_sleep() is called; we could use any variable name we + # wanted for the return value, but since 'return_value' is already a + # standard term in mock, and since setting side_effect causes return_value + # to be ignored, it's safe to just reuse the name here. + m_time = Mock() + m_time.return_value = time.time() + + def m_time_side_effect(): + # Fake the slow passage of time + m_time.return_value += 0.1 + return m_time.return_value + m_time.side_effect = m_time_side_effect + + def f_sleep(seconds): + m_time.return_value += seconds + m_sleep = Mock(wraps=f_sleep) + return m_time, m_sleep + + +def setup_module(): + global m_time + global m_sleep + m_time, m_sleep = get_fake_time_and_sleep() + global patcher_time_sleep + patcher_time_sleep = patch.multiple( + 'teuthology.suite.time', + time=m_time, + sleep=m_sleep, + ) + patcher_time_sleep.start() + + +def teardown_module(): + patcher_time_sleep.stop() + + +@patch.object(suite.ResultsReporter, 'get_jobs') +def test_wait_success(m_get_jobs, caplog): + results = [ + [{'status': 'queued', 'job_id': '2'}], + [], + ] + final = [ + {'status': 'pass', 'job_id': '1', + 'description': 'DESC1', 'log_href': 'http://URL1'}, + {'status': 'fail', 'job_id': '2', + 'description': 'DESC2', 'log_href': 'http://URL2'}, + {'status': 'pass', 'job_id': '3', + 'description': 'DESC3', 'log_href': 'http://URL3'}, + ] + + def get_jobs(name, **kwargs): + if kwargs['fields'] == ['job_id', 'status']: + return in_progress.pop(0) + else: + return final + m_get_jobs.side_effect = get_jobs + suite.Run.WAIT_PAUSE = 1 + + in_progress = deepcopy(results) + assert 0 == suite.wait('name', 1, 'http://UPLOAD_URL') + m_get_jobs.assert_any_call('name', fields=['job_id', 'status']) + assert 0 == len(in_progress) + assert 'fail http://UPLOAD_URL/name/2' in caplog.text + + m_get_jobs.reset_mock() + in_progress = deepcopy(results) + assert 0 == suite.wait('name', 1, None) + m_get_jobs.assert_any_call('name', fields=['job_id', 'status']) + assert 0 == len(in_progress) + assert 'fail http://URL2' in caplog.text + + +@patch.object(suite.ResultsReporter, 'get_jobs') +def test_wait_fails(m_get_jobs): + results = [] + results.append([{'status': 'queued', 'job_id': '2'}]) + results.append([{'status': 'queued', 'job_id': '2'}]) + results.append([{'status': 'queued', 'job_id': '2'}]) + + def get_jobs(name, **kwargs): + return results.pop(0) + m_get_jobs.side_effect = get_jobs + suite.Run.WAIT_PAUSE = 1 + suite.Run.WAIT_MAX_JOB_TIME = 1 + with pytest.raises(suite.WaitException): + suite.wait('name', 1, None) + + +REPO_SHORTHAND = [ + ['https://github.com/dude/foo', 'bar', + 'https://github.com/dude/bar.git'], + ['https://github.com/dude/foo/', 'bar', + 'https://github.com/dude/bar.git'], + ['https://github.com/ceph/ceph', 'ceph', + 'https://github.com/ceph/ceph.git'], + ['https://github.com/ceph/ceph/', 'ceph', + 'https://github.com/ceph/ceph.git'], + ['https://github.com/ceph/ceph.git', 'ceph', + 'https://github.com/ceph/ceph.git'], + ['https://github.com/ceph/ceph', 'ceph-ci', + 'https://github.com/ceph/ceph-ci.git'], + ['https://github.com/ceph/ceph-ci', 'ceph', + 'https://github.com/ceph/ceph.git'], + ['git://git.ceph.com/ceph.git', 'ceph', + 'git://git.ceph.com/ceph.git'], + ['git://git.ceph.com/ceph.git', 'ceph-ci', + 'git://git.ceph.com/ceph-ci.git'], + ['git://git.ceph.com/ceph-ci.git', 'ceph', + 'git://git.ceph.com/ceph.git'], + ['https://github.com/ceph/ceph.git', 'ceph/ceph-ci', + 'https://github.com/ceph/ceph-ci.git'], + ['https://github.com/ceph/ceph.git', 'https://github.com/ceph/ceph-ci', + 'https://github.com/ceph/ceph-ci'], + ['https://github.com/ceph/ceph.git', 'https://github.com/ceph/ceph-ci/', + 'https://github.com/ceph/ceph-ci/'], + ['https://github.com/ceph/ceph.git', 'https://github.com/ceph/ceph-ci.git', + 'https://github.com/ceph/ceph-ci.git'], +] + + +@pytest.mark.parametrize(['orig', 'shorthand', 'result'], REPO_SHORTHAND) +def test_expand_short_repo_name(orig, shorthand, result): + assert suite.expand_short_repo_name(shorthand, orig) == result + + +class TestSuiteMain(object): + def test_main(self): + suite_name = 'SUITE' + throttle = '3' + machine_type = 'burnupi' + + def prepare_and_schedule(obj): + assert obj.base_config.suite == suite_name + assert obj.args.throttle == throttle + + def fake_str(*args, **kwargs): + return 'fake' + + def fake_bool(*args, **kwargs): + return True + + def fake_false(*args, **kwargs): + return False + + with patch.multiple( + 'teuthology.suite.run.util', + fetch_repos=DEFAULT, + package_version_for_hash=fake_str, + git_branch_exists=fake_bool, + git_ls_remote=fake_str, + ): + with patch.multiple( + 'teuthology.suite.run.Run', + prepare_and_schedule=prepare_and_schedule, + ), patch.multiple( + 'teuthology.suite.run.os.path', + exists=fake_false, + ): + main([ + '--ceph', 'main', + '--suite', suite_name, + '--throttle', throttle, + '--machine-type', machine_type, + ]) + + @patch('teuthology.suite.util.smtplib.SMTP') + def test_machine_type_multi_error(self, m_smtp): + config.results_email = "example@example.com" + with pytest.raises(ScheduleFailError) as exc: + main([ + '--ceph', 'main', + '--suite', 'suite_name', + '--throttle', '3', + '--machine-type', 'multi', + '--dry-run', + ]) + assert str(exc.value) == "Scheduling failed: 'multi' is not a valid machine_type. \ +Maybe you want 'gibba,smithi,mira' or similar" + m_smtp.assert_not_called() + + @patch('teuthology.suite.util.smtplib.SMTP') + def test_machine_type_none_error(self, m_smtp): + config.result_email = 'example@example.com' + with pytest.raises(ScheduleFailError) as exc: + main([ + '--ceph', 'main', + '--suite', 'suite_name', + '--throttle', '3', + '--machine-type', 'None', + '--dry-run', + ]) + assert str(exc.value) == "Scheduling failed: Must specify a machine_type" + m_smtp.assert_not_called() + + def test_schedule_suite_noverify(self): + suite_name = 'noop' + suite_dir = os.path.dirname(__file__) + throttle = '3' + machine_type = 'burnupi' + + with patch.multiple( + 'teuthology.suite.util', + fetch_repos=DEFAULT, + teuthology_schedule=DEFAULT, + get_arch=lambda x: 'x86_64', + get_gitbuilder_hash=DEFAULT, + git_ls_remote=lambda *args: '1234', + package_version_for_hash=DEFAULT, + ) as m: + m['package_version_for_hash'].return_value = 'fake-9.5' + config.suite_verify_ceph_hash = False + main([ + '--ceph', 'main', + '--suite', suite_name, + '--suite-dir', suite_dir, + '--suite-relpath', '', + '--throttle', throttle, + '--machine-type', machine_type + ]) + m_sleep.assert_called_with(int(throttle)) + m['get_gitbuilder_hash'].assert_not_called() + + def test_schedule_suite(self): + suite_name = 'noop' + suite_dir = os.path.dirname(__file__) + throttle = '3' + machine_type = 'burnupi' + + with patch.multiple( + 'teuthology.suite.util', + fetch_repos=DEFAULT, + teuthology_schedule=DEFAULT, + get_arch=lambda x: 'x86_64', + get_gitbuilder_hash=DEFAULT, + git_ls_remote=lambda *args: '12345', + package_version_for_hash=DEFAULT, + ) as m: + m['package_version_for_hash'].return_value = 'fake-9.5' + config.suite_verify_ceph_hash = True + main([ + '--ceph', 'main', + '--suite', suite_name, + '--suite-dir', suite_dir, + '--suite-relpath', '', + '--throttle', throttle, + '--machine-type', machine_type + ]) + m_sleep.assert_called_with(int(throttle)) diff --git a/teuthology/suite/test/test_matrix.py b/teuthology/suite/test/test_matrix.py new file mode 100644 index 000000000..596bb37a7 --- /dev/null +++ b/teuthology/suite/test/test_matrix.py @@ -0,0 +1,82 @@ +from teuthology.suite import matrix + + +def verify_matrix_output_diversity(res): + """ + Verifies that the size of the matrix passed matches the number of unique + outputs from res.index + """ + sz = res.size() + s = frozenset([matrix.generate_lists(res.index(i)) for i in range(sz)]) + for i in range(res.size()): + assert sz == len(s) + + +def mbs(num, l): + return matrix.Sum(num*10, [matrix.Base(i + (100*num)) for i in l]) + + +class TestMatrix(object): + def test_simple(self): + verify_matrix_output_diversity(mbs(1, range(6))) + + def test_simple2(self): + verify_matrix_output_diversity(mbs(1, range(5))) + + # The test_product* tests differ by the degree by which dimension + # sizes share prime factors + def test_product_simple(self): + verify_matrix_output_diversity( + matrix.Product(1, [mbs(1, range(6)), mbs(2, range(2))])) + + def test_product_3_facets_2_prime_factors(self): + verify_matrix_output_diversity(matrix.Product(1, [ + mbs(1, range(6)), + mbs(2, range(2)), + mbs(3, range(3)), + ])) + + def test_product_3_facets_2_prime_factors_one_larger(self): + verify_matrix_output_diversity(matrix.Product(1, [ + mbs(1, range(2)), + mbs(2, range(5)), + mbs(4, range(4)), + ])) + + def test_product_4_facets_2_prime_factors(self): + verify_matrix_output_diversity(matrix.Sum(1, [ + mbs(1, range(6)), + mbs(3, range(3)), + mbs(2, range(2)), + mbs(4, range(9)), + ])) + + def test_product_2_facets_2_prime_factors(self): + verify_matrix_output_diversity(matrix.Sum(1, [ + mbs(1, range(2)), + mbs(2, range(5)), + ])) + + def test_product_with_sum(self): + verify_matrix_output_diversity(matrix.Sum( + 9, + [ + mbs(10, range(6)), + matrix.Product(1, [ + mbs(1, range(2)), + mbs(2, range(5)), + mbs(4, range(4))]), + matrix.Product(8, [ + mbs(7, range(2)), + mbs(6, range(5)), + mbs(5, range(4))]) + ] + )) + + def test_product_with_pick_random(self): + verify_matrix_output_diversity(matrix.PickRandom(1, [ + mbs(1, range(6)), + mbs(3, range(3)), + mbs(2, range(2)), + mbs(4, range(9)), + ])) diff --git a/teuthology/suite/test/test_merge.py b/teuthology/suite/test/test_merge.py new file mode 100644 index 000000000..82a0bb67b --- /dev/null +++ b/teuthology/suite/test/test_merge.py @@ -0,0 +1,231 @@ +import logging +from textwrap import dedent + +from mock import patch, MagicMock + +from teuthology.suite import build_matrix +from teuthology.suite.merge import config_merge +from teuthology.test.fake_fs import make_fake_fstools + +log = logging.getLogger(__name__) + +class TestMerge: + patchpoints = [ + 'os.path.exists', + 'os.listdir', + 'os.path.isfile', + 'os.path.isdir', + 'builtins.open', + ] + + def setup_method(self): + self.mocks = dict() + self.patchers = dict() + for ppoint in self.__class__.patchpoints: + self.mocks[ppoint] = MagicMock() + self.patchers[ppoint] = patch(ppoint, self.mocks[ppoint]) + + def start_patchers(self, fake_fs): + fake_fns = make_fake_fstools(fake_fs) + # N.B.: relies on fake_fns being in same order as patchpoints + for ppoint, fn in zip(self.__class__.patchpoints, fake_fns): + self.mocks[ppoint].side_effect = fn + self.patchers[ppoint].start() + + def stop_patchers(self): + for patcher in self.patchers.values(): + patcher.stop() + + def teardown_method(self): + self.patchers.clear() + self.mocks.clear() + + def test_premerge(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'a.yaml': dedent(""" + teuthology: + premerge: reject() + foo: bar + """), + }, + 'c.yaml': dedent(""" + top: pot + """), + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + assert 1 == len(result) + configs = list(config_merge(result)) + assert 1 == len(configs) + desc, frags, yaml = configs[0] + assert "top" in yaml + assert "foo" not in yaml + finally: + self.stop_patchers() + + def test_postmerge(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'a.yaml': dedent(""" + teuthology: + postmerge: + - reject() + foo: bar + """), + 'b.yaml': dedent(""" + baz: zab + """), + }, + 'c.yaml': dedent(""" + top: pot + """), + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + assert 2 == len(result) + configs = list(config_merge(result)) + assert 1 == len(configs) + desc, frags, yaml = configs[0] + assert "top" in yaml + assert "baz" in yaml + assert "foo" not in yaml + finally: + self.stop_patchers() + + def test_postmerge_concat(self): + fake_fs = { + 'd0_0': { + '%': None, + 'd1_0': { + 'a.yaml': dedent(""" + teuthology: + postmerge: + - local a = 1 + foo: bar + """), + 'b.yaml': dedent(""" + teuthology: + postmerge: + - local a = 2 + baz: zab + """), + }, + 'z.yaml': dedent(""" + teuthology: + postmerge: + - if a == 1 then reject() end + top: pot + """), + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + assert 2 == len(result) + configs = list(config_merge(result)) + assert 1 == len(configs) + desc, frags, yaml = configs[0] + assert "top" in yaml + assert "baz" in yaml + assert "foo" not in yaml + finally: + self.stop_patchers() + + + def test_yaml_mutation(self): + fake_fs = { + 'd0_0': { + '%': None, + 'c.yaml': dedent(""" + teuthology: + postmerge: + - | + yaml["test"] = py_dict() + top: pot + """), + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + assert 1 == len(result) + configs = list(config_merge(result)) + assert 1 == len(configs) + desc, frags, yaml = configs[0] + assert "test" in yaml + assert {} == yaml["test"] + finally: + self.stop_patchers() + + def test_sandbox(self): + fake_fs = { + 'd0_0': { + '%': None, + 'c.yaml': dedent(""" + teuthology: + postmerge: + - | + log.debug(_ENV) + log.debug("_ENV contains:") + for k,v in pairs(_ENV) do + log.debug("_ENV['%s'] = %s", tostring(k), tostring(v)) + end + local check = { + "assert", + "error", + "ipairs", + "next", + "pairs", + "tonumber", + "tostring", + "py_attrgetter", + "py_dict", + "py_list", + "py_tuple", + "py_enumerate", + "py_iterex", + "py_itemgetter", + "math", + "reject", + "accept", + "deep_merge", + "log", + "reject", + "yaml_load", + } + for _,v in ipairs(check) do + log.debug("checking %s", tostring(v)) + assert(_ENV[v]) + end + local block = { + "coroutine", + "debug", + "io", + "os", + "package", + } + for _,v in ipairs(block) do + log.debug("checking %s", tostring(v)) + assert(_ENV[v] == nil) + end + top: pot + """), + }, + } + self.start_patchers(fake_fs) + try: + result = build_matrix.build_matrix('d0_0') + assert 1 == len(result) + configs = list(config_merge(result)) + assert 1 == len(configs) + finally: + self.stop_patchers() diff --git a/teuthology/suite/test/test_placeholder.py b/teuthology/suite/test/test_placeholder.py new file mode 100644 index 000000000..31b51755d --- /dev/null +++ b/teuthology/suite/test/test_placeholder.py @@ -0,0 +1,57 @@ +from teuthology.suite.placeholder import ( + substitute_placeholders, dict_templ, Placeholder +) + + +class TestPlaceholder(object): + def test_substitute_placeholders(self): + suite_hash = 'suite_hash' + input_dict = dict( + suite='suite', + suite_branch='suite_branch', + suite_hash=suite_hash, + ceph_branch='ceph_branch', + ceph_hash='ceph_hash', + teuthology_branch='teuthology_branch', + teuthology_sha1='teuthology_sha1', + machine_type='machine_type', + distro='distro', + distro_version='distro_version', + archive_upload='archive_upload', + archive_upload_key='archive_upload_key', + suite_repo='https://example.com/ceph/suite.git', + suite_relpath='', + ceph_repo='https://example.com/ceph/ceph.git', + flavor='default', + expire='expire', + ) + output_dict = substitute_placeholders(dict_templ, input_dict) + assert output_dict['suite'] == 'suite' + assert output_dict['suite_sha1'] == suite_hash + assert isinstance(dict_templ['suite'], Placeholder) + assert isinstance( + dict_templ['overrides']['admin_socket']['branch'], + Placeholder) + + def test_null_placeholders_dropped(self): + input_dict = dict( + suite='suite', + suite_branch='suite_branch', + suite_hash='suite_hash', + ceph_branch='ceph_branch', + ceph_hash='ceph_hash', + teuthology_branch='teuthology_branch', + teuthology_sha1='teuthology_sha1', + machine_type='machine_type', + archive_upload='archive_upload', + archive_upload_key='archive_upload_key', + distro=None, + distro_version=None, + suite_repo='https://example.com/ceph/suite.git', + suite_relpath='', + ceph_repo='https://example.com/ceph/ceph.git', + flavor=None, + expire='expire', + ) + output_dict = substitute_placeholders(dict_templ, input_dict) + assert 'os_type' not in output_dict diff --git a/teuthology/suite/test/test_run_.py b/teuthology/suite/test/test_run_.py new file mode 100644 index 000000000..a3e6d12b3 --- /dev/null +++ b/teuthology/suite/test/test_run_.py @@ -0,0 +1,644 @@ +import logging +import os +import pytest +import requests +import contextlib +import yaml + +from datetime import datetime, timedelta, timezone +from mock import patch, call, ANY +from io import StringIO +from io import BytesIO + +from teuthology.config import config, YamlConfig +from teuthology.exceptions import ScheduleFailError +from teuthology.suite import run +from teuthology.util.time import TIMESTAMP_FMT + +log = logging.getLogger(__name__) + +class TestRun(object): + klass = run.Run + + def setup_method(self): + self.args_dict = dict( + suite='suite', + suite_branch='suite_branch', + suite_relpath='', + ceph_branch='ceph_branch', + ceph_sha1='ceph_sha1', + email='address@example.com', + teuthology_branch='teuthology_branch', + kernel_branch=None, + flavor='flavor', + distro='ubuntu', + machine_type='machine_type', + base_yaml_paths=list(), + ) + self.args = YamlConfig.from_dict(self.args_dict) + + @patch('teuthology.suite.run.util.fetch_repos') + @patch('teuthology.suite.run.util.git_ls_remote') + @patch('teuthology.suite.run.Run.choose_ceph_version') + @patch('teuthology.suite.run.util.git_validate_sha1') + def test_email_addr(self, m_git_validate_sha1, m_choose_ceph_version, + m_git_ls_remote, m_fetch_repos): + # neuter choose_X_branch + m_git_validate_sha1.return_value = self.args_dict['ceph_sha1'] + m_choose_ceph_version.return_value = self.args_dict['ceph_sha1'] + self.args_dict['teuthology_branch'] = 'main' + self.args_dict['suite_branch'] = 'main' + m_git_ls_remote.return_value = 'suite_sha1' + + runobj = self.klass(self.args) + assert runobj.base_config.email == self.args_dict['email'] + + @patch('teuthology.suite.run.util.fetch_repos') + def test_name(self, m_fetch_repos): + stamp = datetime.now().strftime(TIMESTAMP_FMT) + with patch.object(run.Run, 'create_initial_config', + return_value=run.JobConfig()): + name = run.Run(self.args).name + assert str(stamp) in name + + @patch('teuthology.suite.run.util.fetch_repos') + def test_name_owner(self, m_fetch_repos): + self.args.owner = 'USER' + with patch.object(run.Run, 'create_initial_config', + return_value=run.JobConfig()): + name = run.Run(self.args).name + assert name.startswith('USER-') + + @patch('teuthology.suite.run.util.git_branch_exists') + @patch('teuthology.suite.run.util.package_version_for_hash') + @patch('teuthology.suite.run.util.git_ls_remote') + def test_branch_nonexistent( + self, + m_git_ls_remote, + m_package_version_for_hash, + m_git_branch_exists, + ): + config.gitbuilder_host = 'example.com' + m_git_ls_remote.side_effect = [ + # First call will be for the ceph hash + None, + # Second call will be for the suite hash + 'suite_hash', + ] + m_package_version_for_hash.return_value = 'a_version' + m_git_branch_exists.return_value = True + self.args.ceph_branch = 'ceph_sha1' + self.args.ceph_sha1 = None + with pytest.raises(ScheduleFailError): + self.klass(self.args) + + @pytest.mark.parametrize( + ["expire", "delta", "result"], + [ + [None, timedelta(), False], + ["1m", timedelta(), True], + ["1m", timedelta(minutes=-2), False], + ["1m", timedelta(minutes=2), True], + ["7d", timedelta(days=-14), False], + ] + ) + @patch('teuthology.repo_utils.fetch_repo') + @patch('teuthology.suite.run.util.git_branch_exists') + @patch('teuthology.suite.run.util.package_version_for_hash') + @patch('teuthology.suite.run.util.git_ls_remote') + def test_get_expiration( + self, + m_git_ls_remote, + m_package_version_for_hash, + m_git_branch_exists, + m_fetch_repo, + expire, + delta, + result, + ): + m_git_ls_remote.side_effect = 'hash' + m_package_version_for_hash.return_value = 'a_version' + m_git_branch_exists.return_value = True + self.args.expire = expire + obj = self.klass(self.args) + now = datetime.now(timezone.utc) + expires_result = obj.get_expiration(_base_time=now + delta) + if expire is None: + assert expires_result is None + assert obj.base_config['expire'] is None + else: + assert expires_result is not None + assert (now < expires_result) is result + assert obj.base_config['expire'] + + @patch('teuthology.suite.run.util.fetch_repos') + @patch('requests.head') + @patch('teuthology.suite.run.util.git_branch_exists') + @patch('teuthology.suite.run.util.package_version_for_hash') + @patch('teuthology.suite.run.util.git_ls_remote') + def test_sha1_exists( + self, + m_git_ls_remote, + m_package_version_for_hash, + m_git_branch_exists, + m_requests_head, + m_fetch_repos, + ): + config.gitbuilder_host = 'example.com' + m_package_version_for_hash.return_value = 'ceph_hash' + m_git_branch_exists.return_value = True + resp = requests.Response() + resp.reason = 'OK' + resp.status_code = 200 + m_requests_head.return_value = resp + # only one call to git_ls_remote in this case + m_git_ls_remote.return_value = "suite_branch" + run = self.klass(self.args) + assert run.base_config.sha1 == 'ceph_sha1' + assert run.base_config.branch == 'ceph_branch' + + @patch('teuthology.suite.run.util.git_ls_remote') + @patch('requests.head') + @patch('teuthology.suite.util.git_branch_exists') + @patch('teuthology.suite.util.package_version_for_hash') + def test_sha1_nonexistent( + self, + m_git_ls_remote, + m_package_version_for_hash, + m_git_branch_exists, + m_requests_head, + ): + config.gitbuilder_host = 'example.com' + m_package_version_for_hash.return_value = 'ceph_hash' + m_git_branch_exists.return_value = True + resp = requests.Response() + resp.reason = 'Not Found' + resp.status_code = 404 + m_requests_head.return_value = resp + self.args.ceph_sha1 = 'ceph_hash_dne' + with pytest.raises(ScheduleFailError): + self.klass(self.args) + + @patch('teuthology.suite.util.smtplib.SMTP') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.util.package_version_for_hash') + def test_teuthology_branch_nonexistent( + self, + m_pvfh, + m_git_ls_remote, + m_smtp, + ): + m_git_ls_remote.return_value = None + config.teuthology_path = None + config.results_email = "example@example.com" + self.args.dry_run = True + self.args.teuthology_branch = 'no_branch' + with pytest.raises(ScheduleFailError): + self.klass(self.args) + m_smtp.assert_not_called() + + @patch('teuthology.suite.run.util.fetch_repos') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.run.util.package_version_for_hash') + def test_os_type(self, m_pvfh, m_git_ls_remote, m_fetch_repos): + m_git_ls_remote.return_value = "sha1" + del self.args['distro'] + run_ = run.Run(self.args) + run_.base_args = run_.build_base_args() + run_.base_config = run_.build_base_config() + configs = [ + ["desc", [], {"os_type": "debian", "os_version": "8.0"}], + ["desc", [], {"os_type": "ubuntu", "os_version": "24.0"}], + ] + missing, to_schedule = run_.collect_jobs('x86_64', configs, False, False) + assert to_schedule[0]['yaml']['os_type'] == "debian" + assert to_schedule[0]['yaml']['os_version'] == "8.0" + assert to_schedule[1]['yaml']['os_type'] == "ubuntu" + assert to_schedule[1]['yaml']['os_version'] == "24.0" + + @patch('teuthology.suite.run.util.fetch_repos') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.run.util.package_version_for_hash') + def test_sha1(self, m_pvfh, m_git_ls_remote, m_fetch_repos): + m_git_ls_remote.return_value = "sha1" + del self.args['distro'] + run_ = run.Run(self.args) + run_.base_args = run_.build_base_args() + for i in range(5): # mock backtracking + run_.config_input['ceph_hash'] = f"boo{i}" + run_.config_input['suite_hash'] = f"bar{i}" + run_.base_config = run_.build_base_config() + configs = [ + ["desc", [], {"os_type": "debian", "os_version": "8.0", + "sha1": "old_sha", "suite_sha1": "old_sha", + "overrides": { "workunit": {"sha1": "old_sha"}, "ceph": {"sha1": "old_sha"} } + }], + ] + missing, to_schedule = run_.collect_jobs('x86_64', configs, False, False) + assert to_schedule[0]['yaml']['sha1'] == "boo4" + assert to_schedule[0]['yaml']['suite_sha1'] == "bar4" + assert to_schedule[0]['yaml']['overrides']['workunit']["sha1"] == "bar4" + assert to_schedule[0]['yaml']['overrides']['ceph']["sha1"] == "boo4" + +class TestScheduleSuite(object): + klass = run.Run + + def setup_method(self): + self.args_dict = dict( + suite='suite', + suite_relpath='', + suite_dir='suite_dir', + suite_branch='main', + suite_repo='main', + ceph_repo='main', + ceph_branch='main', + ceph_sha1='ceph_sha1', + teuthology_branch='main', + kernel_branch=None, + flavor='flavor', + distro='ubuntu', + distro_version='14.04', + machine_type='machine_type', + base_yaml_paths=list(), + ) + self.args = YamlConfig.from_dict(self.args_dict) + + @patch('teuthology.suite.run.Run.schedule_jobs') + @patch('teuthology.suite.run.Run.write_rerun_memo') + @patch('teuthology.suite.util.get_install_task_flavor') + @patch('teuthology.suite.merge.open') + @patch('teuthology.suite.run.build_matrix') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.util.package_version_for_hash') + @patch('teuthology.suite.util.git_validate_sha1') + @patch('teuthology.suite.util.get_arch') + def test_successful_schedule( + self, + m_get_arch, + m_git_validate_sha1, + m_package_version_for_hash, + m_git_ls_remote, + m_build_matrix, + m_open, + m_get_install_task_flavor, + m_write_rerun_memo, + m_schedule_jobs, + ): + m_get_arch.return_value = 'x86_64' + m_git_validate_sha1.return_value = self.args.ceph_sha1 + m_package_version_for_hash.return_value = 'ceph_version' + m_git_ls_remote.return_value = 'suite_hash' + build_matrix_desc = 'desc' + build_matrix_frags = ['frag1.yml', 'frag2.yml'] + build_matrix_output = [ + (build_matrix_desc, build_matrix_frags), + ] + m_build_matrix.return_value = build_matrix_output + frag1_read_output = 'field1: val1' + frag2_read_output = 'field2: val2' + m_open.side_effect = [ + StringIO(frag1_read_output), + StringIO(frag2_read_output), + contextlib.closing(BytesIO()) + ] + m_get_install_task_flavor.return_value = 'default' + m_package_version_for_hash.return_value = "v1" + # schedule_jobs() is just neutered; check calls below + + self.args.newest = 0 + self.args.num = 42 + runobj = self.klass(self.args) + runobj.base_args = list() + count = runobj.schedule_suite() + assert(count == 1) + assert runobj.base_config['suite_sha1'] == 'suite_hash' + m_package_version_for_hash.assert_has_calls( + [call('ceph_sha1', 'default', 'ubuntu', '14.04', 'machine_type')], + ) + y = { + 'field1': 'val1', + 'field2': 'val2' + } + teuthology_keys = [ + 'branch', + 'machine_type', + 'name', + 'os_type', + 'os_version', + 'overrides', + 'priority', + 'repo', + 'seed', + 'sha1', + 'sleep_before_teardown', + 'suite', + 'suite_branch', + 'suite_relpath', + 'suite_repo', + 'suite_sha1', + 'tasks', + 'teuthology_branch', + 'teuthology_sha1', + 'timestamp', + 'user', + 'teuthology', + 'flavor', + ] + for t in teuthology_keys: + y[t] = ANY + expected_job = dict( + yaml=y, + sha1='ceph_sha1', + args=[ + '--num', + '42', + '--description', + os.path.join(self.args.suite, build_matrix_desc), + '--', + '-' + ], + stdin=ANY, + desc=os.path.join(self.args.suite, build_matrix_desc), + ) + + m_schedule_jobs.assert_has_calls( + [call([], [expected_job], runobj.name)], + ) + args = m_schedule_jobs.call_args.args + log.debug("args =\n%s", args) + jobargs = args[1][0] + stdin_yaml = yaml.safe_load(jobargs['stdin']) + for k in y: + assert y[k] == stdin_yaml[k] + for k in teuthology_keys: + assert k in stdin_yaml + m_write_rerun_memo.assert_called_once_with() + + @patch('teuthology.suite.util.find_git_parents') + @patch('teuthology.suite.run.Run.schedule_jobs') + @patch('teuthology.suite.util.get_install_task_flavor') + @patch('teuthology.suite.run.config_merge') + @patch('teuthology.suite.run.build_matrix') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.util.package_version_for_hash') + @patch('teuthology.suite.util.git_validate_sha1') + @patch('teuthology.suite.util.get_arch') + def test_newest_failure( + self, + m_get_arch, + m_git_validate_sha1, + m_package_version_for_hash, + m_git_ls_remote, + m_build_matrix, + m_config_merge, + m_get_install_task_flavor, + m_schedule_jobs, + m_find_git_parents, + ): + m_get_arch.return_value = 'x86_64' + m_git_validate_sha1.return_value = self.args.ceph_sha1 + m_package_version_for_hash.return_value = None + m_git_ls_remote.return_value = 'suite_hash' + build_matrix_desc = 'desc' + build_matrix_frags = ['frag.yml'] + build_matrix_output = [ + (build_matrix_desc, build_matrix_frags), + ] + m_build_matrix.return_value = build_matrix_output + m_config_merge.return_value = [(a, b, {}) for a, b in build_matrix_output] + m_get_install_task_flavor.return_value = 'default' + + m_find_git_parents.side_effect = lambda proj, sha1, count: [f"{sha1}_{i}" for i in range(11)] + + self.args.newest = 10 + runobj = self.klass(self.args) + runobj.base_args = list() + with pytest.raises(ScheduleFailError) as exc: + runobj.schedule_suite() + assert 'Exceeded 10 backtracks' in str(exc.value) + m_find_git_parents.assert_has_calls( + [call('ceph', 'ceph_sha1', 10)] + ) + + @patch('teuthology.suite.util.find_git_parents') + @patch('teuthology.suite.run.Run.schedule_jobs') + @patch('teuthology.suite.run.Run.write_rerun_memo') + @patch('teuthology.suite.util.get_install_task_flavor') + @patch('teuthology.suite.run.config_merge') + @patch('teuthology.suite.run.build_matrix') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.util.package_version_for_hash') + @patch('teuthology.suite.util.git_validate_sha1') + @patch('teuthology.suite.util.get_arch') + def test_newest_success_same_branch_same_repo( + self, + m_get_arch, + m_git_validate_sha1, + m_package_version_for_hash, + m_git_ls_remote, + m_build_matrix, + m_config_merge, + m_get_install_task_flavor, + m_write_rerun_memo, + m_schedule_jobs, + m_find_git_parents, + ): + """ + Test that we can successfully schedule a job with newest + backtracking when the ceph and suite branches are the same + and the ceph_sha1 is not supplied. We should expect that the + ceph_hash and suite_hash will be updated to the working sha1 + """ + m_get_arch.return_value = 'x86_64' + # rig has_packages_for_distro to fail this many times, so + # everything will run NUM_FAILS+1 times + NUM_FAILS = 5 + # Here we just assume that even fi ceph_sha1 is not supplied, + # in git_valid_sha1, util.git_ls_remote will give us ceph_sha1 + m_git_validate_sha1.return_value = self.args.ceph_sha1 + # Here we know that in create_initial_config, we call + # git_ls_remote 3 times, choose_ceph_hash, choose_suite_hash, + # and choose_teuthology_branch + sha1_side_effect = [ + self.args.ceph_sha1, # ceph_sha1 + 'suite_sha1', # suite_sha1 + 'teuthology_sha1', # teuthology_sha1 + ] + m_git_ls_remote.side_effect = sha1_side_effect + build_matrix_desc = 'desc' + build_matrix_frags = ['frag.yml'] + build_matrix_output = [ + (build_matrix_desc, build_matrix_frags), + ] + m_build_matrix.return_value = build_matrix_output + m_config_merge.return_value = [(a, b, {}) for a, b in build_matrix_output] + m_get_install_task_flavor.return_value = 'default' + + # Generate backtracked parent sha1s + parent_sha1s = [f"ceph_sha1_{i}" for i in range(NUM_FAILS)] + assert len(parent_sha1s) + # Last sha1 will be the one that works! + working_sha1 = parent_sha1s[-1] + + # NUM_FAILS attempts, then success on the last parent sha1 + m_package_version_for_hash.side_effect = \ + [None for i in range(NUM_FAILS)] + ["ceph_version"] + + m_find_git_parents.return_value = parent_sha1s + + self.args.newest = 10 + runobj = self.klass(self.args) + runobj.base_args = list() + + # Call schedule_suite() + count = runobj.schedule_suite() + # Epect only 1 job to be scheduled + assert count == 1 + # Expect that we called package_version_for_hash NUM_FAILS times + 1 for the working sha1 + m_package_version_for_hash.assert_has_calls( + [call(self.args.ceph_sha1, 'default', 'ubuntu', '14.04', 'machine_type')] + + [call(f"ceph_sha1_{i}", 'default', 'ubuntu', '14.04', 'machine_type') + for i in range(0, NUM_FAILS)] + ) + # (ceph, base_config.sha1, newest) called once to get grab the backtrace + m_find_git_parents.assert_called_once_with('ceph', 'ceph_sha1', 10) + + # Verify that base_config was updated with the working SHA1 + assert runobj.base_config.sha1 == working_sha1 + + # Verify that config_input's ceph_hash and suite_hash was updated + assert runobj.config_input['ceph_hash'] == working_sha1 + assert runobj.config_input['suite_hash'] == working_sha1 + + # Verify that config_input's ceph_hash and suite_hash are not the same as the original sha1s + assert runobj.config_input['ceph_hash'] != sha1_side_effect[0] # ceph_sha1 + assert runobj.config_input['suite_hash'] != sha1_side_effect[1] # suite_sha1 + + # Verify the sha1 in scheduled jobs + args = m_schedule_jobs.call_args.args + scheduled_jobs = args[1] + + # Check each job has the correct SHA1 + for job in scheduled_jobs: + assert job['sha1'] == working_sha1 + + # Parse YAML from stdin to check for sha1 and suite_hash + if 'stdin' in job: + job_yaml = yaml.safe_load(job['stdin']) + assert job_yaml.get('sha1') == working_sha1 + assert job_yaml.get('suite_sha1') == working_sha1 + + @patch('teuthology.suite.util.find_git_parents') + @patch('teuthology.suite.run.Run.schedule_jobs') + @patch('teuthology.suite.run.Run.write_rerun_memo') + @patch('teuthology.suite.util.get_install_task_flavor') + @patch('teuthology.suite.run.config_merge') + @patch('teuthology.suite.run.build_matrix') + @patch('teuthology.suite.util.git_ls_remote') + @patch('teuthology.suite.util.package_version_for_hash') + @patch('teuthology.suite.util.git_validate_sha1') + @patch('teuthology.suite.util.get_arch') + def test_newest_success_diff_branch_diff_repo( + self, + m_get_arch, + m_git_validate_sha1, + m_package_version_for_hash, + m_git_ls_remote, + m_build_matrix, + m_config_merge, + m_get_install_task_flavor, + m_write_rerun_memo, + m_schedule_jobs, + m_find_git_parents, + ): + """ + Test that we can successfully schedule a job with newest + backtracking when the ceph and suite branches are different + and the ceph_sha1 is not supplied. We should expect that the + ceph_hash will be updated to the working sha1, + but the suite_hash will remain the original suite_sha1. + """ + m_get_arch.return_value = 'x86_64' + # Set different branches + self.args.ceph_branch = 'ceph_different_branch' + self.args.suite_branch = 'suite_different_branch' + + # rig has_packages_for_distro to fail this many times, so + # everything will run NUM_FAILS+1 times + NUM_FAILS = 5 + # Here we just assume that even fi ceph_sha1 is not supplied, + # in git_valid_sha1, util.git_ls_remote will give us ceph_sha1 + m_git_validate_sha1.return_value = self.args.ceph_sha1 + # Here we know that in create_initial_config, we call + # git_ls_remote 3 times, choose_ceph_hash, choose_suite_hash, + # and choose_teuthology_branch + sha1_side_effect = [ + self.args.ceph_sha1, # ceph_sha1 + 'suite_sha1', # suite_sha1 + 'teuthology_sha1', # teuthology_sha1 + ] + m_git_ls_remote.side_effect = sha1_side_effect + build_matrix_desc = 'desc' + build_matrix_frags = ['frag.yml'] + build_matrix_output = [ + (build_matrix_desc, build_matrix_frags), + ] + m_build_matrix.return_value = build_matrix_output + m_config_merge.return_value = [(a, b, {}) for a, b in build_matrix_output] + m_get_install_task_flavor.return_value = 'default' + + # Generate backtracked parent sha1s + parent_sha1s = [f"ceph_sha1_{i}" for i in range(NUM_FAILS)] + assert len(parent_sha1s) + # Last sha1 will be the one that works! + working_sha1 = parent_sha1s[-1] + + # NUM_FAILS attempts, then success on the last parent sha1 + m_package_version_for_hash.side_effect = \ + [None for i in range(NUM_FAILS)] + ["ceph_version"] + + m_find_git_parents.return_value = parent_sha1s + + self.args.newest = 10 + runobj = self.klass(self.args) + runobj.base_args = list() + + # Call schedule_suite() + count = runobj.schedule_suite() + # Epect only 1 job to be scheduled + assert count == 1 + # Expect that we called package_version_for_hash NUM_FAILS times + 1 for the working sha1 + m_package_version_for_hash.assert_has_calls( + [call(self.args.ceph_sha1, 'default', 'ubuntu', '14.04', 'machine_type')] + + [call(f"ceph_sha1_{i}", 'default', 'ubuntu', '14.04', 'machine_type') + for i in range(0, NUM_FAILS)] + ) + # (ceph, base_config.sha1, newest) called once to get grab the backtrace + m_find_git_parents.assert_called_once_with('ceph', 'ceph_sha1', 10) + + # Verify that base_config was updated with the working SHA1 + assert runobj.base_config.sha1 == working_sha1 + + # Verify that config_input's ceph_hash was updated, + # but suite_hash is still the original suite_sha1 + assert runobj.config_input['ceph_hash'] == working_sha1 + assert runobj.config_input['suite_hash'] != working_sha1 + + # Verify that config_input's ceph_hash is not the same as the original sha1s + # but suite_hash is still the original suite_sha1 + assert runobj.config_input['ceph_hash'] != sha1_side_effect[0] # ceph_sha1 + assert runobj.config_input['suite_hash'] == sha1_side_effect[1] # suite_sha1 + + # Verify the sha1 in scheduled jobs + args = m_schedule_jobs.call_args.args + scheduled_jobs = args[1] + + # Check each job has the correct SHA1 + for job in scheduled_jobs: + assert job['sha1'] == working_sha1 + + # Parse YAML from stdin to check for sha1 and suite_hash + if 'stdin' in job: + job_yaml = yaml.safe_load(job['stdin']) + assert job_yaml.get('sha1') == working_sha1 + assert job_yaml.get('suite_sha1') == sha1_side_effect[1] diff --git a/teuthology/suite/test/test_util.py b/teuthology/suite/test/test_util.py new file mode 100644 index 000000000..daa583023 --- /dev/null +++ b/teuthology/suite/test/test_util.py @@ -0,0 +1,267 @@ +import os +import pytest +import tempfile + +from mock import Mock, patch + +from teuthology.config import config +from teuthology.orchestra.opsys import OS +from teuthology.suite import util +from teuthology.exceptions import BranchNotFoundError, ScheduleFailError + + +REPO_PROJECTS_AND_URLS = [ + 'ceph', + 'https://github.com/not_ceph/ceph.git', +] + + +@pytest.mark.parametrize('project_or_url', REPO_PROJECTS_AND_URLS) +@patch('subprocess.check_output') +def test_git_branch_exists(m_check_output, project_or_url): + m_check_output.return_value = '' + assert False == util.git_branch_exists( + project_or_url, 'nobranchnowaycanthappen') + m_check_output.return_value = b'HHH branch' + assert True == util.git_branch_exists(project_or_url, 'main') + + +@pytest.fixture +def git_repository(request): + d = tempfile.mkdtemp() + os.system(""" + cd {d} + git init + touch A + git config user.email 'you@example.com' + git config user.name 'Your Name' + git add A + git commit -m 'A' A + git rev-parse --abbrev-ref main || git checkout -b main + """.format(d=d)) + + def fin(): + os.system("rm -fr " + d) + request.addfinalizer(fin) + return d + + +class TestUtil(object): + @patch('teuthology.suite.util.smtplib.SMTP') + def test_schedule_fail(self, m_smtp): + config.results_email = "example@example.com" + with pytest.raises(ScheduleFailError) as exc: + util.schedule_fail(message="error msg", dry_run=False) + assert str(exc.value) == "Scheduling failed: error msg" + m_smtp.assert_called() + + @patch('teuthology.suite.util.smtplib.SMTP') + def test_schedule_fail_dryrun(self, m_smtp): + config.results_email = "example@example.com" + with pytest.raises(ScheduleFailError) as exc: + util.schedule_fail(message="error msg", dry_run=True) + assert str(exc.value) == "Scheduling failed: error msg" + m_smtp.assert_not_called() + + @patch('teuthology.suite.util.fetch_qa_suite') + @patch('teuthology.suite.util.smtplib.SMTP') + def test_fetch_repo_no_branch(self, m_smtp, m_fetch_qa_suite): + m_fetch_qa_suite.side_effect = BranchNotFoundError( + "no-branch", "https://github.com/ceph/ceph-ci.git") + config.results_email = "example@example.com" + with pytest.raises(ScheduleFailError) as exc: + util.fetch_repos("no-branch", "test1", dry_run=False) + assert str(exc.value) == "Scheduling test1 failed: \ +Branch 'no-branch' not found in repo: https://github.com/ceph/ceph-ci.git!" + m_smtp.assert_called() + + @patch('teuthology.suite.util.fetch_qa_suite') + @patch('teuthology.suite.util.smtplib.SMTP') + def test_fetch_repo_no_branch_dryrun(self, m_smtp, m_fetch_qa_suite): + m_fetch_qa_suite.side_effect = BranchNotFoundError( + "no-branch", "https://github.com/ceph/ceph-ci.git") + config.results_email = "example@example.com" + with pytest.raises(ScheduleFailError) as exc: + util.fetch_repos("no-branch", "test1", dry_run=True) + assert str(exc.value) == "Scheduling test1 failed: \ +Branch 'no-branch' not found in repo: https://github.com/ceph/ceph-ci.git!" + m_smtp.assert_not_called() + + @patch('requests.get') + def test_get_branch_info(self, m_get): + mock_resp = Mock() + mock_resp.ok = True + mock_resp.json.return_value = "some json" + m_get.return_value = mock_resp + result = util.get_branch_info("teuthology", "main") + m_get.assert_called_with( + "https://api.github.com/repos/ceph/teuthology/git/refs/heads/main" + ) + assert result == "some json" + + @patch('teuthology.lock.query') + def test_get_arch_fail(self, m_query): + m_query.list_locks.return_value = False + util.get_arch('magna') + m_query.list_locks.assert_called_with(machine_type="magna", count=1, tries=1) + + @patch('teuthology.lock.query') + def test_get_arch_success(self, m_query): + m_query.list_locks.return_value = [{"arch": "arch"}] + result = util.get_arch('magna') + m_query.list_locks.assert_called_with( + machine_type="magna", + count=1, tries=1 + ) + assert result == "arch" + + def test_build_git_url_github(self): + assert 'project' in util.build_git_url('project') + owner = 'OWNER' + git_url = util.build_git_url('project', project_owner=owner) + assert owner in git_url + + @patch('teuthology.config.TeuthologyConfig.get_ceph_qa_suite_git_url') + def test_build_git_url_ceph_qa_suite_custom( + self, + m_get_ceph_qa_suite_git_url): + url = 'http://foo.com/some' + m_get_ceph_qa_suite_git_url.return_value = url + '.git' + assert url == util.build_git_url('ceph-qa-suite') + + @patch('teuthology.config.TeuthologyConfig.get_ceph_git_url') + def test_build_git_url_ceph_custom(self, m_get_ceph_git_url): + url = 'http://foo.com/some' + m_get_ceph_git_url.return_value = url + '.git' + assert url == util.build_git_url('ceph') + + @patch('teuthology.config.TeuthologyConfig.get_ceph_cm_ansible_git_url') + def test_build_git_url_ceph_cm_ansible_custom(self, m_get_ceph_cm_ansible_git_url): + url = 'http://foo.com/some' + m_get_ceph_cm_ansible_git_url.return_value = url + '.git' + assert url == util.build_git_url('ceph-cm-ansible') + + @patch('teuthology.config.TeuthologyConfig.get_ceph_git_url') + def test_git_ls_remote(self, m_get_ceph_git_url, git_repository): + m_get_ceph_git_url.return_value = git_repository + assert util.git_ls_remote('ceph', 'nobranch') is None + assert util.git_ls_remote('ceph', 'main') is not None + + @patch('teuthology.suite.util.requests.get') + def test_find_git_parents(self, m_requests_get): + history_resp = Mock(ok=True) + history_resp.json.return_value = {'sha1s': ['sha1', 'sha1_p']} + m_requests_get.return_value = history_resp + parent_sha1s = util.find_git_parents('ceph', 'sha1') + assert m_requests_get.call_count == 1 + assert parent_sha1s == ['sha1_p'] + + +class TestFlavor(object): + + def test_get_install_task_flavor_bare(self): + config = dict( + tasks=[ + dict( + install=dict(), + ), + ], + ) + assert util.get_install_task_flavor(config) == 'default' + + def test_get_install_task_flavor_simple(self): + config = dict( + tasks=[ + dict( + install=dict( + flavor='notcmalloc', + ), + ), + ], + ) + assert util.get_install_task_flavor(config) == 'notcmalloc' + + def test_get_install_task_flavor_override_simple(self): + config = dict( + tasks=[ + dict(install=dict()), + ], + overrides=dict( + install=dict( + flavor='notcmalloc', + ), + ), + ) + assert util.get_install_task_flavor(config) == 'notcmalloc' + + def test_get_install_task_flavor_override_project(self): + config = dict( + tasks=[ + dict(install=dict()), + ], + overrides=dict( + install=dict( + ceph=dict( + flavor='notcmalloc', + ), + ), + ), + ) + assert util.get_install_task_flavor(config) == 'notcmalloc' + + +class TestMissingPackages(object): + """ + Tests the functionality that checks to see if a + scheduled job will have missing packages in shaman. + """ + @patch("teuthology.packaging.ShamanProject._get_package_version") + def test_distro_has_packages(self, m_gpv): + m_gpv.return_value = "v1" + result = util.package_version_for_hash( + "sha1", + "basic", + "ubuntu", + "14.04", + "mtype", + ) + assert result + + @patch("teuthology.packaging.ShamanProject._get_package_version") + def test_distro_does_not_have_packages(self, m_gpv): + m_gpv.return_value = None + result = util.package_version_for_hash( + "sha1", + "basic", + "rhel", + "7.0", + "mtype", + ) + assert not result + + +class TestDistroDefaults(object): + def test_distro_defaults_plana(self): + expected = ('x86_64', 'ubuntu/22.04', + OS(name='ubuntu', version='22.04', codename='jammy')) + assert util.get_distro_defaults('ubuntu', 'plana') == expected + + def test_distro_defaults_debian(self): + expected = ('x86_64', 'debian/8.0', + OS(name='debian', version='8.0', codename='jessie')) + assert util.get_distro_defaults('debian', 'magna') == expected + + def test_distro_defaults_centos(self): + expected = ('x86_64', 'centos/9', + OS(name='centos', version='9.stream', codename='stream')) + assert util.get_distro_defaults('centos', 'magna') == expected + + def test_distro_defaults_fedora(self): + expected = ('x86_64', 'fedora/25', + OS(name='fedora', version='25', codename='25')) + assert util.get_distro_defaults('fedora', 'magna') == expected + + def test_distro_defaults_default(self): + expected = ('x86_64', 'centos/9', + OS(name='centos', version='9.stream', codename='stream')) + assert util.get_distro_defaults('rhel', 'magna') == expected diff --git a/teuthology/suite/util.py b/teuthology/suite/util.py new file mode 100644 index 000000000..cc884ebf9 --- /dev/null +++ b/teuthology/suite/util.py @@ -0,0 +1,380 @@ +import copy +import functools +import logging +import os +import requests +import smtplib +import socket +from subprocess import Popen, PIPE, DEVNULL +import sys + +from email.mime.text import MIMEText + +import teuthology.lock.query +import teuthology.lock.util +from teuthology import repo_utils + +from teuthology.config import config +from teuthology.exceptions import BranchNotFoundError, ScheduleFailError +from teuthology.misc import deep_merge +from teuthology.repo_utils import fetch_qa_suite, fetch_teuthology +from teuthology.orchestra.opsys import OS, DEFAULT_OS_VERSION +from teuthology.packaging import get_builder_project, VersionNotFoundError +from teuthology.repo_utils import build_git_url +from teuthology.task.install import get_flavor + +log = logging.getLogger(__name__) + +CONTAINER_DISTRO = 'centos/9' # the one to check for build_complete +CONTAINER_FLAVOR = 'default' + + +def fetch_repos(branch, test_name, dry_run, commit=None): + """ + Fetch the suite repo (and also the teuthology repo) so that we can use it + to build jobs. Repos are stored in ~/src/. + + The reason the teuthology repo is also fetched is that currently we use + subprocess to call teuthology-schedule to schedule jobs so we need to make + sure it is up-to-date. For that reason we always fetch the main branch + for test scheduling, regardless of what teuthology branch is requested for + testing. + + :returns: The path to the suite repo on disk + """ + try: + # When a user is scheduling a test run from their own copy of + # teuthology, let's not wreak havoc on it. + if config.automated_scheduling: + # We use teuthology's main branch in all cases right now + if config.teuthology_path is None: + fetch_teuthology('main') + suite_repo_path = fetch_qa_suite(branch, commit) + except BranchNotFoundError as exc: + schedule_fail(message=str(exc), name=test_name, dry_run=dry_run) + return suite_repo_path + + +def schedule_fail(message, name='', dry_run=None): + """ + If an email address has been specified anywhere, send an alert there. Then + raise a ScheduleFailError. + Don't send the mail if --dry-run has been passed. + """ + email = config.results_email + if email and not dry_run: + subject = "Failed to schedule {name}".format(name=name) + msg = MIMEText(message) + msg['Subject'] = subject + msg['From'] = config.results_sending_email + msg['To'] = email + try: + smtp = smtplib.SMTP('localhost') + smtp.sendmail(msg['From'], [msg['To']], msg.as_string()) + smtp.quit() + except socket.error: + log.exception("Failed to connect to mail server!") + raise ScheduleFailError(message, name) + + +def get_worker(machine_type): + """ + Map a given machine_type to a beanstalkd worker. If machine_type mentions + multiple machine types - e.g. 'plana,mira', then this returns 'multi'. + Otherwise it returns what was passed. + """ + if ',' in machine_type: + return 'multi' + else: + return machine_type + + +def get_gitbuilder_hash(project=None, branch=None, flavor=None, + machine_type=None, distro=None, + distro_version=None): + """ + Find the hash representing the head of the project's repository via + querying a gitbuilder repo. + + Will return None in the case of a 404 or any other HTTP error. + """ + # Alternate method for github-hosted projects - left here for informational + # purposes + # resp = requests.get( + # 'https://api.github.com/repos/ceph/ceph/git/refs/heads/main') + # hash = .json()['object']['sha'] + (arch, release, _os) = get_distro_defaults(distro, machine_type) + if distro is None: + distro = _os.name + bp = get_builder_project()( + project, + dict( + branch=branch, + flavor=flavor, + os_type=distro, + os_version=distro_version, + arch=arch, + ), + ) + return bp.sha1 + + +def get_distro_defaults(distro, machine_type): + """ + Given a distro (e.g. 'ubuntu') and machine type, return: + (arch, release, pkg_type) + """ + arch = 'x86_64' + if distro in (None, 'None', 'rhel'): + distro = 'centos' + + try: + os_version = DEFAULT_OS_VERSION[distro] + os_type = distro + except IndexError: + raise ValueError("Invalid distro value passed: %s", distro) + _os = OS(name=os_type, version=os_version) + release = get_builder_project()._get_distro( + _os.name, + _os.version, + _os.codename, + ) + return ( + arch, + release, + _os, + ) + + +def git_ls_remote(project_or_url, branch, project_owner='ceph'): + """ + Find the latest sha1 for a given project's branch. + + :param project_or_url: Either a project name or a full URL + :param branch: The branch to query + :param project_owner: The GitHub project owner. Only used when a project + name is passed; not when a URL is passed + :returns: The sha1 if found; else None + """ + if '://' in project_or_url or project_or_url.startswith('git@'): + url = project_or_url + else: + url = build_git_url(project_or_url, project_owner) + return repo_utils.ls_remote(url, branch) + + +def git_validate_sha1(project, sha1, project_owner='ceph'): + ''' + Use http to validate that project contains sha1 + I can't find a way to do this with git, period, so + we have specific urls to HEAD for github and git.ceph.com/gitweb + for now + ''' + url = build_git_url(project, project_owner) + + if '/github.com/' in url: + url = '/'.join((url, 'commit', sha1)) + elif '/git.ceph.com/' in url: + # kinda specific to knowing git.ceph.com is gitweb + url = ('http://git.ceph.com/?p=%s.git;a=blob_plain;f=.gitignore;hb=%s' + % (project, sha1)) + else: + raise RuntimeError( + 'git_validate_sha1: how do I check %s for a sha1?' % url + ) + + resp = requests.head(url) + if resp.ok: + return sha1 + return None + + +def git_branch_exists(project_or_url, branch, project_owner='ceph'): + """ + Query the git repository to check the existence of a project's branch + + :param project_or_url: Either a project name or a full URL + :param branch: The branch to query + :param project_owner: The GitHub project owner. Only used when a project + name is passed; not when a URL is passed + """ + return git_ls_remote(project_or_url, branch, project_owner) is not None + + +def get_branch_info(project, branch, project_owner='ceph'): + """ + NOTE: This is currently not being used because of GitHub's API rate + limiting. We use github_branch_exists() instead. + + Use the GitHub API to query a project's branch. Returns: + {u'object': {u'sha': , + u'type': , + u'url': }, + u'ref': u'refs/heads/', + u'url': } + + We mainly use this to check if a branch exists. + """ + url_templ = 'https://api.github.com/repos/{project_owner}/{project}/git/refs/heads/{branch}' # noqa + url = url_templ.format(project_owner=project_owner, project=project, + branch=branch) + resp = requests.get(url) + if resp.ok: + return resp.json() + + +@functools.lru_cache() +def package_version_for_hash(hash, flavor='default', distro='rhel', + distro_version='8.0', machine_type='smithi'): + """ + Does what it says on the tin. Uses gitbuilder repos. + + :returns: a string. + """ + (arch, release, _os) = get_distro_defaults(distro, machine_type) + if distro in (None, 'None'): + distro = _os.name + bp = get_builder_project()( + 'ceph', + dict( + flavor=flavor, + os_type=distro, + os_version=distro_version, + arch=arch, + sha1=hash, + ), + ) + + if (bp.distro == CONTAINER_DISTRO and bp.flavor == CONTAINER_FLAVOR and + not bp.build_complete): + log.info("Container build incomplete") + return None + + try: + return bp.version + except VersionNotFoundError: + return None + + +def get_arch(machine_type): + """ + Based on a given machine_type, return its architecture by querying the lock + server. + + :returns: A string or None + """ + result = teuthology.lock.query.list_locks(machine_type=machine_type, count=1, tries=1) + if not result: + log.warning("No machines found with machine_type %s!", machine_type) + else: + return result[0]['arch'] + + +def strip_fragment_path(original_path): + """ + Given a path, remove the text before '/suites/'. Part of the fix for + http://tracker.ceph.com/issues/15470 + """ + scan_after = '/suites/' + scan_start = original_path.find(scan_after) + if scan_start > 0: + return original_path[scan_start + len(scan_after):] + return original_path + + +def get_install_task_flavor(job_config): + """ + Pokes through the install task's configuration (including its overrides) to + figure out which flavor it will want to install. + + Only looks at the first instance of the install task in job_config. + """ + project, = job_config.get('project', 'ceph'), + tasks = job_config.get('tasks', dict()) + overrides = job_config.get('overrides', dict()) + install_overrides = overrides.get('install', dict()) + project_overrides = install_overrides.get(project, dict()) + first_install_config = dict() + for task in tasks: + if list(task.keys())[0] == 'install': + first_install_config = list(task.values())[0] or dict() + break + first_install_config = copy.deepcopy(first_install_config) + deep_merge(first_install_config, install_overrides) + deep_merge(first_install_config, project_overrides) + return get_flavor(first_install_config) + + +def teuthology_schedule(args, verbose, dry_run, log_prefix='', stdin=None): + """ + Run teuthology-schedule to schedule individual jobs. + + If --dry-run has been passed but --verbose has been passed just once, don't + actually run the command - only print what would be executed. + + If --dry-run has been passed and --verbose has been passed multiple times, + do both. + """ + exec_path = os.path.join( + os.path.dirname(sys.argv[0]), + 'teuthology-schedule') + args.insert(0, exec_path) + if dry_run: + # Quote any individual args so that individual commands can be copied + # and pasted in order to execute them individually. + printable_args = [] + for item in args: + if ' ' in item: + printable_args.append("'%s'" % item) + else: + printable_args.append(item) + log.debug('{0} command: {1}'.format( + log_prefix, + ' '.join(printable_args), + )) + if not dry_run or (dry_run and verbose > 1): + astdin = DEVNULL if stdin is None else PIPE + p = Popen(args, stdin=astdin) + if stdin is not None: + p.communicate(input=stdin.encode('utf-8')) + else: + p.communicate() + +def find_git_parents(project: str, sha1: str, count=1): + + base_url = config.githelper_base_url + if not base_url: + log.warning('githelper_base_url not set, --newest disabled') + return [] + + def refresh(): + url = f"{base_url}/{project}.git/refresh" + log.info(f"Forcing refresh of git mirror: {url}") + resp = requests.get(url) + if not resp.ok: + log.error('git refresh failed for %s: %s', + project, resp.content.decode()) + + def get_sha1s(project, committish, count): + url = f"{base_url}/{project}.git/history?committish={committish}&count={count}" + log.info(f"Looking for parent commits: {url}") + resp = requests.get(url) + resp.raise_for_status() + sha1s = resp.json()['sha1s'] + if len(sha1s) != count: + resp_json = resp.json() + err_msg = resp_json.get("error") or resp_json.get("err") + log.debug(f"Got {resp.status_code} response: {resp_json}") + log.error(f"Can't find {count} parents of {sha1} in {project}: {err_msg}") + return sha1s + + # index 0 will be the commit whose parents we want to find. + # So we will query for count+1, and strip index 0 from the result. + sha1s = get_sha1s(project, sha1, count + 1) + if not sha1s: + log.error("Will try to refresh git mirror and try again") + refresh() + sha1s = get_sha1s(project, sha1, count + 1) + if sha1s: + return sha1s[1:] + return [] diff --git a/teuthology/task/__init__.py b/teuthology/task/__init__.py new file mode 100644 index 000000000..98330a7bf --- /dev/null +++ b/teuthology/task/__init__.py @@ -0,0 +1,136 @@ +import logging + +from teuthology.misc import deep_merge +from teuthology.orchestra.cluster import Cluster + +log = logging.getLogger(__name__) + + +class Task(object): + """ + A base-class for "new-style" teuthology tasks. + + Can be used as a drop-in replacement for the old-style task functions with + @contextmanager decorators. + + Note: While looking up overrides, we use the lowercase name of the class by + default. While this works well for the main task in a module, other + tasks or 'subtasks' may want to override that name using a class + variable called 'name' e.g.: + + class MyTask(Task): + pass + class MySubtask(MyTask): + name = 'mytask.mysubtask' + """ + + def __init__(self, ctx, config=None): + if not hasattr(self, 'name'): + self.name = self.__class__.__name__.lower() + self.log = log + self.ctx = ctx + self.config = config or dict() + if not isinstance(self.config, dict): + raise TypeError("config must be a dict") + self.apply_overrides() + self.filter_hosts() + + def apply_overrides(self): + """ + Look for an 'overrides' dict in self.ctx.config; look inside that for a + dict with the same name as this task. Override any settings in + self.config with those overrides + """ + if not hasattr(self.ctx, 'config'): + return + all_overrides = self.ctx.config.get('overrides', dict()) + if not all_overrides: + return + task_overrides = all_overrides.get(self.name) + if task_overrides: + self.log.debug( + "Applying overrides for task {name}: {overrides}".format( + name=self.name, overrides=task_overrides) + ) + deep_merge(self.config, task_overrides) + + def filter_hosts(self): + """ + Look for a 'hosts' list in self.config. Each item in the list may + either be a role or a hostname. Builds a new Cluster object containing + only those hosts which match one (or more) of the roles or hostnames + specified. The filtered Cluster object is stored as self.cluster so + that the task may only run against those hosts. + """ + if not hasattr(self.ctx, 'cluster'): + return + elif 'hosts' not in self.config: + self.cluster = self.ctx.cluster + return self.cluster + host_specs = self.config.get('hosts', list()) + cluster = Cluster() + for host_spec in host_specs: + role_matches = self.ctx.cluster.only(host_spec) + if len(role_matches.remotes) > 0: + for (remote, roles) in role_matches.remotes.items(): + cluster.add(remote, roles) + elif isinstance(host_spec, str): + for (remote, roles) in self.ctx.cluster.remotes.items(): + if remote.name.split('@')[-1] == host_spec or \ + remote.shortname == host_spec: + cluster.add(remote, roles) + if not cluster.remotes: + raise RuntimeError("All target hosts were excluded!") + self.cluster = cluster + hostnames = [h.shortname for h in self.cluster.remotes.keys()] + self.log.debug("Restricting task {name} to hosts: {hosts}".format( + name=self.name, hosts=' '.join(hostnames)) + ) + return self.cluster + + def setup(self): + """ + Perform any setup that is needed by the task before it executes + """ + pass + + def begin(self): + """ + Execute the main functionality of the task + """ + pass + + def end(self): + """ + Perform any work needed to stop processes started in begin() + """ + pass + + def teardown(self): + """ + Perform any work needed to restore configuration to a previous state. + + Can be skipped by setting 'skip_teardown' to True in self.config + """ + pass + + def __enter__(self): + """ + When using an instance of the class as a context manager, this method + calls self.setup(), then calls self.begin() and returns self. + """ + self.setup() + self.begin() + return self + + def __exit__(self, type_, value, traceback): + """ + When using an instance of the class as a context manager, this method + calls self.end() and self.teardown() - unless + self.config['skip_teardown'] is True + """ + self.end() + if self.config.get('skip_teardown', False): + self.log.info("Skipping teardown") + else: + self.teardown() diff --git a/teuthology/task/ansible.py b/teuthology/task/ansible.py new file mode 100644 index 000000000..29d1170d1 --- /dev/null +++ b/teuthology/task/ansible.py @@ -0,0 +1,497 @@ +import json +import logging +import re +import requests +import os +import pathlib +import pexpect +import yaml +import shutil + +from tempfile import mkdtemp, NamedTemporaryFile + +from teuthology import repo_utils +from teuthology.config import config as teuth_config +from teuthology.exceptions import CommandFailedError, AnsibleFailedError +from teuthology.job_status import set_status +from teuthology.task import Task +from teuthology.util.loggerfile import LoggerFile + +log = logging.getLogger(__name__) + + +class FailureAnalyzer: + def analyze(self, failure_log): + failure_obj = yaml.safe_load(failure_log) + lines = set() + if failure_obj is None: + return lines + for host_obj in failure_obj.values(): + if not isinstance(host_obj, dict): + continue + lines = lines.union(self.analyze_host_record(host_obj)) + return sorted(lines) + + def analyze_host_record(self, record): + lines = set() + for result in record.get("results", [record]): + cmd = result.get("cmd", "") + # When a CPAN task fails, we get _lots_ of stderr_lines, and they + # aren't practical to reduce meaningfully. Instead of analyzing lines, + # just report the command that failed. + if "cpan" in cmd: + lines.add(f"CPAN command failed: {cmd}") + continue + lines_to_analyze = [] + if "stderr_lines" in result: + lines_to_analyze = result["stderr_lines"] + elif "msg" in result: + lines_to_analyze = result["msg"].split("\n") + lines_to_analyze.extend(result.get("err", "").split("\n")) + for line in lines_to_analyze: + line = self.analyze_line(line.strip()) + if line: + lines.add(line) + return list(lines) + + def analyze_line(self, line): + if line.startswith("W: ") or line.endswith("?"): + return "" + drop_phrases = [ + # apt output sometimes contains warnings or suggestions. Those won't be + # helpful, so throw them out. + r"^W: ", + r"\?$", + # some output from SSH is not useful + r"Warning: Permanently added .+ to the list of known hosts.", + r"^@+$", + ] + for phrase in drop_phrases: + match = re.search(rf"({phrase})", line, flags=re.IGNORECASE) + if match: + return "" + + # Next, we can normalize some common phrases. + phrases = [ + "connection timed out", + r"(unable to|could not) connect to [^ ]+", + r"temporary failure resolving [^ ]+", + r"Permissions \d+ for '.+' are too open.", + ] + for phrase in phrases: + match = re.search(rf"({phrase})", line, flags=re.IGNORECASE) + if match: + line = match.groups()[0] + break + + # Strip out URLs for specific packages + package_re = re.compile(r"https?://.*\.(deb|rpm)") + line = package_re.sub("", line) + # Strip out IP addresses + ip_re = re.compile(r"\[IP: \d+\.\d+\.\d+\.\d+( \d+)?\]") + line = ip_re.sub("", line) + return line + + +class Ansible(Task): + """ + A task to run ansible playbooks + + Required configuration parameters: + playbook: Required; can either be a list of plays, or a path/URL to a + playbook. In the case of a path, it may be relative to the + repo's on-disk location (if a repo is provided), or + teuthology's working directory. + + Optional configuration parameters: + repo: A path or URL to a repo (defaults to '.'). Given a repo + value of 'foo', ANSIBLE_ROLES_PATH is set to 'foo/roles' + branch: If pointing to a remote git repo, use this branch. Defaults + to 'main'. + hosts: A list of teuthology roles or partial hostnames (or a + combination of the two). ansible-playbook will only be run + against hosts that match. + inventory: A path to be passed to ansible-playbook with the + --inventory-file flag; useful for playbooks that also have + vars they need access to. If this is not set, we check for + /etc/ansible/hosts and use that if it exists. If it does + not, we generate a temporary file to use. + tags: A string including any (comma-separated) tags to be passed + directly to ansible-playbook. + skip_tags: A string of comma-separated tags that will be skipped by + passing them to ansible-playbook using --skip-tags. + vars: A dict of vars to be passed to ansible-playbook via the + --extra-vars flag + group_vars: A dict with keys matching relevant group names in the + playbook, and values to be written in the corresponding + inventory's group_vars files. Only applies to inventories + generated by this task. + cleanup: If present, the given or generated playbook will be run + again during teardown with a 'cleanup' var set to True. + This will allow the playbook to clean up after itself, + if the playbook supports this feature. + reconnect: If set to True (the default), then reconnect to hosts after + ansible-playbook completes. This is in case the playbook + makes changes to the SSH configuration, or user accounts - + we would want to reflect those changes immediately. + + Examples: + + tasks: + - ansible: + repo: https://github.com/ceph/ceph-cm-ansible.git + playbook: + - roles: + - some_role + - another_role + hosts: + - client.0 + - host1 + + tasks: + - ansible: + repo: /path/to/repo + inventory: /path/to/inventory + playbook: /path/to/playbook.yml + tags: my_tags + skip_tags: my_skipped_tags + vars: + var1: string_value + var2: + - list_item + var3: + key: value + + """ + # set this in subclasses to provide a group to + # assign hosts to for dynamic inventory creation + inventory_group = None + + def __init__(self, ctx, config): + super(Ansible, self).__init__(ctx, config) + self.generated_inventory = False + self.generated_playbook = False + self.log = logging.Logger(__name__) + if ctx.archive: + self.log.addHandler(logging.FileHandler( + os.path.join(ctx.archive, "ansible.log"))) + + def setup(self): + super(Ansible, self).setup() + self.find_repo() + self.get_playbook() + self.get_inventory() or self.generate_inventory() + if not hasattr(self, 'playbook_file'): + self.generate_playbook() + + @property + def failure_log(self): + if not hasattr(self, '_failure_log'): + self._failure_log = NamedTemporaryFile( + prefix="teuth_ansible_failures_", + delete=False, + ) + return self._failure_log + + def find_repo(self): + """ + Locate the repo we're using; cloning it from a remote repo if necessary + """ + repo = self.config.get('repo', '.') + if repo.startswith(('http://', 'https://', 'git@', 'git://')): + repo_path = repo_utils.fetch_repo( + repo, + self.config.get('branch', 'main'), + ) + else: + repo_path = os.path.abspath(os.path.expanduser(repo)) + self.repo_path = repo_path + + def get_playbook(self): + """ + If necessary, fetch and read the playbook file + """ + playbook = self.config['playbook'] + if isinstance(playbook, list): + # Multiple plays in a list + self.playbook = playbook + elif isinstance(playbook, str) and playbook.startswith(('http://', + 'https://')): + response = requests.get(playbook) + response.raise_for_status() + self.playbook = yaml.safe_load(response.text) + elif isinstance(playbook, str): + try: + playbook_path = os.path.expanduser(playbook) + if not playbook_path.startswith('/'): + # If the path is not absolute at this point, look for the + # playbook in the repo dir. If it's not there, we assume + # the path is relative to the working directory + pb_in_repo = os.path.join(self.repo_path, playbook_path) + if os.path.exists(pb_in_repo): + playbook_path = pb_in_repo + self.playbook_file = open(playbook_path) + playbook_yaml = yaml.safe_load(self.playbook_file) + self.playbook = playbook_yaml + except Exception: + log.error("Unable to read playbook file %s", playbook) + raise + else: + raise TypeError( + "playbook value must either be a list, URL or a filename") + log.info("Playbook: %s", self.playbook) + + def get_inventory(self): + """ + Determine whether or not we're using an existing inventory file + """ + self.inventory = self.config.get('inventory') + etc_ansible_hosts = '/etc/ansible/hosts' + if self.inventory: + self.inventory = os.path.expanduser(self.inventory) + elif os.path.exists(etc_ansible_hosts): + self.inventory = etc_ansible_hosts + return self.inventory + + def generate_inventory(self): + """ + Generate a hosts (inventory) file to use. This should not be called if + we're using an existing file. + """ + hosts = self.cluster.remotes.keys() + hostnames = [remote.hostname for remote in hosts] + hostnames.sort() + inventory = [] + if self.inventory_group: + inventory.append('[{0}]'.format(self.inventory_group)) + inventory.extend(hostnames + ['']) + hosts_str = '\n'.join(inventory) + self.inventory = self._write_inventory_files(hosts_str) + self.generated_inventory = True + + def _write_inventory_files(self, inventory, inv_suffix=''): + """ + Actually write the inventory files. Writes out group_vars files as + necessary based on configuration. + + :param inventory: The content of the inventory file itself, as a + string + :param inv_suffix: The suffix to use for the inventory filename + """ + # First, create the inventory directory + inventory_dir = mkdtemp( + prefix="teuth_ansible_inventory", + ) + inv_fn = os.path.join(inventory_dir, 'inventory') + if inv_suffix: + inv_fn = '.'.join(inv_fn, inv_suffix) + # Write out the inventory file + inv_file = open(inv_fn, 'w') + inv_file.write(inventory) + # Next, write the group_vars files + all_group_vars = self.config.get('group_vars') + if all_group_vars: + group_vars_dir = os.path.join(inventory_dir, 'group_vars') + os.mkdir(group_vars_dir) + # We loop over a sorted list of keys here because we want our tests + # to be able to mock predictably + for group_name in sorted(all_group_vars): + group_vars = all_group_vars[group_name] + path = os.path.join(group_vars_dir, group_name + '.yml') + gv_file = open(path, 'w') + yaml.safe_dump(group_vars, gv_file) + + return inventory_dir + + def generate_playbook(self): + """ + Generate a playbook file to use. This should not be called if we're + using an existing file. + """ + playbook_file = NamedTemporaryFile( + prefix="teuth_ansible_playbook_", + dir=self.repo_path, + delete=False, + ) + yaml.safe_dump(self.playbook, playbook_file, explicit_start=True) + playbook_file.flush() + self.playbook_file = playbook_file + self.generated_playbook = True + + def begin(self): + super(Ansible, self).begin() + if len(self.cluster.remotes) > 0: + self.execute_playbook() + else: + log.info("There are no remotes; skipping playbook execution") + + def execute_playbook(self, _logfile=None): + """ + Execute ansible-playbook + + :param _logfile: Use this file-like object instead of a LoggerFile for + testing + """ + environ = os.environ + environ['ANSIBLE_SSH_PIPELINING'] = '1' + environ['ANSIBLE_FAILURE_LOG'] = self.failure_log.name + environ['ANSIBLE_ROLES_PATH'] = "%s/roles" % self.repo_path + environ['ANSIBLE_NOCOLOR'] = "1" + # Store collections in /.ansible/ + # This is the same path used in /ansible.cfg + environ['ANSIBLE_COLLECTIONS_PATH'] = str( + pathlib.Path(__file__).parents[2] / ".ansible") + args = self._build_args() + command = ' '.join(args) + log.debug("Running %s", command) + + out, status = pexpect.run( + command, + cwd=self.repo_path, + logfile=_logfile or LoggerFile(self.log, logging.INFO), + withexitstatus=True, + timeout=None, + ) + if status != 0: + self._handle_failure(command, status) + + if self.config.get('reconnect', True) is True: + remotes = list(self.cluster.remotes) + log.debug("Reconnecting to %s", remotes) + for remote in remotes: + remote.reconnect() + + def _handle_failure(self, command, status): + self._set_status('dead') + failures = None + with open(self.failure_log.name, 'r') as fail_log_file: + fail_log = fail_log_file.read() + try: + analyzer = FailureAnalyzer() + failures = analyzer.analyze(fail_log) + except yaml.YAMLError as e: + log.error( + f"Failed to parse ansible failure log: {self.failure_log.name} ({e})" + ) + except Exception: + log.exception(f"Failed to analyze ansible failure log: {self.failure_log.name}") + # If we hit an exception, or if analyze() returned nothing, use the log as-is + if not failures: + failures = fail_log.replace('\n', '') + + if failures: + self._archive_failures() + raise AnsibleFailedError(failures) + raise CommandFailedError(command, status) + + def _set_status(self, status): + """ + Not implemented in the base class + """ + pass + + def _archive_failures(self): + if self.ctx.archive: + archive_path = "{0}/ansible_failures.yaml".format(self.ctx.archive) + log.info("Archiving ansible failure log at: {0}".format( + archive_path, + )) + shutil.move( + self.failure_log.name, + archive_path + ) + os.chmod(archive_path, 0o664) + + def _build_args(self): + """ + Assemble the list of args to be executed + """ + fqdns = [r.hostname for r in self.cluster.remotes.keys()] + # Assume all remotes use the same username + user = list(self.cluster.remotes)[0].user + extra_vars = dict(ansible_ssh_user=user) + extra_vars.update(self.config.get('vars', dict())) + args = [ + 'ansible-playbook', '-v', + "--extra-vars", "'%s'" % json.dumps(extra_vars), + '-i', self.inventory, + '--limit', ','.join(fqdns), + self.playbook_file.name, + ] + tags = self.config.get('tags') + if tags: + args.extend(['--tags', tags]) + skip_tags = self.config.get('skip_tags') + if skip_tags: + args.extend(['--skip-tags', skip_tags]) + return args + + def teardown(self): + self._cleanup() + if self.generated_inventory: + shutil.rmtree(self.inventory) + if self.generated_playbook: + os.remove(self.playbook_file.name) + super(Ansible, self).teardown() + + def _cleanup(self): + """ + If the ``cleanup`` key exists in config the same playbook will be + run again during the teardown step with the var ``cleanup`` given with + a value of ``True``. If supported, this will allow the playbook to + cleanup after itself during teardown. + """ + if self.config.get("cleanup"): + log.info("Running ansible cleanup...") + extra = dict(cleanup=True) + if self.config.get('vars'): + self.config.get('vars').update(extra) + else: + self.config['vars'] = extra + self.execute_playbook() + else: + log.info("Skipping ansible cleanup...") + + +class CephLab(Ansible): + __doc__ = """ + A very simple subclass of Ansible that defaults to: + + - ansible.cephlab: + repo: {git_base}ceph-cm-ansible.git + branch: main + playbook: cephlab.yml + + If a dynamic inventory is used, all hosts will be assigned to the + group 'testnodes'. + """.format(git_base=teuth_config.ceph_git_base_url) + + # Set the name so that Task knows to look up overrides for + # 'ansible.cephlab' instead of just 'cephlab' + name = 'ansible.cephlab' + inventory_group = 'testnodes' + + def __init__(self, ctx, config): + config = config or dict() + if 'playbook' not in config: + config['playbook'] = 'cephlab.yml' + if 'repo' not in config: + config['repo'] = teuth_config.get_ceph_cm_ansible_git_url() + super(CephLab, self).__init__(ctx, config) + + def begin(self): + # Write foo to ~/.vault_pass.txt if it's missing. + # In almost all cases we don't need the actual vault password. + # Touching an empty file broke as of Ansible 2.4 + vault_pass_path = os.path.expanduser('~/.vault_pass.txt') + if not os.path.exists(vault_pass_path): + with open(vault_pass_path, 'w') as f: + f.write('foo') + super(CephLab, self).begin() + + def _set_status(self, status): + set_status(self.ctx.summary, status) + + +task = Ansible +cephlab = CephLab diff --git a/teuthology/task/args.py b/teuthology/task/args.py new file mode 100644 index 000000000..17e9e9dc0 --- /dev/null +++ b/teuthology/task/args.py @@ -0,0 +1,60 @@ +""" +These routines only appear to be used by the peering_speed tests. +""" +def gen_args(name, args): + """ + Called from argify to generate arguments. + """ + usage = [""] + usage += [name + ':'] + usage += \ + [" {key}: <{usage}> ({default})".format( + key=key, usage=_usage, default=default) + for (key, _usage, default, _) in args] + usage.append('') + usage.append(name + ':') + usage += \ + [" {key}: {default}".format( + key = key, default = default) + for (key, _, default, _) in args] + usage = '\n'.join(' ' + i for i in usage) + def ret(config): + """ + return an object with attributes set from args. + """ + class Object(object): + """ + simple object + """ + pass + obj = Object() + for (key, usage, default, conv) in args: + if key in config: + setattr(obj, key, conv(config[key])) + else: + setattr(obj, key, conv(default)) + return obj + return usage, ret + +def argify(name, args): + """ + Object used as a decorator for the peering speed tests. + See peering_spee_test.py + """ + (usage, config_func) = gen_args(name, args) + def ret1(f): + """ + Wrapper to handle doc and usage information + """ + def ret2(**kwargs): + """ + Call f (the parameter passed to ret1) + """ + config = kwargs.get('config', {}) + if config is None: + config = {} + kwargs['config'] = config_func(config) + return f(**kwargs) + ret2.__doc__ = f.__doc__ + usage + return ret2 + return ret1 diff --git a/teuthology/task/background_exec.py b/teuthology/task/background_exec.py new file mode 100644 index 000000000..897b52531 --- /dev/null +++ b/teuthology/task/background_exec.py @@ -0,0 +1,76 @@ +""" +Background task +""" + +import contextlib +import logging + +from teuthology import misc +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def task(ctx, config): + """ + Run a background task. + + Run the given command on a client, similar to exec. However, when + we hit the finally because the subsequent task is ready to exit, kill + the child process. + + We do not do any error code checking here since we are forcefully killing + off the child when we are done. + + If the command a list, we simply join it with ;'s. + + Example:: + + tasks: + - install: + - background_exec: + client.0: while true ; do date ; sleep 1 ; done + client.1: + - while true + - do id + - sleep 1 + - done + - exec: + client.0: + - sleep 10 + + """ + assert isinstance(config, dict), "task background got invalid config" + + testdir = misc.get_testdir(ctx) + + tasks = {} + for role, cmd in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() + log.info('Running background command on role %s host %s', role, + remote.name) + if isinstance(cmd, list): + cmd = '; '.join(cmd) + cmd.replace('$TESTDIR', testdir) + tasks[remote.name] = remote.run( + args=[ + 'sudo', + 'TESTDIR=%s' % testdir, + 'daemon-helper', 'kill', '--kill-group', + 'bash', '-c', cmd, + ], + wait=False, + stdin=run.PIPE, + check_status=False, + logger=log.getChild(remote.name) + ) + + try: + yield + + finally: + for name, task in tasks.items(): + log.info('Stopping background command on %s', name) + task.stdin.close() + run.wait(tasks.values()) diff --git a/teuthology/task/buildpackages.py b/teuthology/task/buildpackages.py new file mode 100644 index 000000000..ae56af01f --- /dev/null +++ b/teuthology/task/buildpackages.py @@ -0,0 +1,245 @@ +""" +Build ceph packages + +Unit tests: + +py.test -v -s tests/test_buildpackages.py + +Integration tests: + +teuthology-openstack --verbose --key-name myself --key-filename ~/Downloads/myself --ceph infernalis --suite teuthology/buildpackages + +""" +import copy +import logging +import os +import types +from teuthology import packaging +from teuthology import misc +from teuthology.config import config as teuth_config +from teuthology.openstack import OpenStack + +log = logging.getLogger(__name__) + +class LocalGitbuilderProject(packaging.GitbuilderProject): + + def __init__(self): + pass + + +def get_pkg_type(os_type): + if os_type in ('centos', 'fedora', 'opensuse', 'rhel', 'sle'): + return 'rpm' + else: + return 'deb' + +def apply_overrides(ctx, config): + if config is None: + config = {} + else: + config = copy.deepcopy(config) + + assert isinstance(config, dict), \ + "task install only supports a dictionary for configuration" + + project, = config.get('project', 'ceph'), + log.debug('project %s' % project) + overrides = ctx.config.get('overrides') + if overrides: + install_overrides = overrides.get('install', {}) + misc.deep_merge(config, install_overrides.get(project, {})) + return config + +def get_config_install(ctx, config): + config = apply_overrides(ctx, config) + log.debug('install config %s' % config) + return [(config.get('flavor', 'default'), + config.get('tag', ''), + config.get('branch', ''), + config.get('sha1'))] + +def get_config_install_upgrade(ctx, config): + log.debug('install.upgrade config before override %s' % config) + configs = [] + for (role, role_config) in config.items(): + if role_config is None: + role_config = {} + o = apply_overrides(ctx, role_config) + + log.debug('install.upgrade config ' + str(role_config) + + ' and with overrides ' + str(o)) + # for install.upgrade overrides are actually defaults + configs.append((o.get('flavor', 'default'), + role_config.get('tag', o.get('tag', '')), + role_config.get('branch', o.get('branch', '')), + role_config.get('sha1', o.get('sha1')))) + return configs + +GET_CONFIG_FUNCTIONS = { + 'install': get_config_install, + 'install.upgrade': get_config_install_upgrade, +} + +def lookup_configs(ctx, node): + configs = [] + if type(node) is types.ListType: + for leaf in node: + configs.extend(lookup_configs(ctx, leaf)) + elif type(node) is types.DictType: + for (key, value) in node.items(): + if key in ('install', 'install.upgrade'): + configs.extend(GET_CONFIG_FUNCTIONS[key](ctx, value)) + elif key in ('overrides',): + pass + else: + configs.extend(lookup_configs(ctx, value)) + return configs + +def get_sha1(ref): + url = teuth_config.get_ceph_git_url() + ls_remote = misc.sh("git ls-remote " + url + " " + ref) + return ls_remote.split()[0] + +def task(ctx, config): + """ + Build Ceph packages. This task will automagically be run + before the task that need to install packages (this is taken + care of by the internal teuthology task). + + The config should be as follows: + + buildpackages: + good_machine: + disk: 40 # GB + ram: 48000 # MB + cpus: 16 + min_machine: + disk: 40 # GB + ram: 8000 # MB + cpus: 1 + + example: + + tasks: + - buildpackages: + good_machine: + disk: 40 # GB + ram: 15000 # MB + cpus: 16 + min_machine: + disk: 40 # GB + ram: 8000 # MB + cpus: 1 + - install: + + When a buildpackages task is already included, the values it contains can be + overriden with: + + overrides: + buildpackages: + good_machine: + disk: 20 # GB + ram: 2000 # MB + cpus: 2 + min_machine: + disk: 10 # GB + ram: 1000 # MB + cpus: 1 + + """ + log.info('Beginning buildpackages...') + if config is None: + config = {} + assert isinstance(config, dict), \ + 'task only accepts a dict for config not ' + str(config) + overrides = ctx.config.get('overrides', {}) + misc.deep_merge(config, overrides.get('buildpackages', {})) + d = os.path.join(os.path.dirname(__file__), 'buildpackages') + os_type = misc.get_distro(ctx) + os_version = misc.get_distro_version(ctx) + arch = ctx.config.get('arch', OpenStack().get_default_arch()) + dist = LocalGitbuilderProject()._get_distro(distro=os_type, + version=os_version) + pkg_type = get_pkg_type(os_type) + misc.sh( + "flock --close /tmp/buildpackages " + + "make -C " + d + " " + os.environ['HOME'] + "/.ssh_agent") + for (flavor, tag, branch, sha1) in lookup_configs(ctx, ctx.config): + if tag: + sha1 = get_sha1(tag) + elif branch: + sha1 = get_sha1(branch) + log.info("building flavor = " + flavor + "," + + " tag = " + tag + "," + + " branch = " + branch + "," + + " sha1 = " + sha1) + self_name = 'teuthology' + key_name = 'teuthology' + pkg_repo = 'packages-repository' + security_group = 'teuthology' + if teuth_config.openstack.has_key('selfname'): + self_name = teuth_config.openstack['selfname'] + if teuth_config.openstack.has_key('keypair'): + key_name = teuth_config.openstack['keypair'] + if teuth_config.openstack.has_key('package_repo'): + pkg_repo = teuth_config.openstack['package_repo'] + if teuth_config.openstack.has_key('server_group'): + security_group = teuth_config.openstack['server_group'] + target = (self_name + '-ceph-' + + pkg_type + '-' + + dist + '-' + + arch + '-' + + flavor + '-' + + sha1) + openstack = OpenStack() + openstack.set_provider() + network = openstack.net() + if network != "": + network = " OPENSTACK_NETWORK='" + network + "' " + openstack.image(os_type, os_version, arch) # create if it does not exist + build_flavor = openstack.flavor_range( + config['min_machine'], config['good_machine'], arch) + default_arch = openstack.get_default_arch() + http_flavor = openstack.flavor({ + 'disk': 30, # GB + 'ram': 1024, # MB + 'cpus': 1, + }, default_arch) + + lock = "/tmp/buildpackages-" + sha1 + "-" + os_type + "-" + os_version + cmd = (". " + os.environ['HOME'] + "/.ssh_agent ; " + + " flock --close " + lock + + " make -C " + d + + network + + " SELFNAME=" + self_name + + " KEY_NAME=" + key_name + + " PKG_REPO=" + pkg_repo + + " SEC_GROUP=" + security_group + + " CEPH_GIT_URL=" + teuth_config.get_ceph_git_url() + + " CEPH_PKG_TYPE=" + pkg_type + + " CEPH_OS_TYPE=" + os_type + + " CEPH_OS_VERSION=" + os_version + + " CEPH_DIST=" + dist + + " CEPH_ARCH=" + arch + + " CEPH_SHA1=" + sha1 + + " CEPH_TAG=" + tag + + " CEPH_BRANCH=" + branch + + " CEPH_FLAVOR=" + flavor + + " BUILD_FLAVOR=" + build_flavor + + " HTTP_FLAVOR=" + http_flavor + + " HTTP_ARCH=" + default_arch + + " BUILDPACKAGES_CANONICAL_TAGS=" + + ("true" if teuth_config.canonical_tags else "false") + + " " + target + + " ") + log.info("Executing the following make command to build {} packages. " \ + "Note that some values in the command, like CEPH_GIT_URL " \ + "and BUILDPACKAGES_CANONICAL_TAGS, may differ from similar " \ + "command-line parameter values. This is because " \ + "the values used by this task are taken from the teuthology " \ + "configuration file. If in doubt, tear down your teuthology " \ + "instance and start again from scratch.".format(pkg_type)) + log.info("buildpackages make command: " + cmd) + misc.sh(cmd) + teuth_config.gitbuilder_host = openstack.get_ip(pkg_repo, '') + log.info('Finished buildpackages') diff --git a/teuthology/task/buildpackages/Makefile b/teuthology/task/buildpackages/Makefile new file mode 100644 index 000000000..9de81db4b --- /dev/null +++ b/teuthology/task/buildpackages/Makefile @@ -0,0 +1,84 @@ +SHELL=/bin/bash +D=/tmp/stampsdir +VPATH=${D} +TIMEOUT_SERVER_CREATE = 30m +TIMEOUT_BUILD = 220m # 20 minutes short of 4 hours +SEC_GROUP=teuthology +KEY_NAME=teuthology +SELFNAME=teuthology +PKG_REPO=packages-repository +PKG_REPO_OS_TYPE=ubuntu +PKG_REPO_OS_VERSION=14.04 +PKG_REPO_USER_DATA=${PKG_REPO_OS_TYPE}-${PKG_REPO_OS_VERSION}-user-data.txt + +# We want to extract the first listed IPv4 address! +# Openstack will provide the addresses field in this format: +# "net1-name=ip(, ip)+(; net2-name=ip(, ip)+)+" +# Each IP may be v4 or v6 (including shortened forms and IPv4-mapped-IPv6 forms) +# 1.2.3.4 +# 2001:db8:6050:ed4d:f816:3eff:fe48:3b36 +# 2001:db8::fe48:3b36 +# 2001:db8::1.2.3.4 +# Example long-form input: +# private-network=10.10.10.69, 2001:db8:6050:ed4d:f816:3eff:fed1:d9f8;net-name2=2001:db8::fe48:3b36, 2001:db8::1.2.3.4, 1.2.3.4; +# TODO: allow selection of the network instead of taking the first network +# TODO: Support IPv6 in future +define get_ip +$$(openstack server show -f value -c addresses $(1) |perl -pe 's/^[^=]+=([^;]+).*/\1/g; s/[ ,]/\n/g; ' |grep -v -e ':' -e '^$$' |head -n1) +endef + +MY_IP=$(shell hostname -I | cut -f1 -d' ') + +${HOME}/.ssh_agent: + ssh-agent -s > ${HOME}/.ssh_agent + source ${HOME}/.ssh_agent ; ssh-add ; ssh-add -l + grep -q ssh_agent ~/.bashrc_teuthology || echo 'source ${HOME}/.ssh_agent' >> ~/.bashrc_teuthology + +flock-${PKG_REPO}: + timeout $(TIMEOUT_SERVER_CREATE) openstack server create --image 'teuthology-ubuntu-14.04-${HTTP_ARCH}' ${OPENSTACK_NETWORK} --flavor ${HTTP_FLAVOR} --key-name ${KEY_NAME} --security-group ${SEC_GROUP} --property ownedby=${MY_IP} --user-data ${PKG_REPO_USER_DATA} --wait ${PKG_REPO} + sleep 30 + set -ex ; \ + ip=$(call get_ip,${PKG_REPO}) ; \ + for delay in 60 60 60 60 2 2 2; do sleep $$delay ; if ssh -o 'ConnectTimeout=3' $$ip bash -c '"grep -q READYTORUN /var/log/cloud-init*.log"' ; then break ; else echo "ssh status code $$?" ; fi ; done ; \ + ssh $$ip sudo apt-get update ; \ + ssh $$ip sudo apt-get install -y nginx rsync && \ + ssh $$ip sudo chown -R ubuntu /usr/share/nginx/html && \ + ssh $$ip sudo rm /usr/share/nginx/html/\* && \ + ssh $$ip sudo perl -pi -e '"s|location / {|location / { autoindex on;|"' /etc/nginx/sites-available/default && \ + ssh $$ip sudo /etc/init.d/nginx restart && \ + perl -pi -e "s/^gitbuilder_host:.*/gitbuilder_host: $$ip/" ~/.teuthology.yaml + touch ${D}/$@ + +${PKG_REPO}: + mkdir -p ${D} + flock --close ${D}/flock-$@.lock ${MAKE} flock-$@ + touch ${D}/$@ + +# Just because 'server create' return success does not mean it actually succeeded! +# Check the server status before we proceed. +# If it's a weird status, bail out and let the delete fire +# eg: ERROR status can happen if there is no VM host without enough capacity for the request. +${SELFNAME}-ceph-${CEPH_PKG_TYPE}-${CEPH_DIST}-${CEPH_ARCH}-${CEPH_FLAVOR}-${CEPH_SHA1}: ${PKG_REPO} + timeout $(TIMEOUT_SERVER_CREATE) openstack server create --image 'makecheck-${CEPH_OS_TYPE}-${CEPH_OS_VERSION}-${CEPH_ARCH}' ${OPENSTACK_NETWORK} --flavor ${BUILD_FLAVOR} --key-name ${KEY_NAME} --security-group ${SEC_GROUP} --property ownedby=${MY_IP} --user-data ${CEPH_OS_TYPE}-${CEPH_OS_VERSION}-user-data.txt --wait $@ + set -ex ; \ + trap "openstack server delete --wait $@" EXIT ; \ + for delay in 30 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 ; do \ + status=$$(openstack server show -c status -f value $@) ; \ + case $$status in \ + ACTIVE) break ;; \ + NOSTATE|*BUILD|*BOOT|*RESIZE) sleep $$delay ;; \ + *) exit 1 ;; \ + esac ; \ + done ; \ + ip=$(call get_ip,$@) ; \ + test -n "$$ip" || exit ; \ + for delay in 60 60 60 60 2 2 2 ; do sleep $$delay ; if ssh -o 'ConnectTimeout=3' $$ip bash -c '"grep -q READYTORUN /var/log/cloud-init*.log"' ; then break ; else echo "ssh status code $$?" ; fi ; done ; \ + scp make-${CEPH_PKG_TYPE}.sh common.sh ubuntu@$$ip: ; \ + packages_repository=$(call get_ip,${> /etc/ssh/sshd_config + - ( echo 'Defaults !requiretty' ; echo 'Defaults visiblepw' ) | tee /etc/sudoers.d/cephlab_sudo +preserve_hostname: true +system_info: + default_user: + name: ubuntu +packages: + - dracut-modules-growroot +runcmd: + - mkinitrd --force /boot/initramfs-2.6.32-573.3.1.el6.x86_64.img 2.6.32-573.3.1.el6.x86_64 + - reboot +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/centos-7.0-user-data.txt b/teuthology/task/buildpackages/centos-7.0-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/centos-7.0-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/centos-7.1-user-data.txt b/teuthology/task/buildpackages/centos-7.1-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/centos-7.1-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/centos-7.2-user-data.txt b/teuthology/task/buildpackages/centos-7.2-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/centos-7.2-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/centos-7.3-user-data.txt b/teuthology/task/buildpackages/centos-7.3-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/centos-7.3-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/common.sh b/teuthology/task/buildpackages/common.sh new file mode 100644 index 000000000..4bc18adc3 --- /dev/null +++ b/teuthology/task/buildpackages/common.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# +function install_deps() { + if [ ! -f install-deps.sh ]; then + git archive --remote=git://git.ceph.com/ceph.git main install-deps.sh | tar -xvf - + fi + # + # drop the following hack when trusty is not supported anymore + # there is no other way as long as we maintain a debian directory that tries + # to be the same for all distributions + # + if grep --quiet 14.04 /etc/issue 2>/dev/null && sudo apt-get install --force-yes -qq -y dpkg-dev && test "$(dpkg-architecture -qDEB_BUILD_GNU_CPU 2>/dev/null)" = aarch64 ; then + sed -i -e '/libgoogle-perftools-dev/d' debian/control + fi + bash -x install-deps.sh +} + +function git_submodules() { + # see http://tracker.ceph.com/issues/13426 + perl -pi -e 's|git://ceph.com/git/ceph-object-corpus.git|https://github.com/ceph/ceph-object-corpus.git|' .gitmodules + local force=$(if git submodule usage 2>&1 | grep --quiet 'update.*--force'; then echo --force ; fi) + git submodule sync || return 1 + git submodule update $force --init --recursive || return 1 +} + +function get_ceph() { + local git_ceph_url=$1 + local sha1=$2 + + test -d ceph || git clone ${git_ceph_url} ceph + cd ceph + if test -d src ; then # so we don't try to fetch when using a fixture + if test "x$BUILDPACKAGES_CANONICAL_TAGS" != "xfalse" ; then + echo "Fetching canonical tags from http://github.com/ceph/ceph (to disable, " \ + "set BUILDPACKAGES_CANONICAL_TAGS=false in the environment)" + git fetch --tags http://github.com/ceph/ceph + fi + fi + git fetch --tags ${git_ceph_url} + git checkout ${sha1} +} + +function init_ceph() { + local git_ceph_url=$1 + local sha1=$2 + get_ceph $git_ceph_url $sha1 || return 1 + git_submodules || return 1 + install_deps || return 1 +} + +function flavor2configure() { + local flavor=$1 + + eval $(dpkg-architecture) + + if test $flavor = notcmalloc || test "$DEB_HOST_GNU_CPU" = aarch64 ; then + echo --without-tcmalloc --without-cryptopp + fi +} + +# +# for a given $sha1 in the $ceph_dir repository, lookup all references +# from the remote origin and tags matching the sha1. Add a symbolic +# link in $ref_dir to the $sha1 for each reference found. If the +# reference is a tag, also add a symbolic link to the commit to which +# the tag points, if it is an annotated tag. +# +function link_same() { + local ref_dir=$1 + local ceph_dir=$2 + local sha1=$3 + + mkdir -p $ref_dir + ( + cd ${ceph_dir} + git for-each-ref refs/tags/** refs/remotes/origin/** | grep $sha1 | \ + while read sha1 type ref ; do + if test $type = 'tag' ; then + commit_sha1=$(git rev-parse $ref^{commit}) + if test $commit_sha1 != $sha1 ; then + echo ../sha1/$sha1 ../sha1/$commit_sha1 + fi + fi + echo ../sha1/$sha1 $(basename $ref) + done + ) | while read from to ; do + ( cd $ref_dir ; ln -sf $from $to ) + done +} + +function test_link_same() { + local d=/tmp/link_same$$ + mkdir -p $d/primary + cd $d/primary + git init + touch a ; git add a ; git commit -m 'm' a + git tag tag1 + tag1=$(git rev-parse HEAD) + git branch branch1 + touch b ; git add b ; git commit -m 'm' b + git tag --annotate -m 'a' tag2 + tag2=$(git rev-parse tag2) + sha1_tag2=$(git rev-parse tag2^{commit}) + git branch branch2 + touch c ; git add c ; git commit -m 'm' c + git branch branch3 + sha1_branch3=$(git rev-parse branch3) + + git clone $d/primary $d/secondary + cd $d/secondary + mkdir $d/ref $d/sha1 + + touch $d/sha1/$sha1_branch3 + link_same $d/ref $d/secondary $sha1_branch3 + test $(readlink --canonicalize $d/ref/branch3) = $d/sha1/$sha1_branch3 || return 1 + test $(readlink --canonicalize $d/ref/main) = $d/sha1/$sha1_branch3 || return 1 + + touch $d/sha1/$tag2 + link_same $d/ref $d/secondary $tag2 + test $(readlink --canonicalize $d/ref/tag2) = $d/sha1/$tag2 || return 1 + test $(readlink --canonicalize $d/sha1/$sha1_tag2) = $d/sha1/$tag2 || return 1 + + touch $d/sha1/$tag1 + link_same $d/ref $d/secondary $tag1 + test $(readlink --canonicalize $d/ref/tag1) = $d/sha1/$tag1 || return 1 + test $(readlink --canonicalize $d/ref/branch1) = $d/sha1/$tag1 || return 1 + + rm -fr $d +} + +function maybe_parallel() { + local nproc=$1 + local vers=$2 + + if echo $vers | grep --quiet '0\.67' ; then + return + fi + + if test $nproc -gt 1 ; then + echo -j${nproc} + fi +} + +function test_maybe_parallel() { + test "$(maybe_parallel 1 0.72)" = "" || return 1 + test "$(maybe_parallel 8 0.67)" = "" || return 1 + test "$(maybe_parallel 8 0.72)" = "-j8" || return 1 +} + +if test "$1" = "TEST" ; then + shopt -s -o xtrace + PS4='${BASH_SOURCE[0]}:$LINENO: ${FUNCNAME[0]}: ' + test_link_same + test_maybe_parallel +fi diff --git a/teuthology/task/buildpackages/debian-8.0-user-data.txt b/teuthology/task/buildpackages/debian-8.0-user-data.txt new file mode 100644 index 000000000..13aba9876 --- /dev/null +++ b/teuthology/task/buildpackages/debian-8.0-user-data.txt @@ -0,0 +1,12 @@ +#cloud-config +bootcmd: + - echo 'APT::Get::AllowUnauthenticated "true";' | tee /etc/apt/apt.conf.d/99disablesigs + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +system_info: + default_user: + name: ubuntu +runcmd: + - echo 'ubuntu ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/make-deb.sh b/teuthology/task/buildpackages/make-deb.sh new file mode 100755 index 000000000..fb7f4176d --- /dev/null +++ b/teuthology/task/buildpackages/make-deb.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +# +# Create and upload a deb repository with the same naming conventions +# as https://github.com/ceph/autobuild-ceph/blob/main/build-ceph-deb.sh +# +set -xe + +base=/tmp/release +gitbuilder_host=$1 +codename=$2 +git_ceph_url=$3 +sha1=$4 +flavor=$5 +arch=$6 +canonical_tags=$7 + +sudo apt-get update +sudo apt-get install -y git + +export BUILDPACKAGES_CANONICAL_TAGS=$canonical_tags +source $(dirname $0)/common.sh + +init_ceph $git_ceph_url $sha1 + +#codename=$(lsb_release -sc) +releasedir=$base/$(lsb_release -si)/WORKDIR +# +# git describe provides a version that is +# a) human readable +# b) is unique for each commit +# c) compares higher than any previous commit +# d) contains the short hash of the commit +# +vers=$(git describe --match "v*" | sed s/^v//) +# +# always set the debian version to 1 which is ok because the debian +# directory is included in the sources and the upstream version will +# change each time it is modified. +# +dvers="$vers-1" +: ${NPROC:=$(nproc)} +ceph_dir=$(pwd) + +function build_package() { + + rm -fr $releasedir + mkdir -p $releasedir + # + # remove all files not under git so they are not + # included in the distribution. + # + git clean -qdxff + + fileext="gz" + # autotools only works in jewel and below + if [[ ! -e "make-dist" ]] ; then + # + # creating the distribution tarbal requires some configure + # options (otherwise parts of the source tree will be left out). + # + ./autogen.sh + # Building with LTTNG on Ubuntu Precise is not possible. + # It fails the LTTNG-is-sane check (it misses headers) + # And the Debian rules files leave it out anyway + case $codename in + precise) lttng_opt="--without-lttng" ;; + *) lttng_opt="--with-lttng" ;; + esac + ./configure $(flavor2configure $flavor) \ + --with-rocksdb --with-ocf \ + --with-nss --with-debug --enable-cephfs-java \ + $lttng_opt --with-babeltrace + # + # use distdir= to set the name of the top level directory of the + # tarbal to match the desired version + # + make distdir=ceph-$vers dist + else + ./make-dist + fileext="bz2" + fi + # + # rename the tarbal to match debian conventions and extract it + # + mv ceph-$vers.tar.$fileext $releasedir/ceph_$vers.orig.tar.$fileext + tar -C $releasedir -xf $releasedir/ceph_$vers.orig.tar.$fileext + # + # copy the debian directory over + # + cp -a debian $releasedir/ceph-$vers/debian + cd $releasedir + # + # uncomment to remove -dbg packages + # because they are large and take time to build + # + #perl -ni -e 'print if(!(/^Package: .*-dbg$/../^$/))' ceph-$vers/debian/control + #perl -pi -e 's/--dbg-package.*//' ceph-$vers/debian/rules + # + # update the changelog to match the desired version + # + cd ceph-$vers + local chvers=$(head -1 debian/changelog | perl -ne 's/.*\(//; s/\).*//; print') + if [ "$chvers" != "$dvers" ]; then + DEBEMAIL="contact@ceph.com" dch -D $codename --force-distribution -b -v "$dvers" "new version" + fi + # + # create the packages (with ccache) + # + export CEPH_EXTRA_CONFIGURE_ARGS=$(flavor2configure $flavor) + j=$(maybe_parallel $NPROC $vers) + PATH=/usr/lib/ccache:$PATH dpkg-buildpackage $j -uc -us -sa +} + +function build_repo() { + local gitbuilder_host=$1 + + sudo apt-get install -y reprepro + cd ${releasedir}/.. + # + # Create a repository in a directory with a name structured + # as + # + base=ceph-deb-$codename-$arch-$flavor + sha1_dir=$codename/$base/sha1/$sha1 + mkdir -p $sha1_dir/conf + cat > $sha1_dir/conf/distributions < $sha1_dir/version + echo $sha1 > $sha1_dir/sha1 + link_same $codename/$base/ref $ceph_dir $sha1 + if test "$gitbuilder_host" ; then + cd $codename + sudo apt-get install -y rsync + RSYNC_RSH='ssh -o StrictHostKeyChecking=false' rsync -av $base/ $gitbuilder_host:/usr/share/nginx/html/$base/ + fi +} + +build_package +build_repo $gitbuilder_host diff --git a/teuthology/task/buildpackages/make-rpm.sh b/teuthology/task/buildpackages/make-rpm.sh new file mode 100755 index 000000000..11cac7000 --- /dev/null +++ b/teuthology/task/buildpackages/make-rpm.sh @@ -0,0 +1,294 @@ +#!/bin/bash +# +# Copyright (C) 2015 Red Hat +# +# Author: Loic Dachary +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +# +# Create and upload a RPM repository with the same naming conventions +# as https://github.com/ceph/autobuild-ceph/blob/main/build-ceph-rpm.sh +# + +set -xe + +base=/tmp/release +gitbuilder_host=$1 +codename=$2 +git_ceph_url=$3 +sha1=$4 +flavor=$5 +arch=$6 +canonical_tags=$7 + +suse=false +[[ $codename =~ suse ]] && suse=true +[[ $codename =~ sle ]] && suse=true + +CREATEREPO=createrepo +if [ "$suse" = true ] ; then + source /etc/os-release + majorvers=$(echo $VERSION_ID | cut -d \. -f 1-1) + test $majorvers -ge 15 && CREATEREPO=createrepo_c + for delay in 60 60 60 60 ; do + sudo zypper --non-interactive --no-gpg-checks refresh && break + sleep $delay + done + sudo zypper --non-interactive install --no-recommends git $CREATEREPO +else + sudo yum install -y git $CREATEREPO +fi + +export BUILDPACKAGES_CANONICAL_TAGS=$canonical_tags +source $(dirname $0)/common.sh + +init_ceph $git_ceph_url $sha1 + +distro=$( source /etc/os-release ; echo $ID ) +distro_version=$( source /etc/os-release ; echo $VERSION ) +releasedir=$base/$distro/WORKDIR +# +# git describe provides a version that is +# a) human readable +# b) is unique for each commit +# c) compares higher than any previous commit +# WAIT, c) DOES NOT HOLD: +# >>> print 'v10.2.5-7-g000000' < 'v10.2.5-8-g000000' +# True +# >>> print 'v10.2.5-9-g000000' < 'v10.2.5-10-g000000' +# False +# d) contains the short hash of the commit +# +# Regardless, we use it for the RPM version number, but strip the leading 'v' +# and replace the '-' before the 'g000000' with a '.' to match the output of +# "rpm -q $PKG --qf %{VERSION}-%{RELEASE}" +# +vers=$(git describe --match "v*" | sed -r -e 's/^v//' -e 's/\-([[:digit:]]+)\-g/\-\1\.g/') +ceph_dir=$(pwd) + +# +# Create a repository in a directory with a name structured +# as +# +base=ceph-rpm-$codename-$arch-$flavor + +function setup_rpmmacros() { + if ! grep -q find_debuginfo_dwz_opts $HOME/.rpmmacros ; then + echo '%_find_debuginfo_dwz_opts %{nil}' >> $HOME/.rpmmacros + fi + if [ "x${distro}x" = "xcentosx" ] && echo $distro_version | grep -q '7' ; then + if ! grep -q '%dist .el7' $HOME/.rpmmacros ; then + echo '%dist .el7' >> $HOME/.rpmmacros + fi + fi +} + +function build_package() { + rm -fr $releasedir + mkdir -p $releasedir + # + # remove all files not under git so they are not + # included in the distribution. + # + git clean -qdxff + # autotools only works in jewel and below + if [[ ! -e "make-dist" ]] ; then + # lsb-release is required by install-deps.sh + # which is required by autogen.sh + if [ "$suse" = true ] ; then + sudo zypper -n install bzip2 lsb-release which + else + sudo yum install -y bzip2 redhat-lsb-core which + fi + ./autogen.sh + # + # creating the distribution tarball requires some configure + # options (otherwise parts of the source tree will be left out). + # + ./configure $(flavor2configure $flavor) --with-debug --with-radosgw --with-fuse --with-libatomic-ops --with-gtk2 --with-nss + + # + # use distdir= to set the name of the top level directory of the + # tarbal to match the desired version + # + make dist-bzip2 + else + # kraken and above + ./make-dist + fi + # Set up build area + setup_rpmmacros + if [ "$suse" = true ] ; then + sudo zypper -n install rpm-build + else + sudo yum install -y rpm-build + fi + local buildarea=$releasedir + mkdir -p ${buildarea}/SOURCES + mkdir -p ${buildarea}/SRPMS + mkdir -p ${buildarea}/SPECS + cp ceph.spec ${buildarea}/SPECS + mkdir -p ${buildarea}/RPMS + mkdir -p ${buildarea}/BUILD + CEPH_TARBALL=( ceph-*.tar.bz2 ) + CEPH_TARBALL_BASE=$(echo $CEPH_TARBALL | sed -e 's/.tar.bz2$//') + CEPH_VERSION=$(echo $CEPH_TARBALL_BASE | cut -d - -f 2-2) + CEPH_RELEASE=$(echo $CEPH_TARBALL_BASE | cut -d - -f 3- | tr - .) + cp -a $CEPH_TARBALL ${buildarea}/SOURCES/. + cp -a rpm/*.patch ${buildarea}/SOURCES || true + ( + cd ${buildarea}/SPECS + ccache=$(echo /usr/lib*/ccache) + if [ "$suse" = true ]; then + sed -i \ + -e '0,/%package/s//%debug_package\n\n&/' \ + -e 's/%bcond_with ceph_test_package/%bcond_without ceph_test_package/g' \ + -e "s/^Version:.*/Version: $CEPH_VERSION/g" \ + -e "s/^Release:.*/Release: $CEPH_RELEASE/g" \ + -e "s/^Source0:.*/Source0: $CEPH_TARBALL/g" \ + -e '/^Source9/d' \ + -e "s/^%autosetup -p1.*/%autosetup -p1 -n $CEPH_TARBALL_BASE/g" \ + ceph.spec + fi + cat ceph.spec + buildarea=`readlink -fn ${releasedir}` ### rpm wants absolute path + PATH=$ccache:$PATH rpmbuild -ba --nosignature \ + --define '_srcdefattr (-,root,root)' \ + --define "_unpackaged_files_terminate_build 0" \ + --define "_topdir ${buildarea}" \ + ceph.spec + ) +} + +function build_rpm_release() { + local buildarea=$1 + local sha1=$2 + local gitbuilder_host=$3 + local base=$4 + + cat < ${buildarea}/SPECS/ceph-release.spec +Name: ceph-release +Version: 1 +Release: 0%{?dist} +Summary: Ceph repository configuration +Group: System Environment/Base +License: GPLv2 +URL: http://gitbuilder.ceph.com/$dist +Source0: ceph.repo +#Source0: RPM-GPG-KEY-CEPH +#Source1: ceph.repo +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +BuildArch: noarch + +%description +This package contains the Ceph repository GPG key as well as configuration +for yum and up2date. + +%prep + +%setup -q -c -T +install -pm 644 %{SOURCE0} . +#install -pm 644 %{SOURCE1} . + +%build + +%install +rm -rf %{buildroot} +#install -Dpm 644 %{SOURCE0} \ +# %{buildroot}/%{_sysconfdir}/pki/rpm-gpg/RPM-GPG-KEY-CEPH +install -dm 755 %{buildroot}/%{_sysconfdir}/yum.repos.d +install -pm 644 %{SOURCE0} \ + %{buildroot}/%{_sysconfdir}/yum.repos.d + +%clean +#rm -rf %{buildroot} + +%post + +%postun + +%files +%defattr(-,root,root,-) +#%doc GPL +/etc/yum.repos.d/* +#/etc/pki/rpm-gpg/* + +%changelog +* Tue Mar 12 2013 Gary Lowell - 1-0 +- Handle both yum and zypper +- Use URL to ceph git repo for key +- remove config attribute from repo file +* Tue Aug 28 2012 Gary Lowell - 1-0 +- Initial Package +EOF + + cat < $buildarea/SOURCES/ceph.repo +[Ceph] +name=Ceph packages for \$basearch +baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/\$basearch +enabled=1 +gpgcheck=0 +type=rpm-md + +[Ceph-noarch] +name=Ceph noarch packages +baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/noarch +enabled=1 +gpgcheck=0 +type=rpm-md + +[ceph-source] +name=Ceph source packages +baseurl=http://${gitbuilder_host}/${base}/sha1/${sha1}/SRPMS +enabled=1 +gpgcheck=0 +type=rpm-md +EOF + + rpmbuild -bb --define "_topdir ${buildarea}" ${buildarea}/SPECS/ceph-release.spec +} + +function build_rpm_repo() { + local buildarea=$1 + local gitbuilder_host=$2 + local base=$3 + + for dir in ${buildarea}/SRPMS ${buildarea}/RPMS/* + do + $CREATEREPO ${dir} + done + + local sha1_dir=${buildarea}/../$codename/$base/sha1/$sha1 + mkdir -p $sha1_dir + echo $vers > $sha1_dir/version + echo $sha1 > $sha1_dir/sha1 + echo ceph > $sha1_dir/name + + for dir in ${buildarea}/SRPMS ${buildarea}/RPMS/* + do + cp -fla ${dir} $sha1_dir + done + + link_same ${buildarea}/../$codename/$base/ref $ceph_dir $sha1 + if test "$gitbuilder_host" ; then + ( + cd ${buildarea}/../$codename + RSYNC_RSH='ssh -o StrictHostKeyChecking=false' rsync -av $base/ ubuntu@$gitbuilder_host:/usr/share/nginx/html/$base/ + ) + fi +} + +setup_rpmmacros +build_package +build_rpm_release $releasedir $sha1 $gitbuilder_host $base +build_rpm_repo $releasedir $gitbuilder_host $base diff --git a/teuthology/task/buildpackages/opensuse-15.0-user-data.txt b/teuthology/task/buildpackages/opensuse-15.0-user-data.txt new file mode 100644 index 000000000..8b9e2244c --- /dev/null +++ b/teuthology/task/buildpackages/opensuse-15.0-user-data.txt @@ -0,0 +1,16 @@ +#cloud-config +bootcmd: + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +users: + - name: ubuntu + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks rm gettext-runtime-mini grub2 grub2-branding-openSUSE grub2-i386-pc grub2-snapper-plugin grub2-systemd-sleep-plugin + - zypper --non-interactive --no-gpg-checks install --no-recommends wget git-core rsyslog lsb-release make gcc gcc-c++ grub2 rpm-build + - sleep 30 +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/opensuse-42.1-user-data.txt b/teuthology/task/buildpackages/opensuse-42.1-user-data.txt new file mode 100644 index 000000000..190cac2b1 --- /dev/null +++ b/teuthology/task/buildpackages/opensuse-42.1-user-data.txt @@ -0,0 +1,13 @@ +#cloud-config +bootcmd: + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +users: + - name: ubuntu + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh ) +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/opensuse-42.2-user-data.txt b/teuthology/task/buildpackages/opensuse-42.2-user-data.txt new file mode 100644 index 000000000..fd35c9db0 --- /dev/null +++ b/teuthology/task/buildpackages/opensuse-42.2-user-data.txt @@ -0,0 +1,14 @@ +#cloud-config +bootcmd: + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +users: + - name: ubuntu + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh ) + - 'zypper rr openSUSE-Leap-Cloud-Tools || :' +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/opensuse-42.3-user-data.txt b/teuthology/task/buildpackages/opensuse-42.3-user-data.txt new file mode 120000 index 000000000..1aa71c406 --- /dev/null +++ b/teuthology/task/buildpackages/opensuse-42.3-user-data.txt @@ -0,0 +1 @@ +opensuse-42.2-user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/sle-12.1-user-data.txt b/teuthology/task/buildpackages/sle-12.1-user-data.txt new file mode 100644 index 000000000..b3edb878a --- /dev/null +++ b/teuthology/task/buildpackages/sle-12.1-user-data.txt @@ -0,0 +1,14 @@ +#cloud-config +bootcmd: + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +users: + - name: ubuntu + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh ) + - zypper --non-interactive install --no-recommends python wget git ntp rsyslog lsb-release +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/sle-12.2-user-data.txt b/teuthology/task/buildpackages/sle-12.2-user-data.txt new file mode 120000 index 000000000..d3697ebdf --- /dev/null +++ b/teuthology/task/buildpackages/sle-12.2-user-data.txt @@ -0,0 +1 @@ +sle-12.1-user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/sle-12.3-user-data.txt b/teuthology/task/buildpackages/sle-12.3-user-data.txt new file mode 120000 index 000000000..d3697ebdf --- /dev/null +++ b/teuthology/task/buildpackages/sle-12.3-user-data.txt @@ -0,0 +1 @@ +sle-12.1-user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/sle-15.0-user-data.txt b/teuthology/task/buildpackages/sle-15.0-user-data.txt new file mode 100644 index 000000000..b837125c8 --- /dev/null +++ b/teuthology/task/buildpackages/sle-15.0-user-data.txt @@ -0,0 +1,14 @@ +#cloud-config +bootcmd: + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +users: + - name: ubuntu + gecos: User + sudo: ["ALL=(ALL) NOPASSWD:ALL"] + groups: users +runcmd: + - ( MYHOME=/home/ubuntu ; mkdir $MYHOME/.ssh ; chmod 700 $MYHOME/.ssh ; cp /root/.ssh/authorized_keys $MYHOME/.ssh ; chown -R ubuntu.users $MYHOME/.ssh ) + - zypper --non-interactive --no-gpg-checks install --no-recommends wget git-core rsyslog lsb-release +final_message: "READYTORUN" diff --git a/teuthology/task/buildpackages/ubuntu-12.04-user-data.txt b/teuthology/task/buildpackages/ubuntu-12.04-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/ubuntu-12.04-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/ubuntu-14.04-user-data.txt b/teuthology/task/buildpackages/ubuntu-14.04-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/ubuntu-14.04-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/ubuntu-16.04-user-data.txt b/teuthology/task/buildpackages/ubuntu-16.04-user-data.txt new file mode 120000 index 000000000..2eb0e3c88 --- /dev/null +++ b/teuthology/task/buildpackages/ubuntu-16.04-user-data.txt @@ -0,0 +1 @@ +user-data.txt \ No newline at end of file diff --git a/teuthology/task/buildpackages/user-data.txt b/teuthology/task/buildpackages/user-data.txt new file mode 100644 index 000000000..d5016929d --- /dev/null +++ b/teuthology/task/buildpackages/user-data.txt @@ -0,0 +1,10 @@ +#cloud-config +bootcmd: + - echo 'APT::Get::AllowUnauthenticated "true";' | tee /etc/apt/apt.conf.d/99disablesigs + - echo nameserver 8.8.8.8 | tee -a /etc/resolv.conf # last resort, in case the DHCP server does not provide a resolver +manage_etc_hosts: true +preserve_hostname: true +system_info: + default_user: + name: ubuntu +final_message: "READYTORUN" diff --git a/teuthology/task/ceph_ansible.py b/teuthology/task/ceph_ansible.py new file mode 100644 index 000000000..0e7d483c3 --- /dev/null +++ b/teuthology/task/ceph_ansible.py @@ -0,0 +1,500 @@ +import json +import os +import re +import logging +import yaml + +from teuthology.task import Task +from tempfile import NamedTemporaryFile +from teuthology.config import config as teuth_config +from teuthology.misc import get_scratch_devices, get_file +from teuthology import contextutil +from teuthology.orchestra import run +from teuthology import misc +log = logging.getLogger(__name__) + + +class CephAnsible(Task): + name = 'ceph_ansible' + + __doc__ = """ + A task to setup ceph cluster using ceph-ansible + + - ceph-ansible: + repo: {git_base}ceph-ansible.git + branch: mybranch # defaults to main + ansible-version: 2.4 # defaults to 2.5 + vars: + ceph_dev: True ( default) + ceph_conf_overrides: + global: + mon pg warn min per osd: 2 + + It always uses a dynamic inventory. + + It will optionally do the following automatically based on ``vars`` that + are passed in: + * Set ``devices`` for each host if ``osd_auto_discovery`` is not True + * Set ``monitor_interface`` for each host if ``monitor_interface`` is + unset + * Set ``public_network`` for each host if ``public_network`` is unset + + The machine that ceph-ansible runs on can be specified using the + installer.0 role. If installer.0 is not used, the first mon will be the + machine on which ceph-ansible runs. + """.format(git_base=teuth_config.ceph_git_base_url) + + groups_to_roles = dict( + mons='mon', + mgrs='mgr', + mdss='mds', + osds='osd', + rgws='rgw', + clients='client', + nfss='nfs', + ) + + def __init__(self, ctx, config): + super(CephAnsible, self).__init__(ctx, config) + config = self.config or dict() + self.playbook = None + if 'playbook' in config: + self.playbook = self.config['playbook'] + if 'repo' not in config: + self.config['repo'] = os.path.join(teuth_config.ceph_git_base_url, + 'ceph-ansible.git') + # default vars to dev builds + if 'vars' not in config: + vars = dict() + config['vars'] = vars + vars = config['vars'] + if 'ceph_dev' not in vars: + vars['ceph_dev'] = True + if 'ceph_dev_key' not in vars: + vars['ceph_dev_key'] = 'https://download.ceph.com/keys/autobuild.asc' + if 'ceph_dev_branch' not in vars: + vars['ceph_dev_branch'] = ctx.config.get('branch', 'main') + self.cluster_name = vars.get('cluster', 'ceph') + + def setup(self): + super(CephAnsible, self).setup() + # generate hosts file based on test config + self.generate_hosts_file() + # generate playbook file if it exists in config + self.playbook_file = None + if self.playbook is not None: + playbook_file = NamedTemporaryFile( + prefix="ceph_ansible_playbook_", dir='/tmp/', + delete=False, + ) + yaml.safe_dump(self.playbook, playbook_file, explicit_start=True) + playbook_file.flush() + self.playbook_file = playbook_file.name + # everything from vars in config go into group_vars/all file + extra_vars = dict() + extra_vars.update(self.config.get('vars', dict())) + gvar = yaml.dump(extra_vars, default_flow_style=False) + self.extra_vars_file = self._write_hosts_file(prefix='teuth_ansible_gvar', + content=gvar) + + def execute_playbook(self): + """ + Execute ansible-playbook + + :param _logfile: Use this file-like object instead of a LoggerFile for + testing + """ + + args = [ + 'ANSIBLE_STDOUT_CALLBACK=debug', + 'ansible-playbook', '-vv', + '-i', 'inven.yml', 'site.yml' + ] + log.debug("Running %s", args) + # If there is an installer.0 node, use that for the installer. + # Otherwise, use the first mon node as installer node. + ansible_loc = self.ctx.cluster.only('installer.0') + (ceph_first_mon,) = self.ctx.cluster.only( + misc.get_first_mon(self.ctx, + self.config)).remotes.keys() + if ansible_loc.remotes: + (ceph_installer,) = ansible_loc.remotes.keys() + else: + ceph_installer = ceph_first_mon + self.ceph_first_mon = ceph_first_mon + self.ceph_installer = ceph_installer + self.args = args + if self.config.get('rhbuild'): + self.run_rh_playbook() + else: + self.run_playbook() + + def generate_hosts_file(self): + hosts_dict = dict() + for group in sorted(self.groups_to_roles.keys()): + role_prefix = self.groups_to_roles[group] + want = lambda role: role.startswith(role_prefix) + for (remote, roles) in self.cluster.only(want).remotes.items(): + hostname = remote.hostname + host_vars = self.get_host_vars(remote) + if group not in hosts_dict: + hosts_dict[group] = {hostname: host_vars} + elif hostname not in hosts_dict[group]: + hosts_dict[group][hostname] = host_vars + + hosts_content = '' + for group in sorted(hosts_dict.keys()): + hosts_content += '[%s]\n' % group + for hostname in sorted(hosts_dict[group].keys()): + vars = hosts_dict[group][hostname] + if vars: + vars_list = [] + for key in sorted(vars.keys()): + vars_list.append( + "%s='%s'" % (key, json.dumps(vars[key]).strip('"')) + ) + host_line = "{hostname} {vars}".format( + hostname=hostname, + vars=' '.join(vars_list), + ) + else: + host_line = hostname + hosts_content += '%s\n' % host_line + hosts_content += '\n' + self.inventory = self._write_hosts_file(prefix='teuth_ansible_hosts_', + content=hosts_content.strip()) + self.generated_inventory = True + + def begin(self): + super(CephAnsible, self).begin() + self.execute_playbook() + + def _write_hosts_file(self, prefix, content): + """ + Actually write the hosts file + """ + hosts_file = NamedTemporaryFile(prefix=prefix, mode='w+', + delete=False) + hosts_file.write(content) + hosts_file.flush() + return hosts_file.name + + def teardown(self): + log.info("Cleaning up temporary files") + os.remove(self.inventory) + if self.playbook is not None: + os.remove(self.playbook_file) + os.remove(self.extra_vars_file) + # collect logs + self.collect_logs() + # run purge-cluster that teardowns the cluster + args = [ + 'ANSIBLE_STDOUT_CALLBACK=debug', + 'ansible-playbook', '-vv', + '-e', 'ireallymeanit=yes', + '-i', 'inven.yml', 'purge-cluster.yml' + ] + log.debug("Running %s", args) + str_args = ' '.join(args) + installer_node = self.ceph_installer + # copy purge-cluster playbook from infra dir to top level dir + # as required by ceph-ansible + installer_node.run( + args=[ + 'cp', + run.Raw('~/ceph-ansible/infrastructure-playbooks/purge-cluster.yml'), + run.Raw('~/ceph-ansible/'), + ] + ) + if self.config.get('rhbuild'): + installer_node.run( + args=[ + run.Raw('cd ~/ceph-ansible'), + run.Raw(';'), + run.Raw(str_args) + ] + ) + else: + installer_node.run( + args=[ + run.Raw('cd ~/ceph-ansible'), + run.Raw(';'), + run.Raw('source venv/bin/activate'), + run.Raw(';'), + run.Raw(str_args) + ] + ) + # cleanup the ansible ppa repository we added + # and also remove the dependency pkgs we installed + if installer_node.os.package_type == 'deb': + installer_node.run(args=[ + 'sudo', + 'add-apt-repository', + '--remove', + run.Raw('ppa:ansible/ansible'), + ]) + installer_node.run(args=[ + 'sudo', + 'apt-get', + 'update', + ]) + installer_node.run(args=[ + 'sudo', + 'apt-get', + 'remove', + '-y', + 'ansible', + 'libssl-dev', + 'libffi-dev', + 'python-dev' + ]) + + def collect_logs(self): + ctx = self.ctx + if ctx.archive is not None and \ + not (ctx.config.get('archive-on-error') and ctx.summary['success']): + log.info('Archiving logs...') + path = os.path.join(ctx.archive, 'remote') + os.makedirs(path) + + def wanted(role): + # Only attempt to collect logs from hosts which are part of the + # cluster + return any(map( + lambda role_stub: role.startswith(role_stub), + self.groups_to_roles.values(), + )) + for remote in ctx.cluster.only(wanted).remotes.keys(): + sub = os.path.join(path, remote.shortname) + os.makedirs(sub) + misc.pull_directory(remote, '/var/log/ceph', + os.path.join(sub, 'log')) + + def wait_for_ceph_health(self): + with contextutil.safe_while(sleep=15, tries=6, + action='check health') as proceed: + (remote,) = self.ctx.cluster.only('mon.a').remotes + remote.run(args=[ + 'sudo', 'ceph', '--cluster', self.cluster_name, 'osd', 'tree' + ]) + remote.run(args=[ + 'sudo', 'ceph', '--cluster', self.cluster_name, '-s' + ]) + log.info("Waiting for Ceph health to reach HEALTH_OK \ + or HEALTH WARN") + while proceed(): + out = remote.sh('sudo ceph --cluster %s health' % self.cluster_name) + state = out.split(None, 1)[0] + log.info("cluster in state: %s", state) + if state in ('HEALTH_OK', 'HEALTH_WARN'): + break + + def get_host_vars(self, remote): + extra_vars = self.config.get('vars', dict()) + host_vars = dict() + if not extra_vars.get('osd_auto_discovery', False): + roles = self.ctx.cluster.remotes[remote] + dev_needed = len([role for role in roles + if role.startswith('osd')]) + if ( + teuth_config.get('ceph_ansible') and + hasattr(self.ctx, "machine_type") and + self.ctx.machine_type in teuth_config['ceph_ansible']['has_lvm_scratch_disks'] + ): + devices = get_file(remote, "/scratch_devs").decode().split() + vols = [] + + for dev in devices: + if 'vg_nvme' in dev: + splitpath = dev.split('/') + vol = dict() + vol['data_vg'] = splitpath[2] + vol['data'] = splitpath[3] + vols.append(vol) + extra_vars['lvm_volumes'] = vols + self.config.update({'vars': extra_vars}) + else: + host_vars['devices'] = get_scratch_devices(remote)[0:dev_needed] + if 'monitor_interface' not in extra_vars: + host_vars['monitor_interface'] = remote.interface + if 'radosgw_interface' not in extra_vars: + host_vars['radosgw_interface'] = remote.interface + if 'public_network' not in extra_vars: + host_vars['public_network'] = remote.cidr + return host_vars + + def run_rh_playbook(self): + ceph_installer = self.ceph_installer + args = self.args + ceph_installer.run(args=[ + 'cp', + '-R', + '/usr/share/ceph-ansible', + '.' + ]) + self._copy_and_print_config() + str_args = ' '.join(args) + out = ceph_installer.sh( + [ + 'cd', + 'ceph-ansible', + run.Raw(';'), + run.Raw(str_args) + ], + timeout=4200, + check_status=False, + ) + log.info(out) + if re.search(r'all hosts have already failed', out): + log.error("Failed during ceph-ansible execution") + raise CephAnsibleError("Failed during ceph-ansible execution") + self._create_rbd_pool() + + def run_playbook(self): + # setup ansible on first mon node + ceph_installer = self.ceph_installer + args = self.args + if ceph_installer.os.package_type == 'deb': + # update ansible from ppa + ceph_installer.run(args=[ + 'sudo', + 'add-apt-repository', + run.Raw('ppa:ansible/ansible'), + ]) + ceph_installer.run(args=[ + 'sudo', + 'apt-get', + 'update', + ]) + ceph_installer.run(args=[ + 'sudo', + 'apt-get', + 'install', + '-y', + 'ansible', + 'libssl-dev', + 'python-openssl', + 'libffi-dev', + 'python-dev' + ]) + ansible_repo = self.config['repo'] + branch = 'main' + if self.config.get('branch'): + branch = self.config.get('branch') + ansible_ver = 'ansible==2.5' + if self.config.get('ansible-version'): + ansible_ver = 'ansible==' + self.config.get('ansible-version') + ceph_installer.run( + args=[ + 'rm', + '-rf', + run.Raw('~/ceph-ansible'), + ], + check_status=False + ) + ceph_installer.run(args=[ + 'mkdir', + run.Raw('~/ceph-ansible'), + run.Raw(';'), + 'git', + 'clone', + run.Raw('-b %s' % branch), + run.Raw(ansible_repo), + ]) + self._copy_and_print_config() + str_args = ' '.join(args) + ceph_installer.run(args=[ + run.Raw('cd ~/ceph-ansible'), + run.Raw(';'), + 'virtualenv', + run.Raw('--python=python3'), + 'venv', + run.Raw(';'), + run.Raw('source venv/bin/activate'), + run.Raw(';'), + 'pip', + 'install', + '--upgrade', + 'pip', + run.Raw(';'), + 'pip', + 'install', + '--upgrade', + 'cryptography>=2.5', + run.Raw(';'), + 'pip', + 'install', + run.Raw('setuptools>=11.3'), + run.Raw('notario>=0.0.13'), # FIXME: use requirements.txt + run.Raw('netaddr'), + run.Raw('six'), + run.Raw(';'), + 'LANG=en_US.utf8', + 'pip', + 'install', + run.Raw(ansible_ver), + run.Raw(';'), + run.Raw(str_args) + ]) + wait_for_health = self.config.get('wait-for-health', True) + if wait_for_health: + self.wait_for_ceph_health() + # for the teuthology workunits to work we + # need to fix the permission on keyring to be readable by them + self._create_rbd_pool() + self.fix_keyring_permission() + + def _copy_and_print_config(self): + ceph_installer = self.ceph_installer + # copy the inventory file to installer node + ceph_installer.put_file(self.inventory, 'ceph-ansible/inven.yml') + # copy the config provided site file or use sample + if self.playbook_file is not None: + ceph_installer.put_file(self.playbook_file, 'ceph-ansible/site.yml') + else: + # use the site.yml.sample provided by the repo as the main site.yml file + ceph_installer.run( + args=[ + 'cp', + 'ceph-ansible/site.yml.sample', + 'ceph-ansible/site.yml' + ] + ) + # copy extra vars to groups/all + ceph_installer.put_file(self.extra_vars_file, 'ceph-ansible/group_vars/all') + # print for debug info + ceph_installer.run(args=['cat', 'ceph-ansible/inven.yml']) + ceph_installer.run(args=['cat', 'ceph-ansible/site.yml']) + ceph_installer.run(args=['cat', 'ceph-ansible/group_vars/all']) + + def _create_rbd_pool(self): + mon_node = self.ceph_first_mon + log.info('Creating RBD pool') + mon_node.run( + args=[ + 'sudo', 'ceph', '--cluster', self.cluster_name, + 'osd', 'pool', 'create', 'rbd', '128', '128'], + check_status=False) + mon_node.run( + args=[ + 'sudo', 'ceph', '--cluster', self.cluster_name, + 'osd', 'pool', 'application', 'enable', + 'rbd', 'rbd', '--yes-i-really-mean-it' + ], + check_status=False) + + def fix_keyring_permission(self): + clients_only = lambda role: role.startswith('client') + for client in self.cluster.only(clients_only).remotes.keys(): + client.run(args=[ + 'sudo', + 'chmod', + run.Raw('o+r'), + '/etc/ceph/%s.client.admin.keyring' % self.cluster_name + ]) + + +class CephAnsibleError(Exception): + pass + +task = CephAnsible diff --git a/teuthology/task/cephmetrics.py b/teuthology/task/cephmetrics.py new file mode 100644 index 000000000..813d266ad --- /dev/null +++ b/teuthology/task/cephmetrics.py @@ -0,0 +1,96 @@ +import logging +import os +import pexpect +import time + +from teuthology.config import config as teuth_config +from teuthology.exceptions import CommandFailedError +from teuthology.task.ansible import Ansible +from teuthology.util.loggerfile import LoggerFile + + +log = logging.getLogger(__name__) + + +class CephMetrics(Ansible): + def __init__(self, ctx, config): + super(CephMetrics, self).__init__(ctx, config) + if 'repo' not in self.config: + self.config['repo'] = os.path.join( + teuth_config.ceph_git_base_url, 'cephmetrics.git') + if 'playbook' not in self.config: + self.config['playbook'] = './ansible/playbook.yml' + + def get_inventory(self): + return False + + def generate_inventory(self): + groups_to_roles = { + 'mons': 'mon', + 'mgrs': 'mgr', + 'mdss': 'mds', + 'osds': 'osd', + 'rgws': 'rgw', + 'clients': 'client', + 'ceph-grafana': 'cephmetrics', + } + hosts_dict = dict() + for group in sorted(groups_to_roles.keys()): + role_prefix = groups_to_roles[group] + want = lambda role: role.startswith(role_prefix) + if group not in hosts_dict: + hosts_dict[group] = dict(hosts=dict()) + group_dict = hosts_dict[group]['hosts'] + for (remote, roles) in self.cluster.only(want).remotes.items(): + hostname = remote.hostname + group_dict[hostname] = dict( + ansible_user=remote.user, + ) + hosts_dict[group]['hosts'] = group_dict + # It might be preferable to use a YAML inventory file, but + # that won't work until an ansible release is out with: + # https://github.com/ansible/ansible/pull/30730 + # Once that is done, we can simply do this: + # hosts_str = yaml.safe_dump(hosts_dict, default_flow_style=False) + # And then pass suffix='.yml' to _write_hosts_file(). + hosts_lines = [] + for group in hosts_dict.keys(): + hosts_lines.append('[%s]' % group) + for host, vars_ in hosts_dict[group]['hosts'].items(): + host_line = ' '.join( + [host] + map( + lambda tuple_: '='.join(tuple_), + vars_.items(), + ) + ) + hosts_lines.append(host_line) + hosts_lines.append('') + hosts_str = '\n'.join(hosts_lines) + self.inventory = self._write_inventory_files(hosts_str) + self.generated_inventory = True + + def begin(self): + super(CephMetrics, self).begin() + wait_time = 5 * 60 + self.log.info( + "Waiting %ss for data collection before running tests...", + wait_time, + ) + time.sleep(wait_time) + self.run_tests() + + def run_tests(self): + self.log.info("Running tests...") + command = "tox -e integration %s" % self.inventory + out, status = pexpect.run( + command, + cwd=self.repo_path, + logfile=LoggerFile(self.log.getChild('tests'), logging.INFO), + withexitstatus=True, + timeout=None, + ) + if status != 0: + raise CommandFailedError(command, status) + + +task = CephMetrics diff --git a/teuthology/task/clock.py b/teuthology/task/clock.py new file mode 100644 index 000000000..982eb8e1b --- /dev/null +++ b/teuthology/task/clock.py @@ -0,0 +1,122 @@ +""" +Clock synchronizer +""" +import logging +import contextlib + +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +def filter_out_containers(cluster): + """ + Returns a cluster that excludes remotes which should skip this task. + Currently, only skips containerized remotes. + """ + return cluster.filter(lambda r: not r.is_container) + +@contextlib.contextmanager +def task(ctx, config): + """ + Sync or skew clock + + This will initially sync the clocks. Eventually it should let us also + skew by some number of seconds. + + example:: + + tasks: + - clock: + - ceph: + - interactive: + + to sync. + + :param ctx: Context + :param config: Configuration + """ + + log.info('Syncing clocks and checking initial clock skew...') + cluster = filter_out_containers(ctx.cluster) + run.wait( + cluster.run( + args = [ + 'sudo', 'systemctl', 'stop', 'ntp.service', run.Raw('||'), + 'sudo', 'systemctl', 'stop', 'ntpd.service', run.Raw('||'), + 'sudo', 'systemctl', 'stop', 'chronyd.service', + run.Raw(';'), + 'sudo', 'ntpd', '-gq', run.Raw('||'), + 'sudo', 'chronyc', 'makestep', + run.Raw(';'), + 'sudo', 'systemctl', 'start', 'ntp.service', run.Raw('||'), + 'sudo', 'systemctl', 'start', 'ntpd.service', run.Raw('||'), + 'sudo', 'systemctl', 'start', 'chronyd.service', + run.Raw(';'), + 'PATH=/usr/bin:/usr/sbin', 'ntpq', '-p', run.Raw('||'), + 'PATH=/usr/bin:/usr/sbin', 'chronyc', 'sources', + run.Raw('||'), + 'true' + ], + timeout = 360, + wait=False, + ) + ) + + try: + yield + + finally: + log.info('Checking final clock skew...') + cluster = filter_out_containers(ctx.cluster) + run.wait( + cluster.run( + args=[ + 'PATH=/usr/bin:/usr/sbin', 'ntpq', '-p', run.Raw('||'), + 'PATH=/usr/bin:/usr/sbin', 'chronyc', 'sources', + run.Raw('||'), + 'true' + ], + wait=False, + ) + ) + + +@contextlib.contextmanager +def check(ctx, config): + """ + Run ntpq at the start and the end of the task. + + :param ctx: Context + :param config: Configuration + """ + log.info('Checking initial clock skew...') + cluster = filter_out_containers(ctx.cluster) + run.wait( + cluster.run( + args=[ + 'PATH=/usr/bin:/usr/sbin', 'ntpq', '-p', run.Raw('||'), + 'PATH=/usr/bin:/usr/sbin', 'chronyc', 'sources', + run.Raw('||'), + 'true' + ], + wait=False, + ) + ) + + try: + yield + + finally: + log.info('Checking final clock skew...') + cluster = filter_out_containers(ctx.cluster) + run.wait( + cluster.run( + args=[ + 'PATH=/usr/bin:/usr/sbin', 'ntpq', '-p', run.Raw('||'), + 'PATH=/usr/bin:/usr/sbin', 'chronyc', 'sources', + run.Raw('||'), + 'true' + ], + wait=False, + ) + ) diff --git a/teuthology/task/common_fs_utils.py b/teuthology/task/common_fs_utils.py new file mode 100644 index 000000000..584897968 --- /dev/null +++ b/teuthology/task/common_fs_utils.py @@ -0,0 +1,123 @@ +""" +Common filesystem related utilities. Originally this +code was part of rbd.py. It was broken out so that it +could be used by other modules (tgt.py and iscsi.py for instance). +""" +import logging +import contextlib +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + + +def default_image_name(role): + """ + Image name used by rbd and iscsi + """ + return 'testimage.{role}'.format(role=role) + + +@contextlib.contextmanager +def generic_mkfs(ctx, config, devname_rtn): + """ + Create a filesystem (either rbd or tgt, depending on devname_rtn) + + Rbd for example, now makes the following calls: + - rbd.create_image: [client.0] + - rbd.modprobe: [client.0] + - rbd.dev_create: [client.0] + - common_fs_utils.generic_mkfs: [client.0] + - common_fs_utils.generic_mount: + client.0: testimage.client.0 + """ + assert isinstance(config, list) or isinstance(config, dict), \ + "task mkfs must be configured with a list or dictionary" + if isinstance(config, dict): + images = config.items() + else: + images = [(role, None) for role in config] + + for role, properties in images: + if properties is None: + properties = {} + (remote,) = ctx.cluster.only(role).remotes.keys() + image = properties.get('image_name', default_image_name(role)) + fs_type = properties.get('fs_type', 'ext3') + remote.run( + args=[ + 'sudo', + 'mkfs', + '-t', fs_type, + devname_rtn(ctx, image), + ], + ) + yield + + +@contextlib.contextmanager +def generic_mount(ctx, config, devname_rtn): + """ + Generic Mount an rbd or tgt image. + + Rbd for example, now makes the following calls: + - rbd.create_image: [client.0] + - rbd.modprobe: [client.0] + - rbd.dev_create: [client.0] + - common_fs_utils.generic_mkfs: [client.0] + - common_fs_utils.generic_mount: + client.0: testimage.client.0 + """ + assert isinstance(config, list) or isinstance(config, dict), \ + "task mount must be configured with a list or dictionary" + if isinstance(config, dict): + role_images = config.items() + else: + role_images = [(role, None) for role in config] + + testdir = teuthology.get_testdir(ctx) + + mnt_template = '{tdir}/mnt.{id}' + mounted = [] + for role, image in role_images: + if image is None: + image = default_image_name(role) + (remote,) = ctx.cluster.only(role).remotes.keys() + _, _, id_ = teuthology.split_role(role) + mnt = mnt_template.format(tdir=testdir, id=id_) + mounted.append((remote, mnt)) + remote.run( + args=[ + 'mkdir', + '--', + mnt, + ] + ) + + remote.run( + args=[ + 'sudo', + 'mount', + devname_rtn(ctx, image), + mnt, + ], + ) + + try: + yield + finally: + log.info("Unmounting rbd images... %s", mounted) + for remote, mnt in mounted: + remote.run( + args=[ + 'sudo', + 'umount', + mnt, + ], + ) + remote.run( + args=[ + 'rmdir', + '--', + mnt, + ] + ) diff --git a/teuthology/task/console_log.py b/teuthology/task/console_log.py new file mode 100644 index 000000000..01b89351f --- /dev/null +++ b/teuthology/task/console_log.py @@ -0,0 +1,112 @@ +import logging +import os + +from teuthology.orchestra.cluster import Cluster +from teuthology.exit import exiter +from teuthology.task import Task + +log = logging.getLogger(__name__) + + +class ConsoleLog(Task): + enabled = True + name = 'console_log' + logfile_name = '{shortname}.log' + + def __init__(self, ctx=None, config=None): + super(ConsoleLog, self).__init__(ctx, config) + if self.config.get('enabled') is False: + self.enabled = False + if not getattr(self.ctx, 'archive', None): + self.enabled = False + if 'logfile_name' in self.config: + self.logfile_name = self.config['logfile_name'] + if 'remotes' in self.config: + self.remotes = self.config['remotes'] + + def filter_hosts(self): + super(ConsoleLog, self).filter_hosts() + if not hasattr(self.ctx, 'cluster'): + return + new_cluster = Cluster() + for (remote, roles) in self.cluster.remotes.items(): + if not hasattr(remote.console, 'spawn_sol_log'): + log.debug("%s does not support IPMI; excluding", + remote.shortname) + elif not (remote.console.has_ipmi_credentials or + remote.console.has_conserver): + log.debug("Cannot find IPMI credentials or conserver settings " + "for %s; excluding", + remote.shortname) + else: + new_cluster.add(remote, roles) + self.cluster = new_cluster + self.remotes = self.cluster.remotes.keys() + return self.cluster + + def setup(self): + if not self.enabled: + return + super(ConsoleLog, self).setup() + self.processes = dict() + self.signal_handlers = list() + self.setup_archive() + + def setup_archive(self): + self.archive_dir = os.path.join( + self.ctx.archive, + 'console_logs', + ) + if not os.path.isdir(self.archive_dir): + os.makedirs(self.archive_dir) + + def begin(self): + if not self.enabled: + return + super(ConsoleLog, self).begin() + self.start_logging() + + def start_logging(self): + for remote in self.remotes: + log_path = os.path.join( + self.archive_dir, + self.logfile_name.format(shortname=remote.shortname), + ) + proc = remote.console.spawn_sol_log(log_path) + self.processes[remote.shortname] = proc + + # Install a signal handler to make sure the console-logging + # processes are terminated if the job is killed + def kill_console_loggers(signal_, frame): + for (name, proc) in self.processes.items(): + log.debug("Killing console logger for %s", name) + proc.terminate() + exiter.add_handler(15, kill_console_loggers) + + def end(self): + if not self.enabled: + return + super(ConsoleLog, self).end() + self.stop_logging() + + def stop_logging(self, force=False): + for proc in self.processes.values(): + if proc.poll() is not None: + continue + if force: + proc.kill() + else: + proc.terminate() + + # Remove any signal handlers + for handler in self.signal_handlers: + handler.remove() + + def teardown(self): + if not self.enabled: + return + self.stop_logging(force=True) + super(ConsoleLog, self).teardown() + + +task = ConsoleLog diff --git a/teuthology/task/dump_ctx.py b/teuthology/task/dump_ctx.py new file mode 100644 index 000000000..f2da22e12 --- /dev/null +++ b/teuthology/task/dump_ctx.py @@ -0,0 +1,19 @@ +import logging +import pprint + +log = logging.getLogger(__name__) +pp = pprint.PrettyPrinter(indent=4) + +def _pprint_me(thing, prefix): + return prefix + "\n" + pp.pformat(thing) + +def task(ctx, config): + """ + Dump task context and config in teuthology log/output + + The intended use case is didactic - to provide an easy way for newbies, who + are working on teuthology tasks for the first time, to find out what + is inside the ctx and config variables that are passed to each task. + """ + log.info(_pprint_me(ctx, "Task context:")) + log.info(_pprint_me(config, "Task config:")) diff --git a/teuthology/task/exec.py b/teuthology/task/exec.py new file mode 100644 index 000000000..b3548c332 --- /dev/null +++ b/teuthology/task/exec.py @@ -0,0 +1,74 @@ +""" +Exececute custom commands +""" +import logging + +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Execute commands on a given role + + tasks: + - ceph: + - kclient: [client.a] + - exec: + client.a: + - "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control" + - "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control" + - interactive: + + It stops and fails with the first command that does not return on success. It means + that if the first command fails, the second won't run at all. + + You can run a command on all hosts `all-hosts`, or all roles with `all-roles`: + + tasks: + - exec: + all-hosts: + - touch /etc/passwd + - exec: + all-roles: + - pwd + + To avoid confusion it is recommended to explicitly enclose the commands in + double quotes. For instance if the command is false (without double quotes) it will + be interpreted as a boolean by the YAML parser. + + :param ctx: Context + :param config: Configuration + """ + log.info('Executing custom commands...') + assert isinstance(config, dict), "task exec got invalid config" + + testdir = teuthology.get_testdir(ctx) + + if 'all' in config and len(config) == 1: + a = config['all'] + roles = teuthology.all_roles(ctx.cluster) + config = dict((id_, a) for id_ in roles) + elif 'all-roles' in config and len(config) == 1: + a = config['all-roles'] + roles = teuthology.all_roles(ctx.cluster) + config = dict((id_, a) for id_ in roles) + elif 'all-hosts' in config and len(config) == 1: + a = config['all-hosts'] + roles = [roles[0] for roles in ctx.cluster.remotes.values()] + config = dict((id_, a) for id_ in roles) + + for role, ls in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() + log.info('Running commands on role %s host %s', role, remote.name) + for c in ls: + c.replace('$TESTDIR', testdir) + remote.run( + args=[ + 'sudo', + 'TESTDIR={tdir}'.format(tdir=testdir), + 'bash', + '-c', + c], + ) + diff --git a/teuthology/task/full_sequential.py b/teuthology/task/full_sequential.py new file mode 100644 index 000000000..a9990f2aa --- /dev/null +++ b/teuthology/task/full_sequential.py @@ -0,0 +1,39 @@ +""" +Task sequencer - full +""" +import sys +import logging + +from teuthology import run_tasks + +log = logging.getLogger(__name__) + + +def task(ctx, config): + """ + Run a set of tasks to completion in order. __exit__ is called on a task + before __enter__ on the next + + example:: + - full_sequential: + - tasktest: + - tasktest: + + :param ctx: Context + :param config: Configuration + """ + for entry in config: + if not isinstance(entry, dict): + entry = ctx.config.get(entry, {}) + ((taskname, confg),) = entry.items() + log.info('In full_sequential, running task %s...' % taskname) + mgr = run_tasks.run_one_task(taskname, ctx=ctx, config=confg) + if hasattr(mgr, '__enter__'): + try: + mgr.__enter__() + finally: + try: + exc_info = sys.exc_info() + mgr.__exit__(*exc_info) + finally: + del exc_info diff --git a/teuthology/task/full_sequential_finally.py b/teuthology/task/full_sequential_finally.py new file mode 100644 index 000000000..76e3bbbde --- /dev/null +++ b/teuthology/task/full_sequential_finally.py @@ -0,0 +1,54 @@ +""" +Task sequencer finally +""" +import sys +import logging +import contextlib + +from teuthology import run_tasks + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def task(ctx, config): + """ + Sequentialize a group of tasks into one executable block, run on cleanup + + example:: + + tasks: + - foo: + - full_sequential_finally: + - final1: + - final2: + - bar: + - baz: + + The final1 and final2 tasks will run when full_sequentiall_finally is torn + down, after the nested bar and baz tasks have run to completion, and right + before the preceding foo task is torn down. This is useful if there are + additional steps you want to interject in a job during the shutdown (instead + of startup) phase. + + :param ctx: Context + :param config: Configuration + """ + try: + yield + finally: + for entry in config: + if not isinstance(entry, dict): + entry = ctx.config.get(entry, {}) + ((taskname, confg),) = entry.items() + log.info('In full_sequential_finally, running task %s...' % taskname) + mgr = run_tasks.run_one_task(taskname, ctx=ctx, config=confg) + if hasattr(mgr, '__enter__'): + try: + mgr.__enter__() + finally: + try: + exc_info = sys.exc_info() + mgr.__exit__(*exc_info) + finally: + del exc_info diff --git a/teuthology/task/hadoop.py b/teuthology/task/hadoop.py new file mode 100644 index 000000000..7754a7695 --- /dev/null +++ b/teuthology/task/hadoop.py @@ -0,0 +1,424 @@ +from io import StringIO +import contextlib +import logging +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.orchestra import run +from teuthology.exceptions import UnsupportedPackageTypeError + +log = logging.getLogger(__name__) + +HADOOP_2x_URL = "https://archive.apache.org/dist/hadoop/core/hadoop-2.5.2/hadoop-2.5.2.tar.gz" + +def dict_to_hadoop_conf(items): + out = "\n" + for key, value in items.items(): + out += " \n" + out += " " + key + "\n" + out += " " + value + "\n" + out += " \n" + out += "\n" + return out + +def is_hadoop_type(type_): + return lambda role: role.startswith('hadoop.' + type_) + +def get_slaves_data(ctx): + tempdir = teuthology.get_testdir(ctx) + path = "{tdir}/hadoop/etc/hadoop/slaves".format(tdir=tempdir) + nodes = ctx.cluster.only(is_hadoop_type('slave')) + hosts = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes] + data = '\n'.join(hosts) + return path, data + +def get_masters_data(ctx): + tempdir = teuthology.get_testdir(ctx) + path = "{tdir}/hadoop/etc/hadoop/masters".format(tdir=tempdir) + nodes = ctx.cluster.only(is_hadoop_type('master')) + hosts = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes] + data = '\n'.join(hosts) + return path, data + +def get_core_site_data(ctx, config): + tempdir = teuthology.get_testdir(ctx) + path = "{tdir}/hadoop/etc/hadoop/core-site.xml".format(tdir=tempdir) + nodes = ctx.cluster.only(is_hadoop_type('master')) + host = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes][0] + + conf = {} + if config.get('hdfs', False): + conf.update({ + 'fs.defaultFS': 'hdfs://{namenode}:9000', + 'hadoop.tmp.dir': '{tdir}/hadoop_tmp', + }) + else: + conf.update({ + 'fs.default.name': 'ceph://{namenode}:6789/', + 'fs.defaultFS': 'ceph://{namenode}:6789/', + 'ceph.conf.file': '/etc/ceph/ceph.conf', + 'ceph.mon.address': '{namenode}:6789', + 'ceph.auth.id': 'admin', + #'ceph.data.pools': 'cephfs_data', + 'fs.AbstractFileSystem.ceph.impl': 'org.apache.hadoop.fs.ceph.CephFs', + 'fs.ceph.impl': 'org.apache.hadoop.fs.ceph.CephFileSystem', + }) + + data_tmpl = dict_to_hadoop_conf(conf) + return path, data_tmpl.format(tdir=tempdir, namenode=host) + +def get_mapred_site_data(ctx): + data_tmpl = """ + + + mapred.job.tracker + {namenode}:9001 + + + mapreduce.framework.name + yarn + + +""" + tempdir = teuthology.get_testdir(ctx) + path = "{tdir}/hadoop/etc/hadoop/mapred-site.xml".format(tdir=tempdir) + nodes = ctx.cluster.only(is_hadoop_type('master')) + hosts = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes] + assert len(hosts) == 1 + host = hosts[0] + return path, data_tmpl.format(namenode=host) + +def get_yarn_site_data(ctx): + conf = {} + conf.update({ + 'yarn.resourcemanager.resourcetracker.address': '{namenode}:8025', + 'yarn.resourcemanager.scheduler.address': '{namenode}:8030', + 'yarn.resourcemanager.address': '{namenode}:8050', + 'yarn.resourcemanager.admin.address': '{namenode}:8041', + 'yarn.resourcemanager.hostname': '{namenode}', + 'yarn.nodemanager.aux-services': 'mapreduce_shuffle', + 'yarn.nodemanager.sleep-delay-before-sigkill.ms': '10000', + }) + data_tmpl = dict_to_hadoop_conf(conf) + + tempdir = teuthology.get_testdir(ctx) + path = "{tdir}/hadoop/etc/hadoop/yarn-site.xml".format(tdir=tempdir) + nodes = ctx.cluster.only(is_hadoop_type('master')) + hosts = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes] + assert len(hosts) == 1 + host = hosts[0] + return path, data_tmpl.format(namenode=host) + +def get_hdfs_site_data(ctx): + data = """ + + + dfs.replication + 1 + + +""" + tempdir = teuthology.get_testdir(ctx) + path = "{tdir}/hadoop/etc/hadoop/hdfs-site.xml".format(tdir=tempdir) + return path, data + +def configure(ctx, config, hadoops): + tempdir = teuthology.get_testdir(ctx) + + log.info("Writing Hadoop slaves file...") + for remote in hadoops.remotes: + path, data = get_slaves_data(ctx) + teuthology.write_file(remote, path, StringIO(data)) + + log.info("Writing Hadoop masters file...") + for remote in hadoops.remotes: + path, data = get_masters_data(ctx) + teuthology.write_file(remote, path, StringIO(data)) + + log.info("Writing Hadoop core-site.xml file...") + for remote in hadoops.remotes: + path, data = get_core_site_data(ctx, config) + teuthology.write_file(remote, path, StringIO(data)) + + log.info("Writing Hadoop yarn-site.xml file...") + for remote in hadoops.remotes: + path, data = get_yarn_site_data(ctx) + teuthology.write_file(remote, path, StringIO(data)) + + log.info("Writing Hadoop hdfs-site.xml file...") + for remote in hadoops.remotes: + path, data = get_hdfs_site_data(ctx) + teuthology.write_file(remote, path, StringIO(data)) + + log.info("Writing Hadoop mapred-site.xml file...") + for remote in hadoops.remotes: + path, data = get_mapred_site_data(ctx) + teuthology.write_file(remote, path, StringIO(data)) + + log.info("Setting JAVA_HOME in hadoop-env.sh...") + for remote in hadoops.remotes: + path = "{tdir}/hadoop/etc/hadoop/hadoop-env.sh".format(tdir=tempdir) + if remote.os.package_type == 'rpm': + data = "JAVA_HOME=/usr/lib/jvm/java\n" + elif remote.os.package_type == 'deb': + data = "JAVA_HOME=/usr/lib/jvm/default-java\n" + else: + raise UnsupportedPackageTypeError(remote) + teuthology.prepend_lines_to_file(remote, path, data) + + if config.get('hdfs', False): + log.info("Formatting HDFS...") + testdir = teuthology.get_testdir(ctx) + hadoop_dir = "{tdir}/hadoop/".format(tdir=testdir) + masters = ctx.cluster.only(is_hadoop_type('master')) + assert len(masters.remotes) == 1 + master = next(iter(masters.remotes.keys())) + master.run( + args = [ + hadoop_dir + "bin/hadoop", + "namenode", + "-format" + ], + wait = True, + ) + +@contextlib.contextmanager +def install_hadoop(ctx, config): + testdir = teuthology.get_testdir(ctx) + + log.info("Downloading Hadoop...") + hadoop_tarball = "{tdir}/hadoop.tar.gz".format(tdir=testdir) + hadoops = ctx.cluster.only(is_hadoop_type('')) + run.wait( + hadoops.run( + args = [ + 'wget', + '-nv', + '-O', + hadoop_tarball, + HADOOP_2x_URL + ], + wait = False, + ) + ) + + log.info("Create directory for Hadoop install...") + hadoop_dir = "{tdir}/hadoop".format(tdir=testdir) + run.wait( + hadoops.run( + args = [ + 'mkdir', + hadoop_dir + ], + wait = False, + ) + ) + + log.info("Unpacking Hadoop...") + run.wait( + hadoops.run( + args = [ + 'tar', + 'xzf', + hadoop_tarball, + '--strip-components=1', + '-C', + hadoop_dir + ], + wait = False, + ) + ) + + log.info("Removing Hadoop download...") + run.wait( + hadoops.run( + args = [ + 'rm', + hadoop_tarball + ], + wait = False, + ) + ) + + log.info("Create Hadoop temporary directory...") + hadoop_tmp_dir = "{tdir}/hadoop_tmp".format(tdir=testdir) + run.wait( + hadoops.run( + args = [ + 'mkdir', + hadoop_tmp_dir + ], + wait = False, + ) + ) + + if not config.get('hdfs', False): + log.info("Fetching cephfs-hadoop...") + + sha1, url = teuthology.get_ceph_binary_url( + package = "hadoop", + format = "jar", + dist = "precise", + arch = "x86_64", + flavor = "default", + branch = "main") + + run.wait( + hadoops.run( + args = [ + 'wget', + '-nv', + '-O', + "{tdir}/cephfs-hadoop.jar".format(tdir=testdir), # FIXME + url + "/cephfs-hadoop-0.80.6.jar", # FIXME + ], + wait = False, + ) + ) + + run.wait( + hadoops.run( + args = [ + 'mv', + "{tdir}/cephfs-hadoop.jar".format(tdir=testdir), + "{tdir}/hadoop/share/hadoop/common/".format(tdir=testdir), + ], + wait = False, + ) + ) + + # Copy JNI native bits. Need to do this explicitly because the + # handling is dependent on the os-type. + for remote in hadoops.remotes: + libcephfs_jni_path = None + if remote.os.package_type == 'rpm': + libcephfs_jni_path = "/usr/lib64/libcephfs_jni.so.1.0.0" + elif remote.os.package_type == 'deb': + libcephfs_jni_path = "/usr/lib/jni/libcephfs_jni.so" + else: + raise UnsupportedPackageTypeError(remote) + + libcephfs_jni_fname = "libcephfs_jni.so" + remote.run( + args = [ + 'cp', + libcephfs_jni_path, + "{tdir}/hadoop/lib/native/{fname}".format(tdir=testdir, + fname=libcephfs_jni_fname), + ]) + + run.wait( + hadoops.run( + args = [ + 'cp', + "/usr/share/java/libcephfs.jar", + "{tdir}/hadoop/share/hadoop/common/".format(tdir=testdir), + ], + wait = False, + ) + ) + + configure(ctx, config, hadoops) + + try: + yield + finally: + run.wait( + hadoops.run( + args = [ + 'rm', + '-rf', + hadoop_dir, + hadoop_tmp_dir + ], + wait = False, + ) + ) + +@contextlib.contextmanager +def start_hadoop(ctx, config): + testdir = teuthology.get_testdir(ctx) + hadoop_dir = "{tdir}/hadoop/".format(tdir=testdir) + masters = ctx.cluster.only(is_hadoop_type('master')) + assert len(masters.remotes) == 1 + master = next(iter(masters.remotes.keys())) + + log.info("Stopping Hadoop daemons") + master.run( + args = [ + hadoop_dir + "sbin/stop-yarn.sh" + ], + wait = True, + ) + + master.run( + args = [ + hadoop_dir + "sbin/stop-dfs.sh" + ], + wait = True, + ) + + if config.get('hdfs', False): + log.info("Starting HDFS...") + master.run( + args = [ + hadoop_dir + "sbin/start-dfs.sh" + ], + wait = True, + ) + + log.info("Starting YARN...") + master.run( + args = [ + hadoop_dir + "sbin/start-yarn.sh" + ], + wait = True, + ) + + try: + yield + + finally: + log.info("Stopping Hadoop daemons") + + master.run( + args = [ + hadoop_dir + "sbin/stop-yarn.sh" + ], + wait = True, + ) + + master.run( + args = [ + hadoop_dir + "sbin/stop-dfs.sh" + ], + wait = True, + ) + + run.wait( + ctx.cluster.run( + args = [ + 'sudo', + 'skill', + '-9', + 'java' + ], + wait = False + ) + ) + +@contextlib.contextmanager +def task(ctx, config): + if config is None: + config = {} + assert isinstance(config, dict), "task hadoop config must be dictionary" + + overrides = ctx.config.get('overrides', {}) + teuthology.deep_merge(config, overrides.get('hadoop', {})) + + tasks = [ + lambda: install_hadoop(ctx=ctx, config=config), + lambda: start_hadoop(ctx=ctx, config=config), + ] + + with contextutil.nested(*tasks): + yield diff --git a/teuthology/task/install/__init__.py b/teuthology/task/install/__init__.py new file mode 100644 index 000000000..8d6c15988 --- /dev/null +++ b/teuthology/task/install/__init__.py @@ -0,0 +1,641 @@ +import contextlib +import copy +import logging +import os +import subprocess +import yaml + +from teuthology import misc as teuthology +from teuthology import contextutil, packaging +from teuthology.parallel import parallel +from teuthology.task import ansible +from teuthology.exceptions import ConfigError + +from distutils.version import LooseVersion +from teuthology.task.install.util import ( + _get_builder_project, get_flavor, ship_utilities, +) + +from teuthology.task.install import rpm, deb, redhat + +log = logging.getLogger(__name__) + +def get_upgrade_version(ctx, config, remote): + builder = _get_builder_project(ctx, remote, config) + version = builder.version + return version + +def verify_package_version(ctx, config, remote): + """ + Ensures that the version of package installed is what + was asked for in the config. + + For most cases this is for ceph, but we also install samba + for example. + """ + # Do not verify the version if the ceph-deploy task is being used to + # install ceph. Verifying the ceph installed by ceph-deploy should work, + # but the qa suites will need reorganized first to run ceph-deploy + # before the install task. + # see: http://tracker.ceph.com/issues/11248 + if config.get("extras"): + log.info("Skipping version verification...") + return True + if 'repos' in config and config.get('repos'): + log.info("Skipping version verification because we have custom repos...") + return True + builder = _get_builder_project(ctx, remote, config) + version = builder.version + pkg_to_check = builder.project + installed_ver = packaging.get_package_version(remote, pkg_to_check) + if installed_ver and version in installed_ver: + msg = "The correct {pkg} version {ver} is installed.".format( + ver=version, + pkg=pkg_to_check + ) + log.info(msg) + else: + raise RuntimeError( + "{pkg} version {ver} was not installed, found {installed}.".format( + ver=version, + installed=installed_ver, + pkg=pkg_to_check + ) + ) + + +def install_packages(ctx, pkgs, config): + """ + Installs packages on each remote in ctx. + + :param ctx: the argparse.Namespace object + :param pkgs: list of packages names to install + :param config: the config dict + """ + install_pkgs = { + "deb": deb._update_package_list_and_install, + "rpm": rpm._update_package_list_and_install, + } + with parallel() as p: + for remote in ctx.cluster.remotes.keys(): + system_type = teuthology.get_system_type(remote) + p.spawn( + install_pkgs[system_type], + ctx, remote, pkgs[system_type], config) + + for remote in ctx.cluster.remotes.keys(): + # verifies that the install worked as expected + verify_package_version(ctx, config, remote) + + +def remove_packages(ctx, config, pkgs): + """ + Removes packages from each remote in ctx. + + :param ctx: the argparse.Namespace object + :param config: the config dict + :param pkgs: list of packages names to remove + """ + remove_pkgs = { + "deb": deb._remove, + "rpm": rpm._remove, + } + cleanup = config.get('cleanup', False) + with parallel() as p: + for remote in ctx.cluster.remotes.keys(): + if not remote.is_reimageable or cleanup: + system_type = teuthology.get_system_type(remote) + p.spawn(remove_pkgs[ + system_type], ctx, config, remote, pkgs[system_type]) + + +def remove_sources(ctx, config): + """ + Removes repo source files from each remote in ctx. + + :param ctx: the argparse.Namespace object + :param config: the config dict + """ + remove_sources_pkgs = { + 'deb': deb._remove_sources_list, + 'rpm': rpm._remove_sources_list, + } + cleanup = config.get('cleanup', False) + project = config.get('project', 'ceph') + with parallel() as p: + for remote in ctx.cluster.remotes.keys(): + if not remote.is_reimageable or cleanup: + log.info("Removing {p} sources lists on {r}" + .format(p=project,r=remote)) + remove_fn = remove_sources_pkgs[remote.os.package_type] + p.spawn(remove_fn, ctx, config, remote) + + +def get_package_list(ctx, config): + debug = config.get('debuginfo', False) + project = config.get('project', 'ceph') + yaml_path = None + # Look for /packages/packages.yaml + if hasattr(ctx, 'config') and 'suite_path' in ctx.config: + suite_packages_path = os.path.join( + ctx.config['suite_path'], + 'packages', + 'packages.yaml', + ) + if os.path.exists(suite_packages_path): + yaml_path = suite_packages_path + # If packages.yaml isn't found in the suite_path, potentially use + # teuthology's + yaml_path = yaml_path or os.path.join( + os.path.dirname(__file__), + 'packages.yaml', + ) + default_packages = yaml.safe_load(open(yaml_path)) + default_debs = default_packages.get(project, dict()).get('deb', []) + default_rpms = default_packages.get(project, dict()).get('rpm', []) + # If a custom deb and/or rpm list is provided via the task config, use + # that. Otherwise, use the list from whichever packages.yaml was found + # first + debs = config.get('packages', dict()).get('deb', default_debs) + rpms = config.get('packages', dict()).get('rpm', default_rpms) + # Optionally include or exclude debug packages + if not debug: + debs = [p for p in debs if not p.endswith('-dbg')] + rpms = [p for p in rpms if not p.endswith('-debuginfo')] + + def exclude(pkgs, exclude_list): + return list(pkg for pkg in pkgs if pkg not in exclude_list) + + excluded_packages = config.get('exclude_packages', []) + if isinstance(excluded_packages, dict): + log.debug("Excluding packages: {}".format(excluded_packages)) + debs = exclude(debs, excluded_packages.get('deb', [])) + rpms = exclude(rpms, excluded_packages.get('rpm', [])) + else: + debs = exclude(debs, excluded_packages) + rpms = exclude(rpms, excluded_packages) + + package_list = dict(deb=debs, rpm=rpms) + log.debug("Package list is: {}".format(package_list)) + return package_list + + +@contextlib.contextmanager +def install(ctx, config): + """ + The install task. Installs packages for a given project on all hosts in + ctx. May work for projects besides ceph, but may not. Patches welcomed! + + :param ctx: the argparse.Namespace object + :param config: the config dict + """ + + package_list = get_package_list(ctx, config) + debs = package_list['deb'] + rpms = package_list['rpm'] + + # pull any additional packages out of config + extra_pkgs = config.get('extra_packages', []) + log.info('extra packages: {packages}'.format(packages=extra_pkgs)) + if isinstance(extra_pkgs, dict): + debs += extra_pkgs.get('deb', []) + rpms += extra_pkgs.get('rpm', []) + else: + debs += extra_pkgs + rpms += extra_pkgs + + # When extras is in the config we want to purposely not install ceph. + # This is typically used on jobs that use ceph-deploy to install ceph + # or when we are testing ceph-deploy directly. The packages being + # installed are needed to properly test ceph as ceph-deploy won't + # install these. 'extras' might not be the best name for this. + extras = config.get('extras') + if extras is not None: + debs = ['ceph-test', 'ceph-fuse', + 'librados2', 'librbd1', + 'python-ceph'] + rpms = ['ceph-fuse', 'librbd1', 'librados2', 'ceph-test', 'python-ceph'] + package_list = dict(deb=debs, rpm=rpms) + install_packages(ctx, package_list, config) + try: + yield + finally: + remove_packages(ctx, config, package_list) + remove_sources(ctx, config) + + +def upgrade_old_style(ctx, node, remote, pkgs, system_type): + """ + Handle the upgrade using methods in use prior to ceph-deploy. + """ + if system_type == 'deb': + deb._upgrade_packages(ctx, node, remote, pkgs) + elif system_type == 'rpm': + rpm._upgrade_packages(ctx, node, remote, pkgs) + + +def upgrade_with_ceph_deploy(ctx, node, remote, pkgs, sys_type): + """ + Upgrade using ceph-deploy + """ + dev_table = ['branch', 'tag', 'dev'] + ceph_dev_parm = '' + ceph_rel_parm = '' + for entry in node.keys(): + if entry in dev_table: + ceph_dev_parm = node[entry] + if entry == 'release': + ceph_rel_parm = node[entry] + params = [] + if ceph_dev_parm: + params += ['--dev', ceph_dev_parm] + if ceph_rel_parm: + params += ['--release', ceph_rel_parm] + params.append(remote.name) + subprocess.call(['ceph-deploy', 'install'] + params) + remote.run(args=['sudo', 'restart', 'ceph-all']) + + +def upgrade_remote_to_config(ctx, config): + assert config is None or isinstance(config, dict), \ + "install.upgrade only supports a dictionary for configuration" + + project = config.get('project', 'ceph') + + # use 'install' overrides here, in case the upgrade target is left + # unspecified/implicit. + install_overrides = ctx.config.get( + 'overrides', {}).get('install', {}).get(project, {}) + log.info('project %s config %s overrides %s', project, config, + install_overrides) + + # build a normalized remote -> config dict + remotes = {} + if 'all' in config: + for remote in ctx.cluster.remotes.keys(): + remotes[remote] = config.get('all') + else: + for role in config.keys(): + remotes_dict = ctx.cluster.only(role).remotes + if not remotes_dict: + # This is a regular config argument, not a role + continue + # take any remote in the dict + remote = next(iter(remotes_dict)) + if remote in remotes: + log.warning('remote %s came up twice (role %s)', remote, role) + continue + remotes[remote] = config.get(role) + + result = {} + for remote, node in remotes.items(): + if not node: + node = {} + + this_overrides = copy.deepcopy(install_overrides) + if 'sha1' in node or 'tag' in node or 'branch' in node: + log.info("config contains sha1|tag|branch, " + "removing those keys from override") + this_overrides.pop('sha1', None) + this_overrides.pop('tag', None) + this_overrides.pop('branch', None) + teuthology.deep_merge(node, this_overrides) + log.info('remote %s config %s', remote, node) + node['project'] = project + + result[remote] = node + + return result + +def _upgrade_is_downgrade(installed_version, upgrade_version): + assert installed_version, "installed_version is empty" + assert upgrade_version, "upgrade_version is empty" + return LooseVersion(installed_version) > LooseVersion(upgrade_version) + +def upgrade_common(ctx, config, deploy_style): + """ + Common code for upgrading + """ + remotes = upgrade_remote_to_config(ctx, config) + project = config.get('project', 'ceph') + + extra_pkgs = config.get('extra_packages', []) + log.info('extra packages: {packages}'.format(packages=extra_pkgs)) + + for remote, node in remotes.items(): + + system_type = teuthology.get_system_type(remote) + assert system_type in ('deb', 'rpm') + pkgs = get_package_list(ctx, config)[system_type] + log.info("Upgrading {proj} {system_type} packages: {pkgs}".format( + proj=project, system_type=system_type, pkgs=', '.join(pkgs))) + if isinstance(extra_pkgs, dict): + pkgs += extra_pkgs.get(system_type, []) + else: + pkgs += extra_pkgs + + installed_version = packaging.get_package_version(remote, 'ceph-common') + upgrade_version = get_upgrade_version(ctx, node, remote) + log.info("Ceph {s} upgrade from {i} to {u}".format( + s=system_type, + i=installed_version, + u=upgrade_version + )) + if _upgrade_is_downgrade(installed_version, upgrade_version): + raise RuntimeError( + "An attempt to upgrade from a higher version to a lower one " + "will always fail. Hint: check tags in the target git branch." + ) + + + deploy_style(ctx, node, remote, pkgs, system_type) + verify_package_version(ctx, node, remote) + return len(remotes) + +docstring_for_upgrade = """" + Upgrades packages for a given project. + + For example:: + + tasks: + - install.{cmd_parameter}: + all: + branch: end + + or specify specific roles:: + + tasks: + - install.{cmd_parameter}: + mon.a: + branch: end + osd.0: + branch: other + + or rely on the overrides for the target version:: + + overrides: + install: + ceph: + sha1: ... + tasks: + - install.{cmd_parameter}: + all: + + (HACK: the overrides will *only* apply the sha1/branch/tag if those + keys are not present in the config.) + + It is also possible to attempt to exclude packages from the upgrade set: + + tasks: + - install.{cmd_parameter}: + exclude_packages: ['ceph-test', 'ceph-test-dbg'] + + :param ctx: the argparse.Namespace object + :param config: the config dict + """ + +# +# __doc__ strings for upgrade and ceph_deploy_upgrade are set from +# the same string so that help(upgrade) and help(ceph_deploy_upgrade) +# look the same. +# + + +@contextlib.contextmanager +def upgrade(ctx, config): + upgrade_common(ctx, config, upgrade_old_style) + yield + +upgrade.__doc__ = docstring_for_upgrade.format(cmd_parameter='upgrade') + + +@contextlib.contextmanager +def ceph_deploy_upgrade(ctx, config): + upgrade_common(ctx, config, upgrade_with_ceph_deploy) + yield + +ceph_deploy_upgrade.__doc__ = docstring_for_upgrade.format( + cmd_parameter='ceph_deploy_upgrade') + + +@contextlib.contextmanager +def task(ctx, config): + """ + Install packages for a given project. + + tasks: + - install: + project: ceph + branch: bar + - install: + project: samba + branch: foo + extra_packages: ['samba'] + - install: + extra_packages: + deb: ['librados-dev', 'libradosstriper-dev'] + rpm: ['librados-devel', 'libradosstriper-devel'] + extra_system_packages: + deb: ['libboost-system-dev'] + rpm: ['boost-devel'] + - install: + rhbuild: 1.3.0 + playbook: downstream_setup.yml + vars: + yum_repos: + - url: "http://location.repo" + name: "ceph_repo" + + Add repos before trying to install any package (all Shaman-related tasks + will be ignored): + + - install: + repos: + - name: "repo-alias" + priority: 1 + url: "http://location.repo" + + Note: The 'repos' are supported for SUSE-based distros only, but patches + are welcome to add support for other distros. + + + Enable Fedora copr repositories using enable_coprs: + + - install: + enable_coprs: [ceph/el9] + + + Overrides are project specific: + + overrides: + install: + ceph: + sha1: ... + + + Debug packages may optionally be installed: + + overrides: + install: + ceph: + debuginfo: true + + + Default package lists (which come from packages.yaml) may be overridden: + + overrides: + install: + ceph: + packages: + deb: + - ceph-osd + - ceph-mon + rpm: + - ceph-devel + - rbd-fuse + + When tag, branch and sha1 do not reference the same commit hash, the + tag takes precedence over the branch and the branch takes precedence + over the sha1. + + When the overrides have a sha1 that is different from the sha1 of + the project to be installed, it will be a noop if the project has + a branch or tag, because they take precedence over the sha1. For + instance: + + overrides: + install: + ceph: + sha1: 1234 + + tasks: + - install: + project: ceph + sha1: 4567 + branch: foobar # which has sha1 4567 + + The override will transform the tasks as follows: + + tasks: + - install: + project: ceph + sha1: 1234 + branch: foobar # which has sha1 4567 + + But the branch takes precedence over the sha1 and foobar + will be installed. The override of the sha1 has no effect. + + When passed 'rhbuild' as a key, it will attempt to install an rh ceph build + using ceph-deploy + + Normally, the package management system will try to install or upgrade + specified packages as instructed. But if newer versions of these packages + to be installed have been installed on test node, we will have to uninstall + or downgrade them. To downgrade multiple packages in a single shot: + + tasks: + - install: + project: ceph + branch: hammer + downgrade_packages: ['librados2', 'librbd1'] + + Reminder regarding teuthology-suite side effects: + + The teuthology-suite command always adds the following: + + overrides: + install: + ceph: + sha1: 1234 + + where sha1 matches the --ceph argument. For instance if + teuthology-suite is called with --ceph main, the sha1 will be + the tip of main. If called with --ceph v0.94.1, the sha1 will be + the v0.94.1 (as returned by git rev-parse v0.94.1 which is not to + be confused with git rev-parse v0.94.1^{commit}) + + :param ctx: the argparse.Namespace object + :param config: the config dict + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + "task install only supports a dictionary for configuration" + + project, = config.get('project', 'ceph'), + log.debug('project %s' % project) + overrides = ctx.config.get('overrides') + repos = None + + if overrides: + try: + install_overrides = overrides.get('install', {}) + log.debug('INSTALL overrides: %s' % install_overrides) + teuthology.deep_merge(config, install_overrides.get(project, {})) + overrides_extra_system_packages = install_overrides.get('extra_system_packages') + if overrides_extra_system_packages: + extra_system_packages = config.get('extra_system_packages') + config['extra_system_packages'] = teuthology.deep_merge(extra_system_packages, overrides_extra_system_packages) + repos = install_overrides.get('repos', None) + except AssertionError: + raise ConfigError( + "'install' task config and its overrides contain" \ + "conflicting types for the same config key. Ensure that " \ + "the configuration is of the same type (dict, list, etc.) " \ + "in both the task definition and its overrides." + ) + + log.debug('config %s' % config) + + rhbuild = None + if config.get('rhbuild'): + rhbuild = config.get('rhbuild') + log.info("Build is %s " % rhbuild) + + flavor = get_flavor(config) + log.info("Using flavor: %s", flavor) + + ctx.summary['flavor'] = flavor + nested_tasks = [lambda: redhat.install(ctx=ctx, config=config), + lambda: ship_utilities(ctx=ctx, config=None)] + + if config.get('rhbuild'): + if config.get('playbook'): + ansible_config = dict(config) + # remove key not required by ansible task + del ansible_config['rhbuild'] + nested_tasks.insert( + 0, + lambda: ansible.CephLab(ctx, config=ansible_config) + ) + with contextutil.nested(*nested_tasks): + yield + else: + nested_config = dict( + branch=config.get('branch'), + cleanup=config.get('cleanup'), + debuginfo=config.get('debuginfo'), + downgrade_packages=config.get('downgrade_packages', []), + exclude_packages=config.get('exclude_packages', []), + extra_packages=config.get('extra_packages', []), + extra_system_packages=config.get('extra_system_packages', []), + extras=config.get('extras', None), + enable_coprs=config.get('enable_coprs', []), + flavor=flavor, + install_ceph_packages=config.get('install_ceph_packages', True), + packages=config.get('packages', dict()), + project=project, + repos_only=config.get('repos_only', False), + sha1=config.get('sha1'), + tag=config.get('tag'), + wait_for_package=config.get('wait_for_package', False), + ) + if repos: + nested_config['repos'] = repos + if 'shaman' in config: + nested_config['shaman'] = config['shaman'] + with contextutil.nested( + lambda: install(ctx=ctx, config=nested_config), + lambda: ship_utilities(ctx=ctx, config=None), + ): + yield diff --git a/teuthology/task/install/bin/adjust-ulimits b/teuthology/task/install/bin/adjust-ulimits new file mode 100755 index 000000000..6f05392b9 --- /dev/null +++ b/teuthology/task/install/bin/adjust-ulimits @@ -0,0 +1,16 @@ +#!/bin/sh +# If we're running as root, allow large amounts of open files. +USER=$(whoami) + +# If a ulimit call fails, exit immediately. +set -e + +if [ "$USER" = "root" ] +then + # Enable large number of open files + ulimit -n 65536 +fi + +# Enable core dumps for everything +ulimit -c unlimited +exec "$@" diff --git a/teuthology/task/install/bin/daemon-helper b/teuthology/task/install/bin/daemon-helper new file mode 100755 index 000000000..3638a6d73 --- /dev/null +++ b/teuthology/task/install/bin/daemon-helper @@ -0,0 +1,114 @@ +#!/usr/bin/python3 + +""" +Helper script for running long-living processes. + +(Name says daemon, but that is intended to mean "long-living", we +assume child process does not double-fork.) + +We start the command passed as arguments, with /dev/null as stdin, and +then wait for EOF on stdin. + +When EOF is seen on stdin, the child process is killed. + +When the child process exits, this helper exits too. + +Usage: + daemon-helper [--kill-group] [nostdin] COMMAND ... +""" + +from __future__ import print_function + +import fcntl +import os +import select +import signal +import struct +import subprocess +import sys +from argparse import ArgumentParser + +parser = ArgumentParser(epilog= + 'The remaining parameters are the command to be run. If these\n' + + 'parameters start wih nostdin, then no stdin input is expected.') +parser.add_argument('signal') +parser.add_argument('--kill-group', action='store_true', + help='kill all processes in the group') +parser.add_argument('--nostdin', action='store_true', + help='no stdin input expected') +parsed, args = parser.parse_known_args() +end_signal = signal.SIGKILL +if parsed.signal == 'term': + end_signal = signal.SIGTERM +group = parsed.kill_group +nostdin = parsed.nostdin +skip_nostdin = 0 +try: + if args[0] == 'nostdin': + nostdin = True + skip_nostdin = 1 +except IndexError: + print('No command specified') + sys.exit(1) + + +proc = None +if nostdin: + if len(args) - skip_nostdin == 0: + print('No command specified') + sys.exit(1) + proc = subprocess.Popen( + args=args[skip_nostdin:], + ) +else: + with open('/dev/null', 'rb') as devnull: + proc = subprocess.Popen( + args=args, + stdin=devnull, + preexec_fn=os.setsid, + ) + +flags = fcntl.fcntl(0, fcntl.F_GETFL) +fcntl.fcntl(0, fcntl.F_SETFL, flags | os.O_NDELAY) + +saw_eof = False +while True: + r,w,x = select.select([0], [], [0], 0.2) + if r: + data = os.read(0, 1) + if not data: + saw_eof = True + if not group: + proc.send_signal(end_signal) + else: + os.killpg(proc.pid, end_signal) + break + else: + sig, = struct.unpack('!b', data) + if not group: + proc.send_signal(sig) + else: + os.killpg(proc.pid, end_signal) + + + if proc.poll() is not None: + # child exited + break + +exitstatus = proc.wait() +if exitstatus > 0: + print('{me}: command failed with exit status {exitstatus:d}'.format( + me=os.path.basename(sys.argv[0]), + exitstatus=exitstatus, + ), file=sys.stderr) + sys.exit(exitstatus) +elif exitstatus < 0: + if saw_eof and exitstatus == -end_signal: + # suppress error from the exit we intentionally caused + pass + else: + print('{me}: command crashed with signal {signal:d}'.format( + me=os.path.basename(sys.argv[0]), + signal=-exitstatus, + ), file=sys.stderr) + sys.exit(1) diff --git a/teuthology/task/install/bin/stdin-killer b/teuthology/task/install/bin/stdin-killer new file mode 100755 index 000000000..d1c9ba4ec --- /dev/null +++ b/teuthology/task/install/bin/stdin-killer @@ -0,0 +1,263 @@ +#!/bin/python3 + +# Forward stdin to a subcommand. If EOF is read from stdin or +# stdin/stdout/stderr are closed or hungup, then give the command "timeout" +# seconds to complete before it is killed. +# +# The command is run in a separate process group. This is mostly to simplify +# killing the set of processes (if well-behaving). You can configure that with +# --setpgrp switch. + +# usage: stdin-killer [-h] [--timeout TIMEOUT] [--debug DEBUG] [--signal SIGNAL] [--verbose] [--setpgrp {no,self,child}] command [arguments ...] +# +# wait for stdin EOF then kill forked subcommand +# +# positional arguments: +# command command to execute +# arguments arguments to command +# +# options: +# -h, --help show this help message and exit +# --timeout TIMEOUT time to wait for forked subcommand to willing terminate +# --debug DEBUG debug file +# --signal SIGNAL signal to send +# --verbose increase debugging +# --setpgrp {no,self,child} +# create process group + + +import argparse +import fcntl +import logging +import os +import select +import signal +import struct +import subprocess +import sys +import time + +NAME = "stdin-killer" + +log = logging.getLogger(NAME) +PAGE_SIZE = 4096 + +POLL_HANGUP = select.POLLHUP | (select.POLLRDHUP if hasattr(select, 'POLLRDHUP') else 0) | select.POLLERR + + +def handle_event(poll, buffer, fd, event, p): + if sigfdr == fd: + b = os.read(sigfdr, 1) + (signum,) = struct.unpack("B", b) + log.debug("got signal %d", signum) + try: + p.wait(timeout=0) + return True + except subprocess.TimeoutExpired: + pass + elif 0 == fd: + if event & POLL_HANGUP: + log.debug("peer closed connection, waiting for process exit") + poll.unregister(0) + sys.stdin.close() + if len(buffer) == 0 and p.stdin is not None: + p.stdin.close() + p.stdin = None + return True + elif event & select.POLLIN: + b = os.read(0, PAGE_SIZE) + if b == b"": + log.debug("read EOF") + poll.unregister(0) + sys.stdin.close() + if len(buffer) == 0: + p.stdin.close() + return True + if p.stdin is not None: + buffer += b + # ignore further POLLIN until buffer is written to p.stdin + poll.register(0, POLL_HANGUP) + poll.register(p.stdin.fileno(), select.POLLOUT) + elif p.stdin is not None and p.stdin.fileno() == fd: + assert event & select.POLLOUT + b = buffer[:PAGE_SIZE] + log.debug("sending %d bytes to process", len(b)) + try: + n = p.stdin.write(b) + p.stdin.flush() + log.debug("wrote %d bytes", n) + buffer = buffer[n:] + poll.register(0, select.POLLIN | POLL_HANGUP) + poll.unregister(p.stdin.fileno()) + except BrokenPipeError: + log.debug("got SIGPIPE") + poll.unregister(p.stdin.fileno()) + p.stdin.close() + p.stdin = None + return True + except BlockingIOError: + poll.register(p.stdin.fileno(), select.POLLOUT | POLL_HANGUP) + elif 1 == fd: + assert event & POLL_HANGUP + log.debug("stdout pipe has closed") + poll.unregister(1) + return True + elif 2 == fd: + assert event & POLL_HANGUP + log.debug("stderr pipe has closed") + poll.unregister(2) + return True + else: + assert False + return False + + +def listen_for_events(sigfdr, p, timeout): + poll = select.poll() + # listen for data on stdin + poll.register(0, select.POLLIN | POLL_HANGUP) + # listen for stdout/stderr to be closed, if they are closed then my parent + # is gone and I should expire the command and myself. + poll.register(1, POLL_HANGUP) + poll.register(2, POLL_HANGUP) + # for SIGCHLD + poll.register(sigfdr, select.POLLIN) + buffer = bytearray() + expired = 0.0 + while True: + if expired > 0.0: + since = time.monotonic() - expired + wait = int((timeout - since) * 1000.0) + if wait <= 0: + return + else: + wait = 5000 + log.debug("polling for %d milliseconds", wait) + events = poll.poll(wait) + for fd, event in events: + log.debug("event: (%d, %d)", fd, event) + if handle_event(poll, buffer, fd, event, p): + if p.returncode is not None: + return + if expired == 0.0: + expired = time.monotonic() + log.info( + "expiration expected; waiting %d seconds for command to complete", + NS.timeout, + ) + + +if __name__ == "__main__": + signal.signal(signal.SIGPIPE, signal.SIG_IGN) + try: + (sigfdr, sigfdw) = os.pipe2(os.O_NONBLOCK | os.O_CLOEXEC) + except AttributeError: + # pipe2 is only available on "some flavors of Unix" + # https://docs.python.org/3.10/library/os.html?highlight=pipe2#os.pipe2 + pipe_ends = os.pipe() + for fd in pipe_ends: + flags = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK | os.O_CLOEXEC) + (sigfdr, sigfdw) = pipe_ends + + signal.set_wakeup_fd(sigfdw) + + def do_nothing(signum, frame): + pass + + signal.signal(signal.SIGCHLD, do_nothing) + + P = argparse.ArgumentParser( + description="wait for stdin EOF then kill forked subcommand" + ) + P.add_argument( + "--timeout", + action="store", + default=5, + help="time to wait for forked subcommand to willing terminate", + type=int, + ) + P.add_argument("--debug", action="store", help="debug file", type=str) + P.add_argument( + "--signal", + action="store", + help="signal to send", + type=int, + default=signal.SIGKILL, + ) + P.add_argument("--verbose", action="store_true", help="increase debugging") + P.add_argument( + "--setpgrp", + action="store", + choices=["no", "self", "child"], + default="self", + help="create process group", + ) + P.add_argument( + "cmd", metavar="command", type=str, nargs=1, help="command to execute" + ) + P.add_argument( + "args", metavar="arguments", type=str, nargs="*", help="arguments to command" + ) + NS = P.parse_args() + + logargs = {} + if NS.debug is not None: + logargs["filename"] = NS.debug + else: + logargs["stream"] = sys.stderr + if NS.verbose: + logargs["level"] = logging.DEBUG + else: + logargs["level"] = logging.INFO + logargs["format"] = f"%(asctime)s {NAME} %(levelname)s: %(message)s" + logargs["datefmt"] = "%Y-%m-%dT%H:%M:%S" + logging.basicConfig(**logargs) + + cargs = NS.cmd + NS.args + popen_kwargs = { + "stdin": subprocess.PIPE, + } + + if NS.setpgrp == "self": + pgrp = os.getpgrp() + if pgrp != os.getpid(): + os.setpgrp() + pgrp = os.getpgrp() + elif NS.setpgrp == "child": + popen_kwargs["preexec_fn"] = os.setpgrp + pgrp = None + elif NS.setpgrp == "no": + pgrp = 0 + else: + assert False + + log.debug("executing %s", cargs) + p = subprocess.Popen(cargs, **popen_kwargs) + if pgrp is None: + pgrp = p.pid + flags = fcntl.fcntl(p.stdin.fileno(), fcntl.F_GETFL) + fcntl.fcntl(p.stdin.fileno(), fcntl.F_SETFL, flags | os.O_NONBLOCK) + + listen_for_events(sigfdr, p, NS.timeout) + + if p.returncode is None: + log.error("timeout expired: sending signal %d to command and myself", NS.signal) + if pgrp == 0: + os.kill(p.pid, NS.signal) + else: + os.killpg(pgrp, NS.signal) # should kill me too + os.kill(os.getpid(), NS.signal) # to exit abnormally with same signal + log.error("signal did not cause termination, sending myself SIGKILL") + os.kill(os.getpid(), signal.SIGKILL) # failsafe + rc = p.returncode + log.debug("rc = %d", rc) + assert rc is not None + if rc < 0: + log.error("command terminated with signal %d: sending same signal to myself!", -rc) + os.kill(os.getpid(), -rc) # kill myself with the same signal + log.error("signal did not cause termination, sending myself SIGKILL") + os.kill(os.getpid(), signal.SIGKILL) # failsafe + else: + log.info("command exited with status %d: exiting normally with same code!", rc) + sys.exit(rc) diff --git a/teuthology/task/install/deb.py b/teuthology/task/install/deb.py new file mode 100644 index 000000000..e1a290f78 --- /dev/null +++ b/teuthology/task/install/deb.py @@ -0,0 +1,226 @@ +import logging +import os + +from io import StringIO + +from teuthology.orchestra import run +from teuthology.contextutil import safe_while + +from teuthology.task.install.util import _get_builder_project, _get_local_dir + + +log = logging.getLogger(__name__) + +def _retry_if_eagain_in_output(remote, args): + # wait at most 5 minutes + with safe_while(sleep=10, tries=30) as proceed: + while proceed(): + stderr = StringIO() + try: + return remote.run(args=args, stderr=stderr) + except run.CommandFailedError: + if "could not get lock" in stderr.getvalue().lower(): + stdout = StringIO() + args = ['sudo', 'fuser', '-v', '/var/lib/dpkg/lock-frontend'] + remote.run(args=args, stdout=stdout) + log.info("The processes holding 'lock-frontend':\n{}".format(stdout.getvalue())) + continue + else: + raise + +def install_dep_packages(remote, args): + _retry_if_eagain_in_output(remote, args) + +def _update_package_list_and_install(ctx, remote, debs, config): + """ + Runs ``apt-get update`` first, then runs ``apt-get install``, installing + the requested packages on the remote system. + + TODO: split this into at least two functions. + + :param ctx: the argparse.Namespace object + :param remote: the teuthology.orchestra.remote.Remote object + :param debs: list of packages names to install + :param config: the config dict + """ + + # check for ceph release key + r = remote.run( + args=[ + 'sudo', 'apt-key', 'list', run.Raw('|'), 'grep', 'Ceph', + ], + stdout=StringIO(), + check_status=False, + ) + if r.stdout.getvalue().find('Ceph automated package') == -1: + # if it doesn't exist, add it + remote.run( + args=[ + 'wget', '-q', '-O-', + 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc', # noqa + run.Raw('|'), + 'sudo', 'apt-key', 'add', '-', + ], + stdout=StringIO(), + ) + + builder = _get_builder_project(ctx, remote, config) + log.info("Installing packages: {pkglist} on remote deb {arch}".format( + pkglist=", ".join(debs), arch=builder.arch) + ) + system_pkglist = config.get('extra_system_packages') + if system_pkglist: + if isinstance(system_pkglist, dict): + system_pkglist = system_pkglist.get('deb') + log.info("Installing system (non-project) packages: {pkglist} on remote deb {arch}".format( + pkglist=", ".join(system_pkglist), arch=builder.arch) + ) + # get baseurl + log.info('Pulling from %s', builder.base_url) + + version = builder.version + log.info('Package version is %s', version) + + builder.install_repo() + + remote.run(args=['sudo', 'apt-get', 'update'], check_status=False) + install_cmd = [ + 'sudo', 'DEBIAN_FRONTEND=noninteractive', 'apt-get', '-y', + '--force-yes', + '-o', run.Raw('Dpkg::Options::="--force-confdef"'), '-o', run.Raw( + 'Dpkg::Options::="--force-confold"'), + 'install', + ] + install_dep_packages(remote, + args=install_cmd + ['%s=%s' % (d, version) for d in debs], + ) + if system_pkglist: + install_dep_packages(remote, + args=install_cmd + system_pkglist, + ) + ldir = _get_local_dir(config, remote) + if ldir: + for fyle in os.listdir(ldir): + fname = "%s/%s" % (ldir, fyle) + remote.run(args=['sudo', 'dpkg', '-i', fname],) + + +def _remove(ctx, config, remote, debs): + """ + Removes Debian packages from remote, rudely + + TODO: be less rude (e.g. using --force-yes) + + :param ctx: the argparse.Namespace object + :param config: the config dict + :param remote: the teuthology.orchestra.remote.Remote object + :param debs: list of packages names to install + """ + log.info("Removing packages: {pkglist} on Debian system.".format( + pkglist=", ".join(debs))) + # first ask nicely + remote.run( + args=[ + 'for', 'd', 'in', + ] + debs + [ + run.Raw(';'), + 'do', + 'sudo', + 'DEBIAN_FRONTEND=noninteractive', 'apt-get', '-y', '--force-yes', + '-o', run.Raw('Dpkg::Options::="--force-confdef"'), '-o', run.Raw( + 'Dpkg::Options::="--force-confold"'), 'purge', + run.Raw('$d'), + run.Raw('||'), + 'true', + run.Raw(';'), + 'done', + ]) + # mop up anything that is broken + remote.run( + args=[ + 'dpkg', '-l', + run.Raw('|'), + # Any package that is unpacked or half-installed and also requires + # reinstallation + 'grep', r'^.\(U\|H\)R', + run.Raw('|'), + 'awk', '{print $2}', + run.Raw('|'), + 'sudo', + 'xargs', '--no-run-if-empty', + 'dpkg', '-P', '--force-remove-reinstreq', + ]) + # then let apt clean up + remote.run( + args=[ + 'sudo', + 'DEBIAN_FRONTEND=noninteractive', 'apt-get', '-y', '--force-yes', + '-o', run.Raw('Dpkg::Options::="--force-confdef"'), '-o', run.Raw( + 'Dpkg::Options::="--force-confold"'), + 'autoremove', + ], + ) + + +def _remove_sources_list(ctx, config, remote): + builder = _get_builder_project(ctx, remote, config) + builder.remove_repo() + remote.run( + args=[ + 'sudo', 'apt-get', 'update', + ], + check_status=False, + ) + + +def _upgrade_packages(ctx, config, remote, debs): + """ + Upgrade project's packages on remote Debian host + Before doing so, installs the project's GPG key, writes a sources.list + file, and runs ``apt-get update``. + + :param ctx: the argparse.Namespace object + :param config: the config dict + :param remote: the teuthology.orchestra.remote.Remote object + :param debs: the Debian packages to be installed + :param branch: the branch of the project to be used + """ + # check for ceph release key + r = remote.run( + args=[ + 'sudo', 'apt-key', 'list', run.Raw('|'), 'grep', 'Ceph', + ], + stdout=StringIO(), + check_status=False, + ) + if r.stdout.getvalue().find('Ceph automated package') == -1: + # if it doesn't exist, add it + remote.run( + args=[ + 'wget', '-q', '-O-', + 'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=keys/autobuild.asc', # noqa + run.Raw('|'), + 'sudo', 'apt-key', 'add', '-', + ], + stdout=StringIO(), + ) + + builder = _get_builder_project(ctx, remote, config) + base_url = builder.base_url + log.info('Pulling from %s', base_url) + + version = builder.version + log.info('Package version is %s', version) + + builder.install_repo() + + remote.run(args=['sudo', 'apt-get', 'update'], check_status=False) + install_dep_packages(remote, + args=[ + 'sudo', + 'DEBIAN_FRONTEND=noninteractive', 'apt-get', '-y', '--force-yes', + '-o', run.Raw('Dpkg::Options::="--force-confdef"'), '-o', run.Raw( + 'Dpkg::Options::="--force-confold"'), + 'install', + ] + ['%s=%s' % (d, version) for d in debs], + ) diff --git a/teuthology/task/install/packages.yaml b/teuthology/task/install/packages.yaml new file mode 100644 index 000000000..602edacbd --- /dev/null +++ b/teuthology/task/install/packages.yaml @@ -0,0 +1,39 @@ +--- +ceph: + deb: + - ceph + - ceph-mds + - ceph-common + - ceph-fuse + - ceph-test + - radosgw + - python3-rados + - python3-rgw + - python3-cephfs + - python3-rbd + - libcephfs2 + - librados2 + - librbd1 + - rbd-fuse + - ceph-dbg + - ceph-mds-dbg + - ceph-common-dbg + - ceph-fuse-dbg + - radosgw-dbg + - libcephfs2-dbg + - librados2-dbg + - librbd1-dbg + rpm: + - ceph-radosgw + - ceph-test + - ceph + - ceph-fuse + - libcephfs2 + - librados2 + - librbd1 + - python3-rados + - python3-rgw + - python3-cephfs + - python3-rbd + - rbd-fuse + - ceph-debuginfo diff --git a/teuthology/task/install/redhat.py b/teuthology/task/install/redhat.py new file mode 100644 index 000000000..511808865 --- /dev/null +++ b/teuthology/task/install/redhat.py @@ -0,0 +1,217 @@ +import contextlib +import logging +import yaml +import os + +from teuthology import packaging +from teuthology.orchestra import run +from teuthology.parallel import parallel +from teuthology.config import config as teuth_config + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def install(ctx, config): + """ + Installs rh ceph on all hosts in ctx. + + :param ctx: the argparse.Namespace object + :param config: the config dict + + uses yaml defined in qa suite or in users + home dir to check for supported versions and + packages to install. + + the format of yaml is: + versions: + supported: + - '1.3.0' + rpm: + mapped: + '1.3.0' : '0.94.1' + deb: + mapped: + '1.3.0' : '0.94.1' + pkgs: + rpm: + - ceph-mon + - ceph-osd + deb: + - ceph-osd + - ceph-mds + """ + # Look for rh specific packages + ds_yaml = os.path.join( + teuth_config.get('ds_yaml_dir'), + config.get('rhbuild') + ".yaml", + ) + if not os.path.exists(ds_yaml): + raise FileNotFoundError(f'Downstream rh version yaml file missing: {ds_yaml}') + log.info("using yaml path %s", ds_yaml) + downstream_config = yaml.safe_load(open(ds_yaml)) + rh_versions = downstream_config.get('versions', dict()).get('supported', []) + external_config = dict(extra_system_packages=config.get('extra_system_packages'), + extra_packages=config.get('extra_packages'), + ) + downstream_config.update(external_config) + version = config.get('rhbuild') + if version in rh_versions: + log.info("%s is a supported version", version) + else: + raise RuntimeError("Unsupported RH Ceph version %s", version) + with parallel() as p: + for remote in ctx.cluster.remotes.keys(): + if remote.os.name == 'rhel': + log.info("Installing on RHEL node: %s", remote.shortname) + p.spawn(install_pkgs, ctx, remote, version, downstream_config) + else: + log.info("Install on Ubuntu node: %s", remote.shortname) + p.spawn(install_deb_pkgs, ctx, remote, version, + downstream_config) + try: + yield + finally: + if config.get('skip_uninstall'): + log.info("Skipping uninstall of Ceph") + else: + with parallel() as p: + for remote in ctx.cluster.remotes.keys(): + p.spawn(uninstall_pkgs, ctx, remote, downstream_config) + + +def install_pkgs(ctx, remote, version, downstream_config): + """ + Installs RH build using ceph-deploy. + + :param ctx: the argparse.Namespace object + :param remote: the teuthology.orchestra.remote.Remote object + :param downstream_config the dict object that has downstream pkg info + """ + rh_version_check = downstream_config.get('versions').get('rpm').get('mapped') + rh_rpm_pkgs = downstream_config.get('pkgs').get('rpm') + extras = [downstream_config.get('extra_system_packages'), + downstream_config.get('extra_packages')] + for extra in extras: + if isinstance(extra, dict): + rh_rpm_pkgs += extra.get('rpm', []) + elif isinstance(extra, list): + rh_rpm_pkgs += extra + pkgs = str.join(' ', rh_rpm_pkgs) + + log.info("Remove any epel packages installed on node %s", remote.shortname) + # below packages can come from epel and still work, ensure we use cdn pkgs + remote.run( + args=[ + 'sudo', + 'yum', + 'remove', + run.Raw("leveldb xmlstarlet fcgi"), + '-y'], + check_status=False) + + log.info("Installing redhat ceph packages") + remote.run(args=['sudo', 'yum', '-y', 'install', + run.Raw(pkgs)]) + # check package version + installed_version = packaging.get_package_version(remote, 'ceph-common') + log.info( + "Node: {n} Ceph version installed is {v}".format( + n=remote.shortname, + v=version)) + req_ver = rh_version_check[version] + if installed_version.startswith(req_ver): + log.info("Installed version matches on %s", remote.shortname) + else: + raise RuntimeError("Version check failed on node %s", remote.shortname) + + +def set_deb_repo(remote, deb_repo, deb_gpg_key=None): + """ + Sets up debian repo and gpg key for package verification + :param remote - remote node object + :param deb_repo - debian repo root path + :param deb_gpg_key - gpg key for the package + """ + repos = ['MON', 'OSD', 'Tools'] + log.info("deb repo: %s", deb_repo) + log.info("gpg key url: %s", deb_gpg_key) + # remove any additional repo so that upstream packages are not used + # all required packages come from downstream repo + remote.run(args=['sudo', 'rm', '-f', run.Raw('/etc/apt/sources.list.d/*')], + check_status=False) + for repo in repos: + cmd = 'echo deb {root}/{repo} $(lsb_release -sc) main'.format( + root=deb_repo, repo=repo) + remote.run(args=['sudo', run.Raw(cmd), run.Raw('>'), + "/tmp/{0}.list".format(repo)]) + remote.run(args=['sudo', 'cp', "/tmp/{0}.list".format(repo), + '/etc/apt/sources.list.d/']) + # add ds gpgkey + ds_keys = ['https://www.redhat.com/security/897da07a.txt', + 'https://www.redhat.com/security/f21541eb.txt'] + if deb_gpg_key is not None: + ds_keys.append(deb_gpg_key) + for key in ds_keys: + wget_cmd = 'wget -O - ' + key + remote.run(args=['sudo', run.Raw(wget_cmd), + run.Raw('|'), 'sudo', 'apt-key', 'add', run.Raw('-')]) + remote.run(args=['sudo', 'apt-get', 'update']) + + +def install_deb_pkgs( + ctx, + remote, + version, + downstream_config): + """ + Setup debian repo, Install gpg key + and Install on debian packages + : param ctx + : param remote + : param downstream_config the dict object that has downstream pkg info + """ + rh_version_check = downstream_config.get('versions').get('deb').get('mapped') + rh_deb_pkgs = downstream_config.get('pkgs').get('deb') + extras = [downstream_config.get('extra_system_packages'), + downstream_config.get('extra_packages')] + for extra in extras: + if isinstance(extra, dict): + rh_deb_pkgs += extra.get('deb', []) + elif isinstance(extra, list): + rh_deb_pkgs += extra + pkgs = str.join(' ', rh_deb_pkgs) + log.info("Installing redhat ceph packages") + remote.run(args=['sudo', 'apt-get', '-y', 'install', + run.Raw(pkgs)]) + # check package version + installed_version = packaging.get_package_version(remote, 'ceph-common') + log.info( + "Node: {n} Ceph version installed is {v}".format( + n=remote.shortname, + v=version)) + req_ver = rh_version_check[version] + if installed_version.startswith(req_ver): + log.info("Installed version matches on %s", remote.shortname) + else: + raise RuntimeError("Version check failed on node %s", remote.shortname) + + +def uninstall_pkgs(ctx, remote, downstream_config): + """ + Removes Ceph from all RH hosts + + :param ctx: the argparse.Namespace object + :param remote: the teuthology.orchestra.remote.Remote object + :param downstream_config the dict object that has downstream pkg info + """ + + if remote.os.name == 'rhel': + pkgs = downstream_config.get('pkgs').get('rpm') + if pkgs: + remote.sh(['sudo', 'yum', 'remove'] + pkgs + ['-y']) + else: + pkgs = downstream_config.get('pkgs').get('deb') + if pkgs: + remote.sh(['sudo', 'apt-get', 'remove'] + pkgs + ['-y']) + remote.run(args=['sudo', 'rm', '-rf', '/var/lib/ceph']) diff --git a/teuthology/task/install/rpm.py b/teuthology/task/install/rpm.py new file mode 100644 index 000000000..6a8999800 --- /dev/null +++ b/teuthology/task/install/rpm.py @@ -0,0 +1,433 @@ +import logging +import os.path +from io import StringIO + +from distutils.version import LooseVersion + +from teuthology.config import config as teuth_config +from teuthology.contextutil import safe_while +from teuthology.orchestra import run +from teuthology import packaging + +from teuthology.task.install.util import _get_builder_project, _get_local_dir + +log = logging.getLogger(__name__) + + +def _remove(ctx, config, remote, rpm): + """ + Removes RPM packages from remote + + :param ctx: the argparse.Namespace object + :param config: the config dict + :param remote: the teuthology.orchestra.remote.Remote object + :param rpm: list of packages names to remove + """ + remote_os = remote.os + dist_release = remote_os.name + + install_ceph_packages = config.get('install_ceph_packages') + if install_ceph_packages: + log.info("Removing packages: {pkglist} on rpm system.".format( + pkglist=", ".join(rpm))) + if dist_release in ['opensuse', 'sle']: + remote.run(args=''' + for d in {rpms} ; do + sudo zypper -n --no-gpg-checks remove --capability $d || true + done'''.format(rpms=' '.join(rpm))) + remote.run(args='sudo zypper clean -a') + else: + remote.run(args=''' + for d in {rpms} ; do + sudo yum -y remove $d || true + done'''.format(rpms=' '.join(rpm))) + remote.run(args='sudo yum clean all') + else: + log.info("install task did not install any packages, " + "so not removing any, either") + + repos = config.get('repos') + if repos: + if dist_release in ['opensuse', 'sle']: + _zypper_removerepo(remote, repos) + else: + raise Exception('Custom repos were specified for %s ' % remote_os + + 'but these are currently not supported') + else: + builder = _get_builder_project(ctx, remote, config) + builder.remove_repo() + + if dist_release in ['opensuse', 'sle']: + #remote.run(args='sudo zypper clean -a') + log.info("Not cleaning zypper cache: this might fail, and is not needed " + "because the test machine will be destroyed or reimaged anyway") + else: + remote.run(args='sudo yum clean expire-cache') + + +def _zypper_addrepo(remote, repo_list): + """ + Add zypper repos to the remote system. + + :param remote: remote node where to add packages + :param repo_list: list of dictionaries with keys 'name', 'url' + :return: + """ + for repo in repo_list: + if 'priority' in repo: + remote.run(args=[ + 'sudo', 'zypper', '-n', 'addrepo', '--refresh', '--no-gpgcheck', + '-p', str(repo['priority']), repo['url'], repo['name'], + ]) + else: + remote.run(args=[ + 'sudo', 'zypper', '-n', 'addrepo', '--refresh', '--no-gpgcheck', + repo['url'], repo['name'], + ]) + # Because 'zypper addrepo --check' does not work as expected + # we need call zypper ref in order to fail early if the repo + # is invalid + remote.run(args='sudo zypper ref ' + repo['name']) + +def _zypper_removerepo(remote, repo_list): + """ + Remove zypper repos on the remote system. + + :param remote: remote node where to remove packages from + :param repo_list: list of dictionaries with keys 'name', 'url' + :return: + """ + for repo in repo_list: + remote.run(args=[ + 'sudo', 'zypper', '-n', 'removerepo', repo['name'], + ]) + +def _zypper_wipe_all_repos(remote): + """ + Completely "wipe" (remove) all zypper repos + + :param remote: remote node where to wipe zypper repos + :return: + """ + log.info("Wiping zypper repos (if any)") + remote.sh('sudo zypper repos -upEP && ' + 'sudo rm -f /etc/zypp/repos.d/* || ' + 'true') + +def _downgrade_packages(ctx, remote, pkgs, pkg_version, config): + """ + Downgrade packages listed by 'downgrade_packages' + + Downgrade specified packages to given version. The list of packages + downgrade is provided by 'downgrade_packages' as a property of "install" + task. + + :param ctx: the argparse.Namespace object + :param remote: the teuthology.orchestra.remote.Remote object + :param pkgs: list of package names to install + :param pkg_version: the version to which all packages will be downgraded + :param config: the config dict + :return: list of package names from 'pkgs' which are not yet + installed/downgraded + """ + downgrade_pkgs = config.get('downgrade_packages', []) + if not downgrade_pkgs: + return pkgs + log.info('Downgrading packages: {pkglist}'.format( + pkglist=', '.join(downgrade_pkgs))) + # assuming we are going to downgrade packages with the same version + first_pkg = downgrade_pkgs[0] + installed_version = packaging.get_package_version(remote, first_pkg) + assert installed_version, "failed to get version of {}".format(first_pkg) + assert LooseVersion(installed_version) > LooseVersion(pkg_version) + # to compose package name like "librados2-0.94.10-87.g116a558.el7" + pkgs_opt = ['-'.join([pkg, pkg_version]) for pkg in downgrade_pkgs] + remote.run(args='sudo yum -y downgrade {}'.format(' '.join(pkgs_opt))) + return [pkg for pkg in pkgs if pkg not in downgrade_pkgs] + +def _retry_if_failures_are_recoverable(remote, args): + # wait at most 5 minutes + with safe_while(sleep=10, tries=30) as proceed: + while proceed(): + stdout = StringIO() + stderr = StringIO() + try: + return remote.run(args=args, stderr=stderr, stdout=stdout) + except run.CommandFailedError: + if "status code: 503" in stdout.getvalue().lower(): + continue + if "failed to download metadata for repo" in stderr.getvalue().lower(): + continue + else: + raise + +def _update_package_list_and_install(ctx, remote, rpm, config): + """ + Installs the repository for the relevant branch, then installs + the requested packages on the remote system. + + TODO: split this into at least two functions. + + :param ctx: the argparse.Namespace object + :param remote: the teuthology.orchestra.remote.Remote object + :param rpm: list of packages names to install + :param config: the config dict + """ + + enable_coprs = config.get('enable_coprs', []) + if len(enable_coprs): + remote.run(args=['sudo', 'dnf', '-y', 'install', 'dnf-command(copr)']) + for copr in enable_coprs: + remote.run(args=['sudo', 'dnf', '-y', 'copr', 'enable', copr]) + + remote_os = remote.os + dist_release = remote_os.name + log.debug("_update_package_list_and_install: config is {}".format(config)) + repos = config.get('repos') + install_ceph_packages = config.get('install_ceph_packages') + repos_only = config.get('repos_only') + + if repos: + log.debug("Adding repos: %s" % repos) + if dist_release in ['opensuse', 'sle']: + _zypper_wipe_all_repos(remote) + _zypper_addrepo(remote, repos) + else: + raise Exception('Custom repos were specified for %s ' % remote_os + + 'but these are currently not supported') + else: + builder = _get_builder_project(ctx, remote, config) + log.info('Pulling from %s', builder.base_url) + log.info('Package version is %s', builder.version) + builder.install_repo() + + if repos_only: + log.info("repos_only was specified: not installing any packages") + return None + + packages = list(rpm) + if not install_ceph_packages: + log.info("install_ceph_packages set to False: not installing Ceph packages") + # Although "librados2" is an indirect dependency of ceph-test, we + # install it separately because, otherwise, ceph-test cannot be + # installed (even with --force) when there are several conflicting + # repos from different vendors. + packages = ["librados2", "ceph-test"] + + # rpm does not force installation of a particular version of the project + # packages, so we can put extra_system_packages together with the rest + system_pkglist = config.get('extra_system_packages', []) + if system_pkglist: + if isinstance(system_pkglist, dict): + packages += system_pkglist.get('rpm') + else: + packages += system_pkglist + + log.info("Installing packages: {pkglist} on remote rpm {arch}".format( + pkglist=", ".join(packages), arch=remote.arch)) + + if dist_release not in ['opensuse', 'sle']: + project = builder.project + uri = builder.uri_reference + _yum_fix_repo_priority(remote, project, uri) + _yum_fix_repo_host(remote, project) + _yum_set_check_obsoletes(remote) + + if dist_release in ['opensuse', 'sle']: + remote.run(args='sudo zypper clean -a') + else: + remote.run(args='sudo yum clean all') + + ldir = _get_local_dir(config, remote) + + if dist_release in ['opensuse', 'sle']: + remove_cmd = 'sudo zypper -n remove --capability' + # NOTE: --capability contradicts --force + install_cmd = 'sudo zypper -n --no-gpg-checks install --force --no-recommends' + else: + remove_cmd = 'sudo yum -y remove' + install_cmd = 'sudo yum -y install' + # to compose version string like "0.94.10-87.g116a558.el7" + pkg_version = '.'.join([builder.version, builder.dist_release]) + packages = _downgrade_packages(ctx, remote, packages, pkg_version, config) + + if system_pkglist: + _retry_if_failures_are_recoverable(remote, + args='{install_cmd} {rpms}' + .format(install_cmd=install_cmd, rpms=' '.join(packages)) + ) + else: + for cpack in packages: + if ldir: + _retry_if_failures_are_recoverable(remote, + args=''' + if test -e {pkg} ; then + {remove_cmd} {pkg} ; + {install_cmd} {pkg} ; + else + {install_cmd} {cpack} ; + fi + '''.format(remove_cmd=remove_cmd, + install_cmd=install_cmd, + pkg=os.path.join(ldir, cpack), + cpack=cpack)) + else: + _retry_if_failures_are_recoverable(remote, + args='{install_cmd} {cpack}' + .format(install_cmd=install_cmd, cpack=cpack) + ) + +def _yum_fix_repo_priority(remote, project, uri): + """ + On the remote, 'priority=1' lines to each enabled repo in: + + /etc/yum.repos.d/{project}.repo + + :param remote: the teuthology.orchestra.remote.Remote object + :param project: the project whose repos need modification + """ + repo_path = '/etc/yum.repos.d/%s.repo' % project + remote.run( + args=[ + 'if', 'test', '-f', repo_path, run.Raw(';'), 'then', + 'sudo', 'sed', '-i', '-e', + run.Raw('\':a;N;$!ba;s/enabled=1\\ngpg/enabled=1\\npriority=1\\ngpg/g\''), + '-e', + run.Raw("'s;ref/[a-zA-Z0-9_-]*/;{uri}/;g'".format(uri=uri)), + repo_path, run.Raw(';'), 'fi' + ] + ) + + +def _yum_fix_repo_host(remote, project): + """ + Update the hostname to reflect the gitbuilder_host setting. + """ + # Skip this bit if we're not using gitbuilder + if not isinstance(packaging.get_builder_project(), + packaging.GitbuilderProject): + return + old_host = teuth_config._defaults['gitbuilder_host'] + new_host = teuth_config.gitbuilder_host + if new_host == old_host: + return + repo_path = '/etc/yum.repos.d/%s.repo' % project + host_sed_expr = "'s/{0}/{1}/'".format(old_host, new_host) + remote.run( + args=[ + 'if', 'test', '-f', repo_path, run.Raw(';'), 'then', + 'sudo', 'sed', '-i', '-e', run.Raw(host_sed_expr), + repo_path, run.Raw(';'), 'fi'] + ) + + +def _yum_set_check_obsoletes(remote): + """ + Set check_obsoletes = 1 in /etc/yum/pluginconf.d/priorities.conf + + Creates a backup at /etc/yum/pluginconf.d/priorities.conf.orig so we can + restore later. + """ + conf_path = '/etc/yum/pluginconf.d/priorities.conf' + conf_path_orig = conf_path + '.orig' + cmd = [ + 'sudo', 'touch', '-a', '/etc/yum/pluginconf.d/priorities.conf', run.Raw(';'), + 'test', '-e', conf_path_orig, run.Raw('||'), 'sudo', 'cp', '-af', + conf_path, conf_path_orig, + ] + remote.run(args=cmd) + cmd = [ + 'grep', 'check_obsoletes', conf_path, run.Raw('&&'), 'sudo', 'sed', + '-i', 's/check_obsoletes.*0/check_obsoletes = 1/g', conf_path, + run.Raw('||'), 'echo', 'check_obsoletes = 1', run.Raw('|'), 'sudo', + 'tee', '-a', conf_path, + ] + remote.run(args=cmd) + + +def _yum_unset_check_obsoletes(remote): + """ + Restore the /etc/yum/pluginconf.d/priorities.conf backup + """ + conf_path = '/etc/yum/pluginconf.d/priorities.conf' + conf_path_orig = conf_path + '.orig' + remote.run(args=['sudo', 'mv', '-f', conf_path_orig, conf_path], + check_status=False) + + +def _remove_sources_list(ctx, config, remote): + """ + Removes /etc/yum.repos.d/{proj}.repo + + :param remote: the teuthology.orchestra.remote.Remote object + :param proj: the project whose .repo needs removing + """ + builder = _get_builder_project(ctx, remote, config) + builder.remove_repo() + if remote.os.name not in ['opensuse', 'sle']: + _yum_unset_check_obsoletes(remote) + + for copr in config.get('enable_coprs', []): + remote.run(args=['sudo', 'dnf', '-y', 'copr', 'disable', copr]) + +def _upgrade_packages(ctx, config, remote, pkgs): + """ + Upgrade project's packages on remote RPM-based host + Before doing so, it makes sure the project's repository is installed - + removing any previous version first. + + :param ctx: the argparse.Namespace object + :param config: the config dict + :param remote: the teuthology.orchestra.remote.Remote object + :param pkgs: the RPM packages to be installed + :param branch: the branch of the project to be used + """ + builder = _get_builder_project(ctx, remote, config) + log.info( + "Host {host} is: {distro} {ver} {arch}".format( + host=remote.shortname, + distro=builder.os_type, + ver=builder.os_version, + arch=builder.arch,) + ) + + base_url = builder.base_url + log.info('Repo base URL: %s', base_url) + project = builder.project + + # Remove the repository before re-adding it + builder.remove_repo() + builder.install_repo() + + if builder.dist_release not in ['opensuse', 'sle']: + uri = builder.uri_reference + _yum_fix_repo_priority(remote, project, uri) + _yum_fix_repo_host(remote, project) + _yum_set_check_obsoletes(remote) + + if builder.dist_release in ['opensuse', 'sle']: + pkg_mng_cmd = 'zypper' + pkg_mng_opts = '-a' + else: + pkg_mng_cmd = 'yum' + pkg_mng_opts = 'all' + + remote.run( + args=[ + 'sudo', pkg_mng_cmd, 'clean', pkg_mng_opts, + ]) + + # Actually upgrade the project packages + if builder.dist_release in ['opensuse', 'sle']: + pkg_mng_opts = '-n' + pkg_mng_subcommand = 'install' + pkg_mng_subcommand_opts = ['--capability', '--no-recommends'] + else: + pkg_mng_opts = '-y' + pkg_mng_subcommand = 'upgrade' + pkg_mng_subcommand_opts = [] + args = ['sudo', pkg_mng_cmd, pkg_mng_opts, pkg_mng_subcommand] + if pkg_mng_subcommand_opts: + args += pkg_mng_subcommand_opts + args += pkgs + remote.run(args=args) diff --git a/teuthology/task/install/util.py b/teuthology/task/install/util.py new file mode 100644 index 000000000..46fbde9c9 --- /dev/null +++ b/teuthology/task/install/util.py @@ -0,0 +1,153 @@ +import contextlib +import logging +import os + +from teuthology import misc as teuthology +from teuthology import packaging +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + + +def _get_builder_project(ctx, remote, config): + return packaging.get_builder_project()( + config.get('project', 'ceph'), + config, + remote=remote, + ctx=ctx + ) + + +def _get_local_dir(config, remote): + """ + Extract local directory name from the task lists. + Copy files over to the remote site. + """ + ldir = config.get('local', None) + if ldir: + remote.run(args=['sudo', 'mkdir', '-p', ldir]) + for fyle in os.listdir(ldir): + fname = "%s/%s" % (ldir, fyle) + teuthology.sudo_write_file( + remote, fname, open(fname).read(), '644') + return ldir + + +def get_flavor(config): + """ + Determine the flavor to use. + """ + config = config or dict() + flavor = config.get('flavor', 'default') + + if config.get('path'): + # local dir precludes any other flavors + flavor = 'local' + else: + if config.get('valgrind'): + flavor = 'notcmalloc' + else: + if config.get('coverage'): + flavor = 'gcov' + return flavor + +def _ship_utilities(ctx): + """ + Write a copy of valgrind.supp to each of the remote sites. Set executables + used by Ceph in /usr/local/bin. When finished (upon exit of the teuthology + run), remove these files. + + :param ctx: Context + """ + testdir = teuthology.get_testdir(ctx) + filenames = [] + + log.info('Shipping valgrind.supp...') + assert 'suite_path' in ctx.config + try: + with open( + os.path.join(ctx.config['suite_path'], 'valgrind.supp'), + 'rb' + ) as f: + fn = os.path.join(testdir, 'valgrind.supp') + filenames.append(fn) + for rem in ctx.cluster.remotes.keys(): + teuthology.sudo_write_file( + remote=rem, + path=fn, + data=f, + ) + f.seek(0) + except IOError as e: + log.info('Cannot ship supression file for valgrind: %s...', e.strerror) + + FILES = ['daemon-helper', 'adjust-ulimits', 'stdin-killer'] + destdir = '/usr/bin' + for filename in FILES: + log.info('Shipping %r...', filename) + src = os.path.join(os.path.dirname(__file__), 'bin', filename) + dst = os.path.join(destdir, filename) + filenames.append(dst) + with open(src, 'rb') as f: + for rem in ctx.cluster.remotes.keys(): + teuthology.sudo_write_file( + remote=rem, + path=dst, + data=f, + ) + f.seek(0) + rem.run( + args=[ + 'sudo', + 'chmod', + 'a=rx', + '--', + dst, + ], + ) + return filenames + +def _remove_utilities(ctx, filenames): + """ + Remove the shipped utilities. + + :param ctx: Context + :param filenames: The utilities install paths + """ + log.info('Removing shipped files: %s...', ' '.join(filenames)) + if filenames == []: + return + run.wait( + ctx.cluster.run( + args=[ + 'sudo', + 'rm', + '-f', + '--', + ] + list(filenames), + wait=False, + ), + ) + +@contextlib.contextmanager +def ship_utilities(ctx, config): + """ + Ship utilities during the first call, and skip it in the following ones. + See also `_ship_utilities`. + + :param ctx: Context + :param config: Configuration + """ + assert config is None + + do_ship_utilities = ctx.get('do_ship_utilities', True) + if do_ship_utilities: + ctx['do_ship_utilities'] = False + filenames = _ship_utilities(ctx) + try: + yield + finally: + _remove_utilities(ctx, filenames) + else: + log.info('Utilities already shipped, skip it...') + yield diff --git a/teuthology/task/interactive.py b/teuthology/task/interactive.py new file mode 100644 index 000000000..dd1676e49 --- /dev/null +++ b/teuthology/task/interactive.py @@ -0,0 +1,40 @@ +""" +Drop into a python shell +""" +import code +import readline +import rlcompleter +rlcompleter.__name__ # silence pyflakes +import pprint + +readline.parse_and_bind('tab: complete') + +def task(ctx, config): + """ + Run an interactive Python shell, with the cluster accessible via + the ``ctx`` variable. + + Hit ``control-D`` to continue. + + This is also useful to pause the execution of the test between two + tasks, either to perform ad hoc operations, or to examine the + state of the cluster. You can also use it to easily bring up a + Ceph cluster for ad hoc testing. + + For example:: + + tasks: + - ceph: + - interactive: + """ + + pp = pprint.PrettyPrinter().pprint + code.interact( + banner='Ceph test interactive mode, use ctx to interact with the cluster, press control-D to exit...', + # TODO simplify this + local=dict( + ctx=ctx, + config=config, + pp=pp, + ), + ) diff --git a/teuthology/task/internal/__init__.py b/teuthology/task/internal/__init__.py new file mode 100644 index 000000000..15b8f81f5 --- /dev/null +++ b/teuthology/task/internal/__init__.py @@ -0,0 +1,549 @@ +""" +Internal tasks are tasks that are started from the teuthology infrastructure. +Note that there is no corresponding task defined for this module. All of +the calls are made from other modules, most notably teuthology/run.py +""" +import contextlib +import functools +import gzip +import logging +import os +import shutil +import time +import yaml +import subprocess +import tempfile +import re +import humanfriendly + +import teuthology.lock.ops +from teuthology import misc, packaging +from teuthology import report +from teuthology.config import config as teuth_config +from teuthology.exceptions import ConfigError, VersionNotFoundError +from teuthology.job_status import get_status, set_status +from teuthology.orchestra import cluster, remote, run +# the below import with noqa is to workaround run.py which does not support multilevel submodule import +from teuthology.task.internal.redhat import (setup_cdn_repo, setup_base_repo, # noqa + setup_additional_repo, # noqa + setup_stage_cdn, setup_container_registry) # noqa + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def base(ctx, config): + """ + Create the test directory that we will be using on the remote system + """ + log.info('Creating test directory...') + testdir = misc.get_testdir(ctx) + run.wait( + ctx.cluster.run( + args=['mkdir', '-p', '-m0755', '--', testdir], + wait=False, + ) + ) + try: + yield + finally: + log.info('Tidying up after the test...') + # if this fails, one of the earlier cleanups is flawed; don't + # just cram an rm -rf here + run.wait( + ctx.cluster.run( + args=['find', testdir, '-ls', + run.Raw(';'), + 'rmdir', '--', testdir], + wait=False, + ), + ) + + +def save_config(ctx, config): + """ + Store the config in a yaml file + """ + log.info('Saving configuration') + if ctx.archive is not None: + with open(os.path.join(ctx.archive, 'config.yaml'), 'w') as f: + yaml.safe_dump(ctx.config, f, default_flow_style=False) + + +def check_packages(ctx, config): + """ + Checks gitbuilder to determine if there are missing packages for this job. + + If there are missing packages, fail the job. + """ + for task in ctx.config['tasks']: + if list(task.keys())[0] == 'buildpackages': + log.info("Checking packages skipped because " + "the task buildpackages was found.") + return + + log.info("Checking packages...") + os_type = ctx.config.get("os_type") + sha1 = ctx.config.get("sha1") + # We can only do this check if there are a defined sha1 and os_type + # in the job config. + if os_type and sha1: + package = packaging.get_builder_project()("ceph", ctx.config) + template = "Checking packages for os_type '{os}', " \ + "flavor '{flav}' and ceph hash '{ver}'" + log.info( + template.format( + os=package.os_type, + flav=package.flavor, + ver=package.sha1, + ) + ) + if package.version: + log.info("Found packages for ceph version {ver}".format( + ver=package.version + )) + else: + msg = "Packages for distro '{d}' and ceph hash '{ver}' not found" + msg = msg.format( + d=package.distro, + ver=package.sha1, + ) + log.error(msg) + # set the failure message and update paddles with the status + ctx.summary["failure_reason"] = msg + set_status(ctx.summary, "dead") + report.try_push_job_info(ctx.config, dict(status='dead')) + raise VersionNotFoundError(package.base_url) + else: + log.info( + "Checking packages skipped, missing os_type '{os}' or ceph hash '{ver}'".format( + os=os_type, + ver=sha1, + ) + ) + + +@contextlib.contextmanager +def timer(ctx, config): + """ + Start the timer used by teuthology + """ + log.info('Starting timer...') + start = time.time() + try: + yield + finally: + duration = time.time() - start + log.info('Duration was %f seconds', duration) + ctx.summary['duration'] = duration + + +def add_remotes(ctx, config): + """ + Create a ctx.cluster object populated with remotes mapped to roles + """ + ctx.cluster = cluster.Cluster() + # Allow jobs to run without using nodes, for self-testing + if 'roles' not in ctx.config and 'targets' not in ctx.config: + return + remotes = [] + machs = [] + for name in ctx.config['targets'].keys(): + machs.append(name) + for t, key in ctx.config['targets'].items(): + t = misc.canonicalize_hostname(t) + try: + if ctx.config['sshkeys'] == 'ignore': + key = None + except (AttributeError, KeyError): + pass + rem = remote.Remote(name=t, host_key=key, keep_alive=True) + remotes.append(rem) + if 'roles' in ctx.config: + for rem, roles in zip(remotes, ctx.config['roles']): + assert all(isinstance(role, str) for role in roles), \ + "Roles in config must be strings: %r" % roles + ctx.cluster.add(rem, roles) + log.info('roles: %s - %s' % (rem, roles)) + else: + for rem in remotes: + ctx.cluster.add(rem, rem.name) + + +def connect(ctx, config): + """ + Connect to all remotes in ctx.cluster + """ + log.info('Opening connections...') + for rem in ctx.cluster.remotes.keys(): + log.debug('connecting to %s', rem.name) + rem.connect() + + +def push_inventory(ctx, config): + if not teuth_config.lock_server: + return + + def push(): + for rem in ctx.cluster.remotes.keys(): + info = rem.inventory_info + teuthology.lock.ops.update_inventory(info) + try: + push() + except Exception: + log.exception("Error pushing inventory") + +BUILDPACKAGES_FIRST = 0 +BUILDPACKAGES_OK = 1 +BUILDPACKAGES_REMOVED = 2 +BUILDPACKAGES_NOTHING = 3 + +def buildpackages_prep(ctx, config): + """ + Make sure the 'buildpackages' task happens before + the 'install' task. + + Return: + + BUILDPACKAGES_NOTHING if there is no buildpackages task + BUILDPACKAGES_REMOVED if there is a buildpackages task but no install task + BUILDPACKAGES_FIRST if a buildpackages task was moved at the beginning + BUILDPACKAGES_OK if a buildpackages task already at the beginning + """ + index = 0 + install_index = None + buildpackages_index = None + buildpackages_prep_index = None + for task in ctx.config['tasks']: + t = list(task)[0] + if t == 'install': + install_index = index + if t == 'buildpackages': + buildpackages_index = index + if t == 'internal.buildpackages_prep': + buildpackages_prep_index = index + index += 1 + if (buildpackages_index is not None and + install_index is not None): + if buildpackages_index > buildpackages_prep_index + 1: + log.info('buildpackages moved to be the first task') + buildpackages = ctx.config['tasks'].pop(buildpackages_index) + ctx.config['tasks'].insert(buildpackages_prep_index + 1, + buildpackages) + return BUILDPACKAGES_FIRST + else: + log.info('buildpackages is already the first task') + return BUILDPACKAGES_OK + elif buildpackages_index is not None and install_index is None: + ctx.config['tasks'].pop(buildpackages_index) + all_tasks = [list(x.keys())[0] for x in ctx.config['tasks']] + log.info('buildpackages removed because no install task found in ' + + str(all_tasks)) + return BUILDPACKAGES_REMOVED + elif buildpackages_index is None: + log.info('no buildpackages task found') + return BUILDPACKAGES_NOTHING + + +def serialize_remote_roles(ctx, config): + """ + Provides an explicit mapping for which remotes have been assigned what roles + So that other software can be loosely coupled to teuthology + """ + if ctx.archive is not None: + with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file: + info_yaml = yaml.safe_load(info_file) + info_file.seek(0) + info_yaml['cluster'] = dict([(rem.name, {'roles': roles}) for rem, roles in ctx.cluster.remotes.items()]) + yaml.safe_dump(info_yaml, info_file, default_flow_style=False) + + +def check_ceph_data(ctx, config): + """ + Check for old /var/lib/ceph subdirectories and detect staleness. + """ + log.info('Checking for non-empty /var/lib/ceph...') + processes = ctx.cluster.run( + args='test -z $(ls -A /var/lib/ceph)', + wait=False, + ) + failed = False + for proc in processes: + try: + proc.wait() + except run.CommandFailedError: + log.error('Host %s has stale /var/lib/ceph!', proc.remote.shortname) + failed = True + if failed: + raise RuntimeError('Stale /var/lib/ceph detected, aborting.') + + +def check_conflict(ctx, config): + """ + Note directory use conflicts and stale directories. + """ + log.info('Checking for old test directory...') + testdir = misc.get_testdir(ctx) + processes = ctx.cluster.run( + args=['test', '!', '-e', testdir], + wait=False, + ) + failed = False + for proc in processes: + try: + proc.wait() + except run.CommandFailedError: + log.error('Host %s has stale test directory %s, check lock and cleanup.', proc.remote.shortname, testdir) + failed = True + if failed: + raise RuntimeError('Stale jobs detected, aborting.') + + +def fetch_binaries_for_coredumps(path, remote): + """ + Pul ELFs (debug and stripped) for each coredump found + """ + # Check for Coredumps: + coredump_path = os.path.join(path, 'coredump') + if os.path.isdir(coredump_path): + log.info('Transferring binaries for coredumps...') + for dump in os.listdir(coredump_path): + # Pull program from core file + dump_path = os.path.join(coredump_path, dump) + dump_info = subprocess.Popen(['file', dump_path], + stdout=subprocess.PIPE) + dump_out = dump_info.communicate()[0].decode() + + # Parse file output to get program, Example output: + # 1422917770.7450.core: ELF 64-bit LSB core file x86-64, version 1 (SYSV), SVR4-style, \ + # from 'radosgw --rgw-socket-path /home/ubuntu/cephtest/apache/tmp.client.0/fastcgi_soc' + log.info(f' core looks like: {dump_out}') + + if 'gzip' in dump_out: + try: + log.info("core is compressed, try accessing gzip file ...") + with gzip.open(dump_path, 'rb') as f_in, \ + tempfile.NamedTemporaryFile(mode='w+b') as f_out: + shutil.copyfileobj(f_in, f_out) + dump_info = subprocess.Popen(['file', f_out.name], + stdout=subprocess.PIPE) + dump_out = dump_info.communicate()[0].decode() + log.info(f' core looks like: {dump_out}') + except Exception as e: + log.info('Something went wrong while opening the compressed file') + log.error(e) + continue + try: + dump_command = re.findall("from '([^ ']+)", dump_out)[0] + dump_program = dump_command.split()[0] + log.info(f' dump_program: {dump_program}') + except Exception as e: + log.info("core doesn't have the desired format, moving on ...") + log.error(e) + continue + + # Find path on remote server: + remote_path = remote.sh(['which', dump_program]).rstrip() + + # Pull remote program into coredump folder: + local_path = os.path.join(coredump_path, + dump_program.lstrip(os.path.sep)) + local_dir = os.path.dirname(local_path) + if not os.path.exists(local_dir): + os.makedirs(local_dir) + remote._sftp_get_file(remote_path, local_path) + + # Pull Debug symbols: + debug_path = os.path.join('/usr/lib/debug', remote_path) + + # RPM distro's append their non-stripped ELF's with .debug + # When deb based distro's do not. + if remote.system_type == 'rpm': + debug_path = '{debug_path}.debug'.format(debug_path=debug_path) + + remote.get_file(debug_path, coredump_path) + + +def gzip_if_too_large(compress_min_size, src, tarinfo, local_path): + if tarinfo.size >= compress_min_size: + with gzip.open(local_path + '.gz', 'wb') as dest: + shutil.copyfileobj(src, dest) + else: + misc.copy_fileobj(src, tarinfo, local_path) + + +@contextlib.contextmanager +def archive(ctx, config): + """ + Handle the creation and deletion of the archive directory. + """ + log.info('Creating archive directory...') + archive_dir = misc.get_archive_dir(ctx) + run.wait( + ctx.cluster.run( + args=['install', '-d', '-m0755', '--', archive_dir], + wait=False, + ) + ) + + # Add logs directory to job's info log file + misc.add_remote_path(ctx, 'init', archive_dir) + + try: + yield + except Exception: + # we need to know this below + set_status(ctx.summary, 'fail') + raise + finally: + passed = get_status(ctx.summary) == 'pass' + if ctx.archive is not None and \ + not (ctx.config.get('archive-on-error') and passed): + log.info('Transferring archived files...') + logdir = os.path.join(ctx.archive, 'remote') + if (not os.path.exists(logdir)): + os.mkdir(logdir) + for rem in ctx.cluster.remotes.keys(): + path = os.path.join(logdir, rem.shortname) + min_size_option = ctx.config.get('log-compress-min-size', + '128MB') + try: + compress_min_size_bytes = \ + humanfriendly.parse_size(min_size_option) + except humanfriendly.InvalidSize: + msg = 'invalid "log-compress-min-size": {}'.format(min_size_option) + log.error(msg) + raise ConfigError(msg) + maybe_compress = functools.partial(gzip_if_too_large, + compress_min_size_bytes) + misc.pull_directory(rem, archive_dir, path, maybe_compress) + # Check for coredumps and pull binaries + fetch_binaries_for_coredumps(path, rem) + + log.info('Removing archive directory...') + run.wait( + ctx.cluster.run( + args=['rm', '-rf', '--', archive_dir], + wait=False, + ), + ) + + +@contextlib.contextmanager +def sudo(ctx, config): + """ + Enable use of sudo + """ + log.info('Configuring sudo...') + sudoers_file = '/etc/sudoers' + backup_ext = '.orig.teuthology' + tty_expr = r's/^\([^#]*\) \(requiretty\)/\1 !\2/g' + pw_expr = r's/^\([^#]*\) !\(visiblepw\)/\1 \2/g' + + run.wait( + ctx.cluster.run( + args="sudo sed -i{ext} -e '{tty}' -e '{pw}' {path}".format( + ext=backup_ext, tty=tty_expr, pw=pw_expr, + path=sudoers_file + ), + wait=False, + ) + ) + try: + yield + finally: + log.info('Restoring {0}...'.format(sudoers_file)) + ctx.cluster.run( + args="sudo mv -f {path}{ext} {path}".format( + path=sudoers_file, ext=backup_ext + ) + ) + + +@contextlib.contextmanager +def coredump(ctx, config): + """ + Stash a coredump of this system if an error occurs. + """ + log.info('Enabling coredump saving...') + cluster = ctx.cluster.filter(lambda r: not r.is_container) + archive_dir = misc.get_archive_dir(ctx) + run.wait( + cluster.run( + args=[ + 'install', '-d', '-m0755', '--', + '{adir}/coredump'.format(adir=archive_dir), + run.Raw('&&'), + 'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir), + run.Raw('&&'), + 'echo', + 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir), + run.Raw('|'), + 'sudo', 'tee', '-a', '/etc/sysctl.conf', + ], + wait=False, + ) + ) + + try: + yield + finally: + cluster = ctx.cluster.filter(lambda r: not r.is_container) + run.wait( + cluster.run( + args=[ + 'sudo', 'sysctl', '-w', 'kernel.core_pattern=core', + run.Raw('&&'), + 'sudo', 'bash', '-c', + (f'for f in `find {archive_dir}/coredump -type f`; do ' + 'file $f | grep -q systemd-sysusers && rm $f || true ; ' + 'done'), + run.Raw('&&'), + # don't litter the archive dir if there were no cores dumped + 'rmdir', + '--ignore-fail-on-non-empty', + '--', + '{adir}/coredump'.format(adir=archive_dir), + ], + wait=False, + ) + ) + + # set status = 'fail' if the dir is still there = coredumps were + # seen + for rem in cluster.remotes.keys(): + try: + rem.sh("test -e " + archive_dir + "/coredump") + except run.CommandFailedError: + continue + log.warning('Found coredumps on %s, flagging run as failed', rem) + set_status(ctx.summary, 'fail') + if 'failure_reason' not in ctx.summary: + ctx.summary['failure_reason'] = \ + 'Found coredumps on {rem}'.format(rem=rem) + + +@contextlib.contextmanager +def archive_upload(ctx, config): + """ + Upload the archive directory to a designated location + """ + try: + yield + finally: + upload = ctx.config.get('archive_upload') + archive_path = ctx.config.get('archive_path') + if upload and archive_path: + log.info('Uploading archives ...') + upload_key = ctx.config.get('archive_upload_key') + if upload_key: + ssh = "RSYNC_RSH='ssh -i " + upload_key + "'" + else: + ssh = '' + split_path = archive_path.split('/') + split_path.insert(-2, '.') + misc.sh(ssh + " rsync -avz --relative /" + + os.path.join(*split_path) + " " + + upload) + else: + log.info('Not uploading archives.') diff --git a/teuthology/task/internal/check_lock.py b/teuthology/task/internal/check_lock.py new file mode 100644 index 000000000..152e41c2d --- /dev/null +++ b/teuthology/task/internal/check_lock.py @@ -0,0 +1,35 @@ +import logging + +import teuthology.lock.query +import teuthology.lock.util + +from teuthology.config import config as teuth_config + +log = logging.getLogger(__name__) + + +def check_lock(ctx, config, check_up=True): + """ + Check lock status of remote machines. + """ + if not teuth_config.lock_server or ctx.config.get('check-locks') is False: + log.info('Lock checking disabled.') + return + log.info('Checking locks...') + for machine in ctx.config['targets'].keys(): + status = teuthology.lock.query.get_status(machine) + log.debug('machine status is %s', repr(status)) + assert status is not None, \ + 'could not read lock status for {name}'.format(name=machine) + if check_up: + assert status['up'], 'machine {name} is marked down'.format( + name=machine + ) + assert status['locked'], \ + 'machine {name} is not locked'.format(name=machine) + assert status['locked_by'] == ctx.owner, \ + 'machine {name} is locked by {user}, not {owner}'.format( + name=machine, + user=status['locked_by'], + owner=ctx.owner, + ) diff --git a/teuthology/task/internal/edit_sudoers.sh b/teuthology/task/internal/edit_sudoers.sh new file mode 100755 index 000000000..6ab40a5d8 --- /dev/null +++ b/teuthology/task/internal/edit_sudoers.sh @@ -0,0 +1,10 @@ +#! /bin/sh + +sudo vi -e /etc/sudoers < # registry-name + """ + if ctx.config.get('redhat').get('setup_container_registry', None): + registry = ctx.config['redhat']['setup_container_registry'] + + # fetch credentials from teuth_config + creds = teuthconfig.get('registries', dict()).get(registry) + if not creds: + raise ConfigError("Registry not found....") + + # container-tool login + for remote in ctx.cluster.remotes.keys(): + container_tool = "podman" + if remote.os.version.startswith('7'): + container_tool = "docker" + + remote.run(args=[ + 'sudo', container_tool, + 'login', registry, + '--username', creds['username'], + '--password', creds['password'], + ] + ) + yield + +@contextlib.contextmanager +def setup_additional_repo(ctx, config): + """ + set additional repo's for testing + redhat: + set-add-repo: 'http://example.com/internal.repo' + """ + if ctx.config.get('redhat').get('set-add-repo', None): + add_repo = ctx.config.get('redhat').get('set-add-repo') + for remote in ctx.cluster.remotes.keys(): + if remote.os.package_type == 'rpm': + remote.run(args=['sudo', 'wget', '-O', '/etc/yum.repos.d/rh_add.repo', + add_repo]) + if not remote.os.version.startswith('8'): + remote.run(args=['sudo', 'yum', 'update', 'metadata']) + + yield + + +def _enable_rhel_repos(remote): + + # Look for rh specific repos + ds_yaml = os.path.join( + teuthconfig.get('ds_yaml_dir'), + teuthconfig.rhbuild + ".yaml" + ) + + rhel_repos = yaml.safe_load(open(ds_yaml)) + repos_to_subscribe = rhel_repos.get('rhel_repos').get(remote.os.version[0]) + + for repo in repos_to_subscribe: + remote.run(args=['sudo', 'subscription-manager', + 'repos', '--enable={r}'.format(r=repo)]) + + +@contextlib.contextmanager +def setup_base_repo(ctx, config): + """ + Setup repo based on redhat nodes + redhat: + base-repo-url: base url that provides Mon, OSD, Tools etc + installer-repo-url: Installer url that provides Agent, Installer + deb-repo-url: debian repo url + deb-gpg-key: gpg key used for signing the build + """ + rh_config = ctx.config.get('redhat') + if not rh_config.get('base-repo-url'): + # no repo defined + yield + if rh_config.get('set-cdn-repo'): + log.info("CDN repo already set, skipping rh repo") + yield + else: + _setup_latest_repo(ctx, rh_config) + try: + yield + finally: + log.info("Cleaning up repo's") + for remote in ctx.cluster.remotes.keys(): + if remote.os.package_type == 'rpm': + remote.run(args=['sudo', 'rm', + run.Raw('/etc/yum.repos.d/rh*.repo'), + ], check_status=False) + + +def _setup_latest_repo(ctx, config): + """ + Setup repo based on redhat nodes + """ + with parallel(): + for remote in ctx.cluster.remotes.keys(): + if remote.os.package_type == 'rpm': + # pre-cleanup + remote.run(args=['sudo', 'rm', run.Raw('/etc/yum.repos.d/rh*')], + check_status=False) + remote.run(args=['sudo', 'yum', 'clean', 'metadata']) + if not remote.os.version.startswith('8'): + remote.run(args=['sudo', 'yum', 'update', 'metadata']) + # skip is required for beta iso testing + if config.get('skip-subscription-manager', False) is True: + log.info("Skipping subscription-manager command") + else: + remote.run(args=['sudo', 'subscription-manager', 'repos', + run.Raw('--disable=*ceph*')], + check_status=False + ) + base_url = config.get('base-repo-url', '') + installer_url = config.get('installer-repo-url', '') + repos = ['MON', 'OSD', 'Tools', 'Calamari', 'Installer'] + installer_repos = ['Agent', 'Main', 'Installer'] + if config.get('base-rh-repos'): + repos = ctx.config.get('base-rh-repos') + if config.get('installer-repos'): + installer_repos = ctx.config.get('installer-repos') + # create base repo + if base_url.startswith('http'): + repo_to_use = _get_repos_to_use(base_url, repos) + base_repo_file = NamedTemporaryFile(mode='w', delete=False) + _create_temp_repo_file(repo_to_use, base_repo_file) + remote.put_file(base_repo_file.name, base_repo_file.name) + remote.run(args=['sudo', 'cp', base_repo_file.name, + '/etc/yum.repos.d/rh_ceph.repo']) + remote.run(args=['sudo', 'yum', 'clean', 'metadata']) + if installer_url.startswith('http'): + irepo_to_use = _get_repos_to_use( + installer_url, installer_repos) + installer_file = NamedTemporaryFile(delete=False) + _create_temp_repo_file(irepo_to_use, installer_file) + remote.put_file(installer_file.name, installer_file.name) + remote.run(args=['sudo', 'cp', installer_file.name, + '/etc/yum.repos.d/rh_inst.repo']) + remote.run(args=['sudo', 'yum', 'clean', 'metadata']) + if not remote.os.version.startswith('8'): + remote.run(args=['sudo', 'yum', 'update', 'metadata']) + else: + if config.get('deb-repo-url'): + deb_repo = config.get('deb-repo-url') + deb_gpg_key = config.get('deb-gpg-key', None) + set_deb_repo(remote, deb_repo, deb_gpg_key) + + +def _get_repos_to_use(base_url, repos): + repod = dict() + for repo in repos: + repo_to_use = base_url + "compose/" + repo + "/x86_64/os/" + r = requests.get(repo_to_use) + log.info("Checking %s", repo_to_use) + if r.status_code == 200: + log.info("Using %s", repo_to_use) + repod[repo] = repo_to_use + return repod + + +def _create_temp_repo_file(repos, repo_file): + for repo in repos.keys(): + header = "[ceph-" + repo + "]" + "\n" + name = "name=ceph-" + repo + "\n" + baseurl = "baseurl=" + repos[repo] + "\n" + gpgcheck = "gpgcheck=0\n" + enabled = "enabled=1\n\n" + repo_file.write(header) + repo_file.write(name) + repo_file.write(baseurl) + repo_file.write(gpgcheck) + repo_file.write(enabled) + repo_file.close() diff --git a/teuthology/task/internal/syslog.py b/teuthology/task/internal/syslog.py new file mode 100644 index 000000000..1e1cc3b5c --- /dev/null +++ b/teuthology/task/internal/syslog.py @@ -0,0 +1,200 @@ +import contextlib +import logging + +from io import BytesIO + +from teuthology import misc +from teuthology.job_status import set_status +from teuthology.orchestra import run + + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def syslog(ctx, config): + """ + start syslog / stop syslog on exit. + """ + if ctx.archive is None: + # disable this whole feature if we're not going to archive the data + # anyway + yield + return + + cluster = ctx.cluster.filter(lambda r: not r.is_container) + if not len(cluster.remotes.keys()): + yield + return + + log.info('Starting syslog monitoring...') + + archive_dir = misc.get_archive_dir(ctx) + log_dir = '{adir}/syslog'.format(adir=archive_dir) + run.wait( + cluster.run( + args=['mkdir', '-p', '-m0755', '--', log_dir], + wait=False, + ) + ) + + CONF = '/etc/rsyslog.d/80-cephtest.conf' + kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir) + misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir) + conf_lines = [ + 'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log), + '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format( + misc_log=misc_log), + ] + conf_fp = BytesIO('\n'.join(conf_lines).encode()) + try: + for rem in cluster.remotes.keys(): + log_context = 'system_u:object_r:var_log_t:s0' + for log_path in (kern_log, misc_log): + rem.run(args=['install', '-m', '666', '/dev/null', log_path]) + rem.chcon(log_path, log_context) + misc.sudo_write_file( + remote=rem, + path=CONF, + data=conf_fp, + ) + conf_fp.seek(0) + run.wait( + cluster.run( + args=[ + 'sudo', + 'service', + # a mere reload (SIGHUP) doesn't seem to make + # rsyslog open the files + 'rsyslog', + 'restart', + ], + wait=False, + ), + ) + + yield + finally: + cluster = ctx.cluster.filter(lambda r: not r.is_container) + if not len(cluster.remotes.keys()): + return + + log.info('Shutting down syslog monitoring...') + + run.wait( + cluster.run( + args=[ + 'sudo', + 'rm', + '-f', + '--', + CONF, + run.Raw('&&'), + 'sudo', + 'service', + 'rsyslog', + 'restart', + ], + wait=False, + ), + ) + # race condition: nothing actually says rsyslog had time to + # flush the file fully. oh well. + + log.info('Checking logs for errors...') + for rem in cluster.remotes.keys(): + log.debug('Checking %s', rem.name) + stdout = rem.sh( + [ + 'egrep', '--binary-files=text', + '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b', + run.Raw(f'{archive_dir}/syslog/kern.log'), + run.Raw('|'), + 'grep', '-v', 'task .* blocked for more than .* seconds', + run.Raw('|'), + 'grep', '-v', 'lockdep is turned off', + run.Raw('|'), + 'grep', '-v', 'trying to register non-static key', + run.Raw('|'), + 'grep', '-v', 'DEBUG: fsize', # xfs_fsr + run.Raw('|'), + 'grep', '-v', 'CRON', # ignore cron noise + run.Raw('|'), + 'grep', '-v', 'BUG: bad unlock balance detected', # #6097 + run.Raw('|'), + 'grep', '-v', 'inconsistent lock state', # FIXME see #2523 + run.Raw('|'), + 'grep', '-v', '*** DEADLOCK ***', # part of lockdep output + run.Raw('|'), + 'grep', '-v', + # FIXME see #2590 and #147 + 'INFO: possible irq lock inversion dependency detected', + run.Raw('|'), + 'grep', '-v', + 'INFO: NMI handler (perf_event_nmi_handler) took too long to run', # noqa + run.Raw('|'), + 'grep', '-v', 'INFO: recovery required on readonly', + run.Raw('|'), + 'grep', '-v', 'ceph-create-keys: INFO', + run.Raw('|'), + 'grep', '-v', 'INFO:ceph-create-keys', + run.Raw('|'), + 'grep', '-v', 'Loaded datasource DataSourceOpenStack', + run.Raw('|'), + 'grep', '-v', 'container-storage-setup: INFO: Volume group backing root filesystem could not be determined', # noqa + run.Raw('|'), + 'egrep', '-v', '\\bsalt-master\\b|\\bsalt-minion\\b|\\bsalt-api\\b', + run.Raw('|'), + 'grep', '-v', 'ceph-crash', + run.Raw('|'), + 'egrep', '-v', '\\btcmu-runner\\b.*\\bINFO\\b', + run.Raw('|'), + 'head', '-n', '1', + ], + ) + if stdout != '': + log.error('Error in syslog on %s: %s', rem.name, stdout) + set_status(ctx.summary, 'fail') + if 'failure_reason' not in ctx.summary: + ctx.summary['failure_reason'] = \ + "'{error}' in syslog".format(error=stdout) + + log.info('Gathering journactl...') + run.wait( + cluster.run( + args=[ + 'sudo', 'journalctl', + run.Raw('>'), + f'{archive_dir}/syslog/journalctl.log', + ], + wait=False, + ) + ) + + log.info('Compressing syslogs...') + run.wait( + cluster.run( + args=[ + 'find', + '{adir}/syslog'.format(adir=archive_dir), + '-name', + '*.log', + '-print0', + run.Raw('|'), + 'sudo', + 'xargs', + '-0', + '--max-args=1', + '--max-procs=0', + '--verbose', + '--no-run-if-empty', + '--', + 'gzip', + '-5', + '--verbose', + '--', + ], + wait=False, + ) + ) + diff --git a/teuthology/task/internal/vm_setup.py b/teuthology/task/internal/vm_setup.py new file mode 100644 index 000000000..f210bc7f4 --- /dev/null +++ b/teuthology/task/internal/vm_setup.py @@ -0,0 +1,51 @@ +import logging +import os +import subprocess + +from teuthology.parallel import parallel +from teuthology.task import ansible +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + + +def vm_setup(ctx, config): + """ + Look for virtual machines and handle their initialization + """ + all_tasks = [list(x.keys())[0] for x in ctx.config['tasks']] + need_ansible = False + if 'kernel' in all_tasks and 'ansible.cephlab' not in all_tasks: + need_ansible = True + ansible_hosts = set() + with parallel(): + editinfo = os.path.join(os.path.dirname(__file__), 'edit_sudoers.sh') + for rem in ctx.cluster.remotes.keys(): + if rem.is_vm: + ansible_hosts.add(rem.shortname) + try: + rem.sh('test -e /ceph-qa-ready') + except CommandFailedError: + p1 = subprocess.Popen(['cat', editinfo], + stdout=subprocess.PIPE) + p2 = subprocess.Popen( + [ + 'ssh', + '-o', 'StrictHostKeyChecking=no', + '-t', '-t', + str(rem), + 'sudo', + 'sh' + ], + stdin=p1.stdout, stdout=subprocess.PIPE + ) + _, err = p2.communicate() + if err: + log.error("Edit of /etc/sudoers failed: %s", err) + if need_ansible and ansible_hosts: + log.info("Running ansible on %s", list(ansible_hosts)) + ansible_config = dict( + hosts=list(ansible_hosts), + ) + with ansible.CephLab(ctx, config=ansible_config): + pass diff --git a/teuthology/task/iscsi.py b/teuthology/task/iscsi.py new file mode 100644 index 000000000..80d01cb8f --- /dev/null +++ b/teuthology/task/iscsi.py @@ -0,0 +1,214 @@ +""" +Handle iscsi adm commands for tgt connections. +""" +import logging +import contextlib +import socket + +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.task.common_fs_utils import generic_mkfs +from teuthology.task.common_fs_utils import generic_mount +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + + +def _get_remote(remotes, client): + """ + Get remote object that is associated with the client specified. + """ + for rem in remotes: + if client in remotes[rem]: + return rem + + +def _get_remote_name(remotes, client): + """ + Get remote name that is associated with the client specified. + """ + rem_name = _get_remote(remotes, client).name + rem_name = rem_name[rem_name.find('@') + 1:] + return rem_name + + +def tgt_devname_get(ctx, test_image): + """ + Get the name of the newly created device by following the by-path + link (which is symbolically linked to the appropriate /dev/sd* file). + """ + remotes = ctx.cluster.only(teuthology.is_type('client')).remotes + rem_name = _get_remote_name(remotes, test_image) + lnkpath = '/dev/disk/by-path/ip-%s:3260-iscsi-rbd-lun-1' % \ + socket.gethostbyname(rem_name) + return lnkpath + + +def tgt_devname_rtn(ctx, test_image): + """ + Wrapper passed to common_fs_util functions. + """ + image = test_image[test_image.find('.') + 1:] + return tgt_devname_get(ctx, image) + + +def file_io_test(rem, file_from, lnkpath): + """ + dd to the iscsi inteface, read it, and compare with original + """ + rem.run( + args=[ + 'sudo', + 'dd', + 'if=%s' % file_from, + 'of=%s' % lnkpath, + 'bs=1024', + 'conv=fsync', + ]) + tfile2 = rem.sh('mktemp').strip() + rem.run( + args=[ + 'sudo', + 'rbd', + 'export', + 'iscsi-image', + run.Raw('-'), + run.Raw('>'), + tfile2, + ]) + size = rem.sh( + [ + 'ls', + '-l', + file_from, + run.Raw('|'), + 'awk', + '{print $5}', ], + ).strip() + rem.run( + args=[ + 'cmp', + '-n', + size, + file_from, + tfile2, + ]) + rem.run(args=['rm', tfile2]) + + +def general_io_test(ctx, rem, image_name): + """ + Do simple I/O tests to the iscsi interface before putting a + filesystem on it. + """ + rem.run( + args=[ + 'udevadm', + 'settle', + ]) + test_phrase = 'The time has come the walrus said to speak of many things.' + lnkpath = tgt_devname_get(ctx, image_name) + tfile1 = rem.sh('mktemp').strip() + rem.run( + args=[ + 'echo', + test_phrase, + run.Raw('>'), + tfile1, + ]) + file_io_test(rem, tfile1, lnkpath) + rem.run(args=['rm', tfile1]) + file_io_test(rem, '/bin/ls', lnkpath) + + +@contextlib.contextmanager +def start_iscsi_initiators(ctx, tgt_link): + """ + This is the sub-task that assigns an rbd to an iscsiadm control and + performs a login (thereby creating a /dev/sd device). It performs + a logout when finished. + """ + remotes = ctx.cluster.only(teuthology.is_type('client')).remotes + tgtd_list = [] + for role, host in tgt_link: + rem = _get_remote(remotes, role) + rem_name = _get_remote_name(remotes, host) + rem.run( + args=[ + 'sudo', + 'iscsiadm', + '-m', + 'discovery', + '-t', + 'st', + '-p', + rem_name, + ]) + proc = rem.run( + args=[ + 'sudo', + 'iscsiadm', + '-m', + 'node', + '--login', + ]) + if proc.exitstatus == 0: + tgtd_list.append((rem, rem_name)) + general_io_test(ctx, rem, host) + try: + with contextutil.nested( + lambda: generic_mkfs(ctx=ctx, config={host: {'fs_type': 'xfs'}}, + devname_rtn=tgt_devname_rtn), + lambda: generic_mount(ctx=ctx, config={host: None}, + devname_rtn=tgt_devname_rtn), + ): + yield + finally: + for rem_info in tgtd_list: + rem = rem_info[0] + rem_name = rem_info[1] + rem.run( + args=[ + 'sudo', + 'iscsiadm', + '-m', + 'node', + '--logout', + ]) + +@contextlib.contextmanager +def task(ctx, config): + """ + handle iscsi admin login after a tgt connection has been established. + + Assume a default host client of client.0 and a sending client of + client.0 if not specified otherwise. + + Sample tests could be: + + iscsi: + + This sets up a tgt link from client.0 to client.0 + + iscsi: [client.1, client.2] + + This sets up a tgt link from client.1 to client.0 and a tgt link + from client.2 to client.0 + + iscsi: + client.0: client.1 + client.1: client.0 + + This sets up a tgt link from client.0 to client.1 and a tgt link + from client.1 to client.0 + + Note that the iscsi image name is iscsi-image, so this only works + for one image being tested at any one time. + """ + try: + pairs = config.items() + except AttributeError: + pairs = [('client.0', 'client.0')] + with contextutil.nested( + lambda: start_iscsi_initiators(ctx=ctx, tgt_link=pairs),): + yield diff --git a/teuthology/task/kernel.py b/teuthology/task/kernel.py new file mode 100644 index 000000000..22cd1a69c --- /dev/null +++ b/teuthology/task/kernel.py @@ -0,0 +1,1349 @@ +""" +Kernel installation task +""" + +import contextlib +import logging +import os +import re +import shlex +from io import StringIO + +from teuthology.util.compat import urljoin + +from teuthology import misc as teuthology +from teuthology.parallel import parallel +from teuthology.config import config as teuth_config +from teuthology.orchestra import run +from teuthology.exceptions import ( + UnsupportedPackageTypeError, + ConfigError, + VersionNotFoundError, +) +from teuthology.packaging import ( + install_package, + get_koji_build_info, + get_kojiroot_base_url, + get_koji_package_name, + get_koji_task_rpm_info, + get_koji_task_result, + get_builder_project, +) +from teuthology.task.install.deb import install_dep_packages + +log = logging.getLogger(__name__) + +CONFIG_DEFAULT = {'branch': 'distro', 'sha1': 'distro'} +TIMEOUT_DEFAULT = 300 + +VERSION_KEYS = ['branch', 'tag', 'sha1', 'deb', 'rpm', 'koji', 'koji_task'] + + +def normalize_config(ctx, config): + """ + Returns a config whose keys are all real roles. + Generic roles (client, mon, osd, etc.) are replaced with + the actual roles (client.0, client.1, etc.). If the config + specifies a different version for a specific role, this is + unchanged. + + For example, with 4 OSDs this:: + + osd: + tag: v3.0 + kdb: true + osd.1: + branch: new_btrfs + kdb: false + osd.3: + deb: /path/to/linux-whatever.deb + + is transformed into:: + + osd.0: + tag: v3.0 + kdb: true + osd.1: + branch: new_btrfs + kdb: false + osd.2: + tag: v3.0 + kdb: true + osd.3: + deb: /path/to/linux-whatever.deb + + If config is None or just specifies a version to use, + it is applied to all nodes. + + :param ctx: Context + :param config: Configuration + """ + log.info(f'normalize config orig: {config}') + if not config or \ + len([x for x in config.keys() if x in + VERSION_KEYS + ['kdb', 'flavor', 'hwe']]) == len(config.keys()): + new_config = {} + if not config: + config = CONFIG_DEFAULT + for role in teuthology.all_roles(ctx.cluster): + new_config[role] = config.copy() + return new_config + + new_config = {} + for role, role_config in config.items(): + if role_config is None: + role_config = CONFIG_DEFAULT + if '.' in role: + new_config[role] = role_config.copy() + else: + for id_ in teuthology.all_roles_of_type(ctx.cluster, role): + name = '{type}.{id}'.format(type=role, id=id_) + # specific overrides generic + if name not in config: + new_config[name] = role_config.copy() + log.info(f'normalize config final: {new_config}') + return new_config + + +def normalize_and_apply_overrides(ctx, config, overrides): + """ + kernel task config is hierarchical and needs to be transformed into + a normal form, see normalize_config() for details. Applying overrides is + also more involved compared to other tasks because of the number of ways + a version of the kernel to install can be specified. + + Returns a (normalized config, timeout) tuple. + + :param ctx: Context + :param config: Configuration + """ + timeout = TIMEOUT_DEFAULT + if 'timeout' in config: + timeout = config.pop('timeout') + config = normalize_config(ctx, config) + + if 'timeout' in overrides: + timeout = overrides.pop('timeout') + if overrides: + overrides = normalize_config(ctx, overrides) + log.debug('normalized overrides %s' % overrides) + + # Handle a case when a version specified with one type of version key + # is overridden by a version specified with another type of version key + # (e.g. 'branch: foo' is overridden with 'tag: bar'). To be able to + # use deep_merge(), drop all version keys from the original config if + # the corresponding override has a version key. + for role, role_config in config.items(): + if (role in overrides and + any(k in overrides[role] for k in VERSION_KEYS)): + for k in VERSION_KEYS: + role_config.pop(k, None) + teuthology.deep_merge(config, overrides) + + return (config, timeout) + + +def validate_config(ctx, config): + """ + Make sure that all kernels in the list of remove kernels + refer to the same kernel. + + :param ctx: Context + :param config: Configuration + """ + for _, roles_for_host in ctx.cluster.remotes.items(): + kernel = None + for role in roles_for_host: + role_kernel = config.get(role, kernel) + if kernel is None: + kernel = role_kernel + elif role_kernel is not None: + assert kernel == role_kernel, \ + "everything on the same host must use the same kernel" + if role in config: + del config[role] + + +def need_to_install(ctx, role, version): + """ + Check to see if we need to install a kernel. Get the version of the + currently running kernel, and compare it against the value passed in. + + :param ctx: Context + :param role: Role + :param version: value to compare against (used in checking), can be either + a utsrelease string (e.g. '3.13.0-rc3-ceph-00049-ge2817b3') + or a sha1. + """ + ret = True + log.info('Checking kernel version of {role}, want "{ver}"...'.format( + role=role, ver=version)) + uname_fp = StringIO() + ctx.cluster.only(role).run( + args=[ + 'uname', + '-r', + ], + stdout=uname_fp, + ) + cur_version = uname_fp.getvalue().rstrip('\n') + log.debug('current kernel version is {ver} vs {want}'.format(ver=cur_version, + want=version)) + + if '.' in str(version): + if cur_version == version: + log.debug('utsrelease strings match, do not need to install') + ret = False + os_type = teuthology.get_distro(ctx) + log.debug("Distro of this test job: {}".format(os_type)) + if os_type in ['sle', 'opensuse']: + cur_version_match = re.search('(.*)-default$', cur_version) + if cur_version_match: + cur_version_rp = cur_version_match.group(1) + if cur_version_rp in version: + log.debug('"{}" is a substring of "{}" - the latest {} kernel is running' + .format(cur_version_rp, version, os_type)) + ret = False + else: + log.debug('failed to parse current kernel version {} (os_type is "{}")' + .format(cur_version, os_type)) + else: + # version is sha1, need to try to extract sha1 from cur_version + match = re.search('[-_]g([0-9a-f]{6,40})', cur_version) + if match: + cur_sha1 = match.group(1) + log.debug('extracting sha1, {ver} -> {sha1}'.format( + ver=cur_version, sha1=cur_sha1)) + m = min(len(cur_sha1), len(version)) + assert m >= 6, "cur_sha1 and/or version is too short, m = %d" % m + if cur_sha1[0:m] == version[0:m]: + log.debug('extracted sha1 matches, do not need to install') + ret = False + else: + log.debug('failed to parse current kernel version') + uname_fp.close() + return ret + + +def install_firmware(ctx, config): + """ + Go to the github to get the latest firmware. + + :param ctx: Context + :param config: Configuration + """ + linux_firmware_git_upstream = 'git://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git' + uri = teuth_config.linux_firmware_git_url or linux_firmware_git_upstream + fw_dir = '/lib/firmware/updates' + + for role in config.keys(): + if isinstance(config[role], str) and config[role].find('distro') >= 0: + log.info('Skipping firmware on distro kernel'); + return + (role_remote,) = ctx.cluster.only(role).remotes.keys() + package_type = role_remote.os.package_type + if package_type == 'rpm': + role_remote.run(args=[ + 'sudo', 'yum', 'upgrade', '-y', 'linux-firmware', + ]) + continue + log.info('Installing linux-firmware on {role}...'.format(role=role)) + role_remote.run( + args=[ + # kludge around mysterious 0-byte .git/HEAD files + 'cd', fw_dir, + run.Raw('&&'), + 'test', '-d', '.git', + run.Raw('&&'), + 'test', '!', '-s', '.git/HEAD', + run.Raw('&&'), + 'sudo', 'rm', '-rf', '.git', + run.Raw(';'), + # init + 'sudo', 'install', '-d', '-m0755', fw_dir, + run.Raw('&&'), + 'cd', fw_dir, + run.Raw('&&'), + 'sudo', 'git', 'init', + ], + ) + role_remote.run( + args=[ + 'sudo', 'git', '--git-dir=%s/.git' % fw_dir, 'config', + '--get', 'remote.origin.url', run.Raw('>/dev/null'), + run.Raw('||'), + 'sudo', 'git', '--git-dir=%s/.git' % fw_dir, + 'remote', 'add', 'origin', uri, + ], + ) + # In case the remote already existed, set its url + role_remote.run( + args=[ + 'sudo', 'git', '--git-dir=%s/.git' % fw_dir, 'remote', + 'set-url', 'origin', uri, run.Raw('>/dev/null') + ] + ) + role_remote.run( + args=[ + 'cd', fw_dir, + run.Raw('&&'), + 'sudo', 'git', 'fetch', 'origin', + run.Raw('&&'), + 'sudo', 'git', 'reset', '--hard', 'origin/main' + ], + ) + + +def gitbuilder_pkg_name(remote): + if remote.os.package_type == 'rpm': + pkg_name = 'kernel.x86_64.rpm' + elif remote.os.package_type == 'deb': + pkg_name = 'linux-image.deb' + else: + raise UnsupportedPackageTypeError(remote) + return pkg_name + + +def remote_pkg_path(remote): + """ + This is where kernel packages are copied over (in case of local + packages) or downloaded to (in case of gitbuilder packages) and + then installed from. + """ + return os.path.join('/tmp', gitbuilder_pkg_name(remote)) + + +def download_kernel(ctx, config): + """ + Supply each remote with a kernel package: + - local kernels are copied over + - gitbuilder kernels are downloaded + - nothing is done for distro kernels + + :param ctx: Context + :param config: Configuration + """ + procs = {} + for role, src in config.items(): + needs_download = False + + if src == 'distro': + # don't need to download distro kernels + log.debug("src is distro, skipping download"); + continue + + (role_remote,) = ctx.cluster.only(role).remotes.keys() + if isinstance(src, dict): + # we're downloading a kernel from koji, the src dict here + # is the build_info retrieved from koji using get_koji_build_info + if src.get("id"): + build_id = src["id"] + log.info("Downloading kernel with build_id {build_id} on {role}...".format( + build_id=build_id, + role=role + )) + needs_download = True + baseurl = get_kojiroot_base_url(src) + pkg_name = get_koji_package_name("kernel", src) + elif src.get("task_id"): + needs_download = True + log.info("Downloading kernel with task_id {task_id} on {role}...".format( + task_id=src["task_id"], + role=role + )) + baseurl = src["base_url"] + # this var is also poorly named as it's not the package name, + # but the full name of the rpm file to download. + pkg_name = src["rpm_name"] + elif src.find('/') >= 0: + # local package - src is path + log.info('Copying kernel package {path} to {role}...'.format( + path=src, role=role)) + role_remote.put_file(src,remote_pkg_path(role_remote)) + else: + # gitbuilder package - src is sha1 + log.info('Downloading kernel {sha1} on {role}...'.format( + sha1=src, + role=role, + )) + needs_download = True + + builder = get_builder_project()( + 'kernel', + {'sha1': src}, + ctx=ctx, + remote=role_remote, + ) + if teuth_config.use_shaman: + if role_remote.os.package_type == 'rpm': + arch = builder.arch + baseurl = urljoin( + builder.base_url, + '/'.join([arch, '']) + ) + pkg_name = "kernel-%s.%s.rpm" % ( + builder.version, + arch, + ) + elif role_remote.os.package_type == 'deb': + arch = 'amd64' # FIXME + baseurl = urljoin( + builder.base_url, + '/'.join([ + 'pool', 'main', 'l', + 'linux-%s' % builder.scm_version, '' + ]) + ) + pkg_name = 'linux-image-%s_%s_%s.deb' % ( + builder.scm_version, + builder.version, + arch, + ) + else: + baseurl = builder.base_url + "/" + pkg_name = gitbuilder_pkg_name(role_remote) + + log.info("fetching, builder baseurl is %s", baseurl) + + if needs_download: + proc = role_remote.run( + args=[ + 'rm', '-f', remote_pkg_path(role_remote), + run.Raw('&&'), + 'echo', + pkg_name, + run.Raw('|'), + 'wget', + '-nv', + '-O', + remote_pkg_path(role_remote), + '--base={url}'.format(url=baseurl), + '--input-file=-', + ], + wait=False) + procs[role_remote.name] = proc + + for name, proc in procs.items(): + log.debug('Waiting for download/copy to %s to complete...', name) + proc.wait() + + +def _no_grub_link(in_file, remote, kernel_ver): + """ + Copy and link kernel related files if grub cannot be used + (as is the case in Arm kernels) + + :param infile: kernel file or image file to be copied. + :param remote: remote machine + :param kernel_ver: kernel version + """ + boot1 = '/boot/%s' % in_file + boot2 = '%s.old' % boot1 + remote.run( + args=[ + 'if', 'test', '-e', boot1, run.Raw(';'), 'then', + 'sudo', 'mv', boot1, boot2, run.Raw(';'), 'fi',], + ) + remote.run( + args=['sudo', 'ln', '-s', '%s-%s' % (in_file, kernel_ver) , boot1, ], + ) + + +def install_latest_rh_kernel(ctx, config): + """ + Installs the lastest z stream kernel + Reboot for the new kernel to take effect + """ + if config is None: + config = {} + if config.get('skip'): + return + with parallel() as p: + for remote in ctx.cluster.remotes.keys(): + p.spawn(update_rh_kernel, remote) + + +def update_rh_kernel(remote): + package_type = remote.os.package_type + remote.run(args=['uname', '-a']) + import time + if package_type == 'rpm': + update_log = remote.sh('sudo yum update -y kernel') + log.info(update_log) + if not update_log.find("Installed") == -1: + log.info("Kernel updated to latest z stream on %s", remote.shortname) + log.info("Rebooting %s", remote.shortname) + remote.run(args=['sudo', 'shutdown', '-r', 'now'], wait=False) + time.sleep(40) + log.info("Reconnecting after reboot") + remote.reconnect(timeout=300) + remote.run(args=['uname', '-a']) + elif not update_log.find('No packages marked for update') == -1: + log.info("Latest version already installed on %s", remote.shortname) + + +def install_and_reboot(ctx, need_install, config): + """ + Install and reboot the kernel. This mostly performs remote + installation operations. The code does check for Arm images + and skips grub operations if the kernel is Arm. Otherwise, it + extracts kernel titles from submenu entries and makes the appropriate + grub calls. The assumptions here are somewhat simplified in that + it expects kernel entries to be present under submenu entries. + + :param ctx: Context + :param need_install: map from caller + :param config: Configuration + """ + procs = {} + kernel_title = '' + for role, src in need_install.items(): + (role_remote,) = ctx.cluster.only(role).remotes.keys() + if isinstance(src, str) and src.find('distro') >= 0: + log.info('Installing distro kernel on {role}...'.format(role=role)) + install_kernel(role_remote, config[role], version=src) + continue + + log.info('Installing kernel {src} on {role}...'.format(src=src, + role=role)) + package_type = role_remote.os.package_type + if package_type == 'rpm': + proc = role_remote.run( + args=[ + 'sudo', + 'rpm', + '-ivh', + '--oldpackage', + '--replacefiles', + '--replacepkgs', + remote_pkg_path(role_remote), + ]) + install_kernel(role_remote, config[role], path=remote_pkg_path(role_remote)) + continue + + # TODO: Refactor this into install_kernel() so that it handles all + # cases for both rpm and deb packages. + proc = role_remote.run( + args=[ + # install the kernel deb + 'sudo', + 'dpkg', + '-i', + remote_pkg_path(role_remote), + ], + ) + + # collect kernel image name from the .deb + kernel_title = get_image_version(role_remote, + remote_pkg_path(role_remote)) + log.info('searching for kernel {}'.format(kernel_title)) + + if kernel_title.endswith("-highbank"): + _no_grub_link('vmlinuz', role_remote, kernel_title) + _no_grub_link('initrd.img', role_remote, kernel_title) + proc = role_remote.run( + args=[ + 'sudo', + 'shutdown', + '-r', + 'now', + ], + wait=False, + ) + procs[role_remote.name] = proc + continue + + # look for menuentry for our kernel, and collect any + # submenu entries for their titles. Assume that if our + # kernel entry appears later in the file than a submenu entry, + # it's actually nested under that submenu. If it gets more + # complex this will totally break. + + kernel_entries = role_remote.sh([ + 'egrep', + '(submenu|menuentry.*' + kernel_title + ').*{', + '/boot/grub/grub.cfg' + ]).split('\n') + submenu_title = '' + default_title = '' + for l in kernel_entries: + fields = shlex.split(l) + if len(fields) >= 2: + command, title = fields[:2] + if command == 'submenu': + submenu_title = title + '>' + if command == 'menuentry': + if title.endswith(kernel_title): + default_title = title + break + log.info('submenu_title:{}'.format(submenu_title)) + log.info('default_title:{}'.format(default_title)) + + proc = role_remote.run( + args=[ + # use the title(s) to construct the content of + # the grub menu entry, so we can default to it. + '/bin/echo', + '-e', + r'cat </dev/null'), + run.Raw('&&'), + 'sudo', + 'chmod', + 'a+x', + '--', + '/etc/grub.d/01_ceph_kernel.tmp~', + run.Raw('&&'), + 'sudo', + 'mv', + '--', + '/etc/grub.d/01_ceph_kernel.tmp~', + '/etc/grub.d/01_ceph_kernel', + # update grub again so it accepts our default + run.Raw('&&'), + 'sudo', + 'update-grub', + run.Raw('&&'), + 'rm', + remote_pkg_path(role_remote), + run.Raw('&&'), + # work around a systemd issue, where network gets shut down + # before ssh can close its session + run.Raw('('), + 'sleep', + '1', + run.Raw('&&'), + 'sudo', + 'shutdown', + '-r', + 'now', + run.Raw('&'), + run.Raw(')'), + ], + wait=False, + ) + procs[role_remote.name] = proc + + for name, proc in procs.items(): + log.debug('Waiting for install on %s to complete...', name) + proc.wait() + + +def enable_disable_kdb(ctx, config): + """ + Enable kdb on remote machines in use. Disable on those that are + not in use. + + :param ctx: Context + :param config: Configuration + """ + for role, enable in config.items(): + (role_remote,) = ctx.cluster.only(role).remotes.keys() + if "mira" in role_remote.name: + serialdev = "ttyS2" + else: + serialdev = "ttyS1" + if enable: + log.info('Enabling kdb on {role}...'.format(role=role)) + try: + role_remote.run( + args=[ + 'echo', serialdev, + run.Raw('|'), + 'sudo', 'tee', '/sys/module/kgdboc/parameters/kgdboc' + ]) + except run.CommandFailedError: + log.warning('Kernel does not support kdb') + else: + log.info('Disabling kdb on {role}...'.format(role=role)) + # Add true pipe so command doesn't fail on kernel without kdb support. + try: + role_remote.run( + args=[ + 'echo', '', + run.Raw('|'), + 'sudo', 'tee', '/sys/module/kgdboc/parameters/kgdboc', + run.Raw('|'), + 'true', + ]) + except run.CommandFailedError: + log.warning('Kernel does not support kdb') + + +def wait_for_reboot(ctx, need_install, timeout, config, distro=False): + """ + Loop reconnecting and checking kernel versions until + they're all correct or the timeout is exceeded. + + :param ctx: Context + :param need_install: list of packages that we need to reinstall. + :param timeout: number of second before we timeout. + """ + import time + # do not try to reconnect immediately after triggering the reboot, + # because the reboot sequence might not have started yet (!) -- + # see https://tracker.ceph.com/issues/44187 + time.sleep(30) + starttime = time.time() + while need_install: + for client in list(need_install.keys()): + if 'distro' in str(need_install[client]): + distro = True + log.info('Checking client {client} for new kernel version...'.format(client=client)) + try: + (remote,) = ctx.cluster.only(client).remotes.keys() + remote.reconnect(timeout=timeout) + if distro: + assert not need_to_install_distro(remote, config[client]), \ + 'failed to install new distro kernel version within timeout' + + else: + assert not need_to_install(ctx, client, need_install[client]), \ + 'failed to install new kernel version within timeout' + del need_install[client] + except Exception: + log.exception("Saw exception") + # ignore connection resets and asserts while time is left + if time.time() - starttime > timeout: + raise + time.sleep(1) + + +def get_version_of_running_kernel(remote): + """ + Get the current running kernel version in a format that can be compared + with the output of "rpm -q kernel..." + """ + dist_release = remote.os.name + uname_r = remote.sh("uname -r").strip() + current = None + if dist_release in ['opensuse', 'sle']: + # "uname -r" returns 4.12.14-lp151.28.36-default + # "rpm -q kernel-default" returns 4.12.14-lp151.28.36.1.x86_64 + # In order to be able to meaningfully check whether the former + # is "in" the latter, we have to chop off the "-default". + current = re.sub(r"-default$", "", uname_r) + else: + current = uname_r + return current + + +def need_to_install_distro(remote, role_config): + """ + Installing kernels on rpm won't setup grub/boot into them. This installs + the newest kernel package and checks its version and compares against + the running kernel (uname -r). Similar check for deb. + + :returns: False if running the newest distro kernel. Returns the version of + the newest if it is not running. + """ + dist_release = remote.os.name + package_type = remote.os.package_type + current = get_version_of_running_kernel(remote) + log.info("Running kernel on {node}: {version}".format( + node=remote.shortname, version=current)) + installed_version = None + if package_type == 'rpm': + if dist_release in ['opensuse', 'sle']: + install_stdout = remote.sh( + 'sudo zypper --non-interactive install kernel-default' + ) + else: + install_stdout = remote.sh( + 'sudo yum install -y kernel' + ) + match = re.search( + "Package (.*) already installed", + install_stdout, flags=re.MULTILINE) + if 'Nothing to do' in install_stdout: + installed_version = match.groups()[0] if match else '' + err_mess = StringIO() + err_mess.truncate(0) + remote.run(args=['echo', 'no', run.Raw('|'), 'sudo', 'yum', + 'reinstall', 'kernel', run.Raw('||'), 'true'], + stderr=err_mess) + reinstall_stderr = err_mess.getvalue() + err_mess.close() + if 'Skipping the running kernel' in reinstall_stderr: + running_version = re.search( + "Skipping the running kernel: (.*)", + reinstall_stderr, flags=re.MULTILINE).groups()[0] + if installed_version == running_version: + log.info( + 'Newest distro kernel already installed and running') + return False + else: + remote.run(args=['sudo', 'yum', 'reinstall', '-y', 'kernel', + run.Raw('||'), 'true']) + newest = get_latest_image_version_rpm(remote) + + if package_type == 'deb': + newest = get_latest_image_version_deb(remote, dist_release, role_config) + + if current in newest or current.replace('-', '_') in newest: + log.info('Newest distro kernel installed and running') + return False + log.info( + 'Not newest distro kernel. Current: {cur} Expected: {new}'.format( + cur=current, new=newest)) + return newest + + +def maybe_generate_initrd_rpm(remote, path, version): + """ + Generate initrd with mkinitrd if the hooks that should make it + happen on its own aren't there. + + :param path: rpm package path + :param version: kernel version to generate initrd for + e.g. 3.18.0-rc6-ceph-00562-g79a9fa5 + """ + out = remote.sh(['rpm', '--scripts', '-qp', path]) + if 'bin/installkernel' in out or 'bin/kernel-install' in out: + return + + log.info("No installkernel or kernel-install hook in %s, " + "will generate initrd for %s", path, version) + remote.run( + args=[ + 'sudo', + 'mkinitrd', + '--allow-missing', + '-f', # overwrite existing initrd + '/boot/initramfs-' + version + '.img', + version, + ]) + + +def install_kernel(remote, role_config, path=None, version=None): + """ + A bit of misnomer perhaps - the actual kernel package is installed + elsewhere, this function deals with initrd and grub. Currently the + following cases are handled: + - local, gitbuilder, distro for rpm packages + - distro for deb packages - see TODO in install_and_reboot() + + TODO: reboots should be issued from install_and_reboot() + + :param path: package path (for local and gitbuilder cases) + :param version: for RPM distro kernels, pass this to update_grub_rpm + """ + dist_release = remote.os.name + templ = "install_kernel(remote={remote}, path={path}, version={version})" + log.debug(templ.format(remote=remote, path=path, version=version)) + package_type = remote.os.package_type + if package_type == 'rpm': + if dist_release in ['opensuse', 'sle']: + # FIXME + pass + else: + if path: + version = get_image_version(remote, path) + # This is either a gitbuilder or a local package and both of these + # could have been built with upstream rpm targets with specs that + # don't have a %post section at all, which means no initrd. + maybe_generate_initrd_rpm(remote, path, version) + elif not version or version == 'distro': + version = get_latest_image_version_rpm(remote) + update_grub_rpm(remote, version) + remote.run( args=['sudo', 'shutdown', '-r', 'now'], wait=False ) + return + + if package_type == 'deb': + newversion = get_latest_image_version_deb(remote, dist_release, role_config) + if 'ubuntu' in dist_release: + grub2conf = teuthology.get_file(remote, + '/boot/grub/grub.cfg', sudo=True).decode() + submenu = '' + menuentry = '' + for line in grub2conf.split('\n'): + if 'submenu' in line: + submenu = line.split('submenu ')[1] + # Ubuntu likes to be sneaky and change formatting of + # grub.cfg between quotes/doublequotes between versions + if submenu.startswith("'"): + submenu = submenu.split("'")[1] + if submenu.startswith('"'): + submenu = submenu.split('"')[1] + if 'menuentry' in line: + if newversion in line and 'recovery' not in line: + menuentry = line.split('\'')[1] + break + if submenu: + grubvalue = submenu + '>' + menuentry + else: + grubvalue = menuentry + grubfile = 'cat < %s -> %s", path, basename, sha1) + return sha1 + + +@contextlib.contextmanager +def task(ctx, config): + """ + Make sure the specified kernel is installed. + This can be a branch, tag, or sha1 of ceph-client.git or a local + kernel package. + + To install ceph-client.git branch (default: main):: + + kernel: + branch: testing + + To install ceph-client.git tag:: + + kernel: + tag: v3.18 + + To install ceph-client.git sha1:: + + kernel: + sha1: 275dd19ea4e84c34f985ba097f9cddb539f54a50 + + To install from a koji build_id:: + + kernel: + koji: 416058 + + To install from a koji task_id:: + + kernel: + koji_task: 9678206 + + When installing from koji you also need to set the urls for koji hub + and the koji root in your teuthology.yaml config file. These are shown + below with their default values:: + + kojihub_url: http://koji.fedoraproject.org/kojihub + kojiroot_url: http://kojipkgs.fedoraproject.org/packages + + When installing from a koji task_id you also need to set koji_task_url, + which is the base url used to download rpms from koji task results:: + + koji_task_url: https://kojipkgs.fedoraproject.org/work/ + + To install local rpm (target should be an rpm system):: + + kernel: + rpm: /path/to/appropriately-named.rpm + + To install local deb (target should be a deb system):: + + kernel: + deb: /path/to/appropriately-named.deb + + For rpm: or deb: to work it should be able to figure out sha1 from + local kernel package basename, see get_sha1_from_pkg_name(). This + means that you can't for example install a local tag - package built + with upstream {rpm,deb}-pkg targets won't have a sha1 in its name. + + If you want to schedule a run and use a local kernel package, you + have to copy the package over to a box teuthology workers are + running on and specify a path to the package on that box. + + All of the above will install a specified kernel on all targets. + You can specify different kernels for each role or for all roles of + a certain type (more specific roles override less specific, see + normalize_config() for details):: + + kernel: + client: + tag: v3.0 + osd: + branch: btrfs_fixes + client.1: + branch: more_specific + osd.3: + branch: main + + To wait 3 minutes for hosts to reboot (default: 300):: + + kernel: + timeout: 180 + + To enable kdb:: + + kernel: + kdb: true + + :param ctx: Context + :param config: Configuration + """ + if config is None: + config = {} + assert isinstance(config, dict), \ + "task kernel only supports a dictionary for configuration" + + overrides = ctx.config.get('overrides', {}).get('kernel', {}) + config, timeout = normalize_and_apply_overrides(ctx, config, overrides) + validate_config(ctx, config) + log.info('config %s, timeout %d' % (config, timeout)) + + with parallel() as p: + for role, role_config in config.items(): + p.spawn(process_role, ctx, config, timeout, role, role_config) + + try: + yield + finally: + pass + + +def process_role(ctx, config, timeout, role, role_config): + need_install = None # sha1 to dl, or path to rpm or deb + need_version = None # utsrelease or sha1 + + # gather information about this remote + (role_remote,) = ctx.cluster.only(role).remotes.keys() + system_type = role_remote.os.name + if role_remote.is_container: + log.info(f"Remote f{role_remote.shortname} is a container; skipping kernel installation") + return + if role_config.get('rpm') or role_config.get('deb'): + # We only care about path - deb: vs rpm: is meaningless, + # rpm: just happens to be parsed first. Nothing is stopping + # 'deb: /path/to/foo.rpm' and it will work provided remote's + # os.package_type is 'rpm' and vice versa. + path = role_config.get('rpm') + if not path: + path = role_config.get('deb') + sha1 = get_sha1_from_pkg_name(path) + assert sha1, "failed to extract commit hash from path %s" % path + if need_to_install(ctx, role, sha1): + need_install = path + need_version = sha1 + elif role_config.get('sha1') == 'distro': + version = need_to_install_distro(role_remote, role_config) + if version: + need_install = 'distro' + need_version = version + elif role_config.get("koji") or role_config.get('koji_task'): + # installing a kernel from koji + build_id = role_config.get("koji") + task_id = role_config.get("koji_task") + if role_remote.os.package_type != "rpm": + msg = ( + "Installing a kernel from koji is only supported " + "on rpm based systems. System type is {system_type}." + ) + msg = msg.format(system_type=system_type) + log.error(msg) + ctx.summary["failure_reason"] = msg + ctx.summary["status"] = "dead" + raise ConfigError(msg) + + # FIXME: this install should probably happen somewhere else + # but I'm not sure where, so we'll leave it here for now. + install_package('koji', role_remote) + + if build_id: + # get information about this build from koji + build_info = get_koji_build_info(build_id, role_remote, ctx) + version = "{ver}-{rel}.x86_64".format( + ver=build_info["version"], + rel=build_info["release"] + ) + elif task_id: + # get information about results of this task from koji + task_result = get_koji_task_result(task_id, role_remote, ctx) + # this is not really 'build_info', it's a dict of information + # about the kernel rpm from the task results, but for the sake + # of reusing the code below I'll still call it that. + build_info = get_koji_task_rpm_info( + 'kernel', + task_result['rpms'] + ) + # add task_id so we can know later that we're installing + # from a task and not a build. + build_info["task_id"] = task_id + version = build_info["version"] + + if need_to_install(ctx, role, version): + need_install = build_info + need_version = version + else: + builder = get_builder_project()( + "kernel", + role_config, + ctx=ctx, + remote=role_remote, + ) + sha1 = builder.sha1 + log.debug('sha1 for {role} is {sha1}'.format(role=role, sha1=sha1)) + ctx.summary['{role}-kernel-sha1'.format(role=role)] = sha1 + + if need_to_install(ctx, role, sha1): + if teuth_config.use_shaman: + version = builder.scm_version + else: + version = builder.version + if not version: + raise VersionNotFoundError(builder.base_url) + need_install = sha1 + need_version = version + + if need_install: + install_firmware(ctx, {role: need_install}) + download_kernel(ctx, {role: need_install}) + install_and_reboot(ctx, {role: need_install}, config) + wait_for_reboot(ctx, {role: need_version}, timeout, config) + + # enable or disable kdb if specified, otherwise do not touch + if role_config.get('kdb') is not None: + kdb = role_config.get('kdb') + enable_disable_kdb(ctx, {role: kdb}) + diff --git a/teuthology/task/knfsd.py b/teuthology/task/knfsd.py new file mode 100644 index 000000000..100671d82 --- /dev/null +++ b/teuthology/task/knfsd.py @@ -0,0 +1,169 @@ +""" +Export/Unexport a ``nfs server`` client. +""" +import contextlib +import logging +import os + +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + +def get_nfsd_args(remote, cmd): + args=[ + 'sudo', + 'service', + 'nfs', + cmd, + ] + if remote.os.package_type == 'deb': + args[2] = 'nfs-kernel-server' + return args + +@contextlib.contextmanager +def task(ctx, config): + """ + Export/Unexport a ``nfs server`` client. + + The config is optional and defaults to exporting on all clients. If + a config is given, it is expected to be a list or dict of clients to do + this operation on. You must have specified ``ceph-fuse`` or + ``kclient`` on all clients specified for knfsd. + + Example that exports all clients:: + + tasks: + - ceph: + - kclient: + - knfsd: + - interactive: + + Example that uses both ``kclient` and ``ceph-fuse``:: + + tasks: + - ceph: + - ceph-fuse: [client.0] + - kclient: [client.1] + - knfsd: [client.0, client.1] + - interactive: + + Example that specifies export options:: + + tasks: + - ceph: + - kclient: [client.0, client.1] + - knfsd: + client.0: + options: [rw,root_squash] + client.1: + - interactive: + + Note that when options aren't specified, rw,no_root_squash is the default. + When you specify options, the defaults are as specified by exports(5). + + So if empty options are specified, i.e. options: [] these are the defaults: + ro,sync,wdelay,hide,nocrossmnt,secure,root_squash,no_all_squash, + no_subtree_check,secure_locks,acl,anonuid=65534,anongid=65534 + + :param ctx: Context + :param config: Configuration + """ + log.info('Exporting nfs server...') + + if config is None: + config = dict(('client.{id}'.format(id=id_), None) + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')) + elif isinstance(config, list): + config = dict((name, None) for name in config) + + clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys())) + + for id_, remote in clients: + mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_)) + client_config = config.get("client.%s" % id_) + if client_config is None: + client_config = {} + log.debug("Client client.%s config is %s" % (id_, client_config)) + + if client_config.get('options') is not None: + opts = ','.join(client_config.get('options')) + else: + opts = 'rw,no_root_squash' + + # Undocumented option to export to any client in case + # testing in interactive mode from other unspecified clients. + wildcard = False + if client_config.get('wildcard') is not None: + wildcard = True + + log.info('Exporting knfsd client.{id} at {remote} *:{mnt} ({opt})...'.format( + id=id_, remote=remote, mnt=mnt, opt=opts)) + + """ + Should the user want to run with root_squash enabled, there is no + way to write anything to the initial ceph root dir which is set to + rwxr-xr-x root root. + + This could possibly break test cases that make assumptions about + the initial state of the root dir. + """ + remote.run( + args=[ + 'sudo', + 'chmod', + "777", + '{MNT}'.format(MNT=mnt), + ], + ) + """ + Start NFS kernel server + """ + remote.run( args=get_nfsd_args(remote, 'restart') ) + args=[ + 'sudo', + "exportfs", + '-o', + 'fsid=123{id},{opt}'.format(id=id_,opt=opts), + ] + + if wildcard: + args += ['*:{MNT}'.format(MNT=mnt)] + else: + """ + DEFAULT + Prevent bogus clients from old runs from access our + export. Specify all specify node addresses for this run. + """ + ips = [host for (host, _) in (remote.ssh.get_transport().getpeername() for (remote, roles) in ctx.cluster.remotes.items())] + for ip in ips: + args += [ '{ip}:{MNT}'.format(ip=ip, MNT=mnt) ] + + log.info('remote run {args}'.format(args=args)) + remote.run( args=args ) + + try: + yield + finally: + log.info('Unexporting nfs server...') + for id_, remote in clients: + log.debug('Unexporting client client.{id}...'.format(id=id_)) + mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_)) + try: + log.debug('Checking active files on mount {mnt}'.format(mnt=mnt)) + remote.run( + args=[ + 'sudo', + 'lsof', '-V', '+D', + '{mnt}'.format(mnt=mnt), + ], + check_status=False + ) + finally: + log.debug('Stopping NFS server on client.{id}...'.format(id=id_)) + remote.run( args=get_nfsd_args(remote, 'stop') ) + log.debug('Syncing client client.{id}'.format(id=id_)) + remote.run( + args=[ + 'sync' + ] + ) diff --git a/teuthology/task/localdir.py b/teuthology/task/localdir.py new file mode 100644 index 000000000..8a8451465 --- /dev/null +++ b/teuthology/task/localdir.py @@ -0,0 +1,69 @@ +""" +Localdir +""" +import contextlib +import logging +import os + +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def task(ctx, config): + """ + Create a mount dir 'client' that is just the local disk: + + Example that "mounts" all clients: + + tasks: + - localdir: + - interactive: + + Example for a specific client: + + tasks: + - localdir: [client.2] + - interactive: + + :param ctx: Context + :param config: Configuration + """ + log.info('Creating local mnt dirs...') + + testdir = teuthology.get_testdir(ctx) + + if config is None: + config = list('client.{id}'.format(id=id_) + for id_ in teuthology.all_roles_of_type(ctx.cluster, + 'client')) + + clients = list(teuthology.get_clients(ctx=ctx, roles=config)) + for id_, remote in clients: + mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) + log.info('Creating dir {remote} {mnt}...'.format( + remote=remote, mnt=mnt)) + remote.run( + args=[ + 'mkdir', + '--', + mnt, + ], + ) + + try: + yield + + finally: + log.info('Removing local mnt dirs...') + for id_, remote in clients: + mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) + remote.run( + args=[ + 'rm', + '-rf', + '--', + mnt, + ], + ) diff --git a/teuthology/task/lockfile.py b/teuthology/task/lockfile.py new file mode 100644 index 000000000..63ff9f3b1 --- /dev/null +++ b/teuthology/task/lockfile.py @@ -0,0 +1,241 @@ +""" +Locking tests +""" +import logging +import os + +from teuthology.orchestra import run +from teuthology import misc as teuthology +import time +import gevent + + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + This task is designed to test locking. It runs an executable + for each lock attempt you specify, at 0.01 second intervals (to + preserve ordering of the locks). + You can also introduce longer intervals by setting an entry + as a number of seconds, rather than the lock dictionary. + The config is a list of dictionaries. For each entry in the list, you + must name the "client" to run on, the "file" to lock, and + the "holdtime" to hold the lock. + Optional entries are the "offset" and "length" of the lock. You can also specify a + "maxwait" timeout period which fails if the executable takes longer + to complete, and an "expectfail". + + An example:: + + tasks: + - ceph: + - ceph-fuse: [client.0, client.1] + - lockfile: + [{client:client.0, file:testfile, holdtime:10}, + {client:client.1, file:testfile, holdtime:0, maxwait:0, expectfail:true}, + {client:client.1, file:testfile, holdtime:0, maxwait:15, expectfail:false}, + 10, + {client: client.1, lockfile: testfile, holdtime: 5}, + {client: client.2, lockfile: testfile, holdtime: 5, maxwait: 1, expectfail: True}] + + + In the past this test would have failed; there was a bug where waitlocks weren't + cleaned up if the process failed. More involved scenarios are also possible. + + :param ctx: Context + :param config: Configuration + """ + log.info('Starting lockfile') + try: + assert isinstance(config, list), \ + "task lockfile got invalid config" + + log.info("building executable on each host") + buildprocs = list() + # build the locker executable on each client + clients = list() + files = list() + for op in config: + if not isinstance(op, dict): + continue + log.info("got an op") + log.info("op['client'] = %s", op['client']) + clients.append(op['client']) + files.append(op['lockfile']) + if not "expectfail" in op: + op["expectfail"] = False + badconfig = False + if not "client" in op: + badconfig = True + if not "lockfile" in op: + badconfig = True + if not "holdtime" in op: + badconfig = True + if badconfig: + raise KeyError("bad config {op_}".format(op_=op)) + + testdir = teuthology.get_testdir(ctx) + clients = set(clients) + files = set(files) + lock_procs = list() + for client in clients: + (client_remote,) = ctx.cluster.only(client).remotes.keys() + log.info("got a client remote") + (_, _, client_id) = client.partition('.') + filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id), op["lockfile"]) + + proc = client_remote.run( + args=[ + 'mkdir', '-p', '{tdir}/archive/lockfile'.format(tdir=testdir), + run.Raw('&&'), + 'mkdir', '-p', '{tdir}/lockfile'.format(tdir=testdir), + run.Raw('&&'), + 'wget', + '-nv', + '--no-check-certificate', + 'https://raw.github.com/gregsfortytwo/FileLocker/main/sclockandhold.cpp', + '-O', '{tdir}/lockfile/sclockandhold.cpp'.format(tdir=testdir), + run.Raw('&&'), + 'g++', '{tdir}/lockfile/sclockandhold.cpp'.format(tdir=testdir), + '-o', '{tdir}/lockfile/sclockandhold'.format(tdir=testdir) + ], + logger=log.getChild('lockfile_client.{id}'.format(id=client_id)), + wait=False + ) + log.info('building sclockandhold on client{id}'.format(id=client_id)) + buildprocs.append(proc) + + # wait for builds to finish + run.wait(buildprocs) + log.info('finished building sclockandhold on all clients') + + # create the files to run these locks on + client = clients.pop() + clients.add(client) + (client_remote,) = ctx.cluster.only(client).remotes.keys() + (_, _, client_id) = client.partition('.') + file_procs = list() + for lockfile in files: + filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id), lockfile) + proc = client_remote.run( + args=[ + 'sudo', + 'touch', + filepath, + ], + logger=log.getChild('lockfile_createfile'), + wait=False + ) + file_procs.append(proc) + run.wait(file_procs) + file_procs = list() + for lockfile in files: + filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id), lockfile) + proc = client_remote.run( + args=[ + 'sudo', 'chown', 'ubuntu.ubuntu', filepath + ], + logger=log.getChild('lockfile_createfile'), + wait=False + ) + file_procs.append(proc) + run.wait(file_procs) + log.debug('created files to lock') + + # now actually run the locktests + for op in config: + if not isinstance(op, dict): + assert isinstance(op, int) or isinstance(op, float) + log.info("sleeping for {sleep} seconds".format(sleep=op)) + time.sleep(op) + continue + greenlet = gevent.spawn(lock_one, op, ctx) + lock_procs.append((greenlet, op)) + time.sleep(0.1) # to provide proper ordering + #for op in config + + for (greenlet, op) in lock_procs: + log.debug('checking lock for op {op_}'.format(op_=op)) + result = greenlet.get() + if not result: + raise Exception("Got wrong result for op {op_}".format(op_=op)) + # for (greenlet, op) in lock_procs + + finally: + #cleanup! + if lock_procs: + for (greenlet, op) in lock_procs: + log.debug('closing proc for op {op_}'.format(op_=op)) + greenlet.kill(block=True) + + for client in clients: + (client_remote,) = ctx.cluster.only(client).remotes.keys() + (_, _, client_id) = client.partition('.') + filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id), op["lockfile"]) + proc = client_remote.run( + args=[ + 'rm', '-rf', '{tdir}/lockfile'.format(tdir=testdir), + run.Raw(';'), + 'sudo', 'rm', '-rf', filepath + ], + wait=True + ) #proc + #done! +# task + +def lock_one(op, ctx): + """ + Perform the individual lock + """ + log.debug('spinning up locker with op={op_}'.format(op_=op)) + timeout = None + proc = None + result = None + (client_remote,) = ctx.cluster.only(op['client']).remotes.keys() + (_, _, client_id) = op['client'].partition('.') + testdir = teuthology.get_testdir(ctx) + filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id), op["lockfile"]) + + if "maxwait" in op: + timeout = gevent.Timeout(seconds=float(op["maxwait"])) + timeout.start() + try: + proc = client_remote.run( + args=[ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=testdir), + 'daemon-helper', + 'kill', + '{tdir}/lockfile/sclockandhold'.format(tdir=testdir), + filepath, + '{holdtime}'.format(holdtime=op["holdtime"]), + '{offset}'.format(offset=op.get("offset", '0')), + '{length}'.format(length=op.get("length", '1')), + ], + logger=log.getChild('lockfile_client.{id}'.format(id=client_id)), + wait=False, + stdin=run.PIPE, + check_status=False + ) + result = proc.wait() + except gevent.Timeout as tout: + if tout is not timeout: + raise + if bool(op["expectfail"]): + result = 1 + if result == 1: + if bool(op["expectfail"]): + log.info("failed as expected for op {op_}".format(op_=op)) + else: + raise Exception("Unexpectedly failed to lock {op_} within given timeout!".format(op_=op)) + finally: #clean up proc + if timeout is not None: + timeout.cancel() + if proc is not None: + proc.stdin.close() + + ret = (result == 0 and not bool(op["expectfail"])) or (result == 1 and bool(op["expectfail"])) + + return ret #we made it through diff --git a/teuthology/task/loop.py b/teuthology/task/loop.py new file mode 100644 index 000000000..cd48df1cc --- /dev/null +++ b/teuthology/task/loop.py @@ -0,0 +1,45 @@ +""" +Task to loop a list of items +""" +import sys +import logging + +from teuthology import run_tasks + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Loop a sequential group of tasks + + example:: + + - loop: + count: 10 + body: + - tasktest: + - tasktest: + + :param ctx: Context + :param config: Configuration + """ + for i in range(config.get('count', 1)): + stack = [] + try: + for entry in config.get('body', []): + if not isinstance(entry, dict): + entry = ctx.config.get(entry, {}) + ((taskname, confg),) = entry.items() + log.info('In sequential, running task %s...' % taskname) + mgr = run_tasks.run_one_task(taskname, ctx=ctx, config=confg) + if hasattr(mgr, '__enter__'): + mgr.__enter__() + stack.append(mgr) + finally: + try: + exc_info = sys.exc_info() + while stack: + mgr = stack.pop() + mgr.__exit__(*exc_info) + finally: + del exc_info diff --git a/teuthology/task/mpi.py b/teuthology/task/mpi.py new file mode 100644 index 000000000..0e1b504dd --- /dev/null +++ b/teuthology/task/mpi.py @@ -0,0 +1,137 @@ +""" +Start mpi processes (and allow commands to be run inside process) +""" +import logging +import re + +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + + +def _check_mpi_version(remotes): + """ + Retrieve the MPI version from each of `remotes` and raise an exception + if they are not all the same version. + """ + versions = set() + for remote in remotes: + version_str = remote.sh("mpiexec --version") + try: + version = re.search(r"^\s+Version:\s+(.+)$", version_str, re.MULTILINE).group(1) + except AttributeError: + raise RuntimeError("Malformed MPI version output: {0}".format(version_str)) + else: + versions.add(version) + + if len(versions) != 1: + raise RuntimeError("MPI version mismatch. Versions are: {0}".format(", ".join(versions))) + else: + log.info("MPI version {0}".format(list(versions)[0])) + + +def task(ctx, config): + """ + Setup MPI and execute commands + + Example that starts an MPI process on specific clients:: + + tasks: + - ceph: + - ceph-fuse: [client.0, client.1] + - ssh_keys: + - mpi: + nodes: [client.0, client.1] + exec: ior ... + + Example that starts MPI processes on all clients:: + + tasks: + - ceph: + - ceph-fuse: + - ssh_keys: + - mpi: + exec: ior ... + + Example that starts MPI processes on all roles:: + + tasks: + - ceph: + - ssh_keys: + - mpi: + nodes: all + exec: ... + + Example that specifies a working directory for MPI processes: + + tasks: + - ceph: + - ceph-fuse: + - pexec: + clients: + - ln -s {testdir}/mnt.* {testdir}/gmnt + - ssh_keys: + - mpi: + exec: fsx-mpi + workdir: {testdir}/gmnt + - pexec: + clients: + - rm -f {testdir}/gmnt + + :param ctx: Context + :param config: Configuration + """ + assert isinstance(config, dict), 'task mpi got invalid config' + assert 'exec' in config, 'task mpi got invalid config, missing exec' + + testdir = teuthology.get_testdir(ctx) + + mpiexec = config['exec'].replace('$TESTDIR', testdir) + hosts = [] + remotes = [] + main_remote = None + if 'nodes' in config: + if isinstance(config['nodes'], str) and config['nodes'] == 'all': + for role in teuthology.all_roles(ctx.cluster): + (remote,) = ctx.cluster.only(role).remotes.keys() + ip,port = remote.ssh.get_transport().getpeername() + hosts.append(ip) + remotes.append(remote) + (main_remote,) = ctx.cluster.only(config['nodes'][0]).remotes.keys() + elif isinstance(config['nodes'], list): + for role in config['nodes']: + (remote,) = ctx.cluster.only(role).remotes.keys() + ip,port = remote.ssh.get_transport().getpeername() + hosts.append(ip) + remotes.append(remote) + (main_remote,) = ctx.cluster.only(config['nodes'][0]).remotes.keys() + else: + roles = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] + (main_remote,) = ctx.cluster.only(roles[0]).remotes.keys() + for role in roles: + (remote,) = ctx.cluster.only(role).remotes.keys() + ip,port = remote.ssh.get_transport().getpeername() + hosts.append(ip) + remotes.append(remote) + + # mpich is sensitive to different versions on different nodes + _check_mpi_version(remotes) + + workdir = [] + if 'workdir' in config: + workdir = ['-wdir', config['workdir'].replace('$TESTDIR', testdir) ] + + log.info('mpi rank 0 is: {name}'.format(name=main_remote.name)) + + # write out the mpi hosts file + log.info('mpi nodes: [%s]' % (', '.join(hosts))) + teuthology.write_file(remote=main_remote, + path='{tdir}/mpi-hosts'.format(tdir=testdir), + data='\n'.join(hosts)) + log.info('mpiexec on {name}: {cmd}'.format(name=main_remote.name, cmd=mpiexec)) + args=['mpiexec', '-f', '{tdir}/mpi-hosts'.format(tdir=testdir)] + args.extend(workdir) + args.extend(mpiexec.split(' ')) + main_remote.run(args=args, ) + log.info('mpi task completed') + main_remote.run(args=['rm', '{tdir}/mpi-hosts'.format(tdir=testdir)]) diff --git a/teuthology/task/nfs.py b/teuthology/task/nfs.py new file mode 100644 index 000000000..5cd3aac81 --- /dev/null +++ b/teuthology/task/nfs.py @@ -0,0 +1,146 @@ +""" +Nfs client tester +""" +import contextlib +import logging +import os + +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + +@contextlib.contextmanager +def task(ctx, config): + """ + Mount nfs client (requires nfs server export like knfsd or ganesh) + + Example that mounts a single nfs client:: + + tasks: + - ceph: + - kclient: [client.0] + - knfsd: [client.0] + - nfs: + client.1: + server: client.0 + - interactive: + + Example that mounts multiple nfs clients with options:: + + tasks: + - ceph: + - kclient: [client.0, client.1] + - knfsd: [client.0, client.1] + - nfs: + client.2: + server: client.0 + options: [rw,hard,intr,nfsvers=3] + client.3: + server: client.1 + options: [ro] + - workunit: + clients: + client.2: + - suites/dbench.sh + client.3: + - suites/blogbench.sh + + It is not recommended that the nfs client and nfs server reside on the same node. So in the example above client.0-3 should be on 4 distinct + nodes. The client nfs testing would be using only client.2 and client.3. + """ + log.info('Mounting nfs clients...') + assert isinstance(config, dict) + + clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys())) + + testdir = teuthology.get_testdir(ctx) + for id_, remote in clients: + mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) + client_config = config.get("client.%s" % id_) + if client_config is None: + client_config = {} + log.debug("Client client.%s config is %s" % (id_, client_config)) + + assert client_config.get('server') is not None + server = client_config.get('server'); + + svr_id = server[len('client.'):] + svr_mnt = os.path.join(testdir, 'mnt.{id}'.format(id=svr_id)) + + svr_remote = None + all_config = ['client.{id}'.format(id=tmpid) + for tmpid in teuthology.all_roles_of_type(ctx.cluster, 'client')] + all_clients = list(teuthology.get_clients(ctx=ctx, roles=all_config)) + for tmpid, tmpremote in all_clients: + if tmpid == svr_id: + svr_remote = tmpremote + break + + assert svr_remote is not None + svr_remote = svr_remote.name.split('@', 2)[1] + + if client_config.get('options') is not None: + opts = ','.join(client_config.get('options')) + else: + opts = 'rw' + + log.info('Mounting client.{id} from client.{sid}'.format(id=id_, sid=svr_id)) + log.debug('mount -o {opts} {remote}:{svr_mnt} {mnt}'.format( + remote=svr_remote, svr_mnt=svr_mnt, opts=opts, mnt=mnt)) + + remote.run( + args=[ + 'mkdir', + '--', + mnt, + ], + ) + + remote.run( + args=[ + 'sudo', + "mount", + "-o", + opts, + '{remote}:{mnt}'.format(remote=svr_remote, mnt=svr_mnt), + mnt + ], + ) + + try: + yield + finally: + log.info('Unmounting nfs clients...') + for id_, remote in clients: + log.debug('Unmounting nfs client client.{id}...'.format(id=id_)) + mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_)) + try: + log.debug('First, syncing client client.{id}'.format(id=id_)) + remote.run( + args=[ + 'sync' + ] + ) + remote.run( + args=[ + 'sudo', + 'lsof', '-V', '+D', + '{mnt}'.format(mnt=mnt), + ], + check_status=False + ) + finally: + remote.run( + args=[ + 'sudo', + 'umount', + mnt, + ], + ) + remote.run( + args=[ + 'rmdir', + '--', + mnt, + ], + ) diff --git a/teuthology/task/nop.py b/teuthology/task/nop.py new file mode 100644 index 000000000..c7b181403 --- /dev/null +++ b/teuthology/task/nop.py @@ -0,0 +1,13 @@ +""" +Null task +""" +def task(ctx, config): + """ + This task does nothing. + + For example:: + + tasks: + - nop: + """ + pass diff --git a/teuthology/task/parallel.py b/teuthology/task/parallel.py new file mode 100644 index 000000000..6999c0aae --- /dev/null +++ b/teuthology/task/parallel.py @@ -0,0 +1,71 @@ +""" +Task to group parallel running tasks +""" +import sys +import logging + +from teuthology import run_tasks +from teuthology import parallel + +log = logging.getLogger(__name__) + + +def task(ctx, config): + """ + Run a group of tasks in parallel. + + example:: + + - parallel: + - tasktest: + - tasktest: + + You can also define tasks in a top-level section outside of + 'tasks:', and reference them here. + + The referenced section must contain a list of tasks to run + sequentially, or a single task as a dict. The latter is only + available for backwards compatibility with existing suites:: + + tasks: + - parallel: + - tasktest: # task inline + - foo # reference to top-level 'foo' section + - bar # reference to top-level 'bar' section + foo: + - tasktest1: + - tasktest2: + bar: + tasktest: # note the list syntax from 'foo' is preferred + + That is, if the entry is not a dict, we will look it up in the top-level + config. + + Sequential tasks and Parallel tasks can be nested. + """ + + log.info('starting parallel...') + with parallel.parallel() as p: + for entry in config: + if not isinstance(entry, dict): + entry = ctx.config.get(entry, {}) + # support the usual list syntax for tasks + if isinstance(entry, list): + entry = dict(sequential=entry) + ((taskname, confg),) = entry.items() + p.spawn(_run_spawned, ctx, confg, taskname) + + +def _run_spawned(ctx, config, taskname): + """Run one of the tasks (this runs in parallel with others)""" + mgr = {} + try: + log.info('In parallel, running task %s...' % taskname) + mgr = run_tasks.run_one_task(taskname, ctx=ctx, config=config) + if hasattr(mgr, '__enter__'): + mgr.__enter__() + finally: + exc_info = sys.exc_info() + if hasattr(mgr, '__exit__'): + mgr.__exit__(*exc_info) + del exc_info diff --git a/teuthology/task/parallel_example.py b/teuthology/task/parallel_example.py new file mode 100644 index 000000000..eb9659a81 --- /dev/null +++ b/teuthology/task/parallel_example.py @@ -0,0 +1,58 @@ +""" +Parallel contextmanager test +""" +import contextlib +import logging + +from teuthology import misc as teuthology +from teuthology import contextutil +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +@contextlib.contextmanager +def sequential_test(ctx, config): + """Example contextmanager that executes a command on remote hosts sequentially.""" + for role in config: + """Create a cluster composed of all hosts with the given role, and run the command on them sequentially.""" + log.info('Executing command on all hosts sequentially with role "%s"' % role) + ctx.cluster.only(role).run(args=['sleep', '5', run.Raw(';'), 'date', run.Raw(';'), 'hostname']) + yield + +@contextlib.contextmanager +def parallel_test(ctx, config): + """Example contextmanager that executes a command on remote hosts in parallel.""" + for role in config: + """Create a cluster composed of all hosts with the given role, and run the command on them concurrently.""" + log.info('Executing command on all hosts concurrently with role "%s"' % role) + cluster = ctx.cluster.only(role) + nodes = {} + for remote in cluster.remotes.keys(): + """Call run for each remote host, but use 'wait=False' to have it return immediately.""" + proc = remote.run(args=['sleep', '5', run.Raw(';'), 'date', run.Raw(';'), 'hostname'], wait=False,) + nodes[remote.name] = proc + for name, proc in nodes.items(): + """Wait for each process to finish before yielding and allowing other contextmanagers to run.""" + proc.wait() + yield + +@contextlib.contextmanager +def task(ctx, config): + """This is the main body of the task that gets run.""" + + """Take car of some yaml parsing here""" + if config is not None and not isinstance(config, list) and not isinstance(config, dict): + assert(False), "task parallel_example only supports a list or dictionary for configuration" + if config is None: + config = ['client.{id}'.format(id=id_) + for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] + if isinstance(config, list): + config = dict.fromkeys(config) + clients = config.keys() + + """Run Multiple contextmanagers sequentially by nesting them.""" + with contextutil.nested( + lambda: parallel_test(ctx=ctx, config=clients), + lambda: sequential_test(ctx=ctx, config=clients), + ): + yield diff --git a/teuthology/task/pcp.j2 b/teuthology/task/pcp.j2 new file mode 100644 index 000000000..fe8261188 --- /dev/null +++ b/teuthology/task/pcp.j2 @@ -0,0 +1,15 @@ + + +{% if job_id %}job {{ job_id }} {% endif %}performance data + +{% for metric in graphs.keys() %} +{% if mode == 'static' %} +{% set url = graphs[metric].file.split('/')[-1] %} +{% else %} +{% set url = graphs[metric].url %} +{% endif %} +

{{ metric }} + +

+{% endfor %} + diff --git a/teuthology/task/pcp.py b/teuthology/task/pcp.py new file mode 100644 index 000000000..80458a131 --- /dev/null +++ b/teuthology/task/pcp.py @@ -0,0 +1,335 @@ +# maybe run pcp role? +import datetime +import dateutil.tz +import jinja2 +import logging +import os +import requests +import time + +from teuthology.util.compat import urljoin, urlencode + +from teuthology.config import config as teuth_config +from teuthology.orchestra import run + +from teuthology import misc +from teuthology.task import Task + +log = logging.getLogger(__name__) + + +# Because PCP output is nonessential, set a timeout to avoid stalling +# tests if the server does not respond promptly. +GRAPHITE_DOWNLOAD_TIMEOUT = 60 + + +class PCPDataSource(object): + def __init__(self, hosts, time_from, time_until='now'): + self.hosts = hosts + self.time_from = time_from + self.time_until = time_until + + +class PCPArchive(PCPDataSource): + archive_base_path = '/var/log/pcp/pmlogger' + archive_file_extensions = ('0', 'index', 'meta') + + def get_archive_input_dir(self, host): + return os.path.join( + self.archive_base_path, + host, + ) + + def get_pmlogextract_cmd(self, host): + cmd = [ + 'pmlogextract', + '-S', self._format_time(self.time_from), + '-T', self._format_time(self.time_until), + run.Raw(os.path.join( + self.get_archive_input_dir(host), + '*.0')), + ] + return cmd + + @staticmethod + def _format_time(seconds): + if isinstance(seconds, str): + return seconds + return "@ %s" % time.asctime(time.gmtime(seconds)) + + +class PCPGrapher(PCPDataSource): + _endpoint = '/' + + def __init__(self, hosts, time_from, time_until='now'): + super(PCPGrapher, self).__init__(hosts, time_from, time_until) + self.base_url = urljoin( + teuth_config.pcp_host, + self._endpoint) + + +class GrafanaGrapher(PCPGrapher): + _endpoint = '/grafana/index.html#/dashboard/script/index.js' + + def __init__(self, hosts, time_from, time_until='now', job_id=None): + super(GrafanaGrapher, self).__init__(hosts, time_from, time_until) + self.job_id = job_id + + def build_graph_url(self): + config = dict( + hosts=','.join(self.hosts), + time_from=self._format_time(self.time_from), + ) + if self.time_until: + config['time_to'] = self._format_time(self.time_until) + args = urlencode(config) + template = "{base_url}?{args}" + return template.format(base_url=self.base_url, args=args) + + @staticmethod + def _format_time(seconds): + if isinstance(seconds, str): + return seconds + seconds = int(seconds) + dt = datetime.datetime.fromtimestamp(seconds, dateutil.tz.tzutc()) + return dt.strftime('%Y-%m-%dT%H:%M:%S') + + +class GraphiteGrapher(PCPGrapher): + metrics = [ + 'kernel.all.load.1 minute', + 'mem.util.free', + 'mem.util.used', + 'network.interface.*.bytes.*', + 'disk.all.read_bytes', + 'disk.all.write_bytes', + ] + + graph_defaults = dict( + width='1200', + height='300', + hideLegend='false', + format='png', + ) + _endpoint = '/graphite/render' + + def __init__(self, hosts, time_from, time_until='now', dest_dir=None, + job_id=None): + super(GraphiteGrapher, self).__init__(hosts, time_from, time_until) + self.dest_dir = dest_dir + self.job_id = job_id + + def build_graph_urls(self): + if not hasattr(self, 'graphs'): + self.graphs = dict() + for metric in self.metrics: + metric_dict = self.graphs.get(metric, dict()) + metric_dict['url'] = self.get_graph_url(metric) + self.graphs[metric] = metric_dict + + def _check_dest_dir(self): + if not self.dest_dir: + raise RuntimeError("Must provide a dest_dir!") + + def write_html(self, mode='dynamic'): + self._check_dest_dir() + generated_html = self.generate_html(mode=mode) + html_path = os.path.join(self.dest_dir, 'pcp.html') + with open(html_path, 'w') as f: + f.write(generated_html) + + def generate_html(self, mode='dynamic'): + self.build_graph_urls() + cwd = os.path.dirname(__file__) + loader = jinja2.loaders.FileSystemLoader(cwd) + env = jinja2.Environment(loader=loader) + template = env.get_template('pcp.j2') + data = template.render( + job_id=self.job_id, + graphs=self.graphs, + mode=mode, + ) + return data + + def download_graphs(self): + self._check_dest_dir() + self.build_graph_urls() + for metric in self.graphs.keys(): + url = self.graphs[metric]['url'] + filename = self._sanitize_metric_name(metric) + '.png' + self.graphs[metric]['file'] = graph_path = os.path.join( + self.dest_dir, + filename, + ) + resp = requests.get(url, timeout=GRAPHITE_DOWNLOAD_TIMEOUT) + if not resp.ok: + log.warning( + "Graph download failed with error %s %s: %s", + resp.status_code, + resp.reason, + url, + ) + continue + with open(graph_path, 'wb') as f: + f.write(resp.content) + + def get_graph_url(self, metric): + config = dict(self.graph_defaults) + config.update({ + 'from': self.time_from, + 'until': self.time_until, + # urlencode with doseq=True encodes each item as a separate + # 'target=' arg + 'target': self.get_target_globs(metric), + }) + args = urlencode(config, doseq=True) + template = "{base_url}?{args}" + return template.format(base_url=self.base_url, args=args) + + def get_target_globs(self, metric=''): + globs = ['*{}*'.format(host) for host in self.hosts] + if metric: + globs = ['{}.{}'.format(glob, metric) for glob in globs] + return globs + + @staticmethod + def _sanitize_metric_name(metric): + result = metric + replacements = [ + (' ', '_'), + ('*', '_all_'), + ] + for rep in replacements: + result = result.replace(rep[0], rep[1]) + return result + + +class PCP(Task): + """ + Collects performance data using PCP during a job. + + Configuration options include: + ``graphite``: Whether to render PNG graphs using Graphite (default: + True) + ``grafana``: Whether to build (and submit to paddles) a link to a + dynamic Grafana dashboard containing graphs of performance data + (default: True) + ``fetch_archives``: Whether to assemble and ship a raw PCP archive + containing performance data to the job's output archive (default: + False) + """ + enabled = True + + def __init__(self, ctx, config): + super(PCP, self).__init__(ctx, config) + if teuth_config.get('pcp_host') is None: + self.enabled = False + self.log = log + self.job_id = self.ctx.config.get('job_id') + # until the job stops, we may want to render graphs reflecting the most + # current data + self.stop_time = 'now' + self.use_graphite = self.config.get('graphite', True) + self.use_grafana = self.config.get('grafana', True) + # fetch_archives defaults to False for now because of various bugs in + # pmlogextract + self.fetch_archives = self.config.get('fetch_archives', False) + + def setup(self): + if not self.enabled: + return + super(PCP, self).setup() + self.start_time = int(time.time()) + log.debug("start_time: %s", self.start_time) + self.setup_collectors() + + def setup_collectors(self): + log.debug("cluster: %s", self.cluster) + hosts = [rem.shortname for rem in self.cluster.remotes.keys()] + self.setup_grafana(hosts) + self.setup_graphite(hosts) + self.setup_archive(hosts) + + def setup_grafana(self, hosts): + if self.use_grafana: + self.grafana = GrafanaGrapher( + hosts=hosts, + time_from=self.start_time, + time_until=self.stop_time, + job_id=self.job_id, + ) + + def setup_graphite(self, hosts): + if not getattr(self.ctx, 'archive', None): + self.use_graphite = False + if self.use_graphite: + out_dir = os.path.join( + self.ctx.archive, + 'pcp', + 'graphite', + ) + if not os.path.exists(out_dir): + os.makedirs(out_dir) + self.graphite = GraphiteGrapher( + hosts=hosts, + time_from=self.start_time, + time_until=self.stop_time, + dest_dir=out_dir, + job_id=self.job_id, + ) + + def setup_archive(self, hosts): + if not getattr(self.ctx, 'archive', None): + self.fetch_archives = False + if self.fetch_archives: + self.archiver = PCPArchive( + hosts=hosts, + time_from=self.start_time, + time_until=self.stop_time, + ) + + def begin(self): + if not self.enabled: + return + if self.use_grafana: + log.info( + "PCP+Grafana dashboard: %s", + self.grafana.build_graph_url(), + ) + if self.use_graphite: + self.graphite.write_html() + + def end(self): + if not self.enabled: + return + self.stop_time = int(time.time()) + self.setup_collectors() + log.debug("stop_time: %s", self.stop_time) + if self.use_grafana: + grafana_url = self.grafana.build_graph_url() + log.info( + "PCP+Grafana dashboard: %s", + grafana_url, + ) + if hasattr(self.ctx, 'summary'): + self.ctx.summary['pcp_grafana_url'] = grafana_url + if self.use_graphite: + try: + self.graphite.download_graphs() + self.graphite.write_html(mode='static') + except (requests.ConnectionError, requests.ReadTimeout): + log.exception("Downloading graphs failed!") + self.graphite.write_html() + if self.fetch_archives: + for remote in self.cluster.remotes.keys(): + log.info("Copying PCP data into archive...") + cmd = self.archiver.get_pmlogextract_cmd(remote.shortname) + archive_out_path = os.path.join( + misc.get_testdir(), + 'pcp_archive_%s' % remote.shortname, + ) + cmd.append(archive_out_path) + remote.run(args=cmd) + + +task = PCP diff --git a/teuthology/task/pexec.py b/teuthology/task/pexec.py new file mode 100644 index 000000000..4d18d2719 --- /dev/null +++ b/teuthology/task/pexec.py @@ -0,0 +1,149 @@ +""" +Handle parallel execution on remote hosts +""" +import logging + +from teuthology import misc as teuthology +from teuthology.parallel import parallel +from teuthology.orchestra import run as tor + +log = logging.getLogger(__name__) + +from gevent import queue as queue +from gevent import event as event + +def _init_barrier(barrier_queue, remote): + """current just queues a remote host""" + barrier_queue.put(remote) + +def _do_barrier(barrier, barrier_queue, remote): + """special case for barrier""" + barrier_queue.get() + if barrier_queue.empty(): + barrier.set() + barrier.clear() + else: + barrier.wait() + + barrier_queue.put(remote) + if barrier_queue.full(): + barrier.set() + barrier.clear() + else: + barrier.wait() + +def _exec_host(barrier, barrier_queue, remote, sudo, testdir, ls): + """Execute command remotely""" + log.info('Running commands on host %s', remote.name) + args = [ + 'TESTDIR={tdir}'.format(tdir=testdir), + 'bash', + '-s' + ] + if sudo: + args.insert(0, 'sudo') + + r = remote.run( args=args, stdin=tor.PIPE, wait=False) + r.stdin.writelines(['set -e\n']) + r.stdin.flush() + for l in ls: + l.replace('$TESTDIR', testdir) + if l == "barrier": + _do_barrier(barrier, barrier_queue, remote) + continue + + r.stdin.writelines([l, '\n']) + r.stdin.flush() + r.stdin.writelines(['\n']) + r.stdin.flush() + r.stdin.close() + tor.wait([r]) + +def _generate_remotes(ctx, config): + """Return remote roles and the type of role specified in config""" + if 'all' in config and len(config) == 1: + ls = config['all'] + for remote in ctx.cluster.remotes.keys(): + yield (remote, ls) + elif 'clients' in config: + ls = config['clients'] + for role in teuthology.all_roles_of_type(ctx.cluster, 'client'): + (remote,) = ctx.cluster.only('client.{r}'.format(r=role)).remotes.keys() + yield (remote, ls) + del config['clients'] + for role, ls in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() + yield (remote, ls) + else: + for role, ls in config.items(): + (remote,) = ctx.cluster.only(role).remotes.keys() + yield (remote, ls) + +def task(ctx, config): + """ + Execute commands on multiple hosts in parallel + + tasks: + - ceph: + - ceph-fuse: [client.0, client.1] + - pexec: + client.0: + - while true; do echo foo >> bar; done + client.1: + - sleep 1 + - tail -f bar + - interactive: + + Execute commands on all hosts in the cluster in parallel. This + is useful if there are many hosts and you want to run the same + command on all: + + tasks: + - pexec: + all: + - grep FAIL /var/log/ceph/* + + Or if you want to run in parallel on all clients: + + tasks: + - pexec: + clients: + - dd if=/dev/zero of={testdir}/mnt.* count=1024 bs=1024 + + You can also ensure that parallel commands are synchronized with the + special 'barrier' statement: + + tasks: + - pexec: + clients: + - cd {testdir}/mnt.* + - while true; do + - barrier + - dd if=/dev/zero of=./foo count=1024 bs=1024 + - done + + The above writes to the file foo on all clients over and over, but ensures that + all clients perform each write command in sync. If one client takes longer to + write, all the other clients will wait. + + """ + log.info('Executing custom commands...') + assert isinstance(config, dict), "task pexec got invalid config" + + sudo = False + if 'sudo' in config: + sudo = config['sudo'] + del config['sudo'] + + testdir = teuthology.get_testdir(ctx) + + remotes = list(_generate_remotes(ctx, config)) + count = len(remotes) + barrier_queue = queue.Queue(count) + barrier = event.Event() + + for remote in remotes: + _init_barrier(barrier_queue, remote[0]) + with parallel() as p: + for remote in remotes: + p.spawn(_exec_host, barrier, barrier_queue, remote[0], sudo, testdir, remote[1]) diff --git a/teuthology/task/print.py b/teuthology/task/print.py new file mode 100644 index 000000000..6594c1681 --- /dev/null +++ b/teuthology/task/print.py @@ -0,0 +1,25 @@ +""" +Print task + +A task that logs whatever is given to it as an argument. Can be used +like any other task (under sequential, etc...).j + +For example, the following would cause the strings "String" and "Another +string" to appear in the teuthology.log before and after the chef task +runs, respectively. + +tasks: +- print: "String" +- chef: null +- print: "Another String" +""" + +import logging + +log = logging.getLogger(__name__) + +def task(ctx, config): + """ + Print out config argument in teuthology log/output + """ + log.info('{config}'.format(config=config)) diff --git a/teuthology/task/proc_thrasher.py b/teuthology/task/proc_thrasher.py new file mode 100644 index 000000000..c01911c5a --- /dev/null +++ b/teuthology/task/proc_thrasher.py @@ -0,0 +1,80 @@ +""" +Process thrasher +""" +import logging +import gevent +import random +import time + +from teuthology.orchestra import run + +log = logging.getLogger(__name__) + +class ProcThrasher: + """ Kills and restarts some number of the specified process on the specified + remote + """ + def __init__(self, config, remote, *proc_args, **proc_kwargs): + self.proc_kwargs = proc_kwargs + self.proc_args = proc_args + self.config = config + self.greenlet = None + self.logger = proc_kwargs.get("logger", log.getChild('proc_thrasher')) + self.remote = remote + + # config: + self.num_procs = self.config.get("num_procs", 5) + self.rest_period = self.config.get("rest_period", 100) # seconds + self.run_time = self.config.get("run_time", 1000) # seconds + + def log(self, msg): + """ + Local log wrapper + """ + self.logger.info(msg) + + def start(self): + """ + Start thrasher. This also makes sure that the greenlet interface + is used. + """ + if self.greenlet is not None: + return + self.greenlet = gevent.Greenlet(self.loop) + self.greenlet.start() + + def join(self): + """ + Local join + """ + self.greenlet.join() + + def loop(self): + """ + Thrashing loop -- loops at time intervals. Inside that loop, the + code loops through the individual procs, creating new procs. + """ + time_started = time.time() + procs = [] + self.log("Starting") + while time_started + self.run_time > time.time(): + if len(procs) > 0: + self.log("Killing proc") + proc = random.choice(procs) + procs.remove(proc) + proc.stdin.close() + self.log("About to wait") + run.wait([proc]) + self.log("Killed proc") + + while len(procs) < self.num_procs: + self.log("Creating proc " + str(len(procs) + 1)) + self.log("args are " + str(self.proc_args) + " kwargs: " + str(self.proc_kwargs)) + procs.append(self.remote.run( + *self.proc_args, + ** self.proc_kwargs)) + self.log("About to sleep") + time.sleep(self.rest_period) + self.log("Just woke") + + run.wait(procs) diff --git a/teuthology/task/selinux.py b/teuthology/task/selinux.py new file mode 100644 index 000000000..7b33b11b3 --- /dev/null +++ b/teuthology/task/selinux.py @@ -0,0 +1,219 @@ +import logging +import os + +from io import StringIO + +from teuthology.exceptions import SELinuxError +from teuthology.misc import get_archive_dir +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra import run + +from teuthology.task import Task + +log = logging.getLogger(__name__) + + +class SELinux(Task): + """ + A task to set the SELinux mode during test execution. Note that SELinux + must first be enabled and the filesystem must have been labeled. + + On teardown, also checks the audit log for any denials. + By default selinux will ignore few known denials(listed below). The test + will fail for any other denials seen in audit.log. For the test not to + fail for other denials one can add the overrides with appropriate escapes + overrides: + selinux: + allowlist: + - 'name="cephtest"' + - 'dmidecode' + - 'comm="logrotate"' + - 'comm="idontcare"' + + Known denials which are ignored: + comm="dmidecode" + chronyd.service + name="cephtest" + + + Automatically skips hosts running non-RPM-based OSes. + """ + def __init__(self, ctx, config): + super(SELinux, self).__init__(ctx, config) + self.log = log + self.mode = self.config.get('mode', 'permissive') + + def filter_hosts(self): + """ + Exclude any non-RPM-based hosts, and any downburst VMs + """ + super(SELinux, self).filter_hosts() + new_cluster = Cluster() + for (remote, roles) in self.cluster.remotes.items(): + if remote.is_vm: + msg = "Excluding {host}: VMs are not yet supported" + log.info(msg.format(host=remote.shortname)) + elif remote.is_container: + msg = "Excluding {host}: containers are not yet supported" + log.info(msg.format(host=remote.shortname)) + elif remote.os.name in ['opensuse', 'sle']: + msg = "Excluding {host}: \ + SELinux is not supported for '{os}' os_type yet" + log.info(msg.format(host=remote.shortname, os=remote.os.name)) + elif remote.os.package_type == 'rpm': + new_cluster.add(remote, roles) + else: + msg = "Excluding {host}: OS '{os}' does not support SELinux" + log.debug(msg.format(host=remote.shortname, os=remote.os.name)) + self.cluster = new_cluster + return self.cluster + + def setup(self): + super(SELinux, self).setup() + self.rotate_log() + self.old_modes = self.get_modes() + self.old_denials = self.get_denials() + self.set_mode() + + def rotate_log(self): + self.cluster.run(args="sudo service auditd rotate") + + def get_modes(self): + """ + Get the current SELinux mode from each host so that we can restore + during teardown + """ + + log.debug("Getting current SELinux state") + modes = dict() + for remote in self.cluster.remotes.keys(): + result = remote.run( + args=['/usr/sbin/getenforce'], + stdout=StringIO(), + ) + modes[remote.name] = result.stdout.getvalue().strip().lower() + log.debug("Existing SELinux modes: %s", modes) + return modes + + def set_mode(self): + """ + Set the requested SELinux mode + """ + log.info("Putting SELinux into %s mode", self.mode) + for remote in self.cluster.remotes.keys(): + mode = self.old_modes[remote.name] + if mode == "Disabled" or mode == "disabled": + continue + remote.run( + args=['sudo', '/usr/sbin/setenforce', self.mode], + ) + + def get_denials(self): + """ + Look for denials in the audit log + """ + all_denials = dict() + # dmidecode issue: + # https://bugzilla.redhat.com/show_bug.cgi?id=1289274 + # tracker for chronyd/cephtest issue: + # http://tracker.ceph.com/issues/14244 + known_denials = [ + 'comm="dmidecode"', + 'chronyd.service', + 'name="cephtest"', + 'scontext=system_u:system_r:nrpe_t:s0', + 'scontext=system_u:system_r:pcp_pmlogger_t', + 'scontext=system_u:system_r:pcp_pmcd_t:s0', + 'comm="rhsmd"', + 'scontext=system_u:system_r:syslogd_t:s0', + 'tcontext=system_u:system_r:nrpe_t:s0', + 'comm="updatedb"', + 'comm="smartd"', + 'comm="rhsmcertd-worke"', + 'comm="setroubleshootd"', + 'comm="rpm"', + 'tcontext=system_u:object_r:container_runtime_exec_t:s0', + 'comm="ksmtuned"', + 'comm="sssd"', + 'comm="sss_cache"', + 'context=system_u:system_r:NetworkManager_dispatcher_t:s0', + 'context=system_u:system_r:getty_t:s0', + ] + se_allowlist = self.config.get('allowlist', []) + if se_allowlist: + known_denials.extend(se_allowlist) + get_denials_cmd = ['sudo', 'grep', '-a', 'avc: .*denied', '/var/log/audit/audit.log'] + filter_denials_cmd = ['grep', '-av'] + for known_denial in known_denials: + filter_denials_cmd.extend(['-e', known_denial]) + for remote in self.cluster.remotes.keys(): + proc = remote.run( + args = get_denials_cmd + [run.Raw('|')] + filter_denials_cmd, + stdout=StringIO(), + check_status=False, + ) + output = proc.stdout.getvalue() + if output: + denials = output.strip().split('\n') + log.debug("%s has %s denials", remote.name, len(denials)) + else: + denials = [] + all_denials[remote.name] = denials + return all_denials + + def teardown(self): + self.restore_modes() + self.archive_log() + self.get_new_denials() + + def restore_modes(self): + """ + If necessary, restore previous SELinux modes + """ + # If there's nothing to do, skip this + if not set(self.old_modes.values()).difference(set([self.mode])): + return + log.info("Restoring old SELinux modes") + for remote in self.cluster.remotes.keys(): + mode = self.old_modes[remote.name] + if mode == "Disabled" or mode == "disabled": + continue + if mode != self.mode: + remote.run( + args=['sudo', '/usr/sbin/setenforce', mode], + ) + + def archive_log(self): + if not hasattr(self.ctx, 'archive') or not self.ctx.archive: + return + archive_dir = get_archive_dir(self.ctx) + audit_archive = os.path.join(archive_dir, 'audit') + mkdir_cmd = "mkdir {audit_archive}" + cp_cmd = "sudo cp /var/log/audit/audit.log {audit_archive}" + chown_cmd = "sudo chown $USER {audit_archive}/audit.log" + gzip_cmd = "gzip {audit_archive}/audit.log" + full_cmd = " && ".join((mkdir_cmd, cp_cmd, chown_cmd, gzip_cmd)) + self.cluster.run( + args=full_cmd.format(audit_archive=audit_archive) + ) + + def get_new_denials(self): + """ + Determine if there are any new denials in the audit log + """ + all_denials = self.get_denials() + new_denials = dict() + for remote in self.cluster.remotes.keys(): + old_host_denials = self.old_denials[remote.name] + all_host_denials = all_denials[remote.name] + new_host_denials = set(all_host_denials).difference( + set(old_host_denials) + ) + new_denials[remote.name] = list(new_host_denials) + + for remote in self.cluster.remotes.keys(): + if len(new_denials[remote.name]): + raise SELinuxError(node=remote, + denials=new_denials[remote.name]) + +task = SELinux diff --git a/teuthology/task/sequential.py b/teuthology/task/sequential.py new file mode 100644 index 000000000..2414336fe --- /dev/null +++ b/teuthology/task/sequential.py @@ -0,0 +1,58 @@ +""" +Task sequencer +""" +import sys +import logging + +from teuthology import run_tasks + +log = logging.getLogger(__name__) + + +def task(ctx, config): + """ + Sequentialize a group of tasks into one executable block + + example:: + + - sequential: + - tasktest: + - tasktest: + + You can also reference the job from elsewhere:: + + foo: + tasktest: + tasks: + - sequential: + - tasktest: + - foo + - tasktest: + + That is, if the entry is not a dict, we will look it up in the top-level + config. + + Sequential tasks and Parallel tasks can be nested. + + :param ctx: Context + :param config: Configuration + """ + stack = [] + try: + for entry in config: + if not isinstance(entry, dict): + entry = ctx.config.get(entry, {}) + ((taskname, confg),) = entry.items() + log.info('In sequential, running task %s...' % taskname) + mgr = run_tasks.run_one_task(taskname, ctx=ctx, config=confg) + if hasattr(mgr, '__enter__'): + mgr.__enter__() + stack.append(mgr) + finally: + try: + exc_info = sys.exc_info() + while stack: + mgr = stack.pop() + mgr.__exit__(*exc_info) + finally: + del exc_info diff --git a/teuthology/task/sleep.py b/teuthology/task/sleep.py new file mode 100644 index 000000000..bd6d44544 --- /dev/null +++ b/teuthology/task/sleep.py @@ -0,0 +1,32 @@ +""" +Sleep task +""" +import logging +import time + +log = logging.getLogger(__name__) + + +def task(ctx, config): + """ + Sleep for some number of seconds. + + Example:: + + + tasks: + - install: + - ceph: + - sleep: + duration: 10 + - interactive: + + :param ctx: Context + :param config: Configuration + """ + if not config: + config = {} + assert isinstance(config, dict) + duration = int(config.get('duration', 5)) + log.info('Sleeping for {} seconds'.format(duration)) + time.sleep(duration) diff --git a/teuthology/task/ssh_keys.py b/teuthology/task/ssh_keys.py new file mode 100644 index 000000000..0f497ebe9 --- /dev/null +++ b/teuthology/task/ssh_keys.py @@ -0,0 +1,207 @@ +#!/usr/bin/python +""" +Ssh-key key handlers and associated routines +""" +import contextlib +import datetime +import logging +import paramiko +import re + +from io import StringIO +from teuthology import contextutil +import teuthology.misc as misc +from teuthology.orchestra import run + +log = logging.getLogger(__name__) +ssh_keys_user = 'ssh-keys-user' + + +def timestamp(format_='%Y-%m-%d_%H:%M:%S:%f'): + """ + Return a UTC timestamp suitable for use in filenames + """ + return datetime.datetime.now(datetime.timezone.utc).strftime(format_) + + +def backup_file(remote, path, sudo=False): + """ + Creates a backup of a file on the remote, simply by copying it and adding a + timestamp to the name. + """ + backup_path = "{path}_{timestamp}".format( + path=path, timestamp=timestamp() + ) + args = [ + 'cp', '-v', '-a', path, backup_path, + ] + if sudo: + args.insert(0, 'sudo') + remote.run(args=args) + return backup_path + + +def generate_keys(): + """ + Generatees a public and private key + """ + key = paramiko.RSAKey.generate(2048) + privateString = StringIO() + key.write_private_key(privateString) + return key.get_base64(), privateString.getvalue() + +def particular_ssh_key_test(line_to_test, ssh_key): + """ + Check the validity of the ssh_key + """ + match = re.match(r'[\w-]+ {key} \S+@\S+'.format(key=re.escape(ssh_key)), line_to_test) + + if match: + return False + else: + return True + +def ssh_keys_user_line_test(line_to_test, username ): + """ + Check the validity of the username + """ + match = re.match(r'[\w-]+ \S+ {username}@\S+'.format(username=username), line_to_test) + + if match: + return False + else: + return True + +def cleanup_added_key(ctx, key_backup_files, path): + """ + Delete the keys and removes ~/.ssh/authorized_keys entries we added + """ + log.info('cleaning up keys added for testing') + + for remote in ctx.cluster.remotes: + username, hostname = str(remote).split('@') + if "" == username or "" == hostname: + continue + else: + log.info(' cleaning up keys for user {user} on {host}'.format(host=hostname, user=username)) + misc.delete_file(remote, '/home/{user}/.ssh/id_rsa'.format(user=username)) + misc.delete_file(remote, '/home/{user}/.ssh/id_rsa.pub'.format(user=username)) + misc.move_file(remote, key_backup_files[remote], path) + +@contextlib.contextmanager +def tweak_ssh_config(ctx, config): + """ + Turn off StrictHostKeyChecking + """ + run.wait( + ctx.cluster.run( + args=[ + 'echo', + 'StrictHostKeyChecking no\n', + run.Raw('>'), + run.Raw('/home/ubuntu/.ssh/config'), + run.Raw('&&'), + 'echo', + 'UserKnownHostsFile ', + run.Raw('/dev/null'), + run.Raw('>>'), + run.Raw('/home/ubuntu/.ssh/config'), + run.Raw('&&'), + run.Raw('chmod 600 /home/ubuntu/.ssh/config'), + ], + wait=False, + ) + ) + + try: + yield + + finally: + run.wait( + ctx.cluster.run( + args=['rm',run.Raw('/home/ubuntu/.ssh/config')], + wait=False + ), + ) + +@contextlib.contextmanager +def push_keys_to_host(ctx, config, public_key, private_key): + """ + Push keys to all hosts + """ + log.info('generated public key {pub_key}'.format(pub_key=public_key)) + + # add an entry for all hosts in ctx to auth_keys_data + auth_keys_data = '' + + for inner_host in ctx.cluster.remotes.keys(): + inner_username, inner_hostname = str(inner_host).split('@') + # create a 'user@hostname' string using our fake hostname + fake_hostname = '{user}@{host}'.format(user=ssh_keys_user, host=str(inner_hostname)) + auth_keys_data += '\nssh-rsa {pub_key} {user_host}\n'.format(pub_key=public_key, user_host=fake_hostname) + + key_backup_files = dict() + # for each host in ctx, add keys for all other hosts + for remote in ctx.cluster.remotes: + username, hostname = str(remote).split('@') + if "" == username or "" == hostname: + continue + else: + log.info('pushing keys to {host} for {user}'.format(host=hostname, user=username)) + + # adding a private key + priv_key_file = '/home/{user}/.ssh/id_rsa'.format(user=username) + priv_key_data = '{priv_key}'.format(priv_key=private_key) + misc.delete_file(remote, priv_key_file, force=True) + # Hadoop requires that .ssh/id_rsa have permissions of '500' + remote.write_file(priv_key_file, priv_key_data, mode='0500') + + # then a private key + pub_key_file = '/home/{user}/.ssh/id_rsa.pub'.format(user=username) + pub_key_data = 'ssh-rsa {pub_key} {user_host}'.format(pub_key=public_key, user_host=str(remote)) + misc.delete_file(remote, pub_key_file, force=True) + remote.write_file(pub_key_file, pub_key_data) + + # add appropriate entries to the authorized_keys file for this host + auth_keys_file = '/home/{user}/.ssh/authorized_keys'.format( + user=username) + key_backup_files[remote] = backup_file(remote, auth_keys_file) + misc.append_lines_to_file(remote, auth_keys_file, auth_keys_data) + + try: + yield + + finally: + # cleanup the keys + log.info("Cleaning up SSH keys") + cleanup_added_key(ctx, key_backup_files, auth_keys_file) + + +@contextlib.contextmanager +def task(ctx, config): + """ + Creates a set of RSA keys, distributes the same key pair + to all hosts listed in ctx.cluster, and adds all hosts + to all others authorized_keys list. + + During cleanup it will delete .ssh/id_rsa, .ssh/id_rsa.pub + and remove the entries in .ssh/authorized_keys while leaving + pre-existing entries in place. + """ + + if config is None: + config = {} + assert isinstance(config, dict), \ + "task hadoop only supports a dictionary for configuration" + + # this does not need to do cleanup and does not depend on + # ctx, so I'm keeping it outside of the nested calls + public_key_string, private_key_string = generate_keys() + + with contextutil.nested( + lambda: tweak_ssh_config(ctx, config), + lambda: push_keys_to_host(ctx, config, public_key_string, private_key_string), + #lambda: tweak_ssh_config(ctx, config), + ): + yield + diff --git a/teuthology/task/tasktest.py b/teuthology/task/tasktest.py new file mode 100644 index 000000000..40926c569 --- /dev/null +++ b/teuthology/task/tasktest.py @@ -0,0 +1,50 @@ +""" +Parallel and sequential task tester. Not used by any ceph tests, but used to +unit test the parallel and sequential tasks +""" +import logging +import contextlib +import time + +log = logging.getLogger(__name__) + +@contextlib.contextmanager +def task(ctx, config): + """ + Task that just displays information when it is create and when it is + destroyed/cleaned up. This task was used to test parallel and + sequential task options. + + example:: + + tasks: + - sequential: + - tasktest: + - id: 'foo' + - tasktest: + - id: 'bar' + - delay:5 + - tasktest: + + The above yaml will sequentially start a test task named foo and a test + task named bar. Bar will take 5 seconds to complete. After foo and bar + have finished, an unidentified tasktest task will run. + """ + try: + delay = config.get('delay', 0) + id = config.get('id', 'UNKNOWN') + except AttributeError: + delay = 0 + id = 'UNKNOWN' + try: + log.info('**************************************************') + log.info('Started task test -- %s' % id) + log.info('**************************************************') + time.sleep(delay) + yield + + finally: + log.info('**************************************************') + log.info('Task test is being cleaned up -- %s' % id) + log.info('**************************************************') + diff --git a/teuthology/task/tests/__init__.py b/teuthology/task/tests/__init__.py new file mode 100644 index 000000000..b558341cf --- /dev/null +++ b/teuthology/task/tests/__init__.py @@ -0,0 +1,170 @@ +""" +This task runs teuthology's unit tests and integration tests. +It can run in one of two modes: "py" or "cli". The latter executes py.test in a +separate process, whereas the former invokes it in the teuthology job's python +process. +If the running job has remotes available to it, it will attempt to run integration tests. +Note that this requires running in "py" mode - the default. + +An example:: + + tasks + - tests: +""" +import logging +import os +import pathlib +import pexpect +import pytest + +from teuthology.job_status import set_status +from teuthology.task import Task +from teuthology.util.loggerfile import LoggerFile + + +log = logging.getLogger(__name__) + + +class TeuthologyContextPlugin(object): + def __init__(self, ctx, config): + self.ctx = ctx + self.config = config + self.failures = list() + self.stats = dict() + + # this is pytest hook for generating tests with custom parameters + def pytest_generate_tests(self, metafunc): + # pass the teuthology ctx and config to each test method + if "ctx" in metafunc.fixturenames and \ + "config" in metafunc.fixturenames: + metafunc.parametrize(["ctx", "config"], [(self.ctx, self.config),]) + + # log the outcome of each test + @pytest.hookimpl(hookwrapper=True) + def pytest_runtest_makereport(self, item: pytest.Item, call: pytest.CallInfo): + outcome = yield + report = outcome.get_result() + test_path = item.location[0] + line_no = item.location[1] + test_name = item.location[2] + name = f"{test_path}:{line_no}:{test_name}" + log_msg = f"{report.outcome.upper()} {name}" + outcome_str = report.outcome.lower() + self.stats.setdefault(outcome_str, 0) + self.stats[outcome_str] += 1 + if outcome_str in ['passed', 'skipped']: + if call.when == 'call': + log.info(log_msg) + else: + log.info(f"----- {name} {call.when} -----") + else: + log_msg = f"{log_msg}:{call.when}" + if call.excinfo: + self.failures.append(name) + log_msg = f"{log_msg}: {call.excinfo.getrepr()}" + else: + self.failures.append(log_msg) + log.error(log_msg) + + return + + +# https://docs.pytest.org/en/stable/reference/exit-codes.html +exit_codes = { + 0: "All tests were collected and passed successfully", + 1: "Tests were collected and run but some of the tests failed", + 2: "Test execution was interrupted by the user", + 3: "Internal error happened while executing tests", + 4: "pytest command line usage error", + 5: "No tests were collected", +} + + +class Tests(Task): + """ + Use pytest to recurse through this directory, finding any tests + and then executing them with the teuthology ctx and config args. + Your tests must follow standard pytest conventions to be discovered. + + If config["mode"] == "py", (the default), it will be run in the job's process. + If config["mode"] == "cli" py.test will be invoked as a subprocess. + """ + base_args = ['-v', '--color=no'] + + def setup(self): + super().setup() + mode = self.config.get("mode", "py") + assert mode in ["py", "cli"], "mode must either be 'py' or 'cli'" + if mode == "cli": + # integration tests need ctx from this process, so we need to invoke + # pytest via python to be able to pass them + assert len(self.cluster.remotes) == 0, \ + "Tests requiring remote nodes conflicts with CLI mode" + self.mode = mode + self.stats = dict() + self.orig_curdir = os.curdir + + def begin(self): + super().begin() + try: + if self.mode == "py": + self.status, self.failures = self.run_py() + else: + self.status, self.failures = self.run_cli() + except Exception as e: + log.exception("Saw non-test failure!") + self.ctx.summary['failure_reason'] = str(e) + set_status(self.ctx.summary, "dead") + + def end(self): + if os.curdir != self.orig_curdir: + os.chdir(self.orig_curdir) + if self.stats: + log.info(f"Stats: {self.stats}") + if self.status == 0: + log.info("OK. All tests passed!") + set_status(self.ctx.summary, "pass") + else: + status_msg = str(self.status) + if self.status in exit_codes: + status_msg = f"{status_msg}: {exit_codes[self.status]}" + log.error(f"FAIL (exit code {status_msg})") + if self.failures: + msg = f"{len(self.failures)} Failures: {self.failures}" + self.ctx.summary['failure_reason'] = msg + log.error(msg) + set_status(self.ctx.summary, "fail") + super().end() + + def run_cli(self): + pytest_args = self.base_args + ['./teuthology/test', './scripts'] + if len(self.cluster.remotes): + pytest_args.append('./teuthology/task/tests') + self.log.info(f"pytest args: {pytest_args}") + cwd = str(pathlib.Path(__file__).parents[3]) + log.info(f"pytest cwd: {cwd}") + _, status = pexpect.run( + "py.test " + " ".join(pytest_args), + cwd=cwd, + withexitstatus=True, + timeout=None, + logfile=LoggerFile(self.log, logging.INFO), + ) + return status, [] + + def run_py(self): + pytest_args = self.base_args + ['--pyargs', 'teuthology', 'scripts'] + if len(self.cluster.remotes): + pytest_args.append(__name__) + self.log.info(f"pytest args: {pytest_args}") + context_plugin = TeuthologyContextPlugin(self.ctx, self.config) + # the cwd needs to change so that FakeArchive can find files in this repo + os.chdir(str(pathlib.Path(__file__).parents[3])) + status = pytest.main( + args=pytest_args, + plugins=[context_plugin], + ) + self.stats = context_plugin.stats + return status, context_plugin.failures + +task = Tests diff --git a/teuthology/task/tests/test_fetch_coredumps.py b/teuthology/task/tests/test_fetch_coredumps.py new file mode 100644 index 000000000..2a9a7de09 --- /dev/null +++ b/teuthology/task/tests/test_fetch_coredumps.py @@ -0,0 +1,116 @@ +from teuthology.task.internal import fetch_binaries_for_coredumps +from unittest.mock import patch, Mock +import gzip +import os + +class TestFetchCoreDumps(object): + class MockDecode(object): + def __init__(self, ret): + self.ret = ret + pass + + def decode(self): + return self.ret + + class MockPopen(object): + def __init__(self, ret): + self.ret = ret + + def communicate(self, input=None): + return [TestFetchCoreDumps.MockDecode(self.ret)] + + def setup_method(self): + self.the_function = fetch_binaries_for_coredumps + with gzip.open('file.gz', 'wb') as f: + f.write(b'Hello world!') + self.core_dump_path = "file.gz" + self.m_remote = Mock() + self.uncompressed_correct = self.MockPopen( + "ELF 64-bit LSB core file,"\ + " x86-64, version 1 (SYSV), SVR4-style, from 'ceph_test_rados_api_io',"\ + " real uid: 1194, effective uid: 1194, real gid: 1194,"\ + " effective gid: 1194, execfn: '/usr/bin/ceph_test_rados_api_io', platform: 'x86_64'" + ) + self.uncompressed_incorrect = self.MockPopen("ASCII text") + self.compressed_correct = self.MockPopen( + "gzip compressed data, was "\ + "'correct.format.core', last modified: Wed Jun 29"\ + " 19:55:29 2022, from Unix, original size modulo 2^32 3167080" + ) + + self.compressed_incorrect = self.MockPopen( + "gzip compressed data, was "\ + "'incorrect.format.core', last modified: Wed Jun 29"\ + " 19:56:56 2022, from Unix, original size modulo 2^32 11" + ) + + # Core is not compressed and file is in the correct format + @patch('teuthology.task.internal.subprocess.Popen') + @patch('teuthology.task.internal.os') + def test_uncompressed_correct_format(self, m_os, m_subproc_popen): + m_subproc_popen.side_effect = [ + self.uncompressed_correct, + Exception("We shouldn't be hitting this!") + ] + m_os.path.join.return_value = self.core_dump_path + m_os.path.sep = self.core_dump_path + m_os.path.isdir.return_value = True + m_os.path.dirname.return_value = self.core_dump_path + m_os.path.exists.return_value = True + m_os.listdir.return_value = [self.core_dump_path] + self.the_function(None, self.m_remote) + assert self.m_remote.get_file.called + + # Core is not compressed and file is in the wrong format + @patch('teuthology.task.internal.subprocess.Popen') + @patch('teuthology.task.internal.os') + def test_uncompressed_incorrect_format(self, m_os, m_subproc_popen): + m_subproc_popen.side_effect = [ + self.uncompressed_incorrect, + Exception("We shouldn't be hitting this!") + ] + m_os.path.join.return_value = self.core_dump_path + m_os.path.sep = self.core_dump_path + m_os.path.isdir.return_value = True + m_os.path.dirname.return_value = self.core_dump_path + m_os.path.exists.return_value = True + m_os.listdir.return_value = [self.core_dump_path] + self.the_function(None, self.m_remote) + assert self.m_remote.get_file.called == False + + # Core is compressed and file is in the correct format + @patch('teuthology.task.internal.subprocess.Popen') + @patch('teuthology.task.internal.os') + def test_compressed_correct_format(self, m_os, m_subproc_popen): + m_subproc_popen.side_effect = [ + self.compressed_correct, + self.uncompressed_correct + ] + m_os.path.join.return_value = self.core_dump_path + m_os.path.sep = self.core_dump_path + m_os.path.isdir.return_value = True + m_os.path.dirname.return_value = self.core_dump_path + m_os.path.exists.return_value = True + m_os.listdir.return_value = [self.core_dump_path] + self.the_function(None, self.m_remote) + assert self.m_remote.get_file.called + + # Core is compressed and file is in the wrong format + @patch('teuthology.task.internal.subprocess.Popen') + @patch('teuthology.task.internal.os') + def test_compressed_incorrect_format(self, m_os, m_subproc_popen): + m_subproc_popen.side_effect = [ + self.compressed_incorrect, + self.uncompressed_incorrect + ] + m_os.path.join.return_value = self.core_dump_path + m_os.path.sep = self.core_dump_path + m_os.path.isdir.return_value = True + m_os.path.dirname.return_value = self.core_dump_path + m_os.path.exists.return_value = True + m_os.listdir.return_value = [self.core_dump_path] + self.the_function(None, self.m_remote) + assert self.m_remote.get_file.called == False + + def teardown(self): + os.remove(self.core_dump_path) \ No newline at end of file diff --git a/teuthology/task/tests/test_locking.py b/teuthology/task/tests/test_locking.py new file mode 100644 index 000000000..05b0f45ad --- /dev/null +++ b/teuthology/task/tests/test_locking.py @@ -0,0 +1,25 @@ +import pytest + + +class TestLocking(object): + + def test_correct_os_type(self, ctx, config): + os_type = ctx.config.get("os_type") + if os_type is None: + pytest.skip('os_type was not defined') + for remote in ctx.cluster.remotes.keys(): + assert remote.os.name == os_type + + def test_correct_os_version(self, ctx, config): + os_version = ctx.config.get("os_version") + if os_version is None: + pytest.skip('os_version was not defined') + if ctx.config.get("os_type") == "debian": + pytest.skip('known issue with debian versions; see: issue #10878') + for remote in ctx.cluster.remotes.keys(): + assert remote.inventory_info['os_version'] == os_version + + def test_correct_machine_type(self, ctx, config): + machine_type = ctx.machine_type + for remote in ctx.cluster.remotes.keys(): + assert remote.machine_type in machine_type diff --git a/teuthology/task/tests/test_run.py b/teuthology/task/tests/test_run.py new file mode 100644 index 000000000..f86b0b4f1 --- /dev/null +++ b/teuthology/task/tests/test_run.py @@ -0,0 +1,40 @@ +import logging +import pytest + +from io import StringIO + +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + + +class TestRun(object): + """ + Tests to see if we can make remote procedure calls to the current cluster + """ + + def test_command_failed_label(self, ctx, config): + result = "" + try: + ctx.cluster.run( + args=["python3", "-c", "assert False"], + label="working as expected, nothing to see here" + ) + except CommandFailedError as e: + result = str(e) + + assert "working as expected" in result + + def test_command_failed_no_label(self, ctx, config): + with pytest.raises(CommandFailedError): + ctx.cluster.run( + args=["python3", "-c", "assert False"], + ) + + def test_command_success(self, ctx, config): + result = StringIO() + ctx.cluster.run( + args=["python3", "-c", "print('hi')"], + stdout=result + ) + assert result.getvalue().strip() == "hi" diff --git a/teuthology/task/timer.py b/teuthology/task/timer.py new file mode 100644 index 000000000..2abf18827 --- /dev/null +++ b/teuthology/task/timer.py @@ -0,0 +1,46 @@ +""" +Timer task +""" +import logging +import contextlib +import datetime + +log = logging.getLogger(__name__) + +@contextlib.contextmanager +def task(ctx, config): + """ + Timer + + Measure the time that this set of tasks takes and save that value in the summary file. + Config is a description of what we are timing. + + example:: + + tasks: + - ceph: + - foo: + - timer: "fsx run" + - fsx: + + """ + start = datetime.datetime.now() + log.debug("got here in timer") + try: + yield + finally: + nowinfo = datetime.datetime.now() + elapsed = nowinfo - start + datesaved = nowinfo.isoformat(' ') + hourz, remainder = divmod(elapsed.seconds, 3600) + minutez, secondz = divmod(remainder, 60) + elapsedtime = "%02d:%02d:%02d.%06d" % (hourz,minutez,secondz, elapsed.microseconds) + dateinfo = (datesaved, elapsedtime) + if not 'timer' in ctx.summary: + ctx.summary['timer'] = {config : [dateinfo]} + else: + if config in ctx.summary['timer']: + ctx.summary['timer'][config].append(dateinfo) + else: + ctx.summary['timer'][config] = [dateinfo] + log.info('Elapsed time for %s -- %s' % (config,elapsedtime)) diff --git a/teuthology/templates/email-sleep-before-teardown.jinja2 b/teuthology/templates/email-sleep-before-teardown.jinja2 new file mode 100644 index 000000000..9cc054f40 --- /dev/null +++ b/teuthology/templates/email-sleep-before-teardown.jinja2 @@ -0,0 +1,10 @@ +Teuthology job {{ run_name }}/{{ job_id }} has fallen asleep at {{ sleep_date }} for {{ sleep_time }} + +Owner: {{ owner }} +Suite Name: {{ suite_name }} +Sleep Date: {{ sleep_date }} +Sleep Time: {{ sleep_time_sec }} seconds ({{ sleep_time }}) +Job Info: {{ job_info }} +Job Logs: {{ job_logs }} +Task Stack: {{ task_stack }} +Current Status: {{ status }} diff --git a/teuthology/templates/rocketchat-sleep-before-teardown.jinja2 b/teuthology/templates/rocketchat-sleep-before-teardown.jinja2 new file mode 100644 index 000000000..4109ec5a0 --- /dev/null +++ b/teuthology/templates/rocketchat-sleep-before-teardown.jinja2 @@ -0,0 +1,6 @@ +The teuthology job [{{ job_id }}]({{ job_info }}) for suite *{{ suite_name }}* owned by '{{ owner }}' has fallen asleep with status '{{ status }}' at {{ sleep_date }} for __{{ sleep_time }}__ ({{ sleep_time_sec }} seconds). +Open [teuthology.log]({{ job_logs }}teuthology.log) for details, or go to [all logs]({{ job_logs}}). + +Job Description: {{ job_desc }} +Run Name: {{ run_name }} +Task Stack: {{ task_stack }} diff --git a/teuthology/test/__init__.py b/teuthology/test/__init__.py new file mode 100644 index 000000000..1eb9e8108 --- /dev/null +++ b/teuthology/test/__init__.py @@ -0,0 +1,9 @@ +import os +import pytest +import sys + +skipif_teuthology_process = pytest.mark.skipif( + os.path.basename(sys.argv[0]) == "teuthology", + reason="Skipped because this test cannot pass when run in a teuthology " \ + "process (as opposed to py.test)" +) \ No newline at end of file diff --git a/teuthology/test/fake_archive.py b/teuthology/test/fake_archive.py new file mode 100644 index 000000000..76a944f46 --- /dev/null +++ b/teuthology/test/fake_archive.py @@ -0,0 +1,107 @@ +import os +import shutil +import yaml +import random + + +class FakeArchive(object): + def __init__(self, archive_base="./test_archive"): + self.archive_base = archive_base + + def get_random_metadata(self, run_name, job_id=None, hung=False): + """ + Generate a random info dict for a fake job. If 'hung' is not True, also + generate a summary dict. + + :param run_name: Run name e.g. 'test_foo' + :param job_id: Job ID e.g. '12345' + :param hung: Simulate a hung job e.g. don't return a summary.yaml + :return: A dict with keys 'job_id', 'info' and possibly + 'summary', with corresponding values + """ + rand = random.Random() + + description = 'description for job with id %s' % job_id + owner = 'job@owner' + duration = rand.randint(1, 36000) + pid = rand.randint(1000, 99999) + job_id = rand.randint(1, 99999) + + info = { + 'description': description, + 'job_id': job_id, + 'run_name': run_name, + 'owner': owner, + 'pid': pid, + } + + metadata = { + 'info': info, + 'job_id': job_id, + } + + if not hung: + success = True if rand.randint(0, 1) != 1 else False + + summary = { + 'description': description, + 'duration': duration, + 'owner': owner, + 'success': success, + } + + if not success: + summary['failure_reason'] = 'Failure reason!' + metadata['summary'] = summary + + return metadata + + def setup(self): + if os.path.exists(self.archive_base): + shutil.rmtree(self.archive_base) + os.mkdir(self.archive_base) + + def teardown(self): + shutil.rmtree(self.archive_base) + + def populate_archive(self, run_name, jobs): + run_archive_dir = os.path.join(self.archive_base, run_name) + os.mkdir(run_archive_dir) + for job in jobs: + archive_dir = os.path.join(run_archive_dir, str(job['job_id'])) + os.mkdir(archive_dir) + + with open(os.path.join(archive_dir, 'info.yaml'), 'w') as yfile: + yaml.safe_dump(job['info'], yfile) + + if 'summary' in job: + summary_path = os.path.join(archive_dir, 'summary.yaml') + with open(summary_path, 'w') as yfile: + yaml.safe_dump(job['summary'], yfile) + + def create_fake_run(self, run_name, job_count, yaml_path, num_hung=0): + """ + Creates a fake run using run_name. Uses the YAML specified for each + job's config.yaml + + Returns a list of job_ids + """ + assert os.path.exists(yaml_path) + assert job_count > 0 + jobs = [] + made_hung = 0 + for i in range(job_count): + if made_hung < num_hung: + jobs.append(self.get_random_metadata(run_name, hung=True)) + made_hung += 1 + else: + jobs.append(self.get_random_metadata(run_name, hung=False)) + #job_config = yaml.safe_load(yaml_path) + self.populate_archive(run_name, jobs) + for job in jobs: + job_id = job['job_id'] + job_yaml_path = os.path.join(self.archive_base, run_name, + str(job_id), 'config.yaml') + shutil.copyfile(yaml_path, job_yaml_path) + return jobs + diff --git a/teuthology/test/fake_fs.py b/teuthology/test/fake_fs.py new file mode 100644 index 000000000..c5cb6e4f0 --- /dev/null +++ b/teuthology/test/fake_fs.py @@ -0,0 +1,90 @@ +from io import BytesIO +from contextlib import closing + + +try: + FileNotFoundError, NotADirectoryError +except NameError: + FileNotFoundError = NotADirectoryError = OSError + + +def make_fake_fstools(fake_filesystem): + """ + Build fake versions of os.listdir(), os.isfile(), etc. for use in + unit tests + + An example fake_filesystem value: + >>> fake_fs = {\ + 'a_directory': {\ + 'another_directory': {\ + 'empty_file': None,\ + 'another_empty_file': None,\ + },\ + 'random_file': None,\ + 'yet_another_directory': {\ + 'empty_directory': {},\ + },\ + 'file_with_contents': 'data',\ + },\ + } + >>> fake_listdir, fake_isfile, _, _ = \ + make_fake_fstools(fake_fs) + >>> fake_listdir('a_directory/yet_another_directory') + ['empty_directory'] + >>> fake_isfile('a_directory/yet_another_directory') + False + + :param fake_filesystem: A dict representing a filesystem + """ + assert isinstance(fake_filesystem, dict) + + def fake_listdir(path, fsdict=False): + if fsdict is False: + fsdict = fake_filesystem + + remainder = path.strip('/') + '/' + subdict = fsdict + while '/' in remainder: + next_dir, remainder = remainder.split('/', 1) + if next_dir not in subdict: + raise FileNotFoundError( + '[Errno 2] No such file or directory: %s' % next_dir) + subdict = subdict.get(next_dir) + if not isinstance(subdict, dict): + raise NotADirectoryError('[Errno 20] Not a directory: %s' % next_dir) + if subdict and not remainder: + return list(subdict) + return [] + + def fake_isfile(path, fsdict=False): + if fsdict is False: + fsdict = fake_filesystem + + components = path.strip('/').split('/') + subdict = fsdict + for component in components: + if component not in subdict: + raise FileNotFoundError('[Errno 2] No such file or directory: %s' % component) + subdict = subdict.get(component) + return subdict is None or isinstance(subdict, str) + + def fake_isdir(path, fsdict=False): + return not fake_isfile(path) + + def fake_exists(path, fsdict=False): + return fake_isfile(path, fsdict) or fake_isdir(path, fsdict) + + def fake_open(path, mode=None, buffering=None): + components = path.strip('/').split('/') + subdict = fake_filesystem + for component in components: + if component not in subdict: + raise FileNotFoundError('[Errno 2] No such file or directory: %s' % component) + subdict = subdict.get(component) + if isinstance(subdict, dict): + raise IOError('[Errno 21] Is a directory: %s' % path) + elif subdict is None: + return closing(BytesIO(b'')) + return closing(BytesIO(subdict.encode())) + + return fake_exists, fake_listdir, fake_isfile, fake_isdir, fake_open diff --git a/teuthology/test/integration/__init__.py b/teuthology/test/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/test/integration/test_suite.py b/teuthology/test/integration/test_suite.py new file mode 100644 index 000000000..a618c4cf7 --- /dev/null +++ b/teuthology/test/integration/test_suite.py @@ -0,0 +1,86 @@ +import os +import requests +from pytest import raises, skip + +from teuthology.config import config +from teuthology import suite + + +class TestSuiteOnline(object): + def setup_method(self): + if 'TEST_ONLINE' not in os.environ: + skip("To run these sets, set the environment variable TEST_ONLINE") + + def test_ceph_hash_simple(self): + resp = requests.get( + 'https://api.github.com/repos/ceph/ceph/git/refs/heads/main') + ref_hash = resp.json()['object']['sha'] + assert suite.get_hash('ceph') == ref_hash + + def test_kernel_hash_saya(self): + # We don't currently have these packages. + assert suite.get_hash('kernel', 'main', 'default', 'saya') is None + + def test_all_main_branches(self): + # Don't attempt to send email + config.results_email = None + job_config = suite.create_initial_config('suite', 'main', + 'main', 'main', 'testing', + 'default', 'centos', 'plana') + assert ((job_config.branch, job_config.teuthology_branch, + job_config.suite_branch) == ('main', 'main', 'main')) + + def test_config_bogus_kernel_branch(self): + # Don't attempt to send email + config.results_email = None + with raises(suite.ScheduleFailError): + suite.create_initial_config('s', None, 'main', 't', + 'bogus_kernel_branch', 'f', 'd', 'm') + + def test_config_bogus_flavor(self): + # Don't attempt to send email + config.results_email = None + with raises(suite.ScheduleFailError): + suite.create_initial_config('s', None, 'main', 't', 'k', + 'bogus_flavor', 'd', 'm') + + def test_config_bogus_ceph_branch(self): + # Don't attempt to send email + config.results_email = None + with raises(suite.ScheduleFailError): + suite.create_initial_config('s', None, 'bogus_ceph_branch', 't', + 'k', 'f', 'd', 'm') + + def test_config_bogus_suite_branch(self): + # Don't attempt to send email + config.results_email = None + with raises(suite.ScheduleFailError): + suite.create_initial_config('s', 'bogus_suite_branch', 'main', + 't', 'k', 'f', 'd', 'm') + + def test_config_bogus_teuthology_branch(self): + # Don't attempt to send email + config.results_email = None + with raises(suite.ScheduleFailError): + suite.create_initial_config('s', None, 'main', + 'bogus_teuth_branch', 'k', 'f', 'd', + 'm') + + def test_config_substitution(self): + # Don't attempt to send email + config.results_email = None + job_config = suite.create_initial_config('MY_SUITE', 'main', + 'main', 'main', 'testing', + 'default', 'centos', 'plana') + assert job_config['suite'] == 'MY_SUITE' + + def test_config_kernel_section(self): + # Don't attempt to send email + config.results_email = None + job_config = suite.create_initial_config('MY_SUITE', 'main', + 'main', 'main', 'testing', + 'default', 'centos', 'plana') + assert job_config['kernel']['kdb'] is True + + +# maybe use notario for the above? diff --git a/teuthology/test/task/__init__.py b/teuthology/test/task/__init__.py new file mode 100644 index 000000000..d000cff88 --- /dev/null +++ b/teuthology/test/task/__init__.py @@ -0,0 +1,205 @@ +from mock import patch, DEFAULT +from pytest import raises + +from teuthology.config import FakeNamespace +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.task import Task + + +class TestTask(object): + klass = Task + task_name = 'task' + + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.config = dict() + self.task_config = dict() + + def test_overrides(self): + self.ctx.config['overrides'] = dict() + self.ctx.config['overrides'][self.task_name] = dict( + key_1='overridden', + ) + self.task_config.update(dict( + key_1='default', + key_2='default', + )) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + assert task.config['key_1'] == 'overridden' + assert task.config['key_2'] == 'default' + + def test_hosts_no_filter(self): + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2'), ['role2']) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task_hosts = list(task.cluster.remotes) + assert len(task_hosts) == 2 + assert sorted(host.shortname for host in task_hosts) == \ + ['remote1', 'remote2'] + + def test_hosts_no_results(self): + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.task_config.update(dict( + hosts=['role2'], + )) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with raises(RuntimeError): + with self.klass(self.ctx, self.task_config): + pass + + def test_hosts_one_role(self): + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2'), ['role2']) + self.task_config.update(dict( + hosts=['role1'], + )) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task_hosts = list(task.cluster.remotes) + assert len(task_hosts) == 1 + assert task_hosts[0].shortname == 'remote1' + + def test_hosts_two_roles(self): + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2'), ['role2']) + self.ctx.cluster.add(Remote('user@remote3'), ['role3']) + self.task_config.update(dict( + hosts=['role1', 'role3'], + )) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task_hosts = list(task.cluster.remotes) + assert len(task_hosts) == 2 + hostnames = [host.shortname for host in task_hosts] + assert sorted(hostnames) == ['remote1', 'remote3'] + + def test_hosts_two_hostnames(self): + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1.example.com'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2.example.com'), ['role2']) + self.ctx.cluster.add(Remote('user@remote3.example.com'), ['role3']) + self.task_config.update(dict( + hosts=['remote1', 'remote2.example.com'], + )) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task_hosts = list(task.cluster.remotes) + assert len(task_hosts) == 2 + hostnames = [host.hostname for host in task_hosts] + assert sorted(hostnames) == ['remote1.example.com', + 'remote2.example.com'] + + def test_hosts_one_role_one_hostname(self): + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1.example.com'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2.example.com'), ['role2']) + self.ctx.cluster.add(Remote('user@remote3.example.com'), ['role3']) + self.task_config.update(dict( + hosts=['role1', 'remote2.example.com'], + )) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task_hosts = list(task.cluster.remotes) + assert len(task_hosts) == 2 + hostnames = [host.hostname for host in task_hosts] + assert sorted(hostnames) == ['remote1.example.com', + 'remote2.example.com'] + + def test_setup_called(self): + with patch.multiple( + self.klass, + setup=DEFAULT, + begin=DEFAULT, + end=DEFAULT, + teardown=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task.setup.assert_called_once_with() + + def test_begin_called(self): + with patch.multiple( + self.klass, + setup=DEFAULT, + begin=DEFAULT, + end=DEFAULT, + teardown=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task.begin.assert_called_once_with() + + def test_end_called(self): + self.task_config.update(dict()) + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + pass + task.end.assert_called_once_with() + + def test_teardown_called(self): + self.task_config.update(dict()) + with patch.multiple( + self.klass, + setup=DEFAULT, + begin=DEFAULT, + end=DEFAULT, + teardown=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + pass + task.teardown.assert_called_once_with() + + def test_skip_teardown(self): + self.task_config.update(dict( + skip_teardown=True, + )) + + def fake_teardown(self): + assert False + + with patch.multiple( + self.klass, + setup=DEFAULT, + begin=DEFAULT, + end=DEFAULT, + teardown=fake_teardown, + ): + with self.klass(self.ctx, self.task_config): + pass diff --git a/teuthology/test/task/test_ansible.py b/teuthology/test/task/test_ansible.py new file mode 100644 index 000000000..1b7afc01d --- /dev/null +++ b/teuthology/test/task/test_ansible.py @@ -0,0 +1,659 @@ +import json +import os +import yaml + +from unittest.mock import patch, DEFAULT, Mock, mock_open +from pytest import raises, mark +from teuthology.util.compat import PY3 +if PY3: + from io import StringIO as StringIO +else: + from io import BytesIO as StringIO + +from teuthology.config import config, FakeNamespace +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.task import ansible +from teuthology.task.ansible import Ansible, CephLab, FailureAnalyzer + +from teuthology.test.task import TestTask + + +class TestFailureAnalyzer: + klass = FailureAnalyzer + + @mark.parametrize( + 'line,result', + [ + [ + "W: --force-yes is deprecated, use one of the options starting with --allow instead.", + "", + ], + [ + "E: Unable to fetch some archives, maybe run apt-get update or try with --fix-missing?", + "", + ], + [ + "E: Failed to fetch http://security.ubuntu.com/ubuntu/pool/main/a/apache2/apache2-bin_2.4.41-4ubuntu3.14_amd64.deb Unable to connect to archive.ubuntu.com:http:", + "Unable to connect to archive.ubuntu.com:http:" + ], + [ + "E: Failed to fetch http://archive.ubuntu.com/ubuntu/pool/main/libb/libb-hooks-op-check-perl/libb-hooks-op-check-perl_0.22-1build2_amd64.deb Temporary failure resolving 'archive.ubuntu.com'", + "Temporary failure resolving 'archive.ubuntu.com'" + ], + [ + "Data could not be sent to remote host \"smithi068.front.sepia.ceph.com\".", + "Data could not be sent to remote host \"smithi068.front.sepia.ceph.com\"." + ], + [ + "Permissions 0644 for '/root/.ssh/id_rsa' are too open.", + "Permissions 0644 for '/root/.ssh/id_rsa' are too open." + ], + ] + ) + def test_lines(self, line, result): + obj = self.klass() + assert obj.analyze_line(line) == result + + +class TestAnsibleTask(TestTask): + klass = Ansible + task_name = 'ansible' + + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2'), ['role2']) + self.ctx.config = dict() + self.ctx.summary = dict() + self.ctx.archive = "" + self.task_config = dict(playbook=[]) + self.start_patchers() + + def start_patchers(self): + self.patchers = dict() + self.mocks = dict() + self.patchers['mkdtemp'] = patch( + 'teuthology.task.ansible.mkdtemp', return_value='/tmp/' + ) + m_NTF = Mock() + m_file = Mock() + m_file.name = 'file_name' + m_NTF.return_value = m_file + self.patchers['NTF'] = patch( + 'teuthology.task.ansible.NamedTemporaryFile', + m_NTF, + ) + self.patchers['file'] = patch( + 'teuthology.task.ansible.open', create=True) + self.patchers['os_mkdir'] = patch( + 'teuthology.task.ansible.os.mkdir', + ) + self.patchers['os_remove'] = patch( + 'teuthology.task.ansible.os.remove', + ) + self.patchers['shutil_rmtree'] = patch( + 'teuthology.task.ansible.shutil.rmtree', + ) + for name in self.patchers.keys(): + self.start_patcher(name) + + def start_patcher(self, name): + if name not in self.mocks.keys(): + self.mocks[name] = self.patchers[name].start() + + def teardown_method(self, method): + self.stop_patchers() + + def stop_patchers(self): + for name in list(self.mocks): + self.stop_patcher(name) + + def stop_patcher(self, name): + self.patchers[name].stop() + del self.mocks[name] + + def test_setup(self): + self.task_config.update(dict( + playbook=[] + )) + + def fake_get_playbook(self): + self.playbook_file = 'fake' + + with patch.multiple( + self.klass, + find_repo=DEFAULT, + get_playbook=fake_get_playbook, + get_inventory=DEFAULT, + generate_inventory=DEFAULT, + generate_playbook=Mock(side_effect=Exception), + ): + task = self.klass(self.ctx, self.task_config) + task.setup() + + def test_setup_generate_playbook(self): + self.task_config.update(dict( + playbook=[] + )) + with patch.multiple( + self.klass, + find_repo=DEFAULT, + get_playbook=DEFAULT, + get_inventory=DEFAULT, + generate_inventory=DEFAULT, + generate_playbook=DEFAULT, + ): + task = self.klass(self.ctx, self.task_config) + task.setup() + task.generate_playbook.assert_called_once_with() + + def test_find_repo_path(self): + self.task_config.update(dict( + repo='~/my/repo', + )) + task = self.klass(self.ctx, self.task_config) + task.find_repo() + assert task.repo_path == os.path.expanduser(self.task_config['repo']) + + @patch('teuthology.repo_utils.fetch_repo') + def test_find_repo_path_remote(self, m_fetch_repo): + self.task_config.update(dict( + repo='git://fake_host/repo.git', + )) + m_fetch_repo.return_value = '/tmp/repo' + task = self.klass(self.ctx, self.task_config) + task.find_repo() + assert task.repo_path == os.path.expanduser('/tmp/repo') + + @patch('teuthology.repo_utils.fetch_repo') + def test_find_repo_http(self, m_fetch_repo): + self.task_config.update(dict( + repo='http://example.com/my/repo', + )) + task = self.klass(self.ctx, self.task_config) + task.find_repo() + m_fetch_repo.assert_called_once_with(self.task_config['repo'], + 'main') + + @patch('teuthology.repo_utils.fetch_repo') + def test_find_repo_git(self, m_fetch_repo): + self.task_config.update(dict( + repo='git@example.com/my/repo', + )) + task = self.klass(self.ctx, self.task_config) + task.find_repo() + m_fetch_repo.assert_called_once_with(self.task_config['repo'], + 'main') + + def test_playbook_none(self): + del self.task_config['playbook'] + task = self.klass(self.ctx, self.task_config) + with raises(KeyError): + task.get_playbook() + + def test_playbook_wrong_type(self): + self.task_config.update(dict( + playbook=dict(), + )) + task = self.klass(self.ctx, self.task_config) + with raises(TypeError): + task.get_playbook() + + def test_playbook_list(self): + playbook = [ + dict( + roles=['role1'], + ), + ] + self.task_config.update(dict( + playbook=playbook, + )) + task = self.klass(self.ctx, self.task_config) + task.get_playbook() + assert task.playbook == playbook + + @patch.object(ansible.requests, 'get') + def test_playbook_http(self, m_get): + m_get.return_value = Mock() + m_get.return_value.text = 'fake playbook text' + playbook = "http://example.com/my_playbook.yml" + self.task_config.update(dict( + playbook=playbook, + )) + task = self.klass(self.ctx, self.task_config) + task.get_playbook() + m_get.assert_called_once_with(playbook) + + def test_playbook_file(self): + fake_playbook = [dict(fake_playbook=True)] + fake_playbook_obj = StringIO(yaml.safe_dump(fake_playbook)) + self.task_config.update(dict( + playbook='~/fake/playbook', + )) + task = self.klass(self.ctx, self.task_config) + self.mocks['file'].return_value = fake_playbook_obj + task.get_playbook() + assert task.playbook == fake_playbook + + def test_playbook_file_missing(self): + self.task_config.update(dict( + playbook='~/fake/playbook', + )) + task = self.klass(self.ctx, self.task_config) + self.mocks['file'].side_effect = IOError + with raises(IOError): + task.get_playbook() + + def test_inventory_none(self): + self.task_config.update(dict( + playbook=[] + )) + task = self.klass(self.ctx, self.task_config) + with patch.object(ansible.os.path, 'exists') as m_exists: + m_exists.return_value = False + task.get_inventory() + assert task.inventory is None + + def test_inventory_path(self): + inventory = '/my/inventory' + self.task_config.update(dict( + playbook=[], + inventory=inventory, + )) + task = self.klass(self.ctx, self.task_config) + task.get_inventory() + assert task.inventory == inventory + assert task.generated_inventory is False + + def test_inventory_etc(self): + self.task_config.update(dict( + playbook=[] + )) + task = self.klass(self.ctx, self.task_config) + with patch.object(ansible.os.path, 'exists') as m_exists: + m_exists.return_value = True + task.get_inventory() + assert task.inventory == '/etc/ansible/hosts' + assert task.generated_inventory is False + + @mark.parametrize( + 'group_vars', + [ + dict(), + dict(all=dict(var0=0, var1=1)), + dict(foo=dict(var0=0), bar=dict(var0=1)), + ] + ) + def test_generate_inventory(self, group_vars): + self.task_config.update(dict( + playbook=[] + )) + if group_vars: + self.task_config.update(dict(group_vars=group_vars)) + task = self.klass(self.ctx, self.task_config) + hosts_file_path = '/my/hosts/inventory' + hosts_file_obj = StringIO() + hosts_file_obj.name = hosts_file_path + inventory_dir = os.path.dirname(hosts_file_path) + gv_dir = os.path.join(inventory_dir, 'group_vars') + self.mocks['mkdtemp'].return_value = inventory_dir + m_file = self.mocks['file'] + fake_files = [hosts_file_obj] + # Create StringIO object for each group_vars file + if group_vars: + fake_files += [StringIO() for i in sorted(group_vars)] + m_file.side_effect = fake_files + task.generate_inventory() + file_calls = m_file.call_args_list + # Verify the inventory file was created + assert file_calls[0][0][0] == hosts_file_path + # Verify each group_vars file was created + for gv_name, call_obj in zip(sorted(group_vars), file_calls[1:]): + gv_path = call_obj[0][0] + assert gv_path == os.path.join(gv_dir, '%s.yml' % gv_name) + # Verify the group_vars dir was created + if group_vars: + mkdir_call = self.mocks['os_mkdir'].call_args_list + assert mkdir_call[0][0][0] == gv_dir + assert task.generated_inventory is True + assert task.inventory == inventory_dir + # Verify the content of the inventory *file* + hosts_file_obj.seek(0) + assert hosts_file_obj.readlines() == [ + 'remote1\n', + 'remote2\n', + ] + # Verify the contents of each group_vars file + gv_names = sorted(group_vars) + for i in range(len(gv_names)): + gv_name = gv_names[i] + in_val = group_vars[gv_name] + gv_stringio = fake_files[1 + i] + gv_stringio.seek(0) + out_val = yaml.safe_load(gv_stringio) + assert in_val == out_val + + def test_generate_playbook(self): + playbook = [ + dict( + roles=['role1', 'role2'], + ), + ] + self.task_config.update(dict( + playbook=playbook + )) + task = self.klass(self.ctx, self.task_config) + playbook_file_path = '/my/playbook/file' + playbook_file_obj = StringIO() + playbook_file_obj.name = playbook_file_path + with patch.object(ansible, 'NamedTemporaryFile') as m_NTF: + m_NTF.return_value = playbook_file_obj + task.find_repo() + task.get_playbook() + task.generate_playbook() + m_NTF.assert_called_once_with( + prefix="teuth_ansible_playbook_", + dir=task.repo_path, + delete=False, + ) + assert task.generated_playbook is True + assert task.playbook_file == playbook_file_obj + playbook_file_obj.seek(0) + playbook_result = yaml.safe_load(playbook_file_obj) + assert playbook_result == playbook + + def test_execute_playbook(self): + playbook = '/my/playbook' + self.task_config.update(dict( + playbook=playbook + )) + fake_playbook = [dict(fake_playbook=True)] + fake_playbook_obj = StringIO(yaml.safe_dump(fake_playbook)) + fake_playbook_obj.name = playbook + self.mocks['mkdtemp'].return_value = '/inventory/dir' + + task = self.klass(self.ctx, self.task_config) + self.mocks['file'].return_value = fake_playbook_obj + task.setup() + args = task._build_args() + logger = StringIO() + with patch.object(ansible.pexpect, 'run') as m_run: + m_run.return_value = ('', 0) + with patch.object(Remote, 'reconnect') as m_reconnect: + m_reconnect.return_value = True + task.execute_playbook(_logfile=logger) + m_run.assert_called_once_with( + ' '.join(args), + cwd=task.repo_path, + logfile=logger, + withexitstatus=True, + timeout=None, + ) + + def test_execute_playbook_fail(self): + self.task_config.update(dict( + playbook=[], + )) + self.mocks['mkdtemp'].return_value = '/inventory/dir' + task = self.klass(self.ctx, self.task_config) + task.setup() + with patch.object(ansible.pexpect, 'run') as m_run: + with patch('teuthology.task.ansible.open', mock_open()): + m_run.return_value = ('', 1) + with raises(CommandFailedError): + task.execute_playbook() + assert task.ctx.summary.get('status') is None + + def test_build_args_no_tags(self): + self.task_config.update(dict( + playbook=[], + )) + task = self.klass(self.ctx, self.task_config) + task.setup() + args = task._build_args() + assert '--tags' not in args + + def test_build_args_tags(self): + self.task_config.update(dict( + playbook=[], + tags="user,pubkeys" + )) + task = self.klass(self.ctx, self.task_config) + task.setup() + args = task._build_args() + assert args.count('--tags') == 1 + assert args[args.index('--tags') + 1] == 'user,pubkeys' + + def test_build_args_skip_tags(self): + self.task_config.update(dict( + playbook=[], + skip_tags="user,pubkeys" + )) + task = self.klass(self.ctx, self.task_config) + task.setup() + args = task._build_args() + assert args.count('--skip-tags') == 1 + assert args[args.index('--skip-tags') + 1] == 'user,pubkeys' + + def test_build_args_no_vars(self): + self.task_config.update(dict( + playbook=[], + )) + task = self.klass(self.ctx, self.task_config) + task.setup() + args = task._build_args() + assert args.count('--extra-vars') == 1 + vars_str = args[args.index('--extra-vars') + 1].strip("'") + extra_vars = json.loads(vars_str) + assert list(extra_vars) == ['ansible_ssh_user'] + + def test_build_args_vars(self): + extra_vars = dict( + string1='value1', + list1=['item1'], + dict1=dict(key='value'), + ) + + self.task_config.update(dict( + playbook=[], + vars=extra_vars, + )) + task = self.klass(self.ctx, self.task_config) + task.setup() + args = task._build_args() + assert args.count('--extra-vars') == 1 + vars_str = args[args.index('--extra-vars') + 1].strip("'") + got_extra_vars = json.loads(vars_str) + assert 'ansible_ssh_user' in got_extra_vars + assert got_extra_vars['string1'] == extra_vars['string1'] + assert got_extra_vars['list1'] == extra_vars['list1'] + assert got_extra_vars['dict1'] == extra_vars['dict1'] + + def test_teardown_inventory(self): + self.task_config.update(dict( + playbook=[], + )) + task = self.klass(self.ctx, self.task_config) + task.generated_inventory = True + task.inventory = 'fake' + with patch.object(ansible.shutil, 'rmtree') as m_rmtree: + task.teardown() + m_rmtree.assert_called_once_with('fake') + + def test_teardown_playbook(self): + self.task_config.update(dict( + playbook=[], + )) + task = self.klass(self.ctx, self.task_config) + task.generated_playbook = True + task.playbook_file = Mock() + task.playbook_file.name = 'fake' + with patch.object(ansible.os, 'remove') as m_remove: + task.teardown() + m_remove.assert_called_once_with('fake') + + def test_teardown_cleanup_with_vars(self): + self.task_config.update(dict( + playbook=[], + cleanup=True, + vars=dict(yum_repos="testing"), + )) + task = self.klass(self.ctx, self.task_config) + task.inventory = "fake" + task.generated_playbook = True + task.playbook_file = Mock() + task.playbook_file.name = 'fake' + with patch.object(self.klass, 'execute_playbook') as m_execute: + with patch.object(ansible.os, 'remove'): + task.teardown() + task._build_args() + assert m_execute.called + assert 'cleanup' in task.config['vars'] + assert 'yum_repos' in task.config['vars'] + + def test_teardown_cleanup_with_no_vars(self): + self.task_config.update(dict( + playbook=[], + cleanup=True, + )) + task = self.klass(self.ctx, self.task_config) + task.inventory = "fake" + task.generated_playbook = True + task.playbook_file = Mock() + task.playbook_file.name = 'fake' + with patch.object(self.klass, 'execute_playbook') as m_execute: + with patch.object(ansible.os, 'remove'): + task.teardown() + task._build_args() + assert m_execute.called + assert 'cleanup' in task.config['vars'] + + def test_no_remotes(self): + self.task_config.update(dict( + playbook=[], + )) + self.ctx.cluster.remotes = dict() + task = self.klass(self.ctx, self.task_config) + with patch.object(ansible.pexpect, 'run') as m_run: + task.setup() + task.begin() + assert not m_run.called + + +class TestCephLabTask(TestAnsibleTask): + klass = CephLab + task_name = 'ansible.cephlab' + + def setup_method(self): + super(TestCephLabTask, self).setup_method() + self.task_config = dict() + + def start_patchers(self): + super(TestCephLabTask, self).start_patchers() + self.patchers['fetch_repo'] = patch( + 'teuthology.repo_utils.fetch_repo', + ) + self.patchers['fetch_repo'].return_value = 'PATH' + + def fake_get_playbook(self): + self.playbook_file = Mock() + self.playbook_file.name = 'cephlab.yml' + + self.patchers['get_playbook'] = patch( + 'teuthology.task.ansible.CephLab.get_playbook', + new=fake_get_playbook, + ) + for name in self.patchers.keys(): + self.start_patcher(name) + + @patch('teuthology.repo_utils.fetch_repo') + def test_find_repo_http(self, m_fetch_repo): + repo = os.path.join(config.ceph_git_base_url, + 'ceph-cm-ansible.git') + task = self.klass(self.ctx, dict()) + task.find_repo() + m_fetch_repo.assert_called_once_with(repo, 'main') + + def test_playbook_file(self): + fake_playbook = [dict(fake_playbook=True)] + fake_playbook_obj = StringIO(yaml.safe_dump(fake_playbook)) + playbook = 'cephlab.yml' + fake_playbook_obj.name = playbook + task = self.klass(self.ctx, dict()) + task.repo_path = '/tmp/fake/repo' + self.mocks['file'].return_value = fake_playbook_obj + task.get_playbook() + assert task.playbook_file.name == playbook + + def test_generate_inventory(self): + self.task_config.update(dict( + playbook=[] + )) + task = self.klass(self.ctx, self.task_config) + hosts_file_path = '/my/hosts/file' + hosts_file_obj = StringIO() + hosts_file_obj.name = hosts_file_path + self.mocks['mkdtemp'].return_value = os.path.dirname(hosts_file_path) + self.mocks['file'].return_value = hosts_file_obj + task.generate_inventory() + assert task.generated_inventory is True + assert task.inventory == os.path.dirname(hosts_file_path) + hosts_file_obj.seek(0) + assert hosts_file_obj.readlines() == [ + '[testnodes]\n', + 'remote1\n', + 'remote2\n', + ] + + def test_fail_status_dead(self): + self.task_config.update(dict( + playbook=[], + )) + self.mocks['mkdtemp'].return_value = '/inventory/dir' + task = self.klass(self.ctx, self.task_config) + task.ctx.summary = dict() + task.setup() + with patch.object(ansible.pexpect, 'run') as m_run: + with patch('teuthology.task.ansible.open', mock_open()): + m_run.return_value = ('', 1) + with raises(CommandFailedError): + task.execute_playbook() + assert task.ctx.summary.get('status') == 'dead' + + def test_execute_playbook_fail(self): + self.mocks['mkdtemp'].return_value = '/inventory/dir' + task = self.klass(self.ctx, self.task_config) + task.setup() + with patch.object(ansible.pexpect, 'run') as m_run: + with patch('teuthology.task.ansible.open', mock_open()): + m_run.return_value = ('', 1) + with raises(CommandFailedError): + task.execute_playbook() + assert task.ctx.summary.get('status') == 'dead' + + @mark.skip("Unsupported") + def test_generate_playbook(self): + pass + + @mark.skip("Unsupported") + def test_playbook_http(self): + pass + + @mark.skip("Unsupported") + def test_playbook_none(self): + pass + + @mark.skip("Unsupported") + def test_playbook_wrong_type(self): + pass + + @mark.skip("Unsupported") + def test_playbook_list(self): + pass + + @mark.skip("Test needs to be reimplemented for this class") + def test_playbook_file_missing(self): + pass diff --git a/teuthology/test/task/test_ceph_ansible.py b/teuthology/test/task/test_ceph_ansible.py new file mode 100644 index 000000000..29359c102 --- /dev/null +++ b/teuthology/test/task/test_ceph_ansible.py @@ -0,0 +1,177 @@ +from mock import patch, MagicMock +from pytest import skip +from teuthology.util.compat import PY3 +if PY3: + from io import StringIO as StringIO +else: + from io import BytesIO as StringIO + +from teuthology.config import FakeNamespace +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.task import ceph_ansible +from teuthology.task.ceph_ansible import CephAnsible + +from teuthology.test.task import TestTask + +SKIP_IRRELEVANT = "Not relevant to this subclass" + + +class TestCephAnsibleTask(TestTask): + klass = CephAnsible + task_name = 'ceph_ansible' + + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['mon.0']) + self.ctx.cluster.add(Remote('user@remote2'), ['mds.0']) + self.ctx.cluster.add(Remote('user@remote3'), ['osd.0']) + self.ctx.summary = dict() + self.ctx.config = dict() + self.ctx.archive = '../' + self.task_config = dict() + self.start_patchers() + + def start_patchers(self): + m_fetch_repo = MagicMock() + m_fetch_repo.return_value = 'PATH' + + def fake_get_scratch_devices(remote): + return ['/dev/%s' % remote.shortname] + + self.patcher_get_scratch_devices = patch( + 'teuthology.task.ceph_ansible.get_scratch_devices', + fake_get_scratch_devices, + ) + self.patcher_get_scratch_devices.start() + + self.patcher_teardown = patch( + 'teuthology.task.ceph_ansible.CephAnsible.teardown', + ) + self.patcher_teardown.start() + + def fake_set_iface_and_cidr(self): + self._interface = 'eth0' + self._cidr = '172.21.0.0/20' + + self.patcher_remote = patch.multiple( + Remote, + _set_iface_and_cidr=fake_set_iface_and_cidr, + ) + self.patcher_remote.start() + + def stop_patchers(self): + self.patcher_get_scratch_devices.stop() + self.patcher_remote.stop() + self.patcher_teardown.stop() + + def test_playbook_none(self): + skip(SKIP_IRRELEVANT) + + def test_inventory_none(self): + skip(SKIP_IRRELEVANT) + + def test_inventory_path(self): + skip(SKIP_IRRELEVANT) + + def test_inventory_etc(self): + skip(SKIP_IRRELEVANT) + + def test_generate_hosts_file(self): + self.task_config.update(dict( + playbook=[], + vars=dict( + osd_auto_discovery=True, + monitor_interface='eth0', + radosgw_interface='eth0', + public_network='172.21.0.0/20', + ), + )) + task = self.klass(self.ctx, self.task_config) + hosts_file_path = '/my/hosts/file' + hosts_file_obj = StringIO() + hosts_file_obj.name = hosts_file_path + with patch.object(ceph_ansible, 'NamedTemporaryFile') as m_NTF: + m_NTF.return_value = hosts_file_obj + task.generate_hosts_file() + m_NTF.assert_called_once_with(prefix="teuth_ansible_hosts_", + mode='w+', + delete=False) + assert task.generated_inventory is True + assert task.inventory == hosts_file_path + hosts_file_obj.seek(0) + assert hosts_file_obj.read() == '\n'.join([ + '[mdss]', + 'remote2', + '', + '[mons]', + 'remote1', + '', + '[osds]', + 'remote3', + ]) + + def test_generate_hosts_file_with_devices(self): + self.task_config.update(dict( + playbook=[], + vars=dict( + monitor_interface='eth0', + radosgw_interface='eth0', + public_network='172.21.0.0/20', + ), + )) + task = self.klass(self.ctx, self.task_config) + hosts_file_path = '/my/hosts/file' + hosts_file_obj = StringIO() + hosts_file_obj.name = hosts_file_path + with patch.object(ceph_ansible, 'NamedTemporaryFile') as m_NTF: + m_NTF.return_value = hosts_file_obj + task.generate_hosts_file() + m_NTF.assert_called_once_with(prefix="teuth_ansible_hosts_", + mode='w+', + delete=False) + assert task.generated_inventory is True + assert task.inventory == hosts_file_path + hosts_file_obj.seek(0) + assert hosts_file_obj.read() == '\n'.join([ + '[mdss]', + 'remote2 devices=\'[]\'', + '', + '[mons]', + 'remote1 devices=\'[]\'', + '', + '[osds]', + 'remote3 devices=\'["/dev/remote3"]\'', + ]) + + def test_generate_hosts_file_with_network(self): + self.task_config.update(dict( + playbook=[], + vars=dict( + osd_auto_discovery=True, + ), + )) + task = self.klass(self.ctx, self.task_config) + hosts_file_path = '/my/hosts/file' + hosts_file_obj = StringIO() + hosts_file_obj.name = hosts_file_path + with patch.object(ceph_ansible, 'NamedTemporaryFile') as m_NTF: + m_NTF.return_value = hosts_file_obj + task.generate_hosts_file() + m_NTF.assert_called_once_with(prefix="teuth_ansible_hosts_", + mode='w+', + delete=False) + assert task.generated_inventory is True + assert task.inventory == hosts_file_path + hosts_file_obj.seek(0) + assert hosts_file_obj.read() == '\n'.join([ + '[mdss]', + "remote2 monitor_interface='eth0' public_network='172.21.0.0/20' radosgw_interface='eth0'", + '', + '[mons]', + "remote1 monitor_interface='eth0' public_network='172.21.0.0/20' radosgw_interface='eth0'", + '', + '[osds]', + "remote3 monitor_interface='eth0' public_network='172.21.0.0/20' radosgw_interface='eth0'", + ]) diff --git a/teuthology/test/task/test_console_log.py b/teuthology/test/task/test_console_log.py new file mode 100644 index 000000000..e2ad6981e --- /dev/null +++ b/teuthology/test/task/test_console_log.py @@ -0,0 +1,92 @@ +import os + +from mock import patch + +from teuthology.config import FakeNamespace +from teuthology.config import config as teuth_config +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.task.console_log import ConsoleLog + +from teuthology.test.task import TestTask + + +class TestConsoleLog(TestTask): + klass = ConsoleLog + task_name = 'console_log' + + def setup_method(self): + teuth_config.ipmi_domain = 'ipmi.domain' + teuth_config.ipmi_user = 'ipmi_user' + teuth_config.ipmi_password = 'ipmi_pass' + self.ctx = FakeNamespace() + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2'), ['role2']) + self.ctx.config = dict() + self.ctx.archive = '/fake/path' + self.task_config = dict() + self.start_patchers() + + def start_patchers(self): + self.patchers = dict() + self.patchers['makedirs'] = patch( + 'teuthology.task.console_log.os.makedirs', + ) + self.patchers['is_vm'] = patch( + 'teuthology.lock.query.is_vm', + ) + self.patchers['is_vm'].return_value = False + self.patchers['get_status'] = patch( + 'teuthology.lock.query.get_status', + ) + self.mocks = dict() + for name, patcher in self.patchers.items(): + self.mocks[name] = patcher.start() + self.mocks['is_vm'].return_value = False + + def teardown_method(self): + for patcher in self.patchers.values(): + patcher.stop() + + def test_enabled(self): + task = self.klass(self.ctx, self.task_config) + assert task.enabled is True + + def test_disabled_noarchive(self): + self.ctx.archive = None + task = self.klass(self.ctx, self.task_config) + assert task.enabled is False + + def test_has_ipmi_credentials(self): + for remote in self.ctx.cluster.remotes.keys(): + remote.console.has_ipmi_credentials = False + remote.console.has_conserver = False + task = self.klass(self.ctx, self.task_config) + assert len(task.cluster.remotes.keys()) == 0 + + def test_remotes(self): + with self.klass(self.ctx, self.task_config) as task: + assert len(task.cluster.remotes) == len(self.ctx.cluster.remotes) + + @patch('teuthology.orchestra.console.PhysicalConsole') + def test_begin(self, m_pconsole): + with self.klass(self.ctx, self.task_config) as task: + assert len(task.processes) == len(self.ctx.cluster.remotes) + expected_log_paths = [] + for remote in task.cluster.remotes.keys(): + expected_log_paths.append( + os.path.join(self.ctx.archive, 'console_logs', '%s.log' % remote.shortname) + ) + assert len(m_pconsole().spawn_sol_log.call_args_list) == len(task.cluster.remotes) + got_log_paths = [c[0][0] for c in m_pconsole().spawn_sol_log.call_args_list] + assert got_log_paths == expected_log_paths + + @patch('teuthology.orchestra.console.PhysicalConsole') + def test_end(self, m_pconsole): + m_proc = m_pconsole().spawn_sol_log.return_value + m_proc.poll.return_value = None + with self.klass(self.ctx, self.task_config): + pass + assert len(m_proc.terminate.call_args_list) == len(self.ctx.cluster.remotes) + assert len(m_proc.kill.call_args_list) == len(self.ctx.cluster.remotes) diff --git a/teuthology/test/task/test_install.py b/teuthology/test/task/test_install.py new file mode 100644 index 000000000..3c5be9048 --- /dev/null +++ b/teuthology/test/task/test_install.py @@ -0,0 +1,337 @@ +import os +import pytest +import yaml + +from mock import patch, Mock + +from teuthology.task import install + + +class TestInstall(object): + + def _get_default_package_list(self, project='ceph', debug=False): + path = os.path.join( + os.path.dirname(__file__), + '..', '..', 'task', 'install', 'packages.yaml', + ) + pkgs = yaml.safe_load(open(path))[project] + if not debug: + pkgs['deb'] = [p for p in pkgs['deb'] + if not p.endswith('-dbg')] + pkgs['rpm'] = [p for p in pkgs['rpm'] + if not p.endswith('-debuginfo')] + return pkgs + + def test_get_package_list_debug(self): + default_pkgs = self._get_default_package_list(debug=True) + default_pkgs['rpm'].sort() + default_pkgs['deb'].sort() + config = dict(debuginfo=True) + result = install.get_package_list(ctx=None, config=config) + result['rpm'].sort() + result['deb'].sort() + assert result == default_pkgs + + def test_get_package_list_no_debug(self): + default_pkgs = self._get_default_package_list(debug=False) + default_pkgs['rpm'].sort() + default_pkgs['deb'].sort() + config = dict(debuginfo=False) + result = install.get_package_list(ctx=None, config=config) + result['rpm'].sort() + result['deb'].sort() + assert result == default_pkgs + + def test_get_package_list_custom_rpm(self): + default_pkgs = self._get_default_package_list(debug=False) + default_pkgs['rpm'].sort() + default_pkgs['deb'].sort() + rpms = ['rpm1', 'rpm2', 'rpm2-debuginfo'] + config = dict(packages=dict(rpm=rpms)) + result = install.get_package_list(ctx=None, config=config) + result['rpm'].sort() + result['deb'].sort() + assert result['rpm'] == ['rpm1', 'rpm2'] + assert result['deb'] == default_pkgs['deb'] + + @patch("teuthology.task.install._get_builder_project") + @patch("teuthology.task.install.packaging.get_package_version") + def test_get_upgrade_version(self, m_get_package_version, + m_gitbuilder_project): + gb = Mock() + gb.version = "11.0.0" + gb.project = "ceph" + m_gitbuilder_project.return_value = gb + m_get_package_version.return_value = "11.0.0" + install.get_upgrade_version(Mock(), Mock(), Mock()) + + @patch("teuthology.task.install._get_builder_project") + @patch("teuthology.task.install.packaging.get_package_version") + def test_verify_ceph_version_success(self, m_get_package_version, + m_gitbuilder_project): + gb = Mock() + gb.version = "0.89.0" + gb.project = "ceph" + m_gitbuilder_project.return_value = gb + m_get_package_version.return_value = "0.89.0" + config = dict() + install.verify_package_version(Mock(), config, Mock()) + + @patch("teuthology.task.install._get_builder_project") + @patch("teuthology.task.install.packaging.get_package_version") + def test_verify_ceph_version_failed(self, m_get_package_version, + m_gitbuilder_project): + gb = Mock() + gb.version = "0.89.0" + gb.project = "ceph" + m_gitbuilder_project.return_value = gb + m_get_package_version.return_value = "0.89.1" + config = dict() + with pytest.raises(RuntimeError): + install.verify_package_version(Mock(), config, Mock()) + + @patch("teuthology.task.install._get_builder_project") + @patch("teuthology.task.install.packaging.get_package_version") + def test_skip_when_using_ceph_deploy(self, m_get_package_version, + m_gitbuilder_project): + gb = Mock() + gb.version = "0.89.0" + gb.project = "ceph" + m_gitbuilder_project.return_value = gb + # ceph isn't installed because ceph-deploy would install it + m_get_package_version.return_value = None + config = dict() + config['extras'] = True + install.verify_package_version(Mock(), config, Mock()) + + def test_get_flavor_default(self): + config = dict() + assert install.get_flavor(config) == 'default' + + def test_get_flavor_simple(self): + config = dict( + flavor='notcmalloc' + ) + assert install.get_flavor(config) == 'notcmalloc' + + def test_get_flavor_valgrind(self): + config = dict( + valgrind=True + ) + assert install.get_flavor(config) == 'notcmalloc' + + def test_upgrade_is_downgrade(self): + assert_ok_vals = [ + ('9.0.0', '10.0.0'), + ('10.2.2-63-g8542898-1trusty', '10.2.2-64-gabcdef1-1trusty'), + ('11.0.0-918.g13c13c7', '11.0.0-2165.gabcdef1') + ] + for t in assert_ok_vals: + assert install._upgrade_is_downgrade(t[0], t[1]) == False + + @patch("teuthology.packaging.get_package_version") + @patch("teuthology.misc.get_system_type") + @patch("teuthology.task.install.verify_package_version") + @patch("teuthology.task.install.get_upgrade_version") + def test_upgrade_common(self, + m_get_upgrade_version, + m_verify_package_version, + m_get_system_type, + m_get_package_version): + expected_system_type = 'deb' + def make_remote(): + remote = Mock() + remote.arch = 'x86_64' + remote.os = Mock() + remote.os.name = 'ubuntu' + remote.os.version = '14.04' + remote.os.codename = 'trusty' + remote.system_type = expected_system_type + return remote + ctx = Mock() + class cluster: + remote1 = make_remote() + remote2 = make_remote() + remotes = { + remote1: ['client.0'], + remote2: ['mon.a','osd.0'], + } + def only(self, role): + result = Mock() + if role in ('client.0',): + result.remotes = { cluster.remote1: None } + if role in ('osd.0', 'mon.a'): + result.remotes = { cluster.remote2: None } + return result + ctx.cluster = cluster() + config = { + 'client.0': { + 'sha1': 'expectedsha1', + }, + } + ctx.config = { + 'roles': [ ['client.0'], ['mon.a','osd.0'] ], + 'tasks': [ + { + 'install.upgrade': config, + }, + ], + } + m_get_upgrade_version.return_value = "11.0.0" + m_get_package_version.return_value = "10.2.4" + m_get_system_type.return_value = "deb" + def upgrade(ctx, node, remote, pkgs, system_type): + assert system_type == expected_system_type + assert install.upgrade_common(ctx, config, upgrade) == 1 + expected_config = { + 'project': 'ceph', + 'sha1': 'expectedsha1', + } + m_verify_package_version.assert_called_with(ctx, + expected_config, + cluster.remote1) + def test_upgrade_remote_to_config(self): + expected_system_type = 'deb' + def make_remote(): + remote = Mock() + remote.arch = 'x86_64' + remote.os = Mock() + remote.os.name = 'ubuntu' + remote.os.version = '14.04' + remote.os.codename = 'trusty' + remote.system_type = expected_system_type + return remote + ctx = Mock() + class cluster: + remote1 = make_remote() + remote2 = make_remote() + remotes = { + remote1: ['client.0'], + remote2: ['mon.a','osd.0'], + } + def only(self, role): + result = Mock() + if role in ('client.0',): + result.remotes = { cluster.remote1: None } + elif role in ('osd.0', 'mon.a'): + result.remotes = { cluster.remote2: None } + else: + result.remotes = None + return result + ctx.cluster = cluster() + ctx.config = { + 'roles': [ ['client.0'], ['mon.a','osd.0'] ], + } + + # nothing -> nothing + assert install.upgrade_remote_to_config(ctx, {}) == {} + + # select the remote for the osd.0 role + # the 'ignored' role does not exist and is ignored + # the remote for mon.a is the same as for osd.0 and + # is silently ignored (actually it could be the other + # way around, depending on how the keys are hashed) + config = { + 'osd.0': { + 'sha1': 'expectedsha1', + }, + 'ignored': None, + 'mon.a': { + 'sha1': 'expectedsha1', + }, + } + expected_config = { + cluster.remote2: { + 'project': 'ceph', + 'sha1': 'expectedsha1', + }, + } + assert install.upgrade_remote_to_config(ctx, config) == expected_config + + # select all nodes, regardless + config = { + 'all': { + 'sha1': 'expectedsha1', + }, + } + expected_config = { + cluster.remote1: { + 'project': 'ceph', + 'sha1': 'expectedsha1', + }, + cluster.remote2: { + 'project': 'ceph', + 'sha1': 'expectedsha1', + }, + } + assert install.upgrade_remote_to_config(ctx, config) == expected_config + + # verify that install overrides are used as default + # values for the upgrade task, not as override + ctx.config['overrides'] = { + 'install': { + 'ceph': { + 'sha1': 'overridesha1', + 'tag': 'overridetag', + 'branch': 'overridebranch', + }, + }, + } + config = { + 'client.0': { + 'sha1': 'expectedsha1', + }, + 'osd.0': { + }, + } + expected_config = { + cluster.remote1: { + 'project': 'ceph', + 'sha1': 'expectedsha1', + }, + cluster.remote2: { + 'project': 'ceph', + 'sha1': 'overridesha1', + 'tag': 'overridetag', + 'branch': 'overridebranch', + }, + } + assert install.upgrade_remote_to_config(ctx, config) == expected_config + + + @patch("teuthology.task.install.packaging.get_package_version") + @patch("teuthology.task.install.redhat.set_deb_repo") + def test_rh_install_deb_pkgs(self, m_set_rh_deb_repo, m_get_pkg_version): + ctx = Mock() + remote = Mock() + version = '1.3.2' + rh_ds_yaml = dict() + rh_ds_yaml = { + 'versions': {'deb': {'mapped': {'1.3.2': '0.94.5'}}}, + 'pkgs': {'deb': ['pkg1', 'pkg2']}, + 'extra_system_packages': {'deb': ['es_pkg1', 'es_pkg2']}, + 'extra_packages': {'deb': ['e_pkg1', 'e_pkg2']}, + } + m_get_pkg_version.return_value = "0.94.5" + install.redhat.install_deb_pkgs(ctx, remote, version, rh_ds_yaml) + + @patch("teuthology.task.install.packaging.get_package_version") + def test_rh_install_pkgs(self, m_get_pkg_version): + ctx = Mock() + remote = Mock() + version = '1.3.2' + rh_ds_yaml = dict() + rh_ds_yaml = { + 'versions': {'rpm': {'mapped': {'1.3.2': '0.94.5', + '1.3.1': '0.94.3'}}}, + 'pkgs': {'rpm': ['pkg1', 'pkg2']}, + 'extra_system_packages': {'rpm': ['es_pkg1', 'es_pkg2']}, + 'extra_packages': {'rpm': ['e_pkg1', 'e_pkg2']}, + } + + m_get_pkg_version.return_value = "0.94.5" + install.redhat.install_pkgs(ctx, remote, version, rh_ds_yaml) + version = '1.3.1' + with pytest.raises(RuntimeError) as e: + install.redhat.install_pkgs(ctx, remote, version, rh_ds_yaml) + assert "Version check failed" in str(e) diff --git a/teuthology/test/task/test_internal.py b/teuthology/test/task/test_internal.py new file mode 100644 index 000000000..1340b1822 --- /dev/null +++ b/teuthology/test/task/test_internal.py @@ -0,0 +1,57 @@ +from teuthology.config import FakeNamespace +from teuthology.task import internal + + +class TestInternal(object): + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.config = dict() + + def test_buildpackages_prep(self): + # + # no buildpackages nor install tasks + # + self.ctx.config = { 'tasks': [] } + assert internal.buildpackages_prep(self.ctx, + self.ctx.config) == internal.BUILDPACKAGES_NOTHING + # + # make the buildpackages tasks the first to run + # + self.ctx.config = { + 'tasks': [ { 'atask': None }, + { 'internal.buildpackages_prep': None }, + { 'btask': None }, + { 'install': None }, + { 'buildpackages': None } ], + } + assert internal.buildpackages_prep(self.ctx, + self.ctx.config) == internal.BUILDPACKAGES_FIRST + assert self.ctx.config == { + 'tasks': [ { 'atask': None }, + { 'internal.buildpackages_prep': None }, + { 'buildpackages': None }, + { 'btask': None }, + { 'install': None } ], + } + # + # the buildpackages task already the first task to run + # + assert internal.buildpackages_prep(self.ctx, + self.ctx.config) == internal.BUILDPACKAGES_OK + # + # no buildpackages task + # + self.ctx.config = { + 'tasks': [ { 'install': None } ], + } + assert internal.buildpackages_prep(self.ctx, + self.ctx.config) == internal.BUILDPACKAGES_NOTHING + # + # no install task: the buildpackages task must be removed + # + self.ctx.config = { + 'tasks': [ { 'buildpackages': None } ], + } + assert internal.buildpackages_prep(self.ctx, + self.ctx.config) == internal.BUILDPACKAGES_REMOVED + assert self.ctx.config == {'tasks': []} diff --git a/teuthology/test/task/test_kernel.py b/teuthology/test/task/test_kernel.py new file mode 100644 index 000000000..593b204fa --- /dev/null +++ b/teuthology/test/task/test_kernel.py @@ -0,0 +1,243 @@ +from teuthology.config import FakeNamespace +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.task.kernel import ( + normalize_and_apply_overrides, + CONFIG_DEFAULT, + TIMEOUT_DEFAULT, +) + +class TestKernelNormalizeAndApplyOverrides(object): + + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('remote1'), ['mon.a', 'client.0']) + self.ctx.cluster.add(Remote('remote2'), ['osd.0', 'osd.1', 'osd.2']) + self.ctx.cluster.add(Remote('remote3'), ['client.1']) + + def test_default(self): + config = {} + overrides = {} + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'mon.a': CONFIG_DEFAULT, + 'osd.0': CONFIG_DEFAULT, + 'osd.1': CONFIG_DEFAULT, + 'osd.2': CONFIG_DEFAULT, + 'client.0': CONFIG_DEFAULT, + 'client.1': CONFIG_DEFAULT, + } + assert t == TIMEOUT_DEFAULT + + def test_timeout_default(self): + config = { + 'client.0': {'branch': 'testing'}, + } + overrides = {} + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'client.0': {'branch': 'testing'}, + } + assert t == TIMEOUT_DEFAULT + + def test_timeout(self): + config = { + 'client.0': {'branch': 'testing'}, + 'timeout': 100, + } + overrides = {} + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'client.0': {'branch': 'testing'}, + } + assert t == 100 + + def test_override_timeout(self): + config = { + 'client.0': {'branch': 'testing'}, + 'timeout': 100, + } + overrides = { + 'timeout': 200, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'client.0': {'branch': 'testing'}, + } + assert t == 200 + + def test_override_same_version_key(self): + config = { + 'client.0': {'branch': 'testing'}, + } + overrides = { + 'client.0': {'branch': 'wip-foobar'}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'client.0': {'branch': 'wip-foobar'}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_different_version_key(self): + config = { + 'client.0': {'branch': 'testing'}, + } + overrides = { + 'client.0': {'tag': 'v4.1'}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'client.0': {'tag': 'v4.1'}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_actual(self): + config = { + 'osd.1': {'tag': 'v4.1'}, + 'client.0': {'branch': 'testing'}, + } + overrides = { + 'osd.1': {'koji': 1234, 'kdb': True}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'osd.1': {'koji': 1234, 'kdb': True}, + 'client.0': {'branch': 'testing'}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_actual_with_generic(self): + config = { + 'osd.1': {'tag': 'v4.1', 'kdb': False}, + 'client.0': {'branch': 'testing'}, + } + overrides = { + 'osd': {'koji': 1234}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'osd.0': {'koji': 1234}, + 'osd.1': {'koji': 1234, 'kdb': False}, + 'osd.2': {'koji': 1234}, + 'client.0': {'branch': 'testing'}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_actual_with_top_level(self): + config = { + 'osd.1': {'tag': 'v4.1'}, + 'client.0': {'branch': 'testing', 'kdb': False}, + } + overrides = {'koji': 1234, 'kdb': True} + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'mon.a': {'koji': 1234, 'kdb': True}, + 'osd.0': {'koji': 1234, 'kdb': True}, + 'osd.1': {'koji': 1234, 'kdb': True}, + 'osd.2': {'koji': 1234, 'kdb': True}, + 'client.0': {'koji': 1234, 'kdb': True}, + 'client.1': {'koji': 1234, 'kdb': True}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_generic(self): + config = { + 'osd': {'tag': 'v4.1'}, + 'client': {'branch': 'testing'}, + } + overrides = { + 'client': {'koji': 1234, 'kdb': True}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'osd.0': {'tag': 'v4.1'}, + 'osd.1': {'tag': 'v4.1'}, + 'osd.2': {'tag': 'v4.1'}, + 'client.0': {'koji': 1234, 'kdb': True}, + 'client.1': {'koji': 1234, 'kdb': True}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_generic_with_top_level(self): + config = { + 'osd': {'tag': 'v4.1'}, + 'client': {'branch': 'testing', 'kdb': False}, + } + overrides = { + 'client': {'koji': 1234}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'osd.0': {'tag': 'v4.1'}, + 'osd.1': {'tag': 'v4.1'}, + 'osd.2': {'tag': 'v4.1'}, + 'client.0': {'koji': 1234, 'kdb': False}, + 'client.1': {'koji': 1234, 'kdb': False}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_generic_with_actual(self): + config = { + 'osd': {'tag': 'v4.1', 'kdb': False}, + 'client': {'branch': 'testing'}, + } + overrides = { + 'osd.2': {'koji': 1234, 'kdb': True}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'osd.0': {'tag': 'v4.1', 'kdb': False}, + 'osd.1': {'tag': 'v4.1', 'kdb': False}, + 'osd.2': {'koji': 1234, 'kdb': True}, + 'client.0': {'branch': 'testing'}, + 'client.1': {'branch': 'testing'}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_top_level(self): + config = {'branch': 'testing'} + overrides = {'koji': 1234, 'kdb': True} + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'mon.a': {'koji': 1234, 'kdb': True}, + 'osd.0': {'koji': 1234, 'kdb': True}, + 'osd.1': {'koji': 1234, 'kdb': True}, + 'osd.2': {'koji': 1234, 'kdb': True}, + 'client.0': {'koji': 1234, 'kdb': True}, + 'client.1': {'koji': 1234, 'kdb': True}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_top_level_with_actual(self): + config = {'branch': 'testing', 'kdb': False} + overrides = { + 'mon.a': {'koji': 1234}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'mon.a': {'koji': 1234, 'kdb': False}, + 'osd.0': {'branch': 'testing', 'kdb': False}, + 'osd.1': {'branch': 'testing', 'kdb': False}, + 'osd.2': {'branch': 'testing', 'kdb': False}, + 'client.0': {'branch': 'testing', 'kdb': False}, + 'client.1': {'branch': 'testing', 'kdb': False}, + } + assert t == TIMEOUT_DEFAULT + + def test_override_top_level_with_generic(self): + config = {'branch': 'testing', 'kdb': False} + overrides = { + 'client': {'koji': 1234, 'kdb': True}, + } + config, t = normalize_and_apply_overrides(self.ctx, config, overrides) + assert config == { + 'mon.a': {'branch': 'testing', 'kdb': False}, + 'osd.0': {'branch': 'testing', 'kdb': False}, + 'osd.1': {'branch': 'testing', 'kdb': False}, + 'osd.2': {'branch': 'testing', 'kdb': False}, + 'client.0': {'koji': 1234, 'kdb': True}, + 'client.1': {'koji': 1234, 'kdb': True}, + } + assert t == TIMEOUT_DEFAULT diff --git a/teuthology/test/task/test_pcp.py b/teuthology/test/task/test_pcp.py new file mode 100644 index 000000000..777740779 --- /dev/null +++ b/teuthology/test/task/test_pcp.py @@ -0,0 +1,379 @@ +import os +import requests + +from teuthology.util.compat import parse_qs, urljoin + +from mock import patch, DEFAULT, Mock, mock_open, call +from pytest import raises + +from teuthology.config import config, FakeNamespace +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.orchestra.run import Raw +from teuthology.task.pcp import (PCPDataSource, PCPArchive, PCPGrapher, + GrafanaGrapher, GraphiteGrapher, PCP) + +from teuthology.test.task import TestTask + +pcp_host = 'http://pcp.front.sepia.ceph.com:44323/' + + +class TestPCPDataSource(object): + klass = PCPDataSource + + def setup_method(self): + config.pcp_host = pcp_host + + def test_init(self): + hosts = ['host1', 'host2'] + time_from = 'now-2h' + time_until = 'now' + obj = self.klass( + hosts=hosts, + time_from=time_from, + time_until=time_until, + ) + assert obj.hosts == hosts + assert obj.time_from == time_from + assert obj.time_until == time_until + + +class TestPCPArchive(TestPCPDataSource): + klass = PCPArchive + + def test_get_archive_input_dir(self): + hosts = ['host1', 'host2'] + time_from = 'now-1d' + obj = self.klass( + hosts=hosts, + time_from=time_from, + ) + assert obj.get_archive_input_dir('host1') == \ + '/var/log/pcp/pmlogger/host1' + + def test_get_pmlogextract_cmd(self): + obj = self.klass( + hosts=['host1'], + time_from='now-3h', + time_until='now-1h', + ) + expected = [ + 'pmlogextract', + '-S', 'now-3h', + '-T', 'now-1h', + Raw('/var/log/pcp/pmlogger/host1/*.0'), + ] + assert obj.get_pmlogextract_cmd('host1') == expected + + def test_format_time(self): + assert self.klass._format_time(1462893484) == \ + '@ Tue May 10 15:18:04 2016' + + def test_format_time_now(self): + assert self.klass._format_time('now-1h') == 'now-1h' + + +class TestPCPGrapher(TestPCPDataSource): + klass = PCPGrapher + + def test_init(self): + hosts = ['host1', 'host2'] + time_from = 'now-2h' + time_until = 'now' + obj = self.klass( + hosts=hosts, + time_from=time_from, + time_until=time_until, + ) + assert obj.hosts == hosts + assert obj.time_from == time_from + assert obj.time_until == time_until + expected_url = urljoin(config.pcp_host, self.klass._endpoint) + assert obj.base_url == expected_url + + +class TestGrafanaGrapher(TestPCPGrapher): + klass = GrafanaGrapher + + def test_build_graph_url(self): + hosts = ['host1'] + time_from = 'now-3h' + time_until = 'now-1h' + obj = self.klass( + hosts=hosts, + time_from=time_from, + time_until=time_until, + ) + base_url = urljoin( + config.pcp_host, + 'grafana/index.html#/dashboard/script/index.js', + ) + assert obj.base_url == base_url + got_url = obj.build_graph_url() + parsed_query = parse_qs(got_url.split('?')[1]) + assert parsed_query['hosts'] == hosts + assert len(parsed_query['time_from']) == 1 + assert parsed_query['time_from'][0] == time_from + assert len(parsed_query['time_to']) == 1 + assert parsed_query['time_to'][0] == time_until + + def test_format_time(self): + assert self.klass._format_time(1462893484) == \ + '2016-05-10T15:18:04' + + def test_format_time_now(self): + assert self.klass._format_time('now-1h') == 'now-1h' + + +class TestGraphiteGrapher(TestPCPGrapher): + klass = GraphiteGrapher + + def test_build_graph_urls(self): + obj = self.klass( + hosts=['host1', 'host2'], + time_from='now-3h', + time_until='now-1h', + ) + expected_urls = [obj.get_graph_url(m) for m in obj.metrics] + obj.build_graph_urls() + built_urls = [] + for metric in obj.graphs.keys(): + built_urls.append(obj.graphs[metric]['url']) + assert len(built_urls) == len(expected_urls) + assert sorted(built_urls) == sorted(expected_urls) + + def test_check_dest_dir(self): + obj = self.klass( + hosts=['host1'], + time_from='now-3h', + ) + assert obj.dest_dir is None + with raises(RuntimeError): + obj._check_dest_dir() + + def test_generate_html_dynamic(self): + obj = self.klass( + hosts=['host1'], + time_from='now-3h', + ) + html = obj.generate_html() + assert config.pcp_host in html + + def test_download_graphs(self): + dest_dir = '/fake/path' + obj = self.klass( + hosts=['host1'], + time_from='now-3h', + dest_dir=dest_dir, + ) + _format = obj.graph_defaults.get('format') + with patch('teuthology.task.pcp.requests.get', create=True) as m_get: + m_resp = Mock() + m_resp.ok = True + m_get.return_value = m_resp + with patch('teuthology.task.pcp.open', mock_open(), create=True): + obj.download_graphs() + expected_filenames = [] + for metric in obj.metrics: + expected_filenames.append( + "{}.{}".format( + os.path.join( + dest_dir, + obj._sanitize_metric_name(metric), + ), + _format, + ) + ) + graph_filenames = [] + for metric in obj.graphs.keys(): + graph_filenames.append(obj.graphs[metric]['file']) + assert sorted(graph_filenames) == sorted(expected_filenames) + + def test_generate_html_static(self): + obj = self.klass( + hosts=['host1'], + time_from='now-3h', + dest_dir='/fake/path', + ) + with patch('teuthology.task.pcp.requests.get', create=True) as m_get: + m_resp = Mock() + m_resp.ok = True + m_get.return_value = m_resp + with patch('teuthology.task.pcp.open', mock_open(), create=True): + obj.download_graphs() + html = obj.generate_html(mode='static') + assert config.pcp_host not in html + + def test_sanitize_metric_name(self): + sanitized_metrics = { + 'foo.bar': 'foo.bar', + 'foo.*': 'foo._all_', + 'foo.bar baz': 'foo.bar_baz', + 'foo.*.bar baz': 'foo._all_.bar_baz', + } + for in_, out in sanitized_metrics.items(): + assert self.klass._sanitize_metric_name(in_) == out + + def test_get_target_globs(self): + obj = self.klass( + hosts=['host1'], + time_from='now-3h', + ) + assert obj.get_target_globs() == ['*host1*'] + assert obj.get_target_globs('a.metric') == ['*host1*.a.metric'] + obj.hosts.append('host2') + assert obj.get_target_globs() == ['*host1*', '*host2*'] + assert obj.get_target_globs('a.metric') == \ + ['*host1*.a.metric', '*host2*.a.metric'] + + +class TestPCPTask(TestTask): + klass = PCP + task_name = 'pcp' + + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.cluster = Cluster() + self.ctx.cluster.add(Remote('user@remote1'), ['role1']) + self.ctx.cluster.add(Remote('user@remote2'), ['role2']) + self.ctx.config = dict() + self.task_config = dict() + config.pcp_host = pcp_host + + def test_init(self): + task = self.klass(self.ctx, self.task_config) + assert task.stop_time == 'now' + + def test_disabled(self): + config.pcp_host = None + with self.klass(self.ctx, self.task_config) as task: + assert task.enabled is False + assert not hasattr(task, 'grafana') + assert not hasattr(task, 'graphite') + assert not hasattr(task, 'archiver') + + def test_setup(self): + with patch.multiple( + self.klass, + setup_collectors=DEFAULT, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task.setup_collectors.assert_called_once_with() + assert isinstance(task.start_time, int) + + def test_setup_collectors(self): + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + assert hasattr(task, 'grafana') + assert not hasattr(task, 'graphite') + assert not hasattr(task, 'archiver') + self.task_config['grafana'] = False + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'grafana') + + @patch('os.makedirs') + def test_setup_grafana(self, m_makedirs): + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + self.ctx.archive = '/fake/path' + with self.klass(self.ctx, self.task_config) as task: + assert hasattr(task, 'grafana') + self.task_config['grafana'] = False + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'grafana') + + @patch('os.makedirs') + @patch('teuthology.task.pcp.GraphiteGrapher') + def test_setup_graphite(self, m_graphite_grapher, m_makedirs): + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'graphite') + self.task_config['graphite'] = False + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'graphite') + self.ctx.archive = '/fake/path' + self.task_config['graphite'] = True + with self.klass(self.ctx, self.task_config) as task: + assert hasattr(task, 'graphite') + self.task_config['graphite'] = False + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'graphite') + + @patch('os.makedirs') + @patch('teuthology.task.pcp.PCPArchive') + def test_setup_archiver(self, m_archive, m_makedirs): + with patch.multiple( + self.klass, + begin=DEFAULT, + end=DEFAULT, + ): + self.task_config['fetch_archives'] = True + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'archiver') + self.task_config['fetch_archives'] = False + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'archiver') + self.ctx.archive = '/fake/path' + self.task_config['fetch_archives'] = True + with self.klass(self.ctx, self.task_config) as task: + assert hasattr(task, 'archiver') + self.task_config['fetch_archives'] = False + with self.klass(self.ctx, self.task_config) as task: + assert not hasattr(task, 'archiver') + + @patch('os.makedirs') + @patch('teuthology.task.pcp.GrafanaGrapher') + @patch('teuthology.task.pcp.GraphiteGrapher') + def test_begin(self, m_grafana, m_graphite, m_makedirs): + with patch.multiple( + self.klass, + end=DEFAULT, + ): + with self.klass(self.ctx, self.task_config) as task: + task.grafana.build_graph_url.assert_called_once_with() + self.task_config['graphite'] = True + self.ctx.archive = '/fake/path' + with self.klass(self.ctx, self.task_config) as task: + task.graphite.write_html.assert_called_once_with() + + @patch('os.makedirs') + @patch('teuthology.task.pcp.GrafanaGrapher') + @patch('teuthology.task.pcp.GraphiteGrapher') + def test_end(self, m_grafana, m_graphite, m_makedirs): + self.ctx.archive = '/fake/path' + with self.klass(self.ctx, self.task_config) as task: + # begin() should have called write_html() once by now, with no args + task.graphite.write_html.assert_called_once_with() + # end() should have called write_html() a second time by now, with + # mode=static + second_call = task.graphite.write_html.call_args_list[1] + assert second_call[1]['mode'] == 'static' + assert isinstance(task.stop_time, int) + + @patch('os.makedirs') + @patch('teuthology.task.pcp.GrafanaGrapher') + @patch('teuthology.task.pcp.GraphiteGrapher') + def test_end_16049(self, m_grafana, m_graphite, m_makedirs): + # http://tracker.ceph.com/issues/16049 + # Jobs were failing if graph downloading failed. We don't want that. + self.ctx.archive = '/fake/path' + with self.klass(self.ctx, self.task_config) as task: + task.graphite.download_graphs.side_effect = \ + requests.ConnectionError + # Even though downloading graphs failed, we should have called + # write_html() a second time, again with no args + assert task.graphite.write_html.call_args_list == [call(), call()] + assert isinstance(task.stop_time, int) diff --git a/teuthology/test/task/test_selinux.py b/teuthology/test/task/test_selinux.py new file mode 100644 index 000000000..68c8e9c03 --- /dev/null +++ b/teuthology/test/task/test_selinux.py @@ -0,0 +1,35 @@ +from mock import patch, Mock, DEFAULT + +from teuthology.config import FakeNamespace +from teuthology.orchestra.cluster import Cluster +from teuthology.orchestra.remote import Remote +from teuthology.task.selinux import SELinux + + +class TestSELinux(object): + def setup_method(self): + self.ctx = FakeNamespace() + self.ctx.config = dict() + + def test_host_exclusion(self): + with patch.multiple( + Remote, + os=DEFAULT, + run=DEFAULT, + ): + self.ctx.cluster = Cluster() + remote1 = Remote('remote1') + remote1.os = Mock() + remote1.os.package_type = 'rpm' + remote1._is_vm = False + self.ctx.cluster.add(remote1, ['role1']) + remote2 = Remote('remote1') + remote2.os = Mock() + remote2.os.package_type = 'deb' + remote2._is_vm = False + self.ctx.cluster.add(remote2, ['role2']) + task_config = dict() + with SELinux(self.ctx, task_config) as task: + remotes = list(task.cluster.remotes) + assert remotes == [remote1] + diff --git a/teuthology/test/test_config.py b/teuthology/test/test_config.py new file mode 100644 index 000000000..262000890 --- /dev/null +++ b/teuthology/test/test_config.py @@ -0,0 +1,189 @@ +import pytest + +from teuthology import config + + +class TestYamlConfig(object): + def setup_method(self): + self.test_class = config.YamlConfig + + def test_set_multiple(self): + conf_obj = self.test_class() + conf_obj.foo = 'foo' + conf_obj.bar = 'bar' + assert conf_obj.foo == 'foo' + assert conf_obj.bar == 'bar' + assert conf_obj.to_dict()['foo'] == 'foo' + + def test_from_dict(self): + in_dict = dict(foo='bar') + conf_obj = self.test_class.from_dict(in_dict) + assert conf_obj.foo == 'bar' + + def test_contains(self): + in_dict = dict(foo='bar') + conf_obj = self.test_class.from_dict(in_dict) + conf_obj.bar = "foo" + assert "bar" in conf_obj + assert "foo" in conf_obj + assert "baz" not in conf_obj + + def test_to_dict(self): + in_dict = dict(foo='bar') + conf_obj = self.test_class.from_dict(in_dict) + assert conf_obj.to_dict() == in_dict + + def test_from_str(self): + in_str = "foo: bar" + conf_obj = self.test_class.from_str(in_str) + assert conf_obj.foo == 'bar' + + def test_to_str(self): + in_str = "foo: bar" + conf_obj = self.test_class.from_str(in_str) + assert conf_obj.to_str() == in_str + + def test_update(self): + conf_obj = self.test_class(dict()) + conf_obj.foo = 'foo' + conf_obj.bar = 'bar' + conf_obj.update(dict(bar='baz')) + assert conf_obj.foo == 'foo' + assert conf_obj.bar == 'baz' + + def test_delattr(self): + conf_obj = self.test_class() + conf_obj.foo = 'bar' + assert conf_obj.foo == 'bar' + del conf_obj.foo + assert conf_obj.foo is None + + def test_assignment(self): + conf_obj = self.test_class() + conf_obj["foo"] = "bar" + assert conf_obj["foo"] == "bar" + assert conf_obj.foo == "bar" + + def test_used_with_update(self): + d = dict() + conf_obj = self.test_class.from_dict({"foo": "bar"}) + d.update(conf_obj) + assert d["foo"] == "bar" + + def test_get(self): + conf_obj = self.test_class() + assert conf_obj.get('foo') is None + assert conf_obj.get('foo', 'bar') == 'bar' + conf_obj.foo = 'baz' + assert conf_obj.get('foo') == 'baz' + + +class TestTeuthologyConfig(TestYamlConfig): + def setup_method(self): + self.test_class = config.TeuthologyConfig + + def test_get_ceph_git_base_default(self): + conf_obj = self.test_class() + conf_obj.yaml_path = '' + conf_obj.load() + assert conf_obj.ceph_git_base_url == "https://github.com/ceph/" + + def test_set_ceph_git_base_via_private(self): + conf_obj = self.test_class() + conf_obj._conf['ceph_git_base_url'] = \ + "git://git.ceph.com/" + assert conf_obj.ceph_git_base_url == "git://git.ceph.com/" + + def test_get_reserve_machines_default(self): + conf_obj = self.test_class() + conf_obj.yaml_path = '' + conf_obj.load() + assert conf_obj.reserve_machines == 5 + + def test_set_reserve_machines_via_private(self): + conf_obj = self.test_class() + conf_obj._conf['reserve_machines'] = 2 + assert conf_obj.reserve_machines == 2 + + def test_set_nonstandard(self): + conf_obj = self.test_class() + conf_obj.something = 'something else' + assert conf_obj.something == 'something else' + + +class TestJobConfig(TestYamlConfig): + def setup_method(self): + self.test_class = config.JobConfig + + +class TestFakeNamespace(TestYamlConfig): + def setup_method(self): + self.test_class = config.FakeNamespace + + def test_docopt_dict(self): + """ + Tests if a dict in the format that docopt returns can + be parsed correctly. + """ + d = { + "--verbose": True, + "--an-option": "some_option", + "": "the_arg", + "something": "some_thing", + } + conf_obj = self.test_class(d) + assert conf_obj.verbose + assert conf_obj.an_option == "some_option" + assert conf_obj.an_arg == "the_arg" + assert conf_obj.something == "some_thing" + + def test_config(self): + """ + Tests that a teuthology_config property is automatically added + to the conf_obj + """ + conf_obj = self.test_class(dict(foo="bar")) + assert conf_obj["foo"] == "bar" + assert conf_obj.foo == "bar" + assert conf_obj.teuthology_config.get("fake key") is None + + def test_getattr(self): + conf_obj = self.test_class.from_dict({"foo": "bar"}) + result = getattr(conf_obj, "not_there", "default") + assert result == "default" + result = getattr(conf_obj, "foo") + assert result == "bar" + + def test_none(self): + conf_obj = self.test_class.from_dict(dict(null=None)) + assert conf_obj.null is None + + def test_delattr(self): + conf_obj = self.test_class() + conf_obj.foo = 'bar' + assert conf_obj.foo == 'bar' + del conf_obj.foo + with pytest.raises(AttributeError): + conf_obj.foo + + def test_to_str(self): + in_str = "foo: bar" + conf_obj = self.test_class.from_str(in_str) + assert conf_obj.to_str() == "{'foo': 'bar'}" + + def test_multiple_access(self): + """ + Test that config.config and FakeNamespace.teuthology_config reflect + each others' modifications + """ + in_str = "foo: bar" + conf_obj = self.test_class.from_str(in_str) + assert config.config.get('test_key_1') is None + assert conf_obj.teuthology_config.get('test_key_1') is None + config.config.test_key_1 = 'test value' + assert conf_obj.teuthology_config['test_key_1'] == 'test value' + + assert config.config.get('test_key_2') is None + assert conf_obj.teuthology_config.get('test_key_2') is None + conf_obj.teuthology_config['test_key_2'] = 'test value' + assert config.config['test_key_2'] == 'test value' diff --git a/teuthology/test/test_contextutil.py b/teuthology/test/test_contextutil.py new file mode 100644 index 000000000..980415e3d --- /dev/null +++ b/teuthology/test/test_contextutil.py @@ -0,0 +1,97 @@ +from pytest import raises +from teuthology import contextutil +from logging import ERROR + + +class TestSafeWhile(object): + + def setup_method(self): + contextutil.log.setLevel(ERROR) + self.fake_sleep = lambda s: True + self.s_while = contextutil.safe_while + + def test_6_5_10_deal(self): + with raises(contextutil.MaxWhileTries): + with self.s_while(_sleeper=self.fake_sleep) as proceed: + while proceed(): + pass + + def test_6_0_1_deal(self): + with raises(contextutil.MaxWhileTries) as error: + with self.s_while( + tries=1, + _sleeper=self.fake_sleep + ) as proceed: + while proceed(): + pass + + assert 'waiting for 6 seconds' in str(error) + + def test_1_0_10_deal(self): + with raises(contextutil.MaxWhileTries) as error: + with self.s_while( + sleep=1, + _sleeper=self.fake_sleep + ) as proceed: + while proceed(): + pass + + assert 'waiting for 10 seconds' in str(error) + + def test_6_1_10_deal(self): + with raises(contextutil.MaxWhileTries) as error: + with self.s_while( + increment=1, + _sleeper=self.fake_sleep + ) as proceed: + while proceed(): + pass + + assert 'waiting for 105 seconds' in str(error) + + def test_timeout(self): + # series of sleep, increment, timeout params to test + params = [(10, 0, 100), + (1, 2, 30), + (10, 0.5, 100), + (2, 0, 5), + (2, 3, 5), + (10, 0, 15), + (20, 10, 60)] + for sleep, increment, timeout in params: + print("trying ", sleep, increment, timeout) + with raises(contextutil.MaxWhileTries) as error: + with self.s_while( + sleep=sleep, + increment=increment, + timeout=timeout, + _sleeper=self.fake_sleep + ) as proceed: + while proceed(): + pass + + assert 'waiting for {timeout}'.format(timeout=timeout) in str(error) + + def test_action(self): + with raises(contextutil.MaxWhileTries) as error: + with self.s_while( + action='doing the thing', + _sleeper=self.fake_sleep + ) as proceed: + while proceed(): + pass + + assert "'doing the thing' reached maximum tries" in str(error) + + def test_no_raise(self): + with self.s_while(_raise=False, _sleeper=self.fake_sleep) as proceed: + while proceed(): + pass + + assert True + + def test_tries(self): + attempts = 0 + with self.s_while(tries=-1, _sleeper=self.fake_sleep) as proceed: + while attempts < 100 and proceed(): + attempts += 1 diff --git a/teuthology/test/test_describe_tests.py b/teuthology/test/test_describe_tests.py new file mode 100644 index 000000000..04d177702 --- /dev/null +++ b/teuthology/test/test_describe_tests.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- +import pytest + +from teuthology.test.fake_fs import make_fake_fstools +from teuthology.describe_tests import (tree_with_info, extract_info, + get_combinations) +from teuthology.exceptions import ParseError +from mock import MagicMock, patch + +realistic_fs = { + 'basic': { + '%': None, + 'base': { + 'install.yaml': + """meta: +- desc: install ceph +install: +""" + }, + 'clusters': { + 'fixed-1.yaml': + """meta: +- desc: single node cluster +roles: +- [osd.0, osd.1, osd.2, mon.a, mon.b, mon.c, client.0] +""", + 'fixed-2.yaml': + """meta: +- desc: couple node cluster +roles: +- [osd.0, osd.1, osd.2, mon.a, mon.b, mon.c] +- [client.0] +""", + 'fixed-3.yaml': + """meta: +- desc: triple node cluster +roles: +- [osd.0, osd.1, osd.2, mon.a, mon.b, mon.c] +- [client.0] +- [client.1] +""" + }, + 'workloads': { + 'rbd_api_tests_old_format.yaml': + """meta: +- desc: c/c++ librbd api tests with format 1 images + rbd_features: none +overrides: + ceph: + conf: + client: + rbd default format: 1 +tasks: +- workunit: + env: + RBD_FEATURES: 0 + clients: + client.0: + - rbd/test_librbd.sh +""", + 'rbd_api_tests.yaml': + """meta: +- desc: c/c++ librbd api tests with default settings + rbd_features: default +tasks: +- workunit: + clients: + client.0: + - rbd/test_librbd.sh +""", + }, + }, +} + + +expected_tree = """├── % +├── base +│ └── install.yaml +├── clusters +│ ├── fixed-1.yaml +│ ├── fixed-2.yaml +│ └── fixed-3.yaml +└── workloads + ├── rbd_api_tests.yaml + └── rbd_api_tests_old_format.yaml""".split('\n') + + +expected_facets = [ + '', + '', + 'base', + '', + 'clusters', + 'clusters', + 'clusters', + '', + 'workloads', + 'workloads', +] + + +expected_desc = [ + '', + '', + 'install ceph', + '', + 'single node cluster', + 'couple node cluster', + 'triple node cluster', + '', + 'c/c++ librbd api tests with default settings', + 'c/c++ librbd api tests with format 1 images', +] + + +expected_rbd_features = [ + '', + '', + '', + '', + '', + '', + '', + '', + 'default', + 'none', +] + + +class TestDescribeTests(object): + + def setup_method(self): + self.mocks = dict() + self.patchers = dict() + exists, listdir, isfile, isdir, open = make_fake_fstools(realistic_fs) + for ppoint, fn in { + 'os.listdir': listdir, + 'os.path.isdir': isdir, + 'teuthology.describe_tests.open': open, + 'builtins.open': open, + 'os.path.exists': exists, + 'os.listdir': listdir, + 'os.path.isfile': isfile, + }.items(): + mockobj = MagicMock() + patcher = patch(ppoint, mockobj) + mockobj.side_effect = fn + patcher.start() + self.mocks[ppoint] = mockobj + self.patchers[ppoint] = patcher + + def stop_patchers(self): + for patcher in self.patchers.values(): + patcher.stop() + + def teardown_method(self): + self.stop_patchers() + + @staticmethod + def assert_expected_combo_headers(headers): + assert headers == (['subsuite depth 0'] + + sorted(set(filter(bool, expected_facets)))) + + def test_no_filters(self): + rows = tree_with_info('basic', [], False, '', []) + assert rows == [[x] for x in expected_tree] + + def test_single_filter(self): + rows = tree_with_info('basic', ['desc'], False, '', []) + assert rows == [list(_) for _ in zip(expected_tree, expected_desc)] + + rows = tree_with_info('basic', ['rbd_features'], False, '', []) + assert rows == [list(_) for _ in zip(expected_tree, expected_rbd_features)] + + def test_single_filter_with_facets(self): + rows = tree_with_info('basic', ['desc'], True, '', []) + assert rows == [list(_) for _ in zip(expected_tree, expected_facets, + expected_desc)] + + rows = tree_with_info('basic', ['rbd_features'], True, '', []) + assert rows == [list(_) for _ in zip(expected_tree, expected_facets, + expected_rbd_features)] + + def test_no_matching(self): + rows = tree_with_info('basic', ['extra'], False, '', []) + assert rows == [list(_) for _ in zip(expected_tree, [''] * len(expected_tree))] + + rows = tree_with_info('basic', ['extra'], True, '', []) + assert rows == [list(_) for _ in zip(expected_tree, expected_facets, + [''] * len(expected_tree))] + + def test_multiple_filters(self): + rows = tree_with_info('basic', ['desc', 'rbd_features'], False, '', []) + assert rows == [list(_) for _ in zip(expected_tree, + expected_desc, + expected_rbd_features)] + + rows = tree_with_info('basic', ['rbd_features', 'desc'], False, '', []) + assert rows == [list(_) for _ in zip(expected_tree, + expected_rbd_features, + expected_desc)] + + def test_multiple_filters_with_facets(self): + rows = tree_with_info('basic', ['desc', 'rbd_features'], True, '', []) + assert rows == [list(_) for _ in zip(expected_tree, + expected_facets, + expected_desc, + expected_rbd_features)] + + rows = tree_with_info('basic', ['rbd_features', 'desc'], True, '', []) + assert rows == [list(_) for _ in zip(expected_tree, + expected_facets, + expected_rbd_features, + expected_desc)] + + def test_combinations_only_facets(self): + headers, rows = get_combinations('basic', + fields=[], subset=None, limit=1, + filter_in=None, filter_out=None, filter_all=None, + include_facet=True) + self.assert_expected_combo_headers(headers) + assert rows == [['basic', 'install', 'fixed-1', 'rbd_api_tests']] + + def test_combinations_desc_features(self): + headers, rows = get_combinations('basic', + fields=['desc', 'rbd_features'], subset=None, limit=1, + filter_in=None, filter_out=None, filter_all=None, + include_facet=False) + assert headers == ['desc', 'rbd_features'] + descriptions = '\n'.join([ + 'install ceph', + 'single node cluster', + 'c/c++ librbd api tests with default settings', + ]) + assert rows == [[descriptions, 'default']] + + def test_combinations_filter_in(self): + headers, rows = get_combinations('basic', + fields=[], subset=None, limit=0, + filter_in=['old_format'], filter_out=None, filter_all=None, + include_facet=True) + self.assert_expected_combo_headers(headers) + assert rows == [ + ['basic', 'install', 'fixed-1', 'rbd_api_tests_old_format'], + ['basic', 'install', 'fixed-2', 'rbd_api_tests_old_format'], + ['basic', 'install', 'fixed-3', 'rbd_api_tests_old_format'], + ] + + def test_combinations_filter_out(self): + headers, rows = get_combinations('basic', + fields=[], subset=None, limit=0, + filter_in=None, filter_out=['old_format'], filter_all=None, + include_facet=True) + self.assert_expected_combo_headers(headers) + assert rows == [ + ['basic', 'install', 'fixed-1', 'rbd_api_tests'], + ['basic', 'install', 'fixed-2', 'rbd_api_tests'], + ['basic', 'install', 'fixed-3', 'rbd_api_tests'], + ] + + def test_combinations_filter_all(self): + headers, rows = get_combinations('basic', + fields=[], subset=None, limit=0, + filter_in=None, filter_out=None, + filter_all=['fixed-2', 'old_format'], + include_facet=True) + self.assert_expected_combo_headers(headers) + assert rows == [ + ['basic', 'install', 'fixed-2', 'rbd_api_tests_old_format'] + ] + + +@patch('teuthology.describe_tests.open') +@patch('os.path.isdir') +def test_extract_info_dir(m_isdir, m_open): + simple_fs = {'a': {'b.yaml': 'meta: [{foo: c}]'}} + _, _, _, m_isdir.side_effect, m_open.side_effect = \ + make_fake_fstools(simple_fs) + info = extract_info('a', []) + assert info == {} + + info = extract_info('a', ['foo', 'bar']) + assert info == {'foo': '', 'bar': ''} + + info = extract_info('a/b.yaml', ['foo', 'bar']) + assert info == {'foo': 'c', 'bar': ''} + + +@patch('teuthology.describe_tests.open') +@patch('os.path.isdir') +def check_parse_error(fs, m_isdir, m_open): + _, _, _, m_isdir.side_effect, m_open.side_effect = make_fake_fstools(fs) + with pytest.raises(ParseError): + a = extract_info('a.yaml', ['a']) + raise Exception(str(a)) + + +def test_extract_info_too_many_elements(): + check_parse_error({'a.yaml': 'meta: [{a: b}, {b: c}]'}) + + +def test_extract_info_not_a_list(): + check_parse_error({'a.yaml': 'meta: {a: b}'}) + + +def test_extract_info_not_a_dict(): + check_parse_error({'a.yaml': 'meta: [[a, b]]'}) + + +@patch('teuthology.describe_tests.open') +@patch('os.path.isdir') +def test_extract_info_empty_file(m_isdir, m_open): + simple_fs = {'a.yaml': ''} + _, _, _, m_isdir.side_effect, m_open.side_effect = \ + make_fake_fstools(simple_fs) + info = extract_info('a.yaml', []) + assert info == {} diff --git a/teuthology/test/test_email_sleep_before_teardown.py b/teuthology/test/test_email_sleep_before_teardown.py new file mode 100644 index 000000000..60fcd245c --- /dev/null +++ b/teuthology/test/test_email_sleep_before_teardown.py @@ -0,0 +1,81 @@ +from humanfriendly import format_timespan +from mock import Mock, patch +from pytest import mark +from teuthology.config import config +from teuthology.run_tasks import build_email_body as email_body +from textwrap import dedent + +class TestSleepBeforeTeardownEmail(object): + def setup_method(self): + config.results_ui_server = "http://example.com/" + config.archive_server = "http://qa-proxy.ceph.com/teuthology/" + + @mark.parametrize( + ['status', 'owner', 'suite_name', 'run_name', 'job_id', 'dura'], + [ + [ + 'pass', + 'noreply@host', + 'dummy', + 'run-name', + 123, + 3600, + ], + [ + 'fail', + 'noname', + 'yummy', + 'next-run', + 1000, + 99999, + ], + ] + ) + @patch("teuthology.run_tasks.time.time") + def test_sleep_before_teardown_email_body(self, m_time, status, owner, + suite_name, run_name, job_id, dura): + ctx = Mock() + archive_path='archive/path' + archive_dir='/archive/dir' + date_sec=3661 + date_str='1970-01-01 01:01:01' + m_time.return_value=float(date_sec) + duration_sec=dura + duration_str=format_timespan(duration_sec) + ref_body=dedent(""" + Teuthology job {run}/{job} has fallen asleep at {date} for {duration_str} + + Owner: {owner} + Suite Name: {suite} + Sleep Date: {date} + Sleep Time: {duration_sec} seconds ({duration_str}) + Job Info: http://example.com/{run}/ + Job Logs: http://qa-proxy.ceph.com/teuthology/path/{job}/ + Task Stack: a/b/c + Current Status: {status}""" + .format(duration_sec=duration_sec, duration_str=duration_str, + owner=owner, suite=suite_name, run=run_name, + job=job_id, status=status, date=date_str)) + print(ref_body) + ctx.config = dict( + archive_path=archive_path, + job_id=job_id, + suite=suite_name, + ) + if status == 'pass': + ctx.summary = dict( + success=True, + ) + elif status == 'fail': + ctx.summary = dict( + success=False, + ) + else: + ctx.summary = dict() + + ctx.owner = owner + ctx.name = run_name + ctx.archive_dir = archive_dir + tasks = [('a', None), ('b', None), ('c', None)] + (subj, body) = email_body(ctx, tasks, dura) + assert body == ref_body.lstrip('\n') diff --git a/teuthology/test/test_exit.py b/teuthology/test/test_exit.py new file mode 100644 index 000000000..e8c22bf20 --- /dev/null +++ b/teuthology/test/test_exit.py @@ -0,0 +1,90 @@ +import os +import random + +from unittest.mock import patch, Mock + +from teuthology import exit + + +class TestExiter(object): + klass = exit.Exiter + + def setup_method(self): + self.pid = os.getpid() + + # Below, we patch os.kill() in such a way that the first time it is + # invoked it does actually send the signal. Any subsequent invocation + # won't send any signal - this is so we don't kill the process running + # our unit tests! + self.patcher_kill = patch( + 'teuthology.exit.os.kill', + wraps=os.kill, + ) + + #Keep a copy of the unpatched kill and call this in place of os.kill + #In the Exiter objects, the os.kill calls are patched. + #So the call_count should be 1. + self.kill_unpatched = os.kill + self.m_kill = self.patcher_kill.start() + + def m_kill_unwrap(pid, sig): + # Setting return_value of a mocked object disables the wrapping + if self.m_kill.call_count > 1: + self.m_kill.return_value = None + + self.m_kill.side_effect = m_kill_unwrap + + def teardown_method(self): + self.patcher_kill.stop() + del self.m_kill + + def test_basic(self): + sig = 15 + obj = self.klass() + m_func = Mock() + obj.add_handler(sig, m_func) + assert len(obj.handlers) == 1 + self.kill_unpatched(self.pid, sig) + assert m_func.call_count == 1 + assert self.m_kill.call_count == 1 + for arg_list in self.m_kill.call_args_list: + assert arg_list[0] == (self.pid, sig) + + def test_remove_handlers(self): + sig = [1, 15] + send_sig = random.choice(sig) + n = 3 + obj = self.klass() + handlers = list() + for i in range(n): + m_func = Mock(name="handler %s" % i) + handlers.append(obj.add_handler(sig, m_func)) + assert obj.handlers == handlers + for handler in handlers: + handler.remove() + assert obj.handlers == list() + self.kill_unpatched(self.pid, send_sig) + assert self.m_kill.call_count == 1 + for handler in handlers: + assert handler.func.call_count == 0 + + def test_n_handlers(self, n=10, sig=11): + if isinstance(sig, int): + send_sig = sig + else: + send_sig = random.choice(sig) + obj = self.klass() + handlers = list() + for i in range(n): + m_func = Mock(name="handler %s" % i) + handlers.append(obj.add_handler(sig, m_func)) + assert obj.handlers == handlers + self.kill_unpatched(self.pid, send_sig) + for i in range(n): + assert handlers[i].func.call_count == 1 + assert self.m_kill.call_count == 1 + for arg_list in self.m_kill.call_args_list: + assert arg_list[0] == (self.pid, send_sig) + + def test_multiple_signals(self): + self.test_n_handlers(n=3, sig=[1, 6, 11, 15]) diff --git a/teuthology/test/test_get_distro.py b/teuthology/test/test_get_distro.py new file mode 100644 index 000000000..b03ba7b63 --- /dev/null +++ b/teuthology/test/test_get_distro.py @@ -0,0 +1,47 @@ +from teuthology.misc import get_distro + + +class Mock: + pass + + +class TestGetDistro(object): + + def setup_method(self): + self.fake_ctx = Mock() + self.fake_ctx.config = {} + # os_type in ctx will always default to None + self.fake_ctx.os_type = None + + def test_default_distro(self): + distro = get_distro(self.fake_ctx) + assert distro == 'ubuntu' + + def test_argument(self): + # we don't want fake_ctx to have a config + self.fake_ctx = Mock() + self.fake_ctx.os_type = 'centos' + distro = get_distro(self.fake_ctx) + assert distro == 'centos' + + def test_teuth_config(self): + self.fake_ctx.config = {'os_type': 'fedora'} + distro = get_distro(self.fake_ctx) + assert distro == 'fedora' + + def test_argument_takes_precedence(self): + self.fake_ctx.config = {'os_type': 'fedora'} + self.fake_ctx.os_type = "centos" + distro = get_distro(self.fake_ctx) + assert distro == 'centos' + + def test_no_config_or_os_type(self): + self.fake_ctx = Mock() + self.fake_ctx.os_type = None + distro = get_distro(self.fake_ctx) + assert distro == 'ubuntu' + + def test_config_os_type_is_none(self): + self.fake_ctx.config["os_type"] = None + distro = get_distro(self.fake_ctx) + assert distro == 'ubuntu' diff --git a/teuthology/test/test_get_distro_version.py b/teuthology/test/test_get_distro_version.py new file mode 100644 index 000000000..8a77e39be --- /dev/null +++ b/teuthology/test/test_get_distro_version.py @@ -0,0 +1,47 @@ +from teuthology.misc import get_distro_version + + +class Mock: + pass + + +class TestGetDistroVersion(object): + + def setup_method(self): + self.fake_ctx = Mock() + self.fake_ctx.config = {} + self.fake_ctx_noarg = Mock() + self.fake_ctx_noarg.config = {} + self.fake_ctx_noarg.os_version = None + self.fake_ctx.os_type = None + self.fake_ctx_noarg.os_type = None + + def test_default_distro_version(self): + # Default distro is ubuntu, default version of ubuntu is 20.04 + self.fake_ctx.os_version = None + distroversion = get_distro_version(self.fake_ctx) + assert distroversion == '22.04' + + def test_argument_version(self): + self.fake_ctx.os_version = '13.04' + distroversion = get_distro_version(self.fake_ctx) + assert distroversion == '13.04' + + def test_teuth_config_version(self): + #Argument takes precidence. + self.fake_ctx.os_version = '13.04' + self.fake_ctx.config = {'os_version': '13.10'} + distroversion = get_distro_version(self.fake_ctx) + assert distroversion == '13.04' + + def test_teuth_config_noarg_version(self): + self.fake_ctx_noarg.config = {'os_version': '13.04'} + distroversion = get_distro_version(self.fake_ctx_noarg) + assert distroversion == '13.04' + + def test_no_teuth_config(self): + self.fake_ctx = Mock() + self.fake_ctx.os_type = None + self.fake_ctx.os_version = '13.04' + distroversion = get_distro_version(self.fake_ctx) + assert distroversion == '13.04' diff --git a/teuthology/test/test_get_multi_machine_types.py b/teuthology/test/test_get_multi_machine_types.py new file mode 100644 index 000000000..32a6b0263 --- /dev/null +++ b/teuthology/test/test_get_multi_machine_types.py @@ -0,0 +1,27 @@ +from teuthology import misc as teuthology + +class Mock: pass + +class TestGetMultiMachineTypes(object): + + def test_space(self): + give = 'burnupi plana vps' + expect = ['burnupi','plana','vps'] + assert teuthology.get_multi_machine_types(give) == expect + + def test_tab(self): + give = 'burnupi plana vps' + expect = ['burnupi','plana','vps'] + assert teuthology.get_multi_machine_types(give) == expect + + def test_comma(self): + give = 'burnupi,plana,vps' + expect = ['burnupi','plana','vps'] + assert teuthology.get_multi_machine_types(give) == expect + + def test_single(self): + give = 'burnupi' + expect = ['burnupi'] + assert teuthology.get_multi_machine_types(give) == expect + + diff --git a/teuthology/test/test_imports.py b/teuthology/test/test_imports.py new file mode 100644 index 000000000..cb25c8d0a --- /dev/null +++ b/teuthology/test/test_imports.py @@ -0,0 +1,31 @@ +import importlib +import pytest +import sys + +from pathlib import Path +from typing import List + +root = Path("./teuthology") + + +def find_modules() -> List[str]: + modules = [] + for path in root.rglob("*.py"): + if path.name.startswith("test_"): + continue + if "-" in path.name: + continue + if path.name == "__init__.py": + path = path.parent + + path_name = str(path).replace("/", ".") + if path_name.endswith(".py"): + path_name = path_name[:-3] + modules.append(path_name) + return sorted(modules) + + +@pytest.mark.parametrize("module", find_modules()) +def test_import_modules(module): + importlib.import_module(module) + assert module in sys.modules diff --git a/teuthology/test/test_job_status.py b/teuthology/test/test_job_status.py new file mode 100644 index 000000000..ee1b764e4 --- /dev/null +++ b/teuthology/test/test_job_status.py @@ -0,0 +1,60 @@ +from teuthology import job_status + + +class TestJobStatus(object): + def test_get_only_success_true(self): + summary = dict(success=True) + status = job_status.get_status(summary) + assert status == 'pass' + + def test_get_only_success_false(self): + summary = dict(success=False) + status = job_status.get_status(summary) + assert status == 'fail' + + def test_get_status_pass(self): + summary = dict(status='pass') + status = job_status.get_status(summary) + assert status == 'pass' + + def test_get_status_fail(self): + summary = dict(status='fail') + status = job_status.get_status(summary) + assert status == 'fail' + + def test_get_status_dead(self): + summary = dict(status='dead') + status = job_status.get_status(summary) + assert status == 'dead' + + def test_get_status_none(self): + summary = dict() + status = job_status.get_status(summary) + assert status is None + + def test_set_status_pass(self): + summary = dict() + job_status.set_status(summary, 'pass') + assert summary == dict(status='pass', success=True) + + def test_set_status_dead(self): + summary = dict() + job_status.set_status(summary, 'dead') + assert summary == dict(status='dead', success=False) + + def test_set_then_get_status_dead(self): + summary = dict() + job_status.set_status(summary, 'dead') + status = job_status.get_status(summary) + assert status == 'dead' + + def test_set_status_none(self): + summary = dict() + job_status.set_status(summary, None) + assert summary == dict() + + def test_legacy_fail(self): + summary = dict(success=True) + summary['success'] = False + status = job_status.get_status(summary) + assert status == 'fail' diff --git a/teuthology/test/test_kill.py b/teuthology/test/test_kill.py new file mode 100644 index 000000000..21ec99718 --- /dev/null +++ b/teuthology/test/test_kill.py @@ -0,0 +1,46 @@ +from unittest.mock import patch + +from teuthology.kill import find_targets + + +class TestFindTargets(object): + """ Tests for teuthology.kill.find_targets """ + + @patch('teuthology.kill.report.ResultsReporter.get_jobs') + def test_missing_run_find_targets(self, m_get_jobs): + m_get_jobs.return_value = [] + run_targets = find_targets("run-name") + assert run_targets == {} + + @patch('teuthology.kill.report.ResultsReporter.get_jobs') + def test_missing_job_find_targets(self, m_get_jobs): + m_get_jobs.return_value = {} + job_targets = find_targets("run-name", "3") + assert job_targets == {} + + @patch('teuthology.kill.report.ResultsReporter.get_jobs') + def test_missing_run_targets_find_targets(self, m_get_jobs): + m_get_jobs.return_value = [{"targets": None, "status": "waiting"}] + run_targets = find_targets("run-name") + assert run_targets == {} + + @patch('teuthology.kill.report.ResultsReporter.get_jobs') + def test_missing_job_targets_find_targets(self, m_get_jobs): + m_get_jobs.return_value = {"targets": None} + job_targets = find_targets("run-name", "3") + assert job_targets == {} + + @patch('teuthology.kill.report.ResultsReporter.get_jobs') + def test_run_find_targets(self, m_get_jobs): + m_get_jobs.return_value = [{"targets": {"node1": ""}, "status": "running"}] + run_targets = find_targets("run-name") + assert run_targets == {"node1": ""} + m_get_jobs.return_value = [{"targets": {"node1": ""}}] + run_targets = find_targets("run-name") + assert run_targets == {} + + @patch('teuthology.kill.report.ResultsReporter.get_jobs') + def test_job_find_targets(self, m_get_jobs): + m_get_jobs.return_value = {"targets": {"node1": ""}} + job_targets = find_targets("run-name", "3") + assert job_targets == {"node1": ""} diff --git a/teuthology/test/test_ls.py b/teuthology/test/test_ls.py new file mode 100644 index 000000000..631dcfd46 --- /dev/null +++ b/teuthology/test/test_ls.py @@ -0,0 +1,48 @@ +import pytest + +from unittest.mock import patch, Mock + +from teuthology import ls + + +class TestLs(object): + """ Tests for teuthology.ls """ + + @patch('os.path.isdir') + @patch('os.listdir') + def test_get_jobs(self, m_listdir, m_isdir): + m_listdir.return_value = ["1", "a", "3"] + m_isdir.return_value = True + results = ls.get_jobs("some/archive/dir") + assert results == ["1", "3"] + + @patch("yaml.safe_load_all") + @patch("teuthology.ls.get_jobs") + def test_ls(self, m_get_jobs, m_safe_load_all): + m_get_jobs.return_value = ["1", "2"] + m_safe_load_all.return_value = [{"failure_reason": "reasons"}] + ls.ls("some/archive/div", True) + + @patch("teuthology.ls.open") + @patch("teuthology.ls.get_jobs") + def test_ls_ioerror(self, m_get_jobs, m_open): + m_get_jobs.return_value = ["1", "2"] + m_open.side_effect = IOError() + with pytest.raises(IOError): + ls.ls("some/archive/dir", True) + + @patch("teuthology.ls.open") + @patch("os.popen") + @patch("os.path.isdir") + @patch("os.path.isfile") + def test_print_debug_info(self, m_isfile, m_isdir, m_popen, m_open): + m_isfile.return_value = True + m_isdir.return_value = True + m_popen.return_value = Mock() + cmdline = Mock() + cmdline.find = Mock(return_value=0) + m1 = Mock() + m2 = Mock() + m2.read = Mock(return_value=cmdline) + m_open.side_effect = [m1, m2] + ls.print_debug_info("the_job", "job/dir", "some/archive/dir") diff --git a/teuthology/test/test_misc.py b/teuthology/test/test_misc.py new file mode 100644 index 000000000..ca7533643 --- /dev/null +++ b/teuthology/test/test_misc.py @@ -0,0 +1,387 @@ +import argparse +import pytest +import subprocess + +from unittest.mock import Mock, patch + +from teuthology import misc +from teuthology.config import config +from teuthology.orchestra import cluster +from teuthology.orchestra.remote import Remote + + +class FakeRemote(object): + pass + + +def test_sh_normal(caplog): + assert misc.sh("/bin/echo ABC") == "ABC\n" + assert "truncated" not in caplog.text + + +def test_sh_truncate(caplog): + assert misc.sh("/bin/echo -n AB ; /bin/echo C", 2) == "ABC\n" + assert "truncated" in caplog.text + assert "ABC" not in caplog.text + + +def test_sh_fail(caplog): + with pytest.raises(subprocess.CalledProcessError) as excinfo: + misc.sh("/bin/echo -n AB ; /bin/echo C ; exit 111", 2) + assert excinfo.value.returncode == 111 + for record in caplog.records: + if record.levelname == 'ERROR': + assert ('replay full' in record.message or + 'ABC\n' == record.message) + +def test_sh_progress(caplog): + assert misc.sh("echo AB ; sleep 0.1 ; /bin/echo C", 2) == "AB\nC\n" + records = caplog.records + assert ':sh: ' in records[0].message + assert 'AB' == records[1].message + assert 'C' == records[2].message + assert records[2].created > records[1].created + + +def test_wait_until_osds_up(): + ctx = argparse.Namespace() + ctx.daemons = Mock() + ctx.daemons.iter_daemons_of_role.return_value = list() + remote = Mock(spec=Remote) + remote.sh.return_value = 'IGNORED\n{"osds":[{"state":["up"]}]}' + ctx.cluster = cluster.Cluster( + remotes=[ + (remote, ['osd.0', 'client.1']) + ], + ) + with patch.multiple( + misc, + get_testdir=lambda _: "TESTDIR", + ): + misc.wait_until_osds_up(ctx, ctx.cluster, remote) + + +def test_get_clients_simple(): + ctx = argparse.Namespace() + remote = FakeRemote() + ctx.cluster = cluster.Cluster( + remotes=[ + (remote, ['client.0', 'client.1']) + ], + ) + g = misc.get_clients(ctx=ctx, roles=['client.1']) + got = next(g) + assert len(got) == 2 + assert got[0] == ('1') + assert got[1] is remote + with pytest.raises(StopIteration): + next(g) + + +def test_get_mon_names(): + expected = [ + ([['mon.a', 'osd.0', 'mon.c']], 'ceph', ['mon.a', 'mon.c']), + ([['ceph.mon.a', 'osd.0', 'ceph.mon.c']], 'ceph', ['ceph.mon.a', 'ceph.mon.c']), + ([['mon.a', 'osd.0', 'mon.c'], ['ceph.mon.b']], 'ceph', ['mon.a', 'mon.c', 'ceph.mon.b']), + ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'ceph', ['mon.a', 'mon.c']), + ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'foo', ['foo.mon.a']), + ] + for remote_roles, cluster_name, expected_mons in expected: + ctx = argparse.Namespace() + ctx.cluster = Mock() + ctx.cluster.remotes = {i: roles for i, roles in enumerate(remote_roles)} + mons = misc.get_mon_names(ctx, cluster_name) + assert expected_mons == mons + + +def test_get_first_mon(): + expected = [ + ([['mon.a', 'osd.0', 'mon.c']], 'ceph', 'mon.a'), + ([['ceph.mon.a', 'osd.0', 'ceph.mon.c']], 'ceph', 'ceph.mon.a'), + ([['mon.a', 'osd.0', 'mon.c'], ['ceph.mon.b']], 'ceph', 'ceph.mon.b'), + ([['mon.a', 'osd.0', 'mon.c'], ['foo.mon.a']], 'ceph', 'mon.a'), + ([['foo.mon.b', 'osd.0', 'mon.c'], ['foo.mon.a']], 'foo', 'foo.mon.a'), + ] + for remote_roles, cluster_name, expected_mon in expected: + ctx = argparse.Namespace() + ctx.cluster = Mock() + ctx.cluster.remotes = {i: roles for i, roles in enumerate(remote_roles)} + mon = misc.get_first_mon(ctx, None, cluster_name) + assert expected_mon == mon + + +def test_roles_of_type(): + expected = [ + (['client.0', 'osd.0', 'ceph.osd.1'], 'osd', ['0', '1']), + (['client.0', 'osd.0', 'ceph.osd.1'], 'client', ['0']), + (['foo.client.1', 'bar.client.2.3', 'baz.osd.1'], 'mon', []), + (['foo.client.1', 'bar.client.2.3', 'baz.osd.1'], 'client', + ['1', '2.3']), + ] + for roles_for_host, type_, expected_ids in expected: + ids = list(misc.roles_of_type(roles_for_host, type_)) + assert ids == expected_ids + + +def test_cluster_roles_of_type(): + expected = [ + (['client.0', 'osd.0', 'ceph.osd.1'], 'osd', 'ceph', + ['osd.0', 'ceph.osd.1']), + (['client.0', 'osd.0', 'ceph.osd.1'], 'client', 'ceph', + ['client.0']), + (['foo.client.1', 'bar.client.2.3', 'baz.osd.1'], 'mon', None, []), + (['foo.client.1', 'bar.client.2.3', 'baz.osd.1'], 'client', None, + ['foo.client.1', 'bar.client.2.3']), + (['foo.client.1', 'bar.client.2.3', 'baz.osd.1'], 'client', 'bar', + ['bar.client.2.3']), + ] + for roles_for_host, type_, cluster_, expected_roles in expected: + roles = list(misc.cluster_roles_of_type(roles_for_host, type_, cluster_)) + assert roles == expected_roles + + +def test_all_roles_of_type(): + expected = [ + ([['client.0', 'osd.0', 'ceph.osd.1'], ['bar.osd.2']], + 'osd', ['0', '1', '2']), + ([['client.0', 'osd.0', 'ceph.osd.1'], ['bar.osd.2', 'baz.client.1']], + 'client', ['0', '1']), + ([['foo.client.1', 'bar.client.2.3'], ['baz.osd.1']], 'mon', []), + ([['foo.client.1', 'bar.client.2.3'], ['baz.osd.1', 'ceph.client.bar']], + 'client', ['1', '2.3', 'bar']), + ] + for host_roles, type_, expected_ids in expected: + cluster_ = Mock() + cluster_.remotes = dict(enumerate(host_roles)) + ids = list(misc.all_roles_of_type(cluster_, type_)) + assert ids == expected_ids + + +def test_get_http_log_path(): + # Fake configuration + archive_server = "http://example.com/server_root" + config.archive_server = archive_server + archive_dir = "/var/www/archives" + + path = misc.get_http_log_path(archive_dir) + assert path == "http://example.com/server_root/archives/" + + job_id = '12345' + path = misc.get_http_log_path(archive_dir, job_id) + assert path == "http://example.com/server_root/archives/12345/" + + # Inktank configuration + archive_server = "http://qa-proxy.ceph.com/teuthology/" + config.archive_server = archive_server + archive_dir = "/var/lib/teuthworker/archive/teuthology-2013-09-12_11:49:50-ceph-deploy-main-testing-basic-vps" + job_id = 31087 + path = misc.get_http_log_path(archive_dir, job_id) + assert path == "http://qa-proxy.ceph.com/teuthology/teuthology-2013-09-12_11:49:50-ceph-deploy-main-testing-basic-vps/31087/" + + path = misc.get_http_log_path(archive_dir) + assert path == "http://qa-proxy.ceph.com/teuthology/teuthology-2013-09-12_11:49:50-ceph-deploy-main-testing-basic-vps/" + + +def test_is_type(): + is_client = misc.is_type('client') + assert is_client('client.0') + assert is_client('ceph.client.0') + assert is_client('foo.client.0') + assert is_client('foo.client.bar.baz') + + with pytest.raises(ValueError): + is_client('') + is_client('client') + assert not is_client('foo.bar.baz') + assert not is_client('ceph.client') + assert not is_client('hadoop.main.0') + + +def test_is_type_in_cluster(): + is_c1_osd = misc.is_type('osd', 'c1') + with pytest.raises(ValueError): + is_c1_osd('') + assert not is_c1_osd('osd.0') + assert not is_c1_osd('ceph.osd.0') + assert not is_c1_osd('ceph.osd.0') + assert not is_c1_osd('c11.osd.0') + assert is_c1_osd('c1.osd.0') + assert is_c1_osd('c1.osd.999') + + +def test_get_mons(): + ips = ['1.1.1.1', '2.2.2.2', '3.3.3.3'] + addrs = ['1.1.1.1:6789', '1.1.1.1:6790', '1.1.1.1:6791'] + + mons = misc.get_mons([['mon.a']], ips) + assert mons == {'mon.a': addrs[0]} + + mons = misc.get_mons([['cluster-a.mon.foo', 'client.b'], ['osd.0']], ips) + assert mons == {'cluster-a.mon.foo': addrs[0]} + + mons = misc.get_mons([['mon.a', 'mon.b', 'ceph.mon.c']], ips) + assert mons == {'mon.a': addrs[0], + 'mon.b': addrs[1], + 'ceph.mon.c': addrs[2]} + + mons = misc.get_mons([['mon.a'], ['mon.b'], ['ceph.mon.c']], ips) + assert mons == {'mon.a': addrs[0], + 'mon.b': ips[1] + ':6789', + 'ceph.mon.c': ips[2] + ':6789'} + + +def test_split_role(): + expected = { + 'client.0': ('ceph', 'client', '0'), + 'foo.client.0': ('foo', 'client', '0'), + 'bar.baz.x.y.z': ('bar', 'baz', 'x.y.z'), + 'mds.a-s-b': ('ceph', 'mds', 'a-s-b'), + } + + for role, expected_split in expected.items(): + actual_split = misc.split_role(role) + assert actual_split == expected_split + +def test_update_key(): + a = { "sha": "foo", "workunit": { "sha": "foo" }, "tasks": [{"task1": "ceph"}], "overrides": [{"sha": "foo"}] } + b = { "sha": "blah", "workunit": { "sha": "bar" }, "tasks": [] } + + misc.update_key("sha", a, b) + assert a == { "sha": "blah", "workunit": { "sha": "bar" }, "tasks": [{"task1": "ceph"}], "overrides": [{"sha": "foo"}] } + +class TestHostnames(object): + def setup_method(self): + config._conf = dict() + + def teardown_method(self): + config.load() + + def test_canonicalize_hostname(self): + host_base = 'box1' + result = misc.canonicalize_hostname(host_base) + assert result == 'ubuntu@box1.front.sepia.ceph.com' + + def test_decanonicalize_hostname(self): + host = 'ubuntu@box1.front.sepia.ceph.com' + result = misc.decanonicalize_hostname(host) + assert result == 'box1' + + def test_canonicalize_hostname_nouser(self): + host_base = 'box1' + result = misc.canonicalize_hostname(host_base, user=None) + assert result == 'box1.front.sepia.ceph.com' + + def test_decanonicalize_hostname_nouser(self): + host = 'box1.front.sepia.ceph.com' + result = misc.decanonicalize_hostname(host) + assert result == 'box1' + + def test_canonicalize_hostname_otherlab(self): + config.lab_domain = 'example.com' + host_base = 'box1' + result = misc.canonicalize_hostname(host_base) + assert result == 'ubuntu@box1.example.com' + + def test_decanonicalize_hostname_otherlab(self): + config.lab_domain = 'example.com' + host = 'ubuntu@box1.example.com' + result = misc.decanonicalize_hostname(host) + assert result == 'box1' + + def test_canonicalize_hostname_nodomain(self): + config.lab_domain = '' + host = 'box2' + result = misc.canonicalize_hostname(host) + assert result == 'ubuntu@' + host + + def test_decanonicalize_hostname_nodomain(self): + config.lab_domain = '' + host = 'ubuntu@box2' + result = misc.decanonicalize_hostname(host) + assert result == 'box2' + + def test_canonicalize_hostname_full_other_user(self): + config.lab_domain = 'example.com' + host = 'user1@box1.example.come' + result = misc.canonicalize_hostname(host) + assert result == 'user1@box1.example.com' + + def test_decanonicalize_hostname_full_other_user(self): + config.lab_domain = 'example.com' + host = 'user1@box1.example.come' + result = misc.decanonicalize_hostname(host) + assert result == 'box1' + +class TestMergeConfigs(object): + """ Tests merge_config and deep_merge in teuthology.misc """ + + @patch("os.path.exists") + @patch("yaml.safe_load") + @patch("teuthology.misc.open") + def test_merge_configs(self, m_open, m_safe_load, m_exists): + """ Only tests with one yaml file being passed, mainly just to test + the loop logic. The actual merge will be tested in subsequent + tests. + """ + expected = {"a": "b", "b": "c"} + m_exists.return_value = True + m_safe_load.return_value = expected + result = misc.merge_configs(["path/to/config1"]) + assert result == expected + m_open.assert_called_once_with("path/to/config1") + + def test_merge_configs_empty(self): + assert misc.merge_configs([]) == {} + + def test_deep_merge(self): + a = {"a": "b"} + b = {"b": "c"} + result = misc.deep_merge(a, b) + assert result == {"a": "b", "b": "c"} + + def test_overwrite_deep_merge(self): + a = {"a": "b"} + b = {"a": "overwritten", "b": "c"} + result = misc.deep_merge(a, b) + assert result == {"a": "overwritten", "b": "c"} + + def test_list_deep_merge(self): + a = [1, 2] + b = [3, 4] + result = misc.deep_merge(a, b) + assert result == [1, 2, 3, 4] + + def test_missing_list_deep_merge(self): + a = [1, 2] + b = "not a list" + with pytest.raises(AssertionError): + misc.deep_merge(a, b) + + def test_missing_a_deep_merge(self): + result = misc.deep_merge(None, [1, 2]) + assert result == [1, 2] + + def test_missing_b_deep_merge(self): + result = misc.deep_merge([1, 2], None) + assert result == [1, 2] + + def test_invalid_b_deep_merge(self): + with pytest.raises(AssertionError): + misc.deep_merge({"a": "b"}, "invalid") + + +class TestIsInDict(object): + def test_simple_membership(self): + assert misc.is_in_dict('a', 'foo', {'a':'foo', 'b':'bar'}) + + def test_dict_membership(self): + assert misc.is_in_dict( + 'a', {'sub1':'key1', 'sub2':'key2'}, + {'a':{'sub1':'key1', 'sub2':'key2', 'sub3':'key3'}} + ) + + def test_simple_nonmembership(self): + assert not misc.is_in_dict('a', 'foo', {'a':'bar', 'b':'foo'}) + + def test_nonmembership_with_presence_at_lower_level(self): + assert not misc.is_in_dict('a', 'foo', {'a':{'a': 'foo'}}) diff --git a/teuthology/test/test_packaging.py b/teuthology/test/test_packaging.py new file mode 100644 index 000000000..265b2f8cf --- /dev/null +++ b/teuthology/test/test_packaging.py @@ -0,0 +1,794 @@ +import pytest + +from unittest.mock import patch, Mock + +from teuthology import packaging +from teuthology.exceptions import VersionNotFoundError + +KOJI_TASK_RPMS_MATRIX = [ + ('tasks/6745/9666745/kernel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel'), + ('tasks/6745/9666745/kernel-modules-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-modules'), + ('tasks/6745/9666745/kernel-tools-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-tools'), + ('tasks/6745/9666745/kernel-tools-libs-devel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-tools-libs-devel'), + ('tasks/6745/9666745/kernel-headers-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-headers'), + ('tasks/6745/9666745/kernel-tools-debuginfo-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-tools-debuginfo'), + ('tasks/6745/9666745/kernel-debuginfo-common-x86_64-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-debuginfo-common-x86_64'), + ('tasks/6745/9666745/perf-debuginfo-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'perf-debuginfo'), + ('tasks/6745/9666745/kernel-modules-extra-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-modules-extra'), + ('tasks/6745/9666745/kernel-tools-libs-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-tools-libs'), + ('tasks/6745/9666745/kernel-core-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-core'), + ('tasks/6745/9666745/kernel-debuginfo-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-debuginfo'), + ('tasks/6745/9666745/python-perf-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'python-perf'), + ('tasks/6745/9666745/kernel-devel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'kernel-devel'), + ('tasks/6745/9666745/python-perf-debuginfo-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'python-perf-debuginfo'), + ('tasks/6745/9666745/perf-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm', 'perf'), +] + +KOJI_TASK_RPMS = [rpm[0] for rpm in KOJI_TASK_RPMS_MATRIX] + + +class TestPackaging(object): + + def test_get_package_name_deb(self): + remote = Mock() + remote.os.package_type = "deb" + assert packaging.get_package_name('sqlite', remote) == "sqlite3" + + def test_get_package_name_rpm(self): + remote = Mock() + remote.os.package_type = "rpm" + assert packaging.get_package_name('sqlite', remote) is None + + def test_get_package_name_not_found(self): + remote = Mock() + remote.os.package_type = "rpm" + assert packaging.get_package_name('notthere', remote) is None + + def test_get_service_name_deb(self): + remote = Mock() + remote.os.package_type = "deb" + assert packaging.get_service_name('httpd', remote) == 'apache2' + + def test_get_service_name_rpm(self): + remote = Mock() + remote.os.package_type = "rpm" + assert packaging.get_service_name('httpd', remote) == 'httpd' + + def test_get_service_name_not_found(self): + remote = Mock() + remote.os.package_type = "rpm" + assert packaging.get_service_name('notthere', remote) is None + + def test_install_package_deb(self): + m_remote = Mock() + m_remote.os.package_type = "deb" + expected = [ + 'DEBIAN_FRONTEND=noninteractive', + 'sudo', + '-E', + 'apt-get', + '-y', + '--force-yes', + 'install', + 'apache2' + ] + packaging.install_package('apache2', m_remote) + m_remote.run.assert_called_with(args=expected) + + def test_install_package_rpm(self): + m_remote = Mock() + m_remote.os.package_type = "rpm" + expected = [ + 'sudo', + 'yum', + '-y', + 'install', + 'httpd' + ] + packaging.install_package('httpd', m_remote) + m_remote.run.assert_called_with(args=expected) + + def test_remove_package_deb(self): + m_remote = Mock() + m_remote.os.package_type = "deb" + expected = [ + 'DEBIAN_FRONTEND=noninteractive', + 'sudo', + '-E', + 'apt-get', + '-y', + 'purge', + 'apache2' + ] + packaging.remove_package('apache2', m_remote) + m_remote.run.assert_called_with(args=expected) + + def test_remove_package_rpm(self): + m_remote = Mock() + m_remote.os.package_type = "rpm" + expected = [ + 'sudo', + 'yum', + '-y', + 'erase', + 'httpd' + ] + packaging.remove_package('httpd', m_remote) + m_remote.run.assert_called_with(args=expected) + + def test_get_koji_package_name(self): + build_info = dict(version="3.10.0", release="123.20.1") + result = packaging.get_koji_package_name("kernel", build_info) + assert result == "kernel-3.10.0-123.20.1.x86_64.rpm" + + @patch("teuthology.packaging.config") + def test_get_kojiroot_base_url(self, m_config): + m_config.kojiroot_url = "http://kojiroot.com" + build_info = dict( + package_name="kernel", + version="3.10.0", + release="123.20.1", + ) + result = packaging.get_kojiroot_base_url(build_info) + expected = "http://kojiroot.com/kernel/3.10.0/123.20.1/x86_64/" + assert result == expected + + @patch("teuthology.packaging.config") + def test_get_koji_build_info_success(self, m_config): + m_config.kojihub_url = "http://kojihub.com" + m_proc = Mock() + expected = dict(foo="bar") + m_proc.exitstatus = 0 + m_proc.stdout.getvalue.return_value = str(expected) + m_remote = Mock() + m_remote.run.return_value = m_proc + result = packaging.get_koji_build_info(1, m_remote, dict()) + assert result == expected + args, kwargs = m_remote.run.call_args + expected_args = [ + 'python', '-c', + 'import koji; ' + 'hub = koji.ClientSession("http://kojihub.com"); ' + 'print(hub.getBuild(1))', + ] + assert expected_args == kwargs['args'] + + @patch("teuthology.packaging.config") + def test_get_koji_build_info_fail(self, m_config): + m_config.kojihub_url = "http://kojihub.com" + m_proc = Mock() + m_proc.exitstatus = 1 + m_remote = Mock() + m_remote.run.return_value = m_proc + m_ctx = Mock() + m_ctx.summary = dict() + with pytest.raises(RuntimeError): + packaging.get_koji_build_info(1, m_remote, m_ctx) + + @patch("teuthology.packaging.config") + def test_get_koji_task_result_success(self, m_config): + m_config.kojihub_url = "http://kojihub.com" + m_proc = Mock() + expected = dict(foo="bar") + m_proc.exitstatus = 0 + m_proc.stdout.getvalue.return_value = str(expected) + m_remote = Mock() + m_remote.run.return_value = m_proc + result = packaging.get_koji_task_result(1, m_remote, dict()) + assert result == expected + args, kwargs = m_remote.run.call_args + expected_args = [ + 'python', '-c', + 'import koji; ' + 'hub = koji.ClientSession("http://kojihub.com"); ' + 'print(hub.getTaskResult(1))', + ] + assert expected_args == kwargs['args'] + + @patch("teuthology.packaging.config") + def test_get_koji_task_result_fail(self, m_config): + m_config.kojihub_url = "http://kojihub.com" + m_proc = Mock() + m_proc.exitstatus = 1 + m_remote = Mock() + m_remote.run.return_value = m_proc + m_ctx = Mock() + m_ctx.summary = dict() + with pytest.raises(RuntimeError): + packaging.get_koji_task_result(1, m_remote, m_ctx) + + @patch("teuthology.packaging.config") + def test_get_koji_task_rpm_info_success(self, m_config): + m_config.koji_task_url = "http://kojihub.com/work" + expected = dict( + base_url="http://kojihub.com/work/tasks/6745/9666745/", + version="4.1.0-0.rc2.git2.1.fc23.x86_64", + rpm_name="kernel-4.1.0-0.rc2.git2.1.fc23.x86_64.rpm", + package_name="kernel", + ) + result = packaging.get_koji_task_rpm_info('kernel', KOJI_TASK_RPMS) + assert expected == result + + @patch("teuthology.packaging.config") + def test_get_koji_task_rpm_info_fail(self, m_config): + m_config.koji_task_url = "http://kojihub.com/work" + with pytest.raises(RuntimeError): + packaging.get_koji_task_rpm_info('ceph', KOJI_TASK_RPMS) + + def test_get_package_version_deb_found(self): + remote = Mock() + remote.os.package_type = "deb" + proc = Mock() + proc.exitstatus = 0 + proc.stdout.getvalue.return_value = "2.2" + remote.run.return_value = proc + result = packaging.get_package_version(remote, "apache2") + assert result == "2.2" + + def test_get_package_version_deb_command(self): + remote = Mock() + remote.os.package_type = "deb" + packaging.get_package_version(remote, "apache2") + args, kwargs = remote.run.call_args + expected_args = ['dpkg-query', '-W', '-f', '${Version}', 'apache2'] + assert expected_args == kwargs['args'] + + def test_get_package_version_rpm_found(self): + remote = Mock() + remote.os.package_type = "rpm" + proc = Mock() + proc.exitstatus = 0 + proc.stdout.getvalue.return_value = "2.2" + remote.run.return_value = proc + result = packaging.get_package_version(remote, "httpd") + assert result == "2.2" + + def test_get_package_version_rpm_command(self): + remote = Mock() + remote.os.package_type = "rpm" + packaging.get_package_version(remote, "httpd") + args, kwargs = remote.run.call_args + expected_args = ['rpm', '-q', 'httpd', '--qf', '%{VERSION}-%{RELEASE}'] + assert expected_args == kwargs['args'] + + def test_get_package_version_not_found(self): + remote = Mock() + remote.os.package_type = "rpm" + proc = Mock() + proc.exitstatus = 1 + proc.stdout.getvalue.return_value = "not installed" + remote.run.return_value = proc + result = packaging.get_package_version(remote, "httpd") + assert result is None + + def test_get_package_version_invalid_version(self): + # this tests the possibility that the package is not found + # but the exitstatus is still 0. Not entirely sure we'll ever + # hit this condition, but I want to test the codepath regardless + remote = Mock() + remote.os.package_type = "rpm" + proc = Mock() + proc.exitstatus = 0 + proc.stdout.getvalue.return_value = "not installed" + remote.run.return_value = proc + result = packaging.get_package_version(remote, "httpd") + assert result is None + + @pytest.mark.parametrize("input, expected", KOJI_TASK_RPMS_MATRIX) + def test_get_koji_task_result_package_name(self, input, expected): + assert packaging._get_koji_task_result_package_name(input) == expected + + @patch("requests.get") + def test_get_response_success(self, m_get): + resp = Mock() + resp.ok = True + m_get.return_value = resp + result = packaging._get_response("google.com") + assert result == resp + + @patch("requests.get") + def test_get_response_failed_wait(self, m_get): + resp = Mock() + resp.ok = False + m_get.return_value = resp + packaging._get_response("google.com", wait=True, sleep=1, tries=2) + assert m_get.call_count == 2 + + @patch("requests.get") + def test_get_response_failed_no_wait(self, m_get): + resp = Mock() + resp.ok = False + m_get.return_value = resp + packaging._get_response("google.com", sleep=1, tries=2) + assert m_get.call_count == 1 + + +class TestBuilderProject(object): + klass = None + + def setup_method(self): + if self.klass is None: + pytest.skip() + + def _get_remote(self, arch="x86_64", system_type="deb", distro="ubuntu", + codename="focal", version="20.04"): + rem = Mock() + rem.system_type = system_type + rem.os.name = distro + rem.os.codename = codename + rem.os.version = version + rem.arch = arch + + return rem + + def test_init_from_remote_base_url(self, expected=None): + assert expected is not None + rem = self._get_remote() + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + result = gp.base_url + assert result == expected + + def test_init_from_remote_base_url_debian(self, expected=None): + assert expected is not None + # remote.os.codename returns and empty string on debian + rem = self._get_remote(distro="debian", codename='', version="7.1") + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + result = gp.base_url + assert result == expected + + def test_init_from_config_base_url(self, expected=None): + assert expected is not None + config = dict( + os_type="ubuntu", + os_version="20.04", + sha1="sha1", + ) + gp = self.klass("ceph", config) + result = gp.base_url + print(self.m_get.call_args_list) + assert result == expected + + def test_init_from_config_branch_ref(self): + config = dict( + os_type="ubuntu", + os_version="20.04", + branch='jewel', + ) + gp = self.klass("ceph", config) + result = gp.uri_reference + expected = 'ref/jewel' + assert result == expected + + def test_init_from_config_tag_ref(self): + config = dict( + os_type="ubuntu", + os_version="20.04", + tag='v10.0.1', + ) + gp = self.klass("ceph", config) + result = gp.uri_reference + expected = 'ref/v10.0.1' + assert result == expected + + def test_init_from_config_tag_overrides_branch_ref(self, caplog): + config = dict( + os_type="ubuntu", + os_version="20.04", + branch='jewel', + tag='v10.0.1', + ) + gp = self.klass("ceph", config) + result = gp.uri_reference + expected = 'ref/v10.0.1' + assert result == expected + expected_log = 'More than one of ref, tag, branch, or sha1 supplied; using tag' + assert expected_log in caplog.text + return gp + + def test_init_from_config_branch_overrides_sha1(self, caplog): + config = dict( + os_type="ubuntu", + os_version="20.04", + branch='jewel', + sha1='sha1', + ) + gp = self.klass("ceph", config) + result = gp.uri_reference + expected = 'ref/jewel' + assert result == expected + expected_log = 'More than one of ref, tag, branch, or sha1 supplied; using branch' + assert expected_log in caplog.text + return gp + + REFERENCE_MATRIX = [ + ('the_ref', 'the_tag', 'the_branch', 'the_sha1', dict(ref='the_ref')), + (None, 'the_tag', 'the_branch', 'the_sha1', dict(tag='the_tag')), + (None, None, 'the_branch', 'the_sha1', dict(branch='the_branch')), + (None, None, None, 'the_sha1', dict(sha1='the_sha1')), + (None, None, 'the_branch', None, dict(branch='the_branch')), + ] + + @pytest.mark.parametrize( + "ref, tag, branch, sha1, expected", + REFERENCE_MATRIX, + ) + def test_choose_reference(self, ref, tag, branch, sha1, expected): + config = dict( + os_type='ubuntu', + os_version='18.04', + ) + if ref: + config['ref'] = ref + if tag: + config['tag'] = tag + if branch: + config['branch'] = branch + if sha1: + config['sha1'] = sha1 + gp = self.klass("ceph", config) + assert gp._choose_reference() == expected + + def test_get_package_version_found(self): + rem = self._get_remote() + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + assert gp.version == "0.90.0" + + @patch("teuthology.packaging._get_response") + def test_get_package_version_not_found(self, m_get_response): + rem = self._get_remote() + ctx = dict(foo="bar") + resp = Mock() + resp.ok = False + m_get_response.return_value = resp + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + with pytest.raises(VersionNotFoundError): + gp.version + + def test_get_package_sha1_fetched_found(self): + rem = self._get_remote() + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + assert gp.sha1 == "the_sha1" + + def test_get_package_sha1_fetched_not_found(self): + rem = self._get_remote() + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + assert not gp.sha1 + + DISTRO_MATRIX = [None] * 12 + + @pytest.mark.parametrize( + "matrix_index", + range(len(DISTRO_MATRIX)), + ) + def test_get_distro_remote(self, matrix_index): + (distro, version, codename, expected) = \ + self.DISTRO_MATRIX[matrix_index] + rem = self._get_remote(distro=distro, version=version, + codename=codename) + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + assert gp.distro == expected + + DISTRO_MATRIX_NOVER = [ + ('rhel', None, None, 'centos8'), + ('centos', None, None, 'centos8'), + ('fedora', None, None, 'fedora25'), + ('ubuntu', None, None, 'focal'), + ('debian', None, None, 'jessie'), + ] + + @pytest.mark.parametrize( + "matrix_index", + range(len(DISTRO_MATRIX) + len(DISTRO_MATRIX_NOVER)), + ) + def test_get_distro_config(self, matrix_index): + (distro, version, codename, expected) = \ + (self.DISTRO_MATRIX + self.DISTRO_MATRIX_NOVER)[matrix_index] + config = dict( + os_type=distro, + os_version=version + ) + gp = self.klass("ceph", config) + assert gp.distro == expected + + DIST_RELEASE_MATRIX = [ + ('rhel', '7.0', None, 'el7'), + ('centos', '6.5', None, 'el6'), + ('centos', '7.0', None, 'el7'), + ('centos', '7.1', None, 'el7'), + ('centos', '8.1', None, 'el8'), + ('fedora', '20', None, 'fc20'), + ('debian', '7.0', None, 'debian'), + ('debian', '7', None, 'debian'), + ('debian', '7.1', None, 'debian'), + ('ubuntu', '12.04', None, 'ubuntu'), + ('ubuntu', '14.04', None, 'ubuntu'), + ('ubuntu', '16.04', None, 'ubuntu'), + ('ubuntu', '18.04', None, 'ubuntu'), + ('ubuntu', '20.04', None, 'ubuntu'), + ] + + @pytest.mark.parametrize( + "matrix_index", + range(len(DIST_RELEASE_MATRIX)), + ) + def test_get_dist_release(self, matrix_index): + (distro, version, codename, expected) = \ + (self.DIST_RELEASE_MATRIX)[matrix_index] + rem = self._get_remote(distro=distro, version=version, + codename=codename) + ctx = dict(foo="bar") + gp = self.klass("ceph", {}, ctx=ctx, remote=rem) + assert gp.dist_release == expected + + +class TestShamanProject(TestBuilderProject): + klass = packaging.ShamanProject + + def setup_method(self): + self.p_config = patch('teuthology.packaging.config') + self.m_config = self.p_config.start() + self.m_config.use_shaman = True + self.m_config.shaman_host = 'shaman.ceph.com' + self.p_get_config_value = \ + patch('teuthology.packaging._get_config_value_for_remote') + self.m_get_config_value = self.p_get_config_value.start() + self.m_get_config_value.return_value = None + self.p_get = patch('requests.get') + self.m_get = self.p_get.start() + + def teardown_method(self): + self.p_config.stop() + self.p_get_config_value.stop() + self.p_get.stop() + + def test_init_from_remote_base_url(self): + # Here, we really just need to make sure ShamanProject._search() + # queries the right URL. So let's make _get_base_url() just pass that + # URL through and test that value. + def m_get_base_url(obj): + obj._search() + return self.m_get.call_args_list[0][0][0] + with patch( + 'teuthology.packaging.ShamanProject._get_base_url', + new=m_get_base_url, + ): + super(TestShamanProject, self)\ + .test_init_from_remote_base_url( + "https://shaman.ceph.com/api/search?status=ready" + "&project=ceph&flavor=default" + "&distros=ubuntu%2F20.04%2Fx86_64&ref=main" + ) + + def test_init_from_remote_base_url_debian(self): + # Here, we really just need to make sure ShamanProject._search() + # queries the right URL. So let's make _get_base_url() just pass that + # URL through and test that value. + def m_get_base_url(obj): + obj._search() + return self.m_get.call_args_list[0][0][0] + with patch( + 'teuthology.packaging.ShamanProject._get_base_url', + new=m_get_base_url, + ): + super(TestShamanProject, self)\ + .test_init_from_remote_base_url_debian( + "https://shaman.ceph.com/api/search?status=ready" + "&project=ceph&flavor=default" + "&distros=debian%2F7.1%2Fx86_64&ref=main" + ) + + def test_init_from_config_base_url(self): + # Here, we really just need to make sure ShamanProject._search() + # queries the right URL. So let's make _get_base_url() just pass that + # URL through and test that value. + def m_get_base_url(obj): + obj._search() + return self.m_get.call_args_list[0][0][0] + with patch( + 'teuthology.packaging.ShamanProject._get_base_url', + new=m_get_base_url, + ): + super(TestShamanProject, self).test_init_from_config_base_url( + "https://shaman.ceph.com/api/search?status=ready&project=ceph" \ + "&flavor=default&distros=ubuntu%2F20.04%2Fx86_64&sha1=sha1" + ) + + @patch('teuthology.packaging.ShamanProject._get_package_sha1') + def test_init_from_config_tag_ref(self, m_get_package_sha1): + m_get_package_sha1.return_value = 'the_sha1' + super(TestShamanProject, self).test_init_from_config_tag_ref() + + def test_init_from_config_tag_overrides_branch_ref(self, caplog): + with patch( + 'teuthology.packaging.repo_utils.ls_remote', + ) as m_ls_remote: + m_ls_remote.return_value = 'sha1_from_my_tag' + obj = super(TestShamanProject, self)\ + .test_init_from_config_tag_overrides_branch_ref(caplog) + search_uri = obj._search_uri + assert 'sha1=sha1_from_my_tag' in search_uri + assert 'jewel' not in search_uri + + def test_init_from_config_branch_overrides_sha1(self, caplog): + obj = super(TestShamanProject, self)\ + .test_init_from_config_branch_overrides_sha1(caplog) + search_uri = obj._search_uri + assert 'jewel' in search_uri + assert 'sha1' not in search_uri + + def test_get_package_version_found(self): + resp = Mock() + resp.ok = True + resp.json.return_value = [ + dict( + sha1='the_sha1', + extra=dict(package_manager_version='0.90.0'), + ) + ] + self.m_get.return_value = resp + super(TestShamanProject, self)\ + .test_get_package_version_found() + + def test_get_package_sha1_fetched_found(self): + resp = Mock() + resp.ok = True + resp.json.return_value = [dict(sha1='the_sha1')] + self.m_get.return_value = resp + super(TestShamanProject, self)\ + .test_get_package_sha1_fetched_found() + + def test_get_package_sha1_fetched_not_found(self): + resp = Mock() + resp.json.return_value = [] + self.m_get.return_value = resp + super(TestShamanProject, self)\ + .test_get_package_sha1_fetched_not_found() + + SHAMAN_SEARCH_RESPONSE = [ + { + "status": "ready", + "sha1": "534fc6d936bd506119f9e0921ff8cf8d47caa323", + "extra": { + "build_url": "https://jenkins.ceph.com/job/ceph-dev-build/ARCH=x86_64,AVAILABLE_ARCH=x86_64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic/48556/", + "root_build_cause": "SCMTRIGGER", + "version": "17.0.0-8856-g534fc6d9", + "node_name": "172.21.2.7+braggi07", + "job_name": "ceph-dev-build/ARCH=x86_64,AVAILABLE_ARCH=x86_64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic", + "package_manager_version": "17.0.0-8856.g534fc6d9" + }, + "url": "https://3.chacra.ceph.com/r/ceph/main/534fc6d936bd506119f9e0921ff8cf8d47caa323/centos/8/flavors/default/", + "modified": "2021-11-06 21:40:40.669823", + "distro_version": "8", + "project": "ceph", + "flavor": "default", + "ref": "main", + "chacra_url": "https://3.chacra.ceph.com/repos/ceph/main/534fc6d936bd506119f9e0921ff8cf8d47caa323/centos/8/flavors/default/", + "archs": [ + "x86_64", + "arm64", + "source" + ], + "distro": "centos" + } + ] + + SHAMAN_BUILDS_RESPONSE = [ + { + "status": "completed", + "sha1": "534fc6d936bd506119f9e0921ff8cf8d47caa323", + "distro_arch": "arm64", + "started": "2021-11-06 20:20:15.121203", + "completed": "2021-11-06 22:36:27.115950", + "extra": { + "node_name": "172.21.4.66+confusa04", + "version": "17.0.0-8856-g534fc6d9", + "build_user": "", + "root_build_cause": "SCMTRIGGER", + + "job_name": "ceph-dev-build/ARCH=arm64,AVAILABLE_ARCH=arm64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic" + }, + "modified": "2021-11-06 22:36:27.118043", + "distro_version": "8", + "project": "ceph", + "url": "https://jenkins.ceph.com/job/ceph-dev-build/ARCH=arm64,AVAILABLE_ARCH=arm64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic/48556/", + "log_url": "https://jenkins.ceph.com/job/ceph-dev-build/ARCH=arm64,AVAILABLE_ARCH=arm64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic/48556//consoleFull", + "flavor": "default", + "ref": "main", + "distro": "centos" + }, + { + "status": "completed", + "sha1": "534fc6d936bd506119f9e0921ff8cf8d47caa323", + "distro_arch": "x86_64", + "started": "2021-11-06 20:20:06.740692", + "completed": "2021-11-06 21:43:51.711970", + "extra": { + "node_name": "172.21.2.7+braggi07", + "version": "17.0.0-8856-g534fc6d9", + "build_user": "", + "root_build_cause": "SCMTRIGGER", + "job_name": "ceph-dev-build/ARCH=x86_64,AVAILABLE_ARCH=x86_64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic" + }, + "modified": "2021-11-06 21:43:51.713487", + "distro_version": "8", + "project": "ceph", + "url": "https://jenkins.ceph.com/job/ceph-dev-build/ARCH=x86_64,AVAILABLE_ARCH=x86_64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic/48556/", + "log_url": "https://jenkins.ceph.com/job/ceph-dev-build/ARCH=x86_64,AVAILABLE_ARCH=x86_64,AVAILABLE_DIST=centos8,DIST=centos8,MACHINE_SIZE=gigantic/48556//consoleFull", + "flavor": "default", + "ref": "main", + "distro": "centos" + } + ] + + def test_build_complete_success(self): + config = dict( + os_type="centos", + os_version="8", + branch='main', + arch='x86_64', + flavor='default', + ) + builder = self.klass("ceph", config) + + search_resp = Mock() + search_resp.ok = True + search_resp.json.return_value = self.SHAMAN_SEARCH_RESPONSE + self.m_get.return_value = search_resp + # cause builder to call requests.get and cache search_resp + builder.assert_result() + + build_resp = Mock() + build_resp.ok = True + self.m_get.return_value = build_resp + + # both archs completed, so x86_64 build is complete + builds = build_resp.json.return_value = self.SHAMAN_BUILDS_RESPONSE + assert builder.build_complete + + # mark the arm64 build failed, x86_64 should still be complete + builds[0]['status'] = "failed" + build_resp.json.return_value = builds + assert builder.build_complete + + # mark the x86_64 build failed, should show incomplete + builds[1]['status'] = "failed" + build_resp.json.return_value = builds + assert not builder.build_complete + + # mark the arm64 build complete again, x86_64 still incomplete + builds[0]['status'] = "completed" + build_resp.json.return_value = builds + assert not builder.build_complete + + DISTRO_MATRIX = [ + ('rhel', '7.0', None, 'centos/7'), + ('centos', '6.5', None, 'centos/6'), + ('centos', '7.0', None, 'centos/7'), + ('centos', '7.1', None, 'centos/7'), + ('centos', '8.1', None, 'centos/8'), + ('fedora', '20', None, 'fedora/20'), + ('ubuntu', '14.04', 'trusty', 'ubuntu/14.04'), + ('ubuntu', '14.04', None, 'ubuntu/14.04'), + ('debian', '7.0', None, 'debian/7.0'), + ('debian', '7', None, 'debian/7'), + ('debian', '7.1', None, 'debian/7.1'), + ('ubuntu', '12.04', None, 'ubuntu/12.04'), + ('ubuntu', '14.04', None, 'ubuntu/14.04'), + ('ubuntu', '16.04', None, 'ubuntu/16.04'), + ('ubuntu', '18.04', None, 'ubuntu/18.04'), + ('ubuntu', '20.04', None, 'ubuntu/20.04'), + ] + + DISTRO_MATRIX_NOVER = [ + ('rhel', None, None, 'centos/8'), + ('centos', None, None, 'centos/8'), + ('fedora', None, None, 'fedora/25'), + ('ubuntu', None, None, 'ubuntu/20.04'), + ('debian', None, None, 'debian/8.0'), + ] diff --git a/teuthology/test/test_parallel.py b/teuthology/test/test_parallel.py new file mode 100644 index 000000000..bba1d57bf --- /dev/null +++ b/teuthology/test/test_parallel.py @@ -0,0 +1,28 @@ +from teuthology.parallel import parallel + + +def identity(item, input_set=None, remove=False): + if input_set is not None: + assert item in input_set + if remove: + input_set.remove(item) + return item + + +class TestParallel(object): + def test_basic(self): + in_set = set(range(10)) + with parallel() as para: + for i in in_set: + para.spawn(identity, i, in_set, remove=True) + assert para.any_spawned is True + assert para.count == len(in_set) + + def test_result(self): + in_set = set(range(10)) + with parallel() as para: + for i in in_set: + para.spawn(identity, i, in_set) + for result in para: + in_set.remove(result) + diff --git a/teuthology/test/test_repo_utils.py b/teuthology/test/test_repo_utils.py new file mode 100644 index 000000000..969f82513 --- /dev/null +++ b/teuthology/test/test_repo_utils.py @@ -0,0 +1,250 @@ +import logging +import unittest.mock as mock +import os +import os.path +from pytest import raises, mark +import re +import shutil +import subprocess +import tempfile +from packaging.version import parse + +from teuthology.exceptions import BranchNotFoundError, CommitNotFoundError +from teuthology import repo_utils +from teuthology import parallel +repo_utils.log.setLevel(logging.WARNING) + + +class TestRepoUtils(object): + + @classmethod + def setup_class(cls): + cls.temp_path = tempfile.mkdtemp(prefix='test_repo-') + cls.dest_path = cls.temp_path + '/empty_dest' + cls.src_path = cls.temp_path + '/empty_src' + + if 'TEST_ONLINE' in os.environ: + cls.repo_url = 'https://github.com/ceph/empty.git' + cls.commit = '71245d8e454a06a38a00bff09d8f19607c72e8bf' + else: + cls.repo_url = 'file://' + cls.src_path + cls.commit = None + + cls.git_version = parse(cls.get_system_git_version()) + + @classmethod + def teardown_class(cls): + shutil.rmtree(cls.temp_path) + + @classmethod + def get_system_git_version(cls): + # parsing following patterns + # 1) git version 2.45.2 + # 2) git version 2.39.3 (Apple Git-146) + git_version = subprocess.check_output(('git', 'version')).decode() + m = re.match(r"git version (?P\d+.\d+.\d+) ?", git_version) + return m['ver'] + + def setup_method(self, method): + # In git 2.28.0, the --initial-branch flag was added. + if self.git_version >= parse("2.28.0"): + subprocess.check_call( + ('git', 'init', '--initial-branch', 'main', self.src_path) + ) + else: + subprocess.check_call(('git', 'init', self.src_path)) + subprocess.check_call( + ('git', 'checkout', '-b', 'main'), + cwd=self.src_path, + ) + proc = subprocess.Popen( + ('git', 'config', 'user.email', 'test@ceph.com'), + cwd=self.src_path, + stdout=subprocess.PIPE, + ) + assert proc.wait() == 0 + proc = subprocess.Popen( + ('git', 'config', 'user.name', 'Test User'), + cwd=self.src_path, + stdout=subprocess.PIPE, + ) + assert proc.wait() == 0 + proc = subprocess.Popen( + ('git', 'commit', '--allow-empty', '--allow-empty-message', + '--no-edit'), + cwd=self.src_path, + stdout=subprocess.PIPE, + ) + assert proc.wait() == 0 + if not self.commit: + result = subprocess.check_output( + 'git rev-parse HEAD', + shell=True, + cwd=self.src_path, + ).split() + assert result + self.commit = result[0].decode() + + def teardown_method(self, method): + shutil.rmtree(self.src_path, ignore_errors=True) + shutil.rmtree(self.dest_path, ignore_errors=True) + + def test_clone_repo_existing_branch(self): + repo_utils.clone_repo(self.repo_url, self.dest_path, 'main', self.commit) + assert os.path.exists(self.dest_path) + + def test_clone_repo_non_existing_branch(self): + with raises(BranchNotFoundError): + repo_utils.clone_repo(self.repo_url, self.dest_path, 'nobranch', self.commit) + assert not os.path.exists(self.dest_path) + + def test_fetch_no_repo(self): + fake_dest_path = self.temp_path + '/not_a_repo' + assert not os.path.exists(fake_dest_path) + with raises(OSError): + repo_utils.fetch(fake_dest_path) + assert not os.path.exists(fake_dest_path) + + def test_fetch_noop(self): + repo_utils.clone_repo(self.repo_url, self.dest_path, 'main', self.commit) + repo_utils.fetch(self.dest_path) + assert os.path.exists(self.dest_path) + + def test_fetch_branch_no_repo(self): + fake_dest_path = self.temp_path + '/not_a_repo' + assert not os.path.exists(fake_dest_path) + with raises(OSError): + repo_utils.fetch_branch(fake_dest_path, 'main') + assert not os.path.exists(fake_dest_path) + + def test_fetch_branch_fake_branch(self): + repo_utils.clone_repo(self.repo_url, self.dest_path, 'main', self.commit) + with raises(BranchNotFoundError): + repo_utils.fetch_branch(self.dest_path, 'nobranch') + + @mark.parametrize('git_str', + ["fatal: couldn't find remote ref", + "fatal: Couldn't find remote ref"]) + @mock.patch('subprocess.Popen') + def test_fetch_branch_different_git_versions(self, mock_popen, git_str): + """ + Newer git versions return a lower case string + See: https://github.com/git/git/commit/0b9c3afdbfb629363 + """ + branch_name = 'nobranch' + process_mock = mock.Mock() + attrs = { + 'wait.return_value': 1, + 'stdout.read.return_value': f"{git_str} {branch_name}".encode(), + } + process_mock.configure_mock(**attrs) + mock_popen.return_value = process_mock + with raises(BranchNotFoundError): + repo_utils.fetch_branch('', branch_name) + + def test_enforce_existing_branch(self): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main') + assert os.path.exists(self.dest_path) + + def test_enforce_existing_commit(self): + import logging + logging.getLogger().info(subprocess.check_output("git branch", shell=True, cwd=self.src_path)) + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + + def test_enforce_non_existing_branch(self): + with raises(BranchNotFoundError): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'blah', self.commit) + assert not os.path.exists(self.dest_path) + + def test_enforce_non_existing_commit(self): + with raises(CommitNotFoundError): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', 'c69e90807d222c1719c45c8c758bf6fac3d985f1') + assert not os.path.exists(self.dest_path) + + def test_enforce_multiple_calls_same_branch(self): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + + def test_enforce_multiple_calls_different_branches(self): + with raises(BranchNotFoundError): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'blah1') + assert not os.path.exists(self.dest_path) + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + with raises(BranchNotFoundError): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'blah2') + assert not os.path.exists(self.dest_path) + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, + 'main', self.commit) + assert os.path.exists(self.dest_path) + + def test_enforce_invalid_branch(self): + with raises(ValueError): + repo_utils.enforce_repo_state(self.repo_url, self.dest_path, 'a b', self.commit) + + def test_simultaneous_access(self): + count = 5 + with parallel.parallel() as p: + for i in range(count): + p.spawn(repo_utils.enforce_repo_state, self.repo_url, + self.dest_path, 'main', self.commit) + for result in p: + assert result is None + + def test_simultaneous_access_different_branches(self): + branches = [('main', self.commit), ('main', self.commit), ('nobranch', 'nocommit'), + ('nobranch', 'nocommit'), ('main', self.commit), ('nobranch', 'nocommit')] + + with parallel.parallel() as p: + for branch, commit in branches: + if branch == 'main': + p.spawn(repo_utils.enforce_repo_state, self.repo_url, + self.dest_path, branch, commit) + else: + dest_path = self.dest_path + '_' + branch + + def func(): + repo_utils.enforce_repo_state( + self.repo_url, dest_path, + branch, commit) + p.spawn( + raises, + BranchNotFoundError, + func, + ) + for result in p: + pass + + URLS_AND_DIRNAMES = [ + ('git@git.ceph.com/ceph-qa-suite.git', 'git.ceph.com_ceph-qa-suite'), + ('git://git.ceph.com/ceph-qa-suite.git', 'git.ceph.com_ceph-qa-suite'), + ('https://github.com/ceph/ceph', 'github.com_ceph_ceph'), + ('https://github.com/liewegas/ceph.git', 'github.com_liewegas_ceph'), + ('file:///my/dir/has/ceph.git', 'my_dir_has_ceph'), + ] + + @mark.parametrize("input_, expected", URLS_AND_DIRNAMES) + def test_url_to_dirname(self, input_, expected): + assert repo_utils.url_to_dirname(input_) == expected + + def test_current_branch(self): + repo_utils.clone_repo(self.repo_url, self.dest_path, 'main', self.commit) + assert repo_utils.current_branch(self.dest_path) == "main" diff --git a/teuthology/test/test_report.py b/teuthology/test/test_report.py new file mode 100644 index 000000000..a8535dfc8 --- /dev/null +++ b/teuthology/test/test_report.py @@ -0,0 +1,86 @@ +import json +import pytest +import yaml + +from teuthology.test import fake_archive +from teuthology import report + + +@pytest.fixture +def archive(tmp_path): + archive = fake_archive.FakeArchive(archive_base=str(tmp_path)) + yield archive + archive.teardown() + + +@pytest.fixture(autouse=True) +def reporter(archive): + archive.setup() + return report.ResultsReporter(archive_base=archive.archive_base) + + +def test_all_runs_one_run(archive, reporter): + run_name = "test_all_runs" + yaml_path = "examples/3node_ceph.yaml" + job_count = 3 + archive.create_fake_run(run_name, job_count, yaml_path) + assert [run_name] == reporter.serializer.all_runs + + +def test_all_runs_three_runs(archive, reporter): + run_count = 3 + runs = {} + for i in range(run_count): + run_name = "run #%s" % i + yaml_path = "examples/3node_ceph.yaml" + job_count = 3 + job_ids = archive.create_fake_run( + run_name, + job_count, + yaml_path) + runs[run_name] = job_ids + assert sorted(runs.keys()) == sorted(reporter.serializer.all_runs) + + +def test_jobs_for_run(archive, reporter): + run_name = "test_jobs_for_run" + yaml_path = "examples/3node_ceph.yaml" + job_count = 3 + jobs = archive.create_fake_run(run_name, job_count, yaml_path) + job_ids = [str(job['job_id']) for job in jobs] + + got_jobs = reporter.serializer.jobs_for_run(run_name) + assert sorted(job_ids) == sorted(got_jobs.keys()) + + +def test_running_jobs_for_run(archive, reporter): + run_name = "test_jobs_for_run" + yaml_path = "examples/3node_ceph.yaml" + job_count = 10 + num_hung = 3 + archive.create_fake_run(run_name, job_count, yaml_path, + num_hung=num_hung) + + got_jobs = reporter.serializer.running_jobs_for_run(run_name) + assert len(got_jobs) == num_hung + + +def test_json_for_job(archive, reporter): + run_name = "test_json_for_job" + yaml_path = "examples/3node_ceph.yaml" + job_count = 1 + jobs = archive.create_fake_run(run_name, job_count, yaml_path) + job = jobs[0] + + with open(yaml_path) as yaml_file: + obj_from_yaml = yaml.safe_load(yaml_file) + full_obj = obj_from_yaml.copy() + full_obj.update(job['info']) + full_obj.update(job['summary']) + + out_json = reporter.serializer.json_for_job( + run_name, str(job['job_id'])) + out_obj = json.loads(out_json) + assert full_obj == out_obj + + diff --git a/teuthology/test/test_results.py b/teuthology/test/test_results.py new file mode 100644 index 000000000..3d7ba93bd --- /dev/null +++ b/teuthology/test/test_results.py @@ -0,0 +1,155 @@ +import textwrap +from teuthology.config import config +from teuthology import results +from teuthology import report + +from unittest.mock import patch, DEFAULT + + +class TestResultsEmail(object): + reference = { + 'run_name': 'test_name', + 'jobs': [ + # Running + {'description': 'description for job with name test_name', + 'job_id': 30481, + 'name': 'test_name', + 'log_href': 'http://qa-proxy.ceph.com/teuthology/test_name/30481/teuthology.log', # noqa + 'owner': 'job@owner', + 'duration': None, + 'status': 'running', + }, + # Waiting + {'description': 'description for job with name test_name', + 'job_id': 62965, + 'name': 'test_name', + 'log_href': 'http://qa-proxy.ceph.com/teuthology/test_name/30481/teuthology.log', # noqa + 'owner': 'job@owner', + 'duration': None, + 'status': 'waiting', + }, + # Queued + {'description': 'description for job with name test_name', + 'job_id': 79063, + 'name': 'test_name', + 'log_href': 'http://qa-proxy.ceph.com/teuthology/test_name/30481/teuthology.log', # noqa + 'owner': 'job@owner', + 'duration': None, + 'status': 'queued', + }, + # Failed + {'description': 'description for job with name test_name', + 'job_id': 88979, + 'name': 'test_name', + 'log_href': 'http://qa-proxy.ceph.com/teuthology/test_name/88979/teuthology.log', # noqa + 'owner': 'job@owner', + 'duration': 35190, + 'success': False, + 'status': 'fail', + 'failure_reason': 'Failure reason!', + }, + # Dead + {'description': 'description for job with name test_name', + 'job_id': 69152, + 'name': 'test_name', + 'log_href': 'http://qa-proxy.ceph.com/teuthology/test_name/69152/teuthology.log', # noqa + 'owner': 'job@owner', + 'duration': 5225, + 'success': False, + 'status': 'dead', + 'failure_reason': 'Dead reason!', + }, + # Passed + {'description': 'description for job with name test_name', + 'job_id': 68369, + 'name': 'test_name', + 'log_href': 'http://qa-proxy.ceph.com/teuthology/test_name/68369/teuthology.log', # noqa + 'owner': 'job@owner', + 'duration': 33771, + 'success': True, + 'status': 'pass', + }, + ], + 'subject': '1 fail, 1 dead, 1 running, 1 waiting, 1 queued, 1 pass in test_name', # noqa + 'body': textwrap.dedent(""" + Test Run: test_name + ================================================================= + info: http://example.com/test_name/ + logs: http://qa-proxy.ceph.com/teuthology/test_name/ + failed: 1 + dead: 1 + running: 1 + waiting: 1 + queued: 1 + passed: 1 + + + Fail + ================================================================= + [88979] description for job with name test_name + ----------------------------------------------------------------- + time: 09:46:30 + info: http://example.com/test_name/88979/ + log: http://qa-proxy.ceph.com/teuthology/test_name/88979/ + + Failure reason! + + + + Dead + ================================================================= + [69152] description for job with name test_name + ----------------------------------------------------------------- + time: 01:27:05 + info: http://example.com/test_name/69152/ + log: http://qa-proxy.ceph.com/teuthology/test_name/69152/ + + Dead reason! + + + + Running + ================================================================= + [30481] description for job with name test_name + info: http://example.com/test_name/30481/ + + + + Waiting + ================================================================= + [62965] description for job with name test_name + info: http://example.com/test_name/62965/ + + + + Queued + ================================================================= + [79063] description for job with name test_name + info: http://example.com/test_name/79063/ + + + + Pass + ================================================================= + [68369] description for job with name test_name + time: 09:22:51 + info: http://example.com/test_name/68369/ + """).strip(), + } + + def setup_method(self): + config.results_ui_server = "http://example.com/" + config.archive_server = "http://qa-proxy.ceph.com/teuthology/" + + def test_build_email_body(self): + run_name = self.reference['run_name'] + with patch.multiple( + report, + ResultsReporter=DEFAULT, + ): + reporter = report.ResultsReporter() + reporter.get_jobs.return_value = self.reference['jobs'] + (subject, body) = results.build_email_body( + run_name, _reporter=reporter) + assert subject == self.reference['subject'] + assert body == self.reference['body'] diff --git a/teuthology/test/test_run.py b/teuthology/test/test_run.py new file mode 100644 index 000000000..fbb74403a --- /dev/null +++ b/teuthology/test/test_run.py @@ -0,0 +1,286 @@ +import pytest +import docopt + +from unittest.mock import patch, call + +from teuthology import run +from scripts import run as scripts_run +from teuthology.test import skipif_teuthology_process + + +class TestRun(object): + """ Tests for teuthology.run """ + + @patch("teuthology.log.setLevel") + @patch("teuthology.setup_log_file") + @patch("os.mkdir") + def test_set_up_logging(self, m_mkdir, m_setup_log_file, m_setLevel): + run.set_up_logging(True, "path/to/archive") + m_mkdir.assert_called_with("path/to/archive") + m_setup_log_file.assert_called_with("path/to/archive/teuthology.log") + assert m_setLevel.called + + # because of how we import things, mock merge_configs from run - where it's used + # see: http://www.voidspace.org.uk/python/mock/patch.html#where-to-patch + @patch("teuthology.run.merge_configs") + def test_setup_config(self, m_merge_configs): + config = {"job_id": 1, "foo": "bar"} + m_merge_configs.return_value = config + result = run.setup_config(["some/config.yaml"]) + assert m_merge_configs.called + assert result["job_id"] == "1" + assert result["foo"] == "bar" + + @patch("teuthology.run.merge_configs") + def test_setup_config_targets_ok(self, m_merge_configs): + config = {"targets": list(range(4)), "roles": list(range(2))} + m_merge_configs.return_value = config + result = run.setup_config(["some/config.yaml"]) + assert result["targets"] == [0, 1, 2, 3] + assert result["roles"] == [0, 1] + + @patch("teuthology.run.merge_configs") + def test_setup_config_targets_invalid(self, m_merge_configs): + config = {"targets": range(2), "roles": range(4)} + m_merge_configs.return_value = config + with pytest.raises(AssertionError): + run.setup_config(["some/config.yaml"]) + + @patch("teuthology.run.open") + def test_write_initial_metadata(self, m_open): + config = {"job_id": "123", "foo": "bar"} + run.write_initial_metadata( + "some/archive/dir", + config, + "the_name", + "the_description", + "the_owner", + ) + expected = [ + call('some/archive/dir/pid', 'w'), + call('some/archive/dir/owner', 'w'), + call('some/archive/dir/orig.config.yaml', 'w'), + call('some/archive/dir/info.yaml', 'w') + ] + assert m_open.call_args_list == expected + + def test_get_machine_type(self): + result = run.get_machine_type(None, {"machine-type": "the_machine_type"}) + assert result == "the_machine_type" + + def test_get_summary(self): + result = run.get_summary("the_owner", "the_description") + assert result == {"owner": "the_owner", "description": "the_description", "success": True} + result = run.get_summary("the_owner", None) + assert result == {"owner": "the_owner", "success": True} + + def test_validate_tasks_invalid(self): + config = {"tasks": [{"kernel": "can't be here"}]} + with pytest.raises(AssertionError) as excinfo: + run.validate_tasks(config) + assert excinfo.value.args[0].startswith("kernel installation") + + def test_validate_task_no_tasks(self): + result = run.validate_tasks({}) + assert result == [] + + def test_validate_tasks_valid(self): + expected = [{"foo": "bar"}, {"bar": "foo"}] + result = run.validate_tasks({"tasks": expected}) + assert result == expected + + def test_validate_tasks_is_list(self): + with pytest.raises(AssertionError) as excinfo: + run.validate_tasks({"tasks": {"foo": "bar"}}) + assert excinfo.value.args[0].startswith("Expected list") + + def test_get_initial_tasks_invalid(self): + with pytest.raises(AssertionError) as excinfo: + run.get_initial_tasks(True, {"targets": "can't be here", + "roles": "roles" }, "machine_type") + assert excinfo.value.args[0].startswith("You cannot") + + def test_get_inital_tasks(self): + config = {"roles": range(2), "kernel": "the_kernel", "use_existing_cluster": False} + result = run.get_initial_tasks(True, config, "machine_type") + assert {"internal.lock_machines": (2, "machine_type")} in result + assert {"kernel": "the_kernel"} in result + # added because use_existing_cluster == False + assert {'internal.vm_setup': None} in result + assert {'internal.buildpackages_prep': None} in result + + # When tests are run in a teuthology process using the py.test + # API, tasks will have already been imported. Patching sys.path + # (and even calling sys.path_importer_cache.clear()) doesn't seem + # to help "forget" where the tasks are, keeping this test from + # passing. The test isn't critical to run in every single + # environment, so skip. + @skipif_teuthology_process + @patch("teuthology.run.fetch_qa_suite") + def test_fetch_tasks_if_needed(self, m_fetch_qa_suite): + config = {"suite_path": "/some/suite/path", "suite_branch": "feature_branch", + "suite_sha1": "commit"} + m_fetch_qa_suite.return_value = "/some/other/suite/path" + result = run.fetch_tasks_if_needed(config) + m_fetch_qa_suite.assert_called_with("feature_branch", commit="commit") + assert result == "/some/other/suite/path/qa" + + @patch("teuthology.run.get_status") + @patch("yaml.safe_dump") + @patch("teuthology.report.try_push_job_info") + @patch("teuthology.run.email_results") + @patch("teuthology.run.open") + @patch("sys.exit") + def test_report_outcome(self, m_sys_exit, m_open, m_email_results, m_try_push_job_info, m_safe_dump, m_get_status): + m_get_status.return_value = "fail" + summary = {"failure_reason": "reasons"} + summary_dump = "failure_reason: reasons\n" + config = {"email-on-error": True} + config_dump = "email-on-error: true\n" + m_safe_dump.side_effect = [None, summary_dump, config_dump] + run.report_outcome(config, "the/archive/path", summary) + m_try_push_job_info.assert_called_with(config, summary) + m_open.assert_called_with("the/archive/path/summary.yaml", "w") + assert m_email_results.called + assert m_open.called + assert m_sys_exit.called + + @patch("teuthology.run.set_up_logging") + @patch("teuthology.run.setup_config") + @patch("teuthology.run.get_user") + @patch("teuthology.run.write_initial_metadata") + @patch("teuthology.report.try_push_job_info") + @patch("teuthology.run.get_machine_type") + @patch("teuthology.run.get_summary") + @patch("yaml.safe_dump") + @patch("teuthology.run.validate_tasks") + @patch("teuthology.run.get_initial_tasks") + @patch("teuthology.run.fetch_tasks_if_needed") + @patch("teuthology.run.run_tasks") + @patch("teuthology.run.report_outcome") + def test_main(self, m_report_outcome, m_run_tasks, m_fetch_tasks_if_needed, m_get_initial_tasks, m_validate_tasks, + m_safe_dump, m_get_summary, m_get_machine_type, m_try_push_job_info, m_write_initial_metadata, + m_get_user, m_setup_config, m_set_up_logging): + """ This really should be an integration test of some sort. """ + config = {"job_id": 1} + m_setup_config.return_value = config + m_get_machine_type.return_value = "machine_type" + doc = scripts_run.__doc__ + args = docopt.docopt(doc, [ + "--verbose", + "--archive", "some/archive/dir", + "--description", "the_description", + "--lock", + "--os-type", "os_type", + "--os-version", "os_version", + "--block", + "--name", "the_name", + "--suite-path", "some/suite/dir", + "path/to/config.yml", + ]) + m_get_user.return_value = "the_owner" + m_get_summary.return_value = dict(success=True, owner="the_owner", description="the_description") + m_validate_tasks.return_value = ['task3'] + m_get_initial_tasks.return_value = ['task1', 'task2'] + m_fetch_tasks_if_needed.return_value = "some/suite/dir" + run.main(args) + m_set_up_logging.assert_called_with(True, "some/archive/dir") + m_setup_config.assert_called_with(["path/to/config.yml"]) + m_write_initial_metadata.assert_called_with( + "some/archive/dir", + config, + "the_name", + "the_description", + "the_owner" + ) + m_try_push_job_info.assert_called_with(config, dict(status='running')) + m_get_machine_type.assert_called_with(None, config) + m_get_summary.assert_called_with("the_owner", "the_description") + m_get_initial_tasks.assert_called_with(True, config, "machine_type") + m_fetch_tasks_if_needed.assert_called_with(config) + assert m_report_outcome.called + args, kwargs = m_run_tasks.call_args + fake_ctx = kwargs["ctx"]._conf + # fields that must be in ctx for the tasks to behave + expected_ctx = ["verbose", "archive", "description", "owner", "lock", "machine_type", "os_type", "os_version", + "block", "name", "suite_path", "config", "summary"] + for key in expected_ctx: + assert key in fake_ctx + assert isinstance(fake_ctx["config"], dict) + assert isinstance(fake_ctx["summary"], dict) + assert "tasks" in fake_ctx["config"] + # ensures that values missing in args are added with the correct value + assert fake_ctx["owner"] == "the_owner" + assert fake_ctx["machine_type"] == "machine_type" + # ensures os_type and os_version are property overwritten + assert fake_ctx["config"]["os_type"] == "os_type" + assert fake_ctx["config"]["os_version"] == "os_version" + + @patch("teuthology.run.set_up_logging") + @patch("teuthology.run.setup_config") + @patch("teuthology.run.get_user") + @patch("teuthology.run.write_initial_metadata") + @patch("teuthology.report.try_push_job_info") + @patch("teuthology.run.get_machine_type") + @patch("teuthology.run.get_summary") + @patch("yaml.safe_dump") + @patch("teuthology.run.validate_tasks") + @patch("teuthology.run.get_initial_tasks") + @patch("teuthology.run.fetch_tasks_if_needed") + @patch("teuthology.run.run_tasks") + @patch("teuthology.run.report_outcome") + def test_main_interactive( + self, + m_report_outcome, + m_run_tasks, + m_fetch_tasks_if_needed, + m_get_initial_tasks, + m_validate_tasks, + m_safe_dump, + m_get_summary, + m_get_machine_type, + m_try_push_job_info, + m_write_initial_metadata, + m_get_user, + m_setup_config, + m_set_up_logging, + ): + config = {"job_id": 1} + m_setup_config.return_value = config + m_get_machine_type.return_value = "machine_type" + doc = scripts_run.__doc__ + args = docopt.docopt(doc, [ + "--interactive-on-error", + "path/to/config.yml", + ]) + run.main(args) + args, kwargs = m_run_tasks.call_args + fake_ctx = kwargs["ctx"]._conf + assert fake_ctx['interactive_on_error'] is True + + def test_get_teuthology_command(self): + doc = scripts_run.__doc__ + args = docopt.docopt(doc, [ + "--archive", "some/archive/dir", + "--description", "the_description", + "--lock", + "--block", + "--name", "the_name", + "--suite-path", "some/suite/dir", + "path/to/config.yml", "path/to/config2.yaml", + ]) + result = run.get_teuthology_command(args) + result = result.split() + expected = [ + "teuthology", + "path/to/config.yml", "path/to/config2.yaml", + "--suite-path", "some/suite/dir", + "--lock", + "--description", "the_description", + "--name", "the_name", + "--block", + "--archive", "some/archive/dir", + ] + assert len(result) == len(expected) + for arg in expected: + assert arg in result diff --git a/teuthology/test/test_safepath.py b/teuthology/test/test_safepath.py new file mode 100644 index 000000000..afc81cdad --- /dev/null +++ b/teuthology/test/test_safepath.py @@ -0,0 +1,55 @@ +from teuthology import safepath + +class TestSafepath(object): + def test_simple(self): + got = safepath.munge('foo') + assert got == 'foo' + + def test_empty(self): + # really odd corner case + got = safepath.munge('') + assert got == '_' + + def test_slash(self): + got = safepath.munge('/') + assert got == '_' + + def test_slashslash(self): + got = safepath.munge('//') + assert got == '_' + + def test_absolute(self): + got = safepath.munge('/evil') + assert got == 'evil' + + def test_absolute_subdir(self): + got = safepath.munge('/evil/here') + assert got == 'evil/here' + + def test_dot_leading(self): + got = safepath.munge('./foo') + assert got == 'foo' + + def test_dot_middle(self): + got = safepath.munge('evil/./foo') + assert got == 'evil/foo' + + def test_dot_trailing(self): + got = safepath.munge('evil/foo/.') + assert got == 'evil/foo' + + def test_dotdot(self): + got = safepath.munge('../evil/foo') + assert got == '_./evil/foo' + + def test_dotdot_subdir(self): + got = safepath.munge('evil/../foo') + assert got == 'evil/_./foo' + + def test_hidden(self): + got = safepath.munge('.evil') + assert got == '_evil' + + def test_hidden_subdir(self): + got = safepath.munge('foo/.evil') + assert got == 'foo/_evil' diff --git a/teuthology/test/test_schedule.py b/teuthology/test/test_schedule.py new file mode 100644 index 000000000..dd0a68f84 --- /dev/null +++ b/teuthology/test/test_schedule.py @@ -0,0 +1,45 @@ +from teuthology.schedule import build_config +from teuthology.misc import get_user + + +class TestSchedule(object): + basic_args = { + '--verbose': False, + '--owner': 'OWNER', + '--description': 'DESC', + '--email': 'EMAIL', + '--first-in-suite': False, + '--last-in-suite': True, + '--name': 'NAME', + '--worker': 'tala', + '--timeout': '6', + '--priority': '99', + # TODO: make this work regardless of $PWD + #'': ['../../examples/3node_ceph.yaml', + # '../../examples/3node_rgw.yaml'], + } + + def test_basic(self): + expected = { + 'description': 'DESC', + 'email': 'EMAIL', + 'first_in_suite': False, + 'last_in_suite': True, + 'machine_type': 'tala', + 'name': 'NAME', + 'owner': 'OWNER', + 'priority': 99, + 'results_timeout': '6', + 'verbose': False, + 'tube': 'tala', + } + + job_dict = build_config(self.basic_args) + assert job_dict == expected + + def test_owner(self): + args = self.basic_args + args['--owner'] = None + job_dict = build_config(self.basic_args) + assert job_dict['owner'] == 'scheduled_%s' % get_user() + diff --git a/teuthology/test/test_scrape.py b/teuthology/test/test_scrape.py new file mode 100644 index 000000000..ed281b4b2 --- /dev/null +++ b/teuthology/test/test_scrape.py @@ -0,0 +1,205 @@ +from __future__ import with_statement + +import glob +import gzip +import os +import shutil +import tempfile +import yaml +from teuthology import scrape + +class FakeResultDir(object): + """Mocks a Result Directory""" + + def __init__(self, + failure_reason="Dummy reason", + assertion="FAILED assert 1 == 2\n", + blank_backtrace=False, + assertion_osd=False, + ): + self.failure_reason = failure_reason + self.assertion = assertion + self.blank_backtrace = blank_backtrace + self.path = tempfile.mkdtemp() + + with open(os.path.join(self.path, "config.yaml"), "w") as f: + yaml.dump({"description": "Dummy test"}, f) + + with open(os.path.join(self.path, "summary.yaml"), "w") as f: + yaml.dump({ + "success": "false", + "failure_reason": self.failure_reason + }, f) + + with open(os.path.join(self.path, "teuthology.log"), "w") as f: + if not self.blank_backtrace: + f.write(" ceph version 1000\n") + f.write(".stderr: Dummy error\n") + f.write(self.assertion) + f.write(" NOTE: a copy of the executable dummy text\n") + + if assertion_osd: + host = "host1" + rem_log_dir = os.path.join(self.path, "remote", host, "log") + os.makedirs(rem_log_dir, exist_ok=True) + ceph_mon_log = os.path.join(rem_log_dir, "ceph-osd.0.log") + with open(ceph_mon_log, "w") as f: + f.write("ceph version 1000\n") + f.write(self.assertion) + + def __enter__(self): + return self + + def __exit__(self, exc_typ, exc_val, exc_tb): + shutil.rmtree(self.path) + +class TestScrape(object): + """Tests for teuthology.scrape""" + + def test_grep(self): + with FakeResultDir() as d: + filepath = os.path.join(d.path, "scrapetest.txt") + with open(filepath, 'w') as f: + f.write("Ceph is an open-source software storage platform\n\ + Teuthology is used for testing.") + + #System level grep is called + value1 = scrape.grep(filepath, "software") + value2 = scrape.grep(filepath, "device") + + assert value1 ==\ + ['Ceph is an open-source software storage platform', ''] + assert value2 == [] + + def test_job(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + assert job.get_success() == "false" + assert job.get_assertion() == "FAILED assert 1 == 2" + assert job.get_last_tlog_line() ==\ + b"NOTE: a copy of the executable dummy text" + assert job.get_failure_reason() == "Dummy reason" + + def test_timeoutreason(self): + with FakeResultDir(failure_reason=\ + "status 124: timeout '123 /home/ubuntu/cephtest/workunit.client.0/cephtool/test.sh'") as d: + job = scrape.Job(d.path, 1) + assert scrape.TimeoutReason.could_be(job) + assert scrape.TimeoutReason(job).match(job) + + def test_deadreason(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + #Summary is present + #So this cannot be a DeadReason + assert not scrape.DeadReason.could_be(job) + + def test_lockdepreason(self): + lkReason = None + with FakeResultDir(assertion=\ + "FAILED assert common/lockdep reason\n") as d: + job = scrape.Job(d.path, 1) + assert scrape.LockdepReason.could_be(job) + + lkReason = scrape.LockdepReason(job) + #Backtraces of same jobs must match 100% + assert lkReason.match(job) + with FakeResultDir(blank_backtrace=True) as d: + #Corresponding to 0% match + assert not lkReason.match(scrape.Job(d.path, 2)) + + def test_assertionreason(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + assert scrape.AssertionReason.could_be(job) + + def test_genericreason(self): + d1 = FakeResultDir(blank_backtrace=True) + d2 = FakeResultDir(failure_reason="Dummy dummy") + d3 = FakeResultDir() + + job1 = scrape.Job(d1.path, 1) + job2 = scrape.Job(d2.path, 2) + job3 = scrape.Job(d3.path, 3) + + reason = scrape.GenericReason(job3) + + assert reason.match(job2) + assert not reason.match(job1) + + shutil.rmtree(d1.path) + shutil.rmtree(d2.path) + shutil.rmtree(d3.path) + + def test_valgrindreason(self): + vreason = None + with FakeResultDir( + failure_reason="saw valgrind issues", + assertion="2014-08-22T20:07:18.668 ERROR:tasks.ceph:saw valgrind issue Leak_DefinitelyLost in /var/log/ceph/valgrind/osd.3.log.gz\n" + ) as d: + job = scrape.Job(d.path, 1) + assert scrape.ValgrindReason.could_be(job) + + vreason = scrape.ValgrindReason(job) + assert vreason.match(job) + + def test_give_me_a_reason(self): + with FakeResultDir() as d: + job = scrape.Job(d.path, 1) + + assert type(scrape.give_me_a_reason(job)) == scrape.AssertionReason + + #Test the lockdep ordering + with FakeResultDir(assertion=\ + "FAILED assert common/lockdep reason\n") as d: + job = scrape.Job(d.path, 1) + assert type(scrape.give_me_a_reason(job)) == scrape.LockdepReason + + def test_scraper(self): + d = FakeResultDir() + os.mkdir(os.path.join(d.path, "test")) + shutil.move( + os.path.join(d.path, "config.yaml"), + os.path.join(d.path, "test", "config.yaml") + ) + shutil.move( + os.path.join(d.path, "summary.yaml"), + os.path.join(d.path, "test", "summary.yaml") + ) + shutil.move( + os.path.join(d.path, "teuthology.log"), + os.path.join(d.path, "test", "teuthology.log") + ) + + scrape.Scraper(d.path).analyze() + + #scrape.log should be created + assert os.path.exists(os.path.join(d.path, "scrape.log")) + + shutil.rmtree(d.path) + + def test_gzip_backtrace_decode(self): + with FakeResultDir(assertion="FAILED assert dummy backtrace line", + blank_backtrace=True, + assertion_osd=True) as d: + + with open(os.path.join(d.path, "teuthology.log"), "a") as root_log: + root_log.write( + "command crashed with signal SIGSEGV tasks.ceph.osd.0.host1.stderr\n" + ) + + pattern = os.path.join(d.path, "**", "ceph-osd.0.log") + raws = glob.glob(pattern, recursive=True) + assert len(raws) == 1, f"expected one raw log, found: {raws}" + raw_log = raws[0] + gz_log = raw_log + ".gz" + + with gzip.open(gz_log, "wb") as out: + out.write(open(raw_log, "rb").read()) + os.remove(raw_log) + + assert not os.path.exists(raw_log) + assert os.path.exists(gz_log) + + job = scrape.Job(d.path, 1) + assert job.get_assertion() == "FAILED assert dummy backtrace line" \ No newline at end of file diff --git a/teuthology/test/test_timer.py b/teuthology/test/test_timer.py new file mode 100644 index 000000000..312a9c8b8 --- /dev/null +++ b/teuthology/test/test_timer.py @@ -0,0 +1,80 @@ +from teuthology import timer + +from unittest.mock import MagicMock, patch, mock_open +from time import time + + +class TestTimer(object): + def test_data_empty(self): + self.timer = timer.Timer() + assert self.timer.data == dict() + + def test_data_one_mark(self): + self.timer = timer.Timer() + # Avoid failing if ~1ms elapses between these two calls + self.timer.precision = 2 + self.timer.mark() + assert len(self.timer.data['marks']) == 1 + assert self.timer.data['marks'][0]['interval'] == 0 + assert self.timer.data['marks'][0]['message'] == '' + + def test_data_five_marks(self): + self.timer = timer.Timer() + for i in range(5): + self.timer.mark(str(i)) + assert len(self.timer.data['marks']) == 5 + assert [m['message'] for m in self.timer.data['marks']] == \ + ['0', '1', '2', '3', '4'] + + def test_intervals(self): + fake_time = MagicMock() + with patch('teuthology.timer.time.time', fake_time): + self.timer = timer.Timer() + now = start_time = fake_time.return_value = time() + intervals = [0, 1, 1, 2, 3, 5, 8] + for i in intervals: + now += i + fake_time.return_value = now + self.timer.mark(str(i)) + + summed_intervals = [sum(intervals[:x + 1]) for x in range(len(intervals))] + result_intervals = [m['interval'] for m in self.timer.data['marks']] + assert result_intervals == summed_intervals + assert self.timer.data['start'] == \ + self.timer.get_datetime_string(start_time) + assert self.timer.data['end'] == \ + self.timer.get_datetime_string(start_time + summed_intervals[-1]) + assert [m['message'] for m in self.timer.data['marks']] == \ + [str(i) for i in intervals] + assert self.timer.data['elapsed'] == summed_intervals[-1] + + def test_write(self): + _path = '/path' + _safe_dump = MagicMock(name='safe_dump') + with patch('teuthology.timer.yaml.safe_dump', _safe_dump): + with patch('teuthology.timer.open', mock_open(), create=True) as _open: + self.timer = timer.Timer(path=_path) + assert self.timer.path == _path + self.timer.write() + _open.assert_called_once_with(_path, 'w') + _safe_dump.assert_called_once_with( + dict(), + _open.return_value.__enter__.return_value, + default_flow_style=False, + ) + + def test_sync(self): + _path = '/path' + _safe_dump = MagicMock(name='safe_dump') + with patch('teuthology.timer.yaml.safe_dump', _safe_dump): + with patch('teuthology.timer.open', mock_open(), create=True) as _open: + self.timer = timer.Timer(path=_path, sync=True) + assert self.timer.path == _path + assert self.timer.sync is True + self.timer.mark() + _open.assert_called_once_with(_path, 'w') + _safe_dump.assert_called_once_with( + self.timer.data, + _open.return_value.__enter__.return_value, + default_flow_style=False, + ) diff --git a/teuthology/test/test_vps_os_vers_parameter_checking.py b/teuthology/test/test_vps_os_vers_parameter_checking.py new file mode 100644 index 000000000..8acfed046 --- /dev/null +++ b/teuthology/test/test_vps_os_vers_parameter_checking.py @@ -0,0 +1,84 @@ +from unittest.mock import patch, Mock + +import teuthology.lock.util +from teuthology import provision + + +class TestVpsOsVersionParamCheck(object): + + def setup_method(self): + self.fake_ctx = Mock() + self.fake_ctx.machine_type = 'vps' + self.fake_ctx.num_to_lock = 1 + self.fake_ctx.lock = False + + def fake_downburst_executable(): + return '' + + self.fake_downburst_executable = fake_downburst_executable + + def test_ubuntu_noble(self): + self.fake_ctx.os_type = 'ubuntu' + self.fake_ctx.os_version = 'noble' + with patch.multiple( + provision.downburst, + downburst_executable=self.fake_downburst_executable, + ): + check_value = teuthology.lock.util.vps_version_or_type_valid( + self.fake_ctx.machine_type, + self.fake_ctx.os_type, + self.fake_ctx.os_version) + + assert check_value + + def test_ubuntu_number(self): + self.fake_ctx.os_type = 'ubuntu' + self.fake_ctx.os_version = '24.04' + with patch.multiple( + provision.downburst, + downburst_executable=self.fake_downburst_executable, + ): + check_value = teuthology.lock.util.vps_version_or_type_valid( + self.fake_ctx.machine_type, + self.fake_ctx.os_type, + self.fake_ctx.os_version) + assert check_value + + def test_mixup(self): + self.fake_ctx.os_type = '6.5' + self.fake_ctx.os_version = 'rhel' + with patch.multiple( + provision.downburst, + downburst_executable=self.fake_downburst_executable, + ): + check_value = teuthology.lock.util.vps_version_or_type_valid( + self.fake_ctx.machine_type, + self.fake_ctx.os_type, + self.fake_ctx.os_version) + assert not check_value + + def test_bad_type(self): + self.fake_ctx.os_type = 'aardvark' + self.fake_ctx.os_version = '6.5' + with patch.multiple( + provision.downburst, + downburst_executable=self.fake_downburst_executable, + ): + check_value = teuthology.lock.util.vps_version_or_type_valid( + self.fake_ctx.machine_type, + self.fake_ctx.os_type, + self.fake_ctx.os_version) + assert not check_value + + def test_bad_version(self): + self.fake_ctx.os_type = 'ubuntu' + self.fake_ctx.os_version = 'vampire_bat' + with patch.multiple( + provision.downburst, + downburst_executable=self.fake_downburst_executable, + ): + check_value = teuthology.lock.util.vps_version_or_type_valid( + self.fake_ctx.machine_type, + self.fake_ctx.os_type, + self.fake_ctx.os_version) + assert not check_value diff --git a/teuthology/timer.py b/teuthology/timer.py new file mode 100644 index 000000000..9ca5d76a7 --- /dev/null +++ b/teuthology/timer.py @@ -0,0 +1,114 @@ +import logging +import time +import yaml + +import datetime + +log = logging.getLogger(__name__) + + +class Timer(object): + """ + A class that records timing data. + + It was created in order to record time intervals between the execution of + different tasks' enter and exit methods. + """ + # How many decimal places to use for time intervals + precision = 3 + # The format to use for date-time strings + datetime_format = '%Y-%m-%d_%H:%M:%S' + + def __init__(self, path=None, sync=False): + """ + :param path: A path to a file to be written when self.write() is + called. The file will contain self.data in yaml + format. + :param sync: Whether or not to call self.write() from within + self.mark() + """ + if sync and not path: + raise ValueError( + "When providing sync=True, a path must be specified!") + self.path = path + self.sync = sync + self.marks = list() + self.start_time = None + self.start_string = None + + def mark(self, message=''): + """ + Create a time mark + + If necessary, call self._mark_start() to begin time-keeping. Then, + create a new entry in self.marks with the message provided, along with + the time elapsed in seconds since time-keeping began. + """ + if self.start_time is None: + self._mark_start(message) + interval = round(time.time() - self.start_time, self.precision) + mark = dict( + interval=interval, + message=message, + ) + self.marks.append(mark) + if self.sync: + self.write() + + def _mark_start(self, message): + """ + Create the initial time mark + """ + self.start_time = time.time() + self.start_string = self.get_datetime_string(self.start_time) + + def get_datetime_string(self, time): + """ + Return a human-readable timestamp in UTC + + :param time: Time in seconds; like from time.time() + """ + _datetime = datetime.datetime.fromtimestamp(time, datetime.timezone.utc) + return datetime.datetime.strftime( + _datetime, + self.datetime_format, + ) + + @property + def data(self): + """ + Return an object similar to:: + + {'start': '2016-02-02_23:19:51', + 'elapsed': 10.65, + 'end': '2016-02-02_23:20:01', + 'marks': [ + {'message': 'event 1', 'interval': 0.0}, + {'message': 'event 2', 'interval': 8.58}, + {'message': 'event 3', 'interval': 10.65} + ], + } + + 'start' and 'end' times are in UTC. + """ + if not self.start_string: + return dict() + if len(self.marks) <= 1: + end_interval = 0 + else: + end_interval = self.marks[-1]['interval'] + end_time = self.start_time + end_interval + result = dict( + start=self.start_string, + marks=self.marks, + end=self.get_datetime_string(end_time), + elapsed=end_interval, + ) + return result + + def write(self): + try: + with open(self.path, 'w') as f: + yaml.safe_dump(self.data, f, default_flow_style=False) + except Exception: + log.exception("Failed to write timing.yaml !") diff --git a/teuthology/util/__init__.py b/teuthology/util/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/teuthology/util/compat.py b/teuthology/util/compat.py new file mode 100644 index 000000000..fc654e3d6 --- /dev/null +++ b/teuthology/util/compat.py @@ -0,0 +1,16 @@ +import sys + +PY3 = False + +if sys.version_info >= (3, 0): + PY3 = True + +if PY3: + from urllib.parse import parse_qs, urljoin, urlparse, urlencode # noqa: F401 + from urllib.request import urlopen, Request # noqa: F401 + from urllib.error import HTTPError # noqa: F401 +else: + from urlparse import parse_qs, urljoin, urlparse # noqa: F401 + from urllib import urlencode # noqa: F401 + from urllib2 import urlopen, Request, HTTPError # noqa: F401 + diff --git a/teuthology/util/flock.py b/teuthology/util/flock.py new file mode 100644 index 000000000..f381d8b51 --- /dev/null +++ b/teuthology/util/flock.py @@ -0,0 +1,22 @@ +import fcntl + + +class FileLock(object): + def __init__(self, filename, noop=False): + self.filename = filename + self.file = None + self.noop = noop + + def __enter__(self): + if not self.noop: + assert self.file is None + self.file = open(self.filename, 'w') + fcntl.lockf(self.file, fcntl.LOCK_EX) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if not self.noop: + assert self.file is not None + fcntl.lockf(self.file, fcntl.LOCK_UN) + self.file.close() + self.file = None diff --git a/teuthology/util/loggerfile.py b/teuthology/util/loggerfile.py new file mode 100644 index 000000000..3dd786258 --- /dev/null +++ b/teuthology/util/loggerfile.py @@ -0,0 +1,19 @@ +import logging + +class LoggerFile(object): + """ + A thin wrapper around a logging.Logger instance that provides a file-like + interface. + + Used by Ansible.execute_playbook() when it calls pexpect.run() + """ + def __init__(self, logger: logging.Logger, level: int): + self.logger = logger + self.level = level + + def write(self, string): + self.logger.log(self.level, string.decode('utf-8', 'ignore')) + + def flush(self): + pass + diff --git a/teuthology/util/scanner.py b/teuthology/util/scanner.py new file mode 100644 index 000000000..421d5a028 --- /dev/null +++ b/teuthology/util/scanner.py @@ -0,0 +1,159 @@ +import logging +import yaml +from typing import Optional, Tuple +from collections import defaultdict +from lxml import etree + +log = logging.getLogger(__name__) + + +class Scanner(): + def __init__(self, remote=None) -> None: + self.summary_data = [] + self.remote = remote + + def _parse(self, file_content) -> Tuple[str, dict]: + """ + This parses file_content and returns: + :returns: a message string + :returns: data dictionary with additional info + + Just an abstract method in Scanner class, + to be defined in inherited classes. + """ + raise NotImplementedError + + def scan_file(self, path: str) -> Optional[str]: + if not path: + return None + try: + file = self.remote._sftp_open_file(path, 'r') + file_content = file.read() + txt, data = self._parse(file_content) + if data: + data["file_path"] = path + self.summary_data += [data] + file.close() + return txt + except Exception as exc: + log.error(str(exc)) + + def scan_all_files(self, path_regex: str) -> [str]: + """ + Scans all files matching path_regex + and collect additional data in self.summary_data + + :param path_regex: Regex string to find all the files which have to be scanned. + Example: /path/to/dir/*.xml + """ + (_, stdout, _) = self.remote.ssh.exec_command(f'ls -d {path_regex}', timeout=200) + + files = stdout.read().decode().split('\n') + + extracted_txts = [] + for fpath in files: + txt = self.scan_file(fpath) + if txt: + extracted_txts += [txt] + return extracted_txts + + def write_summary(self, yaml_path: str) -> None: + """ + Create yaml file locally + with self.summary_data. + """ + if self.summary_data and yaml_path: + with open(yaml_path, 'a') as f: + yaml.safe_dump(self.summary_data, f, default_flow_style=False) + else: + log.info("summary_data or yaml_file is empty!") + + +class UnitTestScanner(Scanner): + def __init__(self, remote=None) -> None: + super().__init__(remote) + + def _parse(self, file_content: str) -> Tuple[Optional[str], Optional[dict]]: + xml_tree = etree.fromstring(file_content) + + failed_testcases = xml_tree.xpath('.//failure/.. | .//error/..') + if len(failed_testcases) == 0: + return None, None + + exception_txt = "" + error_data = defaultdict(list) + for testcase in failed_testcases: + testcase_name = testcase.get("name", "test-name") + testcase_suitename = testcase.get("classname", "suite-name") + for child in testcase: + if child.tag in ['failure', 'error']: + fault_kind = child.tag + reason = child.get('message', 'No message found in xml output, check logs.') + short_reason = (reason[:200].strip() + '...') if len(reason) > 200 else reason.strip() + error_data[testcase_suitename] += [{ + "kind": fault_kind, + "testcase": testcase_name, + "message": reason, + }] + if not exception_txt: + exception_txt = f'{fault_kind.upper()}: Test `{testcase_name}` of `{testcase_suitename}`. Reason: {short_reason}.' + + return exception_txt, { "failed_testsuites": dict(error_data), "num_of_failures": len(failed_testcases) } + + @property + def num_of_total_failures(self): + total_failed_testcases = 0 + if self.summary_data: + for file_data in self.summary_data: + failed_tests = file_data.get("num_of_failures", 0) + total_failed_testcases += failed_tests + return total_failed_testcases + + def scan_and_write(self, path_regex: str, summary_path: str) -> Optional[str]: + """ + Scan all files matching 'path_regex' + and write summary in 'summary_path'. + """ + try: + errors = self.scan_all_files(path_regex) + self.write_summary(summary_path) + if errors: + count = self.num_of_total_failures + return f"(total {count} failed) " + errors[0] + except Exception as scanner_exc: + log.error(str(scanner_exc)) + + +class ValgrindScanner(Scanner): + def __init__(self, remote=None) -> None: + super().__init__(remote) + + def _parse(self, file_content: str) -> Tuple[Optional[str], Optional[dict]]: + xml_tree = etree.fromstring(file_content) + if xml_tree is None: + return None, None + + error_tree = xml_tree.find('error') + if error_tree is None: + return None, None + + error_data = { + "kind": error_tree.findtext("kind"), + "traceback": [], + } + for frame in error_tree.xpath("stack/frame"): + if len(error_data["traceback"]) >= 5: + break + curr_frame = { + "file": f"{frame.findtext('dir', '')}/{frame.findtext('file', '')}", + "line": frame.findtext("line", ''), + "function": frame.findtext("fn", ''), + } + error_data["traceback"].append(curr_frame) + + traceback_functions = "\n".join( + frame.get("function", "N/A") + for frame in error_data["traceback"][:3] + ) + exception_text = f"valgrind error: {error_data['kind']}\n{traceback_functions}" + return exception_text, error_data diff --git a/teuthology/util/sentry.py b/teuthology/util/sentry.py new file mode 100644 index 000000000..ed767745b --- /dev/null +++ b/teuthology/util/sentry.py @@ -0,0 +1,52 @@ +import logging +import sentry_sdk + +from copy import deepcopy + +from teuthology.config import config as teuth_config +from teuthology.misc import get_http_log_path + +log = logging.getLogger(__name__) + + +def report_error(job_config, exception, task_name=None): + if not teuth_config.sentry_dsn: + return None + sentry_sdk.init(teuth_config.sentry_dsn) + job_config = deepcopy(job_config) + + tags = { + 'task': task_name, + 'owner': job_config.get("owner"), + } + optional_tags = ('teuthology_branch', 'branch', 'suite', + 'machine_type', 'os_type', 'os_version') + for tag in optional_tags: + if tag in job_config: + tags[tag] = job_config[tag] + + # Remove ssh keys from reported config + if 'targets' in job_config: + targets = job_config['targets'] + for host in targets.keys(): + targets[host] = '' + + job_id = job_config.get('job_id') + archive_path = job_config.get('archive_path') + extras = dict(config=job_config) + if job_id: + extras['logs'] = get_http_log_path(archive_path, job_id) + + fingerprint = exception.fingerprint() if hasattr(exception, 'fingerprint') else None + exc_id = sentry_sdk.capture_exception( + error=exception, + tags=tags, + extras=extras, + fingerprint=fingerprint, + ) + event_url = "{server}/?query={id}".format( + server=teuth_config.sentry_server.strip('/'), id=exc_id) + log.exception(" Sentry event: %s" % event_url) + return event_url + + diff --git a/teuthology/util/test/files/test_unit_test.xml b/teuthology/util/test/files/test_unit_test.xml new file mode 100644 index 000000000..bd9c73434 --- /dev/null +++ b/teuthology/util/test/files/test_unit_test.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/teuthology/util/test/files/test_valgrind.xml b/teuthology/util/test/files/test_valgrind.xml new file mode 100644 index 000000000..41bf8375f --- /dev/null +++ b/teuthology/util/test/files/test_valgrind.xml @@ -0,0 +1,31 @@ + + + + 0x870fc + 1 + Leak_DefinitelyLost + + 1,234 bytes in 1 blocks are definitely lost in loss record 198 of 201 + 1234 + 1 + + + + 0x4C39B6F + /usr/libexec/valgrind/vgpreload_memcheck-amd64-linux.so + operator new[](unsigned long) + /builddir/build/BUILD/valgrind-3.19.0/coregrind/m_replacemalloc + vg_replace_malloc.c + 640 + + + 0xF3F4B5 + /usr/bin/ceph-osd + ceph::common::leak_some_memory() + /usr/src/debug/ceph-18.0.0-5567.g64a4fc94.el8.x86_64/src/common + ceph_context.cc + 510 + + + + diff --git a/teuthology/util/test/test_scanner.py b/teuthology/util/test/test_scanner.py new file mode 100644 index 000000000..928d4305b --- /dev/null +++ b/teuthology/util/test/test_scanner.py @@ -0,0 +1,191 @@ +from mock import patch, MagicMock + +from io import BytesIO +import os, io + +from teuthology.orchestra import remote +from teuthology.util.scanner import UnitTestScanner, ValgrindScanner + + +class MockFile(io.StringIO): + def close(self): + pass + + +class TestUnitTestScanner(object): + + def setup_method(self): + self.remote = remote.Remote( + name='jdoe@xyzzy.example.com', ssh=MagicMock()) + self.test_values = { + "xml_path": os.path.dirname(__file__) + "/files/test_unit_test.xml", + "error_msg": "FAILURE: Test `test_set_bucket_tagging` of `s3tests_boto3.functional.test_s3`. \ +Reason: 'NoSuchTagSetError' != 'NoSuchTagSet'.", + "summary_data": [{'failed_testsuites': {'s3tests_boto3.functional.test_s3': + [{'kind': 'failure', 'testcase': 'test_set_bucket_tagging', + 'message': "'NoSuchTagSetError' != 'NoSuchTagSet'"}]}, + 'num_of_failures': 1, + 'file_path': f'{os.path.dirname(__file__)}/files/test_unit_test.xml'}], + "yaml_data": r"""- failed_testsuites: + s3tests_boto3.functional.test_s3: + - kind: failure + message: '''NoSuchTagSetError'' != ''NoSuchTagSet''' + testcase: test_set_bucket_tagging + file_path: {file_dir}/files/test_unit_test.xml + num_of_failures: 1 +""".format(file_dir=os.path.dirname(__file__)) + } + + @patch('teuthology.util.scanner.UnitTestScanner.write_summary') + def test_scan_and_write(self, m_write_summary): + xml_path = self.test_values["xml_path"] + self.remote.ssh.exec_command.return_value = (None, BytesIO(xml_path.encode('utf-8')), None) + m_open = MagicMock() + m_open.return_value = open(xml_path, "rb") + self.remote._sftp_open_file = m_open + result = UnitTestScanner(remote=self.remote).scan_and_write(xml_path, "test_summary.yaml") + assert result == "(total 1 failed) " + self.test_values["error_msg"] + + def test_parse(self): + xml_content = b'\n\n\n' + scanner = UnitTestScanner(self.remote) + result = scanner._parse(xml_content) + assert result == ( + 'FAILURE: Test `abc` of `xyz`. Reason: error_msg.', + {'failed_testsuites': {'xyz': + [{'kind': 'failure','message': 'error_msg','testcase': 'abc'}]}, + 'num_of_failures': 1 + } + ) + + def test_scan_file(self): + xml_path = self.test_values["xml_path"] + m_open = MagicMock() + m_open.return_value = open(xml_path, "rb") + self.remote._sftp_open_file = m_open + scanner = UnitTestScanner(remote=self.remote) + result = scanner.scan_file(xml_path) + assert result == self.test_values["error_msg"] + assert scanner.summary_data == self.test_values["summary_data"] + + def test_scan_all_files(self): + xml_path = self.test_values["xml_path"] + self.remote.ssh.exec_command.return_value = (None, BytesIO(xml_path.encode('utf-8')), None) + m_open = MagicMock() + m_open.return_value = open(xml_path, "rb") + self.remote._sftp_open_file = m_open + scanner = UnitTestScanner(remote=self.remote) + result = scanner.scan_all_files(xml_path) + assert result == [self.test_values["error_msg"]] + + @patch('builtins.open') + def test_write_summary(self, m_open): + scanner = UnitTestScanner(self.remote) + mock_yaml_file = MockFile() + scanner.summary_data = self.test_values["summary_data"] + m_open.return_value = mock_yaml_file + scanner.write_summary("path/file.yaml") + written_content = mock_yaml_file.getvalue() + assert written_content == self.test_values["yaml_data"] + + +class TestValgrindScanner(object): + + def setup_method(self): + self.remote = remote.Remote( + name='jdoe@xyzzy.example.com', ssh=MagicMock()) + self.test_values = { + "xml_path": os.path.dirname(__file__) + "/files/test_valgrind.xml", + "error_msg": "valgrind error: Leak_DefinitelyLost\noperator new[]\ +(unsigned long)\nceph::common::leak_some_memory()", + "summary_data": [{'kind': 'Leak_DefinitelyLost', 'traceback': [{'file': + '/builddir/build/BUILD/valgrind-3.19.0/coregrind/m_replacemalloc/vg_replace_malloc.c', + 'line': '640', 'function': 'operator new[](unsigned long)'}, + {'file': '/usr/src/debug/ceph-18.0.0-5567.g64a4fc94.el8.x86_64/src/common/ceph_context.cc', + 'line': '510', 'function': 'ceph::common::leak_some_memory()'}], 'file_path': + f'{os.path.dirname(__file__)}/files/test_valgrind.xml'}], + "yaml_data": r"""- file_path: {file_dir}/files/test_valgrind.xml + kind: Leak_DefinitelyLost + traceback: + - file: /builddir/build/BUILD/valgrind-3.19.0/coregrind/m_replacemalloc/vg_replace_malloc.c + function: operator new[](unsigned long) + line: '640' + - file: /usr/src/debug/ceph-18.0.0-5567.g64a4fc94.el8.x86_64/src/common/ceph_context.cc + function: ceph::common::leak_some_memory() + line: '510' +""".format(file_dir=os.path.dirname(__file__)) + } + + def test_parse_with_traceback(self): + xml_content = b''' + + + Leak_DefinitelyLost + + + func() + /dir + file1.ext + 640 + + + + +''' + scanner = ValgrindScanner(self.remote) + result = scanner._parse(xml_content) + assert result == ( + 'valgrind error: Leak_DefinitelyLost\nfunc()', + {'kind': 'Leak_DefinitelyLost', 'traceback': + [{'file': '/dir/file1.ext', 'line': '640', 'function': 'func()'}] + } + ) + + def test_parse_without_trackback(self): + xml_content = b''' + + + Leak_DefinitelyLost + + + + +''' + scanner = ValgrindScanner(self.remote) + result = scanner._parse(xml_content) + assert result == ( + 'valgrind error: Leak_DefinitelyLost\n', + {'kind': 'Leak_DefinitelyLost', 'traceback': []} + ) + + def test_scan_file(self): + xml_path = self.test_values["xml_path"] + m_open = MagicMock() + m_open.return_value = open(xml_path, "rb") + self.remote._sftp_open_file = m_open + scanner = ValgrindScanner(remote=self.remote) + result = scanner.scan_file(xml_path) + assert result == self.test_values["error_msg"] + assert scanner.summary_data == self.test_values["summary_data"] + + def test_scan_all_files(self): + xml_path = self.test_values["xml_path"] + self.remote.ssh.exec_command.return_value = (None, BytesIO(xml_path.encode('utf-8')), None) + m_open = MagicMock() + m_open.return_value = open(xml_path, "rb") + self.remote._sftp_open_file = m_open + scanner = ValgrindScanner(remote=self.remote) + result = scanner.scan_all_files(xml_path) + assert result == [self.test_values["error_msg"]] + + @patch('builtins.open') + def test_write_summary(self, m_open): + scanner = ValgrindScanner(self.remote) + mock_yaml_file = MockFile() + scanner.summary_data = self.test_values["summary_data"] + m_open.return_value = mock_yaml_file + scanner.write_summary("path/file.yaml") + written_content = mock_yaml_file.getvalue() + assert written_content == self.test_values["yaml_data"] \ No newline at end of file diff --git a/teuthology/util/test/test_time.py b/teuthology/util/test/test_time.py new file mode 100644 index 000000000..c37b5f2f5 --- /dev/null +++ b/teuthology/util/test/test_time.py @@ -0,0 +1,54 @@ +import pytest + +from datetime import datetime, timedelta, timezone +from typing import Type + +from teuthology.util import time + + +@pytest.mark.parametrize( + ["timestamp", "result"], + [ + ["1999-12-31_23:59:59", datetime(1999, 12, 31, 23, 59, 59, tzinfo=timezone.utc)], + ["1999-12-31_23:59", datetime(1999, 12, 31, 23, 59, 0, tzinfo=timezone.utc)], + ["1999-12-31T23:59:59", datetime(1999, 12, 31, 23, 59, 59, tzinfo=timezone.utc)], + ["1999-12-31T23:59:59+00:00", datetime(1999, 12, 31, 23, 59, 59, tzinfo=timezone.utc)], + ["1999-12-31T17:59:59-06:00", datetime(1999, 12, 31, 23, 59, 59, tzinfo=timezone.utc)], + ["2024-01-01", datetime(2024, 1, 1, 0, 0, tzinfo=timezone.utc)], + ["tomorrow", ValueError], + ["1d", ValueError], + ["", ValueError], + ["2024", ValueError], + + ] +) +def test_parse_timestamp(timestamp: str, result: datetime | Type[Exception]): + if isinstance(result, datetime): + assert time.parse_timestamp(timestamp) == result + else: + with pytest.raises(result): + time.parse_timestamp(timestamp) + + +@pytest.mark.parametrize( + ["offset", "result"], + [ + ["1s", timedelta(seconds=1)], + ["1m", timedelta(minutes=1)], + ["1h", timedelta(hours=1)], + ["1d", timedelta(days=1)], + ["1w", timedelta(weeks=1)], + ["365d", timedelta(days=365)], + ["1x", ValueError], + ["-1m", ValueError], + ["0xde", ValueError], + ["frog", ValueError], + ["7dwarfs", ValueError], + ] +) +def test_parse_offset(offset: str, result: timedelta | Type[Exception]): + if isinstance(result, timedelta): + assert time.parse_offset(offset) == result + else: + with pytest.raises(result): + time.parse_offset(offset) diff --git a/teuthology/util/time.py b/teuthology/util/time.py new file mode 100644 index 000000000..8e0525fcc --- /dev/null +++ b/teuthology/util/time.py @@ -0,0 +1,52 @@ +import re + +from datetime import datetime, timedelta, timezone + +# When we're not using ISO format, we're using this +TIMESTAMP_FMT = "%Y-%m-%d_%H:%M:%S" + +def parse_timestamp(timestamp: str) -> datetime: + """ + timestamp: A string either in ISO 8601 format or TIMESTAMP_FMT. + If no timezone is specified, UTC is assumed. + + :returns: a datetime object + """ + try: + dt = datetime.fromisoformat(timestamp) + except ValueError: + dt = datetime.strptime(timestamp, TIMESTAMP_FMT) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + +def parse_offset(offset: str) -> timedelta: + """ + offset: A string consisting of digits followed by one of the following + characters: + s: seconds + m: minutes + h: hours + d: days + w: weeks + """ + err_msg = "Offsets must either be an ISO 8601-formatted timestamp or " \ + f"a relative value like '2w', '1d', '7h', '45m', '90s'. Got: {offset}" + match = re.match(r'(\d+)(s|m|h|d|w)$', offset) + if match is None: + raise ValueError(err_msg) + num = int(match.groups()[0]) + unit = match.groups()[1] + match unit: + case 's': + return timedelta(seconds=num) + case 'm': + return timedelta(minutes=num) + case 'h': + return timedelta(hours=num) + case 'd': + return timedelta(days=num) + case 'w': + return timedelta(weeks=num) + case _: + raise ValueError(err_msg) diff --git a/tox.ini b/tox.ini new file mode 100644 index 000000000..40c48b071 --- /dev/null +++ b/tox.ini @@ -0,0 +1,82 @@ +[tox] +envlist = docs, py3, flake8 +isolated_build = True + +[testenv] +setenv = + LC_ALL=en_US.UTF-8 + LANG=en_US + +[testenv:py3] +install_command = pip install --upgrade {opts} {packages} +passenv = HOME +deps= + -r{toxinidir}/requirements.txt + pytest-cov + coverage + mock +extras = test +log_format = %(asctime)s %(levelname)s %(message)s +commands= + python -m pytest --cov=teuthology --cov-report=term -v {posargs:teuthology scripts} + +[testenv:flake8] +install_command = pip install --upgrade {opts} {packages} +deps= + flake8 +commands=flake8 --select=F,E9 {posargs:teuthology scripts} + +[testenv:docs] +install_command = pip install --upgrade {opts} {packages} +changedir=docs +deps= + -r{toxinidir}/requirements.txt + sphinx != 7.2.0, != 7.2.1, != 7.2.2 + sphinxcontrib-programoutput +commands= + sphinx-apidoc -f -o . ../teuthology ../teuthology/test ../teuthology/orchestra/test ../teuthology/task/test + sphinx-build -b html -d {envtmpdir}/doctrees . {envtmpdir}/html + +[testenv:openstack] +install_command = pip install --upgrade {opts} {packages} +passenv = + HOME + OS_REGION_NAME + OS_AUTH_URL + OS_TENANT_ID + OS_TENANT_NAME + OS_PASSWORD + OS_USERNAME +deps= + -r{toxinidir}/requirements.txt +extras = test +commands=py.test -v {posargs:teuthology/openstack/test/test_openstack.py} + +[testenv:openstack-integration] +passenv = + HOME + OS_REGION_NAME + OS_AUTH_URL + OS_TENANT_ID + OS_TENANT_NAME + OS_PASSWORD + OS_USERNAME +deps= + -r{toxinidir}/requirements.txt +extras = test +commands= + py.test -v {posargs} teuthology/openstack/test/openstack-integration.py + +[testenv:openstack-delegate] +passenv = + HOME + OS_REGION_NAME + OS_AUTH_URL + OS_TENANT_ID + OS_TENANT_NAME + OS_PASSWORD + OS_USERNAME +sitepackages=True +deps= + -r{toxinidir}/requirements.txt +commands={toxinidir}/openstack-delegate.sh diff --git a/update-requirements.sh b/update-requirements.sh new file mode 100755 index 000000000..a2b56ba8d --- /dev/null +++ b/update-requirements.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pip-compile --extra=test $@ pyproject.toml diff --git a/watch-suite.sh b/watch-suite.sh new file mode 100755 index 000000000..04a5d34ee --- /dev/null +++ b/watch-suite.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +watch "pwd ; echo \`teuthology-ls . | grep -c pass\` passes ; teuthology-ls . | grep -v pass" +