From: David Galloway Date: Wed, 18 Feb 2026 19:12:02 +0000 (-0500) Subject: prep-fog-capture: Refactor, split, re-add cobbler-provided hacks X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=refs%2Fheads%2Fprepfog-v2;p=ceph-cm-ansible.git prep-fog-capture: Refactor, split, re-add cobbler-provided hacks - Using systemd instead of rc.local: - Configure netplan on boot for Ubuntu - Configure NetworkManager on boot for CentOS/Rocky - Set hostname - Update packages and reboot Signed-off-by: David Galloway --- diff --git a/tools/prep-fog-capture.yml b/tools/prep-fog-capture.yml index 260e2890..bee0c477 100644 --- a/tools/prep-fog-capture.yml +++ b/tools/prep-fog-capture.yml @@ -1,181 +1,14 @@ --- -### This standalone playbook can be used to prep a COBBLER-IMAGED testnode +### This role is used to prep a {FOG|MAAS}-IMAGED testnode ### so that it can be used to capture an OS image for FOG. ### This playbook is needed for a couple reasons ### - NIC configs get hard coded into the captured FOG images so nodes reimaged by FOG don't come up with network +### - SSH host keys need to be deleted +### - apt and cloud-init services need to be disabled - hosts: - testnodes - become: true + roles: + - prep-fog-capture gather_facts: false - tasks: - - # (Missing in RHEL8) - - name: Check for /usr/bin/python - shell: echo marco - register: polo - ignore_errors: true - - - name: Set ansible_python_interpreter=/usr/bin/python3 - set_fact: - ansible_python_interpreter: /usr/bin/python3 - when: polo is failed - - # Now that we know where python is, we can gather_facts - - setup: - - # We need to leave /.cephlab_rc_local or else each FOG reimage would tell Cobbler to run ceph-cm-ansible - - name: Remove lock files and udev rules - file: - path: "{{ item }}" - state: absent - with_items: - - /etc/udev/rules.d/70-persistent-net.rules - - /.cephlab_net_configured - - /ceph-qa-ready - - - name: Get list of ifcfg scripts from host used to capture image - shell: "ls -1 /etc/sysconfig/network-scripts/ifcfg-* | grep -v ifcfg-lo" - register: ifcfg_scripts - when: ansible_os_family == "RedHat" - ignore_errors: true - - - name: Get list of ifcfg scripts from host used to capture image - shell: "ls -1 /etc/sysconfig/network/ifcfg-* | grep -v ifcfg-lo" - register: ifcfg_scripts - when: ansible_os_family == "Suse" - ignore_errors: true - - - name: Delete ifcfg scripts - file: - path: "{{ item }}" - state: absent - with_items: "{{ ifcfg_scripts.stdout_lines|default([]) }}" - when: ifcfg_scripts is defined - - - name: Remove /var/lib/ceph mountpoint from fstab - shell: sed -i '/\/var\/lib\/ceph/d' /etc/fstab - - - name: Unmount /var/lib/ceph - ansible.posix.mount: - path: /var/lib/ceph - state: unmounted - - - name: Install one-shot service to regenerate SSH host keys on first boot - copy: - dest: /etc/systemd/system/regen-ssh-hostkeys.service - owner: root - group: root - mode: '0644' - content: | - [Unit] - Description=Regenerate SSH host keys on first boot - ConditionPathExists=!/etc/ssh/ssh_host_ed25519_key - Before=ssh.service - - [Service] - Type=oneshot - ExecStart=/usr/bin/ssh-keygen -A - ExecStartPost=/bin/systemctl disable regen-ssh-hostkeys.service - - [Install] - WantedBy=multi-user.target - - - name: Reload systemd daemon - systemd: - daemon_reload: true - - - name: Enable regen-ssh-hostkeys.service - systemd: - name: regen-ssh-hostkeys.service - enabled: true - - - name: Get list of SSH host keys - shell: "ls -1 /etc/ssh/ssh_host_*" - register: ssh_host_keys - ignore_errors: true - - # Key regeneration is done automatically on CentOS firstboot. - # For Ubuntu, we'll add `dpkg-reconfigure openssh-server` to rc.local - - name: Delete SSH host keys so they're generated during firstboot on cloned machines - file: - path: "{{ item }}" - state: absent - with_items: "{{ ssh_host_keys.stdout_lines|default([]) }}" - when: ssh_host_keys is defined - - - name: Unsubscribe RHEL - command: subscription-manager unregister - when: ansible_distribution == "RedHat" - failed_when: false - - # A file gets leftover when a testnode is registered with Satellite that caused - # each registered subsequent testnode to report the wrong hostname - - name: Clean up katello facts - file: - path: /etc/rhsm/facts/katello.facts - state: absent - when: ansible_distribution == "RedHat" - - # https://bugzilla.redhat.com/show_bug.cgi?id=1814337 - - name: Disable dnf-makecache service - service: - name: dnf-makecache.timer - state: stopped - enabled: no - when: - - ansible_os_family == "RedHat" - - ansible_distribution_major_version|int >= 8 - - # Hopefully fixes https://github.com/ceph/ceph-cm-ansible/pull/544#issuecomment-599076564 - - name: Clean DNF cache - shell: "dnf clean all && rm -rf /var/cache/dnf/*" - when: - - ansible_os_family == "RedHat" - - ansible_distribution_major_version|int >= 8 - - - set_fact: - ntp_service: ntp - when: ansible_os_family == "Debian" - - - set_fact: - ntp_service: ntpd - when: ansible_os_family == "RedHat" and ansible_distribution_major_version|int <= 7 - - - set_fact: - ntp_service: chronyd - when: (ansible_os_family == "RedHat" and ansible_distribution_major_version|int >= 8) or - ansible_os_family == "Suse" - - - name: "Stop {{ ntp_service }} service" - service: - name: "{{ ntp_service }}" - state: stopped - when: '"ntp" in ntp_service' - - # The theory here is although we do have the ntp service running on boot, - # if the time is off, it slowly drifts back in sync. Since our testnodes - # are ephemeral, they don't ever have enough time to correctly drift - # back to the correct time. So we'll force it in the captured OS images. - - name: Install ntpdate command if missing - package: - name: ntpdate - state: present - when: '"ntp" in ntp_service' - - - name: Force time synchronization using stepping | ntp - command: "ntpdate -b {{ ntp_servers|join(' ') }}" - when: '"ntp" in ntp_service' - - - name: "Start {{ ntp_service }}" - service: - name: "{{ ntp_service }}" - state: started - - # chronyd needs to be started in order to force time sync. This differs from ntpd. - - name: Force time synchronization using stepping | chrony - command: chronyc -a makestep - when: '"chrony" in ntp_service' - - - name: Sync the hardware clock - command: "hwclock --systohc" + become: true diff --git a/tools/roles/prep-fog-capture/files/cephlab-set-hostname.service b/tools/roles/prep-fog-capture/files/cephlab-set-hostname.service new file mode 100644 index 00000000..180af02e --- /dev/null +++ b/tools/roles/prep-fog-capture/files/cephlab-set-hostname.service @@ -0,0 +1,14 @@ +[Unit] +Description=Ceph Lab hostname configuration +After=network-online.target nss-lookup.target +Wants=nss-lookup.target + +[Service] +StandardOutput=journal+console +StandardError=journal+console +Type=oneshot +ExecStart=/usr/local/sbin/cephlab-set-hostname.sh +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/tools/roles/prep-fog-capture/files/cephlab-set-hostname.sh b/tools/roles/prep-fog-capture/files/cephlab-set-hostname.sh new file mode 100644 index 00000000..482df749 --- /dev/null +++ b/tools/roles/prep-fog-capture/files/cephlab-set-hostname.sh @@ -0,0 +1,193 @@ +#!/usr/bin/env bash +# Wait for /.cephlab_net_configured, then set hostname + /etc/hostname + /etc/hosts +# Flow: +# 1) Wait for DHCP/global IPv4 +# 2) Ping CHECK_HOST for up to 10 minutes (from any local IP) +# 3) Once ping works, try reverse DNS for up to 10 minutes (for an IP that can ping) +# 4) Set hostname and rewrite /etc/hostname + /etc/hosts +set -euo pipefail + +# --- Config --- +CHECK_HOST="10.20.192.14" # soko04 (must be reachable before we trust DNS) +DEFAULT_NAMESERVER="10.20.192.11" # override via env NAMESERVER or arg1 + +WAIT_FOR_FILE="/.cephlab_net_configured" +HOSTNAME_IS_SET_FILE="/.cephlab_hostname_set" +LOG="/var/log/cephlab-set-hostname.log" + +NAMESERVER="${NAMESERVER:-${1:-${DEFAULT_NAMESERVER}}}" + +MAX_WAIT_SECONDS="300" # wait for /.cephlab_net_configured +PING_WINDOW_SECONDS="600" # 10 minutes +DNS_WINDOW_SECONDS="600" # 10 minutes +LOOP_SLEEP_SECONDS="2" + +# --- Logging --- +touch "$LOG" +chmod 0644 "$LOG" +exec > >(tee -a "$LOG") 2>&1 + +log() { + echo "$(date -u +%FT%T.%N | cut -c1-23) cephlab-set-hostname: $*" >&2 +} + +# --- Helpers --- +get_my_ips() { + ip -4 -o addr show scope global 2>/dev/null \ + | awk '$2 != "docker0" {print $4}' \ + | cut -d/ -f1 \ + || true +} + +# Reverse lookup helper (never non-zero; safe with set -euo pipefail) +reverse_lookup() { + local ip="$1" + local ns="$2" + local name="" + + if command -v dig >/dev/null 2>&1; then + name="$(dig +time=1 +tries=1 +short -x "${ip}" @"${ns}" 2>/dev/null | head -n1 | sed 's/\.$//' || true)" + elif command -v host >/dev/null 2>&1; then + name="$(host -W 1 "${ip}" "${ns}" 2>/dev/null | awk '/domain name pointer/ {print $5}' | sed 's/\.$//' | head -n1 || true)" + elif command -v getent >/dev/null 2>&1; then + name="$(getent hosts "${ip}" 2>/dev/null | awk '{print $2}' | head -n1 || true)" + fi + + echo "${name}" +} + +set_hostname() { + local fqdn="$1" + if command -v hostnamectl >/dev/null 2>&1; then + hostnamectl set-hostname "${fqdn}" + else + hostname "${fqdn}" + fi +} + +can_ping_from_ip() { + local src_ip="$1" + # More tolerant per-attempt check but bounded: + # 3 packets, 1s apart, wait up to 2s each; hard cap 10s. + timeout 10s ping -I "${src_ip}" -nq -c3 -i 1 -W 2 "${CHECK_HOST}" >/dev/null 2>&1 +} + +# --- Main --- +if [[ -f "${HOSTNAME_IS_SET_FILE}" ]]; then + log "We've already set the hostname before. Exiting..." + exit 0 +fi + +log "Waiting for ${WAIT_FOR_FILE} (up to ${MAX_WAIT_SECONDS}s)..." +end=$((SECONDS + MAX_WAIT_SECONDS)) +while [[ ! -f "${WAIT_FOR_FILE}" ]]; do + if (( SECONDS >= end )); then + log "Timed out waiting for ${WAIT_FOR_FILE}. Exiting." + exit 1 + fi + sleep 1 +done +log "Flag file present. Proceeding." + +# Wait for at least one global IPv4 +myips="$(get_my_ips)" +if [[ -z "${myips}" ]]; then + log "No non-loopback IPv4 addresses found yet. Will continue, but ping/DNS will likely fail until DHCP is up." +fi + +# 1) Ping CHECK_HOST for up to 10 minutes (find a working source IP) +log "Checking connectivity to ${CHECK_HOST} for up to ${PING_WINDOW_SECONDS}s..." +ping_deadline=$((SECONDS + PING_WINDOW_SECONDS)) +good_ip="" + +while (( SECONDS < ping_deadline )); do + myips="$(get_my_ips)" + if [[ -z "${myips}" ]]; then + log "No global IPv4 yet; waiting..." + sleep "${LOOP_SLEEP_SECONDS}" + continue + fi + + for ip in ${myips}; do + log "Pinging ${CHECK_HOST} from ${ip}..." + if can_ping_from_ip "${ip}"; then + good_ip="${ip}" + log "Connectivity confirmed: ${ip} -> ${CHECK_HOST}" + break + fi + log "Ping failed from ${ip}" + done + + [[ -n "${good_ip}" ]] && break + sleep "${LOOP_SLEEP_SECONDS}" +done + +if [[ -z "${good_ip}" ]]; then + log "Timed out (${PING_WINDOW_SECONDS}s) waiting for connectivity to ${CHECK_HOST}. Nothing changed." + exit 1 +fi + +# 2) Now that we can reach CHECK_HOST, try reverse DNS for up to 10 minutes +log "Connectivity is good. Attempting reverse DNS via ${NAMESERVER} for up to ${DNS_WINDOW_SECONDS}s..." +dns_deadline=$((SECONDS + DNS_WINDOW_SECONDS)) +newhostname="" + +while (( SECONDS < dns_deadline )); do + # Prefer the IP that proved connectivity; if it disappeared, re-find a good one. + myips="$(get_my_ips)" + if [[ -z "${myips}" ]]; then + log "Lost all global-scope IPv4 addresses; waiting..." + sleep "${LOOP_SLEEP_SECONDS}" + continue + fi + + if ! echo "${myips}" | tr ' ' '\n' | grep -qx "${good_ip}"; then + log "Previously-good IP ${good_ip} is gone; re-checking connectivity..." + good_ip="" + for ip in ${myips}; do + log "Pinging ${CHECK_HOST} from ${ip}..." + if can_ping_from_ip "${ip}"; then + good_ip="${ip}" + log "Connectivity confirmed: ${ip} -> ${CHECK_HOST}" + break + fi + done + [[ -z "${good_ip}" ]] && { sleep "${LOOP_SLEEP_SECONDS}"; continue; } + fi + + log "Reverse lookup for ${good_ip} via ${NAMESERVER}..." + newhostname="$(reverse_lookup "${good_ip}" "${NAMESERVER}")" + + if [[ -n "${newhostname}" ]]; then + log "Resolved ${good_ip} -> ${newhostname}" + break + fi + + log "Reverse lookup failed/empty for ${good_ip}" + sleep "${LOOP_SLEEP_SECONDS}" +done + +if [[ -z "${newhostname}" ]]; then + log "Timed out (${DNS_WINDOW_SECONDS}s) waiting for reverse DNS via ${NAMESERVER}. Nothing changed." + exit 1 +fi + +# Apply hostname + persist +set_hostname "${newhostname}" +shorthostname="${newhostname%%.*}" +echo "${newhostname}" > /etc/hostname + +log "Rewriting /etc/hosts from scratch" +cat > /etc/hosts < >(tee -a "$LOG") 2>&1 + +log() { + echo "$(date -u +%FT%T.%N | cut -c1-23) netplan-from-link: $*" >&2 +} + +log "starting" +log "kernel=$(uname -r)" +log "cmdline=$(cat /proc/cmdline || true)" + +rm -f /etc/netplan/*.yaml || true + +pick_iface() { + for d in /sys/class/net/*; do + iface="$(basename "$d")" + c="$d/carrier" + + case "$iface" in + lo|docker*|veth*|virbr*|br*|cni*|flannel*|weave*|zt*|wg*|tun*|tap*|sit*|ip6tnl*|gre*|gretap*|erspan*|bond* ) + continue + ;; + esac + + ip link set dev "$iface" up 2>/dev/null || true + v="$(cat "$c" 2>/dev/null || true)" + log "probe iface=$iface carrier='${v}' path=$c" + if [[ -r "$c" ]] && [[ "$v" == "1" ]]; then + log "selected iface=$iface via carrier" + echo "$iface" + return 0 + fi + done + + dflt="$(ip -4 route show default 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}' || true)" + if [[ -n "${dflt:-}" ]]; then + log "selected iface=$dflt via default-route" + echo "$dflt" + return 0 + fi + + return 1 +} + +iface="" +for i in $(seq 1 30); do + iface="$(pick_iface || true)" + if [[ -n "${iface:-}" ]]; then + break + fi + log "no iface yet (attempt $i/30); sleeping 1s" + sleep 1 +done + +if [[ -z "${iface:-}" ]]; then + log "netplan-from-link could not find an uplink interface" + log "ip -o link:" + ip -o link show || true + log "ip -4 addr:" + ip -4 addr show || true + log "ip -4 route:" + ip -4 route show || true + exit 0 +fi + +log "writing netplan to $OUT for iface=$iface" +cat >"$OUT" </dev/null 2>&1; then + log "netplan generate" + netplan generate || true + log "netplan apply" + netplan apply || true +else + log "netplan not found; skipping generate/apply" +fi + +log "final ip -4 addr for iface=$iface" +ip -4 addr show dev "$iface" || true + +touch "$STAMP" +log "done; touched $STAMP" diff --git a/tools/roles/prep-fog-capture/files/nm-from-link.service b/tools/roles/prep-fog-capture/files/nm-from-link.service new file mode 100644 index 00000000..d59b6e94 --- /dev/null +++ b/tools/roles/prep-fog-capture/files/nm-from-link.service @@ -0,0 +1,14 @@ +[Unit] +Description=Write NetworkManager connection from link carrier once +After=systemd-udev-settle.service local-fs.target +Wants=systemd-udev-settle.service + +[Service] +StandardOutput=journal+console +StandardError=journal+console +Type=oneshot +ExecStart=/usr/local/sbin/nm-from-link.sh +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target diff --git a/tools/roles/prep-fog-capture/files/nm-from-link.sh b/tools/roles/prep-fog-capture/files/nm-from-link.sh new file mode 100644 index 00000000..4e03f20d --- /dev/null +++ b/tools/roles/prep-fog-capture/files/nm-from-link.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +set -euo pipefail + +STAMP="/.cephlab_net_configured" +LOG="/var/log/nm-from-link.log" + +touch "$LOG" +chmod 0644 "$LOG" +exec > >(tee -a "$LOG") 2>&1 + +log() { + echo "$(date -u +%FT%T.%N | cut -c1-23) nm-from-link: $*" >&2 +} + +log "starting" + +pick_iface() { + for c in /sys/class/net/*/carrier; do + iface="$(basename "$(dirname "$c")")" + + case "$iface" in + lo|docker*|veth*|virbr*|br*|cni*|flannel*|weave*|zt*|wg*|tun*|tap*|sit*|ip6tnl*|gre*|gretap*|erspan*|bond* ) + continue + ;; + esac + + if [[ -r "$c" ]] && [[ "$(cat "$c")" == "1" ]]; then + echo "$iface" + return 0 + fi + done + + ip -4 route show default 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}' || true +} + +iface="" +for _ in $(seq 1 30); do + iface="$(pick_iface || true)" + if [[ -n "${iface:-}" ]]; then + break + fi + sleep 1 +done + +if [[ -z "${iface:-}" ]]; then + log "nm-from-link could not find an uplink interface" >&2 + exit 0 +fi + +systemctl enable --now NetworkManager || true + +IFACE="$iface" +CONN="fog-dhcp-${IFACE}" + +# Remove existing connections pinned to this interface (prevents stale MAC/IP settings) +nmcli -t -f NAME,DEVICE con show | awk -F: -v d="$IFACE" '$2==d {print $1}' | while read -r n; do + [[ -n "$n" ]] && nmcli con delete "$n" || true +done + +# Remove same-named conn if present +nmcli -t -f NAME con show | grep -qx "$CONN" && nmcli con delete "$CONN" || true + +nmcli con add type ethernet ifname "$IFACE" con-name "$CONN" ipv4.method auto ipv6.method ignore +nmcli con mod "$CONN" connection.autoconnect yes +nmcli con mod "$CONN" ipv4.ignore-auto-dns yes +nmcli con mod "$CONN" ipv4.dns "10.20.192.11" +nmcli con up "$CONN" || true + +touch "$STAMP" diff --git a/tools/roles/prep-fog-capture/files/regen-ssh-hostkeys.service b/tools/roles/prep-fog-capture/files/regen-ssh-hostkeys.service new file mode 100644 index 00000000..90af2f42 --- /dev/null +++ b/tools/roles/prep-fog-capture/files/regen-ssh-hostkeys.service @@ -0,0 +1,12 @@ +[Unit] +Description=Regenerate SSH host keys on first boot +ConditionPathExists=!/etc/ssh/ssh_host_ed25519_key +Before=ssh.service + +[Service] +Type=oneshot +ExecStart=/usr/bin/ssh-keygen -A +ExecStartPost=/bin/systemctl disable regen-ssh-hostkeys.service + +[Install] +WantedBy=multi-user.target diff --git a/tools/roles/prep-fog-capture/tasks/apt.yml b/tools/roles/prep-fog-capture/tasks/apt.yml new file mode 100644 index 00000000..7d75764c --- /dev/null +++ b/tools/roles/prep-fog-capture/tasks/apt.yml @@ -0,0 +1,97 @@ +--- +- name: Update apt cache + apt: + update_cache: yes + cache_valid_time: 3600 + when: ansible_facts.os_family == "Debian" + +- name: Full upgrade (apt dist-upgrade) + apt: + upgrade: dist + when: ansible_facts.os_family == "Debian" + +- name: Check if reboot is required (Debian/Ubuntu) + stat: + path: /var/run/reboot-required + register: deb_reboot_required + when: ansible_facts.os_family == "Debian" + +- name: Install one-shot service to regenerate SSH host keys on first boot + copy: + src: files/regen-ssh-hostkeys.service + dest: /etc/systemd/system/regen-ssh-hostkeys.service + owner: root + group: root + mode: '0644' + +- name: Reload systemd daemon + systemd: + daemon_reload: true + +- name: Enable regen-ssh-hostkeys.service + systemd: + name: regen-ssh-hostkeys.service + enabled: true + +- set_fact: + ntp_service: ntp + +- name: Remove cloud init netplan file + file: + path: /etc/netplan/50-cloud-init.yaml + state: absent + failed_when: false + +- name: Install netplan link selection script + copy: + src: files/netplan-from-link.sh + dest: /usr/local/sbin/netplan-from-link.sh + owner: root + group: root + mode: "0755" + +- name: Install netplan-from-link systemd unit + copy: + src: files/netplan-from-link.service + dest: /etc/systemd/system/netplan-from-link.service + owner: root + group: root + mode: "0644" + +- name: Enable netplan link selection systemd unit + systemd: + name: netplan-from-link.service + enabled: true + state: started + daemon_reload: true + +- name: Disable NetworkManager + systemd: + name: NetworkManager + enabled: false + state: stopped + failed_when: false + +- name: Enable networkd + systemd: + name: systemd-networkd + enabled: true + state: started + daemon_reload: true + +- name: Avoid wait online hang + systemd: + name: systemd-networkd-wait-online + enabled: false + state: stopped + failed_when: false + +- name: Fog prep netplan generate + command: netplan generate + changed_when: false + failed_when: false + +- name: Fog prep netplan apply + command: netplan apply + changed_when: true + failed_when: false diff --git a/tools/roles/prep-fog-capture/tasks/main.yml b/tools/roles/prep-fog-capture/tasks/main.yml new file mode 100644 index 00000000..49f73efc --- /dev/null +++ b/tools/roles/prep-fog-capture/tasks/main.yml @@ -0,0 +1,161 @@ +--- +# Tasks common to all distros +# We import tasks based on ansible_os_family about halfway through + +- setup: + +- name: Remove lock files, udev rules, logs + file: + path: "{{ item }}" + state: absent + with_items: + - /etc/udev/rules.d/70-persistent-net.rules + - /.cephlab_net_configured + - /.cephlab_hostname_set + - /ceph-qa-ready + - /var/log/netplan-from-link.log + - /var/log/nm-from-link.log + - /var/log/cephlab-set-hostname.log + - /var/log/cloud-init-output.log + - /var/log/cloud-init.log + +- name: Remove /var/lib/ceph mountpoint from fstab + shell: sed -i '/\/var\/lib\/ceph/d' /etc/fstab + +- name: Unmount /var/lib/ceph + mount: + path: /var/lib/ceph + state: unmounted + +- name: Import tasks for RPM-based distros + import_tasks: rpm.yml + when: ansible_os_family == "RedHat" or ansible_os_family == "Suse" + +- name: Import tasks for APT-based distros + import_tasks: apt.yml + when: ansible_os_family == "Debian" + +# If we updated the kernel in apt/rpm.yml +- name: Reboot if required + reboot: + msg: "Rebooting trial node after kernel/package updates" + reboot_timeout: 1800 + connect_timeout: 10 + test_command: whoami + when: > + (ansible_facts.os_family == "Debian" and deb_reboot_required.stat.exists) or + (ansible_facts.os_family == "RedHat" and rhel_needs_reboot.rc != 0) or + (ansible_facts.os_family == "Suse" and suse_reboot_required.stat.exists) + +- name: Get list of SSH host keys + shell: "ls -1 /etc/ssh/ssh_host_*" + register: ssh_host_keys + ignore_errors: true + +- name: Delete SSH host keys so they're generated during firstboot on cloned machines + file: + path: "{{ item }}" + state: absent + with_items: "{{ ssh_host_keys.stdout_lines|default([]) }}" + when: ssh_host_keys is defined + +# el <= 7 = ntpd +# el >= 8 = chronyd +# Ubuntu = ntp +- name: "Stop {{ ntp_service }} service" + service: + name: "{{ ntp_service }}" + state: stopped + when: '"ntp" in ntp_service' + +# The theory here is although we do have the ntp service running on boot, +# if the time is off, it slowly drifts back in sync. Since our testnodes +# are ephemeral, they don't ever have enough time to correctly drift +# back to the correct time. So we'll force it in the captured OS images. +- name: Install ntpdate command if missing + package: + name: ntpdate + state: present + when: '"ntp" in ntp_service' + +- name: Force time synchronization using stepping | ntp + command: "ntpdate -b {{ ntp_servers|join(' ') }}" + when: '"ntp" in ntp_service' + +- name: "Start {{ ntp_service }}" + service: + name: "{{ ntp_service }}" + state: started + +# chronyd needs to be started in order to force time sync. This differs from ntpd. +- name: Force time synchronization using stepping | chrony + command: chronyc -a makestep + when: '"chrony" in ntp_service' + +- name: Sync the hardware clock + command: "hwclock --systohc" + +- name: Disable cloud init and disruptive apt services + systemd: + name: "{{ item }}" + enabled: false + state: stopped + masked: true + loop: + - cloud-init-local.service + - cloud-init.service + - cloud-config.service + - cloud-final.service + - unattended-upgrades.service + - apt-daily.service + - apt-daily-upgrade.service + - apt-daily.timer + - apt-daily-upgrade.timer + failed_when: false + +- name: Disable cloud init networking config + copy: + dest: /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg + owner: root + group: root + mode: "0644" + content: | + network: + config: disabled + failed_when: false + +- name: Disable cloud init completely + file: + path: /etc/cloud/cloud-init.disabled + state: touch + owner: root + group: root + mode: "0644" + +- name: Remove cloud init state + file: + path: /var/lib/cloud + state: absent + failed_when: false + +- name: Install cephlab-set-hostname script + copy: + src: files/cephlab-set-hostname.sh + dest: /usr/local/sbin/ + owner: root + group: root + mode: "0755" + +- name: Install cephlab-set-hostname systemd unit + copy: + src: files/cephlab-set-hostname.service + dest: /etc/systemd/system/cephlab-set-hostname.service + owner: root + group: root + mode: "0644" + +- name: Enable cephlab-set-hostname service + systemd: + name: cephlab-set-hostname.service + enabled: true + daemon_reload: true diff --git a/tools/roles/prep-fog-capture/tasks/rpm.yml b/tools/roles/prep-fog-capture/tasks/rpm.yml new file mode 100644 index 00000000..4cfbb3de --- /dev/null +++ b/tools/roles/prep-fog-capture/tasks/rpm.yml @@ -0,0 +1,98 @@ +--- +- name: Upgrade all packages to latest (dnf) + dnf: + name: "*" + state: latest + update_cache: true + when: ansible_facts.os_family == "RedHat" + +- name: Ensure dnf-utils present (for needs-restarting) + package: + name: dnf-utils + state: present + when: ansible_facts.os_family == "RedHat" + +- name: Check if reboot is required (RHEL family) + command: needs-restarting -r + register: rhel_needs_reboot + changed_when: false + failed_when: false + when: ansible_facts.os_family == "RedHat" + +- name: Find existing ifcfg scripts + shell: | + ls -1 {{ ifcfg_dir }}/ifcfg-* 2>/dev/null | grep -v ifcfg-lo || true + vars: + ifcfg_dir: >- + {{ '/etc/sysconfig/network-scripts' + if ansible_os_family == 'RedHat' + else '/etc/sysconfig/network' + if ansible_os_family == 'Suse' + else '' }} + register: ifcfg_scripts + +- name: Delete ifcfg scripts + file: + path: "{{ item }}" + state: absent + loop: "{{ ifcfg_scripts.stdout_lines | default([]) }}" + when: ifcfg_scripts is defined + +- name: Unsubscribe RHEL + command: subscription-manager unregister + when: ansible_distribution == "RedHat" + failed_when: false + +# A file gets leftover when a testnode is registered with Satellite that caused +# each registered subsequent testnode to report the wrong hostname +- name: Clean up katello facts + file: + path: /etc/rhsm/facts/katello.facts + state: absent + when: ansible_distribution == "RedHat" + +# https://bugzilla.redhat.com/show_bug.cgi?id=1814337 +- name: Disable dnf-makecache service + service: + name: dnf-makecache.timer + state: stopped + enabled: no + when: + - ansible_os_family == "RedHat" + - ansible_distribution_major_version|int >= 8 + +# Hopefully fixes https://github.com/ceph/ceph-cm-ansible/pull/544#issuecomment-599076564 +- name: Clean DNF cache + shell: "dnf clean all && rm -rf /var/cache/dnf/*" + when: + - ansible_os_family == "RedHat" + - ansible_distribution_major_version|int >= 8 + +- name: Ensure sshd-keygen is enabled so host keys get regenerated on boot + systemd: + name: sshd-keygen.target + enabled: true + +- set_fact: + ntp_service: ntpd + when: ansible_os_family == "RedHat" and ansible_distribution_major_version|int <= 7 + +- set_fact: + ntp_service: chronyd + when: (ansible_os_family == "RedHat" and ansible_distribution_major_version|int >= 8) or + ansible_os_family == "Suse" + +- name: Install systemd unit for network manager link selection + copy: + src: files/nm-from-link.service + dest: /etc/systemd/system/nm-from-link.service + owner: root + group: root + mode: "0644" + +- name: Enable network manager link selection unit + systemd: + name: nm-from-link.service + enabled: true + state: started + daemon_reload: true