]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph-cm-ansible.git/commitdiff
prep-fog-capture: Refactor, split, re-add cobbler-provided hacks prepfog-v2 821/head
authorDavid Galloway <david.galloway@ibm.com>
Wed, 18 Feb 2026 19:12:02 +0000 (14:12 -0500)
committerDavid Galloway <david.galloway@ibm.com>
Wed, 18 Feb 2026 19:12:02 +0000 (14:12 -0500)
- Using systemd instead of rc.local:
  - Configure netplan on boot for Ubuntu
  - Configure NetworkManager on boot for CentOS/Rocky
  - Set hostname
  - Update packages and reboot

Signed-off-by: David Galloway <david.galloway@ibm.com>
tools/prep-fog-capture.yml
tools/roles/prep-fog-capture/files/cephlab-set-hostname.service [new file with mode: 0644]
tools/roles/prep-fog-capture/files/cephlab-set-hostname.sh [new file with mode: 0644]
tools/roles/prep-fog-capture/files/netplan-from-link.service [new file with mode: 0644]
tools/roles/prep-fog-capture/files/netplan-from-link.sh [new file with mode: 0644]
tools/roles/prep-fog-capture/files/nm-from-link.service [new file with mode: 0644]
tools/roles/prep-fog-capture/files/nm-from-link.sh [new file with mode: 0644]
tools/roles/prep-fog-capture/files/regen-ssh-hostkeys.service [new file with mode: 0644]
tools/roles/prep-fog-capture/tasks/apt.yml [new file with mode: 0644]
tools/roles/prep-fog-capture/tasks/main.yml [new file with mode: 0644]
tools/roles/prep-fog-capture/tasks/rpm.yml [new file with mode: 0644]

index 260e28901eca1994921e25f0e60c13ee87b22dd5..bee0c477ed437974785a76079241d664343447ec 100644 (file)
 ---
-### This standalone playbook can be used to prep a COBBLER-IMAGED testnode
+### This role is used to prep a {FOG|MAAS}-IMAGED testnode
 ### so that it can be used to capture an OS image for FOG.
 ### This playbook is needed for a couple reasons
 ###   - NIC configs get hard coded into the captured FOG images so nodes reimaged by FOG don't come up with network
+###   - SSH host keys need to be deleted
+###   - apt and cloud-init services need to be disabled
 
 - hosts:
     - testnodes
-  become: true
+  roles:
+    - prep-fog-capture
   gather_facts: false
-  tasks:
-
-  # (Missing in RHEL8)
-  - name: Check for /usr/bin/python
-    shell: echo marco
-    register: polo
-    ignore_errors: true
-
-  - name: Set ansible_python_interpreter=/usr/bin/python3
-    set_fact:
-      ansible_python_interpreter: /usr/bin/python3
-    when: polo is failed
-
-  # Now that we know where python is, we can gather_facts
-  - setup:
-
-  # We need to leave /.cephlab_rc_local or else each FOG reimage would tell Cobbler to run ceph-cm-ansible
-  - name: Remove lock files and udev rules
-    file:
-      path: "{{ item }}"
-      state: absent
-    with_items:
-      - /etc/udev/rules.d/70-persistent-net.rules
-      - /.cephlab_net_configured
-      - /ceph-qa-ready
-
-  - name: Get list of ifcfg scripts from host used to capture image
-    shell: "ls -1 /etc/sysconfig/network-scripts/ifcfg-* | grep -v ifcfg-lo"
-    register: ifcfg_scripts
-    when: ansible_os_family == "RedHat"
-    ignore_errors: true
-
-  - name: Get list of ifcfg scripts from host used to capture image
-    shell: "ls -1 /etc/sysconfig/network/ifcfg-* | grep -v ifcfg-lo"
-    register: ifcfg_scripts
-    when: ansible_os_family == "Suse"
-    ignore_errors: true
-
-  - name: Delete ifcfg scripts
-    file:
-      path: "{{ item }}"
-      state: absent
-    with_items: "{{ ifcfg_scripts.stdout_lines|default([]) }}"
-    when: ifcfg_scripts is defined
-
-  - name: Remove /var/lib/ceph mountpoint from fstab
-    shell: sed -i '/\/var\/lib\/ceph/d' /etc/fstab
-
-  - name: Unmount /var/lib/ceph
-    ansible.posix.mount:
-      path: /var/lib/ceph
-      state: unmounted
-
-  - name: Install one-shot service to regenerate SSH host keys on first boot
-    copy:
-      dest: /etc/systemd/system/regen-ssh-hostkeys.service
-      owner: root
-      group: root
-      mode: '0644'
-      content: |
-        [Unit]
-        Description=Regenerate SSH host keys on first boot
-        ConditionPathExists=!/etc/ssh/ssh_host_ed25519_key
-        Before=ssh.service
-  
-        [Service]
-        Type=oneshot
-        ExecStart=/usr/bin/ssh-keygen -A
-        ExecStartPost=/bin/systemctl disable regen-ssh-hostkeys.service
-  
-        [Install]
-        WantedBy=multi-user.target
-  
-  - name: Reload systemd daemon
-    systemd:
-      daemon_reload: true
-  
-  - name: Enable regen-ssh-hostkeys.service
-    systemd:
-      name: regen-ssh-hostkeys.service
-      enabled: true
-
-  - name: Get list of SSH host keys
-    shell: "ls -1 /etc/ssh/ssh_host_*"
-    register: ssh_host_keys
-    ignore_errors: true
-
-  # Key regeneration is done automatically on CentOS firstboot.
-  # For Ubuntu, we'll add `dpkg-reconfigure openssh-server` to rc.local
-  - name: Delete SSH host keys so they're generated during firstboot on cloned machines
-    file:
-      path: "{{ item }}"
-      state: absent
-    with_items: "{{ ssh_host_keys.stdout_lines|default([]) }}"
-    when: ssh_host_keys is defined
-
-  - name: Unsubscribe RHEL
-    command: subscription-manager unregister
-    when: ansible_distribution == "RedHat"
-    failed_when: false
-
-  # A file gets leftover when a testnode is registered with Satellite that caused
-  # each registered subsequent testnode to report the wrong hostname
-  - name: Clean up katello facts
-    file:
-      path: /etc/rhsm/facts/katello.facts
-      state: absent
-    when: ansible_distribution == "RedHat"
-
-  # https://bugzilla.redhat.com/show_bug.cgi?id=1814337
-  - name: Disable dnf-makecache service
-    service:
-      name: dnf-makecache.timer
-      state: stopped
-      enabled: no
-    when:
-      - ansible_os_family == "RedHat"
-      - ansible_distribution_major_version|int >= 8
-
-  # Hopefully fixes https://github.com/ceph/ceph-cm-ansible/pull/544#issuecomment-599076564
-  - name: Clean DNF cache
-    shell: "dnf clean all && rm -rf /var/cache/dnf/*"
-    when:
-      - ansible_os_family == "RedHat"
-      - ansible_distribution_major_version|int >= 8
-
-  - set_fact:
-      ntp_service: ntp
-    when: ansible_os_family == "Debian"
-
-  - set_fact:
-      ntp_service: ntpd
-    when: ansible_os_family == "RedHat" and ansible_distribution_major_version|int <= 7
-
-  - set_fact:
-      ntp_service: chronyd
-    when: (ansible_os_family == "RedHat" and ansible_distribution_major_version|int >= 8) or
-          ansible_os_family == "Suse"
-
-  - name: "Stop {{ ntp_service }} service"
-    service:
-      name: "{{ ntp_service }}"
-      state: stopped
-    when: '"ntp" in ntp_service'
-
-  # The theory here is although we do have the ntp service running on boot,
-  # if the time is off, it slowly drifts back in sync.  Since our testnodes
-  # are ephemeral, they don't ever have enough time to correctly drift
-  # back to the correct time.  So we'll force it in the captured OS images.
-  - name: Install ntpdate command if missing
-    package:
-      name: ntpdate
-      state: present
-    when: '"ntp" in ntp_service'
-
-  - name: Force time synchronization using stepping | ntp
-    command: "ntpdate -b {{ ntp_servers|join(' ') }}"
-    when: '"ntp" in ntp_service'
-
-  - name: "Start {{ ntp_service }}"
-    service:
-      name: "{{ ntp_service }}"
-      state: started
-
-  # chronyd needs to be started in order to force time sync. This differs from ntpd.
-  - name: Force time synchronization using stepping | chrony
-    command: chronyc -a makestep
-    when: '"chrony" in ntp_service'
-
-  - name: Sync the hardware clock
-    command: "hwclock --systohc"
+  become: true
diff --git a/tools/roles/prep-fog-capture/files/cephlab-set-hostname.service b/tools/roles/prep-fog-capture/files/cephlab-set-hostname.service
new file mode 100644 (file)
index 0000000..180af02
--- /dev/null
@@ -0,0 +1,14 @@
+[Unit]
+Description=Ceph Lab hostname configuration
+After=network-online.target nss-lookup.target
+Wants=nss-lookup.target
+
+[Service]
+StandardOutput=journal+console
+StandardError=journal+console
+Type=oneshot
+ExecStart=/usr/local/sbin/cephlab-set-hostname.sh
+RemainAfterExit=yes
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tools/roles/prep-fog-capture/files/cephlab-set-hostname.sh b/tools/roles/prep-fog-capture/files/cephlab-set-hostname.sh
new file mode 100644 (file)
index 0000000..482df74
--- /dev/null
@@ -0,0 +1,193 @@
+#!/usr/bin/env bash
+# Wait for /.cephlab_net_configured, then set hostname + /etc/hostname + /etc/hosts
+# Flow:
+#   1) Wait for DHCP/global IPv4
+#   2) Ping CHECK_HOST for up to 10 minutes (from any local IP)
+#   3) Once ping works, try reverse DNS for up to 10 minutes (for an IP that can ping)
+#   4) Set hostname and rewrite /etc/hostname + /etc/hosts
+set -euo pipefail
+
+# --- Config ---
+CHECK_HOST="10.20.192.14"         # soko04 (must be reachable before we trust DNS)
+DEFAULT_NAMESERVER="10.20.192.11" # override via env NAMESERVER or arg1
+
+WAIT_FOR_FILE="/.cephlab_net_configured"
+HOSTNAME_IS_SET_FILE="/.cephlab_hostname_set"
+LOG="/var/log/cephlab-set-hostname.log"
+
+NAMESERVER="${NAMESERVER:-${1:-${DEFAULT_NAMESERVER}}}"
+
+MAX_WAIT_SECONDS="300"        # wait for /.cephlab_net_configured
+PING_WINDOW_SECONDS="600"     # 10 minutes
+DNS_WINDOW_SECONDS="600"      # 10 minutes
+LOOP_SLEEP_SECONDS="2"
+
+# --- Logging ---
+touch "$LOG"
+chmod 0644 "$LOG"
+exec > >(tee -a "$LOG") 2>&1
+
+log() {
+  echo "$(date -u +%FT%T.%N | cut -c1-23) cephlab-set-hostname: $*" >&2
+}
+
+# --- Helpers ---
+get_my_ips() {
+  ip -4 -o addr show scope global 2>/dev/null \
+    | awk '$2 != "docker0" {print $4}' \
+    | cut -d/ -f1 \
+    || true
+}
+
+# Reverse lookup helper (never non-zero; safe with set -euo pipefail)
+reverse_lookup() {
+  local ip="$1"
+  local ns="$2"
+  local name=""
+
+  if command -v dig >/dev/null 2>&1; then
+    name="$(dig +time=1 +tries=1 +short -x "${ip}" @"${ns}" 2>/dev/null | head -n1 | sed 's/\.$//' || true)"
+  elif command -v host >/dev/null 2>&1; then
+    name="$(host -W 1 "${ip}" "${ns}" 2>/dev/null | awk '/domain name pointer/ {print $5}' | sed 's/\.$//' | head -n1 || true)"
+  elif command -v getent >/dev/null 2>&1; then
+    name="$(getent hosts "${ip}" 2>/dev/null | awk '{print $2}' | head -n1 || true)"
+  fi
+
+  echo "${name}"
+}
+
+set_hostname() {
+  local fqdn="$1"
+  if command -v hostnamectl >/dev/null 2>&1; then
+    hostnamectl set-hostname "${fqdn}"
+  else
+    hostname "${fqdn}"
+  fi
+}
+
+can_ping_from_ip() {
+  local src_ip="$1"
+  # More tolerant per-attempt check but bounded:
+  # 3 packets, 1s apart, wait up to 2s each; hard cap 10s.
+  timeout 10s ping -I "${src_ip}" -nq -c3 -i 1 -W 2 "${CHECK_HOST}" >/dev/null 2>&1
+}
+
+# --- Main ---
+if [[ -f "${HOSTNAME_IS_SET_FILE}" ]]; then
+  log "We've already set the hostname before. Exiting..."
+  exit 0
+fi
+
+log "Waiting for ${WAIT_FOR_FILE} (up to ${MAX_WAIT_SECONDS}s)..."
+end=$((SECONDS + MAX_WAIT_SECONDS))
+while [[ ! -f "${WAIT_FOR_FILE}" ]]; do
+  if (( SECONDS >= end )); then
+    log "Timed out waiting for ${WAIT_FOR_FILE}. Exiting."
+    exit 1
+  fi
+  sleep 1
+done
+log "Flag file present. Proceeding."
+
+# Wait for at least one global IPv4
+myips="$(get_my_ips)"
+if [[ -z "${myips}" ]]; then
+  log "No non-loopback IPv4 addresses found yet. Will continue, but ping/DNS will likely fail until DHCP is up."
+fi
+
+# 1) Ping CHECK_HOST for up to 10 minutes (find a working source IP)
+log "Checking connectivity to ${CHECK_HOST} for up to ${PING_WINDOW_SECONDS}s..."
+ping_deadline=$((SECONDS + PING_WINDOW_SECONDS))
+good_ip=""
+
+while (( SECONDS < ping_deadline )); do
+  myips="$(get_my_ips)"
+  if [[ -z "${myips}" ]]; then
+    log "No global IPv4 yet; waiting..."
+    sleep "${LOOP_SLEEP_SECONDS}"
+    continue
+  fi
+
+  for ip in ${myips}; do
+    log "Pinging ${CHECK_HOST} from ${ip}..."
+    if can_ping_from_ip "${ip}"; then
+      good_ip="${ip}"
+      log "Connectivity confirmed: ${ip} -> ${CHECK_HOST}"
+      break
+    fi
+    log "Ping failed from ${ip}"
+  done
+
+  [[ -n "${good_ip}" ]] && break
+  sleep "${LOOP_SLEEP_SECONDS}"
+done
+
+if [[ -z "${good_ip}" ]]; then
+  log "Timed out (${PING_WINDOW_SECONDS}s) waiting for connectivity to ${CHECK_HOST}. Nothing changed."
+  exit 1
+fi
+
+# 2) Now that we can reach CHECK_HOST, try reverse DNS for up to 10 minutes
+log "Connectivity is good. Attempting reverse DNS via ${NAMESERVER} for up to ${DNS_WINDOW_SECONDS}s..."
+dns_deadline=$((SECONDS + DNS_WINDOW_SECONDS))
+newhostname=""
+
+while (( SECONDS < dns_deadline )); do
+  # Prefer the IP that proved connectivity; if it disappeared, re-find a good one.
+  myips="$(get_my_ips)"
+  if [[ -z "${myips}" ]]; then
+    log "Lost all global-scope IPv4 addresses; waiting..."
+    sleep "${LOOP_SLEEP_SECONDS}"
+    continue
+  fi
+
+  if ! echo "${myips}" | tr ' ' '\n' | grep -qx "${good_ip}"; then
+    log "Previously-good IP ${good_ip} is gone; re-checking connectivity..."
+    good_ip=""
+    for ip in ${myips}; do
+      log "Pinging ${CHECK_HOST} from ${ip}..."
+      if can_ping_from_ip "${ip}"; then
+        good_ip="${ip}"
+        log "Connectivity confirmed: ${ip} -> ${CHECK_HOST}"
+        break
+      fi
+    done
+    [[ -z "${good_ip}" ]] && { sleep "${LOOP_SLEEP_SECONDS}"; continue; }
+  fi
+
+  log "Reverse lookup for ${good_ip} via ${NAMESERVER}..."
+  newhostname="$(reverse_lookup "${good_ip}" "${NAMESERVER}")"
+
+  if [[ -n "${newhostname}" ]]; then
+    log "Resolved ${good_ip} -> ${newhostname}"
+    break
+  fi
+
+  log "Reverse lookup failed/empty for ${good_ip}"
+  sleep "${LOOP_SLEEP_SECONDS}"
+done
+
+if [[ -z "${newhostname}" ]]; then
+  log "Timed out (${DNS_WINDOW_SECONDS}s) waiting for reverse DNS via ${NAMESERVER}. Nothing changed."
+  exit 1
+fi
+
+# Apply hostname + persist
+set_hostname "${newhostname}"
+shorthostname="${newhostname%%.*}"
+echo "${newhostname}" > /etc/hostname
+
+log "Rewriting /etc/hosts from scratch"
+cat > /etc/hosts <<EOF
+127.0.0.1 localhost
+${good_ip} ${newhostname} ${shorthostname}
+
+# IPv6
+::1 localhost ip6-localhost ip6-loopback
+ff02::1 ip6-allnodes
+ff02::2 ip6-allrouters
+EOF
+
+log "Hostname updated: $(hostname); /etc/hostname and /etc/hosts rewritten."
+touch "${HOSTNAME_IS_SET_FILE}"
+exit 0
diff --git a/tools/roles/prep-fog-capture/files/netplan-from-link.service b/tools/roles/prep-fog-capture/files/netplan-from-link.service
new file mode 100644 (file)
index 0000000..47efea2
--- /dev/null
@@ -0,0 +1,14 @@
+[Unit]
+Description=Write netplan from link carrier once
+After=systemd-networkd.service local-fs.target
+Wants=systemd-networkd.service
+
+[Service]
+StandardOutput=journal+console
+StandardError=journal+console
+Type=oneshot
+ExecStart=/usr/local/sbin/netplan-from-link.sh
+RemainAfterExit=yes
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tools/roles/prep-fog-capture/files/netplan-from-link.sh b/tools/roles/prep-fog-capture/files/netplan-from-link.sh
new file mode 100644 (file)
index 0000000..dd98855
--- /dev/null
@@ -0,0 +1,107 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+OUT="/etc/netplan/01-fog.yaml"
+STAMP="/.cephlab_net_configured"
+LOG="/var/log/netplan-from-link.log"
+
+touch "$LOG"
+chmod 0644 "$LOG"
+exec > >(tee -a "$LOG") 2>&1
+
+log() {
+  echo "$(date -u +%FT%T.%N | cut -c1-23) netplan-from-link: $*" >&2
+}
+
+log "starting"
+log "kernel=$(uname -r)"
+log "cmdline=$(cat /proc/cmdline || true)"
+
+rm -f /etc/netplan/*.yaml || true
+
+pick_iface() {
+  for d in /sys/class/net/*; do
+    iface="$(basename "$d")"
+    c="$d/carrier"
+
+    case "$iface" in
+      lo|docker*|veth*|virbr*|br*|cni*|flannel*|weave*|zt*|wg*|tun*|tap*|sit*|ip6tnl*|gre*|gretap*|erspan*|bond* )
+        continue
+        ;;
+    esac
+
+    ip link set dev "$iface" up 2>/dev/null || true
+    v="$(cat "$c" 2>/dev/null || true)"
+    log "probe iface=$iface carrier='${v}' path=$c"
+    if [[ -r "$c" ]] && [[ "$v" == "1" ]]; then
+      log "selected iface=$iface via carrier"
+      echo "$iface"
+      return 0
+    fi
+  done
+
+  dflt="$(ip -4 route show default 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}' || true)"
+  if [[ -n "${dflt:-}" ]]; then
+    log "selected iface=$dflt via default-route"
+    echo "$dflt"
+    return 0
+  fi
+
+  return 1
+}
+
+iface=""
+for i in $(seq 1 30); do
+  iface="$(pick_iface || true)"
+  if [[ -n "${iface:-}" ]]; then
+    break
+  fi
+  log "no iface yet (attempt $i/30); sleeping 1s"
+  sleep 1
+done
+
+if [[ -z "${iface:-}" ]]; then
+  log "netplan-from-link could not find an uplink interface"
+  log "ip -o link:"
+  ip -o link show || true
+  log "ip -4 addr:"
+  ip -4 addr show || true
+  log "ip -4 route:"
+  ip -4 route show || true
+  exit 0
+fi
+
+log "writing netplan to $OUT for iface=$iface"
+cat >"$OUT" <<EOF
+network:
+  version: 2
+  renderer: networkd
+  ethernets:
+    ${iface}:
+      dhcp4: true
+      dhcp6: false
+      optional: false
+      dhcp4-overrides:
+        use-dns: true
+        use-hostname: true
+      nameservers:
+        addresses: [10.20.192.11]
+EOF
+
+chmod 0600 "$OUT"
+
+if command -v netplan >/dev/null 2>&1; then
+  log "netplan generate"
+  netplan generate || true
+  log "netplan apply"
+  netplan apply || true
+else
+  log "netplan not found; skipping generate/apply"
+fi
+
+log "final ip -4 addr for iface=$iface"
+ip -4 addr show dev "$iface" || true
+
+touch "$STAMP"
+log "done; touched $STAMP"
diff --git a/tools/roles/prep-fog-capture/files/nm-from-link.service b/tools/roles/prep-fog-capture/files/nm-from-link.service
new file mode 100644 (file)
index 0000000..d59b6e9
--- /dev/null
@@ -0,0 +1,14 @@
+[Unit]
+Description=Write NetworkManager connection from link carrier once
+After=systemd-udev-settle.service local-fs.target
+Wants=systemd-udev-settle.service
+
+[Service]
+StandardOutput=journal+console
+StandardError=journal+console
+Type=oneshot
+ExecStart=/usr/local/sbin/nm-from-link.sh
+RemainAfterExit=yes
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tools/roles/prep-fog-capture/files/nm-from-link.sh b/tools/roles/prep-fog-capture/files/nm-from-link.sh
new file mode 100644 (file)
index 0000000..4e03f20
--- /dev/null
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+STAMP="/.cephlab_net_configured"
+LOG="/var/log/nm-from-link.log"
+
+touch "$LOG"
+chmod 0644 "$LOG"
+exec > >(tee -a "$LOG") 2>&1
+
+log() {
+  echo "$(date -u +%FT%T.%N | cut -c1-23) nm-from-link: $*" >&2
+}
+
+log "starting"
+
+pick_iface() {
+  for c in /sys/class/net/*/carrier; do
+    iface="$(basename "$(dirname "$c")")"
+
+    case "$iface" in
+      lo|docker*|veth*|virbr*|br*|cni*|flannel*|weave*|zt*|wg*|tun*|tap*|sit*|ip6tnl*|gre*|gretap*|erspan*|bond* )
+        continue
+        ;;
+    esac
+
+    if [[ -r "$c" ]] && [[ "$(cat "$c")" == "1" ]]; then
+      echo "$iface"
+      return 0
+    fi
+  done
+
+  ip -4 route show default 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i=="dev") {print $(i+1); exit}}' || true
+}
+
+iface=""
+for _ in $(seq 1 30); do
+  iface="$(pick_iface || true)"
+  if [[ -n "${iface:-}" ]]; then
+    break
+  fi
+  sleep 1
+done
+
+if [[ -z "${iface:-}" ]]; then
+  log "nm-from-link could not find an uplink interface" >&2
+  exit 0
+fi
+
+systemctl enable --now NetworkManager || true
+
+IFACE="$iface"
+CONN="fog-dhcp-${IFACE}"
+
+# Remove existing connections pinned to this interface (prevents stale MAC/IP settings)
+nmcli -t -f NAME,DEVICE con show | awk -F: -v d="$IFACE" '$2==d {print $1}' | while read -r n; do
+  [[ -n "$n" ]] && nmcli con delete "$n" || true
+done
+
+# Remove same-named conn if present
+nmcli -t -f NAME con show | grep -qx "$CONN" && nmcli con delete "$CONN" || true
+
+nmcli con add type ethernet ifname "$IFACE" con-name "$CONN" ipv4.method auto ipv6.method ignore
+nmcli con mod "$CONN" connection.autoconnect yes
+nmcli con mod "$CONN" ipv4.ignore-auto-dns yes
+nmcli con mod "$CONN" ipv4.dns "10.20.192.11"
+nmcli con up "$CONN" || true
+
+touch "$STAMP"
diff --git a/tools/roles/prep-fog-capture/files/regen-ssh-hostkeys.service b/tools/roles/prep-fog-capture/files/regen-ssh-hostkeys.service
new file mode 100644 (file)
index 0000000..90af2f4
--- /dev/null
@@ -0,0 +1,12 @@
+[Unit]
+Description=Regenerate SSH host keys on first boot
+ConditionPathExists=!/etc/ssh/ssh_host_ed25519_key
+Before=ssh.service
+
+[Service]
+Type=oneshot
+ExecStart=/usr/bin/ssh-keygen -A
+ExecStartPost=/bin/systemctl disable regen-ssh-hostkeys.service
+
+[Install]
+WantedBy=multi-user.target
diff --git a/tools/roles/prep-fog-capture/tasks/apt.yml b/tools/roles/prep-fog-capture/tasks/apt.yml
new file mode 100644 (file)
index 0000000..7d75764
--- /dev/null
@@ -0,0 +1,97 @@
+---
+- name: Update apt cache
+  apt:
+    update_cache: yes
+    cache_valid_time: 3600
+  when: ansible_facts.os_family == "Debian"
+
+- name: Full upgrade (apt dist-upgrade)
+  apt:
+    upgrade: dist
+  when: ansible_facts.os_family == "Debian"
+
+- name: Check if reboot is required (Debian/Ubuntu)
+  stat:
+    path: /var/run/reboot-required
+  register: deb_reboot_required
+  when: ansible_facts.os_family == "Debian"
+
+- name: Install one-shot service to regenerate SSH host keys on first boot
+  copy:
+    src: files/regen-ssh-hostkeys.service
+    dest: /etc/systemd/system/regen-ssh-hostkeys.service
+    owner: root
+    group: root
+    mode: '0644'
+
+- name: Reload systemd daemon
+  systemd:
+    daemon_reload: true
+
+- name: Enable regen-ssh-hostkeys.service
+  systemd:
+    name: regen-ssh-hostkeys.service
+    enabled: true
+
+- set_fact:
+    ntp_service: ntp
+
+- name: Remove cloud init netplan file
+  file:
+    path: /etc/netplan/50-cloud-init.yaml
+    state: absent
+  failed_when: false
+
+- name: Install netplan link selection script
+  copy:
+    src: files/netplan-from-link.sh
+    dest: /usr/local/sbin/netplan-from-link.sh
+    owner: root
+    group: root
+    mode: "0755"
+
+- name: Install netplan-from-link systemd unit
+  copy:
+    src: files/netplan-from-link.service
+    dest: /etc/systemd/system/netplan-from-link.service
+    owner: root
+    group: root
+    mode: "0644"
+
+- name: Enable netplan link selection systemd unit
+  systemd:
+    name: netplan-from-link.service
+    enabled: true
+    state: started
+    daemon_reload: true
+
+- name: Disable NetworkManager
+  systemd:
+    name: NetworkManager
+    enabled: false
+    state: stopped
+  failed_when: false
+
+- name: Enable networkd
+  systemd:
+    name: systemd-networkd
+    enabled: true
+    state: started
+    daemon_reload: true
+
+- name: Avoid wait online hang
+  systemd:
+    name: systemd-networkd-wait-online
+    enabled: false
+    state: stopped
+  failed_when: false
+
+- name: Fog prep netplan generate
+  command: netplan generate
+  changed_when: false
+  failed_when: false
+
+- name: Fog prep netplan apply
+  command: netplan apply
+  changed_when: true
+  failed_when: false
diff --git a/tools/roles/prep-fog-capture/tasks/main.yml b/tools/roles/prep-fog-capture/tasks/main.yml
new file mode 100644 (file)
index 0000000..49f73ef
--- /dev/null
@@ -0,0 +1,161 @@
+---
+# Tasks common to all distros
+# We import tasks based on ansible_os_family about halfway through
+
+- setup:
+
+- name: Remove lock files, udev rules, logs
+  file:
+    path: "{{ item }}"
+    state: absent
+  with_items:
+    - /etc/udev/rules.d/70-persistent-net.rules
+    - /.cephlab_net_configured
+    - /.cephlab_hostname_set
+    - /ceph-qa-ready
+    - /var/log/netplan-from-link.log
+    - /var/log/nm-from-link.log
+    - /var/log/cephlab-set-hostname.log
+    - /var/log/cloud-init-output.log
+    - /var/log/cloud-init.log
+
+- name: Remove /var/lib/ceph mountpoint from fstab
+  shell: sed -i '/\/var\/lib\/ceph/d' /etc/fstab
+
+- name: Unmount /var/lib/ceph
+  mount:
+    path: /var/lib/ceph
+    state: unmounted
+
+- name: Import tasks for RPM-based distros
+  import_tasks: rpm.yml
+  when: ansible_os_family == "RedHat" or ansible_os_family == "Suse"
+
+- name: Import tasks for APT-based distros
+  import_tasks: apt.yml
+  when: ansible_os_family == "Debian"
+
+# If we updated the kernel in apt/rpm.yml
+- name: Reboot if required
+  reboot:
+    msg: "Rebooting trial node after kernel/package updates"
+    reboot_timeout: 1800
+    connect_timeout: 10
+    test_command: whoami
+  when: >
+    (ansible_facts.os_family == "Debian" and deb_reboot_required.stat.exists) or
+    (ansible_facts.os_family == "RedHat" and rhel_needs_reboot.rc != 0) or
+    (ansible_facts.os_family == "Suse" and suse_reboot_required.stat.exists)
+
+- name: Get list of SSH host keys
+  shell: "ls -1 /etc/ssh/ssh_host_*"
+  register: ssh_host_keys
+  ignore_errors: true
+
+- name: Delete SSH host keys so they're generated during firstboot on cloned machines
+  file:
+    path: "{{ item }}"
+    state: absent
+  with_items: "{{ ssh_host_keys.stdout_lines|default([]) }}"
+  when: ssh_host_keys is defined
+
+# el <= 7 = ntpd
+# el >= 8 = chronyd
+# Ubuntu  = ntp
+- name: "Stop {{ ntp_service }} service"
+  service:
+    name: "{{ ntp_service }}"
+    state: stopped
+  when: '"ntp" in ntp_service'
+
+# The theory here is although we do have the ntp service running on boot,
+# if the time is off, it slowly drifts back in sync.  Since our testnodes
+# are ephemeral, they don't ever have enough time to correctly drift
+# back to the correct time.  So we'll force it in the captured OS images.
+- name: Install ntpdate command if missing
+  package:
+    name: ntpdate
+    state: present
+  when: '"ntp" in ntp_service'
+
+- name: Force time synchronization using stepping | ntp
+  command: "ntpdate -b {{ ntp_servers|join(' ') }}"
+  when: '"ntp" in ntp_service'
+
+- name: "Start {{ ntp_service }}"
+  service:
+    name: "{{ ntp_service }}"
+    state: started
+
+# chronyd needs to be started in order to force time sync. This differs from ntpd.
+- name: Force time synchronization using stepping | chrony
+  command: chronyc -a makestep
+  when: '"chrony" in ntp_service'
+
+- name: Sync the hardware clock
+  command: "hwclock --systohc"
+
+- name: Disable cloud init and disruptive apt services
+  systemd:
+    name: "{{ item }}"
+    enabled: false
+    state: stopped
+    masked: true
+  loop:
+    - cloud-init-local.service
+    - cloud-init.service
+    - cloud-config.service
+    - cloud-final.service
+    - unattended-upgrades.service
+    - apt-daily.service
+    - apt-daily-upgrade.service
+    - apt-daily.timer
+    - apt-daily-upgrade.timer
+  failed_when: false
+
+- name: Disable cloud init networking config
+  copy:
+    dest: /etc/cloud/cloud.cfg.d/99-disable-network-config.cfg
+    owner: root
+    group: root
+    mode: "0644"
+    content: |
+      network:
+        config: disabled
+  failed_when: false
+
+- name: Disable cloud init completely
+  file:
+    path: /etc/cloud/cloud-init.disabled
+    state: touch
+    owner: root
+    group: root
+    mode: "0644"
+
+- name: Remove cloud init state
+  file:
+    path: /var/lib/cloud
+    state: absent
+  failed_when: false
+
+- name: Install cephlab-set-hostname script
+  copy:
+    src: files/cephlab-set-hostname.sh
+    dest: /usr/local/sbin/
+    owner: root
+    group: root
+    mode: "0755"
+
+- name: Install cephlab-set-hostname systemd unit
+  copy:
+    src: files/cephlab-set-hostname.service
+    dest: /etc/systemd/system/cephlab-set-hostname.service
+    owner: root
+    group: root
+    mode: "0644"
+
+- name: Enable cephlab-set-hostname service
+  systemd:
+    name: cephlab-set-hostname.service
+    enabled: true
+    daemon_reload: true
diff --git a/tools/roles/prep-fog-capture/tasks/rpm.yml b/tools/roles/prep-fog-capture/tasks/rpm.yml
new file mode 100644 (file)
index 0000000..4cfbb3d
--- /dev/null
@@ -0,0 +1,98 @@
+---
+- name: Upgrade all packages to latest (dnf)
+  dnf:
+    name: "*"
+    state: latest
+    update_cache: true
+  when: ansible_facts.os_family == "RedHat"
+
+- name: Ensure dnf-utils present (for needs-restarting)
+  package:
+    name: dnf-utils
+    state: present
+  when: ansible_facts.os_family == "RedHat"
+
+- name: Check if reboot is required (RHEL family)
+  command: needs-restarting -r
+  register: rhel_needs_reboot
+  changed_when: false
+  failed_when: false
+  when: ansible_facts.os_family == "RedHat"
+
+- name: Find existing ifcfg scripts
+  shell: |
+    ls -1 {{ ifcfg_dir }}/ifcfg-* 2>/dev/null | grep -v ifcfg-lo || true
+  vars:
+    ifcfg_dir: >-
+      {{ '/etc/sysconfig/network-scripts'
+         if ansible_os_family == 'RedHat'
+         else '/etc/sysconfig/network'
+         if ansible_os_family == 'Suse'
+         else '' }}
+  register: ifcfg_scripts
+
+- name: Delete ifcfg scripts
+  file:
+    path: "{{ item }}"
+    state: absent
+  loop: "{{ ifcfg_scripts.stdout_lines | default([]) }}"
+  when: ifcfg_scripts is defined
+
+- name: Unsubscribe RHEL
+  command: subscription-manager unregister
+  when: ansible_distribution == "RedHat"
+  failed_when: false
+
+# A file gets leftover when a testnode is registered with Satellite that caused
+# each registered subsequent testnode to report the wrong hostname
+- name: Clean up katello facts
+  file:
+    path: /etc/rhsm/facts/katello.facts
+    state: absent
+  when: ansible_distribution == "RedHat"
+
+# https://bugzilla.redhat.com/show_bug.cgi?id=1814337
+- name: Disable dnf-makecache service
+  service:
+    name: dnf-makecache.timer
+    state: stopped
+    enabled: no
+  when:
+    - ansible_os_family == "RedHat"
+    - ansible_distribution_major_version|int >= 8
+
+# Hopefully fixes https://github.com/ceph/ceph-cm-ansible/pull/544#issuecomment-599076564
+- name: Clean DNF cache
+  shell: "dnf clean all && rm -rf /var/cache/dnf/*"
+  when:
+    - ansible_os_family == "RedHat"
+    - ansible_distribution_major_version|int >= 8
+
+- name: Ensure sshd-keygen is enabled so host keys get regenerated on boot
+  systemd:
+    name: sshd-keygen.target
+    enabled: true
+
+- set_fact:
+    ntp_service: ntpd
+  when: ansible_os_family == "RedHat" and ansible_distribution_major_version|int <= 7
+
+- set_fact:
+    ntp_service: chronyd
+  when: (ansible_os_family == "RedHat" and ansible_distribution_major_version|int >= 8) or
+        ansible_os_family == "Suse"
+
+- name: Install systemd unit for network manager link selection
+  copy:
+    src: files/nm-from-link.service
+    dest: /etc/systemd/system/nm-from-link.service
+    owner: root
+    group: root
+    mode: "0644"
+
+- name: Enable network manager link selection unit
+  systemd:
+    name: nm-from-link.service
+    enabled: true
+    state: started
+    daemon_reload: true