diff --git a/misc/runners/common/README.md b/misc/runners/common/README.md new file mode 100644 index 0000000000..2e840920a3 --- /dev/null +++ b/misc/runners/common/README.md @@ -0,0 +1,25 @@ +# Common Runner Management Scripts + +Site-agnostic scripts shared between the Frontier and Phoenix runner setups. +All shared logic lives here; site directories contain only site-specific files +(`config.sh` and scripts unique to that cluster). + +Scripts are invoked via the dispatcher at `misc/runners/runner.sh`: +```bash +bash misc/runners/runner.sh [args...] +``` + +## Scripts + +| Script | Purpose | +|---|---| +| `runner-lib.sh` | Shared library: GitHub API helpers, EXE-based process discovery, parallel node sweep, start/stop primitives. Sourced by site `config.sh` files. | +| `check-runners.sh` | Per-node health check: Runner.Listener processes with name, idle/BUSY, slurm PATH, RSS. Optional cgroup memory footer. | +| `list-runners.sh` | Full table: GitHub API status × parallel node sweep. Shows slurm status, flags stale `runner.node`. | +| `rebalance-runners.sh` | Compute optimal distribution and move runners across nodes. Handles offline runners. Writes `runner.node`. Dry run by default. | +| `restart-runner.sh` | Stop and restart one runner on a given node. Verifies slurm in PATH. Writes `runner.node`. | +| `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. | +| `move-runner.sh` | Move a runner to a different login node by name. Stops on current node, starts on target. Writes `runner.node`. | +| `stop-runner.sh` | Stop a runner process and remove its GitHub registration. | +| `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft PRs and master. Dry run by default. | +| `create-runner.sh` | Download, register, and start a new runner. Requires `runner_install_dir()` and `TARBALL_CACHE_DIR` from site config. Usage: `create-runner [install-dir]` | diff --git a/misc/runners/common/check-runners.sh b/misc/runners/common/check-runners.sh new file mode 100644 index 0000000000..365201e571 --- /dev/null +++ b/misc/runners/common/check-runners.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Check runner health across all login nodes. +# +# Sourced by site wrappers (frontier/check-runners.sh, phoenix/check-runners.sh) +# after config.sh is loaded. Shows Runner.Listener processes per node with +# name, busy/idle status, slurm availability, and RSS memory. +# If CGROUP_LIMIT > 0, also shows per-node total memory vs the cgroup limit. +# +# Usage: bash check-runners.sh +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +for node in "${NODES[@]}"; do + echo "=== $node ===" + ssh $SSH_OPTS "$node" ' + found=0 + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + found=1 + exe=$(readlink -f /proc/$p/exe 2>/dev/null || echo "???") + dir=$(dirname "$(dirname "$exe")" 2>/dev/null || echo "???") + name=$(basename "$dir") + worker=$(ps aux | grep "Runner.Worker" | grep "$dir" | grep -v grep | awk "{print \$2}" | head -1) + [ -n "$worker" ] && status="BUSY" || status="idle" + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?") + slurm=$(tr "\0" "\n" < /proc/$p/environ 2>/dev/null | grep -c "^PATH=.*slurm" || echo 0) + [ "$slurm" -gt 0 ] && slurm_ok="ok" || slurm_ok="MISSING" + printf " %-30s %5s slurm=%-7s %s MB\n" "$name" "$status" "$slurm_ok" "$rss" + done + [ "$found" -eq 0 ] && echo " (no runners)" + ' 2>/dev/null || echo " (unreachable)" + + if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then + rss=$(ssh $SSH_OPTS "$node" \ + "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \ + 2>/dev/null || echo "?") + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + echo " --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---" + fi + echo "" +done diff --git a/misc/runners/common/create-runner.sh b/misc/runners/common/create-runner.sh new file mode 100644 index 0000000000..2e77c0e4bf --- /dev/null +++ b/misc/runners/common/create-runner.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# Create, register, and start a GitHub Actions runner. +# +# Sourced by misc/runners/runner.sh after config is loaded. +# Config must define runner_install_dir() and may set TARBALL_CACHE_DIR. +# +# runner_install_dir [override-dir] +# Returns the directory where the runner should be installed. +# If override-dir is given it is used directly; otherwise the site +# computes the path (e.g. SHARED_DIR/ on Frontier, or an +# auto-numbered actions-runner-N/ directory on Phoenix). +# +# TARBALL_CACHE_DIR +# If non-empty, the runner tarball is cached here and reused across +# installs (useful on Frontier where shared Lustre is visible from all +# login nodes). If empty or unset, a fresh download is made for each +# runner and the temporary file is removed after extraction. +# +# Usage: runner.sh create-runner [install-dir] +# name Runner name (e.g. frontier-23, phoenix-11) +# node Login node to start the runner on +# install-dir Optional: override the computed installation directory +set -euo pipefail + +RUNNER_NAME="${1:?Usage: create-runner [install-dir]}" +TARGET_NODE="${2:?Usage: create-runner [install-dir]}" +INSTALL_DIR_OVERRIDE="${3:-}" + +RUNNER_DIR=$(runner_install_dir "$RUNNER_NAME" "$INSTALL_DIR_OVERRIDE") +RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" +TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" +TARBALL_URL="https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" + +echo "=== Creating runner ===" +echo " Name: $RUNNER_NAME" +echo " Node: $TARGET_NODE" +echo " Directory: $RUNNER_DIR" +echo " Org: $ORG" +echo " Group: $RUNNER_GROUP" +echo " Label: $RUNNER_LABEL" +echo " Version: $RUNNER_VERSION" +echo "" + +if [ -d "$RUNNER_DIR" ]; then + echo "ERROR: Directory already exists: $RUNNER_DIR" >&2 + exit 1 +fi + +# --- Download tarball --- +if [ -n "${TARBALL_CACHE_DIR:-}" ]; then + if [ ! -f "$TARBALL_CACHE_DIR/$TARBALL" ]; then + echo "==> Downloading runner v${RUNNER_VERSION} to cache..." + tmp="$TARBALL_CACHE_DIR/$TARBALL.tmp.$$" + curl -fsSL "$TARBALL_URL" -o "$tmp" + mv "$tmp" "$TARBALL_CACHE_DIR/$TARBALL" + fi + tarball_path="$TARBALL_CACHE_DIR/$TARBALL" +else + echo "==> Downloading runner v${RUNNER_VERSION}..." + mkdir -p "$RUNNER_DIR" + tarball_path="$RUNNER_DIR/runner-download.tmp.$$" + curl -fsSL "$TARBALL_URL" -o "$tarball_path" +fi + +# --- Extract --- +mkdir -p "$RUNNER_DIR" +echo "==> Extracting into $RUNNER_DIR..." +tar xzf "$tarball_path" -C "$RUNNER_DIR" +[ -z "${TARBALL_CACHE_DIR:-}" ] && rm -f "$tarball_path" + +if [ ! -f "$RUNNER_DIR/run.sh" ]; then + echo "ERROR: Extraction failed — run.sh not found in $RUNNER_DIR" >&2 + exit 1 +fi + +# --- Register --- +echo "==> Fetching registration token..." +token=$(gh_registration_token) +if [ -z "$token" ]; then + echo "ERROR: Failed to get registration token." >&2 + echo " Run: gh auth refresh -h github.com -s admin:org" >&2 + exit 1 +fi + +echo "==> Configuring runner..." +"$RUNNER_DIR/config.sh" \ + --url "https://github.com/$ORG" \ + --token "$token" \ + --name "$RUNNER_NAME" \ + --runnergroup "$RUNNER_GROUP" \ + --labels "$RUNNER_LABEL" \ + --work "_work" \ + --unattended \ + --replace +echo "==> Configured." + +# --- Start --- +echo "==> Starting on $TARGET_NODE..." +if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then + echo "$TARGET_NODE" > "$RUNNER_DIR/runner.node" + pids=$(find_pids "$TARGET_NODE" "$RUNNER_DIR") + pid=${pids%% *} + if has_slurm "$TARGET_NODE" "$pid"; then + echo "==> OK: $RUNNER_NAME running on $TARGET_NODE (PID $pid, slurm in PATH)" + else + echo "==> WARNING: $RUNNER_NAME running on $TARGET_NODE (PID $pid) but slurm MISSING from PATH" + fi +else + echo "ERROR: $RUNNER_NAME did not start on $TARGET_NODE" >&2 + exit 1 +fi + +echo "" +echo "==> Log: $RUNNER_DIR/runner.log" diff --git a/misc/runners/common/list-runners.sh b/misc/runners/common/list-runners.sh new file mode 100644 index 0000000000..6077dfe47e --- /dev/null +++ b/misc/runners/common/list-runners.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# List all runners combining GitHub API status with live node process info. +# +# Sourced by site wrappers (frontier/list-runners.sh, phoenix/list-runners.sh) +# after config.sh is loaded. Uses a parallel SSH sweep across all nodes +# simultaneously (one SSH per node regardless of runner count). +# Shows name, GitHub status, node, slurm availability, and RSS. +# If CGROUP_LIMIT > 0, also shows a per-node memory summary. +# +# Usage: bash list-runners.sh +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +tmpdir=$(mktemp -d) +trap 'rm -rf "$tmpdir"' EXIT + +sweep_all_nodes "$tmpdir" + +# Parse sweep results into associative arrays +declare -A runner_node runner_rss runner_slurm +for node in "${NODES[@]}"; do + while IFS= read -r line; do + read -r _s sweep_node dir rss slurm_ok <<< "$line" + runner_node["$dir"]="$sweep_node" + runner_rss["$dir"]="$rss" + runner_slurm["$dir"]="$slurm_ok" + done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true) +done + +# Fetch GitHub API status +declare -A gh_status gh_busy +while read -r _id name status busy; do + gh_status["$name"]="$status" + gh_busy["$name"]="$busy" +done < <(gh_list_runners) + +# Print table +printf "%-25s %-8s %-20s %-8s %s\n" "NAME" "GITHUB" "NODE" "SLURM" "RSS" +printf "%s\n" "$(printf '%.0s-' {1..70})" + +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + + [ "${gh_busy[$name]:-false}" = "true" ] && gh_col="BUSY" || gh_col="${gh_status[$name]:-unknown}" + + actual_node="${runner_node[$dir]:-}" + rss="${runner_rss[$dir]:-—}" + slurm="${runner_slurm[$dir]:-—}" + + if [ -z "$actual_node" ]; then + printf "%-25s %-8s %-20s %-8s %s\n" "$name" "$gh_col" "offline" "—" "—" + continue + fi + + # Flag stale runner.node entries + node_col="$actual_node" + if [ -f "$dir/runner.node" ]; then + recorded=$(cat "$dir/runner.node") + [ "$actual_node" != "$recorded" ] && node_col="${actual_node} *(stale: ${recorded})" + fi + + printf "%-25s %-8s %-20s %-8s %sMB\n" "$name" "$gh_col" "$node_col" "$slurm" "$rss" +done < <(find_runner_dirs) + +# Per-node memory summary (only when site has a cgroup limit) +if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then + echo "" + echo "=== Per-node memory ===" + for node in "${NODES[@]}"; do + count=$(ssh $SSH_OPTS "$node" \ + "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0) + rss=$(ssh $SSH_OPTS "$node" \ + "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \ + 2>/dev/null || echo "?") + [[ "$rss" =~ ^[0-9]+$ ]] || rss=0 + echo " $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)" + done +fi diff --git a/misc/runners/common/move-runner.sh b/misc/runners/common/move-runner.sh new file mode 100644 index 0000000000..ffbadb269b --- /dev/null +++ b/misc/runners/common/move-runner.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# Move a runner to a different login node. +# +# Sourced by site wrappers (frontier/move-runner.sh, phoenix/move-runner.sh) +# after config.sh is loaded. Finds the runner by name, stops it on its current +# node, and starts it on the target node. Retries start once after 5 seconds. +# +# Usage: bash move-runner.sh +set -euo pipefail + +RUNNER_NAME="${1:?Usage: $0 }" +TARGET_NODE="${2:?Usage: $0 }" + +# Validate target node +valid=0 +for node in "${NODES[@]}"; do + [ "$node" = "$TARGET_NODE" ] && valid=1 && break +done +if [ "$valid" -eq 0 ]; then + echo "ERROR: '$TARGET_NODE' is not a valid login node." >&2 + echo " Valid nodes: ${NODES[*]}" >&2 + exit 1 +fi + +# Find runner directory by name +runner_dir="" +while IFS= read -r dir; do + if [ "$(get_runner_name "$dir")" = "$RUNNER_NAME" ]; then + runner_dir="$dir" + break + fi +done < <(find_runner_dirs) + +if [ -z "$runner_dir" ]; then + echo "ERROR: Runner '$RUNNER_NAME' not found in known runner directories." >&2 + exit 1 +fi + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +echo "==> Locating $RUNNER_NAME..." +current_node=$(find_node "$runner_dir") + +if [ "$current_node" = "$TARGET_NODE" ]; then + echo "==> $RUNNER_NAME is already running on $TARGET_NODE. Nothing to do." + exit 0 +fi + +if [ "$current_node" != "offline" ]; then + echo "==> Stopping $RUNNER_NAME on $current_node..." + stop_runner "$current_node" "$runner_dir" +fi + +echo "==> Starting $RUNNER_NAME on $TARGET_NODE..." +if start_runner "$TARGET_NODE" "$runner_dir"; then + echo "$TARGET_NODE" > "$runner_dir/runner.node" + echo "==> $RUNNER_NAME is now running on $TARGET_NODE." +else + echo " First start attempt failed. Retrying in 5 seconds..." + sleep 5 + if start_runner "$TARGET_NODE" "$runner_dir"; then + echo "$TARGET_NODE" > "$runner_dir/runner.node" + echo "==> $RUNNER_NAME is now running on $TARGET_NODE." + else + echo "ERROR: $RUNNER_NAME failed to start on $TARGET_NODE after retry." >&2 + exit 1 + fi +fi diff --git a/misc/runners/common/rebalance-runners.sh b/misc/runners/common/rebalance-runners.sh new file mode 100644 index 0000000000..20a20b0f7c --- /dev/null +++ b/misc/runners/common/rebalance-runners.sh @@ -0,0 +1,179 @@ +#!/usr/bin/env bash +# Core rebalance algorithm for GitHub Actions runners. +# +# Sourced by site wrappers (frontier/rebalance-runners.sh, +# phoenix/rebalance-runners.sh) after config.sh is loaded. +# Discovers all runner directories, checks which node each is on, +# computes the optimal distribution, and moves runners to balance. +# Prefers moving idle runners over busy ones. Also places offline runners. +# +# Usage: bash rebalance-runners.sh # dry run +# APPLY=1 bash rebalance-runners.sh # execute +# APPLY=1 FORCE=1 bash rebalance-runners.sh # move busy runners too +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +# Discover runners +declare -a dirs=() names=() +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + dirs+=("$dir") + names+=("$name") +done < <(find_runner_dirs) + +num_nodes=${#NODES[@]} +num_runners=${#dirs[@]} +target=$(( num_runners / num_nodes )) +remainder=$(( num_runners % num_nodes )) + +echo "=== Current state ===" +echo "Runners: $num_runners across $num_nodes nodes" +echo "Target: $target per node (+1 on first $remainder nodes)" +echo "" + +# Map runners to nodes and check busy status +declare -A node_runners +declare -A runner_node runner_busy + +for node in "${NODES[@]}"; do node_runners[$node]=""; done + +for i in "${!dirs[@]}"; do + node=$(find_node "${dirs[$i]}") + runner_node[$i]="$node" + if [ "$node" != "offline" ]; then + node_runners[$node]="${node_runners[$node]:-} $i" + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '${dirs[$i]}/' | grep -v grep" 2>/dev/null || true) + [ -n "$worker" ] && runner_busy[$i]=1 || runner_busy[$i]=0 + else + runner_busy[$i]=0 + fi +done + +# Show current distribution +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + echo "$node: ${#indices[@]} runners" + for i in "${indices[@]}"; do + busy="" + [ "${runner_busy[$i]:-0}" = "1" ] && busy=" (BUSY)" + echo " ${names[$i]}$busy" + done +done + +offline=() +for i in "${!dirs[@]}"; do + [ "${runner_node[$i]}" = "offline" ] && offline+=("$i") +done +if [ ${#offline[@]} -gt 0 ]; then + echo "" + echo "OFFLINE:" + for i in "${offline[@]}"; do echo " ${names[$i]}"; done +fi +echo "" + +# Compute per-node targets +declare -A node_target +n=0 +for node in "${NODES[@]}"; do + node_target[$node]=$target + [ $n -lt $remainder ] && node_target[$node]=$(( target + 1 )) + n=$((n + 1)) +done + +# Plan moves: pull runners from overloaded nodes (idle first) +to_place=() +for node in "${NODES[@]}"; do + indices=(${node_runners[$node]:-}) + excess=$(( ${#indices[@]} - ${node_target[$node]} )) + [ $excess -le 0 ] && continue + idle=() busy=() + for i in "${indices[@]}"; do + [ "${runner_busy[$i]:-0}" = "1" ] && busy+=("$i") || idle+=("$i") + done + moved=0 + for i in "${idle[@]}" "${busy[@]}"; do + [ $moved -ge $excess ] && break + to_place+=("$node $i") + moved=$((moved + 1)) + done +done + +# Add offline runners to be placed +for i in "${offline[@]}"; do to_place+=("offline $i"); done + +# Assign to underloaded nodes +moves=() +for entry in "${to_place[@]}"; do + read -r src idx <<< "$entry" + best="" best_deficit=-999 + for node in "${NODES[@]}"; do + cur=(${node_runners[$node]:-}) + deficit=$(( ${node_target[$node]} - ${#cur[@]} )) + [ $deficit -gt $best_deficit ] && best_deficit=$deficit && best=$node + done + [ -z "$best" ] || [ "$best_deficit" -le 0 ] && continue + moves+=("$src $best $idx") + # Update bookkeeping so subsequent assignments reflect this move + if [ "$src" != "offline" ]; then + new="" + for j in ${node_runners[$src]}; do [ "$j" != "$idx" ] && new="$new $j"; done + node_runners[$src]="$new" + fi + node_runners[$best]="${node_runners[$best]:-} $idx" +done + +if [ ${#moves[@]} -eq 0 ]; then + echo "Already balanced." + exit 0 +fi + +echo "=== Planned moves ===" +has_busy=false +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + busy="" + [ "${runner_busy[$idx]:-0}" = "1" ] && busy=" (BUSY!)" && has_busy=true + echo " ${names[$idx]}: $src -> $dst$busy" +done +echo "" +echo "=== Target ===" +for node in "${NODES[@]}"; do + cur=(${node_runners[$node]:-}) + echo " $node: ${#cur[@]} runners" +done + +[ "$has_busy" = true ] && [ "${FORCE:-0}" != "1" ] && echo "" && echo "Set FORCE=1 to move busy runners." && exit 1 +[ "${APPLY:-0}" != "1" ] && echo "" && echo "Dry run — set APPLY=1 to execute." && exit 0 + +echo "" +echo "=== Executing ===" +for move in "${moves[@]}"; do + read -r src dst idx <<< "$move" + echo "Moving ${names[$idx]}: $src -> $dst" + if [ "$src" != "offline" ]; then + if ! stop_runner "$src" "${dirs[$idx]}"; then + echo " ERROR: Failed to stop ${names[$idx]} on $src; skipping move" >&2 + continue + fi + fi + if start_runner "$dst" "${dirs[$idx]}"; then + echo "$dst" > "${dirs[$idx]}/runner.node" + pids=$(find_pids "$dst" "${dirs[$idx]}") + pid=${pids%% *} + if has_slurm "$dst" "$pid"; then + echo " OK: ${names[$idx]} started on $dst (slurm ok)" + else + echo " WARNING: ${names[$idx]} started on $dst but slurm MISSING from PATH" + fi + else + echo " ERROR: Failed to start ${names[$idx]} on $dst" + fi +done + +echo "" +source "$(dirname "${BASH_SOURCE[0]}")/check-runners.sh" diff --git a/misc/runners/common/rerun-failed.sh b/misc/runners/common/rerun-failed.sh new file mode 100755 index 0000000000..8fc827216d --- /dev/null +++ b/misc/runners/common/rerun-failed.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# Rerun failed GitHub Actions workflows on open non-draft MFC PRs and master. +# +# Checks the 5 most recent workflow runs per branch. Only the failed jobs +# within each run are rerun (via `gh run rerun --failed`), not the entire +# workflow. Runs that are already in progress or queued are skipped by `gh`. +# +# Requires: gh CLI authenticated with access to MFlowCode/MFC +# +# Usage: bash rerun-failed.sh # dry run (show what would be rerun) +# APPLY=1 bash rerun-failed.sh # actually rerun failed workflows + +set -euo pipefail + +REPO="MFlowCode/MFC" + +echo "Checking open non-draft PRs on $REPO..." +prs=$(gh pr list --repo "$REPO" --state open --json number,title,isDraft --jq '.[] | select(.isDraft == false) | .number') + +if [ -z "$prs" ]; then + echo "No open non-draft PRs found." + exit 0 +fi + +for pr in $prs; do + title=$(gh pr view --repo "$REPO" "$pr" --json title --jq .title) + branch=$(gh pr view --repo "$REPO" "$pr" --json headRefName --jq .headRefName) + + # Find failed workflow runs on this PR's branch + failed_runs=$(gh run list --repo "$REPO" --branch "$branch" --limit 5 \ + --json databaseId,status,conclusion,name \ + --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') + + if [ -n "$failed_runs" ]; then + echo "" + echo "=== PR #$pr: $title ===" + while read -r run_id run_name; do + # Check which jobs failed; skip if run has expired or been deleted + failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ + --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name' \ + 2>/dev/null) || { echo " WARNING: could not fetch jobs for run $run_id, skipping"; continue; } + echo " Run $run_id ($run_name):" + if [ -n "$failed_jobs" ]; then + while read -r job; do + echo " - $job" + done <<< "$failed_jobs" + fi + + if [ "${APPLY:-0}" = "1" ]; then + echo " Rerunning failed jobs..." + gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed (may already be rerunning)" + fi + done < <(echo "$failed_runs") + fi +done + +# Also check master branch +echo "" +echo "=== master branch ===" +master_failed=$(gh run list --repo "$REPO" --branch master --limit 5 \ + --json databaseId,status,conclusion,name \ + --jq '.[] | select(.conclusion == "failure") | "\(.databaseId) \(.name)"') +if [ -n "$master_failed" ]; then + while read -r run_id run_name; do + failed_jobs=$(gh run view --repo "$REPO" "$run_id" \ + --json jobs --jq '.jobs[] | select(.conclusion == "failure" or .conclusion == "cancelled") | .name' \ + 2>/dev/null) || { echo " WARNING: could not fetch jobs for run $run_id, skipping"; continue; } + echo " Run $run_id ($run_name):" + if [ -n "$failed_jobs" ]; then + while read -r job; do + echo " - $job" + done <<< "$failed_jobs" + fi + if [ "${APPLY:-0}" = "1" ]; then + echo " Rerunning failed jobs..." + gh run rerun --repo "$REPO" "$run_id" --failed || echo " WARNING: rerun failed" + fi + done < <(echo "$master_failed") +else + echo " All passing" +fi + +if [ "${APPLY:-0}" != "1" ]; then + echo "" + echo "Dry run — set APPLY=1 to actually rerun failed workflows." +fi diff --git a/misc/runners/common/restart-all.sh b/misc/runners/common/restart-all.sh new file mode 100644 index 0000000000..3254a4f7cb --- /dev/null +++ b/misc/runners/common/restart-all.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Restart all runners in place on their current nodes. +# +# Sourced by site wrappers (frontier/restart-all.sh, phoenix/restart-all.sh) +# after config.sh is loaded. Useful after a login node reboot or to pick up +# environment changes. Skips busy runners unless FORCE=1. Dry run by default. +# +# Usage: bash restart-all.sh # dry run +# APPLY=1 bash restart-all.sh # execute +# APPLY=1 FORCE=1 bash restart-all.sh # restart busy runners too +set -euo pipefail + +declare -f sync_runner_nodes > /dev/null 2>&1 && { + echo "==> Syncing runner node locations..." + sync_runner_nodes +} + +echo "=== Discovering runners ===" +declare -a restart_nodes=() restart_dirs=() restart_names=() + +while IFS= read -r dir; do + name=$(get_runner_name "$dir") + [ -z "$name" ] && continue + node=$(find_node "$dir") + + if [ "$node" = "offline" ]; then + echo " $name: OFFLINE (use rebalance-runners.sh to place)" + continue + fi + + worker=$(ssh $SSH_OPTS "$node" "ps aux | grep Runner.Worker | grep '$dir/' | grep -v grep" 2>/dev/null || true) + if [ -n "$worker" ]; then + echo " $name: BUSY on $node" + if [ "${FORCE:-0}" != "1" ]; then + echo " Skipping. Set FORCE=1 to restart anyway." + continue + fi + else + echo " $name: idle on $node" + fi + + restart_nodes+=("$node") + restart_dirs+=("$dir") + restart_names+=("$name") +done < <(find_runner_dirs) + +if [ ${#restart_nodes[@]} -eq 0 ]; then + echo "Nothing to restart." + exit 0 +fi + +echo "" +echo "${#restart_nodes[@]} runners will be restarted." + +if [ "${APPLY:-0}" != "1" ]; then + echo "Dry run — set APPLY=1 to execute." + exit 0 +fi + +echo "" +echo "=== Restarting ===" +success=0; fail=0 +for i in "${!restart_nodes[@]}"; do + node="${restart_nodes[$i]}" + dir="${restart_dirs[$i]}" + name="${restart_names[$i]}" + echo "--- $name on $node ---" + if ! stop_runner "$node" "$dir"; then + echo " ERROR: Failed to stop; skipping restart" >&2 + fail=$((fail + 1)) + continue + fi + if start_runner "$node" "$dir"; then + echo "$node" > "$dir/runner.node" + pids=$(find_pids "$node" "$dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" + success=$((success + 1)) + else + echo " WARNING: PID $pid but slurm MISSING" + fail=$((fail + 1)) + fi + else + echo " ERROR: Failed to start" + fail=$((fail + 1)) + fi +done + +echo "" +echo "=== Summary: $success succeeded, $fail failed ===" diff --git a/misc/runners/common/restart-runner.sh b/misc/runners/common/restart-runner.sh new file mode 100644 index 0000000000..1517d69d32 --- /dev/null +++ b/misc/runners/common/restart-runner.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Restart a single GitHub Actions runner on a given node. +# +# Sourced by site wrappers (frontier/restart-runner.sh, phoenix/restart-runner.sh) +# after config.sh is loaded. Stops any existing process, starts fresh, and +# verifies slurm is in PATH. +# +# Usage: bash restart-runner.sh +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "Nodes: ${NODES[*]}" + exit 1 +fi + +node="$1" +dir="$2" +name=$(get_runner_name "$dir" 2>/dev/null || basename "$dir") + +echo "Restarting $name on $node..." +stop_runner "$node" "$dir" + +if start_runner "$node" "$dir"; then + echo "$node" > "$dir/runner.node" + pids=$(find_pids "$node" "$dir") + pid=${pids%% *} + if has_slurm "$node" "$pid"; then + echo " OK: PID $pid, slurm in PATH" + else + echo " WARNING: PID $pid but slurm MISSING from PATH" + fi +else + echo " ERROR: Failed to start on $node" + exit 1 +fi diff --git a/misc/runners/common/runner-lib.sh b/misc/runners/common/runner-lib.sh new file mode 100755 index 0000000000..91f5a6adc3 --- /dev/null +++ b/misc/runners/common/runner-lib.sh @@ -0,0 +1,165 @@ +#!/usr/bin/env bash +# Shared GitHub Actions runner management library. +# +# Sourced by site-specific config.sh files (misc/frontier/config.sh, +# misc/phoenix/config.sh). Callers must define ORG, NODES, and SSH_OPTS +# before sourcing this file. + +# Default: no cgroup memory limit displayed. Override in site config (e.g. CGROUP_LIMIT=4096). +CGROUP_LIMIT=${CGROUP_LIMIT:-0} + +# --- GitHub API --- + +# List runners from the GitHub API, filtered to this site's label. +# Prints: id name status busy (one runner per line) +gh_list_runners() { + gh api "orgs/$ORG/actions/runners" --paginate \ + --jq ".runners[] + | select(.labels | map(.name) | index(\"$RUNNER_LABEL\")) + | \"\(.id) \(.name) \(.status) \(.busy)\"" +} + +# Get a registration token for new runners. +gh_registration_token() { + gh api "orgs/$ORG/actions/runners/registration-token" --jq .token +} + +# Get the latest runner binary version. +gh_latest_runner_version() { + gh api repos/actions/runner/releases/latest --jq '.tag_name | ltrimstr("v")' +} + +# Remove a runner registration from GitHub. +# Args: $1 = runner ID (numeric, from API) +gh_remove_runner() { + gh api "orgs/$ORG/actions/runners/$1" -X DELETE +} + +# --- Local filesystem --- + +# Get the GitHub runner name from a .runner config file. +# Uses sys.argv to avoid path injection into Python source code. +# Prints the agentName, or empty string if the file is missing or unparsable +# (with a warning to stderr). +# Args: $1 = runner directory +get_runner_name() { + python3 -c " +import json, sys +d = json.loads(open(sys.argv[1] + '/.runner').read().lstrip('\ufeff')) +print(d.get('agentName', '')) +" "$1" 2>/dev/null \ + || echo "WARNING: could not read runner name from '$1/.runner'" >&2 +} + +# --- Login-node process management --- + +# Find PIDs of a runner on a node by matching its executable path. +# Candidate PIDs are found by grepping ps for Runner.Listener; each +# candidate's /proc/$p/exe is then resolved and matched against +# $dir/bin/Runner.Listener to confirm identity. This makes the confirmation +# step independent of CWD or process arguments. +# Output is filtered to numeric lines only to strip SSH MOTD noise. +# Args: $1 = node, $2 = runner directory +# Prints: space-separated PIDs, or empty. +find_pids() { + ssh $SSH_OPTS "$1" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + exe=$(readlink -f /proc/$p/exe 2>/dev/null || true) + [ "$exe" = "'"$2"'/bin/Runner.Listener" ] && echo "$p" + done + ' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' ' || true +} + +# Find which login node a runner is on. +# Args: $1 = runner directory +# Prints: node hostname, or "offline". +find_node() { + for node in "${NODES[@]}"; do + [ -n "$(find_pids "$node" "$1")" ] && echo "$node" && return + done + echo "offline" +} + +# Check if a runner process has a slurm directory in its PATH. +# Works across sites regardless of the specific slurm installation path. +# Returns non-zero if slurm is absent OR if the SSH check itself fails +# (callers should treat non-zero as "could not confirm slurm present"). +# Args: $1 = node, $2 = PID +has_slurm() { + local node="$1" pid="$2" + ssh $SSH_OPTS "$node" \ + "tr '\0' '\n' < /proc/$pid/environ 2>/dev/null | grep -q '^PATH=.*slurm'" \ + 2>/dev/null +} + +# Sweep all nodes in parallel, writing per-node result files to tmpdir. +# Each output line: RUNNER +# dir = runner directory derived from the Runner.Listener exe path +# slurm_ok = "ok" if slurm appears in the process PATH, "MISSING" otherwise +# Warns to stderr for any node whose output file is empty (SSH likely failed). +# Caller must create tmpdir and parse the output files. +# Args: $1 = tmpdir +sweep_all_nodes() { + local tmpdir="$1" node + for node in "${NODES[@]}"; do + ssh $SSH_OPTS "$node" ' + for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do + exe=$(readlink -f /proc/$p/exe 2>/dev/null || true) + [ -z "$exe" ] && continue + dir=$(dirname "$(dirname "$exe")") + rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo 0) + slurm=$(tr "\0" "\n" < /proc/$p/environ 2>/dev/null | grep -c "^PATH=.*slurm" || echo 0) + [ "$slurm" -gt 0 ] && slurm_ok="ok" || slurm_ok="MISSING" + echo "RUNNER '"$node"' $dir $rss $slurm_ok" + done + ' 2>/dev/null > "$tmpdir/$node.out" & + done + wait + for node in "${NODES[@]}"; do + if [ ! -s "$tmpdir/$node.out" ]; then + echo "WARNING: no runner data from $node (SSH may have failed)" >&2 + fi + done +} + +# Start a runner on a node. +# Uses a login shell (bash -lc) so site PATH (e.g. SLURM) is available. +# SSH launch failures are logged to stderr but do not prevent the function +# from checking whether the process appeared (find_pids after 3s). +# Args: $1 = node, $2 = runner directory +# Returns: 0 if a Runner.Listener process is found after start, 1 otherwise. +start_runner() { + local node="$1" dir="$2" + timeout 15 ssh $SSH_OPTS "$node" \ + "cd $dir && setsid bash -lc 'nohup ./run.sh >> runner.log 2>&1 < /dev/null &'" \ + /dev/null \ + || echo "WARNING: SSH launch to $node failed; checking for process anyway..." >&2 + sleep 3 + [ -n "$(find_pids "$node" "$dir")" ] +} + +# Stop a runner on a node (SIGTERM, 3s grace, then SIGKILL). +# After SIGKILL, waits 1s then re-checks with find_pids. If the process is +# still alive (e.g. SSH kill failed, wrong UID, kernel stuck), emits a warning +# to stderr and returns 1 so callers can react. Returns 0 if the runner is +# confirmed stopped or was never running. +# Args: $1 = node, $2 = runner directory +stop_runner() { + local node="$1" dir="$2" pids + pids=$(find_pids "$node" "$dir") + [ -z "$pids" ] && return 0 + for pid in $pids; do + ssh $SSH_OPTS "$node" "kill $pid" 2>/dev/null || true + done + sleep 3 + pids=$(find_pids "$node" "$dir") + for pid in $pids; do + ssh $SSH_OPTS "$node" "kill -9 $pid" 2>/dev/null || true + done + sleep 1 + pids=$(find_pids "$node" "$dir") + if [ -n "$pids" ]; then + echo "WARNING: process(es) $pids on $node survived SIGKILL; runner may still be running" >&2 + return 1 + fi +} diff --git a/misc/runners/common/stop-runner.sh b/misc/runners/common/stop-runner.sh new file mode 100644 index 0000000000..b9a8661846 --- /dev/null +++ b/misc/runners/common/stop-runner.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Stop and deregister a GitHub Actions runner. +# +# Sourced by site wrappers (frontier/stop-runner.sh, phoenix/stop-runner.sh) +# after config.sh is loaded. Finds the runner directory by name via +# find_runner_dirs(), stops the process, and removes the GitHub registration. +# +# Usage: bash stop-runner.sh +set -euo pipefail + +RUNNER_NAME="${1:?Usage: $0 }" + +# Find runner directory by name +runner_dir="" +while IFS= read -r dir; do + if [ "$(get_runner_name "$dir")" = "$RUNNER_NAME" ]; then + runner_dir="$dir" + break + fi +done < <(find_runner_dirs) + +if [ -z "$runner_dir" ]; then + echo "ERROR: Runner '$RUNNER_NAME' not found in known runner directories." >&2 + exit 1 +fi + +# Locate and stop the process +echo "==> Locating $RUNNER_NAME..." +node=$(find_node "$runner_dir") + +if [ "$node" != "offline" ]; then + echo "==> Stopping $RUNNER_NAME on $node..." + stop_runner "$node" "$runner_dir" + echo "==> Process stopped." +else + echo "==> $RUNNER_NAME is not running (already offline)." +fi + +# Deregister from GitHub +echo "==> Fetching runner ID from GitHub..." +runner_id="" +runner_list=$(gh_list_runners 2>/dev/null) || { + echo "WARNING: GitHub API call failed; runner may still be registered on GitHub." >&2 + exit 0 +} +while read -r id name _status _busy; do + [ "$name" = "$RUNNER_NAME" ] && runner_id="$id" && break +done <<< "$runner_list" + +if [ -n "$runner_id" ]; then + echo "==> Deregistering runner (ID $runner_id)..." + gh_remove_runner "$runner_id" + echo "==> Done." +else + echo "==> Runner not found in GitHub API (may already be deregistered)." +fi diff --git a/misc/runners/frontier/README.md b/misc/runners/frontier/README.md new file mode 100644 index 0000000000..ca3bb07ce2 --- /dev/null +++ b/misc/runners/frontier/README.md @@ -0,0 +1,97 @@ +# Frontier Runner Management Scripts + +Scripts for managing GitHub Actions self-hosted runners on OLCF Frontier login +nodes. The runners submit SLURM jobs to Frontier compute nodes for MFC's CI/CD +pipeline using the `service` partition with the `develop` QOS. + +## Background + +Frontier has 11 login nodes (`login01`–`login11`). Runner binaries live on +shared Lustre storage (`/lustre/orion/cfd154/proj-shared/runners/`), so moving +a runner between nodes only requires stopping the process on one node and +starting it on another — no binary copying needed. + +Each runner directory contains a `runner.node` file recording which login node +it was last started on. This is used as a fallback hint when restarting offline +runners. The authoritative source of truth for whether a runner is running (and +on which node) is EXE-based process discovery via `/proc/$pid/exe` — not any +PID file. + +`runner.node` is self-healing: `rebalance-runners` calls `sync_runner_nodes` +at startup, which sweeps all nodes in parallel and corrects any stale +`runner.node` files automatically. + +Runners occasionally die due to OLCF's firewall/proxy dropping long-lived TCP +connections to GitHub's broker. Run `rebalance-runners` to restart and +redistribute them. Login nodes vary in stability — if a runner keeps dying on a +particular node, move it to a quieter one (login01 tends to have low load). + +All commands are run via the dispatcher at `misc/runners/runner.sh`: + +```bash +bash misc/runners/runner.sh frontier [args...] +``` + +## Quick Reference + +```bash +R="bash misc/runners/runner.sh frontier" + +# List all runners with GitHub status, node, slurm, and memory usage +$R list-runners + +# Check runner health across all login nodes +$R check-runners + +# Rebalance runners across all 11 nodes (also restarts offline runners) +$R rebalance-runners # dry run +APPLY=1 $R rebalance-runners # execute +APPLY=1 FORCE=1 $R rebalance-runners # move busy runners too + +# Restart all runners in place (e.g. after a node reboot) +APPLY=1 $R restart-all + +# Restart one specific runner +$R restart-runner login01 /path/to/runner-dir + +# Move a runner to a different login node +$R move-runner frontier-1 login01 + +# Stop and deregister a runner +$R stop-runner frontier-12 + +# Deploy a new runner on a specific node +$R make-runner 23 login01 + +# Deploy multiple runners across nodes (e.g. runners 23, 24, 25) +$R deploy-runners 23 login01 login02 login03 + +# Rerun failed CI workflows +$R rerun-failed +APPLY=1 $R rerun-failed +``` + +## Scripts + +| Script | Purpose | +|---|---| +| `config.sh` | Shared configuration: Frontier constants, `find_runner_dirs()`, and `sync_runner_nodes()`. Sources `../common/runner-lib.sh` for shared functions. | +| `make-runner.sh` | Download runner binary, register with GitHub, start on target node. Usage: `make-runner [node]` | +| `deploy-runners.sh` | Deploy multiple runners across nodes in parallel. Usage: `deploy-runners [node2 ...]` | +| `../common/` | All other commands (`check-runners`, `list-runners`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runners/runner.sh`. | + +## Troubleshooting + +**Runner goes OFFLINE repeatedly on the same node** — That login node may have +process culling or high memory pressure. Move it to a different node: +```bash +bash misc/runners/runner.sh frontier move-runner frontier-1 login01 +``` + +**Multiple runners OFFLINE at once** — Usually a transient OLCF network blip +to GitHub. Run `rebalance-runners` to recover and redistribute all at once. + +**Runner appears offline on GitHub but process is running** — GitHub status can +lag. `rebalance-runners` uses EXE-based process discovery first: if a +process is found running, it will stop it before restarting, preventing +duplicate runner processes. diff --git a/misc/runners/frontier/config.sh b/misc/runners/frontier/config.sh new file mode 100755 index 0000000000..ce6962c0ab --- /dev/null +++ b/misc/runners/frontier/config.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Shared configuration for Frontier GitHub Actions runner management. +# +# Sourced by all other scripts. Provides Frontier constants and +# site-specific functions. Common functions live in ../common/runner-lib.sh. + +# --- Frontier constants --- +ORG="MFlowCode" +RUNNER_GROUP="phoenix" # Both sites share one GitHub runner group named "phoenix" +RUNNER_LABEL="frontier" +NODES=(login01 login02 login03 login04 login05 login06 login07 login08 login09 login10 login11) +SHARED_DIR="/lustre/orion/cfd154/proj-shared/runners" + +SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=10 -o BatchMode=yes -o ServerAliveInterval=10 -o ServerAliveCountMax=3" + +source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" + +# --- Local filesystem --- + +# Cache downloaded runner tarballs here so parallel deployments don't race. +TARBALL_CACHE_DIR="$SHARED_DIR" + +# Return the directory where a named runner should be installed. +# Args: $1 = runner name, $2 = optional override dir +runner_install_dir() { + local name="$1" override="${2:-}" + [ -n "$override" ] && echo "$override" && return + echo "$SHARED_DIR/$name" +} + +# Find all runner directories on shared storage. +# Prints: one directory path per line. +find_runner_dirs() { + for conf in "$SHARED_DIR"/frontier-*/.runner; do + [ -f "$conf" ] && dirname "$conf" + done +} + +# --- Login-node process management --- + +# Sweep all nodes in parallel and update runner.node for any runner +# found running on a different node than recorded. Called at the top of +# every primary script to ensure runner.node always reflects reality, +# even if a runner was manually restarted on a different node. +sync_runner_nodes() { + local tmpdir node + tmpdir=$(mktemp -d) + trap 'rm -rf "$tmpdir"' RETURN + + sweep_all_nodes "$tmpdir" + + for node in "${NODES[@]}"; do + while IFS= read -r line; do + local dir sweep_node + read -r _s sweep_node dir _rss _slurm <<< "$line" + [ -f "$dir/runner.node" ] || continue + local recorded + recorded=$(cat "$dir/runner.node" 2>/dev/null || echo "") + if [ "$sweep_node" != "$recorded" ]; then + echo "==> $(basename "$dir"): runner.node updated $recorded -> $sweep_node" + echo "$sweep_node" > "$dir/runner.node" + fi + done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true) + done +} diff --git a/misc/runners/frontier/deploy-runners.sh b/misc/runners/frontier/deploy-runners.sh new file mode 100755 index 0000000000..f4ad3ecbb5 --- /dev/null +++ b/misc/runners/frontier/deploy-runners.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Deploy one runner per login node in parallel. +# Usage: deploy-runners.sh [node2 ...] +# Example: deploy-runners.sh 17 login08 login09 login10 +# Deploys frontier-17 on login08, frontier-18 on login09, frontier-19 on login10. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +START_NUM="${1:?Usage: $0 [node2 ...]}" +shift +TARGET_NODES=("$@") + +if [ ${#TARGET_NODES[@]} -eq 0 ]; then + echo "Error: no login nodes specified." >&2 + echo "Usage: $0 [node2 ...]" >&2 + exit 1 +fi + +# Pre-download the runner tarball once before spawning parallel make-runner.sh +# instances. Without this, all instances race to download the same file +# concurrently and corrupt it. The tmp+mv ensures an atomic final placement. +RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}" +TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" +if [ ! -f "${TARBALL_CACHE_DIR}/${TARBALL}" ]; then + echo "==> Downloading runner v${RUNNER_VERSION}..." + tmp="${TARBALL_CACHE_DIR}/${TARBALL}.tmp.$$" + curl -fsSL \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}" \ + -o "$tmp" + mv "$tmp" "${TARBALL_CACHE_DIR}/${TARBALL}" +fi +export RUNNER_VERSION + +declare -a pids=() nums=() deploy_nodes=() +for i in "${!TARGET_NODES[@]}"; do + NODE="${TARGET_NODES[$i]}" + NUM=$((START_NUM + i)) + echo "==> Deploying frontier-${NUM} on ${NODE}..." + "$SCRIPT_DIR/make-runner.sh" "${NUM}" "${NODE}" & + pids+=($!) + nums+=("$NUM") + deploy_nodes+=("$NODE") +done + +failed=0 +for i in "${!pids[@]}"; do + if ! wait "${pids[$i]}"; then + echo "ERROR: frontier-${nums[$i]} on ${deploy_nodes[$i]} failed." >&2 + failed=$((failed + 1)) + fi +done + +if [ "$failed" -gt 0 ]; then + echo "==> $failed runner(s) failed to deploy." >&2 + exit 1 +fi +echo "==> All runners deployed." diff --git a/misc/runners/frontier/make-runner.sh b/misc/runners/frontier/make-runner.sh new file mode 100755 index 0000000000..bfd664da65 --- /dev/null +++ b/misc/runners/frontier/make-runner.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# Frontier convenience wrapper: derives runner name "frontier-" from a number. +# For full name control, use: runner.sh frontier create-runner +# +# Usage: runner.sh frontier make-runner [node] +# num Runner number (e.g. 23 creates "frontier-23") +# node Login node to start on (default: current host) +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/config.sh" + +NUM="${1:?Usage: $0 [node]}" +NODE="${2:-$(hostname -s)}" + +source "$SCRIPT_DIR/../common/create-runner.sh" "frontier-$NUM" "$NODE" diff --git a/misc/runners/phoenix/README.md b/misc/runners/phoenix/README.md new file mode 100644 index 0000000000..3e632a2ae6 --- /dev/null +++ b/misc/runners/phoenix/README.md @@ -0,0 +1,96 @@ +# Phoenix Runner Management Scripts + +Scripts for managing GitHub Actions self-hosted runners on Georgia Tech Phoenix +login nodes. The runners submit SLURM jobs to Phoenix compute nodes for MFC's +CI/CD pipeline. + +## Background + +Phoenix has 3 physical login nodes (`login-phoenix-gnr-{1,2,3}`), each with a +**4 GB per-user cgroup memory limit**. Each runner uses ~60-130 MB, so +distributing them evenly (~3-4 per node) is important to avoid OOM kills. + +Runner binaries live on shared storage, so moving a runner between nodes only +requires stopping the process on one node and starting it on another. + +Runners must be started with a **login shell** (`bash -l`) so they inherit +`/opt/slurm/current/bin` in PATH (required for `sbatch`, `squeue`, `sacct`). + +All commands are run via the dispatcher at `misc/runners/runner.sh`: + +```bash +bash misc/runners/runner.sh phoenix [args...] +``` + +## Quick Reference + +```bash +R="bash misc/runners/runner.sh phoenix" + +# Check health (quick, one SSH per node) +$R check-runners + +# Detailed table with GitHub API status +$R list-runners + +# Auto-rebalance across nodes (also restarts offline runners) +$R rebalance-runners # dry run +APPLY=1 $R rebalance-runners # execute + +# Restart all runners in place (e.g. after a node reboot) +APPLY=1 $R restart-all + +# Restart one specific runner +$R restart-runner login-phoenix-gnr-2 /path/to/actions-runner-3 + +# Move a runner to a different login node +$R move-runner phoenix-3 login-phoenix-gnr-1 + +# Stop and deregister a runner +$R stop-runner phoenix-3 + +# Create a new runner (needs gh CLI with admin:org scope) +$R create-runner phoenix-11 login-phoenix-gnr-2 + +# Rerun failed CI on open PRs +$R rerun-failed # dry run +APPLY=1 $R rerun-failed # execute +``` + +## Scripts + +| Script | Purpose | +|---|---| +| `config.sh` | Shared config: Phoenix constants (`ORG`, `RUNNER_GROUP`, `RUNNER_LABEL`, `NODES`, `CGROUP_LIMIT`, `RUNNER_PARENT_DIRS`), `find_runner_dirs()`, and `runner_install_dir()`. Sources `../common/runner-lib.sh` for shared functions. | +| `../common/` | All commands (`check-runners`, `list-runners`, `create-runner`, `rebalance-runners`, etc.) live here and are dispatched via `misc/runners/runner.sh`. | + +## Safety + +- **Dry run by default**: `rebalance-runners`, `restart-all`, and + `rerun-failed` show what they would do unless `APPLY=1` is set. +- **Busy runner protection**: Scripts skip BUSY runners unless `FORCE=1`. +- **Slurm PATH verification**: After starting, scripts verify `slurm` appears + in the runner's PATH and warn if missing. + +## Configuration + +Edit `config.sh` to change: + +- `ORG` / `RUNNER_GROUP` / `RUNNER_LABEL` — GitHub org and runner registration +- `NODES` — physical login node hostnames +- `CGROUP_LIMIT` — per-user memory limit in MB +- `RUNNER_PARENT_DIRS` — directories containing `actions-runner-*/` installations + +## Troubleshooting + +**"sbatch: command not found"** — Runner started without login shell. +Fix: `bash misc/runners/runner.sh phoenix restart-runner ` + +**OOM kills** — Too many runners on one node. +Fix: `bash misc/runners/runner.sh phoenix check-runners` then `APPLY=1 bash misc/runners/runner.sh phoenix rebalance-runners` + +**Runner OFFLINE** — Process died or node rebooted. +Fix: `APPLY=1 bash misc/runners/runner.sh phoenix rebalance-runners` (auto-places on least-loaded node) + +**All runners down** — Node maintenance. +Fix: `APPLY=1 bash misc/runners/runner.sh phoenix restart-all` diff --git a/misc/runners/phoenix/config.sh b/misc/runners/phoenix/config.sh new file mode 100755 index 0000000000..11826019c2 --- /dev/null +++ b/misc/runners/phoenix/config.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Shared configuration for Phoenix GitHub Actions runner management. +# +# Sourced by all other scripts. Provides Phoenix constants and +# site-specific functions. Common functions live in ../common/runner-lib.sh. + +# --- Phoenix constants --- +ORG="MFlowCode" +RUNNER_GROUP="phoenix" +RUNNER_LABEL="gt" +NODES=(login-phoenix-gnr-1 login-phoenix-gnr-2 login-phoenix-gnr-3) +CGROUP_LIMIT=4096 # per-user memory limit in MB on login nodes + +SSH_OPTS="-o ConnectTimeout=5" + +# Parent directories containing actions-runner-*/ installations on shared storage. +RUNNER_PARENT_DIRS=( + /storage/scratch1/6/sbryngelson3/mfc-runners + /storage/project/r-sbryngelson3-0/sbryngelson3/mfc-runners-2 +) + +source "$(dirname "${BASH_SOURCE[0]}")/../common/runner-lib.sh" + +# --- Local filesystem --- + +# No shared cache: each runner downloads its own tarball independently. +TARBALL_CACHE_DIR="" + +# Return the directory where a named runner should be installed. +# Auto-increments the actions-runner-N suffix within RUNNER_PARENT_DIRS[0]. +# Args: $1 = runner name (unused; directory is numbered, not named), $2 = optional override dir +runner_install_dir() { + local override="${2:-}" + [ -n "$override" ] && echo "$override" && return + local parent="${RUNNER_PARENT_DIRS[1]}" + local existing next_num + existing=$(ls -d "$parent"/actions-runner-* 2>/dev/null | sed 's/.*actions-runner-//' | sort -n | tail -1) + next_num=$(( ${existing:-0} + 1 )) + echo "$parent/actions-runner-$next_num" +} + +# Find all runner directories on shared storage. +# Prints: one directory path per line. +find_runner_dirs() { + for parent in "${RUNNER_PARENT_DIRS[@]}"; do + for conf in "$parent"/actions-runner-*/.runner; do + [ -f "$conf" ] && dirname "$conf" + done + done +} diff --git a/misc/runners/runner.sh b/misc/runners/runner.sh new file mode 100644 index 0000000000..0de3f2d625 --- /dev/null +++ b/misc/runners/runner.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# Dispatcher for GitHub Actions runner management scripts. +# +# Loads site-specific configuration then runs the requested command. +# Common commands live in misc/common/; site-specific commands live in +# misc//. All site-specific scripts source their own config, so +# the dispatcher only pre-loads config for common commands. +# +# Usage: bash misc/runners/runner.sh [args...] +# +# Sites: frontier phoenix +# Common: check-runners list-runners move-runner rebalance-runners +# restart-all restart-runner stop-runner rerun-failed +# Frontier: make-runner deploy-runners +# Phoenix: create-runner +# +# Examples: +# bash misc/runners/runner.sh frontier check-runners +# bash misc/runners/runner.sh phoenix list-runners +# APPLY=1 bash misc/runners/runner.sh frontier rebalance-runners +# bash misc/runners/runner.sh frontier restart-runner login01 /path/to/runner +# bash misc/runners/runner.sh frontier make-runner 23 login01 +# bash misc/runners/runner.sh phoenix create-runner phoenix-11 login-phoenix-gnr-2 + +set -euo pipefail + +MISC_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SITE="${1:?Usage: $0 [args...]}" +CMD="${2:?Usage: $0 [args...]}" +shift 2 + +if [ ! -f "$MISC_DIR/$SITE/config.sh" ]; then + echo "ERROR: Unknown site '$SITE'. Known sites: frontier, phoenix" >&2 + exit 1 +fi + +# Site-specific scripts are standalone — they source their own config. +if [ -f "$MISC_DIR/$SITE/$CMD.sh" ]; then + exec bash "$MISC_DIR/$SITE/$CMD.sh" "$@" +fi + +# Common scripts need the site config pre-loaded. +if [ -f "$MISC_DIR/common/$CMD.sh" ]; then + SITE_SCRIPT_DIR="$MISC_DIR/$SITE" + source "$MISC_DIR/$SITE/config.sh" + source "$MISC_DIR/common/$CMD.sh" "$@" + exit +fi + +echo "ERROR: Unknown command '$CMD' for site '$SITE'." >&2 +echo "Common: check-runners list-runners move-runner rebalance-runners restart-all restart-runner stop-runner rerun-failed" >&2 +echo "Frontier: make-runner deploy-runners" >&2 +echo "Phoenix: create-runner" >&2 +exit 1 diff --git a/misc/starting-phoenix-runners.md b/misc/starting-phoenix-runners.md deleted file mode 100644 index 5e77fbd189..0000000000 --- a/misc/starting-phoenix-runners.md +++ /dev/null @@ -1,110 +0,0 @@ -# Launching Phoenix Runners - -The Phoenix runners were repeatedly failing due to a network error. -Spencer managed to fix it via [this PR](https://github.com/MFlowCode/MFC/pull/933) and by running things through a socks5 proxy on each login node that holds a runner. -These are documented for Spencer or his next of kin. - -__The runners are started via the following process__ - -1. Log in to the login node via `ssh login-phoenix-rh9-.pace.gatech.edu`. `` can be `1` through `6` on Phoenix. - * Detour: Make sure no stray `ssh` daemons are sitting around: `pkill -9 sshd`. - * You can probably keep your terminal alive via `fuser -k -9 ~/nohup.out`, which kills (signal 9) whatever process is writing to that no-hangup file (the daemon we care about) -2. Log back into the same login node because you may have just nuked your session - * Detour: Make sure stray runners on that login node are dead (one liner): `pkill -9 -f -E 'run.sh|Runner.listener|Runner.helper'` - * If cautious, check that no runner processes are left over. `top` followed by `u` and `` and return. -3. Execute from your home directory: `nohup ssh -N -D 1080 -vvv login-phoenix-rh9-.pace.gatech.edu &`, replacing `` with the login node number - * This starts a proxy to tunnel a new ssh session through -4. Navigate to your runner's directory (or create a runner directory if you need). - * Right now they are in Spencer's `scratch/mfc-runners/action-runner-` -5. Run the alias `start_runner`, which dumps output `~/runner.out` - * If one doesn't have this alias yet, create and source it in your `.bashrc` or similar: -```bash -alias start_runner=' \ - http_proxy="socks5://localhost:1080" \ - https_proxy="socks5://localhost:1080" \ - no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org" \ - NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org" \ - RUNNER_DEBUG=1 \ - ACTIONS_STEP_DEBUG=1 \ - GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4 \ - DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00 \ - DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20 \ - DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5 \ - nohup ./run.sh > ~/runner.out 2>&1 &' -``` -6. You're done - - -### For inquisitive minds - -__Why the `start_runner` alias?__ - -1. `alias start_runner='…'` - Defines a new shell alias named `start_runner`. Whenever you run `start_runner`, the shell will execute everything between the single quotes as if you’d typed it at the prompt. - -2. `http_proxy="socks5://localhost:1080"` - Sets the `http_proxy` environment variable so that any HTTP traffic from the runner is sent through a SOCKS5 proxy listening on `localhost:1080`. - -3. `https_proxy="socks5://localhost:1080"` - Tells HTTPS-aware tools to use that same local SOCKS5 proxy for HTTPS requests. - -4. `no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Lists hosts and domains that should bypass the proxy entirely. Commonly used for internal or high-volume endpoints where you don’t want proxy overhead. - -5. `NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Same list as `no_proxy`—some programs only check the uppercase `NO_PROXY` variable. - -6. `RUNNER_DEBUG=1` - Enables debug-level logging in the GitHub Actions runner itself, so you’ll see more verbose internal messages in its logs. - -7. `ACTIONS_STEP_DEBUG=1` - Turns on step-level debug logging for actions you invoke—handy if you need to trace exactly what each action is doing under the hood. - -8. `GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4` - Forces the runner to resolve DNS names to IPv4 addresses only. Useful if your proxy or network has spotty IPv6 support. - -9. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00` - For .NET–based tasks: sets the initial TCP keepalive timeout to 1 minute (after 1 minute of idle, a keepalive probe is sent). - -10. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20` - If the first keepalive probe gets no response, wait 20 seconds between subsequent probes. - -11. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5` - If probes continue to go unanswered, retry up to 5 times before declaring the connection dead. - -12. `nohup ./run.sh > ~/runner.out 2>&1 &` - - `nohup … &` runs `./run.sh` in the background and makes it immune to hangups (so it keeps running if you log out). - - `> ~/runner.out` redirects **stdout** to the file `runner.out` in your home directory. - - `2>&1` redirects **stderr** into the same file, so you get a combined log of everything the script prints. - -__Why the extra ssh command?__ - -1. `http_proxy="socks5://localhost:1080"` - Routes all HTTP traffic through a local SOCKS5 proxy on port 1080. - -2. `https_proxy="socks5://localhost:1080"` - Routes all HTTPS traffic through the same proxy. - -3. `no_proxy="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Specifies hosts and domains that bypass the proxy entirely. Includes specific things that MFC's CMake will try to `wget` (e.g., `fftw`) or some other non `git` command. Allows `git clone` to work. - -4. `NO_PROXY="localhost,127.0.0.1,github.com,api.github.com,pipelines.actions.githubusercontent.com,alive.github.com,pypi.org,files.pythonhosted.org,fftw.org,www.fftw.org"` - Same bypass list for applications that only check the uppercase variable. - -5. `RUNNER_DEBUG=1` - Enables verbose internal logging in the GitHub Actions runner. - -6. `GITHUB_ACTIONS_RUNNER_PREFER_IP_FAMILY=ipv4` - Forces DNS resolution to IPv4 to avoid IPv6 issues. - -7. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_TIME=00:01:00` - (For .NET tasks) sends the first TCP keepalive probe after 1 minute of idle. - -8. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_INTERVAL=00:00:20` - Waits 20 seconds between subsequent TCP keepalive probes. - -9. `DOTNET_SYSTEM_NET_SOCKETS_KEEPALIVE_RETRYCOUNT=5` - Retries keepalive probes up to 5 times before closing the connection. - -10. `nohup ./run.sh > ~/runner.out 2>&1 &` - Runs `run.sh` in the background, immune to hangups, redirecting both stdout and stderr to `~/runner.out`.