MFlowCode · sbryngelson · Mar 16, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
@@ -0,0 +1,25 @@
+# Common Runner Management Scripts
+
+Site-agnostic scripts shared between the Frontier and Phoenix runner setups.
+All shared logic lives here; site directories contain only site-specific files
+(`config.sh` and scripts unique to that cluster).
+
+Scripts are invoked via the dispatcher at `misc/runners/runner.sh`:
+```bash
+bash misc/runners/runner.sh <site> <command> [args...]
+```
+
+## Scripts
+
+| Script | Purpose |
+|---|---|
+| `runner-lib.sh` | Shared library: GitHub API helpers, EXE-based process discovery, parallel node sweep, start/stop primitives. Sourced by site `config.sh` files. |
+| `check-runners.sh` | Per-node health check: Runner.Listener processes with name, idle/BUSY, slurm PATH, RSS. Optional cgroup memory footer. |
+| `list-runners.sh` | Full table: GitHub API status × parallel node sweep. Shows slurm status, flags stale `runner.node`. |
+| `rebalance-runners.sh` | Compute optimal distribution and move runners across nodes. Handles offline runners. Writes `runner.node`. Dry run by default. |
+| `restart-runner.sh` | Stop and restart one runner on a given node. Verifies slurm in PATH. Writes `runner.node`. |
+| `restart-all.sh` | Restart all runners in place. Skips busy unless `FORCE=1`. Dry run by default. |
+| `move-runner.sh` | Move a runner to a different login node by name. Stops on current node, starts on target. Writes `runner.node`. |
+| `stop-runner.sh` | Stop a runner process and remove its GitHub registration. |
+| `rerun-failed.sh` | Rerun failed GitHub Actions workflows on open non-draft PRs and master. Dry run by default. |
+| `create-runner.sh` | Download, register, and start a new runner. Requires `runner_install_dir()` and `TARBALL_CACHE_DIR` from site config. Usage: `create-runner <name> <node> [install-dir]` |
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Check runner health across all login nodes.
+#
+# Sourced by site wrappers (frontier/check-runners.sh, phoenix/check-runners.sh)
+# after config.sh is loaded. Shows Runner.Listener processes per node with
+# name, busy/idle status, slurm availability, and RSS memory.
+# If CGROUP_LIMIT > 0, also shows per-node total memory vs the cgroup limit.
+#
+# Usage: bash check-runners.sh
+set -euo pipefail
+
+declare -f sync_runner_nodes > /dev/null 2>&1 && {
+    echo "==> Syncing runner node locations..."
+    sync_runner_nodes
+}
+
+for node in "${NODES[@]}"; do
+    echo "=== $node ==="
+    ssh $SSH_OPTS "$node" '
+        found=0
+        for p in $(ps aux | grep Runner.Listener | grep -v grep | awk "{print \$2}"); do
+            found=1
+            exe=$(readlink -f /proc/$p/exe 2>/dev/null || echo "???")
+            dir=$(dirname "$(dirname "$exe")" 2>/dev/null || echo "???")
+            name=$(basename "$dir")
+            worker=$(ps aux | grep "Runner.Worker" | grep "$dir" | grep -v grep | awk "{print \$2}" | head -1)
+            [ -n "$worker" ] && status="BUSY" || status="idle"
+            rss=$(ps -p $p -o rss= 2>/dev/null | awk "{printf \"%.0f\", \$1/1024}" || echo "?")
+            slurm=$(tr "\0" "\n" < /proc/$p/environ 2>/dev/null | grep -c "^PATH=.*slurm" || echo 0)
+            [ "$slurm" -gt 0 ] && slurm_ok="ok" || slurm_ok="MISSING"
+            printf "  %-30s %5s  slurm=%-7s  %s MB\n" "$name" "$status" "$slurm_ok" "$rss"
+        done
+        [ "$found" -eq 0 ] && echo "  (no runners)"
+    ' 2>/dev/null || echo "  (unreachable)"
+
+    if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then
+        rss=$(ssh $SSH_OPTS "$node" \
+            "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \
+            2>/dev/null || echo "?")
+        [[ "$rss" =~ ^[0-9]+$ ]] || rss=0
+        echo "  --- Total: ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free) ---"
+    fi
+    echo ""
+done
@@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+# Create, register, and start a GitHub Actions runner.
+#
+# Sourced by misc/runners/runner.sh after config is loaded.
+# Config must define runner_install_dir() and may set TARBALL_CACHE_DIR.
+#
+# runner_install_dir <name> [override-dir]
+#   Returns the directory where the runner should be installed.
+#   If override-dir is given it is used directly; otherwise the site
+#   computes the path (e.g. SHARED_DIR/<name> on Frontier, or an
+#   auto-numbered actions-runner-N/ directory on Phoenix).
+#
+# TARBALL_CACHE_DIR
+#   If non-empty, the runner tarball is cached here and reused across
+#   installs (useful on Frontier where shared Lustre is visible from all
+#   login nodes). If empty or unset, a fresh download is made for each
+#   runner and the temporary file is removed after extraction.
+#
+# Usage: runner.sh <site> create-runner <name> <node> [install-dir]
+#   name         Runner name (e.g. frontier-23, phoenix-11)
+#   node         Login node to start the runner on
+#   install-dir  Optional: override the computed installation directory
+set -euo pipefail
+
+RUNNER_NAME="${1:?Usage: create-runner <name> <node> [install-dir]}"
+TARGET_NODE="${2:?Usage: create-runner <name> <node> [install-dir]}"
+INSTALL_DIR_OVERRIDE="${3:-}"
+
+RUNNER_DIR=$(runner_install_dir "$RUNNER_NAME" "$INSTALL_DIR_OVERRIDE")
+RUNNER_VERSION="${RUNNER_VERSION:-$(gh_latest_runner_version 2>/dev/null || echo "2.332.0")}"
+TARBALL="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz"
+TARBALL_URL="https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/${TARBALL}"
+
+echo "=== Creating runner ==="
+echo "  Name:      $RUNNER_NAME"
+echo "  Node:      $TARGET_NODE"
+echo "  Directory: $RUNNER_DIR"
+echo "  Org:       $ORG"
+echo "  Group:     $RUNNER_GROUP"
+echo "  Label:     $RUNNER_LABEL"
+echo "  Version:   $RUNNER_VERSION"
+echo ""
+
+if [ -d "$RUNNER_DIR" ]; then
+    echo "ERROR: Directory already exists: $RUNNER_DIR" >&2
+    exit 1
+fi
+
+# --- Download tarball ---
+if [ -n "${TARBALL_CACHE_DIR:-}" ]; then
+    if [ ! -f "$TARBALL_CACHE_DIR/$TARBALL" ]; then
+        echo "==> Downloading runner v${RUNNER_VERSION} to cache..."
+        tmp="$TARBALL_CACHE_DIR/$TARBALL.tmp.$$"
+        curl -fsSL "$TARBALL_URL" -o "$tmp"
+        mv "$tmp" "$TARBALL_CACHE_DIR/$TARBALL"
+    fi
+    tarball_path="$TARBALL_CACHE_DIR/$TARBALL"
+else
+    echo "==> Downloading runner v${RUNNER_VERSION}..."
+    mkdir -p "$RUNNER_DIR"
+    tarball_path="$RUNNER_DIR/runner-download.tmp.$$"
+    curl -fsSL "$TARBALL_URL" -o "$tarball_path"
+fi
+
+# --- Extract ---
+mkdir -p "$RUNNER_DIR"
+echo "==> Extracting into $RUNNER_DIR..."
+tar xzf "$tarball_path" -C "$RUNNER_DIR"
+[ -z "${TARBALL_CACHE_DIR:-}" ] && rm -f "$tarball_path"
+
+if [ ! -f "$RUNNER_DIR/run.sh" ]; then
+    echo "ERROR: Extraction failed — run.sh not found in $RUNNER_DIR" >&2
+    exit 1
+fi
+
+# --- Register ---
+echo "==> Fetching registration token..."
+token=$(gh_registration_token)
+if [ -z "$token" ]; then
+    echo "ERROR: Failed to get registration token." >&2
+    echo "       Run: gh auth refresh -h github.com -s admin:org" >&2
+    exit 1
+fi
+
+echo "==> Configuring runner..."
+"$RUNNER_DIR/config.sh" \
+    --url "https://github.com/$ORG" \
+    --token "$token" \
+    --name "$RUNNER_NAME" \
+    --runnergroup "$RUNNER_GROUP" \
+    --labels "$RUNNER_LABEL" \
+    --work "_work" \
+    --unattended \
+    --replace
+echo "==> Configured."
+
+# --- Start ---
+echo "==> Starting on $TARGET_NODE..."
+if start_runner "$TARGET_NODE" "$RUNNER_DIR"; then
+    echo "$TARGET_NODE" > "$RUNNER_DIR/runner.node"
+    pids=$(find_pids "$TARGET_NODE" "$RUNNER_DIR")
+    pid=${pids%% *}
+    if has_slurm "$TARGET_NODE" "$pid"; then
+        echo "==> OK: $RUNNER_NAME running on $TARGET_NODE (PID $pid, slurm in PATH)"
+    else
+        echo "==> WARNING: $RUNNER_NAME running on $TARGET_NODE (PID $pid) but slurm MISSING from PATH"
+    fi
+else
+    echo "ERROR: $RUNNER_NAME did not start on $TARGET_NODE" >&2
+    exit 1
+fi
+
+echo ""
+echo "==> Log: $RUNNER_DIR/runner.log"
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# List all runners combining GitHub API status with live node process info.
+#
+# Sourced by site wrappers (frontier/list-runners.sh, phoenix/list-runners.sh)
+# after config.sh is loaded. Uses a parallel SSH sweep across all nodes
+# simultaneously (one SSH per node regardless of runner count).
+# Shows name, GitHub status, node, slurm availability, and RSS.
+# If CGROUP_LIMIT > 0, also shows a per-node memory summary.
+#
+# Usage: bash list-runners.sh
+set -euo pipefail
+
+declare -f sync_runner_nodes > /dev/null 2>&1 && {
+    echo "==> Syncing runner node locations..."
+    sync_runner_nodes
+}
+
+tmpdir=$(mktemp -d)
+trap 'rm -rf "$tmpdir"' EXIT
+
+sweep_all_nodes "$tmpdir"
+
+# Parse sweep results into associative arrays
+declare -A runner_node runner_rss runner_slurm
+for node in "${NODES[@]}"; do
+    while IFS= read -r line; do
+        read -r _s sweep_node dir rss slurm_ok <<< "$line"
+        runner_node["$dir"]="$sweep_node"
+        runner_rss["$dir"]="$rss"
+        runner_slurm["$dir"]="$slurm_ok"
+    done < <(grep '^RUNNER ' "$tmpdir/$node.out" 2>/dev/null || true)
+done
+
+# Fetch GitHub API status
+declare -A gh_status gh_busy
+while read -r _id name status busy; do
+    gh_status["$name"]="$status"
+    gh_busy["$name"]="$busy"
+done < <(gh_list_runners)
+
+# Print table
+printf "%-25s %-8s %-20s %-8s %s\n" "NAME" "GITHUB" "NODE" "SLURM" "RSS"
+printf "%s\n" "$(printf '%.0s-' {1..70})"
+
+while IFS= read -r dir; do
+    name=$(get_runner_name "$dir")
+    [ -z "$name" ] && continue
+
+    [ "${gh_busy[$name]:-false}" = "true" ] && gh_col="BUSY" || gh_col="${gh_status[$name]:-unknown}"
+
+    actual_node="${runner_node[$dir]:-}"
+    rss="${runner_rss[$dir]:-—}"
+    slurm="${runner_slurm[$dir]:-—}"
+
+    if [ -z "$actual_node" ]; then
+        printf "%-25s %-8s %-20s %-8s %s\n" "$name" "$gh_col" "offline" "—" "—"
+        continue
+    fi
+
+    # Flag stale runner.node entries
+    node_col="$actual_node"
+    if [ -f "$dir/runner.node" ]; then
+        recorded=$(cat "$dir/runner.node")
+        [ "$actual_node" != "$recorded" ] && node_col="${actual_node} *(stale: ${recorded})"
+    fi
+
+    printf "%-25s %-8s %-20s %-8s %sMB\n" "$name" "$gh_col" "$node_col" "$slurm" "$rss"
+done < <(find_runner_dirs)
+
+# Per-node memory summary (only when site has a cgroup limit)
+if [ "${CGROUP_LIMIT:-0}" -gt 0 ]; then
+    echo ""
+    echo "=== Per-node memory ==="
+    for node in "${NODES[@]}"; do
+        count=$(ssh $SSH_OPTS "$node" \
+            "ps aux | grep Runner.Listener | grep -v grep | wc -l" 2>/dev/null || echo 0)
+        rss=$(ssh $SSH_OPTS "$node" \
+            "ps -u \$(whoami) -o rss= 2>/dev/null | awk '{sum+=\$1} END {printf \"%.0f\", sum/1024}'" \
+            2>/dev/null || echo "?")
+        [[ "$rss" =~ ^[0-9]+$ ]] || rss=0
+        echo "  $node: $count runners, ${rss} MB / ${CGROUP_LIMIT} MB ($(( CGROUP_LIMIT - rss )) MB free)"
+    done
+fi
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+# Move a runner to a different login node.
+#
+# Sourced by site wrappers (frontier/move-runner.sh, phoenix/move-runner.sh)
+# after config.sh is loaded. Finds the runner by name, stops it on its current
+# node, and starts it on the target node. Retries start once after 5 seconds.
+#
+# Usage: bash move-runner.sh <runner-name> <target-node>
+set -euo pipefail
+
+RUNNER_NAME="${1:?Usage: $0 <runner-name> <target-node>}"
+TARGET_NODE="${2:?Usage: $0 <runner-name> <target-node>}"
+
+# Validate target node
+valid=0
+for node in "${NODES[@]}"; do
+    [ "$node" = "$TARGET_NODE" ] && valid=1 && break
+done
+if [ "$valid" -eq 0 ]; then
+    echo "ERROR: '$TARGET_NODE' is not a valid login node." >&2
+    echo "       Valid nodes: ${NODES[*]}" >&2
+    exit 1
+fi
+
+# Find runner directory by name
+runner_dir=""
+while IFS= read -r dir; do
+    if [ "$(get_runner_name "$dir")" = "$RUNNER_NAME" ]; then
+        runner_dir="$dir"
+        break
+    fi
+done < <(find_runner_dirs)
+
+if [ -z "$runner_dir" ]; then
+    echo "ERROR: Runner '$RUNNER_NAME' not found in known runner directories." >&2
+    exit 1
+fi
+
+declare -f sync_runner_nodes > /dev/null 2>&1 && {
+    echo "==> Syncing runner node locations..."
+    sync_runner_nodes
+}
+
+echo "==> Locating $RUNNER_NAME..."
+current_node=$(find_node "$runner_dir")
+
+if [ "$current_node" = "$TARGET_NODE" ]; then
+    echo "==> $RUNNER_NAME is already running on $TARGET_NODE. Nothing to do."
+    exit 0
+fi
+
+if [ "$current_node" != "offline" ]; then
+    echo "==> Stopping $RUNNER_NAME on $current_node..."
+    stop_runner "$current_node" "$runner_dir"
+fi
+
+echo "==> Starting $RUNNER_NAME on $TARGET_NODE..."
+if start_runner "$TARGET_NODE" "$runner_dir"; then
+    echo "$TARGET_NODE" > "$runner_dir/runner.node"
+    echo "==> $RUNNER_NAME is now running on $TARGET_NODE."
+else
+    echo "    First start attempt failed. Retrying in 5 seconds..."
+    sleep 5
+    if start_runner "$TARGET_NODE" "$runner_dir"; then
+        echo "$TARGET_NODE" > "$runner_dir/runner.node"
+        echo "==> $RUNNER_NAME is now running on $TARGET_NODE."
+    else
+        echo "ERROR: $RUNNER_NAME failed to start on $TARGET_NODE after retry." >&2
+        exit 1
+    fi
+fi