#!/bin/bash
#
# air-cast-selfheal
#
# Boot-time ExecStartPre for cast.service. Validates
# /usr/local/bin/cast-server and, if it's broken, atomic-swaps
# in the newest intact backup so systemd's ExecStart=
# /usr/local/bin/cast-server actually loads.
#
# # Why this exists
#
# A brownout / SD I/O fault during a write to /usr/local/bin can
# leave cast-server truncated to 0 bytes (observed in the wild on
# a Pi 5 + 45 W laptop-PD charger that was sagging under load).
# The kernel then rejects ExecStart with `status=203/EXEC — Exec
# format error` and systemd bangs out 15 restart attempts in
# ~25 s before giving up with "Start request repeated too
# quickly". The device is then cast-server-down with no auto-
# recovery until an operator SSHes in with a known-good binary.
#
# This helper checks the binary before every ExecStart, and
# transparently heals from a backup when one is available. The
# operator then only has to repower; no SSH required.
#
# # What's "broken"
#
#   * file missing                                (ENOENT)
#   * size <= 1024 bytes                          (truncated)
#   * first four bytes != 7f 45 4c 46 ELF magic   (non-ELF garbage)
#
# These three states cover the brownout-truncation case AND the
# "someone ran `echo > cast-server`" human-error case. Not bulletproof
# against more subtle corruption (mid-binary block flip) — for those
# we rely on the ELF loader's own checksum + cast-server's build.rs
# to catch it.
#
# # Restore chain
#
# Try each of these in order, keep the first intact match:
#
#   1. /usr/local/bin/cast-server.prev       — written by air-ota-apply
#                                              on every successful OTA,
#                                              so always the last-known-
#                                              good version.
#   2. /usr/local/bin/cast-server.bak-v*     — operator-created backups
#                                              (manual swap-in workflow).
#                                              Sorted newest version first
#                                              by a naive version-part
#                                              sort.
#
# If NOTHING intact is found, log and exit 0. cast.service's own
# ExecStart will then hit `status=203/EXEC` as before — the helper
# never makes things worse.
#
# # Exit codes
#
# Always 0. The point is to transparently heal; if we can't heal,
# fall through to the normal-failure path and let the operator see
# cast.service fail as it would without us.

set -u
# Not `-e`: we handle every failure explicitly and always want to
# exit 0 regardless of intermediate step failure.

TARGET=/usr/local/bin/cast-server
LOG_TAG="air-cast-selfheal"

log() { logger -t "$LOG_TAG" "$*"; echo "[${LOG_TAG}] $*" >&2; }

# Returns 0 if $1 looks like a valid cast-server ELF, 1 otherwise.
is_intact() {
    local f="$1"
    [ -f "$f" ] || return 1
    local sz
    sz=$(stat -c%s "$f" 2>/dev/null)
    [ -n "$sz" ] || return 1
    # Arbitrary minimum — a real cast-server binary is ~10 MB. Under
    # 1 KiB is guaranteed broken; using 1 KiB rather than "any
    # positive size" catches the "file created but zero header"
    # edge case where stat would say 32 or similar.
    [ "$sz" -gt 1024 ] || return 1
    # ELF magic check. `head -c 4` reads from wherever the first-
    # block pointer leads — so a file that claims >1024 bytes but
    # whose data blocks are unallocated (the FTL-cached-write
    # case) still fails here with `head` returning short.
    local magic
    magic=$(head -c 4 "$f" 2>/dev/null | od -An -tx1 | tr -d ' \n')
    [ "$magic" = "7f454c46" ]
}

TARGET_LAST_GOOD=/usr/local/bin/cast-server.bak-last-good

# Refresh the "last-good" backup slot while we know the binary is
# healthy. Runs on every ExecStartPre when TARGET is intact — so
# every successful cast.service start guarantees there's a fresh
# backup available for the *next* boot's self-heal if a brownout
# strikes mid-way. Added in wave 55 after a user Pi found itself
# with no usable backup (brownout truncated both cast-server AND
# cast-server.bak-v44 in the same I/O event; only an older
# .bak-v41 survived).
#
# Skip the copy when source + dest already match — keeps the
# boot-path free of unnecessary writes on healthy devices (no
# reason to hammer the SD). Compare by size first (cheap stat)
# and by first+last 4 KiB only if sizes match (avoids a full
# 10 MB hash on every start). Full integrity was already proved
# by is_intact above; this is a freshness check, not a validation.
refresh_last_good() {
    local src=/usr/local/bin/cast-server
    local dst="$TARGET_LAST_GOOD"
    local src_sz dst_sz
    src_sz=$(stat -c%s "$src" 2>/dev/null)
    dst_sz=$(stat -c%s "$dst" 2>/dev/null || echo -1)
    if [ "$src_sz" = "$dst_sz" ]; then
        # Same size — cheap head/tail comparison. A 4 KiB read at
        # each end catches the "file named the same size but
        # different content" case without hashing the whole 10 MB.
        if cmp -s <(head -c 4096 "$src" 2>/dev/null) \
                  <(head -c 4096 "$dst" 2>/dev/null) && \
           cmp -s <(tail -c 4096 "$src" 2>/dev/null) \
                  <(tail -c 4096 "$dst" 2>/dev/null); then
            return 0
        fi
    fi
    # Different — refresh the backup. Atomic rename so a brownout
    # mid-copy doesn't leave a half-written backup (the whole
    # reason we're doing this).
    local tmp="${dst}.refresh-$$"
    if cp -a "$src" "$tmp" 2>/dev/null; then
        chmod 0755 "$tmp"
        if mv -f "$tmp" "$dst"; then
            sync
            log "refreshed $dst from intact $src"
        else
            rm -f "$tmp"
            log "backup refresh: mv $tmp → $dst failed"
        fi
    else
        log "backup refresh: cp from $src failed"
    fi
}

if is_intact "$TARGET"; then
    # Healthy — maintain the last-good slot, then silent exit.
    refresh_last_good
    exit 0
fi

log "TARGET ($TARGET) is broken — size=$(stat -c%s "$TARGET" 2>/dev/null || echo ENOENT). Attempting restore."

# Build the candidate list in preference order. `last-good` first —
# it's refreshed on every successful cast.service boot so it's the
# freshest verified-intact copy. `.prev` next — OTA-written, last
# successful version. Operator .bak-v* last, newest version first.
candidates=()
if [ -f "$TARGET_LAST_GOOD" ]; then
    candidates+=("$TARGET_LAST_GOOD")
fi
if [ -f /usr/local/bin/cast-server.prev ]; then
    candidates+=(/usr/local/bin/cast-server.prev)
fi
# Sort bak-v* by version part descending (newest first). The version
# suffixes are like `.bak-v53` — simple numeric sort on the trailing
# number works for the single-day-wave naming scheme; if someone ever
# ships a .bak-v100 we still pick it correctly because we sort
# numerically after stripping the `.bak-v` prefix.
while IFS= read -r f; do
    candidates+=("$f")
done < <(
    for bak in /usr/local/bin/cast-server.bak-v*; do
        [ -f "$bak" ] || continue
        # Extract numeric version tail (`.bak-v53` -> `53`).
        v=${bak##*/cast-server.bak-v}
        # Guard against non-numeric suffixes (`.bak-vold` etc).
        case "$v" in
            ''|*[!0-9]*) continue ;;
        esac
        printf '%d\t%s\n' "$v" "$bak"
    done | sort -rn | cut -f2-
)

if [ "${#candidates[@]}" -eq 0 ]; then
    log "no backup candidates found; leaving $TARGET as-is. cast.service will fail; restore a binary manually (scp + systemctl restart cast)."
    exit 0
fi

for c in "${candidates[@]}"; do
    if is_intact "$c"; then
        # Atomic-rename via mv so cast-server never observes a
        # half-written state mid-install. install(1) does the same
        # under the hood; we use explicit cp + mv to make the intent
        # obvious in the journal.
        tmp="${TARGET}.heal-$$"
        if cp -a "$c" "$tmp" 2>/dev/null; then
            # Match the perms cast.service expects.
            chmod 0755 "$tmp"
            chown root:root "$tmp" 2>/dev/null || true
            if mv -f "$tmp" "$TARGET"; then
                sync
                log "restored $TARGET from $c"
                exit 0
            else
                rm -f "$tmp"
                log "mv $tmp → $TARGET failed, trying next candidate"
            fi
        else
            log "cp from $c failed, trying next candidate"
        fi
    else
        log "candidate $c is itself broken; skipping"
    fi
done

log "all candidates tried; $TARGET still broken. cast.service will fail."
exit 0
