#!/bin/bash
#
# air-update-watchdog
#
# Boot-time oneshot that decides whether the running cast-server build
# is healthy, and rolls back to the previous version if it isn't. Runs
# once per boot, ordered After= cast.service, with a 90-second wait on
# /api/health before declaring failure.
#
# # What counts as a failed update
#
# 1. /etc/air/version exists AND names a version we "just applied" AND
# 2. cast-server has not responded 200 on http://127.0.0.1/api/health
#    within HEALTH_TIMEOUT seconds.
#
# An older-style failure where cast-server crashes repeatedly is
# already caught by systemd's Restart=on-failure + StartLimitBurst
# (see cast.service). Once that limit trips, the service stays
# failed — and this watchdog, running After= cast.service with
# RemainAfterExit=false, observes the lack of /api/health and rolls
# back.
#
# # What we roll back
#
# services::ota writes `.prev` siblings next to every swapped artifact:
#
#   /usr/local/bin/cast-server.prev
#   /var/lib/cast/web.prev/
#
# plus `/etc/air/version` carries the newly-applied version. Rollback:
#
#   1. mv cast-server.prev → cast-server (restoring old binary)
#   2. rm -rf web; mv web.prev → web (restoring old tree)
#   3. rm /etc/air/version (signal "no known-good version installed")
#   4. systemctl restart cast.service
#
# If either `.prev` sibling is missing, we log loudly and leave the
# state as-is — the operator needs to re-flash manually.
#
# # Why a shell script and not Rust
#
# By definition this runs when cast-server itself is unhealthy. A
# rollback tool baked into the very binary we're rolling back is a
# foot-gun: a corrupt binary can't roll itself back. Shell + systemd
# is the smallest possible independent agent.

set -euo pipefail

HEALTH_URL="http://127.0.0.1/api/health"
HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-90}"
VERSION_FILE="/etc/air/version"
BIN=/usr/local/bin/cast-server
BIN_PREV=/usr/local/bin/cast-server.prev
WEB=/var/lib/cast/web
WEB_PREV=/var/lib/cast/web.prev

log() {
    echo "air-update-watchdog: $*" >&2
}

# If no version marker exists, the device is either a fresh flash or a
# rolled-back one — nothing to watchdog.
if [ ! -s "$VERSION_FILE" ]; then
    log "no $VERSION_FILE — nothing to watchdog"
    exit 0
fi

# If no rollback artifacts are staged, this is a normal boot from a
# fresh image/package install that still has /etc/air/version for OTA
# version comparison. Do not probe or log a scary "cannot roll back"
# warning on every boot; the liveness timer owns normal health
# recovery outside an OTA apply window.
if [ ! -x "$BIN_PREV" ] && [ ! -d "$WEB_PREV" ]; then
    log "no rollback artifacts staged — nothing to watchdog"
    exit 0
fi

# If a rollback artifact exists but the previous binary is missing,
# we can still health-check and surface a broken partial state, but we
# cannot safely restore the executable.
HAVE_ROLLBACK=1
if [ ! -x "$BIN_PREV" ]; then
    log "no $BIN_PREV — cannot roll back even if health check fails"
    HAVE_ROLLBACK=0
fi

version_now="$(tr -d '\n' < "$VERSION_FILE")"
log "current version: $version_now"

# Poll /api/health until it returns 200 or we give up. A single curl
# call with --max-time would be simpler, but losing the first 90s of
# boot on a hung server is too slow; polling every 3s lets a healthy
# server exit this loop in <= 3s after it comes up.
deadline=$(( $(date +%s) + HEALTH_TIMEOUT ))
healthy=0
while [ "$(date +%s)" -lt "$deadline" ]; do
    if curl -fsS --max-time 3 "$HEALTH_URL" >/dev/null 2>&1; then
        healthy=1
        break
    fi
    sleep 3
done

if [ "$healthy" = "1" ]; then
    log "$HEALTH_URL OK — update $version_now committed"
    # Commit the update: drop the .prev siblings so a future failed
    # update has a single known-good baseline to roll back to, rather
    # than chaining pre-pre-previous versions. This is a policy call;
    # we prefer one-step rollback with definitive "current is known
    # good" semantics over multi-step history.
    if [ -e "$BIN_PREV" ]; then
        rm -f "$BIN_PREV"
    fi
    if [ -d "$WEB_PREV" ]; then
        rm -rf "$WEB_PREV"
    fi
    exit 0
fi

log "$HEALTH_URL did not return 200 within ${HEALTH_TIMEOUT}s — considering rollback"

if [ "$HAVE_ROLLBACK" = "0" ]; then
    log "no rollback target available — leaving system in its current broken state"
    log "operator must re-flash the SD card or manually restore /usr/local/bin/cast-server"
    exit 1
fi

# Rollback sequence. Each step is idempotent-ish: if we crash mid-way,
# the next boot will observe a broken state AND a non-empty VERSION_FILE,
# try again, and finish.
log "rolling back binary $BIN -> $BIN_PREV"
mv -f "$BIN_PREV" "$BIN"

if [ -d "$WEB_PREV" ]; then
    log "rolling back web tree $WEB"
    rm -rf "$WEB"
    mv -f "$WEB_PREV" "$WEB"
else
    log "no $WEB_PREV — leaving web tree as-is (possibly not updated this cycle)"
fi

# Wipe the version marker so a subsequent OTA poll will recognise the
# device as "unknown version" and treat the next applicable manifest
# as a fresh install rather than skipping because it equals the
# (now-wrong) version string.
log "clearing $VERSION_FILE"
rm -f "$VERSION_FILE"

log "rollback complete — restarting cast.service"
systemctl restart cast.service || {
    log "systemctl restart failed — next natural restart picks it up"
}

# Exit non-zero so journalctl flags the watchdog invocation as a
# failure event; operators grepping for air-update-watchdog.service
# failures see exactly the boots where a rollback was needed.
exit 1
