#!/bin/bash
#
# air-cast-liveness
#
# Passive health probe: if cast-server's HTTP surface stops responding
# for two consecutive checks (120 s window), bounce cast.service.
#
# Why this exists:
#   Historically, cast-server has hit "service is active in systemd's
#   view, process alive, port :80 listening, but handlers never return"
#   states — caused variously by (a) Mutex guards held across blocking
#   GStreamer calls, (b) TasksMax=100 cgroup exhaustion on spawn
#   bursts, and (c) Hailo vdma_buffer_map kernel-trace wedging actix
#   workers. The individual fixes landed in separate waves; this
#   timer-driven watchdog is the belt-and-suspenders defence for any
#   new variant of the same failure mode.
#
# Probe target:
#   /api/health is unauthenticated, returns 200 + a tiny JSON body,
#   runs zero DB / GStreamer / sudo calls. `curl -f` would otherwise
#   treat 401 from auth-gated endpoints as a probe miss (e.g. the
#   initial attempt used /api/sim-mode, which looks unauth'd in
#   main.rs but still returns 401 through the middleware stack —
#   the watchdog ended up restarting the service on every tick).
#
# Cadence:
#   Fires from air-cast-liveness.timer every 60 s. Miss counter lives
#   in /run/air/cast-liveness.state (tmpfs — zeroed on every boot, and
#   reset whenever a probe succeeds). Restart threshold is 3 misses,
#   so two transient curl failures from unrelated hiccups do not
#   bounce the service. Curl's per-probe timeout is 15 s — enough
#   headroom for a Pi 5 that's momentarily saturated by the in-process
#   GStreamer pipeline under load (software x264enc at 720p30 + the
#   camera preview websocket fan-out can stall the actix workers
#   briefly). A genuine lockup still restarts in ~3 minutes.
#
# Manual test:
#   sudo /usr/local/sbin/air-cast-liveness        # one tick
#   journalctl -t air-cast-liveness --since -1h   # history
#
set -u

STATE=/run/air/cast-liveness.state
PROBE_URL="http://localhost/api/health"
# Probe TIMEOUT dropped 15 → 8 s and miss THRESHOLD dropped 3 → 2
# on 2026-04-23 after the field Pi logged 22 hang-triggered restarts
# in 4 days. At the previous settings the worst-case dead time was
# 15 × 3 + timer-interval ≈ 4 min; operators refreshing a tab would
# see "cannot reach device" for that whole window. The new settings
# give a 16 s detection budget + 1 tick for the second miss ≈ 2 min
# end-to-end — still conservative vs. the 60 s liveness timer, but
# 2× faster recovery on the operator's screen.
TIMEOUT=8
THRESHOLD=2

mkdir -p /run/air
chmod 0755 /run/air

if curl -sSf -o /dev/null -m "$TIMEOUT" "$PROBE_URL"; then
    # Healthy — zero the miss counter and exit clean.
    echo "0" > "$STATE"
    exit 0
fi

# Increment the miss counter. Default to 0 on a missing/corrupt state
# file so we don't restart on the very first tick after boot (which
# would also race cast.service's own startup).
MISSES=0
if [ -r "$STATE" ]; then
    MISSES=$(cat "$STATE" 2>/dev/null || echo 0)
    case "$MISSES" in
        ''|*[!0-9]*) MISSES=0 ;;
    esac
fi
MISSES=$((MISSES + 1))
echo "$MISSES" > "$STATE"

logger -t air-cast-liveness "probe failed (consecutive=$MISSES url=$PROBE_URL)"

if [ "$MISSES" -ge "$THRESHOLD" ]; then
    logger -t air-cast-liveness "threshold reached (${MISSES}/${THRESHOLD}) — restarting cast.service"
    systemctl restart cast.service
    echo "0" > "$STATE"
fi
