#!/bin/bash
#
# air-zerotier-watchdog: probe zerotier-one and force-restart it
# if the daemon is wedged or stopped.
#
# Triggered by a 60-second systemd timer (air-zerotier-watchdog.timer)
# at boot AND every minute thereafter. Independent of cast-server
# lifecycle so it survives a wedged or never-started cast.service.
#
# # Why this exists
#
# Operator complaint 2026-04-26: "ZeroTier should never stop, under
# any conditions". Defense-in-depth around the systemd-level
# `Restart=always` policy:
#
#   1. Pi powers up → systemd brings up zerotier-one (enabled in
#      image, auto-start). 95% of the time, this is enough.
#   2. zerotier-one crashes for some reason → Restart=always fires
#      within 5 s. Another 4% of the time covered.
#   3. Daemon is alive but wedged (process exists, no PID exit, but
#      the control socket on /var/lib/zerotier-one/zerotier-one.port
#      is unresponsive). systemd does not notice; cast-server's
#      `is-active --quiet` returns 0 (lying). This watchdog catches
#      the remaining 1% — pings the control plane, restarts on
#      timeout. Forced restart kills the wedged process and
#      Restart=always brings up a fresh one.
#
# # Probe
#
# `zerotier-cli info` over the local control socket. Returns
# "200 info <addr> <ver> ONLINE" on healthy. Anything else (incl.
# timeout, ECONNREFUSED, daemon not running, identity not yet
# generated on first boot) is treated as failure.
#
# # Failure action
#
# `systemctl restart zerotier-one`. The Restart=always policy +
# StartLimitBurst=30/300s drop-in on the unit means we have plenty
# of headroom; the watchdog itself only triggers a restart at most
# once per 60 s tick.
#
# # First-boot grace
#
# Skip the first 90 s after boot — zerotier-one is still generating
# its identity and joining the network, the control socket isn't
# ready yet, and a watchdog restart at 60 s would just slow things
# down. After 90 s, healthy state is well-established and any
# unhealthy state is a real fault.

set -u

UPTIME_SECS=$(awk '{print int($1)}' /proc/uptime 2>/dev/null || echo 999)
if [ "$UPTIME_SECS" -lt 90 ]; then
    echo "watchdog: skipping (uptime ${UPTIME_SECS}s, grace 90s)"
    exit 0
fi

# Probe via zerotier-cli with a strict 5 s timeout. If the binary
# is missing (extremely degraded host) we soft-fail rather than
# loop-restarting — a missing zerotier-cli means a missing daemon
# means we'd never recover anyway.
if ! command -v zerotier-cli >/dev/null 2>&1; then
    echo "watchdog: zerotier-cli missing, nothing to probe"
    exit 0
fi

# Use timeout(1) because zerotier-cli has no built-in deadline and
# can hang indefinitely against a wedged daemon socket. 5 s is
# generous — a healthy local-socket query returns in <50 ms.
INFO_OUTPUT=$(timeout 5 zerotier-cli info 2>&1)
INFO_RC=$?

if [ "$INFO_RC" -eq 0 ] && echo "$INFO_OUTPUT" | grep -q "ONLINE"; then
    # Daemon is healthy. The 'ONLINE' check is more conservative
    # than rc=0 alone: zerotier-cli returns 0 even when status is
    # OFFLINE (still talking to the daemon, but daemon hasn't
    # reached the planet yet). OFFLINE for >60 s typically means
    # the planet handshake is broken and a restart helps.
    exit 0
fi

# Probe failed — daemon is stopped, wedged, or offline. Log the
# observation (journalctl -u air-zerotier-watchdog.service) and
# force-restart. systemd's StartLimitBurst protects us from a
# total flap into permanent-failed state.
echo "watchdog: zerotier-cli info failed (rc=$INFO_RC); restarting zerotier-one"
echo "watchdog: probe output: ${INFO_OUTPUT:0:200}"

# `systemctl restart` does stop+start. Stop is bounded by the
# unit's TimeoutStopSec (default 10 s for zerotier-one). Start
# fires immediately after.
systemctl restart zerotier-one 2>&1 || {
    echo "watchdog: systemctl restart returned non-zero (probably an interlock)"
    exit 1
}

echo "watchdog: restart dispatched"
exit 0
