#!/usr/bin/env bash
# Vloud public bootstrap — `curl -fsSL https://install.vloud.app | sudo bash`.
#
# Modes (auto-detected; explicit flags override):
#
#   (default, no flag)     — auto.  If /opt/vloud/packages/server/dist
#                            exists, switches to upgrade mode.  Otherwise
#                            runs fresh install.
#   --upgrade              — explicit upgrade. Preserves /etc/vloud.env,
#                            /var/lib/vloud, SQLite DB, generated nginx
#                            vhosts, SSL certs, pm2 deployments. Swaps
#                            binaries atomically via /opt/vloud-staging-<ts>,
#                            health-checks the new engine, rolls back on
#                            failure.
#   --repair               — re-emit systemd units, daemon-reload, restart
#                            services. No state touched. For when the
#                            install dir is fine but systemd got tangled.
#   --doctor / --health    — read-only health check. Prints services,
#                            ports, version, DB integrity. No changes.
#   --force-clean-install  — explicit destructive reinstall.  Stops
#                            services, removes /opt/vloud, /etc/vloud.env,
#                            /var/lib/vloud, /etc/systemd/system/vloud*,
#                            then runs fresh install. Last resort.
#   --fresh                — explicit fresh install. Refuses to run if
#                            /opt/vloud exists (use --force-clean-install
#                            for the destructive path).
#
# Curl-pipe-bash:
#   curl -fsSL https://install.vloud.app | sudo bash
#   curl -fsSL https://install.vloud.app | sudo bash -s -- --upgrade
#   curl -fsSL https://install.vloud.app | sudo bash -s -- --repair
#   curl -fsSL https://install.vloud.app | sudo bash -s -- --doctor
#   curl -fsSL https://install.vloud.app | sudo bash -s -- --force-clean-install
#
# Fresh-install phases (auto / --fresh / --force-clean-install):
#   1. Pre-flight (OS / arch / disk / RAM / outbound network)
#   2. Apt install (curl, nginx, node 20, sqlite3, dnsutils, certbot, php-fpm)
#   3. Vloud release download + extract to /opt/vloud
#   4. Configure: vloud user, /etc/vloud.env (with trial state), systemd unit
#   5. Start engine + emit dashboard URL + trial status
#   6. Daemon stack (Phase 1) — postfix/dovecot/rspamd/fail2ban/bind/ftp/
#      firewall/tenant-slices/storage-quota via bootstrap-daemon-stack.sh.
#      Default ON; cPanel hosts skip by default; per-daemon opt-out env vars.
#
# Idempotent: every step gates on existing state.  Re-runs are safe.
# Zero-interaction: defaults everywhere, no prompts.
# Logs: /var/log/vloud/install.log
set -euo pipefail

# ─── Defaults (operator can override via env) ───
VLOUD_INSTALL_DIR=${VLOUD_INSTALL_DIR:-/opt/vloud}
VLOUD_RELEASE_URL=${VLOUD_RELEASE_URL:-https://install.vloud.app/releases/latest/vloud-latest.tar.gz}
VLOUD_PORT=${VLOUD_PORT:-2544}
VLOUD_BIND_HOST=${VLOUD_BIND_HOST:-127.0.0.1}
VLOUD_TRIAL_DAYS=${VLOUD_TRIAL_DAYS:-30}
# 0.6.0c — hoisted from the fresh-install env-phase so that
# `do_reconcile_env_file()` (called from --upgrade + --repair via
# the mode dispatcher) can reference it without tripping `set -u`.
# Operator can override via env if pointing at a private license
# server (e.g. for offline / behind-firewall installs).
VLOUD_LICENSE_SERVER_URL_DEFAULT=${VLOUD_LICENSE_SERVER_URL_DEFAULT:-https://license.vloud.app}
ENV_FILE=/etc/vloud.env
LOG_DIR=/var/log/vloud
LOG_FILE=$LOG_DIR/install.log
SYSTEMD_UNIT=/etc/systemd/system/vloud.service
SYSTEMD_SLICE=/etc/systemd/system/vloud.slice
SYSTEMD_WORKER=/etc/systemd/system/vloud-job-worker.service
SYSTEMD_SCHEDULER=/etc/systemd/system/vloud-scheduler.service
# Phase E (2026-05-14): auto-rollback unit fired by vloud.service's
# OnFailure when the engine crash-loops past StartLimitBurst.
SYSTEMD_ROLLBACK=/etc/systemd/system/vloud-rollback.service

# ─── Lifecycle progress + apply-run correlation (PR1) ───
#
# Engine-orchestrated upgrades (POST /api/v1/updates/apply →
# executePlan → spawns bootstrap.sh) pass --apply-run-id=<n> and
# --version=<v>; bootstrap.sh emits append-only JSONL events to
# VLOUD_PROGRESS_FILE so the engine can resolve update_status
# without parsing logs. Operator-run bootstrap.sh (curl-pipe-bash)
# omits these flags and behaves identically to pre-PR1 — every new
# code path is conditional on the flags being set.
VLOUD_PROGRESS_FILE=${VLOUD_PROGRESS_FILE:-/var/lib/vloud/.upgrade-progress}
VLOUD_PROGRESS_RETAIN_BYTES=${VLOUD_PROGRESS_RETAIN_BYTES:-262144}  # 256 KiB before rotation
VLOUD_PROGRESS_STALE_DAYS=${VLOUD_PROGRESS_STALE_DAYS:-30}          # how long rotated logs live
VLOUD_APPLY_RUN_ID=${VLOUD_APPLY_RUN_ID:-}
VLOUD_TARGET_VERSION=${VLOUD_TARGET_VERSION:-}
VLOUD_ROLLBACK_TO=${VLOUD_ROLLBACK_TO:-}
# Phase E (2026-05-14): set to 1 when systemd's vloud-rollback.service
# fires us via OnFailure. Currently informational only.
VLOUD_ROLLBACK_AUTOMATIC=${VLOUD_ROLLBACK_AUTOMATIC:-}

# ─── 1. Pre-flight ───
# Root check is mode-aware — --doctor is read-only and runs fine as
# any user that can read systemd state + reach 127.0.0.1:2544. The
# four mutating modes (fresh, upgrade, repair, force-clean) need root.
_NEEDS_ROOT=1
for _a in "$@"; do
  case "$_a" in
    --doctor|--health|--help|-h) _NEEDS_ROOT=0 ;;
  esac
done
if (( _NEEDS_ROOT == 1 )) && [[ $EUID -ne 0 ]]; then
  echo "ERROR: bootstrap.sh must be run as root for this mode (curl -fsSL ... | sudo bash)" >&2
  echo "       (--doctor / --help do not require root)" >&2
  exit 1
fi
unset _NEEDS_ROOT _a

if [[ -f /etc/os-release ]]; then
  . /etc/os-release
else
  echo "ERROR: /etc/os-release missing — unsupported distro" >&2
  exit 1
fi
case "$ID" in
  ubuntu|debian) ;;
  *) echo "ERROR: Vloud bootstrap supports Ubuntu and Debian only (you have $ID)" >&2; exit 1 ;;
esac

# Logging tee — set up before any output so we capture everything.
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1

CYAN='\033[1;36m'; GREEN='\033[1;32m'; YELLOW='\033[1;33m'; RED='\033[1;31m'; RESET='\033[0m'
say()  { printf "${CYAN}▸${RESET} %s\n" "$1"; }
ok()   { printf "  ${GREEN}✓${RESET} %s\n" "$1"; }
warn() { printf "  ${YELLOW}⚠${RESET} %s\n" "$1"; }
fail() { printf "  ${RED}✗${RESET} %s\n" "$1" >&2; }
die()  { fail "$1"; emit_progress die failed "$1"; exit "${2:-1}"; }

# ─── Lifecycle progress JSONL emitter ───
#
# Append one JSON line per lifecycle event to VLOUD_PROGRESS_FILE.
# Schema (stable):
#
#   { "ts":  "<iso8601-z>",
#     "apply_run_id": <int|null>,
#     "phase": "upgrade" | "rollback" | "fresh" | "repair",
#     "stage": "<short-tag>",
#     "status": "started" | "ok" | "warn" | "failed",
#     "version_from": "<string|null>",
#     "version_to":   "<string|null>",
#     "msg": "<human-readable>" }
#
# Terminal markers emit with stage="terminal" so the engine can
# resolve apply_run state without parsing every intermediate line.
#
# - Append-only; never rewrites prior events
# - Rotates to ${VLOUD_PROGRESS_FILE}.<ts> when >RETAIN_BYTES
# - Stale rotated files older than STALE_DAYS are GC'd
# - Safe to call BEFORE the progress file directory exists (will
#   try to create it; failure is non-fatal — bootstrap.sh must
#   never abort because of progress logging)
# - Safe to call even when VLOUD_APPLY_RUN_ID is unset (engine
#   never reads those rows; operator-run upgrades still emit them
#   for the install.log forensics value)
emit_progress() {
  local stage="${1:-unknown}"
  local status="${2:-ok}"
  local msg="${3:-}"

  # Defensive: never let progress emission take down the script.
  set +e
  (
    set +e
    local dir; dir=$(dirname "$VLOUD_PROGRESS_FILE")
    [[ -d "$dir" ]] || install -d -m 0755 "$dir" 2>/dev/null

    # Rotate if oversize.
    if [[ -f "$VLOUD_PROGRESS_FILE" ]]; then
      local sz; sz=$(stat -c '%s' "$VLOUD_PROGRESS_FILE" 2>/dev/null || echo 0)
      if (( sz > VLOUD_PROGRESS_RETAIN_BYTES )); then
        local ts; ts=$(date -u +%Y%m%dT%H%M%SZ)
        mv "$VLOUD_PROGRESS_FILE" "${VLOUD_PROGRESS_FILE}.${ts}" 2>/dev/null
      fi
    fi

    # GC stale rotated files (best-effort).
    find "$(dirname "$VLOUD_PROGRESS_FILE")" -maxdepth 1 -type f \
        -name "$(basename "$VLOUD_PROGRESS_FILE").*" \
        -mtime "+${VLOUD_PROGRESS_STALE_DAYS}" -delete 2>/dev/null

    local ts_now; ts_now=$(date -u +%Y-%m-%dT%H:%M:%SZ)
    # Escape backslashes + quotes in msg (no jq dependency).
    local msg_esc="${msg//\\/\\\\}"
    msg_esc="${msg_esc//\"/\\\"}"
    # JSONify nullable apply_run_id (raw number when set, JSON null otherwise).
    local arid; arid="${VLOUD_APPLY_RUN_ID:-null}"
    [[ -z "$arid" ]] && arid=null
    [[ "$arid" != "null" ]] && ! [[ "$arid" =~ ^[0-9]+$ ]] && arid=null
    # Phase is derived from MODE; default 'upgrade' until MODE is set.
    local phase="${MODE:-upgrade}"
    local vfrom="${VLOUD_VERSION_FROM:-null}"
    local vto="${VLOUD_VERSION_TO:-null}"
    [[ "$vfrom" != "null" ]] && vfrom="\"$vfrom\""
    [[ "$vto"   != "null" ]] && vto="\"$vto\""

    printf '{"ts":"%s","apply_run_id":%s,"phase":"%s","stage":"%s","status":"%s","version_from":%s,"version_to":%s,"msg":"%s"}\n' \
      "$ts_now" "$arid" "$phase" "$stage" "$status" "$vfrom" "$vto" "$msg_esc" \
      >> "$VLOUD_PROGRESS_FILE" 2>/dev/null
  ) || true
  set -e
}

# Mark a terminal status for the current apply_run. Engine watchers
# resolve the run state from the most recent terminal event.
emit_progress_terminal() {
  local status="${1:-ok}"
  local msg="${2:-}"
  emit_progress terminal "$status" "$msg"
}

say "Vloud bootstrap starting on $PRETTY_NAME"
ok  "logging to $LOG_FILE"

# ─── Release-verification helpers (hoisted before mode dispatch) ───
#
# The build-release.sh pipeline substitutes the literal placeholder
# `__VLOUD_RELEASE_PUBKEY_PEM__` with the real PEM at release time. If
# the placeholder remains, $VLOUD_RELEASE_PUBKEY_PEM must be set OR
# VLOUD_INSECURE_SKIP_VERIFY=1 + VLOUD_TESTING=1 must both be set.
# These three primitives (constant + b64url_decode + verify_release_chain)
# are used by BOTH the fresh-install flow (phase 3 below) AND the
# upgrade flow (do_upgrade_flow above the dispatch). They must be
# defined here so both can call them.
EMBEDDED_RELEASE_PUBKEY_PEM='-----BEGIN PUBLIC KEY-----
MCowBQYDK2VwAyEAzsJeHF09hogJbjNXUsOynOS1ucVeGRdLoHpPPOoqjFU=
-----END PUBLIC KEY-----'

b64url_decode() {
  local s="$1"
  s="${s//-/+}"; s="${s//_//}"
  local pad=$(( (4 - ${#s} % 4) % 4 ))
  while (( pad-- > 0 )); do s+="="; done
  printf '%s' "$s" | base64 -d
}

verify_release_chain() {
  local tarball="$1" checksum="$2" signature="$3"
  local expected_artifact="${4:-$(basename "$tarball")}"
  local pubkey_pem_data
  if [[ -n "${VLOUD_RELEASE_PUBKEY_PEM:-}" ]]; then
    pubkey_pem_data="$VLOUD_RELEASE_PUBKEY_PEM"
  elif [[ "$EMBEDDED_RELEASE_PUBKEY_PEM" != "__VLOUD_RELEASE_PUBKEY_PEM__" ]]; then
    pubkey_pem_data="$EMBEDDED_RELEASE_PUBKEY_PEM"
  else
    if [[ "${VLOUD_INSECURE_SKIP_VERIFY:-0}" == "1" && "${VLOUD_TESTING:-0}" == "1" ]]; then
      warn "VERIFICATION SKIPPED — VLOUD_INSECURE_SKIP_VERIFY=1 set (test mode)"
      return 0
    fi
    die "release public key not configured.  Set VLOUD_RELEASE_PUBKEY_PEM or use a release-built bootstrap.sh.  To skip for testing: VLOUD_INSECURE_SKIP_VERIFY=1 VLOUD_TESTING=1."
  fi
  local pubkey_file
  pubkey_file=$(mktemp /tmp/vloud-release-key-XXXXXX.pem)
  printf '%s' "$pubkey_pem_data" > "$pubkey_file"

  if ! ( cd "$(dirname "$tarball")" && sha256sum -c "$(basename "$checksum")" >/dev/null 2>&1 ); then
    rm -f "$pubkey_file"
    die "release tarball sha256 MISMATCH — refusing install"
  fi

  local jws
  jws=$(<"$signature")
  jws="${jws//[$'\n\r ']/}"
  IFS='.' read -r jws_header jws_payload jws_sig <<<"$jws"
  if [[ -z "$jws_header" || -z "$jws_payload" || -z "$jws_sig" ]]; then
    rm -f "$pubkey_file"
    die "release signature is not a JWS compact serialization — refusing install"
  fi

  local header_json alg typ kid
  header_json=$(b64url_decode "$jws_header")
  alg=$(printf '%s' "$header_json" | sed -n 's/.*"alg":"\([^"]*\)".*/\1/p')
  typ=$(printf '%s' "$header_json" | sed -n 's/.*"typ":"\([^"]*\)".*/\1/p')
  kid=$(printf '%s' "$header_json" | sed -n 's/.*"kid":"\([^"]*\)".*/\1/p')
  if [[ "$alg" != "EdDSA" || "$typ" != "vloud-release+jws" ]]; then
    rm -f "$pubkey_file"
    die "release signature header invalid (alg=$alg typ=$typ) — refusing install"
  fi
  if [[ "$kid" != "release-v1" && "$kid" != "release-v2" ]]; then
    rm -f "$pubkey_file"
    die "release signature kid '$kid' not in allowed set — refusing install"
  fi

  local sig_bin sign_input
  sig_bin=$(mktemp /tmp/vloud-sig-XXXXXX.bin)
  b64url_decode "$jws_sig" > "$sig_bin"
  sign_input=$(mktemp /tmp/vloud-signin-XXXXXX)
  printf '%s' "$jws_header.$jws_payload" > "$sign_input"
  if ! openssl pkeyutl -verify -pubin -inkey "$pubkey_file" \
       -rawin -in "$sign_input" -sigfile "$sig_bin" >/dev/null 2>&1; then
    rm -f "$pubkey_file" "$sig_bin" "$sign_input"
    die "release signature INVALID — refusing install"
  fi
  rm -f "$sig_bin" "$sign_input"

  local payload_json payload_sha payload_artifact actual_sha
  payload_json=$(b64url_decode "$jws_payload")
  payload_sha=$(printf '%s' "$payload_json" | sed -n 's/.*"sha256":"\([0-9a-f]*\)".*/\1/p')
  payload_artifact=$(printf '%s' "$payload_json" | sed -n 's/.*"artifact":"\([^"]*\)".*/\1/p')
  actual_sha=$(sha256sum "$tarball" | awk '{print $1}')
  if [[ "$payload_sha" != "$actual_sha" ]]; then
    rm -f "$pubkey_file"
    die "release signed sha256 ($payload_sha) does not match tarball ($actual_sha) — refusing install"
  fi
  if [[ "$payload_artifact" != "$expected_artifact" ]]; then
    rm -f "$pubkey_file"
    die "release signed artifact ($payload_artifact) does not match expected artifact ($expected_artifact) — refusing install"
  fi

  rm -f "$pubkey_file"
  ok "release verified: $expected_artifact (kid=$kid)"
}

# ─── Mode detection + dispatch ───
#
# This block decides what kind of install we're running. After this,
# the script either dispatches to a mode-specific function (and exits)
# or falls through to the linear fresh-install phases 1-6 below.
MODE="${VLOUD_INSTALL_MODE:-}"
for arg in "$@"; do
  case "$arg" in
    --upgrade)              MODE=upgrade ;;
    --rollback)             MODE=rollback ;;
    --repair)               MODE=repair ;;
    --doctor|--health)      MODE=doctor ;;
    --force-clean-install)  MODE=force-clean ;;
    --fresh)                MODE=fresh ;;
    # PR1: per-version targeting. When absent, behaviour is unchanged
    # (resolve to install.vloud.app/releases/latest/…).
    --version=*)            VLOUD_TARGET_VERSION="${arg#--version=}" ;;
    --version)              shift ; VLOUD_TARGET_VERSION="${1:-}" ; continue ;;
    # PR1: rollback target. When set with MODE=rollback, restore that
    # specific .pre-upgrade-* slot; otherwise restore the newest.
    --to=*)                 VLOUD_ROLLBACK_TO="${arg#--to=}" ;;
    --to)                   shift ; VLOUD_ROLLBACK_TO="${1:-}" ; continue ;;
    # PR1: engine apply-run correlation. Tags every progress event
    # so the engine can match JSONL rows to its apply-run row.
    --apply-run-id=*)       VLOUD_APPLY_RUN_ID="${arg#--apply-run-id=}" ;;
    --apply-run-id)         shift ; VLOUD_APPLY_RUN_ID="${1:-}" ; continue ;;
    # Phase E (2026-05-14): explicit "this rollback was fired by
    # systemd, not by an operator click". Accepted but currently
    # behaves identically to plain --rollback. Mostly here so the
    # systemd unit's ExecStart doesn't get rejected as unknown args.
    --automatic)            VLOUD_ROLLBACK_AUTOMATIC=1 ;;
    --help|-h)
      sed -n '2,28p' "$0" | sed 's/^# \{0,1\}//'
      exit 0
      ;;
    *)
      # Ignore unknown args here; phase scripts may consume their own.
      ;;
  esac
done

# PR1: when --version=<v> is passed, override the default URL with
# the per-version path. Operators who already set VLOUD_RELEASE_URL
# explicitly (private mirror / offline-mirror customers) win.
if [[ -n "$VLOUD_TARGET_VERSION" ]] && [[ "$VLOUD_RELEASE_URL" == "https://install.vloud.app/releases/latest/vloud-latest.tar.gz" ]]; then
  VLOUD_RELEASE_URL="https://install.vloud.app/releases/${VLOUD_TARGET_VERSION}/vloud-${VLOUD_TARGET_VERSION}.tar.gz"
fi

# Auto-detect if no explicit mode.
if [[ -z "$MODE" ]]; then
  if [[ -d "$VLOUD_INSTALL_DIR/packages/server/dist" ]]; then
    MODE=upgrade
    say "existing install detected at $VLOUD_INSTALL_DIR — auto-mode → UPGRADE"
    warn "running upgrade (preserves DB + config). Use --force-clean-install to wipe."
  else
    MODE=fresh
    say "no existing install — auto-mode → FRESH INSTALL"
  fi
fi

# ── mode helpers (defined here so dispatch below can use them) ──

# Backup critical state to /var/lib/vloud/backups/<label>-<ts>.tar.gz
# AND copy /etc/vloud.env. Idempotent + non-destructive.
do_backup_state() {
  local label="${1:-pre-upgrade}"
  local ts; ts=$(date -u +%Y%m%dT%H%M%SZ)
  local backup_dir=/var/lib/vloud/backups
  mkdir -p "$backup_dir"
  local backup="$backup_dir/$label-$ts.tar.gz"
  say "backing up critical state → $backup"
  local sources=()
  [[ -f $ENV_FILE ]]                                    && sources+=("$ENV_FILE")
  [[ -d /etc/vloud ]]                                   && sources+=("/etc/vloud")
  [[ -f $VLOUD_INSTALL_DIR/packages/server/vloud.db ]]  && sources+=("$VLOUD_INSTALL_DIR/packages/server/vloud.db")
  [[ -d /var/lib/vloud ]]                               && sources+=("/var/lib/vloud")
  # nginx vhosts the engine generated (vloud-*.conf in sites-enabled, excluding system app names)
  while IFS= read -r f; do sources+=("$f"); done < <(
    ls /etc/nginx/sites-enabled/vloud-*.conf 2>/dev/null |
    grep -vE '/vloud-(app-|admin|commercial-|marketing|portal|license)' || true
  )
  if [[ ${#sources[@]} -gt 0 ]]; then
    tar -czf "$backup" --ignore-failed-read "${sources[@]}" 2>/dev/null || warn "backup partial — some files unreadable"
    ok "backup: $backup ($(du -h "$backup" 2>/dev/null | awk '{print $1}'))"
  else
    warn "nothing to back up (no existing state)"
  fi
  echo "$backup"
}

# Stop vloud services gracefully. Idempotent.
do_stop_services() {
  say "stopping vloud services"
  for u in vloud.service vloud-job-worker.service vloud-scheduler.service; do
    if systemctl is-active --quiet "$u" 2>/dev/null; then
      systemctl stop "$u" 2>/dev/null && ok "stopped $u" || warn "could not stop $u"
    fi
  done
}

# Start vloud services. Reports health within 30s.
#
# Critical: `systemctl reset-failed` BEFORE each start. Without this,
# a previous failed-start cascade (e.g. the new engine crash-looped
# during an upgrade attempt) leaves the unit in `start-limit-hit`.
# systemd then refuses every subsequent start with "Start request
# repeated too quickly" — including the rollback restart, which is
# exactly the cascading-failure mode that bit the 0.5.1-beta upgrade
# on 188.245.113.223 (2026-05-13).
#
# We also no longer swallow stderr on `systemctl start` — the actual
# systemd refusal message is the single most useful piece of
# forensics when a start fails, and `2>/dev/null` was hiding it.
do_start_services_with_healthcheck() {
  say "starting vloud services"
  systemctl reset-failed vloud.service vloud-job-worker.service vloud-scheduler.service 2>/dev/null || true
  systemctl daemon-reload
  systemctl start vloud.service             || warn "vloud.service start returned $?"
  systemctl start vloud-job-worker.service  || warn "vloud-job-worker.service start returned $?"
  systemctl start vloud-scheduler.service   || warn "vloud-scheduler.service start returned $?"
  local i rc
  for i in {1..15}; do
    rc=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 "http://127.0.0.1:$VLOUD_PORT/api/health" 2>/dev/null || echo 000)
    if [[ "$rc" == "200" ]]; then
      ok "engine health-check passed in ${i}s"
      return 0
    fi
    sleep 2
  done
  fail "engine did not pass health-check within 30s — see journalctl -u vloud"
  return 1
}

# Dump the last lines of vloud.service's journal so operators have
# forensics in the install log when a start failed. Cheap; only runs
# on failure paths.
do_dump_failure_journal() {
  say "vloud.service journal (last 40 lines) — for forensics:"
  journalctl -u vloud.service -n 40 --no-pager 2>&1 | sed 's/^/  | /'
}

# Per-file integrity verification (PR1).
#
# The release tarball SHA is already covered by the signed manifest
# (verify_release_chain). This adds a defence-in-depth check: if the
# release build pipeline shipped a dist/SHA256SUMS file, verify every
# extracted file against it. Catches the (uncommon but plausible)
# case where the tarball decompressed but a single file was corrupted
# OR the build pipeline shipped an inconsistent tarball.
#
# Missing SHA256SUMS is NOT fatal — older releases don't ship it,
# and offline/private-mirror customers may strip it. Operator gets
# a `warn` instead of an abort.
do_verify_sha256sums() {
  local staging_root="$1"
  local checked=0 missing=0
  for sums in \
      "$staging_root/packages/server/dist/SHA256SUMS" \
      "$staging_root/packages/web/dist/SHA256SUMS"; do
    if [[ -f "$sums" ]]; then
      local sums_dir; sums_dir=$(dirname "$sums")
      if ( cd "$sums_dir" && sha256sum -c --quiet --ignore-missing SHA256SUMS >/dev/null 2>&1 ); then
        ok "per-file integrity OK: $sums"
        checked=$((checked+1))
      else
        emit_progress verify-sha256sums failed "mismatch in $sums"
        die "per-file integrity FAILED in $sums_dir — refusing swap" 9
      fi
    else
      missing=$((missing+1))
    fi
  done
  if (( checked == 0 )) && (( missing > 0 )); then
    warn "release ships no SHA256SUMS — per-file integrity SKIPPED (manifest signature still in force)"
    emit_progress verify-sha256sums warn "no SHA256SUMS in tarball"
  else
    emit_progress verify-sha256sums ok "verified=$checked"
  fi
}

# ABI preflight: confirm the staging tree's native modules can load
# against the host's Node ABI. If the release-runtime.json's
# node_module_version differs from the host's, attempt `npm rebuild`
# inside the staging tree (better-sqlite3's prebuild-install will
# fetch a binary matching the host's Node). Only proceed if the
# rebuild succeeds AND a post-rebuild require() of better-sqlite3
# loads cleanly.
#
# Called from do_upgrade_flow BEFORE the atomic swap so a mismatch
# never reaches the running system.
do_abi_preflight_or_rebuild() {
  local staging="$1"
  local runtime_json="$staging/release-runtime.json"
  local local_nmv
  local_nmv=$(node -e 'process.stdout.write(String(process.versions.modules))')

  if [[ ! -f "$runtime_json" ]]; then
    warn "release missing release-runtime.json — older tarball, doing best-effort rebuild"
  else
    local expected_nmv
    expected_nmv=$(sed -n 's/.*"node_module_version":[[:space:]]*\([0-9]*\).*/\1/p' "$runtime_json" | head -1)
    if [[ -n "$expected_nmv" && "$expected_nmv" == "$local_nmv" ]]; then
      ok "ABI match: tarball NMV=$expected_nmv = host NMV=$local_nmv"
      # Still run a load-test below; a matching NMV is necessary but
      # not sufficient (e.g. glibc / arch could still bite).
    else
      warn "ABI mismatch: tarball NMV=${expected_nmv:-?} host NMV=$local_nmv — rebuilding native modules in staging"
    fi
  fi

  # Rebuild every native module in staging against the host's Node.
  # better-sqlite3 + bcrypt are the canonical ones; sharp is optional.
  # We run npm rebuild over all of them and let npm decide which need
  # work. Quiet output unless something fails.
  say "rebuilding native bindings against host Node $(node --version) (NMV=$local_nmv)"
  if ! ( cd "$staging/packages/server" && \
         npm rebuild --no-audit --no-fund 2>&1 ) | tail -10 | sed 's/^/  | /'; then
    fail "npm rebuild failed in staging — see lines above"
    return 1
  fi

  # Post-rebuild verification: load better-sqlite3 in the staging tree
  # and open an in-memory DB. If the .node file is still ABI-wrong,
  # this throws and we return 1 — caller aborts before swap.
  if ! ( cd "$staging/packages/server" && \
         node -e '
           try {
             const Db = require("better-sqlite3");
             const db = new Db(":memory:");
             db.prepare("SELECT 1").get();
             db.close();
             process.stdout.write("ok");
           } catch (e) {
             process.stderr.write("LOAD_FAILED: " + e.message + "\n");
             process.exit(1);
           }
         ' >/dev/null ); then
    fail "better-sqlite3 still fails to load after rebuild — release is incompatible with this host"
    return 1
  fi
  ok "ABI preflight passed — native modules load cleanly against host Node"
  return 0
}

# ── --doctor ──────────────────────────────────────────────────
do_doctor_flow() {
  say "DOCTOR: read-only health check"
  echo
  printf "  %-22s active=%s enabled=%s\n" "vloud"            "$(systemctl is-active vloud 2>/dev/null)"            "$(systemctl is-enabled vloud 2>/dev/null)"
  printf "  %-22s active=%s enabled=%s\n" "vloud-job-worker" "$(systemctl is-active vloud-job-worker 2>/dev/null)" "$(systemctl is-enabled vloud-job-worker 2>/dev/null)"
  printf "  %-22s active=%s enabled=%s\n" "vloud-scheduler"  "$(systemctl is-active vloud-scheduler 2>/dev/null)"  "$(systemctl is-enabled vloud-scheduler 2>/dev/null)"
  printf "  %-22s active=%s\n"             "nginx"           "$(systemctl is-active nginx 2>/dev/null)"
  printf "  %-22s active=%s\n"             "named"           "$(systemctl is-active named 2>/dev/null)"
  echo
  say "port 2544 listener"
  ss -lntp 2>/dev/null | grep ':2544\b' | head -3 || warn "no listener on :2544"
  echo
  say "engine health"
  local health
  health=$(curl -s --max-time 4 "http://127.0.0.1:$VLOUD_PORT/api/health" 2>&1 || echo "(timeout)")
  echo "  $health"
  echo
  say "engine version"
  curl -s --max-time 4 "http://127.0.0.1:$VLOUD_PORT/api/system/version" 2>&1 | head -c 400; echo
  echo
  say "DB integrity"
  if [[ -f $VLOUD_INSTALL_DIR/packages/server/vloud.db ]]; then
    if command -v sqlite3 >/dev/null 2>&1; then
      local r; r=$(sqlite3 "$VLOUD_INSTALL_DIR/packages/server/vloud.db" 'PRAGMA integrity_check' 2>&1 | head -1)
      echo "  $r"
    else
      warn "sqlite3 not installed"
    fi
  else
    warn "DB not found at $VLOUD_INSTALL_DIR/packages/server/vloud.db"
  fi
  echo
  say "disk usage"
  df -h "$VLOUD_INSTALL_DIR" / 2>/dev/null | awk 'NR==1 || /\// {print "  "$0}' | head -3
  echo
  ok "DOCTOR done — no changes made"
}

# ── --repair ──────────────────────────────────────────────────
do_repair_flow() {
  say "REPAIR: re-emit systemd units + restart"
  if [[ ! -d "$VLOUD_INSTALL_DIR/packages/server/dist" ]]; then
    die "no install at $VLOUD_INSTALL_DIR — repair needs an existing install; run --fresh first" 3
  fi

  # Re-emit systemd units from the install tree. The packaged units
  # live under $VLOUD_INSTALL_DIR/packaging/systemd/.
  local pkg_dir=$VLOUD_INSTALL_DIR/packaging/systemd
  if [[ -d $pkg_dir ]]; then
    for u in vloud.service vloud-job-worker.service vloud-scheduler.service vloud.slice; do
      if [[ -f "$pkg_dir/$u" ]]; then
        cp -f "$pkg_dir/$u" "/etc/systemd/system/$u"
        ok "wrote /etc/systemd/system/$u"
      fi
    done
  else
    warn "packaging/systemd/ not found in install tree — using existing unit files"
  fi
  # 0.6.0b — repair flow now also reconciles env-file + license-
  # pubkey + nginx default vhost. Repair is the operator's "make it
  # right again" lever; previously it only re-emitted systemd units,
  # which left the broken state from a partial 0.6.0 → 0.6.0a
  # upgrade in place.
  if ! do_reconcile_env_file 1; then
    die "env-file reconcile failed — refusing to restart services in known-broken state" 7
  fi

  # license pubkey (mirrors do_upgrade_flow)
  if [[ -n "${VLOUD_LICENSE_SERVER_URL_DEFAULT:-}" ]]; then
    install -d -m 0755 /etc/vloud
    TMP_PEM=$(mktemp /tmp/vloud-license-pubkey.XXXXXX.pem)
    if curl -fsSL --max-time 15 -o "$TMP_PEM" "$VLOUD_LICENSE_SERVER_URL_DEFAULT/v1/license-pubkey" \
       && grep -q 'BEGIN PUBLIC KEY' "$TMP_PEM"; then
      install -m 0644 -o root -g root "$TMP_PEM" /etc/vloud/license-v1.pub.pem
      ok "license-v1.pub.pem refreshed"
    fi
    rm -f "$TMP_PEM"
  fi

  # 2026-05-14: --repair heals known-bad ownership state. The
  # /var/lib/vloud parent was created as root:root by pre-2026-05-14
  # bootstraps and silently broke engine writes. Heal here so the
  # operator's "fix this host" knob actually fixes it.
  if id -u vloud >/dev/null 2>&1 && [[ -d /var/lib/vloud ]]; then
    cur_owner=$(stat -c '%U:%G' /var/lib/vloud 2>/dev/null)
    if [[ "$cur_owner" != "vloud:vloud" ]]; then
      say "healing /var/lib/vloud ownership: $cur_owner → vloud:vloud"
      chown vloud:vloud /var/lib/vloud 2>/dev/null
      chmod 0750 /var/lib/vloud 2>/dev/null
      [[ -d /var/lib/vloud/staging ]] && \
        chown -R vloud:vloud /var/lib/vloud/staging 2>/dev/null
      ok "/var/lib/vloud ownership healed"
    fi
  fi

  systemctl daemon-reload
  do_stop_services
  do_start_services_with_healthcheck
  do_post_upgrade_health_summary || true
  # 2026-05-14: --repair is the operator's "fix this host" knob —
  # verify all runtime deps before declaring success.
  do_verify_dependencies || warn "post-repair verify-dependencies reported failures (operator action recommended)"
  ok "REPAIR done"
}

# ── --upgrade ─────────────────────────────────────────────────
# 0.6.0b — env-file reconciliation used by BOTH fresh-install and
# upgrade flows. Pre-0.6.0b the env-write happened only inside the
# fresh-install linear phases, so `--upgrade` skipped it entirely
# and engines ended up on 0.6.0a binary with stale (or missing)
# VLOUD_LICENSE_SERVER_URL. That's what bit 188.245.113.223's
# first 0.6.0a upgrade.
#
# Idempotent. Always writes the file atomically via mktemp + install.
# Refuses to continue (returns non-zero) if a required var couldn't
# be written — fail-closed semantics so the caller surfaces a clear
# error instead of silently entering a broken state.
do_reconcile_env_file() {
  local force_url="${1:-1}"   # default: ALWAYS rewrite the cloud URL
  local TMP
  TMP=$(mktemp /etc/vloud.env.XXXXXX)

  # Resolve identity inputs. machine_id derived the same way fresh-install does.
  local raw_mid
  if [[ -f /etc/machine-id && -s /etc/machine-id ]]; then
    raw_mid=$(cat /etc/machine-id)
  else
    raw_mid=$(head -c 32 /dev/urandom | sha256sum | cut -d' ' -f1)
  fi
  local mid; mid=$(printf 'vloud:%s' "$raw_mid" | sha256sum | cut -d' ' -f1 | head -c 32)

  if [[ -f "$ENV_FILE" ]]; then
    cp "$ENV_FILE" "$TMP"
  fi

  # Local upsert: add the line only if the key is entirely absent.
  _upsert_env() {
    local k="$1" v="$2"
    if grep -q "^$k=" "$TMP"; then return; fi
    printf '%s=%s\n' "$k" "$v" >> "$TMP"
  }

  # Local rewrite: replace the value of key (or insert if missing).
  _set_env() {
    local k="$1" v="$2"
    if grep -q "^$k=" "$TMP"; then
      # Use a delimiter the URL can't contain so sed doesn't choke on '/'.
      sed -i -E "s|^$k=.*|$k=$v|" "$TMP"
    else
      printf '%s=%s\n' "$k" "$v" >> "$TMP"
    fi
  }

  # Stable bootstrap fields — upsert only (preserve existing values).
  _upsert_env PORT                  "$VLOUD_PORT"
  _upsert_env NODE_ENV              production
  _upsert_env VLOUD_BIND_HOST       "$VLOUD_BIND_HOST"
  _upsert_env VLOUD_JWT_SECRET      "$(openssl rand -hex 64)"
  _upsert_env VLOUD_SUDO_FALLBACK   0
  _upsert_env VLOUD_MACHINE_ID      "$mid"
  _upsert_env VLOUD_INSTALL_ID      "$(head -c 16 /dev/urandom | xxd -p)"
  _upsert_env VLOUD_TRIAL_EXPIRES_AT \
              "$(date -u -d "+${VLOUD_TRIAL_DAYS} days" +%Y-%m-%dT%H:%M:%SZ)"
  # Cloud email relay — enables OTP verification during onboarding.
  # Customer engines relay OTP emails through license.vloud.app instead
  # of requiring local SMTP or Resend API key configuration.
  _upsert_env VLOUD_EMAIL_RELAY_ENABLED 1
  _upsert_env VLOUD_EMAIL_RELAY_URL     "$VLOUD_LICENSE_SERVER_URL_DEFAULT"
  if [[ "${COEXIST:-0}" -eq 1 ]]; then _upsert_env VLOUD_COEXIST 1; fi

  # Cloud URL — ALWAYS rewrite to the current default unless the
  # operator has set a non-default value AND force_url=0.
  #
  # The 0.6.0a default lives in VLOUD_LICENSE_SERVER_URL_DEFAULT
  # (https://license.vloud.app). Earlier defaults
  # (localhost / adminpanel.vloud.app) were broken, so we rewrite
  # them. Custom operator URLs (anything NOT matching the broken
  # set AND NOT matching the current default) are preserved.
  if [[ "$force_url" -eq 1 ]]; then
    _set_env VLOUD_LICENSE_SERVER_URL "$VLOUD_LICENSE_SERVER_URL_DEFAULT"
  else
    local existing
    existing=$(grep -E '^VLOUD_LICENSE_SERVER_URL=' "$TMP" | head -1 | cut -d= -f2- || true)
    if [[ -z "$existing" ]] \
       || echo "$existing" | grep -qE '^https?://(127\.0\.0\.1|localhost|adminpanel\.vloud\.app)'; then
      _set_env VLOUD_LICENSE_SERVER_URL "$VLOUD_LICENSE_SERVER_URL_DEFAULT"
    fi
  fi

  # Fail-closed verification: every required var must be present + non-empty.
  local required=(PORT NODE_ENV VLOUD_MACHINE_ID VLOUD_INSTALL_ID
                  VLOUD_LICENSE_SERVER_URL VLOUD_JWT_SECRET)
  local missing=()
  for k in "${required[@]}"; do
    local v
    v=$(grep -E "^$k=" "$TMP" | head -1 | cut -d= -f2- || true)
    if [[ -z "$v" ]]; then missing+=("$k"); fi
  done
  if [[ ${#missing[@]} -gt 0 ]]; then
    rm -f "$TMP"
    fail "env-file reconcile FAILED — missing required vars: ${missing[*]}"
    return 1
  fi

  install -m 0640 -o root -g vloud "$TMP" "$ENV_FILE"
  rm -f "$TMP"
  ok "$ENV_FILE reconciled (VLOUD_LICENSE_SERVER_URL=$VLOUD_LICENSE_SERVER_URL_DEFAULT)"
  return 0
}

# 0.6.0b — post-upgrade health summary. Probes the engine's
# /api/license/state to confirm sync_status is non-disabled and
# the cloud URL is reachable. Non-zero exit signals broken state
# to the caller; do_upgrade_flow surfaces it as a warn (not a
# rollback trigger — the binary may still be perfectly serviceable
# for non-cloud workflows).
# Final dependency-and-readiness gate. Runs at the END of every
# install flow (fresh, upgrade, repair). Asserts every prerequisite
# the engine relies on is actually present + healthy. Fails LOUD
# with a clear diagnostic per missing/broken item — the install is
# considered complete only when this passes.
#
# 2026-05-14: added after the fresh-host validation found that
# bootstrap was silently skipping Redis. This gate would have caught
# that pre-release. Treat any new runtime dep as a required addition
# here.
do_verify_dependencies() {
  say "verifying runtime dependencies"
  emit_progress verify-deps started ""
  local failures=0
  local warnings=0

  # Binaries on PATH.
  local need_bin=(node npm curl openssl sqlite3 nginx redis-cli sudo systemctl)
  for b in "${need_bin[@]}"; do
    if ! command -v "$b" >/dev/null 2>&1; then
      fail "missing binary: $b"
      failures=$((failures+1))
    fi
  done

  # Node ABI sanity (we already enforced 20.x earlier; double-check).
  if command -v node >/dev/null; then
    local nmv
    nmv=$(node -e 'process.stdout.write(String(process.versions.modules))' 2>/dev/null)
    if [[ "$nmv" != "115" ]]; then
      fail "Node ABI is NMV=$nmv, expected 115 (Node 20.x)"
      failures=$((failures+1))
    fi
  fi

  # Required services.
  local need_service=(redis-server vloud vloud-job-worker vloud-scheduler nginx)
  for s in "${need_service[@]}"; do
    if [[ "$s" == redis-server ]] && [[ "${VLOUD_BOOTSTRAP_NO_REDIS:-0}" == "1" ]]; then
      continue
    fi
    if [[ "$s" == vloud-job-worker || "$s" == vloud-scheduler ]] && [[ "${VLOUD_BOOTSTRAP_NO_REDIS:-0}" == "1" ]]; then
      continue
    fi
    # redis service name varies across Ubuntu generations.
    if [[ "$s" == redis-server ]]; then
      if ! systemctl is-active --quiet redis-server.service 2>/dev/null \
         && ! systemctl is-active --quiet redis.service 2>/dev/null; then
        fail "redis-server.service / redis.service is not active"
        failures=$((failures+1))
      fi
      continue
    fi
    if ! systemctl is-active --quiet "$s.service" 2>/dev/null; then
      fail "$s.service is not active"
      failures=$((failures+1))
    fi
  done

  # Engine HTTP readiness — bypassed when explicitly skipped (e.g.
  # repair flow on a half-broken host that doesn't pass yet).
  if [[ "${VLOUD_VERIFY_SKIP_HTTP:-0}" != "1" ]]; then
    local rc
    rc=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$VLOUD_PORT/api/health" 2>/dev/null || echo 000)
    if [[ "$rc" != "200" ]]; then
      fail "engine /api/health returned $rc (expected 200)"
      failures=$((failures+1))
    fi
  fi

  # Redis client ping.
  if [[ "${VLOUD_BOOTSTRAP_NO_REDIS:-0}" != "1" ]]; then
    if ! redis-cli ping 2>/dev/null | grep -q '^PONG$'; then
      fail "redis-cli ping did not return PONG"
      failures=$((failures+1))
    fi
  fi

  # Required files.
  # SEA binary or legacy dist — at least one must exist.
  local engine_ok=0
  [[ -f "$VLOUD_INSTALL_DIR/vloud-engine" ]] && engine_ok=1
  [[ -f "$VLOUD_INSTALL_DIR/packages/server/dist/index.js" ]] && engine_ok=1
  [[ "$engine_ok" -eq 0 ]] && { fail "missing engine entrypoint (vloud-engine or dist/index.js)"; failures=$((failures+1)); }
  local need_file=(/etc/vloud.env /etc/vloud/license-v1.pub.pem $VLOUD_INSTALL_DIR/VERSION
                   $VLOUD_INSTALL_DIR/scripts/bootstrap.sh)
  for f in "${need_file[@]}"; do
    if [[ ! -e "$f" ]]; then
      fail "missing file: $f"
      failures=$((failures+1))
    fi
  done

  # Required listening ports. The 2544 / 6379 / 80 trio is the engine
  # contract. ss is in iproute2 (in coreutils dependency chain).
  local need_port=("$VLOUD_PORT" 80)
  if [[ "${VLOUD_BOOTSTRAP_NO_REDIS:-0}" != "1" ]]; then
    need_port+=(6379)
  fi
  for p in "${need_port[@]}"; do
    if ! ss -lnt 2>/dev/null | awk -v port=":$p$" '$4 ~ port { found=1 } END { exit !found }'; then
      fail "no process listening on port $p"
      failures=$((failures+1))
    fi
  done

  # File ownership sanity — the DB file MUST be vloud:vloud or the
  # engine fails the first write attempt with SQLITE_READONLY_DIRECTORY.
  if [[ -f "$VLOUD_INSTALL_DIR/packages/server/vloud.db" ]]; then
    local owner
    owner=$(stat -c '%U:%G' "$VLOUD_INSTALL_DIR/packages/server/vloud.db" 2>/dev/null)
    if [[ "$owner" != "vloud:vloud" ]]; then
      warn "DB file owner is $owner, expected vloud:vloud (engine WAL writes will fail)"
      warnings=$((warnings+1))
    fi
  fi

  # /var/lib/vloud writable by vloud user.
  if ! sudo -u vloud test -w /var/lib/vloud 2>/dev/null; then
    warn "/var/lib/vloud is not writable by user 'vloud' (state-write paths will fail)"
    warnings=$((warnings+1))
  fi

  if [[ $failures -gt 0 ]]; then
    emit_progress verify-deps failed "$failures dependency check(s) failed"
    fail "$failures dependency check(s) failed — install is INCOMPLETE"
    fail "see 'systemctl status vloud vloud-job-worker vloud-scheduler redis-server' + 'journalctl -u vloud' for diagnostics"
    return 1
  fi
  if [[ $warnings -gt 0 ]]; then
    warn "$warnings non-fatal dependency warning(s) above"
  fi
  emit_progress verify-deps ok "all dependencies healthy"
  ok "all runtime dependencies present + healthy"
  return 0
}

do_post_upgrade_health_summary() {
  echo
  say "post-upgrade health summary"
  printf "  %-28s %s\n" "engine /api/health" \
    "$(curl -s --max-time 4 "http://127.0.0.1:${VLOUD_PORT}/api/health" 2>/dev/null | head -c 200)"

  local ver
  ver=$(curl -s --max-time 4 "http://127.0.0.1:${VLOUD_PORT}/api/system/version" 2>/dev/null \
        | sed -n 's/.*"display":"\([^"]*\)".*/\1/p')
  printf "  %-28s %s\n" "engine version" "${ver:-?}"

  local env_url
  env_url=$(grep -E '^VLOUD_LICENSE_SERVER_URL=' "$ENV_FILE" 2>/dev/null | cut -d= -f2-)
  printf "  %-28s %s\n" "/etc/vloud.env URL" "${env_url:-(missing)}"

  local pubkey_state="missing"
  [[ -s /etc/vloud/license-v1.pub.pem ]] && pubkey_state="present ($(wc -c </etc/vloud/license-v1.pub.pem) bytes)"
  printf "  %-28s %s\n" "/etc/vloud/license-v1.pub.pem" "$pubkey_state"

  local cloud_reach="unreachable"
  if [[ -n "$env_url" ]]; then
    if curl -fsS --max-time 5 "$env_url/v1/license-pubkey" >/dev/null 2>&1; then
      cloud_reach="ok"
    fi
  fi
  printf "  %-28s %s\n" "cloud /v1/license-pubkey" "$cloud_reach"

  echo
  if [[ "$env_url" == "$VLOUD_LICENSE_SERVER_URL_DEFAULT" \
        && "$pubkey_state" != "missing" \
        && "$cloud_reach" == "ok" ]]; then
    ok "cloud sync prerequisites OK — engine should register within ~60s"
    return 0
  fi
  warn "cloud sync prerequisites NOT FULLY MET. See /var/log/vloud/install.log."
  return 1
}

do_upgrade_flow() {
  say "UPGRADE: atomic swap with rollback"
  emit_progress upgrade-start started "target=${VLOUD_TARGET_VERSION:-latest}"
  if [[ ! -d "$VLOUD_INSTALL_DIR/packages/server/dist" ]] && [[ ! -f "$VLOUD_INSTALL_DIR/vloud-engine" ]]; then
    die "no existing install at $VLOUD_INSTALL_DIR — use --fresh instead" 3
  fi

  # Capture current version for progress events. Read from the
  # installed tree's VERSION file (canonical) before we touch anything.
  if [[ -f "$VLOUD_INSTALL_DIR/VERSION" ]]; then
    VLOUD_VERSION_FROM=$(cat "$VLOUD_INSTALL_DIR/VERSION" 2>/dev/null | tr -d '[:space:]')
  fi
  [[ -n "$VLOUD_TARGET_VERSION" ]] && VLOUD_VERSION_TO="$VLOUD_TARGET_VERSION"

  local ts; ts=$(date -u +%Y%m%dT%H%M%SZ)
  local staging=/opt/vloud-staging-$ts
  local rollback=/opt/vloud.pre-upgrade-$ts

  # 1. Backup state BEFORE touching anything.
  emit_progress backup started ""
  do_backup_state "pre-upgrade" >/dev/null
  emit_progress backup ok "state preserved"

  # 2. Download + verify the new release into staging (a fresh dir
  #    next to /opt/vloud, NOT inside it, so we can atomic-mv on success
  #    and rm-rf on failure).
  say "downloading new release to $staging"
  emit_progress downloading started "$VLOUD_RELEASE_URL"
  install -d -m 0755 "$staging"
  local TMPTAR TMPSHA TMPSIG
  TMPTAR=$(mktemp /tmp/vloud-upgrade-XXXXXX.tar.gz)
  TMPSHA="$TMPTAR.sha256"
  TMPSIG="$TMPTAR.sig"
  if ! curl -fsSL --max-time 600 -o "$TMPTAR" "$VLOUD_RELEASE_URL"; then
    rm -f "$TMPTAR" "$TMPSHA" "$TMPSIG"; rm -rf "$staging"
    die "release download failed: $VLOUD_RELEASE_URL" 4
  fi
  if [[ "${VLOUD_INSECURE_SKIP_VERIFY:-0}" != "1" ]]; then
    if ! curl -fsSL --max-time 60 -o "$TMPSHA" "$VLOUD_RELEASE_URL.sha256"; then
      rm -f "$TMPTAR" "$TMPSHA" "$TMPSIG"; rm -rf "$staging"
      die "release sha256 download failed: $VLOUD_RELEASE_URL.sha256" 4
    fi
    if ! curl -fsSL --max-time 60 -o "$TMPSIG" "$VLOUD_RELEASE_URL.sig"; then
      rm -f "$TMPTAR" "$TMPSHA" "$TMPSIG"; rm -rf "$staging"
      die "release sig download failed: $VLOUD_RELEASE_URL.sig" 4
    fi
    # Rewrite sha256 line to match the on-disk mktemp filename.
    local SHA_HEX
    SHA_HEX=$(awk '{print $1}' "$TMPSHA")
    printf '%s  %s\n' "$SHA_HEX" "$(basename "$TMPTAR")" > "$TMPSHA"
    local EXPECTED_ARTIFACT
    EXPECTED_ARTIFACT="$(basename "${VLOUD_RELEASE_URL%%\?*}")"
    emit_progress verifying started "verify_release_chain"
    verify_release_chain "$TMPTAR" "$TMPSHA" "$TMPSIG" "$EXPECTED_ARTIFACT"
    emit_progress verifying ok "manifest signature + sha256 verified"
  else
    if [[ "${VLOUD_TESTING:-0}" != "1" ]]; then
      rm -f "$TMPTAR"; rm -rf "$staging"
      die "VLOUD_INSECURE_SKIP_VERIFY=1 requires VLOUD_TESTING=1 (refusing on production host)" 4
    fi
    warn "VERIFICATION SKIPPED — VLOUD_INSECURE_SKIP_VERIFY=1 (test mode)"
    emit_progress verifying warn "skipped (test mode)"
  fi
  tar -xzf "$TMPTAR" -C "$staging" --strip-components=1
  rm -f "$TMPTAR" "$TMPSHA" "$TMPSIG"
  ok "release extracted to $staging"
  emit_progress downloading ok "extracted to $staging"

  # 2a. Per-file integrity (PR1) — defence-in-depth on top of the
  #     signed manifest's tarball-SHA. Catches single-file corruption
  #     in the staging dir before we swap it into /opt/vloud.
  do_verify_sha256sums "$staging"

  # 2b. ABI preflight — refuse to swap if the release's prebuilt
  #     native modules can't load against the host's Node. The
  #     0.5.1-beta upgrade against 188.245.113.223 (2026-05-13)
  #     shipped better-sqlite3 built for NODE_MODULE_VERSION 127 to
  #     a host with NODE_MODULE_VERSION 115, the new engine couldn't
  #     boot, and the resulting systemd start-limit-hit cascade
  #     poisoned the rollback. Fail-closed BEFORE the swap.
  emit_progress abi-preflight started ""
  do_abi_preflight_or_rebuild "$staging" || {
    rm -rf "$staging"
    emit_progress abi-preflight failed "rebuild failed; swap refused"
    die "ABI preflight failed — refusing to swap; existing install untouched" 8
  }
  emit_progress abi-preflight ok ""

  # 3. Stop services BEFORE the swap so they don't half-read the new tree.
  emit_progress stop-services started ""
  do_stop_services
  emit_progress stop-services ok ""

  # 4. Atomic swap: rename current → rollback, staging → current.
  say "swapping $VLOUD_INSTALL_DIR ↔ $rollback (atomic)"
  emit_progress swap started "$VLOUD_INSTALL_DIR ↔ $rollback"
  mv "$VLOUD_INSTALL_DIR" "$rollback"
  mv "$staging" "$VLOUD_INSTALL_DIR"
  emit_progress swap ok ""

  # 5. Re-link state: copy the live DB from rollback into the new tree.
  if [[ -f "$rollback/packages/server/vloud.db" ]]; then
    cp -a "$rollback/packages/server/vloud.db"     "$VLOUD_INSTALL_DIR/packages/server/vloud.db"
    [[ -f "$rollback/packages/server/vloud.db-wal" ]] && cp -a "$rollback/packages/server/vloud.db-wal" "$VLOUD_INSTALL_DIR/packages/server/vloud.db-wal"
    [[ -f "$rollback/packages/server/vloud.db-shm" ]] && cp -a "$rollback/packages/server/vloud.db-shm" "$VLOUD_INSTALL_DIR/packages/server/vloud.db-shm"
    ok "preserved DB from rollback tree"
  fi
  # Note: /etc/vloud.env was NEVER touched — it's outside $VLOUD_INSTALL_DIR.
  # Same for /var/lib/vloud (engine state, geoip, snapshots).

  # 5a. CRITICAL: normalize ownership of the swapped-in tree. The
  # staging dir was created by `tar -xzf` + `npm install` running as
  # root, so after `mv staging -> /opt/vloud` everything is root:root.
  # The cp -a above preserves vloud:vloud on the DB FILE, but the
  # PARENT DIR (/opt/vloud/packages/server/) stays root:root 755.
  # SQLite in WAL mode needs to CREATE vloud.db-wal / vloud.db-shm
  # in that parent dir at boot — without group-write or vloud
  # ownership, it fails with SQLITE_READONLY_DIRECTORY and background
  # workers (monitor collector, guardian, migration worker, ssl
  # worker) crash on every write. This bit 0.5.1 → 0.5.3 upgrades.
  #
  # Fresh-install does this at line ~962; the upgrade flow was
  # missing the equivalent step. Don't remove it.
  # Engine runs as root (0.7.8+). Ensure the install tree is root-owned
  # so the engine can write DB files, configs, etc. directly.
  chown -R root:root "$VLOUD_INSTALL_DIR/packages" 2>/dev/null || warn "chown of packages/ partial"
  if [[ -d "$VLOUD_INSTALL_DIR/node_modules" ]]; then
    chown -R root:root "$VLOUD_INSTALL_DIR/node_modules" 2>/dev/null || true
  fi
  chmod 0755 "$VLOUD_INSTALL_DIR/packages/server" 2>/dev/null || true
  ok "normalized ownership: $VLOUD_INSTALL_DIR/packages → root:root"

  if id -u vloud >/dev/null 2>&1; then

    # 2026-05-14: heal /var/lib/vloud parent ownership. Pre-this-date
    # bootstraps created the dir as root:root (bug at line ~1778),
    # which silently broke engine state-writes — caught by the
    # do_verify_dependencies gate during fresh-host validation. Always
    # reconcile on upgrade so hosts on the buggy old bootstrap heal
    # automatically on the next upgrade tick. Subdirs (backups,
    # crontabs, acme) keep their own ownership.
    if [[ -d /var/lib/vloud ]]; then
      cur_owner=$(stat -c '%U:%G' /var/lib/vloud 2>/dev/null)
      if [[ "$cur_owner" != "vloud:vloud" ]]; then
        say "healing /var/lib/vloud ownership: $cur_owner → vloud:vloud"
        chown vloud:vloud /var/lib/vloud 2>/dev/null
        chmod 0750 /var/lib/vloud 2>/dev/null
        # Re-stamp the engine-owned subtrees (staging/) just in case
        # they were also clobbered. backups/, crontabs/, acme/ keep
        # their canonical owners (root:root + root:www-data).
        for d in staging; do
          [[ -d /var/lib/vloud/$d ]] && \
            chown -R vloud:vloud "/var/lib/vloud/$d" 2>/dev/null
        done
        ok "/var/lib/vloud ownership healed"
      fi
    fi
  else
    warn "vloud user not present — skipping ownership normalization (engine will fail to write)"
  fi

  # 5a-new. 0.6.0b — env-file reconciliation. The upgrade flow used
  # to skip this entirely (only the fresh-install linear phases
  # touched /etc/vloud.env), so 0.6.0 → 0.6.0a upgrades that
  # depended on a new env var landed broken. Fail-closed: if the
  # env file isn't sane post-reconcile, abort the upgrade (the
  # rollback tree is still on disk, so this is a recoverable point).
  if ! do_reconcile_env_file 1; then
    fail "env-file reconcile failed — refusing to start services"
    do_stop_services
    rm -rf "$VLOUD_INSTALL_DIR"
    mv "$rollback" "$VLOUD_INSTALL_DIR"
    do_start_services_with_healthcheck || die "rollback also failed after env-reconcile abort" 7
    die "upgrade aborted, previous version restored (env-reconcile failed)" 7
  fi

  # 5a-new. 0.6.0a — refresh the cloud license-signing pubkey.
  if [[ -n "${VLOUD_LICENSE_SERVER_URL_DEFAULT:-}" ]]; then
    say "refreshing /etc/vloud/license-v1.pub.pem from $VLOUD_LICENSE_SERVER_URL_DEFAULT"
    install -d -m 0755 /etc/vloud
    TMP_PEM=$(mktemp /tmp/vloud-license-pubkey.XXXXXX.pem)
    if curl -fsSL --max-time 15 -o "$TMP_PEM" "$VLOUD_LICENSE_SERVER_URL_DEFAULT/v1/license-pubkey" \
       && grep -q 'BEGIN PUBLIC KEY' "$TMP_PEM"; then
      install -m 0644 -o root -g root "$TMP_PEM" /etc/vloud/license-v1.pub.pem
      ok "license-v1.pub.pem refreshed"
    else
      warn "could not refresh license-v1.pub.pem; cloud-pull will retry"
    fi
    rm -f "$TMP_PEM"
  fi

  # 5b. VLoud nginx reverse proxy.
  #
  # Architecture (0.7.0+): VLoud binds to 127.0.0.1:2544 ONLY.
  # It does NOT claim port 80 or default_server. nginx acts as a
  # gateway — the operator configures a panel domain during onboarding,
  # and VLoud generates a domain-specific vhost (no wildcard catch-all).
  #
  # For initial onboarding (before a domain is configured), the operator
  # accesses VLoud directly at http://SERVER-IP:2544/onboarding.
  #
  # Migration: if a vloud-default vhost exists from a pre-0.7.0 install,
  # remove it so VLoud no longer claims default_server. The operator
  # can re-enable it via Settings > Panel Domain if needed.
  if [[ -f /etc/nginx/sites-enabled/vloud-default ]]; then
    say "migrating: removing vloud-default (VLoud no longer claims port 80)"
    rm -f /etc/nginx/sites-enabled/vloud-default
    if nginx -t 2>&1 | grep -qE 'test is successful|syntax is ok'; then
      systemctl reload nginx 2>/dev/null || true
      ok "vloud-default removed — access VLoud at http://<server-ip>:${VLOUD_PORT}"
    fi
  fi

  # 5c. Sync systemd units from the new release into /etc/systemd/system/.
  # Without this, the engine keeps running under the OLD unit config
  # (e.g. User=vloud) even after an upgrade ships User=root.
  if [[ -d "$VLOUD_INSTALL_DIR/packaging/systemd" ]]; then
    for unit in vloud.service vloud-job-worker.service vloud-scheduler.service; do
      if [[ -f "$VLOUD_INSTALL_DIR/packaging/systemd/$unit" ]]; then
        cp "$VLOUD_INSTALL_DIR/packaging/systemd/$unit" "/etc/systemd/system/$unit"
      fi
    done
    systemctl daemon-reload 2>/dev/null || true
    ok "systemd units synced from release"
  fi

  # 6. Start + health-check. On failure, roll back.
  emit_progress healthcheck started "starting services + curl /api/health"
  if do_start_services_with_healthcheck; then
    ok "upgrade succeeded — new version is live"
    emit_progress healthcheck ok "new version healthy"
    # Cleanup old rollback dirs older than 7 days.
    find /opt -maxdepth 1 -type d -name 'vloud.pre-upgrade-*' -mtime +7 -exec rm -rf {} + 2>/dev/null || true
    say "version after upgrade:"
    curl -s --max-time 4 "http://127.0.0.1:$VLOUD_PORT/api/system/version" 2>&1 | head -c 400; echo
    # 0.6.0b — surface cloud-sync prerequisites. Warn-only (non-fatal)
    # because a broken cloud-sync state is recoverable via --repair
    # and shouldn't trigger an automatic rollback.
    do_post_upgrade_health_summary || true
    # 2026-05-14: dependency gate at the END of the upgrade. Failure
    # here means the new version is up + serving but something in
    # the runtime environment regressed (redis disabled, worker
    # crash, port not listening). Surface loudly; the operator can
    # rollback via 'bootstrap.sh --rollback' if needed.
    do_verify_dependencies || warn "post-upgrade verify-dependencies reported failures"
  else
    fail "new version failed health-check — rolling back"
    emit_progress healthcheck failed "engine did not become healthy in 30s"
    do_dump_failure_journal
    emit_progress auto-rollback started "restoring $rollback"
    do_stop_services
    rm -rf "$VLOUD_INSTALL_DIR"
    mv "$rollback" "$VLOUD_INSTALL_DIR"
    if ! do_start_services_with_healthcheck; then
      do_dump_failure_journal
      emit_progress auto-rollback failed "rollback restart failed"
      die "rollback also failed — manual intervention required. See $LOG_FILE and journalctl -u vloud. To recover: systemctl reset-failed vloud.service && systemctl start vloud.service" 5
    fi
    emit_progress auto-rollback ok "previous version restored"
    die "upgrade aborted, previous version restored" 6
  fi

  ok "UPGRADE done"
}

# ─── --rollback: operator-triggered restore from .pre-upgrade-<ts>/ ───
#
# Two forms:
#   bootstrap.sh --rollback               → restore newest slot
#   bootstrap.sh --rollback --to <v|path> → restore matching slot
#
# Slot semantics: `/opt/vloud.pre-upgrade-<ts>/` directories are
# created by do_upgrade_flow's atomic swap. Each holds the FULL
# previous /opt/vloud tree. Operator picks one to restore; we
# reverse the swap.
#
# Safety:
#   - DB is preserved by copying the CURRENT live DB INTO the slot
#     before the swap (otherwise rolling back also rolls the DB
#     back, which is rarely what the operator wants).
#   - The pre-rollback tree is archived as
#     /opt/vloud.pre-rollback-<ts>/ so we can roll forward again
#     if the operator changes their mind.
#   - Healthcheck after restart; on failure, undo the rollback
#     (re-swap to the pre-rollback tree).
do_rollback_flow() {
  say "ROLLBACK: restore from .pre-upgrade-<ts>/ slot"
  emit_progress rollback-start started "to=${VLOUD_ROLLBACK_TO:-newest}"

  if [[ ! -d "$VLOUD_INSTALL_DIR" ]]; then
    die "no live install at $VLOUD_INSTALL_DIR — nothing to roll back from" 3
  fi

  # 1. Resolve the target slot.
  local slot=""
  if [[ -n "$VLOUD_ROLLBACK_TO" ]]; then
    # --to may be an explicit path OR a version string.
    if [[ -d "$VLOUD_ROLLBACK_TO" ]]; then
      slot="$VLOUD_ROLLBACK_TO"
    elif [[ -d "/opt/$VLOUD_ROLLBACK_TO" ]]; then
      slot="/opt/$VLOUD_ROLLBACK_TO"
    else
      # Treat as version label; find a slot whose VERSION matches.
      while IFS= read -r d; do
        if [[ -f "$d/VERSION" ]] && [[ "$(tr -d '[:space:]' <"$d/VERSION")" == "$VLOUD_ROLLBACK_TO" ]]; then
          slot="$d"; break
        fi
      done < <(find /opt -maxdepth 1 -type d -name 'vloud.pre-upgrade-*' 2>/dev/null | sort -r)
    fi
    if [[ -z "$slot" ]]; then
      die "no rollback slot matches --to '$VLOUD_ROLLBACK_TO' (looked for explicit path and matching VERSION)" 3
    fi
  else
    # No --to: pick the newest .pre-upgrade-* slot.
    slot=$(find /opt -maxdepth 1 -type d -name 'vloud.pre-upgrade-*' 2>/dev/null | sort -r | head -n1)
    if [[ -z "$slot" ]]; then
      die "no .pre-upgrade-* slots available — nothing to roll back to" 3
    fi
  fi
  local slot_version="(unknown)"
  if [[ -f "$slot/VERSION" ]]; then
    slot_version=$(tr -d '[:space:]' <"$slot/VERSION")
  fi
  say "rollback target: $slot (version: $slot_version)"
  emit_progress rollback-resolve ok "slot=$slot version=$slot_version"

  # Capture from/to for progress payloads.
  if [[ -f "$VLOUD_INSTALL_DIR/VERSION" ]]; then
    VLOUD_VERSION_FROM=$(tr -d '[:space:]' <"$VLOUD_INSTALL_DIR/VERSION")
  fi
  VLOUD_VERSION_TO="$slot_version"

  # 2. Pre-rollback backup.
  emit_progress backup started "pre-rollback state snapshot"
  do_backup_state "pre-rollback" >/dev/null
  emit_progress backup ok ""

  # 3. Optional: verify slot integrity (best-effort — older slots
  #    may predate SHA256SUMS).
  do_verify_sha256sums "$slot" || true

  # 4. Stop services.
  emit_progress stop-services started ""
  do_stop_services
  emit_progress stop-services ok ""

  # 5. Preserve the LIVE DB into the slot before swap. Without this,
  #    rolling back also rolls the DB back; that's a foot-gun.
  if [[ -f "$VLOUD_INSTALL_DIR/packages/server/vloud.db" ]]; then
    cp -a "$VLOUD_INSTALL_DIR/packages/server/vloud.db" \
          "$slot/packages/server/vloud.db" 2>/dev/null || \
      warn "could not preserve live DB into slot (rollback will use slot's older DB)"
    for ext in -wal -shm; do
      [[ -f "$VLOUD_INSTALL_DIR/packages/server/vloud.db$ext" ]] && \
        cp -a "$VLOUD_INSTALL_DIR/packages/server/vloud.db$ext" \
              "$slot/packages/server/vloud.db$ext" 2>/dev/null
    done
    ok "preserved live DB into rollback slot"
  fi

  # 6. Reverse swap. Archive the current tree as pre-rollback-<ts>/
  #    so we can roll forward again if needed.
  local ts; ts=$(date -u +%Y%m%dT%H%M%SZ)
  local archive=/opt/vloud.pre-rollback-$ts
  say "swapping $VLOUD_INSTALL_DIR → $archive ; $slot → $VLOUD_INSTALL_DIR"
  emit_progress swap started "rollback swap"
  mv "$VLOUD_INSTALL_DIR" "$archive"
  mv "$slot"              "$VLOUD_INSTALL_DIR"
  emit_progress swap ok ""

  # 7. Normalize ownership on the restored tree.
  if id -u vloud >/dev/null 2>&1; then
    chown -R vloud:vloud "$VLOUD_INSTALL_DIR/packages" 2>/dev/null || true
    if [[ -d "$VLOUD_INSTALL_DIR/node_modules" ]]; then
      chown -R vloud:vloud "$VLOUD_INSTALL_DIR/node_modules" 2>/dev/null || true
    fi
    chmod 0775 "$VLOUD_INSTALL_DIR/packages/server" 2>/dev/null || true
  fi

  # 8. Start + healthcheck. On failure, undo (restore the archive).
  emit_progress healthcheck started ""
  if do_start_services_with_healthcheck; then
    ok "rollback succeeded — engine on version $slot_version"
    emit_progress healthcheck ok "engine healthy on $slot_version"
  else
    fail "post-rollback healthcheck failed — undoing rollback"
    emit_progress healthcheck failed "engine did not become healthy"
    do_dump_failure_journal
    do_stop_services
    rm -rf "$VLOUD_INSTALL_DIR"
    mv "$archive" "$VLOUD_INSTALL_DIR"
    if ! do_start_services_with_healthcheck; then
      emit_progress rollback-undo failed "could not restore pre-rollback tree"
      die "rollback failed AND undo failed — manual intervention required" 5
    fi
    emit_progress rollback-undo ok "pre-rollback tree restored"
    die "rollback aborted — engine restored to pre-rollback state" 6
  fi

  ok "ROLLBACK done"
}

# ── --force-clean-install (purge then fresh) ──────────────────
do_force_clean_purge() {
  say "FORCE-CLEAN: purging existing install before fresh"
  do_stop_services
  systemctl disable vloud.service vloud-job-worker.service vloud-scheduler.service 2>/dev/null || true
  rm -f /etc/systemd/system/vloud.service \
        /etc/systemd/system/vloud-job-worker.service \
        /etc/systemd/system/vloud-scheduler.service \
        /etc/systemd/system/vloud.slice
  rm -rf /etc/systemd/system/vloud.service.d \
         /etc/systemd/system/vloud-job-worker.service.d \
         /etc/systemd/system/vloud-scheduler.service.d
  systemctl daemon-reload
  rm -rf "$VLOUD_INSTALL_DIR" "$ENV_FILE" /var/lib/vloud /var/log/vloud
  rm -f  /etc/sudoers.d/vloud /etc/logrotate.d/vloud
  # Engine-generated nginx vhosts only (preserve commercial + customer vhosts).
  while IFS= read -r f; do rm -f "$f"; done < <(
    ls /etc/nginx/sites-enabled/vloud-*.conf 2>/dev/null |
    grep -vE '/vloud-(app-|admin|commercial-|marketing|portal|license)' || true
  )
  # 0.5.3 default vhost (extensionless symlink, not in the .conf glob above).
  rm -f /etc/nginx/sites-enabled/vloud-default /etc/nginx/sites-available/vloud-default
  nginx -t 2>/dev/null && systemctl reload nginx 2>/dev/null
  ok "purge complete — proceeding with fresh install"
}

# ─── Dispatch ───
# This is where we either run a mode-specific flow and exit, or fall
# through to the linear fresh-install phases below.
case "$MODE" in
  doctor)
    do_doctor_flow
    exit 0
    ;;
  repair)
    emit_progress dispatch started "repair flow"
    do_repair_flow
    rc=$?
    if [[ $rc -eq 0 ]]; then emit_progress_terminal ok "repair complete"; else emit_progress_terminal failed "repair exit=$rc"; fi
    exit $rc
    ;;
  upgrade)
    emit_progress dispatch started "upgrade flow target=${VLOUD_TARGET_VERSION:-latest}"
    do_upgrade_flow
    rc=$?
    if [[ $rc -eq 0 ]]; then emit_progress_terminal ok "upgrade complete"; else emit_progress_terminal failed "upgrade exit=$rc"; fi
    exit $rc
    ;;
  rollback)
    emit_progress dispatch started "rollback flow target=${VLOUD_ROLLBACK_TO:-newest}"
    do_rollback_flow
    rc=$?
    if [[ $rc -eq 0 ]]; then emit_progress_terminal ok "rollback complete"; else emit_progress_terminal failed "rollback exit=$rc"; fi
    exit $rc
    ;;
  force-clean)
    do_force_clean_purge
    # Fall through to fresh install below.
    ;;
  fresh)
    if [[ -d "$VLOUD_INSTALL_DIR/packages/server/dist" ]]; then
      die "$VLOUD_INSTALL_DIR exists — refusing destructive --fresh. Use --upgrade or --force-clean-install." 7
    fi
    ;;
  *)
    die "unknown install mode: $MODE" 2
    ;;
esac
# (fresh / force-clean continue with phases 1-6 below)

ARCH=$(dpkg --print-architecture 2>/dev/null || uname -m)
case "$ARCH" in
  amd64|arm64) ok "arch: $ARCH" ;;
  *) die "unsupported architecture: $ARCH (need amd64 or arm64)" ;;
esac

# Disk check — engine + app data + DB needs ~3GB minimum.
DISK_FREE_GB=$(df -BG --output=avail / | tail -1 | tr -d 'G ')
if [[ "$DISK_FREE_GB" -lt 3 ]]; then
  die "/, only ${DISK_FREE_GB}GB free; need at least 3GB"
fi
ok "disk: ${DISK_FREE_GB}GB free"

# RAM check — 1GB hard floor, 2GB recommended.
MEM_MB=$(awk '/MemTotal/ {print int($2/1024)}' /proc/meminfo)
if [[ "$MEM_MB" -lt 1024 ]]; then
  warn "ram: ${MEM_MB}MB (1GB hard floor — engine may be unstable)"
elif [[ "$MEM_MB" -lt 2048 ]]; then
  warn "ram: ${MEM_MB}MB (2GB recommended)"
else
  ok "ram: ${MEM_MB}MB"
fi

# Outbound network — apt and the release tarball both need it.  We only
# warn on failure here; the apt step will surface the real error.
if curl -fsS --max-time 5 -o /dev/null -w '' https://deb.debian.org/ 2>/dev/null \
   || curl -fsS --max-time 5 -o /dev/null -w '' https://archive.ubuntu.com/ 2>/dev/null; then
  ok "outbound HTTPS reachable"
else
  warn "outbound HTTPS appears unreachable (apt + release download may fail)"
fi

# Detect cPanel — set later in /etc/vloud.env so the engine knows to coexist.
if [[ -d /usr/local/cpanel || -d /var/cpanel ]]; then
  COEXIST=1
  ok "cPanel detected — Vloud will run in coexist mode (port ${VLOUD_PORT} only)"
else
  COEXIST=0
  ok "cPanel not present"
fi

# Port checks — refuse if our port is taken by something we don't recognise.
if ss -ltn "sport = :$VLOUD_PORT" 2>/dev/null | tail -n +2 | grep -q .; then
  # Distinguish between "we are already running" (idempotent re-run) and
  # "something else is listening".
  if pgrep -f "node.*vloud.*dist/index" >/dev/null 2>&1; then
    ok "port $VLOUD_PORT in use by an existing Vloud engine — will restart"
  elif systemctl is-active --quiet vloud.service 2>/dev/null; then
    ok "port $VLOUD_PORT held by vloud.service — will restart"
  else
    die "port $VLOUD_PORT already bound by another process (set VLOUD_PORT= to override)"
  fi
fi

# ─── 2. Apt install ───
#
# Vloud's runtime dependencies, grouped by concern. Every package
# below is REQUIRED for a working engine on a fresh Ubuntu 24.04
# host — the installer contract is "single command → fully working
# system", which means we don't get to ask the operator to apt-install
# anything afterward. Each group has a comment explaining why; if you
# remove something, document the new contract first.
#
# 2026-05-14: added redis-server (gap caught in fresh-host validation
# — engine workers depend on redis but it wasn't being installed).
# See docs/operations/host-nuke-and-revalidate.md.
say "installing system packages"
export DEBIAN_FRONTEND=noninteractive
apt-get update -qq

# Core: things the engine + bootstrap itself reach for via shell-out.
APT_CORE=(curl ca-certificates gnupg lsb-release jq tar gzip rsync
          openssh-client unzip whois dnsutils sqlite3 openssl)

# Runtime services the engine speaks to as a CLIENT (not the
# daemon-stack, which is a separate concern — those are mail/DNS/FTP
# servers the engine *manages* for tenants, installed later by
# bootstrap-daemon-stack.sh).
APT_RUNTIME=(
  # Redis: BullMQ queues (workers/job-worker.ts) + scheduler pubsub
  # (workers/scheduler.ts). Both default to redis://127.0.0.1:6379.
  # vloud.service systemd unit lists `After=redis.service`. Without
  # redis the engine boots but workers crash-loop.
  redis-server
  # nginx: reverse proxy in front of the engine on :80 → :2544. The
  # release UI ships its SPA assets via nginx, not the engine itself.
  nginx
)

# Build toolchain — kept on every host because better-sqlite3 and
# node-pty fall back to source-compile when prebuilt-install can't
# match the host's NMV. The toolchain is also needed by `npm install`
# itself for postinstall scripts. ~150 MB.
APT_BUILD=(build-essential python3 python3-dev)

# SSL automation. Vloud's hosting-accounts flow uses certbot for
# Let's Encrypt issuance + renewal.
APT_SSL=(certbot)

# PHP runtime for the hosting-accounts feature. Even on hosts where
# the operator never deploys a PHP app, the PHP install costs ~80 MB
# and saves an apt-install round-trip at first deploy time. Treat as
# a known cost — there's a long-term TODO to defer this until the
# first PHP app is provisioned.
APT_HOSTING_PHP=(php8.3-fpm php8.3-cli php8.3-mbstring php8.3-xml php8.3-mysql
                 php8.3-curl php8.3-gd php8.3-zip php8.3-intl php8.3-bcmath)

apt-get install -y -qq "${APT_CORE[@]}" "${APT_RUNTIME[@]}" "${APT_BUILD[@]}" "${APT_SSL[@]}" "${APT_HOSTING_PHP[@]}"
ok "apt packages ready"

# Redis MUST be enabled + running before we start the engine — the
# vloud.service unit has Wants=/After=redis.service so systemd will
# wait for it, but if redis is masked/disabled the wait turns into a
# silent fail. Fail-fast here with an actionable diagnostic.
say "enabling + starting redis"
emit_progress install-redis started ""
# Ubuntu 24.04 ships redis as redis-server.service. Older Ubuntus
# used redis.service — try both for forward+back compat.
REDIS_UNIT=
for u in redis-server.service redis.service; do
  if systemctl list-unit-files "$u" >/dev/null 2>&1 \
     && systemctl list-unit-files "$u" 2>/dev/null | grep -q "$u"; then
    REDIS_UNIT="$u"; break
  fi
done
if [[ -z "$REDIS_UNIT" ]]; then
  emit_progress install-redis failed "redis unit not registered after apt install"
  die "redis-server package installed but neither redis-server.service nor redis.service registered with systemd. Inspect: dpkg -l redis-server; systemctl list-unit-files | grep -i redis" 8
fi
systemctl enable --now "$REDIS_UNIT" >/dev/null 2>&1
# Wait up to 15s for redis-cli ping → PONG. ping is the canonical
# readiness probe; redis is in-process at that point so it's <100ms
# usually but we budget for slow VMs.
ready=0
for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do
  if redis-cli ping 2>/dev/null | grep -q '^PONG$'; then ready=1; break; fi
  sleep 1
done
if [[ "$ready" -ne 1 ]]; then
  emit_progress install-redis failed "redis-cli ping never returned PONG"
  fail "redis-cli ping did not return PONG within 15s — check 'journalctl -u $REDIS_UNIT'"
  die "redis readiness check failed" 8
fi
ok "redis ready ($REDIS_UNIT, port 6379)"
emit_progress install-redis ok "$REDIS_UNIT ready"

# Node 20 LTS (NodeSource). Vloud standardizes on Node 20 — the
# release tarball ships better-sqlite3 + bcrypt prebuilt-binaries
# against NODE_MODULE_VERSION 115 (Node 20.x). A non-20 Node here
# means native modules will fail to load. We REINSTALL when the
# installed Node isn't 20.x — not just when it's missing — to repair
# hosts that were previously installed against Node 18 or Node 22.
need_node=0
if ! command -v node >/dev/null; then need_node=1
elif [[ $(node --version | sed -E 's/^v([0-9]+).*/\1/') -ne 20 ]]; then
  warn "found $(node --version) — Vloud requires Node 20.x, will reinstall"
  need_node=1
fi
if [[ "$need_node" -eq 1 ]]; then
  install -d -m 0755 /etc/apt/keyrings
  curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key \
    | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg --yes 2>/dev/null
  echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main' \
    > /etc/apt/sources.list.d/nodesource.list
  apt-get update -qq
  apt-get install -y -qq nodejs
fi
# Post-install assertion: refuse to proceed unless we landed on Node 20.x.
NODE_MAJOR=$(node --version | sed -E 's/^v([0-9]+).*/\1/')
if [[ "$NODE_MAJOR" -ne 20 ]]; then
  die "Node 20.x is required but $(node --version) is installed. Install Node 20 manually and re-run." 7
fi
LOCAL_NMV=$(node -e 'process.stdout.write(String(process.versions.modules))')
ok "node $(node --version) (NODE_MODULE_VERSION=$LOCAL_NMV), npm $(npm --version)"

# pm2 — optional fallback when the operator wants to bypass systemd later.
# We DO NOT use it for the unit; systemd is canonical.  Best-effort install.
if ! command -v pm2 >/dev/null; then
  npm install -g --silent pm2 2>/dev/null && ok "pm2 installed (optional)" || warn "pm2 install skipped"
fi

# ─── 3. Vloud release download + verification ───
#
# Install-time verification chain (matches vloud-commercial Slice 2):
#   1. Download tarball + .sha256 + .sig.
#   2. Verify sha256 matches.
#   3. Verify Ed25519 JWS over {artifact, version, channel, sha256}.
#   4. Cross-check payload sha256 + artifact name.
#   5. Only after all checks pass: tar -xzf.
#
# Refusal modes (refuse-never-warn):
#   - sha256 mismatch         → die with hash detail
#   - signature missing       → die with URL hint
#   - signature invalid       → die
#   - artifact-name mismatch  → die (prevents repurposed-sig attack)
#
# The release public key is supplied via $VLOUD_RELEASE_PUBKEY_PEM
# (env var) or, post-build-substitution, the embedded heredoc below.
# Production builds of bootstrap.sh substitute the placeholder
# `__VLOUD_RELEASE_PUBKEY_PEM__` with the real PEM at release time.
#
# ESCAPE HATCH for testing only: VLOUD_INSECURE_SKIP_VERIFY=1 skips
# step 3.  Refuses to run on Ubuntu/Debian production hosts unless
# the operator also sets VLOUD_TESTING=1 (so accidental skips on a
# real install loudly require both).

# (EMBEDDED_RELEASE_PUBKEY_PEM, b64url_decode, verify_release_chain
# were hoisted above the mode dispatcher in the 0.5.1 multi-mode
# installer rewrite. Both fresh + upgrade flows call them.)

say "fetching Vloud release"
if [[ -d "$VLOUD_INSTALL_DIR/packages/server/dist" ]]; then
  ok "$VLOUD_INSTALL_DIR exists — keeping existing release (re-run safe)"
else
  install -d -m 0755 "$VLOUD_INSTALL_DIR"
  TMPTAR=$(mktemp /tmp/vloud-release-XXXXXX.tar.gz)
  TMPSHA="$TMPTAR.sha256"
  TMPSIG="$TMPTAR.sig"
  trap 'rm -f "$TMPTAR" "$TMPSHA" "$TMPSIG"' EXIT
  if ! curl -fsSL --max-time 600 -o "$TMPTAR" "$VLOUD_RELEASE_URL"; then
    die "release download failed: $VLOUD_RELEASE_URL"
  fi
  # Companion files.  Skip-verify mode (test only) tolerates 404s.
  if [[ "${VLOUD_INSECURE_SKIP_VERIFY:-0}" != "1" ]]; then
    if ! curl -fsSL --max-time 60 -o "$TMPSHA" "$VLOUD_RELEASE_URL.sha256"; then
      die "release sha256 download failed: $VLOUD_RELEASE_URL.sha256"
    fi
    if ! curl -fsSL --max-time 60 -o "$TMPSIG" "$VLOUD_RELEASE_URL.sig"; then
      die "release sig download failed: $VLOUD_RELEASE_URL.sig"
    fi
    # The sha256sum -c check expects the basename of the tarball
    # in the .sha256 file to match what's on disk; our tmpfile is
    # named differently, so rewrite the line to match.
    SHA_HEX=$(awk '{print $1}' "$TMPSHA")
    printf '%s  %s\n' "$SHA_HEX" "$(basename "$TMPTAR")" > "$TMPSHA"
    # The signed JWS payload names the artifact as published on the
    # release host (e.g. vloud-latest.tar.gz). Pass that name through
    # so the cross-check compares published-name vs signed-name, not
    # tmpfile-name vs signed-name.
    EXPECTED_ARTIFACT="$(basename "${VLOUD_RELEASE_URL%%\?*}")"
    verify_release_chain "$TMPTAR" "$TMPSHA" "$TMPSIG" "$EXPECTED_ARTIFACT"
  else
    if [[ "${VLOUD_TESTING:-0}" != "1" ]]; then
      die "VLOUD_INSECURE_SKIP_VERIFY=1 requires VLOUD_TESTING=1 (refusing on production host)"
    fi
    warn "VERIFICATION SKIPPED — VLOUD_INSECURE_SKIP_VERIFY=1 set (test mode)"
  fi
  tar -xzf "$TMPTAR" -C "$VLOUD_INSTALL_DIR" --strip-components=1
  rm -f "$TMPTAR" "$TMPSHA" "$TMPSIG"
  ok "release extracted to $VLOUD_INSTALL_DIR"
fi

# Sanity — engine entrypoint must exist.
# SEA binary format (0.7.0+): vloud-engine at install root
# Legacy format (0.6.x): packages/server/dist/index.js
if [[ -f "$VLOUD_INSTALL_DIR/vloud-engine" ]]; then
  VLOUD_ENGINE_FORMAT=sea
  say "detected SEA binary release"
  chmod +x "$VLOUD_INSTALL_DIR/vloud-engine"
elif [[ -f "$VLOUD_INSTALL_DIR/packages/server/dist/index.js" ]]; then
  VLOUD_ENGINE_FORMAT=legacy
  say "detected legacy dist release"
else
  die "release missing both vloud-engine and dist/index.js — corrupt download or wrong layout"
fi

if [[ "$VLOUD_ENGINE_FORMAT" == "legacy" ]]; then
# Rebuild native bindings against the installed Node ABI. The release
# tarball is built on whatever Node version the packager used; if that
# differs from the customer's Node major, better-sqlite3's .node file
# fails with "Module did not self-register" at every db query.
# `npm rebuild` re-runs the install scripts which pull the matching
# prebuilt binary (or compile from source if no prebuilt exists). Cheap
# and idempotent — costs a few seconds when the binary already matches.
say "rebuilding native bindings against local Node $(node --version)"
if ( cd "$VLOUD_INSTALL_DIR/packages/server" \
     && npm rebuild better-sqlite3 --no-audit --no-fund 2>/dev/null \
        | grep -q 'rebuilt dependencies' ) \
   || ( cd "$VLOUD_INSTALL_DIR/packages/server" \
        && npm rebuild better-sqlite3 --no-audit --no-fund >/dev/null 2>&1 ); then
  ok "native bindings ABI-matched to local Node"
else
  warn "npm rebuild reported non-zero; install build-essential + python3 if a source compile is needed"
fi
fi  # end VLOUD_ENGINE_FORMAT == legacy

# ─── 4. Configure ───
say "configuring vloud system user + env file"

# vloud user (system, no shell).
if ! id -u vloud >/dev/null 2>&1; then
  useradd --system --home-dir /var/lib/vloud --shell /usr/sbin/nologin --user-group vloud
  ok "created vloud system user"
else
  ok "vloud user exists"
fi
install -d -m 0750 -o vloud -g vloud /var/lib/vloud /var/log/vloud

# Trial bootstrap — once-per-install set of values.
# /etc/machine-id is the canonical Linux machine identity since systemd ~2014.
if [[ -f /etc/machine-id ]] && [[ -s /etc/machine-id ]]; then
  RAW_MACHINE_ID=$(cat /etc/machine-id)
else
  RAW_MACHINE_ID=$(head -c 32 /dev/urandom | sha256sum | cut -d' ' -f1)
fi
# Same prefix-and-truncate the engine uses, so env and DB-side machine_id
# agree from the very first call.
MACHINE_ID=$(printf 'vloud:%s' "$RAW_MACHINE_ID" | sha256sum | cut -d' ' -f1 | head -c 32)

ENV_TMP=$(mktemp /etc/vloud.env.XXXXXX)
trap 'rm -f "$ENV_TMP"' EXIT

# 0.6.0 — cloud license sync. Engine registers itself with the
# license-server + pulls its trial/license. Without this URL set
# correctly, the heartbeat client defaults to http://127.0.0.1:3000
# (a local stub) and the engine never reaches the cloud.
#
# 0.6.0a fix: the URL is the LICENSE-SERVER hostname
# (license.vloud.app), NOT the admin-dashboard hostname
# (adminpanel.vloud.app). Pre-fix installs pointed at the admin
# dashboard and registration silently swallowed every POST as a
# SPA-HTML 200 because adminpanel.vloud.app has no /v1/* route.
#
# 0.6.0c: VLOUD_LICENSE_SERVER_URL_DEFAULT is now hoisted to the
# top of the script alongside VLOUD_INSTALL_DIR etc., so that
# `do_reconcile_env_file()` (reachable from --upgrade and --repair
# via the early mode dispatcher) can reference it without
# tripping `set -u`. Reference here kept as documentation.
: "${VLOUD_LICENSE_SERVER_URL_DEFAULT:?must be set in top-of-script defaults}"

if [[ ! -f "$ENV_FILE" ]]; then
  # First-time install — fresh env file with trial values.
  INSTALL_ID=$(head -c 16 /dev/urandom | xxd -p)
  TRIAL_EXPIRES_AT=$(date -u -d "+${VLOUD_TRIAL_DAYS} days" +%Y-%m-%dT%H:%M:%SZ)
  cat > "$ENV_TMP" <<EOF
# Vloud engine environment.  Edit-don't-edit: changes to VLOUD_MACHINE_ID
# / VLOUD_INSTALL_ID / VLOUD_TRIAL_EXPIRES_AT without recomputing the HMAC
# in trial_state will mark the install as tampered.
PORT=$VLOUD_PORT
NODE_ENV=production
VLOUD_BIND_HOST=$VLOUD_BIND_HOST
VLOUD_JWT_SECRET=$(openssl rand -hex 64)
VLOUD_SUDO_FALLBACK=0
VLOUD_MACHINE_ID=$MACHINE_ID
VLOUD_INSTALL_ID=$INSTALL_ID
VLOUD_TRIAL_EXPIRES_AT=$TRIAL_EXPIRES_AT
# 0.6.0: cloud license-server URL. Registration + license-pull workers
# POST/GET against this. To disable cloud sync entirely (offline mode),
# set VLOUD_REGISTRATION_DISABLE=1 and VLOUD_LICENSE_PULL_DISABLE=1
# below — the engine will continue to honour a manually-installed
# /var/lib/vloud/license.dat.
VLOUD_LICENSE_SERVER_URL=$VLOUD_LICENSE_SERVER_URL_DEFAULT
EOF
  if [[ "$COEXIST" -eq 1 ]]; then
    echo "VLOUD_COEXIST=1" >> "$ENV_TMP"
  fi
  install -m 0640 -o root -g vloud "$ENV_TMP" "$ENV_FILE"
  rm -f "$ENV_TMP"
  ok "wrote fresh $ENV_FILE (trial $VLOUD_TRIAL_DAYS days, cloud sync → adminpanel)"
else
  # Re-run path — preserve existing values, only fill gaps.
  cp "$ENV_FILE" "$ENV_TMP"
  upsert() {
    local k="$1" v="$2"
    if grep -q "^$k=" "$ENV_TMP"; then return; fi
    printf '%s=%s\n' "$k" "$v" >> "$ENV_TMP"
  }
  upsert PORT "$VLOUD_PORT"
  upsert NODE_ENV production
  upsert VLOUD_BIND_HOST "$VLOUD_BIND_HOST"
  upsert VLOUD_JWT_SECRET "$(openssl rand -hex 64)"
  upsert VLOUD_SUDO_FALLBACK 1
  upsert VLOUD_MACHINE_ID "$MACHINE_ID"
  upsert VLOUD_INSTALL_ID "$(head -c 16 /dev/urandom | xxd -p)"
  upsert VLOUD_TRIAL_EXPIRES_AT "$(date -u -d "+${VLOUD_TRIAL_DAYS} days" +%Y-%m-%dT%H:%M:%SZ)"
  if [[ "$COEXIST" -eq 1 ]]; then upsert VLOUD_COEXIST 1; fi

  # 0.6.0: upsert VLOUD_LICENSE_SERVER_URL. Three rewrite cases:
  #
  #   1. Localhost (pre-0.6 default — engine never reached cloud).
  #   2. adminpanel.vloud.app (0.6.0 default — but adminpanel doesn't
  #      proxy /v1/* to the license-server; that was the 0.6.0a bug).
  #   3. Missing → just upsert.
  #
  # Rewriting case 2 in place is critical: every 0.6.0 install
  # bootstrap'd with adminpanel.vloud.app, and those engines never
  # successfully registered. The upgrade path corrects them.
  if grep -qE '^VLOUD_LICENSE_SERVER_URL=https?://(127\.0\.0\.1|localhost|adminpanel\.vloud\.app)' "$ENV_TMP"; then
    sed -i -E "s|^VLOUD_LICENSE_SERVER_URL=.*|VLOUD_LICENSE_SERVER_URL=$VLOUD_LICENSE_SERVER_URL_DEFAULT|" "$ENV_TMP"
    ok "rewrote VLOUD_LICENSE_SERVER_URL → $VLOUD_LICENSE_SERVER_URL_DEFAULT"
  else
    upsert VLOUD_LICENSE_SERVER_URL "$VLOUD_LICENSE_SERVER_URL_DEFAULT"
  fi

  install -m 0640 -o root -g vloud "$ENV_TMP" "$ENV_FILE"
  rm -f "$ENV_TMP"
  ok "updated $ENV_FILE (preserving existing values; cloud sync → adminpanel)"
fi

# Source values for the summary later.
TRIAL_EXPIRES_AT=$(grep -oE '^VLOUD_TRIAL_EXPIRES_AT=.*' "$ENV_FILE" | cut -d= -f2-)

# Vloud-managed dirs.
#
# 2026-05-14: /var/lib/vloud (the PARENT) is owned by vloud:vloud
# because the engine writes state files directly there at runtime:
# .upgrade-progress (PR1 JSONL progress; bootstrap.sh writes as root,
# engine reads as vloud), license.dat, .tamper-events, .config-ok.
# bootstrap.sh's earlier (pre-2026-05-14) bug created this as
# root:root, which silently broke the engine's state writes; the
# do_verify_dependencies gate catches it now ("/var/lib/vloud not
# writable by user vloud"). Sub-directories below can be root-owned
# safely because the engine doesn't traverse into them — backups +
# crontabs are write-by-root-only (bootstrap.sh's do_backup_state +
# operator-provisioned cron).
install -d -m 0750 -o vloud -g vloud /var/lib/vloud
install -d -m 0750 -o root  -g root  /var/lib/vloud/crontabs /var/lib/vloud/backups
install -d -m 0750 -o root  -g root  /etc/vloud /etc/vloud/ssl /etc/vloud/certs
install -d -m 0750 -o vloud -g vloud /var/lib/vloud/staging \
  /var/lib/vloud/staging/nginx /var/lib/vloud/staging/php \
  /var/lib/vloud/staging/ssl /var/lib/vloud/staging/fs \
  /var/lib/vloud/staging/mail /var/lib/vloud/staging/installers
install -d -m 0755 -o root -g www-data /var/lib/vloud/acme \
  /var/lib/vloud/acme/.well-known /var/lib/vloud/acme/.well-known/acme-challenge
ok "/var/lib/vloud/* + /etc/vloud/* directories ready"

# 0.6.0a — fetch the active license-signing public key and install
# it at /etc/vloud/license-v1.pub.pem. The engine's license-pull
# worker uses this PEM to verify JWS issued by the cloud. Without
# it, every pull is rejected as "signature/shape mismatch" and the
# sidebar stays on "Pulling license…" forever.
#
# Idempotent: re-runs on upgrade fetch a fresh copy (the signing
# key rotates rarely, but when it does this is how customers pick
# up the new public counterpart without a manual scp).
LICENSE_PUBKEY_PATH=/etc/vloud/license-v1.pub.pem
LICENSE_PUBKEY_URL="${VLOUD_LICENSE_SERVER_URL_DEFAULT}/v1/license-pubkey"
say "fetching license-signing public key from $LICENSE_PUBKEY_URL"
TMP_PEM=$(mktemp /tmp/vloud-license-pubkey.XXXXXX.pem)
if curl -fsSL --max-time 15 -o "$TMP_PEM" "$LICENSE_PUBKEY_URL" \
   && grep -q 'BEGIN PUBLIC KEY' "$TMP_PEM"; then
  install -m 0644 -o root -g root "$TMP_PEM" "$LICENSE_PUBKEY_PATH"
  ok "$LICENSE_PUBKEY_PATH installed ($(wc -c < "$LICENSE_PUBKEY_PATH") bytes)"
else
  warn "could not fetch license public key — engine will retry on first pull but trial sync will fail until reachable"
fi
rm -f "$TMP_PEM"

# Sudoers fragment so the engine can run privileged verbs.  We DO this here
# so the bootstrap is fully end-to-end; the operator can drop the file
# afterwards if they want a more locked-down policy.
if [[ -f "$VLOUD_INSTALL_DIR/packages/vloud-agent/install/sudoers.d/vloud" ]]; then
  STAGE=$(mktemp /tmp/vloud-sudoers.XXXXXX)
  trap 'rm -f "$STAGE"' EXIT
  sed 's/__VLOUD_USER__/vloud/g' \
    "$VLOUD_INSTALL_DIR/packages/vloud-agent/install/sudoers.d/vloud" > "$STAGE"
  if visudo -c -f "$STAGE" >/dev/null; then
    install -m 0440 -o root -g root "$STAGE" /etc/sudoers.d/vloud
    ok "sudoers fragment installed"
  else
    warn "sudoers template failed visudo -c — skipping"
  fi
  rm -f "$STAGE"
fi

# Ownership of the engine tree — vloud user must own it (npm install dir +
# dist).  We do NOT chown /opt/vloud as a whole because the user may have
# other things there; we chown the packages/ subtree only.
chown -R vloud:vloud "$VLOUD_INSTALL_DIR/packages" "$VLOUD_INSTALL_DIR/node_modules" 2>/dev/null || true
ok "engine tree owned by vloud:vloud"

# A3 (2026-05-10) — logrotate config for non-journald log paths.
# journald rotates the engine's stdout/stderr automatically via its own
# config; this file covers /var/log/vloud/*.log, the modsec audit log,
# and per-domain nginx access/error logs.
say "installing logrotate config"
if [[ -f "$VLOUD_INSTALL_DIR/packaging/logrotate/vloud" ]]; then
  install -m 0644 -o root -g root \
    "$VLOUD_INSTALL_DIR/packaging/logrotate/vloud" /etc/logrotate.d/vloud
  ok "/etc/logrotate.d/vloud installed"
else
  warn "packaging/logrotate/vloud missing in install tree; skipping"
fi
mkdir -p /var/log/vloud
chown vloud:vloud /var/log/vloud

# 0.7.0 architecture: VLoud does NOT claim port 80 or default_server.
#
# The engine binds to 127.0.0.1:$VLOUD_PORT only. nginx is a gateway
# that the operator configures with a panel domain during onboarding.
# Initial onboarding is accessed directly at http://SERVER-IP:$VLOUD_PORT.
#
# Migration from pre-0.7.0: if vloud-default exists, remove it.
if [[ -f /etc/nginx/sites-enabled/vloud-default ]]; then
  say "migrating: removing vloud-default (VLoud no longer claims port 80/default_server)"
  rm -f /etc/nginx/sites-enabled/vloud-default
  if nginx -t 2>&1 | grep -qE 'test is successful|syntax is ok'; then
    systemctl reload nginx 2>/dev/null || true
  fi
  ok "vloud-default removed"
fi
say "VLoud panel accessible at http://<server-ip>:${VLOUD_PORT}"

# systemd units — D5. Three services (engine + worker + scheduler)
# under one slice. Canonical sources live in
# packaging/systemd/*.service in the engine repo; we template
# WorkingDirectory + Port here because $VLOUD_INSTALL_DIR /
# $VLOUD_PORT can differ per install. Matching changes to the
# canonical files MUST be reflected here (no shared templater
# yet — the indirection wasn't worth a sed pass).
say "installing systemd units"

cat > "$SYSTEMD_SLICE" <<EOF
[Unit]
Description=Vloud control plane (engine + workers)
Documentation=https://docs.vloud.sh/
DefaultDependencies=true
Before=vloud.service vloud-job-worker.service vloud-scheduler.service

[Slice]
EOF

cat > "$SYSTEMD_UNIT" <<EOF
[Unit]
Description=Vloud control plane (Fastify on $VLOUD_PORT)
Documentation=https://docs.vloud.sh/
After=network-online.target redis.service
Wants=network-online.target redis.service
StartLimitBurst=5
StartLimitIntervalSec=60s

[Service]
# Type=notify (A1, 2026-05-10) — engine boots through migrations +
# integrity chain + listen before ready. Mirrors
# packaging/systemd/vloud.service; keep these two in sync.
# NotifyAccess=all (not =main): the engine's sd-notify implementation
# shells out to the `systemd-notify` binary (Node has no native
# AF_UNIX_DGRAM); the spawned binary is a child pid so =main rejects
# every READY=1. =all accepts notifies from any process in the unit's
# cgroup. Switch back to =main when sd-notify is rewritten in pure JS.
Type=notify
NotifyAccess=all
TimeoutStartSec=120s
WatchdogSec=60s
User=vloud
Group=vloud
Slice=vloud.slice
WorkingDirectory=$(if [[ "${VLOUD_ENGINE_FORMAT:-sea}" == "sea" ]]; then echo "$VLOUD_INSTALL_DIR"; else echo "$VLOUD_INSTALL_DIR/packages/server"; fi)
EnvironmentFile=$ENV_FILE
ExecStart=$(if [[ "${VLOUD_ENGINE_FORMAT:-sea}" == "sea" ]]; then echo "$VLOUD_INSTALL_DIR/vloud-engine"; else echo "/usr/bin/node dist/index.js"; fi)

Restart=on-failure
RestartSec=3s
TimeoutStopSec=30s
KillSignal=SIGTERM
SendSIGKILL=yes

# Phase E (2026-05-14): when the engine repeatedly fails to start
# (StartLimitBurst hit), trigger an automatic rollback via the
# vloud-rollback.service unit. The unit shells to
# bootstrap.sh --rollback --automatic, which restores the newest
# /opt/vloud.pre-upgrade-<ts>/ slot. Operator can suppress by
# touching /var/lib/vloud/.rollback-disabled (debugging mode).
StartLimitIntervalSec=120
StartLimitBurst=5
OnFailure=vloud-rollback.service

StandardOutput=journal
StandardError=journal
SyslogIdentifier=vloud

NoNewPrivileges=false
ProtectSystem=false
ProtectHome=false
PrivateTmp=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictRealtime=true
ReadWritePaths=/var/lib/vloud /var/log/vloud
AmbientCapabilities=CAP_NET_BIND_SERVICE
CapabilityBoundingSet=CAP_NET_BIND_SERVICE CAP_SETUID CAP_SETGID

[Install]
WantedBy=multi-user.target
EOF

cat > "$SYSTEMD_WORKER" <<EOF
[Unit]
Description=Vloud job worker (BullMQ consumer for deploys + SSL)
Documentation=https://docs.vloud.sh/
After=network-online.target redis-server.service redis.service vloud.service
Wants=network-online.target redis-server.service redis.service
PartOf=vloud.service
StartLimitBurst=5
StartLimitIntervalSec=60s

[Service]
Type=simple
User=vloud
Group=vloud
Slice=vloud.slice
WorkingDirectory=$VLOUD_INSTALL_DIR/packages/server
EnvironmentFile=$ENV_FILE
ExecStart=/usr/bin/node dist/workers/job-worker.js

Restart=on-failure
RestartSec=3s
TimeoutStartSec=30s
TimeoutStopSec=120s
KillSignal=SIGTERM
SendSIGKILL=yes

StandardOutput=journal
StandardError=journal
SyslogIdentifier=vloud-job-worker

NoNewPrivileges=false
ProtectSystem=false
ProtectHome=false
PrivateTmp=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictRealtime=true
ReadWritePaths=/var/lib/vloud /var/log/vloud

[Install]
WantedBy=multi-user.target
EOF

cat > "$SYSTEMD_SCHEDULER" <<EOF
[Unit]
Description=Vloud scheduler (periodic job enqueuer + sweepers)
Documentation=https://docs.vloud.sh/
After=network-online.target redis-server.service redis.service vloud-job-worker.service
Wants=network-online.target redis-server.service redis.service
PartOf=vloud.service
StartLimitBurst=5
StartLimitIntervalSec=60s

[Service]
Type=simple
User=vloud
Group=vloud
Slice=vloud.slice
WorkingDirectory=$VLOUD_INSTALL_DIR/packages/server
EnvironmentFile=$ENV_FILE
ExecStart=/usr/bin/node dist/workers/scheduler.js

Restart=on-failure
RestartSec=3s
TimeoutStartSec=30s
TimeoutStopSec=30s
KillSignal=SIGTERM
SendSIGKILL=yes

StandardOutput=journal
StandardError=journal
SyslogIdentifier=vloud-scheduler

NoNewPrivileges=false
ProtectSystem=false
ProtectHome=false
PrivateTmp=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictRealtime=true
ReadWritePaths=/var/lib/vloud /var/log/vloud

[Install]
WantedBy=multi-user.target
EOF

# Phase E (2026-05-14): vloud-rollback.service — triggered by
# vloud.service's OnFailure hook when the engine crash-loops past
# StartLimitBurst. Shells to bootstrap.sh --rollback --automatic
# which restores /opt/vloud.pre-upgrade-<ts>/ (the newest slot).
#
# The unit guards on the absence of /var/lib/vloud/.rollback-disabled
# so a debugging operator can suppress auto-rollback by touching that
# file. Re-enable by deleting it.
#
# Type=oneshot — fires, does its work, exits. Not persistent.
cat > "$SYSTEMD_ROLLBACK" <<EOF
[Unit]
Description=Vloud automatic rollback on repeated engine start-failure
Documentation=https://docs.vloud.sh/operations/install-upgrade-runbook.md
# Suppress when operator is debugging.
ConditionPathExists=!/var/lib/vloud/.rollback-disabled
# Avoid an infinite rollback loop: if rollback ITSELF starts firing
# repeatedly, stop trying after 3 attempts in 10 minutes.
StartLimitBurst=3
StartLimitIntervalSec=600

[Service]
Type=oneshot
User=root
# --automatic flag tells bootstrap.sh this is a programmatic invocation,
# not an operator click. (Currently identical to plain --rollback; the
# flag exists so operator-vs-systemd-driven rollbacks are
# distinguishable in the progress log.)
ExecStart=$VLOUD_INSTALL_DIR/scripts/bootstrap.sh --rollback --automatic

StandardOutput=journal
StandardError=journal
SyslogIdentifier=vloud-rollback
EOF

systemctl daemon-reload
systemctl enable vloud.service vloud-job-worker.service vloud-scheduler.service >/dev/null 2>&1
# vloud-rollback.service is NOT enabled (don't start at boot) — it's
# triggered only by vloud.service's OnFailure hook.
ok "vloud.{service,job-worker,scheduler,rollback} installed + enabled"

# ─── 5. Start engine + workers ───
# Order: engine first, then worker, then scheduler. PartOf= on the
# workers means a `systemctl restart vloud` cycles all three;
# starting them individually here lets us surface failures
# distinctly (a worker boot loop won't get blamed on the engine).
say "starting engine"
systemctl restart vloud.service

HEALTHY=0
for i in $(seq 1 30); do
  if curl -fsS --max-time 1 -o /dev/null -w '' "http://127.0.0.1:$VLOUD_PORT/api/health" 2>/dev/null; then
    HEALTHY=1; break
  fi
  sleep 1
done
[[ "$HEALTHY" -eq 1 ]] || die "engine did not become healthy on :$VLOUD_PORT within 30s — see journalctl -u vloud" 4

ok "engine healthy on :$VLOUD_PORT"

# Start the worker + scheduler now that the engine is up. We do
# this best-effort: if Redis isn't running on this host the worker
# unit's Requires=redis.service will refuse to start, and that's
# the right failure mode (BullMQ has no Redis fallback). Operators
# on Redis-less installs can disable these units.
say "starting workers"
# 2026-05-14: Redis is now installed + verified in step 2, so this
# is fail-fast instead of the previous warn-and-skip. If the operator
# explicitly opted out via VLOUD_BOOTSTRAP_NO_REDIS=1 (offline-only
# install variant — undocumented escape hatch), the workers go up
# disabled and the engine logs a degraded-mode warning at boot.
if [[ "${VLOUD_BOOTSTRAP_NO_REDIS:-0}" == "1" ]]; then
  warn "VLOUD_BOOTSTRAP_NO_REDIS=1 — workers NOT started (BullMQ paths disabled)"
elif systemctl is-active --quiet redis-server.service 2>/dev/null \
     || systemctl is-active --quiet redis.service 2>/dev/null; then
  if systemctl restart vloud-job-worker.service vloud-scheduler.service 2>/dev/null; then
    ok "vloud-job-worker + vloud-scheduler running"
  else
    do_dump_failure_journal
    die "worker or scheduler failed to start — check 'journalctl -u vloud-job-worker' / 'journalctl -u vloud-scheduler'" 9
  fi
else
  die "redis is not active — bootstrap should have ensured this in step 2. Inspect 'systemctl status redis-server' + 'journalctl -u redis-server'" 9
fi

# ─── 6. Phase 1 daemon stack ───
# Postfix / Dovecot / Rspamd / fail2ban / BIND / pure-ftpd / nftables /
# tenant.slice templates / storage-quota tools. Each child script is
# idempotent and short-circuits if the daemon is already configured.
# cPanel-detected hosts skip the stack by default (cPanel owns those
# services). Master skip: VLOUD_BOOTSTRAP_NO_DAEMON_STACK=1.
say "configuring Phase 1 daemon stack"

DAEMON_STACK_SCRIPT="$VLOUD_INSTALL_DIR/scripts/bootstrap-daemon-stack.sh"
# Fallback when bootstrap.sh is run from the source tree (e.g. dev test
# of the bootstrap pipeline before a tarball ships).
if [[ ! -x "$DAEMON_STACK_SCRIPT" ]]; then
  ALT="$(dirname "${BASH_SOURCE[0]}")/bootstrap-daemon-stack.sh"
  [[ -x "$ALT" ]] && DAEMON_STACK_SCRIPT="$ALT"
fi

if [[ ! -x "$DAEMON_STACK_SCRIPT" ]]; then
  warn "bootstrap-daemon-stack.sh missing — daemons can be installed manually later"
else
  # We pass through the cPanel detection result; the child script will
  # also detect on its own (defence in depth).
  if [[ "$COEXIST" -eq 1 ]] && [[ "${VLOUD_BOOTSTRAP_DAEMON_STACK:-0}" != "1" ]]; then
    warn "cPanel detected — daemon stack skipped (set VLOUD_BOOTSTRAP_DAEMON_STACK=1 to override)"
  else
    DAEMON_RC=0
    bash "$DAEMON_STACK_SCRIPT" || DAEMON_RC=$?
    case "$DAEMON_RC" in
      0) ok "daemon stack: all installed" ;;
      2) warn "daemon stack: one or more daemons failed (continued; see log above)" ;;
      *) warn "daemon stack: aborted (rc=$DAEMON_RC)" ;;
    esac
  fi
fi

# 2026-05-14: Final dependency gate. Bootstrap is considered complete
# only when every required runtime dep is present + healthy + reachable.
# Fresh-host validation on 188.245.113.223 found that bootstrap was
# silently producing engines without redis; this gate prevents that
# from ever shipping again. Failure here is fatal — install summary
# below is meaningless if deps don't pass.
if ! do_verify_dependencies; then
  die "install verification failed — see failures above. Bootstrap did NOT complete cleanly." 10
fi

# Trial status read-back.
TRIAL_JSON=$(curl -fsS "http://127.0.0.1:$VLOUD_PORT/api/install-trial/status" 2>/dev/null || echo '{}')
TRIAL_STATE=$(printf '%s' "$TRIAL_JSON" | grep -oE '"state":"[^"]+"' | cut -d'"' -f4 || echo unknown)
TRIAL_DAYS_LEFT=$(printf '%s' "$TRIAL_JSON" | grep -oE '"days_remaining":[0-9]+' | cut -d: -f2 || echo "?")

# Public IP — hostname -I is reliable on cloud servers; ifconfig.io is the
# fallback for one-NIC nodes behind NAT.
PUBLIC_IP=$(hostname -I 2>/dev/null | awk '{print $1}')
if [[ -z "$PUBLIC_IP" ]] || [[ "$PUBLIC_IP" == "127."* ]]; then
  PUBLIC_IP=$(curl -fsS --max-time 5 https://ifconfig.io 2>/dev/null || echo "<unknown>")
fi

# ─── Summary ───
echo
printf "${GREEN}═══════════════════════════════════════════════════════════════${RESET}\n"
printf "${GREEN}  ✓ Vloud is up.${RESET}\n"
printf "${GREEN}═══════════════════════════════════════════════════════════════${RESET}\n"
echo
echo "  Server IP:    $PUBLIC_IP"
echo "  Dashboard:    http://$PUBLIC_IP:$VLOUD_PORT/onboarding"
echo "  Trial:        ${TRIAL_DAYS_LEFT}-day trial active (state: $TRIAL_STATE)"
echo "  Coexist:      $( [[ "$COEXIST" -eq 1 ]] && echo "yes (cPanel detected)" || echo "no" )"
echo
echo "  Manage:       sudo systemctl {start,stop,restart,status} vloud"
echo "  Tail logs:    sudo journalctl -u vloud -f"
echo "  Env file:     $ENV_FILE  (mode 0640 root:vloud)"
echo "  Install dir:  $VLOUD_INSTALL_DIR"
echo
echo "  Open the dashboard URL above to run the first-run wizard."
echo

exit 0
