HOA_Financial_Platform/scripts/deploy-prod.sh

#!/usr/bin/env bash
# ---------------------------------------------------------------------------
# deploy-prod.sh — Production deployment script for HOA LedgerIQ
#
# Usage:
#   ./scripts/deploy-prod.sh [--seed-existing]
#
# This script performs a full production deployment:
#   1. Takes a pre-upgrade database backup
#   2. Pulls the latest code from the main branch
#   3. Rebuilds and restarts Docker containers
#   4. Runs any pending database migrations (tracked in shared.schema_migrations)
#   5. Verifies the application is healthy
#   6. Takes a post-upgrade database backup
#
# On failure (migration error or health check), the script automatically:
#   - Restores the pre-upgrade database backup
#   - Reverts the code to the previous commit
#   - Rebuilds containers from the reverted code
#
# Flags:
#   --seed-existing   Mark all existing migration files as applied without
#                     executing them. Use this ONLY on the first deployment
#                     against an existing database where migrations were
#                     previously applied manually.
#
# Environment:
#   PROJECT_DIR        Override the project directory (default: /opt/hoa-ledgeriq)
#   POSTGRES_USER      Database user (default: hoafinance)
#   POSTGRES_DB        Database name (default: hoafinance)
# ---------------------------------------------------------------------------

set -euo pipefail

# ---- Defaults ----
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="${PROJECT_DIR:-/opt/hoa-ledgeriq}"
COMPOSE_CMD="docker compose -f $PROJECT_DIR/docker-compose.yml -f $PROJECT_DIR/docker-compose.prod.yml"
DB_USER="${POSTGRES_USER:-hoafinance}"
DB_NAME="${POSTGRES_DB:-hoafinance}"
MIGRATION_DIR="$PROJECT_DIR/db/migrations"
HEALTH_URL="http://localhost:3000/api"
HEALTH_RETRIES=36
HEALTH_INTERVAL=5
HEALTH_START_WAIT=10
LOG_DIR="$PROJECT_DIR/logs"
LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d_%H%M%S).log"

# State tracking
SEED_EXISTING=false
PREV_COMMIT=""
BACKUP_FILE=""
ROLLBACK_NEEDED=false
DEPLOY_SUCCESS=false
DEPLOY_START_TIME=""

# ---- Colors ----
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; NC='\033[0m'

# ---- Logging ----
log()  { echo -e "$(date -Iseconds) ${CYAN}[DEPLOY]${NC} $*"; }
ok()   { echo -e "$(date -Iseconds) ${GREEN}[OK]${NC}     $*"; }
warn() { echo -e "$(date -Iseconds) ${YELLOW}[WARN]${NC}   $*"; }
err()  { echo -e "$(date -Iseconds) ${RED}[ERROR]${NC}  $*" >&2; }
die()  { err "$@"; exit 1; }

# ---- Parse flags ----
while [ $# -gt 0 ]; do
  case "$1" in
    --seed-existing) SEED_EXISTING=true; shift ;;
    --help|-h)
      head -35 "$0" | tail -33
      exit 0
      ;;
    *) die "Unknown argument: $1" ;;
  esac
done

# ---- Setup logging ----
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1

# ---- Cleanup / Rollback trap ----
cleanup() {
  if [ "$DEPLOY_SUCCESS" = true ]; then
    return 0
  fi

  if [ "$ROLLBACK_NEEDED" = true ] && [ -n "$BACKUP_FILE" ]; then
    echo ""
    err "=========================================="
    err "  DEPLOYMENT FAILED — STARTING ROLLBACK"
    err "=========================================="
    echo ""

    # Step 1: Restore the pre-upgrade database backup
    log "Restoring database from pre-upgrade backup: $(basename "$BACKUP_FILE")"
    if "$SCRIPT_DIR/db-backup.sh" restore --yes "$BACKUP_FILE"; then
      ok "Database restored successfully"
    else
      err "DATABASE RESTORE FAILED — manual intervention required!"
      err "Backup file: $BACKUP_FILE"
      exit 1
    fi

    # Step 2: Revert code to previous commit
    if [ -n "$PREV_COMMIT" ]; then
      log "Reverting code to previous commit: $PREV_COMMIT"
      cd "$PROJECT_DIR"
      git reset --hard "$PREV_COMMIT"
      ok "Code reverted to $PREV_COMMIT"
    fi

    # Step 3: Rebuild containers from old code
    log "Rebuilding containers from reverted code ..."
    cd "$PROJECT_DIR"
    $COMPOSE_CMD up -d --build
    ok "Containers rebuilt from previous version"

    echo ""
    err "Rollback complete. The system is restored to the pre-deployment state."
    err "Review the deploy log for details: $LOG_FILE"
    exit 1
  elif [ "$ROLLBACK_NEEDED" = true ]; then
    err "Rollback needed but no backup file available — manual intervention required!"
    exit 1
  fi
}

trap cleanup EXIT

# ====================================================================
#  STEP 1: Pre-flight checks
# ====================================================================
log "============================================"
log "  HOA LedgerIQ — Production Deployment"
log "============================================"
log "Project directory: $PROJECT_DIR"
log "Timestamp: $(date -Iseconds)"
DEPLOY_START_TIME=$(date +%s)
echo ""

cd "$PROJECT_DIR"

# Verify prerequisites
command -v git >/dev/null 2>&1 || die "git is not installed"
command -v docker >/dev/null 2>&1 || die "docker is not installed"
docker compose version >/dev/null 2>&1 || die "docker compose is not available"

# Verify we're in a git repo
[ -d ".git" ] || die "$PROJECT_DIR is not a git repository"

# Verify postgres is running
if ! $COMPOSE_CMD ps postgres 2>/dev/null | grep -q "running\|Up"; then
  die "PostgreSQL container is not running. Start it with: $COMPOSE_CMD up -d postgres"
fi

# Store current commit for rollback
PREV_COMMIT=$(git rev-parse HEAD)
log "Current commit: $PREV_COMMIT"

# ====================================================================
#  STEP 2: Pre-upgrade database backup
# ====================================================================
echo ""
log "--- Step 1/6: Pre-upgrade database backup ---"

BACKUP_OUTPUT=$("$SCRIPT_DIR/db-backup.sh" backup 2>&1)
echo "$BACKUP_OUTPUT"

# Extract the backup file path from the output (strip ANSI color codes first)
BACKUP_FILE=$(echo "$BACKUP_OUTPUT" | sed 's/\x1b\[[0-9;]*m//g' | grep -oP 'Backup complete: \K\S+' || true)

if [ -z "$BACKUP_FILE" ]; then
  die "Failed to capture backup file path from db-backup.sh output"
fi

if [ ! -f "$BACKUP_FILE" ]; then
  die "Backup file does not exist: $BACKUP_FILE"
fi

ok "Pre-upgrade backup saved: $(basename "$BACKUP_FILE")"

# From this point forward, rollback is possible
ROLLBACK_NEEDED=true

# ====================================================================
#  STEP 3: Pull latest code
# ====================================================================
echo ""
log "--- Step 2/6: Pulling latest code from main ---"

git fetch origin main
git reset --hard origin/main

NEW_COMMIT=$(git rev-parse HEAD)
log "Updated to commit: $NEW_COMMIT"

if [ "$PREV_COMMIT" = "$NEW_COMMIT" ]; then
  warn "No new commits — continuing anyway (migrations or rebuilds may still be needed)"
fi

# ====================================================================
#  STEP 4: Rebuild and restart containers
# ====================================================================
echo ""
log "--- Step 3/6: Rebuilding and restarting containers ---"

$COMPOSE_CMD up -d --build

# Wait for postgres to be healthy before running migrations
log "Waiting for PostgreSQL to be healthy ..."
PG_RETRIES=30
PG_COUNT=0
while [ $PG_COUNT -lt $PG_RETRIES ]; do
  if $COMPOSE_CMD exec -T postgres pg_isready -U "$DB_USER" -d "$DB_NAME" >/dev/null 2>&1; then
    ok "PostgreSQL is ready"
    break
  fi
  ((PG_COUNT++))
  if [ $PG_COUNT -eq $PG_RETRIES ]; then
    die "PostgreSQL did not become healthy after $((PG_RETRIES * 2))s"
  fi
  sleep 2
done

# ====================================================================
#  STEP 5: Run database migrations
# ====================================================================
echo ""
log "--- Step 4/6: Running database migrations ---"

# Helper: run SQL via psql in the postgres container
run_sql() {
  $COMPOSE_CMD exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" -v ON_ERROR_STOP=1 --quiet "$@"
}

# Step 5a: Ensure the migration tracking table exists
log "Ensuring shared.schema_migrations table exists ..."
run_sql <<'SQL'
CREATE SCHEMA IF NOT EXISTS shared;
CREATE TABLE IF NOT EXISTS shared.schema_migrations (
    id SERIAL PRIMARY KEY,
    filename TEXT NOT NULL UNIQUE,
    applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
    checksum TEXT
);
SQL
ok "Migration tracking table ready"

# Helper: check if a migration has been applied (safe with set -u)
is_applied() {
  local key="$1"
  # Use a subshell test to avoid unbound variable with set -u on empty associative arrays
  [[ -n "${APPLIED_MIGRATIONS[$key]:-}" ]]
}

# Step 5b: Get list of already-applied migrations
declare -A APPLIED_MIGRATIONS=()
while IFS= read -r fname; do
  fname=$(echo "$fname" | xargs)  # trim whitespace
  [ -n "$fname" ] && APPLIED_MIGRATIONS["$fname"]=1
done < <(run_sql -t -c "SELECT filename FROM shared.schema_migrations ORDER BY filename;" 2>/dev/null || true)

APPLIED_COUNT=${#APPLIED_MIGRATIONS[@]}
log "Previously applied migrations: $APPLIED_COUNT"

# Step 5c: Scan migration directory for .sql files
MIGRATION_FILES=()
if [ -d "$MIGRATION_DIR" ]; then
  while IFS= read -r f; do
    MIGRATION_FILES+=("$(basename "$f")")
  done < <(find "$MIGRATION_DIR" -name "*.sql" -type f | sort)
fi

TOTAL_MIGRATIONS=${#MIGRATION_FILES[@]}
log "Total migration files found: $TOTAL_MIGRATIONS"

# Step 5d: Handle --seed-existing (first deployment only)
if [ "$SEED_EXISTING" = true ]; then
  if [ "$APPLIED_COUNT" -gt 0 ]; then
    warn "--seed-existing flag set but $APPLIED_COUNT migrations are already tracked. Skipping seed."
  else
    log "Seeding migration tracking table with ${TOTAL_MIGRATIONS} existing migration files ..."
    for filename in "${MIGRATION_FILES[@]}"; do
      checksum=$(md5sum "$MIGRATION_DIR/$filename" | awk '{print $1}')
      run_sql -c "INSERT INTO shared.schema_migrations (filename, checksum) VALUES ('$filename', '$checksum') ON CONFLICT (filename) DO NOTHING;"
      log "  Seeded: $filename"
    done
    ok "All existing migrations marked as applied (not executed)"
    # Refresh the applied list
    APPLIED_COUNT=$TOTAL_MIGRATIONS
    for filename in "${MIGRATION_FILES[@]}"; do
      APPLIED_MIGRATIONS["$filename"]=1
    done
  fi
fi

# Step 5e: Detect first-run without --seed-existing
if [ "$APPLIED_COUNT" -eq 0 ] && [ "$TOTAL_MIGRATIONS" -gt 0 ] && [ "$SEED_EXISTING" = false ]; then
  warn "The migration tracking table is empty but $TOTAL_MIGRATIONS migration files exist."
  warn "If these migrations were previously applied manually, re-run with --seed-existing"
  warn "to register them without re-executing. Otherwise, all migrations will be applied."
  warn ""
  warn "Continuing in 10 seconds ... (Ctrl+C to abort)"
  sleep 10
fi

# Step 5f: Apply pending migrations
PENDING_COUNT=0
APPLIED_THIS_RUN=0

for filename in "${MIGRATION_FILES[@]}"; do
  if is_applied "$filename"; then
    continue
  fi
  ((PENDING_COUNT++))
done

if [ "$PENDING_COUNT" -eq 0 ]; then
  ok "No pending migrations to apply"
else
  log "$PENDING_COUNT pending migration(s) to apply"
  echo ""

  for filename in "${MIGRATION_FILES[@]}"; do
    if is_applied "$filename"; then
      continue
    fi

    checksum=$(md5sum "$MIGRATION_DIR/$filename" | awk '{print $1}')
    log "  Applying: $filename ..."

    # Run the migration in a single transaction with error stopping
    if cat "$MIGRATION_DIR/$filename" | $COMPOSE_CMD exec -T postgres psql \
        -U "$DB_USER" \
        -d "$DB_NAME" \
        -v ON_ERROR_STOP=1 \
        --single-transaction \
        --quiet 2>&1; then

      # Record successful migration
      run_sql -c "INSERT INTO shared.schema_migrations (filename, checksum) VALUES ('$filename', '$checksum');"
      ok "  Applied: $filename"
      ((APPLIED_THIS_RUN++))
    else
      err "Migration FAILED: $filename"
      err "Triggering automatic rollback ..."
      exit 1  # trap will handle rollback
    fi
  done

  echo ""
  ok "Successfully applied $APPLIED_THIS_RUN migration(s)"
fi

# ====================================================================
#  STEP 6: Health check
# ====================================================================
echo ""
log "--- Step 5/6: Verifying application health ---"

# After a fresh image build, NestJS cold-start can take 2-3 minutes:
#   New Relic init → TypeORM connections → Redis → BullMQ → NestJS bootstrap
# Docker's own healthcheck (start_period:30s + 3×15s retries = ~75s) is too
# aggressive and will mark the container "unhealthy" before the app finishes
# booting. So we do NOT rely on Docker's health status — we probe the HTTP
# endpoint directly from the host and give it up to ~3 minutes total.
TOTAL_WAIT=$((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL))
log "Will wait up to ${TOTAL_WAIT}s for backend to respond at $HEALTH_URL ..."
sleep "$HEALTH_START_WAIT"

HEALTHY=false
for ((i=1; i<=HEALTH_RETRIES; i++)); do
  # Direct HTTP check from the host using wget (available on Ubuntu)
  if wget -qO- --timeout=5 "$HEALTH_URL" >/dev/null 2>&1; then
    HEALTHY=true
    break
  fi

  # Also check Docker's container health for informational logging
  CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown")

  # If the container exited or was removed, fail immediately — no point waiting
  CONTAINER_STATUS=$($COMPOSE_CMD ps backend --format '{{.Status}}' 2>/dev/null || echo "unknown")
  if echo "$CONTAINER_STATUS" | grep -qi "exit\|dead\|removed"; then
    err "Backend container has stopped unexpectedly: $CONTAINER_STATUS"
    break
  fi

  log "  Health check attempt $i/$HEALTH_RETRIES — docker: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..."
  sleep "$HEALTH_INTERVAL"
done

if [ "$HEALTHY" = true ]; then
  ok "Backend is healthy and responding at $HEALTH_URL"
else
  # Log diagnostics before triggering rollback
  err "Backend failed to respond after ${TOTAL_WAIT}s"
  warn "Container status: $($COMPOSE_CMD ps backend 2>/dev/null || echo 'unknown')"
  warn "Recent backend logs:"
  $COMPOSE_CMD logs --tail=30 backend 2>/dev/null || true
  err "Triggering automatic rollback ..."
  exit 1  # trap will handle rollback
fi

# ====================================================================
#  STEP 7: Post-upgrade database backup
# ====================================================================
echo ""
log "--- Step 6/6: Post-upgrade database backup ---"

"$SCRIPT_DIR/db-backup.sh" backup

# ====================================================================
#  Deployment complete
# ====================================================================
DEPLOY_SUCCESS=true
ROLLBACK_NEEDED=false

DEPLOY_END_TIME=$(date +%s)
DEPLOY_DURATION=$((DEPLOY_END_TIME - DEPLOY_START_TIME))

echo ""
log "============================================"
ok "  DEPLOYMENT COMPLETE"
log "============================================"
log "  Previous commit : $PREV_COMMIT"
log "  Current commit  : $NEW_COMMIT"
log "  Migrations run  : $APPLIED_THIS_RUN"
log "  Duration        : ${DEPLOY_DURATION}s"
log "  Log file        : $LOG_FILE"
log "============================================"
echo ""