Files
HOA_Financial_Platform/scripts/deploy-prod.sh
olsch01 a025c9e979 fix: health check now probes HTTP directly with 3-min timeout
The previous approach relied on Docker's container health status, but
Docker's healthcheck (start_period:30s + 3x15s retries = ~75s) marks
the container "unhealthy" before NestJS finishes cold-starting after a
fresh image build (New Relic + TypeORM + Redis + BullMQ init can take
2-3 minutes).

Changes:
- Primary check is now direct wget to localhost:3000/api from the host
- Docker health status used only for informational logging
- Total timeout increased from 130s to 190s (~3 min) for cold starts
- Early exit if container has stopped/exited (no point waiting)
- More backend log lines (30 vs 20) shown on failure for diagnostics

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-09 09:42:49 -04:00

435 lines
15 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# ---------------------------------------------------------------------------
# deploy-prod.sh — Production deployment script for HOA LedgerIQ
#
# Usage:
# ./scripts/deploy-prod.sh [--seed-existing]
#
# This script performs a full production deployment:
# 1. Takes a pre-upgrade database backup
# 2. Pulls the latest code from the main branch
# 3. Rebuilds and restarts Docker containers
# 4. Runs any pending database migrations (tracked in shared.schema_migrations)
# 5. Verifies the application is healthy
# 6. Takes a post-upgrade database backup
#
# On failure (migration error or health check), the script automatically:
# - Restores the pre-upgrade database backup
# - Reverts the code to the previous commit
# - Rebuilds containers from the reverted code
#
# Flags:
# --seed-existing Mark all existing migration files as applied without
# executing them. Use this ONLY on the first deployment
# against an existing database where migrations were
# previously applied manually.
#
# Environment:
# PROJECT_DIR Override the project directory (default: /opt/hoa-ledgeriq)
# POSTGRES_USER Database user (default: hoafinance)
# POSTGRES_DB Database name (default: hoafinance)
# ---------------------------------------------------------------------------
set -euo pipefail
# ---- Defaults ----
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="${PROJECT_DIR:-/opt/hoa-ledgeriq}"
COMPOSE_CMD="docker compose -f $PROJECT_DIR/docker-compose.yml -f $PROJECT_DIR/docker-compose.prod.yml"
DB_USER="${POSTGRES_USER:-hoafinance}"
DB_NAME="${POSTGRES_DB:-hoafinance}"
MIGRATION_DIR="$PROJECT_DIR/db/migrations"
HEALTH_URL="http://localhost:3000/api"
HEALTH_RETRIES=36
HEALTH_INTERVAL=5
HEALTH_START_WAIT=10
LOG_DIR="$PROJECT_DIR/logs"
LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d_%H%M%S).log"
# State tracking
SEED_EXISTING=false
PREV_COMMIT=""
BACKUP_FILE=""
ROLLBACK_NEEDED=false
DEPLOY_SUCCESS=false
DEPLOY_START_TIME=""
# ---- Colors ----
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; NC='\033[0m'
# ---- Logging ----
log() { echo -e "$(date -Iseconds) ${CYAN}[DEPLOY]${NC} $*"; }
ok() { echo -e "$(date -Iseconds) ${GREEN}[OK]${NC} $*"; }
warn() { echo -e "$(date -Iseconds) ${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "$(date -Iseconds) ${RED}[ERROR]${NC} $*" >&2; }
die() { err "$@"; exit 1; }
# ---- Parse flags ----
while [ $# -gt 0 ]; do
case "$1" in
--seed-existing) SEED_EXISTING=true; shift ;;
--help|-h)
head -35 "$0" | tail -33
exit 0
;;
*) die "Unknown argument: $1" ;;
esac
done
# ---- Setup logging ----
mkdir -p "$LOG_DIR"
exec > >(tee -a "$LOG_FILE") 2>&1
# ---- Cleanup / Rollback trap ----
cleanup() {
if [ "$DEPLOY_SUCCESS" = true ]; then
return 0
fi
if [ "$ROLLBACK_NEEDED" = true ] && [ -n "$BACKUP_FILE" ]; then
echo ""
err "=========================================="
err " DEPLOYMENT FAILED — STARTING ROLLBACK"
err "=========================================="
echo ""
# Step 1: Restore the pre-upgrade database backup
log "Restoring database from pre-upgrade backup: $(basename "$BACKUP_FILE")"
if "$SCRIPT_DIR/db-backup.sh" restore --yes "$BACKUP_FILE"; then
ok "Database restored successfully"
else
err "DATABASE RESTORE FAILED — manual intervention required!"
err "Backup file: $BACKUP_FILE"
exit 1
fi
# Step 2: Revert code to previous commit
if [ -n "$PREV_COMMIT" ]; then
log "Reverting code to previous commit: $PREV_COMMIT"
cd "$PROJECT_DIR"
git reset --hard "$PREV_COMMIT"
ok "Code reverted to $PREV_COMMIT"
fi
# Step 3: Rebuild containers from old code
log "Rebuilding containers from reverted code ..."
cd "$PROJECT_DIR"
$COMPOSE_CMD up -d --build
ok "Containers rebuilt from previous version"
echo ""
err "Rollback complete. The system is restored to the pre-deployment state."
err "Review the deploy log for details: $LOG_FILE"
exit 1
elif [ "$ROLLBACK_NEEDED" = true ]; then
err "Rollback needed but no backup file available — manual intervention required!"
exit 1
fi
}
trap cleanup EXIT
# ====================================================================
# STEP 1: Pre-flight checks
# ====================================================================
log "============================================"
log " HOA LedgerIQ — Production Deployment"
log "============================================"
log "Project directory: $PROJECT_DIR"
log "Timestamp: $(date -Iseconds)"
DEPLOY_START_TIME=$(date +%s)
echo ""
cd "$PROJECT_DIR"
# Verify prerequisites
command -v git >/dev/null 2>&1 || die "git is not installed"
command -v docker >/dev/null 2>&1 || die "docker is not installed"
docker compose version >/dev/null 2>&1 || die "docker compose is not available"
# Verify we're in a git repo
[ -d ".git" ] || die "$PROJECT_DIR is not a git repository"
# Verify postgres is running
if ! $COMPOSE_CMD ps postgres 2>/dev/null | grep -q "running\|Up"; then
die "PostgreSQL container is not running. Start it with: $COMPOSE_CMD up -d postgres"
fi
# Store current commit for rollback
PREV_COMMIT=$(git rev-parse HEAD)
log "Current commit: $PREV_COMMIT"
# ====================================================================
# STEP 2: Pre-upgrade database backup
# ====================================================================
echo ""
log "--- Step 1/6: Pre-upgrade database backup ---"
BACKUP_OUTPUT=$("$SCRIPT_DIR/db-backup.sh" backup 2>&1)
echo "$BACKUP_OUTPUT"
# Extract the backup file path from the output (strip ANSI color codes first)
BACKUP_FILE=$(echo "$BACKUP_OUTPUT" | sed 's/\x1b\[[0-9;]*m//g' | grep -oP 'Backup complete: \K\S+' || true)
if [ -z "$BACKUP_FILE" ]; then
die "Failed to capture backup file path from db-backup.sh output"
fi
if [ ! -f "$BACKUP_FILE" ]; then
die "Backup file does not exist: $BACKUP_FILE"
fi
ok "Pre-upgrade backup saved: $(basename "$BACKUP_FILE")"
# From this point forward, rollback is possible
ROLLBACK_NEEDED=true
# ====================================================================
# STEP 3: Pull latest code
# ====================================================================
echo ""
log "--- Step 2/6: Pulling latest code from main ---"
git fetch origin main
git reset --hard origin/main
NEW_COMMIT=$(git rev-parse HEAD)
log "Updated to commit: $NEW_COMMIT"
if [ "$PREV_COMMIT" = "$NEW_COMMIT" ]; then
warn "No new commits — continuing anyway (migrations or rebuilds may still be needed)"
fi
# ====================================================================
# STEP 4: Rebuild and restart containers
# ====================================================================
echo ""
log "--- Step 3/6: Rebuilding and restarting containers ---"
$COMPOSE_CMD up -d --build
# Wait for postgres to be healthy before running migrations
log "Waiting for PostgreSQL to be healthy ..."
PG_RETRIES=30
PG_COUNT=0
while [ $PG_COUNT -lt $PG_RETRIES ]; do
if $COMPOSE_CMD exec -T postgres pg_isready -U "$DB_USER" -d "$DB_NAME" >/dev/null 2>&1; then
ok "PostgreSQL is ready"
break
fi
((PG_COUNT++))
if [ $PG_COUNT -eq $PG_RETRIES ]; then
die "PostgreSQL did not become healthy after $((PG_RETRIES * 2))s"
fi
sleep 2
done
# ====================================================================
# STEP 5: Run database migrations
# ====================================================================
echo ""
log "--- Step 4/6: Running database migrations ---"
# Helper: run SQL via psql in the postgres container
run_sql() {
$COMPOSE_CMD exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" -v ON_ERROR_STOP=1 --quiet "$@"
}
# Step 5a: Ensure the migration tracking table exists
log "Ensuring shared.schema_migrations table exists ..."
run_sql <<'SQL'
CREATE SCHEMA IF NOT EXISTS shared;
CREATE TABLE IF NOT EXISTS shared.schema_migrations (
id SERIAL PRIMARY KEY,
filename TEXT NOT NULL UNIQUE,
applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
checksum TEXT
);
SQL
ok "Migration tracking table ready"
# Helper: check if a migration has been applied (safe with set -u)
is_applied() {
local key="$1"
# Use a subshell test to avoid unbound variable with set -u on empty associative arrays
[[ -n "${APPLIED_MIGRATIONS[$key]:-}" ]]
}
# Step 5b: Get list of already-applied migrations
declare -A APPLIED_MIGRATIONS=()
while IFS= read -r fname; do
fname=$(echo "$fname" | xargs) # trim whitespace
[ -n "$fname" ] && APPLIED_MIGRATIONS["$fname"]=1
done < <(run_sql -t -c "SELECT filename FROM shared.schema_migrations ORDER BY filename;" 2>/dev/null || true)
APPLIED_COUNT=${#APPLIED_MIGRATIONS[@]}
log "Previously applied migrations: $APPLIED_COUNT"
# Step 5c: Scan migration directory for .sql files
MIGRATION_FILES=()
if [ -d "$MIGRATION_DIR" ]; then
while IFS= read -r f; do
MIGRATION_FILES+=("$(basename "$f")")
done < <(find "$MIGRATION_DIR" -name "*.sql" -type f | sort)
fi
TOTAL_MIGRATIONS=${#MIGRATION_FILES[@]}
log "Total migration files found: $TOTAL_MIGRATIONS"
# Step 5d: Handle --seed-existing (first deployment only)
if [ "$SEED_EXISTING" = true ]; then
if [ "$APPLIED_COUNT" -gt 0 ]; then
warn "--seed-existing flag set but $APPLIED_COUNT migrations are already tracked. Skipping seed."
else
log "Seeding migration tracking table with ${TOTAL_MIGRATIONS} existing migration files ..."
for filename in "${MIGRATION_FILES[@]}"; do
checksum=$(md5sum "$MIGRATION_DIR/$filename" | awk '{print $1}')
run_sql -c "INSERT INTO shared.schema_migrations (filename, checksum) VALUES ('$filename', '$checksum') ON CONFLICT (filename) DO NOTHING;"
log " Seeded: $filename"
done
ok "All existing migrations marked as applied (not executed)"
# Refresh the applied list
APPLIED_COUNT=$TOTAL_MIGRATIONS
for filename in "${MIGRATION_FILES[@]}"; do
APPLIED_MIGRATIONS["$filename"]=1
done
fi
fi
# Step 5e: Detect first-run without --seed-existing
if [ "$APPLIED_COUNT" -eq 0 ] && [ "$TOTAL_MIGRATIONS" -gt 0 ] && [ "$SEED_EXISTING" = false ]; then
warn "The migration tracking table is empty but $TOTAL_MIGRATIONS migration files exist."
warn "If these migrations were previously applied manually, re-run with --seed-existing"
warn "to register them without re-executing. Otherwise, all migrations will be applied."
warn ""
warn "Continuing in 10 seconds ... (Ctrl+C to abort)"
sleep 10
fi
# Step 5f: Apply pending migrations
PENDING_COUNT=0
APPLIED_THIS_RUN=0
for filename in "${MIGRATION_FILES[@]}"; do
if is_applied "$filename"; then
continue
fi
((PENDING_COUNT++))
done
if [ "$PENDING_COUNT" -eq 0 ]; then
ok "No pending migrations to apply"
else
log "$PENDING_COUNT pending migration(s) to apply"
echo ""
for filename in "${MIGRATION_FILES[@]}"; do
if is_applied "$filename"; then
continue
fi
checksum=$(md5sum "$MIGRATION_DIR/$filename" | awk '{print $1}')
log " Applying: $filename ..."
# Run the migration in a single transaction with error stopping
if cat "$MIGRATION_DIR/$filename" | $COMPOSE_CMD exec -T postgres psql \
-U "$DB_USER" \
-d "$DB_NAME" \
-v ON_ERROR_STOP=1 \
--single-transaction \
--quiet 2>&1; then
# Record successful migration
run_sql -c "INSERT INTO shared.schema_migrations (filename, checksum) VALUES ('$filename', '$checksum');"
ok " Applied: $filename"
((APPLIED_THIS_RUN++))
else
err "Migration FAILED: $filename"
err "Triggering automatic rollback ..."
exit 1 # trap will handle rollback
fi
done
echo ""
ok "Successfully applied $APPLIED_THIS_RUN migration(s)"
fi
# ====================================================================
# STEP 6: Health check
# ====================================================================
echo ""
log "--- Step 5/6: Verifying application health ---"
# After a fresh image build, NestJS cold-start can take 2-3 minutes:
# New Relic init → TypeORM connections → Redis → BullMQ → NestJS bootstrap
# Docker's own healthcheck (start_period:30s + 3×15s retries = ~75s) is too
# aggressive and will mark the container "unhealthy" before the app finishes
# booting. So we do NOT rely on Docker's health status — we probe the HTTP
# endpoint directly from the host and give it up to ~3 minutes total.
TOTAL_WAIT=$((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL))
log "Will wait up to ${TOTAL_WAIT}s for backend to respond at $HEALTH_URL ..."
sleep "$HEALTH_START_WAIT"
HEALTHY=false
for ((i=1; i<=HEALTH_RETRIES; i++)); do
# Direct HTTP check from the host using wget (available on Ubuntu)
if wget -qO- --timeout=5 "$HEALTH_URL" >/dev/null 2>&1; then
HEALTHY=true
break
fi
# Also check Docker's container health for informational logging
CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown")
# If the container exited or was removed, fail immediately — no point waiting
CONTAINER_STATUS=$($COMPOSE_CMD ps backend --format '{{.Status}}' 2>/dev/null || echo "unknown")
if echo "$CONTAINER_STATUS" | grep -qi "exit\|dead\|removed"; then
err "Backend container has stopped unexpectedly: $CONTAINER_STATUS"
break
fi
log " Health check attempt $i/$HEALTH_RETRIES — docker: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..."
sleep "$HEALTH_INTERVAL"
done
if [ "$HEALTHY" = true ]; then
ok "Backend is healthy and responding at $HEALTH_URL"
else
# Log diagnostics before triggering rollback
err "Backend failed to respond after ${TOTAL_WAIT}s"
warn "Container status: $($COMPOSE_CMD ps backend 2>/dev/null || echo 'unknown')"
warn "Recent backend logs:"
$COMPOSE_CMD logs --tail=30 backend 2>/dev/null || true
err "Triggering automatic rollback ..."
exit 1 # trap will handle rollback
fi
# ====================================================================
# STEP 7: Post-upgrade database backup
# ====================================================================
echo ""
log "--- Step 6/6: Post-upgrade database backup ---"
"$SCRIPT_DIR/db-backup.sh" backup
# ====================================================================
# Deployment complete
# ====================================================================
DEPLOY_SUCCESS=true
ROLLBACK_NEEDED=false
DEPLOY_END_TIME=$(date +%s)
DEPLOY_DURATION=$((DEPLOY_END_TIME - DEPLOY_START_TIME))
echo ""
log "============================================"
ok " DEPLOYMENT COMPLETE"
log "============================================"
log " Previous commit : $PREV_COMMIT"
log " Current commit : $NEW_COMMIT"
log " Migrations run : $APPLIED_THIS_RUN"
log " Duration : ${DEPLOY_DURATION}s"
log " Log file : $LOG_FILE"
log "============================================"
echo ""