The previous approach relied on Docker's container health status, but Docker's healthcheck (start_period:30s + 3x15s retries = ~75s) marks the container "unhealthy" before NestJS finishes cold-starting after a fresh image build (New Relic + TypeORM + Redis + BullMQ init can take 2-3 minutes). Changes: - Primary check is now direct wget to localhost:3000/api from the host - Docker health status used only for informational logging - Total timeout increased from 130s to 190s (~3 min) for cold starts - Early exit if container has stopped/exited (no point waiting) - More backend log lines (30 vs 20) shown on failure for diagnostics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
435 lines
15 KiB
Bash
Executable File
435 lines
15 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# ---------------------------------------------------------------------------
|
||
# deploy-prod.sh — Production deployment script for HOA LedgerIQ
|
||
#
|
||
# Usage:
|
||
# ./scripts/deploy-prod.sh [--seed-existing]
|
||
#
|
||
# This script performs a full production deployment:
|
||
# 1. Takes a pre-upgrade database backup
|
||
# 2. Pulls the latest code from the main branch
|
||
# 3. Rebuilds and restarts Docker containers
|
||
# 4. Runs any pending database migrations (tracked in shared.schema_migrations)
|
||
# 5. Verifies the application is healthy
|
||
# 6. Takes a post-upgrade database backup
|
||
#
|
||
# On failure (migration error or health check), the script automatically:
|
||
# - Restores the pre-upgrade database backup
|
||
# - Reverts the code to the previous commit
|
||
# - Rebuilds containers from the reverted code
|
||
#
|
||
# Flags:
|
||
# --seed-existing Mark all existing migration files as applied without
|
||
# executing them. Use this ONLY on the first deployment
|
||
# against an existing database where migrations were
|
||
# previously applied manually.
|
||
#
|
||
# Environment:
|
||
# PROJECT_DIR Override the project directory (default: /opt/hoa-ledgeriq)
|
||
# POSTGRES_USER Database user (default: hoafinance)
|
||
# POSTGRES_DB Database name (default: hoafinance)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
set -euo pipefail
|
||
|
||
# ---- Defaults ----
|
||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||
PROJECT_DIR="${PROJECT_DIR:-/opt/hoa-ledgeriq}"
|
||
COMPOSE_CMD="docker compose -f $PROJECT_DIR/docker-compose.yml -f $PROJECT_DIR/docker-compose.prod.yml"
|
||
DB_USER="${POSTGRES_USER:-hoafinance}"
|
||
DB_NAME="${POSTGRES_DB:-hoafinance}"
|
||
MIGRATION_DIR="$PROJECT_DIR/db/migrations"
|
||
HEALTH_URL="http://localhost:3000/api"
|
||
HEALTH_RETRIES=36
|
||
HEALTH_INTERVAL=5
|
||
HEALTH_START_WAIT=10
|
||
LOG_DIR="$PROJECT_DIR/logs"
|
||
LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
# State tracking
|
||
SEED_EXISTING=false
|
||
PREV_COMMIT=""
|
||
BACKUP_FILE=""
|
||
ROLLBACK_NEEDED=false
|
||
DEPLOY_SUCCESS=false
|
||
DEPLOY_START_TIME=""
|
||
|
||
# ---- Colors ----
|
||
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; NC='\033[0m'
|
||
|
||
# ---- Logging ----
|
||
log() { echo -e "$(date -Iseconds) ${CYAN}[DEPLOY]${NC} $*"; }
|
||
ok() { echo -e "$(date -Iseconds) ${GREEN}[OK]${NC} $*"; }
|
||
warn() { echo -e "$(date -Iseconds) ${YELLOW}[WARN]${NC} $*"; }
|
||
err() { echo -e "$(date -Iseconds) ${RED}[ERROR]${NC} $*" >&2; }
|
||
die() { err "$@"; exit 1; }
|
||
|
||
# ---- Parse flags ----
|
||
while [ $# -gt 0 ]; do
|
||
case "$1" in
|
||
--seed-existing) SEED_EXISTING=true; shift ;;
|
||
--help|-h)
|
||
head -35 "$0" | tail -33
|
||
exit 0
|
||
;;
|
||
*) die "Unknown argument: $1" ;;
|
||
esac
|
||
done
|
||
|
||
# ---- Setup logging ----
|
||
mkdir -p "$LOG_DIR"
|
||
exec > >(tee -a "$LOG_FILE") 2>&1
|
||
|
||
# ---- Cleanup / Rollback trap ----
|
||
cleanup() {
|
||
if [ "$DEPLOY_SUCCESS" = true ]; then
|
||
return 0
|
||
fi
|
||
|
||
if [ "$ROLLBACK_NEEDED" = true ] && [ -n "$BACKUP_FILE" ]; then
|
||
echo ""
|
||
err "=========================================="
|
||
err " DEPLOYMENT FAILED — STARTING ROLLBACK"
|
||
err "=========================================="
|
||
echo ""
|
||
|
||
# Step 1: Restore the pre-upgrade database backup
|
||
log "Restoring database from pre-upgrade backup: $(basename "$BACKUP_FILE")"
|
||
if "$SCRIPT_DIR/db-backup.sh" restore --yes "$BACKUP_FILE"; then
|
||
ok "Database restored successfully"
|
||
else
|
||
err "DATABASE RESTORE FAILED — manual intervention required!"
|
||
err "Backup file: $BACKUP_FILE"
|
||
exit 1
|
||
fi
|
||
|
||
# Step 2: Revert code to previous commit
|
||
if [ -n "$PREV_COMMIT" ]; then
|
||
log "Reverting code to previous commit: $PREV_COMMIT"
|
||
cd "$PROJECT_DIR"
|
||
git reset --hard "$PREV_COMMIT"
|
||
ok "Code reverted to $PREV_COMMIT"
|
||
fi
|
||
|
||
# Step 3: Rebuild containers from old code
|
||
log "Rebuilding containers from reverted code ..."
|
||
cd "$PROJECT_DIR"
|
||
$COMPOSE_CMD up -d --build
|
||
ok "Containers rebuilt from previous version"
|
||
|
||
echo ""
|
||
err "Rollback complete. The system is restored to the pre-deployment state."
|
||
err "Review the deploy log for details: $LOG_FILE"
|
||
exit 1
|
||
elif [ "$ROLLBACK_NEEDED" = true ]; then
|
||
err "Rollback needed but no backup file available — manual intervention required!"
|
||
exit 1
|
||
fi
|
||
}
|
||
|
||
trap cleanup EXIT
|
||
|
||
# ====================================================================
|
||
# STEP 1: Pre-flight checks
|
||
# ====================================================================
|
||
log "============================================"
|
||
log " HOA LedgerIQ — Production Deployment"
|
||
log "============================================"
|
||
log "Project directory: $PROJECT_DIR"
|
||
log "Timestamp: $(date -Iseconds)"
|
||
DEPLOY_START_TIME=$(date +%s)
|
||
echo ""
|
||
|
||
cd "$PROJECT_DIR"
|
||
|
||
# Verify prerequisites
|
||
command -v git >/dev/null 2>&1 || die "git is not installed"
|
||
command -v docker >/dev/null 2>&1 || die "docker is not installed"
|
||
docker compose version >/dev/null 2>&1 || die "docker compose is not available"
|
||
|
||
# Verify we're in a git repo
|
||
[ -d ".git" ] || die "$PROJECT_DIR is not a git repository"
|
||
|
||
# Verify postgres is running
|
||
if ! $COMPOSE_CMD ps postgres 2>/dev/null | grep -q "running\|Up"; then
|
||
die "PostgreSQL container is not running. Start it with: $COMPOSE_CMD up -d postgres"
|
||
fi
|
||
|
||
# Store current commit for rollback
|
||
PREV_COMMIT=$(git rev-parse HEAD)
|
||
log "Current commit: $PREV_COMMIT"
|
||
|
||
# ====================================================================
|
||
# STEP 2: Pre-upgrade database backup
|
||
# ====================================================================
|
||
echo ""
|
||
log "--- Step 1/6: Pre-upgrade database backup ---"
|
||
|
||
BACKUP_OUTPUT=$("$SCRIPT_DIR/db-backup.sh" backup 2>&1)
|
||
echo "$BACKUP_OUTPUT"
|
||
|
||
# Extract the backup file path from the output (strip ANSI color codes first)
|
||
BACKUP_FILE=$(echo "$BACKUP_OUTPUT" | sed 's/\x1b\[[0-9;]*m//g' | grep -oP 'Backup complete: \K\S+' || true)
|
||
|
||
if [ -z "$BACKUP_FILE" ]; then
|
||
die "Failed to capture backup file path from db-backup.sh output"
|
||
fi
|
||
|
||
if [ ! -f "$BACKUP_FILE" ]; then
|
||
die "Backup file does not exist: $BACKUP_FILE"
|
||
fi
|
||
|
||
ok "Pre-upgrade backup saved: $(basename "$BACKUP_FILE")"
|
||
|
||
# From this point forward, rollback is possible
|
||
ROLLBACK_NEEDED=true
|
||
|
||
# ====================================================================
|
||
# STEP 3: Pull latest code
|
||
# ====================================================================
|
||
echo ""
|
||
log "--- Step 2/6: Pulling latest code from main ---"
|
||
|
||
git fetch origin main
|
||
git reset --hard origin/main
|
||
|
||
NEW_COMMIT=$(git rev-parse HEAD)
|
||
log "Updated to commit: $NEW_COMMIT"
|
||
|
||
if [ "$PREV_COMMIT" = "$NEW_COMMIT" ]; then
|
||
warn "No new commits — continuing anyway (migrations or rebuilds may still be needed)"
|
||
fi
|
||
|
||
# ====================================================================
|
||
# STEP 4: Rebuild and restart containers
|
||
# ====================================================================
|
||
echo ""
|
||
log "--- Step 3/6: Rebuilding and restarting containers ---"
|
||
|
||
$COMPOSE_CMD up -d --build
|
||
|
||
# Wait for postgres to be healthy before running migrations
|
||
log "Waiting for PostgreSQL to be healthy ..."
|
||
PG_RETRIES=30
|
||
PG_COUNT=0
|
||
while [ $PG_COUNT -lt $PG_RETRIES ]; do
|
||
if $COMPOSE_CMD exec -T postgres pg_isready -U "$DB_USER" -d "$DB_NAME" >/dev/null 2>&1; then
|
||
ok "PostgreSQL is ready"
|
||
break
|
||
fi
|
||
((PG_COUNT++))
|
||
if [ $PG_COUNT -eq $PG_RETRIES ]; then
|
||
die "PostgreSQL did not become healthy after $((PG_RETRIES * 2))s"
|
||
fi
|
||
sleep 2
|
||
done
|
||
|
||
# ====================================================================
|
||
# STEP 5: Run database migrations
|
||
# ====================================================================
|
||
echo ""
|
||
log "--- Step 4/6: Running database migrations ---"
|
||
|
||
# Helper: run SQL via psql in the postgres container
|
||
run_sql() {
|
||
$COMPOSE_CMD exec -T postgres psql -U "$DB_USER" -d "$DB_NAME" -v ON_ERROR_STOP=1 --quiet "$@"
|
||
}
|
||
|
||
# Step 5a: Ensure the migration tracking table exists
|
||
log "Ensuring shared.schema_migrations table exists ..."
|
||
run_sql <<'SQL'
|
||
CREATE SCHEMA IF NOT EXISTS shared;
|
||
CREATE TABLE IF NOT EXISTS shared.schema_migrations (
|
||
id SERIAL PRIMARY KEY,
|
||
filename TEXT NOT NULL UNIQUE,
|
||
applied_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||
checksum TEXT
|
||
);
|
||
SQL
|
||
ok "Migration tracking table ready"
|
||
|
||
# Helper: check if a migration has been applied (safe with set -u)
|
||
is_applied() {
|
||
local key="$1"
|
||
# Use a subshell test to avoid unbound variable with set -u on empty associative arrays
|
||
[[ -n "${APPLIED_MIGRATIONS[$key]:-}" ]]
|
||
}
|
||
|
||
# Step 5b: Get list of already-applied migrations
|
||
declare -A APPLIED_MIGRATIONS=()
|
||
while IFS= read -r fname; do
|
||
fname=$(echo "$fname" | xargs) # trim whitespace
|
||
[ -n "$fname" ] && APPLIED_MIGRATIONS["$fname"]=1
|
||
done < <(run_sql -t -c "SELECT filename FROM shared.schema_migrations ORDER BY filename;" 2>/dev/null || true)
|
||
|
||
APPLIED_COUNT=${#APPLIED_MIGRATIONS[@]}
|
||
log "Previously applied migrations: $APPLIED_COUNT"
|
||
|
||
# Step 5c: Scan migration directory for .sql files
|
||
MIGRATION_FILES=()
|
||
if [ -d "$MIGRATION_DIR" ]; then
|
||
while IFS= read -r f; do
|
||
MIGRATION_FILES+=("$(basename "$f")")
|
||
done < <(find "$MIGRATION_DIR" -name "*.sql" -type f | sort)
|
||
fi
|
||
|
||
TOTAL_MIGRATIONS=${#MIGRATION_FILES[@]}
|
||
log "Total migration files found: $TOTAL_MIGRATIONS"
|
||
|
||
# Step 5d: Handle --seed-existing (first deployment only)
|
||
if [ "$SEED_EXISTING" = true ]; then
|
||
if [ "$APPLIED_COUNT" -gt 0 ]; then
|
||
warn "--seed-existing flag set but $APPLIED_COUNT migrations are already tracked. Skipping seed."
|
||
else
|
||
log "Seeding migration tracking table with ${TOTAL_MIGRATIONS} existing migration files ..."
|
||
for filename in "${MIGRATION_FILES[@]}"; do
|
||
checksum=$(md5sum "$MIGRATION_DIR/$filename" | awk '{print $1}')
|
||
run_sql -c "INSERT INTO shared.schema_migrations (filename, checksum) VALUES ('$filename', '$checksum') ON CONFLICT (filename) DO NOTHING;"
|
||
log " Seeded: $filename"
|
||
done
|
||
ok "All existing migrations marked as applied (not executed)"
|
||
# Refresh the applied list
|
||
APPLIED_COUNT=$TOTAL_MIGRATIONS
|
||
for filename in "${MIGRATION_FILES[@]}"; do
|
||
APPLIED_MIGRATIONS["$filename"]=1
|
||
done
|
||
fi
|
||
fi
|
||
|
||
# Step 5e: Detect first-run without --seed-existing
|
||
if [ "$APPLIED_COUNT" -eq 0 ] && [ "$TOTAL_MIGRATIONS" -gt 0 ] && [ "$SEED_EXISTING" = false ]; then
|
||
warn "The migration tracking table is empty but $TOTAL_MIGRATIONS migration files exist."
|
||
warn "If these migrations were previously applied manually, re-run with --seed-existing"
|
||
warn "to register them without re-executing. Otherwise, all migrations will be applied."
|
||
warn ""
|
||
warn "Continuing in 10 seconds ... (Ctrl+C to abort)"
|
||
sleep 10
|
||
fi
|
||
|
||
# Step 5f: Apply pending migrations
|
||
PENDING_COUNT=0
|
||
APPLIED_THIS_RUN=0
|
||
|
||
for filename in "${MIGRATION_FILES[@]}"; do
|
||
if is_applied "$filename"; then
|
||
continue
|
||
fi
|
||
((PENDING_COUNT++))
|
||
done
|
||
|
||
if [ "$PENDING_COUNT" -eq 0 ]; then
|
||
ok "No pending migrations to apply"
|
||
else
|
||
log "$PENDING_COUNT pending migration(s) to apply"
|
||
echo ""
|
||
|
||
for filename in "${MIGRATION_FILES[@]}"; do
|
||
if is_applied "$filename"; then
|
||
continue
|
||
fi
|
||
|
||
checksum=$(md5sum "$MIGRATION_DIR/$filename" | awk '{print $1}')
|
||
log " Applying: $filename ..."
|
||
|
||
# Run the migration in a single transaction with error stopping
|
||
if cat "$MIGRATION_DIR/$filename" | $COMPOSE_CMD exec -T postgres psql \
|
||
-U "$DB_USER" \
|
||
-d "$DB_NAME" \
|
||
-v ON_ERROR_STOP=1 \
|
||
--single-transaction \
|
||
--quiet 2>&1; then
|
||
|
||
# Record successful migration
|
||
run_sql -c "INSERT INTO shared.schema_migrations (filename, checksum) VALUES ('$filename', '$checksum');"
|
||
ok " Applied: $filename"
|
||
((APPLIED_THIS_RUN++))
|
||
else
|
||
err "Migration FAILED: $filename"
|
||
err "Triggering automatic rollback ..."
|
||
exit 1 # trap will handle rollback
|
||
fi
|
||
done
|
||
|
||
echo ""
|
||
ok "Successfully applied $APPLIED_THIS_RUN migration(s)"
|
||
fi
|
||
|
||
# ====================================================================
|
||
# STEP 6: Health check
|
||
# ====================================================================
|
||
echo ""
|
||
log "--- Step 5/6: Verifying application health ---"
|
||
|
||
# After a fresh image build, NestJS cold-start can take 2-3 minutes:
|
||
# New Relic init → TypeORM connections → Redis → BullMQ → NestJS bootstrap
|
||
# Docker's own healthcheck (start_period:30s + 3×15s retries = ~75s) is too
|
||
# aggressive and will mark the container "unhealthy" before the app finishes
|
||
# booting. So we do NOT rely on Docker's health status — we probe the HTTP
|
||
# endpoint directly from the host and give it up to ~3 minutes total.
|
||
TOTAL_WAIT=$((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL))
|
||
log "Will wait up to ${TOTAL_WAIT}s for backend to respond at $HEALTH_URL ..."
|
||
sleep "$HEALTH_START_WAIT"
|
||
|
||
HEALTHY=false
|
||
for ((i=1; i<=HEALTH_RETRIES; i++)); do
|
||
# Direct HTTP check from the host using wget (available on Ubuntu)
|
||
if wget -qO- --timeout=5 "$HEALTH_URL" >/dev/null 2>&1; then
|
||
HEALTHY=true
|
||
break
|
||
fi
|
||
|
||
# Also check Docker's container health for informational logging
|
||
CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown")
|
||
|
||
# If the container exited or was removed, fail immediately — no point waiting
|
||
CONTAINER_STATUS=$($COMPOSE_CMD ps backend --format '{{.Status}}' 2>/dev/null || echo "unknown")
|
||
if echo "$CONTAINER_STATUS" | grep -qi "exit\|dead\|removed"; then
|
||
err "Backend container has stopped unexpectedly: $CONTAINER_STATUS"
|
||
break
|
||
fi
|
||
|
||
log " Health check attempt $i/$HEALTH_RETRIES — docker: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..."
|
||
sleep "$HEALTH_INTERVAL"
|
||
done
|
||
|
||
if [ "$HEALTHY" = true ]; then
|
||
ok "Backend is healthy and responding at $HEALTH_URL"
|
||
else
|
||
# Log diagnostics before triggering rollback
|
||
err "Backend failed to respond after ${TOTAL_WAIT}s"
|
||
warn "Container status: $($COMPOSE_CMD ps backend 2>/dev/null || echo 'unknown')"
|
||
warn "Recent backend logs:"
|
||
$COMPOSE_CMD logs --tail=30 backend 2>/dev/null || true
|
||
err "Triggering automatic rollback ..."
|
||
exit 1 # trap will handle rollback
|
||
fi
|
||
|
||
# ====================================================================
|
||
# STEP 7: Post-upgrade database backup
|
||
# ====================================================================
|
||
echo ""
|
||
log "--- Step 6/6: Post-upgrade database backup ---"
|
||
|
||
"$SCRIPT_DIR/db-backup.sh" backup
|
||
|
||
# ====================================================================
|
||
# Deployment complete
|
||
# ====================================================================
|
||
DEPLOY_SUCCESS=true
|
||
ROLLBACK_NEEDED=false
|
||
|
||
DEPLOY_END_TIME=$(date +%s)
|
||
DEPLOY_DURATION=$((DEPLOY_END_TIME - DEPLOY_START_TIME))
|
||
|
||
echo ""
|
||
log "============================================"
|
||
ok " DEPLOYMENT COMPLETE"
|
||
log "============================================"
|
||
log " Previous commit : $PREV_COMMIT"
|
||
log " Current commit : $NEW_COMMIT"
|
||
log " Migrations run : $APPLIED_THIS_RUN"
|
||
log " Duration : ${DEPLOY_DURATION}s"
|
||
log " Log file : $LOG_FILE"
|
||
log "============================================"
|
||
echo ""
|