fix: health check now probes HTTP directly with 3-min timeout

The previous approach relied on Docker's container health status, but
Docker's healthcheck (start_period:30s + 3x15s retries = ~75s) marks
the container "unhealthy" before NestJS finishes cold-starting after a
fresh image build (New Relic + TypeORM + Redis + BullMQ init can take
2-3 minutes).

Changes:
- Primary check is now direct wget to localhost:3000/api from the host
- Docker health status used only for informational logging
- Total timeout increased from 130s to 190s (~3 min) for cold starts
- Early exit if container has stopped/exited (no point waiting)
- More backend log lines (30 vs 20) shown on failure for diagnostics

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-09 09:42:49 -04:00
parent 19bd19b0c4
commit a025c9e979

View File

@@ -40,9 +40,9 @@ DB_USER="${POSTGRES_USER:-hoafinance}"
DB_NAME="${POSTGRES_DB:-hoafinance}" DB_NAME="${POSTGRES_DB:-hoafinance}"
MIGRATION_DIR="$PROJECT_DIR/db/migrations" MIGRATION_DIR="$PROJECT_DIR/db/migrations"
HEALTH_URL="http://localhost:3000/api" HEALTH_URL="http://localhost:3000/api"
HEALTH_RETRIES=20 HEALTH_RETRIES=36
HEALTH_INTERVAL=5 HEALTH_INTERVAL=5
HEALTH_START_WAIT=30 HEALTH_START_WAIT=10
LOG_DIR="$PROJECT_DIR/logs" LOG_DIR="$PROJECT_DIR/logs"
LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d_%H%M%S).log" LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d_%H%M%S).log"
@@ -359,38 +359,47 @@ fi
# ==================================================================== # ====================================================================
echo "" echo ""
log "--- Step 5/6: Verifying application health ---" log "--- Step 5/6: Verifying application health ---"
log "Waiting ${HEALTH_START_WAIT}s for backend to initialize (matches Docker start_period) ..."
# After a fresh image build, NestJS cold-start can take 2-3 minutes:
# New Relic init → TypeORM connections → Redis → BullMQ → NestJS bootstrap
# Docker's own healthcheck (start_period:30s + 3×15s retries = ~75s) is too
# aggressive and will mark the container "unhealthy" before the app finishes
# booting. So we do NOT rely on Docker's health status — we probe the HTTP
# endpoint directly from the host and give it up to ~3 minutes total.
TOTAL_WAIT=$((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL))
log "Will wait up to ${TOTAL_WAIT}s for backend to respond at $HEALTH_URL ..."
sleep "$HEALTH_START_WAIT" sleep "$HEALTH_START_WAIT"
# Primary check: Docker's own container health status
# (docker-compose.prod.yml already defines a healthcheck using wget inside the container)
HEALTHY=false HEALTHY=false
for ((i=1; i<=HEALTH_RETRIES; i++)); do for ((i=1; i<=HEALTH_RETRIES; i++)); do
CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown") # Direct HTTP check from the host using wget (available on Ubuntu)
if [ "$CONTAINER_HEALTH" = "healthy" ]; then
HEALTHY=true
break
fi
# Also try a direct HTTP check from the host as a secondary signal
# Use wget (available on Ubuntu) since curl may not be installed
if wget -qO- --timeout=5 "$HEALTH_URL" >/dev/null 2>&1; then if wget -qO- --timeout=5 "$HEALTH_URL" >/dev/null 2>&1; then
HEALTHY=true HEALTHY=true
break break
fi fi
log " Health check attempt $i/$HEALTH_RETRIES — container status: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..." # Also check Docker's container health for informational logging
CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown")
# If the container exited or was removed, fail immediately — no point waiting
CONTAINER_STATUS=$($COMPOSE_CMD ps backend --format '{{.Status}}' 2>/dev/null || echo "unknown")
if echo "$CONTAINER_STATUS" | grep -qi "exit\|dead\|removed"; then
err "Backend container has stopped unexpectedly: $CONTAINER_STATUS"
break
fi
log " Health check attempt $i/$HEALTH_RETRIES — docker: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..."
sleep "$HEALTH_INTERVAL" sleep "$HEALTH_INTERVAL"
done done
if [ "$HEALTHY" = true ]; then if [ "$HEALTHY" = true ]; then
ok "Backend is healthy and responding" ok "Backend is healthy and responding at $HEALTH_URL"
else else
# Log diagnostics before triggering rollback # Log diagnostics before triggering rollback
err "Backend failed to respond after $((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL))s" err "Backend failed to respond after ${TOTAL_WAIT}s"
warn "Container status: $($COMPOSE_CMD ps backend 2>/dev/null || echo 'unknown')" warn "Container status: $($COMPOSE_CMD ps backend 2>/dev/null || echo 'unknown')"
warn "Recent backend logs:" warn "Recent backend logs:"
$COMPOSE_CMD logs --tail=20 backend 2>/dev/null || true $COMPOSE_CMD logs --tail=30 backend 2>/dev/null || true
err "Triggering automatic rollback ..." err "Triggering automatic rollback ..."
exit 1 # trap will handle rollback exit 1 # trap will handle rollback
fi fi