From a025c9e9792a90899c6c9a674748d98530518be6 Mon Sep 17 00:00:00 2001 From: olsch01 Date: Thu, 9 Apr 2026 09:42:49 -0400 Subject: [PATCH] fix: health check now probes HTTP directly with 3-min timeout The previous approach relied on Docker's container health status, but Docker's healthcheck (start_period:30s + 3x15s retries = ~75s) marks the container "unhealthy" before NestJS finishes cold-starting after a fresh image build (New Relic + TypeORM + Redis + BullMQ init can take 2-3 minutes). Changes: - Primary check is now direct wget to localhost:3000/api from the host - Docker health status used only for informational logging - Total timeout increased from 130s to 190s (~3 min) for cold starts - Early exit if container has stopped/exited (no point waiting) - More backend log lines (30 vs 20) shown on failure for diagnostics Co-Authored-By: Claude Opus 4.6 --- scripts/deploy-prod.sh | 43 +++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/scripts/deploy-prod.sh b/scripts/deploy-prod.sh index 63df3d1..4260543 100755 --- a/scripts/deploy-prod.sh +++ b/scripts/deploy-prod.sh @@ -40,9 +40,9 @@ DB_USER="${POSTGRES_USER:-hoafinance}" DB_NAME="${POSTGRES_DB:-hoafinance}" MIGRATION_DIR="$PROJECT_DIR/db/migrations" HEALTH_URL="http://localhost:3000/api" -HEALTH_RETRIES=20 +HEALTH_RETRIES=36 HEALTH_INTERVAL=5 -HEALTH_START_WAIT=30 +HEALTH_START_WAIT=10 LOG_DIR="$PROJECT_DIR/logs" LOG_FILE="$LOG_DIR/deploy-$(date +%Y%m%d_%H%M%S).log" @@ -359,38 +359,47 @@ fi # ==================================================================== echo "" log "--- Step 5/6: Verifying application health ---" -log "Waiting ${HEALTH_START_WAIT}s for backend to initialize (matches Docker start_period) ..." + +# After a fresh image build, NestJS cold-start can take 2-3 minutes: +# New Relic init → TypeORM connections → Redis → BullMQ → NestJS bootstrap +# Docker's own healthcheck (start_period:30s + 3×15s retries = ~75s) is too +# aggressive and will mark the container "unhealthy" before the app finishes +# booting. So we do NOT rely on Docker's health status — we probe the HTTP +# endpoint directly from the host and give it up to ~3 minutes total. +TOTAL_WAIT=$((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL)) +log "Will wait up to ${TOTAL_WAIT}s for backend to respond at $HEALTH_URL ..." sleep "$HEALTH_START_WAIT" -# Primary check: Docker's own container health status -# (docker-compose.prod.yml already defines a healthcheck using wget inside the container) HEALTHY=false for ((i=1; i<=HEALTH_RETRIES; i++)); do - CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown") - if [ "$CONTAINER_HEALTH" = "healthy" ]; then - HEALTHY=true - break - fi - - # Also try a direct HTTP check from the host as a secondary signal - # Use wget (available on Ubuntu) since curl may not be installed + # Direct HTTP check from the host using wget (available on Ubuntu) if wget -qO- --timeout=5 "$HEALTH_URL" >/dev/null 2>&1; then HEALTHY=true break fi - log " Health check attempt $i/$HEALTH_RETRIES — container status: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..." + # Also check Docker's container health for informational logging + CONTAINER_HEALTH=$($COMPOSE_CMD ps backend --format '{{.Health}}' 2>/dev/null || echo "unknown") + + # If the container exited or was removed, fail immediately — no point waiting + CONTAINER_STATUS=$($COMPOSE_CMD ps backend --format '{{.Status}}' 2>/dev/null || echo "unknown") + if echo "$CONTAINER_STATUS" | grep -qi "exit\|dead\|removed"; then + err "Backend container has stopped unexpectedly: $CONTAINER_STATUS" + break + fi + + log " Health check attempt $i/$HEALTH_RETRIES — docker: ${CONTAINER_HEALTH}, retrying in ${HEALTH_INTERVAL}s ..." sleep "$HEALTH_INTERVAL" done if [ "$HEALTHY" = true ]; then - ok "Backend is healthy and responding" + ok "Backend is healthy and responding at $HEALTH_URL" else # Log diagnostics before triggering rollback - err "Backend failed to respond after $((HEALTH_START_WAIT + HEALTH_RETRIES * HEALTH_INTERVAL))s" + err "Backend failed to respond after ${TOTAL_WAIT}s" warn "Container status: $($COMPOSE_CMD ps backend 2>/dev/null || echo 'unknown')" warn "Recent backend logs:" - $COMPOSE_CMD logs --tail=20 backend 2>/dev/null || true + $COMPOSE_CMD logs --tail=30 backend 2>/dev/null || true err "Triggering automatic rollback ..." exit 1 # trap will handle rollback fi