Add production infrastructure: compiled builds, clustering, connection pooling

Root cause of 502 errors under 30 concurrent users: the production server was running dev-mode infrastructure (Vite dev server, NestJS --watch, no DB connection pooling, single Node.js process). Changes: - backend/Dockerfile: multi-stage prod build (compiled JS, no devDeps) - frontend/Dockerfile: multi-stage prod build (static assets served by nginx) - frontend/nginx.conf: SPA routing config for frontend container - docker-compose.prod.yml: production overlay with tuned Postgres, memory limits, health checks, restart policies - nginx/production.conf: keepalive upstreams, proxy buffering, rate limiting - backend/src/main.ts: Node.js clustering (1 worker per CPU, up to 4), conditional request logging, production CORS - backend/src/app.module.ts: TypeORM connection pool (max 30, min 5) - docs/DEPLOYMENT.md: new Production Deployment section Deploy with: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d --build Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 16:55:19 -05:00
parent e719f593de
commit 8db89373e0
8 changed files with 408 additions and 18 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -0,0 +1,26 @@
 # ---- Production Dockerfile for NestJS backend ----
 # Multi-stage build: compile TypeScript, then run with minimal image
 # Stage 1: Build
 FROM node:20-alpine AS builder
 WORKDIR /app
 COPY package*.json ./
 RUN npm ci
 COPY . .
 RUN npm run build
 # Stage 2: Production
 FROM node:20-alpine
 WORKDIR /app
 # Only install production dependencies
 COPY package*.json ./
 RUN npm ci --omit=dev && npm cache clean --force
 # Copy compiled output from builder
 COPY --from=builder /app/dist ./dist
 EXPOSE 3000
 # Run the compiled JS directly — no ts-node, no watch, no devDeps
 CMD ["node", "dist/main"]
--- a/backend/src/app.module.ts
+++ b/backend/src/app.module.ts
@@ -43,6 +43,13 @@ import { ScheduleModule } from '@nestjs/schedule';
        autoLoadEntities: true,
        synchronize: false,
        logging: false,
        // Connection pool — reuse connections instead of creating new ones per query
        extra: {
          max: 30,                        // max pool size (across all concurrent requests)
          min: 5,                         // keep at least 5 idle connections warm
          idleTimeoutMillis: 30000,       // close idle connections after 30s
          connectionTimeoutMillis: 5000,  // fail fast if pool is exhausted
        },
      }),
    }),
    DatabaseModule,
--- a/backend/src/main.ts
+++ b/backend/src/main.ts
@@ -1,18 +1,51 @@
 import cluster from 'node:cluster';
 import os from 'node:os';
 import { NestFactory } from '@nestjs/core';
 import { ValidationPipe } from '@nestjs/common';
 import { SwaggerModule, DocumentBuilder } from '@nestjs/swagger';
 import { AppModule } from './app.module';
 const isProduction = process.env.NODE_ENV === 'production';
 // ---------------------------------------------------------------------------
 // Clustering — fork one worker per CPU core in production
 // ---------------------------------------------------------------------------
 const WORKERS = isProduction
  ? Math.min(os.cpus().length, 4)  // cap at 4 workers to stay within DB pool
  : 1;                              // single process in dev
 if (WORKERS > 1 && cluster.isPrimary) {
  console.log(`Primary ${process.pid} forking ${WORKERS} workers ...`);
  for (let i = 0; i < WORKERS; i++) {
    cluster.fork();
  }
  cluster.on('exit', (worker, code) => {
    console.warn(`Worker ${worker.process.pid} exited (code ${code}), restarting ...`);
    cluster.fork();
  });
 } else {
  bootstrap();
 }
 // ---------------------------------------------------------------------------
 // NestJS bootstrap
 // ---------------------------------------------------------------------------
 async function bootstrap() {
-  const app = await NestFactory.create(AppModule);
+  const app = await NestFactory.create(AppModule, {
    logger: isProduction ? ['error', 'warn', 'log'] : ['error', 'warn', 'log', 'debug', 'verbose'],
  });
  app.setGlobalPrefix('api');
-  // Request logging
+  // Request logging — only in development (too noisy / slow for prod)
  if (!isProduction) {
    app.use((req: any, _res: any, next: any) => {
      console.log(`[REQ] ${req.method} ${req.url} auth=${req.headers.authorization ? 'yes' : 'no'}`);
      next();
    });
  }
  app.useGlobalPipes(
    new ValidationPipe({
@@ -22,21 +55,22 @@ async function bootstrap() {
    }),
  );
  // CORS — in production nginx handles this; accept all origins behind the proxy
  app.enableCors({
-    origin: ['http://localhost', 'http://localhost:5173'],
+    origin: isProduction ? true : ['http://localhost', 'http://localhost:5173'],
    credentials: true,
  });
  // Swagger docs — available in all environments
  const config = new DocumentBuilder()
    .setTitle('HOA LedgerIQ API')
    .setDescription('API for the HOA LedgerIQ')
-    .setVersion('0.1.0')
+    .setVersion('2026.3.2')
    .addBearerAuth()
    .build();
  const document = SwaggerModule.createDocument(app, config);
  SwaggerModule.setup('api/docs', app, document);
  await app.listen(3000);
-  console.log('Backend running on port 3000');
+  console.log(`Backend worker ${process.pid} listening on port 3000`);
 }
 bootstrap();
--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -0,0 +1,96 @@
 # Production override — use with:
 #   docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d --build
 #
 # For SSL add docker-compose.ssl.yml as well:
 #   docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ssl.yml up -d --build
 #
 # What this changes from the base (dev) config:
 #   - Backend: production Dockerfile (compiled JS, no watch, no devDeps)
 #   - Frontend: production Dockerfile (static build served by nginx, not Vite)
 #   - No source-code volume mounts (uses baked-in built code)
 #   - Memory limits and health checks on backend
 #   - Restart policies for reliability
 services:
  nginx:
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx/production.conf:/etc/nginx/conf.d/default.conf:ro
      - certbot_www:/var/www/certbot:ro
      - certbot_conf:/etc/letsencrypt:ro
    restart: unless-stopped
  backend:
    build:
      context: ./backend
      dockerfile: Dockerfile          # production Dockerfile (compiled JS)
    volumes: []                        # override: no source mounts in prod
    environment:
      - DATABASE_URL=${DATABASE_URL}
      - REDIS_URL=${REDIS_URL}
      - JWT_SECRET=${JWT_SECRET}
      - NODE_ENV=production
      - AI_API_URL=${AI_API_URL}
      - AI_API_KEY=${AI_API_KEY}
      - AI_MODEL=${AI_MODEL}
      - AI_DEBUG=${AI_DEBUG:-false}
    deploy:
      resources:
        limits:
          memory: 1024M
        reservations:
          memory: 256M
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 3
      start_period: 30s
    restart: unless-stopped
  frontend:
    build:
      context: ./frontend
      dockerfile: Dockerfile           # production Dockerfile (static nginx)
    volumes: []                         # override: no source mounts in prod
    environment:
      - NODE_ENV=production
    restart: unless-stopped
  postgres:
    # Tune PostgreSQL for production workloads
    command: >
      postgres
        -c max_connections=200
        -c shared_buffers=256MB
        -c effective_cache_size=512MB
        -c work_mem=4MB
        -c maintenance_work_mem=64MB
        -c checkpoint_completion_target=0.9
        -c wal_buffers=16MB
        -c random_page_cost=1.1
    deploy:
      resources:
        limits:
          memory: 1024M
        reservations:
          memory: 512M
    restart: unless-stopped
  redis:
    restart: unless-stopped
  certbot:
    image: certbot/certbot:latest
    volumes:
      - certbot_www:/var/www/certbot
      - certbot_conf:/etc/letsencrypt
    networks:
      - hoanet
    entrypoint: "/bin/sh -c 'trap exit TERM; while :; do certbot renew --quiet; sleep 12h & wait $${!}; done'"
 volumes:
  certbot_www:
  certbot_conf:
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -9,12 +9,13 @@
 1. [Prerequisites](#prerequisites)
 2. [Deploy to a Fresh Docker Server](#deploy-to-a-fresh-docker-server)
-3. [SSL with Certbot (Let's Encrypt)](#ssl-with-certbot-lets-encrypt)
+3. [Production Deployment](#production-deployment)
-4. [Backup the Local Test Database](#backup-the-local-test-database)
+4. [SSL with Certbot (Let's Encrypt)](#ssl-with-certbot-lets-encrypt)
-5. [Restore a Backup into the Staged Environment](#restore-a-backup-into-the-staged-environment)
+5. [Backup the Local Test Database](#backup-the-local-test-database)
-6. [Running Migrations on the Staged Environment](#running-migrations-on-the-staged-environment)
+6. [Restore a Backup into the Staged Environment](#restore-a-backup-into-the-staged-environment)
-7. [Verifying the Deployment](#verifying-the-deployment)
+7. [Running Migrations on the Staged Environment](#running-migrations-on-the-staged-environment)
-8. [Environment Variable Reference](#environment-variable-reference)
+8. [Verifying the Deployment](#verifying-the-deployment)
 9. [Environment Variable Reference](#environment-variable-reference)
 ---
@@ -135,8 +136,95 @@ This creates:
 | API       | `http://<server-ip>/api`       |
 | Postgres  | `<server-ip>:5432` (direct)    |
-> At this point the app is running over **plain HTTP**. Continue to the next
+> At this point the app is running over **plain HTTP** in development mode.
-> section to enable HTTPS.
+> For any environment that will serve real traffic, continue to the Production
 > Deployment section.
 ---
 ## Production Deployment
 The base `docker-compose.yml` runs everything in **development mode** (Vite
 dev server, NestJS in watch mode, no connection pooling). This is fine for
 local development but will fail under even light production load.
 `docker-compose.prod.yml` provides a production overlay that fixes this:
 | Component | Dev mode | Production mode |
 |-----------|----------|-----------------|
 | Frontend  | Vite dev server (single-threaded, HMR) | Static build served by nginx |
 | Backend   | `nest start --watch` (ts-node, file watcher) | Compiled JS, clustered across CPU cores |
 | DB pooling | None (new connection per query) | Pool of 30 reusable connections |
 | Postgres  | Default config (100 connections) | Tuned: 200 connections, optimized buffers |
 | Nginx     | Basic proxy | Keepalive upstreams, buffering, rate limiting |
 | Restart   | None | `unless-stopped` on all services |
 ### Deploy for production
 ```bash
 cd /opt/hoa-ledgeriq
 # Ensure .env has NODE_ENV=production and strong secrets
 nano .env
 # Build and start with the production overlay
 docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d --build
 ```
 To add SSL on top of the production stack:
 ```bash
 docker compose \
  -f docker-compose.yml \
  -f docker-compose.prod.yml \
  -f docker-compose.ssl.yml \
  up -d --build
 ```
 > **Tip:** Create a shell alias to avoid typing the compose files every time:
 > ```bash
 > echo 'alias dc="docker compose -f docker-compose.yml -f docker-compose.prod.yml"' >> ~/.bashrc
 > source ~/.bashrc
 > dc up -d --build
 > ```
 ### What the production overlay does
 **Backend (`backend/Dockerfile`)**
 - Multi-stage build: compiles TypeScript once, runs `node dist/main`
 - No dev dependencies shipped (smaller image, faster startup)
 - Node.js clustering: forks one worker per CPU core (up to 4)
 - Connection pool: 30 reusable PostgreSQL connections shared across workers
 **Frontend (`frontend/Dockerfile`)**
 - Multi-stage build: `npm run build` produces optimized static assets
 - Served by a lightweight nginx container (not Vite)
 - Static assets cached with immutable headers (Vite filename hashing)
 **Nginx (`nginx/production.conf`)**
 - Keepalive connections to upstream services (connection reuse)
 - Proxy buffering to prevent 502s during slow responses
 - Rate limiting on API routes (10 req/s per IP, burst 30)
 - Proper timeouts tuned per endpoint type
 **PostgreSQL**
 - `max_connections=200` (up from default 100)
 - `shared_buffers=256MB`, `effective_cache_size=512MB`
 - Tuned checkpoint, WAL, and memory settings
 ### Capacity guidelines
 With the production stack on a 2-core / 4GB server:
 | Metric | Expected capacity |
 |--------|-------------------|
 | Concurrent users | 50–100 |
 | API requests/sec | ~200 |
 | DB connections | 30 per backend worker × workers |
 | Frontend serving | Static files, effectively unlimited |
 For higher loads, scale the backend horizontally with Docker Swarm or
 Kubernetes replicas.
 ---
--- a/frontend/Dockerfile
+++ b/frontend/Dockerfile
@@ -0,0 +1,22 @@
 # ---- Production Dockerfile for React frontend ----
 # Multi-stage build: compile to static assets, serve with nginx
 # Stage 1: Build
 FROM node:20-alpine AS builder
 WORKDIR /app
 COPY package*.json ./
 RUN npm ci
 COPY . .
 RUN npm run build
 # Stage 2: Serve with nginx
 FROM nginx:alpine
 # Copy the built static files
 COPY --from=builder /app/dist /usr/share/nginx/html
 # Copy a small nginx config for SPA routing
 COPY nginx.conf /etc/nginx/conf.d/default.conf
 EXPOSE 80
 CMD ["nginx", "-g", "daemon off;"]
--- a/frontend/nginx.conf
+++ b/frontend/nginx.conf
@@ -0,0 +1,20 @@
 # Minimal nginx config for serving the React SPA inside the frontend container.
 # The outer nginx reverse proxy forwards non-API requests here.
 server {
    listen 80;
    server_name _;
    root /usr/share/nginx/html;
    index index.html;
    # Serve static assets with long cache (Vite hashes filenames)
    location /assets/ {
        expires 1y;
        add_header Cache-Control "public, immutable";
    }
    # SPA fallback — any non-file route returns index.html
    location / {
        try_files $uri $uri/ /index.html;
    }
 }
--- a/nginx/production.conf
+++ b/nginx/production.conf
@@ -0,0 +1,97 @@
 upstream backend {
    server backend:3000;
    keepalive 32;            # reuse connections to backend
 }
 upstream frontend {
    server frontend:80;
    keepalive 16;
 }
 # Shared proxy settings
 proxy_http_version 1.1;
 proxy_set_header Connection "";          # enable keepalive to upstreams
 proxy_set_header Host $host;
 proxy_set_header X-Real-IP $remote_addr;
 proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
 proxy_set_header X-Forwarded-Proto $scheme;
 # Buffer settings — prevent 502s when backend is slow to respond
 proxy_buffering on;
 proxy_buffer_size 16k;
 proxy_buffers 8 16k;
 proxy_busy_buffers_size 32k;
 # Redirect HTTP → HTTPS
 server {
    listen 80;
    server_name _;
    location /.well-known/acme-challenge/ {
        root /var/www/certbot;
    }
    location / {
        return 301 https://$host$request_uri;
    }
 }
 # HTTPS server
 server {
    listen 443 ssl;
    # Replace with your hostname:
    server_name staging.example.com;
    # --- TLS certificates ---
    ssl_certificate     /etc/letsencrypt/live/staging.example.com/fullchain.pem;
    ssl_certificate_key /etc/letsencrypt/live/staging.example.com/privkey.pem;
    ssl_protocols TLSv1.2 TLSv1.3;
    ssl_ciphers 'ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384';
    ssl_prefer_server_ciphers on;
    ssl_session_cache shared:SSL:10m;
    ssl_session_timeout 1d;
    ssl_session_tickets off;
    add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
    add_header X-Content-Type-Options nosniff always;
    add_header X-Frame-Options SAMEORIGIN always;
    # --- Rate limit zone (10 req/s per IP for API) ---
    limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
    # --- API routes → backend ---
    location /api/ {
        limit_req zone=api_limit burst=30 nodelay;
        proxy_pass http://backend;
        proxy_read_timeout 30s;
        proxy_connect_timeout 5s;
        proxy_send_timeout 15s;
    }
    # AI endpoints → longer timeouts
    location /api/investment-planning/recommendations {
        proxy_pass http://backend;
        proxy_read_timeout 180s;
        proxy_connect_timeout 10s;
        proxy_send_timeout 30s;
    }
    location /api/health-scores/calculate {
        proxy_pass http://backend;
        proxy_read_timeout 180s;
        proxy_connect_timeout 10s;
        proxy_send_timeout 30s;
    }
    # --- Static frontend → built React assets ---
    location / {
        proxy_pass http://frontend;
        proxy_read_timeout 10s;
        proxy_connect_timeout 5s;
        # Cache static assets aggressively at the proxy level
        proxy_cache_bypass $http_upgrade;
    }
 }