Add production infrastructure: compiled builds, clustering, connection pooling

Root cause of 502 errors under 30 concurrent users: the production server
was running dev-mode infrastructure (Vite dev server, NestJS --watch,
no DB connection pooling, single Node.js process).

Changes:
- backend/Dockerfile: multi-stage prod build (compiled JS, no devDeps)
- frontend/Dockerfile: multi-stage prod build (static assets served by nginx)
- frontend/nginx.conf: SPA routing config for frontend container
- docker-compose.prod.yml: production overlay with tuned Postgres, memory
  limits, health checks, restart policies
- nginx/production.conf: keepalive upstreams, proxy buffering, rate limiting
- backend/src/main.ts: Node.js clustering (1 worker per CPU, up to 4),
  conditional request logging, production CORS
- backend/src/app.module.ts: TypeORM connection pool (max 30, min 5)
- docs/DEPLOYMENT.md: new Production Deployment section

Deploy with: docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d --build

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-02 16:55:19 -05:00
parent e719f593de
commit 8db89373e0
8 changed files with 408 additions and 18 deletions

26
backend/Dockerfile Normal file
View File

@@ -0,0 +1,26 @@
# ---- Production Dockerfile for NestJS backend ----
# Multi-stage build: compile TypeScript, then run with minimal image
# Stage 1: Build
FROM node:20-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build
# Stage 2: Production
FROM node:20-alpine
WORKDIR /app
# Only install production dependencies
COPY package*.json ./
RUN npm ci --omit=dev && npm cache clean --force
# Copy compiled output from builder
COPY --from=builder /app/dist ./dist
EXPOSE 3000
# Run the compiled JS directly — no ts-node, no watch, no devDeps
CMD ["node", "dist/main"]

View File

@@ -43,6 +43,13 @@ import { ScheduleModule } from '@nestjs/schedule';
autoLoadEntities: true, autoLoadEntities: true,
synchronize: false, synchronize: false,
logging: false, logging: false,
// Connection pool — reuse connections instead of creating new ones per query
extra: {
max: 30, // max pool size (across all concurrent requests)
min: 5, // keep at least 5 idle connections warm
idleTimeoutMillis: 30000, // close idle connections after 30s
connectionTimeoutMillis: 5000, // fail fast if pool is exhausted
},
}), }),
}), }),
DatabaseModule, DatabaseModule,

View File

@@ -1,18 +1,51 @@
import cluster from 'node:cluster';
import os from 'node:os';
import { NestFactory } from '@nestjs/core'; import { NestFactory } from '@nestjs/core';
import { ValidationPipe } from '@nestjs/common'; import { ValidationPipe } from '@nestjs/common';
import { SwaggerModule, DocumentBuilder } from '@nestjs/swagger'; import { SwaggerModule, DocumentBuilder } from '@nestjs/swagger';
import { AppModule } from './app.module'; import { AppModule } from './app.module';
const isProduction = process.env.NODE_ENV === 'production';
// ---------------------------------------------------------------------------
// Clustering — fork one worker per CPU core in production
// ---------------------------------------------------------------------------
const WORKERS = isProduction
? Math.min(os.cpus().length, 4) // cap at 4 workers to stay within DB pool
: 1; // single process in dev
if (WORKERS > 1 && cluster.isPrimary) {
console.log(`Primary ${process.pid} forking ${WORKERS} workers ...`);
for (let i = 0; i < WORKERS; i++) {
cluster.fork();
}
cluster.on('exit', (worker, code) => {
console.warn(`Worker ${worker.process.pid} exited (code ${code}), restarting ...`);
cluster.fork();
});
} else {
bootstrap();
}
// ---------------------------------------------------------------------------
// NestJS bootstrap
// ---------------------------------------------------------------------------
async function bootstrap() { async function bootstrap() {
const app = await NestFactory.create(AppModule); const app = await NestFactory.create(AppModule, {
logger: isProduction ? ['error', 'warn', 'log'] : ['error', 'warn', 'log', 'debug', 'verbose'],
});
app.setGlobalPrefix('api'); app.setGlobalPrefix('api');
// Request logging // Request logging — only in development (too noisy / slow for prod)
if (!isProduction) {
app.use((req: any, _res: any, next: any) => { app.use((req: any, _res: any, next: any) => {
console.log(`[REQ] ${req.method} ${req.url} auth=${req.headers.authorization ? 'yes' : 'no'}`); console.log(`[REQ] ${req.method} ${req.url} auth=${req.headers.authorization ? 'yes' : 'no'}`);
next(); next();
}); });
}
app.useGlobalPipes( app.useGlobalPipes(
new ValidationPipe({ new ValidationPipe({
@@ -22,21 +55,22 @@ async function bootstrap() {
}), }),
); );
// CORS — in production nginx handles this; accept all origins behind the proxy
app.enableCors({ app.enableCors({
origin: ['http://localhost', 'http://localhost:5173'], origin: isProduction ? true : ['http://localhost', 'http://localhost:5173'],
credentials: true, credentials: true,
}); });
// Swagger docs — available in all environments
const config = new DocumentBuilder() const config = new DocumentBuilder()
.setTitle('HOA LedgerIQ API') .setTitle('HOA LedgerIQ API')
.setDescription('API for the HOA LedgerIQ') .setDescription('API for the HOA LedgerIQ')
.setVersion('0.1.0') .setVersion('2026.3.2')
.addBearerAuth() .addBearerAuth()
.build(); .build();
const document = SwaggerModule.createDocument(app, config); const document = SwaggerModule.createDocument(app, config);
SwaggerModule.setup('api/docs', app, document); SwaggerModule.setup('api/docs', app, document);
await app.listen(3000); await app.listen(3000);
console.log('Backend running on port 3000'); console.log(`Backend worker ${process.pid} listening on port 3000`);
} }
bootstrap();

96
docker-compose.prod.yml Normal file
View File

@@ -0,0 +1,96 @@
# Production override — use with:
# docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d --build
#
# For SSL add docker-compose.ssl.yml as well:
# docker compose -f docker-compose.yml -f docker-compose.prod.yml -f docker-compose.ssl.yml up -d --build
#
# What this changes from the base (dev) config:
# - Backend: production Dockerfile (compiled JS, no watch, no devDeps)
# - Frontend: production Dockerfile (static build served by nginx, not Vite)
# - No source-code volume mounts (uses baked-in built code)
# - Memory limits and health checks on backend
# - Restart policies for reliability
services:
nginx:
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/production.conf:/etc/nginx/conf.d/default.conf:ro
- certbot_www:/var/www/certbot:ro
- certbot_conf:/etc/letsencrypt:ro
restart: unless-stopped
backend:
build:
context: ./backend
dockerfile: Dockerfile # production Dockerfile (compiled JS)
volumes: [] # override: no source mounts in prod
environment:
- DATABASE_URL=${DATABASE_URL}
- REDIS_URL=${REDIS_URL}
- JWT_SECRET=${JWT_SECRET}
- NODE_ENV=production
- AI_API_URL=${AI_API_URL}
- AI_API_KEY=${AI_API_KEY}
- AI_MODEL=${AI_MODEL}
- AI_DEBUG=${AI_DEBUG:-false}
deploy:
resources:
limits:
memory: 1024M
reservations:
memory: 256M
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:3000/api || exit 1"]
interval: 15s
timeout: 5s
retries: 3
start_period: 30s
restart: unless-stopped
frontend:
build:
context: ./frontend
dockerfile: Dockerfile # production Dockerfile (static nginx)
volumes: [] # override: no source mounts in prod
environment:
- NODE_ENV=production
restart: unless-stopped
postgres:
# Tune PostgreSQL for production workloads
command: >
postgres
-c max_connections=200
-c shared_buffers=256MB
-c effective_cache_size=512MB
-c work_mem=4MB
-c maintenance_work_mem=64MB
-c checkpoint_completion_target=0.9
-c wal_buffers=16MB
-c random_page_cost=1.1
deploy:
resources:
limits:
memory: 1024M
reservations:
memory: 512M
restart: unless-stopped
redis:
restart: unless-stopped
certbot:
image: certbot/certbot:latest
volumes:
- certbot_www:/var/www/certbot
- certbot_conf:/etc/letsencrypt
networks:
- hoanet
entrypoint: "/bin/sh -c 'trap exit TERM; while :; do certbot renew --quiet; sleep 12h & wait $${!}; done'"
volumes:
certbot_www:
certbot_conf:

View File

@@ -9,12 +9,13 @@
1. [Prerequisites](#prerequisites) 1. [Prerequisites](#prerequisites)
2. [Deploy to a Fresh Docker Server](#deploy-to-a-fresh-docker-server) 2. [Deploy to a Fresh Docker Server](#deploy-to-a-fresh-docker-server)
3. [SSL with Certbot (Let's Encrypt)](#ssl-with-certbot-lets-encrypt) 3. [Production Deployment](#production-deployment)
4. [Backup the Local Test Database](#backup-the-local-test-database) 4. [SSL with Certbot (Let's Encrypt)](#ssl-with-certbot-lets-encrypt)
5. [Restore a Backup into the Staged Environment](#restore-a-backup-into-the-staged-environment) 5. [Backup the Local Test Database](#backup-the-local-test-database)
6. [Running Migrations on the Staged Environment](#running-migrations-on-the-staged-environment) 6. [Restore a Backup into the Staged Environment](#restore-a-backup-into-the-staged-environment)
7. [Verifying the Deployment](#verifying-the-deployment) 7. [Running Migrations on the Staged Environment](#running-migrations-on-the-staged-environment)
8. [Environment Variable Reference](#environment-variable-reference) 8. [Verifying the Deployment](#verifying-the-deployment)
9. [Environment Variable Reference](#environment-variable-reference)
--- ---
@@ -135,8 +136,95 @@ This creates:
| API | `http://<server-ip>/api` | | API | `http://<server-ip>/api` |
| Postgres | `<server-ip>:5432` (direct) | | Postgres | `<server-ip>:5432` (direct) |
> At this point the app is running over **plain HTTP**. Continue to the next > At this point the app is running over **plain HTTP** in development mode.
> section to enable HTTPS. > For any environment that will serve real traffic, continue to the Production
> Deployment section.
---
## Production Deployment
The base `docker-compose.yml` runs everything in **development mode** (Vite
dev server, NestJS in watch mode, no connection pooling). This is fine for
local development but will fail under even light production load.
`docker-compose.prod.yml` provides a production overlay that fixes this:
| Component | Dev mode | Production mode |
|-----------|----------|-----------------|
| Frontend | Vite dev server (single-threaded, HMR) | Static build served by nginx |
| Backend | `nest start --watch` (ts-node, file watcher) | Compiled JS, clustered across CPU cores |
| DB pooling | None (new connection per query) | Pool of 30 reusable connections |
| Postgres | Default config (100 connections) | Tuned: 200 connections, optimized buffers |
| Nginx | Basic proxy | Keepalive upstreams, buffering, rate limiting |
| Restart | None | `unless-stopped` on all services |
### Deploy for production
```bash
cd /opt/hoa-ledgeriq
# Ensure .env has NODE_ENV=production and strong secrets
nano .env
# Build and start with the production overlay
docker compose -f docker-compose.yml -f docker-compose.prod.yml up -d --build
```
To add SSL on top of the production stack:
```bash
docker compose \
-f docker-compose.yml \
-f docker-compose.prod.yml \
-f docker-compose.ssl.yml \
up -d --build
```
> **Tip:** Create a shell alias to avoid typing the compose files every time:
> ```bash
> echo 'alias dc="docker compose -f docker-compose.yml -f docker-compose.prod.yml"' >> ~/.bashrc
> source ~/.bashrc
> dc up -d --build
> ```
### What the production overlay does
**Backend (`backend/Dockerfile`)**
- Multi-stage build: compiles TypeScript once, runs `node dist/main`
- No dev dependencies shipped (smaller image, faster startup)
- Node.js clustering: forks one worker per CPU core (up to 4)
- Connection pool: 30 reusable PostgreSQL connections shared across workers
**Frontend (`frontend/Dockerfile`)**
- Multi-stage build: `npm run build` produces optimized static assets
- Served by a lightweight nginx container (not Vite)
- Static assets cached with immutable headers (Vite filename hashing)
**Nginx (`nginx/production.conf`)**
- Keepalive connections to upstream services (connection reuse)
- Proxy buffering to prevent 502s during slow responses
- Rate limiting on API routes (10 req/s per IP, burst 30)
- Proper timeouts tuned per endpoint type
**PostgreSQL**
- `max_connections=200` (up from default 100)
- `shared_buffers=256MB`, `effective_cache_size=512MB`
- Tuned checkpoint, WAL, and memory settings
### Capacity guidelines
With the production stack on a 2-core / 4GB server:
| Metric | Expected capacity |
|--------|-------------------|
| Concurrent users | 50100 |
| API requests/sec | ~200 |
| DB connections | 30 per backend worker × workers |
| Frontend serving | Static files, effectively unlimited |
For higher loads, scale the backend horizontally with Docker Swarm or
Kubernetes replicas.
--- ---

22
frontend/Dockerfile Normal file
View File

@@ -0,0 +1,22 @@
# ---- Production Dockerfile for React frontend ----
# Multi-stage build: compile to static assets, serve with nginx
# Stage 1: Build
FROM node:20-alpine AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build
# Stage 2: Serve with nginx
FROM nginx:alpine
# Copy the built static files
COPY --from=builder /app/dist /usr/share/nginx/html
# Copy a small nginx config for SPA routing
COPY nginx.conf /etc/nginx/conf.d/default.conf
EXPOSE 80
CMD ["nginx", "-g", "daemon off;"]

20
frontend/nginx.conf Normal file
View File

@@ -0,0 +1,20 @@
# Minimal nginx config for serving the React SPA inside the frontend container.
# The outer nginx reverse proxy forwards non-API requests here.
server {
listen 80;
server_name _;
root /usr/share/nginx/html;
index index.html;
# Serve static assets with long cache (Vite hashes filenames)
location /assets/ {
expires 1y;
add_header Cache-Control "public, immutable";
}
# SPA fallback — any non-file route returns index.html
location / {
try_files $uri $uri/ /index.html;
}
}

97
nginx/production.conf Normal file
View File

@@ -0,0 +1,97 @@
upstream backend {
server backend:3000;
keepalive 32; # reuse connections to backend
}
upstream frontend {
server frontend:80;
keepalive 16;
}
# Shared proxy settings
proxy_http_version 1.1;
proxy_set_header Connection ""; # enable keepalive to upstreams
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Buffer settings — prevent 502s when backend is slow to respond
proxy_buffering on;
proxy_buffer_size 16k;
proxy_buffers 8 16k;
proxy_busy_buffers_size 32k;
# Redirect HTTP → HTTPS
server {
listen 80;
server_name _;
location /.well-known/acme-challenge/ {
root /var/www/certbot;
}
location / {
return 301 https://$host$request_uri;
}
}
# HTTPS server
server {
listen 443 ssl;
# Replace with your hostname:
server_name staging.example.com;
# --- TLS certificates ---
ssl_certificate /etc/letsencrypt/live/staging.example.com/fullchain.pem;
ssl_certificate_key /etc/letsencrypt/live/staging.example.com/privkey.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers 'ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384';
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 1d;
ssl_session_tickets off;
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains" always;
add_header X-Content-Type-Options nosniff always;
add_header X-Frame-Options SAMEORIGIN always;
# --- Rate limit zone (10 req/s per IP for API) ---
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s;
# --- API routes → backend ---
location /api/ {
limit_req zone=api_limit burst=30 nodelay;
proxy_pass http://backend;
proxy_read_timeout 30s;
proxy_connect_timeout 5s;
proxy_send_timeout 15s;
}
# AI endpoints → longer timeouts
location /api/investment-planning/recommendations {
proxy_pass http://backend;
proxy_read_timeout 180s;
proxy_connect_timeout 10s;
proxy_send_timeout 30s;
}
location /api/health-scores/calculate {
proxy_pass http://backend;
proxy_read_timeout 180s;
proxy_connect_timeout 10s;
proxy_send_timeout 30s;
}
# --- Static frontend → built React assets ---
location / {
proxy_pass http://frontend;
proxy_read_timeout 10s;
proxy_connect_timeout 5s;
# Cache static assets aggressively at the proxy level
proxy_cache_bypass $http_upgrade;
}
}