Node.js 生产环境异常处理与进程守护实战：从崩溃到自愈-是一个基于内容分享折腾笔记的网站序与诗

为什么 Node.js 进程会崩溃？

Node.js 单线程模型的致命弱点：一个未捕获的异常就能让整个进程退出。在生产环境中，这意味着服务瞬间不可用，用户看到 502 错误，业务中断。很多人觉得”我的代码没有 bug”，但现实是——第三方库抛异常、内存泄漏导致 OOM、网络波动引发连锁反应——这些问题你无法完全预防。

所以，真正的问题不是”如何避免崩溃”，而是”崩溃之后怎么办”。

第一道防线：process 全局异常捕获

uncaughtException — 最后一道墙

当异常一路冒泡到事件循环都没有被 catch，uncaughtException 就会被触发。这是你最后的抢救机会：

process.on('uncaughtException', (err) => {
  console.error('[FATAL] 未捕获异常:', err);
  // 记录日志、发送告警
  alertService.send(`进程即将退出: ${err.message}`);
  // 强制退出，让进程管理器重启
  process.exit(1);
});

注意：Node.js 官方文档明确建议，uncaughtException 触发后应该退出进程。因为你无法确认应用状态是否完好，继续运行可能导致数据损坏或更严重的错误。捕获它是为了”善后”，不是为了”续命”。

unhandledRejection — Promise 的隐形杀手

比 uncaughtException 更隐蔽的是未处理的 Promise rejection。在 Node.js 15+ 中，未处理的 rejection 默认会导致进程退出：

process.on('unhandledRejection', (reason, promise) => {
  console.error('[FATAL] 未处理的Promise拒绝:', reason);
  alertService.send(`未处理拒绝: ${reason}`);
  process.exit(1);
});

很多人在 async 函数里忘了 try-catch，或者 .then() 忘了加 .catch()，结果 rejection 静默丢失，直到某天线上炸了才发现。

第二道防线：优雅退出（Graceful Shutdown）

当进程要退出时（无论是异常还是收到 SIGTERM），你需要做好清理工作：关闭数据库连接、停止接收新请求、处理完已有请求、关闭 HTTP 服务器。

const server = app.listen(3000);
let isShuttingDown = false;

// 请求中间件：关闭期间拒绝新请求
app.use((req, res, next) => {
  if (isShuttingDown) {
    res.status(503).json({ error: '服务正在关闭' });
    return;
  }
  next();
});

function gracefulShutdown(signal) {
  console.log(`收到 ${signal}，开始优雅退出...`);
  isShuttingDown = true;

  // 停止接受新连接
  server.close(() => {
    console.log('HTTP 服务器已关闭');

    // 关闭数据库连接
    database.close().then(() => {
      console.log('数据库连接已关闭');
      process.exit(0);
    });
  });

  // 超时强制退出
  setTimeout(() => {
    console.error('优雅退出超时，强制退出');
    process.exit(1);
  }, 10000);
}

process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
process.on('SIGINT', () => gracefulShutdown('SIGINT'));

第三道防线：PM2 进程守护

单靠代码里的异常处理还不够——如果进程退出了，谁来重启它？答案就是进程管理器，PM2 是最流行的选择。

安装与启动

# 全局安装 PM2
npm install -g pm2

# 启动应用（集群模式，自动利用所有 CPU 核心）
pm2 start app.js -i max --name "my-api"

# 查看状态
pm2 status

# 查看日志
pm2 logs my-api

ecosystem.config.js 配置文件

生产环境推荐使用配置文件，一次性搞定所有参数：

module.exports = {
  apps: [{
    name: 'my-api',
    script: './dist/app.js',
    instances: 'max',        // 集群模式，按 CPU 核心数启动
    exec_mode: 'cluster',
    watch: false,
    max_memory_restart: '512M',  // 内存超限自动重启
    env_production: {
      NODE_ENV: 'production',
      PORT: 3000
    },
    // 异常自动重启策略
    exp_backoff_restart_delay: 100,  // 指数退避，避免快速重启循环
    max_restarts: 10,                // 1分钟内最多重启10次
    restart_time: 60000,
    // 优雅退出
    kill_timeout: 8000,              // 给进程8秒优雅退出
    listen_timeout: 10000,
    // 日志
    error_file: './logs/error.log',
    out_file: './logs/out.log',
    merge_logs: true,
    log_date_format: 'YYYY-MM-DD HH:mm:ss Z'
  }]
};

PM2 的关键能力

自动重启：进程崩溃后自动拉起，无需人工干预
零停机重载：pm2 reload my-api 逐个重启集群实例
内存监控：超出 max_memory_restart 自动重启，防止内存泄漏
指数退避：连续崩溃时逐渐增大重启间隔，避免重启风暴
日志管理：统一收集 stdout/stderr，支持日志轮转

第四道防线：健康检查与自动摘除

在 Kubernetes 或负载均衡环境中，你需要健康检查接口，让编排系统知道服务是否正常：

const express = require('express');
const app = express();
let isReady = true;

// 存活探针：进程还活着吗？
app.get('/healthz', (req, res) => {
  res.status(200).send('ok');
});

// 就绪探针：能处理请求吗？
app.get('/readyz', async (req, res) => {
  if (!isReady) {
    return res.status(503).send('not ready');
  }
  try {
    // 检查关键依赖（数据库、缓存等）
    await database.ping();
    await redis.ping();
    res.status(200).send('ready');
  } catch (err) {
    res.status(503).send('dependencies unavailable');
  }
});

// 优雅退出时标记为不可用
process.on('SIGTERM', () => {
  isReady = false;  // 立即从负载均衡摘除
  gracefulShutdown('SIGTERM');
});

第五道防线：全局错误中间件（Express/Koa）

框架层面的错误兜底，确保每个请求都有错误响应，不会让异常变成未捕获错误：

// Express 错误中间件（必须放在所有路由之后）
app.use((err, req, res, next) => {
  console.error('请求处理异常:', err);

  // 已知的业务错误
  if (err instanceof AppError) {
    return res.status(err.statusCode).json({
      code: err.code,
      message: err.message
    });
  }

  // 未知错误
  res.status(500).json({
    code: 'INTERNAL_ERROR',
    message: process.env.NODE_ENV === 'production'
      ? '服务器内部错误'
      : err.message
  });
});

// Koa 版本
app.on('error', (err, ctx) => {
  console.error(`[${ctx.method}] ${ctx.url}:`, err);
  if (ctx.status >= 500) {
    alertService.send(`服务异常: ${ctx.method} ${ctx.url} - ${err.message}`);
  }
});

实战：一个完整的守护方案

把以上所有防线组合起来，形成一个完整的守护体系：

// guard.js - 守护模块
class ProcessGuard {
  constructor(options = {}) {
    this.isShuttingDown = false;
    this.server = null;
    this.cleanupFns = [];
    this.alertFn = options.alertFn || console.error;
    this.shutdownTimeout = options.shutdownTimeout || 10000;
  }

  registerServer(server) {
    this.server = server;
    return this;
  }

  onCleanup(fn) {
    this.cleanupFns.push(fn);
    return this;
  }

  install() {
    // 未捕获异常
    process.on('uncaughtException', (err) => {
      this.alertFn(`[FATAL] 未捕获异常: ${err.stack}`);
      this.gracefulShutdown(1);
    });

    // 未处理 Promise rejection
    process.on('unhandledRejection', (reason) => {
      this.alertFn(`[FATAL] 未处理Rejection: ${reason}`);
      this.gracefulShutdown(1);
    });

    // 信号处理
    process.on('SIGTERM', () => this.gracefulShutdown(0));
    process.on('SIGINT', () => this.gracefulShutdown(0));

    // 内存警告（Node.js 15.9+）
    if (process.on) {
      try {
        process.on('warning', (warning) => {
          if (warning.name === 'MaxListenersExceededWarning') {
            this.alertFn(`[WARN] 可能存在内存泄漏: ${warning.message}`);
          }
        });
      } catch (e) {}
    }

    return this;
  }

  async gracefulShutdown(exitCode) {
    if (this.isShuttingDown) return;
    this.isShuttingDown = true;
    console.log('开始优雅退出...');

    const timer = setTimeout(() => {
      console.error('优雅退出超时，强制退出');
      process.exit(1);
    }, this.shutdownTimeout);

    // 执行清理函数
    for (const fn of this.cleanupFns) {
      try {
        await fn();
      } catch (e) {
        console.error('清理函数执行失败:', e);
      }
    }

    // 关闭 HTTP 服务器
    if (this.server) {
      this.server.close();
    }

    clearTimeout(timer);
    process.exit(exitCode);
  }
}

// 使用方式
const guard = new ProcessGuard({
  shutdownTimeout: 8000,
  alertFn: (msg) => {
    console.error(msg);
    // 发送到你的告警系统
    fetch('https://your-alert-webhook', {
      method: 'POST',
      body: JSON.stringify({ text: msg })
    }).catch(() => {});
  }
});

guard
  .registerServer(server)
  .onCleanup(() => database.close())
  .onCleanup(() => redis.disconnect())
  .install();

日志与告警：出问题时第一时间知道

进程守护只是”保活”，你还需要知道”为什么挂了”。推荐使用结构化日志：

const pino = require('pino');
const logger = pino({
  level: process.env.LOG_LEVEL || 'info',
  formatters: {
    level: (label) => ({ level: label })
  },
  timestamp: pino.stdTimeFunctions.isoTime
});

// 在异常处理中使用
process.on('uncaughtException', (err) => {
  logger.fatal({ err, stack: err.stack }, '未捕获异常，进程即将退出');
  process.exit(1);
});

// 请求日志中间件
app.use((req, res, next) => {
  const start = Date.now();
  res.on('finish', () => {
    logger.info({
      method: req.method,
      url: req.url,
      status: res.statusCode,
      duration: Date.now() - start,
      ip: req.ip
    }, '请求完成');
  });
  next();
});