SMTP 服务器搭建完全指南 / 第 11 章:监控与日志分析
第 11 章:监控与日志分析
你无法管理你看不到的东西——监控是邮件服务器运维的眼睛。
11.1 邮件队列管理
11.1.1 查看队列状态
# 查看邮件队列
mailq
# 详细队列信息
postqueue -p
# 队列统计
qshape
# 队列目录大小
du -sh /var/spool/postfix/{active,deferred,corrupt,hold,maildrop,incoming}
# 查看特定队列
find /var/spool/postfix/deferred -type f | wc -l
11.1.2 队列操作命令
| 命令 | 作用 |
|---|---|
mailq | 显示队列中的邮件 |
postqueue -p | 详细显示队列 |
postqueue -f | 刷新队列(尝试立即发送) |
postqueue -s domain | 刷新特定域名的队列 |
postsuper -d ID | 删除特定邮件 |
postsuper -d ALL | 删除所有邮件 |
postsuper -h ID | 暂停特定邮件 |
postsuper -H ID | 恢复特定邮件 |
postcat -q ID | 查看邮件内容 |
11.1.3 队列管理脚本
#!/bin/bash
# mail-queue-status.sh — 邮件队列状态报告
echo "=== 邮件队列状态 $(date) ==="
echo ""
# 队列统计
TOTAL=$(find /var/spool/postfix/active /var/spool/postfix/deferred -type f 2>/dev/null | wc -l)
ACTIVE=$(find /var/spool/postfix/active -type f 2>/dev/null | wc -l)
DEFERRED=$(find /var/spool/postfix/deferred -type f 2>/dev/null | wc -l)
HOLD=$(find /var/spool/postfix/hold -type f 2>/dev/null | wc -l)
echo "队列统计:"
echo " 总计: $TOTAL"
echo " 活跃: $ACTIVE"
echo " 延迟: $DEFERRED"
echo " 暂停: $HOLD"
echo ""
# 告警阈值
if [ $DEFERRED -gt 1000 ]; then
echo "⚠️ 警告:延迟队列超过 1000 封!"
fi
# 队列大小
echo "队列目录大小:"
du -sh /var/spool/postfix/{active,deferred,hold} 2>/dev/null
echo ""
# 最旧的邮件
echo "最旧的延迟邮件:"
find /var/spool/postfix/deferred -type f -printf '%T+ %p\n' 2>/dev/null | sort | head -5
echo ""
# 延迟邮件的目标域名分布
echo "延迟邮件目标域名 TOP 10:"
qshape deferred 2>/dev/null | head -12
11.1.4 队列清理策略
# /etc/postfix/main.cf — 队列配置
# 队列扫描间隔
queue_run_delay = 300s
# 最小退信间隔
minimal_backoff_time = 300s
# 最大退信间隔
maximal_backoff_time = 4000s
# 邮件最大生命周期
maximal_queue_lifetime = 5d
# 退信最大生命周期
bounce_queue_lifetime = 2d
11.2 日志分析
11.2.1 Postfix 日志格式
# 日志文件位置
/var/log/mail.log # Debian/Ubuntu
/var/log/maillog # RHEL/CentOS
# 日志示例
May 10 10:15:23 mail postfix/smtpd[12345]: connect from unknown[203.0.113.50]
May 10 10:15:24 mail postfix/smtpd[12345]: 1A2B3C4D5E: client=unknown[203.0.113.50]
May 10 10:15:24 mail postfix/cleanup[12346]: 1A2B3C4D5E: message-id=<abc@sender.com>
May 10 10:15:24 mail postfix/qmgr[12347]: 1A2B3C4D5E: from=<sender@example.com>, size=1234, nrcpt=1
May 10 10:15:25 mail postfix/smtp[12348]: 1A2B3C4D5E: to=<user@example.com>, relay=mail.example.com[203.0.113.10]:25, delay=2, status=sent (250 OK)
May 10 10:15:25 mail postfix/qmgr[12347]: 1A2B3C4D5E: removed
11.2.2 常用日志分析命令
# 查看今天的邮件
grep "$(date +%b' '%d)" /var/log/mail.log
# 统计发送状态
grep "status=" /var/log/mail.log | awk '{print $NF}' | sort | uniq -c | sort -rn
# 查看失败的邮件
grep "status=bounced\|status=deferred\|status=reject" /var/log/mail.log
# 查看特定队列 ID 的完整流程
grep "1A2B3C4D5E" /var/log/mail.log
# 查看连接来源 TOP 10
grep "connect from" /var/log/mail.log | awk '{print $NF}' | sort | uniq -c | sort -rn | head -10
# 查看被拒绝的连接
grep "reject:" /var/log/mail.log
# 查看 TLS 连接
grep "TLS" /var/log/mail.log | tail -20
# 查看认证失败
grep "auth failed\|authentication failed" /var/log/mail.log
# 统计每小时邮件量
awk '{print $2}' /var/log/mail.log | cut -d: -f1 | uniq -c
11.2.3 日志分析脚本
#!/bin/bash
# mail-log-stats.sh — 邮件日志统计脚本
LOG_FILE="/var/log/mail.log"
DATE=$(date +%b' '%d)
echo "=== 邮件日志统计 ($DATE) ==="
echo ""
# 连接统计
CONNECTIONS=$(grep "$DATE" "$LOG_FILE" | grep -c "connect from")
echo "连接数: $CONNECTIONS"
# 发送统计
SENT=$(grep "$DATE" "$LOG_FILE" | grep -c "status=sent")
BOUNCED=$(grep "$DATE" "$LOG_FILE" | grep -c "status=bounced")
DEFERRED=$(grep "$DATE" "$LOG_FILE" | grep -c "status=deferred")
REJECTED=$(grep "$DATE" "$LOG_FILE" | grep -c "status=reject")
echo "发送成功: $SENT"
echo "退信: $BOUNCED"
echo "延迟: $DEFERRED"
echo "拒绝: $REJECTED"
echo ""
# 退信原因 TOP 5
echo "退信原因 TOP 5:"
grep "status=bounced" "$LOG_FILE" | grep -oP 'dsn="[^"]*"' | sort | uniq -c | sort -rn | head -5
echo ""
# 被拒绝的域名 TOP 5
echo "被拒绝的域名 TOP 5:"
grep "reject:" "$LOG_FILE" | grep -oP 'to=<[^>]*>' | sed 's/to=<//;s/>//' | cut -d@ -f2 | sort | uniq -c | sort -rn | head -5
echo ""
# 发送量最大的域名 TOP 5
echo "发送量最大的域名 TOP 5:"
grep "status=sent" "$LOG_FILE" | grep -oP 'to=<[^>]*>' | sed 's/to=<//;s/>//' | cut -d@ -f2 | sort | uniq -c | sort -rn | head -5
11.3 系统监控
11.3.1 Postfix 服务监控
# 检查 Postfix 状态
sudo systemctl status postfix
# 检查关键进程
ps aux | grep -E "master|smtpd|qmgr|cleanup"
# 检查端口
sudo ss -tlnp | grep -E ":(25|587|465)"
# 检查服务健康
postfix check
11.3.2 健康检查脚本
#!/bin/bash
# mail-health-check.sh — 邮件服务器健康检查
ERRORS=0
WARNINGS=0
echo "=== 邮件服务器健康检查 $(date) ==="
echo ""
# 1. 检查 Postfix 服务
echo "[1/8] 检查 Postfix 服务..."
if systemctl is-active --quiet postfix; then
echo " ✅ Postfix 运行正常"
else
echo " ❌ Postfix 未运行!"
((ERRORS++))
fi
# 2. 检查 Dovecot 服务
echo "[2/8] 检查 Dovecot 服务..."
if systemctl is-active --quiet dovecot; then
echo " ✅ Dovecot 运行正常"
else
echo " ❌ Dovecot 未运行!"
((ERRORS++))
fi
# 3. 检查端口监听
echo "[3/8] 检查端口监听..."
for port in 25 587 993; do
if ss -tln | grep -q ":$port "; then
echo " ✅ 端口 $port 监听中"
else
echo " ❌ 端口 $port 未监听!"
((ERRORS++))
fi
done
# 4. 检查磁盘空间
echo "[4/8] 检查磁盘空间..."
DISK_USAGE=$(df -h / | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -lt 80 ]; then
echo " ✅ 磁盘使用率: ${DISK_USAGE}%"
elif [ $DISK_USAGE -lt 90 ]; then
echo " ⚠️ 磁盘使用率较高: ${DISK_USAGE}%"
((WARNINGS++))
else
echo " ❌ 磁盘空间严重不足: ${DISK_USAGE}%!"
((ERRORS++))
fi
# 5. 检查邮件队列
echo "[5/8] 检查邮件队列..."
QUEUE_SIZE=$(find /var/spool/postfix/deferred -type f 2>/dev/null | wc -l)
if [ $QUEUE_SIZE -lt 100 ]; then
echo " ✅ 延迟队列: $QUEUE_SIZE 封"
elif [ $QUEUE_SIZE -lt 1000 ]; then
echo " ⚠️ 延迟队列较大: $QUEUE_SIZE 封"
((WARNINGS++))
else
echo " ❌ 延迟队列过大: $QUEUE_SIZE 封!"
((ERRORS++))
fi
# 6. 检查证书有效期
echo "[6/8] 检查 TLS 证书..."
CERT_FILE="/etc/letsencrypt/live/mail.example.com/cert.pem"
if [ -f "$CERT_FILE" ]; then
EXPIRY=$(openssl x509 -enddate -noout -in "$CERT_FILE" | cut -d= -f2)
EXPIRY_EPOCH=$(date -d "$EXPIRY" +%s)
NOW_EPOCH=$(date +%s)
DAYS_LEFT=$(( (EXPIRY_EPOCH - NOW_EPOCH) / 86400 ))
if [ $DAYS_LEFT -gt 30 ]; then
echo " ✅ 证书剩余 $DAYS_LEFT 天"
elif [ $DAYS_LEFT -gt 7 ]; then
echo " ⚠️ 证书即将过期: 剩余 $DAYS_LEFT 天"
((WARNINGS++))
else
echo " ❌ 证书即将过期: 剩余 $DAYS_LEFT 天!"
((ERRORS++))
fi
else
echo " ❌ 证书文件不存在!"
((ERRORS++))
fi
# 7. 检查日志错误
echo "[7/8] 检查最近日志错误..."
RECENT_ERRORS=$(grep "$(date +%b' '%d)" /var/log/mail.log | grep -ci "error\|fatal\|panic")
if [ $RECENT_ERRORS -eq 0 ]; then
echo " ✅ 今日无严重错误"
elif [ $RECENT_ERRORS -lt 10 ]; then
echo " ⚠️ 今日有 $RECENT_ERRORS 条错误"
((WARNINGS++))
else
echo " ❌ 今日有 $RECENT_ERRORS 条错误!"
((ERRORS++))
fi
# 8. 检查内存使用
echo "[8/8] 检查内存使用..."
MEM_USAGE=$(free | awk '/Mem:/ {printf "%.0f", $3/$2*100}')
if [ $MEM_USAGE -lt 80 ]; then
echo " ✅ 内存使用率: ${MEM_USAGE}%"
elif [ $MEM_USAGE -lt 90 ]; then
echo " ⚠️ 内存使用率较高: ${MEM_USAGE}%"
((WARNINGS++))
else
echo " ❌ 内存使用率过高: ${MEM_USAGE}%!"
((ERRORS++))
fi
echo ""
echo "=== 检查完成 ==="
echo "错误: $ERRORS | 警告: $WARNINGS"
if [ $ERRORS -gt 0 ]; then
exit 2
elif [ $WARNINGS -gt 0 ]; then
exit 1
else
exit 0
fi
11.4 Prometheus 监控集成
11.4.1 安装 Postfix Exporter
# 下载 postfix_exporter
wget https://github.com/kumina/postfix_exporter/releases/download/0.3.0/postfix_exporter-0.3.0.linux-amd64.tar.gz
tar xzf postfix_exporter-0.3.0.linux-amd64.tar.gz
sudo mv postfix_exporter-0.3.0.linux-amd64/postfix_exporter /usr/local/bin/
# 创建 systemd 服务
sudo tee /etc/systemd/system/postfix_exporter.service << 'EOF'
[Unit]
Description=Postfix Exporter
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/local/bin/postfix_exporter \
--postfix.showq.url=http://localhost:10099/showq \
--web.listen-address=:9154
Restart=always
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl daemon-reload
sudo systemctl enable --now postfix_exporter
11.4.2 配置 Prometheus
# /etc/prometheus/prometheus.yml — 添加 Postfix 监控
scrape_configs:
- job_name: 'postfix'
static_configs:
- targets: ['localhost:9154']
- job_name: 'node'
static_configs:
- targets: ['localhost:9100']
11.4.3 Postfix Exporter 指标
| 指标 | 说明 |
|---|---|
postfix_showq_messages | 队列中的邮件数量 |
postfix_showq_message_size_bytes | 邮件大小分布 |
postfix_smtpd_connects_total | SMTP 连接总数 |
postfix_smtpd_disconnects_total | SMTP 断开总数 |
postfix_smtp_sent_total | 发送邮件总数 |
postfix_smtp_deferred_total | 延迟邮件总数 |
postfix_cleanup_messages_total | 清理邮件总数 |
11.4.4 Grafana 仪表板
{
"dashboard": {
"title": "Postfix Mail Server",
"panels": [
{
"title": "Queue Size",
"type": "stat",
"targets": [{
"expr": "postfix_showq_messages"
}]
},
{
"title": "SMTP Connections",
"type": "graph",
"targets": [{
"expr": "rate(postfix_smtpd_connects_total[5m])"
}]
},
{
"title": "Delivery Rate",
"type": "graph",
"targets": [{
"expr": "rate(postfix_smtp_sent_total[5m])"
}]
},
{
"title": "Bounce Rate",
"type": "graph",
"targets": [{
"expr": "rate(postfix_smtp_deferred_total[5m])"
}]
}
]
}
}
11.5 告警配置
11.5.1 告警规则示例
# /etc/prometheus/rules/postfix.yml
groups:
- name: postfix_alerts
rules:
# 队列堵塞告警
- alert: PostfixQueueBacklog
expr: postfix_showq_messages > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "邮件队列积压"
description: "邮件队列中积压了 {{ $value }} 封邮件"
# 服务宕机告警
- alert: PostfixDown
expr: up{job="postfix"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Postfix 服务宕机"
# 磁盘空间告警
- alert: MailDiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘空间不足"
description: "磁盘剩余空间: {{ $value }}%"
# 证书过期告警
- alert: CertificateExpiringSoon
expr: (probe_ssl_earliest_cert_expiry - time()) / 86400 < 30
for: 1h
labels:
severity: warning
annotations:
summary: "TLS 证书即将过期"
description: "证书将在 {{ $value }} 天后过期"
11.5.2 邮件告警通知
#!/bin/bash
# mail-alert.sh — 邮件告警脚本
ALERT_TO="admin@example.com"
ALERT_SUBJECT="邮件服务器告警"
ALERT_BODY="$1"
echo "$ALERT_BODY" | mail -s "$ALERT_SUBJECT" "$ALERT_TO"
11.5.3 日志轮转配置
# /etc/logrotate.d/postfix
/var/log/mail.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
create 0640 root adm
sharedscripts
postrotate
/usr/lib/rsyslog/rsyslog-rotate
endscript
}
11.6 业务场景:生产环境监控方案
场景描述
一家中型企业需要全面的邮件服务器监控:
- 实时监控队列状态
- 告警通知(邮件 + 企业微信/钉钉)
- 历史趋势分析
- 自动化运维
监控架构
┌─────────────────────────────────────────┐
│ 监控面板 (Grafana) │
└─────────────┬───────────────────────────┘
│
┌─────────────▼───────────────────────────┐
│ Prometheus + Alertmanager │
└───────┬─────────┬─────────┬─────────────┘
│ │ │
┌───────▼──┐ ┌────▼────┐ ┌─▼───────────┐
│Postfix │ │Node │ │Blackbox │
│Exporter │ │Exporter │ │Exporter │
└──────────┘ └─────────┘ └─────────────┘
11.7 注意事项
⚠️ 日志安全:
- 邮件日志可能包含敏感信息(发件人、收件人)
- 限制日志文件权限
- 定期清理旧日志
⚠️ 监控性能:
- Prometheus 指标采集可能影响性能
- 合理设置采集间隔
- 避免过多的告警规则
💡 日志分析建议:
- 使用 ELK (Elasticsearch + Logstash + Kibana) 进行大规模日志分析
- 定期生成日志报告
- 建立基线,便于异常检测