Nagios 监控运维完整教程 / 第7章:命令与插件开发
第7章:命令与插件开发
命令(Command)是连接 Nagios 核心与插件的桥梁。本章详细讲解命令定义、插件开发规范、返回值标准、性能数据格式以及远程监控协议 NRPE 和 NSCA。
一、命令定义
1.1 命令对象语法
define command {
command_name check_example # 命令名称(唯一标识)
command_line $USER1$/check_example -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$
# 实际执行的命令行(支持宏变量替换)
}
1.2 命令分类
| 类型 | 命名约定 | 用途 | 示例 |
|---|---|---|---|
| 检查命令 | check_* | 主机/服务检查 | check_ping, check_http |
| 通知命令 | notify_* | 发送通知 | notify-service-by-email |
| 事件处理 | event_* | 事件处理脚本 | event-restart-service |
| 性能数据 | process_* | 处理性能数据 | process-service-perfdata |
1.3 常用检查命令定义
########################################
# 主机检查命令
########################################
# ICMP Ping 检查
define command {
command_name check_ping
command_line $USER1$/check_ping -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$ -p 5
}
# TCP 端口检查
define command {
command_name check_tcp
command_line $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ -t 10
}
# SSH 检查
define command {
command_name check_ssh
command_line $USER1$/check_ssh -H $HOSTADDRESS$ -p $ARG1$
}
# NRPE 远程检查
define command {
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -a $ARG2$
}
########################################
# Web 服务检查命令
########################################
# HTTP 检查
define command {
command_name check_http
command_line $USER1$/check_http -H $HOSTADDRESS$ -p $ARG1$ -u $ARG2$ -e 200
}
# HTTPS 检查
define command {
command_name check_https
command_line $USER1$/check_http -H $HOSTADDRESS$ -S -p $ARG1$ -u $ARG2$ --sni -e 200
}
# SSL 证书检查
define command {
command_name check_ssl_cert
command_line $USER1$/check_http -H $HOSTADDRESS$ -C $ARG1$,$ARG2$
}
# URL 内容检查
define command {
command_name check_url_content
command_line $USER1$/check_http -H $HOSTADDRESS$ -u $ARG1$ -s "$ARG2$"
}
########################################
# 系统资源检查命令
########################################
# 磁盘使用率
define command {
command_name check_disk
command_line $USER1$/check_disk -w $ARG1$ -c $ARG2$ -p $ARG3$
}
# CPU 负载
define command {
command_name check_load
command_line $USER1$/check_load -w $ARG1$ -c $ARG2$
}
# 内存使用率(需要 check_mem 插件)
define command {
command_name check_mem
command_line $USER1$/check_mem -w $ARG1$ -c $ARG2$ -f
}
# 进程数
define command {
command_name check_procs
command_line $USER1$/check_procs -w $ARG1$ -c $ARG2$ -s $ARG3$
}
# 用户数
define command {
command_name check_users
command_line $USER1$/check_users -w $ARG1$ -c $ARG2$
}
# Swap 使用率
define command {
command_name check_swap
command_line $USER1$/check_swap -w $ARG1$ -c $ARG2$
}
# 文件描述符
define command {
command_name check_file_handles
command_line $USER1$/check_file_handles -w $ARG1$ -c $ARG2$
}
########################################
# 网络服务检查命令
########################################
# DNS 检查
define command {
command_name check_dns
command_line $USER1$/check_dns -H $ARG1$ -s $HOSTADDRESS$ -t 10
}
# SMTP 检查
define command {
command_name check_smtp
command_line $USER1$/check_smtp -H $HOSTADDRESS$ -p $ARG1$
}
# POP3 检查
define command {
command_name check_pop
command_line $USER1$/check_pop -H $HOSTADDRESS$ -p 110
}
# IMAP 检查
define command {
command_name check_imap
command_line $USER1$/check_imap -H $HOSTADDRESS$ -p 143
}
# NTP 检查
define command {
command_name check_ntp
command_line $USER1$/check_ntp -H $HOSTADDRESS$ -w $ARG1$ -c $ARG2$
}
########################################
# 数据库检查命令
########################################
# MySQL 检查
define command {
command_name check_mysql
command_line $USER1$/check_mysql -H $HOSTADDRESS$ -u $USER5$ -p $USER6$ -d $ARG1$
}
# PostgreSQL 检查
define command {
command_name check_pgsql
command_line $USER1$/check_pgsql -H $HOSTADDRESS$ -l $USER5$ -p $ARG1$
}
########################################
# 通知命令
########################################
# 邮件通知(主机)
define command {
command_name notify-host-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\nHost: $HOSTNAME$\nState: $HOSTSTATE$\nAddress: $HOSTADDRESS$\nInfo: $HOSTOUTPUT$\n\nDate/Time: $LONGDATETIME$\n" | /usr/bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$
}
# 邮件通知(服务)
define command {
command_name notify-service-by-email
command_line /usr/bin/printf "%b" "***** Nagios *****\n\nNotification Type: $NOTIFICATIONTYPE$\n\nService: $SERVICEDESC$\nHost: $HOSTNAME$\nAddress: $HOSTADDRESS$\nState: $SERVICESTATE$\n\nDate/Time: $LONGDATETIME$\n\nAdditional Info:\n\n$SERVICEOUTPUT$\n" | /usr/bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTNAME$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}
########################################
# 事件处理命令
########################################
# 自动重启服务
define command {
command_name event-restart-service
command_line $USER2$/restart_service.sh $HOSTNAME$ $SERVICEDESC$
}
# 性能数据处理
define command {
command_name process-service-perfdata
command_line /usr/bin/printf "%b" "$LASTSERVICECHECK$\t$HOSTNAME$\t$SERVICEDESC$\t$SERVICESTATE$\t$SERVICEATTEMPT$\t$SERVICESTATETYPE$\t$SERVICEEXECUTIONTIME$\t$SERVICELATENCY$\t$SERVICEOUTPUT$\t$SERVICEPERFDATA$\n" >> /var/log/nagios/service-perfdata.out
}
二、插件开发规范
2.1 插件输出规范
Nagios 插件必须遵循严格的输出规范:
# 标准输出格式(一行文本)
TEXT OUTPUT | OPTIONAL PERFDATA
LONG TEXT LINE 1
LONG TEXT LINE 2
...
# 示例
PING OK - Packet Loss = 0%, RTA = 10.50ms | rta=10.50ms;100.00;200.00;0; pl=0%;20;50;0;100
2.2 退出码规范
| 退出码 | 状态 | 含义 |
|---|---|---|
| 0 | OK | 正常 |
| 1 | WARNING | 警告 |
| 2 | CRITICAL | 严重 |
| 3 | UNKNOWN | 未知 |
2.3 性能数据格式
# 格式:
# label=value[UOM];[warn];[crit];[min];[max]
# 示例:
rta=10.50ms;100.00;200.00;0; # RTA 延迟
pl=0%;20;50;0;100 # 丢包率
time=0.15s;1;5;0; # 响应时间
users=5;10;20;0; # 用户数
size=1024B;2048;4096;0; # 文件大小
temperature=45;60;80;0;100 # 温度
# 单位(UOM):
# (无) = 无单位(整数/浮点数)
# s = 秒
# % = 百分比
# B = 字节
# KB, MB, GB, TB = 存储单位
# c = 计数器
2.4 性能数据多行输出
# 第一行:状态输出和主要性能数据
HTTP OK - Response time = 0.15s | time=0.15s;1;5;0; size=15234B;;;0;
# 长输出(可选)
HTTP/1.1 200 OK
Content-Length: 15234
Server: Apache/2.4.41
# 多性能数据项
| time=0.15s;1;5;0; size=15234B;;;0; pages=25;;;0;
三、Shell 插件开发
3.1 基本框架
#!/bin/bash
# check_example.sh - 示例检查插件
# 用法: check_example.sh -H <host> -w <warning> -c <critical>
########################################
# 定义退出码
########################################
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
########################################
# 默认值
########################################
HOST=""
WARNING=""
CRITICAL=""
VERBOSE=0
########################################
# 使用帮助
########################################
print_help() {
echo "Usage: $0 -H <host> -w <warning> -c <critical>"
echo ""
echo "Options:"
echo " -H Host address"
echo " -w Warning threshold"
echo " -c Critical threshold"
echo " -v Verbose output"
echo " -h Show this help"
exit $STATE_UNKNOWN
}
########################################
# 参数解析
########################################
while getopts "H:w:c:vh" opt; do
case $opt in
H) HOST=$OPTARG ;;
w) WARNING=$OPTARG ;;
c) CRITICAL=$OPTARG ;;
v) VERBOSE=1 ;;
h) print_help ;;
*) print_help ;;
esac
done
# 参数验证
if [ -z "$HOST" ] || [ -z "$WARNING" ] || [ -z "$CRITICAL" ]; then
echo "UNKNOWN: Missing required parameters"
print_help
fi
########################################
# 检查逻辑
########################################
# 这里实现你的检查逻辑
# 例如:检查某个指标
METRIC=$(some_check_command $HOST)
if [ $VERBOSE -eq 1 ]; then
echo "Metric: $METRIC, Warning: $WARNING, Critical: $CRITICAL"
fi
########################################
# 状态判断
########################################
if [ -z "$METRIC" ]; then
echo "UNKNOWN: Unable to retrieve metric"
exit $STATE_UNKNOWN
fi
# 使用 bc 进行浮点比较
if (( $(echo "$METRIC > $CRITICAL" | bc -l) )); then
echo "CRITICAL: Metric=$METRIC (threshold=$CRITICAL) | metric=$METRIC;$WARNING;$CRITICAL"
exit $STATE_CRITICAL
elif (( $(echo "$METRIC > $WARNING" | bc -l) )); then
echo "WARNING: Metric=$METRIC (threshold=$WARNING) | metric=$METRIC;$WARNING;$CRITICAL"
exit $STATE_WARNING
else
echo "OK: Metric=$METRIC | metric=$METRIC;$WARNING;$CRITICAL"
exit $STATE_OK
fi
3.2 实用示例:磁盘 I/O 检查
#!/bin/bash
# check_disk_io.sh - 检查磁盘 I/O 状态
# 用法: check_disk_io.sh -d <device> -w <read_warning>,<write_warning> -c <read_critical>,<write_critical>
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
DEVICE=""
READ_WARN=""
READ_CRIT=""
WRITE_WARN=""
WRITE_CRIT=""
print_help() {
echo "Usage: $0 -d <device> -r <read_warn>,<read_crit> -w <write_warn>,<write_crit>"
echo "Example: $0 -d sda -r 50,100 -w 50,100"
exit $STATE_UNKNOWN
}
while getopts "d:r:w:vh" opt; do
case $opt in
d) DEVICE=$OPTARG ;;
r) READ_WARN=$(echo $OPTARG | cut -d',' -f1)
READ_CRIT=$(echo $OPTARG | cut -d',' -f2) ;;
w) WRITE_WARN=$(echo $OPTARG | cut -d',' -f1)
WRITE_CRIT=$(echo $OPTARG | cut -d',' -f2) ;;
h) print_help ;;
*) print_help ;;
esac
done
if [ -z "$DEVICE" ]; then
echo "UNKNOWN: Device not specified"
print_help
fi
# 使用 iostat 获取磁盘 I/O 数据
IOSTAT_DATA=$(iostat -d $DEVICE 1 2 | tail -1)
READ_KB=$(echo $IOSTAT_DATA | awk '{print $3}')
WRITE_KB=$(echo $IOSTAT_DATA | awk '{print $4}')
if [ -z "$READ_KB" ] || [ -z "$WRITE_KB" ]; then
echo "UNKNOWN: Unable to get I/O stats for $DEVICE"
exit $STATE_UNKNOWN
fi
# 判断状态
STATE=$STATE_OK
OUTPUT="OK"
if [ -n "$READ_CRIT" ] && (( $(echo "$READ_KB > $READ_CRIT" | bc -l) )); then
STATE=$STATE_CRITICAL
OUTPUT="CRITICAL: Read=$READ_KB KB/s > $READ_CRIT"
elif [ -n "$WRITE_CRIT" ] && (( $(echo "$WRITE_KB > $WRITE_CRIT" | bc -l) )); then
STATE=$STATE_CRITICAL
OUTPUT="CRITICAL: Write=$WRITE_KB KB/s > $WRITE_CRIT"
elif [ -n "$READ_WARN" ] && (( $(echo "$READ_KB > $READ_WARN" | bc -l) )); then
STATE=$STATE_WARNING
OUTPUT="WARNING: Read=$READ_KB KB/s > $READ_WARN"
elif [ -n "$WRITE_WARN" ] && (( $(echo "$WRITE_KB > $WRITE_WARN" | bc -l) )); then
STATE=$STATE_WARNING
OUTPUT="WARNING: Write=$WRITE_KB KB/s > $WRITE_WARN"
else
OUTPUT="OK: Read=$READ_KB KB/s, Write=$WRITE_KB KB/s"
fi
echo "$OUTPUT | read=${READ_KB}KB/s;${READ_WARN};${READ_CRIT};0; write=${WRITE_KB}KB/s;${WRITE_WARN};${WRITE_CRIT};0"
exit $STATE
四、Python 插件开发
4.1 基本框架
#!/usr/bin/env python3
# check_example.py - Python 插件示例
import sys
import argparse
import subprocess
# 退出码定义
STATE_OK = 0
STATE_WARNING = 1
STATE_CRITICAL = 2
STATE_UNKNOWN = 3
def parse_args():
parser = argparse.ArgumentParser(description='Nagios check plugin example')
parser.add_argument('-H', '--host', required=True, help='Host address')
parser.add_argument('-w', '--warning', type=float, required=True, help='Warning threshold')
parser.add_argument('-c', '--critical', type=float, required=True, help='Critical threshold')
parser.add_argument('-p', '--port', type=int, default=80, help='Port number')
parser.add_argument('-t', '--timeout', type=int, default=10, help='Timeout in seconds')
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
return parser.parse_args()
def check_metric(host, port, timeout):
"""实现检查逻辑,返回指标值"""
import socket
import time
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
start = time.time()
sock.connect((host, port))
elapsed = time.time() - start
sock.close()
return elapsed * 1000 # 转换为毫秒
except Exception as e:
return None
def main():
args = parse_args()
metric = check_metric(args.host, args.port, args.timeout)
if metric is None:
print(f"UNKNOWN: Unable to connect to {args.host}:{args.port}")
sys.exit(STATE_UNKNOWN)
perfdata = f"response_time={metric:.2f}ms;{args.warning};{args.critical};0;"
if metric >= args.critical:
print(f"CRITICAL: Response time = {metric:.2f}ms (threshold = {args.critical}ms) | {perfdata}")
sys.exit(STATE_CRITICAL)
elif metric >= args.warning:
print(f"WARNING: Response time = {metric:.2f}ms (threshold = {args.warning}ms) | {perfdata}")
sys.exit(STATE_WARNING)
else:
print(f"OK: Response time = {metric:.2f}ms | {perfdata}")
sys.exit(STATE_OK)
if __name__ == '__main__':
main()
4.2 HTTP API 检查插件
#!/usr/bin/env python3
# check_rest_api.py - REST API 健康检查插件
import sys
import argparse
import requests
import time
import json
STATE_OK = 0
STATE_WARNING = 1
STATE_CRITICAL = 2
STATE_UNKNOWN = 3
def parse_args():
parser = argparse.ArgumentParser(description='REST API health check')
parser.add_argument('-u', '--url', required=True, help='API URL')
parser.add_argument('-m', '--method', default='GET', help='HTTP method')
parser.add_argument('-e', '--expected', default='200', help='Expected status code')
parser.add_argument('-s', '--string', help='Expected string in response')
parser.add_argument('-w', '--warning', type=float, default=5.0, help='Response time warning (seconds)')
parser.add_argument('-c', '--critical', type=float, default=10.0, help='Response time critical (seconds)')
parser.add_argument('--header', action='append', help='HTTP header (format: Key: Value)')
parser.add_argument('--data', help='POST data')
parser.add_argument('--timeout', type=int, default=30, help='Request timeout')
parser.add_argument('--ssl-verify', action='store_true', default=True, help='Verify SSL')
parser.add_argument('--no-ssl-verify', dest='ssl_verify', action='store_false', help='Disable SSL verify')
return parser.parse_args()
def main():
args = parse_args()
headers = {}
if args.header:
for h in args.header:
key, value = h.split(':', 1)
headers[key.strip()] = value.strip()
try:
start = time.time()
response = requests.request(
method=args.method,
url=args.url,
headers=headers,
data=args.data,
timeout=args.timeout,
verify=args.ssl_verify
)
elapsed = time.time() - start
except requests.exceptions.Timeout:
print(f"CRITICAL: Connection timeout after {args.timeout}s | time={args.timeout}s;{args.warning};{args.critical}")
sys.exit(STATE_CRITICAL)
except requests.exceptions.ConnectionError as e:
print(f"CRITICAL: Connection failed - {e} | time=0;{args.warning};{args.critical}")
sys.exit(STATE_CRITICAL)
except Exception as e:
print(f"UNKNOWN: {e}")
sys.exit(STATE_UNKNOWN)
# 检查状态码
expected_codes = [int(c) for c in args.expected.split(',')]
if response.status_code not in expected_codes:
print(f"CRITICAL: HTTP {response.status_code} (expected {args.expected}) | time={elapsed:.3f}s;{args.warning};{args.critical}")
sys.exit(STATE_CRITICAL)
# 检查响应内容
if args.string and args.string not in response.text:
print(f"CRITICAL: Expected string '{args.string}' not found | time={elapsed:.3f}s;{args.warning};{args.critical}")
sys.exit(STATE_CRITICAL)
# 检查响应时间
perfdata = f"time={elapsed:.3f}s;{args.warning};{args.critical};0;"
if elapsed >= args.critical:
print(f"CRITICAL: Response time = {elapsed:.3f}s | {perfdata}")
sys.exit(STATE_CRITICAL)
elif elapsed >= args.warning:
print(f"WARNING: Response time = {elapsed:.3f}s | {perfdata}")
sys.exit(STATE_WARNING)
else:
print(f"OK: HTTP {response.status_code}, Response time = {elapsed:.3f}s | {perfdata}")
sys.exit(STATE_OK)
if __name__ == '__main__':
main()
五、返回值与状态处理
5.1 返回码处理流程
插件执行
│
▼
返回码判断
│
├─ 0 (OK) → 状态设为 OK
│
├─ 1 (WARNING) → 状态设为 WARNING
│ │
│ ├─ 软状态 → 重试检查
│ └─ 硬状态 → 触发通知
│
├─ 2 (CRITICAL) → 状态设为 CRITICAL
│ │
│ ├─ 软状态 → 重试检查
│ └─ 硬状态 → 触发通知
│
└─ 3 (UNKNOWN) → 状态设为 UNKNOWN
│
├─ 软状态 → 重试检查
└─ 硬状态 → 触发通知
5.2 状态类型(Soft/Hard)
# 软状态(SOFT):未达到 max_check_attempts
# 不会触发通知,只记录日志
# 持续重试直到达到阈值或状态恢复
# 硬状态(HARD):达到 max_check_attempts
# 触发通知
# 后续检查如果仍为异常,继续通知(根据 notification_interval)
# 示例:max_check_attempts = 3
# 检查1: CRITICAL → SOFT 1
# 检查2: CRITICAL → SOFT 2
# 检查3: CRITICAL → HARD 1 → 发送通知
# 检查4: CRITICAL → HARD 2 → 按 notification_interval 发送
# 检查5: OK → RECOVERY → 发送恢复通知
六、NRPE 命令定义
6.1 NRPE 远程检查
# NRPE 命令定义(在监控服务器上)
define command {
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -a $ARG2$
}
# NRPE 命令(带超时和 SSL)
define command {
command_name check_nrpe_ssl
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$ -a $ARG2$ -t 30 --ssl
}
# 使用示例
define service {
use generic-service
host_name remote-server-01
service_description Disk Space
check_command check_nrpe!check_disk!-w 20% -c 10% -p /
}
6.2 被动检查命令(NSCA)
# NSCA 提交命令
define command {
command_name submit_service_check
command_line /usr/bin/printf "%s\t%s\t%s\t%s\n" "$HOSTNAME$" "$SERVICEDESC$" "$SERVICESTATEID$" "$SERVICEOUTPUT$" | /usr/local/nagios/bin/send_nsca -H nagios-server -c /etc/send_nsca.cfg
}
# 被动检查结果处理
define command {
command_name process_passive_check
command_line /bin/true # 被动检查不需要实际执行命令
}
七、事件处理命令
7.1 事件处理器定义
# 自动重启失败的服务
define command {
command_name restart-service
command_line $USER2$/event_handlers/restart_service.sh $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEDESC$
}
# 自动重启主机
define command {
command_name restart-host
command_line $USER2$/event_handlers/restart_host.sh $HOSTSTATE$ $HOSTSTATETYPE$ $HOSTNAME$
}
7.2 事件处理脚本示例
#!/bin/bash
# restart_service.sh - 服务自动重启事件处理器
SERVICESTATE=$1
SERVICESTATETYPE=$2
SERVICEDESC=$3
LOGFILE="/var/log/nagios/event_handlers.log"
echo "$(date): $SERVICEDESC is $SERVICESTATE ($SERVICESTATETYPE)" >> $LOGFILE
# 只在首次硬状态故障时尝试重启
if [ "$SERVICESTATE" = "CRITICAL" ] && [ "$SERVICESTATETYPE" = "HARD" ]; then
echo "$(date): Attempting to restart $SERVICEDESC" >> $LOGFILE
case $SERVICEDESC in
"Apache" | "HTTP")
sudo systemctl restart httpd
;;
"MySQL")
sudo systemctl restart mysqld
;;
"SSH")
sudo systemctl restart sshd
;;
*)
echo "$(date): No restart handler for $SERVICEDESC" >> $LOGFILE
exit 0
;;
esac
# 等待服务启动
sleep 5
# 检查服务状态
if systemctl is-active --quiet ${SERVICEDESC,,}; then
echo "$(date): $SERVICEDESC restarted successfully" >> $LOGFILE
else
echo "$(date): Failed to restart $SERVICEDESC" >> $LOGFILE
fi
fi
八、注意事项
| 注意事项 | 说明 |
|---|---|
| 插件权限 | 插件文件需要执行权限 chmod +x |
| 插件超时 | 设置合理的插件超时避免卡死 |
| 返回码 | 必须正确返回 0-3 的退出码 |
| 性能数据 | 格式必须严格遵循规范 |
| 安全性 | 插件中不要硬编码密码 |
| 日志记录 | 插件应输出有意义的状态信息 |
九、本章小结
- 命令定义连接 Nagios 和插件,支持宏变量替换
- 插件输出必须遵循文本和性能数据规范
- 返回码决定服务/主机状态(0-3)
- Shell/Python 是常用的插件开发语言
- NRPE/NSCA 实现远程和被动检查
下一章:第8章:插件体系详解 - 深入了解 Nagios 插件生态系统。