强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

Nagios 监控运维完整教程 / 第6章:联系人与通知

第6章:联系人与通知

通知是监控系统的核心价值之一——及时将告警信息传达给相关人员。本章详细讲解联系人管理、通知命令、通知升级、过滤和静默机制。


一、联系人定义

1.1 联系人对象

define contact {
    contact_name                    zhangsan
    alias                           张三 - 系统运维工程师
    email                           zhangsan@example.com
    pager                           13800138000@sms.gateway.com

    # 主机通知配置
    host_notification_period        24x7
    host_notification_options       d,u,r
    host_notification_commands      notify-host-by-email,notify-host-by-sms

    # 服务通知配置
    service_notification_period     24x7
    service_notification_options    w,u,c,r
    service_notification_commands   notify-service-by-email,notify-service-by-sms

    # 模板
    use                             generic-contact

    # 高级选项
    can_submit_commands             1
    retain_status_information       1
    retain_nonstatus_information    1
}

1.2 通知选项详解

标志主机通知服务通知说明
dDOWN-主机变为 DOWN 状态
uUNREACHABLEUNKNOWN不可达/未知状态
rRECOVERYRECOVERY恢复正常
w-WARNING警告状态
c-CRITICAL严重状态
fFLAPPINGFLAPPING抖动状态
sDOWNTIMEDOWNTIME计划维护时段
nNoneNone不发送通知

1.3 联系人模板

define contact {
    name                                generic-contact
    service_notification_period         24x7
    host_notification_period            24x7
    service_notification_options        w,u,c,r
    host_notification_options           d,u,r
    service_notification_commands       notify-service-by-email
    host_notification_commands          notify-host-by-email
    register                            0
}

# 邮件+短信通知模板
define contact {
    name                                contact-with-sms
    use                                 generic-contact
    service_notification_commands       notify-service-by-email,notify-service-by-sms
    host_notification_commands          notify-host-by-email,notify-host-by-sms
    register                            0
}

# 仅邮件通知模板
define contact {
    name                                email-only-contact
    use                                 generic-contact
    service_notification_commands       notify-service-by-email
    host_notification_commands          notify-host-by-email
    register                            0
}

二、联系人组

2.1 联系人组定义

define contactgroup {
    contactgroup_name       admins
    alias                   系统管理员
    members                 zhangsan,lisi,wangwu
    contactgroup_members    senior-ops  # 嵌套联系人组
}

define contactgroup {
    contactgroup_name       web-ops
    alias                   Web 运维组
    members                 zhangsan,lisi
}

define contactgroup {
    contactgroup_name       dba-team
    alias                   DBA 团队
    members                 wangwu,zhaoliu
}

define contactgroup {
    contactgroup_name       senior-ops
    alias                   高级运维
    members                 ops-manager
}

define contactgroup {
    contactgroup_name       all-ops
    alias                   全部运维人员
    contactgroup_members    admins,web-ops,dba-team
}

2.2 联系人组在主机/服务中的应用

# 主机通知联系人组
define host {
    use                 linux-server
    host_name           web-server-01
    address             192.168.1.100
    contact_groups      web-ops
}

# 服务通知联系人组
define service {
    use                 generic-service
    host_name           db-server-01
    service_description MySQL
    check_command       check_mysql
    contact_groups      dba-team
}

三、通知命令

3.1 邮件通知命令

# 主机邮件通知
define command {
    command_name    notify-host-by-email
    command_line    /usr/bin/printf "%b" "\
        ***** Nagios *****\n\n\
        Notification Type: $NOTIFICATIONTYPE$\n\
        Host: $HOSTNAME$\n\
        State: $HOSTSTATE$\n\
        Address: $HOSTADDRESS$\n\
        Info: $HOSTOUTPUT$\n\n\
        Date/Time: $LONGDATETIME$\n" | \
        /usr/bin/mail -s "** $NOTIFICATIONTYPE$ Host Alert: $HOSTNAME$ is $HOSTSTATE$ **" $CONTACTEMAIL$
}

# 服务邮件通知
define command {
    command_name    notify-service-by-email
    command_line    /usr/bin/printf "%b" "\
        ***** Nagios *****\n\n\
        Notification Type: $NOTIFICATIONTYPE$\n\n\
        Service: $SERVICEDESC$\n\
        Host: $HOSTNAME$\n\
        Address: $HOSTADDRESS$\n\
        State: $SERVICESTATE$\n\n\
        Date/Time: $LONGDATETIME$\n\
        Additional Info:\n\n\
        $SERVICEOUTPUT$\n\n\
        Comment: [$SERVICEACKCOMMENT$]\n" | \
        /usr/bin/mail -s "** $NOTIFICATIONTYPE$ Service Alert: $HOSTNAME$/$SERVICEDESC$ is $SERVICESTATE$ **" $CONTACTEMAIL$
}

3.2 短信通知命令

# 通过短信网关发送短信
define command {
    command_name    notify-host-by-sms
    command_line    /usr/bin/curl -s \
        "http://sms.gateway.com/send?phone=$CONTACTPAGER$&message=Host+$HOSTNAME$+is+$HOSTSTATE$" \
        > /dev/null
}

define command {
    command_name    notify-service-by-sms
    command_line    /usr/bin/curl -s \
        "http://sms.gateway.com/send?phone=$CONTACTPAGER$&message=$HOSTNAME$/$SERVICEDESC$+is+$SERVICESTATE$" \
        > /dev/null
}

3.3 企业微信/钉钉通知

# 钉钉机器人通知
define command {
    command_name    notify-service-by-dingtalk
    command_line    /usr/bin/curl -s -X POST \
        "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN" \
        -H 'Content-Type: application/json' \
        -d '{
            "msgtype": "markdown",
            "markdown": {
                "title": "Nagios 告警",
                "text": "## Nagios 告警通知\n\n- **通知类型**: $NOTIFICATIONTYPE$\n- **主机**: $HOSTNAME$\n- **服务**: $SERVICEDESC$\n- **状态**: $SERVICESTATE$\n- **信息**: $SERVICEOUTPUT$\n- **时间**: $LONGDATETIME$"
            }
        }'
}

# 企业微信通知
define command {
    command_name    notify-service-by-wechat
    command_line    /usr/bin/curl -s -X POST \
        "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=YOUR_KEY" \
        -H 'Content-Type: application/json' \
        -d '{
            "msgtype": "markdown",
            "markdown": {
                "content": "## Nagios 告警\n> 主机: $HOSTNAME$\n> 服务: $SERVICEDESC$\n> 状态: <font color=\"warning\">$SERVICESTATE$</font>\n> 时间: $LONGDATETIME$"
            }
        }'
}

3.4 电话通知

# 通过 Twilio API 发送语音通知
define command {
    command_name    notify-host-by-phone
    command_line    /usr/bin/curl -s -X POST \
        "https://api.twilio.com/2010-04-01/Accounts/YOUR_SID/Calls.json" \
        -u "YOUR_SID:YOUR_AUTH_TOKEN" \
        --data-urlencode "To=$CONTACTPAGER$" \
        --data-urlencode "From=+1234567890" \
        --data-urlencode "Url=http://your-server/voice-alert.php?host=$HOSTNAME$&state=$HOSTSTATE$" \
        > /dev/null
}

四、通知升级

4.1 主机通知升级

# 当主机 DOWN 超过 30 分钟后,通知高级运维
define hostescalation {
    host_name               web-server-01
    first_notification      3      # 第 3 次通知开始
    last_notification       0      # 0 = 无限(直到恢复)
    notification_interval   30     # 每 30 分钟通知一次
    contact_groups          senior-ops
    escalation_period       24x7
    escalation_options      d,u
}

# 当主机 DOWN 超过 2 小时后,通知管理层
define hostescalation {
    host_name               web-server-01
    first_notification      7
    last_notification       0
    notification_interval   60
    contact_groups          management
    escalation_period       24x7
    escalation_options      d,u
}

4.2 服务通知升级

# 初始通知:运维团队
# 默认配置在服务定义中

# 15 分钟后升级:高级运维
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      2
    last_notification       5
    notification_interval   10
    contact_groups          senior-ops
    escalation_period       24x7
    escalation_options      w,u,c,r
}

# 1 小时后升级:管理层
define serviceescalation {
    host_name               web-server-01
    service_description     HTTP
    first_notification      6
    last_notification       0
    notification_interval   30
    contact_groups          management
    escalation_period       24x7
    escalation_options      c,r
}

4.3 批量升级配置

# 使用主机/服务组批量配置升级
define serviceescalation {
    hostgroup_name          webservers
    service_description     HTTP
    first_notification      3
    last_notification       0
    notification_interval   15
    contact_groups          senior-ops
    escalation_period       24x7
    escalation_options      c
}

4.4 通知升级流程图

服务变为 CRITICAL
    │
    ▼
第 1 次通知 → 运维团队(web-ops)
    │ (5分钟后未恢复)
    ▼
第 2 次通知 → 高级运维(senior-ops)
    │ (10分钟后未恢复)
    ▼
第 3 次通知 → 高级运维(senior-ops)
    │ (15分钟后未恢复)
    ▼
第 4 次通知 → 高级运维(senior-ops)
    │ (30分钟后未恢复)
    ▼
第 5 次通知 → 高级运维(senior-ops)
    │ (30分钟后未恢复)
    ▼
第 6 次通知 → 管理层(management)
    │ (继续...)
    ▼
直到恢复或确认

五、通知过滤

5.1 基于时间段过滤

# 仅在工作时间通知
define service {
    ...
    notification_period     workhours
}

# 全天候通知(关键服务)
define service {
    ...
    notification_period     24x7
}

5.2 基于状态过滤

# 仅通知 CRITICAL 和 RECOVERY
define service {
    ...
    notification_options    c,r
}

# 通知所有状态变化
define service {
    ...
    notification_options    w,u,c,r
}

5.3 基于联系人过滤

# 不同服务通知不同团队
define service {
    use                 generic-service
    host_name           web-server-01
    service_description HTTP
    contact_groups      web-ops      # Web 团队
}

define service {
    use                 generic-service
    host_name           db-server-01
    service_description MySQL
    contact_groups      dba-team     # DBA 团队
}

六、静默与维护

6.1 按计划停机(Downtime)

# 通过 Web 界面设置计划停机
# 或通过命令文件:

# 设置主机停机(2小时)
echo "[$(date +%s)] SCHEDULE_HOST_DOWNTIME;web-server-01;$(date +%s);$(date -d '+2 hours' +%s);1;0;2;admin;Planned maintenance" \
    >> /var/log/nagios/rw/nagios.cmd

# 参数说明:
# SCHEDULE_HOST_DOWNTIME;主机名;开始时间;结束时间;固定/灵活;触发ID;持续时间;用户;备注
# 固定=1, 灵活=0

6.2 取消通知(Acknowledge)

# 确认问题(停止重复通知)
echo "[$(date +%s)] ACKNOWLEDGE_HOST_PROBLEM;web-server-01;2;1;0;zhangsan;已知故障,正在处理" \
    >> /var/log/nagios/rw/nagios.cmd

# 参数:ACKNOWLEDGE_HOST_PROBLEM;主机名;粘性;通知;持久;作者;备注
# 粘性: 0=非粘性, 1=粘性, 2=粘性+下一检查

6.3 禁用/启用通知

# 禁用主机通知
echo "[$(date +%s)] DISABLE_HOST_NOTIFICATIONS;web-server-01" \
    >> /var/log/nagios/rw/nagios.cmd

# 启用主机通知
echo "[$(date +%s)] ENABLE_HOST_NOTIFICATIONS;web-server-01" \
    >> /var/log/nagios/rw/nagios.cmd

# 禁用服务通知
echo "[$(date +%s)] DISABLE_SERVICE_NOTIFICATIONS;web-server-01;HTTP" \
    >> /var/log/nagios/rw/nagios.cmd

# 禁用全局通知
echo "[$(date +%s)] DISABLE_NOTIFICATIONS" \
    >> /var/log/nagios/rw/nagios.cmd

6.4 批量静默脚本

#!/bin/bash
# batch_downtime.sh - 批量设置维护窗口

HOSTGROUP=$1
DURATION=$2  # 分钟
START=$(date +%s)
END=$(date -d "+${DURATION} minutes" +%s)

# 获取主机组成员
HOSTS=$(nagios -v /etc/nagios/nagios.cfg 2>&1 | grep "hostgroup_name.*${HOSTGROUP}" -A 20 | grep members | sed 's/.*: //')

for HOST in $(echo $HOSTS | tr ',' ' '); do
    echo "[$(date +%s)] SCHEDULE_HOST_DOWNTIME;${HOST};${START};${END};1;0;0;admin;Scheduled maintenance" \
        >> /var/log/nagios/rw/nagios.cmd
    echo "已设置 $HOST 维护窗口: ${DURATION} 分钟"
done

七、通知模板定制

7.1 丰富格式的通知

define command {
    command_name    notify-service-by-email-html
    command_line    /usr/bin/printf "%b" "\
        <html>\
        <head><style>\
            body { font-family: Arial; }\
            .critical { color: red; font-weight: bold; }\
            .warning { color: orange; }\
            .ok { color: green; }\
        </style></head>\
        <body>\
        <h2>Nagios 告警通知</h2>\
        <table border='1' cellpadding='5'>\
        <tr><td><b>通知类型</b></td><td>$NOTIFICATIONTYPE$</td></tr>\
        <tr><td><b>主机</b></td><td>$HOSTNAME$ ($HOSTADDRESS$)</td></tr>\
        <tr><td><b>服务</b></td><td>$SERVICEDESC$</td></tr>\
        <tr><td><b>状态</b></td><td class='$(echo $SERVICESTATE$ | tr A-Z a-z)'>$SERVICESTATE$</td></tr>\
        <tr><td><b>输出</b></td><td>$SERVICEOUTPUT$</td></tr>\
        <tr><td><b>时间</b></td><td>$LONGDATETIME$</td></tr>\
        </table>\
        </body></html>" | \
        /usr/bin/mail -s "$(echo $SERVICESTATE$ | tr a-z A-Z): $HOSTNAME$/$SERVICEDESC$" \
        -a "Content-Type: text/html" $CONTACTEMAIL$
}

7.2 通知宏变量参考

说明示例值
$NOTIFICATIONTYPE$通知类型PROBLEM, RECOVERY, ACKNOWLEDGEMENT
$CONTACTEMAIL$联系人邮箱zhangsan@example.com
$CONTACTNAME$联系人名称zhangsan
$HOSTNAME$主机名web-server-01
$HOSTADDRESS$主机地址192.168.1.100
$HOSTSTATE$主机状态DOWN, UP, UNREACHABLE
$HOSTOUTPUT$主机检查输出PING CRITICAL - Packet Loss = 100%
$SERVICEDESC$服务描述HTTP
$SERVICESTATE$服务状态CRITICAL, WARNING, OK
$SERVICEOUTPUT$服务检查输出Connection refused
$LONGDATETIME$完整日期时间Mon Jan 01 12:00:00 CST 2024
$SERVICEACKCOMMENT$确认备注已知故障

八、业务场景示例

8.1 分级通知策略

# 一线运维:所有告警
define contactgroup {
    contactgroup_name   l1-ops
    alias               一线运维
    members             operator1,operator2
}

# 二线专家:严重告警
define contactgroup {
    contactgroup_name   l2-ops
    alias               二线专家
    members             expert1,expert2
}

# 管理层:持续严重告警
define contactgroup {
    contactgroup_name   management
    alias               管理层
    members             ops-manager,cto
}

# 升级策略
define serviceescalation {
    hostgroup_name          production
    service_description     *
    first_notification      1
    last_notification       3
    notification_interval   5
    contact_groups          l1-ops
}

define serviceescalation {
    hostgroup_name          production
    service_description     *
    first_notification      4
    last_notification       8
    notification_interval   10
    contact_groups          l2-ops
}

define serviceescalation {
    hostgroup_name          production
    service_description     *
    first_notification      9
    last_notification       0
    notification_interval   30
    contact_groups          management
}

九、注意事项

注意事项说明
通知风暴防护设置合理的 notification_interval,避免重复告警
联系人轮值使用多个联系人组实现轮值机制
通知确认鼓励使用 ACKNOWLEDGE 减少重复通知
静默审批计划停机应有审批流程
测试通知定期测试通知命令确保可达
敏感信息通知内容中不要包含密码等敏感信息

十、本章小结

  1. 联系人定义接收通知的人员及其联系方式
  2. 联系人组将联系人组织为团队
  3. 通知命令定义如何发送通知(邮件、短信、IM)
  4. 通知升级实现分层响应机制
  5. 静默机制通过停机和确认管理告警

下一章第7章:命令与插件开发 - 学习命令定义和自定义插件开发。