强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

AWK & SED 生产力教程 / 第 11 章:日志分析

第 11 章:日志分析

日志是系统的"黑匣子"。掌握日志分析,你就能在问题发生时快速定位根因。

11.1 常见日志格式

Nginx/Apache 访问日志

192.168.1.1 - - [15/Jan/2024:10:23:45 +0800] "GET /index.html HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0"
字段位置说明
客户端IP$1客户端地址
用户标识$2通常为 -
认证用户$3通常为 -
时间戳$4 $5[15/Jan/2024:10:23:45 +0800]
请求方法$6"GET
请求路径$7/index.html
协议版本$8HTTP/1.1"
状态码$9200
响应大小$101234
来源页$11"https://example.com"
用户代理$12+"Mozilla/5.0"

Syslog 格式

Jan 15 10:23:45 hostname sshd[12345]: Accepted password for user from 192.168.1.1 port 22 ssh2

应用日志格式

2024-01-15 10:23:45 [ERROR] module=auth message="Login failed" user=alice ip=192.168.1.1

11.2 访问日志分析

基本统计

# 创建示例日志
cat > access.log << 'EOF'
192.168.1.1 - - [15/Jan/2024:10:00:00 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.2 - - [15/Jan/2024:10:00:01 +0800] "GET /style.css HTTP/1.1" 200 5678
192.168.1.1 - - [15/Jan/2024:10:00:02 +0800] "GET /api/users HTTP/1.1" 200 890
192.168.1.3 - - [15/Jan/2024:10:00:03 +0800] "POST /api/login HTTP/1.1" 401 123
192.168.1.1 - - [15/Jan/2024:10:00:04 +0800] "GET /missing.html HTTP/1.1" 404 0
192.168.1.2 - - [15/Jan/2024:10:00:05 +0800] "GET /admin HTTP/1.1" 403 0
192.168.1.4 - - [15/Jan/2024:10:00:06 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.1 - - [15/Jan/2024:10:00:07 +0800] "GET /api/data HTTP/1.1" 500 0
192.168.1.3 - - [15/Jan/2024:10:00:08 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.5 - - [15/Jan/2024:10:00:09 +0800] "GET /robots.txt HTTP/1.1" 200 50
192.168.1.1 - - [15/Jan/2024:10:01:00 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.6 - - [15/Jan/2024:10:01:01 +0800] "POST /api/login HTTP/1.1" 200 456
192.168.1.7 - - [15/Jan/2024:10:01:02 +0800] "GET /dashboard HTTP/1.1" 301 0
192.168.1.8 - - [15/Jan/2024:10:01:03 +0800] "GET /index.html HTTP/1.1" 200 1234
192.168.1.1 - - [15/Jan/2024:10:01:04 +0800] "GET /api/data HTTP/1.1" 200 5678
EOF
# 统计总请求数
$ wc -l < access.log
15

# 统计独立 IP 数
$ awk '{print $1}' access.log | sort -u | wc -l
8

# 统计各状态码分布
$ awk '{count[$9]++} END {
    for (s in count) printf "%-6s %4d\n", s, count[s]
}' access.log | sort
200      9
301      1
401      1
403      1
404      1
500      1

流量统计

# 计算总传输量
$ awk '{sum+=$10} END {printf "总传输量: %.2f KB (%.2f MB)\n", sum/1024, sum/1048576}' access.log

# 每个 IP 的传输量
$ awk '{ip_bytes[$1]+=$10} END {
    for (ip in ip_bytes) printf "%12.2f KB  %s\n", ip_bytes[ip]/1024, ip
}' access.log | sort -rn

# 每个请求路径的平均响应大小
$ awk '{
    path_count[$7]++
    path_bytes[$7]+=$10
} END {
    for (p in path_count)
        printf "%8.0f B  %s\n", path_bytes[p]/path_count[p], p
}' access.log | sort -rn

11.3 错误分析

错误请求分类

# 列出所有错误请求
$ awk '$9 >= 400 {print $9, $6, $7, $1}' access.log

# 按状态码统计错误
$ awk '$9 >= 400 {count[$9]++} END {
    for (s in count) printf "%s: %d 次\n", s, count[s]
}' access.log | sort

# 按请求路径统计错误
$ awk '$9 >= 400 {count[$7]++} END {
    for (p in count) printf "%4d %s\n", count[p], p
}' access.log | sort -rn

# 找出触发 500 错误的 IP
$ awk '$9 == 500 {print $1}' access.log | sort | uniq -c | sort -rn

🏢 场景:错误趋势分析

# 按分钟统计错误数量
$ awk '$9 >= 400 {
    # 提取时间中的分钟部分
    split($4, t, ":")
    minute = t[2]":"t[3]
    count[minute]++
} END {
    for (m in count) printf "%s %d\n", m, count[m]
}' access.log | sort

# 错误率变化趋势
$ awk '{
    split($4, t, ":")
    minute = t[2]":"t[3]
    total[minute]++
    if ($9 >= 400) errors[minute]++
} END {
    for (m in total) {
        e = (m in errors) ? errors[m] : 0
        printf "%s  总请求: %3d  错误: %3d  错误率: %.1f%%\n", m, total[m], e, e/total[m]*100
    }
}' access.log | sort

11.4 趋势分析

时间维度分析

# 每小时请求数趋势
$ awk -F'[/: ]' '{
    hour = $7
    count[hour]++
} END {
    for (h in count) {
        printf "%s:00 %4d ", h, count[h]
        for (i=0; i<count[h]; i++) printf "█"
        printf "\n"
    }
}' access.log | sort

# 每天请求数趋势(适用于多天日志)
$ awk '{
    split($4, d, ":")
    date = substr(d[1], 2)  # 去掉开头的 [
    count[date]++
} END {
    for (d in count) printf "%s %d\n", d, count[d]
}' access.log | sort

峰值检测

# 找出请求最频繁的时段
$ awk '{
    split($4, t, ":")
    hour = t[2]
    count[hour]++
} END {
    max_count = 0
    for (h in count) {
        if (count[h] > max_count) {
            max_count = count[h]
            peak_hour = h
        }
    }
    printf "峰值时段: %s:00 (%d 次请求)\n", peak_hour, max_count
}' access.log

# 找出请求最频繁的分钟
$ awk '{
    split($4, t, ":")
    minute = t[2]":"t[3]
    count[minute]++
} END {
    for (m in count) {
        if (count[m] > max) {
            max = count[m]
            peak = m
        }
    }
    printf "峰值分钟: %s (%d 次请求)\n", peak, max
}' access.log

11.5 告警系统

🏢 场景:实时日志告警

#!/bin/bash
# log_alert.sh — 实时日志告警

LOG_FILE="/var/log/nginx/access.log"
ALERT_THRESHOLD=10  # 每分钟错误数阈值
SLACK_WEBHOOK="https://hooks.slack.com/services/xxx"

tail -f "$LOG_FILE" | awk -v threshold="$ALERT_THRESHOLD" '
BEGIN {
    minute = ""
    errors = 0
}
{
    # 提取当前分钟
    split($4, t, ":")
    current_minute = t[2]":"t[3]
    
    # 分钟切换时检查阈值
    if (current_minute != minute) {
        if (errors >= threshold) {
            printf "🚨 告警: %s 分钟内错误数 %d 超过阈值 %d\n", minute, errors, threshold
            # 可以发送到 Slack/钉钉等
            # system("curl -X POST ...")
        }
        minute = current_minute
        errors = 0
    }
    
    # 统计错误
    if ($9 >= 400) errors++
}'

🏢 场景:异常 IP 检测

#!/bin/bash
# detect_anomaly.sh — 检测异常 IP

LOG_FILE="/var/log/nginx/access.log"
THRESHOLD=100  # 每小时请求数阈值

awk -v threshold="$THRESHOLD" '
{
    ip = $1
    split($4, t, ":")
    hour = t[2]
    key = ip":"hour
    count[key]++
}
END {
    for (k in count) {
        if (count[k] >= threshold) {
            split(k, parts, ":")
            printf "⚠️  异常 IP: %s 在 %s:00 时段请求 %d 次\n", parts[1], parts[2], count[k]
        }
    }
}' "$LOG_FILE"

🏢 场景:慢请求检测

# 假设日志格式中包含响应时间字段
# 192.168.1.1 - - [15/Jan/2024:10:00:00 +0800] "GET /api/data HTTP/1.1" 200 1234 1.234

cat > slow_access.log << 'EOF'
192.168.1.1 GET /api/users 200 0.045
192.168.1.2 GET /api/data 200 2.345
192.168.1.1 POST /api/upload 200 5.678
192.168.1.3 GET /index.html 200 0.012
192.168.1.1 GET /api/report 200 10.234
192.168.1.4 GET /api/users 200 0.089
EOF

# 找出响应时间超过 1 秒的请求
$ awk '$5 > 1.0 {
    printf "⚠️  慢请求: %s %s %s 响应时间 %.3fs\n", $1, $2, $3, $5
}' slow_access.log

# 统计各接口的平均响应时间
$ awk '{
    api = $2 " " $3
    time_sum[api] += $5
    time_count[api]++
    if ($5 > time_max[api]) time_max[api] = $5
} END {
    printf "%-30s %10s %10s %10s\n", "接口", "请求数", "平均(ms)", "最大(ms)"
    printf "%-30s %10s %10s %10s\n", "------------------------------", "----------", "----------", "----------"
    for (a in time_sum)
        printf "%-30s %10d %10.3f %10.3f\n", a, time_count[a], time_sum[a]/time_count[a], time_max[a]
}' slow_access.log

11.6 高级日志分析

用户行为分析

# 用户访问路径追踪
$ awk '{
    ip = $1
    path = $7
    time = $4 " " $5
    printf "%s %s %s\n", ip, time, path
}' access.log | sort | awk '
BEGIN { prev_ip = "" }
{
    if ($1 != prev_ip) {
        if (prev_ip != "") print ""
        printf "用户 %s:\n", $1
        prev_ip = $1
    }
    printf "  %s → %s\n", $2, $3
}'

# 找出典型的用户访问模式
$ awk '{print $7}' access.log | awk '
BEGIN { prev = "START" }
{
    pattern = prev " → " $0
    count[pattern]++
    prev = $0
}
END {
    for (p in count) printf "%4d %s\n", count[p], p
}' | sort -rn | head -10

会话分析

# 基于 IP 的会话分析(简化版)
$ awk '{
    ip = $1
    split($4, t, ":")
    hour = t[2]
    minute = t[3]
    seconds = t[4]
    gsub(/\]/, "", seconds)
    time_in_seconds = hour*3600 + minute*60 + seconds
    
    if (ip in last_time) {
        gap = time_in_seconds - last_time[ip]
        if (gap > 1800) {  # 超过 30 分钟认为新会话
            sessions[ip]++
        }
    } else {
        sessions[ip] = 1
    }
    last_time[ip] = time_in_seconds
    requests[ip]++
} END {
    printf "%-16s %8s %10s\n", "IP", "请求数", "会话数"
    for (ip in requests)
        printf "%-16s %8d %10d\n", ip, requests[ip], sessions[ip]
}' access.log

爬虫检测

# 检测可能的爬虫
$ awk '{
    ip = $1
    path = $7
    count[ip]++
    
    # 检测高频访问特定路径的 IP
    if (path ~ /\.(css|js|png|jpg|gif|ico)$/) {
        static[ip]++
    }
} END {
    for (ip in count) {
        total = count[ip]
        static_count = (ip in static) ? static[ip] : 0
        if (total > 50 || (static_count > 0 && static_count/total > 0.8)) {
            printf "🤖 疑似爬虫: %-16s 总请求: %d 静态资源: %d (%.0f%%)\n",
                ip, total, static_count, static_count/total*100
        }
    }
}' access.log

11.7 日志分析报告

综合分析报告

#!/bin/bash
# log_report.sh — 生成日志分析报告

LOG_FILE="${1:-/var/log/nginx/access.log}"
REPORT_FILE="log_report_$(date +%Y%m%d_%H%M%S).txt"

{
    echo "╔══════════════════════════════════════════════════════╗"
    echo "║             Nginx 日志分析报告                       ║"
    echo "╠══════════════════════════════════════════════════════╣"
    echo "║  生成时间: $(date '+%Y-%m-%d %H:%M:%S')"
    echo "║  日志文件: ${LOG_FILE}"
    echo "╚══════════════════════════════════════════════════════╝"
    echo ""

    echo "=== 总体统计 ==="
    awk '{
        total++
        bytes += $10
        if ($9 >= 400) errors++
    } END {
        printf "总请求数:     %d\n", total
        printf "错误请求数:   %d (%.2f%%)\n", errors, errors/total*100
        printf "总传输量:     %.2f MB\n", bytes/1048576
        printf "平均响应大小: %.0f B\n", bytes/total
    }' "$LOG_FILE"

    echo ""
    echo "=== 状态码分布 ==="
    awk '{count[$9]++} END {
        for (s in count) printf "  %s: %d\n", s, count[s]
    }' "$LOG_FILE" | sort

    echo ""
    echo "=== Top 10 IP ==="
    awk '{count[$1]++} END {
        for (ip in count) printf "%6d  %s\n", count[ip], ip
    }' "$LOG_FILE" | sort -rn | head -10

    echo ""
    echo "=== Top 10 请求路径 ==="
    awk '{count[$7]++} END {
        for (p in count) printf "%6d  %s\n", count[p], p
    }' "$LOG_FILE" | sort -rn | head -10

    echo ""
    echo "=== 错误请求详情 ==="
    awk '$9 >= 400 {
        printf "  %s %s %s %s\n", $9, $1, $6, $7
    }' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10

} > "$REPORT_FILE"

echo "报告已生成: ${REPORT_FILE}"

11.8 日志分析速查

# 基本统计
wc -l < logfile                          # 总行数
awk '{print $1}' log | sort -u | wc -l   # 独立 IP 数
awk '{print $9}' log | sort | uniq -c    # 状态码分布

# 错误分析
awk '$9 >= 400' log                      # 所有错误
awk '$9 == 500' log                      # 服务器错误
awk '$9 == 404' log                      # 未找到

# 流量分析
awk '{sum+=$10} END{print sum}' log      # 总字节数
awk '{ip[$1]+=$10} END{for(i in ip) print ip[i], i}' log | sort -rn  # IP 流量

# 时间分析
awk -F'[/: ]' '{print $7}' log | sort | uniq -c  # 按小时统计
awk '{print $7}' log | sort | uniq -c | sort -rn  # 热门路径

# 安全分析
awk '{print $1}' log | sort | uniq -c | sort -rn | head -20  # 高频 IP
awk '$9 == 401 || $9 == 403' log                      # 认证失败

扩展阅读


下一章:第 12 章:报告生成 — 数据汇总、格式化输出、HTML/CSV 报告。