强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

Tesseract OCR 完整教程 / 第 4 章:图像预处理

第 4 章:图像预处理

掌握图像预处理技术,显著提升 OCR 识别精度。

4.1 为什么需要预处理

原始图像 → 预处理 → OCR 识别 → 输出文本
   ↑          ↑
  噪声多     二值化、去噪、校正
  倾斜       → 精度提升 20-50%
  低对比度

预处理效果对比

预处理步骤精度提升适用场景
灰度化+5%所有场景
二值化+10-20%文档扫描件
去噪+5-15%噪声明显的图片
倾斜校正+10-30%倾斜文档
缩放+5-10%低分辨率图片
边缘检测+10%复杂背景

4.2 环境准备

# 安装依赖
pip install opencv-python numpy Pillow pytesseract

# 验证
python3 -c "import cv2; print(cv2.__version__)"

4.3 灰度化

import cv2

# 读取图片
img = cv2.imread('image.png')

# 方法 1: cvtColor
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# 方法 2: 单通道提取(简单场景)
gray = img[:, :, 0]  # 取蓝色通道

# 保存
cv2.imwrite('gray.png', gray)

业务场景:所有 OCR 任务的第一步,将三通道图像转为单通道。

4.4 二值化

4.4.1 全局阈值(Global Thresholding)

import cv2

gray = cv2.imread('image.png', cv2.IMREAD_GRAYSCALE)

# 简单阈值
_, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
_, binary_inv = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)
_, trunc = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)
_, tozero = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)

# Otsu 自动阈值(推荐)
_, binary_otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# 输出 Otsu 选择的阈值
thresh_val, _ = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
print(f"Otsu 阈值: {thresh_val}")

4.4.2 自适应阈值(Adaptive Thresholding)

import cv2

gray = cv2.imread('image.png', cv2.IMREAD_GRAYSCALE)

# 均值自适应
adaptive_mean = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2
)

# 高斯自适应(推荐)
adaptive_gauss = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)

二值化方法对比

方法适用场景参数推荐度
全局阈值光照均匀threshold⭐⭐
Otsu双峰分布自动⭐⭐⭐⭐
自适应均值光照不均blockSize, C⭐⭐⭐
自适应高斯光照不均blockSize, C⭐⭐⭐⭐

4.4.3 自动选择方法

import cv2
import numpy as np

def auto_threshold(gray):
    """自动选择最佳二值化方法"""
    # 计算图像统计
    mean_val = np.mean(gray)
    std_val = np.std(gray)
    
    # Otsu 对比度检测
    otsu_thresh, otsu_binary = cv2.threshold(
        gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )
    
    # 计算像素分布
    hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
    hist_norm = hist.ravel() / hist.sum()
    
    # 判断是否双峰分布
    peaks = np.where(hist_norm > 0.01)[0]
    
    if len(peaks) > 1 and std_val > 50:
        # 双峰分布,使用 Otsu
        print("使用 Otsu 二值化")
        return otsu_binary
    elif std_val < 30:
        # 低对比度,先增强
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        _, binary = cv2.threshold(
            enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )
        print("使用 CLAHE 增强 + Otsu")
        return binary
    else:
        # 光照不均,使用自适应
        binary = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        print("使用自适应阈值")
        return binary

4.5 去噪

4.5.1 常用去噪方法

import cv2

gray = cv2.imread('noisy.png', cv2.IMREAD_GRAYSCALE)

# 方法 1: 高斯模糊
denoised_gauss = cv2.GaussianBlur(gray, (5, 5), 0)

# 方法 2: 中值滤波(推荐去除椒盐噪声)
denoised_median = cv2.medianBlur(gray, 3)

# 方法 3: 双边滤波(保留边缘)
denoised_bilateral = cv2.bilateralFilter(gray, 9, 75, 75)

# 方法 4: 非局部均值去噪(效果最好,速度较慢)
denoised_nlm = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

# 方法 5: 形态学操作(去除小噪点)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
morph_open = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
morph_close = cv2.morphologyEx(morph_open, cv2.MORPH_CLOSE, kernel)

4.5.2 去噪方法对比

方法速度效果保留边缘适用场景
高斯模糊⭐⭐⭐⭐⭐⭐⭐⭐⭐轻微噪声
中值滤波⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐椒盐噪声
双边滤波⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐需保留边缘
非局部均值⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐高质量要求
形态学⭐⭐⭐⭐⭐⭐⭐⭐⭐⭐小噪点

4.5.3 组合去噪流程

import cv2

def denoise_pipeline(gray):
    """组合去噪流程"""
    # 1. 中值滤波去除椒盐噪声
    step1 = cv2.medianBlur(gray, 3)
    
    # 2. 形态学操作去除小噪点
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    step2 = cv2.morphologyEx(step1, cv2.MORPH_OPEN, kernel)
    
    # 3. 轻微高斯模糊平滑
    step3 = cv2.GaussianBlur(step2, (3, 3), 0)
    
    return step3

4.6 倾斜校正

4.6.1 基于霍夫变换

import cv2
import numpy as np

def deskew_hough(image_path):
    """基于霍夫变换的倾斜校正"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 边缘检测
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    
    # 霍夫变换检测直线
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
    
    if lines is None:
        return img
    
    # 计算平均角度
    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
        if abs(angle) < 45:  # 过滤垂直线
            angles.append(angle)
    
    if not angles:
        return img
    
    median_angle = np.median(angles)
    print(f"检测倾斜角度: {median_angle:.2f}°")
    
    # 旋转校正
    h, w = img.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
    rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, 
                              borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

4.6.2 基于最小面积矩形

import cv2
import numpy as np

def deskew_minarea(image_path):
    """基于最小面积矩形的倾斜校正"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 二值化
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return img
    
    # 合并所有轮廓
    all_points = np.vstack(contours)
    
    # 最小面积矩形
    rect = cv2.minAreaRect(all_points)
    angle = rect[2]
    
    # 调整角度
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    print(f"检测倾斜角度: {angle:.2f}°")
    
    # 旋转校正
    h, w = img.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, 
                              borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

4.6.3 基于 Tesseract OSD

import pytesseract
from PIL import Image
import cv2
import numpy as np

def deskew_osd(image_path):
    """基于 Tesseract OSD 的倾斜校正"""
    img = Image.open(image_path)
    
    # 获取方向信息
    osd = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)
    
    angle = osd['rotate']
    print(f"OSD 检测角度: {angle}°")
    
    if angle == 0:
        return cv2.imread(image_path)
    
    # 旋转
    img_cv = cv2.imread(image_path)
    h, w = img_cv.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, -angle, 1.0)
    rotated = cv2.warpAffine(img_cv, M, (w, h), flags=cv2.INTER_CUBIC,
                              borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

4.6.4 倾斜校正方法对比

方法速度精度适用场景
霍夫变换⭐⭐⭐⭐⭐⭐文档、表格
最小面积矩形⭐⭐⭐⭐⭐⭐⭐⭐文字块
Tesseract OSD⭐⭐⭐⭐⭐⭐⭐⭐⭐通用场景

4.7 缩放与分辨率调整

import cv2

def resize_for_ocr(image_path, target_dpi=300):
    """调整图像大小以优化 OCR"""
    img = cv2.imread(image_path)
    h, w = img.shape[:2]
    
    # 计算当前 DPI(假设扫描件)
    # 如果是照片,可根据像素数估算
    
    # 目标:文字高度 20-40 像素
    # 标准打印文字约 3mm 高,300 DPI 下约 35 像素
    
    # 简单缩放策略
    if h < 500:
        # 图片太小,放大
        scale = 2.0
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
    elif h > 4000:
        # 图片太大,缩小
        scale = 0.5
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    
    return img

分辨率建议

输入类型建议操作目标高度
照片(<500px)放大 2x1000px+
扫描件(300 DPI)保持-
高分辨率(>4000px)缩小2000-4000px
小字体放大 3-4x文字 30-50px

4.8 边框处理

import cv2
import numpy as np

def remove_border(image_path):
    """移除图像边框"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 二值化
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return img
    
    # 找到最大轮廓(假设是主体)
    largest = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest)
    
    # 裁剪
    cropped = img[y:y+h, x:x+w]
    
    return cropped

4.9 完整预处理流水线

import cv2
import numpy as np
import pytesseract
from PIL import Image

def preprocess_for_ocr(image_path, output_path=None):
    """完整的 OCR 预处理流水线"""
    
    # 1. 读取图像
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"无法读取图像: {image_path}")
    
    # 2. 灰度化
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 3. 倾斜校正(基于 OSD)
    try:
        osd = pytesseract.image_to_osd(gray, output_type=pytesseract.Output.DICT)
        angle = osd['rotate']
        if abs(angle) > 0.5:
            h, w = gray.shape
            M = cv2.getRotationMatrix2D((w//2, h//2), -angle, 1.0)
            gray = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC,
                                   borderMode=cv2.BORDER_REPLICATE)
            print(f"校正倾斜: {angle}°")
    except Exception as e:
        print(f"OSD 检测失败: {e}")
    
    # 4. 去噪
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # 5. 对比度增强
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    
    # 6. 二值化
    _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 7. 形态学处理(去除小噪点)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    
    # 8. 保存结果
    if output_path:
        cv2.imwrite(output_path, cleaned)
    
    return cleaned

def compare_ocr(image_path):
    """对比预处理前后的 OCR 结果"""
    import pytesseract
    from PIL import Image
    
    # 原图识别
    img = Image.open(image_path)
    text_original = pytesseract.image_to_string(img, lang='chi_sim+eng')
    
    # 预处理后识别
    preprocessed = preprocess_for_ocr(image_path, 'preprocessed.png')
    text_processed = pytesseract.image_to_string(
        Image.fromarray(preprocessed), lang='chi_sim+eng'
    )
    
    print("=== 原图识别 ===")
    print(text_original[:200])
    print("\n=== 预处理后识别 ===")
    print(text_processed[:200])

if __name__ == '__main__':
    compare_ocr('test.png')

4.10 特殊场景预处理

4.10.1 手机拍照文档

def preprocess_phone_photo(image_path):
    """手机拍照文档预处理"""
    img = cv2.imread(image_path)
    
    # 1. 缩放(手机照片通常很大)
    h, w = img.shape[:2]
    if h > 3000:
        scale = 2000 / h
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    
    # 2. 灰度化
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 3. CLAHE 对比度增强(手机拍照常有阴影)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    
    # 4. 自适应二值化(光照不均)
    binary = cv2.adaptiveThreshold(
        enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 8
    )
    
    return binary

4.10.2 老旧扫描件

def preprocess_old_scan(image_path):
    """老旧扫描件预处理"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 1. 去噪(老旧扫描件噪声多)
    denoised = cv2.fastNlMeansDenoising(gray, None, 15, 7, 21)
    
    # 2. 形态学操作修复断裂文字
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    dilated = cv2.dilate(denoised, kernel, iterations=1)
    
    # 3. 二值化
    _, binary = cv2.threshold(dilated, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return binary

4.11 预处理参数调优

参数范围调优建议
blockSize (自适应阈值)3-51 (奇数)11-21,越大越平滑
C (自适应阈值)-10 到 102-8,越大越敏感
clipLimit (CLAHE)1-52-3,越大对比度越强
h (NLM 去噪)3-2010-15,越大去噪越强
滤波核大小3-15 (奇数)3-5,越大越模糊

4.12 本章小结

步骤方法推荐
灰度化cvtColor所有场景
二值化Otsu / 自适应根据光照选择
去噪NLM / 中值根据噪声类型选择
倾斜校正OSD / 霍夫变换OSD 最准
缩放resize目标文字 20-40px
对比度CLAHE低对比度场景

4.13 扩展阅读


上一章: 基本使用 | 下一章: 多语言支持