Tesseract OCR 完整教程 / 第 4 章：图像预处理

第 4 章：图像预处理

掌握图像预处理技术，显著提升 OCR 识别精度。

4.1 为什么需要预处理

原始图像 → 预处理 → OCR 识别 → 输出文本
   ↑          ↑
  噪声多     二值化、去噪、校正
  倾斜       → 精度提升 20-50%
  低对比度

预处理效果对比：

预处理步骤	精度提升	适用场景
灰度化	+5%	所有场景
二值化	+10-20%	文档扫描件
去噪	+5-15%	噪声明显的图片
倾斜校正	+10-30%	倾斜文档
缩放	+5-10%	低分辨率图片
边缘检测	+10%	复杂背景

4.2 环境准备

# 安装依赖
pip install opencv-python numpy Pillow pytesseract

# 验证
python3 -c "import cv2; print(cv2.__version__)"

4.3 灰度化

import cv2

# 读取图片
img = cv2.imread('image.png')

# 方法 1: cvtColor
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# 方法 2: 单通道提取（简单场景）
gray = img[:, :, 0]  # 取蓝色通道

# 保存
cv2.imwrite('gray.png', gray)

业务场景：所有 OCR 任务的第一步，将三通道图像转为单通道。

4.4 二值化

4.4.1 全局阈值（Global Thresholding）

import cv2

gray = cv2.imread('image.png', cv2.IMREAD_GRAYSCALE)

# 简单阈值
_, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
_, binary_inv = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)
_, trunc = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)
_, tozero = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)

# Otsu 自动阈值（推荐）
_, binary_otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# 输出 Otsu 选择的阈值
thresh_val, _ = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
print(f"Otsu 阈值: {thresh_val}")

4.4.2 自适应阈值（Adaptive Thresholding）

import cv2

gray = cv2.imread('image.png', cv2.IMREAD_GRAYSCALE)

# 均值自适应
adaptive_mean = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2
)

# 高斯自适应（推荐）
adaptive_gauss = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)

二值化方法对比：

方法	适用场景	参数	推荐度
全局阈值	光照均匀	`threshold`	⭐⭐
Otsu	双峰分布	自动	⭐⭐⭐⭐
自适应均值	光照不均	`blockSize, C`	⭐⭐⭐
自适应高斯	光照不均	`blockSize, C`	⭐⭐⭐⭐

4.4.3 自动选择方法

import cv2
import numpy as np

def auto_threshold(gray):
    """自动选择最佳二值化方法"""
    # 计算图像统计
    mean_val = np.mean(gray)
    std_val = np.std(gray)
    
    # Otsu 对比度检测
    otsu_thresh, otsu_binary = cv2.threshold(
        gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )
    
    # 计算像素分布
    hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
    hist_norm = hist.ravel() / hist.sum()
    
    # 判断是否双峰分布
    peaks = np.where(hist_norm > 0.01)[0]
    
    if len(peaks) > 1 and std_val > 50:
        # 双峰分布，使用 Otsu
        print("使用 Otsu 二值化")
        return otsu_binary
    elif std_val < 30:
        # 低对比度，先增强
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        _, binary = cv2.threshold(
            enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
        )
        print("使用 CLAHE 增强 + Otsu")
        return binary
    else:
        # 光照不均，使用自适应
        binary = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
        )
        print("使用自适应阈值")
        return binary

4.5 去噪

4.5.1 常用去噪方法

import cv2

gray = cv2.imread('noisy.png', cv2.IMREAD_GRAYSCALE)

# 方法 1: 高斯模糊
denoised_gauss = cv2.GaussianBlur(gray, (5, 5), 0)

# 方法 2: 中值滤波（推荐去除椒盐噪声）
denoised_median = cv2.medianBlur(gray, 3)

# 方法 3: 双边滤波（保留边缘）
denoised_bilateral = cv2.bilateralFilter(gray, 9, 75, 75)

# 方法 4: 非局部均值去噪（效果最好，速度较慢）
denoised_nlm = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)

# 方法 5: 形态学操作（去除小噪点）
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
morph_open = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
morph_close = cv2.morphologyEx(morph_open, cv2.MORPH_CLOSE, kernel)

4.5.2 去噪方法对比

方法	速度	效果	保留边缘	适用场景
高斯模糊	⭐⭐⭐⭐⭐	⭐⭐	⭐⭐	轻微噪声
中值滤波	⭐⭐⭐⭐	⭐⭐⭐	⭐⭐⭐	椒盐噪声
双边滤波	⭐⭐⭐	⭐⭐⭐⭐	⭐⭐⭐⭐⭐	需保留边缘
非局部均值	⭐⭐	⭐⭐⭐⭐⭐	⭐⭐⭐⭐	高质量要求
形态学	⭐⭐⭐⭐	⭐⭐⭐	⭐⭐⭐	小噪点

4.5.3 组合去噪流程

import cv2

def denoise_pipeline(gray):
    """组合去噪流程"""
    # 1. 中值滤波去除椒盐噪声
    step1 = cv2.medianBlur(gray, 3)
    
    # 2. 形态学操作去除小噪点
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    step2 = cv2.morphologyEx(step1, cv2.MORPH_OPEN, kernel)
    
    # 3. 轻微高斯模糊平滑
    step3 = cv2.GaussianBlur(step2, (3, 3), 0)
    
    return step3

4.6 倾斜校正

4.6.1 基于霍夫变换

import cv2
import numpy as np

def deskew_hough(image_path):
    """基于霍夫变换的倾斜校正"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 边缘检测
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    
    # 霍夫变换检测直线
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
    
    if lines is None:
        return img
    
    # 计算平均角度
    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
        if abs(angle) < 45:  # 过滤垂直线
            angles.append(angle)
    
    if not angles:
        return img
    
    median_angle = np.median(angles)
    print(f"检测倾斜角度: {median_angle:.2f}°")
    
    # 旋转校正
    h, w = img.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
    rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, 
                              borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

4.6.2 基于最小面积矩形

import cv2
import numpy as np

def deskew_minarea(image_path):
    """基于最小面积矩形的倾斜校正"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 二值化
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return img
    
    # 合并所有轮廓
    all_points = np.vstack(contours)
    
    # 最小面积矩形
    rect = cv2.minAreaRect(all_points)
    angle = rect[2]
    
    # 调整角度
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    
    print(f"检测倾斜角度: {angle:.2f}°")
    
    # 旋转校正
    h, w = img.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, 
                              borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

4.6.3 基于 Tesseract OSD

import pytesseract
from PIL import Image
import cv2
import numpy as np

def deskew_osd(image_path):
    """基于 Tesseract OSD 的倾斜校正"""
    img = Image.open(image_path)
    
    # 获取方向信息
    osd = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)
    
    angle = osd['rotate']
    print(f"OSD 检测角度: {angle}°")
    
    if angle == 0:
        return cv2.imread(image_path)
    
    # 旋转
    img_cv = cv2.imread(image_path)
    h, w = img_cv.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, -angle, 1.0)
    rotated = cv2.warpAffine(img_cv, M, (w, h), flags=cv2.INTER_CUBIC,
                              borderMode=cv2.BORDER_REPLICATE)
    
    return rotated

4.6.4 倾斜校正方法对比

方法	速度	精度	适用场景
霍夫变换	⭐⭐⭐	⭐⭐⭐	文档、表格
最小面积矩形	⭐⭐⭐⭐	⭐⭐⭐⭐	文字块
Tesseract OSD	⭐⭐⭐⭐	⭐⭐⭐⭐⭐	通用场景

4.7 缩放与分辨率调整

import cv2

def resize_for_ocr(image_path, target_dpi=300):
    """调整图像大小以优化 OCR"""
    img = cv2.imread(image_path)
    h, w = img.shape[:2]
    
    # 计算当前 DPI（假设扫描件）
    # 如果是照片，可根据像素数估算
    
    # 目标：文字高度 20-40 像素
    # 标准打印文字约 3mm 高，300 DPI 下约 35 像素
    
    # 简单缩放策略
    if h < 500:
        # 图片太小，放大
        scale = 2.0
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
    elif h > 4000:
        # 图片太大，缩小
        scale = 0.5
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    
    return img

分辨率建议：

输入类型	建议操作	目标高度
照片（<500px）	放大 2x	1000px+
扫描件（300 DPI）	保持	-
高分辨率（>4000px）	缩小	2000-4000px
小字体	放大 3-4x	文字 30-50px

4.8 边框处理

import cv2
import numpy as np

def remove_border(image_path):
    """移除图像边框"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 二值化
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return img
    
    # 找到最大轮廓（假设是主体）
    largest = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest)
    
    # 裁剪
    cropped = img[y:y+h, x:x+w]
    
    return cropped

4.9 完整预处理流水线

import cv2
import numpy as np
import pytesseract
from PIL import Image

def preprocess_for_ocr(image_path, output_path=None):
    """完整的 OCR 预处理流水线"""
    
    # 1. 读取图像
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"无法读取图像: {image_path}")
    
    # 2. 灰度化
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 3. 倾斜校正（基于 OSD）
    try:
        osd = pytesseract.image_to_osd(gray, output_type=pytesseract.Output.DICT)
        angle = osd['rotate']
        if abs(angle) > 0.5:
            h, w = gray.shape
            M = cv2.getRotationMatrix2D((w//2, h//2), -angle, 1.0)
            gray = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC,
                                   borderMode=cv2.BORDER_REPLICATE)
            print(f"校正倾斜: {angle}°")
    except Exception as e:
        print(f"OSD 检测失败: {e}")
    
    # 4. 去噪
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # 5. 对比度增强
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    
    # 6. 二值化
    _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 7. 形态学处理（去除小噪点）
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
    
    # 8. 保存结果
    if output_path:
        cv2.imwrite(output_path, cleaned)
    
    return cleaned

def compare_ocr(image_path):
    """对比预处理前后的 OCR 结果"""
    import pytesseract
    from PIL import Image
    
    # 原图识别
    img = Image.open(image_path)
    text_original = pytesseract.image_to_string(img, lang='chi_sim+eng')
    
    # 预处理后识别
    preprocessed = preprocess_for_ocr(image_path, 'preprocessed.png')
    text_processed = pytesseract.image_to_string(
        Image.fromarray(preprocessed), lang='chi_sim+eng'
    )
    
    print("=== 原图识别 ===")
    print(text_original[:200])
    print("\n=== 预处理后识别 ===")
    print(text_processed[:200])

if __name__ == '__main__':
    compare_ocr('test.png')

4.10 特殊场景预处理

4.10.1 手机拍照文档

def preprocess_phone_photo(image_path):
    """手机拍照文档预处理"""
    img = cv2.imread(image_path)
    
    # 1. 缩放（手机照片通常很大）
    h, w = img.shape[:2]
    if h > 3000:
        scale = 2000 / h
        img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
    
    # 2. 灰度化
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 3. CLAHE 对比度增强（手机拍照常有阴影）
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    
    # 4. 自适应二值化（光照不均）
    binary = cv2.adaptiveThreshold(
        enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 8
    )
    
    return binary

4.10.2 老旧扫描件

def preprocess_old_scan(image_path):
    """老旧扫描件预处理"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 1. 去噪（老旧扫描件噪声多）
    denoised = cv2.fastNlMeansDenoising(gray, None, 15, 7, 21)
    
    # 2. 形态学操作修复断裂文字
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    dilated = cv2.dilate(denoised, kernel, iterations=1)
    
    # 3. 二值化
    _, binary = cv2.threshold(dilated, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return binary

4.11 预处理参数调优

参数	范围	调优建议
`blockSize` (自适应阈值)	3-51 (奇数)	11-21，越大越平滑
`C` (自适应阈值)	-10 到 10	2-8，越大越敏感
`clipLimit` (CLAHE)	1-5	2-3，越大对比度越强
`h` (NLM 去噪)	3-20	10-15，越大去噪越强
滤波核大小	3-15 (奇数)	3-5，越大越模糊

4.12 本章小结

步骤	方法	推荐
灰度化	`cvtColor`	所有场景
二值化	Otsu / 自适应	根据光照选择
去噪	NLM / 中值	根据噪声类型选择
倾斜校正	OSD / 霍夫变换	OSD 最准
缩放	`resize`	目标文字 20-40px
对比度	CLAHE	低对比度场景

4.13 扩展阅读

上一章: 基本使用 | 下一章: 多语言支持