Tesseract OCR 完整教程 / 第 4 章:图像预处理
第 4 章:图像预处理
掌握图像预处理技术,显著提升 OCR 识别精度。
4.1 为什么需要预处理
原始图像 → 预处理 → OCR 识别 → 输出文本
↑ ↑
噪声多 二值化、去噪、校正
倾斜 → 精度提升 20-50%
低对比度
预处理效果对比:
| 预处理步骤 | 精度提升 | 适用场景 |
|---|
| 灰度化 | +5% | 所有场景 |
| 二值化 | +10-20% | 文档扫描件 |
| 去噪 | +5-15% | 噪声明显的图片 |
| 倾斜校正 | +10-30% | 倾斜文档 |
| 缩放 | +5-10% | 低分辨率图片 |
| 边缘检测 | +10% | 复杂背景 |
4.2 环境准备
# 安装依赖
pip install opencv-python numpy Pillow pytesseract
# 验证
python3 -c "import cv2; print(cv2.__version__)"
4.3 灰度化
import cv2
# 读取图片
img = cv2.imread('image.png')
# 方法 1: cvtColor
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 方法 2: 单通道提取(简单场景)
gray = img[:, :, 0] # 取蓝色通道
# 保存
cv2.imwrite('gray.png', gray)
业务场景:所有 OCR 任务的第一步,将三通道图像转为单通道。
4.4 二值化
4.4.1 全局阈值(Global Thresholding)
import cv2
gray = cv2.imread('image.png', cv2.IMREAD_GRAYSCALE)
# 简单阈值
_, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
_, binary_inv = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)
_, trunc = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)
_, tozero = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)
# Otsu 自动阈值(推荐)
_, binary_otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 输出 Otsu 选择的阈值
thresh_val, _ = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
print(f"Otsu 阈值: {thresh_val}")
4.4.2 自适应阈值(Adaptive Thresholding)
import cv2
gray = cv2.imread('image.png', cv2.IMREAD_GRAYSCALE)
# 均值自适应
adaptive_mean = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2
)
# 高斯自适应(推荐)
adaptive_gauss = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
二值化方法对比:
| 方法 | 适用场景 | 参数 | 推荐度 |
|---|
| 全局阈值 | 光照均匀 | threshold | ⭐⭐ |
| Otsu | 双峰分布 | 自动 | ⭐⭐⭐⭐ |
| 自适应均值 | 光照不均 | blockSize, C | ⭐⭐⭐ |
| 自适应高斯 | 光照不均 | blockSize, C | ⭐⭐⭐⭐ |
4.4.3 自动选择方法
import cv2
import numpy as np
def auto_threshold(gray):
"""自动选择最佳二值化方法"""
# 计算图像统计
mean_val = np.mean(gray)
std_val = np.std(gray)
# Otsu 对比度检测
otsu_thresh, otsu_binary = cv2.threshold(
gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
# 计算像素分布
hist = cv2.calcHist([gray], [0], None, [256], [0, 256])
hist_norm = hist.ravel() / hist.sum()
# 判断是否双峰分布
peaks = np.where(hist_norm > 0.01)[0]
if len(peaks) > 1 and std_val > 50:
# 双峰分布,使用 Otsu
print("使用 Otsu 二值化")
return otsu_binary
elif std_val < 30:
# 低对比度,先增强
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
_, binary = cv2.threshold(
enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
)
print("使用 CLAHE 增强 + Otsu")
return binary
else:
# 光照不均,使用自适应
binary = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
)
print("使用自适应阈值")
return binary
4.5 去噪
4.5.1 常用去噪方法
import cv2
gray = cv2.imread('noisy.png', cv2.IMREAD_GRAYSCALE)
# 方法 1: 高斯模糊
denoised_gauss = cv2.GaussianBlur(gray, (5, 5), 0)
# 方法 2: 中值滤波(推荐去除椒盐噪声)
denoised_median = cv2.medianBlur(gray, 3)
# 方法 3: 双边滤波(保留边缘)
denoised_bilateral = cv2.bilateralFilter(gray, 9, 75, 75)
# 方法 4: 非局部均值去噪(效果最好,速度较慢)
denoised_nlm = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# 方法 5: 形态学操作(去除小噪点)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
morph_open = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
morph_close = cv2.morphologyEx(morph_open, cv2.MORPH_CLOSE, kernel)
4.5.2 去噪方法对比
| 方法 | 速度 | 效果 | 保留边缘 | 适用场景 |
|---|
| 高斯模糊 | ⭐⭐⭐⭐⭐ | ⭐⭐ | ⭐⭐ | 轻微噪声 |
| 中值滤波 | ⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | 椒盐噪声 |
| 双边滤波 | ⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | 需保留边缘 |
| 非局部均值 | ⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | 高质量要求 |
| 形态学 | ⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐ | 小噪点 |
4.5.3 组合去噪流程
import cv2
def denoise_pipeline(gray):
"""组合去噪流程"""
# 1. 中值滤波去除椒盐噪声
step1 = cv2.medianBlur(gray, 3)
# 2. 形态学操作去除小噪点
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
step2 = cv2.morphologyEx(step1, cv2.MORPH_OPEN, kernel)
# 3. 轻微高斯模糊平滑
step3 = cv2.GaussianBlur(step2, (3, 3), 0)
return step3
4.6 倾斜校正
4.6.1 基于霍夫变换
import cv2
import numpy as np
def deskew_hough(image_path):
"""基于霍夫变换的倾斜校正"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 边缘检测
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
# 霍夫变换检测直线
lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
if lines is None:
return img
# 计算平均角度
angles = []
for line in lines:
x1, y1, x2, y2 = line[0]
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
if abs(angle) < 45: # 过滤垂直线
angles.append(angle)
if not angles:
return img
median_angle = np.median(angles)
print(f"检测倾斜角度: {median_angle:.2f}°")
# 旋转校正
h, w = img.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE)
return rotated
4.6.2 基于最小面积矩形
import cv2
import numpy as np
def deskew_minarea(image_path):
"""基于最小面积矩形的倾斜校正"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 二值化
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# 查找轮廓
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
return img
# 合并所有轮廓
all_points = np.vstack(contours)
# 最小面积矩形
rect = cv2.minAreaRect(all_points)
angle = rect[2]
# 调整角度
if angle < -45:
angle = -(90 + angle)
else:
angle = -angle
print(f"检测倾斜角度: {angle:.2f}°")
# 旋转校正
h, w = img.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE)
return rotated
4.6.3 基于 Tesseract OSD
import pytesseract
from PIL import Image
import cv2
import numpy as np
def deskew_osd(image_path):
"""基于 Tesseract OSD 的倾斜校正"""
img = Image.open(image_path)
# 获取方向信息
osd = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)
angle = osd['rotate']
print(f"OSD 检测角度: {angle}°")
if angle == 0:
return cv2.imread(image_path)
# 旋转
img_cv = cv2.imread(image_path)
h, w = img_cv.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, -angle, 1.0)
rotated = cv2.warpAffine(img_cv, M, (w, h), flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE)
return rotated
4.6.4 倾斜校正方法对比
| 方法 | 速度 | 精度 | 适用场景 |
|---|
| 霍夫变换 | ⭐⭐⭐ | ⭐⭐⭐ | 文档、表格 |
| 最小面积矩形 | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | 文字块 |
| Tesseract OSD | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | 通用场景 |
4.7 缩放与分辨率调整
import cv2
def resize_for_ocr(image_path, target_dpi=300):
"""调整图像大小以优化 OCR"""
img = cv2.imread(image_path)
h, w = img.shape[:2]
# 计算当前 DPI(假设扫描件)
# 如果是照片,可根据像素数估算
# 目标:文字高度 20-40 像素
# 标准打印文字约 3mm 高,300 DPI 下约 35 像素
# 简单缩放策略
if h < 500:
# 图片太小,放大
scale = 2.0
img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
elif h > 4000:
# 图片太大,缩小
scale = 0.5
img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
return img
分辨率建议:
| 输入类型 | 建议操作 | 目标高度 |
|---|
| 照片(<500px) | 放大 2x | 1000px+ |
| 扫描件(300 DPI) | 保持 | - |
| 高分辨率(>4000px) | 缩小 | 2000-4000px |
| 小字体 | 放大 3-4x | 文字 30-50px |
4.8 边框处理
import cv2
import numpy as np
def remove_border(image_path):
"""移除图像边框"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 二值化
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# 查找轮廓
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
return img
# 找到最大轮廓(假设是主体)
largest = max(contours, key=cv2.contourArea)
x, y, w, h = cv2.boundingRect(largest)
# 裁剪
cropped = img[y:y+h, x:x+w]
return cropped
4.9 完整预处理流水线
import cv2
import numpy as np
import pytesseract
from PIL import Image
def preprocess_for_ocr(image_path, output_path=None):
"""完整的 OCR 预处理流水线"""
# 1. 读取图像
img = cv2.imread(image_path)
if img is None:
raise ValueError(f"无法读取图像: {image_path}")
# 2. 灰度化
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 3. 倾斜校正(基于 OSD)
try:
osd = pytesseract.image_to_osd(gray, output_type=pytesseract.Output.DICT)
angle = osd['rotate']
if abs(angle) > 0.5:
h, w = gray.shape
M = cv2.getRotationMatrix2D((w//2, h//2), -angle, 1.0)
gray = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC,
borderMode=cv2.BORDER_REPLICATE)
print(f"校正倾斜: {angle}°")
except Exception as e:
print(f"OSD 检测失败: {e}")
# 4. 去噪
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
# 5. 对比度增强
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(denoised)
# 6. 二值化
_, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# 7. 形态学处理(去除小噪点)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# 8. 保存结果
if output_path:
cv2.imwrite(output_path, cleaned)
return cleaned
def compare_ocr(image_path):
"""对比预处理前后的 OCR 结果"""
import pytesseract
from PIL import Image
# 原图识别
img = Image.open(image_path)
text_original = pytesseract.image_to_string(img, lang='chi_sim+eng')
# 预处理后识别
preprocessed = preprocess_for_ocr(image_path, 'preprocessed.png')
text_processed = pytesseract.image_to_string(
Image.fromarray(preprocessed), lang='chi_sim+eng'
)
print("=== 原图识别 ===")
print(text_original[:200])
print("\n=== 预处理后识别 ===")
print(text_processed[:200])
if __name__ == '__main__':
compare_ocr('test.png')
4.10 特殊场景预处理
4.10.1 手机拍照文档
def preprocess_phone_photo(image_path):
"""手机拍照文档预处理"""
img = cv2.imread(image_path)
# 1. 缩放(手机照片通常很大)
h, w = img.shape[:2]
if h > 3000:
scale = 2000 / h
img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)
# 2. 灰度化
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 3. CLAHE 对比度增强(手机拍照常有阴影)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 4. 自适应二值化(光照不均)
binary = cv2.adaptiveThreshold(
enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 8
)
return binary
4.10.2 老旧扫描件
def preprocess_old_scan(image_path):
"""老旧扫描件预处理"""
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 1. 去噪(老旧扫描件噪声多)
denoised = cv2.fastNlMeansDenoising(gray, None, 15, 7, 21)
# 2. 形态学操作修复断裂文字
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
dilated = cv2.dilate(denoised, kernel, iterations=1)
# 3. 二值化
_, binary = cv2.threshold(dilated, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return binary
4.11 预处理参数调优
| 参数 | 范围 | 调优建议 |
|---|
blockSize (自适应阈值) | 3-51 (奇数) | 11-21,越大越平滑 |
C (自适应阈值) | -10 到 10 | 2-8,越大越敏感 |
clipLimit (CLAHE) | 1-5 | 2-3,越大对比度越强 |
h (NLM 去噪) | 3-20 | 10-15,越大去噪越强 |
| 滤波核大小 | 3-15 (奇数) | 3-5,越大越模糊 |
4.12 本章小结
| 步骤 | 方法 | 推荐 |
|---|
| 灰度化 | cvtColor | 所有场景 |
| 二值化 | Otsu / 自适应 | 根据光照选择 |
| 去噪 | NLM / 中值 | 根据噪声类型选择 |
| 倾斜校正 | OSD / 霍夫变换 | OSD 最准 |
| 缩放 | resize | 目标文字 20-40px |
| 对比度 | CLAHE | 低对比度场景 |
4.13 扩展阅读
上一章: 基本使用 | 下一章: 多语言支持