强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

Tesseract OCR 完整教程 / 第 8 章:Python 集成

第 8 章:Python 集成

使用 Python 高效集成 Tesseract OCR。

8.1 环境配置

# 安装 Python 依赖
pip install pytesseract Pillow opencv-python numpy pandas

# 验证
python3 -c "import pytesseract; print(pytesseract.get_tesseract_version())"

8.2 pytesseract 基础

8.2.1 核心 API

import pytesseract
from PIL import Image

img = Image.open('test.png')

# 1. 识别为文本
text = pytesseract.image_to_string(img, lang='chi_sim+eng')

# 2. 识别为数据(含位置、置信度)
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

# 3. 识别为 hOCR
hocr = pytesseract.image_to_pdf_or_hocr(img, extension='hocr')

# 4. 识别为 PDF
pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')

# 5. 获取 OSD 信息
osd = pytesseract.image_to_osd(img, output_type=pytesseract.Output.DICT)

8.2.2 输出类型

函数输出用途
image_to_string字符串简单文本提取
image_to_data字典/DataFrame位置、置信度
image_to_boxes字符框字符级位置
image_to_pdf_or_hocrPDF/hOCR生成文件
image_to_osdOSD 信息方向、脚本检测

8.3 结果解析

8.3.1 image_to_data 详解

import pytesseract
from PIL import Image

img = Image.open('test.png')

# 获取详细数据
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

# 字段说明
# data['level']    - 层级: 1=page, 2=block, 3=para, 4=line, 5=word
# data['page_num'] - 页码
# data['block_num']- 块编号
# data['par_num']  - 段落编号
# data['line_num'] - 行编号
# data['word_num'] - 词编号
# data['left']     - 左边距
# data['top']      - 上边距
# data['width']    - 宽度
# data['height']   - 高度
# data['conf']     - 置信度 (-1=失败)
# data['text']     - 文本

# 遍历结果
n_boxes = len(data['text'])
for i in range(n_boxes):
    if int(data['conf'][i]) > 60:  # 置信度过滤
        text = data['text'][i].strip()
        if text:
            print(f"文本: {text:20s} 置信度: {data['conf'][i]:6.1f} "
                  f"位置: ({data['left'][i]}, {data['top'][i]})")

8.3.2 结果转换为 DataFrame

import pytesseract
import pandas as pd
from PIL import Image

def ocr_to_dataframe(image_path, lang='chi_sim+eng'):
    """OCR 结果转为 DataFrame"""
    img = Image.open(image_path)
    data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
    
    df = pd.DataFrame(data)
    
    # 转换类型
    df['conf'] = df['conf'].astype(int)
    df['text'] = df['text'].astype(str)
    
    # 过滤
    df = df[df['conf'] > 0]  # 排除失败项
    df = df[df['text'].str.strip() != '']  # 排除空文本
    
    return df

# 使用
df = ocr_to_dataframe('test.png')
print(df[['text', 'conf', 'left', 'top']].to_string())

# 保存为 CSV
df.to_csv('ocr_results.csv', index=False, encoding='utf-8-sig')

8.3.3 按层级组织结果

def ocr_structured(image_path, lang='chi_sim+eng'):
    """结构化 OCR 结果"""
    import pytesseract
    from PIL import Image
    
    img = Image.open(image_path)
    data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
    
    result = {
        'blocks': {}
    }
    
    n = len(data['text'])
    for i in range(n):
        level = data['level'][i]
        conf = int(data['conf'][i])
        text = data['text'][i].strip()
        
        if conf <= 0 or not text:
            continue
        
        if level == 2:  # Block
            block_id = data['block_num'][i]
            if block_id not in result['blocks']:
                result['blocks'][block_id] = {'lines': {}, 'bbox': (
                    data['left'][i], data['top'][i],
                    data['left'][i] + data['width'][i],
                    data['top'][i] + data['height'][i]
                )}
        
        elif level == 4:  # Line
            block_id = data['block_num'][i]
            line_id = data['line_num'][i]
            if block_id in result['blocks']:
                if line_id not in result['blocks'][block_id]['lines']:
                    result['blocks'][block_id]['lines'][line_id] = {'words': [], 'text': ''}
        
        elif level == 5:  # Word
            block_id = data['block_num'][i]
            line_id = data['line_num'][i]
            if (block_id in result['blocks'] and 
                line_id in result['blocks'][block_id]['lines']):
                result['blocks'][block_id]['lines'][line_id]['words'].append({
                    'text': text,
                    'conf': conf,
                    'bbox': (data['left'][i], data['top'][i],
                             data['left'][i] + data['width'][i],
                             data['top'][i] + data['height'][i])
                })
                result['blocks'][block_id]['lines'][line_id]['text'] += text + ' '
    
    return result

# 使用
result = ocr_structured('test.png')
for block_id, block in result['blocks'].items():
    print(f"\nBlock {block_id}:")
    for line_id, line in block['lines'].items():
        print(f"  Line {line_id}: {line['text'].strip()}")

8.4 置信度过滤

8.4.1 基本过滤

import pytesseract
from PIL import Image

def ocr_with_confidence(image_path, min_conf=60, lang='chi_sim+eng'):
    """带置信度过滤的 OCR"""
    img = Image.open(image_path)
    data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
    
    results = []
    n = len(data['text'])
    
    for i in range(n):
        conf = int(data['conf'][i])
        text = data['text'][i].strip()
        
        if conf >= min_conf and text:
            results.append({
                'text': text,
                'confidence': conf,
                'bbox': {
                    'left': data['left'][i],
                    'top': data['top'][i],
                    'width': data['width'][i],
                    'height': data['height'][i]
                },
                'level': data['level'][i]
            })
    
    return results

# 使用
results = ocr_with_confidence('test.png', min_conf=70)
for r in results:
    print(f"{r['text']:20s} conf={r['confidence']:.0f}")

8.4.2 置信度统计分析

import numpy as np

def confidence_analysis(image_path, lang='chi_sim+eng'):
    """置信度统计分析"""
    import pytesseract
    from PIL import Image
    
    img = Image.open(image_path)
    data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
    
    confs = [int(c) for c in data['conf'] if int(c) > 0]
    
    if not confs:
        print("无有效结果")
        return
    
    print(f"样本数: {len(confs)}")
    print(f"平均置信度: {np.mean(confs):.1f}")
    print(f"中位数: {np.median(confs):.1f}")
    print(f"标准差: {np.std(confs):.1f}")
    print(f"最低: {min(confs)}")
    print(f"最高: {max(confs)}")
    
    # 分布
    ranges = [(90, 100), (80, 90), (70, 80), (60, 70), (0, 60)]
    for low, high in ranges:
        count = sum(1 for c in confs if low <= c < high)
        print(f"  {low}-{high}: {count} ({count/len(confs)*100:.1f}%)")

8.4.3 自适应置信度阈值

def adaptive_confidence_threshold(image_path, lang='chi_sim+eng'):
    """自适应置信度阈值"""
    import pytesseract
    from PIL import Image
    import numpy as np
    
    img = Image.open(image_path)
    data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
    
    confs = [int(c) for c in data['conf'] if int(c) > 0]
    
    if not confs:
        return []
    
    # 使用均值-标准差方法
    mean_conf = np.mean(confs)
    std_conf = np.std(confs)
    threshold = max(mean_conf - std_conf, 50)  # 最低 50
    
    print(f"自适应阈值: {threshold:.1f}")
    
    # 过滤
    results = []
    n = len(data['text'])
    for i in range(n):
        conf = int(data['conf'][i])
        text = data['text'][i].strip()
        
        if conf >= threshold and text:
            results.append({'text': text, 'confidence': conf})
    
    return results

8.5 批量处理

8.5.1 简单批量处理

import os
import pytesseract
from PIL import Image

def batch_ocr(input_dir, output_dir, lang='chi_sim+eng'):
    """批量 OCR 处理"""
    os.makedirs(output_dir, exist_ok=True)
    
    for filename in os.listdir(input_dir):
        if filename.endswith(('.png', '.jpg', '.tif', '.tiff')):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename.rsplit('.', 1)[0] + '.txt')
            
            try:
                img = Image.open(input_path)
                text = pytesseract.image_to_string(img, lang=lang)
                
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(text)
                
                print(f"✓ {filename}")
            except Exception as e:
                print(f"✗ {filename}: {e}")

batch_ocr('./images', './texts')

8.5.2 并行批量处理

import os
import pytesseract
from PIL import Image
from concurrent.futures import ProcessPoolExecutor, as_completed

def process_single(args):
    """处理单个文件"""
    input_path, output_path, lang = args
    try:
        img = Image.open(input_path)
        text = pytesseract.image_to_string(img, lang=lang)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        return True, input_path, len(text)
    except Exception as e:
        return False, input_path, str(e)

def parallel_batch_ocr(input_dir, output_dir, lang='chi_sim+eng', workers=4):
    """并行批量 OCR"""
    os.makedirs(output_dir, exist_ok=True)
    
    tasks = []
    for filename in os.listdir(input_dir):
        if filename.endswith(('.png', '.jpg', '.tif', '.tiff')):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename.rsplit('.', 1)[0] + '.txt')
            tasks.append((input_path, output_path, lang))
    
    print(f"待处理: {len(tasks)} 个文件,使用 {workers} 个进程")
    
    success = 0
    fail = 0
    
    with ProcessPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(process_single, task) for task in tasks]
        
        for future in as_completed(futures):
            ok, path, info = future.result()
            if ok:
                success += 1
                print(f"✓ {os.path.basename(path)} ({info} 字符)")
            else:
                fail += 1
                print(f"✗ {os.path.basename(path)}: {info}")
    
    print(f"\n完成: 成功 {success}, 失败 {fail}")

parallel_batch_ocr('./images', './texts', workers=4)

8.6 OpenCV 集成

8.6.1 OpenCV 预处理 + pytesseract

import cv2
import numpy as np
import pytesseract

def ocr_with_preprocessing(image_path, lang='chi_sim+eng'):
    """OpenCV 预处理 + Tesseract OCR"""
    # 读取
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 预处理
    # 1. 去噪
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    
    # 2. 对比度增强
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(denoised)
    
    # 3. 二值化
    _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 4. 形态学处理
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    processed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    # OCR
    from PIL import Image
    pil_img = Image.fromarray(processed)
    text = pytesseract.image_to_string(pil_img, lang=lang)
    
    return text

text = ocr_with_preprocessing('scan.png')
print(text)

8.6.2 区域提取 OCR

import cv2
import pytesseract

def region_ocr(image_path, region, lang='chi_sim+eng'):
    """对指定区域进行 OCR"""
    img = cv2.imread(image_path)
    
    x, y, w, h = region
    roi = img[y:y+h, x:x+w]
    
    # 预处理
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # OCR
    text = pytesseract.image_to_string(binary, lang=lang, config='--psm 6')
    
    return text.strip()

# 识别右上角区域
h, w = cv2.imread('document.png').shape[:2]
top_right = region_ocr('document.png', (w//2, 0, w//2, h//4))
print(f"右上角文字: {top_right}")

8.6.3 可视化结果

import cv2
import pytesseract
from PIL import Image

def visualize_ocr(image_path, output_path='ocr_result.png'):
    """可视化 OCR 结果"""
    img = cv2.imread(image_path)
    pil_img = Image.open(image_path)
    
    data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
    
    n = len(data['text'])
    for i in range(n):
        conf = int(data['conf'][i])
        text = data['text'][i].strip()
        
        if conf > 60 and text:
            x = data['left'][i]
            y = data['top'][i]
            w = data['width'][i]
            h = data['height'][i]
            
            # 绘制边框
            color = (0, 255, 0) if conf > 80 else (0, 255, 255)
            cv2.rectangle(img, (x, y), (x+w, y+h), color, 2)
            
            # 绘制置信度
            cv2.putText(img, f"{conf}%", (x, y-5), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
    
    cv2.imwrite(output_path, img)
    print(f"可视化结果已保存: {output_path}")

visualize_ocr('test.png')

8.7 高级用法

8.7.1 自定义配置

import pytesseract
from PIL import Image

img = Image.open('test.png')

# 组合配置
custom_config = r'--oem 1 --psm 6 -c tessedit_char_whitelist=0123456789'
text = pytesseract.image_to_string(img, config=custom_config)

# 常用配置
configs = {
    '数字': '--psm 7 -c tessedit_char_whitelist=0123456789',
    '单行英文': '--psm 7',
    '表格': '--psm 6',
    '稀疏文本': '--psm 11',
    '垂直文本': '--psm 5',
}

8.7.2 多引擎结果合并

import pytesseract
from PIL import Image

def multi_config_ocr(image_path, configs, lang='chi_sim+eng'):
    """多配置 OCR 结果合并"""
    img = Image.open(image_path)
    
    results = {}
    for name, config in configs.items():
        try:
            text = pytesseract.image_to_string(img, lang=lang, config=config)
            results[name] = text.strip()
        except Exception as e:
            results[name] = f"Error: {e}"
    
    # 选择最长的结果(简单策略)
    best = max(results.items(), key=lambda x: len(x[1]))
    
    return best[0], best[1], results

configs = {
    'psm3': '--psm 3',
    'psm4': '--psm 4',
    'psm6': '--psm 6',
}

best_name, best_text, all_results = multi_config_ocr('test.png', configs)
print(f"最佳配置: {best_name}")
print(f"识别结果:\n{best_text}")

8.8 异常处理

import pytesseract
from PIL import Image

def safe_ocr(image_path, lang='chi_sim+eng', retries=3):
    """带重试的安全 OCR"""
    import time
    
    for attempt in range(retries):
        try:
            img = Image.open(image_path)
            
            # 验证图片
            if img.size[0] < 10 or img.size[1] < 10:
                return None, "图片太小"
            
            # OCR
            text = pytesseract.image_to_string(img, lang=lang)
            
            return text.strip(), None
            
        except pytesseract.TesseractNotFoundError:
            return None, "Tesseract 未安装"
        except pytesseract.TesseractError as e:
            if attempt < retries - 1:
                time.sleep(1)
                continue
            return None, f"Tesseract 错误: {e}"
        except Exception as e:
            return None, f"未知错误: {e}"
    
    return None, "重试次数用完"

# 使用
text, error = safe_ocr('test.png')
if error:
    print(f"错误: {error}")
else:
    print(f"结果: {text}")

8.9 实用工具函数

import pytesseract
from PIL import Image
import os

def ocr_info():
    """获取 Tesseract 信息"""
    print(f"版本: {pytesseract.get_tesseract_version()}")
    print(f"语言: {pytesseract.get_languages()}")
    print(f"路径: {pytesseract.get_tesseract_version()}")

def quick_ocr(image_path, lang='chi_sim+eng'):
    """快速 OCR"""
    return pytesseract.image_to_string(Image.open(image_path), lang=lang)

def word_count(text):
    """统计字数"""
    cn_chars = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
    en_words = len([w for w in text.split() if w.isascii()])
    return {'中文字符': cn_chars, '英文单词': en_words}

8.10 本章小结

要点说明
核心库pytesseract + Pillow
结果解析image_to_data 获取详细信息
置信度过滤conf > 60 一般可用
批量处理ProcessPoolExecutor 并行
预处理OpenCV + pytesseract 组合

8.11 扩展阅读


上一章: PDF 处理 | 下一章: 版面分析