强曰为道
与天地相似,故不违。知周乎万物,而道济天下,故不过。旁行而不流,乐天知命,故不忧.
文档目录

Tesseract OCR 完整教程 / 第 9 章:版面分析

第 9 章:版面分析

掌握复杂文档布局的分析与处理技术。

9.1 版面分析概述

文档布局类型
├── 单栏文本
├── 多栏文本
├── 表格
├── 图文混排
├── 表单
└── 复杂布局(报纸、杂志)

9.1.1 Tesseract 版面分析能力

功能支持程度说明
单栏文本⭐⭐⭐⭐⭐最佳
多栏文本⭐⭐⭐一般
表格⭐⭐有限
图文混排⭐⭐⭐一般
表单⭐⭐⭐需配置
手写体很有限

9.2 页面分割模式详解

import pytesseract
from PIL import Image

def test_psm_modes(image_path, lang='chi_sim+eng'):
    """测试不同 PSM 模式"""
    img = Image.open(image_path)
    
    psm_modes = {
        0: 'OSD only',
        1: 'Auto + OSD',
        2: 'Auto',
        3: 'Fully auto',
        4: 'Single column',
        5: 'Vertical block',
        6: 'Uniform block',
        7: 'Single line',
        8: 'Single word',
        11: 'Sparse text',
        12: 'Sparse + OSD',
    }
    
    results = {}
    for psm, desc in psm_modes.items():
        try:
            config = f'--psm {psm}'
            text = pytesseract.image_to_string(img, lang=lang, config=config)
            results[psm] = {
                'desc': desc,
                'text': text.strip()[:100],
                'length': len(text.strip())
            }
        except Exception as e:
            results[psm] = {'desc': desc, 'error': str(e)}
    
    return results

# 测试并选择最佳模式
results = test_psm_modes('document.png')
for psm, info in results.items():
    print(f"PSM {psm:2d} ({info['desc']:15s}): {info.get('text', info.get('error', ''))[:60]}")

9.3 表格识别

9.3.1 基本表格识别

import pytesseract
from PIL import Image

def ocr_table(image_path, lang='chi_sim+eng'):
    """表格 OCR"""
    img = Image.open(image_path)
    
    # PSM 6 适合表格
    data = pytesseract.image_to_data(img, lang=lang, 
                                      config='--psm 6',
                                      output_type=pytesseract.Output.DICT)
    
    # 按行组织结果
    lines = {}
    n = len(data['text'])
    for i in range(n):
        if int(data['conf'][i]) > 30 and data['text'][i].strip():
            line_num = data['line_num'][i]
            if line_num not in lines:
                lines[line_num] = []
            lines[line_num].append({
                'text': data['text'][i].strip(),
                'left': data['left'][i],
                'top': data['top'][i],
                'width': data['width'][i],
                'conf': int(data['conf'][i])
            })
    
    # 按 x 坐标排序每行
    for line_num in lines:
        lines[line_num].sort(key=lambda x: x['left'])
    
    return lines

# 使用
lines = ocr_table('table.png')
for line_num, words in sorted(lines.items()):
    row = ' | '.join(w['text'] for w in words)
    print(f"行 {line_num}: {row}")

9.3.2 表格结构检测

import cv2
import numpy as np

def detect_table_structure(image_path):
    """检测表格结构(行列)"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 检测水平线
    h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    h_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, h_kernel)
    
    # 检测垂直线
    v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    v_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, v_kernel)
    
    # 找到行和列的位置
    h_proj = np.sum(h_lines, axis=1)
    v_proj = np.sum(v_lines, axis=0)
    
    row_positions = np.where(h_proj > np.max(h_proj) * 0.5)[0]
    col_positions = np.where(v_proj > np.max(v_proj) * 0.5)[0]
    
    # 合并相近位置
    def merge_positions(positions, threshold=10):
        if len(positions) == 0:
            return []
        merged = [positions[0]]
        for pos in positions[1:]:
            if pos - merged[-1] > threshold:
                merged.append(pos)
        return merged
    
    rows = merge_positions(row_positions)
    cols = merge_positions(col_positions)
    
    return rows, cols

rows, cols = detect_table_structure('table.png')
print(f"检测到 {len(rows)-1} 行, {len(cols)-1} 列")

9.3.3 完整表格 OCR

import cv2
import numpy as np
import pytesseract
from PIL import Image

def full_table_ocr(image_path, lang='chi_sim+eng'):
    """完整表格 OCR:检测结构 + 识别内容"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 1. 检测表格结构
    rows, cols = detect_table_structure(image_path)
    
    if len(rows) < 2 or len(cols) < 2:
        print("未检测到表格结构,尝试普通 OCR")
        return pytesseract.image_to_string(gray, lang=lang)
    
    # 2. 提取每个单元格
    table_data = []
    for i in range(len(rows) - 1):
        row_data = []
        for j in range(len(cols) - 1):
            # 单元格边界
            y1, y2 = rows[i], rows[i+1]
            x1, x2 = cols[j], cols[j+1]
            
            # 添加边距
            margin = 5
            cell = gray[y1+margin:y2-margin, x1+margin:x2-margin]
            
            if cell.size == 0:
                row_data.append('')
                continue
            
            # OCR 识别单元格
            cell_pil = Image.fromarray(cell)
            text = pytesseract.image_to_string(
                cell_pil, lang=lang, config='--psm 7'
            ).strip()
            row_data.append(text)
        
        table_data.append(row_data)
    
    return table_data

# 使用
table = full_table_ocr('table.png')
for i, row in enumerate(table):
    print(f"行 {i+1}: {' | '.join(row)}")

9.3.4 专业表格识别工具

# 使用其他库增强表格识别
# pip install tabula-py camelot-py[cv]

# Camelot(PDF 表格提取)
import camelot

def extract_pdf_tables(pdf_path):
    """从 PDF 提取表格"""
    tables = camelot.read_pdf(pdf_path, pages='all')
    
    for i, table in enumerate(tables):
        print(f"表格 {i+1}: {table.shape}")
        print(table.df)  # DataFrame
        print()
    
    return tables

# tabula-py(Java 依赖)
import tabula

def extract_tables_tabula(pdf_path):
    """使用 tabula 提取表格"""
    dfs = tabula.read_pdf(pdf_path, pages='all')
    return dfs

9.4 多栏文档处理

9.4.1 栏检测

import cv2
import numpy as np

def detect_columns(image_path):
    """检测文档栏数"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 垂直投影
    v_proj = np.sum(binary, axis=0)
    
    # 平滑
    kernel_size = 20
    v_proj_smooth = np.convolve(v_proj, np.ones(kernel_size)/kernel_size, mode='same')
    
    # 找空白区域(投影值低的区域)
    threshold = np.mean(v_proj_smooth) * 0.3
    blank_regions = np.where(v_proj_smooth < threshold)[0]
    
    # 找栏分隔点
    columns = []
    if len(blank_regions) > 0:
        # 分组连续空白区域
        groups = []
        current_group = [blank_regions[0]]
        for pos in blank_regions[1:]:
            if pos - current_group[-1] <= 5:
                current_group.append(pos)
            else:
                groups.append(current_group)
                current_group = [pos]
        groups.append(current_group)
        
        # 每组的中心是栏分隔线
        for group in groups:
            center = int(np.mean(group))
            width = len(group)
            if width > 20:  # 足够宽的空白区域才是分栏线
                columns.append(center)
    
    return len(columns) + 1, columns  # 栏数,分隔位置

num_cols, separators = detect_columns('newspaper.png')
print(f"检测到 {num_cols} 栏,分隔位置: {separators}")

9.4.2 多栏文档分割

import cv2
import pytesseract
from PIL import Image

def split_columns(image_path, separators):
    """按栏分割图像"""
    img = cv2.imread(image_path)
    h, w = img.shape[:2]
    
    # 添加边界
    boundaries = [0] + separators + [w]
    
    columns = []
    for i in range(len(boundaries) - 1):
        x1, x2 = boundaries[i], boundaries[i+1]
        column = img[:, x1:x2]
        columns.append(column)
    
    return columns

def ocr_multicolumn(image_path, lang='chi_sim+eng'):
    """多栏文档 OCR"""
    # 检测栏
    num_cols, separators = detect_columns(image_path)
    
    if num_cols == 1:
        # 单栏,直接识别
        img = Image.open(image_path)
        return pytesseract.image_to_string(img, lang=lang)
    
    # 分割栏
    columns = split_columns(image_path, separators)
    
    # 逐栏识别
    all_text = []
    for i, col_img in enumerate(columns):
        col_pil = Image.fromarray(cv2.cvtColor(col_img, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_string(col_pil, lang=lang, config='--psm 4')
        all_text.append(f"=== 栏 {i+1} ===\n{text}")
    
    return '\n\n'.join(all_text)

9.5 图文混排处理

9.5.1 区域类型检测

import cv2
import numpy as np

def detect_regions(image_path):
    """检测文档中的不同类型区域"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    # 查找轮廓
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    regions = {'text': [], 'image': [], 'table': []}
    
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        area = w * h
        
        if area < 1000:  # 太小,忽略
            continue
        
        # 计算像素密度
        roi = binary[y:y+h, x:x+w]
        density = np.sum(roi > 0) / area
        
        # 分类
        aspect_ratio = w / h
        
        if density > 0.3 and 0.1 < aspect_ratio < 10:
            # 文本区域
            regions['text'].append((x, y, w, h))
        elif density < 0.1 and area > 10000:
            # 图像区域
            regions['image'].append((x, y, w, h))
        else:
            # 可能是表格
            regions['table'].append((x, y, w, h))
    
    return regions

regions = detect_regions('mixed.png')
print(f"文本区域: {len(regions['text'])}")
print(f"图像区域: {len(regions['image'])}")
print(f"表格区域: {len(regions['table'])}")

9.5.2 图文混排 OCR

import cv2
import pytesseract
from PIL import Image

def ocr_mixed_layout(image_path, lang='chi_sim+eng'):
    """图文混排 OCR"""
    img = cv2.imread(image_path)
    regions = detect_regions(image_path)
    
    results = []
    
    # 处理文本区域
    for i, (x, y, w, h) in enumerate(regions['text']):
        roi = img[y:y+h, x:x+w]
        roi_pil = Image.fromarray(cv2.cvtColor(roi, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_string(roi_pil, lang=lang, config='--psm 6')
        
        if text.strip():
            results.append({
                'type': 'text',
                'position': (x, y),
                'content': text.strip()
            })
    
    # 按位置排序(从上到下,从左到右)
    results.sort(key=lambda r: (r['position'][1], r['position'][0]))
    
    return results

9.6 版面分析可视化

import cv2
import pytesseract
from PIL import Image

def visualize_layout(image_path, output_path='layout.png'):
    """可视化版面分析结果"""
    img = cv2.imread(image_path)
    pil_img = Image.open(image_path)
    
    data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
    
    # 颜色映射
    colors = {
        1: (255, 0, 0),    # page - 红
        2: (0, 255, 0),    # block - 绿
        3: (0, 0, 255),    # para - 蓝
        4: (255, 255, 0),  # line - 黄
        5: (255, 0, 255),  # word - 紫
    }
    
    n = len(data['level'])
    for i in range(n):
        level = data['level'][i]
        if level in [2, 4]:  # 只显示 block 和 line
            x = data['left'][i]
            y = data['top'][i]
            w = data['width'][i]
            h = data['height'][i]
            
            color = colors.get(level, (128, 128, 128))
            cv2.rectangle(img, (x, y), (x+w, y+h), color, 2)
            
            # 添加标签
            label = f"L{level}"
            cv2.putText(img, label, (x, y-5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
    
    cv2.imwrite(output_path, img)
    print(f"版面分析可视化: {output_path}")

visualize_layout('document.png')

9.7 表单识别

import pytesseract
from PIL import Image
import re

def ocr_form(image_path, fields, lang='chi_sim+eng'):
    """表单 OCR:提取指定字段"""
    img = Image.open(image_path)
    
    # 全文 OCR
    data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
    
    # 构建词列表
    words = []
    n = len(data['text'])
    for i in range(n):
        if int(data['conf'][i]) > 30 and data['text'][i].strip():
            words.append({
                'text': data['text'][i].strip(),
                'left': data['left'][i],
                'top': data['top'][i],
                'right': data['left'][i] + data['width'][i],
                'bottom': data['top'][i] + data['height'][i]
            })
    
    # 查找字段
    results = {}
    for field_name, field_config in fields.items():
        # 查找标签位置
        label_text = field_config.get('label', '')
        label_pos = None
        
        for word in words:
            if label_text in word['text']:
                label_pos = word
                break
        
        if label_pos is None:
            results[field_name] = None
            continue
        
        # 查找值(标签右侧或下方)
        value_words = []
        for word in words:
            if word == label_pos:
                continue
            
            # 检查是否在标签右侧
            if (word['left'] > label_pos['right'] and 
                abs(word['top'] - label_pos['top']) < 30):
                value_words.append(word)
            
            # 检查是否在标签下方
            elif (word['top'] > label_pos['bottom'] and 
                  abs(word['left'] - label_pos['left']) < 50):
                value_words.append(word)
        
        value = ' '.join(w['text'] for w in value_words)
        results[field_name] = value.strip()
    
    return results

# 使用
fields = {
    'name': {'label': '姓名'},
    'id': {'label': '身份证'},
    'phone': {'label': '电话'},
}

result = ocr_form('form.png', fields)
for field, value in result.items():
    print(f"{field}: {value}")

9.8 复杂布局处理策略

9.8.1 分治策略

def process_complex_layout(image_path, lang='chi_sim+eng'):
    """复杂布局处理:分治策略"""
    # 1. 检测布局类型
    num_cols, _ = detect_columns(image_path)
    regions = detect_regions(image_path)
    
    # 2. 根据布局选择策略
    if len(regions['table']) > 0:
        # 有表格,使用表格模式
        return full_table_ocr(image_path, lang)
    
    elif num_cols > 1:
        # 多栏,分栏处理
        return ocr_multicolumn(image_path, lang)
    
    elif len(regions['image']) > 0:
        # 图文混排
        return ocr_mixed_layout(image_path, lang)
    
    else:
        # 普通文档
        img = Image.open(image_path)
        return pytesseract.image_to_string(img, lang=lang)

9.9 第三方版面分析工具

工具特点安装
LayoutParser深度学习版面分析pip install layoutparser
Detectron2Facebook 检测框架需编译
PaddleDetection百度检测pip install paddle-det
YOLOv8通用目标检测pip install ultralytics
# LayoutParser 示例
import layoutparser as lp

def layout_analysis_advanced(image_path):
    """使用 LayoutParser 进行版面分析"""
    model = lp.Detectron2LayoutModel(
        config_path='lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
        extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5]
    )
    
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    layout = model.detect(image)
    
    for block in layout:
        print(f"类型: {block.type}, 置信度: {block.score:.2f}")
        print(f"  位置: {block.block.x_1:.0f}, {block.block.y_1:.0f}, "
              f"{block.block.x_2:.0f}, {block.block.y_2:.0f}")
    
    return layout

9.10 本章小结

要点说明
PSM 选择根据布局选择合适模式
表格识别检测结构 + 单元格 OCR
多栏处理垂直投影 + 分割 + 逐栏 OCR
图文混排区域检测 + 分类处理
可视化image_to_data + OpenCV 绘制

9.11 扩展阅读


上一章: Python 集成 | 下一章: 精度优化