主要使用python-docx 与pandas
因为python-docx对表格的解析不够友好且效率低,故需转换一次
代码如下
# coding:utf-8
import os, re
import docx
from docx.document import Document as dc
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor # 设置字体颜色
from docx import Document
from docx.shared import Pt # 设置字体
from docx.oxml.ns import qn # 设置中文字体
import pandas as pd
FILE_PATH = r"D:\xxxx\xxxx\xxxx\xxxx.docx"
obj = docx.Document(FILE_PATH)
def iter_block_items(parent):
# print('utils.py ----> iter_block_items:', 2)
if isinstance(parent, dc):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("[TypeError] Document in insuitable type.")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
def table2list(table):
data = []
for i, row in enumerate(table.rows):
row_data = []
for cell in row.cells:
row_data.append(cell.text)
data.append(row_data)
return data
#替换的段落关键字
word = '段落关键字'
#替换的表格关键字
table_text = '表格关键字'
def set_run(run, font_size, bold, color, name):
'''
设置run对象
:param run:
:param font_size: 字体大小
:param bold: 是否加粗
:param color: 字体颜色
:param name: 字体名
:return:
'''
run.font.size = font_size
run.bold = bold
run.font.color.rgb = color
run.font.name = name
# 设置字体必须要下面2步
s = run._element
s.rPr.rFonts.set(qn('w:eastAsia'), name)
def paragraphs_utils(obj):
for p in obj.paragraphs:
# 先循环得到单个段落p
for r in p.runs:
if word not in r.text:
# 判断关键字是否存在于段落文本中
continue
# print(r.text)
# print(r.style.name)
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷体'
# 使用关键词切分当前run的文本
rest = r.text.split(word)
# 清除当前run的内容
r.text = ''
for text in rest[:-1]:
# 循环切割出来的列表 ['','xxxxxxx']或者['xxxxx','']
run = p.add_run(text=text)
set_run(run, font_size, bold, color, name)
run = p.add_run(word)
# 重写关键字部分
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
run = p.add_run(rest[-1])
# 在补齐r.text的内容
set_run(run, font_size, bold, color, name)
obj.save('标注后的文档.docx')
def table_utils(obj):
for p in obj.tables:
# 先循环得到单个表格p
pd_block = pd.DataFrame(table2list(p))
# 使用table2list 将table转成列表,然后转成pandas的DateFrame对象
for rows in range(pd_block.shape[0]):
# 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
if rows == 0: continue
if table_text != pd_block.iloc[rows, 0]: continue
# 判断关键字是否等于当前表的 rows行0列,否则跳过
for cols in range(pd_block.shape[1]):
if cols == 0: continue
rs = p.cell(rows, cols).paragraphs[0]
# 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0
for r in rs.runs: # paragraphs中有个runs 是个列表
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷体'
data = r.text.strip()
# 清除当前run的内容
r.text = ''
run = rs.add_run(data)
# 此时要使用paragraphs的add_run方法重写data数据
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
obj.save('标注后的表格.docx')
for block in iter_block_items(obj):
if isinstance(block, Paragraph):
for r in block.runs:
if word not in r.text:
continue
print(r.text)
print(r.style.name)
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷体'
# 使用关键词切分当前run的文本
rest = r.text.split(word)
# 清除当前run的内容
r.text = ''
for text in rest[:-1]:
run = block.add_run(text=text)
set_run(run, font_size, bold, color, name)
run = block.add_run(word)
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
run = block.add_run(rest[-1])
set_run(run, font_size, bold, color, name)
else:
pd_block = pd.DataFrame(table2list(block))
# 使用table2list 将table转成列表,然后转成pandas的DateFrame对象
for rows in range(pd_block.shape[0]):
# 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
if rows == 0: continue
if table_text != pd_block.iloc[rows, 0]: continue
# 判断关键字是否等于当前表的 rows行0列,否则跳过
for cols in range(pd_block.shape[1]):
if cols == 0: continue
rs = block.cell(rows, cols).paragraphs[0]
# 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0
for r in rs.runs: # paragraphs中有个runs 是个列表
font_size = r.font.size
bold = r.bold
color = r.font.color.rgb
name = u'楷体'
data = r.text.strip()
# 清除当前run的内容
r.text = ''
run = rs.add_run(data)
# 此时要使用paragraphs的add_run方法重写data数据
set_run(run, font_size, bold, color, name)
run.font.color.rgb = RGBColor(255, 0, 0)
obj.save('段落与表格标注后的文档.docx')