1 Star 0 Fork 2

w4sevens / python-word-process

forked from ypftest / python-word-process 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
python-docx-keyword.py 6.46 KB
一键复制 编辑 原始数据 按行查看 历史
杨鹏飞 提交于 2020-08-21 17:41 . 分步处理更新
"""
将文章中的关键字加粗(第一行中关键字不加粗, 第一行后面_下划线删除),去除标题中的日期
"""
import os
from docx import Document
import pythoncom
import re
from docx.opc.exceptions import PackageNotFoundError
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
# 读取文件夹下的docx文件名列表
def docx_file_name(file_dir):
file_list = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if os.path.splitext(file)[1] == '.docx':
file_list.append(os.path.join(root, file))
return file_list
# 取关键词
def keyword_find(doc):
paragraph1 = doc.paragraphs[0].text
key_word = paragraph1.split("_")[-1]
return key_word
# 去除标题中的_
def replace_(doc):
para = doc.paragraphs[0]
for i in range(len(para.runs)):
para.runs[i].text = para.runs[i].text.replace("_", "")
return doc
# 删除标题中的日期
def replace_date(doc):
para = doc.paragraphs[0]
# 删除“-2018-09-20”或“-2018-2-13”
para.text = re.sub(r"(-\d{4}-\d{1,2}-\d{1,2})", "", para.text)
# 删除“2018-09-20”或“2018-2-13”
para.text = re.sub(r"(\d{4}-\d{1,2}-\d{1,2})", "", para.text)
# 删除“2018年09月20日”或“2018年2月13日”
para.text = re.sub(r"(\d{4}\u5E74\d{1,2}\u6708\d{1,2}\u65E5)", "", para.text)
# 删除“20180324”
para.text = re.sub(r"(\d{8})", "", para.text)
return doc
# 删除正文中的一句话
def replace_text(doc):
for i in range(1, len(doc.paragraphs)):
para = doc.paragraphs[i]
# 删除“这*发布的” “这*发布”
para.text = re.sub(r"(\u8fd9)([\u4e00-\u9fa5a-zA-Z]*)(\u53d1\u5e03)([\u7684]*)", "", para.text)
# 删除“此*发布的” “此*发布”
para.text = re.sub(r"(\u6b64)([\u4e00-\u9fa5a-zA-Z]*)(\u53d1\u5e03)([\u7684]*)", "", para.text)
# 删除“本*发布的” “本*发布”
para.text = re.sub(r"(\u672c)([\u4e00-\u9fa5a-zA-Z]*)(\u53d1\u5e03)([\u7684]*)", "", para.text)
# 删除“战况报告由*发布的” “战况报告由*发布”
para.text = re.sub(r"(\u6218\u51b5\u62a5\u544a\u7531)([\u4e00-\u9fa5a-zA-Z]*)(\u53d1\u5e03)([\u7684]*)", "", para.text)
# 删除“比赛报告由*发布的” “比赛报告由*发布”
para.text = re.sub(r"(\u6bd4\u8d5b\u62a5\u544a\u7531)([\u4e00-\u9fa5a-zA-Z]*)(\u53d1\u5e03)([\u7684]*)", "", para.text)
# 删除“由*发布的” “由*发布”
para.text = re.sub(r"(\u7531)([\u4e00-\u9fa5a-zA-Z]*)(\u53d1\u5e03)([\u7684]*)", "", para.text)
# [来*在等着你
para.text = re.sub(r"(\[\u6765)([\s\S]*)(\u5728\u7b49\u7740\u4f60)", "", para.text)
# [来*等着您!
para.text = re.sub(r"(\[\u6765)([\s\S]*)(\u7b49\u7740\u60a8\uff01)", "", para.text)
# 【足球*应用
para.text = re.sub(r"(\u3010\u8db3\u7403)([\s\S]*)(\u5e94\u7528)", "", para.text)
# 【*等你来拿。
para.text = re.sub(r"(\u3010)([\s\S]*)(\u7b49\u4f60\u6765\u62ff\u3002)", "", para.text)
# 【来*等你领!
para.text = re.sub(r"(\u3010\u6765)([\s\S]*)(\u7b49\u4f60\u9886\uff01)", "", para.text)
# 【来*等您来
para.text = re.sub(r"(\u3010\u6765)([\s\S]*)(\u7b49\u60a8\u6765)", "", para.text)
# [如果*应用中找到。
para.text = re.sub(r"(\[\u5982\u679c)([\s\S]*)(\u5e94\u7528\u4e2d\u627e\u5230\u3002)", "", para.text)
# [,*查看!
para.text = re.sub(r"(\[\uff0c)([\s\S]*)(\u67e5\u770b\uff01)", "", para.text)
# 新用户*等着你去取。
para.text = re.sub(r"(\u65b0\u7528\u6237)([\s\S]*)(\u7b49\u7740\u4f60\u53bb\u53d6\u3002)", "", para.text)
# 来-掷-赌!
para.text = re.sub(r"(\u6765\u002d\u63b7\u002d\u8d4c\uff01)", "", para.text)
return doc
# 删除正文中的日期
def replace_date_1(doc):
for i in range(1, len(doc.paragraphs)):
para = doc.paragraphs[i]
# 删除“2018年09月20日,”或“2018年2月13日,”
para.text = re.sub(r"(\d{4}\u5E74\d{1,2}\u6708\d{1,2}\u65E5\uFF0C)", "", para.text)
# 删除“2018年09月20日”或“2018年2月13日”
para.text = re.sub(r"(\d{4}\u5E74\d{1,2}\u6708\d{1,2}\u65E5)", "", para.text)
return doc
# 加粗关键词
def key_word_bold(doc, keyword):
for i in range(1, len(doc.paragraphs)):
p = doc.paragraphs[i]
# p.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
for r in p.runs:
if keyword not in r.text:
pass
rest = r.text.split(keyword)
r.style = r.style
r.text = ''
for text in rest[:-1]:
run = p.add_run(text=text)
run.bold = False
run = p.add_run(keyword)
run.bold = True
run = p.add_run(rest[-1])
run.bold = False
return doc
# 替换换行符(即向下箭头)
def replace_n_n(doc):
for para in doc.paragraphs:
for i in range(len(para.runs)):
para.runs[i].text = para.runs[i].text.replace("\n\n", "\n")
return doc
# 设置段落左对齐
def align_left(doc):
for para in doc.paragraphs:
# for left, 1 for center, 2 right, 3 justify ....
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
return doc
def main():
file_list = docx_file_name("D:\\文件处理\\2020.8.12-350篇足球")
print(len(file_list))
for file in file_list:
pythoncom.CoInitialize()
try:
doc = Document(file)
print(len(doc.paragraphs))
# 根据标题中的_查找关键词
keyword = keyword_find(doc)
# 去除标题中的_
replace_(doc)
# 去除标题中的日期
replace_date(doc)
# 去除正文中的日期
replace_date_1(doc)
# 去除正文中的一句话
replace_text(doc)
# 替换 \n\n 为 \n, 避免增加空行
replace_n_n(doc)
# 加粗关键词
if keyword == "":
pass
elif "2020" in keyword:
key_word_bold(doc, "2020")
key_word_bold(doc, keyword.replace("2020", ""))
else:
key_word_bold(doc, keyword)
doc.save(file)
except PackageNotFoundError:
pass
finally:
# 释放资源
pythoncom.CoUninitialize()
if __name__ == '__main__':
main()
Python
1
https://gitee.com/w4dll/python-word-process.git
git@gitee.com:w4dll/python-word-process.git
w4dll
python-word-process
python-word-process
master

搜索帮助