1 Star 0 Fork 2

w4sevens / python-word-process

forked from ypftest / python-word-process 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
python-test.py 7.58 KB
一键复制 编辑 原始数据 按行查看 历史
杨鹏飞 提交于 2020-06-29 11:50 . 更新说明文档及程序
"""
修改文档格式
"""
import os
import re
import pythoncom
import win32com.client as wc
from docx import Document
from docx.shared import Inches
def doc_file_name(file_dir):
fileList = []
for root, dirs, files in os.walk(file_dir):
print(root)
print(dirs)
for file in files:
if os.path.splitext(file)[1] == '.doc':
fileList.append(os.path.join(root, file))
docx_dir = root + '\\docx'
if not os.path.exists(docx_dir):
os.makedirs(docx_dir)
return fileList
# 从最后开始替换某字符串几次
def rreplace(s, old, new, occurrence):
li = s.rsplit(old, occurrence)
return new.join(li)
# doc文件另存为docx
def doc_to_docx(docName):
pythoncom.CoInitialize()
try:
word = wc.Dispatch("Word.Application")
doc = word.Documents.Open(docName)
# 上面的地方只能使用完整绝对地址,相对地址找不到文件,且,只能用“\\”,不能用“/”,哪怕加了 r 也不行,涉及到将反斜杠看成转义字符。
docName = rreplace(docName, "\\", "\\docx\\", 1)
doc.SaveAs(docName.replace(".doc", ".docx"), FileFormat=12, Encoding=65001)
# print(len(doc.paragraphs))
# 内联图片个数
# print(len(doc.InlineShapes))
# 转换后的文件,12代表转换后为docx文件
doc.Close
except Exception as e:
print(e.message)
finally:
# 对com操作,一定要确保退出word应用
if word:
word.Quit
del word
# 释放资源
pythoncom.CoUninitialize()
# 取关键词
def keyword_find(doc):
paragraph1 = doc.paragraphs[0].text
key_word = paragraph1.split("_")[-1]
return key_word
# 去除标题中的_
def replace_(doc):
para = doc.paragraphs[0]
for i in range(len(para.runs)):
para.runs[i].text = para.runs[i].text.replace("_", "")
return doc
# 替换换行符
def replace_n_n(doc):
for para in doc.paragraphs:
for i in range(len(para.runs)):
para.runs[i].text = para.runs[i].text.replace("\n\n", "\n")
return doc
# 正文插入两幅图片
def replace_n_picture(doc, picture1, picture2):
para = doc.paragraphs[1]
print(len(para.runs))
for i in range(len(para.runs)):
print(i)
if '\n' in para.runs[i].text:
#para.runs[i+1].add_break() # 添加一个折行
para.runs[i+1].add_picture(picture1, width=Inches(5.25)) # 在runs的最后一段文字后添加图片
para.runs[i+1].add_break()
break
return doc
# 删除日期
def replace_date(doc):
para = doc.paragraphs[0]
# 删除“-2018-09-20”或“-2018-2-13”
para.text = re.sub(r"(-\d{4}-\d{1,2}-\d{1,2})", "", para.text)
# 删除“2018-09-20”或“2018-2-13”
para.text = re.sub(r"(\d{4}-\d{1,2}-\d{1,2})", "", para.text)
# 删除“2018年09月20日”或“2018年2月13日”
para.text = re.sub(r"(\d{4}\u5E74\d{1,2}\u6708\d{1,2}\u65E5)", "", para.text)
# 删除“20180324”
para.text = re.sub(r"(\d{8})", "", para.text)
return doc
# 插入空行
def add_null_line(doc):
run = doc.paragraphs[0].add_run()
run.add_break()
return doc
def replace(string, new_string):
word = wc.DispatchEx('Word.Application')
word.Visible = 1
word.DisplayAlerts = 0
word.Selection.Find.Execute(string, False, False, False, False, False, True, 1, True, new_string, 2)
# 标题行之后插入两张图片
def add_picture(doc, picture1, picture2):
run = doc.paragraphs[0].add_run()
run.add_break()
run.add_picture(picture1, width=Inches(5.25))
run.add_break()
run.add_picture(picture2, width=Inches(5.25))
# 标题行之后插入一段文字及一幅图片
def add_para_picture(doc, para, picture):
run = doc.paragraphs[0].add_run("\n" + para)
run.add_break()
run.add_picture(picture, width=Inches(5.25))
# 随机取列表中的一个值返回
def pick_random_one(list):
import random
return random.choice(list)
# 加粗关键词
def key_word_bold(doc, keyword):
for i in range(1, len(doc.paragraphs)):
p = doc.paragraphs[i]
for r in p.runs:
if keyword not in r.text:
pass
rest = r.text.split(keyword)
r.text = ''
for text in rest[:-1]:
run = p.add_run(text=text)
run.bold = False
run = p.add_run(keyword)
run.bold = True
run = p.add_run(rest[-1])
run.bold = False
# 替换文本
def add_pic_after_text(doc, expect_text, picture_name):
for i in range(1, len(doc.paragraphs)):
p = doc.paragraphs[i]
for r in p.runs:
if expect_text not in r.text:
pass
rest = r.text.split(expect_text)
r.text = ''
for text in rest[0]:
run = p.add_run(text=text)
run.bold = False
run = p.add_run(expect_text)
run.add_picture(picture_name)
run = p.add_run(rest[-1])
run.bold = False
def find_char_pos(doc):
p2 = doc.paragraphs[1].text
brace_list = []
for i, ch in enumerate(p2):
if ch == "。":
brace_list.append(i)
p_new = p2.replace(p2[brace_list[1]], "\r\n[", 1)
# 读取文件夹下的特定前缀的文件名列表
def file_name(file_dir, file_prefix):
fileList = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if os.path.splitext(file)[0].startswith(file_prefix):
fileList.append(os.path.join(root, file))
return fileList
def main():
# doc_to_docx("D:\\workspace\\python-test\\N标题81.doc")
doc_file_name("D:\\文件处理\\2020.6.29-200篇足球文章")
# doc = Document("D:\\workspace\\python-test\\N标题8.docx")
# print(len(doc.paragraphs))
#
# for parg in doc.paragraphs:
# # 仅修改文本
# # runt = []
# # for run in parg.runs:
# # if run.text:
# # runt.append(run.text)
# # run.text = ''
# # parg.add_run('test***'+''.join(runt))
# # 仅保留文本
# if parg.text:
# parg.text = parg.text
# doc.save("D:\\workspace\\python-test\\new-N标题8.docx")
# replace_date(doc)
# doc.save("D:\\workspace\\python-test\\docx\\2016年11月5日 第11轮格拉纳达VS拉科鲁尼亚全战巴塞罗那.doc")
# replace_n_n(doc)
# replace_n_picture(doc, "D:\\workspace\\python-test\\足球图片\\002807uf4yb2dccedfojmi.jpg", "D:\\workspace\\python-test\\足球图片\\002807uf4yb2dccedfojmi.jpg")
# para = ['test1', 'test2', 'test3']
# picture = "D:\\workspace\\python-test\\足球图片\\002807uf4yb2dccedfojmi.jpg"
# add_para_picture(doc, pick_random_one(para), picture)
# doc.save("D:\\workspace\\python-test\\N标题8-addpara.doc")
# keyWord = keyword_find(doc)
# replace_(doc)
# if keyWord == "":
# pass
# elif "2020" in keyWord:
# key_word_bold(doc, "2020")
# key_word_bold(doc, keyWord.replace("2020", ""))
# else:
# key_word_bold(doc, keyWord)
#find_char_pos(doc)
#add_pic_after_text(doc, "。", "003019z4a1swz4ya2agava_wps图片.jpg")
# add_null_line(doc)
# print(len(doc.paragraphs))
#add_picture(doc, "003019z4a1swz4ya2agava_wps图片.jpg", "002730v6pxgdd2462fe78z_wps图片.jpg")
# replace_date(doc)
#doc.save("replace_date.doc")
# doc.save("D标题44_temp.docx".replace(".docx", ".doc"))
if __name__ == '__main__':
main()
Python
1
https://gitee.com/w4dll/python-word-process.git
git@gitee.com:w4dll/python-word-process.git
w4dll
python-word-process
python-word-process
master

搜索帮助