1 Star 0 Fork 2

小蜗牛012 / funny_repo

forked from mxdon / funny_repo 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
convert_blog_2_md.py 9.26 KB
一键复制 编辑 原始数据 按行查看 历史
# get title from title
# get content from article
# finish requests while article
# difficult @ filter out the correct content from so many <p>...</p>
# and format different content.
import re
import requests
import time
import random
import urllib3
from bs4 import BeautifulSoup
#def getimg(link):
def run(filename,textString):
src = open(filename,'a')
# Solved
# 1. bold
# 2. img and figcaption
# 3. title and contents
# 4. italics
# 5. code
# 6. nonsense tags
# 7. quote ### multi_quote TODO
# 8. delete
# 9. toc
# 10.link
# 11.p
# 12.add toc
# 13.formula ### TODO
toc_flag = 0
i = 0
l = '- '
quote_flag = 0
code_flag = 0
title_flag = 0
#for line in src.readlines():
for line in textString.split('\n'):
if len(line) ==0:
continue
templine = line
#### replace figcaption
if re.search('</?(?<=figcaption)[^<]*>',templine):
continue
#### replace [toc]
if re.search('(?<=<div)[^<]+?(?=toc)[^<]+?>',templine):
toc_flag = toc_flag + 1
elif re.search('</div>',line):
toc_flag = toc_flag - 1
if toc_flag:
continue
templine = re.sub('<code\s[^<]*inline[^<]*>', '<code>', templine)
code_inline = re.search('(?<=<code>).*(?=</code>)',templine)
if code_inline:
#print('===========' + str(code_inline.group()))
templine = re.sub('</?code>', '`', templine)
if re.search('<code\s[^<]+?>',templine):
code_flag = 1
codetype = re.search('(?<=language-)([^\"]+?)(?=\")',templine)
if codetype:
#titlelevel = re.search('(?<=<h)([0-9])(?=[^<]+?>$)',templine)
#print(titlelevel.group())
#print(codetype.group())
templine = re.sub('<code\s[^<]+?>','```'+str(codetype.group())+"\n",templine).strip()
else:
templine = re.sub('<code\s[^<]+?>','```\n',templine).strip()
templine = re.sub('</?code>','\n```',templine).strip()
#### replace image
try:
full_img = re.search('(?<=<img).*(?=/>)',templine)
#print(full_img.group())
img_name_group = re.search('(?<=alt=\")[^\"]+?(?=\")',str(full_img.group()))
img_group = re.search('(?<=src=\")([^\"]+?)(?=\")',str(full_img.group()))
#print(img.group())
#print(img_name.group())
img_name = str(img_name_group.group())
img = str(img_group.group())
if len(img_name) <= 1 or img_name.startswith('im'):
img_name = ''
else:
img_name = str(img_name_group.group())
templine = re.sub('<img[^<]+?/>','\n!['+str(img_name)+']('+str(img)+')\n',templine).strip()
except:
templine = re.sub('<img[^<]+?/>','![load img faild]()',templine).strip()
#templine = re.sub('<img[^<]+?/>','!['+str(img_name)+']('+str(img)+')',templine).strip()
#### replace bold
templine = re.sub('</?strong[^<]*>','**',templine).strip()
templine = re.sub('</?(b>|b\s[^<]*>)','**',templine).strip() # conflict with blockquote
templine = re.sub('</?em[^<]*>','**',templine).strip()
#### replace italics
templine = re.sub('</?i[^<]*>','*',templine).strip()
#### replace delete
templine = re.sub('</?del[^<]*>','~~',templine).strip()
templine = re.sub('</?s>','~~',templine).strip()
#### replace list
#if re.search('<ol[^<]*>',templine):
# i = 1
#elif re.search('</ol[^<]*>',templine):
# i = 0
# l = '- '
if i != 0:
if re.search('<li>',templine):
l = str(i) + '. '
templine = re.sub('<li>',str(i) + '. ',templine).strip()
i = i + 1
templine = re.sub('<li>',l,templine).strip()
templine = re.sub('</li>','\n',templine).strip()
#### replace link
try:
full_link_group = re.search('(?<=<a)[^<]+?(?=href)[^<]+?(?=</a>)',templine)
full_link = str(full_link_group.group())
link_href_group = re.search('(?<=href=\").*(?=\")',full_link)
link_href = str(link_href_group.group())
#print(link_href)
link_text_group = re.search('(?<=>).*',full_link)
link_text = str(link_text_group.group())
#print(link_text)
templine = re.sub('<a[^>]+?>.*</a>','['+link_text+']('+link_href+')',templine).strip()
except:
templine = templine
#### replace p
#lineval = re.sub('<p[^<]*>','',templine).strip()
#if re.search('</p[^<]*>',templine):
#print("")
templine = re.sub('<p[^<]*>','',templine).strip()
templine = re.sub('</p[^<]*>','\n',templine).strip()+'\n'
#### 1. replace quote
if re.search('<blockquote[^<]*>',templine):
quote_flag = quote_flag + 1
print("> ",end="")
#templine = re.sub('<blockquote[^<]*>', '> ', templine).strip()
templine = re.sub('<blockquote[^<]*>', '', templine).strip()
src.write('> ')
elif re.search('</blockquote[^<]*>',templine):
quote_flag = quote_flag - 1
templine = re.sub('</blockquote[^<]*>', '', templine).strip()
if quote_flag:
print(templine,end=" ")
src.write(templine)
continue
#### replace different level title
#templine = re.sub('<h1[^<]*>','# ',templine).strip()
templine = re.sub('<h1[^<]*>','## ',templine).strip()
templine = re.sub('<h2[^<]*>','## ',templine).strip()
templine = re.sub('<h3[^<]*>','### ',templine).strip()
templine = re.sub('<h4[^<]*>','#### ',templine).strip()
templine = re.sub('<h5[^<]*>','##### ',templine).strip()
templine = re.sub('<h6[^<]*>','###### ',templine).strip()
templine = re.sub('</h[0-9].*>','',templine).strip()
if re.search('</?title[^<]*>',templine):
templine = re.sub('</?title[^<]*>','',templine).strip()
#### replace all tags
templine = re.sub('<[^<]+?>', '', templine).strip()
print('# '+templine+'\n[toc]\n')
src.write('# '+templine+'\n[toc]\n')
continue
#### replace h1 and title
#if re.search('</?title[^<]*>',templine):
# title_flag = 1
# templine = re.sub('</?title[^<]+?>','',templine).strip() + '\n[toc]\n'
# src.write('# ')
# if re.search('<h1[^<]*>',templine):
# templine = re.sub('<h1[^<]*>','## ',templine).strip()
# title_flag = 0
#if title_flag and re.search('<h1[^<]*>',templine):
# templine = re.sub('<h1[^<]*>','## ',templine).strip()
#else:
# templine = '# ' + re.sub('</?h1[^<]+?>','',templine).strip() + '\n[toc]\n'
#### replace all tags
templine = re.sub('</?code>', '`', templine).strip()
templine = re.sub('<[^<]+?>', '', templine).strip()
lineval = templine + '\n'
print(lineval)
if len(lineval) == 0:
continue
else:
src.write(lineval+'\n')
src.close()
if __name__ == '__main__':
# hi = '# '
# hii = '## '
# hiii = '### '
# hiv = '#### '
# hv = '##### '
# hvi = '###### '
# code_inline = '`'
# code_inblock = '```'
# bold = '**'
# italics = '*'
# delete_line = '~~'
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
heads = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
des = open("blog.md","w")
des.write('')
des.close()
#des = open('blog.md','a')
url = input("Please input a legal URL: ")
while not url.startswith('http'):
url = input("Please input a legal URL again : \n")
try:
res = requests.get(url,timeout=30,headers=heads,verify=False)
res.encoding = 'UTF-8'
content = res.text
#print(content)
soup = BeautifulSoup(content,'html.parser')
if 'zhihu.com' in url:
temp_article = soup.find('div',class_='Post-RichTextContainer')
elif 'csdn.net' in url:
temp_article = soup.article
elif 'cnblogs.com' in url:
temp_article = soup.find('div',class_='markdown-here-wrapper')
article = str(temp_article)
except:
print("request error")
exit(1)
try:
title = soup.find('title')
target_head = str(soup.title)
#print("%s\n" % ("# "+title.text+"\n"+"[toc]"))
#title_text = '# '+title.text+'\n[toc]\n'
#des.write("# "+title.text+"\n"+"[toc]\n")
except:
print("===== title not found =====")
h1 = soup.find('h1')
target_head = str(soup.h1)
#print("%s\n" % ("#"+h1.text+"\n"+"[toc]"))
#des.write("# "+h1.text+"\n"+"[toc]\n")
#run(article)
target_text = target_head + article
#print(target_text)
#des.write(article)
#des.close()
run('blog.md',target_text)
#run('blog.md')
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/beatas/funny_repo.git
git@gitee.com:beatas/funny_repo.git
beatas
funny_repo
funny_repo
master

搜索帮助

344bd9b3 5694891 D2dac590 5694891