1 Star 1 Fork 0

ghwngzw / sina_sars_spider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
url_spider.py 2.49 KB
一键复制 编辑 原始数据 按行查看 历史
ghwngzw 提交于 2023-01-31 21:28 . delte history git log,first commit
import datetime as dt
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm
# 关键词
key_words = ["疫情", "非典", "病例", "sars", "SARS", "肺炎"]
# 设置日期池和地址池
# 测试日期用例
# start_date = dt.date(2003, 5, 11)
# end_date = dt.date(2003, 5, 13)
# 完整日期用例
start_date = dt.date(2002, 11, 1)
end_date = dt.date(2003, 9, 1)
delta = dt.timedelta(days=1)
now = start_date
dates = []
spider_urls = []
while now != end_date:
dates.append(now.strftime("%Y%m%d"))
spider_urls.append(f'https://news.sina.com.cn/head/news{now.strftime("%Y%m%d")}am.shtml')
now += delta
# 设置日期和url对应的字典
dates_and_urls = dict(zip(spider_urls, dates))
# 爬取单个网页
def crawl(url):
# 发送请求, 获取响应
response = requests.get(url)
home_page = response.content.decode("GB18030")
# 使用BeautifulSoup提取疫情数据
soup = BeautifulSoup(home_page, 'lxml')
a_tags = soup.find_all('a')
# 爬取<a>标签内容和url
titles = []
news_urls = []
r_title = r'>(.*?)</a>'
r_url = r'<a href="(.*?)" '
for a_tag in a_tags:
if len(re.findall(r_title, str(a_tag))) > 0 and len(re.findall(r_url, str(a_tag))) > 0:
for key_word in key_words:
if key_word in re.findall(r_title, str(a_tag))[0]:
titles.append(re.findall(r_title, str(a_tag))[0])
news_urls.append(re.findall(r_url, str(a_tag))[0])
break
# 去除<a>标签提取出的标题内的<font>标签
for i in range(len(titles)):
if titles[i][0] == "<":
if titles[i][1] == "f":
titles[i] = re.findall(r'>(.*?)</font>', titles[i])[0]
# 删除<a>标签提取出的<img>标签并删除相关url
temp_dic = dict(zip(titles, news_urls))
for i in titles[:]:
if i[0] == "<" and i[1] == "i":
titles.remove(i)
if i in temp_dic.keys():
del temp_dic[i]
return temp_dic
# 爬取所有日期的新闻,用dic储存
dic = {}
for spider_url in tqdm(spider_urls, "爬取进度"):
# 获取当前日期
today = dates_and_urls[spider_url]
titles_and_news_urls = crawl(spider_url)
dic[today] = titles_and_news_urls
with open('data/sars_news_url.json', 'w', encoding='utf8') as file:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(dic, file, ensure_ascii=False, indent=2)
Python
1
https://gitee.com/ghwngzw/sina_sars_spider.git
git@gitee.com:ghwngzw/sina_sars_spider.git
ghwngzw
sina_sars_spider
sina_sars_spider
master

搜索帮助