代码拉取完成,页面将自动刷新
import datetime as dt
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm
# 关键词
key_words = ["疫情", "非典", "病例", "sars", "SARS", "肺炎"]
# 设置日期池和地址池
# 测试日期用例
# start_date = dt.date(2003, 5, 11)
# end_date = dt.date(2003, 5, 13)
# 完整日期用例
start_date = dt.date(2002, 11, 1)
end_date = dt.date(2003, 9, 1)
delta = dt.timedelta(days=1)
now = start_date
dates = []
spider_urls = []
while now != end_date:
dates.append(now.strftime("%Y%m%d"))
spider_urls.append(f'https://news.sina.com.cn/head/news{now.strftime("%Y%m%d")}am.shtml')
now += delta
# 设置日期和url对应的字典
dates_and_urls = dict(zip(spider_urls, dates))
# 爬取单个网页
def crawl(url):
# 发送请求, 获取响应
response = requests.get(url)
home_page = response.content.decode("GB18030")
# 使用BeautifulSoup提取疫情数据
soup = BeautifulSoup(home_page, 'lxml')
a_tags = soup.find_all('a')
# 爬取<a>标签内容和url
titles = []
news_urls = []
r_title = r'>(.*?)</a>'
r_url = r'<a href="(.*?)" '
for a_tag in a_tags:
if len(re.findall(r_title, str(a_tag))) > 0 and len(re.findall(r_url, str(a_tag))) > 0:
for key_word in key_words:
if key_word in re.findall(r_title, str(a_tag))[0]:
titles.append(re.findall(r_title, str(a_tag))[0])
news_urls.append(re.findall(r_url, str(a_tag))[0])
break
# 去除<a>标签提取出的标题内的<font>标签
for i in range(len(titles)):
if titles[i][0] == "<":
if titles[i][1] == "f":
titles[i] = re.findall(r'>(.*?)</font>', titles[i])[0]
# 删除<a>标签提取出的<img>标签并删除相关url
temp_dic = dict(zip(titles, news_urls))
for i in titles[:]:
if i[0] == "<" and i[1] == "i":
titles.remove(i)
if i in temp_dic.keys():
del temp_dic[i]
return temp_dic
# 爬取所有日期的新闻,用dic储存
dic = {}
for spider_url in tqdm(spider_urls, "爬取进度"):
# 获取当前日期
today = dates_and_urls[spider_url]
titles_and_news_urls = crawl(spider_url)
dic[today] = titles_and_news_urls
with open('data/sars_news_url.json', 'w', encoding='utf8') as file:
# ensure_ascii=False才能输入中文,否则是Unicode字符
# indent=2 JSON数据的缩进,美观
json.dump(dic, file, ensure_ascii=False, indent=2)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。