1 Star 0 Fork 9

Roby&奋斗码农 / BeiJingSubwayFlows

forked from 五十风 / BeiJingSubwayFlows
暂停
 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
get_sub_flows.py 5.61 KB
一键复制 编辑 原始数据 按行查看 历史
五十风 提交于 2018-03-05 18:12 . 增加从新浪微博爬数据的方法
import requests
import re
from urllib import request
from bs4 import BeautifulSoup as bs
from datetime_helper import get_yesterday_date,datetime_to_str,str_to_date
save_flows_file = 'sub_flows.txt'
sub_data_js_file = 'sub_data.js'
def get_page_url(page):
base_url = 'https://www.bjsubway.com/support/cxyd/klxx/index'
if page == 1:
return base_url + ".html"
else:
return base_url + "_" + str(page) + ".html"
def get_html(url):
resp = request.urlopen(url)
html_data = resp.read()
return html_data
def get_flow_from_html(html):
year = 2018
soup = bs(html, 'html.parser')
work_list = soup.find_all('div', class_='work_list')
data = work_list[0].find_all('li')
s = data[0].get_text()
try:
m_pos = s.find('月')
m = s[0:m_pos]
d_pos = s.find('日')
d = s[m_pos + 1:d_pos]
f_start = s.find('运量为') + 3
f_end = s.find('万人次')
flow = s[f_start:f_end]
return "%s.%s.%s,%s" % (year, m, d, flow)
except:
print("error: %s" % (s))
return None
def get_html_from_sina():
url='https://weibo.com/p/1008088aa1b647b52515da3acfee8879bd747e'
headers={
'Host':'weibo.com',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36 QQBrowser/4.3.4986.400',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer':'http://s.weibo.com/weibo/%25E5%25AE%25A2%25E6%25B5%2581%25E8%25A7%2582%25E5%25AF%259F&Refer=index',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8',
}
cookies={
'SUB':'_2AkMtL0HNf8NxqwJRmPAcymPqZIV0zgnEieKbc7AWJRMxHRl-yT9kql06tRB6Bq9vIptsi0itG6AEcsGYeVX0F0WoBgob',
'SUBP':'0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWldK7b-QoN.la7Gp5smllK',
'SINAGLOBAL':'3163792802659.2153.1517539069473',
'YF-V5-G0':'cd5d86283b86b0d506628aedd6f8896e',
'_s_tentry':'www.baidu.com',
'UOR':',,www.baidu.com',
'Apache':'4557240479899.627.1520217434514',
'ULV':'1520217434521:6:2:1:4557240479899.627.1520217434514:1519971759990',
'YF-Page-G0':'f70469e0b5607cacf38b47457e34254f'
}
r = requests.get(url,headers=headers,cookies=cookies)
return r.content
def get_flow_from_sina_html(html):
"""
新浪搜索出来的数据是通过script展示的,所以只能直接从script里爬数据,
幸运的是script里直接就是html,可以直接解析
"""
year=2018
soup = bs(html, 'html.parser')
scripts=soup.find_all('script')
# data 数据所在的列表
data=[]
for s in scripts[-1:]:
temp=s.text
# 带有数据的那个标签是FM.view开头的
if temp.startswith('FM.view'):
temp=temp[8:-1]
# 到这一步时,temp是这样的 ‘{"domid":"xx","css":["style/xx"],"html"="<div>..."}’
try:
temp=eval(temp)
except :
# 这里出错说明不是需要的那个标签
continue
if 'html' in temp:
new_html=temp['html']
new_html=new_html.replace('\/','/')
else:
continue
new_soup=bs(new_html, 'html.parser')
flows_div = new_soup.find_all('div', class_='WB_feed WB_feed_v3 WB_feed_v4')
if flows_div:
data=flows_div[0].find_all('div',class_='WB_feed_detail clearfix')
if data:
break
yesterday=get_yesterday_date()
for d in data:
from_name=d.find('div',class_='WB_info').find('a').get_text().strip()
if from_name!="北京地铁":
continue
content=d.find('div',class_='WB_text W_f14').get_text().strip()
content=re.compile(r'#.*?#').sub('',content)
content=re.compile(r'【.*?】').sub('',content)
try:
m_end = content.find('月')
if m_end==1:
m_start=0
else:
m_start=m_end-2
month = content[m_start:m_end]
month=re.findall("\d+",month)[0]
d_end = content.find('日')
day = content[m_end + 1:d_end]
f_start = content.find('运量为') + 3
f_end = content.find('万人次')
flow = content[f_start:f_end]
if month==str(yesterday.month) and day==str(yesterday.day):
return "%s.%s.%s,%s" % (year, month, day, flow)
except:
print("error: %s" % (s))
return None
def get_flows_from_file():
with open(save_flows_file, 'r')as f:
flows = f.readlines()
temp = []
for f in flows:
f = f.split(',')
d = str_to_date(f[0], '%Y.%m.%d')
temp.append([d, float(f[1])])
return temp
def get_cahrt_xy(flows):
x = []
y = []
for f in flows:
x.append("%s-%s-%s" % (f[0].year, f[0].month, f[0].day))
y.append(f[1])
s = "BASEX=%s;\nBASEY =%s;" % (x, y)
return s
# html = get_html(get_page_url(1))
# d, flow = get_flow_from_html(html).split(',')
html=get_html_from_sina()
d, flow = get_flow_from_sina_html(html).split(',')
if get_yesterday_date() == str_to_date(d, "%Y.%m.%d"):
with open(save_flows_file, 'a')as f:
f.write("%s,%s\n" % (d, flow))
flows = get_flows_from_file()
flows = sorted(flows, key=lambda f: f[0])
with open(sub_data_js_file, 'w')as f:
f.write(get_cahrt_xy(flows))
Python
1
https://gitee.com/beilutesoft/BeiJingSubwayFlows.git
git@gitee.com:beilutesoft/BeiJingSubwayFlows.git
beilutesoft
BeiJingSubwayFlows
BeiJingSubwayFlows
master

搜索帮助