1 Star 3 Fork 5

梁新斌 / Scrpay

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
weatherData.py 5.14 KB
一键复制 编辑 原始数据 按行查看 历史
梁新斌 提交于 2019-01-20 22:03 . 添加对代理ip的使用
import requests
from lxml import etree
import tool
import time
from bs4 import BeautifulSoup
#自动批量生成url连接,传入城市名称列表,开始年份,结束年份即可获取url列表
def get_url(cityname,start_year,end_year):
years = list(range(start_year, end_year))
months = list(range(1, 13))
# suburl = 'http://www.tianqihoubao.com/lishi/beijing/month/'
suburl = 'http://www.tianqihoubao.com/lishi/'
urllist = []
for year in years:
for month in months:
if month < 10:
url = suburl + cityname + '/month/'+ str(year) + (str(0) + str(month)) + '.html'
else:
url = suburl + cityname + '/month/' + str(year) + str(month) + '.html'
urllist.append(url.strip())
return urllist
#获取城市名称列表,返回列表类型
def get_cityname(db_conn,db_cur):
sql = 'select cityname from city order by cityid '
db_cur.execute(sql)
cityname = db_cur.fetchall()
citylist = []
for citys in cityname:
citylist.append(citys[0])
return citylist
#从url中解析出城市名称,找出城市名称对应的城市id
def get_cityid(db_conn,db_cur,url):
suburl = url.split('/')
sql = 'select cityid from city where cityname = %s '
db_cur.execute(sql,suburl[4])
cityid = db_cur.fetchone()
idlist = list(cityid)
return idlist[0]
#以xpath解析网页数据,最终以二维列表的形式返回
def parse_html(db_conn,db_cur,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
#获取城市id
cityid = get_cityid(db_conn,db_cur,url)
#获取天气数据的html网页源代码
weather_data = requests.get(url=url,headers=headers).text
weather_html = etree.HTML(weather_data)
table = weather_html.xpath('//table//tr//td//text()')
#获取所有日期相关的数据,存储在列表中
listall = []
for t in table:
if t.strip() == '':
continue
#替换元素中的空格和\r\n
t1 = t.replace(' ','')
t2 = t1.replace('\r\n','')
listall.append(t2.strip())
#对提取到的列表数据进行拆分,将一个月的天气数据拆分成每天的天气情况,方便数据插入数据库
n = 4
sublist= [listall[i:i+n] for i in range(0, len(listall), n)]
#删除表头第一行
sublist.remove(sublist[0])
flist = []
#将列表元素中的最高和最低气温拆分,方便后续数据分析,并插入城市代码
for sub in sublist:
if sub == sublist[0]:
pass
sub2 = sub[2].split('/')
sub.remove(sub[2])
sub.insert(2, sub2[0])
sub.insert(3, sub2[1])
sub.insert(0,cityid) #插入城市代码
flist.append(sub)
# tool.dyn_insert_sql('weather',tuple(sub),db_conn,db_cur)
return flist
def get_proxy():
proxy_pool_url = 'http://localhost:5000/random'
try:
response = requests.get(proxy_pool_url)
if response.status_code == 200:
return response.text
except ConnectionError:
return None
#以BeautifulSoup解析网页数据,最终以二维列表的形式返回
def parse_html_bs(db_conn,db_cur,url):
proxy = get_proxy()
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
'Connection': 'close'
}
# 获取天气数据的html网页源代码
weather_data = requests.get(url=url, headers=headers,proxies = proxies).text
weather_data_new =(weather_data.replace('\n','').replace('\r','').replace(' ',''))
soup = BeautifulSoup(weather_data_new,'lxml')
table = soup.find_all(['td'])
# 获取城市id
cityid = get_cityid(db_conn, db_cur, url)
listall = []
for t in list(table):
ts = t.string
listall.append(ts)
n= 4
sublist = [listall[i:i+n] for i in range(0,len(listall),n)]
sublist.remove(sublist[0])
flist = []
# 将列表元素中的最高和最低气温拆分,方便后续数据分析,并插入城市代码
for sub in sublist:
if sub == sublist[0]:
pass
sub2 = sub[2].split('/')
sub.remove(sub[2])
sub.insert(2, sub2[0])
sub.insert(3, sub2[1])
sub.insert(0, cityid) # 插入城市代码
flist.append(sub)
return flist
def insert_mysql(db_conn,db_cur,flist,tabname):
flists = flist
for f in flists:
tool.dyn_insert_sql(tabname,tuple(f),db_conn,db_cur)
db_conn.close()
if __name__ == '__main__':
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
citylist = get_cityname(db_conn,db_cur)
for city in citylist:
urllist = get_url(city,2016,2019)
for url in urllist:
time.sleep(1)
flist = parse_html_bs(db_conn, db_cur, url)
for li in flist:
tool.dyn_insert_sql('weather',tuple(li),db_conn,db_cur)
time.sleep(1)
Python
1
https://gitee.com/liangxinbin/Scrpay.git
git@gitee.com:liangxinbin/Scrpay.git
liangxinbin
Scrpay
Scrpay
master

搜索帮助