1 Star 1 Fork 5

zhenghua / Scrpay

forked from 梁新斌 / Scrpay 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
zhaopinData.py 8.37 KB
一键复制 编辑 原始数据 按行查看 历史
梁新斌 提交于 2019-01-25 22:40 . 爬取拉钩网工作数据
import requests
from requests import Session
from bs4 import BeautifulSoup
import tool
import time
class Spider():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_ga=GA1.2.1564052695.1545472099; user_trace_token=20181222174726-96d1cc6e-05ce-11e9-88d7-525400f775ce; LGUID=20181222174726-96d1cfb9-05ce-11e9-88d7-525400f775ce; fromsite="localhost:63342"; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22167ea87c48db9-0394bf9a87f259-444a022e-1049088-167ea87c48e99%22%2C%22%24device_id%22%3A%22167ea87c48db9-0394bf9a87f259-444a022e-1049088-167ea87c48e99%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _gid=GA1.2.873913553.1548247267; SEARCH_ID=8aef70ed1cee45bcac69ea2ef9242db1; JSESSIONID=ABAAABAAADEAAFIFE26EFB130DFC26C8276AF1415D7ECF5; _gat=1; LGSID=20190125213628-379b597a-20a6-11e9-b7e9-5254005c3644; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DOwb9BrLg4713KUplaExWdFvbgB3V0wwWJBai91E7Vyi%26wd%3D%26eqid%3Dbb02f6ad000034b9000000035c4b10d7; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1548338988,1548338993,1548338999,1548423499; index_location_city=%E5%8C%97%E4%BA%AC; sm_auth_id=yrtxdqqtb1sv85ad; LG_LOGIN_USER_ID=45303e1d698f488ce6f213a5c4d904ae3ef805cab8f9a96d; _putrc=3C0FA0629129C357; login=true; unick=%E6%A2%81%E6%96%B0%E6%96%8C; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=56; gate_login_token=d826a72e9dfff62f62cfb317e5d0deffa5b20235c4fe8cf7; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1548423701; LGRID=20190125213951-b0cbcc0d-20a6-11e9-a694-525400f775ce',
'Host': 'www.lagou.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Connection': 'close'
}
session = Session()
def get_proxy(self):
try:
proxy_pool_url = 'http://localhost:5000/random'
response = requests.get(proxy_pool_url)
if response.status_code == 200:
return response.text
except ConnectionError:
return None
def get_url(self):
'''
构造出需要爬取的索引页
:return:索引页地址
'''
#全局更新headers,使得所有请求都可以使用headers
self.session.headers.update(self.headers)
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
url_1 = 'https://www.lagou.com/zhaopin/shujuwajue/'
url_2 = '/?filterOption=3'
tool.loging('开始构造准备爬取的url','DEBUG')
urllist = ()
for i in range(11, 20):
url = url_1 + str(i) + url_2
urllist = (url,'1','parse_index_lagou','1',30,'0')
tool.dyn_insert_sql('index_url',urllist,db_conn,db_cur)
tool.loging('准备爬取的url构造完成', 'DEBUG')
db_conn.close()
def get_one_url(self,tabname,callback):
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
sql = "select url,callback,need_proxy from " + tabname + " where flag = '0' and callback = %s "
db_cur.execute(sql,callback)
tup = db_cur.fetchone()
return tup
db_conn.close()
def update_one_url(self,tabname,url):
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
sql = "update " + tabname + " set flag = '1' where url = %s"
db_cur.execute(sql,url)
db_conn.commit()
db_conn.close()
def parse_index_lagou(self,url,need_proxy):
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
# 获取拉勾网html网页源代码
if need_proxy == '1':
proxy = self.get_proxy()
proxy = self.get_proxy()
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
python_data = requests.get(url=url, headers=self.headers) #,proxies = proxies
if python_data.status_code == 200:
soup = BeautifulSoup(python_data.text,'lxml')
con_item = soup.find_all(name = 'div',class_='s_position_list')
tool.loging('详细链接url数据插入开始','DEBUG')
for item in con_item:
urls = item.find_all(name='a',class_='position_link')
for url in urls:
url = url['href']
urllist = (url, '2', 'parse_detail_lagou', '1', 30, '0')
tool.dyn_insert_sql('index_url', urllist, db_conn, db_cur)
tool.loging('详细链接url数据插入完成','DEBUG')
db_conn.close()
def parse_detail_lagou(self,url,need_proxy):
db_conn = tool.get_connect()
db_cur = tool.get_cursor(db_conn)
tool.loging('解析详细工作页面开始', 'DEBUG')
if need_proxy == '1':
proxy = self.get_proxy()
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
python_data = requests.get(url=url, headers=self.headers) #,proxies = proxies
if python_data.status_code == 200:
soup = BeautifulSoup(python_data.text, 'lxml')
pos_head = soup.find_all(name='div', class_='position-head')
pos_list = []
for pos in pos_head:
for company in pos.find_all(name = 'div',class_='company'):
pos_list.append(company.text)
for pos_name in pos.find_all(name = 'span',class_='name'):
pos_list.append(pos_name.text)
for request in pos.find_all(name = 'dd',class_='job_request'):
p = request.text.strip().replace('\n', '').replace('xa0','').split('/')
pos_list.append(p[0])
pos_list.append(p[2])
pos_list.append(p[3])
pos_details = soup.find_all(name='div', class_='container clearfix')
for details in pos_details:
for zwxy in details.find_all(name = 'dd',class_='job-advantage'):
pos_list.append(zwxy.text.strip().replace('\n',''))
for zwms in details.find_all(name = 'dd',class_='job_bt'):
pos_list.append(zwms.text.strip().replace('\n',''))
for pos_base in details.find_all(name='div', class_='work_addr'):
pos_list.append(pos_base.text.replace('\n','').replace(' ',''))
tool.dyn_insert_sql('job_detail', tuple(pos_list), db_conn, db_cur)
tool.loging('解析详细工作页面结束', 'DEBUG')
def run(self):
#获取url列表
tool.loging('构造url开始', 'INFO')
self.get_url()
tool.loging('构造url结束', 'INFO')
tool.loging('解析索引页面开始', 'INFO')
u_list = self.get_one_url('index_url','parse_index_lagou')
while u_list:
time.sleep(5)
url = u_list[0]
callback = u_list[1]
need_proxy= u_list[2]
self.parse_index_lagou(url,need_proxy)
self.update_one_url('index_url',url)
u_list = self.get_one_url('index_url','parse_index_lagou')
tool.loging('解析索引页面结束', 'INFO')
tool.loging('开始解析详细页面','INFO')
detail_list = self.get_one_url('index_url', 'parse_detail_lagou')
while detail_list:
time.sleep(5)
url = detail_list[0]
callback = detail_list[1]
need_proxy= detail_list[2]
self.parse_detail_lagou(url,need_proxy)
self.update_one_url('index_url',url)
detail_list = self.get_one_url('index_url','parse_detail_lagou')
tool.loging('解析详细页面结束','INFO')
if __name__ == '__main__':
sp = Spider()
sp.run()
Python
1
https://gitee.com/zhenghua0501/Scrpay.git
git@gitee.com:zhenghua0501/Scrpay.git
zhenghua0501
Scrpay
Scrpay
master

搜索帮助