代码拉取完成,页面将自动刷新
本人爬虫入门小白,工作原来是想走python后端的
爬取360手机助手的app的所有信息,公司名,开发者,等信息
开始用到的多线程,发现效果并不好,而且电脑有点卡(本人的电脑 i5,8G内存)
后来用协程,效果还可以,但是开始我并没有控制并发数,导致代理失败的数量很多
之后控制协程并发数,就好起来了
爬取详情页面需要用到该详情的链接,称之为种子,开始把种子存储到redis效果也可以
后来我把种子存储到文本中,因为360每个app的详情页链接是有规律的
循环可以生成种子,就算有的链接没有app信息,也没事
from gevent.pool import Pool
from gevent import monkey;monkey.patch_all()
import time
from decimal import Decimal
from urllib.parse import quote
import gevent
import requests
import re
import json
import redis
from lxml import etree
import threading
import sys
import os
import base64
conn = redis.Redis(host="123.56.101.144",db=0, port=3718,password="")
seed_conn = redis.Redis(host='123.56.101.144',db=21,port=8379,password='',decode_responses=True)
#代理
http_config = {
"user": "",
"pass": "",
"concurrent": 100,
"proxyServer": "http://http-proxy-t3.dobel.cn:9180",
"type": "duobeiyun"
}
count = "//{}:{}".format(http_config['user'],http_config['pass'])
proxies = {
"http": http_config['proxyServer'].replace('//', count + '@'),
"https": http_config['proxyServer'].replace('//', count + '@')
}
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
"Accept":"application/json, text/plain, */*",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
}
def get_res(url):
source_url = url
print(source_url)
res = ''
for i in range(5):
try:
res = requests.get(url=source_url, headers=headers, proxies=proxies, timeout=30)
break
except Exception as e:
fun_name = sys._getframe().f_code.co_name
with open('tww_logs', 'a+') as f:
f.write(fun_name)
f.write('\n')
f.write(str(e))
f.write('\n')
f.flush()
if i < 5:
time.sleep(0.5)
if not res:
return
print(res.status_code)
if res.status_code == 200:
html_content = res.text
html = etree.HTML(html_content)
app_name = ''
company_website = ''
tag = ''
apk = ''
company_name = ''
avatar = ''
avatar_url = ''
desc = ''
catagory = ''
size = ''
score = '暂无好评'
update_time = ''
system = ''
lowest_system = ''
version = ''
download_count = ''
publish_time = ''
platform = '360手机助手'
try:
app_name = html.xpath("//h2[contains(@id,'app-name')]/span/@title")
if len(app_name):
app_name = app_name[0]
if not app_name:
return
tag = html.xpath("//div[contains(@class,'app-tags')]/a/text()")
company_name = html.xpath("//strong[contains(text(),'作者')]/parent::td/text()")
if len(company_name):
company_name = company_name[0]
avatar_url = html.xpath("//dt/img/@src")
if avatar_url:
avatar_url = avatar_url[0]
for i in range(5):
res = requests.get(avatar_url, proxies=proxies, headers=headers)
if res.status_code == 200:
#二进制存储
avatar = base64.b64encode(res.content).decode()
break
desc = html.xpath("//div[contains(@class,'breif')]/text()")
if len(desc):
desc = desc[0].replace('\n','')
desc = desc.replace('\r','')
desc = desc.replace(' ','')
size = html.xpath("//span[contains(@class,'s-3')][2]/text()")
if len(size):
size = size[0]
update_time = html.xpath("//strong[contains(text(),'更新时间')]/parent::td/text()")
if len(update_time):
update_time = update_time[0].replace("-","")
system = html.xpath("//strong[contains(text(),'系统:')]/parent::td/text()")
lowest_system = html.xpath("//strong[contains(text(),'系统:')]/parent::td/text()")
if len(lowest_system):
lowest_system = lowest_system[0]
if len(system):
system = system[0].split(" ")[0]
score = html.xpath("//em[contains(text(),'分')]/parent::span/text()")
if len(score):
score = score[0] + '分'
version = html.xpath("//strong[contains(text(),'版本:')]/parent::td/text()")
if len(version):
version = version[0]
download_count = html.xpath("substring-after(//span[contains(text(),'下载:')]/text(),':')")
if len(download_count):
download_count = download_count[0]
except Exception as e:
fun_name = sys._getframe().f_code.co_name
with open('tww_logs', 'a+') as f:
f.write(fun_name)
f.write('\n')
f.write(str(e))
f.write('\n')
if app_name:
print(score)
print('-------------------------------------------------------------')
time_tup = time.time()
ctime = '%.1f' % time_tup
redis_data = {
"spider_config": {
"city": "all",
"task_type": "360app",
"job_type": "details",
"channel": "360app",
"type": "sell"
},
"data": {
"response_code": {
source_url: res.status_code
},
"source_url": source_url,
"app_name": app_name,
"company_name": company_name,
"url_crc": "",
"avatar": avatar,
"company_website": company_website,
"tag": tag,
"catagory": catagory,
"desc": desc,
"apk": apk,
"version": version,
"download_count": download_count,
"platform": platform,
"system": system,
"publish_time": publish_time,
"score": score,
"update_time": update_time,
"size": size,
"lowest_system": lowest_system
},
"ctime": ctime
}
if redis_data:
print('存储数据')
for i in range(5):
try:
conn.lpush("360app_details", json.dumps(redis_data))
break
except Exception as e:
fun_name = sys._getframe().f_code.co_name
with open('tww_logs', 'a+') as f:
f.write(fun_name)
f.write('\n')
f.write(str(e))
f.write('\n')
def get_seed():
print('获取种子')
s_time = time.time()
g_list = []
key = True
# if not seed_conn.llen('itjuzi_seed'):
# return
p = Pool(200)
i = 2000
try:
with open('zhizhen_7.txt', 'r') as fp:
print('获取位置--------------------------------------')
zz = fp.read()
except Exception as e:
return
with open('360app_details_url_7.txt', 'r') as f:
f.seek(int(zz))
while i:
url = f.readline().replace('\n', '')
if not url:
key = False
break
i -= 1
g = p.spawn(get_res, url)
g_list.append(g)
zz = f.tell()
f.close()
with open('zhizhen_7.txt', 'w') as fp:
print('更新位置-------------------------')
fp.write(str(zz))
fp.close()
gevent.joinall(g_list)
print('----------------------------这一轮结束了--------------------------------------------')
if not key:
print('脚本7没有拿到url')
os._exit(0)
print("爬取自定义个页面共花费{}时间".format(time.time() - s_time))
return
def run():
print("开始爬取————————————————————————————————————————4000000-5000000")
monkey.patch_socket()
start_time = time.time()
while 1:
# if not seed_conn.llen('itjuzi_seed'):
# cost_time = time.time() - start_time
# os._exit(0)
# print('获取id')
get_seed()
if __name__ == '__main__':
run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。