1 Star 5 Fork 8

1264644959 / blog

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
协程爬虫脚本的分享.md 9.30 KB
一键复制 编辑 原始数据 按行查看 历史
zhangyunlei 提交于 2020-07-29 10:42 . commit

协程爬虫脚本的分享

本人爬虫入门小白,工作原来是想走python后端的

img

(一)介绍

爬取360手机助手的app的所有信息,公司名,开发者,等信息

开始用到的多线程,发现效果并不好,而且电脑有点卡(本人的电脑 i5,8G内存)

后来用协程,效果还可以,但是开始我并没有控制并发数,导致代理失败的数量很多

之后控制协程并发数,就好起来了

(二)种子

爬取详情页面需要用到该详情的链接,称之为种子,开始把种子存储到redis效果也可以

后来我把种子存储到文本中,因为360每个app的详情页链接是有规律的

image-20200729103358071

循环可以生成种子,就算有的链接没有app信息,也没事

(三)代码

from gevent.pool import Pool
from gevent import monkey;monkey.patch_all()
import time
from decimal import Decimal
from urllib.parse import quote

import gevent
import requests
import re
import json
import redis
from lxml import etree
import threading
import sys
import os
import base64

conn = redis.Redis(host="123.56.101.144",db=0, port=3718,password="")
seed_conn = redis.Redis(host='123.56.101.144',db=21,port=8379,password='',decode_responses=True)

#代理
http_config = {
    "user": "",
    "pass": "",
    "concurrent": 100,
    "proxyServer": "http://http-proxy-t3.dobel.cn:9180",
    "type": "duobeiyun"
}
count = "//{}:{}".format(http_config['user'],http_config['pass'])
proxies = {
    "http": http_config['proxyServer'].replace('//', count + '@'),
    "https": http_config['proxyServer'].replace('//', count + '@')
}

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
    "Accept":"application/json, text/plain, */*",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"zh-CN,zh;q=0.9",

}


def get_res(url):
    source_url = url
    print(source_url)

    res = ''
    for i in range(5):
        try:
            res = requests.get(url=source_url, headers=headers, proxies=proxies, timeout=30)
            break
        except Exception as e:
            fun_name = sys._getframe().f_code.co_name
            with open('tww_logs', 'a+') as f:
                f.write(fun_name)
                f.write('\n')
                f.write(str(e))
                f.write('\n')
                f.flush()
            if i < 5:
                time.sleep(0.5)
    if not res:
        return
    print(res.status_code)
    if res.status_code == 200:
        html_content = res.text
        html = etree.HTML(html_content)

        app_name = ''
        company_website = ''
        tag = ''
        apk = ''
        company_name = ''
        avatar = ''
        avatar_url = ''
        desc = ''
        catagory = ''
        size = ''
        score = '暂无好评'
        update_time = ''
        system = ''
        lowest_system = ''
        version = ''
        download_count = ''
        publish_time = ''
        platform = '360手机助手'

        try:
            app_name = html.xpath("//h2[contains(@id,'app-name')]/span/@title")
            if len(app_name):
                app_name = app_name[0]
            if not app_name:
                return
            tag = html.xpath("//div[contains(@class,'app-tags')]/a/text()")


            company_name = html.xpath("//strong[contains(text(),'作者')]/parent::td/text()")
            if len(company_name):
                company_name = company_name[0]

            avatar_url = html.xpath("//dt/img/@src")
            if avatar_url:
                avatar_url = avatar_url[0]

                for i in range(5):
                    res = requests.get(avatar_url, proxies=proxies, headers=headers)
                    if res.status_code == 200:
                        #二进制存储
                        avatar = base64.b64encode(res.content).decode()
                        break

            desc = html.xpath("//div[contains(@class,'breif')]/text()")
            if len(desc):
                desc = desc[0].replace('\n','')
                desc = desc.replace('\r','')
                desc = desc.replace(' ','')


            size = html.xpath("//span[contains(@class,'s-3')][2]/text()")
            if len(size):
                size = size[0]

            update_time = html.xpath("//strong[contains(text(),'更新时间')]/parent::td/text()")
            if len(update_time):
                update_time = update_time[0].replace("-","")

            system = html.xpath("//strong[contains(text(),'系统:')]/parent::td/text()")
            lowest_system = html.xpath("//strong[contains(text(),'系统:')]/parent::td/text()")
            if len(lowest_system):
                lowest_system = lowest_system[0]
            if len(system):
                system = system[0].split(" ")[0]
            score = html.xpath("//em[contains(text(),'分')]/parent::span/text()")
            if len(score):
                score = score[0] + '分'
            version = html.xpath("//strong[contains(text(),'版本:')]/parent::td/text()")
            if len(version):
                version = version[0]
            download_count = html.xpath("substring-after(//span[contains(text(),'下载:')]/text(),':')")
            if len(download_count):
                download_count = download_count[0]


        except Exception as e:
            fun_name = sys._getframe().f_code.co_name
            with open('tww_logs', 'a+') as f:
                f.write(fun_name)
                f.write('\n')
                f.write(str(e))
                f.write('\n')

        if app_name:
            print(score)
            print('-------------------------------------------------------------')



            time_tup = time.time()

            ctime = '%.1f' % time_tup

            redis_data = {
                "spider_config": {
                    "city": "all",
                    "task_type": "360app",
                    "job_type": "details",
                    "channel": "360app",
                    "type": "sell"
                },
                "data": {
                    "response_code": {
                        source_url: res.status_code
                    },
                    "source_url": source_url,
                    "app_name": app_name,
                    "company_name": company_name,
                    "url_crc": "",
                    "avatar": avatar,
                    "company_website": company_website,
                    "tag": tag,
                    "catagory": catagory,
                    "desc": desc,
                    "apk": apk,
                    "version": version,
                    "download_count": download_count,
                    "platform": platform,
                    "system": system,
                    "publish_time": publish_time,
                    "score": score,
                    "update_time": update_time,
                    "size": size,
                    "lowest_system": lowest_system

                },
                "ctime": ctime
            }
            if redis_data:
                print('存储数据')
                for i in range(5):
                    try:
                        conn.lpush("360app_details", json.dumps(redis_data))
                        break
                    except Exception as e:
                        fun_name = sys._getframe().f_code.co_name
                        with open('tww_logs', 'a+') as f:
                            f.write(fun_name)
                            f.write('\n')
                            f.write(str(e))
                            f.write('\n')










def get_seed():
    print('获取种子')
    s_time = time.time()
    g_list = []
    key = True

    # if not seed_conn.llen('itjuzi_seed'):
    #     return
    p = Pool(200)

    i = 2000

    try:
        with open('zhizhen_7.txt', 'r') as fp:
            print('获取位置--------------------------------------')
            zz = fp.read()
    except Exception as e:
        return


    with open('360app_details_url_7.txt', 'r') as f:
        f.seek(int(zz))
        while i:
            url = f.readline().replace('\n', '')
            if not url:
                key = False
                break
            i -= 1
            g = p.spawn(get_res, url)
            g_list.append(g)




        zz = f.tell()

        f.close()

    with open('zhizhen_7.txt', 'w') as fp:
        print('更新位置-------------------------')
        fp.write(str(zz))
        fp.close()




    gevent.joinall(g_list)
    print('----------------------------这一轮结束了--------------------------------------------')
    if not key:
        print('脚本7没有拿到url')
        os._exit(0)

    print("爬取自定义个页面共花费{}时间".format(time.time() - s_time))
    return

def run():
    print("开始爬取————————————————————————————————————————4000000-5000000")
    monkey.patch_socket()
    start_time = time.time()
    while 1:
        # if not seed_conn.llen('itjuzi_seed'):
        #     cost_time = time.time() - start_time
        #     os._exit(0)
        # print('获取id')

        get_seed()

if __name__ == '__main__':
    run()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/super__man/blog.git
git@gitee.com:super__man/blog.git
super__man
blog
blog
master

搜索帮助

344bd9b3 5694891 D2dac590 5694891