2 Star 2 Fork 0

绝世尘封 / SpiderWxContent

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
weixin_info.py 4.91 KB
一键复制 编辑 原始数据 按行查看 历史
绝世尘封 提交于 2022-08-25 11:06 . add
#!/usr/bin/python
# -*-coding:utf-8-*-
"""微信公众号历史文章获取"""
import asyncio
from pyppeteer import launch
import logging
from pyppeteer.errors import TimeoutError
import re
import common
# 插入数据库
import sys
import requests
import json
from pyppeteer_stealth import stealth
import qrcode
import zxing
import pandas as pd
import numpy as np
import time
import pymongo
import os
from bs4 import BeautifulSoup
try:
mongo_collection = pymongo.MongoClient(os.getenv('MONGOD_HOST'), os.getenv(
'MONGOD_PORT')) # 如 mongodb://admin:123456@192.168.221.128:27017
mongo_collection.server_info()
except Exception as e:
print("数据库配置异常,请检查...")
sys.exit()
# 下载图片
def downImg(url, path_name="IMG"):
"""
下载图片到本地
:param url: 图片数组
:param path_name: 保存文件名
:return:
"""
import requests
# 图片链接
a = 0
url_path = []
for i in url:
image_url = i
print(image_url)
a += 1
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
r = requests.get(image_url, headers=headers)
if not os.path.exists(str(path_name)):
os.mkdir(str(path_name))
# 下载图片
with open(str(path_name) + "/" + str(a) + ".jpg", mode="wb") as f:
f.write(r.content) # 图片内容写入文件
# /Users/wankaifa/穿搭图集
url_path.append("/Users/wankaifa/穿搭图集/" + str(a) + ".jpg")
# 返回图片路径
return url_path
class Tencent:
# 查询网站
def __init__(self):
# 登录成功后获取
self.token = 0
self.cookies = ''
# 是否已登录
self.is_login = 0
self.keyword_url = "https://mp.weixin.qq.com/"
self.user = "674514904@qq.com"
self.pwd = "wkf674514904"
mongo_db = mongo_collection['article']
mongo_collection_model = mongo_db['man']
# url_list = mongo_collection_model.find({"$or": [{"title": {'$regex': "发型"}},{"title": {'$regex': "穿搭"}},{"title": {'$regex': "搭配"}},{"title": {'$regex': "发型"}}]})
url_list = mongo_collection_model.find({"$and": [{"title": {'$regex': "秋"}},{'new_img': None}]})
# url_list_one = mongo_collection_model.find_one({"$and": [{"title": {'$regex': "早秋"}},{'new_img': None}]})
# print(url_list_one)
# 获取所有url
url_arr = []
self.title_arr = {}
print(url_list)
for url in url_list:
print(url)
url_arr.append(url['link'])
self.title_arr[url['link']] = url['title']
if len(url_arr)==0:
print("数据为空...")
sys.exit()
# 将数组 list1 每 3 个切分一次
list2 = common.list_split(url_arr, 2)
for i in range(len(list2)):
print(list2[i])
# 开启
self.sem = asyncio.Semaphore(2) # 有界信号量(等待其中两个协程结束)
# 遍历所需的网站
tasks = [asyncio.ensure_future(self.main(new_total_url)) for new_total_url in list2[i]]
asyncio.get_event_loop().run_until_complete(asyncio.wait(tasks))
async def main(self, num):
print(num)
async with self.sem:
browser = await launch(headless=True, userDataDir='./userdata', args=['--disable-infobars', '--no-sandbox'],
dumpio=True)
page = await browser.newPage()
await stealth(page) # <-- Here
await page.goto(num)
# print(await page.content())
soup = BeautifulSoup(await page.content(), "html.parser")
await browser.close()
print(soup.select("#activity-name")[0].get_text())
wxw_img = soup.select("#js_content")
# 获取图片
new_img = wxw_img[0].find_all("img")
# 获取文章源码
js_content = str(wxw_img[0])
print(new_img)
detail_url = []
for img in new_img:
detail_url.append(img.get('data-src'))
print(detail_url)
print(len(detail_url))
url_path = downImg(detail_url, "../../../穿搭图集/" + self.title_arr[num])
# 将内容保存到对应的数据
mongo_db = mongo_collection['article']
mongo_collection_model = mongo_db['man']
ret2find = mongo_collection_model.find_one({"link": num})
print(ret2find)
mongo_collection_model.update_one({
'_id': ret2find['_id']
}, {
'$set': {
'content': js_content,
'old_img': json.dumps(detail_url),
'new_img': json.dumps(url_path),
}
}, upsert=False)
# 测试打开
Tencent()
Python
1
https://gitee.com/kaifakaixin.com/SpiderWxContent.git
git@gitee.com:kaifakaixin.com/SpiderWxContent.git
kaifakaixin.com
SpiderWxContent
SpiderWxContent
wkf

搜索帮助