1 Star 0 Fork 0

gigi0103 / python_homework

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
request.py 3.62 KB
一键复制 编辑 原始数据 按行查看 历史
4-8-6 提交于 2022-01-13 12:25 . first input
import requests
import os
import re
import time
import random
import json
from PIL import Image
from train import resize
#定义一个爬虫类
class crawler():
#爬虫项目初始化
#tex:表示项目的名字
#dic一个字典保存键值对(英文单词:中文单词)
#num表示准备爬取的数量
def __init__(self,name:str,dic:dict,num=150):
self.name = name
self.dic = dic
self.num = num
if not os.path.exists('projects'):
os.mkdir('projects')
if os.path.exists(os.path.join('projects',self.name)):
self.ch = False
else:
self.ch = True
os.mkdir(os.path.join('projects',self.name))
oth = {}
for i in sorted(dic.keys()):
oth[i] = len(oth)
f = open(os.path.join('projects',self.name,'config.json'),'w',encoding='utf-8')
json.dump(oth,f)
f.close()
for k in dic.keys():
os.mkdir(os.path.join('projects',self.name,k))
#开始爬取内容
#从百度之中爬取文件,一页会有30张对应的照片
#对字典的内容
def craw(self,size=64):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Referer':'https://www.baidu.com',}
step = self.num/4
for k,v in self.dic.items():
num = 0 #保存爬取的照片爬取的数量,当爬取到足够数量停止爬虫
offset = 0
url = 'https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDYsMSw0LDUsOCw3LDIsOQ%3D%3D&word='
while True:
offset += random.randint(2,5) * 30
url_use = url + v + '&pn=' + str(offset)
req = requests.get(url_use,headers=header)
assert req.status_code == 200
text = req.text
url_re = re.compile('"thumbURL":"(.*?)","adType"')
url_list = url_re.findall(text)
for url_found in url_list:
try:
img_req = requests.get(url_found,headers=header,timeout=30)
img_req.raise_for_status()
except:
continue
time.sleep(random.random()/2)
if img_req.status_code != 200:
continue
else:
num += 1
s = str(num) + '.jpg'
file_name = os.path.join('projects',self.name,k,s)
with open(file_name,'wb') as f:
f.write(img_req.content)
if num % step == 0:
print('{}:{}/{}'.format(k,num,self.num))
if num == self.num:
break
if num == self.num:
#将图形进行压缩
for name in os.listdir(os.path.join('projects',self.name,k)):
file_name = os.path.join('projects', self.name, k, name)
img = Image.open(file_name)
img = img.convert('RGB')
img = img.resize((resize,resize))
img.save(file_name)
print("{}:capture has complete".format(k))
break
print("craw complete")
def check(self):
return self.ch
def main():
craw1 = crawler('first',{'cat':'一只猫','dog':'一只狗','chicken':'一只鸡'})
craw1.craw()
if __name__ == '__main__':
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/gigi0103/python_homework.git
git@gitee.com:gigi0103/python_homework.git
gigi0103
python_homework
python_homework
master

搜索帮助

344bd9b3 5694891 D2dac590 5694891