代码拉取完成,页面将自动刷新
import requests
import os
import re
import time
import random
import json
from PIL import Image
from train import resize
#定义一个爬虫类
class crawler():
#爬虫项目初始化
#tex:表示项目的名字
#dic一个字典保存键值对(英文单词:中文单词)
#num表示准备爬取的数量
def __init__(self,name:str,dic:dict,num=150):
self.name = name
self.dic = dic
self.num = num
if not os.path.exists('projects'):
os.mkdir('projects')
if os.path.exists(os.path.join('projects',self.name)):
self.ch = False
else:
self.ch = True
os.mkdir(os.path.join('projects',self.name))
oth = {}
for i in sorted(dic.keys()):
oth[i] = len(oth)
f = open(os.path.join('projects',self.name,'config.json'),'w',encoding='utf-8')
json.dump(oth,f)
f.close()
for k in dic.keys():
os.mkdir(os.path.join('projects',self.name,k))
#开始爬取内容
#从百度之中爬取文件,一页会有30张对应的照片
#对字典的内容
def craw(self,size=64):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Referer':'https://www.baidu.com',}
step = self.num/4
for k,v in self.dic.items():
num = 0 #保存爬取的照片爬取的数量,当爬取到足够数量停止爬虫
offset = 0
url = 'https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&dyTabStr=MCwzLDYsMSw0LDUsOCw3LDIsOQ%3D%3D&word='
while True:
offset += random.randint(2,5) * 30
url_use = url + v + '&pn=' + str(offset)
req = requests.get(url_use,headers=header)
assert req.status_code == 200
text = req.text
url_re = re.compile('"thumbURL":"(.*?)","adType"')
url_list = url_re.findall(text)
for url_found in url_list:
try:
img_req = requests.get(url_found,headers=header,timeout=30)
img_req.raise_for_status()
except:
continue
time.sleep(random.random()/2)
if img_req.status_code != 200:
continue
else:
num += 1
s = str(num) + '.jpg'
file_name = os.path.join('projects',self.name,k,s)
with open(file_name,'wb') as f:
f.write(img_req.content)
if num % step == 0:
print('{}:{}/{}'.format(k,num,self.num))
if num == self.num:
break
if num == self.num:
#将图形进行压缩
for name in os.listdir(os.path.join('projects',self.name,k)):
file_name = os.path.join('projects', self.name, k, name)
img = Image.open(file_name)
img = img.convert('RGB')
img = img.resize((resize,resize))
img.save(file_name)
print("{}:capture has complete".format(k))
break
print("craw complete")
def check(self):
return self.ch
def main():
craw1 = crawler('first',{'cat':'一只猫','dog':'一只狗','chicken':'一只鸡'})
craw1.craw()
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。