2 Star 11 Fork 4

leon / 汽车之家车型配置参数爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
AutoHomeCarConfigSpider.py 17.41 KB
一键复制 编辑 原始数据 按行查看 历史
leon 提交于 2021-09-30 00:47 . update AutoHomeCarConfigSpider.py.
import bs4
import requests as req
import os
import re
from selenium import webdriver
from lxml import html
import shutil
import json
import xlwt
import openpyxl
import pandas as pd
headers = {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Connection': 'keep-alive',
'Referer': 'http://www.baidu.com/'
}
req.adapters.DEFAULT_RETRIES = 10 # 增加重连次数
req.keep_alive = False # 关闭多余连接
# download all cars informations
def getCarlists():
li = [chr(i) for i in range(ord("A"),ord("Z")+1)]
firstSite="https://www.autohome.com.cn/grade/carhtml/"
firstSiteSurfixe=".html"
secondSite = "https://car.autohome.com.cn/config/series/"
secondSiteSurfixe = ".html"
html_dir="./data/html/"
# mkdir if not exist
if not os.path.exists(html_dir):
os.mkdir(html_dir)
for a in li:
if a is not None:
requestUrl = firstSite+a+firstSiteSurfixe
print ('---------------------------------------------------------------------')
print (' ')
print (' Getting car lists begin with letter ',a,' from url: ')
print(' '+requestUrl)
print (' ')
print ('---------------------------------------------------------------------')
print (' ')
#get car lists
resp = req.get(requestUrl,headers=headers)
bs = bs4.BeautifulSoup(str(resp.content,"gbk"),"html.parser")
bss = bs.find_all("li")
con = 0
for b in bss:
d = b.h4
if d is not None:
her = str(d.a.attrs['href'])
her = her.split("#")[0]
her = her[her.index(".cn")+3:].replace("/",'')
if her is not None:
if os.path.exists(html_dir+str(her)):
print(' File ',her,' exists, skip...')
else:
secSite = secondSite +her + secondSiteSurfixe
if her is not None:
resp = req.get(secSite,headers=headers)
try:
text = str(resp.content,encoding="utf-8")
fil = open(html_dir+str(her),"a",encoding="utf-8")
fil.write(text)
print (' Save file from: ',secSite,' Done.')
except ConnectionError:
print(' Connection Error. Skip...')
con = (con+1)
else:
print(' Skip empty url...')
print (' ')
print (' Getting car lists start with',a,' is done. Total ',con,' files saved.')
print (' ')
print ('---------------------------------------------------------------------')
print (' ')
# decode downloaded html information
def decodeHtml():
# mkdir if not exist
newhtml_dir="./data/newhtml/"
if not os.path.exists(newhtml_dir):
os.mkdir(newhtml_dir)
print(" Start to decode the html file information...")
print (' ')
rootPath = "./data/html/"
files = os.listdir(rootPath)
for file in files:
print(" Start decoding file: "+file.title()+" ...")
text = ""
for fi in open(rootPath+file,'r',encoding="utf-8"):
text = text+fi
alljs = ("var rules = '2';"
"var document = {};"
"function getRules(){return rules}"
"document.createElement = function() {"
" return {"
" sheet: {"
" insertRule: function(rule, i) {"
" if (rules.length == 0) {"
" rules = rule;"
" } else {"
" rules = rules + '#' + rule;"
" }"
" }"
" }"
" }"
"};"
"document.querySelectorAll = function() {"
" return {};"
"};"
"document.head = {};"
"document.head.appendChild = function() {};"
"var window = {};"
"window.decodeURIComponent = decodeURIComponent;")
try:
js = re.findall('(\(function\([a-zA-Z]{2}.*?_\).*?\(document\);)', text)
for item in js:
alljs = alljs + item
print(' File decodeing successfully...')
except Exception as e:
print(' File decodeing failed...')
print (' ')
newHtml = "<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8' /><head></head><body> <script type='text/javascript'>"
alljs = newHtml + alljs+" document.write(rules)</script></body></html>"
f = open(newhtml_dir+file+".html","a",encoding="utf-8")
f.write(alljs)
f.close()
print ('---------------------------------------------------------------------')
print (' ')
#decode json data
def decodejson():
# mkdir if not exist
json_dir="./data/json/"
if not os.path.exists(json_dir):
os.mkdir(json_dir)
print(" Start to decode the json data...")
print (' ')
rootPath = "./data/html/"
files = os.listdir(rootPath)
for file in files:
print(" Start decoding file: "+file.title()+" ...")
text = ""
for fi in open(rootPath+file,'r',encoding="utf-8"):
text = text+fi
jsonData = ""
config = re.search('var config = (.*?){1,};',text)
if config!= None:
print(' Decode Config Data Done...')
jsonData = jsonData+ config.group(0)
else:
print(' Get Empty Config Data...')
option = re.search('var option = (.*?)};',text)
if option != None:
print(' Decode Option Data Done...')
jsonData = jsonData+ option.group(0)
else:
print(' Get Empty Option Data...')
bag = re.search('var bag = (.*?);',text)
if bag != None:
print(' Decode Bag Data Done...')
jsonData = jsonData+ bag.group(0)
else:
print(' Get Empty Bag Data...')
f = open(json_dir+file,"a",encoding="utf-8")
f.write(jsonData)
f.close()
print (' ')
print ('---------------------------------------------------------------------')
print (' ')
#catch results from chrome web
class Crack():
def __init__(self,keyword,username,passod):
self.url = 'https://www.baidu.com'
self.browser = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
def captureResults():
newhtml_dir="./data/newhtml/"
lists = os.listdir(newhtml_dir)
crack = Crack('测试公司','17610177519','17610177519')
content_dir="./data/content/"
if not os.path.exists(content_dir):
os.mkdir(content_dir)
print (' ')
print(" Start to capture results data...")
print (' ')
for fil in lists:
file = os.path.exists(content_dir+fil)
if file :
print(' File already decoded...'+str(file))
continue
print(' Capture results of file: '+fil)
current_path = os.path.abspath(__file__)
father_path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")
crack.browser.get("file:///"+father_path+newhtml_dir[1:len(newhtml_dir)]+fil+"")
text = crack.browser.find_element_by_tag_name('body')
print(' Capture results done... ')
print(' ')
f = open(content_dir+fil,"a",encoding="utf-8")
f.write(text.text)
else:
f.close()
crack.browser.close()
print ('---------------------------------------------------------------------')
print (' ')
#generate normal json
def generateNormjson():
rootPath = "./data/json/"
content_dir="./data/content/"
newjson_dir="./data/newjson/"
if not os.path.exists(newjson_dir):
os.mkdir(newjson_dir)
listdir = os.listdir(rootPath)
for json_s in listdir:
print(' Generate json data from file: '+json_s.title())
jso = ""
#读取json数据文件
for fi in open(rootPath+json_s,'r',encoding="utf-8"):
jso = jso+fi
content = ""
#读取样式文件
spansPath = content_dir+json_s.title()+".html"
for spans in open(spansPath,"r",encoding="utf-8"):
content = content+ spans
#获取所有span对象
jsos = re.findall("<span(.*?)></span>",jso)
num = 0
for js in jsos:
num = num +1
#获取class属性值
sea = re.search("'(.*?)'",js)
spanContent = str(sea.group(1))+"::before { content:(.*?)}"
#匹配样式值
spanContentRe = re.search(spanContent,content)
if spanContentRe != None:
if sea.group(1) != None:
jso = jso.replace(str("<span class='"+sea.group(1)+"'></span>"),re.search("\"(.*?)\"",spanContentRe.group(1)).group(1))
print(' Gnerate json data done...')
print(' ')
fi = open(newjson_dir+json_s.title(),"a",encoding="utf-8")
fi.write(jso)
fi.close()
print ('---------------------------------------------------------------------')
print (' ')
#get ev car ID lists
def get_urls():
base_url = "https://car.autohome.com.cn"
url = "https://car.autohome.com.cn/diandongche/index.html"
response=req.get(url,headers=headers).text
etree=html.etree
res=etree.HTML(response)
carlist_href = res.xpath("//div[@id='cartree']/ul/li/h3/a/@href")
carlist_href=[base_url+ch_url for ch_url in carlist_href]
return carlist_href
def getEvList():
carlist=get_urls()
etree=html.etree
file=open(r"./data/CarHref.txt","w")
i=0
while i<len(carlist):
url=carlist[i]
response = req.get(url, headers=headers).text
res=etree.HTML(response)
chlist_href=res.xpath("//*[contains(@id,'series_')]/@href")
for ele in chlist_href:
print(" Ev Car ID url: "+ele)
file.write(ele[ele.find('-')+1:len(ele)-5])
file.write("\n")
i=i+1
file.close()
#move Ev car json file to new folder
def moveEvjson():
print(" Start to move the Ev car json file...")
print (' ')
evjson_dir="./data/evjson/"
if not os.path.exists(evjson_dir):
os.mkdir(evjson_dir)
newjson_dir="./data/newjson/"
evlistfil="./data/CarHref.txt"
file_object = open(evlistfil, 'r')
try:
for line in file_object:
if os.path.exists(newjson_dir+line.rstrip('\n')):
shutil.move(newjson_dir+line.rstrip('\n'), evjson_dir)
print(' Move file: '+line.rstrip('\n')+' Done.')
else:
print(' Can not find file: '+line.rstrip('\n')+'.')
finally:
file_object.close()
print (' ')
print ('---------------------------------------------------------------------')
print (' ')
#write xls
def extractXls(rootPath):
err_dir="./data/skipdata/"
if(rootPath == "./data/evjson/"):
xls_dir="./data/evxls/" #电动车
else:
xls_dir="./data/gasxls/" #燃油车
if not os.path.exists(err_dir):
os.mkdir(err_dir)
if not os.path.exists(xls_dir):
os.mkdir(xls_dir)
files = os.listdir(rootPath)
#startRow = 0
isFlag = True #默认记录表头
for file in files:
workbook = xlwt.Workbook(encoding = 'ascii')#创建一个文件
worksheet = workbook.add_sheet(file)#创建一个表
list = []
carItem = {}
text = ""
for fi in open(rootPath+file,'r',encoding="utf-8"):
text = text+fi
#解析基本参数配置参数,颜色三种参数,其他参数
config = "var config = (.*?);"
option = "var option = (.*?);var"
bag = "var bag = (.*?);"
configRe = re.findall(config,text)
optionRe = re.findall(option,text)
bagRe = re.findall(bag,text)
for a in configRe:
config = a
for b in optionRe:
option = b
for c in bagRe:
bag = c
try:
config = json.loads(config)
option = json.loads(option)
#bag = json.loads(bag) # 卡宴车型的选装包格式有问题,导致无法解析,故取消所有选装包的解析
except Exception as e:
f = open(err_dir+'exception.txt',"a",encoding="utf-8")
f.write(file.title()+"\n")
continue
#-----------开始解析参数-----------------------------------------
confignum=len (config['result']['paramtypeitems'])
optionnum=len (option['result']['configtypeitems'])
for i in range(0,confignum):
configItem = config['result']['paramtypeitems'][i]['paramitems']
for car in configItem:
carItem[car['name']]=[]
for ca in car['valueitems']:
carItem[car['name']].append(ca['value'])
for j in range(0,optionnum):
optionItem = option['result']['configtypeitems'][j]['configitems']
for car in optionItem:
carItem[car['name']]=[]
for ca in car['valueitems']:
value = ca['value']
for sblst in ca['sublist']:
if sblst != None:
value = value + '●' + sblst['subname']
value = value.replace("&nbsp;/&nbsp;"," / ")
value = value.replace("&amp;"," ")
carItem[car['name']].append(value)
#---------------------------------------------------------------
startRow = 0
if isFlag:
co1s = 0
for co in carItem:
co1s = co1s +1
worksheet.write(startRow,co1s-1,co) #写入表头
else:
startRow = startRow+1
#计算起止行号
endRowNum = startRow + len(carItem['车型名称']) #车辆款式记录数
for row in range(startRow,endRowNum):
colNum = 0
for col in carItem:
colNum = colNum +1
worksheet.write(row,colNum-1,str(carItem[col][row-startRow]))
print(" Write ",str(carItem['车型名称'][row-startRow])," . Done...")
workbook.save(xls_dir+file+'.xlsx')
#merge xls
def mergeXls(CarType):
print(" ")
print("------------------------------------------------------------")
print(" ")
if(CarType =="EvCar"):
xls_dir="./data/evxls/" #电动车
xlsfile="./AutoHomeEvDatasheet.xlsx"
if(CarType =="GasCar"):
xls_dir="./data/gasxls/" #燃油车
xlsfile="./AutoHomeGasDatasheet.xlsx" #燃油车
dfs = []
files = os.listdir(xls_dir)
num=0
for file in files:
evxls = pd.DataFrame(pd.read_excel(xls_dir+file))
dfs.append(evxls)
num=num+1
print(" ")
print(" Read Car Parameters file " + xls_dir+file + " . Done...")
print(" ")
print("------------------------------------------------------------------")
print(" ")
print(" Start merging file, please wait...")
result = pd.concat(dfs,sort = False)
print(" ")
print(" Total " + str(num) + " files have been successfully merged. Writing file...")
writer = pd.ExcelWriter(xlsfile)
result.to_excel(writer,index=False,engine='openpyxl')
writer.save()
print(" ")
print(" " + xlsfile + " has been successfully saved.")
if __name__ =="__main__":
data_dir="./data/"
if not os.path.exists(data_dir):
os.mkdir(data_dir)
evlist="./data/CarHref.txt"
getCarlists()
decodeHtml()
decodejson()
captureResults()
generateNormjson()
if os.path.exists(evlist):
print('Ev Car list is already exist, Skip...')
print (' ')
print ('---------------------------------------------------------------------')
else:
getEvList()
moveEvjson()
extractXls(r"./data/evjson/") #电动车
mergeXls(r"EvCar") #电动车
extractXls(r"./data/newjson/") #燃油车
mergeXls(r"GasCar") #燃油车
Python
1
https://gitee.com/leon_young/AutoHome-CarConfig-Spider.git
git@gitee.com:leon_young/AutoHome-CarConfig-Spider.git
leon_young
AutoHome-CarConfig-Spider
汽车之家车型配置参数爬虫
master

搜索帮助