40 Star 86 Fork 25

地狱星星 / checkproxy

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
proxy.py 19.60 KB
一键复制 编辑 原始数据 按行查看 历史
QinZhou 提交于 2014-01-21 19:52 . 文件编码由gbk改为utf8
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
# -*- coding: utf-8 -*-
import urllib2,time,random,re,threading,string,sys,urllib,urllib2,getopt
import MySQLdb,socks,socket
from bs4 import BeautifulSoup
web_site_count=13 #要抓取的网站数目
indebug=1
thread_num=200 # 开 thread_num 个线程检查代理
check_in_one_call=thread_num*10 # 本次程序运行时 最多检查的代理个数
db_host='localhost' # 数据库设置
db_port=3306
db_user='root'
db_passwd='123456'
db_database='social'
db_charset='utf8'
target_url="http://www.baidu.com/" # 验证代理的时候通过代理访问这个地址
target_string="030173" # 如果返回的html中包含这个字符串,
target_timeout=10 # 并且响应时间小于 target_timeout 秒
#那么我们就认为这个代理是有效的
proxy_use=1 #抓取时,是否使用代理
proxy_ip='127.0.0.1'
proxy_port='8087'
proxy_array=[] # 这个数组保存将要添加到数据库的代理列表
update_array=[] # 这个数组保存将要更新的代理的数据
conn=None #数据库全局对象
cursor=None
def usage():
print u"-h help"
print u"-g 抓取所有的网站代理"
print u"-c http-普通http代理 connect-支持connect代理 检查代理"
print u"-t id 测试抓取指定id的网站代理,不入库"
def get_html(url=''):
if proxy_use==1:
opener = urllib.FancyURLopener({'http': 'http://'+proxy_ip+':'+proxy_port+'/'})
else:
opener = urllib.FancyURLopener({})
opener.addheaders = [('User-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36')]
try:
f = opener.open(url)
return f.read()
except Exception,e:
print e
return ''
def build_list_urls_1(page=10):
page=page+1
ret=[]
for i in range(1,page):
ret.append('http://www.cnproxy.com/proxy%(num)01d.html'%{'num':i})
return ret
def parse_page_1(html=''):
matches=re.findall('<tr><td>(.*?)<script.*?>document.write\(\":\"\+(.*?)\)</script></td><td>(.*?)</td><td>.*?</td><td>(.*?)</td>',html,re.IGNORECASE)
ret=[]
for match in matches:
ip=match[0]
port=match[1]
v="3";m="4";a="2";l="9";q="0";b="5";i="7";w="6";r="8";c="1";
portlist=port.split('+')
porttmp=''
for intstr in portlist:
porttmp=porttmp+eval(intstr)
port=porttmp
method=match[2]
if method=='HTTP':
method=1
elif method=='SOCKS4':
method=2
elif method=='SOCKS5':
method=3
type=-1 #该网站未提供代理服务器类型
area=match[3]
if indebug:
print '1',ip,port,method,type,area
area=unicode(area, 'cp936')
area=area.encode('utf8')
ret.append([ip,port,method,type,area])
return ret
def build_list_urls_2(page=1):
return ['http://www.proxylists.net/http_highanon.txt']
def parse_page_2(html=''):
matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
ret=[]
for match in matches:
ip=match[0]
port=match[1]
method=1
type=2
area='--'
ret.append([ip,port,method,type,area])
if indebug:
print '2',ip,port,method,type,area
return ret
def build_list_urls_3(page=1):
return ['http://www.proxylists.net/http.txt']
def parse_page_3(html=''):
matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
ret=[]
for match in matches:
ip=match[0]
port=match[1]
method=1
type=-1
area='--'
ret.append([ip,port,method,type,area])
if indebug:
print '3',ip,port,method,type,area
return ret
def build_list_urls_4(page=5):
page=page+1
ret=[]
for i in range(0,page):
ret.append('http://proxylist.sakura.ne.jp/index.htm?pages=%(n)01d'%{'n':i})
return ret
def parse_page_4(html=''):
matches=re.findall('<TD.*?>[\s\S]*?<script.*?>[\s\S]*?proxy\((.*?)\);[\s\S]*?<\/script>[\s\S]*?<\/TD>[\s\S]*?<TD>(.*?)<\/TD>[\s\S]*?',html,re.IGNORECASE)
ret=[]
for match in matches:
ipandport=match[0]
ipandportlist=ipandport.split(',')
port=ipandportlist[5]
if ipandportlist[0]=='1':
ip=ipandportlist[1].strip("'")+"."+ipandportlist[2].strip("'")+"."+ipandportlist[3].strip("'")+"."+ipandportlist[4].strip("'")
elif ipandportlist[0]=='2':
ip=ipandportlist[4].strip("'")+"."+ipandportlist[1].strip("'")+"."+ipandportlist[2].strip("'")+"."+ipandportlist[3].strip("'")
elif ipandportlist[0]=='3':
ip=ipandportlist[3].strip("'")+"."+ipandportlist[4].strip("'")+"."+ipandportlist[1].strip("'")+"."+ipandportlist[2].strip("'")
elif ipandportlist[0]=='4':
ip=ipandportlist[2].strip("'")+"."+ipandportlist[3].strip("'")+"."+ipandportlist[4].strip("'")+"."+ipandportlist[1].strip("'")
method=1
type=-1
area=match[1]
if (type=='Anonymous'):
type=1
else:
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '4',ip,port,method,type,area
return ret
def build_list_urls_5(page=10):
page=page+1
ret=[]
for i in range(1,page):
ret.append('http://www.my-proxy.com/free-proxy-list-%(n)01d.html'%{'n':i})
ret.append('http://www.my-proxy.com/free-proxy-list-s1.html')
ret.append('http://www.my-proxy.com/free-proxy-list-s2.html')
ret.append('http://www.my-proxy.com/free-proxy-list-s3.html')
return ret
def parse_page_5(html=''):
matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
ret=[]
method=1
type=-1
for match in matches:
ip=match[0]
port=match[1]
area='--'
ret.append([ip,port,method,type,area])
if indebug:
print '5',ip,port,method,type,area
return ret
def build_list_urls_6(page=4):
ret=[]
ret.append('http://www.cybersyndrome.net/plr5.html')
ret.append('http://www.cybersyndrome.net/pla5.html')
ret.append('http://www.cybersyndrome.net/pld5.html')
return ret
def parse_page_6(html=''):
matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})',html)
ret=[]
for match in matches:
ip=match[0]
port=match[1]
method=1
area='--'
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '6',ip,port,method,type,area
return ret
def build_list_urls_7(page=3):
ret=[]
ret.append('http://cn-proxy.com/')
return ret
def parse_page_7(html=''):
ret=[]
matches=re.findall('<tr>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?</tr>',html,re.IGNORECASE)
for match in matches:
ip=match[0]
port=match[1]
method=1
type=-1
area=match[2]
ipmatch=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',ip)
if len(ipmatch)>0:
ret.append([ip,port,method,type,area])
if indebug:
print '7',ip,port,method,type,area.decode('utf8')
return ret
def build_list_urls_8(page=3):
ret=[]
ret.append('http://cn-proxy.com/archives/218')
return ret
def parse_page_8(html=''):
ret=[]
matches=re.findall('<tr>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?</tr>',html,re.IGNORECASE)
for match in matches:
ip=match[0]
port=match[1]
method=1
type=match[2].decode('utf8')
if type==u'透明':
type=0
elif type==u'普通匿名':
type=1
elif type==u'高度匿名':
type=2
else:
type=-1
area=match[3]
ipmatch=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',ip)
if len(ipmatch)>0:
ret.append([ip,port,method,type,area])
if indebug:
print '8',ip,port,method,type,area.decode('utf8')
return ret
def build_list_urls_9(page=4):
ret=[]
rehtml=get_html("http://www.youdaili.cn/Daili/http/")
soup=BeautifulSoup(rehtml,from_encoding='utf-8')
urllist=soup.find_all('ul','newslist_line')[0].find_all('li')
for url in urllist:
ret.append(url.find_all('a')[0]['href'])
rehtml=get_html("http://www.youdaili.cn/Daili/QQ/")
soup=BeautifulSoup(rehtml,from_encoding='utf-8')
urllist=soup.find_all('ul','newslist_line')[0].find_all('li')
for url in urllist:
ret.append(url.find_all('a')[0]['href'])
rehtml=get_html("http://www.youdaili.cn/Daili/guonei/")
soup=BeautifulSoup(rehtml,from_encoding='utf-8')
urllist=soup.find_all('ul','newslist_line')[0].find_all('li')
for url in urllist:
ret.append(url.find_all('a')[0]['href'])
rehtml=get_html("http://www.youdaili.cn/Daili/guowai/")
soup=BeautifulSoup(rehtml,from_encoding='utf-8')
urllist=soup.find_all('ul','newslist_line')[0].find_all('li')
for url in urllist:
ret.append(url.find_all('a')[0]['href'])
rehtml=get_html("http://www.youdaili.cn/Daili/Socks/")
soup=BeautifulSoup(rehtml,from_encoding='utf-8')
urllist=soup.find_all('ul','newslist_line')[0].find_all('li')
for url in urllist:
ret.append(url.find_all('a')[0]['href'])
return ret
def parse_page_9(html=''):
matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})\@(.*?)\#(.*?)<br \/>',html)
ret=[]
for match in matches:
ip=match[0]
port=match[1]
method=match[2]
if method=='HTTP':
method=1
elif method=='SOCKS4':
method=2
elif method=='SOCKS5':
method=3
else:
continue
area=match[3]
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '9',ip,port,method,type,area.decode('utf8')
return ret
def build_list_urls_10(page=4):
ret=[]
rehtml=get_html("http://www.itmop.com/proxy/")
soup=BeautifulSoup(rehtml,from_encoding='utf-8')
urllist=soup.find_all('dt')
for url in urllist:
ret.append(url.find_all('a')[0]['href'])
return ret
def parse_page_10(html=''):
matches=re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\:(\d{2,5})\@(.*?)\;(.*?)<br \/>',html)
ret=[]
for match in matches:
ip=match[0]
port=match[1]
method=match[2]
if method=='HTTP':
method=1
elif method=='SOCKS4':
method=2
elif method=='SOCKS5':
method=3
else:
continue
area=match[3]
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '10',ip,port,method,type,area.decode('utf8')
return ret
def build_list_urls_11(page=4):
ret=[]
ret.append('http://pachong.org/')
ret.append('http://pachong.org/anonymous.html')
ret.append('http://pachong.org/transparent.html')
ret.append('http://pachong.org/area/short/name/cn.html')
ret.append('http://pachong.org/area/short/name/br.html')
ret.append('http://pachong.org/area/short/name/us.html')
ret.append('http://pachong.org/area/short/name/ve.html')
ret.append('http://pachong.org/area/short/name/in.html')
return ret
def parse_page_11(html=''):
matches=re.findall('<tr.*?data-type=\"(.*?)\">[\s\S]*?<td.*?>.*?</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>(.*?)</td>[\s\S]*?<td>[\s\S]*?<img.*?>[\s\S]*?<a.*?>(.*?)</a>',html,re.IGNORECASE)
ret=[]
for match in matches:
ip=match[1]
port=match[2]
method=1
area=match[3]
type=match[0]
if type=='anonymous':
type=1
elif type=='transparent':
type=0
elif type=='high':
type=2
elif type=='socks4':
method=2
type=-1
elif type=='socks5':
method=3
type=-1
else:
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '11',ip,port,method,type,area.decode('utf8')
return ret
def build_list_urls_12(page=10):
ret=[]
ret.append('http://www.cz88.net/proxy/index.aspx')
page=page+1
for i in range(2,page):
ret.append('http://www.cz88.net/proxy/http_%(n)01d.aspx'%{'n':i})
return ret
def parse_page_12(html=''):
matches=re.findall('<tr><td>(.*?)</td><td>(.*?)</td><td>.*?</td><td>.*?</td><td><div.*?>(.*?)</div></td></tr>',html,re.IGNORECASE)
ret=[]
for match in matches:
ip=match[0].decode('gbk').encode('utf8')
port=match[1].decode('gbk').encode('utf8')
method=1
area=match[2].decode('gbk').encode('utf8')
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '12',ip,port,method,type,area.decode('utf8')
return ret
def build_list_urls_13(page=10):
ret=[]
ret.append('http://www.cz88.net/proxy/socks4.aspx')
ret.append('http://www.cz88.net/proxy/socks4_2.aspx')
ret.append('http://www.cz88.net/proxy/socks4_3.aspx')
ret.append('http://www.cz88.net/proxy/socks5.aspx')
ret.append('http://www.cz88.net/proxy/socks5_2.aspx')
return ret
def parse_page_13(html=''):
matches=re.findall('<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>.*?</td><td><div.*?>(.*?)</div></td></tr>',html,re.IGNORECASE)
ret=[]
for match in matches:
ip=match[0].decode('gbk').encode('utf8')
port=match[1].decode('gbk').encode('utf8')
method=match[2].decode('gbk').encode('utf8')
if method.decode('utf8')==u'SOCKS4':
method=2
elif method.decode('utf8')==u'SOCKS5':
method=3
else:
continue
area=match[3].decode('gbk').encode('utf8')
type=-1
ret.append([ip,port,method,type,area])
if indebug:
print '13',ip,port,method,type,area.decode('utf8')
return ret
#线程类
class TEST(threading.Thread):
def __init__(self,action,index=None,checklist=None,checkmothed='http'):
threading.Thread.__init__(self)
self.index =index
self.action=action
self.checklist=checklist
self.checkmothed=checkmothed
def run(self):
if (self.action=='getproxy'):
get_proxy_one_website(self.index)
else:
check_proxy(self.index,self.checklist,self.checkmothed)
def check_proxy(index,checklist,checkmothed):
for item in checklist:
check_one_proxy(checkmothed,item[0],item[1],item[2])
def check_one_proxy(checkmothed,ip,port,method):
global update_array
global check_in_one_call
global target_url,target_string,target_timeout
url=target_url
checkstr=target_string
timeout=target_timeout
if checkmothed=='http':
if method==1:
proxy_handler = urllib2.ProxyHandler({'http': 'http://'+ip+':'+str(port)+'/'})
opener = urllib2.build_opener(proxy_handler)
urllib2.install_opener(opener)
else:
return # socks4,socks5 退出函数处理
elif checkmothed=='connect':
if method==1:
socks.setdefaultproxy(socks.PROXY_TYPE_HTTP, ip, int(port))
elif method==2:
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, ip, int(port))
elif method==3:
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, ip, int(port))
socks.wrap_module(urllib2)
send_headers = {
'User-agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)'
}
t1=time.time()
try:
req = urllib2.Request(url,headers=send_headers)
r = urllib2.urlopen(req,timeout=20)
rehtml=r.read()
pos=rehtml.find(checkstr)
except Exception,e:
pos=-1
print e
t2=time.time()
timeused=t2-t1
if (timeused<timeout and pos>0):
active=1
else:
active=0
update_array.append([ip,port,active,timeused])
def check_all_proxy(threadCount,checkmothed='http'):
global check_in_one_call,skip_check_in_hour,cursor
threads=[]
cursor.execute('select ip,port,method from proxy where active=0')
rows = cursor.fetchall()
check_in_one_call=len(rows)
#计算每个线程将要检查的代理个数
if len(rows)>=threadCount:
num_in_one_thread=len(rows)/threadCount
else:
num_in_one_thread=1
threadCount=threadCount+1
print u"现在开始验证以下代理服务器....."
for index in range(1,threadCount):
#分配每个线程要检查的checklist,并把那些剩余任务留给最后一个线程
checklist=rows[(index-1)*num_in_one_thread:index*num_in_one_thread]
if (index+1==threadCount):
checklist=rows[(index-1)*num_in_one_thread:]
t=TEST('',index,checklist,checkmothed)
t.setDaemon(True)
t.start()
threads.append((t))
for thread in threads:
thread.join(60)
update_proxies() #把所有的检查结果更新到数据库
def get_proxy_one_website(index):
global proxy_array
func='build_list_urls_'+str(index)
parse_func=eval('parse_page_'+str(index))
urls=eval(func+'()')
for url in urls:
print url
html=get_html(url)
proxylist=parse_func(html)
for proxy in proxylist:
ip=string.strip(proxy[0])
port=string.strip(proxy[1])
if (re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").search(ip)):
method=str(proxy[2])
type=str(proxy[3])
area=string.strip(proxy[4])
proxy_array.append([ip,port,method,type,area])
def get_all_proxies(webindex='0'):
global web_site_count,cursor
threads=[]
if webindex=='0':
print u"现在开始从以下"+str(web_site_count)+u"个网站抓取代理列表...."
count=web_site_count+1
for index in range(1,count):
t=TEST('getproxy',index)
t.setDaemon(True)
t.start()
threads.append((t))
for thread in threads:
thread.join(60)
add_proxies_to_db()
else:
print u"现在开始从以下第"+webindex+u"个网站抓取代理列表...."
t=TEST('getproxy',webindex)
t.setDaemon(True)
t.start()
threads.append((t))
for thread in threads:
thread.join(60)
def add_proxies_to_db():
global proxy_array
count=len(proxy_array)
for i in range(count):
item=proxy_array[i]
cursor.execute("select ip from proxy where ip='"+item[0]+"'")
iplist=cursor.fetchall()
if len(iplist)==0:
sql="""insert into `proxy` (`ip`,`port`,`method`,`type`,`intime`,`area`) values
('"""+item[0]+"',"+item[1]+","+item[2]+","+item[3]+",now(),'"+clean_string(item[4])+"')"
try:
cursor.execute(sql)
except Exception,e:
print e
def update_proxies():
global update_array
for item in update_array:
sql='''
update `proxy` set `checktime`=now(),
`active`=%(active)01d,
`speed`=%(speed)02.3f
where `ip`='%(ip)01s' and `port`=%(port)01d
'''%{'active':item[2],'speed':item[3],'ip':item[0],'port':item[1]}
try:
cursor.execute(sql)
except:
pass
def clean_string(s):
tmp=re.sub(r"['\,\s\\\/]", ' ', s)
return re.sub(r"\s+", ' ', tmp)
def open_database():
global conn,cursor,day_keep,db_host,db_port,db_user,db_passwd,db_database,db_charset,webindex
try:
conn=MySQLdb.connect(host=db_host,port=db_port,user=db_user,passwd=db_passwd,db=db_database,charset=db_charset)
cursor=conn.cursor()
except:
print u"连接数据库失败"
sys.exit()
if webindex=='0': #真正抓取网页的时候
cursor.execute("delete from proxy where active=0 and checktime is not null") #清理检测未通过的代理
def close_database():
global cursor,conn
cursor.close()
conn.close()
if __name__ == '__main__':
if len(sys.argv)<=1:
usage()
sys.exit()
grab=False #是否抓取网页
check=False #是否检测代理
try:
opts, args = getopt.getopt(sys.argv[1:], "hgc:t:")
except:
usage()
sys.exit()
for op, value in opts:
if op == "-h":
usage()
sys.exit()
elif op== "-g":
webindex='0'
grab=True
elif op=='-c':
if value not in ['http','connect']:
usage()
sys.exit()
webindex='0'
check=True
checkmethod=value
elif op=="-t":
webindex=value
grab=True
else:
usage()
sys.exit()
open_database()
if grab==True:
get_all_proxies(webindex)
if check==True:
check_all_proxy(thread_num,checkmethod)
close_database()
print u"所有工作已经完成"
Python
1
https://gitee.com/hellstar/checkproxy.git
git@gitee.com:hellstar/checkproxy.git
hellstar
checkproxy
checkproxy
master

搜索帮助