python-learn
/
learn-thread.py

from threading import Thread
from Queue import Queue
import random, time
from bs4 import BeautifulSoup
import os, sys, urllib2
import urllib2,os,socket
import thread


'''
    date:   2014-03-07 20:50:15
    url:    http://www.dbmeizi.com
    desc:   using mulitithreading download pictures from dbmeizi.com
    email:  withfaker@gmail.com
'''

queue = Queue()
pic_path = os.path.join(os.curdir, "images")

class  ProducerThread(Thread):
    def run(self):
        page_loop()

class ConsumerThread(Thread):
    def run(self):
        while True:
            if queue.empty():
                thread.exit()
            url = queue.get()
            queue.task_done()
            fetch(url)

def page_loop(page=0):
    url = 'http://www.dbmeizi.com/?p=%s' % page
    try:
        content = urllib2.urlopen(url)
        soup = BeautifulSoup(content)
    except:
        print "internal error:[%s]" % url
        page_loop(int(page)+1)

    my_girl = soup.find_all('img')
    if my_girl == []:
        print 'finished!'
        sys.exit(0)
    #print "BEGIN TO FETCH PAGE:[%s]" % page
    for girl in my_girl:
        link = girl.get('src')
        flink = 'http:' + link
        queue.put(flink)
    page = int(page) + 1
    page_loop(page)

#fetch pictures
def fetch(url):
    print "url:[%s]" % url
    p = os.path.join(os.curdir, pic_path, url[-11:])
    if os.path.isfile(p):
        st = os.stat(p)
        if st.st_size > 0:
            print "file[%s] is already exists." % url[-11:]
            return
        else:
            print "file[%s] exists. but size is too small." % url[-11:]
    req = urllib2.Request(url)
    req.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
    req.add_header('Accept-Encoding','gzip,deflate,sdch')
    req.add_header('Accept-Language','zh-CN,zh;q=0.8,en;q=0.6')
    req.add_header('Cache-Control','max-age=0')
    req.add_header('Connection','keep-alive')
    req.add_header('Referer','https://www.dbmeizi.com')
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1660.0 Safari/537.36')
    try:
        resp = urllib2.urlopen(req)
    except:
        print "internal error:[%s]" % url
        return
    f = open(p, 'wb')
    f.write(resp.read())
    f.close()
    resp.close()
    #print "fetch url done.[%s]" % url

if __name__ == '__main__':
    if not os.path.isdir(pic_path):
        os.mkdir(pic_path)
    ProducerThread(name="Producer1").start()
    ConsumerThread(name="Consumer1").start()
    ConsumerThread(name="Consumer2").start()
    ConsumerThread(name="Consumer3").start()
    ConsumerThread(name="Consumer4").start()
    ConsumerThread(name="Consumer5").start()
    #ConsumerThread(name="Consumer6").start()
    #ConsumerThread(name="Consumer7").start()
    #ConsumerThread(name="Consumer8").start()
    #ConsumerThread(name="Consumer9").start()
    #ConsumerThread(name="Consumer10").start()