多线程及代理服务器下载图片

2015/03/21 39167
#! /usr/bin/env python
# -*- coding=utf-8 -*- 

import urllib2,urllib
import re,time,socket
import os
import sys
import threading

path = os.getcwd()                     
new_path = os.path.join(path,r'mnsfz')
if not os.path.isdir(new_path):
    os.mkdir(new_path)
path1=new_path+'/'+r'List1.txt'
k=open(path1,'wt')
k.close()
path2=new_path+'/'+r'List2.txt'
g=open(path2,'wt')
g.close()
path3=new_path+'/'+r'List3.txt'
g=open(path3,'wt')
g.close()
match1=r'<div class="bgyellow_bsb"><a href="(info_toplist.*?)"'#匹配下一页
match2=r'(http.*?)#'#匹配txt1中的网址

match3=r'<a href="(unit_info.*?ps=18)">'#匹配html1中的各图册pageurl
match4=r'(unit_info.*?ps=18)'#匹配txt2中的地址

match5=r'value="(http://.*?\.jpg)" emptyok="true" />'#匹配打开的图册中图片的下载地址<input name="picurl" type="hidden"  value="http://d4.lexun.net/d43/act/20150324/18/94798621.jpg" emptyok="true" />
match6=r'"<a href="(unit_info.*?ps=18)">u"下一页"'#匹配打开图册的下一页
match7=r'(http.*?\.jpg)'#匹配txt3中的地址
match8=r'<img src="(http.*?\.jpg)" alt='#匹配原图下载地址'<a href="http.jpg">立即下载'
match9=r'(http.*?\.jpg)'#匹配原图下载页的真实imgurl
url1=r'http://p.lexun.net/w/info_toplist.aspx?flag=1&ps=18&total=17967&total=17967&cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=451103666'

def pageloop1(url1):
    for i in range(1,41):
        putintotxt(url1+r'#',path1)
        html=useragent(url1)
        bturl=geturl(match1,html)
        if bturl:
            src=bturl[0]
            url1=r'http://p.lexun.net/w/'+src.replace(r'amp;','')

def pageloop2(url2):
    print r'page',url2
    html2=useragent(url2)
    pagelist=geturl(match3,html2)
    putintotxt(pagelist,path2)

def pageloop3(pageurl):
    url2=r'http://p.lexun.net/w/'+pageurl.replace(r'amp;','')
    # print r'next page',url2
    html3=useragent(url2)
    imglist=geturl(match5,html3)
    # print imglist
    putintotxt(imglist,path3)
    nextimgurl=geturl(match6,html3)
    if nextimgurl:
        src=nextimgurl[0]
        pageurl2=r'http://p.lexun.net/w/'+src.replace(r'amp;','')
        pageloop3(pageurl2)

def pageloop4(urlimg):
    try:
        name=os.path.basename(urlimg)
        size=os.path.isfile(new_path+'/'+name)
        if size==True:
            print u'已经存在'
            pass
        else:
            content=urllib2.urlopen(urlimg,None,timeout=20).read()
            with open(new_path+'/'+name,'wb') as code:
                code.write(content)
            if size==False:
                print u'需要host'
                useragent2(urlimg)
            else:
                print urlimg
    except:
        useragent2(urlimg)
def useragent2(urlimg):
    try:
        url=r'http://app.lexun.com/resizepic/pic_zoomr.aspx?cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=3925580'+str(i)
        values={
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36",
        "picurl":urlimg
        }
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data)
        proxy_support = urllib2.ProxyHandler({'http':'http://190.79.62.76:8080'})
        opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        response = urllib2.urlopen(req)
        html = response.read()
        HTL=geturl(match8,html)
        print HTL[0]
        pageloop4(HTL[0])
    except:
        pass

class getallpag(threading.Thread):   
    def __init__(self,begin,end):
        threading.Thread.__init__(self)
        self.begin = begin
        self.end = end
    def run(self):
        for i in range(self.begin,self.end):
            pageloop2(ALLPAG[i])
class getimgpag(threading.Thread):   
    def __init__(self,begin,end):
        threading.Thread.__init__(self)
        self.begin = begin
        self.end = end
    def run(self):
        for i in range(self.begin,self.end):
            pageloop3(ALLPAG2[i])

class getmypic(threading.Thread):   
    def __init__(self,begin,end):
        threading.Thread.__init__(self)
        self.begin = begin
        self.end = end
    def run(self):
        for i in range(self.begin,self.end):
            pageloop4(ALLPIC[i])

def geturl(match,html):
    reg=re.compile(match)
    URLNEXT=re.findall(reg,html)
    return URLNEXT
def putintotxt(url,path):
    with open (path,'a+') as code:
        code.writelines(url)
def useragent(url):
    try: 
        html = urllib2.urlopen(url,None,timeout=10).read()
        #time.sleep(1)
    except:
        html=r'123456'
        pass
    return html    

def listmk(path,match):
    f=open(path,'r+')
    allurl=f.readlines()
    f.close
    reg=re.compile(match)
    urllist=re.findall(reg,allurl[0])
    return urllist

pageloop1(url1)
ALLPAG=listmk(path1,match2)
l=len(ALLPAG)
print l
if __name__ == '__main__':
    threads = []
    m=1
    n=10
    while(1):
        threads.append(getallpag(m-1,n-1))
        m+=10
        n+=10
        if n-1>l:
            break
    for t in threads:
        t.start()
    for t in threads:
        t.join()
ALLPAG2=listmk(path2,match4)
l2=len(ALLPAG2)
print l2
if __name__ == '__main__':
    threads = []
    m=0
    n=100
    while(1):
        threads.append(getimgpag(m,n))
        m+=101
        n+=100
        if n>l2:
            break
    for t in threads:
        t.start()
    for t in threads:
        t.join()
ALLPIC=listmk(path3,match7)
print u'一共：',len(ALLPIC)
if __name__ == '__main__':
    threads = []
    i=0
    j=100
    kl=len(ALLPIC)
    while(1):
        threads.append(getmypic(i,j))
        i+=101
        j+=100
        if j>kl:
            break
    for t in threads:
        t.start()
        # 等待子线程结束
    for t in threads:
        t.join()     
 
print "the end!!"
代码片段