清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
#! /usr/bin/env python
# -*- coding=utf-8 -*-
import urllib2,urllib
import re,time,socket
import os
import sys
import threading
path = os.getcwd()
new_path = os.path.join(path,r'mnsfz')
if not os.path.isdir(new_path):
os.mkdir(new_path)
path1=new_path+'/'+r'List1.txt'
k=open(path1,'wt')
k.close()
path2=new_path+'/'+r'List2.txt'
g=open(path2,'wt')
g.close()
path3=new_path+'/'+r'List3.txt'
g=open(path3,'wt')
g.close()
match1=r'<div class="bgyellow_bsb"><a href="(info_toplist.*?)"'#匹配下一页
match2=r'(http.*?)#'#匹配txt1中的网址
match3=r'<a href="(unit_info.*?ps=18)">'#匹配html1中的各图册pageurl
match4=r'(unit_info.*?ps=18)'#匹配txt2中的地址
match5=r'value="(http://.*?\.jpg)" emptyok="true" />'#匹配打开的图册中图片的下载地址<input name="picurl" type="hidden" value="http://d4.lexun.net/d43/act/20150324/18/94798621.jpg" emptyok="true" />
match6=r'"<a href="(unit_info.*?ps=18)">u"下一页"'#匹配打开图册的下一页
match7=r'(http.*?\.jpg)'#匹配txt3中的地址
match8=r'<img src="(http.*?\.jpg)" alt='#匹配原图下载地址'<a href="http.jpg">立即下载'
match9=r'(http.*?\.jpg)'#匹配原图下载页的真实imgurl
url1=r'http://p.lexun.net/w/info_toplist.aspx?flag=1&ps=18&total=17967&total=17967&cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=451103666'
def pageloop1(url1):
for i in range(1,41):
putintotxt(url1+r'#',path1)
html=useragent(url1)
bturl=geturl(match1,html)
if bturl:
src=bturl[0]
url1=r'http://p.lexun.net/w/'+src.replace(r'amp;','')
def pageloop2(url2):
print r'page',url2
html2=useragent(url2)
pagelist=geturl(match3,html2)
putintotxt(pagelist,path2)
def pageloop3(pageurl):
url2=r'http://p.lexun.net/w/'+pageurl.replace(r'amp;','')
# print r'next page',url2
html3=useragent(url2)
imglist=geturl(match5,html3)
# print imglist
putintotxt(imglist,path3)
nextimgurl=geturl(match6,html3)
if nextimgurl:
src=nextimgurl[0]
pageurl2=r'http://p.lexun.net/w/'+src.replace(r'amp;','')
pageloop3(pageurl2)
def pageloop4(urlimg):
try:
name=os.path.basename(urlimg)
size=os.path.isfile(new_path+'/'+name)
if size==True:
print u'已经存在'
pass
else:
content=urllib2.urlopen(urlimg,None,timeout=20).read()
with open(new_path+'/'+name,'wb') as code:
code.write(content)
if size==False:
print u'需要host'
useragent2(urlimg)
else:
print urlimg
except:
useragent2(urlimg)
def useragent2(urlimg):
try:
url=r'http://app.lexun.com/resizepic/pic_zoomr.aspx?cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=3925580'+str(i)
values={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36",
"picurl":urlimg
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
proxy_support = urllib2.ProxyHandler({'http':'http://190.79.62.76:8080'})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
response = urllib2.urlopen(req)
html = response.read()
HTL=geturl(match8,html)
print HTL[0]
pageloop4(HTL[0])
except:
pass
class getallpag(threading.Thread):
def __init__(self,begin,end):
threading.Thread.__init__(self)
self.begin = begin
self.end = end
def run(self):
for i in range(self.begin,self.end):
pageloop2(ALLPAG[i])
class getimgpag(threading.Thread):
def __init__(self,begin,end):
threading.Thread.__init__(self)
self.begin = begin
self.end = end
def run(self):
for i in range(self.begin,self.end):
pageloop3(ALLPAG2[i])
class getmypic(threading.Thread):
def __init__(self,begin,end):
threading.Thread.__init__(self)
self.begin = begin
self.end = end
def run(self):
for i in range(self.begin,self.end):
pageloop4(ALLPIC[i])
def geturl(match,html):
reg=re.compile(match)
URLNEXT=re.findall(reg,html)
return URLNEXT
def putintotxt(url,path):
with open (path,'a+') as code:
code.writelines(url)
def useragent(url):
try:
html = urllib2.urlopen(url,None,timeout=10).read()
#time.sleep(1)
except:
html=r'123456'
pass
return html
def listmk(path,match):
f=open(path,'r+')
allurl=f.readlines()
f.close
reg=re.compile(match)
urllist=re.findall(reg,allurl[0])
return urllist
pageloop1(url1)
ALLPAG=listmk(path1,match2)
l=len(ALLPAG)
print l
if __name__ == '__main__':
threads = []
m=1
n=10
while(1):
threads.append(getallpag(m-1,n-1))
m+=10
n+=10
if n-1>l:
break
for t in threads:
t.start()
for t in threads:
t.join()
ALLPAG2=listmk(path2,match4)
l2=len(ALLPAG2)
print l2
if __name__ == '__main__':
threads = []
m=0
n=100
while(1):
threads.append(getimgpag(m,n))
m+=101
n+=100
if n>l2:
break
for t in threads:
t.start()
for t in threads:
t.join()
ALLPIC=listmk(path3,match7)
print u'一共:',len(ALLPIC)
if __name__ == '__main__':
threads = []
i=0
j=100
kl=len(ALLPIC)
while(1):
threads.append(getmypic(i,j))
i+=101
j+=100
if j>kl:
break
for t in threads:
t.start()
# 等待子线程结束
for t in threads:
t.join()
print "the end!!"