清华大佬耗费三个月吐血整理的几百G的资源,免费分享!....>>>
import re
import os
import urllib.request
home = "http://www.gamersky.com/ent/wp/"
i_count = 5 # 爬取的列表数量
def downImg(url, title):
if not os.path.exists(title):
os.makedirs(title)
page = urllib.request.urlopen(url).read()
try:
page = page.decode('utf-8')
except:
print("try gbk code")
page = page.decode('gbk')
#s_key = 'src="(http://img1.gamersky.com/.+?\.jpg)"'
s_key = 'href="http://www.gamersky.com/showimage/id_gamersky.shtml\?(http://img1.gamersky.com/.+?.jpg)"'
re_c = re.compile(s_key)
ls2 = re.findall(re_c, page)
for l2 in ls2:
try:
(p2, f2) = os.path.split(l2)
if os.path.exists(title + "/" + f2):
continue
print(l2)
urllib.request.urlretrieve(l2, title + "/" + f2)
except:
print('down image error!')
if __name__ == '__main__':
try:
url = home
page = urllib.request.urlopen(url).read()
try:
page = page.decode('utf-8')
except:
print("try gbk code")
page = page.decode('gbk')
print(len(page))
s_key = 'href="(http://www.gamersky.com/ent.+?\.shtml)"'
re_c = re.compile(s_key)
ls = re.findall(re_c, page)
i = 0
for l in ls:
if i >= i_count:
break
i += 1
print("(" + str(i) + "/" + str(i_count) + ") " + l)
try:
(path, file) = os.path.split(l)
title = file.replace('.shtml', '')
page = urllib.request.urlopen(l).read()
page = page.decode('utf-8')
url2 = l.replace('.shtml', '')
s_key = 'href="(' + url2 + '.+?)"'
re_c = re.compile(s_key)
ls2 = re.findall(re_c, page)
j = 0
for l2 in ls2:
j += 1
print("(" + str(j) + "/" + str(len(ls2)) + ") " + l2)
try:
downImg(l2, title)
except:
print('error II !')
except:
print('error!')
except:
print("read index error!")
print('finish!')