Python初学小试

2015/04/23 36246
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2015-06-01 10:30:06
# @Author  : HG (hg0728@qq.com)
# @Version : 1.0
import urllib2
import re
import os

def getHtml(url):	#获取html源码
    page = urllib2.urlopen(url)
    html = page.read()
    return html

def urlPages(page):		#翻页
	url = 'http://www.cnblogs.com/sitehome/p/' + str(page)
	#print url
	return url

def findList(html):		#正则匹配列表
    myItems = re.findall('<h3><a class="titlelnk" href="(.*?)" target="_blank">(.*?)</a></h3>', html, re.S)
    return myItems

for page in range(1, 200+1):	#抓取的页数
	html = getHtml(urlPages(page))
	items = findList(html)
	for item in items:
		s = item[0] +' '+ item[1] + '\n'
		print item[0]
		file_object = open('thefile.txt', 'a')
		file_object.write(s)
		file_object.close()
代码片段