# -*- coding: utf-8 -*-
# Author: 天一
# Blog: http://www.90blog.org
# Version: 1.0
# 功能: Python抓取百度站长平台用户名脚本
import urllib
import urllib2
import re
import time
def BiduSpider():
pattern = re.compile(r'
(.*)的个人资料 百度站长社区 ')
uid=1
thedatas = []
while uid <400000:
theUrl = "http://bbs.zhanzhang.baidu.com/home.php?mod=space&uid="+str(uid)
uid +=1
theResponse = urllib2.urlopen(theUrl)
thePage = theResponse.read()
#正则匹配用户名
theFindall = re.findall(pattern,thePage)
#等待0.5秒,以防频繁访问被禁止
time.sleep(0.5)
if theFindall :
#中文编码防止乱码输出
thedatas = theFindall[0].decode('utf-8').encode('gbk')
#写入txt文本文档
f = open('theUid.txt','a')
f.writelines(thedatas+'\n')
f.close()
if __name__ == '__main__':
BiduSpider()
最终成果如下: