时间:2021-07-01 10:21:17 帮助过:24人阅读
参考: 用python爬虫抓站的一些技巧总结 zz
1.访问网站 #最简单的得到网页代码的方法
1 import urllib22 response = urllib2.urlopen("http://www.xx.com")3 print response.read()
2.伪装成浏览器(User-Agent,Referer等) #为了不被服务器禁止访问所以还是伪装成浏览器比较好
1 headers = {2 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',3 'Referer':'http://www.xx.com/xx',4 'Accept':'application/javascript, */*;q=0.8'5 }6 response = urllib2.Request(url = "http://www.xx.com",data = None,headers = headers)
3.Post数据转码
1 import urllib,urllib22 values = {3 'username':'xxx',4 'password':'xxx',5 'key':'xxx'6 }7 postdata = urllib.urlencode(values)8 response = urllib2.Request(url,data = postdata)
4.Cookies
1 import urllib2,cookielib2 cookie_handler = urllib2.HTTPCookieProcessor(cookielib.CookieJar())3 opener = urllib2.build_opener(cookie_handler)4 urllib2.install_opener(opener)5 response = urllib2.urlopen(url)
5.代理服务器 #重复多次访问同一网址 结果被封了ip或限制了访问次数
1 import urllib22 proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'})3 opener = urllib2.build_opener(proxy_handler)4 urllib2.install_opener(opener)5 response = urllib2.urlopen(url)
问:如果想cookie和proxy一起用怎么办?
答:urllib2.build_opener可以放多个参数,即handler 如:BaseHandler,ProxyHandler,HTTPHandler,FileHandler,FTPHandler,CacheFTPHandler等等等等
6.gzip #现在普遍支持gzip压缩,我们默认获取压缩后的网页,大大提高了抓取网页的效率,减少了带宽负荷。
1 import urllib2,zlib2 req = urllib2.Request(url)3 req.add_header('Accept-encoding', 'gzip')4 response = urllib2.urlopen(req, timeout=120)5 html = response.read()6 gzipped = response.headers.get('Content-Encoding')7 if gzipped:8 html = zlib.decompress(html, 16+zlib.MAX_WBITS)
7.其他
设置线程栈大小:栈大小显著影响python的内存占用,方法如下:
1 from threading import stack_size 2 stack_size(32768*16)
设置超时
1 import socket2 socket.setdefaulttimeout(10) #设置10秒后连接超时
失败后重试
1 def get(self,req,retries=3): 2 try: 3 response = self.opener.open(req) 4 data = response.read() 5 except Exception , what: 6 print what,req 7 if retries>0: 8 return self.get(req,retries-1) 9 else:10 print 'GET Failed',req11 return ''12 return data
根据以上内容,我们可以写出便于配置解决重复性工作的自己的helper类:
1 # -*- coding: utf-8 -*- 2 import cookielib, urllib, urllib2, socket 3 import zlib,StringIO 4 class HttpClient: 5 __cookie = cookielib.CookieJar() 6 #代理设置,需要时添加(后续设置为多代理切换) 7 #__proxy_handler = urllib2.ProxyHandler({"http" : '42.121.6.80:8080'}) 8 __req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie))#,__proxy_handler) 9 __req.addheaders = [10 ('Accept', 'application/javascript, */*;q=0.8'),11 ('User-Agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')12 ]13 urllib2.install_opener(__req)14 15 def Get(self, url, refer=None,retries=3):16 try:17 req = urllib2.Request(url)18 req.add_header('Accept-encoding', 'gzip')19 if not (refer is None):20 req.add_header('Referer', refer)21 response = urllib2.urlopen(req, timeout=120)22 html = response.read()23 gzipped = response.headers.get('Content-Encoding')24 if gzipped:25 html = zlib.decompress(html, 16+zlib.MAX_WBITS)26 return html27 except Exception,what:28 print what29 if retries>0:30 return self.Get(url,refer,retries-1)31 else:32 print "Get Failed",url33 return ''34 #except urllib2.HTTPError, e:35 # return e.read()36 #except socket.timeout, e:37 # return ''38 #except socket.error, e:39 # return ''40 41 def Post(self, url, data, refer=None):42 try:43 req = urllib2.Request(url, urllib.urlencode(data))44 #req = urllib2.Request(url,data)45 if not (refer is None):46 req.add_header('Referer', refer)47 return urllib2.urlopen(req, timeout=120).read()48 except urllib2.HTTPError, e:49 return e.read()50 except socket.timeout, e:51 return ''52 except socket.error, e:53 return ''54 55 def Download(self, url, file):56 output = open(file, 'wb')57 output.write(urllib2.urlopen(url).read())58 output.close()59 60 def getCookie(self, key):61 for c in self.__cookie:62 if c.name == key:63 return c.value64 return ''65 66 def setCookie(self, key, val, domain):67 ck = cookielib.Cookie(version=0, name=key, value=val, port=None, port_specified=False, domain=domain, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)68 self.__cookie.set_cookie(ck)HttpClient
至于多线程就参考网上找的这段代码好了,还支持并发。。。
1 from threading import Thread 2 from Queue import Queue 3 from time import sleep 4 #q是任务队列 5 #NUM是并发线程总数 6 #JOBS是有多少任务 7 q = Queue() 8 NUM = 2 9 JOBS = 1010 #具体的处理函数,负责处理单个任务11 def do_somthing_using(arguments):12 print arguments13 #这个是工作进程,负责不断从队列取数据并处理14 def working():15 while True:16 arguments = q.get()17 do_somthing_using(arguments)18 sleep(1)19 q.task_done()20 #fork NUM个线程等待队列21 for i in range(NUM):22 t = Thread(target=working)23 t.setDaemon(True)24 t.start()25 #把JOBS排入队列26 for i in range(JOBS):27 q.put(i)28 #等待所有JOBS完成29 q.join()ThreadDemo
爬虫就靠一段落吧,更深入的爬虫框架以及html解析库暂时放一放,让我考虑考虑接下来的内容,是pygame还是django!