当前位置:Gxlcms > Python > python实现爬虫下载漫画示例

python实现爬虫下载漫画示例

时间:2021-07-01 10:21:17 帮助过:46人阅读

代码如下:


#!/usr/bin/python3.2
import os,socket
import urllib
import urllib.request,threading,time
import re,sys
global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''
floder=''
chapterbegin=0
currentthreadnum=0
threadcount=6


if len(sys.argv)>=3:
weburl=sys.argv[1]
floder=sys.argv[2]
else:
print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
sys.exit(0)
if len(sys.argv)>=4:
chapterbegin=int(sys.argv[3])
if len(sys.argv)>=5:
threadcount=(int)(sys.argv[4])

def jin(i,jinzhi):
finalans=""
answer=i%jinzhi
i=int(i/jinzhi)
if answer>9:
finalans=finalans+chr(ord('a')+(answer-10))
else:
finalans=finalans+str(answer)
if i!=0:
finalans=jin(i,jinzhi)+finalans
return finalans
def urlparse(p,a,c,k):
d={}
e=lambda c: jin(c,36)
if 1:
while c:
c=c-1
if not k[c]:
d[jin(c,36)]=jin(c,36)
else:
d[jin(c,36)]=k[c]
k=[lambda e:d[e]]
e=lambda c:'\\w+'
c=1
newstr=""
while c:
c=c-1
if k[c]:
for i in range(0,len(p)):
tempi=p[i]
tempi=ord(tempi)
if tempi>=ord('a') and tempi<=ord('f'):
newstr+=d[chr(tempi)]
elif tempi>=ord('0') and tempi<=ord('9'):
newstr+=d[chr(tempi)]
else:
newstr+=chr(tempi)
return newstr
def meispower(s):
p=re.compile(r"(?=\}\().*",re.IGNORECASE)
s=p.findall(s)
s=s[0]
s=s[0:(len(s)-19)]
par=s.split(',')
par[3]=par[3][1:len(par[3])]
answer=par[3].split('|')
chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
allurl=allurl[10:(len(allurl)-2)]
return allurl
def pictofile(weburl,filename,loop=100):
if loop<0:
print('can\'t download the picture %s'%weburl)
return
loop=loop-1
if os.path.exists(filename):
return
try:
url=urllib.request.urlopen(weburl)
data=url.read()
if len(data)<2048:
url.close()
pictofile(weburl,filename,loop)
else:
print('download from %s name is %s\n'%(weburl,filename))
myfile=open('%s'%filename,'wb')
myfile.write(data)
myfile.close()
url.close();
except socket.timeout:
print('timeout')
pictofile(weburl,filename,loop)
except Exception as e:
print('error',e)
pictofile(weburl,filename,loop)
finally:
pass
def downloadpic(url,loadpicdir,num):
#download the all url picture to loadpicdir
global currentthreadnum,mutex,mutex2
mymode=re.compile(r'[0-9a-z.]*\Z')
try:
mutex2.acquire()
os.chdir(loadpicdir)
mutex2.release()
except:
print("can't open the floder %s will be create"%loadpicdir)
try:
if(mutex2.locked()):
os.mkdir(loadpicdir)
os.chdir(loadpicdir)
mutex2.release()
print('create floder succeed')
except:
print("can't create floder %s"%loadpicdir)
if(mutex.acquire()):
mutex.release()
quit(0)
name=mymode.findall(url)
filename='manhua'+name[0]
pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
mutex.acquire()
currentthreadnum=currentthreadnum-1
mutex.release()
def downloadchapter(url,loadpicdir,num,begin=0):
global manhuaweb,threadcount,currentthreadnum,mutex
print(manhuaweb+url)
webdata=urllib.request.urlopen(manhuaweb+url).read()
webdata=webdata.decode('UTF-8')
chaptername=re.findall(r'[^_]*',webdata)[0]<BR> chaptername=chaptername[7:len(chaptername)]<BR> webscrip=re.findall(r'eval.*[^<>]',webdata)<BR> chapterurl=meispower(webscrip[0]);<BR> chapterurl='http://mhimg.ali213.net'+chapterurl<BR> for i in range(begin,num):<BR> try:<BR> while(currentthreadnum>=threadcount):<BR> time.sleep(0.5)<BR> mutex.acquire()<BR> currentthreadnum=currentthreadnum+1<BR> mutex.release()<BR> threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()<BR> except socket.error:<BR> mutex.acquire()<BR> i=i-1<BR> currentthreadnum=currentthreadnum-1<BR> mutex.release()<BR> except Exception as error:<BR> print(error,'break')<BR> print('download chapter %d of picture make a error'%i)<BR> break<BR>if __name__=='__main__':<BR> manhuaweb=r'http://manhua.ali213.net'<BR> socket.setdefaulttimeout(60.0)<BR> mutex=threading.Lock()<BR> mutex2=threading.Lock()</P> <P> <BR> webfile=urllib.request.urlopen(weburl)<BR> webdata=webfile.read();<BR> webdata=webdata.decode('UTF-8')<BR> meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')<BR> meshdata=meshmode.findall(webdata)[0]<BR> indexmode=re.compile(r'([0-9]*页)')<BR> indexdata=indexmode.findall(meshdata)</P> <P> picurlmode=re.compile(r'/comic/[0-9/]*.html')<BR> picurldata=picurlmode.findall(meshdata)</P> <P><BR> chapterlength=len(picurldata)<BR> nummode=re.compile(r'[\d]+')</P> <P> i=chapterbegin<BR> while i<chapterlength:<BR> manhuachapter=picurldata[chapterlength-i-1]<BR> downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))<BR> i=i+1 </div> <div class=""> <ul class="m-news-opt fix"> <li class="opt-item"> <a href='/python-353405.html' target='_blank'><p>< 上一篇</p><p class="ellipsis">python求斐波那契数列示例分享</p></a> </li> <li class="opt-item ta-r"> <a href='/python-353407.html' target='_blank'><p>下一篇 ></p><p class="ellipsis">python实现socket端口重定向示例</p></a> </li> </ul> </div> </div> </div> <div class="g-title fix"> <h2 class="title-txt">人气教程排行</h2> </div> <div class="m-rank u-dashed mb40"> <ul> <li class="rank-item"> <a href="/python-361871.html" title='对Python2.7pandas中的read_excel详解' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">384次</span> <span class="g-sort-num top">1</span> 对Python2.7pandas中的read_excel详解 </a> </li> <li class="rank-item"> <a href="/python-357851.html" title='Python实现定时弹窗提醒' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">383次</span> <span class="g-sort-num second">2</span> Python实现定时弹窗提醒 </a> </li> <li class="rank-item"> <a href="/python-359898.html" title='python爬虫入门(3)--利用requests构建知乎API' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">383次</span> <span class="g-sort-num third">3</span> python爬虫入门(3)--利用requests构建知乎API </a> </li> <li class="rank-item"> <a href="/python-361328.html" title='python如何爬取搜狗微信公众号文章永久链接的思路解析' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">382次</span> <span class="g-sort-num ">4</span> python如何爬取搜狗微信公众号文章永久链接的思路解析 </a> </li> <li class="rank-item"> <a href="/python-363639.html" title='python字典的键可以相同吗' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">381次</span> <span class="g-sort-num ">5</span> python字典的键可以相同吗 </a> </li> <li class="rank-item"> <a href="/python-462846.html" title='python是一种面向什么的语言?' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">381次</span> <span class="g-sort-num ">6</span> python是一种面向什么的语言? </a> </li> <li class="rank-item"> <a href="/python-355903.html" title='python通过pil为png图片填充上背景颜色的方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">381次</span> <span class="g-sort-num ">7</span> python通过pil为png图片填充上背景颜色的方法 </a> </li> <li class="rank-item"> <a href="/python-364233.html" title='python语言的编程模式有什么' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">380次</span> <span class="g-sort-num ">8</span> python语言的编程模式有什么 </a> </li> <li class="rank-item"> <a href="/python-353438.html" title='使用python获取进程pid号的方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">380次</span> <span class="g-sort-num ">9</span> 使用python获取进程pid号的方法 </a> </li> <li class="rank-item"> <a href="/python-362615.html" title='Python中如何解决无限循环的问题' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">380次</span> <span class="g-sort-num ">10</span> Python中如何解决无限循环的问题 </a> </li> <li class="rank-item"> <a href="/python-466149.html" title='怎么解决pip不是内部或外部命令' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">378次</span> <span class="g-sort-num ">11</span> 怎么解决pip不是内部或外部命令 </a> </li> <li class="rank-item"> <a href="/python-374795.html" title='python中def是什么意思' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">378次</span> <span class="g-sort-num ">12</span> python中def是什么意思 </a> </li> <li class="rank-item"> <a href="/python-361381.html" title='对numpy中数组元素的统一赋值实例' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">376次</span> <span class="g-sort-num ">13</span> 对numpy中数组元素的统一赋值实例 </a> </li> <li class="rank-item"> <a href="/python-378450.html" title='python的选择语句是什么语句' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">374次</span> <span class="g-sort-num ">14</span> python的选择语句是什么语句 </a> </li> <li class="rank-item"> <a href="/python-362375.html" title='Python中构造方法的解析(附示例)' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">374次</span> <span class="g-sort-num ">15</span> Python中构造方法的解析(附示例) </a> </li> <li class="rank-item"> <a href="/python-360729.html" title='关于python中引入导入与自定义模块以及外部文件的实例分享' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">373次</span> <span class="g-sort-num ">16</span> 关于python中引入导入与自定义模块以及外部文件的实例分享 </a> </li> <li class="rank-item"> <a href="/python-364421.html" title='python如何在不同类之间调用方法' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">372次</span> <span class="g-sort-num ">17</span> python如何在不同类之间调用方法 </a> </li> <li class="rank-item"> <a href="/python-462395.html" title='python中的【//】是什么运算符号' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">372次</span> <span class="g-sort-num ">18</span> python中的【//】是什么运算符号 </a> </li> <li class="rank-item"> <a href="/python-363743.html" title='python中╲t是什么' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">371次</span> <span class="g-sort-num ">19</span> python中╲t是什么 </a> </li> <li class="rank-item"> <a href="/python-357501.html" title='python同时给多个变量赋值' class="item-name ellipsis" target="_blank"> <span class="g-art-count fr">371次</span> <span class="g-sort-num ">20</span> python同时给多个变量赋值 </a> </li> </ul> </div> </div> </div> <!-- / 教程内容页 --> </div> </div> <!-- 页尾 --> <div class="footer"> 本站所有资源全部来源于网络,若本站发布的内容侵害到您的隐私或者利益,请联系我们删除!</div> <!-- / 页尾 --> <script type="text/javascript" src="/kan/js/read.js"></script> <div style="display:none"> <div class="login-box" id="login-dialog"> <div class="login-top"><a class="current" rel="nofollow" id="login1" onclick="setTab('login',1,2);" >登录</a></div> <div class="login-form" id="nav-signin"> <!-- <div class="login-ico"><a rel="nofollow" class="qq" id="qqlogin" target="_blank" href="/user-center-qqlogin.html"> QQ </a></div> --> <div class="login-box-form" id="con_login_1"> <form id="loginform" action="/user-center-login.html" method="post" onsubmit="return false;"> <p class="int-text"> <input class="email" id="username" name="username" type="text" value="用户名或Email" onfocus="if(this.value=='用户名或Email'){this.value='';}" onblur="if(this.value==''){this.value='用户名或Email';};" ></p> <p class="int-text"> <input class="password1" type="password" id="password" name="password" value="******" onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';" > </p> <p class="int-info"> <label class="ui-label"> </label> <label for="agreement" class="ui-label-checkbox"> <input type="checkbox" value="" name="cookietime" id="cookietime" checked="checked" value="2592000"> <input type="hidden" name="notforward" id="notforward" value="1"> <input type="hidden" name="dosubmit" id="dosubmit" value="1">记住我的登录 </label> <a rel="nofollow" class="aright" href="/user-center-forgetpwd.html" target="_blank"> 忘记密码? </a></p> <p class="int-btn"><a rel="nofollow" id="loginbt" class="loginbtn"><span>登录</span></a></p> </form> </div> <form id="regform" action="/user-center-reg.html" method="post"> <div class="login-reg" style="display: none;" id="con_login_2"> <input type="hidden" name="t" id="t"/> <p class="int-text"> <input id="email" name="email" type="text" value="Email" onfocus="if(this.value=='Email'){this.value='';}" onblur="if(this.value==''){this.value='Email';};"></p> <p class="int-text"> <input id="uname" name="username" type="text" value="用户名或昵称" onfocus="if(this.value=='用户名或昵称'){this.value='';}" onblur="if(this.value==''){this.value='用户名或昵称';};"></p> <p class="int-text"> <input type="password" id="pwd" name="password" value="******" onBlur="if(this.value=='') this.value='******';" onFocus="if(this.value=='******') this.value='';"> </p> <p class="int-text1"><span class="inputbox"> <input id="validate" name="validate" type="text" value="验证码" onfocus="if(this.value=='验证码'){this.value='';}" onblur="if(this.value==''){this.value='验证码';};"> </span><span class="yzm-img"><img src="/user-checkcode-index" alt="看不清楚换一张" id="indexlogin"></p> <p class="int-info"> <label> <input value="" name="agreement" id="agreement" CHECKED="checked" type="checkbox"> 我已阅读<a rel="nofollow" href="/user-center-agreement.html">用户协议</a>及<a rel="nofollow" href="/user-center-agreement.html">版权声明</a></label> </p> <p class="int-btn"><input type="hidden" name="dosubmit"/> <a rel="nofollow" class="loginbtn" id="register"><span>注册</span></a></p> </div> </form> </div> </div> </div> </div> <script type="text/javascript" src="/kan/js/foot_js.js"></script> <script> var _hmt = _hmt || []; (function() { var hm = document.createElement("script"); hm.src = "https://hm.baidu.com/hm.js?6dc1c3c5281cf70f49bc0bc860ec24f2"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(hm, s); })(); </script> <script type="text/javascript" src="/layui/layui.js"></script> <script> layui.use('code', function() { layui.code({ elem: 'pre', //默认值为.layui-code about: false, skin: 'notepad', title: 'php怎么实现数据库验证跳转代码块', encode: true //是否转义html标签。默认不开启 }); }); </script> </body> </html>