时间:2021-07-01 10:21:17 帮助过:93人阅读
- # zhouxianglh 2013.05.03 python3.3
- import urllib.request
- from html.parser import HTMLParser
- import re
- import os
- import shutil
- import time
- url = "http://www.douban.com/"
- filePath = "D:\\temp"
- # 读也HTML
- urlContent = urllib.request.urlopen(url);
- data = str(urlContent.read())
- # 初始化文件目录
- if os.path.isdir(filePath):
- # os.removedirs(filePath)
- shutil.rmtree(filePath)
- elif os.path.isfile(filePath):
- os.remove(filePath)
- os.makedirs(filePath)
- # 生成唯一文件名
- intFlag = 0
- def getTimeStr():
- global intFlag
- intFlag = intFlag + 1
- return time.strftime("%H%M%S") + str(intFlag)
- # 解析HTML
- # HTMLParser方式解析,这里HTMLParser类似于抽象类
- class MyHtmlParser(HTMLParser):
- def handle_starttag(self, tag, attrs):
- '获取 img标签'
- if tag == "img" :
- for imageUrl in attrs:
- '获取src属性'
- if imageUrl[0] == 'src':
- imageUrl = imageUrl[1]
- imageUrl = re.sub("[\\\\']", "", imageUrl)
- iamgeUrlArr = imageUrl.split("/")
- imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1]
- try:
- imgData = urllib.request.urlopen(imageUrl).read()
- imgFilePath = filePath + os.sep + imgFilePath + getTimeStr() + ".jpg"
- imageFile = open(imgFilePath, "wb")
- imageFile.write(imgData)
- imageFile.close()
- print("下载文件", imageUrl, "成功,另存路径:" + imgFilePath)
- except :
- print("****下载文件 ", imageUrl, " 出错:")
- parser = MyHtmlParser()
- # 解析HTML
- parser.feed(data)
- print("获取图片操作完成")