时间:2021-07-01 10:21:17 帮助过:93人阅读
# zhouxianglh 2013.05.03 python3.3 import urllib.request from html.parser import HTMLParser import re import os import shutil import time url = "http://www.douban.com/" filePath = "D:\\temp" # 读也HTML urlContent = urllib.request.urlopen(url); data = str(urlContent.read()) # 初始化文件目录 if os.path.isdir(filePath): # os.removedirs(filePath) shutil.rmtree(filePath) elif os.path.isfile(filePath): os.remove(filePath) os.makedirs(filePath) # 生成唯一文件名 intFlag = 0 def getTimeStr(): global intFlag intFlag = intFlag + 1 return time.strftime("%H%M%S") + str(intFlag) # 解析HTML # HTMLParser方式解析,这里HTMLParser类似于抽象类 class MyHtmlParser(HTMLParser): def handle_starttag(self, tag, attrs): '获取 img标签' if tag == "img" : for imageUrl in attrs: '获取src属性' if imageUrl[0] == 'src': imageUrl = imageUrl[1] imageUrl = re.sub("[\\\\']", "", imageUrl) iamgeUrlArr = imageUrl.split("/") imgFilePath = iamgeUrlArr[len(iamgeUrlArr) - 1] try: imgData = urllib.request.urlopen(imageUrl).read() imgFilePath = filePath + os.sep + imgFilePath + getTimeStr() + ".jpg" imageFile = open(imgFilePath, "wb") imageFile.write(imgData) imageFile.close() print("下载文件", imageUrl, "成功,另存路径:" + imgFilePath) except : print("****下载文件 ", imageUrl, " 出错:") parser = MyHtmlParser() # 解析HTML parser.feed(data) print("获取图片操作完成")