# zhouxianglh 2013.05.03 python3.3
import urllib.request
from html.parser import htmlparser
import re
import os
import shutil
import time
url = “http://www.douban.com/”
filepath = “d:\\temp”
# 读也html
urlcontent = urllib.request.urlopen(url);
data = str(urlcontent.read())
# 初始化文件目录
if os.path.isdir(filepath):
# os.removedirs(filepath)
shutil.rmtree(filepath)
elif os.path.isfile(filepath):
os.remove(filepath)
os.makedirs(filepath)
# 生成唯一文件名
intflag = 0
def gettimestr():
global intflag
intflag = intflag + 1
return time.strftime(“%h%m%s”) + str(intflag)
# 解析html
# htmlparser方式解析,这里htmlparser类似于抽象类
class myhtmlparser(htmlparser):
def handle_starttag(self, tag, attrs):
‘获取 img标签’
if tag == “img” :
for imageurl in attrs:
‘获取src属性’
if imageurl[0] == ‘src’:
imageurl = imageurl[1]
imageurl = re.sub(“[\\\\’]”, “”, imageurl)
iamgeurlarr = imageurl.split(“/”)
imgfilepath = iamgeurlarr[len(iamgeurlarr) – 1]
try:
imgdata = urllib.request.urlopen(imageurl).read()
imgfilepath = filepath + os.sep + imgfilepath + gettimestr() + “.jpg”
imagefile = open(imgfilepath, “wb”)
imagefile.write(imgdata)
imagefile.close()
print(“下载文件”, imageurl, “成功,另存路径:” + imgfilepath)
except :
print(“****下载文件 “, imageurl, ” 出错:”)
parser = myhtmlparser()
# 解析html
parser.feed(data)
print(“获取图片操作完成”)