python使用urllib2模块抓取html页面资源的实例分享

先把要抓取的网络地址列在单独的list文件中

http://www.jb51.net/article/83440.html
http://www.jb51.net/article/83437.html
http://www.jb51.net/article/83430.html
http://www.jb51.net/article/83449.html

然后我们来看程序操作,代码如下:

#!/usr/bin/python
import os
import sys
import urllib2
import re
def cdown_data(fileurl, fpath, dpath):
if not os.path.exists(dpath):
os.makedirs(dpath)
try:
getfile = urllib2.urlopen(fileurl)
data = getfile.read()
f = open(fpath, ‘w’)
f.write(data)
f.close()
except:
print
with open(‘u1.list’) as lines:
for line in lines:
uri = line.strip()
if ‘?’ and ‘%’ in uri:
continue
elif uri.count(‘/’) == 2:
continue
elif uri.count(‘/’) > 2:
#print uri,uri.count(‘/’)
try:
dirpath = uri.rpartition(‘/’)[0].split(‘//’)[1]
#filepath = uri.split(‘//’)[1].split(‘/’)[1]
filepath = uri.split(‘//’)[1]
if filepath:
print uri,filepath,dirpath
cdown_data(uri, filepath, dirpath)
except:
print uri,’error’

原文网址为:http://www.diyoms.com/python/1806.html

Posted in 未分类

发表评论