本文实例讲述了python妹子图简单爬虫实现方法。分享给大家供大家参考。具体如下:
#!/usr/bin/env python
#coding: utf-8
import urllib
import urllib2
import os
import re
import sys
#显示下载进度
def schedule(a,b,c):
””’
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
”’
per = 100.0 * a * b / c
if per > 100 :
per = 100
print ‘%.2f%%’ % per
#获取html源码
def gethtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
#下载图片
def downloadimg(html, num, foldername):
picpath = ‘%s’ % (foldername) #下载到的本地目录
if not os.path.exists(picpath): #路径不存在时创建一个
os.makedirs(picpath)
target = picpath+’/%s.jpg’ % num
myitems = re.findall(‘
‘,html,re.s)
print ‘downloading image to location: ‘ + target
urllib.urlretrieve(myitems[0], target, schedule)
#正则匹配分页
def findpage(html):
myitems = re.findall(‘(\d*)’, html, re.s)
return myitems.pop()
#正则匹配列表
def findlist(html):
myitems = re.findall(‘.*?’, html, re.s)
return myitems
#总下载
def totaldownload(modelurl):
listhtml5 = gethtml(modelurl)
listcontent = findlist(listhtml)
for list in listcontent:
html = gethtml(‘http://www.mzitu.com/’ + str(list[0]))
totalnum = findpage(html)
for num in range(1, int(totalnum)+1):
if num == 1:
url = ‘http://www.mzitu.com/’ + str(list[0])
html5 = gethtml(url)
downloadimg(html5, str(num), str(list[1]))
else:
url = ‘http://www.mzitu.com/’ + str(list[0]) + ‘/’+str(num)
html5 = gethtml(url)
downloadimg(html5, str(num), str(list[1]))
if __name__ == ‘__main__’:
listhtml = gethtml(‘http://www.mzitu.com/model’)
#这是其中一个模块的url,可以添加不同的模块url从而达到整站爬取。
for model in range(1, int(findpage(listhtml))+1):
if model == 1:
modelurl = ‘http://www.mzitu.com/model’
totaldownload(modelurl)
else:
modelurl = ‘http://www.mzitu.com/model/page/’ + str(model)
totaldownload(modelurl)
print “download has finished.”
希望本文所述对大家的python程序设计有所帮助。