python抓取网页图片示例(python爬虫)

代码如下:

#-*- encoding: utf-8 -*-”’created on 2014-4-24

@author: leon wong”’

import urllib2import urllibimport reimport timeimport osimport uuid

#获取二级页面urldef findurl2(html): re1 = r’http://tuchong.com/\d+/\d+/|http://\w+(?# @blocknum: 已经下载的数据块# @blocksize: 数据块的大小# @totalsize: 远程文件的大小# ”’# print str(blocknum),str(blocksize),str(totalsize)# if blocknum * blocksize >= totalsize:# print ‘下载完成’

def quitit(): print “bye!” exit(0)

if __name__ == ‘__main__’: print ”’ ***************************************** ** welcome to spider for tuchong ** ** created on 2014-4-24 ** ** @author: leon wong ** *****************************************”’ pageno = raw_input(“input the page number you want to scratch (1-100),please input ‘quit’ if you want to quit>”) while not pageno.isdigit() or int(pageno) > 100 : if pageno == ‘quit’:quitit() print “param is invalid , please try again.” pageno = raw_input(“input the page number you want to scratch >”) #针对图虫人像模块来爬取 html = gethtml(“http://tuchong.com/tags/%e4%ba%ba%e5%83%8f/?page=”+str(pageno)) detllst = findurl2(html) for detail in detllst: html2 = gethtml(detail) download(html2,pageno) print “finished.”

Posted in 未分类

发表评论