在上篇文章给大家分享php源码批量抓取远程网页图片并保存到本地的实现方法,感兴趣的朋友可以点击了解详情。
#-*-coding:utf-8-*-
import os
import uuid
import urllib2
import cookielib
”’获取文件后缀名”’
def get_file_extension(file):
return os.path.splitext(file)[1]
”’創建文件目录,并返回该目录”’
def mkdir(path):
# 去除左右两边的空格
path=path.strip()
# 去除尾部 \符号
path=path.rstrip(“\\”)
if not os.path.exists(path):
os.makedirs(path)
return path
”’自动生成一个唯一的字符串,固定长度为36”’
def unique_str():
return str(uuid.uuid1())
”’
抓取网页文件内容,保存到内存
@url 欲抓取文件 ,path+filename
”’
def get_file(url):
try:
cj=cookielib.lwpcookiejar()
opener=urllib2.build_opener(urllib2.httpcookieprocessor(cj))
urllib2.install_opener(opener)
req=urllib2.request(url)
operate=opener.open(req)
data=operate.read()
return data
except baseexception, e:
print e
return none
”’
保存文件到本地
@path 本地路径
@file_name 文件名
@data 文件内容
”’
def save_file(path, file_name, data):
if data == none:
return
mkdir(path)
if(not path.endswith(“/”)):
path=path+”/”
file=open(path+file_name, “wb”)
file.write(data)
file.flush()
file.close()
#获取文件后缀名
print get_file_extension(“123.jpg”);
#創建文件目录,并返回该目录
#print mkdir(“d:/ljq”)
#自动生成一个唯一的字符串,固定长度为36
print unique_str()
url=”http://qlogo1.store.qq.com/qzone/416501600/416501600/100?0″;
save_file(“d:/ljq/”, “123.jpg”, get_file(url))
通过python抓取指定url中的图片保存至本地
# *** encoding: utf-8 ***
__author__=’jiangyt’
“””
fetch images from specific url
v1.0
“””
import urllib, httplib, urlparse
import re
import random
“””judge url exists or not”””
def httpexists(url):
host, path = urlparse.urlsplit(url)[1:3]
if ‘:’ in host:
# port specified, try to use it
host, port = host.split(‘:’, 1)
try:
port = int(port)
except valueerror:
print ‘invalid port number %r’ % (port,)
return false
else:
# no port specified, use default port
port = none
try:
connection = httplib.httpconnection(host, port=port)
connection.request(“head”, path)
resp = connection.getresponse( )
if resp.status == 200: # normal ‘found’ status
found = true
elif resp.status == 302: # recurse on temporary redirect
found = httpexists(urlparse.urljoin(url,resp.getheader(‘location’, ”)))
else: # everything else -> not found
print “status %d %s : %s” % (resp.status, resp.reason, url)
found = false
except exception, e:
print e.__class__, e, url
found = false
return found
“””get html src,return lines[]”””
def ggethtmllines(url):
if url==none : return
if not httpexists(url): return
try:
page = urllib.urlopen(url)
html = page.readlines()
page.close()
return html
except exception, e:
print “ggethtmllines() error! exception ==>>” + e
return
“””get html src,return string”””
def ggethtml(url):
if url==none : return
if not httpexists(url): return
try:
page = urllib.urlopen(url)
html = page.read()
page.close()
return html
except exception, e:
print “ggethtml() error! exception ==>>” + e
return
“””根据url获取文件名”””
def ggetfilename(url):
if url==none: return none
if url==”” : return “”
arr=url.split(“/”)
return arr[len(arr)-1]
“””生成随机文件名”””
def grandfilename(type):
fname = ”
for i in range(16):
fname = fname + chr(random.randint(65,90))
fname = fname + chr(random.randint(48,57))
return fname + ‘.’ + type
“””根据url和其上的link,得到link的绝对地址”””
def ggetabsllink(url,link):
if url==none or link == none : return
if url==” or link==” : return url
addr = ”
if link[0] == ‘/’ :
addr = ggethttpaddr(url) + link
elif len(link)>3 and link[0:4] == ‘http’:
addr = link
elif len(link)>2 and link[0:2] == ‘..’:
addr = ggethttpaddrfatherassign(url,link)
else:
addr = ggethttpaddrfather(url) + link
return addr
“””根据输入的lines,匹配正则表达式,返回list”””
def ggetreglist(lineslist,regx):
if lineslist==none : return
rtnlist=[]
for line in lineslist:
matchs = re.search(regx, line, re.ignorecase)
if matchs!=none:
allgroups = matchs.groups()
for foundstr in allgroups:
if foundstr not in rtnlist:
rtnlist.append(foundstr)
return rtnlist
“””根据url下载文件,文件名参数指定”””
def gdownloadwithfilename(url,savepath,file):
#参数检查,现忽略
try:
urlopen=urllib.urlopener()
fp = urlopen.open(url)
data = fp.read()
fp.close()
file=open(savepath + file,’w+b’)
file.write(data)
file.close()
except ioerror, error:
print “download %s error!==>>%s” % (url, error)
except exception, e:
print “exception==>>” + e
“””根据url下载文件,文件名自动从url获取”””
def gdownload(url,savepath):
#参数检查,现忽略
filename = ggetfilename(url)
#filename =grandfilename(‘jpg’)
gdownloadwithfilename(url,savepath,filename)
“””根据某网页的url,下载该网页的jpg”””
def gdownloadhtmljpg(downloadurl,savepath):
lines= ggethtmllines(downloadurl) # ‘get the page source’
regx = r”””src\s*=”?(\s+)\.jpg”””
lists =ggetreglist(lines,regx) #’get the links which match regular express’
if lists==none: return
for jpg in lists:
jpg = ggetabsllink(downloadurl, jpg) + ‘.jpg’
gdownload(jpg,savepath)
print ggetfilename(jpg)
“””根据url取主站地址”””
def ggethttpaddr(url):
if url== ” : return ”
arr=url.split(“/”)
return arr[0]+”//”+arr[2]
“””根据url取上级目录”””
def ggethttpaddrfather(url):
if url==” : return ”
arr=url.split(“/”)
addr = arr[0]+’//’+arr[2]+ ‘/’
if len(arr)-1>3 :
for i in range(3,len(arr)-1):
addr = addr + arr[i] + ‘/’
return addr
“””根据url和上级的link取link的绝对地址”””
def ggethttpaddrfatherassign(url,link):
if url==” : return ”
if link==”: return ”
linkarray=link.split(“/”)
urlarray = url.split(“/”)
partlink =”
parturl = ”
for i in range(len(linkarray)):
if linkarray[i]==’..’:
numoffather = i + 1 #上级数
else:
partlink = partlink + ‘/’ + linkarray[i]
for i in range(len(urlarray)-1-numoffather):
parturl = parturl + urlarray[i]
if i < len(urlarray)-1-numoffather -1 :
parturl = parturl + '/'
return parturl + partlink
"""根据url获取其上的相关htm、html链接,返回list"""
def ggethtmllink(url):
#参数检查,现忽略
rtnlist=[]
lines=ggethtmllines(url)
regx = r"""href="?(\s+)\.htm"""
for link in ggetreglist(lines,regx):
link = ggetabsllink(url,link) + '.htm'
if link not in rtnlist:
rtnlist.append(link)
print link
return rtnlist
"""根据url,抓取其上的jpg和其链接htm上的jpg"""
def gdownloadalljpg(url,savepath):
#参数检查,现忽略
gdownloadhtmljpg(url,savepath)
#抓取link上的jpg
links=ggethtmllink(url)
for link in links:
gdownloadhtmljpg(link,savepath)
"""test"""
def main():
u='http://site.douban.com/196738/room/2462453/'#想要抓取图片的地址
save='/root/python/tmp/' #图片所要存放的目录
print 'download pic from [' + u +']'
print 'save to [' +save+'] ...'
gdownloadhtmljpg(u,save)
print "download finished"
if __name__ == "__main__":
main()
else:
print "called from intern."
以上代码是小编给大家介绍的python抓取网页中图片并保存到本地的全部内容,希望大家喜欢。