最基本的抓取网页内容的代码实现:
#!/usr/bin/env python
from urllib import urlretrieve
def firstnonblank(lines):
for eachline in lines:
if not eachline.strip():
continue
else:
return eachline
def firstlast(webpage):
f = open(webpage)
lines = f.readlines()
f.close()
print firstnonblank(lines),
lines.reverse()
print firstnonblank(lines),
def download(url=’http://www’,process=firstlast):
try:
retval = urlretrieve(url)[0]
except ioerror:
retval = none
if retval:
process(retval)
if __name__ == ‘__main__’:
download()
利用urllib模块,来实现一个网页中针对图片的抓取功能:
import urllib.request
import socket
import re
import sys
import os
targetdir = r”c:\users\elqstux\desktop\pic”
def destfile(path):
if not os.path.isdir(targetdir):
os.mkdir(targetdir)
pos = path.rindex(‘/’)
t = os.path.join(targetdir, path[pos+1:])
return t
if __name__ == “__main__”:
hostname = “http://www.douban.com”
req = urllib.request.request(hostname)
webpage = urllib.request.urlopen(req)
contentbytes = webpage.read()
for link, t in set(re.findall(r'(http:[^\s]*?(jpg|png|gif))’, str(contentbytes))):
print(link)
urllib.request.urlretrieve(link, destfile(link))
import urllib.request
import socket
import re
import sys
import os
targetdir = r”h:\pic”
def destfile(path):
if not os.path.isdir(targetdir):
os.mkdir(targetdir)
pos = path.rindex(‘/’)
t = os.path.join(targetdir, path[pos+1:]) #会以/作为分隔
return t
if __name__ == “__main__”:
hostname = “http://www.douban.com/”
req = urllib.request.request(hostname)
webpage = urllib.request.urlopen(req)
contentbytes = webpage.read()
match = re.findall(r'(http:[^\s]*?(jpg|png|gif))’, str(contentbytes) )#r'(http:[^\s]*?(jpg|png|gif))’中包含两层圆括号,故有两个分组,
#上面会返回列表,括号中匹配的内容才会出现在列表中
for picname, pictype in match:
print(picname)
print(pictype)
””’
输出:
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g111328-1.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
http://img3.douban.com/icon/g197523-19.jpg
jpg
http://img3.douban.com/pics/blank.gif
gif
…
”’