python2.7mac os
抓取的是电影天堂里面最新电影的页面。链接地址: http://www.dytt8.net/html/gndy/dyzz/index.html
获取页面的中电影详情页链接
import urllib2
import os
import re
import string
# 电影url集合
movieurls = []
# 获取电影列表
def querymovielist():
url = ‘http://www.dytt8.net/html/gndy/dyzz/index.html’
conent = urllib2.urlopen(url)
conent = conent.read()
conent = conent.decode(‘gb2312′,’ignore’).encode(‘utf-8′,’ignore’)
pattern = re.compile (‘
.*?>
‘+
‘(.*?) ‘,re.s)
items = re.findall(pattern,conent)
str = ”.join(items)
pattern = re.compile (‘(.*?).*?(.*?)’,re.s)
news = re.findall(pattern, str)
for j in news:
movieurls.append(‘http://www.dytt8.net’+j[0])
抓取详情页中的电影数据
def querymovieinfo(movieurls):
for index, item in enumerate(movieurls):
print(‘电影url: ‘ + item)
conent = urllib2.urlopen(item)
conent = conent.read()
conent = conent.decode(‘gb2312′,’ignore’).encode(‘utf-8′,’ignore’)
moviename = re.findall(r’
(.*?)
‘, conent, re.s)
if (len(moviename) > 0):
moviename = moviename[0] + “”
# 截取名称
moviename = moviename[moviename.find(“《”) + 3:moviename.find(“》”)]
else:
moviename = “”
print(“电影名称: ” + moviename.strip())
moviecontent = re.findall(r’
(.*?)’,conent , re.s)
pattern = re.compile(‘(.*?)’, re.s)
moviedate = re.findall(pattern,moviecontent[0])
if (len(moviedate) > 0):
moviedate = moviedate[0].strip() + ”
else:
moviedate = “”
print(“电影发布时间: ” + moviedate[-10:])
pattern = re.compile(‘(.*?)