代码如下:
#-*- coding: utf-8 -*-”’created on 2013-12-5
@author: good-temper”’
import urllib2import bs4import time
def getpage(urlstr): ”’ 获取页面内容 ”’ content = urllib2.urlopen(urlstr).read() return content
def getnextpageurl(currpagenum): #http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-页码-1-1-72-4137-33.html url = u’http://list.jd.com/9987-653-655-0-0-0-0-0-0-0-1-1-‘+str(currpagenum+1)+’-1-1-72-4137-33.html’ #是否有下一页 content = getpage(url); soup = bs4.beautifulsoup(content) list = soup.findall(‘span’,{‘class’:’next-disabled’}); if(len(list) == 0): return url return ”def analyzelist(): pagenum = 0 list = [] url = getnextpageurl(pagenum) while url !=”: soup = bs4.beautifulsoup(getpage(url)) pagelist = soup.findall(‘p’,{‘class’:’p-name’}) for elem in pagelist: soup1 = bs4.beautifulsoup(str(elem)) list.append(soup1.find(‘a’)[‘href’]) pagenum = pagenum+1 print pagenum url = getnextpageurl(pagenum) return list
def analyzecontent(url): return ”
def writetofile(list, path): f = open(path, ‘a’) for elem in list: f.write(elem+’\n’) f.close()
if __name__ == ‘__main__’: list = analyzelist() print ‘共抓取’+str(len(list))+’条\n’ writetofile(list, u’e:\\jd_phone_list.dat’);