python写的一个爬虫程序源码分享

写爬虫是一项复杂、枯噪、反复的工作，考虑的问题包括采集效率、链路异常处理、数据质量(与站点编码规范关系很大)等。整理自己写一个爬虫程序，单台服务器可以启用1~8个实例同时采集，然后将数据入库。

#-*- coding:utf-8 -*-
#!/usr/local/bin/python
import sys, time, os,string
import mechanize
import urlparse
from beautifulsoup import beautifulsoup
import re
import mysqldb
import logging
import cgi
from optparse import optionparser
#—————————————————————————-#
# name: tyspider.py #
# purpose: website spider module #
# author: 刘天斯 #
# email: liutiansi@gamil.com #
# created: 2010/02/16 #
# copyright: (c) 2010 #
#—————————————————————————-#
“””
|————————————————————————–
| 定义 loging class;
|————————————————————————–
|
| 功能：记录系统相关日志信息。
|
|
“””
class pubclilog():
def __init__(self):
self.logfile = ‘website_log.txt’
def inilog(self):
logger = logging.getlogger()
filehandler = logging.filehandler(self.logfile)
streamhandler = logging.streamhandler()
fmt = logging.formatter(‘%(asctime)s, %(funcname)s, %(message)s’)
logger.setlevel(logging.debug)
logger.addhandler(filehandler)
logger.addhandler(streamhandler)
return [logger,filehandler]
“””
|————————————————————————–
| 定义 tyspider class;
|————————————————————————–
|
| 功能：抓取分类、标题等信息
|
|
“””
class basetyspider:
#初始化相关成员方法
def __init__(self,x,log_switch):
#数据库连接
self.conn = mysqldb.connect(db=’dbname’,host=’192.168.0.10′, user=’dbuser’,passwd=’sdflkj934y5jsdgfjh435′,charset=’utf8′)
#分类及标题页面community
self.class_url = ‘http://test.abc.com/aa/commtopicspage?’
#发表回复页
self.content_url = ‘http://test.bac.com/aa/commmsgspage?’
#开始comm值
self.x=x
#当前comm id取模，方面平均到表
self.mod=self.x%5
#community文件下载页
self.body=””
#self.bodysoup对象
self.soup=none
#发表回复页下载内容变量
self.contentbody=””
#发表回复页内容self.contentbodysoup对象
self.contentsoup=none
#日志开关
self.log_switch=log_switch
#======================获取名称及分类方法==========================
def _spiderclass(self,nextpage=none):
if nextpage==none:
fixed_query = ‘cmm=’+str(self.x)
else:
fixed_query = nextpage[1:]
try:
rd = mechanize.browser()
rd.addheaders = [(“user-agent”, “tianya/2010 (compatible; msie 6.0;windows nt 5.1)”)]
rd.open(self.class_url + fixed_query)
self.body=rd.response().read()
#rd=mechanize.request(self.class_url + fixed_query)
#response = mechanize.urlopen(rd)
#self.body=response.read()
except exception,e:
if self.log_switch==”on”:
logapp=pubclilog()
logger,hdlr = logapp.inilog()
logger.info(self.class_url + fixed_query+str(e))
hdlr.flush()
logger.removehandler(hdlr)
return
self.soup = beautifulsoup(self.body)
nextpageobj= self.soup(“a”, {‘class’ : re.compile(“fs-paging-item fs-paging-next”)})
self.cursor = self.conn.cursor()
if nextpage==none:
try:
ttag=str(self.soup.table)
#print ttag
“””
——————分析结构体—————–

dunhill

中国 » 人民

“””
souptable=beautifulsoup(ttag)
#定位到第一个h1标签
tableh1 = souptable(“h1”)
#print self.x
#print “name:”+tableh1[0].string.strip().encode(‘utf-8’)
#处理无类型的
try:
#定位到表格中符合规则“^topbycategory”a链接块，tablea[0]为第一个符合条件的连接文字，tablea[1]…
tablea = souptable(“a”, {‘href’ : re.compile(“^topbycategory”)})
if tablea[0].string.strip()==””:
pass
#print “bigclass:”+tablea[0].string.strip().encode(‘utf-8’)
#print “subclass:”+tablea[1].string.strip().encode(‘utf-8’)
except exception,e:
if self.log_switch==”on”:
logapp=pubclilog()
logger,hdlr = logapp.inilog()
logger.info(“[noclassinfo]”+str(self.x)+str(e))
hdlr.flush()
logger.removehandler(hdlr)
self.cursor.execute(“insert into baname”+str(self.mod)+” values(‘%d’,’%d’,’%s’)” %(self.x,-1,tableh1[0].string.strip().encode(‘utf-8’)))
self.conn.commit()
self._spidertitle()
if nextpageobj:
nextpageurl=nextpageobj[0][‘href’]
self._spiderclass(nextpageurl)
return
else:
return
#获取链接二对象的href值
classlink=tablea[1][‘href’]
par_dict=cgi.parse_qs(urlparse.urlparse(classlink).query)
#print “cid:”+par_dict[“cid”][0]
#print “subcid:”+par_dict[“subcid”][0]
#print “—————————————”
#插入数据库
self.cursor.execute(“insert into class values(‘%d’,’%s’)” %(int(par_dict[“cid”][0]),tablea[0].string.strip().encode(‘utf-8’)))
self.cursor.execute(“insert into subclass values(‘%d’,’%d’,’%s’)” %(int(par_dict[“subcid”][0]),int(par_dict[“cid”][0]),tablea[1].string.strip().encode(‘utf-8’)))
self.cursor.execute(“insert into baname”+str(self.mod)+” values(‘%d’,’%d’,’%s’)” %(self.x,int(par_dict[“subcid”][0]),tableh1[0].string.strip().encode(‘utf-8’)))
self.conn.commit()
self._spidertitle()
if nextpageobj:
nextpageurl=nextpageobj[0][‘href’]
self._spiderclass(nextpageurl)
self.body=none
self.soup=none
ttag=none
souptable=none
table=none
table1=none
classlink=none
par_dict=none
except exception,e:
if self.log_switch==”on”:
logapp=pubclilog()
logger,hdlr = logapp.inilog()
logger.info(“[classinfo]”+str(self.x)+str(e))
hdlr.flush()
logger.removehandler(hdlr)
else:
self._spidertitle()
if nextpageobj:
nextpageurl=nextpageobj[0][‘href’]
self._spiderclass(nextpageurl)
#====================获取标题方法=========================
def _spidertitle(self):
#查找标题表格对象(table)
souptitletable=self.soup(“table”, {‘class’ : “fs-topic-list”})
#查找标题行对象(tr)
titletr = souptitletable[0](“tr”, {‘onmouseover’ : re.compile(“^this\.classname=’fs-row-hover'”)})
“””
———–分析结构体————–

【新人报到】欢迎美国人民加入

0
/
12

中国人

2-14

“””
for currtr in titletr:
try:
#初始化置顶及精华状态
title_starred=’n’
title_sticky=’n’
#获取当前记录的beautifulsoup对象
soupcurrtr=beautifulsoup(str(currtr))
#beautifulsoup分析html有误，只能通过span的标志数来获取贴子状态，会存在一定误差
#如只有精华时也会当成置顶来处理。
titlestatus=soupcurrtr(“span”, {‘title’ : “”})
titlephotoviewer=soupcurrtr(“a”, {‘href’ : re.compile(“^photoviewer”)})
if titlephotoviewer.__len__()==1:
titlephotoviewerbool=0
else:
titlephotoviewerbool=1
if titlestatus.__len__()==3-titlephotoviewerbool:
title_starred=’y’
title_sticky=’y’
elif titlestatus.__len__()==2-titlephotoviewerbool:
title_sticky=’y’
#获取贴子标题
title=soupcurrtr.a.next.strip()
#获取贴子id
par_dict=cgi.parse_qs(urlparse.urlparse(soupcurrtr.a[‘href’]).query)
#获取回复数及浏览器
titlenum=soupcurrtr(“td”, {‘class’ : “fs-topic-name”})
titlearray=string.split(str(titlenum[0]),’\n’)
title_replynum=string.split(titlearray[len(titlearray)-4],’>’)[2]
title_viewnum=string.split(titlearray[len(titlearray)-2],’>’)[2][:-6]
#获取贴子作者
titleauthorobj=soupcurrtr(“td”, {‘style’ : “padding-left:4px”})
title_author=titleauthorobj[0].next.next.next.string.strip().encode(‘utf-8’)
#获取回复时间
titletime=soupcurrtr(“td”, {‘class’ : re.compile(“^fs-topic-last-mdfy fs-meta”)})
“””
print “x:”+str(self.x)
print “title_starred:”+title_starred
print “title_sticky:”+title_sticky
print “title:”+title
#获取贴子内容连接url
print “title_link:”+soupcurrtr.a[‘href’]
print “cid:”+par_dict[“tid”][0]
print “title_replynum:”+title_replynum
print “title_viewnum:”+title_viewnum
print “title_author:”+title_author
print “titletime:”+titletime[0].string.strip().encode(‘utf-8’)
“””
#入库
self.cursor.execute(“insert into title”+str(self.mod)+” values(‘%s’,’%d’,’%s’,’%d’,’%d’,’%s’,’%s’,’%s’,’%s’)” %(par_dict[“tid”][0], \
self.x,title,int(title_replynum),int(title_viewnum),title_starred,title_sticky, \
title_author.decode(‘utf-8’),titletime[0].string.strip().encode(‘utf-8′)))
self.conn.commit()
self._spidercontent(par_dict[“tid”][0])
except exception,e:
if self.log_switch==”on”:
logapp=pubclilog()
logger,hdlr = logapp.inilog()
logger.info(“[title]”+str(self.x)+’-‘+par_dict[“tid”][0]+’-‘+str(e))
hdlr.flush()
logger.removehandler(hdlr)
#======================获取发表及回复方法=======================
def _spidercontent(self,id,nextpage=none):
if nextpage==none:
fixed_query = ‘cmm=’+str(self.x)+’&t&ref=regulartopics’
else:
fixed_query = nextpage[9:]
rd = mechanize.browser()
rd.addheaders = [(“user-agent”, “tianya/2010 (compatible; msie 6.0;windows nt 5.1)”)]
rd.open(self.content_url + fixed_query)
self.contentbody=rd.response().read()
#rd=mechanize.request(self.content_url + fixed_query)
#response = mechanize.urlopen(rd)
#self.contentbody=response.read()
self.contentsoup = beautifulsoup(self.contentbody)
nextpageobj= self.contentsoup(“a”, {‘class’ : re.compile(“fs-paging-item fs-paging-next”)})
try:
tp=self.contentsoup(“p”, {‘class’ : “fs-user-action”})
i=0
for currp in tp:
if i==0:
ctype=’y’
else:
ctype=’n’
#发表时间
soupcurrp=beautifulsoup(str(currp))
posttimeobj=soupcurrp(“span”, {‘class’ : “fs-meta”})
posttime=posttimeobj[0].next[1:]
posttime=posttime[0:-3]
#ip地址
ipobj=soupcurrp(“a”, {‘href’ : re.compile(“commmsgaddress”)})
if ipobj:
ip=ipobj[0].next.strip()
else:
ip=”
#发表／回复内容
contentobj=soupcurrp(“p”, {‘class’ :”fs-user-action-body”})
content=contentobj[0].rendercontents().strip()
“””
print “id:”+str(self.x)
print “id:”+id
print “ctype:”+ctype
print “posttime:”+posttime
print “ip:”+ip
print “content:”+content
“””
self.cursor.execute(“insert into content”+str(self.mod)+” values(‘%s’,’%d’,’%s’,’%s’,’%s’,’%s’)” %(id,self.x,ctype,posttime,ip,content.decode(‘utf-8′)))
self.conn.commit()
i+=1
except exception,e:
if self.log_switch==”on”:
logapp=pubclilog()
logger,hdlr = logapp.inilog()
logger.info(“[content]”+str(self.x)+’-‘+id+’-‘+str(e))
hdlr.flush()
logger.removehandler(hdlr)
#如“下一页”有链接刚继续遍历
if nextpageobj:
nextpageurl=nextpageobj[0][‘href’]
self._spidercontent(id,nextpageurl)
def __del__(self):
try:
self.cursor.close()
self.conn.close()
except exception,e:
pass
#遍历comm范围
def initapp(startvalue,endvalue,log_switch):
for x in range(startvalue,endvalue):
app=basetyspider(x,log_switch)
app._spiderclass()
app=none
if __name__ == “__main__”:
#定义命令行参数
msg_usage = “tyspider.py [ -s startnumber endnumber ] -l [on|off] [-v][-h]”
parser = optionparser(msg_usage)
parser.add_option(“-s”, “–set”, nargs=2,action=”store”,
dest=”comm_value”,
type=”int”,
default=false,
help=”配置名称id值范围。”.decode(‘utf-8’))
parser.add_option(“-l”, “–log”, action=”store”,
dest=”log_switch”,
type=”string”,
default=”on”,
help=”错误日志开关”.decode(‘utf-8’))
parser.add_option(“-v”,”–version”, action=”store_true”, dest=”verbose”,
help=”显示版本信息”.decode(‘utf-8’))
opts, args = parser.parse_args()
if opts.comm_value:
if opts.comm_value[0]>opts.comm_value[1]:
print “终止值比起始值还小？”
exit();
if opts.log_switch==”on”:
log_switch=”on”
else:
log_switch=”off”
initapp(opts.comm_value[0],opts.comm_value[1],log_switch)
exit();
if opts.verbose:
print “website scider v1.0 beta.”
exit;

更多python 写的一个爬虫程序源码分享相关文章请关注php中文网！

发表评论 取消回复

发表评论取消回复