下载糗事百科的内容

代码如下:

#coding:utf-8 import urllib.request import xml.dom.minidom import sqlite3 import threading import time class logger(object): def log(self,*msg): for i in msg: print(i) log = logger() log.log(‘测试下’) class downloader(object): def __init__(self,url): self.url = url def download(self): log.log(‘开始下载’,self.url) try: content = urllib.request.urlopen(self.url).read() #req = urllib.request.request(url) #response = urllib.request.urlopen(req) #content = response.read() log.log(‘下载完毕’) return(content) except: log.log(‘下载出错’) return(none) class parser(object): def __init__(self,content): #获得根节点 self.html = xml.dom.minidom.parsestring(content) def parse(self): log.log(‘开始提取数据’) contents = {‘content’:”,’url’:[]} #获得p节点 ps = self.html.getelementsbytagname(‘p’) #获得content节点 for p in ps: if p.hasattribute(‘class’) and \ p.getattribute(‘class’) == ‘content’: #获得糗事百科的内容 textnode = p.childnodes[0] qcontent = textnode.data #数据填充 contents[‘content’] = qcontent #获得上一糗事、下一糗事节点 spans = self.html.getelementsbytagname(‘span’) for span in spans: pspan = span.parentnode if pspan.tagname == ‘a’: #pspan为对应的链接,此时需要将对应的地址加入数据库 url = pspan.getattribute(‘href’) qid = url[10:][:-4] #数据填充 contents[‘url’].append(qid) log.log(‘提取数据完毕’) return(contents) def downloadpage(qid,db): url = ‘http://www.qiushibaike.com/articles/’+str(qid)+’.htm’ content = downloader(url).download() if content: contents = parser(content).parse() if contents[‘content’]: db.updatecontent(qid,contents[‘content’]) for i in contents[‘url’]: db.addqid(i) if len(contents[‘url’]) == 2: db.updatestatus(qid,2) #下载池,表示同时允许下载的链接个数 class downloaderpool(object): def __init__(self,maxlength=15): self.downloaders = [none]*maxlength self.downloadlist = [] self.db = none def setdownloadlist(self,downloadlist): self.downloadlist = list(set(self.downloadlist+downloadlist)) def setdb(self,db): self.db = db def daemon(self): #每隔一秒查询线程的状态,为非活动线程则设置为none log.log(‘设置守护进程’) for index,downloader in enumerate(self.downloaders): if downloader: if not downloader.isalive(): log.log(‘将下载器置空’,index) self.downloaders[index] = none #检查线程池状态 for index,downloader in enumerate(self.downloaders): if not downloader: qid = self.getqid() if qid: #创建线程 t = threading.thread(target=downloadpage,args=(qid,self.db)) self.downloaders[index] = t t.start() t.join() log.log(‘设置下载器’,index) #间隔一秒执行一次 time.sleep(1) def getqid(self): try: tmp = self.downloadlist[0] del self.downloadlist[0] return(tmp) except: return(none) def begindownload(self): #创建守护线程 daemon = threading.thread(target=self.daemon) daemon.setdaemon(true) daemon.start() daemon.join() def getdownloader(self): for index,downloader in enumerate(self.downloaders): if not downloader: return(index) return(none) add_q_id = ‘insert into qiushibaike(id,success) values(?,?)’ update_q_content = ‘update qiushibaike set content=? where update_q_status = ‘update qiushibaike set success=? where q_list = ‘select id from qiushibaike where success=?’ q_list_by_id = ‘select count(*) from qiushibaike where class dbconnect(object): “”” create table qiushibaike( id,integer content,varchar success,interger ) #id表示糗事的id #content表示糗事的内容 #success表示是否下载成功,当该糗事内容下载完成,且获得id时表示下载完成 1表示未完成 2表示完成 “”” def __init__(self,dbpath=’db.sqlite’): self.dbpath = dbpath def addqid(self,qid): log.log(‘插入糗事百科’,qid) #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() try: #添加内容并提交 c.execute(add_q_id,(qid,1)) cn.commit() except: log.log(‘添加id出错’,qid) #关闭连接 c.close() cn.close() log.log(‘插入成功’) def updatecontent(self,qid,content): log.log(‘更新糗事百科’,qid,content) #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() #添加内容并提交 c.execute(update_q_content,(content,qid)) cn.commit() #关闭连接 c.close() cn.close() log.log(‘更新成功’) def updatestatus(self,qid,flag): log.log(‘更新状态’,qid,flag) #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() #添加内容并提交 c.execute(update_q_status,(flag,qid)) cn.commit() #关闭连接 c.close() cn.close() log.log(‘更新状态成功’) def getlist(self,undonloaded=1): log.log(‘获得列表’) l = [] #获得连接 cn = sqlite3.connect(self.dbpath) c = cn.cursor() #获得数据 c.execute(q_list,(undonloaded,)) rows = c.fetchall() for i in rows: l.append(i[0]) #关闭连接 c.close() cn.close() log.log(‘获得列表成功’) return(l) class singledownloader(object): def __init__(self): self.downloadlist = [] def setdb(self,db): self.db = db def setdownloadlist(self,downloadlist): self.downloadlist = list(set(self.downloadlist+downloadlist)) def begindownload(self): for i in self.downloadlist: downloadpage(i,self.db) def main(): db = dbconnect(‘db.sqlite’) #dp = downloaderpool() #dp.setdb(db) sp = singledownloader() sp.setdb(db) dp=sp undownloadedlist = db.getlist() #当还有未下载的糗事时就要继续下载 while(len(undownloadedlist)): #使用该列表填充下载池 dp.setdownloadlist(undownloadedlist) dp.begindownload() time.sleep(1) #重置参数 undownloadedlist = db.getlist() if __name__ == ‘__main__’: main()

代码是没问题的,可以正常运行,但是希望做到以下2方面: 1、多线程下载 2、代码分离度更高,跟面向对象

Posted in 未分类

发表评论