python2.7编写的读取html中指定元素,并生成excle文件
代码如下:
#coding=gbkimport stringimport codecsimport os,timeimport xlwtimport xlrdfrom bs4 import beautifulsoup from xlrd import open_workbook
class logmsg: def __init__(self,logfile,level=0): try: import logging #self.logger = none self.logger = logging.getlogger() self.hdlr = logging.filehandler(logfile) formatter = logging.formatter(“[%(asctime)s]: %(message)s”,”%y%m%d %h:%m:%s”) self.hdlr.setformatter(formatter) self.logger.addhandler(self.hdlr) #logger.setlevel() if level == 10: self.logger.setlevel(logging.debug) elif level == 20: self.logger.setlevel(logging.info) elif level == 30: self.logger.setlevel(logging.warning) elif level == 40: self.logger.setlevel(logging.error) elif level == 50: self.logger.setlevel(logging.critical) else: self.logger.setlevel(logging.notset) except: print “log init error!” exit(1)
def output(self,loginfo): level = self.logger.geteffectivelevel() try: if level == 10: self.logger.debug(loginfo) elif level == 20: self.logger.info(loginfo) elif level == 30: self.logger.warning(loginfo) elif level == 40: self.logger.error(loginfo) elif level == 50: self.logger.critical(loginfo) else: self.logger.info(loginfo) except: print “log output error!” exit(1) def close(self): try: #logging.shutdown([self.hdlr]) self.logger.removehandler(self.hdlr) except: print “log closed error!” exit(1)
logtime = time.strftime(“%y%m%d%h%m%s”,time.localtime())logfiletime = time.strftime(“%y%m%d”,time.localtime())logfile = ‘/data/pyexample/logs/htmlparser_%s.log’ % logfiletimelog = logmsg(logfile,20)
datapath = ‘/data/pyexample/’ xlsname = ‘dangjian_’+logtime+’.xls’
if __name__ == ‘__main__’:
wbk = xlwt.workbook(encoding = ‘gbk’) sheet = wbk.add_sheet(‘基本内容导入模板’) sheet.write(0,0,’内容类型 ‘) sheet.write(0,1,’栏目名称’) sheet.write(0,2,’栏目编号’) sheet.write(0,3,’内容名称’) sheet.write(0,4,’时长’) sheet.write(0,5,’关键字’) sheet.write(0,6,’看点’) sheet.write(0,7,’作者’) sheet.write(0,8,’来源’) sheet.write(0,9,’子内容1′) sheet.write(0,10,’子内容2′) xlscontent = [] files = os.listdir(datapath) k = 0 for f in files: if os.path.splitext(f)[1] == ‘.html’: content=[] log.output(‘当前文件:’+f) htmlfile =codecs.open(datapath+f,’r’,’gbk’) lines = htmlfile.readlines() if not lines: log.output (‘not line’) for line in lines: if line.strip()==’\n’: log.output(‘该处是空行’) else: line = line.replace(‘ ‘,”) soup = beautifulsoup(line) for tdd in soup.findall(‘td’): #print tdd.text.encode(“gbk”) content.append(tdd.text.encode(“gbk”)) #print line.encode(‘gbk’) htmlfile.close() for i in content: print content.index(i),’,’,i log.output(i) log.output(content.index(i)) print ‘—————————————-‘
foldername = content[6] contentname= content[4] duration = filter(str.isdigit, content[16]) int_duration = string.atoi(duration)*60 str_duration = “%i”%int_duration keyword = content[6] desciption = content[36] videoname_1 = content[10] print foldername print contentname print str_duration print keyword print desciption print videoname_1 log.output(‘输出xls数据:’+’,’+foldername+’,,’+contentname+’,’+str_duration+’,’+keyword+’,’+desciption+’,管理员,华数编辑,’+videoname_1+’,,’) print k sheet.write(k+1,0,”) sheet.write(k+1,1,foldername) sheet.write(k+1,2,”) sheet.write(k+1,3,contentname) sheet.write(k+1,4,str_duration) sheet.write(k+1,5,keyword) sheet.write(k+1,6,desciption) sheet.write(k+1,7,’管理员’) sheet.write(k+1,8,’华数编辑’) sheet.write(k+1,9,videoname_1) sheet.write(k+1,10,”) k+=1 wbk.save(datapath + xlsname) print ‘=========================================’