saxå°dd.xml解ææhtmlãå½ç¶å¦ï¼å¦æå¾å°äºxml对åºçxslæ件å¯ä»¥ç´æ¥ç¨libxml2å°å ¶è½¬æ¢æhtmlã
代ç å¦ä¸:
#!/usr/bin/env python # -*- coding: utf-8 -*-#—————————————# ç¨åºï¼xml解æå¨# çæ¬ï¼01.0# ä½è ï¼mupeng# æ¥æï¼2013-12-18# è¯è¨ï¼python 2.7# åè½ï¼å°xml解ææ对åºçhtml# 注解ï¼è¯¥ç¨åºç¨xml.sax模åçparseå½æ°è§£æxmlï¼å¹¶çæäºä»¶# 继æ¿contenthandler并éåå ¶äºä»¶å¤çå½æ°# dispatcher主è¦ç¨äºç¸åºæ ç¾çèµ·å§ãç»æäºä»¶çæ´¾å#—————————————from xml.sax.handler import contenthandlerfrom xml.sax import parse
class dispatcher: def dispatch(self, prefix, name, attrs=none): mname = prefix + name.capitalize() dname = ‘default’ + prefix.capitalize() method = getattr(self, mname, none) if callable(method): args = () else: method = getattr(self, dname, none) #args = name #if prefix == ‘start’: args += attrs if callable(method): method()
def startelement(self, name, attrs): self.dispatch(‘start’, name, attrs)
def endelement(self, name): self.dispatch(‘end’, name)
class website(dispatcher, contenthandler):
def __init__(self): self.fout = open(‘ddt_sax.html’, ‘w’) self.imagein = false self.desflag = false self.item = false self.title = ” self.link = ” self.guid = ” self.url = ” self.pubdate = ” self.description = ” self.temp = ” self.prx = ” def startchannel(self): self.fout.write(”’\n\n rss-”’) def endchannel(self): self.fout.write(”’ ãscriptã function gettimediff(str) { if(str == ”) { return ”; }
var pubdate = new date(str); var nowdate = new date(); var diffmilseconds = nowdate.valueof()-pubdate.valueof(); var days = diffmilseconds/86400000; days = parseint(days);
diffmilseconds = diffmilseconds-(days*86400000); var hours = diffmilseconds/3600000; hours = parseint(hours);
diffmilseconds = diffmilseconds-(hours*3600000); var minutes = diffmilseconds/60000; minutes = parseint(minutes);
diffmilseconds = diffmilseconds-(minutes*60000); var seconds = diffmilseconds/1000; seconds = parseint(seconds); var returnstr = “±±¾©Â·¢²¼ê±¼Ã¤£º” + pubdate.tolocalestring();
if(days > 0) { returnstr = returnstr + ” £Â¨¾Ã àëïöôú” + days + “ìì” + hours + “ð¡ê±” + minutes + “·öö󣩔; } else if (hours > 0) { returnstr = returnstr + ” £Â¨¾Ã àëïöôú” + hours + “ð¡ê±” + minutes + “·öö󣩔; } else if (minutes > 0) { returnstr = returnstr + ” £Â¨¾Ã àëïöôú” + minutes + “·öö󣩔; }
return returnstr;
}
function getspantext() { var pubdate; var pubdatearray; var spanarray = document.getelementsbytagname(“span”);
for(var i = 0; i < spanarray.length; i++) { pubdate = spanarray[i].innerhtml; document.getelementsbytagname("span")[i].innerhtml = gettimediff(pubdate); } }
getspantext(); ãscriptã ”’) self.fout.close()
def characters(self, chars): if chars.strip(): #chars = chars.strip() self.temp += chars #print self.temp def starttitle(self): if self.item: self.fout.write(”’ \n\n ”’) def endtitle(self): if not self.imagein and not self.item: self.title = self.temp self.temp = ” self.fout.write(self.title.encode(‘gb2312′)) #self.title = self.temp self.fout.write(”’ \n\n\n\n ãscriptã\n
function copylink() { clipboarddata.setdata(“text”,window.location.href); alert(“rssá´½óòñ¾¸´Ã¶æµ½¼ôìù°å”); }
function subscibelink() { var str = window.location.pathname; while(str.match(/^\//)) { str = str.replace(/^\//,””); } window.open(“http://rss.sina.com.cn/my_sina_web_rss_news.html?url=” + str,”_self”);
} ãscriptã\n \n \n \n ”’) if self.item: self.title = self.temp self.temp = ” self.fout.write(self.title.encode(‘gb2312′)) self.fout.write(”’ ”’)
def startimage(self): self.imagein = true
def endimage(self): self.imagein = false def startlink(self): if self.imagein: self.fout.write(”’ def endlink(self): self.link = self.temp self.temp = ” if self.imagein: self.fout.write(self.link.encode(‘gb2312′)) self.fout.write(”'” target=”_blank”>\n ”’) elif self.item: #self.link = self.temp pass else: self.fout.write(self.link) self.fout.write(”’ ” target=” _blank “> ”’) self.fout.write(self.title.encode(‘gb2312′)) self.fout.write(”’ ”’) self.fout.write(self.description.encode(‘gb2312′)) self.fout.write(”’ ¸´Ã¶æ´ëò³á´½ó îòòªç¶èë¸ãðâîåáð±õ½îòµÃ¤ò³ãæ£Â¨¼Ã²µ¥¡¢¿Ã¬ëù¡¢êµê±¡¢ãâ·ñ£© ”’)
def starturl(self): if self.imagein: self.fout.write(”’\n ”’) if self.item: #self.url = self.temp pass
def defaultstart(self): pass def defaultend(self): self.temp = ” def startdescription(self): pass def enddescription(self): self.description = self.temp self.temp = ” if self.item: #self.fout.write(‘¡¡¡¡’) self.fout.write(self.description.encode(‘gb2312’)) def endguid(self): self.guid = self.temp def endpubdate(self): if not self.temp.startswith(‘http’): self.pubdate = self.temp self.temp = ” else: self.pubdate = ” def startitem(self): self.item = true def enditem(self): self.item = false self.fout.write(”’ self.fout.write(self.link) self.fout.write(”’ ” target=”_blank”> ”’) self.fout.write(self.guid) self.fout.write(”’ ”’) self.fout.write(self.pubdate) self.fout.write(”’ ”’)
#ç¨åºå ¥å£if __name__ == ‘__main__’: parse(‘ddt.xml’, website())