代码如下:
def get_seed_data(filename):dom = minidom.parse(filename)root = dom.documentelementsystem_nodes = root.getelementsbytagname(“system”)k = 0seed_list = []for system_node in system_nodes: #print system_node.nodename+’ id’) system_id = system_node.getattribute(“id”) system_name = system_node.getattribute(“name”) #print ‘system_name:%s’%system_name section_nodes = system_node.getelementsbytagname(“section”) for section_node in section_nodes: section_id = section_node.getattribute(‘id’) section_name = section_node.getattribute(‘name’) #print ‘ ‘+section_node.nodename+’ name=’+section_name crawl_cycle_node = section_node.getelementsbytagname(“crawl_cycle”) crawl_cycle = crawl_cycle_node[0].childnodes[0].nodevalue #print ‘ ‘+crawl_cycle_node[0].nodename+’=’+crawl_cycle seed_nodes = section_node.getelementsbytagname(‘seed’) for seed_node in seed_nodes: seed = {} seed[‘crawl_cycle’] = crawl_cycle seed[‘system_id’] = int(system_id) seed[‘system_name’] = system_name seed[‘section_id’] = int(section_id) seed[‘section_name’] = section_name seed_id = seed_node.getattribute(‘id’) seed[‘seed_id’] = int(seed_id) #print ‘ ‘+seed_node.nodename+’ ‘+’userblog_url’) userblog_url = userblog_url_node[0].childnodes[0].nodevalue seed[‘userblog_url’] = userblog_url #print ‘ ‘+’userblog_url’+’ ‘+userblog_url print ‘——————————————-‘ print ‘system_id:%d’ % seed[‘system_id’] print ‘system_name:%s’%seed[‘system_name’] print ‘ section_id:%d’ % seed[‘section_id’] print ‘ section_name:%s’ % seed[‘section_name’] print ‘ seed_id:%d’ %seed[‘seed_id’] print ‘ userblog_url:%s’ %seed[‘userblog_url’] print ‘=========================’ seed_list.append(seed) print seed_list[k] k += 1 os.system(‘pause’)return seed_list
代码如下:
http://aaa.com.cn/loveissuuny http://aaa.com.cn/loveissuuny http://aaa.com.cn/sanxiazaixian http://aaa.com.cn/twocold http://aaa.com.cn/u/1233526741