代码如下:
from creepy import crawlerfrom beautifulsoup import beautifulsoupimport urllib2import json
class mycrawler(crawler): def process_document(self, doc): if doc.status == 200: print ‘[%d] %s’ % (doc.status, doc.url) try: soup = beautifulsoup(doc.text.decode(‘gb18030’).encode(‘utf-8′)) except exception as e: print e soup = beautifulsoup(doc.text) print soup.find().p.h1.text url_utf8’).split(‘/’)[-1].split(‘.’)[0] f = urllib2.urlopen(‘http://p.3.cn/prices/get?sku+url_id,timeout=5) price=json.loads(f.read()) f.close() print price[0][‘p’] else: pass
crawler = mycrawler()crawler.set_follow_mode(crawler.f_same_host)crawler.set_concurrency_level(16)crawler.add_url_filter(‘\.(jpg|jpeg|gif|png|js|css|swf)$’)crawler.crawl(‘http://item.jd.com/982040.html’)