python使用rabbitmq实现网络爬虫示例

编写tasks.py

代码如下:

from celery import celeryfrom tornado.httpclient import httpclientapp = celery(‘tasks’)app.config_from_object(‘celeryconfig’)@app.taskdef get_html(url): http_client = httpclient() try: response = http_client.fetch(url,follow_redirects=true) return response.body except httpclient.httperror as e: return none http_client.close()

编写celeryconfig.py

代码如下:

celery_imports = (‘tasks’,)broker_url = ‘amqp://guest@localhost:5672//’celery_result_backend = ‘amqp://’

编写spider.py

代码如下:

from tasks import get_htmlfrom queue import queuefrom bs4 import beautifulsoupfrom urllib.parse import urlparse,urljoinimport threadingclass spider(object): def __init__(self): self.visited={} self.queue=queue() def process_html(self, html): pass #print(html) def _add_links_to_queue(self,url_base,html): soup = beautifulsoup(html) links=soup.find_all(‘a’) for link in links: try: url=link[‘href’] except: pass else: url_com=urlparse(url) if not url_com.netloc: self.queue.put(urljoin(url_base,url)) else: self.queue.put(url_com.geturl()) def start(self,url): self.queue.put(url) for i in range(20): t = threading.thread(target=self._worker) t.daemon = true t.start() self.queue.join() def _worker(self): while 1: url=self.queue.get() if url in self.visited: continue else: result=get_html.delay(url) try: html=result.get(timeout=5) except exception as e: print(url) print(e) self.process_html(html) self._add_links_to_queue(url,html)

self.visited[url]=true self.queue.task_done()s=spider()s.start(“http://www.bitscn.com/”)

由于html中某些特殊情况的存在,程序还有待完善。

Posted in 未分类

发表评论