编写tasks.py
代码如下:
from celery import celeryfrom tornado.httpclient import httpclientapp = celery(‘tasks’)app.config_from_object(‘celeryconfig’)@app.taskdef get_html(url): http_client = httpclient() try: response = http_client.fetch(url,follow_redirects=true) return response.body except httpclient.httperror as e: return none http_client.close()
编写celeryconfig.py
代码如下:
celery_imports = (‘tasks’,)broker_url = ‘amqp://guest@localhost:5672//’celery_result_backend = ‘amqp://’
编写spider.py
代码如下:
from tasks import get_htmlfrom queue import queuefrom bs4 import beautifulsoupfrom urllib.parse import urlparse,urljoinimport threadingclass spider(object): def __init__(self): self.visited={} self.queue=queue() def process_html(self, html): pass #print(html) def _add_links_to_queue(self,url_base,html): soup = beautifulsoup(html) links=soup.find_all(‘a’) for link in links: try: url=link[‘href’] except: pass else: url_com=urlparse(url) if not url_com.netloc: self.queue.put(urljoin(url_base,url)) else: self.queue.put(url_com.geturl()) def start(self,url): self.queue.put(url) for i in range(20): t = threading.thread(target=self._worker) t.daemon = true t.start() self.queue.join() def _worker(self): while 1: url=self.queue.get() if url in self.visited: continue else: result=get_html.delay(url) try: html=result.get(timeout=5) except exception as e: print(url) print(e) self.process_html(html) self._add_links_to_queue(url,html)
self.visited[url]=true self.queue.task_done()s=spider()s.start(“http://www.bitscn.com/”)
由于html中某些特殊情况的存在,程序还有待完善。