python使用rabbitmq实现网络爬虫示例
编写tasks.py
代码如下:
from celery import Celery from tornado.httpclient import HTTPClient app = Celery('tasks') app.config_from_object('celeryconfig') @app.task def get_html(url): http_client = HTTPClient() try: response = http_client.fetch(url,follow_redirects=True) return response.body except httpclient.HTTPError as e: return None http_client.close()
编写celeryconfig.py
代码如下:
CELERY_IMPORTS = ('tasks',) BROKER_URL = 'amqp://guest@localhost:5672//' CELERY_RESULT_BACKEND = 'amqp://'
编写spider.py
代码如下:
from tasks import get_html from queue import Queue from bs4 import BeautifulSoup from urllib.parse import urlparse,urljoin import threading class spider(object): def __init__(self): self.visited={} self.queue=Queue() def process_html(self, html): pass #print(html) def _add_links_to_queue(self,url_base,html): soup = BeautifulSoup(html) links=soup.find_all('a') for link in links: try: url=link['href'] except: pass else: url_com=urlparse(url) if not url_com.netloc: self.queue.put(urljoin(url_base,url)) else: self.queue.put(url_com.geturl()) def start(self,url): self.queue.put(url) for i in range(20): t = threading.Thread(target=self._worker) t.daemon = True t.start() self.queue.join() def _worker(self): while 1: url=self.queue.get() if url in self.visited: continue else: result=get_html.delay(url) try: html=result.get(timeout=5) except Exception as e: print(url) print(e) self.process_html(html) self._add_links_to_queue(url,html)
由于html中某些特殊情况的存在,程序还有待完善。
相关推荐
我欲疾风前行 2020-06-18
CycloneKid 2020-10-27
jling 2020-09-17
fengling 2020-08-15
我欲疾风前行 2020-06-04
athrenzala 2020-05-30
zengni 2020-05-29
sunzhihaofuture 2020-05-17
hilary0 2020-05-15
hilary0 2020-05-04
hilary0 2020-05-03
fangjack 2020-04-22
knightwatch 2020-04-16
宿舍 2020-03-06
四叶草 2020-02-15
oXiaoChong 2020-02-14
四叶草 2020-01-30