Scrapy（五）：CrawlSpider的使用

wumxiaozhu

2020-05-26

Scrapy（五）：CrawlSpider的使用

说明：CrawlSpider，就是一个类，是Spider的一个子类，也是一个官方类，因为是子类，所以功能更加的强大，多了一项功能：去指定的页面中来抓取指定的url的功能
比如：很多页码，都需要自己去查找规律，然后写代码实现其它页面的爬取，学完crawlspider之后，可以让它直接提取符合要求的页码url，将这些url扔给调度器即可
链接提取器，在scrapy中就是一个类，LinkExtractor

一、创建项目：

scrapy startproject qiubaiproject

二、创建CrawlSpider爬虫文件

scrapy genspider -t crawl qiu ‘www.qiushibaike.com‘

三、书写spider文件

# -*- coding: utf-8 -*-

"""
field:  qiu.py
"""
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from  qiubaiproject.items import QiubaiprojectItem


class QiuSpider(CrawlSpider):

    name = ‘qiu‘
    allowed_domains = [‘www.qiushibaike.com‘]
    start_urls = [‘http://www.qiushibaike.com/‘]

    # 【注】如果使用crawlspider，parse方法不能重写，因为内部实现了很重要的功能

    page_link = LinkExtractor(allow=r‘/8hr/page/\d+/‘)

    """
           在创建对象的时候，需要使用到一些规则，符合规则的都会提取出来
           scrapy.linkextractors.LinkExtractor(
               allow = (),           # 正则表达式
               deny = (),            # 正则表达式（不经常使用）
               allow_domains = (),   # 域名，在域名下的提取出来（不经常使用）
               deny_domains = (),    # 在这些下面的不要提取（不经常使用）
               restrict_xpaths = (), # 根据xpath路径来提取符合要求的链接
               restrict_css = ()     # 根据选择器来提取
           callback: 定制处理响应的回调函数, 注意callback的写法，和普通spider的写法不一样，普通 callback=self.parse
           crawl: callback=‘parse_item‘
           follow: 是否跟进，提取了这些链接，这些链接获得响应之后，在响应里面要不要接着提取链接发送请求，如果要，follow=True  如果不要，follow=False
           follow是有默认值的，如果有callback，默认为False，如果没有callback，默认为True
           )

       """
    rules = (
        Rule(page_link, callback=‘parse_item‘, follow=True),
    )

    def parse_item(self, response):
        # 先找到所有的div
        div_list = response.xpath(‘//div[@id="content-left"]/div‘)
        # 遍历这个div的列表，依次获取里面的每一条信息
        for odiv in div_list:
            # 创建对象
            item = QiubaiprojectItem()
            # 用户头像
            face = ‘https:‘ + odiv.xpath(‘.//div[1]//img/@src‘)[0].extract()
            # 用户的名字
            name = odiv.xpath(‘.//div[1]//h2‘).extract()[0]
            # 用户的年龄
            age = odiv.xpath(‘.//div[starts-with(@class,"articleGender")]‘).extract_first()
            # 获取用户内容
            ospan = odiv.xpath(‘.//div[@class="content"]/span[1]‘)[0]
            content = ospan.xpath(‘string(.)‘).extract()
            # 用户的好笑个数
            haha_count = odiv.xpath(‘.//div[@class="stats"]/span[@class="stats-vote"]/i/text()‘).extract()[0]
            # 获取评论个数
            ping_count = odiv.xpath(‘.//div[@class="stats"]/span[@class="stats-comments"]//i/text()‘).extract()[0]

            # 将提取的信息保存起来
            item[‘image_src‘] = face
            item[‘name‘] = name
            item[‘age‘] = age
            item[‘content‘] = content
            item[‘haha_count‘] = haha_count
            item[‘ping_count‘] = ping_count

            yield item

四、书写item.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class QiubaiprojectItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 图片链接
    image_src = scrapy.Field()
    # 用户名
    name = scrapy.Field()
    # 年龄
    age = scrapy.Field()
    # 内容
    content = scrapy.Field()
    # 好笑个数
    haha_count = scrapy.Field()
    # 评论个数
    ping_count = scrapy.Field()

五、书写管道文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json

class QiubaiprojectPipeline(object):
    # 爬虫启动的时候只会调用一次
    def open_spider(self, spider):
        # 将文件的打开写道这里
        self.fp = open(‘qiubai.json‘, ‘w‘, encoding=‘utf8‘)

    # 这个函数就是处理item的函数，每一个item过来都会回调这个方法
    def process_item(self, item, spider):
        # 将对象转化为字典
        obj = dict(item)
        # 将字典转化为json格式字符串
        string = json.dumps(obj, ensure_ascii=False)
        self.fp.write(string + ‘\n‘)
        return item

    # 爬虫结束的时候回调这个方法
    def close_spider(self, spider):
        self.fp.close()

六、修改setting文件

ROBOTSTXT_OBEY = True

DOWNLOAD_DELAY = 4   #防止爬取过快丢失数据

DEFAULT_REQUEST_HEADERS = {
    "User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",
    ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘
}

ITEM_PIPELINES = {
    ‘tencent.pipelines.TencentPipeline‘: 300,
}