scrapy抓取校花网图片

一:基础版(抓取首页图片)

爬虫py文件代码:

# -*- coding: utf-8 -*-
import scrapy
import sys
import io
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import Day96XiaohuaItem
import re
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=‘utf-8‘)


class XiaohuaSpider(scrapy.Spider):
    name = ‘xiaohua‘
    allowed_domains = [‘www.xueshengmai.com/hua/‘]
    start_urls = [‘http://www.xueshengmai.com/hua/‘]

    def parse(self, response):
        # ------------持久化数据--------------
        hxs = Selector(response=response).xpath("//div[@class=‘item_t‘]/div[@class=‘img‘]/a/img").extract()
        # print(hxs)
        for i in hxs:
            # print(i)
            title = re.findall("alt=(.*) src=",i)[0].strip(‘"‘)+".jpg"
            src = "http://www.xueshengmai.com%s"%re.findall("src=(.*)>",i)[0].strip(‘"‘)
            print(title,src)
            item_obj = Day96XiaohuaItem(title=title, src=src)
            yield item_obj

items.py 代码:

import scrapy


class Day96XiaohuaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    src=scrapy.Field()

pipelines代码:

import requests

class Day96XiaohuaPipeline(object):
    def process_item(self, item, spider):
        file_path="imgs/%s"%item["title"]
        file_src=item["src"]
        f=open(file_path,"wb")
        img_date=requests.get(file_src)
        f.write(img_date.content)
        f.close()

二:分页抓取校花网图片

相关推荐