cnblogs 博客爬取 + scrapy + 持久化
cnblogs_spider.py
# -*- coding: utf-8 -*- import scrapy from ..items import TttItem class ChoutiSpider(scrapy.Spider): name = ‘chouti‘ # 爬虫名字 start_urls = [‘https://www.cnblogs.com‘] def parse(self, response): div_list = response.xpath(‘//div[@class="post_item_body"]‘) for div in div_list: title = div.xpath(‘./h3/a/text()‘).extract_first() url = div.xpath(‘./h3/a/@href‘).extract_first() outline = div.css(‘.post_item_summary::text‘).extract()[-1] author = div.xpath(‘./div[@class="post_item_foot"]/a/text()‘).extract_first() item = TttItem() item[‘title‘] = title item[‘outline‘] = outline item[‘author‘] = author item[‘url‘] = url yield scrapy.Request(url, callback=self.get_detail, meta={‘item‘: item}) beforeurl = response.url print(beforeurl) # 获取最后一个 a 标签 next_url = response.xpath(‘//div[@class="pager"]/a[last()]/@href‘).extract_first() print(‘next_url‘, next_url) yield scrapy.Request(self.start_urls[0] + next_url, callback=self.parse) # 获取文章详情 def get_detail(self, response): content = response.xpath(‘//div[@id="cnblogs_post_body"]‘).extract_first() if not content: content=response.css(‘content‘).extract_first() item = response.meta.get(‘item‘) item[‘content‘] = content yield item
piplines.py
import pymysql class CnblogsSaveMysqlPipline(object): def open_spider(self, spider): self.conn = pymysql.connect(user=‘root‘, password=‘123123‘, db=‘cnblogs‘) def close_spider(self, spider): self.conn.close() def process_item(self, item, spider): cursor = self.conn.cursor() sql = ‘‘‘insert into cnb (title, outline, author, url, content) values (%s,%s,%s,%s,%s)‘‘‘ cursor.execute(sql, args=(item[‘title‘], item[‘outline‘], item[‘author‘], item[‘url‘], item[‘content‘])) self.conn.commit()
相关推荐
温攀峰 2020-10-23
Dyancsdn 2020-07-28
鲁氏汤包王 2020-06-08
Callmesmallpure 2020-05-31
无能力者只知抱怨 2020-05-09
技术之博大精深 2020-05-03
shenzhenzsw 2020-05-01
jokewinl 2020-04-20
ljbhander 2020-04-11
JavaWDB 2020-03-26
fansenjun 2020-03-03
wangxiaoxue 2020-02-17
Java高知 2020-02-14
wangxiaoxue 2020-01-29
qiqizhiyun 2020-01-29
杜引强 2020-01-13