scrapy爬取美剧天堂排名100
class MeiJuItem(scrapy.Item):
‘‘‘
爬取美剧
定义MeiJuItem类继承scrapy.Item
定义存储数据
明确存储信息
‘‘‘
name = scrapy.Field()
href = scrapy.Field()
state = scrapy.Field()
tv = scrapy.Field()
time = scrapy.Field()import scrapy
from Example.example.items import MeiJuItem
class MeiJuSpider(scrapy.Spider):
name = ‘meiju‘
start_urls = ["https://www.meijutt.tv/new100.html"]
def parse(self, response):
page_data = response.xpath(".//ul[@class=‘top-list fn-clear‘]/li")
for li_data in page_data:
item = MeiJuItem()
item[‘name‘] = li_data.xpath("./h5/a/@title").extract()[0]
item[‘href‘] = li_data.xpath("./h5/a/@href").extract()[0]
item[‘state‘] = li_data.xpath("./span[@class=‘state1 new100state1‘]/string()").extract()[0]
item[‘tv‘] = li_data.xpath("./span[@class=‘mjtv‘]/text()").extract()[0]
item[‘time‘] = li_data.xpath("./div[@class=‘lasted-time new100time fn-right‘]/text()").extract()[0]
yield item
passclass MeiJuPipeline(object):
def __init__(self):
# 可选实现,做参数初始化等
self.file = open(‘./meiju.json‘,‘wb‘)
pass
def process_item(self,item,spider):
‘‘‘
# item (Item 对象) – 被爬取的 item
# spider (Spider 对象) – 爬取该 item 的 spider
# 这个方法必须实现,每个 item pipeline 组件都需要调用该方法,
# 这个方法必须返回一个 Item 对象,被丢弃的 item 将不会被之后的 pipeline 组件 所处理。
‘‘‘
json.dump(dict(item),open(‘./meiju.json‘,‘a‘,encoding=‘utf-8‘),ensure_ascii=False)
return item
pass
def open_spider(self, spider):
# spider (Spider 对象) – 被开启的 spider
# 可选实现,当 spider 被开启时,这个方法被调用。
pass
def close_spider(self,spider):
# spider (Spider 对象) – 被关闭的 spider
# 可选实现,当 spider 被关闭时,这个方法被调用
self.file.close()
passfrom scrapy import cmdline cmdline.execute(‘scrapy crawl meiju‘.split())
相关推荐
paleyellow 2020-10-25
baifanwudi 2020-10-25
fangjack 2020-06-25
andrewwf 2020-05-08
andrewwf 2020-11-11
Arvinzx 2020-10-28
CycloneKid 2020-10-27
heyboz 2020-10-21
wumxiaozhu 2020-10-16
ZHANGRENXIANG00 2020-07-27
zhangll00 2020-07-05
javaraylu 2020-06-28
ZHANGRENXIANG00 2020-06-28
Catastrophe 2020-06-26
Catastrophe 2020-06-26
andrewwf 2020-06-16
qyf 2020-06-14