Python Crawler(1) - Scrappy Introduce
PythonCrawler(1)-ScrappyIntroduce
>python--version
Python2.7.13
>pip--version
pip9.0.1from/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages(python2.7)
>pipinstallscrapy
https://docs.scrapy.org/en/latest/intro/overview.html
Firstexampleherequotes_spider.py
importscrapy
classQuotesSpider(scrapy.Spider):
name="quotes"
start_urls=[
'http://quotes.toscrape.com/tag/humor/',
]
defparse(self,response):
forquoteinresponse.css('div.quote'):
yield{
'text':quote.css('span.text::text').extract_first(),
'author':quote.xpath('span/small/text()').extract_first(),
}
next_page=response.css('li.nexta::attr("href")').extract_first()
ifnext_pageisnotNone:
yieldresponse.follow(next_page,self.parse)
Commandtocheck
>scrapyrunspiderquotes_spider.py-oquotes.json
https://docs.scrapy.org/en/latest/intro/tutorial.html
StartaNewProject
>scrapystartprojecttutorial
FirstSpiderunderspiders,quotes_spider.py
importscrapy
classQuotesSpider(scrapy.Spider):
name="quotes"
defstart_requests(self):
urls=[
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
forurlinurls:
yieldscrapy.Request(url=url,callback=self.parse)
defparse(self,response):
page=response.url.split("/")[-2]
filename='quotes-%s.html'%page
withopen(filename,'wb')asf:
f.write(response.body)
self.log('Savefile%s'%filename)
RuntheProject
>scrapecrawlquotes
Ashortcuttothestart_requests
start_urls=[
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
ThisshellcommandwillopenalltheDOMelementsonthepage
>scrapyshell'http://quotes.toscrape.com/page/1’
[s]AvailableScrapyobjects:
[s]scrapyscrapymodule(containsscrapy.Request,scrapy.Selector,etc)
[s]crawler<scrapy.crawler.Crawlerobjectat0x104c3db90>
[s]item{}
[s]request<GEThttp://quotes.toscrape.com/page/1>
[s]response<200http://quotes.toscrape.com/page/1/>
[s]settings<scrapy.settings.Settingsobjectat0x104c3d110>
[s]spider<DefaultSpider'default'at0x10582e550>
[s]Usefulshortcuts:
[s]fetch(url[,redirect=True])FetchURLandupdatelocalobjects(bydefault,redirectsarefollowed)
[s]fetch(req)Fetchascrapy.Requestandupdatelocalobjects
[s]shelp()Shellhelp(printthishelp)
[s]view(response)Viewresponseinabrowser
>response.css('title')
[<Selectorxpath=u'descendant-or-self::title'data=u'<title>QuotestoScrape</title>'>]
>response.css('title::text').extract()
[u'QuotestoScrape’]
>response.css('title::text').extract_first()
u'QuotestoScrape’
>response.xpath('//title/text()').extract_first()
u'QuotestoScrape’
>quote=response.css("div.quote")[0]
>title=quote.css("span.text::text").extract_first()
>title
u'\u201cTheworldaswehavecreateditisaprocessofourthinking.Itcannotbechangedwithoutchangingourthinking.\u201d'
>forquoteinresponse.css("div.quote"):
...text=quote.css("span.text::text").extract_first()
...author=quote.css("small.author::text").extract_first()
...tags=quote.css("div.tagsa.tag::text").extract()
...print(dict(text=text,author=author,tags=tags))
...
{'text':u'\u201cTheworldaswehavecreateditisaprocessofourthinking.Itcannotbechangedwithoutchangingourthinking.\u201d','tags':[u'change',u'deep-thoughts',u'thinking',u'world'],'author':u'AlbertEinstein'}
{'text':u'\u201cItisourchoices,Harry,thatshowwhatwetrulyare,farmorethanourabilities.\u201d','tags':[u'abilities',u'choices'],'author':u'J.K.Rowling'}
ChangethePythonScripttoParsethedatainSpider
defparse(self,response):
forquoteinresponse.css('div.quote'):
yield{
'text':quote.css('span.text::text').extract_first(),
'author':quote.css('small.author::text').extract_first(),
'tags':quote.css('div.tagsa.tag::text').extract(),
}
OutputtheJSONinsomewhere
>scrapycrawlquotes-oquotes.json
>response.css('li.nexta::attr(href)').extract_first()
u'/page/2/‘
FindNextPage
next_page=response.css('li.nexta::attr(href)').extract_first()
ifnext_pageisnotNone:
next_page=response.urljoin(next_page)
yieldscrapy.Request(next_page,callback=self.parse)
Oralternatively
ifnext_pageisnotNone:
yieldresponse.follow(next_page,callback=self.parse)
AuthorSpider
importscrapy
classAuthorSpider(scrapy.Spider):
name='author'
start_urls=['http://quotes.toscrape.com/']
defparse(self,response):
forhrefinresponse.css('.author+a::attr(href)'):
yieldresponse.follow(href,self.parse_author)
forhrefinresponse.css('li.nexta::attr(href)'):
yieldresponse.follow(href,self.parse)
defparse_author(self,response):
defextract_with_css(query):
returnresponse.css(query).extract_first().strip()
yield{
'name':extract_with_css('h3.author-title::text'),
'birthdate':extract_with_css('.author-born-date::text'),
'bio':extract_with_css('.author-description::text'),
}
>scrapycrawlauthor-oauthors.json
ReceiveParameters
>scrapycrawlquotes-oquotes-humor.json-atag=humor
defstart_requests(self):
url='http://quotes.toscrape.com/'
tag=getattr(self,'tag',None)
iftagisnotNone:
url=url+'tag/'+tag
yieldscrapy.Request(url,self.parse)
References:
https://www.debrice.com/building-a-simple-crawler/
https://gist.github.com/debrice/a34563fb078d9d2d15e8
https://scrapy.org/
https://medium.com/python-pandemonium/develop-your-first-web-crawler-in-python-scrapy-6b2ee4baf954