Python Crawler(1) - Scrappy Introduce

PythonCrawler(1)-ScrappyIntroduce

>python--version

Python2.7.13

>pip--version

pip9.0.1from/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages(python2.7)

>pipinstallscrapy

https://docs.scrapy.org/en/latest/intro/overview.html

Firstexampleherequotes_spider.py

importscrapy

classQuotesSpider(scrapy.Spider):

name="quotes"

start_urls=[

'http://quotes.toscrape.com/tag/humor/',

]

defparse(self,response):

forquoteinresponse.css('div.quote'):

yield{

'text':quote.css('span.text::text').extract_first(),

'author':quote.xpath('span/small/text()').extract_first(),

}

next_page=response.css('li.nexta::attr("href")').extract_first()

ifnext_pageisnotNone:

yieldresponse.follow(next_page,self.parse)

Commandtocheck

>scrapyrunspiderquotes_spider.py-oquotes.json

https://docs.scrapy.org/en/latest/intro/tutorial.html

StartaNewProject

>scrapystartprojecttutorial

FirstSpiderunderspiders,quotes_spider.py

importscrapy

classQuotesSpider(scrapy.Spider):

name="quotes"

defstart_requests(self):

urls=[

'http://quotes.toscrape.com/page/1/',

'http://quotes.toscrape.com/page/2/',

]

forurlinurls:

yieldscrapy.Request(url=url,callback=self.parse)

defparse(self,response):

page=response.url.split("/")[-2]

filename='quotes-%s.html'%page

withopen(filename,'wb')asf:

f.write(response.body)

self.log('Savefile%s'%filename)

RuntheProject

>scrapecrawlquotes

Ashortcuttothestart_requests

start_urls=[

'http://quotes.toscrape.com/page/1/',

'http://quotes.toscrape.com/page/2/',

]

ThisshellcommandwillopenalltheDOMelementsonthepage

>scrapyshell'http://quotes.toscrape.com/page/1’

[s]AvailableScrapyobjects:

[s]scrapyscrapymodule(containsscrapy.Request,scrapy.Selector,etc)

[s]crawler<scrapy.crawler.Crawlerobjectat0x104c3db90>

[s]item{}

[s]request<GEThttp://quotes.toscrape.com/page/1>

[s]response<200http://quotes.toscrape.com/page/1/>

[s]settings<scrapy.settings.Settingsobjectat0x104c3d110>

[s]spider<DefaultSpider'default'at0x10582e550>

[s]Usefulshortcuts:

[s]fetch(url[,redirect=True])FetchURLandupdatelocalobjects(bydefault,redirectsarefollowed)

[s]fetch(req)Fetchascrapy.Requestandupdatelocalobjects

[s]shelp()Shellhelp(printthishelp)

[s]view(response)Viewresponseinabrowser

>response.css('title')

[<Selectorxpath=u'descendant-or-self::title'data=u'<title>QuotestoScrape</title>'>]

>response.css('title::text').extract()

[u'QuotestoScrape’]

>response.css('title::text').extract_first()

u'QuotestoScrape’

>response.xpath('//title/text()').extract_first()

u'QuotestoScrape’

>quote=response.css("div.quote")[0]

>title=quote.css("span.text::text").extract_first()

>title

u'\u201cTheworldaswehavecreateditisaprocessofourthinking.Itcannotbechangedwithoutchangingourthinking.\u201d'

>forquoteinresponse.css("div.quote"):

...text=quote.css("span.text::text").extract_first()

...author=quote.css("small.author::text").extract_first()

...tags=quote.css("div.tagsa.tag::text").extract()

...print(dict(text=text,author=author,tags=tags))

...

{'text':u'\u201cTheworldaswehavecreateditisaprocessofourthinking.Itcannotbechangedwithoutchangingourthinking.\u201d','tags':[u'change',u'deep-thoughts',u'thinking',u'world'],'author':u'AlbertEinstein'}

{'text':u'\u201cItisourchoices,Harry,thatshowwhatwetrulyare,farmorethanourabilities.\u201d','tags':[u'abilities',u'choices'],'author':u'J.K.Rowling'}

ChangethePythonScripttoParsethedatainSpider

defparse(self,response):

forquoteinresponse.css('div.quote'):

yield{

'text':quote.css('span.text::text').extract_first(),

'author':quote.css('small.author::text').extract_first(),

'tags':quote.css('div.tagsa.tag::text').extract(),

}

OutputtheJSONinsomewhere

>scrapycrawlquotes-oquotes.json

>response.css('li.nexta::attr(href)').extract_first()

u'/page/2/‘

FindNextPage

next_page=response.css('li.nexta::attr(href)').extract_first()

ifnext_pageisnotNone:

next_page=response.urljoin(next_page)

yieldscrapy.Request(next_page,callback=self.parse)

Oralternatively

ifnext_pageisnotNone:

yieldresponse.follow(next_page,callback=self.parse)

AuthorSpider

importscrapy

classAuthorSpider(scrapy.Spider):

name='author'

start_urls=['http://quotes.toscrape.com/']

defparse(self,response):

forhrefinresponse.css('.author+a::attr(href)'):

yieldresponse.follow(href,self.parse_author)

forhrefinresponse.css('li.nexta::attr(href)'):

yieldresponse.follow(href,self.parse)

defparse_author(self,response):

defextract_with_css(query):

returnresponse.css(query).extract_first().strip()

yield{

'name':extract_with_css('h3.author-title::text'),

'birthdate':extract_with_css('.author-born-date::text'),

'bio':extract_with_css('.author-description::text'),

}

>scrapycrawlauthor-oauthors.json

ReceiveParameters

>scrapycrawlquotes-oquotes-humor.json-atag=humor

defstart_requests(self):

url='http://quotes.toscrape.com/'

tag=getattr(self,'tag',None)

iftagisnotNone:

url=url+'tag/'+tag

yieldscrapy.Request(url,self.parse)

References:

https://www.debrice.com/building-a-simple-crawler/

https://gist.github.com/debrice/a34563fb078d9d2d15e8

https://scrapy.org/

https://medium.com/python-pandemonium/develop-your-first-web-crawler-in-python-scrapy-6b2ee4baf954

相关推荐