使用python scrapy爬取网页中带有地图展示的数据
最近有个需求,是要爬取某个物流公司的官网信息,我看了下官网,基本上都是静态页面比较好抓取,不像那种资讯类,电子商务类型的网站结果复杂,反爬严格,AJAX众多,还内心暗自庆幸,当我进一步分析时候发现并非普通的静态页面。
例如这个URL界面,我要获取全中国各大城市的物流园区分布信息,并且要获取详情信息,
这个页面里面是有个地图镶嵌,每个城市物流信息你要单独点击地图上的信息才能显示。
https://www.glprop.com.cn/our...
我刚开始想,这种会不会是ajax请求呢,通过chrmoe抓包并没有发现,然后我查看网页源代码
发现所有城市信息在一个scripts里面
如图:
然后各个园区的信息在一个叫park={xx}里面存着
原来都在这里面,直接获取源代码,正则匹配,开干。
item:
#普洛斯 class PuluosiNewsItem(scrapy.Item): newstitle=scrapy.Field() newtiems=scrapy.Field() newslink=scrapy.Field() class PuluosiItem(scrapy.Item): assetstitle = scrapy.Field() assetaddress=scrapy.Field() assetgaikuang=scrapy.Field() assetpeople=scrapy.Field() asseturl = scrapy.Field()
pipelines:
class PuluosiNewsPipeline(object): def __init__(self): self.wb=Workbook() self.ws=self.wb.active #设置表头 self.ws.append(['普洛斯新闻标题','新闻发布时间','新闻URL']) self.wb2 = Workbook() self.ws2 = self.wb2.active self.ws2.append(['资产标题', '资产地址', '资产概况','其他信息','URL']) def process_item(self,item,spider): if isinstance(item, PuluosiNewsItem): line = [item['newstitle'], item['newtiems'], item['newslink']] # 把数据中每一项整理出来 self.ws.append(line) self.wb.save('PuluosiNews.xlsx') # 保存xlsx文件 elif isinstance(item,PuluosiItem): line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']] self.ws2.append(line) self.wb2.save('PuluosiAsset.xlsx') # 保存xlsx文件 return item
spider:
# -*- coding: utf-8 -*- import scrapy,re,json from news.items import PuluosiNewsItem,PuluosiItem from scrapy.linkextractors import LinkExtractor class PuluosiSpider(scrapy.Spider): name = 'puluosi' allowed_domains = ['glprop.com.cn'] # start_urls = ['https://www.glprop.com.cn/press-releases.html'] def start_requests(self): yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1) yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2) yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3) yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4) def parse1(self, response): print('此时启动的爬虫为:puluosi' ) item=PuluosiNewsItem() web=response.xpath('//tbody/tr') web.pop(0) for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接 item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) # print(item['newslink']) yield item #加入try 来判断当前年份的新闻是否有下一页出现 try: next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract()[0] if next_url_tmp: next_url = "https://www.glprop.com.cn" + next_url_tmp yield scrapy.Request(next_url,callback=self.parse1) except Exception as e: print("当前页面没有下一页") href=response.xpath('//ul[@class="timeList"]/li/a/@href') for nexturl in href: url1 =nexturl.extract() if url1: url="https://www.glprop.com.cn"+url1 yield scrapy.Request(url,callback=self.parse1) def parse2(self,response): item = PuluosiNewsItem() web = response.xpath('//tbody/tr') web.pop(0) for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接 item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) print(item['newslink']) yield item #加入try 来判断当前年份的新闻是否有下一页出现 try: next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(),"下一页")]/@href').extract()[0] if next_url_tmp: next_url = "https://www.glprop.com.cn" + next_url_tmp yield scrapy.Request(next_url,callback=self.parse2) except Exception as e: print("当前页面没有下一页") href=response.xpath('//ul[@class="timeList"]/li/a/@href') for nexturl in href: url1 =nexturl.extract() if url1: url="https://www.glprop.com.cn"+url1 yield scrapy.Request(url,callback=self.parse2) def parse3(self,response): item=PuluosiNewsItem() web=response.xpath('//tbody/tr') web.pop() for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin创建绝对的links路径,始用于网页中的href值为相对路径的连接 item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) print(item['newslink']) yield item def parse4(self,response): link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]') links=link.extract_links(response) #获取所有城市的links for i in links: detailurl=i.url yield scrapy.Request(url=detailurl,callback=self.parse5) def parse4(self, response): item = PuluosiItem() citycode=re.findall('var cities =(.*);',response.text ) citycodejson=json.loads(("".join(citycode))) #把每个城市的id和name取出来放到一个字典 dictcity={} for i in citycodejson: citycodename=i['name'] citycodenm=i['id'] dictcity[citycodenm]=citycodename detail=re.findall('var parks =(.*);',response.text ) jsonBody = json.loads(("".join(detail))) list = [] for key1 in jsonBody: for key2 in jsonBody[key1]: tmp=jsonBody[key1][key2] list.append(jsonBody[key1][key2]) for node in list: assetaddress = node['city_id'] item['assetaddress'] = dictcity[assetaddress] # print(item['assetaddress']) item['assetstitle'] = node['name'] # print(item['assetstitle']) item['assetgaikuang'] = node['detail_single'].strip().replace(' ', '').replace(' ', '') # print(item['assetgaikuang']) assetpeople = node['description'] item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace(' ', '') item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress'] # print(item['assetpeople']) yield item
然后我顺便把页面的新闻信息也爬取了。
相关推荐
andrewwf 2020-11-11
Arvinzx 2020-10-28
CycloneKid 2020-10-27
paleyellow 2020-10-25
baifanwudi 2020-10-25
heyboz 2020-10-21
wumxiaozhu 2020-10-16
ZHANGRENXIANG00 2020-07-27
zhangll00 2020-07-05
javaraylu 2020-06-28
ZHANGRENXIANG00 2020-06-28
Catastrophe 2020-06-26
Catastrophe 2020-06-26
fangjack 2020-06-25
andrewwf 2020-06-16
qyf 2020-06-14
荒乱的没日没夜 2020-06-14