nspider Node 版的轻量级爬虫框架 项目简介
nspidernode版的轻量级爬虫框架特性: 在服务端进行dom处理,默认使用的jquery语法 请求控制 优先请求队列 兼容node4.x或更高版本 快速开始如何安装$ npm install nspider22使用示例//简单示例
var nspider=require('nspider22')
var nsp=new nspider({name:'baidu'});
nsp.onHtml('a',function(ele){
console.log(ele.$.attr('href'));
})
nsp.visit("http://www.baidu.com");
//设置header
var nspider=require('nspider22')
var nsp=new nspider({name:'zhihu'});
nsp.setHeaders({
"Cache-Control":"private,no-store,max-age=0,no-cache,must-revalidate,post-check=0,pre-check=0",
"Connection":"keep-alive",
"Content-Encoding":"gzip",
"Content-Security-Policy":"default-src * blob:;img-src * data: blob:;frame-src 'self' *.zhihu.com *.zhihu.dev getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 weixin: zhihujs: v.qq.com v.youku.com www.bilibili.com *.vzuu.com;script-src 'self' *.zhihu.com *.google-analytics.com zhstatic.zhihu.com res.wx.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com blob:;style-src 'self' *.zhihu.com *.zhihu.dev unicom.zhimg.com 'unsafe-inline';connect-src * wss:",
"Content-Type":"text/html; charset=utf-8",
"Date":"Thu, 19 Oct 2017 02:37:30 GMT",
"Expires":"Fri, 02 Jan 2000 00:00:00 GMT",
"Pragma":"no-cache",
"Server":"ZWS",
"Set-Cookie":"",
"Transfer-Encoding":"chunked",
"Vary":"Accept-Encoding",
"X-Backend-Server":"heifetz.heifetz.fba20226---10.3.183.2:31036[10.3.183.2:31036]",
"X-Frame-Options":"DENY",
"X-Req-ID":"3570E3F59E80FE9",
"X-Req-SSL":"proto=TLSv1.2,sni=,cipher=ECDHE-RSA-AES256-GCM-SHA384"
})
nsp.onHtml('.tab-panel a.question_link',function(ele){
if(ele.tag=='zhihu'){
console.log(ele.$.attr('href'));
nsp.visit('https://www.zhihu.com'+ele.$.attr('href'),'item')
}
})
nsp.onHtml("body",function(ele){
if(ele.tag=='item'){
console.log(ele.$.text())
}
})
nsp.visit("https://www.zhihu.com/explore","zhihu"); todolist 处理复杂的ajax请求 重构代码 命令支持 中间件支持
var nspider=require('nspider22')
var nsp=new nspider({name:'baidu'});
nsp.onHtml('a',function(ele){
console.log(ele.$.attr('href'));
})
nsp.visit("http://www.baidu.com");
//设置header
var nspider=require('nspider22')
var nsp=new nspider({name:'zhihu'});
nsp.setHeaders({
"Cache-Control":"private,no-store,max-age=0,no-cache,must-revalidate,post-check=0,pre-check=0",
"Connection":"keep-alive",
"Content-Encoding":"gzip",
"Content-Security-Policy":"default-src * blob:;img-src * data: blob:;frame-src 'self' *.zhihu.com *.zhihu.dev getpocket.com note.youdao.com safari-extension://com.evernote.safari.clipper-Q79WDW8YH9 weixin: zhihujs: v.qq.com v.youku.com www.bilibili.com *.vzuu.com;script-src 'self' *.zhihu.com *.google-analytics.com zhstatic.zhihu.com res.wx.qq.com 'unsafe-eval' unpkg.zhimg.com unicom.zhimg.com blob:;style-src 'self' *.zhihu.com *.zhihu.dev unicom.zhimg.com 'unsafe-inline';connect-src * wss:",
"Content-Type":"text/html; charset=utf-8",
"Date":"Thu, 19 Oct 2017 02:37:30 GMT",
"Expires":"Fri, 02 Jan 2000 00:00:00 GMT",
"Pragma":"no-cache",
"Server":"ZWS",
"Set-Cookie":"",
"Transfer-Encoding":"chunked",
"Vary":"Accept-Encoding",
"X-Backend-Server":"heifetz.heifetz.fba20226---10.3.183.2:31036[10.3.183.2:31036]",
"X-Frame-Options":"DENY",
"X-Req-ID":"3570E3F59E80FE9",
"X-Req-SSL":"proto=TLSv1.2,sni=,cipher=ECDHE-RSA-AES256-GCM-SHA384"
})
nsp.onHtml('.tab-panel a.question_link',function(ele){
if(ele.tag=='zhihu'){
console.log(ele.$.attr('href'));
nsp.visit('https://www.zhihu.com'+ele.$.attr('href'),'item')
}
})
nsp.onHtml("body",function(ele){
if(ele.tag=='item'){
console.log(ele.$.text())
}
})
nsp.visit("https://www.zhihu.com/explore","zhihu"); todolist 处理复杂的ajax请求 重构代码 命令支持 中间件支持