简单好用的网络爬虫spider/crawler

packagespider;

importjava.io.BufferedReader;

importjava.io.InputStreamReader;

importjava.net.URL;

importjava.util.ArrayList;

importjava.util.HashMap;

importjava.util.HashSet;

importjava.util.LinkedHashSet;

importjava.util.regex.Matcher;

importjava.util.regex.Pattern;

publicclassSearchCrawlerimplementsRunnable{

/*

*disallowListCache缓存robot不允许搜索的URL。Robot协议在Web站点的根目录下设置一个robots.txt文件,

*规定站点上的哪些页面是限制搜索的。搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:#robots.txtfor

*http://somehost.com/User-agent:Disallow:/cgi-bin/Disallow:

*/registration#DisallowrobotsonregistrationpageDisallow:/login

*/

privateHashMap<String,ArrayList<String>>disallowListCache=newHashMap<String,ArrayList<String>>();

ArrayList<String>errorList=newArrayList<String>();//错误信息

ArrayList<String>result=newArrayList<String>();//搜索到的结果

StringstartUrl;//开始搜索的起点

intmaxUrl;//最大处理的url数

StringsearchString;//要搜索的字符串(英文)

booleancaseSensitive=false;//是否区分大小写

booleanlimitHost=false;//是否在限制的主机内搜索

publicSearchCrawler(StringstartUrl,intmaxUrl,StringsearchString){

this.startUrl=startUrl;

this.maxUrl=maxUrl;

this.searchString=searchString;

}

publicArrayList<String>getResult(){

returnresult;

}

publicvoidrun(){//启动搜索线程

crawl(startUrl,maxUrl,searchString,limitHost,caseSensitive);

}

//检测URL格式

privateURLverifyUrl(Stringurl){

//只处理HTTPURLs.

if(!url.toLowerCase().startsWith("http://"))

returnnull;

URLverifiedUrl=null;

try{

verifiedUrl=newURL(url);

}catch(Exceptione){

returnnull;

}

returnverifiedUrl;

}

//检测robot是否允许访问给出的URL.

privatebooleanisRobotAllowed(URLurlToCheck){

Stringhost=urlToCheck.getHost().toLowerCase();//获取给出RUL的主机

//System.out.println("主机="+host);

//获取主机不允许搜索的URL缓存

ArrayList<String>disallowList=disallowListCache.get(host);

//如果还没有缓存,下载并缓存。

if(disallowList==null){

disallowList=newArrayList<String>();

try{

URLrobotsFileUrl=newURL("http://"+host+"/robots.txt");

BufferedReaderreader=newBufferedReader(

newInputStreamReader(robotsFileUrl.openStream()));

//读robot文件,创建不允许访问的路径列表。

Stringline;

while((line=reader.readLine())!=null){

if(line.indexOf("Disallow:")==0){//是否包含"Disallow:"

StringdisallowPath=line.substring("Disallow:"

.length());//获取不允许访问路径

//检查是否有注释。

intcommentIndex=disallowPath.indexOf("#");

if(commentIndex!=-1){

disallowPath=disallowPath.substring(0,

commentIndex);//去掉注释

}

disallowPath=disallowPath.trim();

disallowList.add(disallowPath);

}

}

//缓存此主机不允许访问的路径。

disallowListCache.put(host,disallowList);

}catch(Exceptione){

returntrue;//web站点根目录下没有robots.txt文件,返回真

}

}

Stringfile=urlToCheck.getFile();

//System.out.println("文件getFile()="+file);

for(inti=0;i<disallowList.size();i++){

Stringdisallow=disallowList.get(i);

if(file.startsWith(disallow)){

returnfalse;

}

}

returntrue;

}

privateStringdownloadPage(URLpageUrl){

try{

//OpenconnectiontoURLforreading.

BufferedReaderreader=newBufferedReader(newInputStreamReader(

pageUrl.openStream()));

//Readpageintobuffer.

Stringline;

StringBufferpageBuffer=newStringBuffer();

while((line=reader.readLine())!=null){

pageBuffer.append(line);

}

returnpageBuffer.toString();

}catch(Exceptione){

}

returnnull;

}

//从URL中去掉"www"

privateStringremoveWwwFromUrl(Stringurl){

intindex=url.indexOf("://www.");

if(index!=-1){

returnurl.substring(0,index+3)+url.substring(index+7);

}

return(url);

}

//解析页面并找出链接

@SuppressWarnings("unchecked")

privateArrayList<String>retrieveLinks(URLpageUrl,StringpageContents,

HashSetcrawledList,booleanlimitHost){

//用正则表达式编译链接的匹配模式。

Patternp=Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",

Pattern.CASE_INSENSITIVE);

Matcherm=p.matcher(pageContents);

ArrayList<String>linkList=newArrayList<String>();

while(m.find()){

Stringlink=m.group(1).trim();

if(link.length()<1){

continue;

}

//跳过链到本页面内链接。

if(link.charAt(0)=='#'){

continue;

}

if(link.indexOf("mailto:")!=-1){

continue;

}

if(link.toLowerCase().indexOf("javascript")!=-1){

continue;

}

if(link.indexOf("://")==-1){

if(link.charAt(0)=='/'){//处理绝对地址

link="http://"+pageUrl.getHost()+":"

+pageUrl.getPort()+link;

}else{

Stringfile=pageUrl.getFile();

if(file.indexOf('/')==-1){//处理相对地址

link="http://"+pageUrl.getHost()+":"

+pageUrl.getPort()+"/"+link;

}else{

Stringpath=file.substring(0,

file.lastIndexOf('/')+1);

link="http://"+pageUrl.getHost()+":"

+pageUrl.getPort()+path+link;

}

}

}

intindex=link.indexOf('#');

if(index!=-1){

link=link.substring(0,index);

}

link=removeWwwFromUrl(link);

URLverifiedLink=verifyUrl(link);

if(verifiedLink==null){

continue;

}

/*如果限定主机,排除那些不合条件的URL*/

if(limitHost

&&!pageUrl.getHost().toLowerCase().equals(

verifiedLink.getHost().toLowerCase())){

continue;

}

//跳过那些已经处理的链接.

if(crawledList.contains(link)){

continue;

}

linkList.add(link);

}

return(linkList);

}

//搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串

privatebooleansearchStringMatches(StringpageContents,

StringsearchString,booleancaseSensitive){

StringsearchContents=pageContents;

if(!caseSensitive){//如果不区分大小写

searchContents=pageContents.toLowerCase();

}

Patternp=Pattern.compile("[\\s]+");

String[]terms=p.split(searchString);

for(inti=0;i<terms.length;i++){

if(caseSensitive){

if(searchContents.indexOf(terms[i])==-1){

returnfalse;

}

}else{

if(searchContents.indexOf(terms[i].toLowerCase())==-1){

returnfalse;

}

}

}

returntrue;

}

//执行实际的搜索操作

publicArrayList<String>crawl(StringstartUrl,intmaxUrls,

StringsearchString,booleanlimithost,booleancaseSensitive){

HashSet<String>crawledList=newHashSet<String>();

LinkedHashSet<String>toCrawlList=newLinkedHashSet<String>();

if(maxUrls<1){

errorList.add("InvalidMaxURLsvalue.");

System.out.println("InvalidMaxURLsvalue.");

}

if(searchString.length()<1){

errorList.add("MissingSearchString.");

System.out.println("MissingsearchString");

}

if(errorList.size()>0){

System.out.println("err!!!");

returnerrorList;

}

//从开始URL中移出www

startUrl=removeWwwFromUrl(startUrl);

toCrawlList.add(startUrl);

while(toCrawlList.size()>0){

if(maxUrls!=-1){

if(crawledList.size()==maxUrls){

break;

}

}

//GetURLatbottomofthelist.

Stringurl=toCrawlList.iterator().next();

//RemoveURLfromthetocrawllist.

toCrawlList.remove(url);

//ConvertstringurltoURLobject.

URLverifiedUrl=verifyUrl(url);

//SkipURLifrobotsarenotallowedtoaccessit.

if(!isRobotAllowed(verifiedUrl)){

continue;

}

//增加已处理的URL到crawledList

crawledList.add(url);

StringpageContents=downloadPage(verifiedUrl);

if(pageContents!=null&&pageContents.length()>0){

//从页面中获取有效的链接

ArrayList<String>links=retrieveLinks(verifiedUrl,

pageContents,crawledList,limitHost);

toCrawlList.addAll(links);

if(searchStringMatches(pageContents,searchString,

caseSensitive)){

result.add(url);

System.out.println(url);

}

}

}

returnresult;

}

//主函数

publicstaticvoidmain(String[]args){

SearchCrawlercrawler=newSearchCrawler(

"http://www.blogjava.net/Jack2007/",20,"jack");

Threadsearch=newThread(crawler);

System.out.println("Startsearching...");

System.out.println("result:");

search.start();

try{

search.join();

}catch(InterruptedExceptione){

//TODOAuto-generatedcatchblock

e.printStackTrace();

}

}

}

相关推荐