简单好用的网络爬虫spider/crawler
packagespider;
importjava.io.BufferedReader;
importjava.io.InputStreamReader;
importjava.net.URL;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.HashSet;
importjava.util.LinkedHashSet;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
publicclassSearchCrawlerimplementsRunnable{
/*
*disallowListCache缓存robot不允许搜索的URL。Robot协议在Web站点的根目录下设置一个robots.txt文件,
*规定站点上的哪些页面是限制搜索的。搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:#robots.txtfor
*http://somehost.com/User-agent:Disallow:/cgi-bin/Disallow:
*/registration#DisallowrobotsonregistrationpageDisallow:/login
*/
privateHashMap<String,ArrayList<String>>disallowListCache=newHashMap<String,ArrayList<String>>();
ArrayList<String>errorList=newArrayList<String>();//错误信息
ArrayList<String>result=newArrayList<String>();//搜索到的结果
StringstartUrl;//开始搜索的起点
intmaxUrl;//最大处理的url数
StringsearchString;//要搜索的字符串(英文)
booleancaseSensitive=false;//是否区分大小写
booleanlimitHost=false;//是否在限制的主机内搜索
publicSearchCrawler(StringstartUrl,intmaxUrl,StringsearchString){
this.startUrl=startUrl;
this.maxUrl=maxUrl;
this.searchString=searchString;
}
publicArrayList<String>getResult(){
returnresult;
}
publicvoidrun(){//启动搜索线程
crawl(startUrl,maxUrl,searchString,limitHost,caseSensitive);
}
//检测URL格式
privateURLverifyUrl(Stringurl){
//只处理HTTPURLs.
if(!url.toLowerCase().startsWith("http://"))
returnnull;
URLverifiedUrl=null;
try{
verifiedUrl=newURL(url);
}catch(Exceptione){
returnnull;
}
returnverifiedUrl;
}
//检测robot是否允许访问给出的URL.
privatebooleanisRobotAllowed(URLurlToCheck){
Stringhost=urlToCheck.getHost().toLowerCase();//获取给出RUL的主机
//System.out.println("主机="+host);
//获取主机不允许搜索的URL缓存
ArrayList<String>disallowList=disallowListCache.get(host);
//如果还没有缓存,下载并缓存。
if(disallowList==null){
disallowList=newArrayList<String>();
try{
URLrobotsFileUrl=newURL("http://"+host+"/robots.txt");
BufferedReaderreader=newBufferedReader(
newInputStreamReader(robotsFileUrl.openStream()));
//读robot文件,创建不允许访问的路径列表。
Stringline;
while((line=reader.readLine())!=null){
if(line.indexOf("Disallow:")==0){//是否包含"Disallow:"
StringdisallowPath=line.substring("Disallow:"
.length());//获取不允许访问路径
//检查是否有注释。
intcommentIndex=disallowPath.indexOf("#");
if(commentIndex!=-1){
disallowPath=disallowPath.substring(0,
commentIndex);//去掉注释
}
disallowPath=disallowPath.trim();
disallowList.add(disallowPath);
}
}
//缓存此主机不允许访问的路径。
disallowListCache.put(host,disallowList);
}catch(Exceptione){
returntrue;//web站点根目录下没有robots.txt文件,返回真
}
}
Stringfile=urlToCheck.getFile();
//System.out.println("文件getFile()="+file);
for(inti=0;i<disallowList.size();i++){
Stringdisallow=disallowList.get(i);
if(file.startsWith(disallow)){
returnfalse;
}
}
returntrue;
}
privateStringdownloadPage(URLpageUrl){
try{
//OpenconnectiontoURLforreading.
BufferedReaderreader=newBufferedReader(newInputStreamReader(
pageUrl.openStream()));
//Readpageintobuffer.
Stringline;
StringBufferpageBuffer=newStringBuffer();
while((line=reader.readLine())!=null){
pageBuffer.append(line);
}
returnpageBuffer.toString();
}catch(Exceptione){
}
returnnull;
}
//从URL中去掉"www"
privateStringremoveWwwFromUrl(Stringurl){
intindex=url.indexOf("://www.");
if(index!=-1){
returnurl.substring(0,index+3)+url.substring(index+7);
}
return(url);
}
//解析页面并找出链接
@SuppressWarnings("unchecked")
privateArrayList<String>retrieveLinks(URLpageUrl,StringpageContents,
HashSetcrawledList,booleanlimitHost){
//用正则表达式编译链接的匹配模式。
Patternp=Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
Pattern.CASE_INSENSITIVE);
Matcherm=p.matcher(pageContents);
ArrayList<String>linkList=newArrayList<String>();
while(m.find()){
Stringlink=m.group(1).trim();
if(link.length()<1){
continue;
}
//跳过链到本页面内链接。
if(link.charAt(0)=='#'){
continue;
}
if(link.indexOf("mailto:")!=-1){
continue;
}
if(link.toLowerCase().indexOf("javascript")!=-1){
continue;
}
if(link.indexOf("://")==-1){
if(link.charAt(0)=='/'){//处理绝对地址
link="http://"+pageUrl.getHost()+":"
+pageUrl.getPort()+link;
}else{
Stringfile=pageUrl.getFile();
if(file.indexOf('/')==-1){//处理相对地址
link="http://"+pageUrl.getHost()+":"
+pageUrl.getPort()+"/"+link;
}else{
Stringpath=file.substring(0,
file.lastIndexOf('/')+1);
link="http://"+pageUrl.getHost()+":"
+pageUrl.getPort()+path+link;
}
}
}
intindex=link.indexOf('#');
if(index!=-1){
link=link.substring(0,index);
}
link=removeWwwFromUrl(link);
URLverifiedLink=verifyUrl(link);
if(verifiedLink==null){
continue;
}
/*如果限定主机,排除那些不合条件的URL*/
if(limitHost
&&!pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase())){
continue;
}
//跳过那些已经处理的链接.
if(crawledList.contains(link)){
continue;
}
linkList.add(link);
}
return(linkList);
}
//搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串
privatebooleansearchStringMatches(StringpageContents,
StringsearchString,booleancaseSensitive){
StringsearchContents=pageContents;
if(!caseSensitive){//如果不区分大小写
searchContents=pageContents.toLowerCase();
}
Patternp=Pattern.compile("[\\s]+");
String[]terms=p.split(searchString);
for(inti=0;i<terms.length;i++){
if(caseSensitive){
if(searchContents.indexOf(terms[i])==-1){
returnfalse;
}
}else{
if(searchContents.indexOf(terms[i].toLowerCase())==-1){
returnfalse;
}
}
}
returntrue;
}
//执行实际的搜索操作
publicArrayList<String>crawl(StringstartUrl,intmaxUrls,
StringsearchString,booleanlimithost,booleancaseSensitive){
HashSet<String>crawledList=newHashSet<String>();
LinkedHashSet<String>toCrawlList=newLinkedHashSet<String>();
if(maxUrls<1){
errorList.add("InvalidMaxURLsvalue.");
System.out.println("InvalidMaxURLsvalue.");
}
if(searchString.length()<1){
errorList.add("MissingSearchString.");
System.out.println("MissingsearchString");
}
if(errorList.size()>0){
System.out.println("err!!!");
returnerrorList;
}
//从开始URL中移出www
startUrl=removeWwwFromUrl(startUrl);
toCrawlList.add(startUrl);
while(toCrawlList.size()>0){
if(maxUrls!=-1){
if(crawledList.size()==maxUrls){
break;
}
}
//GetURLatbottomofthelist.
Stringurl=toCrawlList.iterator().next();
//RemoveURLfromthetocrawllist.
toCrawlList.remove(url);
//ConvertstringurltoURLobject.
URLverifiedUrl=verifyUrl(url);
//SkipURLifrobotsarenotallowedtoaccessit.
if(!isRobotAllowed(verifiedUrl)){
continue;
}
//增加已处理的URL到crawledList
crawledList.add(url);
StringpageContents=downloadPage(verifiedUrl);
if(pageContents!=null&&pageContents.length()>0){
//从页面中获取有效的链接
ArrayList<String>links=retrieveLinks(verifiedUrl,
pageContents,crawledList,limitHost);
toCrawlList.addAll(links);
if(searchStringMatches(pageContents,searchString,
caseSensitive)){
result.add(url);
System.out.println(url);
}
}
}
returnresult;
}
//主函数
publicstaticvoidmain(String[]args){
SearchCrawlercrawler=newSearchCrawler(
"http://www.blogjava.net/Jack2007/",20,"jack");
Threadsearch=newThread(crawler);
System.out.println("Startsearching...");
System.out.println("result:");
search.start();
try{
search.join();
}catch(InterruptedExceptione){
//TODOAuto-generatedcatchblock
e.printStackTrace();
}
}
}