Java实现之网络爬虫
最近公司闲来无事,看到了Apachenutch项目,记得前段时间做了网上数据的抓取,是别人给的代码,自己改动一下代码,然后实现其功能。当初没有深究,所以现研究了一下。
从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:
packagecom.shangkang.pzf.xywy;
importjava.io.File;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.io.InputStream;
importorg.apache.commons.io.FileUtils;
importorg.apache.http.HttpEntity;
importorg.apache.http.HttpResponse;
importorg.apache.http.client.ClientProtocolException;
importorg.apache.http.client.HttpClient;
importorg.apache.http.client.methods.HttpGet;
importorg.apache.http.impl.client.DefaultHttpClient;
importorg.apache.http.params.CoreConnectionPNames;
importorg.htmlparser.Node;
importorg.htmlparser.NodeFilter;
importorg.htmlparser.Parser;
importorg.htmlparser.tags.LinkTag;
importorg.htmlparser.util.NodeList;
importorg.htmlparser.util.ParserException;
importcom.shangkang.yjw.manager.LinkQueue;
importcom.shangkang.yjw.manager.Queue;
importcom.shangkang.yjw.util.Constant;
publicclassGetStartPoint{
publicstaticvoidmain(String[]args){
StringbaseUrl="http://club.xywy.com/";
newGetStartPoint().downloadFile(baseUrl,"xywy");
StringfilePath="d:/crawler-cust/xywy.html";
testParserHtml2NeedLink(filePath);
//加载所要爬虫的网站
publicvoiddownloadFile(Stringurl,StringfileName){
StringsaveFilePath="d:/crawler-cust/";
HttpClienthc=null;
try{
hc=newDefaultHttpClient();hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,5000);
HttpGethttpGet=newHttpGet(url);
HttpResponseresponse=hc.execute(httpGet);
response.getParams();
HttpEntityentity=response.getEntity();
System.out.println(entity.getContentType());
if(entity!=null)
{
InputStreamis=entity.getContent();
FileUtils.copyInputStreamToFile(is,newFile(saveFilePath+fileName+".html"));
}
}catch(ClientProtocolExceptione){
e.printStackTrace();
}catch(IllegalStateExceptione){
e.printStackTrace();
}catch(IOExceptione){
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
//爬虫链接
publicstaticvoidtestParserHtml2NeedLink(StringfilePath)
{
try{
Parserparser=newParser(filePath);
NodeListnodeList=parser.extractAllNodesThatMatch(newNodeFilter(){
@Override
publicbooleanaccept(Nodenode){
if(node.getText().startsWith("dlclass=\"clearfix\""))
{System.out.println("node.getText()"+node.getText());//class="clearfix"<dlclass="clearfix">
returntrue;
}else
{
returnfalse;
}
}
});
NodeListnodeListA=newNodeList();
NodeListnodeListDd=newNodeList();
if(nodeList!=null)
{
intsize=nodeList.size();
for(inti=0;i<size;i++)
{
NodedlNode=nodeList.elementAt(i);
nodeListDd=dlNode.getChildren();
nodeListA.add(nodeListDd.extractAllNodesThatMatch(newNodeFilter(){
@Override
publicbooleanaccept(Nodenode){
if(node.getText().startsWith("atarget=\"_blank\"href="))
{System.out.println(node.getText());
returntrue;
}
returnfalse;
}
},true));
}
}
System.out.println("-------------------------------");
intsize=nodeListA.size();
for(inti=0;i<size;i++)
{
//nodeListA.
Nodenode=nodeListA.elementAt(i);
if(nodeinstanceofLinkTag)
{
Stringlink=((LinkTag)node).extractLink();
//System.out.println("link=="+link.replace("file://localhost",base_url_yp900));
link=link.replace("file://localhost","");
System.out.println(link);
link=Constant.BASE_URL_XYWY+link;LinkQueue.addUnvisitedUrl(link);LinkQueue.addUnvisitedUrlName(newString(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));
}
//System.out.println(node);
}
Filefile=newFile(Constant.SAVE_FILE_DIR+"xywy_need_links.txt");
FilefileName=newFile(Constant.SAVE_FILE_DIR+"xywy_need_links_TypeName.txt");
//Queue<String>ulrNames=LinkQueue.getUnVisitedUrlQueue();
Queue<String>ulrs=LinkQueue.getUnVisitedUrlQueue();
while(!ulrs.isEmpty())
{
Stringurl=ulrs.deQueue();
//StringurlName=ulrNames.deQueue();
//FileUtils.writeStringToFile(fileName,urlName+"\r\n",true);
FileUtils.writeStringToFile(file,url+"\r\n",true);
}
}catch(ParserExceptione){
e.printStackTrace();
}catch(FileNotFoundExceptione){
e.printStackTrace();
}catch(IOExceptione){
e.printStackTrace();
}
}
}
//爬虫二级连接
/**
*COPYRIGHT(C)2010LY.ALLRIGHTSRESERVED.
*
*Nopartofthispublicationmaybereproduced,storedinaretrievalsystem,
*ortransmitted,onanyformorbyanymeans,electronic,mechanical,photocopying,
*recording,orotherwise,withoutthepriorwrittenpermissionof3KW.
*
*CreatedBy:zzqiang
*CreatedOn:2013-6-18
*
*AmendmentHistory:
*
*AmendedByAmendedOnAmendmentDescription
*--------------------------------------------------------------------
*
**/
packagecom.shangkang.pzf.xywy;
importjava.io.BufferedReader;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.io.InputStream;
importjava.io.InputStreamReader;
importjava.util.ArrayList;
importjava.util.List;
importorg.apache.commons.io.FileUtils;
importorg.apache.http.HttpEntity;
importorg.apache.http.HttpResponse;
importorg.apache.http.StatusLine;
importorg.apache.http.client.ClientProtocolException;
importorg.apache.http.client.HttpClient;
importorg.apache.http.client.methods.HttpGet;
importorg.apache.http.impl.client.DefaultHttpClient;
importorg.apache.http.params.CoreConnectionPNames;
importorg.htmlparser.Node;
importorg.htmlparser.NodeFilter;
importorg.htmlparser.Parser;
importorg.htmlparser.tags.LinkTag;
importorg.htmlparser.util.NodeList;
importcom.shangkang.yjw.manager.LinkQueue;
importcom.shangkang.yjw.util.Constant;
publicclassGetValuedLink{
publicstaticvoidmain(String[]args)throwsIOException
{
List<String>urls=newArrayList<String>();
//获取
urls=FileUtils.readLines(newFile(Constant.SAVE_FILE_DIR+"xywy_need_links.txt"));
for(Stringurl:urls)
{
StringstartPoint=url;
System.out.println(startPoint);
LinkQueue.addUnvisitedUrl(startPoint);
}
while(!LinkQueue.getUnVisitedUrlQueue().isEmpty())
{
Stringurl=LinkQueue.getUnVisitedUrlQueue().deQueue();
System.out.println("---------------------正在处理Url----------------==="+url);
if(!LinkQueue.getVisitedUrl().contains(url))
{
downloadFileAndParserLink(url);
LinkQueue.addVisitedUrl(url);
}
}
StringfilePath=Constant.SAVE_FILE_DIR+"valued_link_"+Constant.WWWXYWYCOM+".txt";
LinkQueue.flushContent2File(LinkQueue.getValuedUrls(),filePath);
}
publicstaticvoiddownloadFileAndParserLink(StringstartPoint)
{
StringaccessUrl=startPoint;
//http://www.yp900.com/ZY-HXXT/index_2.htm
//http://www.yp900.com/ZY-HXXT/
StringurlEnd=startPoint.substring(startPoint.lastIndexOf("/")+1);
intlastPoint=startPoint.lastIndexOf("/");
intlastLastPoint=startPoint.substring(0,lastPoint).lastIndexOf("/");
StringsonDir=startPoint.substring(lastLastPoint+1,lastPoint);
startPoint=startPoint.replace(urlEnd,"");
StringfileName=urlEnd.equals("")?sonDir:urlEnd.substring(0,urlEnd.lastIndexOf("."));
HttpClienthc=null;
StringfilePath=null;
try{
hc=newDefaultHttpClient();
hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,8000);
HttpGethttpGet=newHttpGet(accessUrl);
HttpResponseresponse=hc.execute(httpGet);
response.getParams();
StatusLinestatusLine=response.getStatusLine();
if(statusLine.getStatusCode()==200)
{
HttpEntityentity=response.getEntity();
//System.out.println(entity.getContentType());
if(entity!=null)
{
InputStreamis=entity.getContent();
filePath=Constant.SAVE_FILE_DIR+Constant.WWWXYWYCOM+"/"+sonDir+"/"+fileName+".htm";
System.out.println("savefilePath="+filePath);
FileUtils.copyInputStreamToFile(is,newFile(filePath));
System.out.println("filedownloadsuccuss:sourceurl="+startPoint);
}
}elseif(statusLine.getStatusCode()==404)
{
System.err.println("http404:::"+startPoint);;
}
else
{
System.err.println("httpconnecterror");
}
if(null!=filePath)
{
parserValuedLinkAndNextLink(filePath,startPoint);
System.out.println("--删除下载的文件--"+filePath);
newFile(filePath).delete();
}
}catch(ClientProtocolExceptione){
e.printStackTrace();
}catch(IllegalStateExceptione){
e.printStackTrace();
}catch(IOExceptione){
e.printStackTrace();
}finally{
hc.getConnectionManager().shutdown();
}
}
publicstaticvoidparserValuedLinkAndNextLink(StringfilePath,StringstartPoint)
{
//divclass="r_btnf_r"
try
{
Parserparser=newParser(filePath);
NodeListnodeListDiv=parser.extractAllNodesThatMatch(newNodeFilter(){
@Override
publicbooleanaccept(Nodenode)
{
//System.out.println(node);
if(node.getText().startsWith(
"tdclass=\"pl20w340\""))
{
//class="clearfix"<dlclass="clearfix">
returntrue;
}else
{
returnfalse;
}
}
});
NodeListnodeListA=newNodeList();
NodeListnodeListDd=newNodeList();
if(nodeListDiv!=null)
{
intsize=nodeListDiv.size();
for(inti=0;i<size;i++)
{
NodedivNode=nodeListDiv.elementAt(i);
NodeListnodes=divNode.getChildren();
nodeListA.add(nodes.extractAllNodesThatMatch(newNodeFilter(){
@Override
publicbooleanaccept(Nodenode)
{
if(nodeinstanceofLinkTag)
{
returntrue;
}
else
{
returnfalse;
}
}
},true));
}
}
System.out.println("-------抽取有价值的连接---start----");
intsize=nodeListA.size();
for(inti=0;i<size;i++)
{
Nodenode=nodeListA.elementAt(i);
if(nodeinstanceofLinkTag)
{
Stringlink=((LinkTag)node).extractLink();
//link=link.replace("file://localhost","");
//System.out.println(link);
if(link.indexOf("static")!=-1)
{
//link=Constant.BASE_URL_XYWY+link;
//link=link.replace("file://localhost","");
System.out.println("valuedlink="+link);
LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);
}
}
}
System.out.println("-------抽取有价值的连接---end---");
System.out.println("-------抽取Next下载的连接-start------");
NodeListnextNodeList=newNodeList();
parser=newParser(filePath);
NodeListpageNumNodeList=parser.extractAllNodesThatMatch(newNodeFilter(){
@Override
publicbooleanaccept(Nodenode)
{
if(node.getText().startsWith("divclass=\"clearfixpageStyletcmt20pb20f12pagelink\""))
{
returntrue;
}else
{
returnfalse;
}
}
});
intdivSize=pageNumNodeList.size();
StringnextLink=null;
for(inti=0;i<divSize;i++)
{
NodedivNode=pageNumNodeList.elementAt(i);
nextNodeList=divNode.getChildren().extractAllNodesThatMatch(newNodeFilter(){
@Override
publicbooleanaccept(Nodenode)
{
if(node.getText().startsWith("ahref=")&&nodeinstanceofLinkTag)
{
LinkTaglinkTag=(LinkTag)node;
Stringlink=linkTag.extractLink();
StringlinkText=linkTag.getLinkText();
//System.out.println("linkText="+linkText);
if(linkText.contains("下一页")&&link!=null&&!link.equals(""))
{
returntrue;
}
}
returnfalse;
}
},true);
}
if(null!=nextNodeList&&nextNodeList.size()>0)
{
Nodenode=nextNodeList.elementAt(0);
if(nodeinstanceofLinkTag)
{
LinkTaglinkTag=(LinkTag)node;
nextLink=linkTag.extractLink();
System.out.println("nextLink=="+nextLink);
nextLink=Constant.BASE_URL_XYWY+nextLink;
System.out.println("找到新的下载链接:"+nextLink);
StringfileName=nextLink.substring(nextLink.lastIndexOf("/"));
System.out.println("fileName===="+fileName);
LinkQueue.addUnvisitedUrl(nextLink);
}
}
System.out.println("-------抽取Next下载的连接---end----");
}catch(Exceptione)
{
e.printStackTrace();
}
}
}