Java实现之网络爬虫

最近公司闲来无事,看到了Apachenutch项目,记得前段时间做了网上数据的抓取,是别人给的代码,自己改动一下代码,然后实现其功能。当初没有深究,所以现研究了一下。

从网上看了很多的例子,实现网络爬虫,大概三步走:一是加载所要爬虫的网站。二是爬虫链接。三是爬虫匹配的内容。以下是原始的实现方法,代码:

packagecom.shangkang.pzf.xywy;

importjava.io.File;

importjava.io.FileNotFoundException;

importjava.io.IOException;

importjava.io.InputStream;

importorg.apache.commons.io.FileUtils;

importorg.apache.http.HttpEntity;

importorg.apache.http.HttpResponse;

importorg.apache.http.client.ClientProtocolException;

importorg.apache.http.client.HttpClient;

importorg.apache.http.client.methods.HttpGet;

importorg.apache.http.impl.client.DefaultHttpClient;

importorg.apache.http.params.CoreConnectionPNames;

importorg.htmlparser.Node;

importorg.htmlparser.NodeFilter;

importorg.htmlparser.Parser;

importorg.htmlparser.tags.LinkTag;

importorg.htmlparser.util.NodeList;

importorg.htmlparser.util.ParserException;

importcom.shangkang.yjw.manager.LinkQueue;

importcom.shangkang.yjw.manager.Queue;

importcom.shangkang.yjw.util.Constant;

publicclassGetStartPoint{

publicstaticvoidmain(String[]args){

StringbaseUrl="http://club.xywy.com/";

newGetStartPoint().downloadFile(baseUrl,"xywy");

StringfilePath="d:/crawler-cust/xywy.html";

testParserHtml2NeedLink(filePath);

//加载所要爬虫的网站

publicvoiddownloadFile(Stringurl,StringfileName){

StringsaveFilePath="d:/crawler-cust/";

HttpClienthc=null;

try{

hc=newDefaultHttpClient();hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,5000);

HttpGethttpGet=newHttpGet(url);

HttpResponseresponse=hc.execute(httpGet);

response.getParams();

HttpEntityentity=response.getEntity();

System.out.println(entity.getContentType());

if(entity!=null)

{

InputStreamis=entity.getContent();

FileUtils.copyInputStreamToFile(is,newFile(saveFilePath+fileName+".html"));

}

}catch(ClientProtocolExceptione){

e.printStackTrace();

}catch(IllegalStateExceptione){

e.printStackTrace();

}catch(IOExceptione){

e.printStackTrace();

}finally{

hc.getConnectionManager().shutdown();

}

}

//爬虫链接

publicstaticvoidtestParserHtml2NeedLink(StringfilePath)

{

try{

Parserparser=newParser(filePath);

NodeListnodeList=parser.extractAllNodesThatMatch(newNodeFilter(){

@Override

publicbooleanaccept(Nodenode){

if(node.getText().startsWith("dlclass=\"clearfix\""))

{System.out.println("node.getText()"+node.getText());//class="clearfix"<dlclass="clearfix">

returntrue;

}else

{

returnfalse;

}

}

});

NodeListnodeListA=newNodeList();

NodeListnodeListDd=newNodeList();

if(nodeList!=null)

{

intsize=nodeList.size();

for(inti=0;i<size;i++)

{

NodedlNode=nodeList.elementAt(i);

nodeListDd=dlNode.getChildren();

nodeListA.add(nodeListDd.extractAllNodesThatMatch(newNodeFilter(){

@Override

publicbooleanaccept(Nodenode){

if(node.getText().startsWith("atarget=\"_blank\"href="))

{System.out.println(node.getText());

returntrue;

}

returnfalse;

}

},true));

}

}

System.out.println("-------------------------------");

intsize=nodeListA.size();

for(inti=0;i<size;i++)

{

//nodeListA.

Nodenode=nodeListA.elementAt(i);

if(nodeinstanceofLinkTag)

{

Stringlink=((LinkTag)node).extractLink();

//System.out.println("link=="+link.replace("file://localhost",base_url_yp900));

link=link.replace("file://localhost","");

System.out.println(link);

link=Constant.BASE_URL_XYWY+link;LinkQueue.addUnvisitedUrl(link);LinkQueue.addUnvisitedUrlName(newString(node.toPlainTextString().getBytes("ISO-8859-1"),"GBK"));

}

//System.out.println(node);

}

Filefile=newFile(Constant.SAVE_FILE_DIR+"xywy_need_links.txt");

FilefileName=newFile(Constant.SAVE_FILE_DIR+"xywy_need_links_TypeName.txt");

//Queue<String>ulrNames=LinkQueue.getUnVisitedUrlQueue();

Queue<String>ulrs=LinkQueue.getUnVisitedUrlQueue();

while(!ulrs.isEmpty())

{

Stringurl=ulrs.deQueue();

//StringurlName=ulrNames.deQueue();

//FileUtils.writeStringToFile(fileName,urlName+"\r\n",true);

FileUtils.writeStringToFile(file,url+"\r\n",true);

}

}catch(ParserExceptione){

e.printStackTrace();

}catch(FileNotFoundExceptione){

e.printStackTrace();

}catch(IOExceptione){

e.printStackTrace();

}

}

}

//爬虫二级连接

/**

*COPYRIGHT(C)2010LY.ALLRIGHTSRESERVED.

*

*Nopartofthispublicationmaybereproduced,storedinaretrievalsystem,

*ortransmitted,onanyformorbyanymeans,electronic,mechanical,photocopying,

*recording,orotherwise,withoutthepriorwrittenpermissionof3KW.

*

*CreatedBy:zzqiang

*CreatedOn:2013-6-18

*

*AmendmentHistory:

*

*AmendedByAmendedOnAmendmentDescription

*--------------------------------------------------------------------

*

**/

packagecom.shangkang.pzf.xywy;

importjava.io.BufferedReader;

importjava.io.File;

importjava.io.FileInputStream;

importjava.io.FileNotFoundException;

importjava.io.IOException;

importjava.io.InputStream;

importjava.io.InputStreamReader;

importjava.util.ArrayList;

importjava.util.List;

importorg.apache.commons.io.FileUtils;

importorg.apache.http.HttpEntity;

importorg.apache.http.HttpResponse;

importorg.apache.http.StatusLine;

importorg.apache.http.client.ClientProtocolException;

importorg.apache.http.client.HttpClient;

importorg.apache.http.client.methods.HttpGet;

importorg.apache.http.impl.client.DefaultHttpClient;

importorg.apache.http.params.CoreConnectionPNames;

importorg.htmlparser.Node;

importorg.htmlparser.NodeFilter;

importorg.htmlparser.Parser;

importorg.htmlparser.tags.LinkTag;

importorg.htmlparser.util.NodeList;

importcom.shangkang.yjw.manager.LinkQueue;

importcom.shangkang.yjw.util.Constant;

publicclassGetValuedLink{

publicstaticvoidmain(String[]args)throwsIOException

{

List<String>urls=newArrayList<String>();

//获取

urls=FileUtils.readLines(newFile(Constant.SAVE_FILE_DIR+"xywy_need_links.txt"));

for(Stringurl:urls)

{

StringstartPoint=url;

System.out.println(startPoint);

LinkQueue.addUnvisitedUrl(startPoint);

}

while(!LinkQueue.getUnVisitedUrlQueue().isEmpty())

{

Stringurl=LinkQueue.getUnVisitedUrlQueue().deQueue();

System.out.println("---------------------正在处理Url----------------==="+url);

if(!LinkQueue.getVisitedUrl().contains(url))

{

downloadFileAndParserLink(url);

LinkQueue.addVisitedUrl(url);

}

}

StringfilePath=Constant.SAVE_FILE_DIR+"valued_link_"+Constant.WWWXYWYCOM+".txt";

LinkQueue.flushContent2File(LinkQueue.getValuedUrls(),filePath);

}

publicstaticvoiddownloadFileAndParserLink(StringstartPoint)

{

StringaccessUrl=startPoint;

//http://www.yp900.com/ZY-HXXT/index_2.htm

//http://www.yp900.com/ZY-HXXT/

StringurlEnd=startPoint.substring(startPoint.lastIndexOf("/")+1);

intlastPoint=startPoint.lastIndexOf("/");

intlastLastPoint=startPoint.substring(0,lastPoint).lastIndexOf("/");

StringsonDir=startPoint.substring(lastLastPoint+1,lastPoint);

startPoint=startPoint.replace(urlEnd,"");

StringfileName=urlEnd.equals("")?sonDir:urlEnd.substring(0,urlEnd.lastIndexOf("."));

HttpClienthc=null;

StringfilePath=null;

try{

hc=newDefaultHttpClient();

hc.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,8000);

HttpGethttpGet=newHttpGet(accessUrl);

HttpResponseresponse=hc.execute(httpGet);

response.getParams();

StatusLinestatusLine=response.getStatusLine();

if(statusLine.getStatusCode()==200)

{

HttpEntityentity=response.getEntity();

//System.out.println(entity.getContentType());

if(entity!=null)

{

InputStreamis=entity.getContent();

filePath=Constant.SAVE_FILE_DIR+Constant.WWWXYWYCOM+"/"+sonDir+"/"+fileName+".htm";

System.out.println("savefilePath="+filePath);

FileUtils.copyInputStreamToFile(is,newFile(filePath));

System.out.println("filedownloadsuccuss:sourceurl="+startPoint);

}

}elseif(statusLine.getStatusCode()==404)

{

System.err.println("http404:::"+startPoint);;

}

else

{

System.err.println("httpconnecterror");

}

if(null!=filePath)

{

parserValuedLinkAndNextLink(filePath,startPoint);

System.out.println("--删除下载的文件--"+filePath);

newFile(filePath).delete();

}

}catch(ClientProtocolExceptione){

e.printStackTrace();

}catch(IllegalStateExceptione){

e.printStackTrace();

}catch(IOExceptione){

e.printStackTrace();

}finally{

hc.getConnectionManager().shutdown();

}

}

publicstaticvoidparserValuedLinkAndNextLink(StringfilePath,StringstartPoint)

{

//divclass="r_btnf_r"

try

{

Parserparser=newParser(filePath);

NodeListnodeListDiv=parser.extractAllNodesThatMatch(newNodeFilter(){

@Override

publicbooleanaccept(Nodenode)

{

//System.out.println(node);

if(node.getText().startsWith(

"tdclass=\"pl20w340\""))

{

//class="clearfix"<dlclass="clearfix">

returntrue;

}else

{

returnfalse;

}

}

});

NodeListnodeListA=newNodeList();

NodeListnodeListDd=newNodeList();

if(nodeListDiv!=null)

{

intsize=nodeListDiv.size();

for(inti=0;i<size;i++)

{

NodedivNode=nodeListDiv.elementAt(i);

NodeListnodes=divNode.getChildren();

nodeListA.add(nodes.extractAllNodesThatMatch(newNodeFilter(){

@Override

publicbooleanaccept(Nodenode)

{

if(nodeinstanceofLinkTag)

{

returntrue;

}

else

{

returnfalse;

}

}

},true));

}

}

System.out.println("-------抽取有价值的连接---start----");

intsize=nodeListA.size();

for(inti=0;i<size;i++)

{

Nodenode=nodeListA.elementAt(i);

if(nodeinstanceofLinkTag)

{

Stringlink=((LinkTag)node).extractLink();

//link=link.replace("file://localhost","");

//System.out.println(link);

if(link.indexOf("static")!=-1)

{

//link=Constant.BASE_URL_XYWY+link;

//link=link.replace("file://localhost","");

System.out.println("valuedlink="+link);

LinkQueue.addValuedUrl(link,Constant.WWWXYWYCOM);

}

}

}

System.out.println("-------抽取有价值的连接---end---");

System.out.println("-------抽取Next下载的连接-start------");

NodeListnextNodeList=newNodeList();

parser=newParser(filePath);

NodeListpageNumNodeList=parser.extractAllNodesThatMatch(newNodeFilter(){

@Override

publicbooleanaccept(Nodenode)

{

if(node.getText().startsWith("divclass=\"clearfixpageStyletcmt20pb20f12pagelink\""))

{

returntrue;

}else

{

returnfalse;

}

}

});

intdivSize=pageNumNodeList.size();

StringnextLink=null;

for(inti=0;i<divSize;i++)

{

NodedivNode=pageNumNodeList.elementAt(i);

nextNodeList=divNode.getChildren().extractAllNodesThatMatch(newNodeFilter(){

@Override

publicbooleanaccept(Nodenode)

{

if(node.getText().startsWith("ahref=")&&nodeinstanceofLinkTag)

{

LinkTaglinkTag=(LinkTag)node;

Stringlink=linkTag.extractLink();

StringlinkText=linkTag.getLinkText();

//System.out.println("linkText="+linkText);

if(linkText.contains("下一页")&&link!=null&&!link.equals(""))

{

returntrue;

}

}

returnfalse;

}

},true);

}

if(null!=nextNodeList&&nextNodeList.size()>0)

{

Nodenode=nextNodeList.elementAt(0);

if(nodeinstanceofLinkTag)

{

LinkTaglinkTag=(LinkTag)node;

nextLink=linkTag.extractLink();

System.out.println("nextLink=="+nextLink);

nextLink=Constant.BASE_URL_XYWY+nextLink;

System.out.println("找到新的下载链接:"+nextLink);

StringfileName=nextLink.substring(nextLink.lastIndexOf("/"));

System.out.println("fileName===="+fileName);

LinkQueue.addUnvisitedUrl(nextLink);

}

}

System.out.println("-------抽取Next下载的连接---end----");

}catch(Exceptione)

{

e.printStackTrace();

}

}

}

相关推荐