spider简单的爬虫程序
spider简单的爬虫程序
1、基础准备
htmlparser
首页:http://sourceforge.net/projects/htmlparser/
下载:http://sourceforge.net/project/showfiles.php?group_id=24399
文件:htmlparser1_6_20060610.zip
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.6</version>
</dependency>
cpdetector
首页:http://cpdetector.sourceforge.net/
下载:http://sourceforge.net/project/showfiles.php?group_id=114421
文件:cpdetector_eclipse_project_1.0.7.zip
<dependency>
<groupId>cpdetector</groupId>
<artifactId>cpdetector</artifactId>
<version>1.0.5</version>
</dependency>
spindle
首页:http://www.bitmechanic.com/projects/spindle/(但是已经无法访问)
2修改spindle代码得到的spider
简单的将URL打印出来了,解析的内容等等都没有处理
解析HTML的基类HtmlParserUtil.java
packagecom.sillycat.api.commons.utils.html;
importjava.io.BufferedReader;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.io.InputStream;
importjava.io.InputStreamReader;
importjava.io.UnsupportedEncodingException;
importjava.net.MalformedURLException;
importjava.net.SocketException;
importjava.net.SocketTimeoutException;
importjava.net.URL;
importjava.net.UnknownHostException;
importjava.nio.charset.Charset;
importorg.htmlparser.Parser;
importorg.htmlparser.util.NodeList;
importorg.htmlparser.util.ParserException;
importorg.htmlparser.visitors.HtmlPage;
importcpdetector.io.ASCIIDetector;
importcpdetector.io.CodepageDetectorProxy;
importcpdetector.io.JChardetFacade;
importcpdetector.io.ParsingDetector;
importcpdetector.io.UnicodeDetector;
publicclassHtmlParserUtil{
/*StringBuffer的缓冲区大小*/
publicstaticintTRANSFER_SIZE=4096;
/*当前平台的行分隔符*/
publicstaticStringlineSep=System.getProperty("line.separator");
/*自动探测页面编码,避免中文乱码的出现*/
publicstaticStringautoDetectCharset(URLurl){
CodepageDetectorProxydetector=CodepageDetectorProxy.getInstance();
/**
*ParsingDetector可用于检查HTML、XML等文件或字符流的编码构造方法中的参数用于指示是否显示探测过程的详细信息
*为false则不显示
*/
detector.add(newParsingDetector(false));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
Charsetcharset=null;
try{
charset=detector.detectCodepage(url);
}catch(MalformedURLExceptionmue){
mue.printStackTrace();
}catch(IOExceptionie){
ie.printStackTrace();
}
if(charset==null)
charset=Charset.defaultCharset();
returncharset.name();
}
/*按照指定编码解析标准的html页面,为建立索引做准备*/
publicstaticString[]parseHtml(Stringurl,Stringcharset){
Stringresult[]=null;
Stringcontent=null;
try{
URLsource=newURL(url);
InputStreamin=source.openStream();
BufferedReaderreader=newBufferedReader(newInputStreamReader(
in,charset));
Stringline=newString();
StringBuffertemp=newStringBuffer(TRANSFER_SIZE);
while((line=reader.readLine())!=null){
temp.append(line);
temp.append(lineSep);
}
reader.close();
in.close();
content=temp.toString();
}catch(UnsupportedEncodingExceptionuee){
uee.printStackTrace();
}catch(MalformedURLExceptionmue){
System.err.println("InvalidURL:"+url);
}catch(UnknownHostExceptionuhe){
System.err.println("UnknowHost:"+url);
}catch(SocketExceptionse){
System.err.println("SocketError:"+se.getMessage()+""+url);
}catch(SocketTimeoutExceptionste){
System.err.println("SocketConnectionTimeOut:"+url);
}catch(FileNotFoundExceptionfnfe){
System.err.println("brokenlink"
+((FileNotFoundException)fnfe.getCause()).getMessage()
+"ignored");
}catch(IOExceptionie){
ie.printStackTrace();
}
if(content!=null){
ParsermyParser=Parser.createParser(content,charset);
HtmlPagevisitor=newHtmlPage(myParser);
try{
myParser.visitAllNodesWith(visitor);
Stringbody=null;
Stringtitle="Untitled";
if(visitor.getBody()!=null){
NodeListnodelist=visitor.getBody();
body=nodelist.asString().trim();
}
if(visitor.getTitle()!=null){
title=visitor.getTitle();
}
result=newString[]{body,title};
}catch(ParserExceptionpe){
pe.printStackTrace();
}
}
returnresult;
}
}
多线程爬虫类HtmlCaptureRunner.java
packagecom.sillycat.api.thread.runner;
importjava.io.FileNotFoundException;
importjava.io.IOException;
importjava.net.HttpURLConnection;
importjava.net.MalformedURLException;
importjava.net.SocketException;
importjava.net.SocketTimeoutException;
importjava.net.URL;
importjava.net.UnknownHostException;
importjava.util.ArrayList;
importjava.util.HashSet;
importorg.apache.commons.logging.Log;
importorg.apache.commons.logging.LogFactory;
importorg.htmlparser.Parser;
importorg.htmlparser.PrototypicalNodeFactory;
importorg.htmlparser.filters.AndFilter;
importorg.htmlparser.filters.HasAttributeFilter;
importorg.htmlparser.filters.NodeClassFilter;
importorg.htmlparser.tags.BaseHrefTag;
importorg.htmlparser.tags.FrameTag;
importorg.htmlparser.tags.LinkTag;
importorg.htmlparser.tags.MetaTag;
importorg.htmlparser.util.EncodingChangeException;
importorg.htmlparser.util.NodeIterator;
importorg.htmlparser.util.NodeList;
importorg.htmlparser.util.ParserException;
importcom.sillycat.api.commons.utils.StringUtil;
importcom.sillycat.api.commons.utils.html.HtmlParserUtil;
publicclassHtmlCaptureRunnerimplementsRunnable{
publicLoglogger=LogFactory.getLog(getClass());
/*基准(初始)URL*/
protectedStringbaseURL=null;
privateStringcontentPath=null;
/**
*待解析的URL地址集合,所有新检测到的链接均存放于此;解析时按照先入先出(First-InFirst-Out)法则线性取出
*/
protectedArrayListURLs=newArrayList();
/*已存储的URL地址集合,避免链接的重复抓取*/
protectedHashSetindexedURLs=newHashSet();
protectedParserparser=newParser();;
/*程序运行线程数,默认2个线程*/
protectedintthreads=2;
/*解析页面时的字符编码*/
protectedStringcharset;
/*基准端口*/
protectedintbasePort;
/*基准主机*/
protectedStringbaseHost;
/*是否存储,默认true*/
protectedbooleanjustDatabase=true;
/*检测索引中是否存在当前URL信息,避免重复抓取*/
protectedbooleanisRepeatedCheck=false;
publicHtmlCaptureRunner(){
PrototypicalNodeFactoryfactory=newPrototypicalNodeFactory();
factory.registerTag(newLocalLinkTag());
factory.registerTag(newLocalFrameTag());
factory.registerTag(newLocalBaseHrefTag());
parser.setNodeFactory(factory);
}
publicvoidcapture(){
URLs.clear();
URLs.add(getBaseURL());
intresponseCode=0;
StringcontentType="";
try{
HttpURLConnectionuc=(HttpURLConnection)newURL(baseURL)
.openConnection();
responseCode=uc.getResponseCode();
contentType=uc.getContentType();
}catch(MalformedURLExceptionmue){
logger.error("InvalidURL:"+getBaseURL());
}catch(UnknownHostExceptionuhe){
logger.error("UnknowHost:"+getBaseURL());
}catch(SocketExceptionse){
logger.error("SocketError:"+se.getMessage()+""
+getBaseURL());
}catch(IOExceptionie){
logger.error("IOException:"+ie);
}
if(responseCode==HttpURLConnection.HTTP_OK
&&contentType.startsWith("text/html")){
try{
charset=HtmlParserUtil.autoDetectCharset(newURL(baseURL));
basePort=newURL(baseURL).getPort();
baseHost=newURL(baseURL).getHost();
if(charset.equals("windows-1252"))
charset="GBK";
longstart=System.currentTimeMillis();
ArrayListthreadList=newArrayList();
for(inti=0;i<threads;i++){
Threadt=newThread(this,"SpiderThread#"+(i+1));
t.start();
threadList.add(t);
}
while(threadList.size()>0){
Threadchild=(Thread)threadList.remove(0);
try{
child.join();
}catch(InterruptedExceptionie){
logger.error("InterruptedException:"+ie);
}
}
//for(inti=0;i<threads;i++){
//threadPool.getThreadPoolExcutor().execute(new
//Thread(this,"SpiderThread#"+(i+1)));
//}
longelapsed=System.currentTimeMillis()-start;
logger.info("Finishedin"+(elapsed/1000)+"seconds");
logger.info("TheCountoftheLinksCapturedis"
+indexedURLs.size());
}catch(MalformedURLExceptione){
e.printStackTrace();
}
}
}
publicvoidrun(){
Stringurl;
while((url=dequeueURL())!=null){
if(justDatabase){
process(url);
}
}
threads--;
}
/**
*处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行
*/
protectedvoidprocess(Stringurl){
Stringresult[];
Stringcontent=null;
Stringtitle=null;
result=HtmlParserUtil.parseHtml(url,charset);
content=result[0];
title=result[1];
if(content!=null&&content.trim().length()>0){
//content
System.out.println(url);
//title
//DateTools.timeToString(System.currentTimeMillis()
}
}
/*从URL队列mPages里取出单个的URL*/
publicsynchronizedStringdequeueURL(){
while(true)
if(URLs.size()>0){
Stringurl=(String)URLs.remove(0);
indexedURLs.add(url);
if(isToBeCaptured(url)){
NodeListlist;
try{
intbookmark=URLs.size();
/*获取页面所有节点*/
parser.setURL(url);
try{
list=newNodeList();
for(NodeIteratore=parser.elements();e
.hasMoreNodes();)
list.add(e.nextNode());
}catch(EncodingChangeExceptionece){
/*解码出错的异常处理*/
parser.reset();
list=newNodeList();
for(NodeIteratore=parser.elements();e
.hasMoreNodes();)
list.add(e.nextNode());
}
/**
*依据http://www.robotstxt.org/wc/meta-user.html处理
*Robots<META>tag
*/
NodeListrobots=list
.extractAllNodesThatMatch(
newAndFilter(newNodeClassFilter(
MetaTag.class),
newHasAttributeFilter("name",
"robots")),true);
if(0!=robots.size()){
MetaTagrobot=(MetaTag)robots.elementAt(0);
Stringcontent=robot.getAttribute("content")
.toLowerCase();
if((-1!=content.indexOf("none"))
||(-1!=content.indexOf("nofollow")))
for(inti=bookmark;i<URLs.size();i++)
URLs.remove(i);
}
}catch(ParserExceptionpe){
logger.error("ParserException:"+pe);
}
returnurl;
}
}else{
threads--;
if(threads>0){
try{
wait();
threads++;
}catch(InterruptedExceptionie){
logger.error("InterruptedException:"+ie);
}
}else{
notifyAll();
returnnull;
}
}
}
privatebooleanisHTML(Stringurl){
if(!url.endsWith(".html")){
returnfalse;
}
if(StringUtil.isNotBlank(contentPath)){
if(!url.startsWith(baseURL+"/"+contentPath)){
returnfalse;
}
}
returntrue;
}
/**
*判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain
*/
publicbooleanisToBeCaptured(Stringurl){
booleanflag=false;
HttpURLConnectionuc=null;
intresponseCode=0;
StringcontentType="";
Stringhost="";
intport=0;
try{
URLsource=newURL(url);
Stringprotocol=source.getProtocol();
if(protocol!=null&&protocol.equals("http")){
host=source.getHost();
port=source.getPort();
uc=(HttpURLConnection)source.openConnection();
uc.setConnectTimeout(8000);
responseCode=uc.getResponseCode();
contentType=uc.getContentType();
}
}catch(MalformedURLExceptionmue){
logger.error("InvalidURL:"+url);
}catch(UnknownHostExceptionuhe){
logger.error("UnknowHost:"+url);
}catch(SocketExceptionse){
logger.error("SocketError:"+se.getMessage()+""+url);
}catch(SocketTimeoutExceptionste){
logger.error("SocketConnectionTimeOut:"+url);
}catch(FileNotFoundExceptionfnfe){
logger.error("brokenlink"+url+"ignored");
}catch(IOExceptionie){
logger.error("IOException:"+ie);
}
if(port==basePort
&&responseCode==HttpURLConnection.HTTP_OK
&&host.equals(baseHost)
&&(contentType.startsWith("text/html")||contentType
.startsWith("text/plain")))
flag=true;
returnflag;
}
classLocalLinkTagextendsLinkTag{
publicvoiddoSemanticAction(){
Stringlink=getLink();
if(link.endsWith("/"))
link=link.substring(0,link.length()-1);
intpos=link.indexOf("#");
if(pos!=-1)
link=link.substring(0,pos);
/*将链接加入到处理队列中*/
if(!(indexedURLs.contains(link)||URLs.contains(link))){
if(isHTML(link)){
URLs.add(link);
}
}
setLink(link);
}
}
/**
*FrametagthatrewritestheSRCURLs.TheSRCURLsaremappedtolocal
*targetsiftheymatchthesource.
*/
classLocalFrameTagextendsFrameTag{
publicvoiddoSemanticAction(){
Stringlink=getFrameLocation();
if(link.endsWith("/"))
link=link.substring(0,link.length()-1);
intpos=link.indexOf("#");
if(pos!=-1)
link=link.substring(0,pos);
/*将链接加入到处理队列中*/
if(!(indexedURLs.contains(link)||URLs.contains(link))){
if(isHTML(link)){
URLs.add(link);
}
}
setFrameLocation(link);
}
}
/**
*Basetagthatdoesn'tshow.ThetoHtml()methodisoverriddentoreturn
*anemptystring,effectivelyshuttingoffthebasereference.
*/
classLocalBaseHrefTagextendsBaseHrefTag{
publicStringtoHtml(){
return("");
}
}
publicStringgetBaseURL(){
returnbaseURL;
}
publicvoidsetBaseURL(StringbaseURL){
this.baseURL=baseURL;
}
publicintgetThreads(){
returnthreads;
}
publicvoidsetThreads(intthreads){
this.threads=threads;
}
publicStringgetCharset(){
returncharset;
}
publicvoidsetCharset(Stringcharset){
this.charset=charset;
}
publicintgetBasePort(){
returnbasePort;
}
publicvoidsetBasePort(intbasePort){
this.basePort=basePort;
}
publicStringgetBaseHost(){
returnbaseHost;
}
publicvoidsetBaseHost(StringbaseHost){
this.baseHost=baseHost;
}
publicbooleanisJustDatabase(){
returnjustDatabase;
}
publicvoidsetJustDatabase(booleanjustDatabase){
this.justDatabase=justDatabase;
}
publicStringgetContentPath(){
returncontentPath;
}
publicvoidsetContentPath(StringcontentPath){
this.contentPath=contentPath;
}
}
spring上的配置文件applicationContext-bean.xml:
<beanid="productCapture"
class="com.sillycat.api.thread.runner.HtmlCaptureRunner">
<propertyname="contentPath"value="${product.contentPath}"/>
<propertyname="basePort"value="${product.base.port}"/>
<propertyname="baseURL"value="${product.base.url}"/>
<propertyname="charset"value="${product.base.code}"/>
<propertyname="threads"value="${product.base.threads}"/>
</bean>
<beanid="messageCapture"
class="com.sillycat.api.thread.runner.HtmlCaptureRunner">
<propertyname="contentPath"value="${message.contentPath}"/>
<propertyname="basePort"value="${message.base.port}"/>
<propertyname="baseURL"value="${message.base.url}"/>
<propertyname="charset"value="${message.base.code}"/>
<propertyname="threads"value="${message.base.threads}"/>
</bean>
easySearch.properties配置文件:
#==========================================
#spiderconfigration
#=========================================
product.contentPath=product
product.base.port=80
product.base.url=http://www.safedv.com
product.base.code=UTF-8
product.base.threads=3
message.contentPath=message
message.base.port=80
message.base.url=http://www.safedv.com
message.base.code=UTF-8
message.base.threads=3
单元测试类HtmlRunnerTest.java文件:
packagecom.sillycat.api.thread;
importcom.sillycat.api.commons.base.BaseManagerTest;
importcom.sillycat.api.thread.runner.HtmlCaptureRunner;
publicclassHtmlRunnerTestextendsBaseManagerTest{
privateHtmlCaptureRunnerproductCapture;
privateHtmlCaptureRunnermessageCapture;
protectedvoidsetUp()throwsException{
super.setUp();
productCapture=(HtmlCaptureRunner)appContext.getBean("productCapture");
messageCapture=(HtmlCaptureRunner)appContext.getBean("messageCapture");
}
protectedvoidtearDown()throwsException{
super.tearDown();
}
publicvoidtestDumy(){
assertTrue(true);
}
publicvoidntestProductCapture(){
productCapture.capture();
}
publicvoidtestMessageCapture(){
messageCapture.capture();
}
}