jeecms 采集功能优化,基于htmlparser实现,多线程版
为了熟悉一下多线程相关知识,把jeecms采集器类,改成了多线程版,还不是很完善,帖出来大家一起完善,改进。
说明:暂不支持暂停,停止功能。
用法:和我上一篇jeecms采集功能优化,基于htmlparser实现里面的用法一样。
思路:想法很简单,在主线程处理类中,先取得当前采集任务下所有URL,并放入队列中,然后开启指定数目的线程(默认是2)采集内容
代码清单:
采集器主类:MultiThreadAcquisitionSvcImpl.java
HTML解析工具类接口:ParseHtmlTool.java
HTML解析工具,HtmlParser实现类:HtmlParserImpl.java
采集参数封装bean:ParamBean.java
队列类:Queue.java
URL队列:UrlQueue.java
代码如下:
采集器主类:MultiThreadAcquisitionSvcImpl.java
packagecom.jeecms.cms.service;
importjava.io.IOException;
importjava.net.URI;
importjava.net.URISyntaxException;
importjava.util.List;
importjava.util.Map;
importjava.util.concurrent.CountDownLatch;
importjava.util.concurrent.ExecutorService;
importjava.util.concurrent.Executors;
importorg.apache.commons.lang.StringUtils;
importorg.apache.http.HttpEntity;
importorg.apache.http.HttpHost;
importorg.apache.http.HttpResponse;
importorg.apache.http.StatusLine;
importorg.apache.http.client.ClientProtocolException;
importorg.apache.http.client.HttpClient;
importorg.apache.http.client.HttpResponseException;
importorg.apache.http.client.ResponseHandler;
importorg.apache.http.client.methods.HttpGet;
importorg.apache.http.conn.params.ConnRoutePNames;
importorg.apache.http.impl.client.DefaultHttpClient;
importorg.apache.http.util.EntityUtils;
importorg.slf4j.Logger;
importorg.slf4j.LoggerFactory;
importorg.springframework.beans.factory.annotation.Autowired;
importorg.springframework.stereotype.Service;
importcom.jeecms.cms.entity.assist.CmsAcquisition;
importcom.jeecms.cms.entity.main.Content;
importcom.jeecms.cms.manager.assist.CmsAcquisitionMng;
/**
*采集器-多线程版
*@authorjavacoo
*@since2011-11-02
*@version1.0
*/
@Service
publicclassMultiThreadAcquisitionSvcImplimplementsAcquisitionSvc{
privateLoggerlog=LoggerFactory.getLogger(MultiThreadAcquisitionSvcImpl.class);
/**开启线程数*/
privatestaticintTHREAD_NUM=2;
/**每个线程休眠毫秒数*/
privatestaticintSLEEP_TIME=100;
/**连接集合标志*/
privatestaticStringLINK_KEY="linkKey";
/**标题集合标志*/
privatestaticStringTITLE_KEY="titleKey";
/**采集管理对象*/
privateCmsAcquisitionMngcmsAcquisitionMng;
/**存放HttpClient的ThreadLocal对象*/
privatestaticThreadLocal<HttpClient>httpClientThreadLocal=newThreadLocal<HttpClient>();
/**存放ParseHtmlTool的ThreadLocal对象*/
privatestaticThreadLocal<ParseHtmlTool>parseHtmlToolThreadLocal=newThreadLocal<ParseHtmlTool>();
/**存放UrlQueue的ThreadLocal对象*/
privatestaticThreadLocal<UrlQueue>urlQueueThreadLocal=newThreadLocal<UrlQueue>();
@Autowired
publicvoidsetCmsAcquisitionMng(CmsAcquisitionMngcmsAcquisitionMng){
this.cmsAcquisitionMng=cmsAcquisitionMng;
}
/**
*开始执行采集任务
*/
publicbooleanstart(Integerid){
CmsAcquisitionacqu=cmsAcquisitionMng.findById(id);
if(acqu==null||acqu.getStatus()==CmsAcquisition.START){
returnfalse;
}
newThread(newMainThreadProcesser(this,acqu)).start();
returntrue;
}
/**
*主线程处理类
*@authorjavacoo
*@since2011-11-02
*/
privateclassMainThreadProcesserimplementsRunnable{
privateCmsAcquisitionacqu;
privateAcquisitionSvcacquisitionSvc;
publicMainThreadProcesser(AcquisitionSvcacquisitionSvc,CmsAcquisitionacqu){
this.acqu=acqu;
this.acquisitionSvc=acquisitionSvc;
}
publicvoidrun(){
longtStart=System.currentTimeMillis();
System.out.println("主线程:"+Thread.currentThread().getName()+"开始...");
try{
getHttpClient().getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,newHttpHost("128.160.64.5",1235));
CharsetHandlerhandler=newCharsetHandler(acqu.getPageEncoding());
getAllUrls(acqu,handler);
CountDownLatchlatch=newCountDownLatch(THREAD_NUM);
ExecutorServiceexec=Executors.newCachedThreadPool();
for(inti=0;i<THREAD_NUM;i++){
Threadthread=newThread(newProcesser(acquisitionSvc,acqu,latch,getHttpClient(),getUrlQueue(),getParseHtmlTool(acqu),handler));
exec.execute(thread);
}
latch.await();
exec.shutdown();
}catch(InterruptedExceptione){
e.printStackTrace();
}catch(ClientProtocolExceptione){
e.printStackTrace();
}catch(URISyntaxExceptione){
e.printStackTrace();
}catch(IOExceptione){
e.printStackTrace();
}finally{
httpClientThreadLocal.get().getConnectionManager().shutdown();
cmsAcquisitionMng.end(acqu.getId());
httpClientThreadLocal.remove();
parseHtmlToolThreadLocal.remove();
urlQueueThreadLocal.remove();
longtEnd=System.currentTimeMillis();
System.out.println("主线程:"+Thread.currentThread().getName()+"结束...");
System.out.println("主线程:"+Thread.currentThread().getName()+"总共用时:"+(tEnd-tStart)+"ms");
}
}
}
/**
*处理类
*@authorjavacoo
*@since2011-11-02
*/
privateclassProcesserimplementsRunnable{
privateAcquisitionSvcacquisitionSvc;
privateCmsAcquisitionacqu;
privateCountDownLatchlatch;
privateUrlQueueurlQueue;
privateHttpClienthttpClient;
privateParseHtmlToolparseHtmlTool;
privateCharsetHandlerhandler;
publicProcesser(AcquisitionSvcacquisitionSvc,CmsAcquisitionacqu,CountDownLatchlatch,HttpClienthttpClient,UrlQueueurlQueue,ParseHtmlToolparseHtmlTool,CharsetHandlerhandler){
this.acquisitionSvc=acquisitionSvc;
this.acqu=acqu;
this.latch=latch;
this.urlQueue=urlQueue;
this.httpClient=httpClient;
this.parseHtmlTool=parseHtmlTool;
this.handler=handler;
}
publicvoidrun(){
System.out.println("======================子线程:"+Thread.currentThread().getName()+"开始...");
try{
Map<String,String>urlMap=null;
while(!urlAndTitleMapIsEmpty(urlQueue)){
urlMap=getUrlAndTitleMap(urlQueue);
saveContent(acqu,httpClient,parseHtmlTool,handler,urlMap);
Thread.sleep(SLEEP_TIME);
}
}catch(Exceptione){
e.printStackTrace();
log.warn(null,e);
}finally{
System.out.println("======================子线程:"+Thread.currentThread().getName()+"结束.");
log.info("Acquisition#{}complete",acqu.getId());
latch.countDown();
}
}
}
/**
*取得当前主线程的HttpClient对象
*@return当前主线程的HttpClient对象
*/
privatestaticHttpClientgetHttpClient(){
if(httpClientThreadLocal.get()==null){
HttpClientclient=newDefaultHttpClient();
httpClientThreadLocal.set(client);
returnclient;
}else{
returnhttpClientThreadLocal.get();
}
}
/**
*取得当前主线程的UrlQueue对象
*@return当前主线程的UrlQueue对象
*/
privatestaticUrlQueuegetUrlQueue(){
if(urlQueueThreadLocal.get()==null){
UrlQueueurlQueue=newUrlQueue();
urlQueueThreadLocal.set(urlQueue);
returnurlQueue;
}else{
returnurlQueueThreadLocal.get();
}
}
/**
*取得当前主线程的ParseHtmlTool对象
*@paramacqu采集参数对象
*@return当前主线程的ParseHtmlTool对象
*/
privatestaticParseHtmlToolgetParseHtmlTool(CmsAcquisitionacqu){
if(parseHtmlToolThreadLocal.get()==null){
ParseHtmlToolparseHtmlTool=newHtmlParserImpl(acqu);
parseHtmlToolThreadLocal.set(parseHtmlTool);
returnparseHtmlTool;
}else{
returnparseHtmlToolThreadLocal.get();
}
}
/**
*连接和标题map对象入队列
*@parammap连接和标题map对象
*/
privatesynchronizedvoidaddUrlAndTitleMap(Map<String,String>map){
getUrlQueue().addUnVisitedUrl(map);
}
/**
*连接和标题map对象出队列
*@paramurlQueue当前线程的队列
*@return连接和标题map对象
*/
privatesynchronizedMap<String,String>getUrlAndTitleMap(UrlQueueurlQueue){
returnurlQueue.unVisitedUrlDeQueue();
}
/**
*判断当前对象是否为空
*@paramurlQueue当前线程的队列
*@returntrue/flase
*/
privatesynchronizedbooleanurlAndTitleMapIsEmpty(UrlQueueurlQueue){
returnurlQueue.isEmpty();
}
/**
*取得当前线程下所有计划的连接,并加入队列
*@paramacqu采集参数对象
*@paramhandler字符集对象
*@throwsURISyntaxException
*@throwsIOException
*@throwsClientProtocolException
*/
privatevoidgetAllUrls(CmsAcquisitionacqu,CharsetHandlerhandler)throwsURISyntaxException,ClientProtocolException,IOException{
acqu=cmsAcquisitionMng.start(acqu.getId());
String[]plans=acqu.getAllPlans();
Stringurl=null;
Stringhtml=null;
List<Map<String,String>>urlAndTitleListMap=null;
HttpGethttpGet=null;
for(inti=plans.length-acqu.getCurrNum();i>=0;i--){
url=plans[i];
httpGet=newHttpGet(newURI(url.trim()));
html=getHttpClient().execute(httpGet,handler);
urlAndTitleListMap=getParseHtmlTool(acqu).getUrlAndTitleMap(html);
for(Map<String,String>map:urlAndTitleListMap){
addUrlAndTitleMap(map);
}
}
System.out.println("=======当前线程:"+Thread.currentThread().getName()+"URL连接数:"+getUrlQueue().getUnVisitedUrl().getSize());
}
/**
*保存内容
*@paramacqu请求参数对象
*@paramhttpClienthttpClient对象
*@paramparseHtmlToolparseHtmlTool对象
*@paramhandlerCharsetHandler对象
*@parammap连接和标题map对象
*@returnContent
*/
privatesynchronizedContentsaveContent(CmsAcquisitionacqu,HttpClienthttpClient,ParseHtmlToolparseHtmlTool,CharsetHandlerhandler,Map<String,String>map){
try{
HttpGethttpGet=null;
if(map.get(LINK_KEY).contains("http://")){
httpGet=newHttpGet(newURI(map.get(LINK_KEY).trim()));
}else{
httpGet=newHttpGet(newURI("http://localhost/v7/"+map.get(LINK_KEY).trim()));
}
Stringhtml=httpClient.execute(httpGet,handler);
System.out.println("=============================子线程:"+Thread.currentThread().getName()+"执行");
Stringtxt=parseHtmlTool.getHtml(html);
returncmsAcquisitionMng.saveContent(map.get(TITLE_KEY),txt,acqu.getId());
//returnnull;
}catch(Exceptione){
log.warn(null,e);
e.printStackTrace();
returnnull;
}
}
/**
*字符集帮助类
*@authorAdministrator
*
*/
privateclassCharsetHandlerimplementsResponseHandler<String>{
privateStringcharset;
publicCharsetHandler(Stringcharset){
this.charset=charset;
}
publicStringhandleResponse(HttpResponseresponse)
throwsClientProtocolException,IOException{
StatusLinestatusLine=response.getStatusLine();
if(statusLine.getStatusCode()>=300){
thrownewHttpResponseException(statusLine.getStatusCode(),
statusLine.getReasonPhrase());
}
HttpEntityentity=response.getEntity();
if(entity!=null){
if(!StringUtils.isBlank(charset)){
returnEntityUtils.toString(entity,charset);
}else{
returnEntityUtils.toString(entity);
}
}else{
returnnull;
}
}
}
}
相关辅助类
HTML解析工具类接口:ParseHtmlTool.java
packagecom.jeecms.cms.service;
importjava.util.List;
importjava.util.Map;
/**
*HTML解析工具类接口
*@authorjavacoo
*@since2011-10-31
*/
publicinterfaceParseHtmlTool{
/**
*取得连接集合
*@paramorginHtml原始HTML
*@return连接集合
*/
List<String>getUrlList(StringorginHtml);
/**
*取得标题集合
*@paramorginHtml原始HTML
*@return标题集合
*/
List<String>getTitleList(StringorginHtml);
/**
*取得指定区域的HTML内容
*@return指定区域的HTML内容
*/
StringgetHtml(StringorginHtml);
/**
*取得连接标题Map集合
*@paramorginHtml原始HTML
*@return连接标题Map集合
*/
List<Map<String,String>>getUrlAndTitleMap(StringorginHtml);
}
HTML解析工具,HtmlParser实现类:HtmlParserImpl.java
packagecom.jeecms.cms.service;
importjava.io.BufferedReader;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.IOException;
importjava.io.InputStreamReader;
importjava.net.URISyntaxException;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.Iterator;
importjava.util.List;
importjava.util.Map;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
importorg.apache.commons.lang.StringUtils;
importorg.htmlparser.Node;
importorg.htmlparser.NodeFilter;
importorg.htmlparser.Parser;
importorg.htmlparser.filters.HasAttributeFilter;
importorg.htmlparser.filters.NodeClassFilter;
importorg.htmlparser.filters.TagNameFilter;
importorg.htmlparser.nodes.RemarkNode;
importorg.htmlparser.util.NodeList;
importorg.htmlparser.util.ParserException;
importcom.jeecms.cms.entity.assist.CmsAcquisition;
/**
*HTML解析工具,HtmlParser实现类
*@authorjavacoo
*@since2011-10-31
*/
publicclassHtmlParserImplimplementsParseHtmlTool{
/**连接集合标志*/
privatestaticStringLINK_KEY="linkKey";
/**标题集合标志*/
privatestaticStringTITLE_KEY="titleKey";
/**单标签标志*/
privatestaticStringSINGLE_TAG="singleTag";
/**连接正则表达式*/
privatestaticStringLINK_REGX="<a.*href=\"(.*?)\".*>(.*?)</a>";
/**正则表达式对象*/
privatePatternpt=Pattern.compile(LINK_REGX);
/**采集参数bean*/
privateParamBeanparamBean;
publicHtmlParserImpl(CmsAcquisitionacqu){
parseRequestParam(acqu);
}
/**
*取得标题集合
*@paramorginHtml原始HTML
*@return标题集合
*/
publicList<String>getTitleList(StringorginHtml){
orginHtml=getHtmlByFilter(paramBean.getLinksetStartMap(),paramBean.getLinksetEndMap(),orginHtml);
if(StringUtils.isNotEmpty(orginHtml)){
returngetUrlOrTitleListByType(orginHtml,TITLE_KEY);
}
returnnull;
}
/**
*取得连接集合
*@paramorginHtml原始HTML
*@return连接集合
*/
publicList<String>getUrlList(StringorginHtml){
orginHtml=getHtmlByFilter(paramBean.getLinksetStartMap(),paramBean.getLinksetEndMap(),orginHtml);
if(StringUtils.isNotEmpty(orginHtml)){
returngetUrlOrTitleListByType(orginHtml,LINK_KEY);
}
returnnull;
}
/**
*取得指定区域的HTML内容
*@paramorginHtml原始HTML
*@return指定区域的HTML内容
*@throwsParserException
*/
publicStringgetHtml(StringorginHtml){
orginHtml=getHtmlByFilter(paramBean.getContentStartMap(),paramBean.getContentEndMap(),orginHtml);
returnorginHtml;
}
/**
*取得连接标题Map
*@paramorginHtml原始HTML
*@return连接标题Map
*/
publicList<Map<String,String>>getUrlAndTitleMap(StringorginHtml){
returngetUrlAandTitleMap(orginHtml);
}
/**
*解析采集参数,并封装到ParamBean
*@paramacqu原始采集参数
*@return采集参数封装bean
*/
privatevoidparseRequestParam(CmsAcquisitionacqu){
paramBean=newParamBean();
if(!StringUtils.isEmpty(acqu.getLinksetStart())){
paramBean.setLinksetStartMap(populateParamMap(acqu.getLinksetStart()));
}
if(!StringUtils.isEmpty(acqu.getLinksetEnd())){
paramBean.setLinksetEndMap(populateParamMap(acqu.getLinksetEnd()));
}
if(!StringUtils.isEmpty(acqu.getContentStart())){
paramBean.setContentStartMap(populateParamMap(acqu.getContentStart()));
}
if(!StringUtils.isEmpty(acqu.getContentEnd())){
paramBean.setContentEndMap(populateParamMap(acqu.getContentEnd()));
}
}
/**
*得到连接标题MAP
*@paramhtmlhtml内容
*@return连接或者标题集合
*/
privateList<Map<String,String>>getUrlAandTitleMap(Stringhtml){
html=getHtmlByFilter(paramBean.getLinksetStartMap(),paramBean.getLinksetEndMap(),html);
List<Map<String,String>>resultMapList=newArrayList<Map<String,String>>();
Map<String,String>resultMap=null;
Matcherm=pt.matcher(html);
while(m.find()){
if(StringUtils.isNotEmpty(m.group(1))&&StringUtils.isNotEmpty(m.group(2))){
resultMap=newHashMap<String,String>();
resultMap.put(LINK_KEY,m.group(1));
resultMap.put(TITLE_KEY,m.group(2));
resultMapList.add(resultMap);
}
}
returnresultMapList;
}
/**
*得到地址集
*@paramhtmlhtml内容
*@paramtype1:取得连接集合,2:取得标题集合
*@return连接或者标题集合
*/
privateList<String>getUrlOrTitleListByType(Stringhtml,Stringtype){
List<String>resultList=newArrayList<String>();
Matcherm=pt.matcher(html);
Stringresult="";
intpos=1;
if(TITLE_KEY.equals(type)){
pos=2;
}
while(m.find()){
result=m.group(pos);
resultList.add(result);
}
returnresultList;
}
/**
*取得指定区域的HTML内容
*@paramtagMap标签MAP
*@paramremoveTagMap要过滤的标签MAP
*@paramorginHtml原始HTML
*@return指定区域的HTML内容
*@throwsParserException
*/
privateStringgetHtmlByFilter(Map<String,String>tagMap,
Map<String,String>removeTagMap,StringorginHtml){
try{
Parserparser=newParser();
parser.setInputHTML(orginHtml);
//第一步取得指定属性/标签内容
StringtempKey=null;
StringtempValue=null;
String[]tempValueArr=null;
StringBuildersb=newStringBuilder();
NodeFilterfilter=null;
for(Iterator<String>it=tagMap.keySet().iterator();it.hasNext();){
tempKey=it.next();
tempValue=tagMap.get(tempKey);
if(tempValue.contains("|")){
tempValueArr=tempValue.split("\\|");
}else{
tempValueArr=newString[]{tempValue};
}
for(Stringvalue:tempValueArr){
filter=populateFilter(tempKey,value);
appendHtmlByFilter(parser,filter,sb);
}
}
//第二步过滤指定属性/标签内容
StringcontentHtml=sb.toString();
for(Iterator<String>it=removeTagMap.keySet().iterator();it
.hasNext();){
tempKey=it.next();
tempValue=removeTagMap.get(tempKey);
if(tempValue.contains("|")){
tempValueArr=tempValue.split("\\|");
}else{
tempValueArr=newString[]{tempValue};
}
for(Stringvalue:tempValueArr){
filter=populateFilter(tempKey,value);
contentHtml=removeHtmlByFilter(parser,filter,contentHtml);
}
}
//第三步过滤注释
filter=newNodeClassFilter(RemarkNode.class);
contentHtml=removeHtmlByFilter(parser,filter,contentHtml);
//System.out.println("=================================结果=======================================");
//System.out.println(contentHtml);
returncontentHtml;
}catch(ParserExceptione){
//TODOAuto-generatedcatchblock
e.printStackTrace();
}
return"";
}
/**
*解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
*
约定采集参数格式如下*
1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN*
2,标签名称形式,如:div,p,span*
3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span*@paramparamStr参数字符串
*/
privateMap<String,String>populateParamMap(StringparamStr){
Map<String,String>paramMap=newHashMap<String,String>();
String[]paramStrArr=paramStr.split(",");
String[]tempStrArr=null;
StringBuildersb=newStringBuilder();
for(Stringtemp:paramStrArr){
if(temp.contains("=")){
tempStrArr=temp.split("=");
paramMap.put(tempStrArr[0],tempStrArr[1]);
}else{
if(StringUtils.isNotEmpty(temp)){
sb.append(temp).append("|");
}
}
}
if(StringUtils.isNotEmpty(sb.toString())){
paramMap.put(SINGLE_TAG,sb.substring(0,sb.length()-1));
}
returnparamMap;
}
/**
*组装过滤器
*@paramkey键
*@paramvalue值
*@return过滤器
*/
privateNodeFilterpopulateFilter(Stringkey,Stringvalue){
NodeFilterfilter;
if(SINGLE_TAG.equals(key)){
filter=newTagNameFilter(value);
}else{
filter=newHasAttributeFilter(key,value);
}
returnfilter;
}
/**
*过滤指定属性标签HTML
*@paramparser解析器
*@paramfilter属性过滤器
*@paramorginHtml原始HTML
*@return过滤后HTML
*@throwsParserException
*/
privateStringremoveHtmlByFilter(Parserparser,NodeFilterfilter,StringorginHtml)throwsParserException{
parser.setInputHTML(orginHtml);
NodeListnodes=parser.extractAllNodesThatMatch(filter);
for(inti=0;i<nodes.size();i++){
Nodetextnode=(Node)nodes.elementAt(i);
orginHtml=StringUtils.remove(orginHtml,textnode.toHtml());
}
returnorginHtml;
}
/**
*取得所有指定属性/标签的HTML
*@paramparser解析器
*@paramfilter过滤器
*@paramsb
*@throwsParserException
*/
privatevoidappendHtmlByFilter(Parserparser,NodeFilterfilter,
StringBuildersb)throwsParserException{
NodeListnodes=parser.extractAllNodesThatMatch(filter);
for(inti=0;i<nodes.size();i++){
Nodetextnode=(Node)nodes.elementAt(i);
sb.append(textnode.toHtml());
}
}
/**
*解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
*
约定采集参数格式如下*
1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN*
2,标签名称形式,如:div,p,span*
3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span*@paramparamMap参数map
*@paramstr参数字符串
*/
privatevoidpopulateParamMap(Map<String,String>paramMap,StringparamStr){
String[]paramStrArr=paramStr.split(",");
String[]tempStrArr=null;
StringBuildersb=newStringBuilder();
for(Stringtemp:paramStrArr){
if(temp.contains("=")){
tempStrArr=temp.split("=");
paramMap.put(tempStrArr[0],tempStrArr[1]);
}else{
if(StringUtils.isNotEmpty(temp)){
sb.append(temp).append("|");
}
}
}
if(StringUtils.isNotEmpty(sb.toString())){
paramMap.put(SINGLE_TAG,sb.substring(0,sb.length()-1));
}
}
/**
*测试方法-打开文件并返回内容
*@paramszFileName文件绝对地址
*@paramcharset字符集
*@return内容
*/
publicstaticStringopenFile(StringszFileName,Stringcharset){
try{
BufferedReaderbis=newBufferedReader(newInputStreamReader(
newFileInputStream(newFile(szFileName)),charset));
StringBuilderszContent=newStringBuilder();
StringszTemp;
while((szTemp=bis.readLine())!=null){
szContent.append(szTemp).append("\n");
}
bis.close();
returnszContent.toString();
}catch(Exceptione){
return"";
}
}
/**
*测试取得连接地址和标题
*@throwsParserException
*/
publicvoidtestFetchLinkAndTitle()throwsParserException{
Stringhtml=openFile("F:\\4.htm","UTF-8");
Stringresult="";
Map<String,String>map=newHashMap<String,String>();
map.put("class","m_list");
Map<String,String>notMap=newHashMap<String,String>();
//notMap.put("class","atc_ic_f");
result=getHtmlByFilter(map,notMap,html);
System.out.println("=============================result============================");
System.out.println(result);
System.out.println("==========================================================");
Patternpt=Pattern.compile("<a.*href=\"(.*?)\".*>(.*?)</a>");
Matcherm=pt.matcher(result);
Stringlink=null;
Stringtitle=null;
while(m.find()){
link=m.group(1);
title=m.group(2);
if(StringUtils.isNotEmpty(link)){
System.out.println("url:"+link);
System.out.println("title:"+title);
}
}
}
/**
*测试取得内容
*@throwsParserException
*/
publicvoidtestFetchContent()throwsParserException{
Stringhtml=openFile("F:\\6.shtml","GB2312");
Map<String,String>map=newHashMap<String,String>();
map.put("id","artibody");
Map<String,String>notMap=newHashMap<String,String>();
notMap.put(SINGLE_TAG,"style|script");
notMap.put("type","text/javascript");
notMap.put("class","icon_fx|blkCommentotherContent_01");
notMap.put("style","text-align:right;padding-right:10px;|margin-top:6px;|font-size:12px!important;|font-size:12px");
notMap.put("id","fxwb|fxMSN|fxMSN|comment_t_show_top");
getHtmlByFilter(map,notMap,html);
}
/**
*测试解析参数
*/
publicvoidtestParseParam(){
Map<String,String>map=newHashMap<String,String>();
populateParamMap(map,"class=articleList|tips,p,div");
StringtempKey=null;
StringtempValue=null;
String[]tempValueArr=null;
for(Iterator<String>it=map.keySet().iterator();it.hasNext();){
tempKey=it.next();
tempValue=map.get(tempKey);
if(tempValue.contains("|")){
tempValueArr=tempValue.split("\\|");
}else{
tempValueArr=newString[]{tempValue};
}
for(Stringvalue:tempValueArr){
System.out.println("tempKey:"+tempKey);
System.out.println("value:"+value);
}
}
}
/**
*测试过滤标签
*@throwsParserException
*/
publicvoidtestRemarkFilter()throwsParserException{
Stringhtml=openFile("F:\\6.shtml","GB2312");
System.out.println("=========================过滤注释前HTML==================================");
System.out.println(html);
NodeFilterfilter=newNodeClassFilter(RemarkNode.class);
html=removeHtmlByFilter(newParser(),filter,html);
System.out.println("=========================过滤注释后HTML==================================");
System.out.println(html);
}
publicstaticvoidmain(String[]args)throwsParserException,
URISyntaxException,IOException{
HtmlParserImplparseHtmlTool=newHtmlParserImpl(newCmsAcquisition());
//parseHtmlTool.testParseParam();
//parseHtmlTool.testFetchLinkAndTitle();
//parseHtmlTool.testFetchContent();
//parseHtmlTool.testRemarkFilter();
}
}
采集参数封装bean:ParamBean.java
packagecom.jeecms.cms.service;
importjava.util.HashMap;
importjava.util.Map;
/**
*采集参数封装bean
*@authorjavacoo
*@since2011-10-31
*/
publicclassParamBean{
/**待采集连接区域属性MAP*/
privateMap<String,String>linksetStartMap=newHashMap<String,String>();
/**待采集连接区域过滤属性MAP*/
privateMap<String,String>linksetEndMap=newHashMap<String,String>();
/**待采集内容区域属性MAP*/
privateMap<String,String>contentStartMap=newHashMap<String,String>();
/**待采集内容区域过滤属性MAP*/
privateMap<String,String>contentEndMap=newHashMap<String,String>();
publicMap<String,String>getLinksetStartMap(){
returnlinksetStartMap;
}
publicvoidsetLinksetStartMap(Map<String,String>linksetStartMap){
this.linksetStartMap=linksetStartMap;
}
publicMap<String,String>getLinksetEndMap(){
returnlinksetEndMap;
}
publicvoidsetLinksetEndMap(Map<String,String>linksetEndMap){
this.linksetEndMap=linksetEndMap;
}
publicMap<String,String>getContentStartMap(){
returncontentStartMap;
}
publicvoidsetContentStartMap(Map<String,String>contentStartMap){
this.contentStartMap=contentStartMap;
}
publicMap<String,String>getContentEndMap(){
returncontentEndMap;
}
publicvoidsetContentEndMap(Map<String,String>contentEndMap){
this.contentEndMap=contentEndMap;
}
}
队列类:Queue.java
packagecom.jeecms.cms.service;
importjava.util.LinkedList;
/**
*队列
*@authorjavacoo
*@since2011-11-01
*@param<T>
*/
publicclassQueue<T>{
privateLinkedList<T>queue=newLinkedList<T>();
/**
*入队列
*@paramt
*/
publicvoidenQueue(Tt){
queue.addLast(t);
}
/**
*出队列
*@returnt
*/
publicTdeQueue(){
returnqueue.removeFirst();
}
/**
*判断队列是否为空
*@return
*/
publicbooleanisEmpty(){
returnqueue.isEmpty();
}
/**
*判断队列是否含有t
*@paramt
*@return
*/
publicbooleancontains(Tt){
returnqueue.contains(t);
}
/**
*取得队列大小
*@return
*/
publicintgetSize(){
returnqueue.size();
}
}
URL队列:UrlQueue.java
packagecom.jeecms.cms.service;
importjava.util.Map;
importorg.springframework.util.CollectionUtils;
/**
*URL队列
*@authorjavacoo
*@since2011-11-01
*@param<T>
*/
publicclassUrlQueue{
/**待访问URL集合*/
privateQueue<Map<String,String>>unVisitedUrl=newQueue<Map<String,String>>();
/**
*获得URL队列
*@return
*/
publicQueue<Map<String,String>>getUnVisitedUrl(){
returnunVisitedUrl;
}
/**
*未访问的URL出队列
*@return
*/
publicMap<String,String>unVisitedUrlDeQueue(){
returnunVisitedUrl.deQueue();
}
/**
*保证每个URL只被访问一次
*@paramurl
*/
publicvoidaddUnVisitedUrl(Map<String,String>urlMap){
if(!CollectionUtils.isEmpty(urlMap)&&!unVisitedUrl.contains(urlMap)){
unVisitedUrl.enQueue(urlMap);
}
}
/**
*判断是否为空
*@return
*/
publicbooleanisEmpty(){
returnunVisitedUrl.isEmpty();
}
}