htmlparser网页抓取
importjava.io.File;
importjava.io.FileNotFoundException;
importjava.io.FileOutputStream;
importjava.io.IOException;
importjava.io.InputStream;
importjava.net.URL;
importjava.sql.Connection;
importjava.sql.DriverManager;
importjava.sql.PreparedStatement;
importjava.sql.ResultSet;
importjava.sql.SQLException;
importorg.apache.log4j.Logger;
importorg.htmlparser.Node;
importorg.htmlparser.NodeFilter;
importorg.htmlparser.Parser;
importorg.htmlparser.Tag;
importorg.htmlparser.filters.TagNameFilter;
importorg.htmlparser.tags.LinkTag;
importorg.htmlparser.util.NodeIterator;
importorg.htmlparser.util.NodeList;
importorg.htmlparser.util.ParserException;
importorg.htmlparser.util.SimpleNodeIterator;
/**
*分析www.cheshi.com首页新闻
*@authorj.li
*/
publicclassHtmlParser{
privatestaticLoggerlogger;
privateConnectionconn=null;
privatestaticfinalStringSitename="";
publicvoidindexNewsContent(Stringsitepath)throwsException{
logger.info("分析网站【"+sitepath+"】首页的新闻列表,内容为【<divclass=\"hotjd\"></div>】所有网页新闻地址的HTML内容。");
ParsermyParser=newParser(sitepath);
myParser.setEncoding("UTF-8");
NodeListnodeList=myParser.extractAllNodesThatMatch(newNodeFilter(){
publicbooleanaccept(Nodenode){
return((nodeinstanceofTag)
&&!((Tag)node).isEndTag()
&&((Tag)node).getTagName().equals("DIV")
&&((Tag)node).getAttribute("class")!=null
&&((Tag)node).getAttribute("class").equals("descclearfix"));
}
});
for(inti=0,len=nodeList.size();i<len;i++){
Nodenode=nodeList.elementAt(i);
logger.debug(node.toHtml());
System.out.println(node.toHtml());
System.out.println("------------------------------------------------------------------------------------------------------");
//extractText(node.toHtml());
}
}
publicvoidextractText(StringinputHtml)throwsException{
Parserparser=Parser.createParser(inputHtml,"GBK");
TagNameFilterfilter=newTagNameFilter("a");
NodeListnodeList=parser.extractAllNodesThatMatch(filter);
NodeIteratorit=nodeList.elements();
getConnection();
while(it.hasMoreNodes()){
LinkTagnode=(LinkTag)it.nextNode();
Stringhref=node.getLink();
Stringtitle=node.getLinkText();
logger.info("分析首页新闻【"+title+"】,链接地址【"+href+"】");
try{
if(!newsExist(title)){
insertDataBase(title,extractContent(href));
}else{
logger.info("新闻【"+title+"】数据库中已经存在,忽略进入下一个新闻分析!");
}
}catch(SQLExceptione){
logger.error("插入数据库新闻记录异常!"+e.getMessage());
e.printStackTrace();
}catch(Exceptione){
logger.error(e.getMessage());
logger.info("分析新闻【"+title+"】,链接地址【"+href+"】失败,进入下一个新闻分析。");
e.printStackTrace();
}
}
closeConnection();
}
publicStringextractContent(Stringcontent)throwsException{
try{
ParsermyParser=newParser(content);
myParser.setEncoding("GBK");
NodeListnodeList=myParser.extractAllNodesThatMatch(newNodeFilter(){
publicbooleanaccept(Nodenode){
return((nodeinstanceofTag)
&&!((Tag)node).isEndTag()
&&((Tag)node).getTagName().equals("DIV")
&&((Tag)node).getAttribute("class")!=null
&&((Tag)node).getAttribute("class").equals("cs_content"));
}
});
intsize=nodeList.size();
Nodenode=nodeList.elementAt(size-1);
content=node.toHtml();
logger.debug("==========extractContent==============");
logger.debug(content);
}catch(Exceptionpe){
logger.error("分析新闻页面出现异常!"+pe.getMessage()+"原因可能出现于新闻页面不存在<divclass=\"cs_content\"></div>标记。");
throwpe;
}
returnremoveTagA(content);
}
/**
*去除新闻中href包含cheshi.com的<a>标签
*@paramcontent分析html内容
*@return分析处理后的html内容
*/
publicStringremoveTagA(Stringcontent)throwsParserException{
ParsermyParser=newParser(content);
myParser.setEncoding("GBK");
NodeListnodeList=myParser.extractAllNodesThatMatch(newTagNameFilter("a"));
SimpleNodeIteratorit=nodeList.elements();
while(it.hasMoreNodes()){
LinkTagnode=(LinkTag)it.nextNode();
logger.info("移除新闻内容中包含的文字、图片的链接【"+node.toHtml()+"】。");
if(node.getLink().indexOf("cheshi.com")>-1)
content=content.replace(node.toHtml(),node.getStringText());
}
logger.debug("==========removeTagA==============");
logger.debug(content);
returndownloadImages(content,"D:\\autodata\\upload\\intersite",SiteName+"upload/intersite");
}
publicStringdownloadImages(Stringcontent,StringuploadImgPath,Stringlocalhost)throwsParserException{
Filef=newFile(uploadImgPath);
if(!f.exists()){
f.mkdirs();
}
ParsermyParser=newParser(content);
myParser.setEncoding("GBK");
NodeListnodeList=myParser.extractAllNodesThatMatch(newTagNameFilter("img"));
SimpleNodeIteratorit=nodeList.elements();
while(it.hasMoreNodes()){
Tagtag=(Tag)it.nextNode();
Stringsrc=tag.getAttribute("src");
Stringfilename=src.substring(src.lastIndexOf("/")+1);
InputStreamis=null;
FileOutputStreamfos=null;
try{
URLurl=newURL(src);
is=url.openStream();
intbytesRead=0;
byte[]buff=newbyte[1024];
fos=newFileOutputStream(uploadImgPath+"/"+filename);
while((bytesRead=is.read(buff,0,buff.length))!=-1){
fos.write(buff,0,bytesRead);
}
content=content.replace(src,localhost+"/"+filename);
}catch(FileNotFoundExceptionnotFoundException){
notFoundException.printStackTrace();
}catch(IOExceptionioe){
ioe.printStackTrace();
}finally{
try{
if(fos!=null)fos.close();
if(is!=null)is.close();
}catch(IOExceptionioe){
ioe.printStackTrace();
}
}
}
logger.debug("=================downloadImages==================");
logger.debug(content);
returncontent;
}
publicvoidgetConnection(){
try{
Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
StringstrCon="jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor";
StringstrUsername="sa";
StringstrPWD="qsyjcsxdl@@@web2009@@@";
conn=DriverManager.getConnection(strCon,strUserName,strPWD);
}catch(java.lang.ClassNotFoundExceptioncnfe){
cnfe.printStackTrace();
}catch(SQLExceptionse){
se.printStackTrace();
}
}
publicvoidcloseConnection(){
try{
if(conn!=null&&!conn.isClosed())conn.close();
}catch(SQLExceptionse){
se.printStackTrace();
}
}
publicvoidinsertDataBase(StringnewsTitle,StringnewsContent)throwsSQLException{
PreparedStatementpstmt=null;
try{
pstmt=conn.prepareStatement("INSERTINTOFumNews(NewsTitle,NewsContext,NewsState)values(?,?,?)");
pstmt.setString(1,newsTitle);
pstmt.setString(2,newsContent);
pstmt.setInt(3,1);
pstmt.executeUpdate();
}catch(SQLExceptione){
throwe;
}finally{
try{
if(pstmt!=null)pstmt.close();
}catch(SQLExceptione){
e.printStackTrace();
}
}
}
publicbooleannewsExist(Stringtitle)throwsSQLException{
PreparedStatementpstmt=null;
try{
pstmt=conn.prepareStatement("SELECTtop1NewsIdfromFumNewswhereNewsTitle=?");
pstmt.setString(1,title);
ResultSetrs=pstmt.executeQuery();
returnrs.next();
}catch(SQLExceptione){
throwe;
}finally{
try{
if(pstmt!=null)pstmt.close();
}catch(SQLExceptione){
e.printStackTrace();
}
}
}
publicstaticvoidmain(String[]args){
HtmlParserhtml=newHtmlParser();
//设置代理链接网络
//System.getProperties().put("proxySet","true");
//System.getProperties().put("proxyHost","192.168.99.100");
//System.getProperties().put("proxyPort","80");
//URLurl=html.getClass().getResource("log4j.properties");
//PropertyConfigurator.configure("www.cheshi.com");
logger=Logger.getLogger(HtmlParser.class);
try{
html.indexNewsContent("http://www.kaola.com/activity/detail/3245.html?navindex=1");
}catch(Exceptione){
e.printStackTrace();
logger.error("分析网页遇到错误,原因:"+e.getMessage());
}
logger.info("分析网页内容完成。");
}
}