解析得到除去标签的txt内容
代码如下:
NodeList body_nodes=this.getParser().parse(body_filter); for(int i=0;i<body_nodes.size();i++) { Node node=body_nodes.elementAt(i); Parser body_parser=new Parser(node.toHtml()); TextExtractingVisitor visitor=new TextExtractingVisitor(); body_parser.visitAllNodesWith(visitor); body.append(visitor.getExtractedText()); }
TextExtractingVisitor,visitAllNodesWith等类及方法都是Visitor中比较很重要但也很少见的。
下面附源代码:
代码如下:
import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.Date; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.HasChildFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; import org.htmlparser.visitors.TextExtractingVisitor; import com.extractor.Extractor; public class ExtractorHangdian extends Extractor{ public void extract() { BufferedWriter bw=null; String indextime; String title; StringBuffer body=new StringBuffer();; NodeFilter time_filter=new AndFilter(new TagNameFilter("font"),new HasAttributeFilter("color","#808080")); NodeFilter title_filter1=new AndFilter(new TagNameFilter("td"),new HasChildFilter(new TagNameFilter("b"))); NodeFilter body_filter=new AndFilter(new TagNameFilter("td"),new HasChildFilter(new TagNameFilter("p"))); try { NodeList title_nodes=this.getParser().parse(title_filter1); Node node=title_nodes.elementAt(0); NodeList node2=node.getChildren(); //title=node2.elementAt(0).toHtml(); /* '\r\n' */ //title=node2.elementAt(1).toHtml(); /*font color="#000080" style="font-size:14.4px*/ //title=node2.elementAt(2).toHtml(); /* b */ title=node2.elementAt(3).toHtml(); /* 教材征订及教师用书登记通知 */ bw=new BufferedWriter(new FileWriter(new File(this.getOutputPath()+title+".txt"))); String url_seg1=getInputFilePath().substring(3,30); int end=getInputFilePath().lastIndexOf("."); String url_seg2=getInputFilePath().substring(30, end); String url_seg=url_seg1+".asp?"+url_seg2; url_seg=url_seg.replaceAll("\\\\","/"); String url="http://"+url_seg; bw.write(url+NEWLINE); bw.write(title+NEWLINE); } catch(Exception e) { e.printStackTrace(); } this.getParser().reset(); try { NodeList time_nodes=this.getParser().parse(time_filter); Node time_node=time_nodes.elementAt(1);//这里的“1”表示符合time_filter的第二个元素 indextime=time_node.getNextSibling().toHtml(); bw.write(indextime+NEWLINE); } catch(Exception e) { e.printStackTrace(); } this.getParser().reset();//得到除去标签的所有txt文本 try { NodeList body_nodes=this.getParser().parse(body_filter); for(int i=0;i<body_nodes.size();i++) { Node node=body_nodes.elementAt(i); Parser body_parser=new Parser(node.toHtml()); TextExtractingVisitor visitor=new TextExtractingVisitor(); body_parser.visitAllNodesWith(visitor); body.append(visitor.getExtractedText()); } bw.write(body+NEWLINE); } catch(Exception e) { e.printStackTrace(); } try { if(bw!=null) bw.close(); }catch(IOException e) { e.printStackTrace(); } } }
这里顺便提一下,当年bw没有关掉,怎么读不进去,搞了我好几天,郁闷死了,想起来就火大,注意!!
相关推荐
爱好HtmlCssJs 2019-11-25
wgPython 2019-04-05
jkshangss 2012-08-06
panyingdao 2011-11-03
loverlucky 2010-11-22
GATSBYER 2012-03-16
souhugirl 2011-11-30
taowanyy 2014-12-08
souhugirl 2010-04-12
SPARK 2010-04-12
谷歌架构师 2019-06-21
Yellowpython 2019-06-21
wusiye 2008-07-08
happyzhangyin 2012-08-28
wangnan0 2012-03-15
andyhu00 2011-04-07
zhangpeng 2011-02-22