java获取网页主信息之三:html to tree(转)
1.所需文件
param.txt:存放需要提取信息的网页路径
TestPage:存放需要提取信息的网页
Out.txt:输出的网页内容2.测试程序
package test; import java.io.*; import Source.*; //提取页面主要信息测试 public class ETest{ public static void main(String args[]) { //输出文件 String out = "out.txt"; File outfile = new File(out); //建立html树 HTML2Tree h2t = new HTML2Tree(); String file = getFilename(); h2t.main(file); HTree tree = h2t.getTree(); //允许标准差 double th = 0.79; //选择主要信息块 ChooseBlock cb = new ChooseBlock(th); //输出主要信息 String str = cb.getContent(tree); if(str == null) { System.out.println("文件为空"); System.exit(1); } try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(outfile))); p.println(str); p.close(); } catch(IOException e) { System.out.println(e); System.exit(1); } } //获取要提取的网页文件名 private static String getFilename() { String file = ""; try { File f = new File("param.txt"); BufferedReader fis = new BufferedReader(new FileReader(f)); String s; while((s = fis.readLine()) != null) if(!s.equalsIgnoreCase("")) { file = s; break; } } catch(IOException e) { System.out.println(e); System.exit(1); } return file; } }
相关推荐
Lzs 2020-10-23
聚合室 2020-11-16
零 2020-09-18
Justhavefun 2020-10-22
jacktangj 2020-10-14
ChaITSimpleLove 2020-10-06
Andrea0 2020-09-18
周游列国之仕子 2020-09-15
afanti 2020-09-16
88234852 2020-09-15
YClimb 2020-09-15
风雨断肠人 2020-09-04
卖口粥湛蓝的天空 2020-09-15
stulen 2020-09-15
pythonxuexi 2020-09-06
abfdada 2020-08-26
梦的天空 2020-08-25