java获取网页主信息之三:html to tree(转)
1.所需文件
param.txt:存放需要提取信息的网页路径
TestPage:存放需要提取信息的网页
Out.txt:输出的网页内容2.测试程序
package test;
import java.io.*;
import Source.*;
//提取页面主要信息测试
public class ETest{
public static void main(String args[])
{
//输出文件
String out = "out.txt";
File outfile = new File(out);
//建立html树
HTML2Tree h2t = new HTML2Tree();
String file = getFilename();
h2t.main(file);
HTree tree = h2t.getTree();
//允许标准差
double th = 0.79;
//选择主要信息块
ChooseBlock cb = new ChooseBlock(th);
//输出主要信息
String str = cb.getContent(tree);
if(str == null)
{
System.out.println("文件为空");
System.exit(1);
}
try
{
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(outfile)));
p.println(str);
p.close();
}
catch(IOException e)
{
System.out.println(e);
System.exit(1);
}
}
//获取要提取的网页文件名
private static String getFilename()
{
String file = "";
try
{
File f = new File("param.txt");
BufferedReader fis = new BufferedReader(new FileReader(f));
String s;
while((s = fis.readLine()) != null)
if(!s.equalsIgnoreCase(""))
{
file = s;
break;
}
}
catch(IOException e)
{
System.out.println(e);
System.exit(1);
}
return file;
}
} 相关推荐
Lzs 2020-10-23
聚合室 2020-11-16
零 2020-09-18
Justhavefun 2020-10-22
ChaITSimpleLove 2020-10-06
周游列国之仕子 2020-09-15
afanti 2020-09-16
88234852 2020-09-15
YClimb 2020-09-15
风雨断肠人 2020-09-04
卖口粥湛蓝的天空 2020-09-15
stulen 2020-09-15
pythonxuexi 2020-09-06
abfdada 2020-08-26
梦的天空 2020-08-25