lucene根据文件类型自动解析的工厂类
阅读本章之前请先参考其他几篇解析各类文档的章节
http://wuzhaohuixy-qq-com.iteye.com/blog/780437
http://wuzhaohuixy-qq-com.iteye.com/blog/780431
http://wuzhaohuixy-qq-com.iteye.com/blog/780426
http://wuzhaohuixy-qq-com.iteye.com/blog/780423
这里主要讲解根据文件类型自动解析文档(ppt,pdf,txt,doc,html,htm)
用java中的反射机制
先准备属性文件
parser.properties
txt=com.cs.TextParser doc=com.cs.DocParser rtf=com.cs.DocParser ppt=com.cs.PPTParser pdf=com.cs.PdfParser html=com.cs.EasyHtmlParser htm=com.cs.EasyHtmlParser
工厂类ParserFactory
package com.cs; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.util.Properties; public class ParserFactory { //该类一加载就把配置文件读到内存 static Properties ps ; static{ ps = new Properties() ; try { ps.load(new FileInputStream("E:\\EclipseStudyWorkspace\\LuceneParse\\src\\parser.properties")) ; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } //该方法根据文件的后缀名确定该文件的类型 然后格局配置文件的类型分词建立索引 public static Parsable getParser(File file){ String ext = file.getAbsolutePath().substring(file.getAbsolutePath().lastIndexOf(".") + 1) ; String className = ps.getProperty(ext) ; Parsable parser = null ; if (className != null){ try { //此处不能直接Class.forName().newInstance() //原因:要传参数 //所以要先拿到构造器 然后根据构造器区newInstance() 此时就可以传入参数了 Class clazz = Class.forName(className) ; //根据参数的不同拿到不同的构造器 Constructor constructor = clazz.getConstructor(new Class[]{File.class}) ;//此处传入的参数是Class类型 parser = (Parsable)constructor.newInstance(new Object[]{file}) ; } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SecurityException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (NoSuchMethodException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IllegalArgumentException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InstantiationException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IllegalAccessException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvocationTargetException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return parser ; } }
测试类
package com.cs; import java.io.File; public class Test { /** * @param args */ public static void main(String[] args) { Parsable parser = null ; parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\搜索引擎-基础.ppt")); // parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\123.pdf")); // parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm")); // parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\XPDF使用文档.doc")); // parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\文档.txt")); // parser = ParserFactory.getParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\BaseItem.html")); System.out.println(" content : "+parser.getContent()) ; } }