lucene如何解析Doc文档
加入poi-scratchpad-3.0.2-FINAL-20080204.jar到lib下
package com.cs; public interface Parsable { public String getTitle() ; public String getContent() ; public String getSummary() ; }
package com.cs; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.poi.hwpf.extractor.WordExtractor; public class DocParser implements Parsable { private File file; private String content; private WordExtractor wordExtractor; public DocParser(File file) { this.file = file; } public String getContent() { try { if (content != null) { return content; } InputStream is = null; is = new FileInputStream(file); wordExtractor = new WordExtractor(is); content = wordExtractor.getText(); return content; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } /** * summary取内容的前200个字符 */ public String getSummary() { String summary; if (content == null) { getContent(); } if (content.length() > 200) { summary = content.substring(0, 200); } else { summary = content; } return summary; } public String getTitle() { return file.getName(); } public static void main(String[] args) { DocParser docParser = new DocParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\XPDF使用文档.doc")) ; System.out.println("doc content : "+docParser.getContent()) ; } }
txt的解析
package com.cs; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; public class TextParser implements Parsable { private File file ; private String content ; public TextParser(File file) { super(); this.file = file; } public String getContent() { if (content != null ) { return content ; } try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file))) ; StringBuffer sb = new StringBuffer() ; String line = null ; while ((line = br.readLine()) != null) { sb.append(line).append("\n") ; } content = sb.toString() ; return content ; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } public String getSummary() { String summary ; if (content == null ) { getContent() ; } if (content.length() > 200) { summary = content.substring(0, 200) ; }else { summary = content ; } return summary; } public String getTitle() { return file.getName(); } public static void main(String[] args) { TextParser textParser = new TextParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\文档.txt")) ; System.out.println("text content : "+textParser.getContent()) ; } }
相关推荐
renjinlong 2020-09-03
Jacry 2020-07-04
IceStreamLab 2020-06-26
mengyue 2020-06-09
PasserbyX 2020-05-16
mameng 2020-05-12
心丨悦 2020-05-06
编码之路 2020-05-03
mengyue 2020-05-02
qiuzhuoxian 2020-02-23
编码之路 2020-02-20
lionelf 2020-02-03
TyCoding 2020-02-01
heniancheng 2020-01-31
某某某 2020-01-30
PinkBean 2020-01-29
某某某 2020-01-12
编码之路 2020-01-01
itmale 2020-01-01