lucene如何解析PPT文档

加入jar包(poi-3.0.2-FINAL-20080204.jarpoi-contrib-3.0.2-FINAL-20080204.jarpoi-scratchpad-3.0.2-FINAL-20080204.jar)

package com.cs;

public interface Parsable {
	
	public String getTitle() ;
	public String getContent()  ;
	public String getSummary()  ;
}
package com.cs;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;

public class PPTParser implements Parsable {

	private File file;

	private String content;

	public PPTParser(File file) {
		this.file = file;
	}

	public String getContent() {

		if (content != null) {
			return content;
		}

		// HSLFSlideShow contains the main functionality for the Powerpoint file
		// "reader". It is only a very basic class for now
		// SlideShow is a friendly wrapper on top of the more scary
		// HSLFSlideShow

		InputStream is;
		try {
			is = new FileInputStream(file);

			SlideShow ss = new SlideShow(new HSLFSlideShow(is));
			Slide[] slides = ss.getSlides();

			StringBuffer sb = new StringBuffer();

			for (int i = 0; i < slides.length; i++) {
				// This class represents a run of text in a powerpoint document.
				// That run could be text on a sheet, or text in a note.
				// It is only a very basic class for now
				TextRun[] t = slides[i].getTextRuns();
				for (int j = 0; j < t.length; j++) {
					sb.append(t[j].getText());
				}
				sb.append(slides[i].getTitle());
			}
			content = sb.toString();
			return content;
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		return null;
	}

	public String getSummary() {
		String summary;
		if (content == null) {
			getContent();
		}

		if (content.length() > 200) {
			summary = content.substring(0, 200);
		} else {
			summary = content;
		}

		return summary;
	}

	public String getTitle() {
		// TODO Auto-generated method stub
		return file.getName();
	}
	
	public static void main(String[] args) {
		PPTParser pptParser = new PPTParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\搜索引擎-基础.ppt")) ;
		System.out.println("ppt content : "+pptParser.getContent()) ;
	}

}

相关推荐