lucene入门

exitzhang

2011-08-12

lucene是一个高性能，可伸缩的全文搜索工具包，可以使用它为你的应用程序添加索引和搜索能力，下面是一个建立索引，并对索引进行查询的小例子

package com.lamp.lucene.util;

import java.io.File;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.junit.Test;

public class IndexDaoTest {

	// 需要建立索引的数据源目录
	public String dataPath = "D:\\javaEEProject\\Lucene\\dataPath";

	// 具体的数据源文件路径
	public String filePath = "D:\\javaEEProject\\Lucene\\dataPath\\JDBC.txt";
	public String filePath2 = "D:\\javaEEProject\\Lucene\\dataPath\\反射机制.txt";

	public IndexDao indexDao = new IndexDao();

	/**
	 * 对数据源文件建立索引
	 */
	@Test
	public void testSave() throws Exception {
		File file = new File(dataPath);
		traverseFile(file);
	}

	/**
	 * 对目录下的文件进行遍历并对后缀为.txt的文件建立索引
	 */
	public void traverseFile(File file) throws Exception {
		if (file.isDirectory()) {
			File[] files = file.listFiles();
			for (File f : files) {
				traverseFile(f);
			}
		}
		if (file.getName().endsWith(".txt")) {
			Document doc = FileToDocument.fileToDocument(file);
			indexDao.save(doc);
		}

	}

	/**
	 * 删除指定文件的索引文件
	 */
	@Test
	public void testDelete() {
		Term term = new Term("path", filePath);
		indexDao.delete(term);
	}

	/**
	 * 更新指定文件的索引
	 */
	@Test
	public void testUpdate() throws Exception {
		Term term = new Term("path", filePath);
		Document doc = FileToDocument.fileToDocument(filePath);
		doc.getField("content").setValue("这是更新后的文件内容");
		indexDao.update(term, doc);
	}

	/**
	 * 根据关键字对索引文件进行查询
	 */
	@Test
	public void testSearch() {
		String queryString = "操作";
		// search(queryString, 0, 10)说的是从第一条匹配结果开始，返回最多前10条记录
		QueryResult qr = indexDao.search(queryString, 0, 10);
		System.out.println("总共有" + qr.getRecordCount() + "条匹配结果");
		for (Document doc : qr.getRecordList()) {
			System.out.println(doc.get("name"));
			System.out.println(doc.get("content"));
			System.out.println(NumberTools.stringToLong(doc.get("size")));
		}
	}

}

package com.lamp.lucene.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.NumberTools;

public class FileToDocument {

	//对其name,content字段建立索引
	public static Document fileToDocument(String dataPath) throws Exception {
		File file = new File(dataPath);
		Document doc = new Document();
		doc.add(new Field("name", file.getName(), Store.YES, Index.ANALYZED));
		doc.add(new Field("content", getContent(file), Store.YES,
				Index.ANALYZED));
		doc.add(new Field("size", NumberTools.longToString(file.length()), Store.YES,
				Index.NOT_ANALYZED));
		doc.add(new Field("path", file.getAbsolutePath(), Store.YES,
				Index.NOT_ANALYZED));
		return doc;

	}

	//返回文件内容
	private static String getContent(File file) throws Exception {
		StringBuffer buffer = new StringBuffer();
		BufferedReader reader = null;
		reader = new BufferedReader(new FileReader(file));
		String str = "";
		while (null != (str = reader.readLine())) {
			buffer.append(str).append("\n");
		}
		return buffer.toString();
	}

	public static Document fileToDocument(File file) throws Exception {
		Document doc = new Document();
		doc.add(new Field("name", file.getName(), Store.YES, Index.ANALYZED));
		doc.add(new Field("content", getContent(file), Store.YES,
				Index.ANALYZED));
		doc.add(new Field("size", NumberTools.longToString(file.length()), Store.YES,
				Index.NOT_ANALYZED));
		doc.add(new Field("path", file.getAbsolutePath(), Store.YES,
				Index.NOT_ANALYZED));
		return doc;
	}

}

package com.lamp.lucene.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

public class IndexDao {

	public String indexPath = "D:\\javaEEProject\\Lucene\\indexPath";

	public Analyzer analyzer = new MMAnalyzer();// 词库分词

	/**
	 * 添加/创建索引
	 */
	public void save(Document doc) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer,
					MaxFieldLength.LIMITED);
			indexWriter.addDocument(doc);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				if (indexWriter != null) {
					indexWriter.close();
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * Term是搜索的最小单位，代表某个 Field 中的一个关键词，如：<title, lucene>
	 * new Term( "title", "lucene" );
	 * new Term( "id", "5" );
	 * new Term( "id", UUID );
	 */
	public void delete(Term term) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer,
					MaxFieldLength.LIMITED);
			indexWriter.deleteDocuments(term);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * 更新索引
	 * 
	 * <pre>
	 * indexWriter.deleteDocuments(term);
	 * indexWriter.addDocument(doc);
	 * </pre>
	 * 
	 * @param term
	 * @param doc
	 */
	public void update(Term term, Document doc) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer,
					MaxFieldLength.LIMITED);
			indexWriter.updateDocument(term, doc);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
	}

	/**
	 * <pre>
	 * totalPage = recordCount / pageSize;
	 * if (recordCount % pageSize &gt; 0)
	 * 	totalPage++;
	 * </pre>
	 * 
	 */
	/*
	 * public QueryResult search(String queryString, int firstResult, int
	 * maxResults) { try { // 1，把要搜索的文本解析为 Query String[] fields = { "name",
	 * "content" }; Map<String, Float> boosts = new HashMap<String, Float>();
	 * boosts.put("name", 3f); // boosts.put("content", 1.0f); 默认为1.0f
	 * 
	 * QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer,
	 * boosts); Query query = queryParser.parse(queryString);
	 * 
	 * return search(query, firstResult, maxResults); } catch (Exception e) {
	 * throw new RuntimeException(e); } }
	 */

	public QueryResult search(String queryString, int firstResult,
			int maxResults) {
		try {
			//在索引字段name,content上进行查询
			String[] fields = { "name", "content" };
			// 设置字段相关度,默认相关度为1f
			Map<String, Float> boosts = new HashMap<String, Float>();
			boosts.put("name", 5f);

			QueryParser queryParser = new MultiFieldQueryParser(fields,
					analyzer,boosts);
			// 1，把要搜索的文本解析为 Query
			Query query = queryParser.parse(queryString);

			return search(query, firstResult, maxResults);
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	public QueryResult search(Query query, int firstResult, int maxResults) {
		IndexSearcher indexSearcher = null;

		try {
			// 2，进行查询
			indexSearcher = new IndexSearcher(indexPath);
			//限定所查询文件的大小在200-1000之间
			Filter filter = new RangeFilter("size",
					NumberTools.longToString(200),
					NumberTools.longToString(1000), true, true);

			// 自定义排序，此处是按照文档大小进行排序
			Sort sort = new Sort();
			sort.setSort(new SortField("size")); // 默认为升序
			// sort.setSort(new SortField("size", true));此为降序
			
			//TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort);
			
			TopDocs topDocs = indexSearcher.search(query, null, 10000, sort);
			//查询到的匹配结果数
			int recordCount = topDocs.totalHits;
			List<Document> recordList = new ArrayList<Document>();

			// 准备高亮器,所谓高亮就是对查询到的关键字进行字体颜色的改变或者为其设置超链接
			Formatter formatter = new SimpleHTMLFormatter("<font color='red'>",
					"</font>");
			Scorer scorer = new QueryScorer(query);
			Highlighter highlighter = new Highlighter(formatter, scorer);

			Fragmenter fragmenter = new SimpleFragmenter(50);
			highlighter.setTextFragmenter(fragmenter);

			// 3，取出当前页的数据
			int end = Math.min(firstResult + maxResults, topDocs.totalHits);
			for (int i = firstResult; i < end; i++) {
				ScoreDoc scoreDoc = topDocs.scoreDocs[i];
				// 文档内部编号
				int docSn = scoreDoc.doc; 
				// 根据编号取出相应的文档
				Document doc = indexSearcher.doc(docSn); 
				// 返回高亮后的结果，如果当前属性值中没有出现关键字，会返回 null
				String hc = highlighter.getBestFragment(analyzer, "content",
						doc.get("content"));
				if (hc == null) {
					String content = doc.get("content");
					int endIndex = Math.min(50, content.length());
					// 最多前50个字符
					hc = content.substring(0, endIndex);
				}
				doc.getField("content").setValue(hc);
				recordList.add(doc);
			}

			// 返回结果
			return new QueryResult(recordCount, recordList);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			try {
				indexSearcher.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

package com.lamp.lucene.util;

import java.util.List;

import org.apache.lucene.document.Document;

/**
 * 
 * QueryResult封装的是匹配结果数和Document集合
 */
public class QueryResult {
	private int recordCount;
	private List<Document> recordList;

	public QueryResult(int recordCount, List<Document> recordList) {
		super();
		this.recordCount = recordCount;
		this.recordList = recordList;
	}

	public int getRecordCount() {
		return recordCount;
	}

	public void setRecordCount(int recordCount) {
		this.recordCount = recordCount;
	}

	public List<Document> getRecordList() {
		return recordList;
	}

	public void setRecordList(List<Document> recordList) {
		this.recordList = recordList;
	}

}

当然除了关键字查询外，还有文件大小范围进行查询，通配符查询，短语查询，布尔查询，示例如下

package com.lamp.lucene.util;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.junit.Test;


public class QueryTest {
	
	public IndexDao indexDao = new IndexDao();

	/**
	 * 关键字查询，即对指定的列上进行查询
	 */
	@Test
	public void testTermQuery(){
		Term term = new Term("name", "反射");
		Query query = new TermQuery(term);
		QueryResult result = indexDao.search(query, 0, 100);
		printResult(result);
	}

	/**
	 * 按照文件大小范围进行查询,由于数字字符串比较大小时由于位数不同易产生问题，这里统一将数字用工具类
	 * NumberTools.longToString(long a)进行转化
	 */
	@Test
	public void testRangeQuery(){
		Term lowerTerm = new Term("size",NumberTools.longToString(200));
		Term upperTerm = new Term("size",NumberTools.longToString(1000));
		
		Query query = new RangeQuery(lowerTerm, upperTerm, true);
		QueryResult result = indexDao.search(query, 0, 100);
		printResult(result);
	}
	
	/**
	 * 通配符查询
	 * ‘？’代表一个字符，‘*’代表0个或多个字符
	 */
	@Test
	public void testWildcardQuery(){
		Term term = new Term("name","jd*");
		Query query = new WildcardQuery(term);
		QueryResult result = indexDao.search(query, 0, 100);
		printResult(result);
	}
	
	/**
	 * 短语查询，比如你想查一首歌曲，里面有两个词语，但是这两个词语之间又隔着若干个词语，这时候可以选择短语查询
	 */
	@Test
	public void testPhraseQuery(){
		PhraseQuery query = new PhraseQuery();
		query.add(new Term("content","反射"));
		query.add(new Term("content","属性"));
		//设定最大间隔数为10
		query.setSlop(10);
		QueryResult result = indexDao.search(query, 0, 100);
		printResult(result);
	}
	/**
	 * 布尔查询，即对查询条件进行多项指定，不如大小，内容等
	 */
	@Test
	public void testBoolQuery(){
		Term lowerTerm = new Term("size",NumberTools.longToString(200));
		Term upperTerm = new Term("size",NumberTools.longToString(1000));
		Query query1 = new RangeQuery(lowerTerm, upperTerm, true);
		
		PhraseQuery query2 = new PhraseQuery();
		query2.add(new Term("content","反射"));
		query2.add(new Term("content","属性"));
		//设定最大间隔数为10 
		query2.setSlop(10);
		
		BooleanQuery booleanQuery = new BooleanQuery();
		//Occur.MUST指定的条件必须满足
		booleanQuery.add(query1, Occur.MUST);
		booleanQuery.add(query2, Occur.MUST);
		QueryResult result = indexDao.search(booleanQuery, 0, 100);
		printResult(result);
	}
	
	public void printResult(QueryResult result) {
		System.out.println("找到的匹配结果有:" + result.getRecordCount() + "条");
		for(Document doc : result.getRecordList()){
			System.out.println(doc.get("name"));
			System.out.println(doc.get("content"));
		}
	}
	
}

我使用的lucene版本是2.4，引入的jar包有lucene-core-2.4.0.jar,lucene-highlighter-2.4.0.jar,lucene-analyzers-2.4.0.jar,je-analysis-1.5.3.jar

lucene apache