Lucene 第一个Lucene例子
第一个Lucene例子,使用lucene-4.0.0,中文查询没有结果。
1.创建索引
package lucene.index; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * 创建文档索引 * 步骤1:创建Lucene Index Writer * 步骤2:索引文档 */ public class Indexer { /* * 创建索引的目录 */ private String indexDir = "F:/project/Lucene/index"; /* * 文档目录 */ private String dataDir = "F:/project/Lucene/docs"; /* * 是否第一次创建索引 */ private boolean create = true; /* * 这个类负责创建索引或打开已有索引,以及向索引中添加、删除或更新被索引文档的信息。 提供针对索引文件的写入操作,但不能读取或搜索索引。 */ private IndexWriter writer; /** * 创建Lucene Index Writer * 步骤1:Directory创建索引存放的位置 * 步骤2:创建分析器Analyzer * 步骤3:配置IndexWriterConfig,使用分析器Analyzer * 步骤4:创建IndexWriter,使用Directory和IndexWriterConfig */ public Indexer() throws IOException { /* * 它是一个抽象类,它的子类负责具体指定索引的存储路径。 */ Directory dir = FSDirectory.open(new File(indexDir)); /* * 分析器,它负责从被索引文本文件中提取语汇单元,并剔除剩下的无用信息。 */ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); if (create) { iwc.setOpenMode(OpenMode.CREATE); } else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } iwc.setInfoStream(System.out); writer = new IndexWriter(dir, iwc); } /** * 关闭Lucene Index Writer */ public void close() throws IOException { writer.close(); } /** * 索引文档 * 步骤1:找到文档目录下所有文件 * 步骤2:循环每个文档,如果是txt文档则步骤3,否则继续循环,或到步骤6 * 步骤3:文档作为输入流FileInputStream,创建Document,为Document添加多个域 * 步骤4:创建或更新索引文档 * 步骤5:关闭输入流 * 步骤6:返回索引文档的数目 */ public int index() throws Exception { File[] files = new File(dataDir).listFiles(); for (File f : files) { FileInputStream fis = null; try { /* * 只索引目录下所有txt文档 */ if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && f.getName().toLowerCase().endsWith(".txt")) { System.out.println("Indexing " + f.getCanonicalPath()); fis = new FileInputStream(f); /* * Document对象代表Field的集合。文档的Field代表文档或文档相关的一些元数据。 */ Document doc = new Document(); /* * TextField、StringField、LongField等Field是包含能被索引的文本内容的类。每个Field包含一个名称和值,以及一组选项来控制Lucene索引操作各个域值。 */ doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); doc.add(new StringField("filename", f.getName(), Field.Store.YES)); doc.add(new StringField("fullpath", f.getCanonicalPath(), Field.Store.YES)); doc.add(new LongField("modified", f.lastModified(), Field.Store.NO)); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + f); writer.addDocument(doc); } else { System.out.println("updating " + f); writer.updateDocument(new Term("path", f.getPath()), doc); } } } finally { if (fis != null) { fis.close(); } } } return writer.numDocs(); } public static void main(String[] args) throws Exception { Indexer indexer = null; int numIndexed; long start = System.currentTimeMillis(); try { indexer = new Indexer(); numIndexed = indexer.index(); } finally { if (indexer != null) { indexer.close(); } } long end = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } }
2.搜索
package lucene.index; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; /** * 搜索文档 * 步骤1:创建IndexReader * 步骤2:创建IndexSearcher * 步骤3:创建Query * 步骤4:搜索searcher.search */ public class Searcher { /* * 索引存放目录 */ private String indexDir = "F:/project/Lucene/index"; /** * 搜索 * * @param 搜索的域名 * ,如contents或filename * @param 搜索的值 */ public void search(String where, String q) throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir))); /* * 用于搜索由IndexWriter类创建的索引 */ IndexSearcher searcher = new IndexSearcher(reader); /* * Query 方法一 */ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); QueryParser parser = new QueryParser(Version.LUCENE_40, where, analyzer); /* * Lucene含有许多具体的Query子类,TermQuery、BooleanQuery、PhraseQuery、PrefixQuery、PhrasePrefixQuery、TermRangeQuery、NumericRangeQuery、FilteredQuery和SpanQuery * */ Query query1 = parser.parse(q); /* * Query 方法二 */ /* * TermQuery是Lucene提供的最基本的查询类型,也是简单查询类型之一。用来匹配指定域中包含特定项的文档。 */ Query query2 = new TermQuery(new Term(where, q)); long start = System.currentTimeMillis(); /* * 一个简单的指针容器,指向前N个排名的搜索结果。 */ TopDocs hits = searcher.search(query1, null, 10); long end = System.currentTimeMillis(); System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); for (ScoreDoc scoreDoc : hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc); System.out.println(doc.get("fullpath")); System.out.println(doc.get("filename")); } } public static void main(String[] args) throws IOException, ParseException { Searcher searcher = new Searcher(); searcher.search("filename", "b.txt"); searcher.search("contents", "abc"); } }
相关推荐
Jacry 2020-07-04
renjinlong 2020-09-03
IceStreamLab 2020-06-26
mengyue 2020-06-09
PasserbyX 2020-05-16
mameng 2020-05-12
心丨悦 2020-05-06
编码之路 2020-05-03
mengyue 2020-05-02
qiuzhuoxian 2020-02-23
编码之路 2020-02-20
lionelf 2020-02-03
TyCoding 2020-02-01
heniancheng 2020-01-31
某某某 2020-01-30
PinkBean 2020-01-29
某某某 2020-01-12
编码之路 2020-01-01
itmale 2020-01-01