实现像百度一样的自动补全功能

bxqybxqy

2013-11-27

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Sugesstion {

private static final String GRAMMED_WORDS_FIELD = "words";

private static final String SOURCE_WORD_FIELD = "sourceWord";

private static final String COUNT_FIELD = "count";

     private static final String[] ENGLISH_STOP_WORDS = {
     "a", "an", "and", "are", "as", "at", "be", "but", "by",
     "for", "i", "if", "in", "into", "is",
     "no", "not", "of", "on", "or", "s", "such",
     "t", "that", "the", "their", "then", "there", "these",
     "they", "this", "to", "was", "will", "with"
     };

private final Directory autoCompleteDirectory;

private IndexReader autoCompleteReader;

private IndexSearcher autoCompleteSearcher;

     public Sugesstion(String autoCompleteDir) throws IOException {
     this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir,
        null);

reOpenReader();
}

     public List<String> suggestTermsFor(String term) throws IOException {
     // get the top 5 terms for query
     Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term));
     Sort sort = new Sort(COUNT_FIELD, true);

     TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort);
     List<String> suggestions = new ArrayList<String>();
     for (ScoreDoc doc : docs.scoreDocs) {
       suggestions.add(autoCompleteReader.document(doc.doc).get(
         SOURCE_WORD_FIELD));
     }

return suggestions;
}

     @SuppressWarnings("unchecked")
     public void reIndex(Directory sourceDirectory, String fieldToAutocomplete)
       throws CorruptIndexException, IOException {
     // build a dictionary (from the spell package)
     IndexReader sourceReader = IndexReader.open(sourceDirectory);

LuceneDictionary dict = new LuceneDictionary(sourceReader,
fieldToAutocomplete);

     // code from
     // org.apache.lucene.search.spell.SpellChecker.indexDictionary(
     // Dictionary)
     IndexReader.unlock(autoCompleteDirectory);

     // use a custom analyzer so we can do EdgeNGramFiltering
     IndexWriter writer = new IndexWriter(autoCompleteDirectory,
     new Analyzer() {
       public TokenStream tokenStream(String fieldName,
         Reader reader) {
        TokenStream result = new StandardTokenizer(reader);

        result = new StandardFilter(result);
        result = new LowerCaseFilter(result);
        result = new ISOLatin1AccentFilter(result);
        result = new StopFilter(result,
         ENGLISH_STOP_WORDS);
        result = new EdgeNGramTokenFilter(
         result, Side.FRONT,1, 20);

        return result;
       }
     }, true);

writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);

     // go through every word, storing the original word (incl. n-grams)
     // and the number of times it occurs
     Map<String, Integer> wordsMap = new HashMap<String, Integer>();

     Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
     while (iter.hasNext()) {
       String word = iter.next();

       int len = word.length();
       if (len < 3) {
        continue; // too short we bail but "too long" is fine...
       }

       if (wordsMap.containsKey(word)) {
        throw new IllegalStateException(
          "This should never happen in Lucene 2.3.2");
        // wordsMap.put(word, wordsMap.get(word) + 1);
       } else {
        // use the number of documents this word appears in
        wordsMap.put(word, sourceReader.docFreq(new Term(
          fieldToAutocomplete, word)));
       }
     }

     for (String word : wordsMap.keySet()) {
       // ok index the word
       Document doc = new Document();
       doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES,
         Field.Index.UN_TOKENIZED)); // orig term
       doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES,
         Field.Index.TOKENIZED)); // grammed
       doc.add(new Field(COUNT_FIELD,
         Integer.toString(wordsMap.get(word)), Field.Store.NO,
         Field.Index.UN_TOKENIZED)); // count

writer.addDocument(doc);
}

sourceReader.close();

     // close writer
     writer.optimize();
     writer.close();

     // re-open our reader
     reOpenReader();
     }

     private void reOpenReader() throws CorruptIndexException, IOException {
     if (autoCompleteReader == null) {
       autoCompleteReader = IndexReader.open(autoCompleteDirectory);
     } else {
       autoCompleteReader.reopen();
     }

autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
}

public static void main(String[] args) throws Exception {
Sugesstion autocomplete = new Sugesstion("/index/autocomplete");

     // run this to re-index from the current index, shouldn't need to do
     // this very often
     // autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null),
     // "content");

String term = "steve";

     System.out.println(autocomplete.suggestTermsFor(term));
     // prints [steve, steven, stevens, stevenson, stevenage]
     }

}

apache lucene

安科网

实现像百度一样的自动补全功能

bxqybxqy

bxqybxqy

相关推荐

MAC OS 10.15 Lucene 源码分析环境搭建

.NET Core下使用Kafka的方法步骤

解决PHPstudy Apache无法启动的问题【亲测有效】

Web安全：文件解析漏洞

终于有人把Nginx说清楚了，图文详解！

为什么Java仍将是未来的主流语言？

如何使用Apache Web服务器来安装和配置网站？

CentOS 8 Apache 安装后 SSL 重定向提示证书错误

如何使用 Apache Directory Studio 连接 JumpCloud

初学者和专业技术人员使用的十大机器学习软件

每个Java开发人员都应该知道的10大Github仓库

漫话：应用程序被拖慢？罪魁祸首竟然是Log4j！

JSP动态网页开发原理详解

centos8使用Apache httpd2.4.37安装web服务器的步骤详解

Tomcat启动springboot项目war包报错：启动子级时出错的问题

如何通过Apache在本地配置多个虚拟主机

Apache Shiro 反序列化(CVE-2016-4437)复现

Apache Shiro 反序列化(CVE-2016-4437)复现

Apache DolphinScheduler 诞生记

【Shiro】05 自定义Realm认证实现

bxqybxqy