lucene 查询示例

ReganHoo

2012-03-26

关注关注

排序 Lucene默认按照相关度(score)排序,为了能支持其他的排序方式,比如日期,我们在add Field的时候,必须保证field被Index且不能被tokenized(分词),并且排序的只能是数字,日期,字符三种类型之一

实体类

public class Article {

privateStringid;

privateStringtitle;

privateStringkeyWords;

privateStringcontent;

privateintorder;

省略set..get方法

}

组织数据

import java.util.ArrayList;
import java.util.List;

import com.company.project.entity.Article;


public class DATAUTIls {
	public static List<Article> luceneDatas = new ArrayList<Article>();
	
	static {
		
		Article a1 = new Article();
		a1.setContent("我们都是中国人" );
		a1.setId("1");
		a1.setTitle("法眼看中国是怎么样的一个中国" ) ;//有两个中国
		a1.setKeyWords("中国，中国，中国") ;
		a1.setOrder(1);
		
		
		Article a2 = new Article();
		a2.setContent("我们是两个中国 中国" );
		a2.setId("2");
		a2.setTitle("法眼看中国是怎么样的一个中国 中国" ) ;//有两个中国
		a2.setKeyWords("中国，中国") ;
		a2.setOrder(2);
		
		
		Article a3 = new Article();
		a3.setContent("我们都是中国人" );
		a3.setId("3");
		a3.setTitle("法眼看怎么样的一个中国" ) ;//有两个中国
		a3.setKeyWords("中国 ") ;
		a3.setOrder(3);
		
		
		Article a4 = new Article();
		a4.setContent("我们都是国中人" );
		a4.setId("4");
		a4.setTitle("法眼看" ) ;//有两个中国
		a4.setKeyWords("无") ;
		a4.setOrder(4);
		
		luceneDatas.add(a1);
		luceneDatas.add(a3);
		luceneDatas.add(a2);
		luceneDatas.add(a4);
		
	}

}

建造索引

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.NRTCachingDirectory;
import org.apache.lucene.util.Version;

import com.company.project.entity.Article;


public class IndexRunner {
	 private String INDEX_STORe_PATH = "D:\\workplace\\company\\mylucene\\indexstore";
 
     public IndexRunner(){};
     public IndexRunner(String index_path)
     {
         this.INDEX_STORe_PATH = index_path;
         File dir = new File(index_path);
         if(dir.exists())
         {
        	 dir.mkdir();
         }
     }
     
     //创建索引
     public void createIndex(List<Article> datas,boolean isCreate) throws IOException
     {
         
         //待创建得文档目录
         Directory dir = FSDirectory.open(new File(INDEX_STORe_PATH));
         //选择得分词工具
         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
         //建立索引的配置类，包含了一个解析器
         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, analyzer);
          //设置我们的解析器是新建还是追加更新
         if(isCreate){
             iwc.setOpenMode(OpenMode.CREATE);//每次建立都覆盖原来的索引
         }
         else{
        	 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);//每次都追加更新
         }
             
         
         NRTCachingDirectory cachedFSDir = new NRTCachingDirectory(dir, 5.0, 60.0);
         iwc.setMergeScheduler(cachedFSDir.getMergeScheduler());
         
         
         //索引的建立类 第一个参数索引的存放位置，第二个参数索引的配置对象
         IndexWriter writer = new IndexWriter(dir, iwc);
         
        
         
         
         for(int i=0;i<datas.size();i++)
         {
        	 Article article =   datas.get(i );
        	 /*
        	   * Field.Store.YES:存储字段值（未分词前的字段值） Field.Store.NO:不存储,存储与索引没有关系
        	   * Field.Store.COMPRESS:压缩存储,用于长文本或二进制，但性能受损 Field.Index.ANALYZED:分词建索引
        	   * Field.Index.ANALYZED_NO_NORMS:分词建索引，但是Field的值不像通常那样被保存，而是只取一个byte，这样节约存储空间
        	   * Field.Index.NOT_ANALYZED:不分词且索引
        	   * Field.Index.NOT_ANALYZED_NO_NORMS:不分词建索引，Field的值去一个byte保存
        	   */
        	  Field f0 = new Field("title", article.getTitle(), Field.Store.YES, Field.Index.ANALYZED);
        	  Field f1 = new Field("content",article.getContent(),Field.Store.YES,Field.Index.ANALYZED);
        	  Field f2 = new Field("order",String.valueOf(article.getOrder()),Field.Store.YES,Field.Index.NOT_ANALYZED);
        	  Field f3 = new Field("id",String.valueOf(article.getId()),Field.Store.YES,Field.Index.NOT_ANALYZED);
        	  
        	  Document doc = new Document();
        	  doc.add(f0) ;
        	  doc.add(f1);
        	  doc.add(f2);
        	  doc.add(f3); 
        	  writer.addDocument(doc);  
         } 
          
         //这个方法在新增索引的情况会很有用，就是讲原来散落的索引文件重新进行整理合并！
         // 
         writer.forceMerge(1);
         
         writer.close();
         System.out.println("索引创建成功");
          
         
     }
     public static void main(String[] args) {
        
         IndexRunner indexRunner = new IndexRunner();
         try {
        	 indexRunner.createIndex(DATAUTIls.luceneDatas,true);
         } catch (IOException e) {
             // TODO Auto-generated catch block
             e.printStackTrace();
         }
     }

}

查询

此处有三种查询，一种是多字段查询一个关键字，一种是多字段组合查询，还有一种是分页查询

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import com.company.project.entity.Article;


public class SearchRunner {
	 private static String PATH = "D:\\workplace\\company\\mylucene\\indexstore";
	
	 public static void main(String [] arg) throws Exception{
		
		 String[] queryFileds = { "title", "content" };
		  String queryString = "中国";
		 //SearchRunner.searchList(queryFileds, queryString );
		  
		 // SearchRunner.combinationSearch();
		  
		  SearchRunner.pagingSearch("中", null);
	 }
	 
	 /**
	  * 在多个字段查找同一个值
	  */
	 public static void searchList(String[] queryFileds,String queryString) throws Exception
	 {
	  // 查询的字符串:输入不存在的字符串是查询不到的,如：中国
	 
	  IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));
	  IndexSearcher searcher = new IndexSearcher(reader);
	  
	  
	  Query query = LuceneUtils.createQuery(queryFileds, queryString);
	  // 在搜索器中进行查询
	  // 对查询内容进行过滤
	  Filter filter = null;
	  // 一次在索引器查询多少条数据
	  int queryCount = 100;

	  TopDocs results = searcher.search(query, filter, queryCount);
	  System.out.println("总符合: " + results.totalHits + "条数！");

	  // 显示记录
	  for (ScoreDoc sr : results.scoreDocs)
	  {
	   // 文档编号
	   int docID = sr.doc;
	   // 真正的内容
	   Document doc = searcher.doc(docID);
	   System.out.println("inof = " + doc.get("title"));
	   System.out.println("info2 = " + doc.get("content"));

	  }
	 }
	 
	 public static void   combinationSearch() throws CorruptIndexException, IOException, ParseException{
		 IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));
		
		  IndexSearcher searcher = new IndexSearcher(reader);
		//选择得分词工具
	        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
		   QueryParser parser = new QueryParser(Version.LUCENE_34, "content",  
				                analyzer);  
		   //注意此处AND一定要大写
		   Query query  = parser.parse("content:中      国  AND title:中      国");
		   // 一次在索引器查询多少条数据
			 int queryCount = 100;

			// Sort sort = new Sort(new SortField("order",SortField.DOUBLE,false)); //排序 false 升序 true降序 
			 
			 //TopDocs results = searcher.search(query, queryCount,sort);
			 TopDocs results = searcher.search(query, queryCount); 
			 
			 System.out.println("总符合: " + results.totalHits + "条数！");
		   
	 
       // 显示记录
    	  for (ScoreDoc sr : results.scoreDocs)
    	  {
    		  //Sort(field,true)
    	   // 文档编号
    	   int docID = sr.doc;
    	   // 真正的内容
    	   Document doc = searcher.doc(docID);
    	   System.out.println("id="+doc.get("id")+"\torder="+doc.get("order")+"\ttitle = " + doc.get("title")+"\tcontent = " + doc.get("content"));

    	 }    
		 
	 }
//	分页查询
	 public static Map pagingSearch(String title,String content) throws CorruptIndexException, IOException, ParseException, InvalidTokenOffsetsException{
		 Map pager = new HashMap();
		 List<Article> blogList=new ArrayList<Article>() ;  
		 TokenStream tokenStream=null;   
		   
			  Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
			  
			//获取IndexSearcher 对象
			  IndexReader reader = IndexReader.open(FSDirectory.open(new File(PATH)));
			  IndexSearcher indexSearch = new IndexSearcher(reader);
			  
			  QueryParser queryParser =  new QueryParser(Version.LUCENE_34, "content",  analyzer); 
			  
			//搜索条件的结合   
              String str="";
              if(title!=null &&title.length()>0){
            	  str="title:"+title;
              }
              
              if(content!=null &&content.length()>0){
            	  if(str.trim().length()>0)
                  {
                	  str +=" AND";
                  }
            	  str="content:"+content;
              } 
            //设置搜索条件   
              Query query=queryParser.parse(str); 
            //查询搜索引擎  
              TopDocs result = indexSearch.search(query, 10);
            //上一页的最后一个document索引    第一页为0,其余也该页的起始记录条数
             int index=2;
             ScoreDoc scoreDoc=null;   
              //如果当前页是第一页面scoreDoc=null。   
              if(index>0){   
                 //因为索引是从0开始所以要index-1  
                 scoreDoc=result.scoreDocs[index-1];   
             }
              //分页处理 
              int pageSize = 2;
              TopDocs hits= indexSearch.searchAfter(scoreDoc, query, pageSize);
              //设置分页的总记录数  
              
             
              
            //循环hits.scoreDocs数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值   
              for (int i = 0; i < hits.scoreDocs.length; i++) { 
            	  ScoreDoc sdoc = hits.scoreDocs[i];   
            	  Document doc = indexSearch.doc(sdoc.doc);
            	  Article article = new Article();
            	  String stitle = doc.get("title");
            	  String scontent = doc.get("content");
            	  String id = doc.get("id");
            	  
            	//加亮处理  
            	  SimpleHTMLFormatter simplehtml=new SimpleHTMLFormatter("<font color='red'>", "</font>"); 
            	  Highlighter highlighter = new Highlighter(simplehtml,new QueryScorer(query));
            	  if(title!=null){  
            		  tokenStream = analyzer.tokenStream("title",new StringReader(title));
            		  String highLightText = highlighter.getBestFragment(tokenStream, title); 
            		  article.setTitle(highLightText==null?title:highLightText);
            	  }else
            	  {
            		  article.setTitle(stitle);
            	  }
            	  
            	  
            	  if(content!=null){  
            		  tokenStream = analyzer.tokenStream("content",new StringReader(content));
            		  String highLightText = highlighter.getBestFragment(tokenStream, content); 
            		  article.setContent(highLightText==null?title:highLightText);
            	  }else
            	  {
            		  article.setContent(scontent);
            	  }
            	  article.setId(id);
            	  System.out.println(article);
            	  blogList.add(article);
              }
              pager.put("content",hits.totalHits);
              pager.put("data",blogList);
		
		 return pager;
	 }

}

lucene