Solr 获取分词

个人博客:http://demi-panda.com

Solr 1.4 有了对字段的分词。FieldAnalysisRequestHandler 可以对某个字段或字段类型的分词器对查询串取到分词数据。

用 solr 的默认配置,如 solr 1.4.0。

我用 mmseg4j 为例。在 solr.root/example/solr/conf/schema.xml 的 types 元素内加:

  1. <fieldType name="text_cn" class="solr.TextField" positionIncrementGap="100">  
  2.   <analyzer>  
  3.     <tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"/>  
  4.   </analyzer>  
  5. </fieldType>  

把 IKAnalyzer-3.2.8.jar 放到 solr.home/example/solr/lib,没有 lib 目录,创建一个。更多关于 solr 中使用 mmseg4j 的内容请看:solr 中文分词 IKAnalyzer 使用例子。

如:“北京烤鸭”http://localhost:6091/solr-web-shop/shop/analysis/field?q=北京烤鸭&analysis.fieldtype=text&indent=on&wt=json

{

  • responseHeader: {
    • status: 0,
    • QTime: 3
    },
  • analysis: {
    • field_types: {
      • text: {
        • query: [
          • "org.wltea.analyzer.lucene.IKTokenizer",
          • [
            • {
              • text: "北京烤鸭",
              • start: 0,
              • end: 4,
              • position: 1,
              • positionHistory: [
                • 1
                ],
              • type: "word"
              },
            • {
              • text: "北京烤",
              • start: 0,
              • end: 3,
              • position: 2,
              • positionHistory: [
                • 2
                ],
              • type: "word"
              },
            • {
              • text: "北京",
              • start: 0,
              • end: 2,
              • position: 3,
              • positionHistory: [
                • 3
                ],
              • type: "word"
              },
            • {
              • text: "烤鸭",
              • start: 2,
              • end: 4,
              • position: 4,
              • positionHistory: [
                • 4
                ],
              • type: "word"
              }
            ],
          • "org.apache.lucene.analysis.synonym.SynonymFilter",
          • "org.apache.lucene.analysis.StopFilter",
          • "org.apache.lucene.analysis.LowerCaseFilter",
          • "org.apache.solr.analysis.RemoveDuplicatesTokenFilter",
          ]
        }
      },
    • field_names: { }
    }

}

代码实现:

public static void main(String[] args) throws MalformedURLException, SolrServerException, IOException {
    CommonsHttpSolrServer solrServer = new CommonsHttpSolrServer("http://localhost:6091/solr-web-shop/shop");
    
    FieldAnalysisRequest request = new FieldAnalysisRequest("/analysis/field");
    request.addFieldName("shopName");
    request.setFieldValue("text");
    request.setQuery("北京烤鸭");
    FieldAnalysisResponse response = request.process(solrServer);
    System.out.println(response.toString());
    Iterator it = response.getFieldNameAnalysis("shopName").getQueryPhases().iterator();
    while(it.hasNext()) {
      AnalysisPhase pharse = (AnalysisPhase)it.next();
      List<TokenInfo> list = pharse.getTokens();
      for (TokenInfo info : list) {
        
      System.out.println(" text : "+ info.getText());
      }
      
    }
  }