lucene自定义分词器
感谢http://qindongliang1922.iteye.com/blog/1927605
这篇文章让我豁然开朗~
建议研究lucene时一定要下载源码
下面代码中有个bug,,,一开始没弄没明白,在用这个分词器进行索引后发现搜不到东西。。是tokenStart和tokenEnd的错,这2个表示该词所在位置,,我这样说不知道对不对,但我感觉我的意思已经表达出来
package TEST; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.AttributeFactory; public class My extends Tokenizer { private final StringBuilder buffer = new StringBuilder(); private int tokenStart = 0, tokenEnd = 0; private final static String PUNCTION = " -()/"; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); //private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); public My(Reader reader) { super(reader); } public My(AttributeFactory factory, Reader input) { super(factory, input); } @Override public boolean incrementToken() throws IOException { clearAttributes(); buffer.setLength(0); int ci; char ch; tokenStart = tokenEnd; ci = input.read(); if(ci>64&&ci<91){ ci=ci+32; } ch = (char) ci; while (true) { if (ci == -1){ if (buffer.length() == 0) return false; else { termAtt.setEmpty().append(buffer); offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); return true; } } else if (PUNCTION.indexOf(ch) != -1) { //buffer.append(ch); tokenEnd++; if(buffer.length()>0){ termAtt.setEmpty().append(buffer); offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); return true; }else { ci = input.read(); if(ci>64&&ci<91){ ci=ci+32; } ch = (char) ci; } } else { buffer.append(ch); tokenEnd++; ci = input.read(); if(ci>64&&ci<91){ ci=ci+32; } ch = (char) ci; } } } @Override public void reset() throws IOException { super.reset(); tokenStart = tokenEnd = 0; } @Override public void end() throws IOException { super.end(); final int finalOffset = correctOffset(tokenEnd); offsetAtt.setOffset(finalOffset, finalOffset); } }
然后开始写分词器
package TEST; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; public class MyAnalyzer extends Analyzer{ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { return new TokenStreamComponents(new My(reader)); } }
最后测试下
package TEST; import java.io.StringReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class TestMy { public static void main(String[] args) throws Exception { MyAnalyzer ma = new MyAnalyzer(); String str = "Norther 雪中悍刀行 河北邯郸 AC DF-II-SDFzd(asd)/小时"; //MyChineseAnalyzer mc= new MyChineseAnalyzer(); 这是三劫散仙的分词器 TokenStream ts = ma.tokenStream("field", new StringReader(str)); CharTermAttribute c = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println(c.toString()); } ts.end(); ts.close(); } }
测试结果: