Lucene5.x 中文同义词

查询好好多资料，英文同义词好好的，中文就不行，多谢网友支持，拼接了好多代码，然后修改了一些，不足之处，多谢指正。

直接上代码吧，在代码中了解怎么分词的最好

1,创建分词引擎

 public interface SamewordContext {

     String[] getSamewords(String name);

 }

2,同义词

 import java.util.HashMap;

 import java.util.Map;

 public class SimpleSamewordContext implements SamewordContext {

     Map<String,String[]> maps = new HashMap<String,String[]>();

     public SimpleSamewordContext() {

         maps.put("中国",new String[]{"天朝","大陆"});

         maps.put("我家",new String[]{"family","伐木累"});

     }

     @Override

     public String[] getSamewords(String name) {

         // TODO Auto-generated method stub

         return maps.get(name);

     }

 }

3,TokenFilter

import java.io.IOException;

import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import org.apache.lucene.util.AttributeSource;

public class MySameTokenFilter extends TokenFilter {

    private CharTermAttribute cta = null;

    private PositionIncrementAttribute pia = null;

    private AttributeSource.State current;

    private Stack<String> sames = null;

    private SamewordContext samewordContext;

    protected MySameTokenFilter(TokenStream input,SamewordContext samewordContext) {

        super(input);

        cta = this.addAttribute(CharTermAttribute.class);

        pia = this.addAttribute(PositionIncrementAttribute.class);

        sames = new Stack<String>();

        this.samewordContext = samewordContext;

    }

    @Override

    public boolean incrementToken() throws IOException {

        if(sames.size()>0) {

            //将元素出栈，并且获取这个同义词

            String str = sames.pop();

            //还原状态

            restoreState(current);

            cta.setEmpty();

            cta.append(str);

            //设置位置0

            pia.setPositionIncrement(0);

            return true;

        }

        if(!this.input.incrementToken()) return false;

        if(addSames(cta.toString())) {

            //如果有同义词将当前状态先保存

            current = captureState();

        }

        return true;

    }

    private boolean addSames(String name) {

        String[] sws = samewordContext.getSamewords(name);

        if(sws!=null) {

            for(String str:sws) {

                sames.push(str);

            }

            return true;

        }

        return false;

    }

}

4,Analyzer

import java.io.Reader;

import java.io.StringReader;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.core.LowerCaseFilter;

import org.apache.lucene.analysis.core.StopAnalyzer;

import org.apache.lucene.analysis.core.StopFilter;

import org.wltea.analyzer.lucene.IKTokenizer;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class MySameworkAnalyzer extends MMSegAnalyzer {

    private SamewordContext samewordContext;

    public MySameworkAnalyzer(SamewordContext samewordContext) {

        // TODO Auto-generated constructor stub

        this.samewordContext = samewordContext;

    }

    @Override

    protected TokenStreamComponents createComponents(String text) {

        Reader in = new StringReader(text);

        IKTokenizer tokenizer = new IKTokenizer(in , true);

        TokenStream tokenStream = new MySameTokenFilter(tokenizer,

                samewordContext);

        tokenStream = new LowerCaseFilter(tokenStream);

        tokenStream = new StopFilter(tokenStream,

                StopAnalyzer.ENGLISH_STOP_WORDS_SET);

        return new TokenStreamComponents(tokenizer, tokenStream);

    }

}

5，测试

@Test

    public void test01() {

         String text = "我家在中国";

         Analyzer analyzer = new MySameworkAnalyzer(new SimpleSamewordContext());

         AnalyzerUtils.displayAllToken(text,analyzer);

    }

运行结果：

巴特西

Lucene5.x 中文同义词

最新文章

热门文章

巴特西

Lucene5.x 中文 同义词

最新文章

热门文章

Lucene5.x 中文同义词