Lucene5.x 中文 同义词
2024-08-25 00:57:24
查询好好多资料,英文同义词好好的,中文就不行,多谢网友支持,拼接了好多代码,然后修改了一些,不足之处,多谢指正。
直接上代码吧,在代码中了解怎么分词的最好
1,创建分词引擎
public interface SamewordContext {
String[] getSamewords(String name);
}
2,同义词
import java.util.HashMap;
import java.util.Map; public class SimpleSamewordContext implements SamewordContext {
Map<String,String[]> maps = new HashMap<String,String[]>();
public SimpleSamewordContext() {
maps.put("中国",new String[]{"天朝","大陆"});
maps.put("我家",new String[]{"family","伐木累"});
}
@Override
public String[] getSamewords(String name) {
// TODO Auto-generated method stub
return maps.get(name);
}
}
3,TokenFilter
import java.io.IOException;
import java.util.Stack; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource; public class MySameTokenFilter extends TokenFilter {
private CharTermAttribute cta = null;
private PositionIncrementAttribute pia = null;
private AttributeSource.State current;
private Stack<String> sames = null;
private SamewordContext samewordContext; protected MySameTokenFilter(TokenStream input,SamewordContext samewordContext) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sames = new Stack<String>();
this.samewordContext = samewordContext;
} @Override
public boolean incrementToken() throws IOException {
if(sames.size()>0) {
//将元素出栈,并且获取这个同义词
String str = sames.pop();
//还原状态
restoreState(current);
cta.setEmpty();
cta.append(str);
//设置位置0
pia.setPositionIncrement(0);
return true;
} if(!this.input.incrementToken()) return false; if(addSames(cta.toString())) {
//如果有同义词将当前状态先保存
current = captureState();
}
return true;
} private boolean addSames(String name) {
String[] sws = samewordContext.getSamewords(name);
if(sws!=null) {
for(String str:sws) {
sames.push(str);
}
return true;
}
return false;
} }
4,Analyzer
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.wltea.analyzer.lucene.IKTokenizer; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; public class MySameworkAnalyzer extends MMSegAnalyzer {
private SamewordContext samewordContext; public MySameworkAnalyzer(SamewordContext samewordContext) {
// TODO Auto-generated constructor stub
this.samewordContext = samewordContext;
} @Override
protected TokenStreamComponents createComponents(String text) {
Reader in = new StringReader(text);
IKTokenizer tokenizer = new IKTokenizer(in , true);
TokenStream tokenStream = new MySameTokenFilter(tokenizer,
samewordContext);
tokenStream = new LowerCaseFilter(tokenStream);
tokenStream = new StopFilter(tokenStream,
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
return new TokenStreamComponents(tokenizer, tokenStream);
} }
5,测试
@Test
public void test01() {
String text = "我家在中国";
Analyzer analyzer = new MySameworkAnalyzer(new SimpleSamewordContext());
AnalyzerUtils.displayAllToken(text,analyzer);
}
运行结果:
最新文章
- 《高性能javascript》一书要点和延伸(上)
- opencv计算运行时间
- 详解PHP输入流php://input
- css常用中文字体的英文名称写法
- Openstack学习历程_1_视频
- [数据库连接字符串] Access 连接字符串
- Servlet小知识点
- 登录MD5加盐处理
- zzuoj 10408: C.最少换乘【最短路dijkstra】
- Cocos2d-x3.1回调函数具体解释
- CSS3秘笈:第四章
- 初识Html:Html和CSS的关系
- 开源解析库 - JSON
- ResizeObserver - 元素resize监听API
- LeetCode专题-Python实现之第21题:Merge Two Sorted Lists
- 第十二节:Lambda、linq、SQL的相爱相杀(1)
- 一个小误区 JS中的contains
- 【作业三】结队任务二-----CourseManagement
- Aliyun cdn访问日志下载
- Unity 灯光系统详解