参考:

https://blog.csdn.net/u014209975/article/details/50525624

https://www.cnblogs.com/hanyinglong/p/5395600.html

http://lucene.apache.org/core/4_0_0/core/overview-summary.html

https://www.jianshu.com/p/0a2bbe0f4c42

依赖:

lucene-analyzers.jar
lucene-benchmark.jar
lucene-core.jar
lucene-highlighter.jar
lucene-memory.jar
lucene-parser.jar
lucene-remote.jar
lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

	private static final long serialVersionUID = 3701082756628915138L;

	private Integer id;

	private String type;

    private String virtualDoc;

    private String summary;

    private float score;

    public Integer getId() {
return id;
} public void setId(Integer id) {
this.id = id;
} public String getType() {
return type;
} public void setType(String type) {
this.type = type;
} public String getVirtualDoc() {
if (null == virtualDoc) {
// TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
// 格式:字段1:属性值1,字段2:属性值2,...
}
return virtualDoc;
} public void setVirtualDoc(String virtualDoc) {
this.virtualDoc = virtualDoc;
} public String getSummary() {
StringBuilder sb = new StringBuilder();
String tmpSum = summary;
tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");
tmpSum = tmpSum.replace("</SPAN>", "");
String virtualDoc2 = getVirtualDoc();
int length = tmpSum.length();
int firstIndex = virtualDoc2.indexOf(tmpSum);
if (firstIndex > 0) {
sb.append("...");
}
sb.append(summary);
if (firstIndex + length < virtualDoc2.length()) {
sb.append("...");
} return sb.toString();
} public void setSummary(String summary) {
this.summary = summary;
} public float getScore() {
return score;
} public void setScore(float score) {
this.score = score;
}
}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version; public class Demo {
/** lucene索引目录 */
private static Directory ciIndexDir; private static final String CI_CONTENT_FLAG = "virtualDoc"; /** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); private static Pattern VALID_IPV4_PATTERN = null;
private static Pattern VALID_IPV6_PATTERN = null;
private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";
private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}"; private static IndexWriter indexWriter; static {
VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
try {
indexWriter = new IndexWriter(getCiIndexDir(), conf);
} catch (IOException e) {
e.printStackTrace();
}
} private static Directory getCiIndexDir() {
if (null == ciIndexDir) {
try {
ciIndexDir = FSDirectory.open(new File("D://indexs"));
} catch (IOException e) {
e.printStackTrace();
}
}
return ciIndexDir;
} private static boolean isIpAddress(String ipAddress) {
Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
return m1.matches() || m2.matches();
} private static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
} private static BooleanQuery parseChineseCharacters(String inputString){
BooleanQuery query = new BooleanQuery();
if(isIpAddress(inputString)){
query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
return query;
}
BooleanQuery fieldQuery = new BooleanQuery();
boolean isWord = false;
StringBuilder tempWord = new StringBuilder();
inputString = inputString.toLowerCase();
BooleanQuery booleanQuery = new BooleanQuery();
int length = inputString.length();
Query termQuery = null;
for(int i=0; i<length; i++){
char c = inputString.charAt(i);
if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
isWord = true;
tempWord.append(c);
}
else{//Delimiter or Chinese character
isWord = false;
if(tempWord.length() > 0){
termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
tempWord = new StringBuilder();
}
}
if(!isWord){
termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
if(isChinese(c)){//Chinese character
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
else{//Delimiter
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} }
}
if(tempWord.length() > 0){
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} // Begin 处理全局字段匹配
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
// End 处理全局字段匹配 BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
fieldQuery.add(clause); BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
query.add(fieldClause); return query;
} /**
* 全文检索
* @param queryStr
* @throws Exception
*/
private static void contentSearch(String queryStr, boolean highlight) throws Exception {
IndexReader indexReader = null;
IndexSearcher indexSearcher = null;
try {
indexReader = IndexReader.open(getCiIndexDir());
indexSearcher = new IndexSearcher(indexReader); //组合查询条件,需要根据业务自己定义
Query query = parseChineseCharacters(queryStr); TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
if(hits.totalHits > 0) {
if (highlight) {
QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter
.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100)); for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
entity.setScore(scoreDoc.score);
}
} else {
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc);
entity.setScore(scoreDoc.score);
}
}
}
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
close(indexSearcher);
close(indexReader);
}
} /**
* 对实现Closeable接口的统一关闭
* @param object
*/
private static void close(Closeable object) {
if(null != object) {
try {
object.close();
} catch (IOException e) {
}
}
} /**
* 实体转换为Doc
* @param entity
* @return
*/
public static Document convertToDocument(Entity entity) {
Document doc = new Document();
String virtualDoc = entity.getVirtualDoc();
//Field.Store.Yes存储,Field.Index.ANALYZED分词
doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
return doc;
} /**
* Doc转换为实体
* @param doc
* @return
*/
public static Entity convertToEntity(Document doc) {
Entity ci = new Entity();
ci.setId(Integer.valueOf(doc.get("id")));
ci.setType(doc.get("type"));
ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
return ci;
} /**
* 检索Entity,含高亮信息
* @param doc
* @param indexReader
* @param docId
* @param highlighter
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException { Entity entity = convertToEntity(doc);
String virtualDoc = entity.getVirtualDoc();
TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
if(highlighterSummary == null){
highlighterSummary = virtualDoc;
}
entity.setSummary(highlighterSummary); return entity;
} /**
* 给entity信息增加索引
* @param entity
*/
public static void addIndex(Entity entity) {
try {
deleteIndex(entity);
Document doc = convertToDocument(entity);
indexWriter.addDocument(doc);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 批量增加索引
* @param list
*/
public static void addIndexs(List<Entity> list) {
try {
List<Document> docs = new ArrayList<Document>();
deleteIndexs(list);
for (Entity entity : list) {
Document doc = convertToDocument(entity);
docs.add(doc);
}
indexWriter.addDocuments(docs);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 给实体信息更新索引
* @param entity
*/
public static void updateIndex(Entity entity) {
try {
addIndex(entity);
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除entity列表信息对应的索引
* @param entity
*/
public static void deleteIndexs(List<Entity> list) {
try {
int size = list.size();
Term[] terms = new Term[size];
for(int i=0; i<size; i++) {
terms[i] = new Term("id", list.get(i).getId().toString());
}
indexWriter.deleteDocuments(terms);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体信息对应的索引
* @param entity
*/
public static void deleteIndex(Entity entity) {
try {
indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体类型对应的所以索引信息
* @param type
*/
public static void deleteIndexByType(String type) {
try {
indexWriter.deleteDocuments(new Term("type", type));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} @Override
protected void finalize() throws Throwable {
indexWriter.close();
} public static void main(String[] args) throws Exception {
String queryStr = "http://mail6c1.shenzhenair.com";
contentSearch(queryStr, true);
}
}

最新文章

  1. [WARNING] Using platform encoding (GBK actually) to copy filtered resources, i.e. build is platform
  2. Maven下SiteMesh的使用
  3. jQuery 效果 - 隐藏和显示
  4. [整]C#获取天气预报信息(baidu api)包括pm2.5
  5. 低版本的无法打开高版本的VM
  6. latex+bibtex+jabref(zz)
  7. MySQL 5.7.12新增MySQL Shell命令行功能
  8. Catalog与Schema
  9. Webservice 调用方式整理
  10. 音频播放(iOS开发)
  11. Python新手学习基础之函数-全局变量和局部变量
  12. Exception in thread &quot;main&quot; java.net.BindException: Address already in use: JVM_Bind
  13. HDFS建筑与shell操作
  14. python第三课
  15. 空数组在以下三种遍历中均不可更改:forEach、map和for...in
  16. man -f/-k [keyword]在fedora 29 中报错nothing appropriate
  17. __x__(41)0909第五天__长表格
  18. asp.net core Session的测试使用心得及注意事项
  19. system generator学习笔记【01】
  20. Jquery 图片走马灯效果原理

热门文章

  1. QTP Code Segment
  2. luoguP1315 观光公交 题解(NOIP2011)(贪心)
  3. HDU 1029Ignatius and the Princess IV
  4. SnowFlake --- 分布式id生成算法
  5. Moco 框架以及其在 Web 集成测试的应用
  6. JavaScript — event介绍以及兼容处理
  7. Linux快速显示图片
  8. 以python为例讲解闭包机制
  9. mysql slave节点多线程复制
  10. Java高频经典面试题(第一季)五:递归与迭代