Lucene笔记一

Lucene就是一个全文检索的工具，建立索引用的，类似于新华字典的目录

这里使用的是lucene-4.4.0版本，入门代码所需jar包如下图所示（解压lucene-4.4.0后的目录）：

入门代码：

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.IntField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.IndexableField;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

import org.junit.Test;

/*8

 * luceneDemo

 *

 */

public class TestLucene {

    /**

     * 通过lucene 提供的api 对数据建立索引，indexWriter

     * @throws IOException

     *

     */

    @Test

    public void testAdd() throws IOException{

        //索引在硬盘上面存放的位置..

        Directory directory=FSDirectory.open(new File("D:/INDEX"));

        //lucene 当前使用的版本...

        Version matchVersion=Version.LUCENE_44;

        //分词器...(把一段文本分词)（黑马程序员是高端的培训机构）

        //analzyer 是一个抽象类，具体的切分词规则由子类实现...

        Analyzer analyzer=new StandardAnalyzer(matchVersion);

        IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);

        //构造索引写入的对象..

        IndexWriter indexWriter=new IndexWriter(directory, config);

        //往索引库里面写数据..

        //索引库里面的数据都是document 一个document相当于是一条记录

        //这个document里面的数据相当于索引结构..

        Document document=new Document();

        IndexableField indexableField=new IntField("id",1, Store.YES);

        IndexableField stringfield=new StringField("title","对王召廷的个人评价",Store.YES);

        IndexableField teIndexableField=new TextField("content","风流倜傥有点黄",Store.YES);

        document.add(indexableField);

        document.add(stringfield);

        document.add(teIndexableField);

        //索引库里面接收的数据都是document对象

        indexWriter.addDocument(document);

        indexWriter.close();

    }

    /**

     * 对建立的索引进行搜索...

     * 通过indexSearcher 去搜索...

     * @throws IOException

     */

    @Test

    public void testSearcher() throws IOException{

        //索引在硬盘上面存放的位置..

        Directory directory=FSDirectory.open(new File("D:/INDEX"));

        //把索引目录里面的索引读取到IndexReader 当中...

        IndexReader indexReader=DirectoryReader.open(directory);

//        /构造搜索索引的对象..

        IndexSearcher indexSearcher=new IndexSearcher(indexReader);

        //Query 它是一个查询条件对象，它是一个抽象类，不同的查询规则就构造不同的子类...

        Query query=new TermQuery(new Term("title", "对王召廷的个人评价"));

        //检索符合query 条件的前面N 条记录..

        //

        TopDocs topDocs=indexSearcher.search(query, 10);

        //返回总记录数...

        System.out.println(topDocs.totalHits);

        //存放的都是document 的id

        ScoreDoc scoreDocs []=topDocs.scoreDocs;

        for(ScoreDoc scoreDoc:scoreDocs){

            //返回的就是document id

            int docID=scoreDoc.doc;

            //我还需要根据id 检索到对应的document

            Document document=indexSearcher.doc(docID);

            System.out.println("id=="+document.get("id"));

            System.out.println("title=="+document.get("title"));

            System.out.println("content=="+document.get("content"));

        }

    }

}

原理分析图：

demo演示：

根据入门代码流程提炼工具类代码：

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.util.Version;

/**

 * lucene 工具类...

 * @author Administrator

 *

 */

/**

 * 提炼规则，假设这段代码可以完成一个功能，把这个代码提炼到一个方法里面去，假设这个方法在某个业务罗继承可以共用，那么往上抽取，

 * 假设在其它逻辑层也可以用，提炼到工具类里面去。

 *

 */

public class LuceneUtils {

    private static IndexWriter indexWriter=null;

    private static IndexSearcher indexSearcher=null;

    //索引存放目录..

    private static Directory directory=null;

    private static IndexWriterConfig indexWriterConfig=null;

    private static Version version=null;

    private static Analyzer analyzer=null;

    static {

        try {

            directory=FSDirectory.open(new File(Constants.URL));

            version=Version.LUCENE_44;

            analyzer=new StandardAnalyzer(version);

            indexWriterConfig=new IndexWriterConfig(version, analyzer);

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

    /**

     *

     * @return 返回用于操作索引的对象...

     * @throws IOException

     */

    public static IndexWriter getIndexWriter() throws IOException{

        indexWriter=new IndexWriter(directory, indexWriterConfig);

        return indexWriter;

    }

    /**

     * 返回用于搜索索引的对象...

     * @return

     * @throws IOException

     */

    public static IndexSearcher  getIndexSearcher() throws IOException{

        IndexReader indexReader=DirectoryReader.open(directory);

        indexSearcher=new IndexSearcher(indexReader);

        return indexSearcher;

    }

    /**

     *

     * 返回lucene 当前的版本...

     * @return

     */

    public static Version getVersion() {

        return version;

    }

    /**

     *

     * 返回lucene 当前使用的分词器..

     * @return

     */

    public static Analyzer getAnalyzer() {

        return analyzer;

    }

}

public class Constants {

    /**

     * 索引存放的目录

     */

    public static final String URL="d:/indexdir/news";

}

bean：

package cn.itcast.bean;

public class Article {

    private int id;

    public int getId() {

        return id;

    }

    public void setId(int id) {

        this.id = id;

    }

    public String getTitle() {

        return title;

    }

    public void setTitle(String title) {

        this.title = title;

    }

    public String getContent() {

        return content;

    }

    public void setContent(String content) {

        this.content = content;

    }

    public String getAuthor() {

        return author;

    }

    public void setAuthor(String author) {

        this.author = author;

    }

    public String getUrl() {

        return url;

    }

    public void setUrl(String url) {

        this.url = url;

    }

    private String title;

    private String content;

    private String author;

    private String url;

}

转换工具类：

package cn.itcast.lucene;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.IntField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.index.IndexableField;

import cn.itcast.bean.Article;

/*8

 * 对象与索引库document 之间的转化

 *

 */

public class ArticleToDocument {

    public static Document articleToDocument(Article article){

        Document document=new Document();

        IntField idfield=new IntField("id", article.getId(), Store.YES);

        //StringField 对应的值不分词，textField 分词..

        TextField titleField=new TextField("title", article.getTitle(),Store.YES);

        TextField contentField=new TextField("content", article.getContent(),Store.YES);

        //修改这个字段对应的权重值，默认这个值为1f

//        contentField.setBoost(3f);

        StringField authorField=new StringField("author", article.getAuthor(), Store.YES);

        StringField urlField=new StringField("url", article.getUrl(), Store.YES);

        document.add(idfield);

        document.add(titleField);

        document.add(contentField);

        document.add(authorField);

        document.add(urlField);

        return document;

    }

}

Dao层：

package cn.itcast.dao;

import java.io.IOException;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.Term;

import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import cn.itcast.bean.Article;

import cn.itcast.lucene.ArticleToDocument;

import cn.itcast.uitls.LuceneUtils;

/**

 * 使用lucene 的API 来操作索引库..

 * @author Administrator

 *

 */

public class LuceneDao {

    public void addIndex(Article article) throws IOException{

        IndexWriter indexWriter=LuceneUtils.getIndexWriter();

        Document doc=ArticleToDocument.articleToDocument(article);

        indexWriter.addDocument(doc);

        indexWriter.close();

    }

    /**

     * 删除符合条件的记录...

     * @param fieldName

     * @param fieldValue

     * @throws IOException

     */

    public void delIndex(String fieldName,String fieldValue) throws IOException{

        IndexWriter indexWriter=LuceneUtils.getIndexWriter();

        //一定要梦想，万一实现了勒

        Term term=new Term(fieldName, fieldValue);

        indexWriter.deleteDocuments(term);

        indexWriter.close();

    }

    /**

     *

     * 更新

     *

     * update table set ?  where condtion

     * @throws IOException

     *

     *

     */

    public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{

        IndexWriter indexWriter=LuceneUtils.getIndexWriter();

        /**

         * 1:term 设置更新的条件...

         *

         * 2:设置更新的内容的对象..

         *

         */

        Term term=new Term(fieldName,fieldValue);

        Document doc=ArticleToDocument.articleToDocument(article);

        /**

         *

         * 在lucene 里面是先删除符合这个条件term 的记录，在创建一个doc 记录...

         *

         */

        indexWriter.updateDocument(term, doc);

        indexWriter.close();

    }

    /**

     * 0,10

     * 10,10

     * 20,10

     * @param keywords

     * @throws Exception

     */

    public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{

        IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();

        //第一个条件.. 单字段查询...

//        Query query=new TermQuery(new Term("title","梦想"))

        //select *  from  table where fieldname="" or content=""

        String fields []={"title","content"};

        //第二种条件：使用查询解析器，多字段。。。 我们需要重新导入一个jar queryParser 的jar... 位置在lucene解压后的queryparser文件夹下

        QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());

//        /这个事一个条件..

        Query query=queryParser.parse(keywords);

        //query 它是一个查询条件，query 是一个抽象类，不同的查询规则构造部同的子类即可

        //检索符合query 条件的前面N 条记录...

        //检索的是索引目录... (总记录数，socreDOC (docID))

        //使用lucene 提供的api 进行操作...

        TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult);

//        /存放的是docID

        ScoreDoc scoreDocs []=topDocs.scoreDocs;

        //判断:scoreDocs 的length  (实际取出来的数量..) 与 firstResult+maxResult 的值取小值...

        //在java jdk 里面提供了一个api

        int endResult=Math.min(scoreDocs.length, firstResult+maxResult);

        for(int i=firstResult;i<endResult;i++){

//            /取出来的是docID,这个id 是lucene 自己来维护。

            int docID=scoreDocs[i].doc;

            Document document=indexSearcher.doc(docID);

            System.out.println("id==="+document.get("id"));

            System.out.println("title==="+document.get("title"));

            System.out.println("content==="+document.get("content"));

            System.out.println("url==="+document.get("url"));

            System.out.println("author==="+document.get("author"));

        }

    }

}

测试类：

package cn.itcast.junit;

import java.io.IOException;

import org.junit.Test;

import cn.itcast.bean.Article;

import cn.itcast.dao.LuceneDao;

/**

 * 测试luceneDao

 * @author Administrator

 *

 */

public class LuceneDaoTest {

    private LuceneDao luceneDao=new LuceneDao();

    @Test

    public void testCreate() throws IOException{

        for(int i=28;i<=28;i++){

            Article article=new Article();

            article.setId(i);

            article.setTitle("一定要梦想，万一实现了勒");

            article.setContent("矫情我觉得这句话太矫情了矫情矫情矫情矫情矫情矫情");

            article.setUrl("http://www.tianmao.com");

            article.setAuthor("马云");

            luceneDao.addIndex(article);

        }

    }

    @Test

    public void testsearcher() throws Exception{

//        article.setTitle("一定要梦想，万一实现了勒");   textfield   分词     标准分词器

//        article.setContent("我觉得这句话太矫情了");   textfield   分词    标准分词器

        luceneDao.findIndex("梦想",20,10);

    }

    @Test

    public void testdelete() throws IOException{

        String fieldName="title";

        String fieldValue="定";

        luceneDao.delIndex(fieldName, fieldValue);

    }

    @Test

    public void testUpdate() throws IOException{

        String fieldName="title";

        String fieldValue="定";

        Article article=new Article();

        article.setId(9527);

        article.setTitle("一定要梦想，万一实现了勒");

        article.setContent("我觉得这句话太矫情了");

        article.setUrl("http://www.tianmao.com");

        article.setAuthor("马云");

        luceneDao.updateIndex(fieldName, fieldValue, article);

    }

}

分词器的流程图：

关于分词器，网上可以找到很多种类的分词器配合Lucene使用，相关分词规则查看对应说明。

举例如下：

Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文单字切分、英文按空格切分成单词

Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分词，中文相连的两个词作为一个索引

Analyzer analyzer=new IKAnalyzer();//第三方的分词器，对中文支持较好，可以自定义分词单词与停用词

索引库优化

package cn.itcast.lucene;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.LogDocMergePolicy;

import org.apache.lucene.index.MergePolicy;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.apache.lucene.store.IOContext;

import org.apache.lucene.store.RAMDirectory;

import org.apache.lucene.util.Version;

import org.junit.Test;

import cn.itcast.uitls.Constants;

public class TestOptimise {

    /*8

     * 优化的第一种方式:通过 IndexWriterConfig 优化设置mergePolicy（合并策略）

     *

     *

     */

    public void testoptimise() throws IOException{

        Directory directory=FSDirectory.open(new File(Constants.URL));

        Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);

        IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);

        LogDocMergePolicy mergePolicy=new LogDocMergePolicy();

        /**

         * 当这个值越小，更少的内存会被运用当创建索引的时候，搜索的时候越快，创建的时候越慢。

         * 当这个值越大，更多的内存会被运用当创建索引的时候，搜索的时候越慢，创建的时候越快..

         * larger values >10

         *

         * 2<=smaller<=10

         *

         */

        //设置合并因子..

        mergePolicy.setMergeFactor(10);

//        /设置索引的合并策略..

        config.setMergePolicy(mergePolicy);

        IndexWriter indexWriter=new IndexWriter(directory, config);

    }

    /**

     * 通过directory 去优化....

     * @throws IOException

     *

     */

    @Test

    public void testoptimise2() throws IOException{

        //现在的索引放在硬盘上面...

        Directory directory=FSDirectory.open(new File(Constants.URL));

//        /通过这个对象吧directory 里面的数据读取到directory1 里面来..

        IOContext ioContext=new IOContext();

        //相办法吧directory 的索引读取到内存当中来...

        Directory directory1=new RAMDirectory(directory,ioContext);

        IndexReader indexReader=DirectoryReader.open(directory1);

        IndexSearcher indexSearcher=new IndexSearcher(indexReader);

        Query query=new TermQuery(new Term("title", "想"));

        TopDocs topDocs=indexSearcher.search(query, 100);

        System.out.println(topDocs.totalHits);

    }

    /**

     * 索引文件越大，会影响检索的速度..  (减少索引文件的大小)

     *

     * 1:排除停用词..

     *

     */

    public void testoptimise3(){

    }

    /**

     * 将索引分目盘存放  将数据归类...

     *

     */

    public void testoptimise4(){

    }

}

巴特西

Lucene笔记一

最新文章

热门文章