天天看点

Lucene+springboot 实现一个简单的搜索

1、背景:网站需要实现一个检索,但是mysql的like已经不能满足需求,需要类似全文检索,在之前简单的接触过elasticsearch,感觉类似elasticsearch的搜索可以满足,最后决定集成lucene实现搜索。(可以直接使用es,为什么没有使用就不多说了)

2、环境:java8、springboot2.2,maven,lucene7.6

3、在pom文件中添加依赖

<!-- Lucene -->
		<!--核心包-->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-core</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!--对分词索引查询解析-->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-queryparser</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!--一般分词器,适用于英文分词-->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-analyzers-common</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!--检索关键字高亮显示 -->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-highlighter</artifactId>
		  <version>7.6.0</version>
		</dependency>
		<!-- smartcn中文分词器 -->
		<dependency>
		  <groupId>org.apache.lucene</groupId>
		  <artifactId>lucene-analyzers-smartcn</artifactId>
		  <version>7.6.0</version>
		</dependency>
           

4、创建查询索引

package 包名
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;

import com.cuiyanet.entity.inter.EncyRetrievalIN;
import com.cuiyanet.service.EncyRetrievalService;

/**
 * @Description lucene搜索引擎的实现方法
 */
@Controller
@RequestMapping("/test/lucene")
public class test {
	private static final Logger logger = LoggerFactory.getLogger(test.class);
	
	@Autowired
	private EncyRetrievalService encyRetrievalService;

	/**
	 * @Description 创建查询索引		
	 * @throws IOException
	 */
	@RequestMapping("/creat")
	public void indexCreate() throws IOException {
		
		指定索引的生成位置,后面查询会使用
		Directory directory = FSDirectory.open(Paths.get(new File("D:\\lucene\\lucene_index").getPath()));		
        logger.info("===================>索引位置:"+Paths.get(new File("D:\\lucene\\lucene_index").getPath()));
        
        //创建一个分词器,表示你存入的内容使用的是该分词器
        SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
        //创建indexwriterConfig(参数分词器)
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(smartChineseAnalyzer);
        //创建indexwrite 对象(文件对象,索引配置对象)
        IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
		
        //此处是从数据库中查询你需要建索引的东西
		List<EncyRetrievalIN> retrievalList = encyRetrievalService.searchEncyRetrievalIN(new EncyRetrievalIN());
		//如果做测试可以直接写几个字符串就可以了
		
		//循环生成索引
		for(int i = 0; i < retrievalList.size(); i ++) {
			
			/**这个很关键,忘记了从哪个文章看到的
	         * 1、LongPoint(String name, int... point) : 在Lucene 6.0中,LongField替换为LongPoint,IntField替换为IntPoint,FloatField替换为FloatPoint,DoubleField替换为DoublePoint。对int型字段索引,索引不存储,提供了一些静态工厂方法用于创建一般的查询,提供了不同于文本的数值类型存储方式,使用KD-trees索引
	         * 2、StringField(FieldName, FieldValue,Store.YES)) : 只索引但不分词,会将整个串存储在索引中,比如(订单号,身份证号等)是否存储在文档中用Store.YES或Store.NO决定
	         * 3、StoredField(FieldName, FieldValue) : 存储Field的值,不分析,不索引,可以用IndexSearcher.doc和IndexReader.document来获取存储的Field和存储的值
	         * 4、TextField(FieldName,FieldValue, Store.NO) :索引并分词,不包括term vectors(词向量,下面会讲),例如通常用于一个body Field
	         */
			Field titleField = new TextField("tilte", retrievalList.get(i).getTitle() , Field.Store.YES);
			Field contentField = new TextField("content", retrievalList.get(i).getProfiles() , Field.Store.YES);
			Field keyWordField = new TextField("keyWord", retrievalList.get(i).getKeyWord() , Field.Store.YES);
			Field clicksField = new StringField("clicks", String.valueOf(retrievalList.get(i).getClicks()) , Field.Store.YES);
			Field idField = new StringField("id", String.valueOf(retrievalList.get(i).getId()) , Field.Store.YES);
			Document doc = new Document();
			doc.add(titleField);
			doc.add(contentField);
			doc.add(keyWordField);
			doc.add(clicksField);
			doc.add(idField);
			doc.add(new NumericDocValuesField("sortid",retrievalList.get(i).getId())); 		//这个东西是后面查询的时候可以用这个排序用的
			doc.add(new NumericDocValuesField("click",retrievalList.get(i).getClicks()));  
			indexWriter.addDocument(doc);							//写入文档	
		}
		
	    // 查看IndexWriter里面有多少个索引
	    logger.info("================IndexWriter创建索引个数===============》:"+indexWriter.numDocs());
	    // 关闭索引
	    indexWriter.close();
	}
	
}
           

5、查询

package 包名;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.ResponseBody;

import com.alibaba.fastjson.JSONObject;
import com.cuiyanet.common.util.PageBean;
import com.cuiyanet.common.util.PageInfo;
import com.cuiyanet.entity.WebResult;
import com.cuiyanet.entity.inter.EncyRetrievalIN;
import com.cuiyanet.entity.vo.EncyRetrievalSearchVO;
import com.cuiyanet.service.EncyRetrievalSearchService;
import com.cuiyanet.service.EncyRetrievalService;
import com.github.pagehelper.PageHelper;

import io.swagger.annotations.ApiImplicitParam;
import io.swagger.annotations.ApiImplicitParams;
import io.swagger.annotations.ApiOperation;

/**
 * @Description lucene搜索引擎的实现方法
 */
@Controller
@RequestMapping("/test/lucene")
public class test {
	private static final Logger logger = LoggerFactory.getLogger(test.class);
	
	@Autowired
	private EncyRetrievalService encyRetrievalService;
	@Autowired
	private EncyRetrievalSearchService encyRetrievalSearchService;
	
	/**
	 * @Description 搜索 	
	 */
	@GetMapping("/searchList")
	public void searchList(@RequestParam(value = "page", defaultValue = "1") int page,	//分页
			@RequestParam(value = "limit", defaultValue = "7") int limit,		//分页
			@RequestParam(value = "title", defaultValue = "") String title		//搜索内容
	) {
		try {
			
			if (title != "") {
				if (page == 0)
					page = 1;
				if (limit == 0)
					limit = 10;
				int startIndex = (page - 1) * limit;
				//这里加载的分词,最好创建索引和查询使用一个,不然可能存在查询不到的问题
				SmartChineseAnalyzer smartChineseAnalyzer = new SmartChineseAnalyzer();
				Query query = null;	//查询

				//这里用的是BooleanQuery方法,可以多个字段进行检索
				Builder builder = new BooleanQuery.Builder();
				String[] fieldStra = new String[] { "title", "keyword" };
				BooleanClause.Occur[] flagsa = { BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
				String[] queryArya = new String[] { title, title };
				query = MultiFieldQueryParser.parse(queryArya, fieldStra, flagsa, smartChineseAnalyzer);
				
				builder.add(new QueryParser("typeid", smartChineseAnalyzer).parse(String.valueOf(11)), Occur.MUST);	//业务需求固定查询11
				builder.add(query, Occur.MUST);

				//这个是上面的说明,一个build.add()代表一个,上面是俩must,下面是查询出的数据集合说明。
//				1.MUST和MUST:取得连个查询子句的交集。 
//				2.MUST和MUST_NOT:表示查询结果中不能包含MUST_NOT所对应得查询子句的检索结果。 
//				3.SHOULD与MUST_NOT:连用时,功能同MUST和MUST_NOT。
//				4.SHOULD与MUST连用时,结果为MUST子句的检索结果,但是SHOULD可影响排序。
//				5.SHOULD与SHOULD:表示“或”关系,最终检索结果为所有检索子句的并集。
//				6.MUST_NOT和MUST_NOT:无意义,检索无结果。
				
			    Directory directory = FSDirectory.open(Paths.get("D:\\lucene\\lucene_index"));	//加载生成索引位置
			    IndexReader indexReader = DirectoryReader.open(directory);
			    IndexSearcher indexSearcher = new IndexSearcher(indexReader);		//搜索
				
			    //排序用的sort,可以多字段排序
				SortField field = new SortField("", SortField.Type.SCORE, false); 			// 相关度高的在前
				SortField fieldB = new SortField("clicks", SortField.Type.LONG, true); 		// 点击量大值在前面
				Sort sort = new Sort(field, fieldB);
				//分页查询
				TopFieldCollector c = TopFieldCollector.create(sort, 100000, false, false, false, false);	//后面这些参数可以自己查询看一下
				indexSearcher.search(builder.build(), c);	//加载查询

				TopDocs topDocs = indexSearcher.search(builder.build(), 100000); // 查询总数

				//分页--开始数据第几个,查询多少个例:(5,10),从第五条后查询十个
				ScoreDoc[] hits = c.topDocs(startIndex, limit).scoreDocs;
				
				for (ScoreDoc sdoc : hits) {
					Document hitDoc = indexSearcher.doc(sdoc.doc); // 根据文档id取存储的文档
					System.out.println(hitDoc.get("tilte"));   	//取出查询文档中的title值
				}

			} 
		} catch (Exception e) {
			logger.error("Exception", e);
		}
		
	}
	
	
	
	
	
}
           

7、除了BooleanQuery查询,其余的查询还有好多,如短语、模糊、范围等,但是目前网上的低版本比较多,查询的方法有很多不好用,而且一般简单的搜索满足不了业务,这里只介绍了创建索引,和查询的基础方法,更新和删除不做介绍。

继续阅读