Lucene5學習之自定義同義詞分詞器簡單示例

同義詞功能在全文搜尋時的意義，大家應該都懂的。今天中文我就試着寫了一個同義詞分詞的示例demo，其實主要代碼還是參考lucene in action 這本英文版書籍的随書代碼，隻不過lucenen in action書裡的示例代碼目前最新版隻支援到lucene4.x,對于lucene5.x,代碼需要稍作修改，下面是基于lucene5.x的自定義同義詞分詞器demo:

package com.yida.framework.lucene5.analyzer.synonym;

import java.io.ioexception;

/**

* 同義詞提取引擎

* @author lanxiaowei

public interface synonymengine {

string[] getsynonyms(string s) throws ioexception;

}

import java.util.hashmap;

public class basesynonymengine implements synonymengine {

private static hashmap<string, string[]> map = new hashmap<string, string[]>();

{

map.put("quick", new string[] {"fast","speedy"});

map.put("jumps", new string[] {"leaps","hops"});

map.put("over", new string[] {"above"});

map.put("lazy", new string[] {"apathetic","slugish"});

map.put("dog", new string[] {"canine","pooch"});

}

public string[] getsynonyms(string s) throws ioexception {

return map.get(s);

import java.util.stack;

import org.apache.lucene.analysis.tokenfilter;

import org.apache.lucene.analysis.tokenstream;

import org.apache.lucene.analysis.tokenattributes.chartermattribute;

import org.apache.lucene.analysis.tokenattributes.positionincrementattribute;

import org.apache.lucene.util.attributesource;

* 自定義同義詞過濾器

public class synonymfilter extends tokenfilter {

public static final string token_type_synonym = "synonym";

private stack<string> synonymstack;

private synonymengine engine;

private attributesource.state current;

private final chartermattribute termatt;

private final positionincrementattribute posincratt;

public synonymfilter(tokenstream in, synonymengine engine) {

super(in);

synonymstack = new stack<string>(); // #1

this.engine = engine;

this.termatt = addattribute(chartermattribute.class);

this.posincratt = addattribute(positionincrementattribute.class);

public boolean incrementtoken() throws ioexception {

if (synonymstack.size() > 0) { // #2

string syn = synonymstack.pop(); // #2

restorestate(current); // #2

// 這裡lucene4.x的寫法

// termatt.settermbuffer(syn);

// 這是lucene5.x的寫法

termatt.copybuffer(syn.tochararray(), 0, syn.length());

posincratt.setpositionincrement(0); // #3

return true;

}

if (!input.incrementtoken()) // #4

return false;

if (addaliasestostack()) { // #5

current = capturestate(); // #6

return true; // #7

private boolean addaliasestostack() throws ioexception {

// 這裡lucene4.x的寫法

// string[] synonyms = engine.getsynonyms(termatt.term()); //#8

// 這裡lucene5.x的寫法

string[] synonyms = engine.getsynonyms(termatt.tostring()); // #8

if (synonyms == null) {

for (string synonym : synonyms) { // #9

synonymstack.push(synonym);

return true;

#1 define synonym buffer

#2 pop buffered synonyms

#3 set position increment to 0

#4 read next token

#5 push synonyms onto stack

#6 save current token

#7 return current token

#8 retrieve synonyms

#9 push synonyms onto stack

import java.io.bufferedreader;

import java.io.reader;

import java.io.stringreader;

import org.apache.lucene.analysis.analyzer;

import org.apache.lucene.analysis.tokenizer;

import org.apache.lucene.analysis.analyzer.tokenstreamcomponents;

import org.apache.lucene.analysis.core.lettertokenizer;

import org.apache.lucene.analysis.core.lowercasefilter;

import org.apache.lucene.analysis.core.stopanalyzer;

import org.apache.lucene.analysis.core.stopfilter;

import org.apache.lucene.analysis.standard.standardfilter;

import org.apache.lucene.analysis.standard.standardtokenizer;

import com.yida.framework.lucene5.util.analyzer.codec.metaphonereplacementfilter;

* 自定義同義詞分詞器

* @createtime 2015-03-31 10:15:23

public class synonymanalyzer extends analyzer {

public synonymanalyzer(synonymengine engine) {

@override

protected tokenstreamcomponents createcomponents(string text) {

tokenizer tokenizer = new standardtokenizer();

tokenstream tokenstream = new synonymfilter(tokenizer, engine);

tokenstream = new lowercasefilter(tokenstream);

tokenstream = new stopfilter(tokenstream,stopanalyzer.english_stop_words_set);

return new tokenstreamcomponents(tokenizer, tokenstream);

import com.yida.framework.lucene5.util.analyzerutils;

public class synonymanalyzertest {

public static void main(string[] args) throws ioexception {

string text = "the quick brown fox jumps over the lazy dog";

analyzer analyzer = new synonymanalyzer(new basesynonymengine());

analyzerutils.displaytokens(analyzer, text);

package com.yida.framework.lucene5.util;

import junit.framework.assert;

import org.apache.lucene.analysis.tokenattributes.offsetattribute;

import org.apache.lucene.analysis.tokenattributes.typeattribute;

* 用于分詞器測試的一個簡單工具類(用于列印分詞情況，包括term的起始位置和結束位置(即所謂的偏 * 移量)，位置增量，term字元串，term字元串類型(字元串/阿拉伯數字之類的))

public class analyzerutils {

public static void displaytokens(analyzer analyzer,string text) throws ioexception {

tokenstream tokenstream = analyzer.tokenstream("text", text);

displaytokens(tokenstream);

public static void displaytokens(tokenstream tokenstream) throws ioexception {

offsetattribute offsetattribute = tokenstream.addattribute(offsetattribute.class);

positionincrementattribute positionincrementattribute = tokenstream.addattribute(positionincrementattribute.class);

chartermattribute chartermattribute = tokenstream.addattribute(chartermattribute.class);

typeattribute typeattribute = tokenstream.addattribute(typeattribute.class);

tokenstream.reset();

int position = 0;

while (tokenstream.incrementtoken()) {

int increment = positionincrementattribute.getpositionincrement();

if(increment > 0) {

position = position + increment;

system.out.print(position + ":");

}

int startoffset = offsetattribute.startoffset();

int endoffset = offsetattribute.endoffset();

string term = chartermattribute.tostring();

system.out.println("[" + term + "]" + ":(" + startoffset + "-->" + endoffset + "):" + typeattribute.type());

/**

* 斷言分詞結果

* @param analyzer

* @param text 源字元串

* @param expecteds 期望分詞後結果

* @throws ioexception

public static void assertanalyzerto(analyzer analyzer,string text,string[] expecteds) throws ioexception {

for(string expected : expecteds) {

assert.asserttrue(tokenstream.incrementtoken());

assert.assertequals(expected, chartermattribute.tostring());

assert.assertfalse(tokenstream.incrementtoken());

tokenstream.close();

以上代碼都是lucene in action這本書裡面的示例代碼，我隻不過是基于lucene5.x把它重寫并調試成功了，特此分享，希望對正在學習lucene5的童鞋們有所幫助。demo代碼我會在底下附件裡上傳，有需要demo源碼的請自己在底下的附件裡下載下傳，lucene in action這本書的随書源碼我已上傳到我的百度網盤，也一并分享給大家，lucene in action随書源碼百度網盤下載下傳位址：

千言萬語都在代碼中，就不多說了，打完收工！

如果你還有什麼問題請加我Ｑ-q：7-3-6-0-3-1-3-0-5，

或者加裙

一起交流學習！

轉載：http://iamyida.iteye.com/blog/2197355

Lucene5學習之自定義同義詞分詞器簡單示例

繼續閱讀

關于Gradle配置的小結

Java小案例——随機數猜測随機數猜測

nginx location中斜線的位置的重要性

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的簡單使用

neo4j之cypher使用文檔

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

mybatis_入門程式Mybatis入門

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method