同義詞功能在全文搜尋時的意義,大家應該都懂的。今天中文我就試着寫了一個同義詞分詞的示例demo,其實主要代碼還是參考lucene in action 這本英文版書籍的随書代碼,隻不過lucenen in action書裡的示例代碼目前最新版隻支援到lucene4.x,對于lucene5.x,代碼需要稍作修改,下面是基于lucene5.x的自定義同義詞分詞器demo:
package com.yida.framework.lucene5.analyzer.synonym;
import java.io.ioexception;
/**
* 同義詞提取引擎
* @author lanxiaowei
*
*/
public interface synonymengine {
string[] getsynonyms(string s) throws ioexception;
}
import java.util.hashmap;
public class basesynonymengine implements synonymengine {
private static hashmap<string, string[]> map = new hashmap<string, string[]>();
{
map.put("quick", new string[] {"fast","speedy"});
map.put("jumps", new string[] {"leaps","hops"});
map.put("over", new string[] {"above"});
map.put("lazy", new string[] {"apathetic","slugish"});
map.put("dog", new string[] {"canine","pooch"});
}
public string[] getsynonyms(string s) throws ioexception {
return map.get(s);
import java.util.stack;
import org.apache.lucene.analysis.tokenfilter;
import org.apache.lucene.analysis.tokenstream;
import org.apache.lucene.analysis.tokenattributes.chartermattribute;
import org.apache.lucene.analysis.tokenattributes.positionincrementattribute;
import org.apache.lucene.util.attributesource;
* 自定義同義詞過濾器
*
public class synonymfilter extends tokenfilter {
public static final string token_type_synonym = "synonym";
private stack<string> synonymstack;
private synonymengine engine;
private attributesource.state current;
private final chartermattribute termatt;
private final positionincrementattribute posincratt;
public synonymfilter(tokenstream in, synonymengine engine) {
super(in);
synonymstack = new stack<string>(); // #1
this.engine = engine;
this.termatt = addattribute(chartermattribute.class);
this.posincratt = addattribute(positionincrementattribute.class);
public boolean incrementtoken() throws ioexception {
if (synonymstack.size() > 0) { // #2
string syn = synonymstack.pop(); // #2
restorestate(current); // #2
// 這裡lucene4.x的寫法
// termatt.settermbuffer(syn);
// 這是lucene5.x的寫法
termatt.copybuffer(syn.tochararray(), 0, syn.length());
posincratt.setpositionincrement(0); // #3
return true;
}
if (!input.incrementtoken()) // #4
return false;
if (addaliasestostack()) { // #5
current = capturestate(); // #6
return true; // #7
private boolean addaliasestostack() throws ioexception {
// 這裡lucene4.x的寫法
// string[] synonyms = engine.getsynonyms(termatt.term()); //#8
// 這裡lucene5.x的寫法
string[] synonyms = engine.getsynonyms(termatt.tostring()); // #8
if (synonyms == null) {
for (string synonym : synonyms) { // #9
synonymstack.push(synonym);
return true;
/*
#1 define synonym buffer
#2 pop buffered synonyms
#3 set position increment to 0
#4 read next token
#5 push synonyms onto stack
#6 save current token
#7 return current token
#8 retrieve synonyms
#9 push synonyms onto stack
*/
import java.io.bufferedreader;
import java.io.reader;
import java.io.stringreader;
import org.apache.lucene.analysis.analyzer;
import org.apache.lucene.analysis.tokenizer;
import org.apache.lucene.analysis.analyzer.tokenstreamcomponents;
import org.apache.lucene.analysis.core.lettertokenizer;
import org.apache.lucene.analysis.core.lowercasefilter;
import org.apache.lucene.analysis.core.stopanalyzer;
import org.apache.lucene.analysis.core.stopfilter;
import org.apache.lucene.analysis.standard.standardfilter;
import org.apache.lucene.analysis.standard.standardtokenizer;
import com.yida.framework.lucene5.util.analyzer.codec.metaphonereplacementfilter;
* 自定義同義詞分詞器
* @createtime 2015-03-31 10:15:23
public class synonymanalyzer extends analyzer {
public synonymanalyzer(synonymengine engine) {
@override
protected tokenstreamcomponents createcomponents(string text) {
tokenizer tokenizer = new standardtokenizer();
tokenstream tokenstream = new synonymfilter(tokenizer, engine);
tokenstream = new lowercasefilter(tokenstream);
tokenstream = new stopfilter(tokenstream,stopanalyzer.english_stop_words_set);
return new tokenstreamcomponents(tokenizer, tokenstream);
import com.yida.framework.lucene5.util.analyzerutils;
public class synonymanalyzertest {
public static void main(string[] args) throws ioexception {
string text = "the quick brown fox jumps over the lazy dog";
analyzer analyzer = new synonymanalyzer(new basesynonymengine());
analyzerutils.displaytokens(analyzer, text);
package com.yida.framework.lucene5.util;
import junit.framework.assert;
import org.apache.lucene.analysis.tokenattributes.offsetattribute;
import org.apache.lucene.analysis.tokenattributes.typeattribute;
* 用于分詞器測試的一個簡單工具類(用于列印分詞情況,包括term的起始位置和結束位置(即所謂的偏 * 移量),位置增量,term字元串,term字元串類型(字元串/阿拉伯數字之類的))
public class analyzerutils {
public static void displaytokens(analyzer analyzer,string text) throws ioexception {
tokenstream tokenstream = analyzer.tokenstream("text", text);
displaytokens(tokenstream);
public static void displaytokens(tokenstream tokenstream) throws ioexception {
offsetattribute offsetattribute = tokenstream.addattribute(offsetattribute.class);
positionincrementattribute positionincrementattribute = tokenstream.addattribute(positionincrementattribute.class);
chartermattribute chartermattribute = tokenstream.addattribute(chartermattribute.class);
typeattribute typeattribute = tokenstream.addattribute(typeattribute.class);
tokenstream.reset();
int position = 0;
while (tokenstream.incrementtoken()) {
int increment = positionincrementattribute.getpositionincrement();
if(increment > 0) {
position = position + increment;
system.out.print(position + ":");
}
int startoffset = offsetattribute.startoffset();
int endoffset = offsetattribute.endoffset();
string term = chartermattribute.tostring();
system.out.println("[" + term + "]" + ":(" + startoffset + "-->" + endoffset + "):" + typeattribute.type());
/**
* 斷言分詞結果
* @param analyzer
* @param text 源字元串
* @param expecteds 期望分詞後結果
* @throws ioexception
*/
public static void assertanalyzerto(analyzer analyzer,string text,string[] expecteds) throws ioexception {
for(string expected : expecteds) {
assert.asserttrue(tokenstream.incrementtoken());
assert.assertequals(expected, chartermattribute.tostring());
assert.assertfalse(tokenstream.incrementtoken());
tokenstream.close();
以上代碼都是lucene in action這本書裡面的示例代碼,我隻不過是基于lucene5.x把它重寫并調試成功了,特此分享,希望對正在學習lucene5的童鞋們有所幫助。demo代碼我會在底下附件裡上傳,有需要demo源碼的請自己在底下的附件裡下載下傳,lucene in action這本書的随書源碼我已上傳到我的百度網盤,也一并分享給大家,lucene in action随書源碼百度網盤下載下傳位址:
千言萬語都在代碼中,就不多說了,打完收工!
如果你還有什麼問題請加我Q-q:7-3-6-0-3-1-3-0-5,
或者加裙
一起交流學習!
轉載:http://iamyida.iteye.com/blog/2197355