天天看點

Lucene5學習之自定義同義詞分詞器簡單示例

  同義詞功能在全文搜尋時的意義,大家應該都懂的。今天中文我就試着寫了一個同義詞分詞的示例demo,其實主要代碼還是參考lucene in action 這本英文版書籍的随書代碼,隻不過lucenen in action書裡的示例代碼目前最新版隻支援到lucene4.x,對于lucene5.x,代碼需要稍作修改,下面是基于lucene5.x的自定義同義詞分詞器demo:

Lucene5學習之自定義同義詞分詞器簡單示例

package com.yida.framework.lucene5.analyzer.synonym;  

import java.io.ioexception;  

/** 

 * 同義詞提取引擎 

 * @author lanxiaowei 

 * 

 */  

public interface synonymengine {  

    string[] getsynonyms(string s) throws ioexception;  

}  

Lucene5學習之自定義同義詞分詞器簡單示例

import java.util.hashmap;  

public class basesynonymengine implements synonymengine {  

    private static hashmap<string, string[]> map = new hashmap<string, string[]>();  

    {  

        map.put("quick", new string[] {"fast","speedy"});  

        map.put("jumps", new string[] {"leaps","hops"});  

        map.put("over", new string[] {"above"});  

        map.put("lazy", new string[] {"apathetic","slugish"});  

        map.put("dog", new string[] {"canine","pooch"});  

    }  

    public string[] getsynonyms(string s) throws ioexception {  

        return map.get(s);  

Lucene5學習之自定義同義詞分詞器簡單示例

import java.util.stack;  

import org.apache.lucene.analysis.tokenfilter;  

import org.apache.lucene.analysis.tokenstream;  

import org.apache.lucene.analysis.tokenattributes.chartermattribute;  

import org.apache.lucene.analysis.tokenattributes.positionincrementattribute;  

import org.apache.lucene.util.attributesource;  

 * 自定義同義詞過濾器 

 *  

public class synonymfilter extends tokenfilter {  

    public static final string token_type_synonym = "synonym";  

    private stack<string> synonymstack;  

    private synonymengine engine;  

    private attributesource.state current;  

    private final chartermattribute termatt;  

    private final positionincrementattribute posincratt;  

    public synonymfilter(tokenstream in, synonymengine engine) {  

        super(in);  

        synonymstack = new stack<string>(); // #1  

        this.engine = engine;  

        this.termatt = addattribute(chartermattribute.class);  

        this.posincratt = addattribute(positionincrementattribute.class);  

    public boolean incrementtoken() throws ioexception {  

        if (synonymstack.size() > 0) { // #2  

            string syn = synonymstack.pop(); // #2  

            restorestate(current); // #2  

            // 這裡lucene4.x的寫法  

            // termatt.settermbuffer(syn);  

            // 這是lucene5.x的寫法  

            termatt.copybuffer(syn.tochararray(), 0, syn.length());  

            posincratt.setpositionincrement(0); // #3  

            return true;  

        }  

        if (!input.incrementtoken()) // #4  

            return false;  

        if (addaliasestostack()) { // #5  

            current = capturestate(); // #6  

        return true; // #7  

    private boolean addaliasestostack() throws ioexception {  

        // 這裡lucene4.x的寫法  

        // string[] synonyms = engine.getsynonyms(termatt.term()); //#8  

        // 這裡lucene5.x的寫法  

        string[] synonyms = engine.getsynonyms(termatt.tostring()); // #8  

        if (synonyms == null) {  

        for (string synonym : synonyms) { // #9  

            synonymstack.push(synonym);  

        return true;  

/* 

#1 define synonym buffer 

#2 pop buffered synonyms 

#3 set position increment to 0 

#4 read next token 

#5 push synonyms onto stack 

#6 save current token 

#7 return current token 

#8 retrieve synonyms 

#9 push synonyms onto stack 

*/  

Lucene5學習之自定義同義詞分詞器簡單示例

import java.io.bufferedreader;  

import java.io.reader;  

import java.io.stringreader;  

import org.apache.lucene.analysis.analyzer;  

import org.apache.lucene.analysis.tokenizer;  

import org.apache.lucene.analysis.analyzer.tokenstreamcomponents;  

import org.apache.lucene.analysis.core.lettertokenizer;  

import org.apache.lucene.analysis.core.lowercasefilter;  

import org.apache.lucene.analysis.core.stopanalyzer;  

import org.apache.lucene.analysis.core.stopfilter;  

import org.apache.lucene.analysis.standard.standardfilter;  

import org.apache.lucene.analysis.standard.standardtokenizer;  

import com.yida.framework.lucene5.util.analyzer.codec.metaphonereplacementfilter;  

 * 自定義同義詞分詞器 

 * @createtime 2015-03-31 10:15:23 

public class synonymanalyzer extends analyzer {  

    public synonymanalyzer(synonymengine engine) {  

    @override  

    protected tokenstreamcomponents createcomponents(string text) {  

        tokenizer tokenizer = new standardtokenizer();  

        tokenstream tokenstream = new synonymfilter(tokenizer, engine);  

        tokenstream = new lowercasefilter(tokenstream);  

        tokenstream = new stopfilter(tokenstream,stopanalyzer.english_stop_words_set);  

        return new tokenstreamcomponents(tokenizer, tokenstream);  

Lucene5學習之自定義同義詞分詞器簡單示例

import com.yida.framework.lucene5.util.analyzerutils;  

public class synonymanalyzertest {  

    public static void main(string[] args) throws ioexception {  

        string text = "the quick brown fox jumps over the lazy dog";  

        analyzer analyzer = new synonymanalyzer(new basesynonymengine());  

        analyzerutils.displaytokens(analyzer, text);  

Lucene5學習之自定義同義詞分詞器簡單示例

package com.yida.framework.lucene5.util;  

import junit.framework.assert;  

import org.apache.lucene.analysis.tokenattributes.offsetattribute;  

import org.apache.lucene.analysis.tokenattributes.typeattribute;  

 * 用于分詞器測試的一個簡單工具類(用于列印分詞情況,包括term的起始位置和結束位置(即所謂的偏 * 移量),位置增量,term字元串,term字元串類型(字元串/阿拉伯數字之類的)) 

public class analyzerutils {  

    public static void displaytokens(analyzer analyzer,string text) throws ioexception {  

        tokenstream tokenstream = analyzer.tokenstream("text", text);  

        displaytokens(tokenstream);  

    public static void displaytokens(tokenstream tokenstream) throws ioexception {  

        offsetattribute offsetattribute = tokenstream.addattribute(offsetattribute.class);  

        positionincrementattribute positionincrementattribute = tokenstream.addattribute(positionincrementattribute.class);  

        chartermattribute chartermattribute = tokenstream.addattribute(chartermattribute.class);  

        typeattribute typeattribute = tokenstream.addattribute(typeattribute.class);  

        tokenstream.reset();  

        int position = 0;  

        while (tokenstream.incrementtoken()) {  

            int increment = positionincrementattribute.getpositionincrement();  

            if(increment > 0) {  

                position = position + increment;  

                system.out.print(position + ":");  

            }  

            int startoffset = offsetattribute.startoffset();  

            int endoffset = offsetattribute.endoffset();  

            string term = chartermattribute.tostring();  

            system.out.println("[" + term + "]" + ":(" + startoffset + "-->" + endoffset + "):" + typeattribute.type());  

    /** 

     * 斷言分詞結果 

     * @param analyzer 

     * @param text        源字元串 

     * @param expecteds   期望分詞後結果 

     * @throws ioexception  

     */  

    public static void assertanalyzerto(analyzer analyzer,string text,string[] expecteds) throws ioexception {  

        for(string expected : expecteds) {  

            assert.asserttrue(tokenstream.incrementtoken());  

            assert.assertequals(expected, chartermattribute.tostring());  

        assert.assertfalse(tokenstream.incrementtoken());  

        tokenstream.close();  

 以上代碼都是lucene in action這本書裡面的示例代碼,我隻不過是基于lucene5.x把它重寫并調試成功了,特此分享,希望對正在學習lucene5的童鞋們有所幫助。demo代碼我會在底下附件裡上傳,有需要demo源碼的請自己在底下的附件裡下載下傳,lucene in action這本書的随書源碼我已上傳到我的百度網盤,也一并分享給大家,lucene in action随書源碼百度網盤下載下傳位址:

      千言萬語都在代碼中,就不多說了,打完收工!

如果你還有什麼問題請加我Q-q:7-3-6-0-3-1-3-0-5,

或者加裙

Lucene5學習之自定義同義詞分詞器簡單示例

一起交流學習!

轉載:http://iamyida.iteye.com/blog/2197355