Lucene自定義分詞:合并IK分詞+二進制分詞

索引命中效果不佳，采用IK分詞與二進制分詞的效果都不是特别好，于是設計了新的自定義分詞器，先将句子用IK分詞分開，再對長度超過3的詞進行二進制分詞。

以下是分詞器的實作效果圖。

實作思路先建立IK分詞器，在通過第一層filter将IK分的詞截留，長度大于等于3的詞置入CJK分詞器進行處理，然後得到的結果送入第二層filter中進行去重。因為在上一層中會出現大量重複詞.下面貼代碼。

package com.huang.analyzer;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKTokenizer;

public class MyAnalyzer extends Analyzer {

    private static final String text = "今天是個好日子";
    //private static final String text = "How are you";

    @Override
    protected TokenStreamComponents createComponents(String fieldname, Reader in) {
        // TODO Auto-generated method stub
//      Tokenizer token = new LowerCaseTokenizer(in); 
//      TokenStreamComponents components = new TokenStreamComponents(token);
//      TokenStream ts = components.getTokenStream();

        //Tokenizer tokenizer = new IKBinaryTokenizer(in);
        //Tokenizer tokenizer = new CJKTokenizer(in);
        //TokenStream filter = new UpperCaseFilter(tokenizer);
        //TokenStream filter = new CJKWidthFilter(tokenizer);
        //Tokenizer tokenizer = new IkCjkTokenizer(in);
        Tokenizer tokenizer = new IKTokenizer(in, false);
        TokenStream normsFilter = new NormsFilter(tokenizer);
        TokenStream distinctFilter = new DistinctFilter(normsFilter);
        TokenStreamComponents components = new TokenStreamComponents(tokenizer, distinctFilter);

        return components;
    }



    public static void main(String[] args) {
        // TODO Auto-generated method stub
        MyAnalyzer myAnalyzer = null;
        myAnalyzer = new MyAnalyzer();          
        StringReader reader = new StringReader(text);

        //開始分詞
        try {
            TokenStream ts = myAnalyzer.tokenStream("", reader);
            ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int i = 0;
            while(ts.incrementToken()){
                CharTermAttribute cta = null;
                cta = ts.getAttribute(CharTermAttribute.class);
                OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
                System.out.println(cta.toString() + "(" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + ")");
            }

            ts.end();
            ts.close();
            reader.close();
            System.out.println();
        }
        catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }


}

package com.huang.analyzer;

import java.io.IOException;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;

public class NormsFilter extends TokenFilter{

    /**中文詞組長度過濾，預設超過2位長度的中文才轉換拼音*/  
    private int minTermLength;

    /**目前輸入是否已輸出*/  
    private boolean hasCurOut;  

    /**詞元輸入緩存*/  
    private char[] curTermBuffer;

    /**詞元輸入長度*/  
    private int curTermLength;  

    private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);  

    /**位置增量屬性*/  
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);  

    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

    /**cjk分詞結果集*/
    private Collection<String> terms;

    /**結果集疊代器*/  
    private Iterator<String> termIte;  

    /**主詞在字元串中的偏移量*/
    private int firstStartOffset;

    /**副詞在主詞中的偏移量*/
    private int secondStartOffset;

    private boolean norms;

    protected NormsFilter(TokenStream input) {
        super(input);
    }

    @Override
    public boolean incrementToken() throws IOException {
        // TODO Auto-generated method stub
        //開始處理或上一詞元已經處理完成
        while (true) {
            if (this.curTermBuffer == null) {
                //擷取下一詞元輸入
                if (!this.input.incrementToken()) {   
                    // 沒有後繼詞元輸入，處理完成，傳回false，結束上層調用  
                    return false;   
                }  
                this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());  
                this.curTermLength = this.termAtt.length(); 
            }
            //處理原輸入詞元
            if (!hasCurOut && (this.termIte == null)) {
                // 标記以保證下次循環不會輸出  
                this.hasCurOut = true;   
                // 寫入原輸入詞元

                this.termAtt.copyBuffer(this.curTermBuffer, 0, this.curTermLength);  
                this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());  
                this.typeAtt.setType("CN_WORD"); 

                if (curTermLength <= 2) {
                    this.curTermBuffer = null;
                    this.hasCurOut = false;
                }
                return true;  
            }
            String text = this.termAtt.toString();

            //norms為true表示在第二層中
            if (norms == false) {

                if (text.length() >= 3) {

                    //主詞相對句子的偏移量
                    this.firstStartOffset = offsetAtt.startOffset();

                    //開始第二層
                    norms = true;
                    //用容器來裝cjk分好的二進制分詞
                    Collection<String> coll = new ArrayList<String>();

                    /**開始cjk分詞*/
                    StringReader reader = new StringReader(text);
                    Tokenizer cjk = new CJKTokenizer(reader);
                    cjk.reset();
                    while (cjk.incrementToken()) {
                        coll.add(cjk.getAttribute(CharTermAttribute.class).toString());
                    }
                    cjk.end();
                    cjk.close();

                    this.terms = coll;
                    if (this.terms != null) {  
                        this.termIte = this.terms.iterator();  
                    }  
                }else {
                    this.termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
                    this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());  
                    this.typeAtt.setType("CN_WORD");
                    this.curTermBuffer = null;  
                    return true;
                }
            }

            if (norms == true) {  
                // 有拼音結果集且未處理完成  
                while (this.termIte.hasNext()) {

                    String pinyin = this.termIte.next();  
                    this.termAtt.copyBuffer(pinyin.toCharArray(), 0, pinyin.length());  
                    this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());  
                    this.typeAtt.setType("CN_WORD");
                    this.offsetAtt.setOffset(this.firstStartOffset, this.firstStartOffset + pinyin.length());
                    this.firstStartOffset++;

                    if (!this.termIte.hasNext()) {
                        // 沒有中文或轉換拼音失敗，不用處理，  
                        // 清理緩存，下次取新詞元  
                        this.curTermBuffer = null;  
                        this.termIte = null;  
                        this.hasCurOut = false;
                        this.firstStartOffset = 0;
                        norms = false;
                    }
                    return true;  
                }  
            }


        }
    }

}

package com.huang.analyzer;

import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

public class DistinctFilter extends TokenFilter{

    private Collection<String> terms;

    /**詞元輸入緩存*/  
    private char[] curTermBuffer;

    /**詞元輸入長度*/  
    private int curTermLength;  

    /**詞元文字屬性*/
    private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);  

    /**位置增量屬性*/  
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);  

    /**詞元類型屬性*/
    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

    protected DistinctFilter(TokenStream input) {
        super(input);
        // TODO Auto-generated constructor stub
        this.terms = new LinkedList<String>();
    }

    @Override
    public boolean incrementToken() throws IOException {
        // TODO Auto-generated method stub
        while (true) {
            if (this.curTermBuffer == null) {
                //擷取下一詞元輸入
                if (!this.input.incrementToken()) {   
                    // 沒有後繼詞元輸入，處理完成，傳回false，結束上層調用  
                    return false;   
                }  
                this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());  
                this.curTermLength = this.termAtt.length(); 
            }

            String text = this.termAtt.toString();
            if (terms.contains(text)) {
                //this.curTermBuffer = "重複".toCharArray();
                this.curTermBuffer = null;
                continue;
            }else {
                terms.add(text);
            }

            /***/
            this.termAtt.copyBuffer(this.curTermBuffer, 0, this.curTermLength);  
            this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());  
            this.typeAtt.setType("CN_WORD"); 
            /***/

            this.curTermBuffer = null;
            return true;
        }
    }
    @Override
    public void reset() throws IOException {
        // TODO Auto-generated method stub
        super.reset();
        this.curTermBuffer = null;  
        terms.clear();
    }

}

Lucene自定義分詞:合并IK分詞+二進制分詞

繼續閱讀

2021-08-08 mysql索引

SQLServer 提升查詢速度

spring-security的過濾器執行

web.xml[spring][encoding]

詳解SQL中幾種常用的表連接配接方式

oracle 中不使用已有的索引解決辦法

hbase thrift C++ 簡單測試

過濾的原理及其必要性

lucene 關鍵字高亮

對first_name建立唯一索引uniq_idx_firstname問題描述Sql語句

幾種常見的疊代器

記一次因MySQL編碼問題導緻的慢查詢排查

Wireshark 簡單使用

tshark簡單使用-wireshark

使用者賬号自動登入--解決方案

專家訪談：搜尋開源力量：Lucene技術前景