索引命中效果不佳,采用IK分詞與二進制分詞的效果都不是特别好,于是設計了新的自定義分詞器,先将句子用IK分詞分開,再對長度超過3的詞進行二進制分詞。
以下是分詞器的實作效果圖。
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsIyN3gDNykjM1EzMykDM1EDMy8CX0Vmbu4GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.jpg)
實作思路先建立IK分詞器,在通過第一層filter将IK分的詞截留,長度大于等于3的詞置入CJK分詞器進行處理,然後得到的結果送入第二層filter中進行去重。因為在上一層中會出現大量重複詞.下面貼代碼。
package com.huang.analyzer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKTokenizer;
public class MyAnalyzer extends Analyzer {
private static final String text = "今天是個好日子";
//private static final String text = "How are you";
@Override
protected TokenStreamComponents createComponents(String fieldname, Reader in) {
// TODO Auto-generated method stub
// Tokenizer token = new LowerCaseTokenizer(in);
// TokenStreamComponents components = new TokenStreamComponents(token);
// TokenStream ts = components.getTokenStream();
//Tokenizer tokenizer = new IKBinaryTokenizer(in);
//Tokenizer tokenizer = new CJKTokenizer(in);
//TokenStream filter = new UpperCaseFilter(tokenizer);
//TokenStream filter = new CJKWidthFilter(tokenizer);
//Tokenizer tokenizer = new IkCjkTokenizer(in);
Tokenizer tokenizer = new IKTokenizer(in, false);
TokenStream normsFilter = new NormsFilter(tokenizer);
TokenStream distinctFilter = new DistinctFilter(normsFilter);
TokenStreamComponents components = new TokenStreamComponents(tokenizer, distinctFilter);
return components;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
MyAnalyzer myAnalyzer = null;
myAnalyzer = new MyAnalyzer();
StringReader reader = new StringReader(text);
//開始分詞
try {
TokenStream ts = myAnalyzer.tokenStream("", reader);
ts.addAttribute(CharTermAttribute.class);
ts.reset();
int i = 0;
while(ts.incrementToken()){
CharTermAttribute cta = null;
cta = ts.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class);
System.out.println(cta.toString() + "(" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + ")");
}
ts.end();
ts.close();
reader.close();
System.out.println();
}
catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package com.huang.analyzer;
import java.io.IOException;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cjk.CJKTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
public class NormsFilter extends TokenFilter{
/**中文詞組長度過濾,預設超過2位長度的中文才轉換拼音*/
private int minTermLength;
/**目前輸入是否已輸出*/
private boolean hasCurOut;
/**詞元輸入緩存*/
private char[] curTermBuffer;
/**詞元輸入長度*/
private int curTermLength;
private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);
/**位置增量屬性*/
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
/**cjk分詞結果集*/
private Collection<String> terms;
/**結果集疊代器*/
private Iterator<String> termIte;
/**主詞在字元串中的偏移量*/
private int firstStartOffset;
/**副詞在主詞中的偏移量*/
private int secondStartOffset;
private boolean norms;
protected NormsFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
// TODO Auto-generated method stub
//開始處理或上一詞元已經處理完成
while (true) {
if (this.curTermBuffer == null) {
//擷取下一詞元輸入
if (!this.input.incrementToken()) {
// 沒有後繼詞元輸入,處理完成,傳回false,結束上層調用
return false;
}
this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());
this.curTermLength = this.termAtt.length();
}
//處理原輸入詞元
if (!hasCurOut && (this.termIte == null)) {
// 标記以保證下次循環不會輸出
this.hasCurOut = true;
// 寫入原輸入詞元
this.termAtt.copyBuffer(this.curTermBuffer, 0, this.curTermLength);
this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());
this.typeAtt.setType("CN_WORD");
if (curTermLength <= 2) {
this.curTermBuffer = null;
this.hasCurOut = false;
}
return true;
}
String text = this.termAtt.toString();
//norms為true表示在第二層中
if (norms == false) {
if (text.length() >= 3) {
//主詞相對句子的偏移量
this.firstStartOffset = offsetAtt.startOffset();
//開始第二層
norms = true;
//用容器來裝cjk分好的二進制分詞
Collection<String> coll = new ArrayList<String>();
/**開始cjk分詞*/
StringReader reader = new StringReader(text);
Tokenizer cjk = new CJKTokenizer(reader);
cjk.reset();
while (cjk.incrementToken()) {
coll.add(cjk.getAttribute(CharTermAttribute.class).toString());
}
cjk.end();
cjk.close();
this.terms = coll;
if (this.terms != null) {
this.termIte = this.terms.iterator();
}
}else {
this.termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());
this.typeAtt.setType("CN_WORD");
this.curTermBuffer = null;
return true;
}
}
if (norms == true) {
// 有拼音結果集且未處理完成
while (this.termIte.hasNext()) {
String pinyin = this.termIte.next();
this.termAtt.copyBuffer(pinyin.toCharArray(), 0, pinyin.length());
this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());
this.typeAtt.setType("CN_WORD");
this.offsetAtt.setOffset(this.firstStartOffset, this.firstStartOffset + pinyin.length());
this.firstStartOffset++;
if (!this.termIte.hasNext()) {
// 沒有中文或轉換拼音失敗,不用處理,
// 清理緩存,下次取新詞元
this.curTermBuffer = null;
this.termIte = null;
this.hasCurOut = false;
this.firstStartOffset = 0;
norms = false;
}
return true;
}
}
}
}
}
package com.huang.analyzer;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public class DistinctFilter extends TokenFilter{
private Collection<String> terms;
/**詞元輸入緩存*/
private char[] curTermBuffer;
/**詞元輸入長度*/
private int curTermLength;
/**詞元文字屬性*/
private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);
/**位置增量屬性*/
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/**詞元類型屬性*/
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
protected DistinctFilter(TokenStream input) {
super(input);
// TODO Auto-generated constructor stub
this.terms = new LinkedList<String>();
}
@Override
public boolean incrementToken() throws IOException {
// TODO Auto-generated method stub
while (true) {
if (this.curTermBuffer == null) {
//擷取下一詞元輸入
if (!this.input.incrementToken()) {
// 沒有後繼詞元輸入,處理完成,傳回false,結束上層調用
return false;
}
this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());
this.curTermLength = this.termAtt.length();
}
String text = this.termAtt.toString();
if (terms.contains(text)) {
//this.curTermBuffer = "重複".toCharArray();
this.curTermBuffer = null;
continue;
}else {
terms.add(text);
}
/***/
this.termAtt.copyBuffer(this.curTermBuffer, 0, this.curTermLength);
this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());
this.typeAtt.setType("CN_WORD");
/***/
this.curTermBuffer = null;
return true;
}
}
@Override
public void reset() throws IOException {
// TODO Auto-generated method stub
super.reset();
this.curTermBuffer = null;
terms.clear();
}
}