天天看點

從零開始搭建.NET Core版搜尋引擎(五)1.查詢多個字段的檢索方法2.可設定權重的檢索方法3.測試示例

目前的查詢方法過于簡單,而且無法與實際業務中的實體建立關系,是以本篇文章就來描述對查詢方法的擴充。

1.查詢多個字段的檢索方法

1.1.定義接口及輸入輸出項

查詢輸入項SingleSearchOption:

public class SingleSearchOption:SearchOptionBase
{
    /// <summary>
    /// 檢索關鍵詞
    /// </summary>
    public string Keyword { get; set; }
 
    /// <summary>
    /// 限定檢索域
    /// </summary>
    public List<string> Fields { get; set; }
 
    public SingleSearchOption(string keyword,List<string> fields,int maxHits=100)
    {
        if (string.IsNullOrWhiteSpace(keyword))
        {
            throw new ArgumentException("搜尋關鍵詞不能為空");
        }
        Keyword = keyword;
        Fields = fields;
        MaxHits = maxHits;
    }
 
    public SingleSearchOption()
    {
 
    }
}
           

其中SearchOptionBase:

public class SearchOptionBase : ISearchOption
{
    /// <summary>
    /// 最大檢索量
    /// </summary>
    public int MaxHits { get ; set; }
}
           

輸出結果SingleSearchResult:

public class SingleSearchResult : ISearchResult<SearchResultItem>
{
    /// <summary>
    /// 比對結果
    /// </summary>
    public List<SearchResultItem> Items { get; set; }
    /// <summary>
    /// 檢索耗時
    /// </summary>
    public long Elapsed { get; set; }
    /// <summary>
    /// 比對結果數
    /// </summary>
    public int TotalHits { get; set; }
 
    public SingleSearchResult()
    {
        Items = new List<SearchResultItem>();
    }
}
           

其中查詢結果項SearchResultItem:

public class SearchResultItem : ISearchResultItem
{
    /// <summary>
    /// 結果評分
    /// </summary>
    public float Score { get; set; }
    /// <summary>
    /// 實體Id
    /// </summary>
    public string EntityId { get; set; }
    /// <summary>
    /// 實體類名
    /// </summary>
    public string EntityName { get; set; }
 
}
           

1.2.方法實作

/// <summary>
/// 簡單查詢
/// </summary>
/// <param name="option"></param>
/// <returns></returns>
public SingleSearchResult SingleSearch(SingleSearchOption option)
{
    SingleSearchResult result = new SingleSearchResult();
    Stopwatch watch=Stopwatch.StartNew();
    using (Lucene.Net.Index.DirectoryReader reader = DirectoryReader.Open(Directory))
    {
        //執行個體化索引檢索器
        IndexSearcher searcher = new IndexSearcher(reader);
        var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer);
        Query query = queryParser.Parse(option.Keyword);
        var matches = searcher.Search(query, option.MaxHits).ScoreDocs;
        result.TotalHits = matches.Count();
        foreach (var match in matches)
        {
            var doc = searcher.Doc(match.Doc);
            SearchResultItem item = new SearchResultItem();
            item.Score = match.Score;
            item.EntityId = doc.GetField(CoreConstant.EntityId).GetStringValue();
            item.EntityName = doc.GetField(CoreConstant.EntityType).GetStringValue();
            result.Items.Add(item);
        }
    }
    watch.Stop();
    result.Elapsed = watch.ElapsedMilliseconds;
    return result;
}
           

其中實體辨別EntityId、實體類名EntityName這兩個域是在建立索引時添加進去的,這樣確定每個Document和資料庫的每條記錄都能通過Id被互相找到。

從零開始搭建.NET Core版搜尋引擎(五)1.查詢多個字段的檢索方法2.可設定權重的檢索方法3.測試示例

2.可設定權重的檢索方法

2.1.定義接口及輸入輸出項

輸入項為:

public class ScoredSearchOption:SearchOptionBase
{
    /// <summary>
    /// 檢索關鍵詞
    /// </summary>
    public string Keyword { get; set; }
 
    /// <summary>
    /// 限定檢索域
    /// </summary>
    public List<string> Fields { get; set; }
 
    /// <summary>
    /// 多字段搜尋時,給字段設定搜尋權重
    /// </summary>
    private readonly Dictionary<string, float> _boosts;
 
    /// <summary>
    /// 多字段搜尋時,給字段設定搜尋權重
    /// </summary>
    internal Dictionary<string, float> Boosts
    {
        get
        {
            foreach (var field in Fields.Where(field => _boosts.All(x => x.Key.ToUpper() != field.ToUpper())))
            {
                _boosts.Add(field, 2.0f);
            }
 
            return _boosts;
        }
    }
 
    /// <summary>
    /// 比對度,0-1,數值越大結果越精确
    /// </summary>
    public float Score { get; set; } = 0.5f;
 
    /// <summary>
    /// 過濾條件
    /// </summary>
    public Filter Filter { get; set; }
 
    public ScoredSearchOption(string keyword,List<string> fields,int maxHits=100,Dictionary<string,float> boosts=null)
    {
        if (string.IsNullOrWhiteSpace(keyword))
        {
            throw new ArgumentException("搜尋關鍵詞不能為空");
        }
 
        Keyword = keyword;
        Fields = fields;
        MaxHits = maxHits;
        _boosts = boosts ?? new Dictionary<string, float>();
    }
 
    /// <summary>
    /// 設定權重
    /// </summary>
    /// <param name="field"></param>
    /// <param name="boost"></param>
    public void SetBoosts(string field,float boost)
    {
        _boosts[field] = boost;
    }
 
}
           

輸出項為:

public class ScoredSearchResult : ISearchResult<SearchResultItem>
{
    public List<SearchResultItem> Items { get; set; }
    public long Elapsed { get;set;}
    public int TotalHits { get; set; }
 
    public ScoredSearchResult()
    {
        Items = new List<SearchResultItem>();
    }
}
           

2.2.方法實作

/// <summary>
/// 包含權重的查詢
/// </summary>
/// <param name="option"></param>
/// <returns></returns>
public ScoredSearchResult ScoredSearch(ScoredSearchOption option)
{
    ScoredSearchResult result = new ScoredSearchResult();
    Stopwatch watch = Stopwatch.StartNew();//啟動計時器
 
    using (DirectoryReader reader = DirectoryReader.Open(Directory))
    {
        IndexSearcher searcher = new IndexSearcher(reader);
        var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer, option.Boosts);
        var terms = Cut(option.Keyword);//關鍵詞分割
        Query query = QueryExpression(queryParser, terms);//查詢語句拼接擴充
        Sort sort = new Sort(SortField.FIELD_SCORE);//預設按照評分排序
        Expression<Func<ScoreDoc, bool>> whereExpression = m => m.Score >= option.Score;
        var matches = searcher.Search(query, option.Filter, option.MaxHits, sort, true, true).ScoreDocs
            .Where(whereExpression.Compile());
 
        foreach (var match in matches)
        {
            var doc = searcher.Doc(match.Doc);
            SearchResultItem item = new SearchResultItem();
            item.Score = match.Score;
            item.EntityId = doc.Get(CoreConstant.EntityId);
            item.EntityName = doc.Get(CoreConstant.EntityType);
            result.Items.Add(item);
        } 
        result.TotalHits = matches.Count();
    } 
    watch.Stop();//停止計時器
    result.Elapsed = watch.ElapsedMilliseconds;
    return result;
}
           

其中私有方法Cut用于關鍵詞的分割:

private List<string> Cut(string keyword)
{
    List<string> result = new List<string> { keyword };//先将關鍵詞放入分割結果中
    if (keyword.Length <= 2)//如果關鍵詞過短則不分割,直接傳回結果
    {
        return result;
    }
    //常用關鍵詞查詢規則替換,‘+’替換并,‘-’替換否,空格替換或
    keyword = keyword.Replace("AND ", "+").Replace("NOT ", "-").Replace("OR ", " ");
 
    result.AddRange(Regex.Matches(keyword, @""".+""").Cast<Match>().Select(m =>
    {
        keyword = keyword.Replace(m.Value, "");
        return m.Value;
    }));//必須包含的
    result.AddRange(Regex.Matches(keyword, @"\s-.+\s?").Cast<Match>().Select(m =>
    {
        keyword = keyword.Replace(m.Value, "");
        return m.Value.Trim();
    }));//必須不包含的
 
    result.AddRange(Regex.Matches(keyword, @"[\u4e00-\u9fa5]+").Cast<Match>().Select(m => m.Value));//中文
    result.AddRange(Regex.Matches(keyword, @"\p{P}?[A-Z]*[a-z]*[\p{P}|\p{S}]*").Cast<Match>().Select(m => m.Value));//英文單詞
    result.AddRange(Regex.Matches(keyword, "([A-z]+)([0-9.]+)").Cast<Match>().SelectMany(m => m.Groups.Cast<Group>().Select(g => g.Value)));//英文+數字
    //result.AddRange(new JiebaSegmenter().Cut(keyword, true));//結巴分詞
    result.RemoveAll(s => s.Length < 2);
    result = result.Distinct().OrderByDescending(s => s.Length).Take(10).ToList();
 
    return result;
}
           

私有方法QueryExpression用于查詢語句的拼接:

private BooleanQuery QueryExpression(MultiFieldQueryParser queryParser, List<string> terms)
{
    BooleanQuery query = new BooleanQuery();
    foreach (var term in terms)
    {
        if (term.StartsWith("\""))
        {
            query.Add(queryParser.Parse(term.Trim('"')), Occur.MUST);//必須比對
        }
        else if (term.StartsWith("-"))
        {
            query.Add(queryParser.Parse(term), Occur.MUST_NOT);//必須不比對
        }
        else
        {
            query.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.SHOULD);//可以比對
        }
    }
    return query;
}
           

3.測試示例

寫一個示例方法對簡單查詢進行測試:

public List<DataContent> SingleSearch(SingleSearchOption option)
{
    List<DataContent> entities = new List<DataContent>();
    SingleSearchResult searchResult = _searchManager.SingleSearch(option);
 
    foreach (var item in searchResult.Items)
    {
        DataContent entity = _repository.Get(item.EntityId);//查詢實體
        entities.Add(entity);
    }
 
    return entities;
}
           

目前索引的查詢和實體的查詢并沒有強關聯,是以實際上是查詢了兩次,後續會考慮根據業務需要将兩者結合起來。

繼續閱讀