275  
查询码:00000435
基于Lucene.Net构建.NET Core版搜索引擎(五)--查询方法扩展
作者: 潘帅 于 2021年09月29日 发布在分类 / 人防组 / 人防后端 下,并于 2021年09月29日 编辑
Lucene.NET 搜索引擎

目前的查询方法过于简单,而且无法与实际业务中的实体建立关系,因此本篇文章就来描述对查询方法的扩展。


1.查询多个字段的检索方法

1.1.定义接口及输入输出项

查询输入项SingleSearchOption:

    public class SingleSearchOption:SearchOptionBase
    {
        /// <summary>
        /// 检索关键词
        /// </summary>
        public string Keyword { get; set; }

        /// <summary>
        /// 限定检索域
        /// </summary>
        public List<string> Fields { get; set; }

        public SingleSearchOption(string keyword,List<string> fields,int maxHits=100)
        {
            if (string.IsNullOrWhiteSpace(keyword))
            {
                throw new ArgumentException("搜索关键词不能为空");
            }
            Keyword = keyword;
            Fields = fields;
            MaxHits = maxHits;
        }

        public SingleSearchOption()
        {

        }
    }
其中SearchOptionBase:

    public class SearchOptionBase : ISearchOption
    {
        /// <summary>
        /// 最大检索量
        /// </summary>
        public int MaxHits { get ; set; }
    }
输出结果SingleSearchResult:

    public class SingleSearchResult : ISearchResult<SearchResultItem>
    {
        /// <summary>
        /// 匹配结果
        /// </summary>
        public List<SearchResultItem> Items { get; set; }
        /// <summary>
        /// 检索耗时
        /// </summary>
        public long Elapsed { get; set; }
        /// <summary>
        /// 匹配结果数
        /// </summary>
        public int TotalHits { get; set; }

        public SingleSearchResult()
        {
            Items = new List<SearchResultItem>();
        }
    }
其中查询结果项SearchResultItem:

    public class SearchResultItem : ISearchResultItem
    {
        /// <summary>
        /// 结果评分
        /// </summary>
        public float Score { get; set; }
        /// <summary>
        /// 实体Id
        /// </summary>
        public string EntityId { get; set; }
        /// <summary>
        /// 实体类名
        /// </summary>
        public string EntityName { get; set; }

    }

1.2.方法实现

        /// <summary>
        /// 简单查询
        /// </summary>
        /// <param name="option"></param>
        /// <returns></returns>
        public SingleSearchResult SingleSearch(SingleSearchOption option)
        {
            SingleSearchResult result = new SingleSearchResult();
            Stopwatch watch=Stopwatch.StartNew();
            using (Lucene.Net.Index.DirectoryReader reader = DirectoryReader.Open(Directory))
            {
                //实例化索引检索器
                IndexSearcher searcher = new IndexSearcher(reader);
                var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer);
                Query query = queryParser.Parse(option.Keyword);
                var matches = searcher.Search(query, option.MaxHits).ScoreDocs;
                result.TotalHits = matches.Count();
                foreach (var match in matches)
                {
                    var doc = searcher.Doc(match.Doc);
                    SearchResultItem item = new SearchResultItem();
                    item.Score = match.Score;
                    item.EntityId = doc.GetField(CoreConstant.EntityId).GetStringValue();
                    item.EntityName = doc.GetField(CoreConstant.EntityType).GetStringValue();
                    result.Items.Add(item);
                }
            }
            watch.Stop();
            result.Elapsed = watch.ElapsedMilliseconds;
            return result;
        }
其中实体标识EntityId、实体类名EntityName这两个域是在创建索引时添加进去的,这样确保每个Document和数据库的每条记录都能通过Id被互相找到。


2.可设置权重的检索方法

2.1.定义接口及输入输出项

输入项为:

    public class ScoredSearchOption:SearchOptionBase
    {
        /// <summary>
        /// 检索关键词
        /// </summary>
        public string Keyword { get; set; }

        /// <summary>
        /// 限定检索域
        /// </summary>
        public List<string> Fields { get; set; }

        /// <summary>
        /// 多字段搜索时,给字段设定搜索权重
        /// </summary>
        private readonly Dictionary<string, float> _boosts;

        /// <summary>
        /// 多字段搜索时,给字段设定搜索权重
        /// </summary>
        internal Dictionary<string, float> Boosts
        {
            get
            {
                foreach (var field in Fields.Where(field => _boosts.All(x => x.Key.ToUpper() != field.ToUpper())))
                {
                    _boosts.Add(field, 2.0f);
                }

                return _boosts;
            }
        }

        /// <summary>
        /// 匹配度,0-1,数值越大结果越精确
        /// </summary>
        public float Score { get; set; } = 0.5f;

        /// <summary>
        /// 过滤条件
        /// </summary>
        public Filter Filter { get; set; }

        public ScoredSearchOption(string keyword,List<string> fields,int maxHits=100,Dictionary<string,float> boosts=null)
        {
            if (string.IsNullOrWhiteSpace(keyword))
            {
                throw new ArgumentException("搜索关键词不能为空");
            }

            Keyword = keyword;
            Fields = fields;
            MaxHits = maxHits;
            _boosts = boosts ?? new Dictionary<string, float>();
        }

        /// <summary>
        /// 设置权重
        /// </summary>
        /// <param name="field"></param>
        /// <param name="boost"></param>
        public void SetBoosts(string field,float boost)
        {
            _boosts[field] = boost;
        }

    }

输出项为:

    public class ScoredSearchResult : ISearchResult<SearchResultItem>
    {
        public List<SearchResultItem> Items { get; set; }
        public long Elapsed { get;set;}
        public int TotalHits { get; set; }

        public ScoredSearchResult()
        {
            Items = new List<SearchResultItem>();
        }
    }


2.2.方法实现

        /// <summary>
        /// 包含权重的查询
        /// </summary>
        /// <param name="option"></param>
        /// <returns></returns>
        public ScoredSearchResult ScoredSearch(ScoredSearchOption option)
        {
            ScoredSearchResult result = new ScoredSearchResult();
            Stopwatch watch = Stopwatch.StartNew();//启动计时器

            using (DirectoryReader reader = DirectoryReader.Open(Directory))
            {
                IndexSearcher searcher = new IndexSearcher(reader);
                var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer, option.Boosts);
                var terms = Cut(option.Keyword);//关键词分割
                Query query = QueryExpression(queryParser, terms);//查询语句拼接扩展
                Sort sort = new Sort(SortField.FIELD_SCORE);//默认按照评分排序
                Expression<Func<ScoreDoc, bool>> whereExpression = m => m.Score >= option.Score;
                var matches = searcher.Search(query, option.Filter, option.MaxHits, sort, true, true).ScoreDocs
                    .Where(whereExpression.Compile());

                foreach (var match in matches)
                {
                    var doc = searcher.Doc(match.Doc);
                    SearchResultItem item = new SearchResultItem();
                    item.Score = match.Score;
                    item.EntityId = doc.Get(CoreConstant.EntityId);
                    item.EntityName = doc.Get(CoreConstant.EntityType);
                    result.Items.Add(item);
                }

                result.TotalHits = matches.Count();
            }

            watch.Stop();//停止计时器
            result.Elapsed = watch.ElapsedMilliseconds;
            return result;
        }
其中私有方法Cut用于关键词的分割:

        private List<string> Cut(string keyword)
        {
            List<string> result = new List<string> { keyword };//先将关键词放入分割结果中
            if (keyword.Length <= 2)//如果关键词过短则不分割,直接返回结果
            {
                return result;
            }
            //常用关键词查询规则替换,‘+’替换并,‘-’替换否,空格替换或
            keyword = keyword.Replace("AND ", "+").Replace("NOT ", "-").Replace("OR ", " ");

            result.AddRange(Regex.Matches(keyword, @""".+""").Cast<Match>().Select(m =>
            {
                keyword = keyword.Replace(m.Value, "");
                return m.Value;
            }));//必须包含的
            result.AddRange(Regex.Matches(keyword, @"\s-.+\s?").Cast<Match>().Select(m =>
            {
                keyword = keyword.Replace(m.Value, "");
                return m.Value.Trim();
            }));//必须不包含的

            result.AddRange(Regex.Matches(keyword, @"[\u4e00-\u9fa5]+").Cast<Match>().Select(m => m.Value));//中文
            result.AddRange(Regex.Matches(keyword, @"\p{P}?[A-Z]*[a-z]*[\p{P}|\p{S}]*").Cast<Match>().Select(m => m.Value));//英文单词
            result.AddRange(Regex.Matches(keyword, "([A-z]+)([0-9.]+)").Cast<Match>().SelectMany(m => m.Groups.Cast<Group>().Select(g => g.Value)));//英文+数字
            //result.AddRange(new JiebaSegmenter().Cut(keyword, true));//结巴分词
            result.RemoveAll(s => s.Length < 2);
            result = result.Distinct().OrderByDescending(s => s.Length).Take(10).ToList();

            return result;
        }
私有方法QueryExpression用于查询语句的拼接:

        private BooleanQuery QueryExpression(MultiFieldQueryParser queryParser, List<string> terms)
        {
            BooleanQuery query = new BooleanQuery();
            foreach (var term in terms)
            {
                if (term.StartsWith("\""))
                {
                    query.Add(queryParser.Parse(term.Trim('"')), Occur.MUST);//必须匹配
                }
                else if (term.StartsWith("-"))
                {
                    query.Add(queryParser.Parse(term), Occur.MUST_NOT);//必须不匹配
                }
                else
                {
                    query.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.SHOULD);//可以匹配
                }
            }
            return query;
        } 

3.测试示例

写一个示例方法对简单查询进行测试:

        public List<DataContent> SingleSearch(SingleSearchOption option)
        {
            List<DataContent> entities = new List<DataContent>();
            SingleSearchResult searchResult = _searchManager.SingleSearch(option);

            foreach (var item in searchResult.Items)
            {
                DataContent entity = _repository.Get(item.EntityId);//查询实体
                entities.Add(entity);
            }

            return entities;
        }
测试结果:


目前索引的查询和实体的查询并没有强关联,所以实际上是查询了两次,后续会考虑根据业务需要将两者结合起来。



 推荐知识

 历史版本

修改日期 修改人 备注
2021-09-29 17:24:36[当前版本] 潘帅 1.0

 附件

附件类型

PNGPNG

预览图

知识分享平台 -V 4.8.7 -wcp