目前的查询方法过于简单,而且无法与实际业务中的实体建立关系,因此本篇文章就来描述对查询方法的扩展。
查询输入项SingleSearchOption:
public class SingleSearchOption:SearchOptionBase { /// <summary> /// 检索关键词 /// </summary> public string Keyword { get; set; } /// <summary> /// 限定检索域 /// </summary> public List<string> Fields { get; set; } public SingleSearchOption(string keyword,List<string> fields,int maxHits=100) { if (string.IsNullOrWhiteSpace(keyword)) { throw new ArgumentException("搜索关键词不能为空"); } Keyword = keyword; Fields = fields; MaxHits = maxHits; } public SingleSearchOption() { } }其中SearchOptionBase:
public class SearchOptionBase : ISearchOption { /// <summary> /// 最大检索量 /// </summary> public int MaxHits { get ; set; } }输出结果SingleSearchResult:
public class SingleSearchResult : ISearchResult<SearchResultItem> { /// <summary> /// 匹配结果 /// </summary> public List<SearchResultItem> Items { get; set; } /// <summary> /// 检索耗时 /// </summary> public long Elapsed { get; set; } /// <summary> /// 匹配结果数 /// </summary> public int TotalHits { get; set; } public SingleSearchResult() { Items = new List<SearchResultItem>(); } }其中查询结果项SearchResultItem:
public class SearchResultItem : ISearchResultItem { /// <summary> /// 结果评分 /// </summary> public float Score { get; set; } /// <summary> /// 实体Id /// </summary> public string EntityId { get; set; } /// <summary> /// 实体类名 /// </summary> public string EntityName { get; set; } }
/// <summary> /// 简单查询 /// </summary> /// <param name="option"></param> /// <returns></returns> public SingleSearchResult SingleSearch(SingleSearchOption option) { SingleSearchResult result = new SingleSearchResult(); Stopwatch watch=Stopwatch.StartNew(); using (Lucene.Net.Index.DirectoryReader reader = DirectoryReader.Open(Directory)) { //实例化索引检索器 IndexSearcher searcher = new IndexSearcher(reader); var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer); Query query = queryParser.Parse(option.Keyword); var matches = searcher.Search(query, option.MaxHits).ScoreDocs; result.TotalHits = matches.Count(); foreach (var match in matches) { var doc = searcher.Doc(match.Doc); SearchResultItem item = new SearchResultItem(); item.Score = match.Score; item.EntityId = doc.GetField(CoreConstant.EntityId).GetStringValue(); item.EntityName = doc.GetField(CoreConstant.EntityType).GetStringValue(); result.Items.Add(item); } } watch.Stop(); result.Elapsed = watch.ElapsedMilliseconds; return result; }其中实体标识EntityId、实体类名EntityName这两个域是在创建索引时添加进去的,这样确保每个Document和数据库的每条记录都能通过Id被互相找到。
输入项为:
public class ScoredSearchOption:SearchOptionBase { /// <summary> /// 检索关键词 /// </summary> public string Keyword { get; set; } /// <summary> /// 限定检索域 /// </summary> public List<string> Fields { get; set; } /// <summary> /// 多字段搜索时,给字段设定搜索权重 /// </summary> private readonly Dictionary<string, float> _boosts; /// <summary> /// 多字段搜索时,给字段设定搜索权重 /// </summary> internal Dictionary<string, float> Boosts { get { foreach (var field in Fields.Where(field => _boosts.All(x => x.Key.ToUpper() != field.ToUpper()))) { _boosts.Add(field, 2.0f); } return _boosts; } } /// <summary> /// 匹配度,0-1,数值越大结果越精确 /// </summary> public float Score { get; set; } = 0.5f; /// <summary> /// 过滤条件 /// </summary> public Filter Filter { get; set; } public ScoredSearchOption(string keyword,List<string> fields,int maxHits=100,Dictionary<string,float> boosts=null) { if (string.IsNullOrWhiteSpace(keyword)) { throw new ArgumentException("搜索关键词不能为空"); } Keyword = keyword; Fields = fields; MaxHits = maxHits; _boosts = boosts ?? new Dictionary<string, float>(); } /// <summary> /// 设置权重 /// </summary> /// <param name="field"></param> /// <param name="boost"></param> public void SetBoosts(string field,float boost) { _boosts[field] = boost; } }
输出项为:
public class ScoredSearchResult : ISearchResult<SearchResultItem> { public List<SearchResultItem> Items { get; set; } public long Elapsed { get;set;} public int TotalHits { get; set; } public ScoredSearchResult() { Items = new List<SearchResultItem>(); } }
/// <summary> /// 包含权重的查询 /// </summary> /// <param name="option"></param> /// <returns></returns> public ScoredSearchResult ScoredSearch(ScoredSearchOption option) { ScoredSearchResult result = new ScoredSearchResult(); Stopwatch watch = Stopwatch.StartNew();//启动计时器 using (DirectoryReader reader = DirectoryReader.Open(Directory)) { IndexSearcher searcher = new IndexSearcher(reader); var queryParser = new MultiFieldQueryParser(LuceneVersion.LUCENE_48, option.Fields.ToArray(), Analyzer, option.Boosts); var terms = Cut(option.Keyword);//关键词分割 Query query = QueryExpression(queryParser, terms);//查询语句拼接扩展 Sort sort = new Sort(SortField.FIELD_SCORE);//默认按照评分排序 Expression<Func<ScoreDoc, bool>> whereExpression = m => m.Score >= option.Score; var matches = searcher.Search(query, option.Filter, option.MaxHits, sort, true, true).ScoreDocs .Where(whereExpression.Compile()); foreach (var match in matches) { var doc = searcher.Doc(match.Doc); SearchResultItem item = new SearchResultItem(); item.Score = match.Score; item.EntityId = doc.Get(CoreConstant.EntityId); item.EntityName = doc.Get(CoreConstant.EntityType); result.Items.Add(item); } result.TotalHits = matches.Count(); } watch.Stop();//停止计时器 result.Elapsed = watch.ElapsedMilliseconds; return result; }其中私有方法Cut用于关键词的分割:
private List<string> Cut(string keyword) { List<string> result = new List<string> { keyword };//先将关键词放入分割结果中 if (keyword.Length <= 2)//如果关键词过短则不分割,直接返回结果 { return result; } //常用关键词查询规则替换,‘+’替换并,‘-’替换否,空格替换或 keyword = keyword.Replace("AND ", "+").Replace("NOT ", "-").Replace("OR ", " "); result.AddRange(Regex.Matches(keyword, @""".+""").Cast<Match>().Select(m => { keyword = keyword.Replace(m.Value, ""); return m.Value; }));//必须包含的 result.AddRange(Regex.Matches(keyword, @"\s-.+\s?").Cast<Match>().Select(m => { keyword = keyword.Replace(m.Value, ""); return m.Value.Trim(); }));//必须不包含的 result.AddRange(Regex.Matches(keyword, @"[\u4e00-\u9fa5]+").Cast<Match>().Select(m => m.Value));//中文 result.AddRange(Regex.Matches(keyword, @"\p{P}?[A-Z]*[a-z]*[\p{P}|\p{S}]*").Cast<Match>().Select(m => m.Value));//英文单词 result.AddRange(Regex.Matches(keyword, "([A-z]+)([0-9.]+)").Cast<Match>().SelectMany(m => m.Groups.Cast<Group>().Select(g => g.Value)));//英文+数字 //result.AddRange(new JiebaSegmenter().Cut(keyword, true));//结巴分词 result.RemoveAll(s => s.Length < 2); result = result.Distinct().OrderByDescending(s => s.Length).Take(10).ToList(); return result; }私有方法QueryExpression用于查询语句的拼接:
private BooleanQuery QueryExpression(MultiFieldQueryParser queryParser, List<string> terms) { BooleanQuery query = new BooleanQuery(); foreach (var term in terms) { if (term.StartsWith("\"")) { query.Add(queryParser.Parse(term.Trim('"')), Occur.MUST);//必须匹配 } else if (term.StartsWith("-")) { query.Add(queryParser.Parse(term), Occur.MUST_NOT);//必须不匹配 } else { query.Add(queryParser.Parse(term.Replace("~", "") + "~"), Occur.SHOULD);//可以匹配 } } return query; }
写一个示例方法对简单查询进行测试:
public List<DataContent> SingleSearch(SingleSearchOption option) { List<DataContent> entities = new List<DataContent>(); SingleSearchResult searchResult = _searchManager.SingleSearch(option); foreach (var item in searchResult.Items) { DataContent entity = _repository.Get(item.EntityId);//查询实体 entities.Add(entity); } return entities; }测试结果:
目前索引的查询和实体的查询并没有强关联,所以实际上是查询了两次,后续会考虑根据业务需要将两者结合起来。