在了解了lucene的工作原理和流程后,就可以更进一步对原有代码进行改进了。在原有项目中使用的是默认的StandardAnalyzer,只能将文本分割成单个词,对于中文并不是很友好,所以本次将替换使用自定义分词器。
根据Lucene的源码,只需要继承基类Analyzer即可实现自定义的分词器。此外在原有的SearchManager中已经预留的泛型接口,也可以很方便的接入。
public class JieBaAnalyzer : Analyzer { private readonly TokenizerMode _mode; private string _stopUrl; public JieBaAnalyzer(TokenizerMode mode,string stopUrl= "./Resources/stopwords.txt") :base() { this._mode = mode; _stopUrl = stopUrl; } protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { var tokenizer = new JieBaTokenizer(reader, _mode,_stopUrl); var tokenstream = (TokenStream)new LowerCaseFilter(Lucene.Net.Util.LuceneVersion.LUCENE_48, (TokenStream)tokenizer); tokenstream.AddAttribute<ICharTermAttribute>(); tokenstream.AddAttribute<IOffsetAttribute>(); return new TokenStreamComponents((Tokenizer)tokenizer, tokenstream); } }
public class JieBaTokenizer : Tokenizer { private System.Collections.Generic.List<JiebaNet.Segmenter.Token> _wordList = new List<JiebaNet.Segmenter.Token>(); private string _inputText; private ICharTermAttribute _termAtt; private IOffsetAttribute _offsetAtt; private IPositionIncrementAttribute _posIncrAtt; private ITypeAttribute _typeAtt; private Dictionary<string,int> _stopWords = new Dictionary<string, int>(); private IEnumerator<JiebaNet.Segmenter.Token> _iter; private readonly JiebaSegmenter _segmenter; private readonly TokenizerMode _mode; public JieBaTokenizer(TextReader input, TokenizerMode mode,string stopUrl= "./Resources/stopwords.txt") : base(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input) { _segmenter = new JiebaSegmenter(); _mode = mode; LoadStopWords(stopUrl); Init(); } /// <summary> /// 加载停用词 /// </summary> /// <param name="filePath"></param> private void LoadStopWords(string filePath) { using (StreamReader reader=File.OpenText(AppDomain.CurrentDomain.BaseDirectory+filePath)) { string tmp; while ((tmp=reader.ReadLine())!=null) { if (string.IsNullOrEmpty(tmp)) { continue; } if (_stopWords.ContainsKey(tmp)) { continue; } _stopWords.Add(tmp,1); } } } /// <summary> /// 初始化(添加属性) /// </summary> private void Init() { _termAtt = AddAttribute<ICharTermAttribute>(); _offsetAtt = AddAttribute<IOffsetAttribute>(); _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); _typeAtt = AddAttribute<ITypeAttribute>(); } private string ReadToEnd(TextReader input) { return input.ReadToEnd(); } public sealed override Boolean IncrementToken() { ClearAttributes(); Lucene.Net.Analysis.Token token = Next(); if (token != null) { var buffer = token.ToString(); _termAtt.SetEmpty().Append(buffer); _offsetAtt.SetOffset(CorrectOffset(token.StartOffset), CorrectOffset(token.EndOffset)); _typeAtt.Type = token.Type; return true; } End(); this.Dispose(); return false; } public Lucene.Net.Analysis.Token Next() { bool res = _iter.MoveNext(); if (res) { JiebaNet.Segmenter.Token current = _iter.Current; if (current!=null) { Lucene.Net.Analysis.Token token = new Lucene.Net.Analysis.Token(current.Word, current.StartIndex, current.EndIndex); return token; } else { return null; } } else return null; } public override void Reset() { base.Reset(); _inputText = ReadToEnd(base.m_input); IEnumerable<JiebaNet.Segmenter.Token> tokens = _segmenter.Tokenize(_inputText, _mode);//获取JieBa分词Token _wordList.Clear();//清除分词列表 foreach (var token in tokens) { if (!_stopWords.ContainsKey(token.Word))//移除停用词 { _wordList.Add(token); } } _iter = _wordList.GetEnumerator(); } }
JieBa分词提供了词库资源,能够更好的根据中文习惯分词。词库资源包位于NuGet包里的Resources文件夹。
包含了字典、专有名词、停用词等,也可以根据自己的需要添加自己想要的分词。
由于在自定义的JieBaTokenizer里包含了读取停用词词库的方法,所以需要将Resources文件夹复制到程序运行目录,以便于初始化时加载停用词,并从分词列表中移除停用词。
services.AddSingleton<Lucene.Net.Store.Directory>(Lucene.Net.Store.FSDirectory.Open(configuration["Search:DefaultPath"])); services.AddSingleton<Lucene.Net.Analysis.Analyzer>(new JieBaAnalyzer(TokenizerMode.Search, configuration["Search:StopWords"])); //services.AddSingleton<Lucene.Net.Analysis.Analyzer>(new StandardAnalyzer(LuceneVersion.LUCENE_48)); services.AddTransient<ISearchManager, SearchManager> ();在.NET Core应用的Startup启动类中通过依赖注入的形式替换默认的Analyzer。
原有方法逻辑不受影响,替换完毕。