经过上次的改造,可以实现对实体对象中的特定成员创建索引,但从实际的应用上来看,需要检索的数据内容格式多种多样,可能会有HTML、XML等。另外一些自定义的数据结构可能是以JSON等特殊规则形式存储的,对于这些情况就需要单独对数据进行分割处理。
索引文件的结构是:
-- 索引(Index)
-- 段(Segment)
-- 文档(Document)
-- 域(Field)
-- 词(Term)
new StringField("sField", StringField, Field.Store.YES);
当为数据实体创建索引时,我们不仅要了解索引文件的结构,也要了解索引与数据库实体的关系,这样才能更好的利用索引进行查询检索。
上图描述了数据库表与索引文件的大致对应关系,索引Index对应数据库表Table,文档Document对应数据库表中的一条记录Row,域Field对应这条记录中的每个列字段Column。而词Term则是将列字段值分词后端结果。
举个例子:
分词只是示例,实际上可能不是这么分的。
public enum FieldDataType { /// <summary> /// Html文本 /// </summary> Html, /// <summary> /// JSON文本 /// </summary> Json, /// <summary> /// 纯文本 /// </summary> Text, /// <summary> /// Xml文本 /// </summary> Xml, /// <summary> /// Csv文本 /// </summary> Csv, /// <summary> /// 年份yyyy /// </summary> DateYear, /// <summary> /// 时间 /// </summary> DateTime, /// <summary> /// 数字 /// </summary> Int32, /// <summary> /// 数字 /// </summary> Int64, /// <summary> /// 小数 /// </summary> Double }
[AttributeUsage(AttributeTargets.Property)] public class IndexAttribute:Attribute { public IndexAttribute() { IsStore = Field.Store.YES; FieldType = FieldDataType.Text; } /// <summary> /// 名称 /// </summary> public string FieldName { get; set; } /// <summary> /// 是否存储 /// </summary> public Field.Store IsStore { get; set; } /// <summary> /// 数据格式 /// </summary> public FieldDataType FieldType { get; set; } }
/// <summary> /// 为实体创建索引 /// </summary> /// <param name="entity"></param> /// <param name="isFiltered">是否启用过滤</param> public virtual void CreateIndexByEntity(IEntity<string> entity,bool isFiltered=true) { var config = new IndexWriterConfig(LuceneVersion.LUCENE_48, Analyzer); using (IndexWriter writer = new IndexWriter(Directory, config)) { //创建文档 Document doc = new Document(); var type = entity.GetType(); //为实体所在的类的名称创建Field,目的是对实体进行标识,便于以后检索 doc.Add(new StringField(CoreConstant.EntityType,type.AssemblyQualifiedName,Field.Store.YES)); var properties = type.GetProperties(); //遍历实体的成员集合 foreach (var propertyInfo in properties) { var propertyValue = propertyInfo.GetValue(entity); if (propertyValue==null) { continue; } string fieldName = propertyInfo.Name;//成员字段名称 if (isFiltered) { var attributes = propertyInfo.GetCustomAttributes<IndexAttribute>();//获取自定义属性集合 foreach (var attribute in attributes) { string name = string.IsNullOrEmpty(attribute.FieldName) ? fieldName : attribute.FieldName; switch (attribute.FieldType) { case FieldDataType.DateTime: doc.Add(new StringField(fieldName, ((DateTime)propertyValue).ToString("yyyy-MM-dd HH:mm:ss"), attribute.IsStore)); break; case FieldDataType.DateYear: break; case FieldDataType.Int32: doc.Add(new Int32Field(fieldName, (Int32)propertyValue, attribute.IsStore)); break; case FieldDataType.Int64: doc.Add(new Int64Field(fieldName, (Int64)propertyValue, attribute.IsStore)); break; case FieldDataType.Double: doc.Add(new DoubleField(fieldName, (double)propertyValue, attribute.IsStore)); break; case FieldDataType.Html: doc.Add(new TextField(fieldName, propertyValue.ToString().ClearHtml(), attribute.IsStore)); break; case FieldDataType.Json: doc.Add(new TextField(fieldName, propertyValue.ToString().ClearJson(), attribute.IsStore)); break; case FieldDataType.Xml: doc.Add(new TextField(fieldName, propertyValue.ToString().ClearXml(), attribute.IsStore)); break; case FieldDataType.Csv: doc.Add(new TextField(fieldName, propertyValue.ToString().ClearCsv(), attribute.IsStore)); break; default: doc.Add(new TextField(fieldName, propertyValue.ToString(), attribute.IsStore)); break; } } } else { switch (propertyValue) { case DateTime time: doc.Add(new StringField(fieldName, time.ToString("yyyy-MM-dd HH:mm:ss"), Field.Store.YES)); break; case int num: doc.Add(new Int32Field(fieldName, num, Field.Store.YES)); break; case long num: doc.Add(new Int64Field(fieldName, num, Field.Store.YES)); break; case double num: doc.Add(new DoubleField(fieldName, num, Field.Store.YES)); break; default: doc.Add(new TextField(fieldName, propertyValue.ToString(), Field.Store.YES)); break; } } } writer.AddDocument(doc); //刷新索引 writer.Flush(true, true); writer.Commit(); } }
/// <summary> /// 移除指定字符 /// </summary> /// <param name="source"></param> /// <param name="chars"></param> /// <returns></returns> internal static string ClearChar(this string source, IEnumerable<char> chars) { return string.IsNullOrEmpty(source) ? string.Empty : new string(source.Where(t => !chars.Contains(t)).ToArray()); } /// <summary> /// 移除HTML标签 /// </summary> /// <param name="source"></param> /// <returns></returns> internal static string ClearHtml(this string source) { string result = Regex.Replace(source, "<[^>]+>", ""); return Regex.Replace(result, "&[^;]+;", ""); } /// <summary> /// 移除XML标签 /// </summary> /// <param name="source"></param> /// <returns></returns> internal static string ClearXml(this string source) { return Regex.Replace(source, "<[^>]+>", ""); }