oqtane.framework/Oqtane.Server/Providers/DatabaseSearchProvider.cs
2024-07-06 07:58:04 -04:00

315 lines
12 KiB
C#

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using HtmlAgilityPack;
using Oqtane.Models;
using Oqtane.Repository;
using Oqtane.Services;
using Oqtane.Shared;
namespace Oqtane.Providers
{
public class DatabaseSearchProvider : ISearchProvider
{
private readonly ISearchContentRepository _searchContentRepository;
private const string IgnoreWords = "the,be,to,of,and,a,i,in,that,have,it,for,not,on,with,he,as,you,do,at,this,but,his,by,from,they,we,say,her,she,or,an,will,my,one,all,would,there,their,what,so,up,out,if,about,who,get,which,go,me,when,make,can,like,time,no,just,him,know,take,people,into,year,your,good,some,could,them,see,other,than,then,now,look,only,come,its,over,think,also,back,after,use,two,how,our,work,first,well,way,even,new,want,because,any,these,give,day,most,us";
private const int WordMinLength = 3;
public string Name => Constants.DefaultSearchProviderName;
public DatabaseSearchProvider(ISearchContentRepository searchContentRepository)
{
_searchContentRepository = searchContentRepository;
}
public void Commit()
{
}
public void DeleteSearchContent(string id)
{
_searchContentRepository.DeleteSearchContent(id);
}
public bool Optimize()
{
return true;
}
public void ResetIndex()
{
_searchContentRepository.DeleteAllSearchContent();
}
public void SaveSearchContent(SearchContent searchContent, bool autoCommit = false)
{
//remove existing search content
_searchContentRepository.DeleteSearchContent(searchContent.EntityName, searchContent.EntityId);
if (!searchContent.IsDeleted)
{
//clean the search content to remove html tags
CleanSearchContent(searchContent);
_searchContentRepository.AddSearchContent(searchContent);
//save the index words
AnalyzeSearchContent(searchContent);
}
}
public async Task<SearchResults> SearchAsync(SearchQuery searchQuery, Func<SearchContent, SearchQuery, bool> validateFunc)
{
var totalResults = 0;
var searchContentList = await _searchContentRepository.GetSearchContentsAsync(searchQuery);
//convert the search content to search results.
var results = searchContentList
.Where(i => validateFunc(i, searchQuery))
.Select(i => ConvertToSearchResult(i, searchQuery));
if (searchQuery.SortDirection == SearchSortDirections.Descending)
{
switch (searchQuery.SortField)
{
case SearchSortFields.Relevance:
results = results.OrderByDescending(i => i.Score).ThenByDescending(i => i.ContentModifiedOn);
break;
case SearchSortFields.Title:
results = results.OrderByDescending(i => i.Title).ThenByDescending(i => i.ContentModifiedOn);
break;
default:
results = results.OrderByDescending(i => i.ContentModifiedOn);
break;
}
}
else
{
switch (searchQuery.SortField)
{
case SearchSortFields.Relevance:
results = results.OrderBy(i => i.Score).ThenByDescending(i => i.ContentModifiedOn);
break;
case SearchSortFields.Title:
results = results.OrderBy(i => i.Title).ThenByDescending(i => i.ContentModifiedOn);
break;
default:
results = results.OrderBy(i => i.ContentModifiedOn);
break;
}
}
//remove duplicated results based on page id for Page and Module types
results = results.DistinctBy(i =>
{
if (i.EntityName == EntityNames.Page || i.EntityName == EntityNames.Module)
{
var pageId = i.SearchContentProperties.FirstOrDefault(p => p.Name == Constants.SearchPageIdPropertyName)?.Value ?? string.Empty;
return !string.IsNullOrEmpty(pageId) ? pageId : i.UniqueKey;
}
else
{
return i.UniqueKey;
}
});
totalResults = results.Count();
return new SearchResults
{
Results = results.Skip(searchQuery.PageIndex * searchQuery.PageSize).Take(searchQuery.PageSize).ToList(),
TotalResults = totalResults
};
}
private SearchResult ConvertToSearchResult(SearchContent searchContent, SearchQuery searchQuery)
{
var searchResult = new SearchResult()
{
SearchContentId = searchContent.SearchContentId,
SiteId = searchContent.SiteId,
EntityName = searchContent.EntityName,
EntityId = searchContent.EntityId,
Title = searchContent.Title,
Description = searchContent.Description,
Body = searchContent.Body,
Url = searchContent.Url,
Permissions = searchContent.Permissions,
ContentModifiedBy = searchContent.ContentModifiedBy,
ContentModifiedOn = searchContent.ContentModifiedOn,
SearchContentProperties = searchContent.SearchContentProperties,
Snippet = BuildSnippet(searchContent, searchQuery),
Score = CalculateScore(searchContent, searchQuery)
};
return searchResult;
}
private float CalculateScore(SearchContent searchContent, SearchQuery searchQuery)
{
var score = 0f;
foreach (var keyword in SearchUtils.GetKeywords(searchQuery.Keywords))
{
score += searchContent.SearchContentWords.Where(i => i.SearchWord.Word.StartsWith(keyword)).Sum(i => i.Count);
}
return score / 100;
}
private string BuildSnippet(SearchContent searchContent, SearchQuery searchQuery)
{
var content = $"{searchContent.Title} {searchContent.Description} {searchContent.Body}";
var snippet = string.Empty;
foreach (var keyword in SearchUtils.GetKeywords(searchQuery.Keywords))
{
if (!string.IsNullOrWhiteSpace(keyword) && content.Contains(keyword, StringComparison.OrdinalIgnoreCase))
{
var start = content.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) - 20;
var prefix = "...";
var suffix = "...";
if (start <= 0)
{
start = 0;
prefix = string.Empty;
}
var length = searchQuery.BodySnippetLength;
if (start + length >= content.Length)
{
length = content.Length - start;
suffix = string.Empty;
}
snippet = $"{prefix}{content.Substring(start, length)}{suffix}";
break;
}
}
if (string.IsNullOrEmpty(snippet))
{
snippet = content.Substring(0, searchQuery.BodySnippetLength);
}
foreach (var keyword in SearchUtils.GetKeywords(searchQuery.Keywords))
{
snippet = Regex.Replace(snippet, $"({keyword})", $"<b>$1</b>", RegexOptions.IgnoreCase);
}
return snippet;
}
private void AnalyzeSearchContent(SearchContent searchContent)
{
//analyze the search content and save the index words
var indexContent = $"{searchContent.Title} {searchContent.Description} {searchContent.Body} {searchContent.AdditionalContent}";
var words = GetWords(indexContent, WordMinLength);
var existingSearchContentWords = _searchContentRepository.GetSearchContentWords(searchContent.SearchContentId);
foreach (var kvp in words)
{
var searchContentWord = existingSearchContentWords.FirstOrDefault(i => i.SearchWord.Word == kvp.Key);
if (searchContentWord != null)
{
searchContentWord.Count = kvp.Value;
searchContentWord.ModifiedOn = DateTime.UtcNow;
_searchContentRepository.UpdateSearchContentWord(searchContentWord);
}
else
{
var searchWord = _searchContentRepository.GetSearchWord(kvp.Key);
if (searchWord == null)
{
searchWord = _searchContentRepository.AddSearchWord(new SearchWord { Word = kvp.Key, CreatedOn = DateTime.UtcNow });
}
searchContentWord = new SearchContentWord
{
SearchContentId = searchContent.SearchContentId,
SearchWordId = searchWord.SearchWordId,
Count = kvp.Value,
CreatedOn = DateTime.UtcNow,
ModifiedOn = DateTime.UtcNow
};
_searchContentRepository.AddSearchContentWord(searchContentWord);
}
}
}
private static Dictionary<string, int> GetWords(string content, int minLength)
{
content = FormatText(content);
var words = new Dictionary<string, int>();
var ignoreWords = IgnoreWords.Split(',');
if (!string.IsNullOrEmpty(content))
{
foreach (var word in content.Split(' '))
{
if (word.Length >= minLength && !ignoreWords.Contains(word))
{
if (!words.ContainsKey(word))
{
words.Add(word, 1);
}
else
{
words[word] += 1;
}
}
}
}
return words;
}
private static string FormatText(string text)
{
text = HtmlEntity.DeEntitize(text);
foreach (var punctuation in ".?!,;:-_()[]{}'\"/\\".ToCharArray())
{
text = text.Replace(punctuation, ' ');
}
text = text.Replace(" ", " ").ToLower().Trim();
return text;
}
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if (string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
}
}
}