search optimizations

This commit is contained in:
sbwalker
2024-07-17 13:57:47 -04:00
parent ada8809ec0
commit 71e472f330
9 changed files with 52 additions and 86 deletions

View File

@ -129,6 +129,36 @@ namespace Oqtane.Providers
return Task.CompletedTask;
}
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if (string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
}
private void AnalyzeSearchContent(SearchContent searchContent, Dictionary<string, string> siteSettings)
{
var ignoreWords = IgnoreWords.Split(',');
@ -180,14 +210,15 @@ namespace Oqtane.Providers
private static Dictionary<string, int> GetWords(string content, string[] ignoreWords, int minimumWordLength)
{
content = FormatText(content);
content = FormatContent(content);
var words = new Dictionary<string, int>();
if (!string.IsNullOrEmpty(content))
{
foreach (var word in content.Split(' '))
foreach (var term in content.Split(' '))
{
var word = term.ToLower().Trim();
if (word.Length >= minimumWordLength && !ignoreWords.Contains(word))
{
if (!words.ContainsKey(word))
@ -205,48 +236,16 @@ namespace Oqtane.Providers
return words;
}
private static string FormatText(string text)
private static string FormatContent(string text)
{
text = HtmlEntity.DeEntitize(text);
foreach (var punctuation in ".?!,;:-_()[]{}'\"/\\".ToCharArray())
foreach (var punctuation in ".?!,;:_()[]{}'\"/\\".ToCharArray())
{
text = text.Replace(punctuation, ' ');
}
text = text.Replace(" ", " ").ToLower().Trim();
return text;
}
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if (string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
}
public Task ResetIndex()
{
_searchContentRepository.DeleteAllSearchContent();