From e1cdc7b387257be0674c500daa4d57e7da85b394 Mon Sep 17 00:00:00 2001 From: Ben Date: Tue, 4 Jun 2024 21:57:50 +0800 Subject: [PATCH] return the words count to calculate the ranking. --- .../Providers/DatabaseSearchProvider.cs | 77 +++++++++++-------- .../Repository/SearchContentRepository.cs | 2 + Oqtane.Server/Services/SearchService.cs | 31 -------- Oqtane.Shared/Shared/SearchUtils.cs | 2 +- 4 files changed, 48 insertions(+), 64 deletions(-) diff --git a/Oqtane.Server/Providers/DatabaseSearchProvider.cs b/Oqtane.Server/Providers/DatabaseSearchProvider.cs index 74884934..adf2cee8 100644 --- a/Oqtane.Server/Providers/DatabaseSearchProvider.cs +++ b/Oqtane.Server/Providers/DatabaseSearchProvider.cs @@ -18,10 +18,6 @@ namespace Oqtane.Providers { private readonly ISearchContentRepository _searchContentRepository; - private const float TitleBoost = 100f; - private const float DescriptionBoost = 10f; - private const float BodyBoost = 10f; - private const float AdditionalContentBoost = 5f; private const string IgnoreWords = "the,be,to,of,and,a,i,in,that,have,it,for,not,on,with,he,as,you,do,at,this,but,his,by,from,they,we,say,her,she,or,an,will,my,one,all,would,there,their,what,so,up,out,if,about,who,get,which,go,me,when,make,can,like,time,no,just,him,know,take,people,into,year,your,good,some,could,them,see,other,than,then,now,look,only,come,its,over,think,also,back,after,use,two,how,our,work,first,well,way,even,new,want,because,any,these,give,day,most,us"; private const int WordMinLength = 3; public string Name => Constants.DefaultSearchProviderName; @@ -55,6 +51,9 @@ namespace Oqtane.Providers //remove exist document _searchContentRepository.DeleteSearchContent(searchContent.EntityName, searchContent.EntityId); + //clean the search content to remove html tags + CleanSearchContent(searchContent); + _searchContentRepository.AddSearchContent(searchContent); //save the index words @@ -152,10 +151,7 @@ namespace Oqtane.Providers var score = 0f; foreach (var keyword in SearchUtils.GetKeywordsList(searchQuery.Keywords)) { - score += Regex.Matches(searchContent.Title, keyword, RegexOptions.IgnoreCase).Count * TitleBoost; - score += Regex.Matches(searchContent.Description, keyword, RegexOptions.IgnoreCase).Count * DescriptionBoost; - score += Regex.Matches(searchContent.Body, keyword, RegexOptions.IgnoreCase).Count * BodyBoost; - score += Regex.Matches(searchContent.AdditionalContent, keyword, RegexOptions.IgnoreCase).Count * AdditionalContentBoost; + score += searchContent.Words.Where(i => i.WordSource.Word.StartsWith(keyword)).Sum(i => i.Count); } return score / 100; @@ -241,37 +237,24 @@ namespace Oqtane.Providers private static Dictionary GetWords(string content, int minLength) { - content = WebUtility.HtmlDecode(content); + content = FormatText(content); var words = new Dictionary(); var ignoreWords = IgnoreWords.Split(','); - var page = new HtmlDocument(); - page.LoadHtml(content); - - var phrases = page.DocumentNode.Descendants().Where(i => - i.NodeType == HtmlNodeType.Text && - i.ParentNode.Name != "script" && - i.ParentNode.Name != "style" && - !string.IsNullOrEmpty(i.InnerText.Trim()) - ).Select(i => FormatText(i.InnerText)); - - foreach (var phrase in phrases) + if (!string.IsNullOrEmpty(content)) { - if (!string.IsNullOrEmpty(phrase)) + foreach (var word in content.Split(' ')) { - foreach (var word in phrase.Split(' ')) + if (word.Length >= minLength && !ignoreWords.Contains(word)) { - if (word.Length >= minLength && !ignoreWords.Contains(word)) + if (!words.ContainsKey(word)) { - if (!words.ContainsKey(word)) - { - words.Add(word, 1); - } - else - { - words[word] += 1; - } + words.Add(word, 1); + } + else + { + words[word] += 1; } } } @@ -288,8 +271,38 @@ namespace Oqtane.Providers text = text.Replace(punctuation, ' '); } text = text.Replace(" ", " ").ToLower().Trim(); - return text; + return text; + } + + private void CleanSearchContent(SearchContent searchContent) + { + searchContent.Title = GetCleanContent(searchContent.Title); + searchContent.Description = GetCleanContent(searchContent.Description); + searchContent.Body = GetCleanContent(searchContent.Body); + searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent); + } + + private string GetCleanContent(string content) + { + if (string.IsNullOrWhiteSpace(content)) + { + return string.Empty; + } + + content = WebUtility.HtmlDecode(content); + + var page = new HtmlDocument(); + page.LoadHtml(content); + + var phrases = page.DocumentNode.Descendants().Where(i => + i.NodeType == HtmlNodeType.Text && + i.ParentNode.Name != "script" && + i.ParentNode.Name != "style" && + !string.IsNullOrEmpty(i.InnerText.Trim()) + ).Select(i => i.InnerText); + + return string.Join(" ", phrases); } } } diff --git a/Oqtane.Server/Repository/SearchContentRepository.cs b/Oqtane.Server/Repository/SearchContentRepository.cs index c1577d81..67ecfcf7 100644 --- a/Oqtane.Server/Repository/SearchContentRepository.cs +++ b/Oqtane.Server/Repository/SearchContentRepository.cs @@ -22,6 +22,8 @@ namespace Oqtane.Repository using var db = _dbContextFactory.CreateDbContext(); var searchContentList = db.SearchContent.AsNoTracking() .Include(i => i.Properties) + .Include(i => i.Words) + .ThenInclude(w => w.WordSource) .Where(i => i.SiteId == searchQuery.SiteId && i.IsActive); if (searchQuery.EntityNames != null && searchQuery.EntityNames.Any()) diff --git a/Oqtane.Server/Services/SearchService.cs b/Oqtane.Server/Services/SearchService.cs index 8fd9aab0..8d3fe4ef 100644 --- a/Oqtane.Server/Services/SearchService.cs +++ b/Oqtane.Server/Services/SearchService.cs @@ -186,7 +186,6 @@ namespace Oqtane.Services { try { - CleanSearchContent(searchContent); searchProvider.SaveSearchContent(searchContent); } catch(Exception ex) @@ -231,35 +230,5 @@ namespace Oqtane.Services return string.Empty; } - - private void CleanSearchContent(SearchContent searchContent) - { - searchContent.Title = GetCleanContent(searchContent.Title); - searchContent.Description = GetCleanContent(searchContent.Description); - searchContent.Body = GetCleanContent(searchContent.Body); - searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent); - } - - private string GetCleanContent(string content) - { - if(string.IsNullOrWhiteSpace(content)) - { - return string.Empty; - } - - content = WebUtility.HtmlDecode(content); - - var page = new HtmlDocument(); - page.LoadHtml(content); - - var phrases = page.DocumentNode.Descendants().Where(i => - i.NodeType == HtmlNodeType.Text && - i.ParentNode.Name != "script" && - i.ParentNode.Name != "style" && - !string.IsNullOrEmpty(i.InnerText.Trim()) - ).Select(i => i.InnerText); - - return string.Join(" ", phrases); - } } } diff --git a/Oqtane.Shared/Shared/SearchUtils.cs b/Oqtane.Shared/Shared/SearchUtils.cs index 745770a5..d49c932d 100644 --- a/Oqtane.Shared/Shared/SearchUtils.cs +++ b/Oqtane.Shared/Shared/SearchUtils.cs @@ -20,7 +20,7 @@ namespace Oqtane.Shared { if (!string.IsNullOrWhiteSpace(keyword.Trim())) { - keywordsList.Add(keyword.Trim()); + keywordsList.Add(keyword.Trim().ToLower()); } } }