return the words count to calculate the ranking.

This commit is contained in:
Ben
2024-06-04 21:57:50 +08:00
parent d9d917e267
commit e1cdc7b387
4 changed files with 48 additions and 64 deletions

View File

@ -18,10 +18,6 @@ namespace Oqtane.Providers
{
private readonly ISearchContentRepository _searchContentRepository;
private const float TitleBoost = 100f;
private const float DescriptionBoost = 10f;
private const float BodyBoost = 10f;
private const float AdditionalContentBoost = 5f;
private const string IgnoreWords = "the,be,to,of,and,a,i,in,that,have,it,for,not,on,with,he,as,you,do,at,this,but,his,by,from,they,we,say,her,she,or,an,will,my,one,all,would,there,their,what,so,up,out,if,about,who,get,which,go,me,when,make,can,like,time,no,just,him,know,take,people,into,year,your,good,some,could,them,see,other,than,then,now,look,only,come,its,over,think,also,back,after,use,two,how,our,work,first,well,way,even,new,want,because,any,these,give,day,most,us";
private const int WordMinLength = 3;
public string Name => Constants.DefaultSearchProviderName;
@ -55,6 +51,9 @@ namespace Oqtane.Providers
//remove exist document
_searchContentRepository.DeleteSearchContent(searchContent.EntityName, searchContent.EntityId);
//clean the search content to remove html tags
CleanSearchContent(searchContent);
_searchContentRepository.AddSearchContent(searchContent);
//save the index words
@ -152,10 +151,7 @@ namespace Oqtane.Providers
var score = 0f;
foreach (var keyword in SearchUtils.GetKeywordsList(searchQuery.Keywords))
{
score += Regex.Matches(searchContent.Title, keyword, RegexOptions.IgnoreCase).Count * TitleBoost;
score += Regex.Matches(searchContent.Description, keyword, RegexOptions.IgnoreCase).Count * DescriptionBoost;
score += Regex.Matches(searchContent.Body, keyword, RegexOptions.IgnoreCase).Count * BodyBoost;
score += Regex.Matches(searchContent.AdditionalContent, keyword, RegexOptions.IgnoreCase).Count * AdditionalContentBoost;
score += searchContent.Words.Where(i => i.WordSource.Word.StartsWith(keyword)).Sum(i => i.Count);
}
return score / 100;
@ -241,37 +237,24 @@ namespace Oqtane.Providers
private static Dictionary<string, int> GetWords(string content, int minLength)
{
content = WebUtility.HtmlDecode(content);
content = FormatText(content);
var words = new Dictionary<string, int>();
var ignoreWords = IgnoreWords.Split(',');
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => FormatText(i.InnerText));
foreach (var phrase in phrases)
if (!string.IsNullOrEmpty(content))
{
if (!string.IsNullOrEmpty(phrase))
foreach (var word in content.Split(' '))
{
foreach (var word in phrase.Split(' '))
if (word.Length >= minLength && !ignoreWords.Contains(word))
{
if (word.Length >= minLength && !ignoreWords.Contains(word))
if (!words.ContainsKey(word))
{
if (!words.ContainsKey(word))
{
words.Add(word, 1);
}
else
{
words[word] += 1;
}
words.Add(word, 1);
}
else
{
words[word] += 1;
}
}
}
@ -288,8 +271,38 @@ namespace Oqtane.Providers
text = text.Replace(punctuation, ' ');
}
text = text.Replace(" ", " ").ToLower().Trim();
return text;
return text;
}
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if (string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
}
}
}

View File

@ -22,6 +22,8 @@ namespace Oqtane.Repository
using var db = _dbContextFactory.CreateDbContext();
var searchContentList = db.SearchContent.AsNoTracking()
.Include(i => i.Properties)
.Include(i => i.Words)
.ThenInclude(w => w.WordSource)
.Where(i => i.SiteId == searchQuery.SiteId && i.IsActive);
if (searchQuery.EntityNames != null && searchQuery.EntityNames.Any())

View File

@ -186,7 +186,6 @@ namespace Oqtane.Services
{
try
{
CleanSearchContent(searchContent);
searchProvider.SaveSearchContent(searchContent);
}
catch(Exception ex)
@ -231,35 +230,5 @@ namespace Oqtane.Services
return string.Empty;
}
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if(string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
}
}
}

View File

@ -20,7 +20,7 @@ namespace Oqtane.Shared
{
if (!string.IsNullOrWhiteSpace(keyword.Trim()))
{
keywordsList.Add(keyword.Trim());
keywordsList.Add(keyword.Trim().ToLower());
}
}
}