return the words count to calculate the ranking.

This commit is contained in:
Ben
2024-06-04 21:57:50 +08:00
parent d9d917e267
commit e1cdc7b387
4 changed files with 48 additions and 64 deletions

View File

@ -18,10 +18,6 @@ namespace Oqtane.Providers
{ {
private readonly ISearchContentRepository _searchContentRepository; private readonly ISearchContentRepository _searchContentRepository;
private const float TitleBoost = 100f;
private const float DescriptionBoost = 10f;
private const float BodyBoost = 10f;
private const float AdditionalContentBoost = 5f;
private const string IgnoreWords = "the,be,to,of,and,a,i,in,that,have,it,for,not,on,with,he,as,you,do,at,this,but,his,by,from,they,we,say,her,she,or,an,will,my,one,all,would,there,their,what,so,up,out,if,about,who,get,which,go,me,when,make,can,like,time,no,just,him,know,take,people,into,year,your,good,some,could,them,see,other,than,then,now,look,only,come,its,over,think,also,back,after,use,two,how,our,work,first,well,way,even,new,want,because,any,these,give,day,most,us"; private const string IgnoreWords = "the,be,to,of,and,a,i,in,that,have,it,for,not,on,with,he,as,you,do,at,this,but,his,by,from,they,we,say,her,she,or,an,will,my,one,all,would,there,their,what,so,up,out,if,about,who,get,which,go,me,when,make,can,like,time,no,just,him,know,take,people,into,year,your,good,some,could,them,see,other,than,then,now,look,only,come,its,over,think,also,back,after,use,two,how,our,work,first,well,way,even,new,want,because,any,these,give,day,most,us";
private const int WordMinLength = 3; private const int WordMinLength = 3;
public string Name => Constants.DefaultSearchProviderName; public string Name => Constants.DefaultSearchProviderName;
@ -55,6 +51,9 @@ namespace Oqtane.Providers
//remove exist document //remove exist document
_searchContentRepository.DeleteSearchContent(searchContent.EntityName, searchContent.EntityId); _searchContentRepository.DeleteSearchContent(searchContent.EntityName, searchContent.EntityId);
//clean the search content to remove html tags
CleanSearchContent(searchContent);
_searchContentRepository.AddSearchContent(searchContent); _searchContentRepository.AddSearchContent(searchContent);
//save the index words //save the index words
@ -152,10 +151,7 @@ namespace Oqtane.Providers
var score = 0f; var score = 0f;
foreach (var keyword in SearchUtils.GetKeywordsList(searchQuery.Keywords)) foreach (var keyword in SearchUtils.GetKeywordsList(searchQuery.Keywords))
{ {
score += Regex.Matches(searchContent.Title, keyword, RegexOptions.IgnoreCase).Count * TitleBoost; score += searchContent.Words.Where(i => i.WordSource.Word.StartsWith(keyword)).Sum(i => i.Count);
score += Regex.Matches(searchContent.Description, keyword, RegexOptions.IgnoreCase).Count * DescriptionBoost;
score += Regex.Matches(searchContent.Body, keyword, RegexOptions.IgnoreCase).Count * BodyBoost;
score += Regex.Matches(searchContent.AdditionalContent, keyword, RegexOptions.IgnoreCase).Count * AdditionalContentBoost;
} }
return score / 100; return score / 100;
@ -241,37 +237,24 @@ namespace Oqtane.Providers
private static Dictionary<string, int> GetWords(string content, int minLength) private static Dictionary<string, int> GetWords(string content, int minLength)
{ {
content = WebUtility.HtmlDecode(content); content = FormatText(content);
var words = new Dictionary<string, int>(); var words = new Dictionary<string, int>();
var ignoreWords = IgnoreWords.Split(','); var ignoreWords = IgnoreWords.Split(',');
var page = new HtmlDocument(); if (!string.IsNullOrEmpty(content))
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => FormatText(i.InnerText));
foreach (var phrase in phrases)
{ {
if (!string.IsNullOrEmpty(phrase)) foreach (var word in content.Split(' '))
{ {
foreach (var word in phrase.Split(' ')) if (word.Length >= minLength && !ignoreWords.Contains(word))
{ {
if (word.Length >= minLength && !ignoreWords.Contains(word)) if (!words.ContainsKey(word))
{ {
if (!words.ContainsKey(word)) words.Add(word, 1);
{ }
words.Add(word, 1); else
} {
else words[word] += 1;
{
words[word] += 1;
}
} }
} }
} }
@ -288,8 +271,38 @@ namespace Oqtane.Providers
text = text.Replace(punctuation, ' '); text = text.Replace(punctuation, ' ');
} }
text = text.Replace(" ", " ").ToLower().Trim(); text = text.Replace(" ", " ").ToLower().Trim();
return text;
return text;
}
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if (string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
} }
} }
} }

View File

@ -22,6 +22,8 @@ namespace Oqtane.Repository
using var db = _dbContextFactory.CreateDbContext(); using var db = _dbContextFactory.CreateDbContext();
var searchContentList = db.SearchContent.AsNoTracking() var searchContentList = db.SearchContent.AsNoTracking()
.Include(i => i.Properties) .Include(i => i.Properties)
.Include(i => i.Words)
.ThenInclude(w => w.WordSource)
.Where(i => i.SiteId == searchQuery.SiteId && i.IsActive); .Where(i => i.SiteId == searchQuery.SiteId && i.IsActive);
if (searchQuery.EntityNames != null && searchQuery.EntityNames.Any()) if (searchQuery.EntityNames != null && searchQuery.EntityNames.Any())

View File

@ -186,7 +186,6 @@ namespace Oqtane.Services
{ {
try try
{ {
CleanSearchContent(searchContent);
searchProvider.SaveSearchContent(searchContent); searchProvider.SaveSearchContent(searchContent);
} }
catch(Exception ex) catch(Exception ex)
@ -231,35 +230,5 @@ namespace Oqtane.Services
return string.Empty; return string.Empty;
} }
private void CleanSearchContent(SearchContent searchContent)
{
searchContent.Title = GetCleanContent(searchContent.Title);
searchContent.Description = GetCleanContent(searchContent.Description);
searchContent.Body = GetCleanContent(searchContent.Body);
searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent);
}
private string GetCleanContent(string content)
{
if(string.IsNullOrWhiteSpace(content))
{
return string.Empty;
}
content = WebUtility.HtmlDecode(content);
var page = new HtmlDocument();
page.LoadHtml(content);
var phrases = page.DocumentNode.Descendants().Where(i =>
i.NodeType == HtmlNodeType.Text &&
i.ParentNode.Name != "script" &&
i.ParentNode.Name != "style" &&
!string.IsNullOrEmpty(i.InnerText.Trim())
).Select(i => i.InnerText);
return string.Join(" ", phrases);
}
} }
} }

View File

@ -20,7 +20,7 @@ namespace Oqtane.Shared
{ {
if (!string.IsNullOrWhiteSpace(keyword.Trim())) if (!string.IsNullOrWhiteSpace(keyword.Trim()))
{ {
keywordsList.Add(keyword.Trim()); keywordsList.Add(keyword.Trim().ToLower());
} }
} }
} }