using System; using System.Collections.Generic; using System.Linq; using System.Net; using System.Text.RegularExpressions; using System.Threading.Tasks; using HtmlAgilityPack; using Oqtane.Models; using Oqtane.Repository; using Oqtane.Services; using Oqtane.Shared; namespace Oqtane.Providers { public class DatabaseSearchProvider : ISearchProvider { private readonly ISearchContentRepository _searchContentRepository; private const string SearchIgnoreWordsSetting = "Search_IgnoreWords"; private const string SearchMinimumWordLengthSetting = "Search_MininumWordLength"; private const string IgnoreWords = "the,be,to,of,and,a,i,in,that,have,it,for,not,on,with,he,as,you,do,at,this,but,his,by,from,they,we,say,her,she,or,an,will,my,one,all,would,there,their,what,so,up,out,if,about,who,get,which,go,me,when,make,can,like,time,no,just,him,know,take,people,into,year,your,good,some,could,them,see,other,than,then,now,look,only,come,its,over,think,also,back,after,use,two,how,our,work,first,well,way,even,new,want,because,any,these,give,day,most,us"; private const int MinimumWordLength = 3; public string Name => Constants.DefaultSearchProviderName; public DatabaseSearchProvider(ISearchContentRepository searchContentRepository) { _searchContentRepository = searchContentRepository; } public async Task> GetSearchResultsAsync(SearchQuery searchQuery) { var searchContents = await _searchContentRepository.GetSearchContentsAsync(searchQuery); return searchContents.Select(item => ConvertToSearchResult(item, searchQuery)).ToList(); } private SearchResult ConvertToSearchResult(SearchContent searchContent, SearchQuery searchQuery) { var searchResult = new SearchResult() { SearchContentId = searchContent.SearchContentId, SiteId = searchContent.SiteId, EntityName = searchContent.EntityName, EntityId = searchContent.EntityId, Title = searchContent.Title, Description = searchContent.Description, Body = searchContent.Body, Url = Utilities.TenantUrl(searchQuery.Alias, searchContent.Url), Permissions = searchContent.Permissions, ContentModifiedBy = searchContent.ContentModifiedBy, ContentModifiedOn = searchContent.ContentModifiedOn, SearchContentProperties = searchContent.SearchContentProperties, Snippet = BuildSnippet(searchContent, searchQuery), Score = (searchContent.Count / 100f) }; return searchResult; } private string BuildSnippet(SearchContent searchContent, SearchQuery searchQuery) { var content = $"{searchContent.Title} {searchContent.Description} {searchContent.Body}"; var snippet = string.Empty; foreach (var keyword in SearchUtils.GetKeywords(searchQuery.Keywords)) { if (!string.IsNullOrWhiteSpace(keyword) && content.Contains(keyword, StringComparison.OrdinalIgnoreCase)) { var start = content.IndexOf(keyword, StringComparison.OrdinalIgnoreCase) - 20; var prefix = "..."; var suffix = "..."; if (start <= 0) { start = 0; prefix = string.Empty; } var length = searchQuery.BodyLength; if (start + length >= content.Length) { length = content.Length - start; suffix = string.Empty; } snippet = $"{prefix}{content.Substring(start, length)}{suffix}"; break; } } if (string.IsNullOrEmpty(snippet)) { snippet = content.Substring(0, searchQuery.BodyLength); } foreach (var keyword in SearchUtils.GetKeywords(searchQuery.Keywords)) { snippet = Regex.Replace(snippet, $"({keyword})", $"$1", RegexOptions.IgnoreCase); } return snippet; } public Task SaveSearchContent(SearchContent searchContent, Dictionary siteSettings) { // remove existing search content _searchContentRepository.DeleteSearchContent(searchContent.EntityName, searchContent.EntityId); if (!searchContent.IsDeleted) { // clean the search content to remove html tags CleanSearchContent(searchContent); _searchContentRepository.AddSearchContent(searchContent); // save the index words AnalyzeSearchContent(searchContent, siteSettings); } return Task.CompletedTask; } private void CleanSearchContent(SearchContent searchContent) { searchContent.Title = GetCleanContent(searchContent.Title); searchContent.Description = GetCleanContent(searchContent.Description); searchContent.Body = GetCleanContent(searchContent.Body); searchContent.AdditionalContent = GetCleanContent(searchContent.AdditionalContent); } private string GetCleanContent(string content) { if (string.IsNullOrWhiteSpace(content)) { return string.Empty; } content = WebUtility.HtmlDecode(content); var page = new HtmlDocument(); page.LoadHtml(content); var phrases = page.DocumentNode.Descendants().Where(i => i.NodeType == HtmlNodeType.Text && i.ParentNode.Name != "script" && i.ParentNode.Name != "style" && !string.IsNullOrEmpty(i.InnerText.Trim()) ).Select(i => i.InnerText); return string.Join(" ", phrases); } private void AnalyzeSearchContent(SearchContent searchContent, Dictionary siteSettings) { var ignoreWords = IgnoreWords.Split(','); if (siteSettings.ContainsKey(SearchIgnoreWordsSetting) && !string.IsNullOrEmpty(siteSettings[SearchIgnoreWordsSetting])) { ignoreWords = siteSettings[SearchIgnoreWordsSetting].Split(','); } var minimumWordLength = MinimumWordLength; if (siteSettings.ContainsKey(SearchMinimumWordLengthSetting) && !string.IsNullOrEmpty(siteSettings[SearchMinimumWordLengthSetting])) { minimumWordLength = int.Parse(siteSettings[SearchMinimumWordLengthSetting]); } // analyze the search content and save the index words var indexContent = $"{searchContent.Title} {searchContent.Description} {searchContent.Body} {searchContent.AdditionalContent}"; var words = GetWords(indexContent, ignoreWords, minimumWordLength); var existingSearchContentWords = _searchContentRepository.GetSearchContentWords(searchContent.SearchContentId); foreach (var kvp in words) { var searchContentWord = existingSearchContentWords.FirstOrDefault(i => i.SearchWord.Word == kvp.Key); if (searchContentWord != null) { searchContentWord.Count = kvp.Value; searchContentWord.ModifiedOn = DateTime.UtcNow; _searchContentRepository.UpdateSearchContentWord(searchContentWord); } else { var searchWord = _searchContentRepository.GetSearchWord(kvp.Key); if (searchWord == null) { searchWord = _searchContentRepository.AddSearchWord(new SearchWord { Word = kvp.Key, CreatedOn = DateTime.UtcNow }); } searchContentWord = new SearchContentWord { SearchContentId = searchContent.SearchContentId, SearchWordId = searchWord.SearchWordId, Count = kvp.Value, CreatedOn = DateTime.UtcNow, ModifiedOn = DateTime.UtcNow }; _searchContentRepository.AddSearchContentWord(searchContentWord); } } } private static Dictionary GetWords(string content, string[] ignoreWords, int minimumWordLength) { content = FormatContent(content); var words = new Dictionary(); if (!string.IsNullOrEmpty(content)) { foreach (var term in content.Split(' ')) { var word = term.ToLower().Trim(); if (word.Length >= minimumWordLength && !ignoreWords.Contains(word)) { if (!words.ContainsKey(word)) { words.Add(word, 1); } else { words[word] += 1; } } } } return words; } private static string FormatContent(string text) { text = HtmlEntity.DeEntitize(text); foreach (var punctuation in ".?!,;:_()[]{}'\"/\\".ToCharArray()) { text = text.Replace(punctuation, ' '); } return text; } public Task DeleteSearchContent(int siteId) { _searchContentRepository.DeleteAllSearchContent(siteId); return Task.CompletedTask; } } }