oqtane.framework/Oqtane.Shared/Shared/SearchUtils.cs
2024-06-03 21:19:42 +08:00

96 lines
2.9 KiB
C#

using System.Collections;
using System.Collections.Generic;
using System.Net;
using System.Text.RegularExpressions;
namespace Oqtane.Shared
{
public sealed class SearchUtils
{
private const string PunctuationMatch = "[~!#\\$%\\^&*\\(\\)-+=\\{\\[\\}\\]\\|;:\\x22'<,>\\.\\?\\\\\\t\\r\\v\\f\\n]";
private static readonly Regex _stripWhiteSpaceRegex = new Regex("\\s+", RegexOptions.Compiled);
private static readonly Regex _stripTagsRegex = new Regex("<[^<>]*>", RegexOptions.Compiled);
private static readonly Regex _afterRegEx = new Regex(PunctuationMatch + "\\s", RegexOptions.Compiled);
private static readonly Regex _beforeRegEx = new Regex("\\s" + PunctuationMatch, RegexOptions.Compiled);
public static string Clean(string html, bool removePunctuation)
{
if (string.IsNullOrWhiteSpace(html))
{
return string.Empty;
}
if (html.Contains("&lt;"))
{
html = WebUtility.HtmlDecode(html);
}
html = StripTags(html, true);
html = WebUtility.HtmlDecode(html);
if (removePunctuation)
{
html = StripPunctuation(html, true);
html = StripWhiteSpace(html, true);
}
return html;
}
public static IList<string> GetKeywordsList(string keywords)
{
var keywordsList = new List<string>();
if(!string.IsNullOrEmpty(keywords))
{
foreach (var keyword in keywords.Split(' '))
{
if (!string.IsNullOrWhiteSpace(keyword.Trim()))
{
keywordsList.Add(keyword.Trim());
}
}
}
return keywordsList;
}
private static string StripTags(string html, bool retainSpace)
{
return _stripTagsRegex.Replace(html, retainSpace ? " " : string.Empty);
}
private static string StripPunctuation(string html, bool retainSpace)
{
if (string.IsNullOrWhiteSpace(html))
{
return string.Empty;
}
string retHTML = html + " ";
var repString = retainSpace ? " " : string.Empty;
while (_beforeRegEx.IsMatch(retHTML))
{
retHTML = _beforeRegEx.Replace(retHTML, repString);
}
while (_afterRegEx.IsMatch(retHTML))
{
retHTML = _afterRegEx.Replace(retHTML, repString);
}
return retHTML.Trim('"');
}
private static string StripWhiteSpace(string html, bool retainSpace)
{
if (string.IsNullOrWhiteSpace(html))
{
return string.Empty;
}
return _stripWhiteSpaceRegex.Replace(html, retainSpace ? " " : string.Empty);
}
}
}