From 536da22cb9600d59c0a9701b91e619d445792396 Mon Sep 17 00:00:00 2001 From: Tom Bruyneel Date: Fri, 17 May 2024 14:17:28 +0200 Subject: [PATCH] remove special case for iodigital --- .../Jobs/WebsitePageIndexingJob.cs | 34 +------------------ .../Implementations/ConversationService.cs | 8 +++++ 2 files changed, 9 insertions(+), 33 deletions(-) diff --git a/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs b/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs index 39dd330..2542cef 100644 --- a/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs +++ b/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs @@ -50,7 +50,7 @@ public int TitleScore score = 90; } - if (HtmlNode.Name.ToLower() == "h3") + if (HtmlNode?.Name.ToLower() == "h3") { score = 80; } @@ -331,38 +331,6 @@ private async Task CreateEntry(ApplicationDbContext db, string tenantId, Website } } } - else if (websitePage.Url.Contains("iodigital.com")) - { - List chunks = new List(); - - var nodes = htmlDoc.DocumentNode.SelectNodes("//main"); - - if (nodes != null) - { - foreach (var node in nodes) - { - //var cleanText = Regex.Replace(node.InnerText, @"\s+", " ").Trim(); - var cleanText = WebUtility.HtmlDecode(node.InnerText); - - if (!string.IsNullOrEmpty(cleanText)) - { - var chunkResult = new ChunkResult(); - chunkResult.ArticleNumber = string.Empty; - chunkResult.Text = cleanText; - chunkResult.Packaging = string.Empty; - - chunks.Add(chunkResult); - } - } - } - - if (chunks.Count > 0) - { - ChunkCollection chunkCollection = new ChunkCollection(tenantId, websitePage.Id.ToString(), websitePage.Url, websitePage.ReferenceType.ToString(), websitePage.Language.ToString(), chunks); - - await _vectorizationService.BulkCreateAsync(nameof(WebsitePage), websitePage.Id, scrapeResult.PageTitle, tenantId, UsageType.Indexing, chunkCollection); - } - } else if(websitePage.Url.Contains("tena.co.uk")) { List chunks = new List(); diff --git a/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/ConversationService.cs b/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/ConversationService.cs index 89d52c5..00bb1ae 100644 --- a/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/ConversationService.cs +++ b/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/ConversationService.cs @@ -13,6 +13,7 @@ using ConversationalSearchPlatform.BackOffice.Tenants; using Finbuckle.MultiTenant; using GraphQL; +using HtmlAgilityPack; using Jint; using Jint.Fetch; using Microsoft.Extensions.Caching.Memory; @@ -268,6 +269,13 @@ public async Task GetConversationContext(GetConversationCon var engine = new Engine(); engine.SetValue("log", new Action((obj) => _logger.LogInformation(obj.ToString()))) + .SetValue("parseHtml", new Func((html) => + { + var htmlDoc = new HtmlDocument(); + htmlDoc.LoadHtml(html); + + return htmlDoc; + })) .SetValue("fetch", new Func>((uri, options) => FetchClass.Fetch(uri, FetchClass.ExpandoToOptionsObject(options)))) .SetValue("__keyword_string", keywordString) .SetValue("__result", 0)