Skip to content

Commit

Permalink
remove special case for iodigital
Browse files Browse the repository at this point in the history
  • Loading branch information
tom-b-iodigital committed May 17, 2024
1 parent 2418e88 commit 536da22
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public int TitleScore
score = 90;
}

if (HtmlNode.Name.ToLower() == "h3")
if (HtmlNode?.Name.ToLower() == "h3")
{
score = 80;
}
Expand Down Expand Up @@ -331,38 +331,6 @@ private async Task CreateEntry(ApplicationDbContext db, string tenantId, Website
}
}
}
else if (websitePage.Url.Contains("iodigital.com"))
{
List<ChunkResult> chunks = new List<ChunkResult>();

var nodes = htmlDoc.DocumentNode.SelectNodes("//main");

if (nodes != null)
{
foreach (var node in nodes)
{
//var cleanText = Regex.Replace(node.InnerText, @"\s+", " ").Trim();
var cleanText = WebUtility.HtmlDecode(node.InnerText);

if (!string.IsNullOrEmpty(cleanText))
{
var chunkResult = new ChunkResult();
chunkResult.ArticleNumber = string.Empty;
chunkResult.Text = cleanText;
chunkResult.Packaging = string.Empty;

chunks.Add(chunkResult);
}
}
}

if (chunks.Count > 0)
{
ChunkCollection chunkCollection = new ChunkCollection(tenantId, websitePage.Id.ToString(), websitePage.Url, websitePage.ReferenceType.ToString(), websitePage.Language.ToString(), chunks);

await _vectorizationService.BulkCreateAsync(nameof(WebsitePage), websitePage.Id, scrapeResult.PageTitle, tenantId, UsageType.Indexing, chunkCollection);
}
}
else if(websitePage.Url.Contains("tena.co.uk"))
{
List<ChunkResult> chunks = new List<ChunkResult>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
using ConversationalSearchPlatform.BackOffice.Tenants;
using Finbuckle.MultiTenant;
using GraphQL;
using HtmlAgilityPack;
using Jint;
using Jint.Fetch;
using Microsoft.Extensions.Caching.Memory;
Expand Down Expand Up @@ -268,6 +269,13 @@ public async Task<ConversationContext> GetConversationContext(GetConversationCon

var engine = new Engine();
engine.SetValue("log", new Action<object>((obj) => _logger.LogInformation(obj.ToString())))
.SetValue("parseHtml", new Func<string, HtmlAgilityPack.HtmlDocument>((html) =>
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
return htmlDoc;
}))
.SetValue("fetch", new Func<string, object, Task<FetchResult>>((uri, options) => FetchClass.Fetch(uri, FetchClass.ExpandoToOptionsObject(options))))
.SetValue("__keyword_string", keywordString)
.SetValue("__result", 0)
Expand Down

0 comments on commit 536da22

Please sign in to comment.