From 804827fad08c3b59ad66cc6634b8119377795d8e Mon Sep 17 00:00:00 2001 From: Thomas Santerre Date: Sat, 15 Jun 2024 16:59:01 -0400 Subject: [PATCH] refactor: cleanup structure on the scrape plugin --- .../src/web_scraping/headless_chrome.rs | 13 +++ mai-sdk-plugins/src/web_scraping/reqwest.rs | 3 + mai-sdk-plugins/src/web_scraping/state.rs | 14 +++ mai-sdk-plugins/src/web_scraping/task.rs | 91 +++++++++++++++++++ 4 files changed, 121 insertions(+) create mode 100644 mai-sdk-plugins/src/web_scraping/headless_chrome.rs create mode 100644 mai-sdk-plugins/src/web_scraping/reqwest.rs create mode 100644 mai-sdk-plugins/src/web_scraping/state.rs create mode 100644 mai-sdk-plugins/src/web_scraping/task.rs diff --git a/mai-sdk-plugins/src/web_scraping/headless_chrome.rs b/mai-sdk-plugins/src/web_scraping/headless_chrome.rs new file mode 100644 index 0000000..970feba --- /dev/null +++ b/mai-sdk-plugins/src/web_scraping/headless_chrome.rs @@ -0,0 +1,13 @@ +use headless_chrome::LaunchOptions; + +pub async fn get_website_content(url: &str, headless: bool) -> anyhow::Result { + let browser = headless_chrome::Browser::new(LaunchOptions { + headless, + ..Default::default() + }) + .unwrap(); + let tab = browser.new_tab()?; + tab.navigate_to(url)?; + tab.wait_until_navigated()?; + tab.get_content() +} diff --git a/mai-sdk-plugins/src/web_scraping/reqwest.rs b/mai-sdk-plugins/src/web_scraping/reqwest.rs new file mode 100644 index 0000000..0ebde1f --- /dev/null +++ b/mai-sdk-plugins/src/web_scraping/reqwest.rs @@ -0,0 +1,3 @@ +pub async fn get_website_content(url: &str) -> anyhow::Result { + reqwest::get(url).await?.text().await.map_err(Into::into) +} diff --git a/mai-sdk-plugins/src/web_scraping/state.rs b/mai-sdk-plugins/src/web_scraping/state.rs new file mode 100644 index 0000000..8f640dd --- /dev/null +++ b/mai-sdk-plugins/src/web_scraping/state.rs @@ -0,0 +1,14 @@ +use slog::Logger; + +#[derive(Debug, Clone)] +pub struct WebScrapingPluginState { + pub logger: Logger, +} + +impl WebScrapingPluginState { + pub fn new(logger: &Logger) -> Self { + Self { + logger: logger.clone(), + } + } +} diff --git a/mai-sdk-plugins/src/web_scraping/task.rs b/mai-sdk-plugins/src/web_scraping/task.rs new file mode 100644 index 0000000..70ebf57 --- /dev/null +++ b/mai-sdk-plugins/src/web_scraping/task.rs @@ -0,0 +1,91 @@ +use anyhow::Result; +use mai_sdk_core::task_queue::{Runnable, TaskId}; +use serde::{Deserialize, Serialize}; +use slog::info; + +use super::state::WebScrapingPluginState; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WebScrapingPluginTaskScrape { + pub id: TaskId, + pub url: String, + pub enable_js: bool, + pub headless: bool, +} + +impl WebScrapingPluginTaskScrape { + pub fn new(url: String, enable_js: bool, headless: bool) -> Self { + Self { + id: nanoid::nanoid!(), + url, + enable_js, + headless, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct WebScrapingPluginTaskScrapeOutput { + pub content: String, +} + +impl Runnable + for WebScrapingPluginTaskScrape +{ + fn id(&self) -> TaskId { + self.id.clone() + } + + async fn run( + &self, + state: WebScrapingPluginState, + ) -> Result { + let content = match self.enable_js { + true => { + info!(state.logger, "running with browser"); + super::headless_chrome::get_website_content(&self.url, self.headless).await? + } + false => { + info!(state.logger, "running without browser"); + super::reqwest::get_website_content(&self.url).await? + } + }; + + Ok(WebScrapingPluginTaskScrapeOutput { content }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_web_scraping_plugin_task_scrape() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let state = WebScrapingPluginState::new(&logger); + + let task = WebScrapingPluginTaskScrape { + id: "test".into(), + url: "https://www.google.com".to_string(), + enable_js: false, + headless: true, + }; + let output = task.run(state).await.unwrap(); + assert!(!output.content.is_empty()); + } + + #[tokio::test] + async fn test_web_scraping_plugin_task_scrape_with_js() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let state = WebScrapingPluginState::new(&logger); + + let task = WebScrapingPluginTaskScrape { + id: "test".into(), + url: "https://www.google.com".to_string(), + enable_js: true, + headless: true, + }; + let output = task.run(state).await.unwrap(); + assert!(!output.content.is_empty()); + } +}