-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: cleanup structure on the scrape plugin
- Loading branch information
1 parent
e2e818c
commit 804827f
Showing
4 changed files
with
121 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
use headless_chrome::LaunchOptions; | ||
|
||
pub async fn get_website_content(url: &str, headless: bool) -> anyhow::Result<String> { | ||
let browser = headless_chrome::Browser::new(LaunchOptions { | ||
headless, | ||
..Default::default() | ||
}) | ||
.unwrap(); | ||
let tab = browser.new_tab()?; | ||
tab.navigate_to(url)?; | ||
tab.wait_until_navigated()?; | ||
tab.get_content() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pub async fn get_website_content(url: &str) -> anyhow::Result<String> { | ||
reqwest::get(url).await?.text().await.map_err(Into::into) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
use slog::Logger; | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct WebScrapingPluginState { | ||
pub logger: Logger, | ||
} | ||
|
||
impl WebScrapingPluginState { | ||
pub fn new(logger: &Logger) -> Self { | ||
Self { | ||
logger: logger.clone(), | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
use anyhow::Result; | ||
use mai_sdk_core::task_queue::{Runnable, TaskId}; | ||
use serde::{Deserialize, Serialize}; | ||
use slog::info; | ||
|
||
use super::state::WebScrapingPluginState; | ||
|
||
#[derive(Debug, Clone, Serialize, Deserialize)] | ||
pub struct WebScrapingPluginTaskScrape { | ||
pub id: TaskId, | ||
pub url: String, | ||
pub enable_js: bool, | ||
pub headless: bool, | ||
} | ||
|
||
impl WebScrapingPluginTaskScrape { | ||
pub fn new(url: String, enable_js: bool, headless: bool) -> Self { | ||
Self { | ||
id: nanoid::nanoid!(), | ||
url, | ||
enable_js, | ||
headless, | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] | ||
pub struct WebScrapingPluginTaskScrapeOutput { | ||
pub content: String, | ||
} | ||
|
||
impl Runnable<WebScrapingPluginTaskScrapeOutput, WebScrapingPluginState> | ||
for WebScrapingPluginTaskScrape | ||
{ | ||
fn id(&self) -> TaskId { | ||
self.id.clone() | ||
} | ||
|
||
async fn run( | ||
&self, | ||
state: WebScrapingPluginState, | ||
) -> Result<WebScrapingPluginTaskScrapeOutput> { | ||
let content = match self.enable_js { | ||
true => { | ||
info!(state.logger, "running with browser"); | ||
super::headless_chrome::get_website_content(&self.url, self.headless).await? | ||
} | ||
false => { | ||
info!(state.logger, "running without browser"); | ||
super::reqwest::get_website_content(&self.url).await? | ||
} | ||
}; | ||
|
||
Ok(WebScrapingPluginTaskScrapeOutput { content }) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[tokio::test] | ||
async fn test_web_scraping_plugin_task_scrape() { | ||
let logger = slog::Logger::root(slog::Discard, slog::o!()); | ||
let state = WebScrapingPluginState::new(&logger); | ||
|
||
let task = WebScrapingPluginTaskScrape { | ||
id: "test".into(), | ||
url: "https://www.google.com".to_string(), | ||
enable_js: false, | ||
headless: true, | ||
}; | ||
let output = task.run(state).await.unwrap(); | ||
assert!(!output.content.is_empty()); | ||
} | ||
|
||
#[tokio::test] | ||
async fn test_web_scraping_plugin_task_scrape_with_js() { | ||
let logger = slog::Logger::root(slog::Discard, slog::o!()); | ||
let state = WebScrapingPluginState::new(&logger); | ||
|
||
let task = WebScrapingPluginTaskScrape { | ||
id: "test".into(), | ||
url: "https://www.google.com".to_string(), | ||
enable_js: true, | ||
headless: true, | ||
}; | ||
let output = task.run(state).await.unwrap(); | ||
assert!(!output.content.is_empty()); | ||
} | ||
} |