Skip to content

Commit

Permalink
refactor: cleanup structure on the scrape plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
tomsanbear committed Jun 15, 2024
1 parent e2e818c commit 804827f
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 0 deletions.
13 changes: 13 additions & 0 deletions mai-sdk-plugins/src/web_scraping/headless_chrome.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
use headless_chrome::LaunchOptions;

pub async fn get_website_content(url: &str, headless: bool) -> anyhow::Result<String> {
let browser = headless_chrome::Browser::new(LaunchOptions {
headless,
..Default::default()
})
.unwrap();
let tab = browser.new_tab()?;
tab.navigate_to(url)?;
tab.wait_until_navigated()?;
tab.get_content()
}
3 changes: 3 additions & 0 deletions mai-sdk-plugins/src/web_scraping/reqwest.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub async fn get_website_content(url: &str) -> anyhow::Result<String> {
reqwest::get(url).await?.text().await.map_err(Into::into)
}
14 changes: 14 additions & 0 deletions mai-sdk-plugins/src/web_scraping/state.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use slog::Logger;

#[derive(Debug, Clone)]
pub struct WebScrapingPluginState {
pub logger: Logger,
}

impl WebScrapingPluginState {
pub fn new(logger: &Logger) -> Self {
Self {
logger: logger.clone(),
}
}
}
91 changes: 91 additions & 0 deletions mai-sdk-plugins/src/web_scraping/task.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use anyhow::Result;
use mai_sdk_core::task_queue::{Runnable, TaskId};
use serde::{Deserialize, Serialize};
use slog::info;

use super::state::WebScrapingPluginState;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WebScrapingPluginTaskScrape {
pub id: TaskId,
pub url: String,
pub enable_js: bool,
pub headless: bool,
}

impl WebScrapingPluginTaskScrape {
pub fn new(url: String, enable_js: bool, headless: bool) -> Self {
Self {
id: nanoid::nanoid!(),
url,
enable_js,
headless,
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub struct WebScrapingPluginTaskScrapeOutput {
pub content: String,
}

impl Runnable<WebScrapingPluginTaskScrapeOutput, WebScrapingPluginState>
for WebScrapingPluginTaskScrape
{
fn id(&self) -> TaskId {
self.id.clone()
}

async fn run(
&self,
state: WebScrapingPluginState,
) -> Result<WebScrapingPluginTaskScrapeOutput> {
let content = match self.enable_js {
true => {
info!(state.logger, "running with browser");
super::headless_chrome::get_website_content(&self.url, self.headless).await?
}
false => {
info!(state.logger, "running without browser");
super::reqwest::get_website_content(&self.url).await?
}
};

Ok(WebScrapingPluginTaskScrapeOutput { content })
}
}

#[cfg(test)]
mod tests {
use super::*;

#[tokio::test]
async fn test_web_scraping_plugin_task_scrape() {
let logger = slog::Logger::root(slog::Discard, slog::o!());
let state = WebScrapingPluginState::new(&logger);

let task = WebScrapingPluginTaskScrape {
id: "test".into(),
url: "https://www.google.com".to_string(),
enable_js: false,
headless: true,
};
let output = task.run(state).await.unwrap();
assert!(!output.content.is_empty());
}

#[tokio::test]
async fn test_web_scraping_plugin_task_scrape_with_js() {
let logger = slog::Logger::root(slog::Discard, slog::o!());
let state = WebScrapingPluginState::new(&logger);

let task = WebScrapingPluginTaskScrape {
id: "test".into(),
url: "https://www.google.com".to_string(),
enable_js: true,
headless: true,
};
let output = task.run(state).await.unwrap();
assert!(!output.content.is_empty());
}
}

0 comments on commit 804827f

Please sign in to comment.