Skip to content

Commit

Permalink
v2022.5.28 release
Browse files Browse the repository at this point in the history
  • Loading branch information
a5huynh committed May 27, 2022
2 parents a9ce4e8 + 19d8bda commit 7c62cd5
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 62 deletions.
4 changes: 3 additions & 1 deletion crates/client/src/pages/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ use shared::response;

use crate::components::{ResultListData, SearchResultItem, SelectedLens};
use crate::events;
use crate::{on_clear_search, on_focus, on_refresh_results, resize_window, search_docs, search_lenses};
use crate::{
on_clear_search, on_focus, on_refresh_results, resize_window, search_docs, search_lenses,
};

#[function_component(SearchPage)]
pub fn search_page() -> Html {
Expand Down
17 changes: 9 additions & 8 deletions crates/entities/src/models/crawl_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,17 +351,18 @@ pub async fn enqueue_all(
}

// Should we crawl external links?
if !settings.crawl_external_links
// Only allow crawls specified in our lenses
&& (!allow_list.is_empty() && !allow_list.is_match(&normalized))
{
return None;
if settings.crawl_external_links {
return Some(normalized);
}

Some(parsed.as_str().to_string())
} else {
None
// If external links are not allowed, only allow crawls specified
// in our lenses
if allow_list.is_empty() || allow_list.is_match(&normalized) {
return Some(normalized);
}
}

None
})
.collect();

Expand Down
2 changes: 1 addition & 1 deletion crates/shared/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ impl Lens {
}

fn default_is_enabled() -> bool {
false
true
}
}

Expand Down
1 change: 0 additions & 1 deletion crates/shared/src/rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,4 @@ pub trait Rpc {

#[rpc(name = "search_lenses")]
fn search_lenses(&self, query: SearchLensesParam) -> BoxFuture<Result<SearchLensesResp>>;

}
7 changes: 2 additions & 5 deletions crates/spyglass/src/api/route.rs
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,7 @@ pub async fn search_lenses(
}

#[instrument(skip(state))]
pub async fn delete_doc(
state: AppState,
id: String
) -> Result<()> {
pub async fn delete_doc(state: AppState, id: String) -> Result<()> {
if let Ok(mut writer) = state.index.writer.lock() {
if let Err(e) = Searcher::delete(&mut writer, &id) {
log::error!("Unable to delete doc {} due to {}", id, e);
Expand All @@ -234,4 +231,4 @@ pub async fn delete_doc(
}

Ok(())
}
}
121 changes: 80 additions & 41 deletions crates/spyglass/src/crawler/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,26 @@ impl CrawlResult {
}
}

fn _normalize_href(url: &Url, href: &str) -> Option<String> {
fn normalize_href(url: &str, href: &str) -> Option<String> {
// Force HTTPS, crawler will fallback to HTTP if necessary.
if href.starts_with("//") {
// schema relative url
if let Ok(url) = Url::parse(&format!("{}:{}", "https", href)) {
return Some(url.to_string());
}
} else if href.starts_with("http://") || href.starts_with("https://") {
// Force HTTPS, crawler will fallback to HTTP if necessary.
if let Ok(url) = Url::parse(href) {
let mut url = url;
url.set_scheme("https").unwrap();
return Some(url.to_string());
}
} else {
// origin or directory relative url
if let Ok(url) = url.join(href) {
return Some(url.to_string());
if let Ok(url) = Url::parse(url) {
if href.starts_with("//") {
// schema relative url
if let Ok(url) = Url::parse(&format!("{}:{}", "https", href)) {
return Some(url.to_string());
}
} else if href.starts_with("http://") || href.starts_with("https://") {
// Force HTTPS, crawler will fallback to HTTP if necessary.
if let Ok(url) = Url::parse(href) {
let mut url = url;
url.set_scheme("https").unwrap();
return Some(url.to_string());
}
} else {
// origin or directory relative url
if let Ok(url) = url.join(href) {
return Some(url.to_string());
}
}
}

Expand Down Expand Up @@ -177,29 +179,21 @@ impl Crawler {
let mut hasher = Sha256::new();
hasher.update(&parse_result.content.as_bytes());
let content_hash = Some(hex::encode(&hasher.finalize()[..]));

// Normalize links from scrape result. If the links start with "/" they
// should be appended to the current URL.
let normalized_links = parse_result
.links
.iter()
.filter_map(|link| _normalize_href(url, link))
.collect();
log::trace!("content hash: {:?}", content_hash);

let canonical_url = match parse_result.canonical_url {
Some(canonical) => determine_canonical(url, &canonical),
None => url.to_string(),
};

log::trace!("content hash: {:?}", content_hash);
CrawlResult {
content_hash,
content: Some(parse_result.content),
description: Some(parse_result.description),
status: 200,
title: parse_result.title,
url: canonical_url,
links: normalized_links,
links: parse_result.links,
raw: Some(raw_body.to_string()),
}
}
Expand Down Expand Up @@ -261,6 +255,15 @@ impl Crawler {
}
}

// Normalize links from scrape result. If the links start with "/" they
// should be appended to the current URL.
let normalized_links = result
.links
.iter()
.filter_map(|link| normalize_href(&result.url, link))
.collect();
result.links = normalized_links;

log::trace!(
"crawl result: {:?} - {:?}\n{:?}",
result.title,
Expand All @@ -277,30 +280,29 @@ impl Crawler {

#[cfg(test)]
mod test {
use entities::models::crawl_queue::CrawlType;
use entities::models::{crawl_queue, resource_rule};
use entities::sea_orm::{ActiveModelTrait, Set};
use entities::test::setup_test_db;

use crate::crawler::{Crawler, _normalize_href, determine_canonical};
use crate::crawler::{determine_canonical, normalize_href, Crawler};

use url::Url;

#[tokio::test]
#[ignore]
async fn test_crawl() {
let crawler = Crawler::new();
let url = Url::parse("https://oldschool.runescape.wiki").unwrap();
let result = crawler.crawl(&url).await;

assert_eq!(result.title, Some("Old School RuneScape Wiki".to_string()));
assert_eq!(result.url, "https://oldschool.runescape.wiki/".to_string());

// All links should start w/ http
for link in result.links {
assert!(link.starts_with("https://"))
}
assert!(result.links.len() > 0);
}

#[tokio::test]
#[ignore]
async fn test_fetch() {
let crawler = Crawler::new();

Expand All @@ -319,6 +321,43 @@ mod test {
let result = crawl_result.unwrap();
assert_eq!(result.title, Some("Old School RuneScape Wiki".to_string()));
assert_eq!(result.url, "https://oldschool.runescape.wiki/".to_string());

let links: Vec<String> = result.links.into_iter().collect();
assert!(links[0].starts_with("https://oldschool.runescape.wiki"));
}

#[tokio::test]
#[ignore]
async fn test_fetch_bootstrap() {
let crawler = Crawler::new();

let db = setup_test_db().await;
let url = Url::parse("https://www.ign.com/wikis/luigis-mansion").unwrap();
let query = crawl_queue::ActiveModel {
domain: Set(url.host_str().unwrap().to_owned()),
url: Set(url.to_string()),
crawl_type: Set(CrawlType::Bootstrap),
..Default::default()
};
let model = query.insert(&db).await.unwrap();

let crawl_result = crawler.fetch_by_job(&db, model.id).await.unwrap();
assert!(crawl_result.is_some());

let result = crawl_result.unwrap();
assert_eq!(
result.title,
Some("Luigi's Mansion Wiki Guide - IGN".to_string())
);
assert_eq!(
result.url,
"https://www.ign.com/wikis/luigis-mansion/".to_string()
);

let links: Vec<String> = result.links.into_iter().collect();
for link in links {
assert!(!link.starts_with("https://web.archive.org"));
}
}

#[tokio::test]
Expand Down Expand Up @@ -353,31 +392,31 @@ mod test {
}

#[test]
fn test_normalize_href() {
let url = Url::parse("https://example.com").unwrap();
fn testnormalize_href() {
let url = "https://example.com";

assert_eq!(
_normalize_href(&url, "http://foo.com"),
normalize_href(&url, "http://foo.com"),
Some("https://foo.com/".into())
);
assert_eq!(
_normalize_href(&url, "https://foo.com"),
normalize_href(&url, "https://foo.com"),
Some("https://foo.com/".into())
);
assert_eq!(
_normalize_href(&url, "//foo.com"),
normalize_href(&url, "//foo.com"),
Some("https://foo.com/".into())
);
assert_eq!(
_normalize_href(&url, "/foo.html"),
normalize_href(&url, "/foo.html"),
Some("https://example.com/foo.html".into())
);
assert_eq!(
_normalize_href(&url, "/foo"),
normalize_href(&url, "/foo"),
Some("https://example.com/foo".into())
);
assert_eq!(
_normalize_href(&url, "foo.html"),
normalize_href(&url, "foo.html"),
Some("https://example.com/foo.html".into())
);
}
Expand Down
2 changes: 1 addition & 1 deletion crates/spyglass/src/crawler/robots.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ pub async fn check_resource_rules(
.await?;

if rules.is_empty() {
log::info!("No rules found for this domain, fetching robot.txt");
log::info!("No rules found for <{}>, fetching robot.txt", domain);

let robots_url = format!("https://{}/robots.txt", domain);
let res = client.get(robots_url).send().await;
Expand Down
2 changes: 2 additions & 0 deletions crates/spyglass/src/task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,13 @@ async fn _handle_fetch(state: AppState, crawler: Crawler, task: CrawlTask) {

// Add all valid, non-duplicate, non-indexed links found to crawl queue
let to_enqueue: Vec<String> = crawl_result.links.into_iter().collect();

let lenses: Vec<Lens> = state
.lenses
.iter()
.map(|entry| entry.value().clone())
.collect();

if let Err(err) = crawl_queue::enqueue_all(
&state.db,
&to_enqueue,
Expand Down
7 changes: 4 additions & 3 deletions crates/tauri/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,13 @@ pub async fn delete_doc<'r>(
let mut rpc = rpc.lock().await;
match rpc
.client
.call_method::<(String,), ()>("delete_doc", "", (id.into(),)).await {

.call_method::<(String,), ()>("delete_doc", "", (id.into(),))
.await
{
Ok(_) => {
let _ = window.emit("refresh_results", true);
Ok(())
},
}
Err(err) => {
log::error!("Error sending RPC: {}", err);
rpc.reconnect().await;
Expand Down
2 changes: 1 addition & 1 deletion crates/tauri/tauri.conf.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"package": {
"productName": "Spyglass",
"version": "22.5.27"
"version": "22.5.28"
},
"build": {
"distDir": "../client/dist",
Expand Down

0 comments on commit 7c62cd5

Please sign in to comment.