v2022.5.28 release

spyglass-search · May 27, 2022 · 7c62cd5 · 7c62cd5
2 parents a9ce4e8 + 19d8bda
commit 7c62cd5
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 62 deletions.
diff --git a/crates/client/src/pages/search.rs b/crates/client/src/pages/search.rs
@@ -8,7 +8,9 @@ use shared::response;
 
 use crate::components::{ResultListData, SearchResultItem, SelectedLens};
 use crate::events;
-use crate::{on_clear_search, on_focus, on_refresh_results, resize_window, search_docs, search_lenses};
+use crate::{
+    on_clear_search, on_focus, on_refresh_results, resize_window, search_docs, search_lenses,
+};
 
 #[function_component(SearchPage)]
 pub fn search_page() -> Html {

diff --git a/crates/entities/src/models/crawl_queue.rs b/crates/entities/src/models/crawl_queue.rs
@@ -351,17 +351,18 @@ pub async fn enqueue_all(
                 }
 
                 // Should we crawl external links?
-                if !settings.crawl_external_links
-                    // Only allow crawls specified in our lenses
-                    && (!allow_list.is_empty() && !allow_list.is_match(&normalized))
-                {
-                    return None;
+                if settings.crawl_external_links {
+                    return Some(normalized);
                 }
 
-                Some(parsed.as_str().to_string())
-            } else {
-                None
+                // If external links are not allowed, only allow crawls specified
+                // in our lenses
+                if allow_list.is_empty() || allow_list.is_match(&normalized) {
+                    return Some(normalized);
+                }
             }
+
+            None
         })
         .collect();
 

diff --git a/crates/shared/src/config.rs b/crates/shared/src/config.rs
@@ -51,7 +51,7 @@ impl Lens {
     }
 
     fn default_is_enabled() -> bool {
-        false
+        true
     }
 }
 

diff --git a/crates/shared/src/rpc.rs b/crates/shared/src/rpc.rs
@@ -36,5 +36,4 @@ pub trait Rpc {
 
     #[rpc(name = "search_lenses")]
     fn search_lenses(&self, query: SearchLensesParam) -> BoxFuture<Result<SearchLensesResp>>;
-
 }
diff --git a/crates/spyglass/src/api/route.rs b/crates/spyglass/src/api/route.rs
@@ -221,10 +221,7 @@ pub async fn search_lenses(
 }
 
 #[instrument(skip(state))]
-pub async fn delete_doc(
-    state: AppState,
-    id: String
-) -> Result<()> {
+pub async fn delete_doc(state: AppState, id: String) -> Result<()> {
     if let Ok(mut writer) = state.index.writer.lock() {
         if let Err(e) = Searcher::delete(&mut writer, &id) {
             log::error!("Unable to delete doc {} due to {}", id, e);
@@ -234,4 +231,4 @@ pub async fn delete_doc(
     }
 
     Ok(())
-}
+}
diff --git a/crates/spyglass/src/crawler/mod.rs b/crates/spyglass/src/crawler/mod.rs
@@ -51,24 +51,26 @@ impl CrawlResult {
     }
 }
 
-fn _normalize_href(url: &Url, href: &str) -> Option<String> {
+fn normalize_href(url: &str, href: &str) -> Option<String> {
     // Force HTTPS, crawler will fallback to HTTP if necessary.
-    if href.starts_with("//") {
-        // schema relative url
-        if let Ok(url) = Url::parse(&format!("{}:{}", "https", href)) {
-            return Some(url.to_string());
-        }
-    } else if href.starts_with("http://") || href.starts_with("https://") {
-        // Force HTTPS, crawler will fallback to HTTP if necessary.
-        if let Ok(url) = Url::parse(href) {
-            let mut url = url;
-            url.set_scheme("https").unwrap();
-            return Some(url.to_string());
-        }
-    } else {
-        // origin or directory relative url
-        if let Ok(url) = url.join(href) {
-            return Some(url.to_string());
+    if let Ok(url) = Url::parse(url) {
+        if href.starts_with("//") {
+            // schema relative url
+            if let Ok(url) = Url::parse(&format!("{}:{}", "https", href)) {
+                return Some(url.to_string());
+            }
+        } else if href.starts_with("http://") || href.starts_with("https://") {
+            // Force HTTPS, crawler will fallback to HTTP if necessary.
+            if let Ok(url) = Url::parse(href) {
+                let mut url = url;
+                url.set_scheme("https").unwrap();
+                return Some(url.to_string());
+            }
+        } else {
+            // origin or directory relative url
+            if let Ok(url) = url.join(href) {
+                return Some(url.to_string());
+            }
         }
     }
 
@@ -177,29 +179,21 @@ impl Crawler {
         let mut hasher = Sha256::new();
         hasher.update(&parse_result.content.as_bytes());
         let content_hash = Some(hex::encode(&hasher.finalize()[..]));
-
-        // Normalize links from scrape result. If the links start with "/" they
-        // should be appended to the current URL.
-        let normalized_links = parse_result
-            .links
-            .iter()
-            .filter_map(|link| _normalize_href(url, link))
-            .collect();
+        log::trace!("content hash: {:?}", content_hash);
 
         let canonical_url = match parse_result.canonical_url {
             Some(canonical) => determine_canonical(url, &canonical),
             None => url.to_string(),
         };
 
-        log::trace!("content hash: {:?}", content_hash);
         CrawlResult {
             content_hash,
             content: Some(parse_result.content),
             description: Some(parse_result.description),
             status: 200,
             title: parse_result.title,
             url: canonical_url,
-            links: normalized_links,
+            links: parse_result.links,
             raw: Some(raw_body.to_string()),
         }
     }
@@ -261,6 +255,15 @@ impl Crawler {
             }
         }
 
+        // Normalize links from scrape result. If the links start with "/" they
+        // should be appended to the current URL.
+        let normalized_links = result
+            .links
+            .iter()
+            .filter_map(|link| normalize_href(&result.url, link))
+            .collect();
+        result.links = normalized_links;
+
         log::trace!(
             "crawl result: {:?} - {:?}\n{:?}",
             result.title,
@@ -277,30 +280,29 @@ impl Crawler {
 
 #[cfg(test)]
 mod test {
+    use entities::models::crawl_queue::CrawlType;
     use entities::models::{crawl_queue, resource_rule};
     use entities::sea_orm::{ActiveModelTrait, Set};
     use entities::test::setup_test_db;
 
-    use crate::crawler::{Crawler, _normalize_href, determine_canonical};
+    use crate::crawler::{determine_canonical, normalize_href, Crawler};
 
     use url::Url;
 
     #[tokio::test]
+    #[ignore]
     async fn test_crawl() {
         let crawler = Crawler::new();
         let url = Url::parse("https://oldschool.runescape.wiki").unwrap();
         let result = crawler.crawl(&url).await;
 
         assert_eq!(result.title, Some("Old School RuneScape Wiki".to_string()));
         assert_eq!(result.url, "https://oldschool.runescape.wiki/".to_string());
-
-        // All links should start w/ http
-        for link in result.links {
-            assert!(link.starts_with("https://"))
-        }
+        assert!(result.links.len() > 0);
     }
 
     #[tokio::test]
+    #[ignore]
     async fn test_fetch() {
         let crawler = Crawler::new();
 
@@ -319,6 +321,43 @@ mod test {
         let result = crawl_result.unwrap();
         assert_eq!(result.title, Some("Old School RuneScape Wiki".to_string()));
         assert_eq!(result.url, "https://oldschool.runescape.wiki/".to_string());
+
+        let links: Vec<String> = result.links.into_iter().collect();
+        assert!(links[0].starts_with("https://oldschool.runescape.wiki"));
+    }
+
+    #[tokio::test]
+    #[ignore]
+    async fn test_fetch_bootstrap() {
+        let crawler = Crawler::new();
+
+        let db = setup_test_db().await;
+        let url = Url::parse("https://www.ign.com/wikis/luigis-mansion").unwrap();
+        let query = crawl_queue::ActiveModel {
+            domain: Set(url.host_str().unwrap().to_owned()),
+            url: Set(url.to_string()),
+            crawl_type: Set(CrawlType::Bootstrap),
+            ..Default::default()
+        };
+        let model = query.insert(&db).await.unwrap();
+
+        let crawl_result = crawler.fetch_by_job(&db, model.id).await.unwrap();
+        assert!(crawl_result.is_some());
+
+        let result = crawl_result.unwrap();
+        assert_eq!(
+            result.title,
+            Some("Luigi's Mansion Wiki Guide - IGN".to_string())
+        );
+        assert_eq!(
+            result.url,
+            "https://www.ign.com/wikis/luigis-mansion/".to_string()
+        );
+
+        let links: Vec<String> = result.links.into_iter().collect();
+        for link in links {
+            assert!(!link.starts_with("https://web.archive.org"));
+        }
     }
 
     #[tokio::test]
@@ -353,31 +392,31 @@ mod test {
     }
 
     #[test]
-    fn test_normalize_href() {
-        let url = Url::parse("https://example.com").unwrap();
+    fn testnormalize_href() {
+        let url = "https://example.com";
 
         assert_eq!(
-            _normalize_href(&url, "http://foo.com"),
+            normalize_href(&url, "http://foo.com"),
             Some("https://foo.com/".into())
         );
         assert_eq!(
-            _normalize_href(&url, "https://foo.com"),
+            normalize_href(&url, "https://foo.com"),
             Some("https://foo.com/".into())
         );
         assert_eq!(
-            _normalize_href(&url, "//foo.com"),
+            normalize_href(&url, "//foo.com"),
             Some("https://foo.com/".into())
         );
         assert_eq!(
-            _normalize_href(&url, "/foo.html"),
+            normalize_href(&url, "/foo.html"),
             Some("https://example.com/foo.html".into())
         );
         assert_eq!(
-            _normalize_href(&url, "/foo"),
+            normalize_href(&url, "/foo"),
             Some("https://example.com/foo".into())
         );
         assert_eq!(
-            _normalize_href(&url, "foo.html"),
+            normalize_href(&url, "foo.html"),
             Some("https://example.com/foo.html".into())
         );
     }

diff --git a/crates/spyglass/src/crawler/robots.rs b/crates/spyglass/src/crawler/robots.rs
@@ -107,7 +107,7 @@ pub async fn check_resource_rules(
         .await?;
 
     if rules.is_empty() {
-        log::info!("No rules found for this domain, fetching robot.txt");
+        log::info!("No rules found for <{}>, fetching robot.txt", domain);
 
         let robots_url = format!("https://{}/robots.txt", domain);
         let res = client.get(robots_url).send().await;

diff --git a/crates/spyglass/src/task.rs b/crates/spyglass/src/task.rs
@@ -120,11 +120,13 @@ async fn _handle_fetch(state: AppState, crawler: Crawler, task: CrawlTask) {
 
             // Add all valid, non-duplicate, non-indexed links found to crawl queue
             let to_enqueue: Vec<String> = crawl_result.links.into_iter().collect();
+
             let lenses: Vec<Lens> = state
                 .lenses
                 .iter()
                 .map(|entry| entry.value().clone())
                 .collect();
+
             if let Err(err) = crawl_queue::enqueue_all(
                 &state.db,
                 &to_enqueue,

diff --git a/crates/tauri/src/cmd.rs b/crates/tauri/src/cmd.rs
@@ -106,12 +106,13 @@ pub async fn delete_doc<'r>(
     let mut rpc = rpc.lock().await;
     match rpc
         .client
-        .call_method::<(String,), ()>("delete_doc", "", (id.into(),)).await {
-
+        .call_method::<(String,), ()>("delete_doc", "", (id.into(),))
+        .await
+    {
         Ok(_) => {
             let _ = window.emit("refresh_results", true);
             Ok(())
-        },
+        }
         Err(err) => {
             log::error!("Error sending RPC: {}", err);
             rpc.reconnect().await;

diff --git a/crates/tauri/tauri.conf.json b/crates/tauri/tauri.conf.json
@@ -1,7 +1,7 @@
 {
   "package": {
     "productName": "Spyglass",
-    "version": "22.5.27"
+    "version": "22.5.28"
   },
   "build": {
     "distDir": "../client/dist",
-Original file line number
+Diff line change
@@ Expand Up / @@ -51,7 +51,7 @@ impl Lens { @@
         }
         fn default_is_enabled() -> bool {
-            false
+            true
         }
     }
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -36,5 +36,4 @@ pub trait Rpc {

		#[rpc(name = "search_lenses")]
		fn search_lenses(&self, query: SearchLensesParam) -> BoxFuture<Result<SearchLensesResp>>;

		}