feat: Support setup headless and HTTP requests. Introduce SPA Engine.…

… Improve typings. Update docs.
obetomuniz · Mar 24, 2023 · 4eb708c · 4eb708c
1 parent 1f34382
commit 4eb708c
Show file tree

Hide file tree

Showing 22 changed files with 251 additions and 173 deletions.
diff --git a/docs/HTML.md b/docs/HTML.md
@@ -1,14 +1,14 @@
 # scrapeHtml
 
-The `scrapeHtml` function takes an page URL and an object containing `selectors` as input and returns an object with data extracted from the HTML string. It uses the `cheerio` and `utils/fetchData` functions to load the HTML and extract the required information using the provided `selectors`.
+The `scrapeHtml` function takes an page URL and an object containing `selectors` as input and returns an object with data extracted from the HTML string. It uses the `jsdom` and `utils/http` functions to load the HTML and extract the required information using the provided `selectors`.
 
 ## Selectors
 
 The `selectors` should be an object with keys that correspond to the data that you want to extract, and values that are the corresponding CSS selectors and HTML tag attributes.
 
-## SPAs
+## Request
 
-If the `spa` option is enabled, scrapeHtml will use `puppeteer` to load the HTML content dynamically by opening a headless browser and navigating to the URL specified in the data parameter. This is useful for extracting data from single-page applications (SPAs) that load content dynamically.
+The `request` configuration should be an object that correspond to the `AxiosRequestConfig` type from `axios`. Enjoy this flexibility.
 
 ## Example
 
@@ -21,12 +21,9 @@ const htmlData = await scrapeHtml(htmlUrl, {
     title: { selector: "title" },
     featured: { selector: "img", attribute: "src" },
   },
-  spa: {
-    enable: true,
-    browserConfig: {
-      headless: true,
-      args: ["--no-sandbox"],
-      executablePath: "/opt/homebrew/bin/chromium",
+  request: {
+    headers: {
+      "Cache-Control": "no-cache",
     },
   },
 })

diff --git a/docs/JSON.md b/docs/JSON.md
@@ -6,6 +6,10 @@ The `scrapeApi` function takes an JSON resource and an object containing `select
 
 The `selectors` should be an object with keys that correspond to the data that you want to extract in the JSON returned.
 
+## Request
+
+The `request` configuration should be an object that correspond to the `AxiosRequestConfig` type from `axios`. Enjoy this flexibility.
+
 ## Example
 
 ```javascript

diff --git a/docs/SPA.md b/docs/SPA.md
@@ -0,0 +1,36 @@
+# scrapeSpa
+
+The `scrapeSpa` function takes an page URL and an object containing `selectors` as input and returns an object with data extracted from the HTML string. It uses the `puppeteer` and `utils/spa` to load the HTML and extract the required information from single-page applications (SPAs) that load content dynamically.
+
+## Selectors
+
+The `selectors` should be an object with keys that correspond to the data that you want to extract, and values that are the corresponding CSS selectors and HTML tag attributes.
+
+## Request
+
+The `request` configuration should be an object that correspond to the `LaunchOptions` type from `puppeeter`. Enjoy this flexibility.
+
+## Example
+
+```javascript
+import { scrapeSpa } from "tatooine"
+
+const htmlUrl = "https://example.com"
+const htmlData = await scrapeSpa(htmlUrl, {
+  selectors: {
+    title: { selector: "title" },
+    featured: { selector: "img", attribute: "src" },
+  },
+  request: {
+    headless: true,
+    args: ["--no-sandbox"],
+    executablePath: "/opt/homebrew/bin/chromium",
+  },
+})
+
+// Output:
+// {
+//   title: 'My Title',
+//   heading: 'Hello World!',
+// }
+```
diff --git a/docs/XML.md b/docs/XML.md
@@ -1,11 +1,15 @@
 # scrapeXml
 
-The `scrapeXml` function takes an XML URL resource and an object containing `selectors` as input and returns an object with data extracted from the XML. It uses the `xmldom`, `xpath`, and `utils/fetchData` functions to make the HTTP request and scrape the XML using the provided `selectors`.
+The `scrapeXml` function takes an XML URL resource and an object containing `selectors` as input and returns an object with data extracted from the XML. It uses the `xmldom`, `xpath`, and `utils/http` functions to make the HTTP request and scrape the XML using the provided `selectors`.
 
 ## Selectors
 
 The `selectors` should be an object with keys that correspond to the data that you want to extract, and values that are the corresponding XPath expressions.
 
+## Request
+
+The `request` configuration should be an object that correspond to the `AxiosRequestConfig` type from `axios`. Enjoy this flexibility.
+
 ## Example
 
 ```javascript

diff --git a/fixtures/engines/html.ts b/fixtures/engines/html.ts
@@ -1,24 +1,5 @@
 import { EngineType } from "../../lib/types"
 
-const SpaScrapingFixture = {
-  url: "https://davidwalsh.name/demo/lazyload-2.0.php",
-  engine: EngineType.Html,
-  options: {
-    selectors: {
-      title: {
-        selector: ".demo-wrapper table tr .image img",
-        attribute: "src",
-      },
-    },
-    spa: {
-      enable: true,
-      browserConfig: {
-        executablePath: "/opt/homebrew/bin/chromium",
-      },
-    },
-  },
-}
-
 const HtmlScrapingFixture = {
   url: "https://github.com/trending/javascript",
   engine: EngineType.Html,
@@ -29,4 +10,4 @@ const HtmlScrapingFixture = {
 
 // TODO: Create Fixtures with Invalid Selectors
 
-export { SpaScrapingFixture, HtmlScrapingFixture }
+export { HtmlScrapingFixture }
diff --git a/fixtures/engines/spa.ts b/fixtures/engines/spa.ts
@@ -0,0 +1,21 @@
+import { EngineType } from "../../lib/types"
+
+const SpaScrapingFixture = {
+  url: "https://davidwalsh.name/demo/lazyload-2.0.php",
+  engine: EngineType.Spa,
+  options: {
+    selectors: {
+      title: {
+        selector: ".demo-wrapper table tr .image img",
+        attribute: "src",
+      },
+    },
+    request: {
+      executablePath: "/opt/homebrew/bin/chromium",
+    },
+  },
+}
+
+// TODO: Create Fixtures with Invalid Selectors
+
+export { SpaScrapingFixture }
diff --git a/lib/constants.ts b/lib/constants.ts
@@ -0,0 +1,6 @@
+export const DEFAULT_HTTP_REQUESTS_OPTIONS = {
+  headers: {
+    "Cache-Control": "no-cache",
+    "Accept-Encoding": "gzip, deflate",
+  },
+}
diff --git a/lib/engines/default/default.ts b/lib/engines/default/default.ts
@@ -1,10 +1,12 @@
 import { TScrapedDataPromise, IScrapeOptions } from "../../types"
 import scrapeHtml from "../../engines/html/html"
+import scrapeSpa from "../../engines/spa/spa"
 import scrapeJson from "../../engines/json/json"
 import scrapeXml from "../../engines/xml/xml"
 
 const SCRAPE_TYPES = {
   html: scrapeHtml,
+  spa: scrapeSpa,
   json: scrapeJson,
   xml: scrapeXml,
 }

diff --git a/lib/engines/html/html.ts b/lib/engines/html/html.ts
@@ -1,67 +1,26 @@
+import { AxiosRequestConfig } from "axios"
 import { JSDOM } from "jsdom"
-import {
-  TScrapedData,
-  TScrapedDataPromise,
-  IScrapeHtmlOptions,
-  TSelectors,
-} from "../../types"
+import { TScrapedDataPromise, IScrapeHtmlOptions } from "../../types"
 import fetchHttp from "../../utils/request/http"
-import fetchSpa from "../../utils/request/spa"
-
-const extractData = (
-  document: Document,
-  selectors: TSelectors
-): TScrapedData => {
-  const data: TScrapedData = {}
-
-  for (const [key, value] of Object.entries(selectors)) {
-    const elements = document.querySelectorAll(value.selector)
-    if (!elements.length) {
-      data[key] = ""
-    } else if (elements.length === 1) {
-      if (value?.attribute) {
-        data[key] = elements[0].getAttribute(value?.attribute) || ""
-      } else {
-        data[key] = elements[0].textContent?.trim() || ""
-      }
-    } else if (elements.length > 1) {
-      const values = Array.from(elements, (element) => {
-        if (value?.attribute) {
-          return element.getAttribute(value?.attribute) || ""
-        }
-        return element.textContent?.trim() || ""
-      })
-      data[key] = values
-    }
-  }
-
-  return data
-}
+import extractData from "../../utils/extract/html"
 
 const processData = async (
   url: string,
   options: IScrapeHtmlOptions
 ): TScrapedDataPromise => {
-  const { selectors, spa } = options
+  const { selectors, request } = options
 
-  if (spa?.enable) {
-    const htmlContent = await fetchSpa(url, spa.browserConfig)
-    const dom = new JSDOM(htmlContent)
-    const document = dom.window.document
-    return extractData(document, selectors)
-  } else {
-    const html = await fetchHttp(url)
-    const dom = new JSDOM(html)
-    const document = dom.window.document
-    return extractData(document, selectors)
-  }
+  const html = await fetchHttp(url, request as AxiosRequestConfig)
+  const dom = new JSDOM(html)
+  const document = dom.window.document
+  return extractData(document, selectors)
 }
 
 const scrapeHtml = async (
   url: string,
-  { selectors, spa = { enable: false, browserConfig: {} } }: IScrapeHtmlOptions
+  { selectors, request }: IScrapeHtmlOptions
 ): TScrapedDataPromise => {
-  const data = await processData(url, { selectors, spa })
+  const data = await processData(url, { selectors, request })
   return data
 }
 

diff --git a/lib/engines/index.ts b/lib/engines/index.ts
@@ -1,4 +1,5 @@
 export { default as scrape } from "./default"
 export { default as scrapeHtml } from "./html"
+export { default as scrapeSpa } from "./spa"
 export { default as scrapeXml } from "./xml"
 export { default as scrapeJson } from "./json"
diff --git a/lib/engines/json/json.ts b/lib/engines/json/json.ts
@@ -1,33 +1,11 @@
-import get from "lodash.get"
+import { AxiosRequestConfig } from "axios"
 import {
   TScrapedData,
   TScrapedDataPromise,
   IScrapeXmlOptions,
-  TSelectors,
 } from "../../types"
 import fetchHttp from "../../utils/request/http"
-
-const extractData = (j: any, selectors: TSelectors): TScrapedData => {
-  const data: TScrapedData = {}
-
-  if (Array.isArray(j)) {
-    return j.map((obj) => {
-      const data: TScrapedData = {}
-
-      for (const [key, value] of Object.entries(selectors)) {
-        data[key] = get(obj, value.selector, "")
-      }
-
-      return data
-    })
-  }
-
-  for (const [key, value] of Object.entries(selectors)) {
-    data[key] = get(j, value.selector, "")
-  }
-
-  return data
-}
+import extractData from "../../utils/extract/json"
 
 const processData = (
   j: string,
@@ -38,9 +16,9 @@ const processData = (
 
 const scrapeJson = async (
   url: string,
-  { selectors }: IScrapeXmlOptions
+  { selectors, request }: IScrapeXmlOptions
 ): TScrapedDataPromise => {
-  const j = await fetchHttp(url)
+  const j = await fetchHttp(url, request as AxiosRequestConfig)
   return processData(j, { selectors })
 }
 

diff --git a/lib/engines/spa/index.ts b/lib/engines/spa/index.ts
@@ -0,0 +1 @@
+export { default } from "./spa"
diff --git a/lib/engines/spa/spa.ts b/lib/engines/spa/spa.ts
@@ -0,0 +1,27 @@
+import { LaunchOptions } from "puppeteer"
+import { JSDOM } from "jsdom"
+import { TScrapedDataPromise, IScrapeSpaOptions } from "../../types"
+import fetchSpa from "../../utils/request/spa"
+import extractData from "../../utils/extract/html"
+
+const processData = async (
+  url: string,
+  options: IScrapeSpaOptions
+): TScrapedDataPromise => {
+  const { selectors, request } = options
+
+  const htmlContent = await fetchSpa(url, request as LaunchOptions)
+  const dom = new JSDOM(htmlContent)
+  const document = dom.window.document
+  return extractData(document, selectors)
+}
+
+const scrapeSpa = async (
+  url: string,
+  { selectors, request }: IScrapeSpaOptions
+): TScrapedDataPromise => {
+  const data = await processData(url, { selectors, request })
+  return data
+}
+
+export default scrapeSpa
diff --git a/lib/engines/xml/xml.ts b/lib/engines/xml/xml.ts
@@ -1,38 +1,8 @@
+import { AxiosRequestConfig } from "axios"
 import { JSDOM } from "jsdom"
-import xpath, { XPathResult } from "xpath-ts"
-import {
-  TScrapedData,
-  TScrapedDataPromise,
-  IScrapeXmlOptions,
-  TSelectors,
-} from "../../types"
+import { TScrapedDataPromise, IScrapeXmlOptions } from "../../types"
 import fetchHttp from "../../utils/request/http"
-
-const extractData = (
-  document: Document,
-  selectors: TSelectors
-): TScrapedData => {
-  const data: TScrapedData = {}
-
-  for (const [key, value] of Object.entries(selectors)) {
-    const nodes: XPathResult = xpath.evaluate(
-      value.selector,
-      document,
-      null,
-      XPathResult.ANY_TYPE,
-      null
-    )
-    const nodeValues: string[] = []
-    let node = nodes.iterateNext()
-    while (node) {
-      nodeValues.push(node.textContent || "")
-      node = nodes.iterateNext()
-    }
-    data[key] = nodeValues.length === 1 ? nodeValues[0] : nodeValues
-  }
-
-  return data
-}
+import extractData from "../../utils/extract/xml"
 
 const processData = async (
   xml: string,
@@ -45,9 +15,9 @@ const processData = async (
 
 const scrapeXml = async (
   url: string,
-  { selectors }: IScrapeXmlOptions
+  { selectors, request }: IScrapeXmlOptions
 ): TScrapedDataPromise => {
-  const xml = await fetchHttp(url)
+  const xml = await fetchHttp(url, request as AxiosRequestConfig)
   const data = await processData(xml, { selectors })
 
   return data