canonicization test passed

hanover-computing · Aug 30, 2021 · 41e6002 · 41e6002
1 parent dae4662
commit 41e6002
Show file tree

Hide file tree

Showing 2 changed files with 221 additions and 37 deletions.
diff --git a/utils/canonicize.js b/utils/canonicize.js
@@ -37,7 +37,7 @@ export default async function canonicizeHook(res) {
       const relStrLower = relStr.toLowerCase()
       if (relStrLower.includes('rel') && relStrLower.includes('canonical')) {
         // <https://example.com>, https://example.com, etc.
-        const url = trim(linkStr.trim(), ['<', '>'])
+        const url = trim(linkStr.trim(), ['<', '>', ' '])
         matches.push(url)
       }
     })
@@ -58,49 +58,40 @@ export default async function canonicizeHook(res) {
   // The only reason we want canonical is to make our job with normalization easier;
   // So we need to make sure the canonical link IS for the url we're trying to normalize!
 
+  const { hostname: domain } = new URL(normalizedUrl)
   const { domain: baseDomain } = parseTld(normalizedUrl)
 
-  const candidates = matches
-    .map(link => {
-      // Before processing, we need to make sure all the URLs are in absolute form
-      if (link.startsWith('//')) {
-        return `https:${link}`
-      } else if (link.startsWith('/')) {
-        return `${baseDomain}${link}`
-      } else {
-        return link
-      }
-    })
-    .filter(link => {
-      // First, ensure that every match is a valid URL w/ a matching domain
-      // In this case, we're only matching the "top-level" domain -
-      // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
-      // so we want to include references to www.site.com (actually *prefer* those)
-      try {
-        return parseTld(link).domain === baseDomain
-      } catch (err) {
-        return false
-      }
-    })
-    .filter(link => {
-      // Then, ensure that links aren't AMP'd
-      return !urlIsAmp(link)
-    })
-
   let result = normalizedUrl
   let minDist = Number.POSITIVE_INFINITY
 
-  for (const candidate of candidates) {
+  for (const match of matches) {
+    let link = match
+
+    // turn relative to absolute URL
+    if (match.startsWith('/')) link = `${domain}${match}`
+
+    // Skip invalid links
     try {
-      const normalizedCandidate = await normalize(candidate)
+      link = await normalize(link)
 
-      // Then, sort by similarity to the normalized URL of the page we ended up in
-      const dist = leven(normalizedUrl, normalizedCandidate)
-      if (dist < minDist) {
-        minDist = dist
-        result = normalizedCandidate
-      }
-    } catch (err) {} // pass, the link is invalid
+      // Ensure that every match is a valid URL w/ a matching domain
+      // In this case, we're only matching the "top-level" domain -
+      // e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
+      // so we want to include references to www.site.com (actually *prefer* those)
+      if (parseTld(link).domain !== baseDomain) continue
+
+      // Then, ensure that links aren't AMP'd
+      if (urlIsAmp(link)) continue
+    } catch (err) {
+      continue
+    }
+
+    // Then, sort by similarity to the normalized URL of the page we ended up in
+    const dist = leven(normalizedUrl, link)
+    if (dist < minDist) {
+      minDist = dist
+      result = link
+    }
   }
 
   res.url = result

diff --git a/utils/canonicize.test.js b/utils/canonicize.test.js
@@ -0,0 +1,193 @@
+import { expect, describe, it } from '@jest/globals'
+import got from 'got'
+import nock from 'nock'
+import hook from './canonicize'
+
+nock.disableNetConnect()
+
+describe('extracting canonical links', () => {
+  const httpClient = got.extend({
+    hooks: { afterResponse: [hook] },
+    context: {
+      normalize: async url =>
+        url.startsWith('http') || url.startsWith('https')
+          ? url
+          : `http://${url}`
+    }
+  })
+
+  it('picks up rel=canonical in HTML body', async () => {
+    nock('http://asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <link rel="canonical" href="http://asdf.com/canonical" />
+                <title>Simple HTML document</title>
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`
+      )
+
+    const { url } = await httpClient.get('http://asdf.com/somepage')
+    expect(url).toBe('http://asdf.com/canonical')
+  })
+
+  it('picks up rel=canonical in HTTP header', async () => {
+    nock('http://asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <title>Simple HTML document</title>
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`,
+        {
+          link: '< http://asdf.com/canonical>; rel="canonical"'
+        }
+      )
+
+    const { url } = await httpClient.get('http://asdf.com/somepage')
+    expect(url).toBe('http://asdf.com/canonical')
+  })
+
+  it("picks up de-AMP'd links", async () => {
+    nock('http://asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <title>Simple HTML document</title>
+                <a class="amp-canurl" href="http://asdf.com/canonical" />
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`
+      )
+
+    const { url } = await httpClient.get('http://asdf.com/somepage')
+    expect(url).toBe('http://asdf.com/canonical')
+  })
+
+  it('picks up opengraph links', async () => {
+    nock('http://asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <title>Simple HTML document</title>
+                <meta property="og:url" content="http://asdf.com/canonical" />
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`
+      )
+
+    const { url } = await httpClient.get('http://asdf.com/somepage')
+    expect(url).toBe('http://asdf.com/canonical')
+  })
+
+  it('puts all relative links in absolute form', async () => {
+    nock('http://sub.asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <link rel="canonical" href="/canonical" />
+                <title>Simple HTML document</title>
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`
+      )
+
+    const { url } = await httpClient.get('http://sub.asdf.com/somepage')
+    expect(url).toBe('http://sub.asdf.com/canonical')
+  })
+
+  it('ignores invalid matches (domain)', async () => {
+    nock('http://sub.asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <link rel="canonical" href="http://sub.com/canonical" />
+                <title>Simple HTML document</title>
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`
+      )
+
+    const { url } = await httpClient.get('http://sub.asdf.com/somepage')
+    expect(url).toBe('http://sub.asdf.com/somepage')
+  })
+
+  it('ignores invalid matches (amp)', async () => {
+    nock('http://asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <link rel="canonical" href="http://amp.asdf.com/canonical" />
+                <title>Simple HTML document</title>
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`
+      )
+
+    const { url } = await httpClient.get('http://asdf.com/somepage')
+    expect(url).toBe('http://asdf.com/somepage')
+  })
+
+  it('returns the most "relevant" link', async () => {
+    nock('http://amp.asdf.com')
+      .get('/somepage')
+      .reply(
+        200,
+        `<!DOCTYPE html>
+        <html lang="en">
+            <head>
+                <link rel="canonical" href="http://asdf.com/some" />
+                <title>Simple HTML document</title>
+                <a class="amp-canurl" href="http://asdf.com" />
+                <meta property="og:url" content="http://asdf.com/somepage" />
+            </head>
+            <body>
+                <h1>Hello World!</h1>
+            </body>
+        </html>`,
+        {
+          link: '< http://asdf.com/canonical>; rel="canonical"'
+        }
+      )
+
+    const { url } = await httpClient.get('http://amp.asdf.com/somepage')
+    expect(url).toBe('http://asdf.com/somepage')
+  })
+})