Skip to content

Commit

Permalink
canonicization test passed
Browse files Browse the repository at this point in the history
  • Loading branch information
JaneJeon committed Aug 30, 2021
1 parent dae4662 commit 41e6002
Show file tree
Hide file tree
Showing 2 changed files with 221 additions and 37 deletions.
65 changes: 28 additions & 37 deletions utils/canonicize.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export default async function canonicizeHook(res) {
const relStrLower = relStr.toLowerCase()
if (relStrLower.includes('rel') && relStrLower.includes('canonical')) {
// <https://example.com>, https://example.com, etc.
const url = trim(linkStr.trim(), ['<', '>'])
const url = trim(linkStr.trim(), ['<', '>', ' '])
matches.push(url)
}
})
Expand All @@ -58,49 +58,40 @@ export default async function canonicizeHook(res) {
// The only reason we want canonical is to make our job with normalization easier;
// So we need to make sure the canonical link IS for the url we're trying to normalize!

const { hostname: domain } = new URL(normalizedUrl)
const { domain: baseDomain } = parseTld(normalizedUrl)

const candidates = matches
.map(link => {
// Before processing, we need to make sure all the URLs are in absolute form
if (link.startsWith('//')) {
return `https:${link}`
} else if (link.startsWith('/')) {
return `${baseDomain}${link}`
} else {
return link
}
})
.filter(link => {
// First, ensure that every match is a valid URL w/ a matching domain
// In this case, we're only matching the "top-level" domain -
// e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
// so we want to include references to www.site.com (actually *prefer* those)
try {
return parseTld(link).domain === baseDomain
} catch (err) {
return false
}
})
.filter(link => {
// Then, ensure that links aren't AMP'd
return !urlIsAmp(link)
})

let result = normalizedUrl
let minDist = Number.POSITIVE_INFINITY

for (const candidate of candidates) {
for (const match of matches) {
let link = match

// turn relative to absolute URL
if (match.startsWith('/')) link = `${domain}${match}`

// Skip invalid links
try {
const normalizedCandidate = await normalize(candidate)
link = await normalize(link)

// Then, sort by similarity to the normalized URL of the page we ended up in
const dist = leven(normalizedUrl, normalizedCandidate)
if (dist < minDist) {
minDist = dist
result = normalizedCandidate
}
} catch (err) {} // pass, the link is invalid
// Ensure that every match is a valid URL w/ a matching domain
// In this case, we're only matching the "top-level" domain -
// e.g. subdomain.(domain.com) - as a lot of sites host their shit on amp.(site.com)
// so we want to include references to www.site.com (actually *prefer* those)
if (parseTld(link).domain !== baseDomain) continue

// Then, ensure that links aren't AMP'd
if (urlIsAmp(link)) continue
} catch (err) {
continue
}

// Then, sort by similarity to the normalized URL of the page we ended up in
const dist = leven(normalizedUrl, link)
if (dist < minDist) {
minDist = dist
result = link
}
}

res.url = result
Expand Down
193 changes: 193 additions & 0 deletions utils/canonicize.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import { expect, describe, it } from '@jest/globals'
import got from 'got'
import nock from 'nock'
import hook from './canonicize'

nock.disableNetConnect()

describe('extracting canonical links', () => {
const httpClient = got.extend({
hooks: { afterResponse: [hook] },
context: {
normalize: async url =>
url.startsWith('http') || url.startsWith('https')
? url
: `http://${url}`
}
})

it('picks up rel=canonical in HTML body', async () => {
nock('http://asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<link rel="canonical" href="http://asdf.com/canonical" />
<title>Simple HTML document</title>
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`
)

const { url } = await httpClient.get('http://asdf.com/somepage')
expect(url).toBe('http://asdf.com/canonical')
})

it('picks up rel=canonical in HTTP header', async () => {
nock('http://asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<title>Simple HTML document</title>
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`,
{
link: '< http://asdf.com/canonical>; rel="canonical"'
}
)

const { url } = await httpClient.get('http://asdf.com/somepage')
expect(url).toBe('http://asdf.com/canonical')
})

it("picks up de-AMP'd links", async () => {
nock('http://asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<title>Simple HTML document</title>
<a class="amp-canurl" href="http://asdf.com/canonical" />
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`
)

const { url } = await httpClient.get('http://asdf.com/somepage')
expect(url).toBe('http://asdf.com/canonical')
})

it('picks up opengraph links', async () => {
nock('http://asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<title>Simple HTML document</title>
<meta property="og:url" content="http://asdf.com/canonical" />
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`
)

const { url } = await httpClient.get('http://asdf.com/somepage')
expect(url).toBe('http://asdf.com/canonical')
})

it('puts all relative links in absolute form', async () => {
nock('http://sub.asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<link rel="canonical" href="/canonical" />
<title>Simple HTML document</title>
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`
)

const { url } = await httpClient.get('http://sub.asdf.com/somepage')
expect(url).toBe('http://sub.asdf.com/canonical')
})

it('ignores invalid matches (domain)', async () => {
nock('http://sub.asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<link rel="canonical" href="http://sub.com/canonical" />
<title>Simple HTML document</title>
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`
)

const { url } = await httpClient.get('http://sub.asdf.com/somepage')
expect(url).toBe('http://sub.asdf.com/somepage')
})

it('ignores invalid matches (amp)', async () => {
nock('http://asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<link rel="canonical" href="http://amp.asdf.com/canonical" />
<title>Simple HTML document</title>
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`
)

const { url } = await httpClient.get('http://asdf.com/somepage')
expect(url).toBe('http://asdf.com/somepage')
})

it('returns the most "relevant" link', async () => {
nock('http://amp.asdf.com')
.get('/somepage')
.reply(
200,
`<!DOCTYPE html>
<html lang="en">
<head>
<link rel="canonical" href="http://asdf.com/some" />
<title>Simple HTML document</title>
<a class="amp-canurl" href="http://asdf.com" />
<meta property="og:url" content="http://asdf.com/somepage" />
</head>
<body>
<h1>Hello World!</h1>
</body>
</html>`,
{
link: '< http://asdf.com/canonical>; rel="canonical"'
}
)

const { url } = await httpClient.get('http://amp.asdf.com/somepage')
expect(url).toBe('http://asdf.com/somepage')
})
})

0 comments on commit 41e6002

Please sign in to comment.