Skip to content

Commit

Permalink
Extractor fixes (#148)
Browse files Browse the repository at this point in the history
Especially aiming to enhance reddit.com archiving support.
  • Loading branch information
NGTmeaty authored Sep 27, 2024
1 parent 094c330 commit 5263258
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 3 deletions.
14 changes: 13 additions & 1 deletion internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"sync/atomic"

Expand Down Expand Up @@ -169,7 +170,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu

for match := range matches {
if len(matches[match]) > 0 {
rawAssets = append(rawAssets, matches[match][1])
matchFound := matches[match][1]
// Don't extract CSS elements that aren't URLs
if strings.Contains(matchFound, "%") || strings.HasPrefix(matchFound, "0.") || strings.HasPrefix(matchFound, "--font") || strings.HasPrefix(matchFound, "--size") || strings.HasPrefix(matchFound, "--color") || strings.HasPrefix(matchFound, "--shreddit") || strings.HasPrefix(matchFound, "100vh") {
continue
}
rawAssets = append(rawAssets, matchFound)
}
}
}
Expand Down Expand Up @@ -276,6 +282,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
scriptLinks := utils.DedupeStrings(regexOutlinks.FindAllString(outerHTML, -1))
for _, scriptLink := range scriptLinks {
if strings.HasPrefix(scriptLink, "http") {
// Escape URLs when unicode runes are present in the extracted URLs
scriptLink, err := strconv.Unquote(`"` + scriptLink + `"`)
if err != nil {
c.Log.Debug("unable to escape URL from JSON in script tag", "error", err, "url", scriptLink)
continue
}
rawAssets = append(rawAssets, scriptLink)
}
}
Expand Down
6 changes: 6 additions & 0 deletions internal/pkg/crawl/sitespecific/youtube/youtube_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ func TestParse(t *testing.T) {
// Parse the video
streamURLs, metaURLs, rawJSON, _, err := ytdlp.Parse(f)
if err != nil {
_, found := ytdlp.FindPath()
if !found {
// TODO: install yt-dlp when running our tests in CI?
t.Skipf("yt-dlp not installed. skipping test due to missing executable.")
return
}
t.Fatal(err)
}

Expand Down
39 changes: 37 additions & 2 deletions internal/pkg/utils/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@ import (
func URLToString(u *url.URL) string {
var err error

q := u.Query()
u.RawQuery = q.Encode()
switch u.Host {
case "external-preview.redd.it", "styles.redditmedia.com", "preview.redd.it":
// Do nothing. We don't want to encode the URL for signature purposes. :(
break
default:
q := u.Query()
u.RawQuery = encodeQuery(q)
}
u.Host, err = idna.ToASCII(u.Host)
if err != nil {
if strings.Contains(u.Host, ":") {
Expand All @@ -38,6 +44,35 @@ func URLToString(u *url.URL) string {
return u.String()
}

// Encode encodes the values into “URL encoded” form
// from: https://cs.opensource.google/go/go/+/refs/tags/go1.23.1:src/net/url/url.go;l=1002
// modified to not sort.
func encodeQuery(v url.Values) string {
if len(v) == 0 {
return ""
}
var buf strings.Builder
keys := make([]string, 0, len(v))
for k := range v {
keys = append(keys, k)
}
// Modified to not sort the keys.
// slices.Sort(keys)
for _, k := range keys {
vs := v[k]
keyEscaped := url.QueryEscape(k)
for _, v := range vs {
if buf.Len() > 0 {
buf.WriteByte('&')
}
buf.WriteString(keyEscaped)
buf.WriteByte('=')
buf.WriteString(url.QueryEscape(v))
}
}
return buf.String()
}

// MakeAbsolute turn all URLs in a slice of url.URL into absolute URLs, based
// on a given base *url.URL
func MakeAbsolute(base *url.URL, URLs []*url.URL) []*url.URL {
Expand Down
27 changes: 27 additions & 0 deletions internal/pkg/utils/url_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,30 @@ func TestURLwithIPv6WithPort(t *testing.T) {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

func TestURLwithSpacesandUnicode(t *testing.T) {
u, err := url.Parse("https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中")
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
}

expected := "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363%E7%9F%B3%E7%A5%9E%E8%A6%96%E7%82%B9%E3%80%90Minecraft%E3%80%91%E5%B9%B3%E6%97%A5%E3%82%82%E3%81%A9%E7%9C%9F%E3%82%93%E4%B8%AD%E3%81%AA%E3%82%93%E3%81%A0%E3%81%8B%E3%82%89%E6%97%A9%E3%81%8F%E5%AF%9D%E3%81%AA%E3%81%8D%E3%82%83%E3%80%90%E7%9F%B3%E7%A5%9E%E3%81%AE%E3%81%9E%E3%81%BF%EF%BC%8F%E3%81%AB%E3%81%98%E3%81%95%E3%82%93%E3%81%98%E6%89%80%E5%B1%9E%E3%80%91https%3A%2F%2Fwww.youtube.com%2Fwatch%2FL30uAR9X8Uw%3Ft%3D10100%E3%80%90%E5%80%89%E6%8C%81%E3%82%A8%E3%83%B3%E8%B6%B3%E4%B8%AD"
actual := URLToString(u)
if actual != expected {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

// For technical reasons we are not encoding reddit URLs.
func TestURLwithRedditOverride(t *testing.T) {
u, err := url.Parse("https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905")
if err != nil {
t.Fatalf("Error parsing URL: %v", err)
}

expected := "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905"
actual := URLToString(u)
if actual != expected {
t.Fatalf("Expected %s, got %s", expected, actual)
}
}

0 comments on commit 5263258

Please sign in to comment.