From 52632588ecb43fd0c3a4942c16966026b59232e2 Mon Sep 17 00:00:00 2001 From: Jake L Date: Fri, 27 Sep 2024 09:02:45 -0400 Subject: [PATCH] Extractor fixes (#148) Especially aiming to enhance reddit.com archiving support. --- internal/pkg/crawl/assets.go | 14 ++++++- .../sitespecific/youtube/youtube_test.go | 6 +++ internal/pkg/utils/url.go | 39 ++++++++++++++++++- internal/pkg/utils/url_test.go | 27 +++++++++++++ 4 files changed, 83 insertions(+), 3 deletions(-) diff --git a/internal/pkg/crawl/assets.go b/internal/pkg/crawl/assets.go index 6e60c372..81400457 100644 --- a/internal/pkg/crawl/assets.go +++ b/internal/pkg/crawl/assets.go @@ -5,6 +5,7 @@ import ( "net/http" "net/url" "regexp" + "strconv" "strings" "sync/atomic" @@ -169,7 +170,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu for match := range matches { if len(matches[match]) > 0 { - rawAssets = append(rawAssets, matches[match][1]) + matchFound := matches[match][1] + // Don't extract CSS elements that aren't URLs + if strings.Contains(matchFound, "%") || strings.HasPrefix(matchFound, "0.") || strings.HasPrefix(matchFound, "--font") || strings.HasPrefix(matchFound, "--size") || strings.HasPrefix(matchFound, "--color") || strings.HasPrefix(matchFound, "--shreddit") || strings.HasPrefix(matchFound, "100vh") { + continue + } + rawAssets = append(rawAssets, matchFound) } } } @@ -276,6 +282,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu scriptLinks := utils.DedupeStrings(regexOutlinks.FindAllString(outerHTML, -1)) for _, scriptLink := range scriptLinks { if strings.HasPrefix(scriptLink, "http") { + // Escape URLs when unicode runes are present in the extracted URLs + scriptLink, err := strconv.Unquote(`"` + scriptLink + `"`) + if err != nil { + c.Log.Debug("unable to escape URL from JSON in script tag", "error", err, "url", scriptLink) + continue + } rawAssets = append(rawAssets, scriptLink) } } diff --git a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go index 5a7c43d8..fb95ab61 100644 --- a/internal/pkg/crawl/sitespecific/youtube/youtube_test.go +++ b/internal/pkg/crawl/sitespecific/youtube/youtube_test.go @@ -18,6 +18,12 @@ func TestParse(t *testing.T) { // Parse the video streamURLs, metaURLs, rawJSON, _, err := ytdlp.Parse(f) if err != nil { + _, found := ytdlp.FindPath() + if !found { + // TODO: install yt-dlp when running our tests in CI? + t.Skipf("yt-dlp not installed. skipping test due to missing executable.") + return + } t.Fatal(err) } diff --git a/internal/pkg/utils/url.go b/internal/pkg/utils/url.go index 80f07be0..7ff4507f 100644 --- a/internal/pkg/utils/url.go +++ b/internal/pkg/utils/url.go @@ -14,8 +14,14 @@ import ( func URLToString(u *url.URL) string { var err error - q := u.Query() - u.RawQuery = q.Encode() + switch u.Host { + case "external-preview.redd.it", "styles.redditmedia.com", "preview.redd.it": + // Do nothing. We don't want to encode the URL for signature purposes. :( + break + default: + q := u.Query() + u.RawQuery = encodeQuery(q) + } u.Host, err = idna.ToASCII(u.Host) if err != nil { if strings.Contains(u.Host, ":") { @@ -38,6 +44,35 @@ func URLToString(u *url.URL) string { return u.String() } +// Encode encodes the values into “URL encoded” form +// from: https://cs.opensource.google/go/go/+/refs/tags/go1.23.1:src/net/url/url.go;l=1002 +// modified to not sort. +func encodeQuery(v url.Values) string { + if len(v) == 0 { + return "" + } + var buf strings.Builder + keys := make([]string, 0, len(v)) + for k := range v { + keys = append(keys, k) + } + // Modified to not sort the keys. + // slices.Sort(keys) + for _, k := range keys { + vs := v[k] + keyEscaped := url.QueryEscape(k) + for _, v := range vs { + if buf.Len() > 0 { + buf.WriteByte('&') + } + buf.WriteString(keyEscaped) + buf.WriteByte('=') + buf.WriteString(url.QueryEscape(v)) + } + } + return buf.String() +} + // MakeAbsolute turn all URLs in a slice of url.URL into absolute URLs, based // on a given base *url.URL func MakeAbsolute(base *url.URL, URLs []*url.URL) []*url.URL { diff --git a/internal/pkg/utils/url_test.go b/internal/pkg/utils/url_test.go index 86cb175c..d160b490 100644 --- a/internal/pkg/utils/url_test.go +++ b/internal/pkg/utils/url_test.go @@ -95,3 +95,30 @@ func TestURLwithIPv6WithPort(t *testing.T) { t.Fatalf("Expected %s, got %s", expected, actual) } } + +func TestURLwithSpacesandUnicode(t *testing.T) { + u, err := url.Parse("https://www.youtube.com/watch/0HBwC_wIFF4?t=18363石神視点【Minecraft】平日もど真ん中なんだから早く寝なきゃ【石神のぞみ/にじさんじ所属】https://www.youtube.com/watch/L30uAR9X8Uw?t=10100【倉持エン足中") + if err != nil { + t.Fatalf("Error parsing URL: %v", err) + } + + expected := "https://www.youtube.com/watch/0HBwC_wIFF4?t=18363%E7%9F%B3%E7%A5%9E%E8%A6%96%E7%82%B9%E3%80%90Minecraft%E3%80%91%E5%B9%B3%E6%97%A5%E3%82%82%E3%81%A9%E7%9C%9F%E3%82%93%E4%B8%AD%E3%81%AA%E3%82%93%E3%81%A0%E3%81%8B%E3%82%89%E6%97%A9%E3%81%8F%E5%AF%9D%E3%81%AA%E3%81%8D%E3%82%83%E3%80%90%E7%9F%B3%E7%A5%9E%E3%81%AE%E3%81%9E%E3%81%BF%EF%BC%8F%E3%81%AB%E3%81%98%E3%81%95%E3%82%93%E3%81%98%E6%89%80%E5%B1%9E%E3%80%91https%3A%2F%2Fwww.youtube.com%2Fwatch%2FL30uAR9X8Uw%3Ft%3D10100%E3%80%90%E5%80%89%E6%8C%81%E3%82%A8%E3%83%B3%E8%B6%B3%E4%B8%AD" + actual := URLToString(u) + if actual != expected { + t.Fatalf("Expected %s, got %s", expected, actual) + } +} + +// For technical reasons we are not encoding reddit URLs. +func TestURLwithRedditOverride(t *testing.T) { + u, err := url.Parse("https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905") + if err != nil { + t.Fatalf("Error parsing URL: %v", err) + } + + expected := "https://styles.redditmedia.com/t5_7wkhw/styles/profileIcon_8w6r6fr3rh2d1.jpeg?width=64&height=64&frame=1&auto=webp&crop=64:64,smart&s=6d8ab9b89c9b846c9eb65622db9ced4992dc0905" + actual := URLToString(u) + if actual != expected { + t.Fatalf("Expected %s, got %s", expected, actual) + } +}