Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom code for Reddit archiving #137

Merged
merged 3 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ import (
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
)

func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
var rawAssets []string
var URL = utils.URLToString(item.URL)

// Execute plugins on the response
if strings.Contains(base.Host, "cloudflarestream.com") {
Expand All @@ -30,8 +32,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) {
dataItem, exists := item.Attr("data-item")
if exists {
URLsFromJSON, _ := getURLsFromJSON(dataItem)
rawAssets = append(rawAssets, URLsFromJSON...)
URLsFromJSON, err := extractor.GetURLsFromJSON(dataItem)
if err != nil {
c.Log.Error("unable to extract URLs from JSON in data-item attribute", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
})

Expand Down Expand Up @@ -136,8 +142,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
scriptType, exists := item.Attr("type")
if exists {
if scriptType == "application/json" {
URLsFromJSON, _ := getURLsFromJSON(item.Text())
rawAssets = append(rawAssets, URLsFromJSON...)
URLsFromJSON, err := extractor.GetURLsFromJSON(item.Text())
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
}

Expand Down Expand Up @@ -184,8 +194,12 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
}

if len(jsonContent[1]) > payloadEndPosition {
URLsFromJSON, _ := getURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
URLsFromJSON, err := extractor.GetURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
if err != nil {
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
} else {
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
}
}
}
}
Expand Down
100 changes: 17 additions & 83 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package crawl

import (
"encoding/json"
"errors"
"io"
"net/http"
Expand All @@ -12,7 +11,7 @@ import (
"time"

"github.com/PuerkitoBio/goquery"
"github.com/clbanning/mxj/v2"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
Expand Down Expand Up @@ -224,6 +223,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
var (
resp *http.Response
waitGroup sync.WaitGroup
assets []*url.URL
)

defer func(i *queue.Item) {
Expand Down Expand Up @@ -390,53 +390,20 @@ func (c *Crawl) Capture(item *queue.Item) error {
return err
}

// If the response is a JSON document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "json") {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading JSON body")
return err
}

outlinksFromJSON, err := getURLsFromJSON(string(jsonBody))
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting URLs from JSON")
return err
}

waitGroup.Add(1)
go c.queueOutlinks(utils.MakeAbsolute(item.URL, utils.StringSliceToURLSlice(outlinksFromJSON)), item, &waitGroup)

return err
}

// If the response is an XML document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
xmlBody, err := io.ReadAll(resp.Body)
assets, err = extractor.XML(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading XML body")
return err
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
}

mv, err := mxj.NewMapXml(xmlBody)
} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
assets, err = extractor.JSON(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing XML body")
return err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
discovered = append(discovered, value.(string))
}
}
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON")
}
}

// If the response isn't a text/*, we do not scrape it.
// We also aren't going to scrape if assets and outlinks are turned off.
if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
// Enforce reading all data from the response for WARC writing
} else if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
// If the response isn't a text/*, we do not scrape it.
// We also aren't going to scrape if assets and outlinks are turned off.
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
Expand Down Expand Up @@ -526,11 +493,13 @@ func (c *Crawl) Capture(item *queue.Item) error {
return err
}

// Extract and capture assets
assets, err := c.extractAssets(base, item, doc)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return err
// Extract and capture assets (only if we didn't use an extractor that produce assets)
if len(assets) == 0 {
assets, err = c.extractAssets(base, item, doc)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return err
}
}

// If we didn't find any assets, let's stop here
Expand Down Expand Up @@ -649,38 +618,3 @@ func (c *Crawl) Capture(item *queue.Item) error {
swg.Wait()
return err
}

func getURLsFromJSON(jsonString string) ([]string, error) {
var data interface{}
err := json.Unmarshal([]byte(jsonString), &data)
if err != nil {
return nil, err
}

links := make([]string, 0)
findURLs(data, &links)

return links, nil
}

func findURLs(data interface{}, links *[]string) {
switch v := data.(type) {
case string:
if isValidURL(v) {
*links = append(*links, v)
}
case []interface{}:
for _, element := range v {
findURLs(element, links)
}
case map[string]interface{}:
for _, value := range v {
findURLs(value, links)
}
}
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
}
64 changes: 64 additions & 0 deletions internal/pkg/crawl/extractor/json.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package extractor

import (
"encoding/json"
"io"
"net/http"
"net/url"
)

func JSON(resp *http.Response) (URLs []*url.URL, err error) {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

rawURLs, err := GetURLsFromJSON(string(jsonBody))
if err != nil {
return nil, err
}

for _, rawURL := range rawURLs {
URL, err := url.Parse(rawURL)
if err == nil {
URLs = append(URLs, URL)
}
}

return URLs, err
}

func GetURLsFromJSON(jsonString string) ([]string, error) {
var data interface{}
err := json.Unmarshal([]byte(jsonString), &data)
if err != nil {
return nil, err
}

links := make([]string, 0)
findURLs(data, &links)

return links, nil
}

func findURLs(data interface{}, links *[]string) {
switch v := data.(type) {
case string:
if isValidURL(v) {
*links = append(*links, v)
}
case []interface{}:
for _, element := range v {
findURLs(element, links)
}
case map[string]interface{}:
for _, value := range v {
findURLs(value, links)
}
}
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
}
91 changes: 91 additions & 0 deletions internal/pkg/crawl/extractor/json_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package extractor

import (
"bytes"
"io"
"net/http"
"net/url"
"reflect"
"sort"
"testing"
)

func TestJSON(t *testing.T) {
tests := []struct {
name string
jsonBody string
wantURLs []*url.URL
wantErr bool
}{
{
name: "Valid JSON with URLs",
jsonBody: `{"url": "https://example.com", "nested": {"link": "http://test.com"}}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example.com"},
{Scheme: "http", Host: "test.com"},
},
wantErr: false,
},
{
name: "Invalid JSON",
jsonBody: `{"url": "https://example.com"`,
wantURLs: nil,
wantErr: true,
},
{
name: "JSON with no URLs",
jsonBody: `{"key": "value", "number": 42}`,
wantURLs: nil,
wantErr: false,
},
{
name: "JSON with URLs in various fields",
jsonBody: `{"someField": "https://example.com", "otherField": "http://test.com", "nested": {"deepLink": "https://deep.example.com"}}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example.com"},
{Scheme: "http", Host: "test.com"},
{Scheme: "https", Host: "deep.example.com"},
},
wantErr: false,
},
{
name: "JSON with array of URLs",
jsonBody: `{"links": ["https://example1.com", "https://example2.com"]}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example1.com"},
{Scheme: "https", Host: "example2.com"},
},
wantErr: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
resp := &http.Response{
Body: io.NopCloser(bytes.NewBufferString(tt.jsonBody)),
}

gotURLs, err := JSON(resp)

if (err != nil) != tt.wantErr {
t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr)
return
}

// Sort both slices before comparison
sortURLs(gotURLs)
sortURLs(tt.wantURLs)

if !reflect.DeepEqual(gotURLs, tt.wantURLs) {
t.Errorf("JSON() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
}
})
}
}

// Helper function to sort URL slices
func sortURLs(urls []*url.URL) {
sort.Slice(urls, func(i, j int) bool {
return urls[i].String() < urls[j].String()
})
}
35 changes: 35 additions & 0 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package extractor

import (
"io"
"net/http"
"net/url"
"strings"

"github.com/clbanning/mxj/v2"
)

func XML(resp *http.Response) (URLs []*url.URL, err error) {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
return nil, err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
URL, err := url.Parse(value.(string))
if err == nil {
URLs = append(URLs, URL)
}
}
}
}

return URLs, nil
}
Loading
Loading