Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add custom code for Reddit archiving #137

Merged
merged 3 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/queue"
"github.com/internetarchive/Zeno/internal/pkg/utils"
Expand All @@ -30,7 +31,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
doc.Find("[data-item]").Each(func(index int, item *goquery.Selection) {
dataItem, exists := item.Attr("data-item")
if exists {
URLsFromJSON, _ := getURLsFromJSON(dataItem)
URLsFromJSON, _ := extractor.GetURLsFromJSON(dataItem)
CorentinB marked this conversation as resolved.
Show resolved Hide resolved
rawAssets = append(rawAssets, URLsFromJSON...)
}
})
Expand Down Expand Up @@ -136,7 +137,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
scriptType, exists := item.Attr("type")
if exists {
if scriptType == "application/json" {
URLsFromJSON, _ := getURLsFromJSON(item.Text())
URLsFromJSON, _ := extractor.GetURLsFromJSON(item.Text())
CorentinB marked this conversation as resolved.
Show resolved Hide resolved
rawAssets = append(rawAssets, URLsFromJSON...)
}
}
Expand Down Expand Up @@ -184,7 +185,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
}

if len(jsonContent[1]) > payloadEndPosition {
URLsFromJSON, _ := getURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
URLsFromJSON, _ := extractor.GetURLsFromJSON(jsonContent[1][:payloadEndPosition+1])
CorentinB marked this conversation as resolved.
Show resolved Hide resolved
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
}
}
Expand Down
100 changes: 17 additions & 83 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package crawl

import (
"encoding/json"
"errors"
"io"
"net/http"
Expand All @@ -12,7 +11,7 @@ import (
"time"

"github.com/PuerkitoBio/goquery"
"github.com/clbanning/mxj/v2"
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
Expand Down Expand Up @@ -224,6 +223,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
var (
resp *http.Response
waitGroup sync.WaitGroup
assets []*url.URL
)

defer func(i *queue.Item) {
Expand Down Expand Up @@ -390,53 +390,20 @@ func (c *Crawl) Capture(item *queue.Item) error {
return err
}

// If the response is a JSON document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "json") {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading JSON body")
return err
}

outlinksFromJSON, err := getURLsFromJSON(string(jsonBody))
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting URLs from JSON")
return err
}

waitGroup.Add(1)
go c.queueOutlinks(utils.MakeAbsolute(item.URL, utils.StringSliceToURLSlice(outlinksFromJSON)), item, &waitGroup)

return err
}

// If the response is an XML document, we want to scrape it for links
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
xmlBody, err := io.ReadAll(resp.Body)
assets, err = extractor.XML(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading XML body")
return err
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
}

mv, err := mxj.NewMapXml(xmlBody)
} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
assets, err = extractor.JSON(resp)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing XML body")
return err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
discovered = append(discovered, value.(string))
}
}
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from JSON")
}
}

// If the response isn't a text/*, we do not scrape it.
// We also aren't going to scrape if assets and outlinks are turned off.
if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
// Enforce reading all data from the response for WARC writing
} else if !strings.Contains(resp.Header.Get("Content-Type"), "text/") || (c.DisableAssetsCapture && !c.DomainsCrawl && (uint64(c.MaxHops) <= item.Hop)) {
// If the response isn't a text/*, we do not scrape it.
// We also aren't going to scrape if assets and outlinks are turned off.
_, err := io.Copy(io.Discard, resp.Body)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
Expand Down Expand Up @@ -526,11 +493,13 @@ func (c *Crawl) Capture(item *queue.Item) error {
return err
}

// Extract and capture assets
assets, err := c.extractAssets(base, item, doc)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return err
// Extract and capture assets (only if we didn't use an extractor that produce assets)
if len(assets) == 0 {
assets, err = c.extractAssets(base, item, doc)
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
return err
}
}

// If we didn't find any assets, let's stop here
Expand Down Expand Up @@ -649,38 +618,3 @@ func (c *Crawl) Capture(item *queue.Item) error {
swg.Wait()
return err
}

func getURLsFromJSON(jsonString string) ([]string, error) {
var data interface{}
err := json.Unmarshal([]byte(jsonString), &data)
if err != nil {
return nil, err
}

links := make([]string, 0)
findURLs(data, &links)

return links, nil
}

func findURLs(data interface{}, links *[]string) {
switch v := data.(type) {
case string:
if isValidURL(v) {
*links = append(*links, v)
}
case []interface{}:
for _, element := range v {
findURLs(element, links)
}
case map[string]interface{}:
for _, value := range v {
findURLs(value, links)
}
}
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
}
64 changes: 64 additions & 0 deletions internal/pkg/crawl/extractor/json.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package extractor

import (
"encoding/json"
"io"
"net/http"
"net/url"
)

func JSON(resp *http.Response) (URLs []*url.URL, err error) {
jsonBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

rawURLs, err := GetURLsFromJSON(string(jsonBody))
if err != nil {
return nil, err
}

for _, rawURL := range rawURLs {
URL, err := url.Parse(rawURL)
if err == nil {
URLs = append(URLs, URL)
}
}

return URLs, err
}

func GetURLsFromJSON(jsonString string) ([]string, error) {
var data interface{}
err := json.Unmarshal([]byte(jsonString), &data)
if err != nil {
return nil, err
}

links := make([]string, 0)
findURLs(data, &links)

return links, nil
}

func findURLs(data interface{}, links *[]string) {
switch v := data.(type) {
case string:
if isValidURL(v) {
*links = append(*links, v)
}
case []interface{}:
for _, element := range v {
findURLs(element, links)
}
case map[string]interface{}:
for _, value := range v {
findURLs(value, links)
}
}
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
}
91 changes: 91 additions & 0 deletions internal/pkg/crawl/extractor/json_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package extractor

import (
"bytes"
"io"
"net/http"
"net/url"
"reflect"
"sort"
"testing"
)

func TestJSON(t *testing.T) {
tests := []struct {
name string
jsonBody string
wantURLs []*url.URL
wantErr bool
}{
{
name: "Valid JSON with URLs",
jsonBody: `{"url": "https://example.com", "nested": {"link": "http://test.com"}}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example.com"},
{Scheme: "http", Host: "test.com"},
},
wantErr: false,
},
{
name: "Invalid JSON",
jsonBody: `{"url": "https://example.com"`,
wantURLs: nil,
wantErr: true,
},
{
name: "JSON with no URLs",
jsonBody: `{"key": "value", "number": 42}`,
wantURLs: nil,
wantErr: false,
},
{
name: "JSON with URLs in various fields",
jsonBody: `{"someField": "https://example.com", "otherField": "http://test.com", "nested": {"deepLink": "https://deep.example.com"}}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example.com"},
{Scheme: "http", Host: "test.com"},
{Scheme: "https", Host: "deep.example.com"},
},
wantErr: false,
},
{
name: "JSON with array of URLs",
jsonBody: `{"links": ["https://example1.com", "https://example2.com"]}`,
wantURLs: []*url.URL{
{Scheme: "https", Host: "example1.com"},
{Scheme: "https", Host: "example2.com"},
},
wantErr: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
resp := &http.Response{
Body: io.NopCloser(bytes.NewBufferString(tt.jsonBody)),
}

gotURLs, err := JSON(resp)

if (err != nil) != tt.wantErr {
t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr)
return
}

// Sort both slices before comparison
sortURLs(gotURLs)
sortURLs(tt.wantURLs)

if !reflect.DeepEqual(gotURLs, tt.wantURLs) {
t.Errorf("JSON() gotURLs = %v, want %v", gotURLs, tt.wantURLs)
}
})
}
}

// Helper function to sort URL slices
func sortURLs(urls []*url.URL) {
sort.Slice(urls, func(i, j int) bool {
return urls[i].String() < urls[j].String()
})
}
35 changes: 35 additions & 0 deletions internal/pkg/crawl/extractor/xml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package extractor

import (
"io"
"net/http"
"net/url"
"strings"

"github.com/clbanning/mxj/v2"
)

func XML(resp *http.Response) (URLs []*url.URL, err error) {
xmlBody, err := io.ReadAll(resp.Body)
if err != nil {
return nil, err
}

mv, err := mxj.NewMapXml(xmlBody)
if err != nil {
return nil, err
}

for _, value := range mv.LeafValues() {
if _, ok := value.(string); ok {
if strings.HasPrefix(value.(string), "http") {
URL, err := url.Parse(value.(string))
if err == nil {
URLs = append(URLs, URL)
}
}
}
}

return URLs, nil
}
2 changes: 1 addition & 1 deletion internal/pkg/crawl/outlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.Wa
}
}

if !c.UseHQ {
if !c.UseHQ && len(items) > 0 {
err := c.Queue.BatchEnqueue(items...)
if err != nil {
c.Log.Error("unable to enqueue outlinks, discarding", "error", err)
Expand Down
Loading
Loading