-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkatchr.go
82 lines (68 loc) · 2.05 KB
/
katchr.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
package main
import (
"encoding/json"
"fmt"
"log"
"os"
"strings"
"github.com/gocolly/colly"
)
type Article struct {
Title string `json:"title"`
Content string `json:"content"`
Link string `json:"link"`
}
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: rekt.news, hackerspaces.org
colly.AllowedDomains("rekt.news", "hackerspaces.org"),
)
articles := make([]Article, 0)
// On every a element which has href attribute call callback
c.OnHTML("article", func(e *colly.HTMLElement) {
// create an article struct and fill it with the data we need
post := Article{
Title: e.ChildText(".post-title"),
Content: strings.ReplaceAll(e.ChildText("section > p:nth-child(1)"), "’", "'"),
Link: e.Request.AbsoluteURL(e.ChildAttr("article a[href]", "href")),
}
// append the article to the list of articles and filter out tagged articles
if !strings.Contains(post.Link, "tag=") {
articles = append(articles, post)
}
fmt.Printf("Article found: %q\n", post)
// Visit link found on page
// Only those links are visited which are in AllowedDomains
c.Visit(post.Link)
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
// Abort if the URL is a tag, we only care about articles
if r.URL.RawQuery != "" && strings.Contains(string(r.URL.RawQuery), "tag") {
r.Abort()
} else {
fmt.Println("Visiting", r.URL.String())
}
})
c.OnResponse(func(r *colly.Response) {
fmt.Println("Got a response from", r.Request.URL)
})
// Set error handler
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Got this error:", err)
})
c.OnScraped(func(r *colly.Response) {
fmt.Println("Finished", r.Request.URL)
js, err := json.MarshalIndent(articles, "", " ")
if err != nil {
log.Fatal(err)
}
fmt.Println("Writing data to file")
if err := os.WriteFile("../articles.json", js, 0664); err == nil {
fmt.Println("Data written to file successfully")
}
})
// Start scraping on https://rekt.news
c.Visit("https://rekt.news/")
}