Skip to content

Commit

Permalink
fix: add ability to track referrer/parent page and link
Browse files Browse the repository at this point in the history
  • Loading branch information
colinramsay committed Jun 5, 2024
1 parent 7053eeb commit dbc477c
Showing 1 changed file with 25 additions and 6 deletions.
31 changes: 25 additions & 6 deletions start.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ import (
type Configuration struct {
Debug bool `json:"debug"`
UserAgent string `json:"userAgent"`
HtmlCache struct {
HtmlCache struct {
Directory string `json:"directory"`
} `json:"htmlCache"`
PdfCache struct {
Directory string `json:"directory"`
} `json:"pdfCache"`
Request struct {
Request struct {
TimeoutInMs int `json:"timeoutInMs"`
DomainGlob string `json:"domainGlob"`
Parallelism int `json:"parellelism"`
Expand All @@ -56,8 +56,15 @@ type Configuration struct {
type HtmlSelectorTemplateVars struct {
Request colly.Request
Response colly.Response
Referrer Referrer
}

type Referrer struct {
Url string
LinkText string
}


type PdfSelectorTemplateVars struct {
Response colly.Response
Request colly.Request
Expand Down Expand Up @@ -135,7 +142,7 @@ func regexpFromConfig(input []string) []*regexp.Regexp {
}

func main() {

log.Println("Starting...")
testUrlPtr := flag.String("testUrl", "", "A single URL. When provided, will show the output from that URL only.")

flag.Parse()
Expand Down Expand Up @@ -172,6 +179,7 @@ func main() {
}
options = append(options, colly.Async(true))

log.Println("Creating collector...")
c := colly.NewCollector(options...)

c.SetRequestTimeout(time.Duration(configuration.Request.TimeoutInMs) * time.Millisecond)
Expand All @@ -192,6 +200,7 @@ func main() {
})

c.OnHTML("html", func(htmlEl *colly.HTMLElement) {
log.Println("Starting doc...")
document := make(map[string]string)

htmlEl.DOM.Find("script,style,link,form").Remove()
Expand All @@ -201,7 +210,7 @@ func main() {
if strings.Contains(selector, "{{") {
t := template.Must(template.New("selectorTpl").Funcs(sprig.TxtFuncMap()).Parse(selector))
var tpl bytes.Buffer
data := HtmlSelectorTemplateVars{Request: *htmlEl.Request, Response: *htmlEl.Response}
data := HtmlSelectorTemplateVars{Request: *htmlEl.Request, Response: *htmlEl.Response, Referrer: Referrer{ Url: htmlEl.Request.Ctx.Get("refUrl"), LinkText: htmlEl.Request.Ctx.Get("linkText") }}
err := t.Execute(&tpl, data)

if err != nil {
Expand All @@ -223,8 +232,15 @@ func main() {
}

htmlEl.ForEach("a[href]", func(_ int, el *colly.HTMLElement) {
htmlEl.Request.Visit(el.Attr("href"))
ctx := colly.NewContext()
ctx.Put("refUrl", el.Request.URL.String())
ctx.Put("linkText", el.Text)

c.Request("GET",
el.Request.AbsoluteURL(el.Attr("href")),
nil, ctx, nil)

// htmlEl.Request.Visit(el.Attr("href"))
})
} else {
fmt.Println(document)
Expand All @@ -236,11 +252,12 @@ func main() {
err := os.Mkdir(configuration.PdfCache.Directory, 0755)

if err != nil {
log.Fatal(err)
log.Fatal("Error creating PDF cache:", configuration.PdfCache.Directory, err)
}
}

c.OnResponse(func(resp *colly.Response) {
log.Println("response")
ext := filepath.Ext(resp.Request.URL.Path)

if ext == ".pdf" {
Expand Down Expand Up @@ -279,6 +296,8 @@ func main() {
panic(err)
}
val = tpl.String()

log.Print(val)
}
document[key] = val
}
Expand Down

0 comments on commit dbc477c

Please sign in to comment.