diff --git a/README.md b/README.md index 1893513..4d300e6 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ There are still some to-do's, and some refactoring is needed, but the app is alr There is nothing special to do here, just download the code, build it as you would do with any other Go app, and you are set to go. ```bash -$ git clone https://github.com/antsanchez/go-download-web +$ git clone https://github.com/CalderWhite/go-download-web $ cd go-download-web $ go build ``` diff --git a/go.mod b/go.mod index 1d5645f..0a23f98 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/antsanchez/go-download-web +module github.com/CalderWhite/go-download-web go 1.15 diff --git a/main.go b/main.go index f7e5379..a088674 100644 --- a/main.go +++ b/main.go @@ -19,10 +19,11 @@ import ( "net/http" "os" "strings" + "sync" "time" - "github.com/antsanchez/go-download-web/scraper" - "github.com/antsanchez/go-download-web/sitemap" + "github.com/CalderWhite/go-download-web/scraper" + "github.com/CalderWhite/go-download-web/sitemap" ) type Flags struct { @@ -168,25 +169,45 @@ func main() { log.Println("\nFinished scraping the site...") + // Semaphore for downloading attachments + attachmentJobs := make(chan string, len(files)) + var wg sync.WaitGroup log.Println("\nDownloading attachments...") - for _, attachedFile := range files { - if strings.Contains(attachedFile, ".css") { - moreAttachments := s.GetInsideAttachments(attachedFile) - for _, link := range moreAttachments { - if !s.IsURLInSlice(link, files) { - log.Println("Appended: ", link) - files = append(files, link) - go func() { - err := s.SaveAttachment(link) - if err != nil { - log.Println(err) + for i := 1; i <= *flags.Simultaneus; i++ { + wg.Add(1) + go func() { + defer wg.Done() + for attachedFile := range attachmentJobs { + if strings.Contains(attachedFile, ".css") || strings.Contains(attachedFile, ".csv") || strings.Contains(attachedFile, ".parquet") || strings.Contains(attachedFile, ".tar") { + log.Println(attachedFile) + s.SaveAttachment(attachedFile) + + moreAttachments := s.GetInsideAttachments(attachedFile) + for _, link := range moreAttachments { + if !s.IsURLInSlice(link, files) { + log.Println("Appended: ", link) + files = append(files, link) + go func() { + err := s.SaveAttachment(link) + if err != nil { + log.Println(err) + } + }() } - }() + } } } - } + }() } + for _, attachedFile := range files { + attachmentJobs <- attachedFile + } + + close(attachmentJobs) + + wg.Wait() + log.Println("Creating Sitemap...") err = sitemap.CreateSitemap(forSitemap, *flags.Path) if err != nil { diff --git a/scraper/helpers.go b/scraper/helpers.go index c52a647..42f3b8e 100644 --- a/scraper/helpers.go +++ b/scraper/helpers.go @@ -8,11 +8,11 @@ import ( "regexp" "strings" - "github.com/antsanchez/go-download-web/commons" + "github.com/CalderWhite/go-download-web/commons" ) var ( - extensions = []string{".png", ".jpg", ".jpeg", ".json", ".js", ".tiff", ".pdf", ".txt", ".gif", ".psd", ".ai", "dwg", ".bmp", ".zip", ".tar", ".gzip", ".svg", ".avi", ".mov", ".json", ".xml", ".mp3", ".wav", ".mid", ".ogg", ".acc", ".ac3", "mp4", ".ogm", ".cda", ".mpeg", ".avi", ".swf", ".acg", ".bat", ".ttf", ".msi", ".lnk", ".dll", ".db", ".css"} + extensions = []string{".png", ".jpg", ".jpeg", ".json", ".js", ".tiff", ".pdf", ".txt", ".gif", ".psd", ".ai", "dwg", ".bmp", ".zip", ".tar", ".gzip", ".svg", ".avi", ".mov", ".json", ".xml", ".mp3", ".wav", ".mid", ".ogg", ".acc", ".ac3", "mp4", ".ogm", ".cda", ".mpeg", ".avi", ".swf", ".acg", ".bat", ".ttf", ".msi", ".lnk", ".dll", ".db", ".css", ".csv", ".parquet", ".tar"} falseURLs = []string{"mailto:", "javascript:", "tel:", "whatsapp:", "callto:", "wtai:", "sms:", "market:", "geopoint:", "ymsgr:", "msnim:", "gtalk:", "skype:"} validURL = regexp.MustCompile(`\(([^()]*)\)`) validCSS = regexp.MustCompile(`\{(\s*?.*?)*?\}`) diff --git a/scraper/save.go b/scraper/save.go index ef82b28..a18db59 100644 --- a/scraper/save.go +++ b/scraper/save.go @@ -7,7 +7,7 @@ import ( "os" "strings" - "github.com/antsanchez/go-download-web/commons" + "github.com/CalderWhite/go-download-web/commons" ) // Download a single link diff --git a/scraper/scrapper.go b/scraper/scrapper.go index 6ffafc8..dedcf2a 100644 --- a/scraper/scrapper.go +++ b/scraper/scrapper.go @@ -5,6 +5,7 @@ import ( "fmt" "log" "net/http" + "net/url" "strings" "golang.org/x/net/html" @@ -155,6 +156,8 @@ func (s *Scraper) getLinks(domain string) (page Page, attachments []string, err } // Get links + // super lazy alphanumeric checking because I don't want to do regex. + alphanumeric := "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0987654321" if n.Type == html.ElementNode && n.Data == "a" { ok := false newLink := Links{} @@ -162,6 +165,10 @@ func (s *Scraper) getLinks(domain string) (page Page, attachments []string, err for _, a := range n.Attr { if a.Key == "href" { link, err := resp.Request.URL.Parse(a.Val) + + if strings.Contains(alphanumeric, string(a.Val[0])) { + link, _ = url.Parse(resp.Request.URL.Scheme + "://" + resp.Request.URL.Hostname() + "/" + a.Val) + } if err == nil { foundLink := s.sanitizeURL(link.String()) if s.isValidLink(foundLink) {