Skip to content

Commit

Permalink
feat: add crawl time limiters to ensure the crawl doesn't go past a s…
Browse files Browse the repository at this point in the history
…pecific amount of time as specified

Co-authored-by: Corentin Barreau <[email protected]>
  • Loading branch information
NGTmeaty and CorentinB committed Jan 13, 2024
1 parent 7be165a commit a8bd42a
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 3 deletions.
16 changes: 15 additions & 1 deletion cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ var GlobalFlags = []cli.Flag{
&cli.IntFlag{
Name: "max-concurrent-per-domain",
Value: 16,
Usage: "Maximum number of concurrent requests per domain",
Usage: "Maximum number of concurrent requests per domain.",
Destination: &config.App.Flags.MaxConcurrentRequestsPerDomain,
},
&cli.IntFlag{
Expand All @@ -151,6 +151,20 @@ var GlobalFlags = []cli.Flag{
Destination: &config.App.Flags.RateLimitDelay,
},

&cli.IntFlag{
Name: "crawl-time-limit",
Value: 0,
Usage: "Number of seconds until the crawl will automatically set itself into the finished state.",
Destination: &config.App.Flags.CrawlTimeLimit,
},

&cli.IntFlag{
Name: "crawl-max-time-limit",
Value: 0,
Usage: "Number of seconds until the crawl will automatically panic itself. Default to crawl-time-limit + (crawl-time-limit / 10)",
Destination: &config.App.Flags.MaxCrawlTimeLimit,
},

// Proxy flags
&cli.StringFlag{
Name: "proxy",
Expand Down
9 changes: 9 additions & 0 deletions cmd/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ func InitCrawlWithCMD(flags config.Flags) *crawl.Crawl {
c.HTTPTimeout = flags.HTTPTimeout
c.MaxConcurrentRequestsPerDomain = flags.MaxConcurrentRequestsPerDomain
c.RateLimitDelay = flags.RateLimitDelay
c.CrawlTimeLimit = flags.CrawlTimeLimit

// Defaults --max-crawl-time-limit to 10% more than --crawl-time-limit
if flags.MaxCrawlTimeLimit == 0 && flags.CrawlTimeLimit != 0 {
c.MaxCrawlTimeLimit = flags.CrawlTimeLimit + (flags.CrawlTimeLimit / 10)
} else {
c.MaxCrawlTimeLimit = flags.MaxCrawlTimeLimit
}

c.MaxRetry = flags.MaxRetry
c.MaxRedirect = flags.MaxRedirect
c.MaxHops = uint8(flags.MaxHops)
Expand Down
2 changes: 2 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ type Flags struct {
MaxRetry int
MaxConcurrentRequestsPerDomain int
RateLimitDelay int
CrawlTimeLimit int
MaxCrawlTimeLimit int

Proxy string
BypassProxy cli.StringSlice
Expand Down
12 changes: 12 additions & 0 deletions internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ type Crawl struct {
HTTPTimeout int
MaxConcurrentRequestsPerDomain int
RateLimitDelay int
CrawlTimeLimit int
MaxCrawlTimeLimit int
DisableAssetsCapture bool
CaptureAlternatePages bool
DomainsCrawl bool
Expand Down Expand Up @@ -124,6 +126,16 @@ func (c *Crawl) Start() (err error) {
c.HQChannelsWg = new(sync.WaitGroup)
regexOutlinks = xurls.Relaxed()

// Setup the --crawl-time-limit clock
if c.CrawlTimeLimit != 0 {
go func() {
time.Sleep(time.Second * time.Duration(c.CrawlTimeLimit))
go c.finish()
time.Sleep((time.Duration(c.MaxCrawlTimeLimit) * time.Second) - (time.Duration(c.CrawlTimeLimit) * time.Second))
logError.Fatal("Crawl time limit reached, exiting..")
}()
}

// Setup logging, every day at midnight UTC a new setup
// is triggered in order to change the ES index's name
if c.ElasticSearchURL != "" {
Expand Down
4 changes: 2 additions & 2 deletions internal/pkg/crawl/finish.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ func (crawl *Crawl) catchFinish() {
if crawl.ActiveWorkers.Value() == 0 && crawl.Frontier.QueueCount.Value() == 0 && !crawl.Finished.Get() && (crawl.CrawledSeeds.Value()+crawl.CrawledAssets.Value() > 0) {
logrus.Warning("No additional URL to archive, finishing")
crawl.finish()
os.Exit(0)
}
}
}
Expand Down Expand Up @@ -92,6 +91,8 @@ func (crawl *Crawl) finish() {
crawl.Frontier.Save()

logrus.Warning("Finished!")

os.Exit(0)
}

func (crawl *Crawl) setupCloseHandler() {
Expand All @@ -101,5 +102,4 @@ func (crawl *Crawl) setupCloseHandler() {
logrus.Warning("CTRL+C catched.. cleaning up and exiting.")
signal.Stop(c)
crawl.finish()
os.Exit(0)
}

0 comments on commit a8bd42a

Please sign in to comment.