Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minimum delay based on download finish time #180

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
- Upgrade gradle-node-plugin to version 2.2.4
- Upgrade gradle wrapper to version 6.6.1
- Upgrade `crawler-commons` to version 1.1
- Minimum delay between request now considers the time when the dowload is
actually finihsed, not the time when the URL was initially scheduled to be
downloaded (which disregards other processing times between scheduling and actual download)
- Refactoring of FethcedResultHandler to simply notify the LinkStorage that
the download finished and to delegate data processing to other handlers
for the appropriate link type


## Version 0.12.0 (2020-01-18)
Expand Down
24 changes: 8 additions & 16 deletions src/main/java/focusedCrawler/crawler/async/AsyncCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,13 @@

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.util.concurrent.AbstractExecutionThreadService;

import focusedCrawler.config.Configuration;
import focusedCrawler.crawler.async.HttpDownloader.Callback;
import focusedCrawler.crawler.cookies.Cookie;
import focusedCrawler.crawler.cookies.CookieUtils;
import focusedCrawler.link.LinkStorage;
Expand All @@ -26,9 +24,9 @@ public class AsyncCrawler extends AbstractExecutionThreadService {
private final TargetStorage targetStorage;
private final LinkStorage linkStorage;
private final HttpDownloader downloader;
private final Map<LinkRelevance.Type, HttpDownloader.Callback> handlers = new HashMap<>();
private MetricsManager metricsManager;
private Configuration config;
private final FetchedResultHandler fetchedResultHandler;
private final MetricsManager metricsManager;
private final Configuration config;

public AsyncCrawler(String crawlerId, TargetStorage targetStorage, LinkStorage linkStorage,
Configuration config, String dataPath, MetricsManager metricsManager) {
Expand All @@ -41,10 +39,9 @@ public AsyncCrawler(String crawlerId, TargetStorage targetStorage, LinkStorage l
HttpDownloaderConfig downloaderConfig = config.getCrawlerConfig().getDownloaderConfig();
this.downloader = new HttpDownloader(downloaderConfig, dataPath, metricsManager);

this.handlers.put(LinkRelevance.Type.FORWARD, new FetchedResultHandler(crawlerId, targetStorage));
this.handlers.put(LinkRelevance.Type.SITEMAP, new SitemapXmlHandler(linkStorage));
this.handlers.put(LinkRelevance.Type.ROBOTS, new RobotsTxtHandler(linkStorage,
downloaderConfig.getUserAgentName()));
String userAgentName = downloaderConfig.getUserAgentName();
this.fetchedResultHandler = new FetchedResultHandler(crawlerId, targetStorage, linkStorage,
userAgentName);

Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
Expand All @@ -58,14 +55,9 @@ public void run() {
protected void run() {
while (isRunning()) {
try {
LinkRelevance link = (LinkRelevance) linkStorage.select(null);
LinkRelevance link = linkStorage.select();
if (link != null) {
Callback handler = handlers.get(link.getType());
if (handler == null) {
logger.error("No registered handler for link type: " + link.getType());
continue;
}
downloader.dipatchDownload(link, handler);
downloader.dipatchDownload(link, fetchedResultHandler);
}
} catch (DataNotFoundException e) {
// There are no more links available in the frontier right now
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import java.io.IOException;

import com.fasterxml.jackson.annotation.JsonUnwrapped;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

Expand All @@ -16,7 +15,7 @@ public AsyncCrawlerConfig() {
// Required for de-serialization
}

public AsyncCrawlerConfig(JsonNode config, ObjectMapper objectMapper) throws JsonProcessingException, IOException {
public AsyncCrawlerConfig(JsonNode config, ObjectMapper objectMapper) throws IOException {
objectMapper.readerForUpdating(this).readValue(config);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,64 +1,65 @@
package focusedCrawler.crawler.async;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import focusedCrawler.crawler.async.HttpDownloader.Callback;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.TargetStorage;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.PaginaURL;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FetchedResultHandler implements HttpDownloader.Callback {

private static final Logger logger = LoggerFactory.getLogger(FetchedResultHandler.class);

private String crawlerId;
private TargetStorage targetStorage;

public FetchedResultHandler(String crawlerId, TargetStorage targetStorage) {
this.crawlerId = crawlerId;
this.targetStorage = targetStorage;
private final SitemapXmlHandler sitemapXmlHandler;
private final ForwardLinkHandler forwardLinkHandler;
private final RobotsTxtHandler robotsTxtHandler;
private LinkStorage linkStorage;

public FetchedResultHandler(String crawlerId, TargetStorage targetStorage,
LinkStorage linkStorage, String userAgentName) {
this.linkStorage = linkStorage;
this.forwardLinkHandler = new ForwardLinkHandler(crawlerId, targetStorage);
this.sitemapXmlHandler = new SitemapXmlHandler(linkStorage);
this.robotsTxtHandler = new RobotsTxtHandler(linkStorage, userAgentName);
}

@Override
public void completed(LinkRelevance link, FetchedResult response) {

int statusCode = response.getStatusCode();
if(statusCode >= 200 && statusCode < 300) {
processData(link, response);
}
//else {
// TODO: Update metadata about page visits in link storage
//}
linkStorage.notifyDownloadFinished(link);
Callback handler = getDownloadHandler(link);
handler.completed(link, response);
}

@Override
public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
linkStorage.notifyDownloadFinished(link);
if (e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(), afe.getAbortReason());
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(),
afe.getAbortReason());
} else {
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(), e.getMessage());
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(),
e.getMessage());
}
Callback handler = getDownloadHandler(link);
handler.failed(link, e);
}

private void processData(LinkRelevance link, FetchedResult response) {
try {
Page page = new Page(response);
page.setLinkRelevance(link);
page.setCrawlerId(crawlerId);
if (page.isHtml()) {
PaginaURL pageParser = new PaginaURL(page);
page.setParsedData(new ParsedData(pageParser));
}
targetStorage.insert(page);

} catch (Exception e) {
logger.error("Problem while processing data.", e);

private Callback getDownloadHandler(LinkRelevance link) {
switch (link.getType()) {
case FORWARD:
return forwardLinkHandler;
case ROBOTS:
return robotsTxtHandler;
case SITEMAP:
return sitemapXmlHandler;
default:
// There should be a handler for each link type, so this shouldn't happen
throw new IllegalStateException("No handler for link type: " + link.getType());
}
}

}
}
57 changes: 57 additions & 0 deletions src/main/java/focusedCrawler/crawler/async/ForwardLinkHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package focusedCrawler.crawler.async;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.TargetStorage;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.PaginaURL;

public class ForwardLinkHandler implements HttpDownloader.Callback {

private static final Logger logger = LoggerFactory.getLogger(ForwardLinkHandler.class);

private String crawlerId;
private TargetStorage targetStorage;

public ForwardLinkHandler(String crawlerId, TargetStorage targetStorage) {
this.crawlerId = crawlerId;
this.targetStorage = targetStorage;
}

@Override
public void completed(LinkRelevance link, FetchedResult response) {

int statusCode = response.getStatusCode();
if (statusCode >= 200 && statusCode < 300) {
processPage(link, response);
}
//else {
// TODO: Update metadata about page visits in link storage
//}
}

@Override
public void failed(LinkRelevance link, Exception e) {
}

private void processPage(LinkRelevance link, FetchedResult response) {
try {
Page page = new Page(response);
page.setLinkRelevance(link);
page.setCrawlerId(crawlerId);
if (page.isHtml()) {
PaginaURL pageParser = new PaginaURL(page);
page.setParsedData(new ParsedData(pageParser));
}
targetStorage.insert(page);

} catch (Exception e) {
logger.error("Problem while processing data.", e);
}
}

}
46 changes: 18 additions & 28 deletions src/main/java/focusedCrawler/crawler/async/RobotsTxtHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.frontier.LinkRelevance;
Expand All @@ -26,70 +25,61 @@ public RobotsData(LinkRelevance link, SimpleRobotRules robotRules) {
this.robotRules = robotRules;
}
}

private static final Logger logger = LoggerFactory.getLogger(RobotsTxtHandler.class);

private SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
private LinkStorage linkStorage;
private String userAgentName;

public RobotsTxtHandler(LinkStorage linkStorage, String userAgentName) {
this.linkStorage = linkStorage;
this.userAgentName = userAgentName;
}

@Override
public void completed(LinkRelevance link, FetchedResult response) {
int statusCode = response.getStatusCode();
if(statusCode >= 200 && statusCode < 300) {
logger.info("Successfully downloaded URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
if (statusCode >= 200 && statusCode < 300) {
// HTTP 2xx means the request was successful
processRobot(link, response, false);
} else {
logger.info("Server returned bad code for URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
processRobot(link, response, true);
}
}

@Override
public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}",
link.getURL().toString(), afe.getAbortReason());
} else {
logger.info("Failed to download URL: "+link.getURL().toString(), e.getMessage());
}
processRobot(link, null, true);
}

private void processRobot(LinkRelevance link, FetchedResult response, boolean fetchFailed) {

SimpleRobotRules robotRules;
if(fetchFailed || response == null) {
robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
}
else {
if (fetchFailed || response == null) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
} else {
String contentType = response.getContentType();
boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
if ((response.getNumRedirects() > 0) && !isPlainText) {
robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
} else {
robotRules = (SimpleRobotRules) parser.parseContent(
robotRules = parser.parseContent(
response.getFetchedUrl(),
response.getContent(),
response.getContentType(),
userAgentName
userAgentName
);
}
}

try {
RobotsData robotsData = new RobotsData(link, robotRules);
linkStorage.insert(robotsData);
} catch (Exception e) {
logger.error("Failed to insert robots.txt data into link storage.", e);
}

}

}
Loading