diff --git a/Dockerfile.es b/Dockerfile.es index 931be02..f5a2698 100644 --- a/Dockerfile.es +++ b/Dockerfile.es @@ -1,4 +1,4 @@ -FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 as builder +FROM docker.elastic.co/elasticsearch/elasticsearch:7.12.0 as builder ADD https://raw.githubusercontent.com/vishnubob/wait-for-it/e1f115e4ca285c3c24e847c4dd4be955e0ed51c2/wait-for-it.sh /utils/wait-for-it.sh @@ -9,6 +9,6 @@ RUN /usr/local/bin/docker-entrypoint.sh elasticsearch -p /tmp/epid & /bin/bash / ./bin/create-es-indices.sh ; \ kill $(cat /tmp/epid) && wait $(cat /tmp/epid); exit 0; -FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 +FROM docker.elastic.co/elasticsearch/elasticsearch:7.12.0 COPY --from=builder /usr/share/elasticsearch/data /usr/share/elasticsearch/data diff --git a/administration-ui/pom.xml b/administration-ui/pom.xml index d46d6b9..1b710fe 100644 --- a/administration-ui/pom.xml +++ b/administration-ui/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 diff --git a/analysis-ui/pom.xml b/analysis-ui/pom.xml index 53d6b5b..f893519 100644 --- a/analysis-ui/pom.xml +++ b/analysis-ui/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 diff --git a/cache/pom.xml b/cache/pom.xml new file mode 100644 index 0000000..b033500 --- /dev/null +++ b/cache/pom.xml @@ -0,0 +1,96 @@ + + + + crawling-framework + lt.tokenmill.crawling + 0.3.5-SNAPSHOT + + 4.0.0 + + cache + + + 2.11.1 + + + + + redis.clients + jedis + 3.3.0 + jar + compile + + + + org.apache.commons + commons-pool2 + 2.3 + + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + + com.fasterxml.jackson.jr + jackson-jr-objects + ${jackson.version} + + + + org.slf4j + jcl-over-slf4j + ${slf4j.version} + + + + commons-codec + commons-codec + 1.15 + + + + junit + junit + 4.12 + test + + + + + + release + + + + org.apache.maven.plugins + maven-source-plugin + + + + org.apache.maven.plugins + maven-jar-plugin + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + + + + + + \ No newline at end of file diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/CacheConstants.java b/cache/src/main/java/lt/tokenmill/crawling/cache/CacheConstants.java new file mode 100644 index 0000000..becfffe --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/CacheConstants.java @@ -0,0 +1,7 @@ +package lt.tokenmill.crawling.cache; + +public class CacheConstants { + public static final String REDIS_HOST = "cache.redis.host"; + public static final String REDIS_PORT = "cache.redis.port"; + public static final String REDIS_AUTH = "cache.redis.auth"; +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/UrlProcessingCache.java b/cache/src/main/java/lt/tokenmill/crawling/cache/UrlProcessingCache.java new file mode 100644 index 0000000..a510afb --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/UrlProcessingCache.java @@ -0,0 +1,54 @@ +package lt.tokenmill.crawling.cache; + +import lt.tokenmill.crawling.cache.providers.CacheProvider; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; +import java.util.Collections; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static lt.tokenmill.crawling.cache.utils.FutureUtils.tryGet; +import static lt.tokenmill.crawling.cache.utils.FutureUtils.waitFor; +import static lt.tokenmill.crawling.cache.utils.HashUtils.hashKey; + +public class UrlProcessingCache { + private static final Logger LOG = LoggerFactory.getLogger(UrlProcessingCache.class); + private final CacheProvider provider; + + public UrlProcessingCache(CacheProvider provider){ + this.provider = provider; + } + + public static String parseDomain(String url){ + try { + URL u = new URL(url); + return u.getHost(); + } catch (MalformedURLException e) { + e.printStackTrace(); + return null; + } + } + + private String globalNamespace(String domain){ + return String.format("global:%s", hashKey(domain)); + } + + public void addUrl(String url){ + String domain = parseDomain(url); + waitFor(provider.addKey(globalNamespace(domain), url)); + } + + public Set filterUrls(String domain, Collection urls) { + return filterUrls(domain, urls.stream()); + } + + public Set filterUrls(String domain, Stream urls) { + Set keys = tryGet(provider.keysInNamespace(globalNamespace(domain)), () -> Collections.emptySet()); + return urls.filter(k -> !keys.contains(hashKey(k))).collect(Collectors.toSet()); + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/datamodel/Pair.java b/cache/src/main/java/lt/tokenmill/crawling/cache/datamodel/Pair.java new file mode 100644 index 0000000..f1ac8b6 --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/datamodel/Pair.java @@ -0,0 +1,29 @@ +package lt.tokenmill.crawling.cache.datamodel; + +public class Pair { + private final T key; + private final U value; + public Pair(T key, U value){ + this.key = key; + this.value = value; + } + + public T getKey(){ + return key; + } + + public U getValue(){ + return value; + } + + @Override + public boolean equals(Object other) { + if(other instanceof Pair){ + Pair otherPair = (Pair)other; + + return key.equals(otherPair.getKey()) && value.equals(otherPair.getValue()); + } + + return false; + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/CacheProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/CacheProvider.java new file mode 100644 index 0000000..17e08bf --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/CacheProvider.java @@ -0,0 +1,29 @@ +package lt.tokenmill.crawling.cache.providers; + +import lt.tokenmill.crawling.cache.datamodel.Pair; + +import java.util.Collection; +import java.util.Set; +import java.util.concurrent.CompletableFuture; + +public interface CacheProvider { + CompletableFuture set(String namespace, String key, T value); + CompletableFuture setMultiple(String namespace, Set> pairs); + + CompletableFuture addKey(String namespace, String key); + CompletableFuture addKeyMultiple(String namespace, Collection keys); + CompletableFuture removeKey(String namespace, String key); + CompletableFuture moveKey(String source, String dest, String key); + CompletableFuture findKey(String namespace, String key); + + CompletableFuture get(Class klass, String namespace, String key); + CompletableFuture> getMultiple(Class klass, String namespace, String... keys); + CompletableFuture> getMultiple(Class klass, String namespace, Collection keys); + + CompletableFuture contains(String key); + CompletableFuture contains(String namespace, String key); + + CompletableFuture> keysInNamespace(String namespace); + + void cleanup(); +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/Builder.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/Builder.java new file mode 100644 index 0000000..52ad901 --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/Builder.java @@ -0,0 +1,51 @@ +package lt.tokenmill.crawling.cache.providers.redis; + +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.JedisPoolConfig; +import redis.clients.jedis.Protocol; + +import java.util.Optional; + +public class Builder{ + private String host = "localhost"; + private int port = 6379; + private Optional auth = Optional.empty(); + private int timeout = Protocol.DEFAULT_TIMEOUT; + + public Builder withHost(String host){ + this.host = host; + return this; + } + + public Builder withPort(int port){ + this.port = port; + return this; + } + + public Builder withAuth(String auth){ + if(auth != null) { + this.auth = Optional.of(auth); + } + return this; + } + + public Builder withTimeoutMillis(int millis){ + this.timeout = millis; + return this; + } + + public RedisProvider build(){ + JedisPoolConfig config = new JedisPoolConfig(); + config.setMaxTotal(100); + JedisPool pool; + if(auth.isPresent()){ + pool = new JedisPool(config, host, port, timeout, auth.get()); + } + else{ + pool = new JedisPool(config, host, port, timeout); + } + + return new RedisProvider(pool); + } + +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisKVProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisKVProvider.java new file mode 100644 index 0000000..11d7a54 --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisKVProvider.java @@ -0,0 +1,85 @@ +package lt.tokenmill.crawling.cache.providers.redis; + +import com.fasterxml.jackson.databind.ObjectMapper; +import lt.tokenmill.crawling.cache.datamodel.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import redis.clients.jedis.Pipeline; + +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +public class RedisKVProvider { + private static final Logger LOG = LoggerFactory.getLogger(RedisKVProvider.class); + private final RedisResourceProvider resource; + private static ObjectMapper objectMapper = new ObjectMapper(); + + public RedisKVProvider(RedisResourceProvider resourceProvider){ + this.resource = resourceProvider; + } + + public CompletableFuture contains(String namespace, String key) { + return resource.withJedis(redis -> redis.hexists(namespace, key)); + } + + public CompletableFuture set(String namespace, String key, T value) { + return resource.withJedis(redis -> { + try { + redis.hset(namespace, key, objectMapper.writeValueAsString(value)); + return true; + } + catch (Exception ex){ + LOG.error("Failed to set value", ex); + return false; + } + }); + } + + public CompletableFuture setMultiple(String namespace, Set> pairs){ + return resource.withJedis(redis -> { + Pipeline pipe = redis.pipelined(); + pipe.multi(); + for(Pair pair : pairs){ + try { + pipe.hset(namespace, pair.getKey(), objectMapper.writeValueAsString(pair.getValue())); + } + catch (Exception ex){ + LOG.error("Failed to set value", ex); + } + } + + pipe.sync(); + pipe.exec(); + return true; + }); + } + + public CompletableFuture get(Class klass, String namespace, String key) { + return resource.withJedis(redis -> { + String data = redis.hget(namespace, key); + return parseObj(data, klass); + }); + } + + private T parseObj(String data, Class klass){ + try { + return objectMapper.readValue(data, klass); + } + catch(Exception ex){ + return null; + } + } + + public CompletableFuture> getMultiple(Class klass, String namespace, Set keys) { + return resource + .withPipeline(pipe -> keys.stream().map(k -> pipe.hget(namespace, k))) + .thenApply(responses -> { + return responses + .map(data -> parseObj(data, klass)) + .filter(Objects::nonNull) + .collect(Collectors.toSet()); + }); + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisProvider.java new file mode 100644 index 0000000..3d089c4 --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisProvider.java @@ -0,0 +1,98 @@ +package lt.tokenmill.crawling.cache.providers.redis; + +import lt.tokenmill.crawling.cache.datamodel.Pair; +import lt.tokenmill.crawling.cache.providers.CacheProvider; +import redis.clients.jedis.JedisPool; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +public class RedisProvider implements CacheProvider { + private final RedisResourceProvider resourceProvider; + private final RedisKVProvider kvProvider; + private final RedisSetProvider setProvider; + + public static Builder builder(){ + return new Builder(); + } + + protected RedisProvider(JedisPool pool){ + this.resourceProvider = new RedisResourceProvider(pool); + this.kvProvider = new RedisKVProvider(resourceProvider); + this.setProvider = new RedisSetProvider(resourceProvider); + } + + @Override + public CompletableFuture set(String namespace, String key, T value) { + return kvProvider.set(namespace, key, value); + } + + @Override + public CompletableFuture setMultiple(String namespace, Set> pairs) { + return kvProvider.setMultiple(namespace, pairs); + } + + + @Override + public CompletableFuture get(Class klass, String namespace, String key) { + return kvProvider.get(klass, namespace, key); + } + + @Override + public CompletableFuture> getMultiple(Class klass, String namespace, String... keys) { + return getMultiple(klass, namespace, Arrays.asList(keys)); + } + + @Override + public CompletableFuture> getMultiple(Class klass, String namespace, Collection initialKeys) { + return kvProvider.getMultiple(klass, namespace, initialKeys.stream().collect(Collectors.toSet())); + } + + @Override + public CompletableFuture addKey(String namespace, String key) { + return setProvider.addKey(namespace, key); + } + + @Override + public CompletableFuture addKeyMultiple(String namespace, Collection keys) { + return setProvider.addKeys(namespace, keys); + } + + @Override + public CompletableFuture removeKey(String namespace, String key) { + return setProvider.removeKey(namespace, key); + } + + @Override + public CompletableFuture moveKey(String source, String dest, String key) { + return setProvider.moveKey(source, dest, key); + } + + @Override + public CompletableFuture findKey(String namespace, String key) { + return setProvider.findKey(namespace, key); + } + + @Override + public CompletableFuture contains(String key) { + return resourceProvider.withJedis(redis -> redis.exists(key)); + } + + @Override + public CompletableFuture contains(String namespace, String key) { + return kvProvider.contains(namespace, key); + } + + @Override + public CompletableFuture> keysInNamespace(String namespace) { + return setProvider.keysInNamespace(namespace); + } + + @Override + public void cleanup() { + resourceProvider.withJedis(redis -> redis.flushAll()); + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisResourceProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisResourceProvider.java new file mode 100644 index 0000000..4ee670f --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisResourceProvider.java @@ -0,0 +1,74 @@ +package lt.tokenmill.crawling.cache.providers.redis; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import redis.clients.jedis.*; +import redis.clients.jedis.exceptions.JedisConnectionException; + +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +public class RedisResourceProvider { + private final JedisPool jedisPool; + private static final Logger LOG = LoggerFactory.getLogger(RedisResourceProvider.class); + private final static int MAX_RETRIES = 5; + private final static long RETRY_TIMEOUT = 300; + + + protected RedisResourceProvider(JedisPool pool){ + this.jedisPool = pool; + } + + public CompletableFuture withJedis(Function body){ + CompletableFuture future = CompletableFuture.supplyAsync(() -> { + int retryCount = 0; + + while(true) { + try (Jedis jedis = jedisPool.getResource()) { + T result = body.apply(jedis); + if(result != null){ + return result; + } + else { + return null; + } + } catch (JedisConnectionException cex){ + retryCount++; + if(retryCount > MAX_RETRIES) { + throw cex; + } + else { + LOG.warn("Redis operation has failed due to connection issue. Retrying in {}ms", RETRY_TIMEOUT * retryCount); + try { + Thread.sleep(RETRY_TIMEOUT); + } catch (InterruptedException iex) { + continue; + } + } + + } catch (Exception ex) { + throw ex; + } + } + }); + + return future; + } + + public CompletableFuture> withPipeline(Function>> body){ + return withJedis(redis -> { + Pipeline pipe = redis.pipelined(); + pipe.multi(); + List> results = body.apply(pipe).collect(Collectors.toList()); + + Response> resultsResponse = pipe.exec(); + pipe.sync(); + + resultsResponse.get(); + return results.stream().map(r -> r.get()); + }); + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisSetProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisSetProvider.java new file mode 100644 index 0000000..4db1a78 --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisSetProvider.java @@ -0,0 +1,154 @@ +package lt.tokenmill.crawling.cache.providers.redis; + +import lt.tokenmill.crawling.cache.datamodel.Pair; +import lt.tokenmill.crawling.cache.utils.FutureUtils; +import lt.tokenmill.crawling.cache.utils.KeyUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import redis.clients.jedis.Jedis; +import redis.clients.jedis.Pipeline; +import redis.clients.jedis.Response; +import redis.clients.jedis.ScanResult; + +import java.util.*; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +public class RedisSetProvider { + private static final Logger LOG = LoggerFactory.getLogger(RedisKVProvider.class); + private final RedisResourceProvider resource; + + public RedisSetProvider(RedisResourceProvider resourceProvider){ + this.resource = resourceProvider; + } + + public CompletableFuture addKey(String namespace, String key) { + return addKeys(namespace, Arrays.asList(key)); + } + + public CompletableFuture addKeys(String namespace, Collection keys){ + return resource.withJedis(redis -> { + Pipeline pipe = redis.pipelined(); + + pipe.multi(); + for(String key : keys){ + pipe.sadd(namespace, key); + } + + pipe.exec(); + pipe.sync(); + return true; + }); + } + + public CompletableFuture removeKey(String namespace, String key) { + return resource.withJedis(redis -> { + Set keys = KeyUtils.resolveKeys(namespace, redis); + + Pipeline pipe = redis.pipelined(); + pipe.multi(); + List> responses = keys + .stream() + .map(k -> pipe.srem(k, key)) + .collect(Collectors.toList()); + + pipe.exec(); + pipe.sync(); + + responses.forEach(response -> response.get()); + return true; + }); + } + + public CompletableFuture moveKey(String source, String dest, String key){ + return resource.withJedis(redis -> redis.smove(source, dest, key) == 1); + } + + public CompletableFuture findKey(String namespace, String key){ + return resource.withJedis(redis -> { + Set keys = KeyUtils.resolveKeys(namespace, redis); + + Pipeline pipe = redis.pipelined(); + pipe.multi(); + List>> responses = keys + .stream() + .map(k -> new Pair>(k, pipe.sismember(k, key))) + .collect(Collectors.toList()); + + pipe.exec(); + pipe.sync(); + + Optional result = responses + .stream() + .filter(pair -> pair.getValue().get()) + .map(pair -> pair.getKey()) + .findFirst(); + + if(result.isPresent()){ + return result.get(); + } + else{ + return "-1"; + } + }); + } + + public CompletableFuture> keysInNamespace(String namespace) { + Set keys = FutureUtils.tryGet(resource.withJedis(redis -> KeyUtils.resolveKeys(namespace, redis)), () -> Collections.emptySet()); + if(keys.size() == 0){ + return CompletableFuture.completedFuture(Collections.emptySet()); + } + + return resource + .withPipeline(pipe -> keys.stream().map(k -> pipe.smembers(k))) + .thenApply(results -> results.flatMap(Collection::stream).collect(Collectors.toSet())); + } + + private Long countKeysInNamespaces(Collection namespaces, Jedis redis){ + Pipeline pipe = redis.pipelined(); + pipe.multi(); + List> responses = namespaces.stream().map(k -> pipe.scard(k)).collect(Collectors.toList()); + + pipe.exec(); + pipe.sync(); + + return responses + .stream() + .map(response -> response.get()) + .filter(Objects::nonNull) + .reduce(0L, Long::sum); + } + + public CompletableFuture countKeysInNamespace(String namespace) { + return resource.withJedis(redis -> { + Set keys = redis.keys(namespace); + if(keys.size() == 0){ + return 0L; + } + + return countKeysInNamespaces(keys, redis); + }); + } + + public CompletableFuture countKeysInNamespaces(Collection namespaces) { + return resource.withJedis(redis -> countKeysInNamespaces(namespaces, redis)); + } + + public CompletableFuture removeSet(String namespace) { + return resource.withJedis(redis -> { + String cursor = "0"; + while(true){ + ScanResult result = redis.sscan(namespace, cursor); + cursor = result.getCursor(); + List members = result.getResult(); + redis.srem(namespace, members.toArray(new String[0])); + + if(cursor.equalsIgnoreCase("0")){ + break; + } + } + + return true; + }); + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/utils/FutureUtils.java b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/FutureUtils.java new file mode 100644 index 0000000..d83a55e --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/FutureUtils.java @@ -0,0 +1,47 @@ +package lt.tokenmill.crawling.cache.utils; + +import java.util.concurrent.Callable; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Future; + +public class FutureUtils { + public static T tryGet(Future future, Callable onFail){ + try{ + T result = future.get(); + if(result != null){ + return result; + } + else { + return onFail.call(); + } + } catch (Exception ex){ + ex.printStackTrace(); + try { + return onFail.call(); + } catch (Exception ex2){ + ex2.printStackTrace(); + return null; + } + } + } + + public static CompletableFuture transformFuture(Future future){ + CompletableFuture newFuture = new CompletableFuture<>(); + try{ + newFuture.complete(future.get()); + } catch(Exception ex){ + newFuture.completeExceptionally(ex); + } + return newFuture; + } + + public static boolean waitFor(Future future){ + try{ + future.get(); + return true; + } catch (Exception ex){ + ex.printStackTrace(); + return false; + } + } +} \ No newline at end of file diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/utils/HashUtils.java b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/HashUtils.java new file mode 100644 index 0000000..3d8d8ec --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/HashUtils.java @@ -0,0 +1,21 @@ +package lt.tokenmill.crawling.cache.utils; + +import org.apache.commons.codec.digest.MurmurHash3; + +import java.util.Arrays; +import java.util.stream.Collectors; + +public class HashUtils { + public static String hashKey(String url){ + // MurmurHash64 + return String.valueOf(MurmurHash3.hash128x64(url.getBytes())[0]); + } + + public static long hashKey(String... s){ + return MurmurHash3.hash128x64(Arrays.asList(s).stream().collect(Collectors.joining("#")).getBytes())[0]; + } + + public static long domainKey(String caseId, String domain){ + return hashKey(caseId, domain); + } +} diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/utils/KeyUtils.java b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/KeyUtils.java new file mode 100644 index 0000000..0cae443 --- /dev/null +++ b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/KeyUtils.java @@ -0,0 +1,22 @@ +package lt.tokenmill.crawling.cache.utils; + +import redis.clients.jedis.Jedis; + +import java.util.Arrays; +import java.util.Set; +import java.util.stream.Collectors; + +public class KeyUtils { + public static boolean isWildCard(String key){ + return key.contains("*"); + } + + public static Set resolveKeys(String key, Jedis jedis){ + if(isWildCard(key)){ + return jedis.keys(key); + } + else{ + return Arrays.asList(key).stream().collect(Collectors.toSet()); + } + } +} diff --git a/crawler/pom.xml b/crawler/pom.xml index 320d8f8..89e2bf0 100644 --- a/crawler/pom.xml +++ b/crawler/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 @@ -73,6 +73,11 @@ 4.13.1 test + + lt.tokenmill.crawling + cache + 0.3.5-SNAPSHOT + diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java index 9982445..30d7903 100644 --- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java +++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java @@ -2,6 +2,10 @@ import com.digitalpebble.stormcrawler.util.ConfUtils; import com.google.common.collect.Maps; +import lt.tokenmill.crawling.cache.CacheConstants; +import lt.tokenmill.crawling.cache.UrlProcessingCache; +import lt.tokenmill.crawling.cache.providers.CacheProvider; +import lt.tokenmill.crawling.cache.providers.redis.RedisProvider; import lt.tokenmill.crawling.es.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,4 +53,14 @@ public EsDocumentOperations creatEsDocumentOperations(Map conf) { return EsDocumentOperations.getInstance(connection, docsIndexName, docsDocumentType); } + @Override + public UrlProcessingCache createUrlProcessingCache(Map conf) { + CacheProvider provider = RedisProvider.builder() + .withHost(ConfUtils.getString(conf, CacheConstants.REDIS_HOST, "localhost")) + .withPort(ConfUtils.getInt(conf, CacheConstants.REDIS_PORT, 6379)) + .withAuth(ConfUtils.getString(conf, CacheConstants.REDIS_AUTH, null)) + .build(); + return new UrlProcessingCache(provider); + } + } diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java index a87b498..573d25f 100644 --- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java +++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java @@ -1,5 +1,6 @@ package lt.tokenmill.crawling.crawler; +import lt.tokenmill.crawling.cache.UrlProcessingCache; import lt.tokenmill.crawling.es.EsDocumentOperations; import lt.tokenmill.crawling.es.EsHttpSourceOperations; import lt.tokenmill.crawling.es.EsHttpUrlOperations; @@ -16,4 +17,6 @@ public interface ServiceProvider { EsHttpSourceOperations createEsHttpSourceOperations(Map conf); EsDocumentOperations creatEsDocumentOperations(Map conf); + + UrlProcessingCache createUrlProcessingCache(Map conf); } diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java index d84c298..7e1fbb7 100644 --- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java +++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java @@ -44,6 +44,7 @@ public void store(String url, Status status, Metadata metadata, Date nextFetch) String filtered = filters.filter(url); if (isSeed || (filtered == null && status.equals(Status.DISCOVERED))) { LOG.debug("Url '{}' is seed or rejected by filters", url); + esUrlsOperations.upsertUrlStatus(url, null, source, false, Status.FETCH_ERROR); return; } diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java index 259cb3f..1ecb072 100644 --- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java +++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java @@ -5,11 +5,14 @@ import lt.tokenmill.crawling.crawler.CrawlerConstants; import lt.tokenmill.crawling.crawler.ServiceProvider; import lt.tokenmill.crawling.crawler.utils.PrioritizedSource; +import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache; import lt.tokenmill.crawling.data.DataUtils; import lt.tokenmill.crawling.data.HttpSource; import lt.tokenmill.crawling.data.HttpUrl; import lt.tokenmill.crawling.es.EsHttpSourceOperations; +import lt.tokenmill.crawling.es.EsHttpSourcesCache; import lt.tokenmill.crawling.es.EsHttpUrlOperations; +import lt.tokenmill.crawling.parser.urls.UrlFilters; import org.apache.storm.spout.SpoutOutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; @@ -144,6 +147,7 @@ private HttpSourceConfiguration getConfiguration() { if (configuration == null || HttpSourceConfiguration.needsReload()) { LOG.info("Loading HTTP sources"); List sources = esSourceOperations.findEnabledSources(); + LOG.info("Have {} enabled sources", sources.size()); configuration = HttpSourceConfiguration.reload(configuration, sources); return configuration; } diff --git a/data-model/pom.xml b/data-model/pom.xml index 3038c40..93f1f2b 100644 --- a/data-model/pom.xml +++ b/data-model/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 diff --git a/elasticsearch/pom.xml b/elasticsearch/pom.xml index 9efe894..2485e99 100644 --- a/elasticsearch/pom.xml +++ b/elasticsearch/pom.xml @@ -5,13 +5,12 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 elasticsearch - lt.tokenmill.crawling @@ -35,26 +34,6 @@ elasticsearch-rest-high-level-client ${elasticsearch.version} - - org.apache.httpcomponents - httpasyncclient - 4.1.3 - - - org.apache.httpcomponents - httpcore-nio - 4.4.6 - - - org.apache.httpcomponents - httpclient - 4.5.4 - - - org.apache.httpcomponents - httpcore - 4.4.6 - org.elasticsearch.plugin transport-netty4-client @@ -89,6 +68,12 @@ 4.13.1 test + + + org.apache.httpcomponents + httpcore + 4.4.14 + @@ -115,4 +100,5 @@ + \ No newline at end of file diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java index 0bc532f..1583fc5 100644 --- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java +++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java @@ -31,10 +31,6 @@ protected String getIndex() { return index; } - protected String getType() { - return type; - } - protected RequestOptions getRequestOptions() { return requestOptions; } public void close() { diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java index 6d13199..be1a07e 100644 --- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java +++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java @@ -90,7 +90,6 @@ public PageableList query(NamedQuery... queries) { .sort(PUBLISHED_FIELD, SortOrder.DESC) .size(100).query(query); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .source(searchSourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient() .search(searchRequest, getRequestOptions()); @@ -148,7 +147,6 @@ public PageableList query(List included, Li .field("text.stem_ci")) .sort(PUBLISHED_FIELD, SortOrder.DESC); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .source(sourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient() .search(searchRequest, getRequestOptions()); @@ -193,7 +191,8 @@ public void store(HttpArticle article, Map fields) { jsonBuilder.startObject(); applyFields(jsonBuilder, article, fields); jsonBuilder.endObject(); - IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(article.getUrl())) + IndexRequest indexRequest = new IndexRequest(getIndex()) + .id(formatId(article.getUrl())) .source(jsonBuilder); getConnection().getProcessor().add(indexRequest); } catch (IOException e) { @@ -216,7 +215,7 @@ public void update(HttpArticle article, Map fields) { XContentBuilder upsertDoc = jsonBuilder().startObject(); applyFields(upsertDoc, article, fields); upsertDoc.endObject(); - UpdateRequest upsert = new UpdateRequest(getIndex(), getType(), id) + UpdateRequest upsert = new UpdateRequest(getIndex(), id) .doc(update) .upsert(upsertDoc); getConnection().getProcessor().add(upsert); @@ -231,7 +230,7 @@ public HttpArticle get(String url) { public Map getAsMap(String url) { try { - GetRequest getRequest = new GetRequest(getIndex(), getType(), formatId(url)) + GetRequest getRequest = new GetRequest(getIndex(), formatId(url)) .fetchSourceContext(new FetchSourceContext(true)); GetResponse response = getConnection().getRestHighLevelClient().get(getRequest, getRequestOptions()); if (response.isExists()) { @@ -268,7 +267,6 @@ public HttpArticle findDuplicate(HttpArticle article) { .fetchSource(true); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .searchType(SearchType.DEFAULT) .source(searchSourceBuilder); try { @@ -294,7 +292,6 @@ public List calculateStats(String sourceUrl) { .must(QueryBuilders.termQuery(SOURCE_FIELD, sourceUrl)); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .searchType(SearchType.DEFAULT) .source(new SearchSourceBuilder() .query(filter) diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java index 2efd8c2..da29c0f 100644 --- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java +++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java @@ -63,7 +63,6 @@ public List findEnabledSources() { .fetchSource(true) .explain(false); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .searchType(SearchType.DEFAULT) .source(searchSourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient().search(searchRequest, getRequestOptions()); @@ -74,23 +73,30 @@ public List findEnabledSources() { .map(this::mapToHttpSource) .collect(Collectors.toList()); } catch (ElasticsearchStatusException e) { + LOG.error("Failed while searching for enabled sources", e); } catch (Exception e) { e.printStackTrace(); + LOG.error("Failed while searching for enabled sources", e); } return Collections.emptyList(); } public HttpSource get(String url) { try { - GetRequest getRequest = new GetRequest(getIndex(), getType(), formatId(url)) - .fetchSourceContext(new FetchSourceContext(true)); + GetRequest getRequest = new GetRequest(getIndex(), /*formatId(url)*/url) + .fetchSourceContext(FetchSourceContext.FETCH_SOURCE); GetResponse response = getConnection().getRestHighLevelClient().get(getRequest, getRequestOptions()); if (response.isExists()) { return mapToHttpSource(response.getSource()); } + else { + LOG.error("No response for HTTP Source id: '{}'", response.getId()); + } } catch (ElasticsearchStatusException e) { + LOG.error("Failed to fetch HTTP Source for {}", url, e); } catch (Exception e) { e.printStackTrace(); + LOG.error("Failed to fetch HTTP Source for {}", url, e); } return null; } @@ -123,7 +129,6 @@ public PageableList filter(String text, int offset) { .explain(false); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .searchType(SearchType.DEFAULT) .source(searchSourceBuilder); @@ -158,7 +163,6 @@ public List all() { .query(filter); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .scroll(keepAlive) .source(searchSourceBuilder); @@ -212,7 +216,7 @@ public void save(HttpSource source) { .field("date_formats", Utils.listToText(source.getDateFormats())) .field("updated", new Date()) .endObject(); - IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(url)) + IndexRequest indexRequest = new IndexRequest(getIndex(), formatId(url)) .source(xContentBuilder) .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); getConnection().getRestHighLevelClient().index(indexRequest, getRequestOptions()); @@ -225,7 +229,7 @@ public void save(HttpSource source) { public void delete(String url) { if (url != null) { try { - DeleteRequest deleteRequest = new DeleteRequest(getIndex(), getType(), formatId(url)) + DeleteRequest deleteRequest = new DeleteRequest(getIndex(), formatId(url)) .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); getConnection().getRestHighLevelClient().delete(deleteRequest, getRequestOptions()); } catch (IOException e) { @@ -243,7 +247,6 @@ public void deleteAll() { .fetchSource(true) .explain(false); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .scroll(keepAlive) .source(searchSourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient() diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java index 44beb45..4570e09 100644 --- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java +++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java @@ -72,7 +72,6 @@ public PageableList filter(String prefix, int offset) { .query(filter); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .source(searchSourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient().search(searchRequest, getRequestOptions()); List items = Arrays.stream(response.getHits().getHits()) @@ -90,7 +89,7 @@ public PageableList filter(String prefix, int offset) { public HttpSourceTest get(String url) { try { - GetRequest getRequest = new GetRequest(getIndex(), getType(), formatId(url)) + GetRequest getRequest = new GetRequest(getIndex(), formatId(url)) .fetchSourceContext(new FetchSourceContext(true)); GetResponse response = getConnection().getRestHighLevelClient().get(getRequest, getRequestOptions()); if (response.isExists()) { @@ -115,7 +114,6 @@ public List all() { .query(filter) .size(100); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .source(searchSourceBuilder) .scroll(keepAlive); @@ -155,7 +153,8 @@ public void save(HttpSourceTest hst) { .field("date", hst.getDate() != null ? hst.getDate().trim() : null) .field("updated", new Date()) .endObject(); - IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(hst.getUrl())) + IndexRequest indexRequest = new IndexRequest(getIndex()) + .id(formatId(hst.getUrl())) .source(contentBuilder) .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); getConnection().getRestHighLevelClient().index(indexRequest, getRequestOptions()); @@ -168,7 +167,7 @@ public void save(HttpSourceTest hst) { public void delete(String url) { if (url != null) { try { - DeleteRequest deleteRequest = new DeleteRequest(getIndex(), getType(), formatId(url)) + DeleteRequest deleteRequest = new DeleteRequest(getIndex(), formatId(url)) .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); DeleteResponse delete = getConnection().getRestHighLevelClient().delete(deleteRequest, getRequestOptions()); LOG.debug("Delete HttpSourceTest url {} with response status {}", url, delete.status()); @@ -182,7 +181,6 @@ public void deleteAll() { try { TimeValue keepAlive = TimeValue.timeValueMinutes(10); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .scroll(keepAlive) .source(new SearchSourceBuilder() .size(100) diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java index e55f4af..beafd1d 100644 --- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java +++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java @@ -57,7 +57,8 @@ public void upsertUrlStatus(String url, String published, String source, boolean .field("published", published) .field("status", status) .endObject(); - IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), id) + IndexRequest indexRequest = new IndexRequest(getIndex()) + .id(id) .source(insert) .create(create); @@ -70,7 +71,7 @@ public void upsertUrlStatus(String url, String published, String source, boolean .field("published", published) .field("status", status) .endObject(); - UpdateRequest upsert = new UpdateRequest(getIndex(), getType(), id) + UpdateRequest upsert = new UpdateRequest(getIndex(), id) .doc(update) .upsert(indexRequest); getConnection().getProcessor().add(upsert); @@ -97,7 +98,6 @@ public List findUrlsByStatusAndSource(String status, String source, int .query(filter) .sort("created", SortOrder.DESC); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .searchType(SearchType.DEFAULT) .source(searchSourceBuilder); @@ -138,7 +138,6 @@ public List calculateStats(String sourceUrl) { .format("yyyy-MM-dd") .dateHistogramInterval(DateHistogramInterval.DAY)); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .searchType(SearchType.DEFAULT) .source(searchSourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient().search(searchRequest, getRequestOptions()); diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java index e78de24..dcf00e2 100644 --- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java +++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java @@ -63,7 +63,6 @@ public PageableList filter(String prefix) { .size(100) .query(filter); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .source(searchSourceBuilder); SearchResponse response = getConnection().getRestHighLevelClient() @@ -91,7 +90,6 @@ public List suggest(String prefix) { .fetchSource(true) .suggest(new SuggestBuilder().addSuggestion("suggestion", suggestionBuilder)); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .source(sourceRequestBuilder); SearchResponse response = getConnection().getRestHighLevelClient() .search(searchRequest, getRequestOptions()); @@ -111,7 +109,7 @@ public List suggest(String prefix) { public NamedQuery get(String name) { try { GetResponse response = getConnection().getRestHighLevelClient() - .get(new GetRequest(getIndex(), getType(), formatId(name)) + .get(new GetRequest(getIndex(), formatId(name)) .fetchSourceContext(new FetchSourceContext(true)), getRequestOptions()); if (response.isExists()) { return mapToNamedQuery(response.getSource()); @@ -126,7 +124,6 @@ public List all() { try { TimeValue keepAlive = TimeValue.timeValueMinutes(10); SearchRequest searchRequest = new SearchRequest(getIndex()) - .types(getType()) .scroll(keepAlive) .source(new SearchSourceBuilder().query(QueryBuilders.matchAllQuery())); SearchResponse response = getConnection().getRestHighLevelClient() @@ -164,7 +161,8 @@ public void save(NamedQuery nq) { .field("advanced", nq.getAdvanced()) .field("updated", new Date()) .endObject(); - IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(nq.getName())) + IndexRequest indexRequest = new IndexRequest(getIndex()) + .id(formatId(nq.getName())) .source(contentBuilder) .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); getConnection().getRestHighLevelClient().index(indexRequest, getRequestOptions()); @@ -177,7 +175,7 @@ public void save(NamedQuery nq) { public void delete(NamedQuery nq) { if (nq != null && nq.getName() != null) { try { - DeleteRequest deleteRequest = new DeleteRequest(getIndex(), getType(), formatId(nq.getName())) + DeleteRequest deleteRequest = new DeleteRequest(getIndex(), formatId(nq.getName())) .setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); getConnection().getRestHighLevelClient().delete(deleteRequest, getRequestOptions()); } catch (IOException e) { @@ -191,7 +189,6 @@ public void deleteAll() { TimeValue keepAlive = TimeValue.timeValueMinutes(10); SearchRequest searchRequest = new SearchRequest(getIndex()) .scroll(keepAlive) - .types(getType()) .source(new SearchSourceBuilder() .size(100) .explain(false) diff --git a/page-analyzer/pom.xml b/page-analyzer/pom.xml index e2b3f0b..7d6c292 100644 --- a/page-analyzer/pom.xml +++ b/page-analyzer/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 diff --git a/parser/pom.xml b/parser/pom.xml index 161fecf..5f1a160 100644 --- a/parser/pom.xml +++ b/parser/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0 diff --git a/pom.xml b/pom.xml index cb6f367..15d0b4c 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ lt.tokenmill.crawling crawling-framework pom - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT Crawling Framework Framework to simplify news crawling https://github.com/tokenmill/crawling-framework @@ -128,7 +128,7 @@ UTF-8 - 7.11.2 + 7.12.0 1.1.3 1.5.1 7.7.5 @@ -150,6 +150,7 @@ administration-ui analysis-ui ui-commons + cache @@ -226,6 +227,17 @@ jsonld-java ${jsonld.version} + + + org.apache.httpcomponents + httpclient + 4.5.10 + + + org.apache.httpcomponents + httpcore + 4.4.14 + diff --git a/ui-commons/pom.xml b/ui-commons/pom.xml index a2b753d..35b2442 100644 --- a/ui-commons/pom.xml +++ b/ui-commons/pom.xml @@ -5,7 +5,7 @@ crawling-framework lt.tokenmill.crawling - 0.3.4-SNAPSHOT + 0.3.5-SNAPSHOT 4.0.0