diff --git a/Dockerfile.es b/Dockerfile.es
index 931be02..f5a2698 100644
--- a/Dockerfile.es
+++ b/Dockerfile.es
@@ -1,4 +1,4 @@
-FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0 as builder
+FROM docker.elastic.co/elasticsearch/elasticsearch:7.12.0 as builder
ADD https://raw.githubusercontent.com/vishnubob/wait-for-it/e1f115e4ca285c3c24e847c4dd4be955e0ed51c2/wait-for-it.sh /utils/wait-for-it.sh
@@ -9,6 +9,6 @@ RUN /usr/local/bin/docker-entrypoint.sh elasticsearch -p /tmp/epid & /bin/bash /
./bin/create-es-indices.sh ; \
kill $(cat /tmp/epid) && wait $(cat /tmp/epid); exit 0;
-FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.3.0
+FROM docker.elastic.co/elasticsearch/elasticsearch:7.12.0
COPY --from=builder /usr/share/elasticsearch/data /usr/share/elasticsearch/data
diff --git a/administration-ui/pom.xml b/administration-ui/pom.xml
index d46d6b9..1b710fe 100644
--- a/administration-ui/pom.xml
+++ b/administration-ui/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
diff --git a/analysis-ui/pom.xml b/analysis-ui/pom.xml
index 53d6b5b..f893519 100644
--- a/analysis-ui/pom.xml
+++ b/analysis-ui/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
diff --git a/cache/pom.xml b/cache/pom.xml
new file mode 100644
index 0000000..b033500
--- /dev/null
+++ b/cache/pom.xml
@@ -0,0 +1,96 @@
+
+
+
+ crawling-framework
+ lt.tokenmill.crawling
+ 0.3.5-SNAPSHOT
+
+ 4.0.0
+
+ cache
+
+
+ 2.11.1
+
+
+
+
+ redis.clients
+ jedis
+ 3.3.0
+ jar
+ compile
+
+
+
+ org.apache.commons
+ commons-pool2
+ 2.3
+
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ ${jackson.version}
+
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ ${jackson.version}
+
+
+
+ com.fasterxml.jackson.jr
+ jackson-jr-objects
+ ${jackson.version}
+
+
+
+ org.slf4j
+ jcl-over-slf4j
+ ${slf4j.version}
+
+
+
+ commons-codec
+ commons-codec
+ 1.15
+
+
+
+ junit
+ junit
+ 4.12
+ test
+
+
+
+
+
+ release
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/CacheConstants.java b/cache/src/main/java/lt/tokenmill/crawling/cache/CacheConstants.java
new file mode 100644
index 0000000..becfffe
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/CacheConstants.java
@@ -0,0 +1,7 @@
+package lt.tokenmill.crawling.cache;
+
+public class CacheConstants {
+ public static final String REDIS_HOST = "cache.redis.host";
+ public static final String REDIS_PORT = "cache.redis.port";
+ public static final String REDIS_AUTH = "cache.redis.auth";
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/UrlProcessingCache.java b/cache/src/main/java/lt/tokenmill/crawling/cache/UrlProcessingCache.java
new file mode 100644
index 0000000..a510afb
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/UrlProcessingCache.java
@@ -0,0 +1,54 @@
+package lt.tokenmill.crawling.cache;
+
+import lt.tokenmill.crawling.cache.providers.CacheProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static lt.tokenmill.crawling.cache.utils.FutureUtils.tryGet;
+import static lt.tokenmill.crawling.cache.utils.FutureUtils.waitFor;
+import static lt.tokenmill.crawling.cache.utils.HashUtils.hashKey;
+
+public class UrlProcessingCache {
+ private static final Logger LOG = LoggerFactory.getLogger(UrlProcessingCache.class);
+ private final CacheProvider provider;
+
+ public UrlProcessingCache(CacheProvider provider){
+ this.provider = provider;
+ }
+
+ public static String parseDomain(String url){
+ try {
+ URL u = new URL(url);
+ return u.getHost();
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ return null;
+ }
+ }
+
+ private String globalNamespace(String domain){
+ return String.format("global:%s", hashKey(domain));
+ }
+
+ public void addUrl(String url){
+ String domain = parseDomain(url);
+ waitFor(provider.addKey(globalNamespace(domain), url));
+ }
+
+ public Set filterUrls(String domain, Collection urls) {
+ return filterUrls(domain, urls.stream());
+ }
+
+ public Set filterUrls(String domain, Stream urls) {
+ Set keys = tryGet(provider.keysInNamespace(globalNamespace(domain)), () -> Collections.emptySet());
+ return urls.filter(k -> !keys.contains(hashKey(k))).collect(Collectors.toSet());
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/datamodel/Pair.java b/cache/src/main/java/lt/tokenmill/crawling/cache/datamodel/Pair.java
new file mode 100644
index 0000000..f1ac8b6
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/datamodel/Pair.java
@@ -0,0 +1,29 @@
+package lt.tokenmill.crawling.cache.datamodel;
+
+public class Pair {
+ private final T key;
+ private final U value;
+ public Pair(T key, U value){
+ this.key = key;
+ this.value = value;
+ }
+
+ public T getKey(){
+ return key;
+ }
+
+ public U getValue(){
+ return value;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if(other instanceof Pair){
+ Pair otherPair = (Pair)other;
+
+ return key.equals(otherPair.getKey()) && value.equals(otherPair.getValue());
+ }
+
+ return false;
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/CacheProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/CacheProvider.java
new file mode 100644
index 0000000..17e08bf
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/CacheProvider.java
@@ -0,0 +1,29 @@
+package lt.tokenmill.crawling.cache.providers;
+
+import lt.tokenmill.crawling.cache.datamodel.Pair;
+
+import java.util.Collection;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+
+public interface CacheProvider {
+ CompletableFuture set(String namespace, String key, T value);
+ CompletableFuture setMultiple(String namespace, Set> pairs);
+
+ CompletableFuture addKey(String namespace, String key);
+ CompletableFuture addKeyMultiple(String namespace, Collection keys);
+ CompletableFuture removeKey(String namespace, String key);
+ CompletableFuture moveKey(String source, String dest, String key);
+ CompletableFuture findKey(String namespace, String key);
+
+ CompletableFuture get(Class klass, String namespace, String key);
+ CompletableFuture> getMultiple(Class klass, String namespace, String... keys);
+ CompletableFuture> getMultiple(Class klass, String namespace, Collection keys);
+
+ CompletableFuture contains(String key);
+ CompletableFuture contains(String namespace, String key);
+
+ CompletableFuture> keysInNamespace(String namespace);
+
+ void cleanup();
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/Builder.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/Builder.java
new file mode 100644
index 0000000..52ad901
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/Builder.java
@@ -0,0 +1,51 @@
+package lt.tokenmill.crawling.cache.providers.redis;
+
+import redis.clients.jedis.JedisPool;
+import redis.clients.jedis.JedisPoolConfig;
+import redis.clients.jedis.Protocol;
+
+import java.util.Optional;
+
+public class Builder{
+ private String host = "localhost";
+ private int port = 6379;
+ private Optional auth = Optional.empty();
+ private int timeout = Protocol.DEFAULT_TIMEOUT;
+
+ public Builder withHost(String host){
+ this.host = host;
+ return this;
+ }
+
+ public Builder withPort(int port){
+ this.port = port;
+ return this;
+ }
+
+ public Builder withAuth(String auth){
+ if(auth != null) {
+ this.auth = Optional.of(auth);
+ }
+ return this;
+ }
+
+ public Builder withTimeoutMillis(int millis){
+ this.timeout = millis;
+ return this;
+ }
+
+ public RedisProvider build(){
+ JedisPoolConfig config = new JedisPoolConfig();
+ config.setMaxTotal(100);
+ JedisPool pool;
+ if(auth.isPresent()){
+ pool = new JedisPool(config, host, port, timeout, auth.get());
+ }
+ else{
+ pool = new JedisPool(config, host, port, timeout);
+ }
+
+ return new RedisProvider(pool);
+ }
+
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisKVProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisKVProvider.java
new file mode 100644
index 0000000..11d7a54
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisKVProvider.java
@@ -0,0 +1,85 @@
+package lt.tokenmill.crawling.cache.providers.redis;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import lt.tokenmill.crawling.cache.datamodel.Pair;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import redis.clients.jedis.Pipeline;
+
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+
+public class RedisKVProvider {
+ private static final Logger LOG = LoggerFactory.getLogger(RedisKVProvider.class);
+ private final RedisResourceProvider resource;
+ private static ObjectMapper objectMapper = new ObjectMapper();
+
+ public RedisKVProvider(RedisResourceProvider resourceProvider){
+ this.resource = resourceProvider;
+ }
+
+ public CompletableFuture contains(String namespace, String key) {
+ return resource.withJedis(redis -> redis.hexists(namespace, key));
+ }
+
+ public CompletableFuture set(String namespace, String key, T value) {
+ return resource.withJedis(redis -> {
+ try {
+ redis.hset(namespace, key, objectMapper.writeValueAsString(value));
+ return true;
+ }
+ catch (Exception ex){
+ LOG.error("Failed to set value", ex);
+ return false;
+ }
+ });
+ }
+
+ public CompletableFuture setMultiple(String namespace, Set> pairs){
+ return resource.withJedis(redis -> {
+ Pipeline pipe = redis.pipelined();
+ pipe.multi();
+ for(Pair pair : pairs){
+ try {
+ pipe.hset(namespace, pair.getKey(), objectMapper.writeValueAsString(pair.getValue()));
+ }
+ catch (Exception ex){
+ LOG.error("Failed to set value", ex);
+ }
+ }
+
+ pipe.sync();
+ pipe.exec();
+ return true;
+ });
+ }
+
+ public CompletableFuture get(Class klass, String namespace, String key) {
+ return resource.withJedis(redis -> {
+ String data = redis.hget(namespace, key);
+ return parseObj(data, klass);
+ });
+ }
+
+ private T parseObj(String data, Class klass){
+ try {
+ return objectMapper.readValue(data, klass);
+ }
+ catch(Exception ex){
+ return null;
+ }
+ }
+
+ public CompletableFuture> getMultiple(Class klass, String namespace, Set keys) {
+ return resource
+ .withPipeline(pipe -> keys.stream().map(k -> pipe.hget(namespace, k)))
+ .thenApply(responses -> {
+ return responses
+ .map(data -> parseObj(data, klass))
+ .filter(Objects::nonNull)
+ .collect(Collectors.toSet());
+ });
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisProvider.java
new file mode 100644
index 0000000..3d089c4
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisProvider.java
@@ -0,0 +1,98 @@
+package lt.tokenmill.crawling.cache.providers.redis;
+
+import lt.tokenmill.crawling.cache.datamodel.Pair;
+import lt.tokenmill.crawling.cache.providers.CacheProvider;
+import redis.clients.jedis.JedisPool;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+
+public class RedisProvider implements CacheProvider {
+ private final RedisResourceProvider resourceProvider;
+ private final RedisKVProvider kvProvider;
+ private final RedisSetProvider setProvider;
+
+ public static Builder builder(){
+ return new Builder();
+ }
+
+ protected RedisProvider(JedisPool pool){
+ this.resourceProvider = new RedisResourceProvider(pool);
+ this.kvProvider = new RedisKVProvider(resourceProvider);
+ this.setProvider = new RedisSetProvider(resourceProvider);
+ }
+
+ @Override
+ public CompletableFuture set(String namespace, String key, T value) {
+ return kvProvider.set(namespace, key, value);
+ }
+
+ @Override
+ public CompletableFuture setMultiple(String namespace, Set> pairs) {
+ return kvProvider.setMultiple(namespace, pairs);
+ }
+
+
+ @Override
+ public CompletableFuture get(Class klass, String namespace, String key) {
+ return kvProvider.get(klass, namespace, key);
+ }
+
+ @Override
+ public CompletableFuture> getMultiple(Class klass, String namespace, String... keys) {
+ return getMultiple(klass, namespace, Arrays.asList(keys));
+ }
+
+ @Override
+ public CompletableFuture> getMultiple(Class klass, String namespace, Collection initialKeys) {
+ return kvProvider.getMultiple(klass, namespace, initialKeys.stream().collect(Collectors.toSet()));
+ }
+
+ @Override
+ public CompletableFuture addKey(String namespace, String key) {
+ return setProvider.addKey(namespace, key);
+ }
+
+ @Override
+ public CompletableFuture addKeyMultiple(String namespace, Collection keys) {
+ return setProvider.addKeys(namespace, keys);
+ }
+
+ @Override
+ public CompletableFuture removeKey(String namespace, String key) {
+ return setProvider.removeKey(namespace, key);
+ }
+
+ @Override
+ public CompletableFuture moveKey(String source, String dest, String key) {
+ return setProvider.moveKey(source, dest, key);
+ }
+
+ @Override
+ public CompletableFuture findKey(String namespace, String key) {
+ return setProvider.findKey(namespace, key);
+ }
+
+ @Override
+ public CompletableFuture contains(String key) {
+ return resourceProvider.withJedis(redis -> redis.exists(key));
+ }
+
+ @Override
+ public CompletableFuture contains(String namespace, String key) {
+ return kvProvider.contains(namespace, key);
+ }
+
+ @Override
+ public CompletableFuture> keysInNamespace(String namespace) {
+ return setProvider.keysInNamespace(namespace);
+ }
+
+ @Override
+ public void cleanup() {
+ resourceProvider.withJedis(redis -> redis.flushAll());
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisResourceProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisResourceProvider.java
new file mode 100644
index 0000000..4ee670f
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisResourceProvider.java
@@ -0,0 +1,74 @@
+package lt.tokenmill.crawling.cache.providers.redis;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import redis.clients.jedis.*;
+import redis.clients.jedis.exceptions.JedisConnectionException;
+
+import java.util.List;
+import java.util.concurrent.CompletableFuture;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class RedisResourceProvider {
+ private final JedisPool jedisPool;
+ private static final Logger LOG = LoggerFactory.getLogger(RedisResourceProvider.class);
+ private final static int MAX_RETRIES = 5;
+ private final static long RETRY_TIMEOUT = 300;
+
+
+ protected RedisResourceProvider(JedisPool pool){
+ this.jedisPool = pool;
+ }
+
+ public CompletableFuture withJedis(Function body){
+ CompletableFuture future = CompletableFuture.supplyAsync(() -> {
+ int retryCount = 0;
+
+ while(true) {
+ try (Jedis jedis = jedisPool.getResource()) {
+ T result = body.apply(jedis);
+ if(result != null){
+ return result;
+ }
+ else {
+ return null;
+ }
+ } catch (JedisConnectionException cex){
+ retryCount++;
+ if(retryCount > MAX_RETRIES) {
+ throw cex;
+ }
+ else {
+ LOG.warn("Redis operation has failed due to connection issue. Retrying in {}ms", RETRY_TIMEOUT * retryCount);
+ try {
+ Thread.sleep(RETRY_TIMEOUT);
+ } catch (InterruptedException iex) {
+ continue;
+ }
+ }
+
+ } catch (Exception ex) {
+ throw ex;
+ }
+ }
+ });
+
+ return future;
+ }
+
+ public CompletableFuture> withPipeline(Function>> body){
+ return withJedis(redis -> {
+ Pipeline pipe = redis.pipelined();
+ pipe.multi();
+ List> results = body.apply(pipe).collect(Collectors.toList());
+
+ Response> resultsResponse = pipe.exec();
+ pipe.sync();
+
+ resultsResponse.get();
+ return results.stream().map(r -> r.get());
+ });
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisSetProvider.java b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisSetProvider.java
new file mode 100644
index 0000000..4db1a78
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/providers/redis/RedisSetProvider.java
@@ -0,0 +1,154 @@
+package lt.tokenmill.crawling.cache.providers.redis;
+
+import lt.tokenmill.crawling.cache.datamodel.Pair;
+import lt.tokenmill.crawling.cache.utils.FutureUtils;
+import lt.tokenmill.crawling.cache.utils.KeyUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import redis.clients.jedis.Jedis;
+import redis.clients.jedis.Pipeline;
+import redis.clients.jedis.Response;
+import redis.clients.jedis.ScanResult;
+
+import java.util.*;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+
+public class RedisSetProvider {
+ private static final Logger LOG = LoggerFactory.getLogger(RedisKVProvider.class);
+ private final RedisResourceProvider resource;
+
+ public RedisSetProvider(RedisResourceProvider resourceProvider){
+ this.resource = resourceProvider;
+ }
+
+ public CompletableFuture addKey(String namespace, String key) {
+ return addKeys(namespace, Arrays.asList(key));
+ }
+
+ public CompletableFuture addKeys(String namespace, Collection keys){
+ return resource.withJedis(redis -> {
+ Pipeline pipe = redis.pipelined();
+
+ pipe.multi();
+ for(String key : keys){
+ pipe.sadd(namespace, key);
+ }
+
+ pipe.exec();
+ pipe.sync();
+ return true;
+ });
+ }
+
+ public CompletableFuture removeKey(String namespace, String key) {
+ return resource.withJedis(redis -> {
+ Set keys = KeyUtils.resolveKeys(namespace, redis);
+
+ Pipeline pipe = redis.pipelined();
+ pipe.multi();
+ List> responses = keys
+ .stream()
+ .map(k -> pipe.srem(k, key))
+ .collect(Collectors.toList());
+
+ pipe.exec();
+ pipe.sync();
+
+ responses.forEach(response -> response.get());
+ return true;
+ });
+ }
+
+ public CompletableFuture moveKey(String source, String dest, String key){
+ return resource.withJedis(redis -> redis.smove(source, dest, key) == 1);
+ }
+
+ public CompletableFuture findKey(String namespace, String key){
+ return resource.withJedis(redis -> {
+ Set keys = KeyUtils.resolveKeys(namespace, redis);
+
+ Pipeline pipe = redis.pipelined();
+ pipe.multi();
+ List>> responses = keys
+ .stream()
+ .map(k -> new Pair>(k, pipe.sismember(k, key)))
+ .collect(Collectors.toList());
+
+ pipe.exec();
+ pipe.sync();
+
+ Optional result = responses
+ .stream()
+ .filter(pair -> pair.getValue().get())
+ .map(pair -> pair.getKey())
+ .findFirst();
+
+ if(result.isPresent()){
+ return result.get();
+ }
+ else{
+ return "-1";
+ }
+ });
+ }
+
+ public CompletableFuture> keysInNamespace(String namespace) {
+ Set keys = FutureUtils.tryGet(resource.withJedis(redis -> KeyUtils.resolveKeys(namespace, redis)), () -> Collections.emptySet());
+ if(keys.size() == 0){
+ return CompletableFuture.completedFuture(Collections.emptySet());
+ }
+
+ return resource
+ .withPipeline(pipe -> keys.stream().map(k -> pipe.smembers(k)))
+ .thenApply(results -> results.flatMap(Collection::stream).collect(Collectors.toSet()));
+ }
+
+ private Long countKeysInNamespaces(Collection namespaces, Jedis redis){
+ Pipeline pipe = redis.pipelined();
+ pipe.multi();
+ List> responses = namespaces.stream().map(k -> pipe.scard(k)).collect(Collectors.toList());
+
+ pipe.exec();
+ pipe.sync();
+
+ return responses
+ .stream()
+ .map(response -> response.get())
+ .filter(Objects::nonNull)
+ .reduce(0L, Long::sum);
+ }
+
+ public CompletableFuture countKeysInNamespace(String namespace) {
+ return resource.withJedis(redis -> {
+ Set keys = redis.keys(namespace);
+ if(keys.size() == 0){
+ return 0L;
+ }
+
+ return countKeysInNamespaces(keys, redis);
+ });
+ }
+
+ public CompletableFuture countKeysInNamespaces(Collection namespaces) {
+ return resource.withJedis(redis -> countKeysInNamespaces(namespaces, redis));
+ }
+
+ public CompletableFuture removeSet(String namespace) {
+ return resource.withJedis(redis -> {
+ String cursor = "0";
+ while(true){
+ ScanResult result = redis.sscan(namespace, cursor);
+ cursor = result.getCursor();
+ List members = result.getResult();
+ redis.srem(namespace, members.toArray(new String[0]));
+
+ if(cursor.equalsIgnoreCase("0")){
+ break;
+ }
+ }
+
+ return true;
+ });
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/utils/FutureUtils.java b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/FutureUtils.java
new file mode 100644
index 0000000..d83a55e
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/FutureUtils.java
@@ -0,0 +1,47 @@
+package lt.tokenmill.crawling.cache.utils;
+
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.Future;
+
+public class FutureUtils {
+ public static T tryGet(Future future, Callable onFail){
+ try{
+ T result = future.get();
+ if(result != null){
+ return result;
+ }
+ else {
+ return onFail.call();
+ }
+ } catch (Exception ex){
+ ex.printStackTrace();
+ try {
+ return onFail.call();
+ } catch (Exception ex2){
+ ex2.printStackTrace();
+ return null;
+ }
+ }
+ }
+
+ public static CompletableFuture transformFuture(Future future){
+ CompletableFuture newFuture = new CompletableFuture<>();
+ try{
+ newFuture.complete(future.get());
+ } catch(Exception ex){
+ newFuture.completeExceptionally(ex);
+ }
+ return newFuture;
+ }
+
+ public static boolean waitFor(Future future){
+ try{
+ future.get();
+ return true;
+ } catch (Exception ex){
+ ex.printStackTrace();
+ return false;
+ }
+ }
+}
\ No newline at end of file
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/utils/HashUtils.java b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/HashUtils.java
new file mode 100644
index 0000000..3d8d8ec
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/HashUtils.java
@@ -0,0 +1,21 @@
+package lt.tokenmill.crawling.cache.utils;
+
+import org.apache.commons.codec.digest.MurmurHash3;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+public class HashUtils {
+ public static String hashKey(String url){
+ // MurmurHash64
+ return String.valueOf(MurmurHash3.hash128x64(url.getBytes())[0]);
+ }
+
+ public static long hashKey(String... s){
+ return MurmurHash3.hash128x64(Arrays.asList(s).stream().collect(Collectors.joining("#")).getBytes())[0];
+ }
+
+ public static long domainKey(String caseId, String domain){
+ return hashKey(caseId, domain);
+ }
+}
diff --git a/cache/src/main/java/lt/tokenmill/crawling/cache/utils/KeyUtils.java b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/KeyUtils.java
new file mode 100644
index 0000000..0cae443
--- /dev/null
+++ b/cache/src/main/java/lt/tokenmill/crawling/cache/utils/KeyUtils.java
@@ -0,0 +1,22 @@
+package lt.tokenmill.crawling.cache.utils;
+
+import redis.clients.jedis.Jedis;
+
+import java.util.Arrays;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class KeyUtils {
+ public static boolean isWildCard(String key){
+ return key.contains("*");
+ }
+
+ public static Set resolveKeys(String key, Jedis jedis){
+ if(isWildCard(key)){
+ return jedis.keys(key);
+ }
+ else{
+ return Arrays.asList(key).stream().collect(Collectors.toSet());
+ }
+ }
+}
diff --git a/crawler/pom.xml b/crawler/pom.xml
index 320d8f8..89e2bf0 100644
--- a/crawler/pom.xml
+++ b/crawler/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
@@ -73,6 +73,11 @@
4.13.1
test
+
+ lt.tokenmill.crawling
+ cache
+ 0.3.5-SNAPSHOT
+
diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java
index 9982445..30d7903 100644
--- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java
+++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/DefaultServiceProvider.java
@@ -2,6 +2,10 @@
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.collect.Maps;
+import lt.tokenmill.crawling.cache.CacheConstants;
+import lt.tokenmill.crawling.cache.UrlProcessingCache;
+import lt.tokenmill.crawling.cache.providers.CacheProvider;
+import lt.tokenmill.crawling.cache.providers.redis.RedisProvider;
import lt.tokenmill.crawling.es.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -49,4 +53,14 @@ public EsDocumentOperations creatEsDocumentOperations(Map conf) {
return EsDocumentOperations.getInstance(connection, docsIndexName, docsDocumentType);
}
+ @Override
+ public UrlProcessingCache createUrlProcessingCache(Map conf) {
+ CacheProvider provider = RedisProvider.builder()
+ .withHost(ConfUtils.getString(conf, CacheConstants.REDIS_HOST, "localhost"))
+ .withPort(ConfUtils.getInt(conf, CacheConstants.REDIS_PORT, 6379))
+ .withAuth(ConfUtils.getString(conf, CacheConstants.REDIS_AUTH, null))
+ .build();
+ return new UrlProcessingCache(provider);
+ }
+
}
diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java
index a87b498..573d25f 100644
--- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java
+++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/ServiceProvider.java
@@ -1,5 +1,6 @@
package lt.tokenmill.crawling.crawler;
+import lt.tokenmill.crawling.cache.UrlProcessingCache;
import lt.tokenmill.crawling.es.EsDocumentOperations;
import lt.tokenmill.crawling.es.EsHttpSourceOperations;
import lt.tokenmill.crawling.es.EsHttpUrlOperations;
@@ -16,4 +17,6 @@ public interface ServiceProvider {
EsHttpSourceOperations createEsHttpSourceOperations(Map conf);
EsDocumentOperations creatEsDocumentOperations(Map conf);
+
+ UrlProcessingCache createUrlProcessingCache(Map conf);
}
diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java
index d84c298..7e1fbb7 100644
--- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java
+++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/bolt/StatusUpdaterBolt.java
@@ -44,6 +44,7 @@ public void store(String url, Status status, Metadata metadata, Date nextFetch)
String filtered = filters.filter(url);
if (isSeed || (filtered == null && status.equals(Status.DISCOVERED))) {
LOG.debug("Url '{}' is seed or rejected by filters", url);
+ esUrlsOperations.upsertUrlStatus(url, null, source, false, Status.FETCH_ERROR);
return;
}
diff --git a/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java b/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java
index 259cb3f..1ecb072 100644
--- a/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java
+++ b/crawler/src/main/java/lt/tokenmill/crawling/crawler/spout/UrlGeneratorSpout.java
@@ -5,11 +5,14 @@
import lt.tokenmill.crawling.crawler.CrawlerConstants;
import lt.tokenmill.crawling.crawler.ServiceProvider;
import lt.tokenmill.crawling.crawler.utils.PrioritizedSource;
+import lt.tokenmill.crawling.crawler.utils.UrlFiltersCache;
import lt.tokenmill.crawling.data.DataUtils;
import lt.tokenmill.crawling.data.HttpSource;
import lt.tokenmill.crawling.data.HttpUrl;
import lt.tokenmill.crawling.es.EsHttpSourceOperations;
+import lt.tokenmill.crawling.es.EsHttpSourcesCache;
import lt.tokenmill.crawling.es.EsHttpUrlOperations;
+import lt.tokenmill.crawling.parser.urls.UrlFilters;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
@@ -144,6 +147,7 @@ private HttpSourceConfiguration getConfiguration() {
if (configuration == null || HttpSourceConfiguration.needsReload()) {
LOG.info("Loading HTTP sources");
List sources = esSourceOperations.findEnabledSources();
+ LOG.info("Have {} enabled sources", sources.size());
configuration = HttpSourceConfiguration.reload(configuration, sources);
return configuration;
}
diff --git a/data-model/pom.xml b/data-model/pom.xml
index 3038c40..93f1f2b 100644
--- a/data-model/pom.xml
+++ b/data-model/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
diff --git a/elasticsearch/pom.xml b/elasticsearch/pom.xml
index 9efe894..2485e99 100644
--- a/elasticsearch/pom.xml
+++ b/elasticsearch/pom.xml
@@ -5,13 +5,12 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
elasticsearch
-
lt.tokenmill.crawling
@@ -35,26 +34,6 @@
elasticsearch-rest-high-level-client
${elasticsearch.version}
-
- org.apache.httpcomponents
- httpasyncclient
- 4.1.3
-
-
- org.apache.httpcomponents
- httpcore-nio
- 4.4.6
-
-
- org.apache.httpcomponents
- httpclient
- 4.5.4
-
-
- org.apache.httpcomponents
- httpcore
- 4.4.6
-
org.elasticsearch.plugin
transport-netty4-client
@@ -89,6 +68,12 @@
4.13.1
test
+
+
+ org.apache.httpcomponents
+ httpcore
+ 4.4.14
+
@@ -115,4 +100,5 @@
+
\ No newline at end of file
diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java
index 0bc532f..1583fc5 100644
--- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java
+++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/BaseElasticOps.java
@@ -31,10 +31,6 @@ protected String getIndex() {
return index;
}
- protected String getType() {
- return type;
- }
-
protected RequestOptions getRequestOptions() { return requestOptions; }
public void close() {
diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java
index 6d13199..be1a07e 100644
--- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java
+++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsDocumentOperations.java
@@ -90,7 +90,6 @@ public PageableList query(NamedQuery... queries) {
.sort(PUBLISHED_FIELD, SortOrder.DESC)
.size(100).query(query);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.source(searchSourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient()
.search(searchRequest, getRequestOptions());
@@ -148,7 +147,6 @@ public PageableList query(List included, Li
.field("text.stem_ci"))
.sort(PUBLISHED_FIELD, SortOrder.DESC);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.source(sourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient()
.search(searchRequest, getRequestOptions());
@@ -193,7 +191,8 @@ public void store(HttpArticle article, Map fields) {
jsonBuilder.startObject();
applyFields(jsonBuilder, article, fields);
jsonBuilder.endObject();
- IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(article.getUrl()))
+ IndexRequest indexRequest = new IndexRequest(getIndex())
+ .id(formatId(article.getUrl()))
.source(jsonBuilder);
getConnection().getProcessor().add(indexRequest);
} catch (IOException e) {
@@ -216,7 +215,7 @@ public void update(HttpArticle article, Map fields) {
XContentBuilder upsertDoc = jsonBuilder().startObject();
applyFields(upsertDoc, article, fields);
upsertDoc.endObject();
- UpdateRequest upsert = new UpdateRequest(getIndex(), getType(), id)
+ UpdateRequest upsert = new UpdateRequest(getIndex(), id)
.doc(update)
.upsert(upsertDoc);
getConnection().getProcessor().add(upsert);
@@ -231,7 +230,7 @@ public HttpArticle get(String url) {
public Map getAsMap(String url) {
try {
- GetRequest getRequest = new GetRequest(getIndex(), getType(), formatId(url))
+ GetRequest getRequest = new GetRequest(getIndex(), formatId(url))
.fetchSourceContext(new FetchSourceContext(true));
GetResponse response = getConnection().getRestHighLevelClient().get(getRequest, getRequestOptions());
if (response.isExists()) {
@@ -268,7 +267,6 @@ public HttpArticle findDuplicate(HttpArticle article) {
.fetchSource(true);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.searchType(SearchType.DEFAULT)
.source(searchSourceBuilder);
try {
@@ -294,7 +292,6 @@ public List calculateStats(String sourceUrl) {
.must(QueryBuilders.termQuery(SOURCE_FIELD, sourceUrl));
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.searchType(SearchType.DEFAULT)
.source(new SearchSourceBuilder()
.query(filter)
diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java
index 2efd8c2..da29c0f 100644
--- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java
+++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceOperations.java
@@ -63,7 +63,6 @@ public List findEnabledSources() {
.fetchSource(true)
.explain(false);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.searchType(SearchType.DEFAULT)
.source(searchSourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient().search(searchRequest, getRequestOptions());
@@ -74,23 +73,30 @@ public List findEnabledSources() {
.map(this::mapToHttpSource)
.collect(Collectors.toList());
} catch (ElasticsearchStatusException e) {
+ LOG.error("Failed while searching for enabled sources", e);
} catch (Exception e) {
e.printStackTrace();
+ LOG.error("Failed while searching for enabled sources", e);
}
return Collections.emptyList();
}
public HttpSource get(String url) {
try {
- GetRequest getRequest = new GetRequest(getIndex(), getType(), formatId(url))
- .fetchSourceContext(new FetchSourceContext(true));
+ GetRequest getRequest = new GetRequest(getIndex(), /*formatId(url)*/url)
+ .fetchSourceContext(FetchSourceContext.FETCH_SOURCE);
GetResponse response = getConnection().getRestHighLevelClient().get(getRequest, getRequestOptions());
if (response.isExists()) {
return mapToHttpSource(response.getSource());
}
+ else {
+ LOG.error("No response for HTTP Source id: '{}'", response.getId());
+ }
} catch (ElasticsearchStatusException e) {
+ LOG.error("Failed to fetch HTTP Source for {}", url, e);
} catch (Exception e) {
e.printStackTrace();
+ LOG.error("Failed to fetch HTTP Source for {}", url, e);
}
return null;
}
@@ -123,7 +129,6 @@ public PageableList filter(String text, int offset) {
.explain(false);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.searchType(SearchType.DEFAULT)
.source(searchSourceBuilder);
@@ -158,7 +163,6 @@ public List all() {
.query(filter);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.scroll(keepAlive)
.source(searchSourceBuilder);
@@ -212,7 +216,7 @@ public void save(HttpSource source) {
.field("date_formats", Utils.listToText(source.getDateFormats()))
.field("updated", new Date())
.endObject();
- IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(url))
+ IndexRequest indexRequest = new IndexRequest(getIndex(), formatId(url))
.source(xContentBuilder)
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
getConnection().getRestHighLevelClient().index(indexRequest, getRequestOptions());
@@ -225,7 +229,7 @@ public void save(HttpSource source) {
public void delete(String url) {
if (url != null) {
try {
- DeleteRequest deleteRequest = new DeleteRequest(getIndex(), getType(), formatId(url))
+ DeleteRequest deleteRequest = new DeleteRequest(getIndex(), formatId(url))
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
getConnection().getRestHighLevelClient().delete(deleteRequest, getRequestOptions());
} catch (IOException e) {
@@ -243,7 +247,6 @@ public void deleteAll() {
.fetchSource(true)
.explain(false);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.scroll(keepAlive)
.source(searchSourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient()
diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java
index 44beb45..4570e09 100644
--- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java
+++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpSourceTestOperations.java
@@ -72,7 +72,6 @@ public PageableList filter(String prefix, int offset) {
.query(filter);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.source(searchSourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient().search(searchRequest, getRequestOptions());
List items = Arrays.stream(response.getHits().getHits())
@@ -90,7 +89,7 @@ public PageableList filter(String prefix, int offset) {
public HttpSourceTest get(String url) {
try {
- GetRequest getRequest = new GetRequest(getIndex(), getType(), formatId(url))
+ GetRequest getRequest = new GetRequest(getIndex(), formatId(url))
.fetchSourceContext(new FetchSourceContext(true));
GetResponse response = getConnection().getRestHighLevelClient().get(getRequest, getRequestOptions());
if (response.isExists()) {
@@ -115,7 +114,6 @@ public List all() {
.query(filter)
.size(100);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.source(searchSourceBuilder)
.scroll(keepAlive);
@@ -155,7 +153,8 @@ public void save(HttpSourceTest hst) {
.field("date", hst.getDate() != null ? hst.getDate().trim() : null)
.field("updated", new Date())
.endObject();
- IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(hst.getUrl()))
+ IndexRequest indexRequest = new IndexRequest(getIndex())
+ .id(formatId(hst.getUrl()))
.source(contentBuilder)
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
getConnection().getRestHighLevelClient().index(indexRequest, getRequestOptions());
@@ -168,7 +167,7 @@ public void save(HttpSourceTest hst) {
public void delete(String url) {
if (url != null) {
try {
- DeleteRequest deleteRequest = new DeleteRequest(getIndex(), getType(), formatId(url))
+ DeleteRequest deleteRequest = new DeleteRequest(getIndex(), formatId(url))
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
DeleteResponse delete = getConnection().getRestHighLevelClient().delete(deleteRequest, getRequestOptions());
LOG.debug("Delete HttpSourceTest url {} with response status {}", url, delete.status());
@@ -182,7 +181,6 @@ public void deleteAll() {
try {
TimeValue keepAlive = TimeValue.timeValueMinutes(10);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.scroll(keepAlive)
.source(new SearchSourceBuilder()
.size(100)
diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java
index e55f4af..beafd1d 100644
--- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java
+++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsHttpUrlOperations.java
@@ -57,7 +57,8 @@ public void upsertUrlStatus(String url, String published, String source, boolean
.field("published", published)
.field("status", status)
.endObject();
- IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), id)
+ IndexRequest indexRequest = new IndexRequest(getIndex())
+ .id(id)
.source(insert)
.create(create);
@@ -70,7 +71,7 @@ public void upsertUrlStatus(String url, String published, String source, boolean
.field("published", published)
.field("status", status)
.endObject();
- UpdateRequest upsert = new UpdateRequest(getIndex(), getType(), id)
+ UpdateRequest upsert = new UpdateRequest(getIndex(), id)
.doc(update)
.upsert(indexRequest);
getConnection().getProcessor().add(upsert);
@@ -97,7 +98,6 @@ public List findUrlsByStatusAndSource(String status, String source, int
.query(filter)
.sort("created", SortOrder.DESC);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.searchType(SearchType.DEFAULT)
.source(searchSourceBuilder);
@@ -138,7 +138,6 @@ public List calculateStats(String sourceUrl) {
.format("yyyy-MM-dd")
.dateHistogramInterval(DateHistogramInterval.DAY));
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.searchType(SearchType.DEFAULT)
.source(searchSourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient().search(searchRequest, getRequestOptions());
diff --git a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java
index e78de24..dcf00e2 100644
--- a/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java
+++ b/elasticsearch/src/main/java/lt/tokenmill/crawling/es/EsNamedQueryOperations.java
@@ -63,7 +63,6 @@ public PageableList filter(String prefix) {
.size(100)
.query(filter);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.source(searchSourceBuilder);
SearchResponse response = getConnection().getRestHighLevelClient()
@@ -91,7 +90,6 @@ public List suggest(String prefix) {
.fetchSource(true)
.suggest(new SuggestBuilder().addSuggestion("suggestion", suggestionBuilder));
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.source(sourceRequestBuilder);
SearchResponse response = getConnection().getRestHighLevelClient()
.search(searchRequest, getRequestOptions());
@@ -111,7 +109,7 @@ public List suggest(String prefix) {
public NamedQuery get(String name) {
try {
GetResponse response = getConnection().getRestHighLevelClient()
- .get(new GetRequest(getIndex(), getType(), formatId(name))
+ .get(new GetRequest(getIndex(), formatId(name))
.fetchSourceContext(new FetchSourceContext(true)), getRequestOptions());
if (response.isExists()) {
return mapToNamedQuery(response.getSource());
@@ -126,7 +124,6 @@ public List all() {
try {
TimeValue keepAlive = TimeValue.timeValueMinutes(10);
SearchRequest searchRequest = new SearchRequest(getIndex())
- .types(getType())
.scroll(keepAlive)
.source(new SearchSourceBuilder().query(QueryBuilders.matchAllQuery()));
SearchResponse response = getConnection().getRestHighLevelClient()
@@ -164,7 +161,8 @@ public void save(NamedQuery nq) {
.field("advanced", nq.getAdvanced())
.field("updated", new Date())
.endObject();
- IndexRequest indexRequest = new IndexRequest(getIndex(), getType(), formatId(nq.getName()))
+ IndexRequest indexRequest = new IndexRequest(getIndex())
+ .id(formatId(nq.getName()))
.source(contentBuilder)
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
getConnection().getRestHighLevelClient().index(indexRequest, getRequestOptions());
@@ -177,7 +175,7 @@ public void save(NamedQuery nq) {
public void delete(NamedQuery nq) {
if (nq != null && nq.getName() != null) {
try {
- DeleteRequest deleteRequest = new DeleteRequest(getIndex(), getType(), formatId(nq.getName()))
+ DeleteRequest deleteRequest = new DeleteRequest(getIndex(), formatId(nq.getName()))
.setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
getConnection().getRestHighLevelClient().delete(deleteRequest, getRequestOptions());
} catch (IOException e) {
@@ -191,7 +189,6 @@ public void deleteAll() {
TimeValue keepAlive = TimeValue.timeValueMinutes(10);
SearchRequest searchRequest = new SearchRequest(getIndex())
.scroll(keepAlive)
- .types(getType())
.source(new SearchSourceBuilder()
.size(100)
.explain(false)
diff --git a/page-analyzer/pom.xml b/page-analyzer/pom.xml
index e2b3f0b..7d6c292 100644
--- a/page-analyzer/pom.xml
+++ b/page-analyzer/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
diff --git a/parser/pom.xml b/parser/pom.xml
index 161fecf..5f1a160 100644
--- a/parser/pom.xml
+++ b/parser/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0
diff --git a/pom.xml b/pom.xml
index cb6f367..15d0b4c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
lt.tokenmill.crawling
crawling-framework
pom
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
Crawling Framework
Framework to simplify news crawling
https://github.com/tokenmill/crawling-framework
@@ -128,7 +128,7 @@
UTF-8
- 7.11.2
+ 7.12.0
1.1.3
1.5.1
7.7.5
@@ -150,6 +150,7 @@
administration-ui
analysis-ui
ui-commons
+ cache
@@ -226,6 +227,17 @@
jsonld-java
${jsonld.version}
+
+
+ org.apache.httpcomponents
+ httpclient
+ 4.5.10
+
+
+ org.apache.httpcomponents
+ httpcore
+ 4.4.14
+
diff --git a/ui-commons/pom.xml b/ui-commons/pom.xml
index a2b753d..35b2442 100644
--- a/ui-commons/pom.xml
+++ b/ui-commons/pom.xml
@@ -5,7 +5,7 @@
crawling-framework
lt.tokenmill.crawling
- 0.3.4-SNAPSHOT
+ 0.3.5-SNAPSHOT
4.0.0