diff --git a/.github/workflows/cypress-integration.yml b/.github/workflows/cypress-integration.yml index 21582a9c0..a82b2d46d 100644 --- a/.github/workflows/cypress-integration.yml +++ b/.github/workflows/cypress-integration.yml @@ -16,20 +16,21 @@ jobs: - uses: actions/checkout@v2 with: repository: conveyal/analysis-ui - ref: dev + ref: a869cd11919343a163e110e812b5d27f3a4ad4c8 path: ui - uses: actions/checkout@v2 with: fetch-depth: 0 path: r5 - # Build .jar and copy to ./ui directory + # Build .jar and copy to ./ui directory (along with config file) - uses: actions/setup-java@v1 with: java-version: 11 - run: gradle shadowJar -x test working-directory: r5 - run: cp $(ls ./r5/build/libs/*-all.jar | head -n1) ./ui/latest.jar + - run: cp ./r5/analysis.properties.template ./ui/analysis.properties # Install / cache dependencies with Cypress to handle caching Cypress binary. - uses: actions/setup-node@v2 @@ -38,6 +39,7 @@ jobs: - uses: cypress-io/github-action@v2 env: NEXT_PUBLIC_BASEMAP_DISABLED: true + NEXT_PUBLIC_CYPRESS: true NEXT_PUBLIC_MAPBOX_ACCESS_TOKEN: ${{ secrets.MAPBOX_ACCESS_TOKEN }} with: build: yarn build diff --git a/.github/workflows/gradle.yml b/.github/workflows/gradle.yml index dc7055dbb..22691e6d1 100644 --- a/.github/workflows/gradle.yml +++ b/.github/workflows/gradle.yml @@ -26,17 +26,12 @@ jobs: # image: mherwig/docker-alpine-java-mongo:latest # env: # BUILD_TARGET:staging - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - GPR_USERNAME: ${{ secrets.GPR_USERNAME }} - GPR_TOKEN: ${{ secrets.GPR_TOKEN }} steps: # Starting in v2.2 checkout action fetches all tags when fetch-depth=0, for auto-versioning. - uses: actions/checkout@v2.3.2 with: fetch-depth: 0 - # Java setup step completes very fast, no need to run in a preconfigured docker container + # Java setup step completes very fast, no need to run in a preconfigured docker container. - name: Set up JDK 11 uses: actions/setup-java@v1 with: @@ -46,29 +41,16 @@ jobs: with: path: ~/.gradle/caches key: gradle-caches + - name: Show version string + run: gradle -q printVersion | head -n1 - name: Build and Test run: gradle build - - name: Ensure shadow JAR is runnable as backend + - name: Ensure shadow JAR is runnable as local backend run: | cp analysis.properties.template analysis.properties gradle testShadowJarRunnable - name: Publish to GH Packages + # Supply access token to build.gradle (used in publishing.repositories.maven.credentials) + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: gradle publish - - name: Upload to S3 - # Use git describe to get a similar string to the Gradle project version (possibly missing .dirty). - run: | - VERSION=$(gradle -q printVersion | head -n1) - LOCAL_FILE=$(ls build/libs/*-all.jar | head -n1) - aws s3 cp --no-progress --region eu-west-1 $LOCAL_FILE s3://r5-builds/${VERSION}.jar - - name: Publish Docker image to GH Container Registry - run: | - echo $GPR_TOKEN | docker login ghcr.io -u $GPR_USERNAME --password-stdin - gradle jib - # If we are on the head of dev or master, also copy to branch-latest.jar. 'aws s3 cp' will overwrite by default. - - name: Copy to branch-latest.jar on S3 - if: github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/master' - run: | - VERSION=$(gradle -q printVersion | head -n1) - BRANCH=${GITHUB_REF#refs/heads/} - echo VERSION is $VERSION, BRANCH is $BRANCH - aws s3 cp --no-progress --region eu-west-1 s3://r5-builds/${VERSION}.jar s3://r5-builds/${BRANCH}-latest.jar diff --git a/README.md b/README.md index 873593d76..6cb8b12d1 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ We refer to the routing method as "realistic" because it works by planning many We say "Real-world and Reimagined" networks because R5's networks are built from widely available open OSM and GTFS data describing baseline transportation systems, but R5 includes a system for applying light-weight patches to those networks for immediate, interactive scenario comparison. -R5 is a core component of [Conveyal Analysis](https://www.conveyal.com/analysis), which allows users to create transportation scenarios and evaluate them in terms of cumulative opportunities accessibility indicators. +R5 is a core component of [Conveyal Analysis](https://www.conveyal.com/learn), which allows users to create transportation scenarios and evaluate them in terms of cumulative opportunities accessibility indicators. See the [methodology section](https://docs.conveyal.com/analysis/methodology) of the [Conveyal user manual](https://docs.conveyal.com/) for more information. **Please note** that the Conveyal team does not provide technical support for third-party deployments of its analysis platform. We provide paid subscriptions to a cloud-based deployment of this system, which performs these complex calculations hundreds of times faster using a compute cluster. This project is open source primarily to ensure transparency and reproducibility in public planning and decision making processes, and in hopes that it may help researchers, students, and potential collaborators to understand and build upon our methodology. diff --git a/analysis.properties.template b/analysis.properties.template index a98f52ebe..f5b5c2b5f 100644 --- a/analysis.properties.template +++ b/analysis.properties.template @@ -1,7 +1,10 @@ # This file contains the configuration options for Conveyal Analysis Backend -# The host and port of the remote Mongo server (if any). Comment out for local Mongo instance. +immediate-shutdown=false + +# The host and port of the Mongo server. # database-uri=mongodb://127.0.0.1:27017 +database-uri=mongodb://localhost # The name of the database in the Mongo instance. database-name=analysis @@ -11,12 +14,6 @@ database-name=analysis # In staging this should be the underlying S3 URL so files are not cached and you see the most recent deployment. frontend-url=https://localhost -# S3 buckets where Analysis inputs and results are stored. -bundle-bucket=analysis-staging-bundles -grid-bucket=analysis-staging-grids -results-bucket=analysis-staging-results -resources-bucket=analysis-staging-resources - # The S3 bucket where we can find tiles of the entire US census, built with Conveyal seamless-census. seamless-census-bucket=lodes-data-2014 seamless-census-region=us-east-1 diff --git a/build.gradle b/build.gradle index a08f004f3..1a676be70 100644 --- a/build.gradle +++ b/build.gradle @@ -3,15 +3,12 @@ plugins { id 'com.github.johnrengelman.shadow' version '6.0.0' id 'maven-publish' id 'com.palantir.git-version' version '0.12.3' - id 'com.google.cloud.tools.jib' version '2.6.0' } group = 'com.conveyal' // set version to `git describe --tags --always --first-parent`, plus '.dirty' if local changes are present. version gitVersion() -jib.to.image = 'ghcr.io/conveyal/r5:' + version - java { sourceCompatibility = JavaVersion.VERSION_11 targetCompatibility = JavaVersion.VERSION_11 @@ -20,12 +17,12 @@ java { jar { // For Java 11 Modules, specify a module name. // Do not create module-info.java until all our dependencies specify a module name. - // Main-Class R5Main will start a worker, BackendMain must be specified on JVM command line to start backend. + // Main-Class BackendMain will start a local backend. // Build-Jdk-Spec mimics a Maven manifest entry that helps us automatically install the right JVM. // Implementation-X attributes are needed for ImageIO (used by Geotools) to initialize in some environments. manifest { attributes 'Automatic-Module-Name': 'com.conveyal.analysis', - 'Main-Class': 'com.conveyal.r5.R5Main', + 'Main-Class': 'com.conveyal.analysis.BackendMain', 'Build-Jdk-Spec': targetCompatibility.getMajorVersion(), 'Implementation-Title': 'Conveyal Analysis Backend', 'Implementation-Vendor': 'Conveyal LLC', @@ -42,14 +39,15 @@ test { } // `gradle publish` will upload both shadow and simple JAR to Github Packages +// On GH Actions, GITHUB_ACTOR env variable is supplied without specifying it in action yml. publishing { repositories { maven { name = "GitHubPackages" url = uri("https://maven.pkg.github.com/conveyal/r5") credentials { - username = System.getenv("GPR_USERNAME") - password = System.getenv("GPR_TOKEN") + username = System.getenv("GITHUB_ACTOR") + password = System.getenv("GITHUB_TOKEN") } } } @@ -74,21 +72,14 @@ task copyDependencies(type: Copy) { into 'dependencies' } -// Run R5 as an analysis backend with all dependencies on the classpath, without building a shadowJar. +// Run R5 as a local analysis backend with all dependencies on the classpath, without building a shadowJar. task runBackend (type: JavaExec) { dependsOn(build) classpath(sourceSets.main.runtimeClasspath) main("com.conveyal.analysis.BackendMain") } -// Run R5 as an analysis worker with all dependencies on the classpath, without building a shadowJar. -task runWorker (type: JavaExec) { - dependsOn(build) - classpath(sourceSets.main.runtimeClasspath) - main("com.conveyal.r5.R5Main") -} - -// Start up the analysis backend from a shaded JAR and ask it to shut down immediately. +// Start up an analysis local backend from a shaded JAR and ask it to shut down immediately. // This is used to check in the automated build that the JAR is usable before we keep it. // Create a configuration properties file (by copying the template) before running this task. task testShadowJarRunnable(type: JavaExec) { @@ -103,7 +94,9 @@ task testShadowJarRunnable(type: JavaExec) { task createVersionProperties(dependsOn: processResources) { doLast { def details = versionDetails() - new File(buildDir, "resources/main/version.properties").withWriter { w -> + def dir = new File(buildDir, "resources/main/com/conveyal/r5/") + mkdir(dir) + new File(dir, "version.properties").withWriter { w -> Properties p = new Properties() p['version'] = project.version.toString() p['commit'] = details.gitHashFull @@ -111,7 +104,7 @@ task createVersionProperties(dependsOn: processResources) { p.store w, null } // Also make a simple one-line version.txt for scripts to use - new File(buildDir, "version.txt").text = "$version" + new File(dir, "version.txt").text = "$version" } } @@ -172,7 +165,7 @@ dependencies { // Provides the EPSG coordinate reference system catalog as an HSQL database. implementation group: 'org.geotools', version: geotoolsVersion, name: 'gt-epsg-hsql' - compile 'com.wdtinc:mapbox-vector-tile:3.1.0' + implementation 'com.wdtinc:mapbox-vector-tile:3.1.0' // Legacy JTS with com.vividsolutions package name. Newer Geotools compatible with Java 11 uses a newer version of // JTS with the org.locationtech package name. But our MapDB format includes serialized JTS geometries with the @@ -196,7 +189,7 @@ dependencies { implementation 'com.google.guava:guava:28.2-jre' // Java 8 rewrite of the Guava cache with asynchronous LoadingCaches. We don't currently use the async - //capabilities, but Caffeine's LoadingCache syntax is more modern idiomatic Java than Guava's. + // capabilities, but Caffeine's LoadingCache syntax is more modern idiomatic Java than Guava's. implementation 'com.github.ben-manes.caffeine:caffeine:2.8.1' implementation ('org.apache.httpcomponents:httpclient:4.5.6') { @@ -205,6 +198,7 @@ dependencies { } // Persistent storage of files / objects on Amazon S3. + // Now used only for Seamless Census TODO eliminate this final AWS dependency implementation 'com.amazonaws:aws-java-sdk-s3:1.11.341' // Old version of GraphQL-Java used by legacy gtfs-api embedded in analysis-backend. diff --git a/src/main/java/com/conveyal/analysis/AnalysisServerException.java b/src/main/java/com/conveyal/analysis/AnalysisServerException.java index 884af851e..de160c667 100644 --- a/src/main/java/com/conveyal/analysis/AnalysisServerException.java +++ b/src/main/java/com/conveyal/analysis/AnalysisServerException.java @@ -11,10 +11,10 @@ public class AnalysisServerException extends RuntimeException { private static final Logger LOG = LoggerFactory.getLogger(AnalysisServerException.class); public int httpCode; - public TYPE type; + public Type type; public String message; - public enum TYPE { + public enum Type { BAD_REQUEST, BROKER, FILE_UPLOAD, @@ -23,25 +23,26 @@ public enum TYPE { JSON_PARSING, NONCE, NOT_FOUND, + RUNTIME, UNAUTHORIZED, UNKNOWN; } public static AnalysisServerException badRequest(String message) { - return new AnalysisServerException(TYPE.BAD_REQUEST, message, 400); + return new AnalysisServerException(Type.BAD_REQUEST, message, 400); } public static AnalysisServerException fileUpload(String message) { - return new AnalysisServerException(TYPE.FILE_UPLOAD, message, 400); + return new AnalysisServerException(Type.FILE_UPLOAD, message, 400); } public static AnalysisServerException forbidden(String message) { - return new AnalysisServerException(TYPE.FORBIDDEN, message, 403); + return new AnalysisServerException(Type.FORBIDDEN, message, 403); } public static AnalysisServerException graphQL(List errors) { return new AnalysisServerException( - TYPE.GRAPHQL, + Type.GRAPHQL, errors .stream() .map(e -> e.getMessage()) @@ -51,36 +52,36 @@ public static AnalysisServerException graphQL(List errors) { } public static AnalysisServerException nonce() { - return new AnalysisServerException(TYPE.NONCE, "The data you attempted to change is out of date and could not be " + + return new AnalysisServerException(Type.NONCE, "The data you attempted to change is out of date and could not be " + "updated. This project may be open by another user or in another browser tab.", 400); } public static AnalysisServerException notFound(String message) { - return new AnalysisServerException(TYPE.NOT_FOUND, message, 404); + return new AnalysisServerException(Type.NOT_FOUND, message, 404); } public static AnalysisServerException unauthorized(String message) { - return new AnalysisServerException(TYPE.UNAUTHORIZED, message, 401); + return new AnalysisServerException(Type.UNAUTHORIZED, message, 401); } public static AnalysisServerException unknown(Exception e) { - return new AnalysisServerException(TYPE.UNKNOWN, ExceptionUtils.asString(e), 400); + return new AnalysisServerException(Type.UNKNOWN, ExceptionUtils.stackTraceString(e), 400); } public static AnalysisServerException unknown(String message) { - return new AnalysisServerException(TYPE.UNKNOWN, message, 400); + return new AnalysisServerException(Type.UNKNOWN, message, 400); } public AnalysisServerException(Exception e, String message) { this(message); - LOG.error(ExceptionUtils.asString(e)); + LOG.error(ExceptionUtils.stackTraceString(e)); } public AnalysisServerException(String message) { - this(TYPE.UNKNOWN, message, 400); + this(Type.UNKNOWN, message, 400); } - public AnalysisServerException(AnalysisServerException.TYPE t, String m, int c) { + public AnalysisServerException(Type t, String m, int c) { httpCode = c; type = t; message = m; diff --git a/src/main/java/com/conveyal/analysis/BackendConfig.java b/src/main/java/com/conveyal/analysis/BackendConfig.java index 649f20781..65eeb6da8 100644 --- a/src/main/java/com/conveyal/analysis/BackendConfig.java +++ b/src/main/java/com/conveyal/analysis/BackendConfig.java @@ -4,173 +4,94 @@ import com.conveyal.analysis.components.LocalWorkerLauncher; import com.conveyal.analysis.components.TaskScheduler; import com.conveyal.analysis.components.broker.Broker; -import com.conveyal.analysis.controllers.AggregationAreaController; -import com.conveyal.analysis.controllers.BundleController; import com.conveyal.analysis.controllers.OpportunityDatasetController; -import com.conveyal.analysis.controllers.RegionalAnalysisController; import com.conveyal.analysis.grids.SeamlessCensusGridExtractor; import com.conveyal.analysis.persistence.AnalysisDB; -import com.conveyal.gtfs.GTFSCache; -import com.conveyal.r5.streets.OSMCache; +import com.conveyal.file.LocalFileStorage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileInputStream; -import java.util.HashSet; -import java.util.Map; import java.util.Properties; -import java.util.Set; -/** - * Represents config information for the Analysis backend server. - */ -public class BackendConfig implements +/** Loads config information for the Analysis backend server and exposes it to the Components and HttpControllers. */ +public class BackendConfig extends ConfigBase implements TaskScheduler.Config, AnalysisDB.Config, Broker.Config, - BundleController.Config, - OSMCache.Config, - GTFSCache.Config, HttpApi.Config, - RegionalAnalysisController.Config, - AggregationAreaController.Config, - OpportunityDatasetController.Config, SeamlessCensusGridExtractor.Config, - LocalWorkerLauncher.Config + LocalWorkerLauncher.Config, + LocalFileStorage.Config { - private static final Logger LOG = LoggerFactory.getLogger(BackendConfig.class); - public static final String PROPERTIES_FILE_NAME = "analysis.properties"; + // CONSTANTS AND STATIC FIELDS - public static final String CONVEYAL_PROPERTY_PREFIX = "conveyal-"; + private static final Logger LOG = LoggerFactory.getLogger(BackendConfig.class); + public static final String BACKEND_CONFIG_FILE = "analysis.properties"; - protected final Properties config = new Properties(); - protected final Set missingKeys = new HashSet<>(); + // INSTANCE FIELDS - // If true, this backend instance should shut itself down after a period of inactivity. - public final boolean autoShutdown; - private final String bundleBucket; + private final boolean offline; private final String databaseName; private final String databaseUri; private final String localCacheDirectory; private final int serverPort; - private final boolean offline; private final String seamlessCensusBucket; private final String seamlessCensusRegion; - private final String gridBucket; - public final String resourcesBucket; // This appears to be unused - public final String resultsBucket; private final int lightThreads; private final int heavyThreads; private final int maxWorkers; - // If set to true, the backend will start up and immediately exit with a success code. // This is used for testing that automated builds and JAR packaging are producing a usable artifact. public final boolean immediateShutdown; - // For use in testing - setting this field will activate alternate code paths that cause intentional failures. private boolean testTaskRedelivery = false; - /** - * Construct a backend configuration object from the default non-testing properties file. - */ - public BackendConfig () { - this(PROPERTIES_FILE_NAME); - } - - /** - * Load configuration from the given properties file, overriding from environment variables and system properties. - * In the latter two sources, the keys may be in upper or lower case and use dashes, underscores, or dots as - * separators. The usual config file keys must be prefixed with "conveyal", e.g. CONVEYAL_HEAVY_THREADS=5 or - * conveyal.heavy.threads=5. - * Precedence of configuration sources is: system properties > environment variables > config file. - */ - public BackendConfig (String filename) { - try (FileInputStream is = new FileInputStream(filename)) { - config.load(is); - } catch (Exception e) { - String message = "Could not read config file " + filename; - LOG.error(message); - throw new RuntimeException(message, e); - } - - // Overwrite properties from config file with environment variables and system properties. - // This could also be done with the Properties constructor that specifies defaults, but by manually - // overwriting items we are able to log these potentially confusing changes to configuration. - setPropertiesFromMap(System.getenv(), "environment variable"); - setPropertiesFromMap(System.getProperties(), "system properties"); + // CONSTRUCTORS - // We intentionally don't supply any defaults here - any 'defaults' should be shipped in an example config file. - autoShutdown = Boolean.parseBoolean(getProperty("auto-shutdown", false)); - immediateShutdown = Boolean.parseBoolean(getProperty("immediate-shutdown", false)); - bundleBucket = getProperty("bundle-bucket", true); - databaseName = getProperty("database-name", true); - databaseUri = getProperty("database-uri", false); - localCacheDirectory = getProperty("local-cache", true); - serverPort = Integer.parseInt(getProperty("server-port", true)); - offline = Boolean.parseBoolean(getProperty("offline", true)); - seamlessCensusBucket = getProperty("seamless-census-bucket", true); - seamlessCensusRegion = getProperty("seamless-census-region", true); - gridBucket = getProperty("grid-bucket", true); - resourcesBucket = getProperty("resources-bucket", true); - resultsBucket = getProperty("results-bucket", true); - lightThreads = Integer.parseInt(getProperty("light-threads", true)); - heavyThreads = Integer.parseInt(getProperty("heavy-threads", true)); - maxWorkers = Integer.parseInt(getProperty("max-workers", true)); - if (!missingKeys.isEmpty()) { - LOG.error("You must provide these configuration properties: {}", String.join(", ", missingKeys)); - System.exit(1); - } + private BackendConfig (String filename) { + this(propsFromFile(filename)); } - /** - * This needs to work with both properties and environment variable conventions, so case and separators - * are normalized. Properties are Object-Object Maps so key and value are cast to String. - */ - private void setPropertiesFromMap (Map map, String sourceDescription) { - for (Map.Entry entry : map.entrySet()) { - // Normalize to String type, all lower case, all dash separators. - String key = ((String)entry.getKey()).toLowerCase().replaceAll("[\\._-]", "-"); - String value = ((String)entry.getValue()); - if (key.startsWith(CONVEYAL_PROPERTY_PREFIX)) { - // Strip off conveyal prefix to get the key that would be used in our config file. - key = key.substring(CONVEYAL_PROPERTY_PREFIX.length()); - String existingKey = config.getProperty(key); - if (existingKey != null) { - LOG.info("Overwriting existing config key {} to '{}' from {}.", key, value, sourceDescription); - } else { - LOG.info("Setting configuration key {} to '{}' from {}.", key, value, sourceDescription); - } - config.setProperty(key, value); - } - } + protected BackendConfig (Properties properties) { + super(properties); + // We intentionally don't supply any defaults here. + // Any 'defaults' should be shipped in an example config file. + immediateShutdown = boolProp("immediate-shutdown"); + databaseName = strProp("database-name"); + databaseUri = strProp("database-uri"); + localCacheDirectory = strProp("local-cache"); + serverPort = intProp("server-port"); + offline = boolProp("offline"); + seamlessCensusBucket = strProp("seamless-census-bucket"); + seamlessCensusRegion = strProp("seamless-census-region"); + lightThreads = intProp("light-threads"); + heavyThreads = intProp("heavy-threads"); + maxWorkers = intProp("max-workers"); + exitIfErrors(); } - private String getProperty (String key, boolean require) { - String value = config.getProperty(key); - if (require && value == null) { - LOG.error("Missing configuration option {}", key); - missingKeys.add(key); - } - return value; - } - - // Implementations of Component and HttpController Config interfaces + // INTERFACE IMPLEMENTATIONS + // Methods implementing Component and HttpController Config interfaces. // Note that one method can implement several Config interfaces at once. - @Override public int lightThreads () { return lightThreads; } - @Override public int heavyThreads () { return heavyThreads; } - @Override public String databaseUri () { return databaseUri; } - @Override public String databaseName () { return databaseName; } - @Override public String resultsBucket () { return resultsBucket; } - @Override public boolean testTaskRedelivery () { return testTaskRedelivery; } - @Override public String gridBucket () { return gridBucket; } - @Override public String seamlessCensusRegion () { return seamlessCensusRegion; } - @Override public String seamlessCensusBucket () { return seamlessCensusBucket; } - @Override public int serverPort () { return serverPort; } - @Override public String localCacheDirectory () { return localCacheDirectory;} - @Override public String bundleBucket () { return bundleBucket; } - @Override public boolean offline () { return offline; } - @Override public int maxWorkers () { return maxWorkers; } + @Override public int lightThreads() { return lightThreads; } + @Override public int heavyThreads() { return heavyThreads; } + @Override public String databaseUri() { return databaseUri; } + @Override public String databaseName() { return databaseName; } + @Override public String localCacheDirectory() { return localCacheDirectory;} + @Override public boolean testTaskRedelivery() { return testTaskRedelivery; } + @Override public String seamlessCensusRegion() { return seamlessCensusRegion; } + @Override public String seamlessCensusBucket() { return seamlessCensusBucket; } + @Override public int serverPort() { return serverPort; } + @Override public boolean offline() { return offline; } + @Override public int maxWorkers() { return maxWorkers; } + + // STATIC FACTORY METHODS + // Always use these to construct BackendConfig objects for readability. + + public static BackendConfig fromDefaultFile () { + return new BackendConfig(BACKEND_CONFIG_FILE); + } } diff --git a/src/main/java/com/conveyal/analysis/BackendMain.java b/src/main/java/com/conveyal/analysis/BackendMain.java index 4a7b4fe58..b3729682d 100644 --- a/src/main/java/com/conveyal/analysis/BackendMain.java +++ b/src/main/java/com/conveyal/analysis/BackendMain.java @@ -1,64 +1,55 @@ package com.conveyal.analysis; -import com.conveyal.analysis.components.Components; -import com.conveyal.analysis.components.LocalComponents; -import com.conveyal.analysis.grids.SeamlessCensusGridExtractor; +import com.conveyal.analysis.components.BackendComponents; +import com.conveyal.analysis.components.LocalBackendComponents; import com.conveyal.analysis.persistence.Persistence; import com.conveyal.gtfs.api.ApiMain; +import com.conveyal.r5.SoftwareVersion; import com.conveyal.r5.analyst.PointSetCache; import com.conveyal.r5.analyst.WorkerCategory; +import com.conveyal.r5.analyst.progress.Task; +import com.conveyal.r5.analyst.progress.TaskAction; import com.conveyal.r5.util.ExceptionUtils; -import org.joda.time.DateTime; -import org.json.simple.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import spark.Request; -import spark.Response; - -import java.net.InetAddress; -import java.net.NetworkInterface; -import java.net.SocketException; -import java.util.Enumeration; /** - * This is the main entry point for starting a Conveyal Analysis server. + * This is the main entry point for starting a local (non-cloud) Conveyal Analysis server. */ public abstract class BackendMain { private static final Logger LOG = LoggerFactory.getLogger(BackendMain.class); - /** This backend server's IP address. This is passed to the workers so they know how to reach the backend. */ - private static final InetAddress privateServerAddress = discoverPrivateInetAddress(); - public static void main (String... args) { - final Components components = new LocalComponents(); + final BackendComponents components = new LocalBackendComponents(); startServer(components); } - protected static void startServer (Components components, Thread... postStartupThreads) { + protected static void startServer (BackendComponents components, TaskAction... postStartupTasks) { // We have several non-daemon background thread pools which will keep the JVM alive if the main thread crashes. // If initialization fails, we need to catch the exception or error and force JVM shutdown. try { - startServerInternal(components, postStartupThreads); + startServerInternal(components, postStartupTasks); } catch (Throwable throwable) { - LOG.error("Exception while starting up backend, shutting down JVM.\n{}", ExceptionUtils.asString(throwable)); + LOG.error("Exception while starting up backend, shutting down JVM.\n{}", ExceptionUtils.stackTraceString(throwable)); System.exit(1); } } - private static void startServerInternal (Components components, Thread... postStartupThreads) { - LOG.info("Starting Conveyal analysis backend, the time is now {}", DateTime.now()); - LOG.info("Backend version is: {}", BackendVersion.instance.version); + private static void startServerInternal (BackendComponents components, TaskAction... postStartupTasks) { + LOG.info("Starting Conveyal analysis backend version {}", SoftwareVersion.instance.version); LOG.info("Connecting to database..."); // Persistence, the census extractor, and ApiMain are initialized statically, without creating instances, - // passing in non-static components we've already created. TODO migrate to non-static Components. + // passing in non-static components we've already created. + // TODO migrate to non-static Components. // TODO remove the static ApiMain abstraction layer. We do not use it anywhere but in handling GraphQL queries. + // TODO we could move this to something like BackendComponents.initialize() Persistence.initializeStatically(components.config); - SeamlessCensusGridExtractor.configureStatically(components.config); ApiMain.initialize(components.gtfsCache); - PointSetCache.initializeStatically(components.fileStorage, components.config.gridBucket()); + PointSetCache.initializeStatically(components.fileStorage); + // TODO handle this via components without explicit "if (offline)" if (components.config.offline()) { LOG.info("Running in OFFLINE mode."); LOG.info("Pre-starting local cluster of Analysis workers..."); @@ -66,9 +57,10 @@ private static void startServerInternal (Components components, Thread... postSt } LOG.info("Conveyal Analysis server is ready."); - // TODO replace postStartupThreads with something like components.taskScheduler.enqueueHeavyTask(); - for (Thread thread : postStartupThreads) { - thread.start(); + for (TaskAction taskAction : postStartupTasks) { + components.taskScheduler.enqueue( + Task.create(Runnable.class.getSimpleName()).setHeavy(true).forUser("SYSTEM").withAction(taskAction) + ); } if (components.config.immediateShutdown) { @@ -77,67 +69,4 @@ private static void startServerInternal (Components components, Thread... postSt } } - public static void respondToException(Exception e, Request request, Response response, String type, String message, int code) { - String stack = ExceptionUtils.asString(e); - - LOG.error("{} {} -> {} {} by {} of {}", type, message, request.requestMethod(), request.pathInfo(), request.attribute("email"), request.attribute("accessGroup")); - LOG.error(stack); - - JSONObject body = new JSONObject(); - body.put("type", type); - body.put("message", message); - body.put("stackTrace", stack); - - response.status(code); - response.type("application/json"); - response.body(body.toJSONString()); - } - - public static String getServerIpAddress() { - return privateServerAddress.getHostAddress(); - } - - // InetAddress.getLocalHost() fails on EC2 because the local hostname is not in the hosts file. - // Anyway we don't want the default, we want to search for a stable, private interface internal to the cluster, - // rather than the public one which may be reassigned during startup. - // TODO move this to an InternalHttpApi Component. - private static InetAddress discoverPrivateInetAddress() { - InetAddress privateAddress = null; - Enumeration networkInterfaces = null; - try { - networkInterfaces = NetworkInterface.getNetworkInterfaces(); - while (networkInterfaces.hasMoreElements()) { - NetworkInterface networkInterface = networkInterfaces.nextElement(); - try { - if (!networkInterface.isUp() || networkInterface.isLoopback()) { - continue; - } - } catch (SocketException e) { - continue; - } - Enumeration addressEnumeration = networkInterface.getInetAddresses(); - while (addressEnumeration.hasMoreElements()) { - InetAddress address = addressEnumeration.nextElement(); - if (address.isAnyLocalAddress() || address.isLoopbackAddress() || address.isMulticastAddress()) { - continue; - } - if (address.isSiteLocalAddress()) { - privateAddress = address; - break; - } - } - } - } catch (SocketException e) { - privateAddress = null; - } - if (privateAddress == null) { - LOG.error("Could not determine private server IP address. Workers will not be able to contact it, making regional analysis impossible."); - // privateAddress = InetAddress.getLoopbackAddress(); - // Leave the private address null to fail fast. - } else { - LOG.info("Private server IP address (which will be contacted by workers) is {}", privateAddress); - } - return privateAddress; - } - } diff --git a/src/main/java/com/conveyal/analysis/ConfigBase.java b/src/main/java/com/conveyal/analysis/ConfigBase.java new file mode 100644 index 000000000..da64e3325 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/ConfigBase.java @@ -0,0 +1,138 @@ +package com.conveyal.analysis; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileReader; +import java.io.Reader; +import java.util.HashSet; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +/** + * Shared functionality for classes that load properties containing configuration information and expose these options + * via the Config interfaces of Components and HttpControllers. Subclasses are defined for Worker and Backend config. + * + * Some validation may be performed here, but any interpretation or conditional logic should be provided in Components + * themselves, or possibly in alternate Components implementations. + * + * Example backend config files are shipped in the repo and we always supply a machine-generated config to the workers, + * so it's easy to see an exhaustive list of all parameters. All configuration parameters are therefore required to + * avoid any confusion due to merging layers of defaults. + */ +public class ConfigBase { + + private static final Logger LOG = LoggerFactory.getLogger(ConfigBase.class); + + public static final String CONVEYAL_PROPERTY_PREFIX = "conveyal-"; + + // All access to these should be through the *Prop methods. + private final Properties properties; + + protected final Set keysWithErrors = new HashSet<>(); + + /** + * Prepare to load config from the given properties, overriding from environment variables and system properties. + * In the latter two sources, the keys may be in upper or lower case and use dashes, underscores, or dots as + * separators. System properties can be set on the JVM command line with -D options. The usual config file keys + * must be prefixed with "conveyal", e.g. CONVEYAL_HEAVY_THREADS=5 or java -Dconveyal.heavy.threads=5. + * Precedence of configuration sources is: system properties > environment variables > config file. + */ + protected ConfigBase (Properties properties) { + this.properties = properties; + // Overwrite properties from config file with environment variables and system properties. + // This could also be done with the Properties constructor that specifies defaults, but by manually + // overwriting items we are able to log these potentially confusing changes to configuration. + setPropertiesFromMap(System.getenv(), "environment variable"); + setPropertiesFromMap(System.getProperties(), "system properties"); + } + + /** Static convenience method to uniformly load files into properties and catch errors. */ + protected static Properties propsFromFile (String filename) { + try (Reader propsReader = new FileReader(filename)) { + Properties properties = new Properties(); + properties.load(propsReader); + return properties; + } catch (Exception e) { + throw new RuntimeException("Could not load configuration properties.", e); + } + } + + // Always use the following *Prop methods to read properties. This will catch and log missing keys or parse + // exceptions, allowing config loading to continue and reporting as many problems as possible at once. + + // Catches and records missing values, + // so methods that wrap this and parse into non-String types can just ignore null values. + protected String strProp (String key) { + String value = properties.getProperty(key); + if (value == null) { + LOG.error("Missing configuration option {}", key); + keysWithErrors.add(key); + } + return value; + } + + protected int intProp (String key) { + String val = strProp(key); + if (val != null) { + try { + return Integer.parseInt(val); + } catch (NumberFormatException nfe) { + LOG.error("Value of configuration option '{}' could not be parsed as an integer: {}", key, val); + keysWithErrors.add(key); + } + } + return 0; + } + + protected boolean boolProp (String key) { + String val = strProp(key); + if (val != null) { + // Boolean.parseBoolean will return false for any string other than "true". + // We want to be more strict. + if ("true".equalsIgnoreCase(val) || "yes".equalsIgnoreCase(val)) { + return true; + } else if ("false".equalsIgnoreCase(val) || "no".equalsIgnoreCase(val)) { + return false; + } else { + LOG.error("Value of configuration option '{}' could not be parsed as an integer: {}", key, val); + keysWithErrors.add(key); + } + } + return false; + } + + /** Call this after reading all properties to enforce the presence of all configuration options. */ + protected void exitIfErrors () { + if (!keysWithErrors.isEmpty()) { + LOG.error("You must provide these configuration properties: {}", String.join(", ", keysWithErrors)); + System.exit(1); + } + } + + /** + * Overwrite configuration options supplied in the config file with environment variables and system properties + * (e.g. supplied on the JVM command line). Case and separators are normalized to conform to both properties and + * environment variable conventions. Properties are Object-Object Maps so key and value are cast to String. + */ + private void setPropertiesFromMap (Map map, String sourceDescription) { + for (Map.Entry entry : map.entrySet()) { + // Normalize to String type, all lower case, all dash separators. + String key = ((String)entry.getKey()).toLowerCase().replaceAll("[\\._-]", "-"); + String value = ((String)entry.getValue()); + if (key.startsWith(CONVEYAL_PROPERTY_PREFIX)) { + // Strip off conveyal prefix to get the key that would be used in our config file. + key = key.substring(CONVEYAL_PROPERTY_PREFIX.length()); + String existingKey = properties.getProperty(key); + if (existingKey != null) { + LOG.info("Overwriting existing config key {} to '{}' from {}.", key, value, sourceDescription); + } else { + LOG.info("Setting configuration key {} to '{}' from {}.", key, value, sourceDescription); + } + properties.setProperty(key, value); + } + } + } + +} diff --git a/src/main/java/com/conveyal/analysis/LocalWorkerConfig.java b/src/main/java/com/conveyal/analysis/LocalWorkerConfig.java new file mode 100644 index 000000000..0ea2f6b18 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/LocalWorkerConfig.java @@ -0,0 +1,38 @@ +package com.conveyal.analysis; + +import com.conveyal.file.LocalFileStorage; + +import java.util.Properties; + +/** + * Note that some local config is not supplied by config files, + * e.g. the initial graph is hard-wired to null in local operation, and files are always served on the same port. + */ +public class LocalWorkerConfig extends WorkerConfig implements LocalFileStorage.Config { + + private final String cacheDirectory; + + private LocalWorkerConfig (Properties props) { + super(props); + // Actually this is not used directly, backend storage component is passed in to local worker constructor. + cacheDirectory = strProp("cache-dir"); + exitIfErrors(); + } + + // INTERFACE IMPLEMENTATIONS + // Methods implementing Component and HttpController Config interfaces. + // Note that one method can implement several Config interfaces at once. + + // FIXME align with actual local file serving port, somehow connected to API? + @Override public int serverPort() { return -1; } + @Override public String localCacheDirectory () { return cacheDirectory; } + @Override public String initialGraphId () { return null; } // + + // STATIC FACTORY METHODS + // Use these to construct WorkerConfig objects for readability. + + public static LocalWorkerConfig fromProperties (Properties properties) { + return new LocalWorkerConfig(properties); + } + +} diff --git a/src/main/java/com/conveyal/analysis/RegionalAnalysisStatus.java b/src/main/java/com/conveyal/analysis/RegionalAnalysisStatus.java deleted file mode 100644 index a3bff5ceb..000000000 --- a/src/main/java/com/conveyal/analysis/RegionalAnalysisStatus.java +++ /dev/null @@ -1,21 +0,0 @@ -package com.conveyal.analysis; - - -import com.conveyal.analysis.results.MultiOriginAssembler; - -import java.io.Serializable; - -/** - * This model object is sent to the UI serialized as JSON in order to report regional job progress. - */ -public final class RegionalAnalysisStatus implements Serializable { - public int total; - public int complete; - - public RegionalAnalysisStatus() { /* No-arg constructor for deserialization only. */ } - - public RegionalAnalysisStatus(MultiOriginAssembler assembler) { - total = assembler.nOriginsTotal; - complete = assembler.nComplete; - } -} diff --git a/src/main/java/com/conveyal/analysis/WorkerConfig.java b/src/main/java/com/conveyal/analysis/WorkerConfig.java new file mode 100644 index 000000000..6e187b18d --- /dev/null +++ b/src/main/java/com/conveyal/analysis/WorkerConfig.java @@ -0,0 +1,53 @@ +package com.conveyal.analysis; + +import com.conveyal.analysis.components.TaskScheduler; +import com.conveyal.r5.analyst.cluster.AnalysisWorker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Properties; + +/** Loads config information for an analysis worker and exposes it to the worker's Components and HttpControllers. */ +public abstract class WorkerConfig extends ConfigBase implements TaskScheduler.Config, AnalysisWorker.Config { + + private static final Logger LOG = LoggerFactory.getLogger(ConfigBase.class); + + // INSTANCE FIELDS + + private final String brokerAddress; + private final String brokerPort; + private final int lightThreads; + private final int heavyThreads; + private final boolean testTaskRedelivery; + private final boolean listenForSinglePoint; + + // CONSTRUCTORS + + protected WorkerConfig (Properties props) { + super(props); + brokerAddress = strProp("broker-address"); + brokerPort = strProp("broker-port"); + { + // Should we supply these in properties, or should this be inferred from CPU cores elsewhere? + int availableProcessors = Runtime.getRuntime().availableProcessors(); + LOG.info("Java reports the number of available processors is: {}", availableProcessors); + lightThreads = availableProcessors; + heavyThreads = availableProcessors; + } + testTaskRedelivery = boolProp("test-task-redelivery"); + listenForSinglePoint = boolProp("listen-for-single-point"); + // No call to exitIfErrors() here, that should be done in concrete subclasses. + } + + // INTERFACE IMPLEMENTATIONS + // Methods implementing Component and HttpController Config interfaces. + // Note that one method can implement several Config interfaces at once. + + @Override public String brokerAddress() { return brokerAddress; } + @Override public String brokerPort() { return brokerPort; } + @Override public int lightThreads () { return lightThreads; } + @Override public int heavyThreads () { return heavyThreads; } + @Override public boolean testTaskRedelivery() { return testTaskRedelivery; } + @Override public boolean listenForSinglePoint() { return listenForSinglePoint; } + +} diff --git a/src/main/java/com/conveyal/analysis/components/Components.java b/src/main/java/com/conveyal/analysis/components/BackendComponents.java similarity index 50% rename from src/main/java/com/conveyal/analysis/components/Components.java rename to src/main/java/com/conveyal/analysis/components/BackendComponents.java index a82296773..197f46b22 100644 --- a/src/main/java/com/conveyal/analysis/components/Components.java +++ b/src/main/java/com/conveyal/analysis/components/BackendComponents.java @@ -3,13 +3,30 @@ import com.conveyal.analysis.BackendConfig; import com.conveyal.analysis.components.broker.Broker; import com.conveyal.analysis.components.eventbus.EventBus; +import com.conveyal.analysis.controllers.AggregationAreaController; +import com.conveyal.analysis.controllers.BrokerController; +import com.conveyal.analysis.controllers.BundleController; +import com.conveyal.analysis.controllers.FileStorageController; +import com.conveyal.analysis.controllers.GTFSGraphQLController; +import com.conveyal.analysis.controllers.GtfsTileController; +import com.conveyal.analysis.controllers.HttpController; +import com.conveyal.analysis.controllers.ModificationController; +import com.conveyal.analysis.controllers.OpportunityDatasetController; +import com.conveyal.analysis.controllers.ProjectController; +import com.conveyal.analysis.controllers.RegionalAnalysisController; +import com.conveyal.analysis.controllers.TimetableController; +import com.conveyal.analysis.controllers.UserActivityController; +import com.conveyal.analysis.grids.SeamlessCensusGridExtractor; import com.conveyal.analysis.persistence.AnalysisDB; import com.conveyal.file.FileStorage; import com.conveyal.gtfs.GTFSCache; import com.conveyal.r5.streets.OSMCache; +import com.google.common.collect.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.List; + /** * We are adopting a lightweight dependency injection approach, where we manually wire up our components instead of * relying on a framework. For our simple case the approach is almost identical but we have to manage the order in @@ -36,9 +53,9 @@ * generalization. We only expect to ever have 2-3 variants of this class. Just make a new subclass and complete * constructor for each. */ -public abstract class Components { +public abstract class BackendComponents { - private static final Logger LOG = LoggerFactory.getLogger(Components.class); + private static final Logger LOG = LoggerFactory.getLogger(BackendComponents.class); public BackendConfig config; /** Verification of user identity and permissions. */ @@ -55,6 +72,38 @@ public abstract class Components { public AnalysisDB database; public HttpApi httpApi; public TaskScheduler taskScheduler; + public SeamlessCensusGridExtractor censusExtractor; public EventBus eventBus; + /** + * Create the standard list of HttpControllers used in local operation. This BackendComponents instance + * should already be initialized with all components except the HttpApi. + * We pass these controllers into the HttpApi (rather than constructing them in the HttpApi constructor) to allow + * injecting custom controllers in other deployment environments. This also avoids bulk-passing the entire set + * of components into the HttpApi constructor, ensuring clear delineation of each component's dependencies. + */ + public List standardHttpControllers () { + return Lists.newArrayList( + // These handlers are at paths beginning with /api + // and therefore subject to authentication and authorization. + new ModificationController(), + new ProjectController(), + new GTFSGraphQLController(gtfsCache), + new BundleController(this), + new OpportunityDatasetController(fileStorage, taskScheduler, censusExtractor), + new RegionalAnalysisController(broker, fileStorage), + new AggregationAreaController(fileStorage), + new TimetableController(), + new FileStorageController(fileStorage, database), + // This broker controller registers at least one handler at URL paths beginning with /internal, which + // is exempted from authentication and authorization, but should be hidden from the world + // outside the cluster by the reverse proxy. Perhaps we should serve /internal on a separate + // port so they can't be accidentally exposed by the reverse proxy. It could even be a separate + // InternalHttpApi component with its own spark service, renaming this ExternalHttpApi. + new BrokerController(broker, eventBus), + new UserActivityController(taskScheduler), + new GtfsTileController(gtfsCache) + ); + } + } diff --git a/src/main/java/com/conveyal/analysis/components/Component.java b/src/main/java/com/conveyal/analysis/components/Component.java index 85b6a2d49..2f835f65d 100644 --- a/src/main/java/com/conveyal/analysis/components/Component.java +++ b/src/main/java/com/conveyal/analysis/components/Component.java @@ -8,6 +8,9 @@ * Each component should encapsulate a distinct, well-defined set of functionality. Different implementations of * components allow running locally or in other environments like AWS or potentially other cloud service providers. * Currently this is a marker interface with no methods, just to indicate the role of certain classes in the project. + * + * All Components should be threadsafe: they must not fail when concurrently used by multiple HTTP handler threads. */ public interface Component { + } diff --git a/src/main/java/com/conveyal/analysis/components/HttpApi.java b/src/main/java/com/conveyal/analysis/components/HttpApi.java index bf8852337..a5315c728 100644 --- a/src/main/java/com/conveyal/analysis/components/HttpApi.java +++ b/src/main/java/com/conveyal/analysis/components/HttpApi.java @@ -1,18 +1,14 @@ package com.conveyal.analysis.components; import com.conveyal.analysis.AnalysisServerException; -import com.conveyal.analysis.BackendMain; -import com.conveyal.analysis.BackendVersion; +import com.conveyal.r5.SoftwareVersion; import com.conveyal.analysis.UserPermissions; -import com.conveyal.analysis.components.eventbus.Event; +import com.conveyal.analysis.components.eventbus.ErrorEvent; import com.conveyal.analysis.components.eventbus.EventBus; import com.conveyal.analysis.components.eventbus.HttpApiEvent; import com.conveyal.analysis.controllers.HttpController; import com.conveyal.analysis.util.JsonUtil; import com.conveyal.file.FileStorage; -import com.conveyal.file.FileStorageFormat; -import com.conveyal.file.FileStorageKey; -import com.conveyal.file.FileUtils; import org.apache.commons.fileupload.FileUploadException; import org.json.simple.JSONObject; import org.slf4j.Logger; @@ -20,12 +16,15 @@ import spark.Request; import spark.Response; -import java.io.File; import java.io.IOException; import java.time.Duration; import java.time.Instant; import java.util.List; +import static com.conveyal.analysis.AnalysisServerException.Type.BAD_REQUEST; +import static com.conveyal.analysis.AnalysisServerException.Type.RUNTIME; +import static com.conveyal.analysis.AnalysisServerException.Type.UNKNOWN; + /** * This Component is a web server that serves up our HTTP API endpoints, both to the UI and to the workers. * It must be supplied with a list of HttpController instances implementing the endpoints. @@ -41,7 +40,7 @@ public class HttpApi implements Component { public static final String USER_GROUP_ATTRIBUTE = "accessGroup"; public interface Config { - boolean offline (); + boolean offline (); // TODO remove this parameter, use different Components types instead int serverPort (); } @@ -114,9 +113,8 @@ private spark.Service configureSparkService () { // but may fail to record requests experiencing authentication problems. Instant requestStartTime = req.attribute(REQUEST_START_TIME_ATTRIBUTE); Duration elapsed = Duration.between(requestStartTime, Instant.now()); - Event event = new HttpApiEvent(req.requestMethod(), res.status(), req.pathInfo(), elapsed.toMillis()); - UserPermissions userPermissions = req.attribute(USER_PERMISSIONS_ATTRIBUTE); - eventBus.send(event.forUser(userPermissions)); + eventBus.send(new HttpApiEvent(req.requestMethod(), res.status(), req.pathInfo(), elapsed.toMillis()) + .forUser(req.attribute(USER_PERMISSIONS_ATTRIBUTE))); }); // Handle CORS preflight requests (which are OPTIONS requests). @@ -132,39 +130,17 @@ private spark.Service configureSparkService () { // Allow client to fetch information about the backend build version. sparkService.get( "/version", - (Request req, Response res) -> BackendVersion.instance, + (Request req, Response res) -> SoftwareVersion.instance, JsonUtil.objectMapper::writeValueAsString ); - // Expose all files in storage while in offline mode. - // Not done with static file serving because it automatically gzips our already gzipped files. - if (config.offline()) { - sparkService.get("/files/:bucket/*", (req, res) -> { - String filename = req.splat()[0]; - FileStorageKey key = new FileStorageKey(req.params("bucket"), filename); - File file = fileStorage.getFile(key); - FileStorageFormat format = FileStorageFormat.fromFilename(filename); - res.type(format.mimeType); - - // If the content-encoding is set to gzip, Spark automatically gzips the response. This mangles data - // that was already gzipped. Therefore, check if it's gzipped and pipe directly to the raw OutputStream. - res.header("Content-Encoding", "gzip"); - if (FileUtils.isGzip(file)) { - FileUtils.transferFromFileTo(file, res.raw().getOutputStream()); - return null; - } else { - return FileUtils.getInputStream(file); - } - }); - } - - // ============ + // Can we consolidate all these exception handlers and get rid of the hard-wired "BAD_REQUEST" parameters? sparkService.exception(AnalysisServerException.class, (e, request, response) -> { // Include a stack trace, except when the error is known to be about unauthenticated or unauthorized access, // in which case we don't want to leak information about the server to people scanning it for weaknesses. - if (e.type == AnalysisServerException.TYPE.UNAUTHORIZED || - e.type == AnalysisServerException.TYPE.FORBIDDEN + if (e.type == AnalysisServerException.Type.UNAUTHORIZED || + e.type == AnalysisServerException.Type.FORBIDDEN ){ JSONObject body = new JSONObject(); body.put("type", e.type.toString()); @@ -173,29 +149,46 @@ private spark.Service configureSparkService () { response.type("application/json"); response.body(body.toJSONString()); } else { - BackendMain.respondToException(e, request, response, e.type.name(), e.message, e.httpCode); + respondToException(e, request, response, e.type, e.message, e.httpCode); } }); sparkService.exception(IOException.class, (e, request, response) -> { - BackendMain.respondToException(e, request, response, "BAD_REQUEST", e.toString(), 400); + respondToException(e, request, response, BAD_REQUEST, e.toString(), 400); }); sparkService.exception(FileUploadException.class, (e, request, response) -> { - BackendMain.respondToException(e, request, response, "BAD_REQUEST", e.toString(), 400); + respondToException(e, request, response, BAD_REQUEST, e.toString(), 400); }); sparkService.exception(NullPointerException.class, (e, request, response) -> { - BackendMain.respondToException(e, request, response, "UNKNOWN", e.toString(), 400); + respondToException(e, request, response, UNKNOWN, e.toString(), 400); }); sparkService.exception(RuntimeException.class, (e, request, response) -> { - BackendMain.respondToException(e, request, response, "RUNTIME", e.toString(), 400); + respondToException(e, request, response, RUNTIME, e.toString(), 400); }); return sparkService; } + private void respondToException(Exception e, Request request, Response response, + AnalysisServerException.Type type, String message, int code) { + + // Stacktrace in ErrorEvent reused below to avoid repeatedly generating String of stacktrace. + ErrorEvent errorEvent = new ErrorEvent(e); + eventBus.send(errorEvent.forUser(request.attribute(USER_PERMISSIONS_ATTRIBUTE))); + + JSONObject body = new JSONObject(); + body.put("type", type.toString()); + body.put("message", message); + body.put("stackTrace", errorEvent.stackTrace); + + response.status(code); + response.type("application/json"); + response.body(body.toJSONString()); + } + // Maybe this should be done or called with a JVM shutdown hook public void shutDown () { sparkService.stop(); diff --git a/src/main/java/com/conveyal/analysis/components/LocalBackendComponents.java b/src/main/java/com/conveyal/analysis/components/LocalBackendComponents.java new file mode 100644 index 000000000..4de8e3098 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/LocalBackendComponents.java @@ -0,0 +1,48 @@ +package com.conveyal.analysis.components; + +import com.conveyal.analysis.BackendConfig; +import com.conveyal.analysis.components.broker.Broker; +import com.conveyal.analysis.components.eventbus.ErrorLogger; +import com.conveyal.analysis.components.eventbus.EventBus; +import com.conveyal.analysis.controllers.HttpController; +import com.conveyal.analysis.controllers.LocalFilesController; +import com.conveyal.analysis.grids.SeamlessCensusGridExtractor; +import com.conveyal.analysis.persistence.AnalysisDB; +import com.conveyal.file.LocalFileStorage; +import com.conveyal.gtfs.GTFSCache; +import com.conveyal.r5.streets.OSMCache; + +import java.util.List; + +/** + * Wires up the components for a local backend instance (as opposed to a cloud-hosted backend instance). + * This establishes the implementations and dependencies between them, and supplies configuration. + * No conditional logic should be present here. + * Differences in implementation or configuration are handled by the Components themselves. + */ +public class LocalBackendComponents extends BackendComponents { + + public LocalBackendComponents () { + config = BackendConfig.fromDefaultFile(); + taskScheduler = new TaskScheduler(config); + fileStorage = new LocalFileStorage(config); + gtfsCache = new GTFSCache(fileStorage); + osmCache = new OSMCache(fileStorage); + // New (October 2019) DB layer, this should progressively replace the Persistence class + database = new AnalysisDB(config); + eventBus = new EventBus(taskScheduler); + authentication = new LocalAuthentication(); + // TODO add nested LocalWorkerComponents here, to reuse some components, and pass it into the LocalWorkerLauncher? + workerLauncher = new LocalWorkerLauncher(config, fileStorage, gtfsCache, osmCache); + broker = new Broker(config, fileStorage, eventBus, workerLauncher); + censusExtractor = new SeamlessCensusGridExtractor(config); + // Instantiate the HttpControllers last, when all the components except the HttpApi are already created. + List httpControllers = standardHttpControllers(); + httpControllers.add(new LocalFilesController(fileStorage)); + httpApi = new HttpApi(fileStorage, authentication, eventBus, config, httpControllers); + // compute = new LocalCompute(); + // persistence = persistence(local_Mongo) + eventBus.addHandlers(new ErrorLogger()); + } + +} diff --git a/src/main/java/com/conveyal/analysis/components/LocalComponents.java b/src/main/java/com/conveyal/analysis/components/LocalComponents.java deleted file mode 100644 index 90e73e8db..000000000 --- a/src/main/java/com/conveyal/analysis/components/LocalComponents.java +++ /dev/null @@ -1,84 +0,0 @@ -package com.conveyal.analysis.components; - -import com.conveyal.analysis.BackendConfig; -import com.conveyal.analysis.components.broker.Broker; -import com.conveyal.analysis.components.eventbus.EventBus; -import com.conveyal.analysis.controllers.AggregationAreaController; -import com.conveyal.analysis.controllers.BrokerController; -import com.conveyal.analysis.controllers.BundleController; -import com.conveyal.analysis.controllers.FileStorageController; -import com.conveyal.analysis.controllers.GTFSGraphQLController; -import com.conveyal.analysis.controllers.GtfsTileController; -import com.conveyal.analysis.controllers.HttpController; -import com.conveyal.analysis.controllers.ModificationController; -import com.conveyal.analysis.controllers.OpportunityDatasetController; -import com.conveyal.analysis.controllers.ProjectController; -import com.conveyal.analysis.controllers.RegionalAnalysisController; -import com.conveyal.analysis.controllers.TimetableController; -import com.conveyal.analysis.persistence.AnalysisDB; -import com.conveyal.file.LocalFileStorage; -import com.conveyal.gtfs.GTFSCache; -import com.conveyal.r5.streets.OSMCache; - -import java.util.Arrays; -import java.util.List; - -/** - * Wires up the components for a local backend instance (as opposed to a cloud cluster). - */ -public class LocalComponents extends Components { - - public LocalComponents () { - config = new BackendConfig(); - taskScheduler = new TaskScheduler(config); - fileStorage = new LocalFileStorage( - config.localCacheDirectory(), - String.format("http://localhost:%s/files", config.serverPort()) - ); - gtfsCache = new GTFSCache(fileStorage, config); - osmCache = new OSMCache(fileStorage, config); - // New (October 2019) DB layer, this should progressively replace the Persistence class - database = new AnalysisDB(config); - eventBus = new EventBus(taskScheduler); - authentication = new LocalAuthentication(); - workerLauncher = new LocalWorkerLauncher(config, fileStorage, gtfsCache, osmCache); - broker = new Broker(config, fileStorage, eventBus, workerLauncher); - // Instantiate the HttpControllers last, when all the components except the HttpApi are already created. - httpApi = new HttpApi(fileStorage, authentication, eventBus, config, standardHttpControllers(this)); - // compute = new LocalCompute(); - // persistence = persistence(local_Mongo) - } - - /** - * Create the standard list of HttpControllers used in local operation. - * The Components parameter should already be initialized with all components except the HttpApi. - * We pass these controllers into the HttpApi (rather than constructing them in the HttpApi constructor) to allow - * injecting custom controllers in other deployment environments. This also avoids bulk-passing the entire set - * of components into the HttpApi constructor, ensuring clear declaration of each component's dependencies. - * Such bulk-passing of components should only occur in this wiring-up code, not in component code. - */ - public static List standardHttpControllers (Components components) { - final List httpControllers = Arrays.asList( - // These handlers are at paths beginning with /api - // and therefore subject to authentication and authorization. - new ModificationController(), - new ProjectController(), - new GTFSGraphQLController(components.gtfsCache), - new BundleController(components), - new OpportunityDatasetController(components.fileStorage, components.taskScheduler, components.config), - new RegionalAnalysisController(components.broker, components.fileStorage, components.config), - new AggregationAreaController(components.fileStorage, components.config), - new TimetableController(), - new FileStorageController(components.fileStorage, components.database), - // This broker controller registers at least one handler at URL paths beginning with /internal, which - // is exempted from authentication and authorization, but should be hidden from the world - // outside the cluster by the reverse proxy. Perhaps we should serve /internal on a separate - // port so they can't be accidentally exposed by the reverse proxy. It could even be a separate - // InternalHttpApi component with its own spark service, renaming this ExternalHttpApi. - new BrokerController(components.broker, components.eventBus), - new GtfsTileController(components.gtfsCache) - ); - return httpControllers; - } - -} diff --git a/src/main/java/com/conveyal/analysis/components/LocalWorkerComponents.java b/src/main/java/com/conveyal/analysis/components/LocalWorkerComponents.java new file mode 100644 index 000000000..27148f097 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/LocalWorkerComponents.java @@ -0,0 +1,32 @@ +package com.conveyal.analysis.components; + +import com.conveyal.analysis.WorkerConfig; +import com.conveyal.analysis.components.eventbus.EventBus; +import com.conveyal.r5.analyst.NetworkPreloader; +import com.conveyal.r5.analyst.cluster.AnalysisWorker; +import com.conveyal.r5.transit.TransportNetworkCache; + +/** + * Wires up the components for a local worker instance (as opposed to a cloud-hosted worker instance). + * This establishes the implementations and dependencies between them, and supplies configuration. + * No conditional logic should be present here. + * Differences in implementation or configuration are handled by the Components themselves. + */ +public class LocalWorkerComponents extends WorkerComponents { + + /** In local operation, share the gtfs and osm cache components that the backend has already constructed. */ + public LocalWorkerComponents (TransportNetworkCache transportNetworkCache, WorkerConfig config) { + // GTFS and OSM caches and FileStorage are already referenced in the supplied TransportNetworkCache. + this.transportNetworkCache = transportNetworkCache; + // We could conceivably use the same taskScheduler and eventBus from the backend. + // In fact since we only ever start one worker in local mode, we don't need the components at all. + // We could construct a single AnalysisWorker(Component) in LocalBackendComponents. + // The ClusterWorkerComponents could then be all final again. + taskScheduler = new TaskScheduler(config); + eventBus = new EventBus(taskScheduler); + analysisWorker = new AnalysisWorker(fileStorage, transportNetworkCache, eventBus, config); + // taskScheduler.repeatRegularly(...); + // eventBus.addHandlers(...); + } + +} diff --git a/src/main/java/com/conveyal/analysis/components/LocalWorkerLauncher.java b/src/main/java/com/conveyal/analysis/components/LocalWorkerLauncher.java index be8699dac..18fe795bb 100644 --- a/src/main/java/com/conveyal/analysis/components/LocalWorkerLauncher.java +++ b/src/main/java/com/conveyal/analysis/components/LocalWorkerLauncher.java @@ -1,5 +1,7 @@ package com.conveyal.analysis.components; +import com.conveyal.analysis.LocalWorkerConfig; +import com.conveyal.analysis.WorkerConfig; import com.conveyal.analysis.components.broker.WorkerTags; import com.conveyal.file.FileStorage; import com.conveyal.gtfs.GTFSCache; @@ -24,38 +26,29 @@ public class LocalWorkerLauncher implements WorkerLauncher { private static final int N_WORKERS_LOCAL_TESTING = 4; public interface Config { - String bundleBucket (); int serverPort (); String localCacheDirectory (); - String gridBucket (); boolean testTaskRedelivery(); } private final TransportNetworkCache transportNetworkCache; - private final FileStorage fileStorage; private final Properties workerConfig = new Properties(); private final int nWorkers; private final List workerThreads = new ArrayList<>(); public LocalWorkerLauncher (Config config, FileStorage fileStorage, GTFSCache gtfsCache, OSMCache osmCache) { - LOG.info("Running in OFFLINE mode, a maximum of {} worker threads will be started locally.", N_WORKERS_LOCAL); - this.fileStorage = fileStorage; - transportNetworkCache = new TransportNetworkCache( - fileStorage, - gtfsCache, - osmCache, - config.bundleBucket() - ); + LOG.debug("Running in OFFLINE mode, a maximum of {} worker threads will be started locally.", N_WORKERS_LOCAL); + WorkerComponents.fileStorage = fileStorage; // Note this is a static field for now, should eventually be changed. + transportNetworkCache = new TransportNetworkCache(fileStorage, gtfsCache, osmCache); // Create configuration for the locally running worker workerConfig.setProperty("work-offline", "true"); - // Do not auto-shutdown the local machine workerConfig.setProperty("auto-shutdown", "false"); workerConfig.setProperty("broker-address", "localhost"); workerConfig.setProperty("broker-port", Integer.toString(config.serverPort())); workerConfig.setProperty("cache-dir", config.localCacheDirectory()); - workerConfig.setProperty("pointsets-bucket", config.gridBucket()); - workerConfig.setProperty("aws-region", "eu-west-1"); // TODO remove? Should not be necessary with local worker. + workerConfig.setProperty("test-task-redelivery", "false"); + // From a throughput perspective there is no point in running more than one worker locally, since each worker // has at least as many threads as there are processor cores. But for testing purposes (e.g. testing that task @@ -79,18 +72,22 @@ public void launch (WorkerCategory category, WorkerTags workerTags, int nOnDeman return; } int nTotal = nOnDemand + nSpot; - LOG.info("Number of workers requested is {}.", nTotal); - nTotal = nWorkers; - LOG.info("Ignoring that and starting {} local Analysis workers...", nTotal); - + LOG.debug("Number of workers requested is {}.", nTotal); + if (nTotal != nWorkers) { + nTotal = nWorkers; + LOG.debug("Ignoring that and starting {} local Analysis workers...", nTotal); + } + if (category.graphId != null) { + // Category is null when pre-starting local workers, but can be used when launching on demand. + workerConfig.setProperty("initial-graph-id", category.graphId); + } for (int i = 0; i < nTotal; i++) { Properties singleWorkerConfig = new Properties(workerConfig); - // singleWorkerConfig.setProperty("initial-graph-id", category.graphId); // Avoid starting more than one worker on the same machine trying to listen on the same port. - if (i > 0) { - singleWorkerConfig.setProperty("listen-for-single-point", "false"); - } - AnalysisWorker worker = new AnalysisWorker(singleWorkerConfig, fileStorage, transportNetworkCache); + singleWorkerConfig.setProperty("listen-for-single-point", Boolean.toString(i == 0).toLowerCase()); + WorkerConfig config = LocalWorkerConfig.fromProperties(singleWorkerConfig); + WorkerComponents components = new LocalWorkerComponents(transportNetworkCache, config); + AnalysisWorker worker = components.analysisWorker; Thread workerThread = new Thread(worker, "WORKER " + i); workerThreads.add(workerThread); workerThread.start(); diff --git a/src/main/java/com/conveyal/analysis/components/TaskScheduler.java b/src/main/java/com/conveyal/analysis/components/TaskScheduler.java index f6c156a8c..53ba806d3 100644 --- a/src/main/java/com/conveyal/analysis/components/TaskScheduler.java +++ b/src/main/java/com/conveyal/analysis/components/TaskScheduler.java @@ -1,15 +1,27 @@ package com.conveyal.analysis.components; +import com.conveyal.analysis.UserPermissions; +import com.conveyal.r5.analyst.progress.ApiTask; +import com.conveyal.r5.analyst.progress.Task; +import com.conveyal.r5.analyst.progress.TaskAction; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.Multimaps; +import com.google.common.collect.SetMultimap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nonnull; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; /** * This provides application-wide queues of one-off or repeating tasks. It ensures that some tasks repeat regularly @@ -21,25 +33,64 @@ * The heavy/light distinction is somewhat like a work-stealing pool (see Javadoc on Executors.newWorkStealingPool()). * However with the workStealingPool we have no guarantees it will create more than one queue, or on execution order. * The implementation with ExecutorServices also seems to allow more control over tasks causing exceptions. + * + * This also serves to report active tasks for a given user. + * That could be considered a separate function, but it ended up having a bidirectional dependency on this + * TaskScheduler so they've been merged. + * + * This is moving in the direction of having a single unified task management and reporting system across the backend + * and workers. It could be interesting to gather task status from the whole cluster of workers and merge them together + * into one view. This could conceivably even include regional analyses and chunks of work for regional analyses on + * workers. But even if such merging doesn't occur, it will be convenient to always report progress from backend and + * workers to the UI in the same data structures. + * + * So we're moving toward a programming API where you submit Tasks with attached Actions. + * + * Eventually every asynchronous task should be handled by this one mechanism, to ensure every Throwable is caught and + * cannot kill threads, as well as standardized reporting and tracking of backend and worker activity. */ public class TaskScheduler implements Component { private static final Logger LOG = LoggerFactory.getLogger(TaskScheduler.class); + // The Javadoc on the ExecutorService interface implies that it's threadsafe by mentioning happens-before. + // Inspecting the source code of ThreadPoolExecutor we see it uses locks to make task submission threadsafe. + // So we don't need to explicitly synchronize use of these executor services from multiple simultaneous requests. private final ScheduledExecutorService scheduledExecutor; private final ExecutorService lightExecutor; private final ExecutorService heavyExecutor; - // Keep the futures returned when tasks are scheduled, which give access to status information and exceptions. + // Keep the futures returned when periodic tasks are scheduled, giving access to status information and exceptions. // This should facilitate debugging. We may want to do the same thing for one-off tasks. + // This will need to be synchronized if we ever allow canceling periodic tasks. private final List periodicTaskFutures = new ArrayList<>(); + // Keep track of tasks submitted by each user, for reporting on their progress over the HTTP API. The collection is + // synchronized because multiple users may add to and read this map from different HTTP server threads. When + // reading be aware of the synchronization requirements described on Guava Multimaps.synchronizedMultimap. + // As a Guava SynchronizedMultimap, certain compound operations such as forEach do properly lock the entire multimap. + // Calls to get() return a Guava SynchronizedSet (a subtype of SynchronizedCollection) which also properly locks the + // entire parent multimap for the duration of compound operations such as forEach and removeIf. However it appears + // that stream operations must be manually synchronized. And it seems like there is a potential for another thread + // to alter the map between a call to get() and a subsequent synchronized call like forEach(). When in doubt it + // usually can't hurt to wrap a series of operations in synchronized(tasksForUser). + private final SetMultimap tasksForUser = Multimaps.synchronizedSetMultimap(HashMultimap.create()); + + // Maybe the number of threads should always be auto-set from the number of processor cores (considering SMT). public interface Config { int lightThreads (); int heavyThreads (); } - /** Interface for all actions that we want to repeat at regular intervals. */ + /** + * Interface for all actions that we want to repeat at regular intervals. + * This could be merged with all other task actions, using a getPeriodSeconds method returning -1 for non-periodic. + * However this would yield interfaces with more than one method, and single-method interfaces provide for some + * syntactic flexibility (lambdas and method references). + * TODO use PeriodicTasks to handle worker record scavenging and cluster stats reporting. + * TODO Heavy/light/periodic should be indicated on the Task rather than the TaskAction passed in. + * TODO ProgressListener might benefit from methods to markComplete() and reportError(Throwable) + */ public interface PeriodicTask extends Runnable { int getPeriodSeconds(); } @@ -50,9 +101,6 @@ public TaskScheduler (Config config) { heavyExecutor = Executors.newFixedThreadPool(config.heavyThreads()); } - /** TODO handle worker record scavenging and cluster stats reporting with this. */ - // Require an interface extending runnable to pass something to the TaskScheduler constructor? - // Perhaps start periodic tasks automatically on TaskScheduler construction. public void repeatRegularly (PeriodicTask periodicTask) { String className = periodicTask.getClass().getSimpleName(); int periodSeconds = periodicTask.getPeriodSeconds(); @@ -64,11 +112,14 @@ public void repeatRegularly (PeriodicTask periodicTask) { } public void repeatRegularly (PeriodicTask... periodicTasks) { - for (PeriodicTask periodicTask : periodicTasks) { - repeatRegularly(periodicTask); + for (PeriodicTask task : periodicTasks) { + repeatRegularly(task); } } + // TODO these methods can be eliminated in favor of the single enqueue method that gets information about + // heavy/light/periodic from its Task parameter. + public void enqueueLightTask (Runnable runnable) { lightExecutor.submit(new ErrorTrap(runnable)); } @@ -100,4 +151,65 @@ public final void run () { } } + /** + * Return the status of all background tasks for the given user, as API model objects for serialization to JSON. + * Completed tasks that finished over a minute ago will be purged after returning them. This ensures they're sent + * at least once to the UI and gives any other tabs a chance to poll for them. + * Conversion to the API model is done here to allow synchronization without copying the list of internal tasks. + * The task scheduler collections are being updated by worker threads. The fields of the individual Tasks may also + * be updated at any time. So there is a risk of sending a partially updated Task out to the UI. If this ever causes + * problems we'll need to lock each Task independently. + * Returns an empty list even when no tasks have been recorded for the user (return is always non-null). + */ + public @Nonnull List getTasksForUser (String userEmail) { + synchronized (tasksForUser) { + Set tasks = tasksForUser.get(userEmail); + if (tasks == null) return Collections.emptyList(); + List apiTaskList = tasks.stream() + .map(Task::toApiTask) + .collect(Collectors.toUnmodifiableList()); + tasks.removeIf(t -> t.durationComplete().getSeconds() > 60); + return apiTaskList; + } + } + + // Q: Should the caller ever create its own Tasks, or if are tasks created here inside the TaskScheduler from + // other raw information? Having the caller creating a Task seems like a good way to configure execution details + // like heavy/light/periodic, and submit user information without passing it in. That task request could be separate + // from the internal Task object it creates, but that seems like overkill for an internal mechanism. + public void newTaskForUser (UserPermissions user, TaskAction taskAction) { + Task task = Task.create("TITLE").forUser(user).withAction(taskAction); + enqueue(task); + } + + public void enqueue (Task task) { + task.validate(); + tasksForUser.put(task.user, task); + // TODO if task.isPeriodic()... except that maybe user tasks should never be periodic. Document that. + if (task.isHeavy()) { + heavyExecutor.submit(new ErrorTrap(task)); // TODO replicate ErrorTrap inside Task + } else { + lightExecutor.submit(new ErrorTrap(task)); + } + } + + /** Just demonstrating how this can be used. */ + public void example () { + this.enqueue(Task.create("Example").forUser(new UserPermissions("abyrd@conveyal.com", true, "conveyal")) + .withAction((progressListener -> { + progressListener.beginTask("Processing complicated things...", 1024); + double sum = 0; + for (int i = 0; i < 1024; i++) { + sum += Math.sqrt(i); + progressListener.increment(); + } + })) + ); + } + + /** Get the number of slower "heavy" tasks that are queued for execution and not yet being processed. */ + public int getBacklog() { + return ((ThreadPoolExecutor) heavyExecutor).getQueue().size(); + } + } diff --git a/src/main/java/com/conveyal/analysis/components/WorkerComponents.java b/src/main/java/com/conveyal/analysis/components/WorkerComponents.java new file mode 100644 index 000000000..20eb213ab --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/WorkerComponents.java @@ -0,0 +1,34 @@ +package com.conveyal.analysis.components; + +import com.conveyal.analysis.components.eventbus.EventBus; +import com.conveyal.file.FileStorage; +import com.conveyal.gtfs.GTFSCache; +import com.conveyal.r5.analyst.cluster.AnalysisWorker; +import com.conveyal.r5.streets.OSMCache; +import com.conveyal.r5.transit.TransportNetworkCache; + +/** + * A common base class for different ways of wiring up components for analysis workers for local/cloud environments. + * This is analogous to BackendComponents (a common superclass for wiring up the backend for a particular environment). + * Many of the same components are used in the backend and workers, to increase code reuse and compatibility. + * + * For now, unlike BackendComponents this is not abstract because we have only one method (with conditional logic) for + * wiring up. We expect to eventually have separate LocalWorkerComponents and ClusterWorkerComponents, which will + * eliminate some of the conditional logic in the configuration. + */ +public abstract class WorkerComponents { + + // This static field is a hack because our worker code currently uses FileStorage deep in the call stack. + // It's here rather than FileStorage.instance to emphasize that this is a quirk of the Worker code only. + public static FileStorage fileStorage; + + // INSTANCE FIELDS + // These are all references to singleton components making up a worker. + // Unfortunately these fields can't be final because we want to initialize them in subclass constructors. + // They would need to be set in an unwieldy N-arg constructor. + public TaskScheduler taskScheduler; // TODO use for regularly recurring backend polling, shutdown, all worker tasks. + public EventBus eventBus; + public TransportNetworkCache transportNetworkCache; + public AnalysisWorker analysisWorker; + +} diff --git a/src/main/java/com/conveyal/analysis/components/broker/Broker.java b/src/main/java/com/conveyal/analysis/components/broker/Broker.java index d622240b5..e9c0221da 100644 --- a/src/main/java/com/conveyal/analysis/components/broker/Broker.java +++ b/src/main/java/com/conveyal/analysis/components/broker/Broker.java @@ -1,8 +1,8 @@ package com.conveyal.analysis.components.broker; import com.conveyal.analysis.AnalysisServerException; -import com.conveyal.analysis.RegionalAnalysisStatus; import com.conveyal.analysis.components.WorkerLauncher; +import com.conveyal.analysis.components.eventbus.ErrorEvent; import com.conveyal.analysis.components.eventbus.EventBus; import com.conveyal.analysis.components.eventbus.RegionalAnalysisEvent; import com.conveyal.analysis.components.eventbus.WorkerEvent; @@ -17,6 +17,7 @@ import com.conveyal.r5.analyst.cluster.RegionalWorkResult; import com.conveyal.r5.analyst.cluster.WorkerStatus; import com.conveyal.r5.analyst.scenario.Scenario; +import com.conveyal.r5.util.ExceptionUtils; import com.google.common.collect.ListMultimap; import com.google.common.collect.MultimapBuilder; import gnu.trove.TCollections; @@ -41,6 +42,8 @@ import static com.conveyal.analysis.components.eventbus.WorkerEvent.Action.REQUESTED; import static com.conveyal.analysis.components.eventbus.WorkerEvent.Role.REGIONAL; import static com.conveyal.analysis.components.eventbus.WorkerEvent.Role.SINGLE_POINT; +import static com.conveyal.file.FileCategory.BUNDLES; +import static com.google.common.base.Preconditions.checkNotNull; /** * This class distributes the tasks making up regional jobs to workers. @@ -84,8 +87,6 @@ public interface Config { // TODO Really these first two should be WorkerLauncher / Compute config boolean offline (); int maxWorkers (); - String resultsBucket (); - String bundleBucket (); boolean testTaskRedelivery (); } @@ -170,9 +171,7 @@ public synchronized void enqueueTasksForRegionalJob (RegionalAnalysis regionalAn // TODO encapsulate MultiOriginAssemblers in a new Component // Note: if this fails with an exception we'll have a job enqueued, possibly being processed, with no assembler. // That is not catastrophic, but the user may need to recognize and delete the stalled regional job. - MultiOriginAssembler assembler = new MultiOriginAssembler( - regionalAnalysis, job, config.resultsBucket(), fileStorage - ); + MultiOriginAssembler assembler = new MultiOriginAssembler(regionalAnalysis, job, fileStorage); resultAssemblers.put(templateTask.jobId, assembler); if (config.testTaskRedelivery()) { @@ -208,7 +207,7 @@ private RegionalTask templateTaskFromRegionalAnalysis (RegionalAnalysis regional // Null out the scenario in the template task, avoiding repeated serialization to the workers as massive JSON. templateTask.scenario = null; String fileName = String.format("%s_%s.json", regionalAnalysis.bundleId, scenario.id); - FileStorageKey fileStorageKey = new FileStorageKey(config.bundleBucket(), fileName); + FileStorageKey fileStorageKey = new FileStorageKey(BUNDLES, fileName); try { File localScenario = FileUtils.createScratchFile("json"); JsonUtil.objectMapper.writeValue(localScenario, scenario); @@ -270,7 +269,7 @@ public void createWorkersInCategory (WorkerCategory category, WorkerTags workerT // If workers have already been started up, don't repeat the operation. if (recentlyRequestedWorkers.containsKey(category) && recentlyRequestedWorkers.get(category) >= System.currentTimeMillis() - WORKER_STARTUP_TIME) { - LOG.info("Workers still starting on {}, not starting more", category); + LOG.debug("Workers still starting on {}, not starting more", category); return; } @@ -322,14 +321,10 @@ public synchronized List getSomeWork (WorkerCategory workerCategor * * @return whether the task was found and removed. */ - public synchronized void markTaskCompleted (String jobId, int taskId) { - Job job = findJob(jobId); - if (job == null) { - LOG.error("Could not find a job with ID {} and therefore could not mark the task as completed.", jobId); - return; - } + public synchronized void markTaskCompleted (Job job, int taskId) { + checkNotNull(job); if (!job.markTaskCompleted(taskId)) { - LOG.error("Failed to mark task {} completed on job {}.", taskId, jobId); + LOG.error("Failed to mark task {} completed on job {}.", taskId, job.jobId); } // Once the last task is marked as completed, the job is finished. // Purge it from the list to free memory. @@ -338,11 +333,16 @@ public synchronized void markTaskCompleted (String jobId, int taskId) { jobs.remove(job.workerCategory, job); // This method is called after the regional work results are handled, finishing and closing the local file. // So we can harmlessly remove the MultiOriginAssembler now that the job is removed. - resultAssemblers.remove(jobId); + resultAssemblers.remove(job.jobId); eventBus.send(new RegionalAnalysisEvent(job.jobId, COMPLETED).forUser(job.workerTags.user, job.workerTags.group)); } } + /** This method ensures synchronization of writes to Jobs from the unsynchronized worker poll HTTP handler. */ + private synchronized void recordJobError (Job job, String error) { + job.errors.add(error); + } + /** * Simple method for querying all current job statuses. * @return List of JobStatuses @@ -446,21 +446,32 @@ public void handleRegionalWorkResult(RegionalWorkResult workResult) { job = findJob(workResult.jobId); assembler = resultAssemblers.get(workResult.jobId); } - if (assembler == null) { - LOG.error("Received result for unrecognized job ID {}, discarding.", workResult.jobId); - } else { - // FIXME this is building up to 5 grids and uploading them to S3, this should not be done synchronously in - // an HTTP handler. + if (job == null || assembler == null) { + // This will happen naturally for all delivered tasks when a job is deleted by the user. + LOG.debug("Received result for unrecognized job ID {}, discarding.", workResult.jobId); + return; + } + if (workResult.error != null) { + // Just record the error reported by the worker and don't pass the result on to regional result assembly. + // The Job will stop delivering tasks, allowing workers to shut down. User will need to manually delete it. + recordJobError(job, workResult.error); + return; + } + // When the last task is received, this will build up to 5 grids and upload them to S3. That should probably + // not be done synchronously in an HTTP handler called by the worker (likewise for starting workers below). + try { assembler.handleMessage(workResult); - // When results for the task with the magic number are received, consider boosting the job by starting EC2 - // spot instances - if (workResult.taskId == AUTO_START_SPOT_INSTANCES_AT_TASK) { - requestExtraWorkersIfAppropriate(job); - } + markTaskCompleted(job, workResult.taskId); + } catch (Throwable t) { + recordJobError(job, ExceptionUtils.stackTraceString(t)); + eventBus.send(new ErrorEvent(t)); + return; + } + // When non-error results are received for several tasks we assume the regional analysis is running smoothly. + // Consider accelerating the job by starting an appropriate number of EC2 spot instances. + if (workResult.taskId == AUTO_START_SPOT_INSTANCES_AT_TASK) { + requestExtraWorkersIfAppropriate(job); } - - markTaskCompleted(workResult.jobId, workResult.taskId); - } private void requestExtraWorkersIfAppropriate(Job job) { @@ -477,18 +488,6 @@ private void requestExtraWorkersIfAppropriate(Job job) { } } - /** - * Returns a simple status object intended to inform the UI of job progress. - */ - public RegionalAnalysisStatus getJobStatus (String jobId) { - MultiOriginAssembler resultAssembler = resultAssemblers.get(jobId); - if (resultAssembler == null) { - return null; - } else { - return new RegionalAnalysisStatus(resultAssembler); - } - } - public File getPartialRegionalAnalysisResults (String jobId) { MultiOriginAssembler resultAssembler = resultAssemblers.get(jobId); if (resultAssembler == null) { @@ -500,7 +499,7 @@ public File getPartialRegionalAnalysisResults (String jobId) { public synchronized boolean anyJobsActive () { for (Job job : jobs.values()) { - if (!job.isComplete()) return true; + if (job.isActive()) return true; } return false; } diff --git a/src/main/java/com/conveyal/analysis/components/broker/Job.java b/src/main/java/com/conveyal/analysis/components/broker/Job.java index b7aa66384..fa2c2ca66 100644 --- a/src/main/java/com/conveyal/analysis/components/broker/Job.java +++ b/src/main/java/com/conveyal/analysis/components/broker/Job.java @@ -1,8 +1,6 @@ package com.conveyal.analysis.components.broker; -import com.conveyal.r5.analyst.FreeFormPointSet; import com.conveyal.r5.analyst.Grid; -import com.conveyal.r5.analyst.PointSetCache; import com.conveyal.r5.analyst.WorkerCategory; import com.conveyal.r5.analyst.cluster.RegionalTask; import org.slf4j.Logger; @@ -10,8 +8,11 @@ import java.util.ArrayList; import java.util.BitSet; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import static com.conveyal.r5.common.Util.notNullOrEmpty; import static com.google.common.base.Preconditions.checkNotNull; /** @@ -121,6 +122,12 @@ private RegionalTask makeOneTask (int taskNumber) { */ public int deliveryPass = 0; + /** + * If any error compromises the usabilty or quality of results from any origin, it is recorded here. + * This is a Set because identical errors are likely to be reported from many workers or individual tasks. + */ + public final Set errors = new HashSet(); + public Job (RegionalTask templateTask, WorkerTags workerTags) { this.jobId = templateTask.jobId; this.templateTask = templateTask; @@ -155,10 +162,18 @@ public boolean markTaskCompleted(int taskId) { } } + public boolean isActive() { + return !(isComplete() || isErrored()); + } + public boolean isComplete() { return nTasksCompleted == nTasksTotal; } + public boolean isErrored () { + return notNullOrEmpty(this.errors); + } + /** * @param maxTasks the maximum number of tasks to return. * @return some tasks that are not yet marked as completed and have not yet been delivered in @@ -181,7 +196,7 @@ public List generateSomeTasksToDeliver (int maxTasks) { } public boolean hasTasksToDeliver() { - if (this.isComplete()) { + if (!(this.isActive())) { return false; } if (nextTaskToDeliver < nTasksTotal) { diff --git a/src/main/java/com/conveyal/analysis/components/broker/JobStatus.java b/src/main/java/com/conveyal/analysis/components/broker/JobStatus.java index cc4c250de..f94a2a6a3 100644 --- a/src/main/java/com/conveyal/analysis/components/broker/JobStatus.java +++ b/src/main/java/com/conveyal/analysis/components/broker/JobStatus.java @@ -2,6 +2,8 @@ import com.conveyal.analysis.models.RegionalAnalysis; +import java.util.Set; + /** * Describes the status of a Job in a REST API response. */ @@ -34,6 +36,9 @@ public class JobStatus { /** Active instances working on this job. **/ public int activeWorkers; + /** Error messages for any problems related to this job that occurred on workers or backend. */ + public Set errors; + /** The regional analysis associated with this job */ public RegionalAnalysis regionalAnalysis; @@ -50,5 +55,6 @@ public JobStatus (Job job) { this.incomplete = total - complete; this.deliveries = job.nTasksDelivered; this.deliveryPass = job.deliveryPass; + this.errors = job.errors; } } diff --git a/src/main/java/com/conveyal/analysis/components/broker/RedeliveryTest.java b/src/main/java/com/conveyal/analysis/components/broker/RedeliveryTest.java index a90f52a94..3e6b53326 100644 --- a/src/main/java/com/conveyal/analysis/components/broker/RedeliveryTest.java +++ b/src/main/java/com/conveyal/analysis/components/broker/RedeliveryTest.java @@ -1,7 +1,7 @@ package com.conveyal.analysis.components.broker; -import com.conveyal.analysis.components.Components; -import com.conveyal.analysis.components.LocalComponents; +import com.conveyal.analysis.components.BackendComponents; +import com.conveyal.analysis.components.LocalBackendComponents; import com.conveyal.analysis.models.RegionalAnalysis; import com.conveyal.r5.analyst.cluster.RegionalTask; import org.slf4j.Logger; @@ -30,7 +30,7 @@ public class RedeliveryTest { public static void main(String[] params) { // Start an analysis server with the default (offline) properties. - Components components = new LocalComponents(); + BackendComponents components = new LocalBackendComponents(); // FIXME this is a hackish way to test - the called method shouldn't be public. // BackendMain.startServer(components); // components.config.testTaskRedelivery = true; diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/ErrorEvent.java b/src/main/java/com/conveyal/analysis/components/eventbus/ErrorEvent.java new file mode 100644 index 000000000..bc86bebc0 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/eventbus/ErrorEvent.java @@ -0,0 +1,27 @@ +package com.conveyal.analysis.components.eventbus; + +import com.conveyal.r5.util.ExceptionUtils; + +/** + * This Event is fired each time a Throwable (usually an Exception or Error) occurs on the backend. It can then be + * recorded or tracked in various places - the console logs, Slack, etc. This could eventually be used for errors on + * the workers as well, but we'd have to be careful not to generate hundreds of messages at once. + */ +public class ErrorEvent extends Event { + + // We may serialize this object, so we convert the Throwable to two strings to control its representation. + // For flexibility in event handlers, it is tempting to hold on to the original Throwable instead of derived + // Strings. Exceptions are famously slow, but it's the initial creation and filling in the stack trace that are + // slow. Once the instace exists, repeatedly examining its stack trace should not be prohibitively costly. Still, + // we do probably gain some efficiency by converting the stack trace to a String once and reusing that. + + public final String summary; + + public final String stackTrace; + + public ErrorEvent (Throwable throwable) { + this.summary = ExceptionUtils.shortCauseString(throwable); + this.stackTrace = ExceptionUtils.stackTraceString(throwable); + } + +} diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/ErrorLogger.java b/src/main/java/com/conveyal/analysis/components/eventbus/ErrorLogger.java new file mode 100644 index 000000000..539753aec --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/eventbus/ErrorLogger.java @@ -0,0 +1,33 @@ +package com.conveyal.analysis.components.eventbus; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Log selected events to the console. Mostly to ensure a redundant record of severe errors. + * All other observations on throughput, HTTP API requests etc. can be recorded in more structured ways. + */ +public class ErrorLogger implements EventHandler { + + private static final Logger LOG = LoggerFactory.getLogger(ErrorLogger.class); + + @Override + public void handleEvent (Event event) { + if (event instanceof ErrorEvent) { + ErrorEvent errorEvent = (ErrorEvent) event; + LOG.error("User {} of {}: {}", errorEvent.user, errorEvent.accessGroup, errorEvent.stackTrace); + } + } + + @Override + public boolean acceptEvent (Event event) { + return event instanceof ErrorEvent; + } + + @Override + public boolean synchronous () { + // Log call is very fast and we want to make sure it happens. Do not hand off to another thread. + return true; + } + +} diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/Event.java b/src/main/java/com/conveyal/analysis/components/eventbus/Event.java index 36871b2ee..92413be0d 100644 --- a/src/main/java/com/conveyal/analysis/components/eventbus/Event.java +++ b/src/main/java/com/conveyal/analysis/components/eventbus/Event.java @@ -6,8 +6,9 @@ import java.util.Set; /** - * This could extend BaseModel, but it's not a domain model class (describing transit or land use data or analysis), - * it's metadata about server operation and user activity. + * This could extend BaseModel, but it's not a domain model class (describing transit or land use data or analysis). + * It's metadata about server operation and user activity. These are intended to be serialized into a database or log, + * so the field visibility and types of every subclass should take that into consideration. */ public abstract class Event { @@ -28,6 +29,8 @@ public abstract class Event { /** * Set the user and groups from the supplied userPermissions object (if any) and return the modified instance. + * These fluent setter methods return this abstract supertype instead of the specific subtype, which can be a + * little awkward. But the alternative of declaring Event and casting is more ugly. * @param userPermissions if this is null, the call will have no effect. */ public Event forUser (UserPermissions userPermissions) { diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/EventBus.java b/src/main/java/com/conveyal/analysis/components/eventbus/EventBus.java index f18a1d544..304c44d69 100644 --- a/src/main/java/com/conveyal/analysis/components/eventbus/EventBus.java +++ b/src/main/java/com/conveyal/analysis/components/eventbus/EventBus.java @@ -8,6 +8,8 @@ import java.util.ArrayList; import java.util.List; +import static com.google.common.base.Preconditions.checkState; + /** * Sometimes a custom component needs to receive notifications from standard components. The standard component does * not know about the custom component, so cannot call it directly. The standard component can export a listener @@ -33,8 +35,9 @@ public EventBus (TaskScheduler taskScheduler) { this.taskScheduler = taskScheduler; } - /** This is not synchronized, so you should add all handlers at once before any events are fired. */ + /** This class is not synchronized, so you should add all handlers at once before any events are fired. */ public void addHandlers (EventHandler... handlers) { + checkState(this.handlers.isEmpty()); for (EventHandler handler : handlers) { LOG.info("An instance of {} will receive events.", handler.getClass().getSimpleName()); this.handlers.add(handler); @@ -52,10 +55,15 @@ public void send (final T event) { try { handler.handleEvent(event); } catch (Throwable t) { + // Do not recursively fire events on errors, there is some programming mistake. LOG.error("Event handler failed: {}", t.toString()); t.printStackTrace(); } } else { + // We do not use the full taskScheduler.Task system here because event handlers are intended to be + // simple and fast and all event handlers internal details like logging that shouldn't be cluttering + // up a user's progress reports until manually cleared. This may be an argument for EventBus having + // its own executor, or bypassing Task. taskScheduler.enqueueLightTask(() -> handler.handleEvent(event)); } } diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/HandleRegionalEvent.java b/src/main/java/com/conveyal/analysis/components/eventbus/HandleRegionalEvent.java new file mode 100644 index 000000000..0cac11b12 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/eventbus/HandleRegionalEvent.java @@ -0,0 +1,8 @@ +package com.conveyal.analysis.components.eventbus; + +/** + * Event fired by a worker when it is handling a single point task (after it has loaded the relevant transport network). + */ +public class HandleRegionalEvent extends Event { + +} diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/HandleSinglePointEvent.java b/src/main/java/com/conveyal/analysis/components/eventbus/HandleSinglePointEvent.java new file mode 100644 index 000000000..fcea40763 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/eventbus/HandleSinglePointEvent.java @@ -0,0 +1,8 @@ +package com.conveyal.analysis.components.eventbus; + +/** + * Event fired by a worker when it is handling a single point task (after it has loaded the relevant transport network). + */ +public class HandleSinglePointEvent extends Event { + +} diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/NetworkPreloadEvent.java b/src/main/java/com/conveyal/analysis/components/eventbus/NetworkPreloadEvent.java new file mode 100644 index 000000000..1b6e4ff3b --- /dev/null +++ b/src/main/java/com/conveyal/analysis/components/eventbus/NetworkPreloadEvent.java @@ -0,0 +1,8 @@ +package com.conveyal.analysis.components.eventbus; + +/** + * Event fired by a worker when it begins a potentially slow operation loading and processing a transit network. + */ +public class NetworkPreloadEvent { + +} diff --git a/src/main/java/com/conveyal/analysis/components/eventbus/SinglePointEvent.java b/src/main/java/com/conveyal/analysis/components/eventbus/SinglePointEvent.java index 370a33f4a..a223a5a3b 100644 --- a/src/main/java/com/conveyal/analysis/components/eventbus/SinglePointEvent.java +++ b/src/main/java/com/conveyal/analysis/components/eventbus/SinglePointEvent.java @@ -1,7 +1,7 @@ package com.conveyal.analysis.components.eventbus; /** - * Created by abyrd on 2020-06-12 + * Fired when the Backend is handling a single point request (forwarding it to a worker). */ public class SinglePointEvent extends Event { @@ -22,7 +22,6 @@ public SinglePointEvent (String scenarioId, String projectId, int variant, int d this.durationMsec = durationMsec; } - @Override public String toString () { return "SinglePointEvent{" + diff --git a/src/main/java/com/conveyal/analysis/controllers/AggregationAreaController.java b/src/main/java/com/conveyal/analysis/controllers/AggregationAreaController.java index 7085dca74..faca62d73 100644 --- a/src/main/java/com/conveyal/analysis/controllers/AggregationAreaController.java +++ b/src/main/java/com/conveyal/analysis/controllers/AggregationAreaController.java @@ -34,8 +34,9 @@ import java.util.stream.Collectors; import java.util.zip.GZIPOutputStream; -import static com.conveyal.analysis.models.OpportunityDataset.ZOOM; import static com.conveyal.analysis.util.JsonUtil.toJson; +import static com.conveyal.file.FileCategory.GRIDS; +import static com.conveyal.r5.analyst.WebMercatorGridPointSet.parseZoom; /** * Stores vector aggregationAreas (used to define the region of a weighted average accessibility metric). @@ -52,22 +53,13 @@ public class AggregationAreaController implements HttpController { private static int MAX_FEATURES = 100; private final FileStorage fileStorage; - private final Config config; - public interface Config { - // TODO this could be eliminated by hard-wiring file types to bucket subdirectories in the FileStorage. - String gridBucket (); - } - - public AggregationAreaController (FileStorage fileStorage, Config config) { + public AggregationAreaController (FileStorage fileStorage) { this.fileStorage = fileStorage; - this.config = config; } - - private FileStorageKey getStoragePath (AggregationArea area) { - return new FileStorageKey(config.gridBucket(), area.getS3Key()); + return new FileStorageKey(GRIDS, area.getS3Key()); } /** @@ -131,6 +123,8 @@ private List createAggregationAreas (Request req, Response res) Map areas = new HashMap<>(); boolean unionRequested = Boolean.parseBoolean(query.get("union").get(0).getString()); + String zoomString = query.get("zoom") == null ? null : query.get("zoom").get(0).getString(); + final int zoom = parseZoom(zoomString); if (!unionRequested && features.size() > MAX_FEATURES) { throw AnalysisServerException.fileUpload(MessageFormat.format("The uploaded shapefile has {0} features, " + @@ -152,7 +146,7 @@ private List createAggregationAreas (Request req, Response res) // 3. Convert to raster grids, then store them. ================================================================ areas.forEach((String name, Geometry geometry) -> { Envelope env = geometry.getEnvelopeInternal(); - Grid maskGrid = new Grid(ZOOM, env); + Grid maskGrid = new Grid(zoom, env); // Store the percentage each cell overlaps the mask, scaled as 0 to 100,000 List weights = maskGrid.getPixelWeights(geometry, true); diff --git a/src/main/java/com/conveyal/analysis/controllers/BrokerController.java b/src/main/java/com/conveyal/analysis/controllers/BrokerController.java index ccbda18a4..39f9cd18f 100644 --- a/src/main/java/com/conveyal/analysis/controllers/BrokerController.java +++ b/src/main/java/com/conveyal/analysis/controllers/BrokerController.java @@ -1,6 +1,5 @@ package com.conveyal.analysis.controllers; -import com.amazonaws.services.s3.Headers; import com.conveyal.analysis.AnalysisServerException; import com.conveyal.analysis.UserPermissions; import com.conveyal.analysis.components.broker.Broker; @@ -28,6 +27,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.io.ByteStreams; import com.mongodb.QueryBuilder; +import com.sun.net.httpserver.Headers; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; @@ -181,7 +181,7 @@ private Object singlePoint(Request request, Response response) { broker.recentlyRequestedWorkers.remove(workerCategory); } String workerUrl = "http://" + address + ":7080/single"; // TODO remove hard-coded port number. - LOG.info("Re-issuing HTTP request from UI to worker at {}", workerUrl); + LOG.debug("Re-issuing HTTP request from UI to worker at {}", workerUrl); HttpPost httpPost = new HttpPost(workerUrl); // httpPost.setHeader("Accept", "application/x-analysis-time-grid"); // TODO Explore: is this unzipping and re-zipping the result from the worker? @@ -195,9 +195,9 @@ private Object singlePoint(Request request, Response response) { response.status(workerResponse.getStatusLine().getStatusCode()); // Mimic headers sent by the worker. We're mostly interested in Content-Type, maybe Content-Encoding. // We do not want to mimic all headers like Date, Server etc. - Header contentTypeHeader = workerResponse.getFirstHeader(Headers.CONTENT_TYPE); + Header contentTypeHeader = workerResponse.getFirstHeader("Content-Type"); response.header(contentTypeHeader.getName(), contentTypeHeader.getValue()); - LOG.info("Returning worker response to UI with status code {} and content type {}", + LOG.debug("Returning worker response to UI with status code {} and content type {}", workerResponse.getStatusLine(), contentTypeHeader.getValue()); // This header will cause the Spark Framework to gzip the data automatically if requested by the client. response.header("Content-Encoding", "gzip"); @@ -221,7 +221,7 @@ private Object singlePoint(Request request, Response response) { // probably degrades the perceived responsiveness of single-point requests. return ByteStreams.toByteArray(entity.getContent()); } catch (SocketTimeoutException ste) { - LOG.info("Timeout waiting for response from worker."); + LOG.warn("Timeout waiting for response from worker."); // Aborting the request might help release resources - we had problems with exhausting connection pools here. httpPost.abort(); return jsonResponse(response, HttpStatus.BAD_REQUEST_400, "Routing server timed out. For the " + @@ -229,7 +229,7 @@ private Object singlePoint(Request request, Response response) { "using Routing Engine version < 4.5.1, your scenario may still be in preparation and you should " + "try again in a few minutes."); } catch (NoRouteToHostException nrthe){ - LOG.info("Worker in category {} was previously cataloged but is not reachable now. This is expected if a " + + LOG.warn("Worker in category {} was previously cataloged but is not reachable now. This is expected if a " + "user made a single-point request within WORKER_RECORD_DURATION_MSEC after shutdown.", workerCategory); httpPost.abort(); broker.unregisterSinglePointWorker(workerCategory); diff --git a/src/main/java/com/conveyal/analysis/controllers/BundleController.java b/src/main/java/com/conveyal/analysis/controllers/BundleController.java index 08635381a..68d4d28a7 100644 --- a/src/main/java/com/conveyal/analysis/controllers/BundleController.java +++ b/src/main/java/com/conveyal/analysis/controllers/BundleController.java @@ -1,12 +1,14 @@ package com.conveyal.analysis.controllers; import com.conveyal.analysis.AnalysisServerException; -import com.conveyal.analysis.components.Components; +import com.conveyal.analysis.UserPermissions; +import com.conveyal.analysis.components.BackendComponents; import com.conveyal.analysis.components.TaskScheduler; import com.conveyal.analysis.models.Bundle; import com.conveyal.analysis.persistence.Persistence; import com.conveyal.analysis.util.HttpUtils; import com.conveyal.analysis.util.JsonUtil; +import com.conveyal.r5.analyst.progress.ProgressInputStream; import com.conveyal.file.FileStorage; import com.conveyal.file.FileStorageKey; import com.conveyal.file.FileUtils; @@ -15,6 +17,7 @@ import com.conveyal.gtfs.model.Stop; import com.conveyal.osmlib.OSM; import com.conveyal.r5.analyst.cluster.BundleManifest; +import com.conveyal.r5.analyst.progress.Task; import com.conveyal.r5.streets.OSMCache; import com.conveyal.r5.util.ExceptionUtils; import com.mongodb.QueryBuilder; @@ -29,7 +32,9 @@ import spark.Service; import java.io.File; +import java.io.FileWriter; import java.io.IOException; +import java.io.Writer; import java.time.LocalDate; import java.util.ArrayList; import java.util.Collection; @@ -38,7 +43,10 @@ import java.util.stream.Collectors; import java.util.zip.ZipFile; +import static com.conveyal.analysis.components.HttpApi.USER_PERMISSIONS_ATTRIBUTE; +import static com.conveyal.r5.analyst.progress.WorkProductType.BUNDLE; import static com.conveyal.analysis.util.JsonUtil.toJson; +import static com.conveyal.file.FileCategory.BUNDLES; /** * This Controller provides HTTP REST endpoints for manipulating Bundles. Bundles are sets of GTFS feeds and OSM @@ -55,16 +63,15 @@ public class BundleController implements HttpController { private final FileStorage fileStorage; private final GTFSCache gtfsCache; + // FIXME The backend appears to use an osmcache purely to get a file key at which to store incoming OSM. Refactor. private final OSMCache osmCache; private final TaskScheduler taskScheduler; - private final String bundleBucket; - public BundleController (Components components) { + public BundleController (BackendComponents components) { this.fileStorage = components.fileStorage; this.gtfsCache = components.gtfsCache; this.osmCache = components.osmCache; this.taskScheduler = components.taskScheduler; - this.bundleBucket = components.config.bundleBucket(); } // INTERFACE METHOD @@ -80,10 +87,6 @@ public void registerEndpoints (Service sparkService) { }); } - public interface Config { - String bundleBucket (); - } - // HTTP REQUEST HANDLERS /** @@ -100,7 +103,7 @@ public interface Config { * Or simply not have "bundles" at all, and just supply a list of OSM and GTFS unique IDs to the workers. */ private Bundle create (Request req, Response res) { - // create the bundle + // Do some initial synchronous work setting up the bundle to fail fast if the request is bad. final Map> files = HttpUtils.getRequestFiles(req.raw()); final Bundle bundle = new Bundle(); try { @@ -131,38 +134,42 @@ private Bundle create (Request req, Response res) { bundle.feedsComplete = bundleWithFeed.feedsComplete; bundle.totalFeeds = bundleWithFeed.totalFeeds; } + bundle.accessGroup = req.attribute("accessGroup"); + bundle.createdBy = req.attribute("email"); } catch (Exception e) { - throw AnalysisServerException.badRequest(ExceptionUtils.asString(e)); + throw AnalysisServerException.badRequest(ExceptionUtils.stackTraceString(e)); } - - // Set `createdBy` and `accessGroup` - bundle.accessGroup = req.attribute("accessGroup"); - bundle.createdBy = req.attribute("email"); - + // ID and create/update times are assigned here when we push into Mongo. + // FIXME Ideally we'd only set and retain the ID without inserting in Mongo, + // but existing create() method with side effects would overwrite the ID. Persistence.bundles.create(bundle); - // Process OSM first, then each feed sequentially. Asynchronous so we can respond to the HTTP API call. - taskScheduler.enqueueHeavyTask(() -> { - try { + // Submit all slower work for asynchronous processing on the backend, then immediately return the partially + // constructed bundle from the HTTP handler. Process OSM first, then each GTFS feed sequentially. + UserPermissions userPermissions = req.attribute(USER_PERMISSIONS_ATTRIBUTE); + taskScheduler.enqueue(Task.create("Processing bundle " + bundle.name) + .forUser(userPermissions) + .setHeavy(true) + .withWorkProduct(BUNDLE, bundle._id, bundle.regionId) + .withAction(progressListener -> { + try { if (bundle.osmId == null) { // Process uploaded OSM. - bundle.status = Bundle.Status.PROCESSING_OSM; bundle.osmId = new ObjectId().toString(); - Persistence.bundles.modifiyWithoutUpdatingLock(bundle); - DiskFileItem fi = (DiskFileItem) files.get("osm").get(0); + // Here we perform minimal validation by loading the OSM, but don't retain the resulting MapDB. OSM osm = new OSM(null); osm.intersectionDetection = true; - osm.readPbf(fi.getInputStream()); - + // Number of entities in an OSM file is unknown, so derive progress from the number of bytes read. + // Wrapping in buffered input stream should reduce number of progress updates. + osm.readPbf(ProgressInputStream.forFileItem(fi, progressListener)); + // osm.readPbf(new BufferedInputStream(fi.getInputStream())); fileStorage.moveIntoStorage(osmCache.getKey(bundle.osmId), fi.getStoreLocation()); } if (bundle.feedGroupId == null) { // Process uploaded GTFS files - bundle.status = Bundle.Status.PROCESSING_GTFS; bundle.feedGroupId = new ObjectId().toString(); - Persistence.bundles.modifiyWithoutUpdatingLock(bundle); Envelope bundleBounds = new Envelope(); bundle.serviceStart = LocalDate.MAX; @@ -175,12 +182,14 @@ private Bundle create (Request req, Response res) { ZipFile zipFile = new ZipFile(feedFile); File tempDbFile = FileUtils.createScratchFile("db"); File tempDbpFile = new File(tempDbFile.getAbsolutePath() + ".p"); + File tempErrorJsonFile = new File(tempDbFile.getAbsolutePath() + ".error.json"); - GTFSFeed feed = new GTFSFeed(tempDbFile); + GTFSFeed feed = GTFSFeed.newWritableFile(tempDbFile); + feed.progressListener = progressListener; feed.loadFromFile(zipFile, new ObjectId().toString()); - feed.findPatterns(); // Populate the metadata while the feed is open + // TODO also get service range, hours per day etc. and error summary (and complete error JSON). Bundle.FeedSummary feedSummary = new Bundle.FeedSummary(feed, bundle.feedGroupId); bundle.feeds.add(feedSummary); @@ -196,6 +205,15 @@ private Bundle create (Request req, Response res) { bundle.serviceEnd = feedSummary.serviceEnd; } + // Save all errors to a file. + try (Writer jsonWriter = new FileWriter(tempErrorJsonFile)) { + JsonUtil.objectMapper.writeValue(jsonWriter, feed.errors); + } catch (IOException e) { + throw new RuntimeException(e); + } + // Save some space in the MapDB after we've summarized the errors to Mongo and a JSON file. + feed.errors.clear(); + // Flush db files to disk feed.close(); @@ -203,13 +221,10 @@ private Bundle create (Request req, Response res) { fileStorage.moveIntoStorage(gtfsCache.getFileKey(feedSummary.bundleScopedFeedId, "db"), tempDbFile); fileStorage.moveIntoStorage(gtfsCache.getFileKey(feedSummary.bundleScopedFeedId, "db.p"), tempDbpFile); fileStorage.moveIntoStorage(gtfsCache.getFileKey(feedSummary.bundleScopedFeedId, "zip"), feedFile); - - // Increment feeds complete for the progress handler - bundle.feedsComplete += 1; - - // Done in a loop the nonce and updatedAt would be changed repeatedly - Persistence.bundles.modifiyWithoutUpdatingLock(bundle); + fileStorage.moveIntoStorage(gtfsCache.getFileKey(feedSummary.bundleScopedFeedId, "error.json"), tempErrorJsonFile); } + // Set legacy progress field to indicate that all feeds have been loaded. + bundle.feedsComplete = bundle.totalFeeds; // TODO Handle crossing the antimeridian bundle.north = bundleBounds.getMaxY(); @@ -217,23 +232,22 @@ private Bundle create (Request req, Response res) { bundle.east = bundleBounds.getMaxX(); bundle.west = bundleBounds.getMinX(); } - writeManifestToCache(bundle); bundle.status = Bundle.Status.DONE; - } catch (Exception e) { - // This catches any error while processing a feed with the GTFS Api and needs to be more - // robust in bubbling up the specific errors to the UI. Really, we need to separate out the - // idea of bundles, track uploads of single feeds at a time, and allow the creation of a - // "bundle" at a later point. This updated error handling is a stopgap until we improve that - // flow. - LOG.error("Error creating bundle", e); + } catch (Throwable t) { + LOG.error("Error creating bundle", t); bundle.status = Bundle.Status.ERROR; - bundle.statusText = ExceptionUtils.asString(e); - } finally { + bundle.statusText = ExceptionUtils.shortAndLongString(t); + // Rethrow the problem so the task scheduler will attach it to the task with state ERROR. + // Eventually this whole catch and finally clause should be handled generically up in the task scheduler. + throw t; + } finally { + // ID and create/update times are assigned here when we push into Mongo. Persistence.bundles.modifiyWithoutUpdatingLock(bundle); - } - }); - + } + })); + // TODO do we really want to return the bundle here? It should not be needed until the background work is done. + // We could instead return the WorkProduct instance (or BaseModel instance or null) from TaskActions. return bundle; } @@ -246,13 +260,13 @@ private void writeManifestToCache (Bundle bundle) throws IOException { File manifestFile = FileUtils.createScratchFile("json"); JsonUtil.objectMapper.writeValue(manifestFile, manifest); - FileStorageKey key = new FileStorageKey(bundleBucket, manifestFileName); + FileStorageKey key = new FileStorageKey(BUNDLES, manifestFileName); fileStorage.moveIntoStorage(key, manifestFile); } private Bundle deleteBundle (Request req, Response res) throws IOException { Bundle bundle = Persistence.bundles.removeIfPermitted(req.params("_id"), req.attribute("accessGroup")); - FileStorageKey key = new FileStorageKey(bundleBucket, bundle._id + ".zip"); + FileStorageKey key = new FileStorageKey(BUNDLES, bundle._id + ".zip"); fileStorage.delete(key); return bundle; diff --git a/src/main/java/com/conveyal/analysis/controllers/LocalFilesController.java b/src/main/java/com/conveyal/analysis/controllers/LocalFilesController.java new file mode 100644 index 000000000..ed7d5b38d --- /dev/null +++ b/src/main/java/com/conveyal/analysis/controllers/LocalFilesController.java @@ -0,0 +1,57 @@ +package com.conveyal.analysis.controllers; + +import com.conveyal.file.FileCategory; +import com.conveyal.file.FileStorage; +import com.conveyal.file.FileStorageFormat; +import com.conveyal.file.FileStorageKey; +import com.conveyal.file.FileUtils; +import spark.Request; +import spark.Response; +import spark.Service; + +import java.io.File; +import java.io.InputStream; +import java.util.Locale; + +/** + * Expose all files in storage while in offline mode. + * Not done with Spark's built-in static file serving because that automatically gzips our already gzipped files. + * Another good reason to eventually code directly against an HTTP server instead of using this framework. + */ +public class LocalFilesController implements HttpController { + + // Something feels whack here, this should more specifically be a LocalFileStorage + private final FileStorage fileStorage; + + public LocalFilesController (FileStorage fileStorage) { + this.fileStorage = fileStorage; + } + + private InputStream getFile (Request req, Response res) throws Exception { + String filename = req.splat()[0]; + FileCategory category = FileCategory.valueOf(req.params("category").toUpperCase(Locale.ROOT)); + FileStorageKey key = new FileStorageKey(category, filename); + File file = fileStorage.getFile(key); + FileStorageFormat format = FileStorageFormat.fromFilename(filename); + res.type(format.mimeType); + + // If the content-encoding is set to gzip, Spark automatically gzips the response. This mangles data + // that was already gzipped. Therefore, check if it's gzipped and pipe directly to the raw OutputStream. + res.header("Content-Encoding", "gzip"); + if (FileUtils.isGzip(file)) { + // TODO Trace in debug: how does this actually work? + // Verify what this is transferring into - a buffer? In another reading thread? + // Is Jetty ServletOutputStream implementation automatically threaded or buffered? + FileUtils.transferFromFileTo(file, res.raw().getOutputStream()); + return null; + } else { + return FileUtils.getInputStream(file); + } + } + + @Override + public void registerEndpoints (Service sparkService) { + sparkService.get("/files/:category/*", this::getFile); + } + +} diff --git a/src/main/java/com/conveyal/analysis/controllers/OpportunityDatasetController.java b/src/main/java/com/conveyal/analysis/controllers/OpportunityDatasetController.java index f0a911d1d..5a6532453 100644 --- a/src/main/java/com/conveyal/analysis/controllers/OpportunityDatasetController.java +++ b/src/main/java/com/conveyal/analysis/controllers/OpportunityDatasetController.java @@ -55,8 +55,9 @@ import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import static com.conveyal.analysis.models.OpportunityDataset.ZOOM; import static com.conveyal.analysis.util.JsonUtil.toJson; +import static com.conveyal.file.FileCategory.GRIDS; +import static com.conveyal.r5.analyst.WebMercatorGridPointSet.parseZoom; /** * Controller that handles fetching opportunity datasets (grids and other pointset formats). @@ -71,22 +72,16 @@ public class OpportunityDatasetController implements HttpController { private final FileStorage fileStorage; private final TaskScheduler taskScheduler; - private final Config config; - - public interface Config { - String gridBucket (); - String seamlessCensusBucket(); - boolean offline (); - } + private final SeamlessCensusGridExtractor extractor; public OpportunityDatasetController ( FileStorage fileStorage, TaskScheduler taskScheduler, - Config config + SeamlessCensusGridExtractor extractor ) { this.fileStorage = fileStorage; this.taskScheduler = taskScheduler; - this.config = config; + this.extractor = extractor; } /** Store upload status objects FIXME trivial Javadoc */ @@ -142,12 +137,10 @@ private boolean clearStatus(Request req, Response res) { } private OpportunityDatasetUploadStatus downloadLODES(Request req, Response res) { - // FIXME conditionals should not be necessary, should be handled by pluggable components - if (config.offline()) { - throw AnalysisServerException.badRequest("Cannot download LODES in offline mode."); - } - final String regionId = req.params("regionId"); + final int zoom = parseZoom(req.queryParams("zoom")); + + // default final String accessGroup = req.attribute("accessGroup"); final String email = req.attribute("email"); final Region region = Persistence.regions.findByIdIfPermitted(regionId, accessGroup); @@ -155,18 +148,19 @@ private OpportunityDatasetUploadStatus downloadLODES(Request req, Response res) // deleted as a batch using deleteSourceSet) final String downloadBatchId = new ObjectId().toString(); // The bucket name contains the specific lodes data set and year so works as an appropriate name - final OpportunityDatasetUploadStatus status = new OpportunityDatasetUploadStatus(regionId, config.seamlessCensusBucket()); + final OpportunityDatasetUploadStatus status = new OpportunityDatasetUploadStatus(regionId, extractor.sourceName); addStatusAndRemoveOldStatuses(status); taskScheduler.enqueueHeavyTask(() -> { try { status.message = "Extracting census data for region"; - List grids = SeamlessCensusGridExtractor.retrieveAndExtractCensusDataForBounds(region.bounds); - createDatasetsFromPointSets(email, accessGroup, config.seamlessCensusBucket(), - downloadBatchId, regionId, status, grids); + List grids = extractor.censusDataForBounds(region.bounds, zoom); + createDatasetsFromPointSets( + email, accessGroup, extractor.sourceName, downloadBatchId, regionId, status, grids + ); } catch (IOException e) { status.completeWithError(e); - LOG.error("Exception processing LODES data: " + ExceptionUtils.asString(e)); + LOG.error("Exception processing LODES data: " + ExceptionUtils.stackTraceString(e)); } }); @@ -202,7 +196,6 @@ private List createDatasetsFromPointSets(String email, dataset.accessGroup = accessGroup; dataset.totalPoints = pointSet.featureCount(); dataset.regionId = regionId; - dataset.bucketName = config.gridBucket(); dataset.totalOpportunities = pointSet.sumTotalOpportunities(); dataset.format = getFormatCode(pointSet); if (dataset.format == FileStorageFormat.FREEFORM) { @@ -239,7 +232,7 @@ private List createDatasetsFromPointSets(String email, fileStorage.moveIntoStorage(dataset.getStorageKey(FileStorageFormat.GRID), gridFile); } else if (pointSet instanceof FreeFormPointSet) { // Upload serialized freeform pointset back to S3 - FileStorageKey fileStorageKey = new FileStorageKey(config.gridBucket(), regionId + "/" + dataset._id + ".pointset"); + FileStorageKey fileStorageKey = new FileStorageKey(GRIDS, regionId + "/" + dataset._id + ".pointset"); File pointsetFile = FileUtils.createScratchFile("pointset"); OutputStream os = new GZIPOutputStream(new FileOutputStream(pointsetFile)); @@ -446,12 +439,13 @@ private OpportunityDatasetUploadStatus createOpportunityDataset(Request req, Res formFields = sfu.parseParameterMap(req.raw()); } catch (FileUploadException e) { // We can't even get enough information to create a status tracking object. Re-throw an exception. - throw AnalysisServerException.fileUpload("Unable to parse opportunity dataset. " + ExceptionUtils.asString(e)); + throw AnalysisServerException.fileUpload("Unable to parse opportunity dataset. " + ExceptionUtils.stackTraceString(e)); } // Parse required fields. Will throw a ServerException on failure. final String sourceName = getFormField(formFields, "Name", true); final String regionId = getFormField(formFields, "regionId", true); + final int zoom = parseZoom(getFormField(formFields, "zoom", false)); // Create a region-wide status object tracking the processing of opportunity data. // Create the status object before doing anything including input and parameter validation, so that any problems @@ -486,7 +480,7 @@ private OpportunityDatasetUploadStatus createOpportunityDataset(Request req, Res pointsets.addAll(createGridsFromBinaryGridFiles(fileItems, status)); } else if (uploadFormat == UploadFormat.SHAPEFILE) { LOG.info("Detected opportunity dataset stored as ESRI shapefile."); - pointsets.addAll(createGridsFromShapefile(fileItems, status)); + pointsets.addAll(createGridsFromShapefile(fileItems, zoom, status)); } else if (uploadFormat == UploadFormat.CSV) { LOG.info("Detected opportunity dataset stored as CSV"); // Create a grid even when user has requested a freeform pointset so we have something to visualize. @@ -505,13 +499,13 @@ private OpportunityDatasetUploadStatus createOpportunityDataset(Request req, Res // This newer process creates a FreeFormPointSet only for the specified count fields, // as well as a Grid to assist in visualization of the uploaded data. for (FreeFormPointSet freeForm : createFreeFormPointSetsFromCsv(csvFileItem, parameters)) { - Grid gridFromFreeForm = Grid.fromFreeForm(freeForm, ZOOM); + Grid gridFromFreeForm = Grid.fromFreeForm(freeForm, zoom); pointsets.add(freeForm); pointsets.add(gridFromFreeForm); } } else { // This is the common default process: create a grid for every non-ignored field in the CSV. - pointsets.addAll(createGridsFromCsv(csvFileItem, formFields, status)); + pointsets.addAll(createGridsFromCsv(csvFileItem, formFields, zoom, status)); } } if (pointsets.isEmpty()) { @@ -602,6 +596,7 @@ private OpportunityDataset deleteDataset(String id, String accessGroup) { */ private List createGridsFromCsv(FileItem csvFileItem, Map> query, + int zoom, OpportunityDatasetUploadStatus status) throws Exception { String latField = getFormField(query, "latField", true); @@ -615,11 +610,11 @@ private List createGridsFromCsv(FileItem csvFileItem, List ignoreFields = Arrays.asList(idField, latField2, lonField2); InputStreamProvider csvStreamProvider = new FileItemInputStreamProvider(csvFileItem); - List grids = Grid.fromCsv(csvStreamProvider, latField, lonField, ignoreFields, ZOOM, status); + List grids = Grid.fromCsv(csvStreamProvider, latField, lonField, ignoreFields, zoom, status); // TODO verify correctness of this second pass if (latField2 != null && lonField2 != null) { ignoreFields = Arrays.asList(idField, latField, lonField); - grids.addAll(Grid.fromCsv(csvStreamProvider, latField2, lonField2, ignoreFields, ZOOM, status)); + grids.addAll(Grid.fromCsv(csvStreamProvider, latField2, lonField2, ignoreFields, zoom, status)); } return grids; @@ -652,6 +647,7 @@ private List createGridsFromBinaryGridFiles(List uploadedFiles, * same base name, and should not contain any other files but these three or four. */ private List createGridsFromShapefile(List fileItems, + int zoom, OpportunityDatasetUploadStatus status) throws Exception { // In the caller, we should have already verified that all files have the same base name and have an extension. @@ -680,7 +676,7 @@ private List createGridsFromShapefile(List fileItems, filesByExtension.get("SHX").write(shxFile); } - List grids = Grid.fromShapefile(shpFile, ZOOM, status); + List grids = Grid.fromShapefile(shpFile, zoom, status); tempDir.delete(); return grids; } @@ -698,7 +694,7 @@ private Object downloadOpportunityDataset (Request req, Response res) throws IOE // get("/api/opportunities/:regionId/:gridKey") is the same signature as this endpoint. String regionId = req.params("_id"); String gridKey = req.params("format"); - FileStorageKey storageKey = new FileStorageKey(config.gridBucket(), String.format("%s/%s.grid", regionId, gridKey)); + FileStorageKey storageKey = new FileStorageKey(GRIDS, String.format("%s/%s.grid", regionId, gridKey)); return getJSONURL(storageKey); } @@ -764,7 +760,7 @@ private void completed (Status status) { } public void completeWithError (Exception e) { - message = "Unable to create opportunity dataset. " + ExceptionUtils.asString(e); + message = "Unable to create opportunity dataset. " + ExceptionUtils.stackTraceString(e); completed(Status.ERROR); } diff --git a/src/main/java/com/conveyal/analysis/controllers/RegionalAnalysisController.java b/src/main/java/com/conveyal/analysis/controllers/RegionalAnalysisController.java index c5df211bb..894c62ac0 100644 --- a/src/main/java/com/conveyal/analysis/controllers/RegionalAnalysisController.java +++ b/src/main/java/com/conveyal/analysis/controllers/RegionalAnalysisController.java @@ -11,6 +11,7 @@ import com.conveyal.analysis.persistence.Persistence; import com.conveyal.analysis.results.CsvResultType; import com.conveyal.analysis.util.JsonUtil; +import com.conveyal.file.FileCategory; import com.conveyal.file.FileStorage; import com.conveyal.file.FileStorageFormat; import com.conveyal.file.FileStorageKey; @@ -42,6 +43,7 @@ import java.util.zip.GZIPOutputStream; import static com.conveyal.analysis.util.JsonUtil.toJson; +import static com.conveyal.file.FileCategory.RESULTS; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; @@ -65,17 +67,10 @@ public class RegionalAnalysisController implements HttpController { private final Broker broker; private final FileStorage fileStorage; - private final Config config; - public interface Config { - String resultsBucket (); - String bundleBucket (); - } - - public RegionalAnalysisController (Broker broker, FileStorage fileStorage, Config config) { + public RegionalAnalysisController (Broker broker, FileStorage fileStorage) { this.broker = broker; this.fileStorage = fileStorage; - this.config = config; } private Collection getRegionalAnalysesForRegion(String regionId, String accessGroup) { @@ -137,7 +132,7 @@ private RegionalAnalysis deleteRegionalAnalysis (Request req, Response res) { if (!analysis.complete) { String jobId = analysis._id; if (broker.deleteJob(jobId)) { - LOG.info("Deleted job {} from broker.", jobId); + LOG.debug("Deleted job {} from broker.", jobId); } else { LOG.error("Deleting job {} from broker failed.", jobId); } @@ -245,7 +240,7 @@ private Object getRegionalResults (Request req, Response res) throws IOException // FIXME we need to do the equivalent of the SelectingGridReducer here. // The job is still being processed. There is a probably harmless race condition if the job happens to be // completed at the very moment we're in this block, because the file will be deleted at that moment. - LOG.info("Analysis {} is not complete, attempting to return the partial results grid.", regionalAnalysisId); + LOG.debug("Analysis {} is not complete, attempting to return the partial results grid.", regionalAnalysisId); if (!"GRID".equalsIgnoreCase(fileFormatExtension)) { throw AnalysisServerException.badRequest( "For partially completed regional analyses, we can only return grid files, not images."); @@ -267,7 +262,7 @@ private Object getRegionalResults (Request req, Response res) throws IOException } } else { // The analysis has already completed, results should be stored and retrieved from S3 via redirects. - LOG.info("Returning {} minute accessibility to pointset {} (percentile {}) for regional analysis {}.", + LOG.debug("Returning {} minute accessibility to pointset {} (percentile {}) for regional analysis {}.", cutoffMinutes, destinationPointSetId, percentile, regionalAnalysisId); FileStorageFormat format = FileStorageFormat.valueOf(fileFormatExtension.toUpperCase()); if (!FileStorageFormat.GRID.equals(format) && !FileStorageFormat.PNG.equals(format) && !FileStorageFormat.TIFF.equals(format)) { @@ -281,7 +276,7 @@ private Object getRegionalResults (Request req, Response res) throws IOException String.format("%s_%s_P%d_C%d.%s", regionalAnalysisId, destinationPointSetId, percentile, cutoffMinutes, fileFormatExtension); // A lot of overhead here - UI contacts backend, backend calls S3, backend responds to UI, UI contacts S3. - FileStorageKey singleCutoffFileStorageKey = new FileStorageKey(config.resultsBucket(), singleCutoffKey); + FileStorageKey singleCutoffFileStorageKey = new FileStorageKey(RESULTS, singleCutoffKey); if (!fileStorage.exists(singleCutoffFileStorageKey)) { // An accessibility grid for this particular cutoff has apparently never been extracted from the // regional results file before. Extract one and save it for future reuse. Older regional analyses @@ -300,8 +295,8 @@ private Object getRegionalResults (Request req, Response res) throws IOException multiCutoffKey = String.format("%s_%s_P%d.access", regionalAnalysisId, destinationPointSetId, percentile); } } - LOG.info("Single-cutoff grid {} not found on S3, deriving it from {}.", singleCutoffKey, multiCutoffKey); - FileStorageKey multiCutoffFileStorageKey = new FileStorageKey(config.resultsBucket(), multiCutoffKey); + LOG.debug("Single-cutoff grid {} not found on S3, deriving it from {}.", singleCutoffKey, multiCutoffKey); + FileStorageKey multiCutoffFileStorageKey = new FileStorageKey(RESULTS, multiCutoffKey); InputStream multiCutoffInputStream = new FileInputStream(fileStorage.getFile(multiCutoffFileStorageKey)); Grid grid = new SelectingGridReducer(cutoffIndex).compute(multiCutoffInputStream); @@ -350,7 +345,7 @@ private String getCsvResults (Request req, Response res) { throw AnalysisServerException.notFound("This regional analysis does not contain CSV results of type " + resultType); } - FileStorageKey fileStorageKey = new FileStorageKey(config.resultsBucket(), storageKey); + FileStorageKey fileStorageKey = new FileStorageKey(RESULTS, storageKey); res.type("text/plain"); return fileStorage.getURL(fileStorageKey); diff --git a/src/main/java/com/conveyal/analysis/controllers/UserActivityController.java b/src/main/java/com/conveyal/analysis/controllers/UserActivityController.java new file mode 100644 index 000000000..c72220079 --- /dev/null +++ b/src/main/java/com/conveyal/analysis/controllers/UserActivityController.java @@ -0,0 +1,62 @@ +package com.conveyal.analysis.controllers; + +import com.conveyal.analysis.UserPermissions; +import com.conveyal.analysis.components.TaskScheduler; +import com.conveyal.r5.analyst.progress.ApiTask; +import spark.Request; +import spark.Response; +import spark.Service; + +import java.util.List; + +import static com.conveyal.analysis.components.HttpApi.USER_PERMISSIONS_ATTRIBUTE; +import static com.conveyal.analysis.util.JsonUtil.toJson; + +/** + * Provides a lightweight heartbeat endpoint by which the UI signals that a user is active, and the backend reports + * any progress and status updates for that user. To determine which users are active we can't depend purely on backend + * API requests, because many UI requests read and write straight to Mongo via Next lambda functions. + * + * The UI is expected to poll this endpoint at least once every 30 seconds whenever its tab is focused and the user + * appears to be active, and whenever the user becomes active after a period of inactivity. Any user who has not hit + * this endpoint for over 30 seconds is considered idle. + * + * The backend response will contain a list of all asynchronous tasks it is currently handling for the user. + * Once a task is finished, the next time it is fetched it will be cleared from the backend. When the user is clearly + * waiting for a task to finish, the UI may poll this endpoint more frequently to get smoother progress updates. + * + * Created by abyrd on 2021-03-03 + */ +public class UserActivityController implements HttpController { + + private final TaskScheduler taskScheduler; + + public UserActivityController (TaskScheduler taskScheduler) { + this.taskScheduler = taskScheduler; + } + + @Override + public void registerEndpoints (Service sparkService) { + sparkService.get("/api/activity", this::getActivity, toJson); + } + + private ResponseModel getActivity (Request req, Response res) { + UserPermissions userPermissions = req.attribute(USER_PERMISSIONS_ATTRIBUTE); + ResponseModel responseModel = new ResponseModel(); + responseModel.systemStatusMessages = List.of(); + responseModel.taskBacklog = taskScheduler.getBacklog(); + responseModel.taskProgress = taskScheduler.getTasksForUser(userPermissions.email); + return responseModel; + } + + /** API model used only to structure activity JSON messages sent back to UI. */ + public static class ResponseModel { + /** For example: "Server going down at 17:00 GMT for maintenance" or "Working to resolve known issue [link]." */ + public List systemStatusMessages; + /** Number of tasks in the queue until this user's start processing. Just a rough indicator of progress. */ + public int taskBacklog; + /** List of tasks with percentage complete, current stage of progress, and any failures or error messages. */ + public List taskProgress; + } + +} diff --git a/src/main/java/com/conveyal/analysis/controllers/WorkerProxyController.java b/src/main/java/com/conveyal/analysis/controllers/WorkerProxyController.java index 6ab3fbcd3..84b0d34e9 100644 --- a/src/main/java/com/conveyal/analysis/controllers/WorkerProxyController.java +++ b/src/main/java/com/conveyal/analysis/controllers/WorkerProxyController.java @@ -90,7 +90,7 @@ private Object proxyGet (Request request, Response response) { return httpClient.send(httpRequest, HttpResponse.BodyHandlers.ofInputStream()); } catch (Exception exception) { response.status(HttpStatus.BAD_GATEWAY_502); - response.body(ExceptionUtils.asString(exception)); + response.body(ExceptionUtils.stackTraceString(exception)); return response; } finally { diff --git a/src/main/java/com/conveyal/analysis/grids/SeamlessCensusGridExtractor.java b/src/main/java/com/conveyal/analysis/grids/SeamlessCensusGridExtractor.java index 4f1c1bbcc..6bf0b3095 100644 --- a/src/main/java/com/conveyal/analysis/grids/SeamlessCensusGridExtractor.java +++ b/src/main/java/com/conveyal/analysis/grids/SeamlessCensusGridExtractor.java @@ -1,5 +1,6 @@ package com.conveyal.analysis.grids; +import com.conveyal.analysis.components.Component; import com.conveyal.analysis.models.Bounds; import com.conveyal.data.census.S3SeamlessSource; import com.conveyal.data.geobuf.GeobufFeature; @@ -16,13 +17,11 @@ import java.util.Map; import java.util.Set; -import static com.conveyal.analysis.models.OpportunityDataset.ZOOM; - /** * Fetch data from the seamless-census s3 buckets and convert it from block-level vector data (polygons) * to raster opportunity density data (grids). */ -public class SeamlessCensusGridExtractor { +public class SeamlessCensusGridExtractor implements Component { private static final Logger LOG = LoggerFactory.getLogger(SeamlessCensusGridExtractor.class); @@ -36,17 +35,20 @@ public interface Config { String seamlessCensusBucket (); } - private static S3SeamlessSource source; + private final S3SeamlessSource source; + + /** A human-readable name for the source of extracted data, e.g. for distinguishing between different years. */ + public final String sourceName; - // TODO make this into a non-static Component - public static void configureStatically (Config config) { + public SeamlessCensusGridExtractor (Config config) { source = new S3SeamlessSource(config.seamlessCensusRegion(), config.seamlessCensusBucket()); + sourceName = config.seamlessCensusBucket(); } /** * Retrieve data for bounds and save to a bucket under a given key */ - public static List retrieveAndExtractCensusDataForBounds (Bounds bounds) throws IOException { + public List censusDataForBounds (Bounds bounds, int zoom) throws IOException { long startTime = System.currentTimeMillis(); // All the features are buffered in a Map in memory. This could be problematic on large areas. @@ -70,7 +72,7 @@ public static List retrieveAndExtractCensusDataForBounds (Bounds bounds) t // Note, the following is assuming each property has a unique name. Grid grid = grids.get(key); if (grid == null) { - grid = new Grid(ZOOM, bounds.envelope()); + grid = new Grid(zoom, bounds.envelope()); grid.name = key; grids.put(key, grid); } @@ -86,4 +88,5 @@ public static List retrieveAndExtractCensusDataForBounds (Bounds bounds) t return new ArrayList<>(grids.values()); } + } diff --git a/src/main/java/com/conveyal/analysis/models/Bundle.java b/src/main/java/com/conveyal/analysis/models/Bundle.java index b5758aba2..f5e7f0d27 100644 --- a/src/main/java/com/conveyal/analysis/models/Bundle.java +++ b/src/main/java/com/conveyal/analysis/models/Bundle.java @@ -2,12 +2,18 @@ import com.conveyal.analysis.AnalysisServerException; import com.conveyal.gtfs.GTFSFeed; +import com.conveyal.gtfs.error.GTFSError; +import com.conveyal.gtfs.model.FeedInfo; +import com.conveyal.gtfs.validator.model.Priority; import com.fasterxml.jackson.annotation.JsonIgnore; import java.time.LocalDate; import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; /** * Represents a transport Bundle (GTFS and OSM). @@ -53,6 +59,35 @@ public Bundle clone () { } } + /** Simplified model for storing the first N errors of each type in Mongo. */ + public static class GtfsErrorSummary { + public String file; + public Integer line; + public String field; + public String message; + public GtfsErrorSummary () { /* For deserialization. */ } + public GtfsErrorSummary (GTFSError error) { + file = error.file; + line = error.line > 0 ? (int)(error.line) : null; + field = error.field; + message = error.getMessage(); + } + } + + /** Simplified model for storing the first N errors of each type in Mongo. */ + public static class GtfsErrorTypeSummary { + public String type; + public int count; + public List someErrors = new ArrayList<>(); + public Priority priority; + public GtfsErrorTypeSummary () { /* For deserialization. */ } + public GtfsErrorTypeSummary (GTFSError error) { + this.priority = error.getPriority(); + this.type = error.errorType; + } + } + + /** Model for storing summary info in Mongo. Bundle contains one instance of FeedSummary per feed in the Bundle. */ public static class FeedSummary implements Cloneable { public String feedId; public String name; @@ -65,14 +100,55 @@ public static class FeedSummary implements Cloneable { public LocalDate serviceStart; public LocalDate serviceEnd; public long checksum; + public List errors; + + /** Default empty constructor needed for JSON mapping. */ + public FeedSummary () { } public FeedSummary(GTFSFeed feed, String feedGroupId) { feedId = feed.feedId; bundleScopedFeedId = Bundle.bundleScopeFeedId(feed.feedId, feedGroupId); - name = feed.agency.size() > 0 ? feed.agency.values().iterator().next().agency_name : feed.feedId; checksum = feed.checksum; + setServiceDates(feed); // TODO expand to record hours per day by mode. + createFeedName(feed); + summarizeErrors(feed); + } - setServiceDates(feed); + /** + * Set this.name, which seems to only be used for display purposes. + * + * If a FeedInfo file is present in the feed, the feed_id, feed_start_date, and feed_end_date are used for the + * name. If not, dates from calendar/calendar_dates files and agency_name values (up to a limit) are used. + * + * This method should be called after setServiceDates(). + */ + private void createFeedName (GTFSFeed feed) { + String name = null; + LocalDate startingDate = this.serviceStart; + LocalDate endingDate = this.serviceEnd; + + if (feed.feedInfo.size() == 1) { + FeedInfo feedInfo = feed.feedInfo.values().iterator().next(); + if (feedInfo.feed_id != null) name = feedInfo.feed_id; + if (feedInfo.feed_start_date != null) startingDate = feedInfo.feed_start_date; + if (feedInfo.feed_end_date != null) endingDate = feedInfo.feed_end_date; + } + if (name == null) { + int nAgencies = feed.agency.size(); + if (nAgencies > 0) { + final int limit = 3; + String agencyNames = feed.agency.values().stream().limit(limit) + .map(a -> a.agency_name).collect(Collectors.joining(", ")); + if (nAgencies > limit) { + agencyNames += String.format(", +%d more", nAgencies - limit); + } + name = agencyNames; + } + } + + if (name == null) name = "(unknown)"; + + this.name = name + ": " + startingDate.toString() + " to " + endingDate.toString(); // ISO-8601 } /** @@ -91,9 +167,26 @@ public void setServiceDates (GTFSFeed feed) { } /** - * Default empty constructor needed for JSON mapping + * This summarization could be done on the fly during loading. + * However some users will want the whole pile of errors. */ - public FeedSummary () { } + private void summarizeErrors (GTFSFeed feed) { + final int maxErrorsPerType = 10; + Map sortedErrors = new HashMap<>(); + for (GTFSError error : feed.errors) { + String type = error.errorType; + GtfsErrorTypeSummary summary = sortedErrors.get(type); + if (summary == null) { + summary = new GtfsErrorTypeSummary(error); + sortedErrors.put(type, summary); + } + summary.count += 1; + if (summary.someErrors.size() < maxErrorsPerType) { + summary.someErrors.add(new GtfsErrorSummary(error)); + } + } + errors = new ArrayList<>(sortedErrors.values()); + } public FeedSummary clone () { try { @@ -104,6 +197,7 @@ public FeedSummary clone () { } } + // The first two PROCESSING_* values are essentially deprecated in favor of Task. Consider eliminating this field. public enum Status { PROCESSING_OSM, PROCESSING_GTFS, diff --git a/src/main/java/com/conveyal/analysis/models/FileInfo.java b/src/main/java/com/conveyal/analysis/models/FileInfo.java index 65486ceb1..8c38cbf96 100644 --- a/src/main/java/com/conveyal/analysis/models/FileInfo.java +++ b/src/main/java/com/conveyal/analysis/models/FileInfo.java @@ -1,5 +1,6 @@ package com.conveyal.analysis.models; +import com.conveyal.file.FileCategory; import com.conveyal.file.FileStorageFormat; import com.conveyal.file.FileStorageKey; import com.fasterxml.jackson.annotation.JsonIgnore; @@ -9,7 +10,8 @@ public class FileInfo extends BaseModel { public String regionId = null; // What is the bucket or folder that this file is stored in? - public String bucket = null; + // TODO database migration. category = bucket.toUpperCase and without the deployment environment prefix. + public FileCategory category = null; // The path to create a FileStorageKey public String path = null; @@ -26,7 +28,7 @@ public class FileInfo extends BaseModel { // Get path @JsonIgnore public FileStorageKey getKey () { - return new FileStorageKey(bucket, path); + return new FileStorageKey(category, path); } /** diff --git a/src/main/java/com/conveyal/analysis/models/ModificationStop.java b/src/main/java/com/conveyal/analysis/models/ModificationStop.java index 52f18e939..3c0b55677 100644 --- a/src/main/java/com/conveyal/analysis/models/ModificationStop.java +++ b/src/main/java/com/conveyal/analysis/models/ModificationStop.java @@ -115,7 +115,7 @@ static List getStopsFromSegments (List segments, Inte // JTS orthodromic distance returns meters, considering the input coordinate system. lineSegmentMeters = JTS.orthodromicDistance(c0, c1, crs); } catch (TransformException e) { - throw AnalysisServerException.unknown(ExceptionUtils.asString(e)); + throw AnalysisServerException.unknown(ExceptionUtils.stackTraceString(e)); } double metersAtEndOfSegment = metersFromPatternStart + lineSegmentMeters; diff --git a/src/main/java/com/conveyal/analysis/models/OpportunityDataset.java b/src/main/java/com/conveyal/analysis/models/OpportunityDataset.java index bb3a84f4e..dc977b586 100644 --- a/src/main/java/com/conveyal/analysis/models/OpportunityDataset.java +++ b/src/main/java/com/conveyal/analysis/models/OpportunityDataset.java @@ -5,6 +5,9 @@ import com.conveyal.r5.analyst.WebMercatorExtents; import com.fasterxml.jackson.annotation.JsonIgnore; +import static com.conveyal.file.FileCategory.GRIDS; +import static com.conveyal.r5.analyst.WebMercatorGridPointSet.DEFAULT_ZOOM; + /** * A model object for storing metadata about opportunity datasets in Mongo, for sharing it with the frontend. * The actual contents of the opportunity datasets are persisted to files on S3 and/or in a directory of the local @@ -13,18 +16,17 @@ */ public class OpportunityDataset extends Model { - /** For now all web Mercator grids are zoom level 9. Level 10 is probably ideal but will quadruple calculation. - * TODO make adjustable - * */ - public static final int ZOOM = 9; - /** The human-readable name of the data source from which this came, provided by the user who uploaded it. */ public String sourceName; /** The unique id for the data source (CSV file, Shapefile etc.) from which this dataset was derived. */ public String sourceId; - /** Bucket name on S3 where the opportunity data itself is persisted. */ + /** + * Bucket name on S3 where the opportunity data itself is persisted. Deprecated: as of April 2021, the FileStorage + * system encapsulates how local or remote storage coordinates are derived from the FileCategory. + */ + @Deprecated public String bucketName; /** @@ -88,17 +90,17 @@ public String storageLocation() { @JsonIgnore public FileStorageKey getStorageKey () { String path = storageLocation(this.format.extension); - return new FileStorageKey(this.bucketName, path); + return new FileStorageKey(GRIDS, path); } @JsonIgnore public FileStorageKey getStorageKey (FileStorageFormat fileFormat) { - return new FileStorageKey(this.bucketName, storageLocation(fileFormat.extension)); + return new FileStorageKey(GRIDS, storageLocation(fileFormat.extension)); } @JsonIgnore public WebMercatorExtents getWebMercatorExtents () { - return new WebMercatorExtents(west, north, width, height, ZOOM); + return new WebMercatorExtents(west, north, width, height, DEFAULT_ZOOM); } /** Analysis region this dataset was uploaded in. */ diff --git a/src/main/java/com/conveyal/analysis/persistence/MongoMap.java b/src/main/java/com/conveyal/analysis/persistence/MongoMap.java index 12e6bf824..80fd07072 100644 --- a/src/main/java/com/conveyal/analysis/persistence/MongoMap.java +++ b/src/main/java/com/conveyal/analysis/persistence/MongoMap.java @@ -25,6 +25,7 @@ * TODO this is using org.mongojack.JacksonDBCollection. I believe Mongo Java client library now provides POJO storage. */ public class MongoMap { + private static Logger LOG = LoggerFactory.getLogger(MongoMap.class); private JacksonDBCollection wrappedCollection; @@ -212,9 +213,7 @@ public V put(V value, DBObject optionalQuery) { throw AnalysisServerException.forbidden("The data you attempted to update is not in your access group."); } } - - // Log the result - LOG.info("{} {} updated by {} ({})", result.toString(), result.name, result.updatedBy, result.accessGroup); + LOG.debug("{} of {} updated {} ({})", result.updatedBy, result.accessGroup, result.name, result._id); // Return the object that was updated return result; diff --git a/src/main/java/com/conveyal/analysis/persistence/Persistence.java b/src/main/java/com/conveyal/analysis/persistence/Persistence.java index c0bce7e0d..71129c0a9 100644 --- a/src/main/java/com/conveyal/analysis/persistence/Persistence.java +++ b/src/main/java/com/conveyal/analysis/persistence/Persistence.java @@ -45,10 +45,10 @@ public class Persistence { // TODO progressively migrate to AnalysisDB which is non-static public static void initializeStatically (AnalysisDB.Config config) { - LOG.info("Connecting to MongoDB..."); - if (config.databaseUri() != null) { - LOG.info("Connecting to remote MongoDB instance..."); - mongo = new MongoClient(new MongoClientURI(config.databaseUri())); + String uri = config.databaseUri(); + if (uri != null) { + LOG.info("Connecting to MongoDB instance at {}...", uri); + mongo = new MongoClient(new MongoClientURI(uri)); } else { LOG.info("Connecting to local MongoDB instance..."); mongo = new MongoClient(); diff --git a/src/main/java/com/conveyal/analysis/results/AccessCsvResultWriter.java b/src/main/java/com/conveyal/analysis/results/AccessCsvResultWriter.java index 68ea7aaec..e208ac827 100644 --- a/src/main/java/com/conveyal/analysis/results/AccessCsvResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/AccessCsvResultWriter.java @@ -10,8 +10,8 @@ public class AccessCsvResultWriter extends CsvResultWriter { - public AccessCsvResultWriter (RegionalTask task, String outputBucket, FileStorage fileStorage) throws IOException { - super(task, outputBucket, fileStorage); + public AccessCsvResultWriter (RegionalTask task, FileStorage fileStorage) throws IOException { + super(task, fileStorage); } @Override diff --git a/src/main/java/com/conveyal/analysis/results/BaseResultWriter.java b/src/main/java/com/conveyal/analysis/results/BaseResultWriter.java index 227c1eca9..df289c9fe 100644 --- a/src/main/java/com/conveyal/analysis/results/BaseResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/BaseResultWriter.java @@ -1,5 +1,6 @@ package com.conveyal.analysis.results; +import com.conveyal.file.FileCategory; import com.conveyal.file.FileStorage; import com.conveyal.file.FileStorageKey; import com.conveyal.file.FileUtils; @@ -17,6 +18,7 @@ import java.io.OutputStream; import java.util.zip.GZIPOutputStream; +import static com.conveyal.file.FileCategory.RESULTS; import static com.conveyal.r5.common.Util.human; /** @@ -32,15 +34,13 @@ public abstract class BaseResultWriter { private final FileStorage fileStorage; protected File bufferFile; - private String outputBucket; public BaseResultWriter (FileStorage fileStorage) { this.fileStorage = fileStorage; } // Can this be merged into the constructor? - protected void prepare (String jobId, String outputBucket) { - this.outputBucket = outputBucket; + protected void prepare (String jobId) { try { bufferFile = File.createTempFile(jobId + "_", ".results"); // On unexpected server shutdown, these files should be deleted. @@ -56,7 +56,7 @@ protected void prepare (String jobId, String outputBucket) { */ protected synchronized void finish (String fileName) throws IOException { LOG.info("Compressing {} and moving into file storage.", fileName); - FileStorageKey fileStorageKey = new FileStorageKey(outputBucket, fileName); + FileStorageKey fileStorageKey = new FileStorageKey(RESULTS, fileName); File gzippedResultFile = FileUtils.createScratchFile(); // There's probably a more elegant way to do this with NIO and without closing the buffer. diff --git a/src/main/java/com/conveyal/analysis/results/CsvResultWriter.java b/src/main/java/com/conveyal/analysis/results/CsvResultWriter.java index 9680e7b8f..e07abc2af 100644 --- a/src/main/java/com/conveyal/analysis/results/CsvResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/CsvResultWriter.java @@ -53,9 +53,9 @@ public abstract class CsvResultWriter extends BaseResultWriter implements Region * "origin", "destination", and the supplied indicator. * FIXME it's strange we're manually passing injectable components into objects not wired up at application construction. */ - CsvResultWriter (RegionalTask task, String outputBucket, FileStorage fileStorage) throws IOException { + CsvResultWriter (RegionalTask task, FileStorage fileStorage) throws IOException { super(fileStorage); - super.prepare(task.jobId, outputBucket); + super.prepare(task.jobId); this.fileName = task.jobId + "_" + resultType() +".csv"; BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(bufferFile)); csvWriter = new CsvWriter(bufferedWriter, ','); diff --git a/src/main/java/com/conveyal/analysis/results/GridResultWriter.java b/src/main/java/com/conveyal/analysis/results/GridResultWriter.java index 9effc9cb8..cdfe08fd9 100644 --- a/src/main/java/com/conveyal/analysis/results/GridResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/GridResultWriter.java @@ -64,7 +64,7 @@ public class GridResultWriter extends BaseResultWriter { * Conveyal grid format. This also creates the on-disk scratch buffer into which the results * from the workers will be accumulated. */ - GridResultWriter (RegionalTask task, String outputBucket, FileStorage fileStorage) { + GridResultWriter (RegionalTask task, FileStorage fileStorage) { super(fileStorage); int width = task.width; int height = task.height; @@ -75,7 +75,7 @@ public class GridResultWriter extends BaseResultWriter { height, channels ); - super.prepare(task.jobId, outputBucket); + super.prepare(task.jobId); try { // Write the access grid file header to the temporary file. diff --git a/src/main/java/com/conveyal/analysis/results/MultiGridResultWriter.java b/src/main/java/com/conveyal/analysis/results/MultiGridResultWriter.java index 762841a1b..5f4d90f8a 100644 --- a/src/main/java/com/conveyal/analysis/results/MultiGridResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/MultiGridResultWriter.java @@ -29,7 +29,7 @@ public class MultiGridResultWriter implements RegionalResultWriter { /** Constructor */ public MultiGridResultWriter ( - RegionalAnalysis regionalAnalysis, RegionalTask task, String outputBucket, FileStorage fileStorage + RegionalAnalysis regionalAnalysis, RegionalTask task, FileStorage fileStorage ) { // We are storing the regional analysis just to get its pointset IDs (not keys) and its own ID. this.regionalAnalysis = regionalAnalysis; @@ -40,7 +40,7 @@ public MultiGridResultWriter ( accessibilityGridWriters = new GridResultWriter[nDestinationPointSets][nPercentiles]; for (int d = 0; d < nDestinationPointSets; d++) { for (int p = 0; p < nPercentiles; p++) { - accessibilityGridWriters[d][p] = new GridResultWriter(task, outputBucket, fileStorage); + accessibilityGridWriters[d][p] = new GridResultWriter(task, fileStorage); } } } diff --git a/src/main/java/com/conveyal/analysis/results/MultiOriginAssembler.java b/src/main/java/com/conveyal/analysis/results/MultiOriginAssembler.java index 3938ee0d4..ffbe11458 100644 --- a/src/main/java/com/conveyal/analysis/results/MultiOriginAssembler.java +++ b/src/main/java/com/conveyal/analysis/results/MultiOriginAssembler.java @@ -1,6 +1,5 @@ package com.conveyal.analysis.results; -import com.beust.jcommander.ParameterException; import com.conveyal.analysis.AnalysisServerException; import com.conveyal.analysis.components.broker.Job; import com.conveyal.analysis.models.RegionalAnalysis; @@ -9,15 +8,17 @@ import com.conveyal.file.FileStorageFormat; import com.conveyal.r5.analyst.PointSet; import com.conveyal.r5.analyst.cluster.RegionalWorkResult; +import com.conveyal.r5.util.ExceptionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.IOException; import java.util.ArrayList; import java.util.BitSet; import java.util.List; +import static com.conveyal.r5.common.Util.notNullOrEmpty; +import static com.google.common.base.Preconditions.checkArgument; + /** * This assembles regional results arriving from workers into one or more files per regional analysis on * the backend. This is not a singleton component: one MultiOriginAssembler instance is created per currently active @@ -49,9 +50,6 @@ public class MultiOriginAssembler { // One writer per CSV/Grids we're outputting private List resultWriters = new ArrayList<>(); - /** TODO check if error is true before all results are received (when receiving each result?) and cancel job. */ - private boolean error = false; - /** * The number of distinct origin points for which we've received at least one result. If for * whatever reason we receive two or more results for the same origin this should only be @@ -85,14 +83,12 @@ public class MultiOriginAssembler { * file up to an umbrella location where a single reference to the file storage can be used to * store all of them. */ - public MultiOriginAssembler ( - RegionalAnalysis regionalAnalysis, Job job, String outputBucket, FileStorage fileStorage - ) { - this.regionalAnalysis = regionalAnalysis; - this.job = job; - this.nOriginsTotal = job.nTasksTotal; - this.originsReceived = new BitSet(job.nTasksTotal); + public MultiOriginAssembler (RegionalAnalysis regionalAnalysis, Job job, FileStorage fileStorage) { try { + this.regionalAnalysis = regionalAnalysis; + this.job = job; + this.nOriginsTotal = job.nTasksTotal; + this.originsReceived = new BitSet(job.nTasksTotal); // Check that origin and destination sets are not too big for generating CSV files. if (!job.templateTask.makeTauiSite && job.templateTask.destinationPointSetKeys[0].endsWith(FileStorageFormat.FREEFORM.extension) @@ -103,7 +99,6 @@ public MultiOriginAssembler ( if (nOriginsTotal * destinationPointSet.featureCount() > MAX_FREEFORM_OD_PAIRS || destinationPointSet.featureCount() > MAX_FREEFORM_DESTINATIONS ) { - error = true; throw new AnalysisServerException(String.format( "Freeform requests limited to %d destinations and %d origin-destination pairs.", MAX_FREEFORM_DESTINATIONS, MAX_FREEFORM_OD_PAIRS @@ -114,27 +109,22 @@ public MultiOriginAssembler ( if (job.templateTask.recordAccessibility) { if (job.templateTask.originPointSet != null) { - resultWriters.add(new AccessCsvResultWriter(job.templateTask, outputBucket, fileStorage)); + resultWriters.add(new AccessCsvResultWriter(job.templateTask, fileStorage)); } else { - resultWriters.add( new MultiGridResultWriter( - regionalAnalysis, job.templateTask, outputBucket, fileStorage - )); + resultWriters.add( new MultiGridResultWriter(regionalAnalysis, job.templateTask, fileStorage)); } } if (job.templateTask.recordTimes) { - resultWriters.add(new TimeCsvResultWriter(job.templateTask, outputBucket, fileStorage)); + resultWriters.add(new TimeCsvResultWriter(job.templateTask, fileStorage)); } if (job.templateTask.includePathResults) { - resultWriters.add(new PathCsvResultWriter(job.templateTask, outputBucket, fileStorage)); + resultWriters.add(new PathCsvResultWriter(job.templateTask, fileStorage)); } - if (resultWriters.isEmpty() && !job.templateTask.makeTauiSite) { - // TODO handle all error conditions of this form with a single method that also cancels the job - error = true; - throw new ParameterException("A regional analysis should always create at least one grid or CSV file."); - } + checkArgument(job.templateTask.makeTauiSite || notNullOrEmpty(resultWriters), + "A non-Taui regional analysis should always create at least one grid or CSV file."); // Record the paths of any CSV files that will be produced by this analysis. // The caller must flush the RegionalAnalysis back out to the database to retain this information. @@ -146,10 +136,8 @@ public MultiOriginAssembler ( regionalAnalysis.resultStorage.put(csvWriter.resultType(), csvWriter.fileName); } } - - } catch (IOException e) { - error = true; - LOG.error("Exception while creating multi-origin assembler: " + e.toString()); + } catch (Exception e) { + throw new RuntimeException("Exception while creating multi-origin assembler: " + ExceptionUtils.stackTraceString(e)); } } @@ -177,26 +165,21 @@ private synchronized void finish() { * dimension checks) but those should take a trivial amount of time. For safety and simplicity * we will synchronize the whole method. The downside is that this prevents one thread from * writing accessibility while another was writing travel time CSV, but this should not be - * assumed to have any impact on performance unless measured. The writeOneValue methods are also - * synchronized for good measure. There should be no cost to retaining the lock. + * assumed to have any impact on performance unless measured. The writeOneValue methods are also synchronized + * for good measure. There should be no additional cost to retaining the lock when entering those methods. */ - public synchronized void handleMessage (RegionalWorkResult workResult) { - try { - for (RegionalResultWriter writer : resultWriters) { - writer.writeOneWorkResult(workResult); - } - // Don't double-count origins if we receive them more than once. Atomic get-and-increment requires - // synchronization, currently achieved by synchronizing this entire method. - if (!originsReceived.get(workResult.taskId)) { - originsReceived.set(workResult.taskId); - nComplete += 1; - } - if (nComplete == nOriginsTotal && !error) { - finish(); - } - } catch (Exception e) { - error = true; - LOG.error("Error assembling results for query {}", job.jobId, e); + public synchronized void handleMessage (RegionalWorkResult workResult) throws Exception { + for (RegionalResultWriter writer : resultWriters) { + writer.writeOneWorkResult(workResult); + } + // Don't double-count origins if we receive them more than once. Atomic get-and-increment requires + // synchronization, currently achieved by synchronizing this entire method. + if (!originsReceived.get(workResult.taskId)) { + originsReceived.set(workResult.taskId); + nComplete += 1; + } + if (nComplete == nOriginsTotal) { + finish(); } } diff --git a/src/main/java/com/conveyal/analysis/results/PathCsvResultWriter.java b/src/main/java/com/conveyal/analysis/results/PathCsvResultWriter.java index e5fb6784c..0dadb4337 100644 --- a/src/main/java/com/conveyal/analysis/results/PathCsvResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/PathCsvResultWriter.java @@ -14,8 +14,8 @@ public class PathCsvResultWriter extends CsvResultWriter { - public PathCsvResultWriter (RegionalTask task, String outputBucket, FileStorage fileStorage) throws IOException { - super(task, outputBucket, fileStorage); + public PathCsvResultWriter (RegionalTask task, FileStorage fileStorage) throws IOException { + super(task, fileStorage); } @Override diff --git a/src/main/java/com/conveyal/analysis/results/TimeCsvResultWriter.java b/src/main/java/com/conveyal/analysis/results/TimeCsvResultWriter.java index e8e47d151..6d9137f5a 100644 --- a/src/main/java/com/conveyal/analysis/results/TimeCsvResultWriter.java +++ b/src/main/java/com/conveyal/analysis/results/TimeCsvResultWriter.java @@ -12,8 +12,8 @@ public class TimeCsvResultWriter extends CsvResultWriter { - public TimeCsvResultWriter (RegionalTask task, String outputBucket, FileStorage fileStorage) throws IOException { - super(task, outputBucket, fileStorage); + public TimeCsvResultWriter (RegionalTask task, FileStorage fileStorage) throws IOException { + super(task, fileStorage); } @Override diff --git a/src/main/java/com/conveyal/analysis/util/HttpUtils.java b/src/main/java/com/conveyal/analysis/util/HttpUtils.java index 5a89b1d86..6226c7d8d 100644 --- a/src/main/java/com/conveyal/analysis/util/HttpUtils.java +++ b/src/main/java/com/conveyal/analysis/util/HttpUtils.java @@ -24,7 +24,7 @@ public static Map> getRequestFiles (HttpServletRequest re try { return sfu.parseParameterMap(req); } catch (Exception e) { - throw AnalysisServerException.badRequest(ExceptionUtils.asString(e)); + throw AnalysisServerException.badRequest(ExceptionUtils.stackTraceString(e)); } } } diff --git a/src/main/java/com/conveyal/analysis/util/Jobs.java b/src/main/java/com/conveyal/analysis/util/Jobs.java deleted file mode 100644 index bea95d6c2..000000000 --- a/src/main/java/com/conveyal/analysis/util/Jobs.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.conveyal.analysis.util; - -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -/** - * Contains a shared ExecutorService. - */ -public class Jobs { - public static ExecutorService service = Executors.newCachedThreadPool(); -} diff --git a/src/main/java/com/conveyal/data/census/LodesSource.java b/src/main/java/com/conveyal/data/census/LodesSource.java index b8106c00c..681cfdd2a 100644 --- a/src/main/java/com/conveyal/data/census/LodesSource.java +++ b/src/main/java/com/conveyal/data/census/LodesSource.java @@ -79,20 +79,6 @@ public void load(ShapeDataStore store) throws Exception { colNames.put("CS01", "male"); colNames.put("CS02", "female"); - // only in workplace characteristics - colNames.put("CFA01", "at firms aged 0-1 years"); - colNames.put("CFA02", "at firms aged 2-3 years"); - colNames.put("CFA03", "at firms aged 4-5 years"); - colNames.put("CFA04", "at firms aged 6-10 years"); - colNames.put("CFA05", "at firms aged 11 or more years"); - - colNames.put("CFS01", "at firms with 0-19 employees"); - colNames.put("CFS02", "at firms with 20-49 employees"); - colNames.put("CFS03", "at firms with 50-249 employees"); - colNames.put("CFS04", "at firms with 250-499 employees"); - colNames.put("CFS05", "at firms with 500 or more employees"); - colNames.put("createdate", "Data creation date"); - reader.readHeaders(); String[] headers = reader.getHeaders(); diff --git a/src/main/java/com/conveyal/data/census/ShapeDataStore.java b/src/main/java/com/conveyal/data/census/ShapeDataStore.java index 6d244d7a2..5c92f7b71 100644 --- a/src/main/java/com/conveyal/data/census/ShapeDataStore.java +++ b/src/main/java/com/conveyal/data/census/ShapeDataStore.java @@ -128,7 +128,7 @@ public void writeTiles (File file) throws IOException { /** Write GeoBuf tiles to S3 */ public void writeTilesToS3 (String bucketName) throws IOException { - // set up an upload thread + // For the duration of this multiple-tile upload operation, manage a single upload thread for the S3 uploads. ExecutorService executor = Executors.newSingleThreadExecutor(); // initialize an S3 client diff --git a/src/main/java/com/conveyal/file/FileCategory.java b/src/main/java/com/conveyal/file/FileCategory.java new file mode 100644 index 000000000..7533a5110 --- /dev/null +++ b/src/main/java/com/conveyal/file/FileCategory.java @@ -0,0 +1,18 @@ +package com.conveyal.file; + +import java.util.Locale; + +/** + * Just to keep things organized and easier to find when debugging/manually manipulating files, each file we put into + * storage has a category, corresponding to the subdirectory or sub-bucket where it's stored in cache and/or on S3. + */ +public enum FileCategory { + + BUNDLES, GRIDS, RESULTS, RESOURCES, POLYGONS, TAUI; + + /** @return a String for the directory or sub-bucket name containing all files in this category. */ + public String directoryName () { + return this.name().toLowerCase(Locale.ROOT); + }; + +} diff --git a/src/main/java/com/conveyal/file/FileStorage.java b/src/main/java/com/conveyal/file/FileStorage.java index 05dcf218b..41a417a44 100644 --- a/src/main/java/com/conveyal/file/FileStorage.java +++ b/src/main/java/com/conveyal/file/FileStorage.java @@ -1,58 +1,94 @@ package com.conveyal.file; +import com.conveyal.r5.analyst.PersistenceBuffer; +import com.conveyal.r5.analyst.cluster.AnalysisWorkerTask; + +import java.io.BufferedInputStream; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.zip.GZIPInputStream; /** - * Store (and maybe mirror) immutable files. - * These are always seen as local files on the local filesystem, - * but may be made more permanent (accessible to workers and future backends). - *

- * The add/remove/etc. methods are all blocking calls now for simplicity, i.e. if you add a file, all other components - * of the system are known to be able to see it as soon as the method returns. - *

- * This does not handle storing file metadata in MongoDB. That is a separate concern. - * Workers for example need to get files without looking into our database. - * Our file metadata handling component could wrap this, so all backend file operations implicitly had metadata. - *

- * In the S3-based implementation we need to set content type and compression details on S3. - * We plan to do that by inspecting the "magic number" bytes at the beginning of the file and auto-setting the - * content type. + * Store files, optionally mirroring them to cloud storage for retrieval by workers and future backend deployments. + * These are always used as files on the local filesystem, and are treated as immutable once put into storage. + * For simplicity, methods that store and remove files are all blocking calls. If you add a file, all other components + * of the system are known to be able to see it as soon as the method returns. This does not handle storing file + * metadata in MongoDB. That is a separate concern. Workers need to get files without looking into our database. + * Our file metadata handling component could wrap FileStorage, so all backend file operations implicitly had metadata. */ public interface FileStorage { + /** * Takes an already existing file on the local filesystem and registers it as a permanent, immutable file to be - * made available to all analysis components including workers and future backends. - *

- * If a file was uploaded in a form, we can call DiskFileItem.getStoreLocation to get the file, which according - * to that method's Javadoc we are allowed to rename to our own location. - *

- * If the file was created by the backend, it should be created in a temp file. Once the file is completely - * constructed / written out, it should be closed and then this method called on it. + * made available to all analysis components including workers and future backends. If a file was uploaded in a + * form, we can call DiskFileItem.getStoreLocation to get the file, which according to that method's Javadoc we are + * allowed to rename to our own location. If the file was created by the backend, it should be created in a temp + * file. Once the file is completely constructed/written out, it should be closed and then this method called on it. */ void moveIntoStorage(FileStorageKey fileStorageKey, File file); /** - * This should be treated as immutable - never write to a file returned from this method. - * That could be enforced by making our own class with no write methods, that only allows reading the file. + * Move the data in an in-memory buffer into permanent storage, much like moveIntoStorage(key, file). + * The PersistenceBuffer must be marked 'done' before it is handed to this method. + * Files in the TAUI category are treated in a special way: they are not kept locally if mirrored remotely. + * Unlike the other file categories, these are produced on the worker (as opposed to the backend), and will never + * be read by the worker, so don't need to be kept once stored to S3. + * + * This is a blocking call and should only return when the file is completely uploaded. This prevents workers from + * producing output faster than uploads can complete, avoiding a growing queue of waiting uploads. + * + * TODO call with FileStorageKey(TAUI, analysisWorkerTask.jobId); + * TODO eventually unify with moveIntoStorage, by wrapping File in FileStorageBuffer? + */ + void moveIntoStorage(FileStorageKey fileStorageKey, PersistenceBuffer persistenceBuffer); + + /** + * Files returned from this method must be treated as immutable. Never write to them. Immutability could be + * enforced by making our own class with no write methods and only allows reading the file. It may be more + * practical to set the file's access flags to read-only so exceptions will occur if we ever write. */ File getFile(FileStorageKey fileStorageKey); /** - * Get the URL for the File located at the FileStorageKey. This can be a file:// URL when running locally or a URL - * available on the web generated by S3. + * Get the URL for the File identified by the FileStorageKey. This provides a way for a browser-based UI to read + * the file without going through the backend. This can be an S3 URL available over the web or a file:// URL when + * running locally. */ String getURL(FileStorageKey fileStorageKey); /** - * Delete the File located at the FileStorageKey. + * Delete the File identified by the FileStorageKey, in both the local cache and any remote mirror. */ void delete(FileStorageKey fileStorageKey); /** + * TODO explain what this method actually does. * When a new server is spun up there will be no local files. In instances where we derive files from other files * (ex: creating Tiffs from Grids) and if they are already created we only need to return a download URL and therefore * not need to retrieve the file at all, it would be useful to check if the file exists in the FileStorage without * actually retrieving it. */ boolean exists(FileStorageKey fileStorageKey); + + + //// Convenience methods usable with all concrete subclasses. + + /** Store Taui output in subfolders by job ID. */ + default void saveTauiData (AnalysisWorkerTask task, String fileName, PersistenceBuffer buffer) { + FileStorageKey key = new FileStorageKey(FileCategory.TAUI, String.join("/", task.jobId, fileName)); + moveIntoStorage(key, buffer); + } + + /** Read from a file as a stream, decompressing if the name indicates it's gzipped. */ + default InputStream getInputStream (FileCategory fileCategory, String fileName) throws IOException { + InputStream inputStream = new FileInputStream(getFile(new FileStorageKey(fileCategory, fileName))); + if (fileName.endsWith(".gz")) { + return new GZIPInputStream(inputStream); + } else { + return new BufferedInputStream(inputStream); + } + } + } diff --git a/src/main/java/com/conveyal/file/FileStorageKey.java b/src/main/java/com/conveyal/file/FileStorageKey.java index f4b8a8f59..22d504f3c 100644 --- a/src/main/java/com/conveyal/file/FileStorageKey.java +++ b/src/main/java/com/conveyal/file/FileStorageKey.java @@ -1,37 +1,29 @@ package com.conveyal.file; /** - * Represent S3 bucket/keys and locally cached folder/paths. Prevents multiple parameter passing or many - * separate `String.join("/", ...)`s. + * A unique identifier for a file within a namespace drawn from an enum of known categories. + * This maps to a subdirectory and filename in local storage, and a bucket and object key in S3-style cloud storage. + * While keeping stored files in multiple distinct categories, this avoids passing around a lot of directory/bucket + * names as strings, and avoids mistakes where such strings are mismatched accross different function calls. */ public class FileStorageKey { - public final String bucket; // Rather than a bucket, this could be just a folder in a cache directory. - public final String path; - public FileStorageKey(String fullPath) { - checkForDirectoryTraversal(fullPath); - int slashIndex = fullPath.indexOf("/"); - bucket = fullPath.substring(0, slashIndex); - path = fullPath.substring(slashIndex + 1); - } + public final FileCategory category; + public final String path; // rename field to id or name? these are not usually (never?) paths, just object names. - public FileStorageKey(String bucket, String path) { + public FileStorageKey(FileCategory category, String path) { checkForDirectoryTraversal(path); - this.bucket = bucket; + this.category = category; this.path = path; } - public FileStorageKey(String bucket, String path, String ext) { - this(bucket, path + "." + ext); - } - - public String getFullPath() { - return String.join("/", bucket, path); + public FileStorageKey(FileCategory category, String path, String ext) { + this(category, path + "." + ext); } @Override public String toString () { - return String.format("[File storage key: bucket='%s', key='%s']", bucket, path); + return String.format("[File storage key: category='%s', key='%s']", category, path); } /** diff --git a/src/main/java/com/conveyal/file/LocalFileStorage.java b/src/main/java/com/conveyal/file/LocalFileStorage.java index aebcc3b18..b0b398356 100644 --- a/src/main/java/com/conveyal/file/LocalFileStorage.java +++ b/src/main/java/com/conveyal/file/LocalFileStorage.java @@ -1,5 +1,6 @@ package com.conveyal.file; +import com.conveyal.r5.analyst.PersistenceBuffer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -9,28 +10,34 @@ import java.nio.file.Files; import java.nio.file.StandardCopyOption; +/** + * This implementation of FileStorage stores files in a local directory hierarchy and does not mirror anything to + * cloud storage. + */ public class LocalFileStorage implements FileStorage { private static final Logger LOG = LoggerFactory.getLogger(LocalFileStorage.class); - public final String directory; - private final String urlPrefix; - - public LocalFileStorage (String localCacheDirectory) { - this(localCacheDirectory, "http://localhost:7070"); + public interface Config { + // The local directory where files will be stored, even if they are being mirrored to a remote storage service. + String localCacheDirectory (); + // The port where the browser can fetch files. Parameter name aligned with the HttpApi server port parameter. + int serverPort (); } - public LocalFileStorage (String localCacheDirectory, String urlPrefix) { - this.directory = localCacheDirectory; - this.urlPrefix = urlPrefix; + public final String directory; + private final String urlPrefix; - File directory = new File(localCacheDirectory); - directory.mkdirs(); + public LocalFileStorage (Config config) { + this.directory = config.localCacheDirectory(); + this.urlPrefix = String.format("http://localhost:%s/files", config.serverPort()); + new File(directory).mkdirs(); } /** * Move the File into the FileStorage by moving the passed in file to the Path represented by the FileStorageKey. */ + @Override public void moveIntoStorage(FileStorageKey key, File file) { // Get a pointer to the local file File storedFile = getFile(key); @@ -54,21 +61,32 @@ public void moveIntoStorage(FileStorageKey key, File file) { } } + @Override + public void moveIntoStorage (FileStorageKey fileStorageKey, PersistenceBuffer persistenceBuffer) { + throw new UnsupportedOperationException("In-memory buffers are only persisted to cloud storage."); + } + + @Override public File getFile(FileStorageKey key) { - return new File(String.join("/", directory, key.bucket, key.path)); + return new File(String.join("/", directory, key.category.directoryName(), key.path)); } /** - * This will only get called in offline mode. + * Return a URL for the file as accessed through the backend's own static file server. + * (Registered in HttpApi at .get("/files/:category/*")) + * This exists to allow the same UI to work locally and in cloud deployments. */ + @Override public String getURL (FileStorageKey key) { - return String.join("/", urlPrefix, key.bucket, key.path); + return String.join("/", urlPrefix, key.category.directoryName(), key.path); } + @Override public void delete (FileStorageKey key) { getFile(key).delete(); } + @Override public boolean exists(FileStorageKey key) { return getFile(key).exists(); } diff --git a/src/main/java/com/conveyal/file/S3FileStorage.java b/src/main/java/com/conveyal/file/S3FileStorage.java deleted file mode 100644 index e148b87fe..000000000 --- a/src/main/java/com/conveyal/file/S3FileStorage.java +++ /dev/null @@ -1,97 +0,0 @@ -package com.conveyal.file; - -import com.amazonaws.HttpMethod; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.GeneratePresignedUrlRequest; -import com.amazonaws.services.s3.model.ObjectMetadata; -import com.amazonaws.services.s3.model.PutObjectRequest; -import com.amazonaws.services.s3.model.S3Object; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.nio.file.Files; -import java.util.Date; - -public class S3FileStorage implements FileStorage { - - private final AmazonS3 s3; - - private final LocalFileStorage localFileStorage; - - public S3FileStorage (String region, String localCacheDirectory) { - localFileStorage = new LocalFileStorage(localCacheDirectory); - s3 = AmazonS3ClientBuilder.standard().withRegion(region).build(); - } - - /** - * Move the file into S3 and then into local file storage. - */ - public void moveIntoStorage(FileStorageKey key, File file) { - PutObjectRequest putObjectRequest = new PutObjectRequest(key.bucket, key.path, file); - if (FileUtils.isGzip(file)) { - ObjectMetadata metadata = new ObjectMetadata(); - String contentType; - try { - contentType = Files.probeContentType(file.toPath()); - } catch (IOException e) { - // TODO Log error here? - contentType = "application/octet-stream"; - } - metadata.setContentType(contentType); - metadata.setContentEncoding("gzip"); - putObjectRequest.withMetadata(metadata); - } - s3.putObject(putObjectRequest); - - // Add to the file storage after. This method moves the File. - localFileStorage.moveIntoStorage(key, file); - } - - public File getFile(FileStorageKey key) { - File localFile = localFileStorage.getFile(key); - // A File object can represent a filesystem path for a file that doesn't exist yet, in which case we create it. - if (!localFile.exists()) { - // Before writing, ensure that the directory exists - localFile.getParentFile().mkdirs(); - - S3Object s3Object = s3.getObject(key.bucket, key.path); - try { - OutputStream fileOutputStream = FileUtils.getOutputStream(localFile); - s3Object.getObjectContent().transferTo(fileOutputStream); - fileOutputStream.close(); - s3Object.close(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - return localFile; - } - - public String getURL (FileStorageKey key) { - Date expiration = new Date(); - // 1 week - int signedUrlTimeout = 3600 * 1000 * 24 * 7; - expiration.setTime(expiration.getTime() + signedUrlTimeout); - - GeneratePresignedUrlRequest presigned = new GeneratePresignedUrlRequest(key.bucket, key.path) - .withMethod(HttpMethod.GET) - .withExpiration(expiration); - - return s3.generatePresignedUrl(presigned).toString(); - } - - public void delete(FileStorageKey key) { - localFileStorage.delete(key); - s3.deleteObject(key.bucket, key.path); - } - - public boolean exists(FileStorageKey key) { - if (localFileStorage.exists(key)) { - return true; - } - return s3.doesObjectExist(key.bucket, key.path); - } -} diff --git a/src/main/java/com/conveyal/gtfs/CropGTFS.java b/src/main/java/com/conveyal/gtfs/CropGTFS.java index 21a570047..999d5f34a 100644 --- a/src/main/java/com/conveyal/gtfs/CropGTFS.java +++ b/src/main/java/com/conveyal/gtfs/CropGTFS.java @@ -44,7 +44,7 @@ public class CropGTFS { public static void main (String[] args) { - GTFSFeed feed = GTFSFeed.fromFile(inputFile); + GTFSFeed feed = GTFSFeed.writableTempFileFromGtfs(inputFile); // We keep two sets of trip IDs because we only keep trips that are referenced by two or more stopTimes. // A TObjectIntMap would be good for this as well, but we don't currently depend on Trove. diff --git a/src/main/java/com/conveyal/gtfs/GTFSCache.java b/src/main/java/com/conveyal/gtfs/GTFSCache.java index 7a22df9e5..cc945b9ef 100644 --- a/src/main/java/com/conveyal/gtfs/GTFSCache.java +++ b/src/main/java/com/conveyal/gtfs/GTFSCache.java @@ -9,9 +9,11 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nonnull; import java.io.File; import java.util.concurrent.TimeUnit; -import java.util.zip.ZipFile; + +import static com.conveyal.file.FileCategory.BUNDLES; /** * Cache for GTFSFeed objects, a disk-backed (MapDB) representation of data from one GTFS feed. The source GTFS @@ -21,24 +23,19 @@ * policy is discussed in Javadoc on the class fields and methods. */ public class GTFSCache { + private static final Logger LOG = LoggerFactory.getLogger(GTFSCache.class); private final LoadingCache cache; - public final String bucket; - public final FileStorage fileStore; - - public interface Config { - String bundleBucket (); - } + public final FileStorage fileStorage; public static String cleanId(String id) { return id.replaceAll("[^A-Za-z0-9_]", "-"); } - public GTFSCache(FileStorage fileStore, Config config) { + public GTFSCache (FileStorage fileStorage) { LOG.info("Initializing the GTFS cache..."); - this.fileStore = fileStore; - this.bucket = config.bundleBucket(); + this.fileStorage = fileStorage; this.cache = makeCaffeineCache(); } @@ -80,15 +77,20 @@ private LoadingCache makeCaffeineCache () { } public FileStorageKey getFileKey (String id, String extension) { - return new FileStorageKey(this.bucket, String.join(".", cleanId(id), extension)); + return new FileStorageKey(BUNDLES, String.join(".", cleanId(id), extension)); } - public void add (String id, GTFSFeed feed) { - cache.put(id, feed); - } - - public GTFSFeed get(String id) { + /** + * Retrieve the feed with the given id, lazily creating it if it's not yet loaded or built. This is expected to + * always return a non-null GTFSFeed. If it can't it will always throw an exception with a cause. The returned feed + * must be closed manually to avoid corruption, so it's preferable to have a single synchronized component managing + * when files shared between threads are opened and closed. + */ + public @Nonnull GTFSFeed get(String id) { GTFSFeed feed = cache.get(id); + // The cache can in principle return null, but only if its loader method returns null. + // This should never happen in normal use - the loader should be revised to throw a clear exception. + if (feed == null) throw new IllegalStateException("Cache should always return a feed or throw an exception."); // The feedId of the GTFSFeed objects may not be unique - we can have multiple versions of the same feed // covering different time periods, uploaded by different users. Therefore we record another ID here that is // known to be unique across the whole application - the ID used to fetch the feed. @@ -96,48 +98,34 @@ public GTFSFeed get(String id) { return feed; } - // This should only ever be called by the cache loader. The returned feed must be closed, and - // it's preferable to have a single component managing when files shared between threads are opened and closed. - private GTFSFeed retrieveAndProcessFeed(String id) { + /** This method should only ever be called by the cache loader. */ + private @Nonnull GTFSFeed retrieveAndProcessFeed(String id) throws GtfsLibException { FileStorageKey dbKey = getFileKey(id, "db"); FileStorageKey dbpKey = getFileKey(id, "db.p"); - - if (fileStore.exists(dbKey) && fileStore.exists(dbpKey)) { - // Ensure both files are local - fileStore.getFile(dbpKey); - return new GTFSFeed(fileStore.getFile(dbKey)); + if (fileStorage.exists(dbKey) && fileStorage.exists(dbpKey)) { + // Ensure both MapDB files are local, pulling them down from remote storage as needed. + fileStorage.getFile(dbKey); + fileStorage.getFile(dbpKey); + return GTFSFeed.reopenReadOnly(fileStorage.getFile(dbKey)); } - FileStorageKey zipKey = getFileKey(id, "zip"); + if (!fileStorage.exists(zipKey)) { + throw new GtfsLibException("Original GTFS zip file could not be found: " + zipKey); + } LOG.debug("Building or rebuilding MapDB from original GTFS ZIP file at {}...", zipKey); - if (fileStore.exists(zipKey)) { - try { - File tempDbFile = FileUtils.createScratchFile("db"); - File tempDbpFile = new File(tempDbFile.getAbsolutePath() + ".p"); - ZipFile zipFile = new ZipFile(fileStore.getFile(zipKey)); - - GTFSFeed feed = new GTFSFeed(tempDbFile); - feed.loadFromFile(zipFile); - feed.findPatterns(); - - // Close the DB and flush to disk before we start moving and copying files around. - feed.close(); - - // Ensure the DB and DB.p files have been fully stored. - fileStore.moveIntoStorage(dbKey, tempDbFile); - fileStore.moveIntoStorage(dbpKey, tempDbpFile); - - return new GTFSFeed(fileStore.getFile(dbKey)); - } catch (Exception e) { - LOG.error("Error loading Zip file for GTFS Feed from {}", zipKey, e); - throw new RuntimeException(e); - } - } else { - LOG.error("Original GTFS ZIP for {} could not be found.", zipKey); + try { + File tempDbFile = FileUtils.createScratchFile("db"); + File tempDbpFile = new File(tempDbFile.getAbsolutePath() + ".p"); + GTFSFeed.newFileFromGtfs(tempDbFile, fileStorage.getFile(zipKey)); + // The DB file should already be closed and flushed to disk. + // Put the DB and DB.p files in local cache, and mirror to remote storage if configured. + fileStorage.moveIntoStorage(dbKey, tempDbFile); + fileStorage.moveIntoStorage(dbpKey, tempDbpFile); + // Reopen the feed in its new location, enforcing read-only access to avoid file corruption. + return GTFSFeed.reopenReadOnly(fileStorage.getFile(dbKey)); + } catch (Exception e) { + throw new GtfsLibException("Error loading zip file for GTFS feed: " + zipKey, e); } - - LOG.error("GTFS Feed {} could not be loaded.", id); - return null; } } \ No newline at end of file diff --git a/src/main/java/com/conveyal/gtfs/GTFSFeed.java b/src/main/java/com/conveyal/gtfs/GTFSFeed.java index 588638c24..77c6c3bac 100644 --- a/src/main/java/com/conveyal/gtfs/GTFSFeed.java +++ b/src/main/java/com/conveyal/gtfs/GTFSFeed.java @@ -20,11 +20,12 @@ import com.conveyal.gtfs.model.Transfer; import com.conveyal.gtfs.model.Trip; import com.conveyal.gtfs.validator.service.GeoUtils; +import com.conveyal.r5.analyst.progress.ProgressInputStream; +import com.conveyal.r5.analyst.progress.ProgressListener; import com.google.common.collect.HashMultimap; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; -import com.google.common.util.concurrent.ExecutionError; import org.geotools.referencing.GeodeticCalculator; import org.locationtech.jts.geom.Coordinate; import org.locationtech.jts.geom.CoordinateList; @@ -45,9 +46,9 @@ import java.io.Closeable; import java.io.File; import java.io.FileOutputStream; -import java.io.IOError; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; import java.time.LocalDate; import java.time.format.DateTimeFormatter; import java.util.ArrayList; @@ -167,6 +168,14 @@ public class GTFSFeed implements Cloneable, Closeable { /** Once a GTFSFeed has one feed loaded into it, we set this to true to block loading any additional feeds. */ private boolean loaded = false; + /** + * Set this to a ProgressListener to receive updates while loading GTFS. Like OSM files we have no idea how many + * entities are in the GTFS before we begin, so can only report progress based on the number of bytes read. + * Unlike OSM, GTFS is a zip file and will usually require random access to read the tables so we report progress + * on each table separately. This does give nice fine-grained reporting for the larger tables like stop_times. + */ + public ProgressListener progressListener = null; + /** * The order in which we load the tables is important for two reasons. * 1. We must load feed_info first so we know the feed ID before loading any other entities. This could be relaxed @@ -176,6 +185,8 @@ public class GTFSFeed implements Cloneable, Closeable { * us to associate a line number with errors in objects that don't have any other clear identifier. * * Interestingly, all references are resolvable when tables are loaded in alphabetical order. + * + * @param zip the source ZIP file to load, which will be closed when done loading. */ public void loadFromFile(ZipFile zip, String fid) throws Exception { if (this.loaded) throw new UnsupportedOperationException("Attempt to load GTFS into existing database"); @@ -236,15 +247,16 @@ else if (feedId == null || feedId.isEmpty()) { new Trip.Loader(this).loadTable(zip); new Frequency.Loader(this).loadTable(zip); new StopTime.Loader(this).loadTable(zip); - LOG.info("{} errors", errors.size()); - for (GTFSError error : errors) { - LOG.info("{}", error); - } - zip.close(); + // There are conceivably cases where the extra step of identifying and naming patterns is not necessary. + // In current usage we do always need them, and performing this step during load allows enforcing subsequent + // read-only access. + findPatterns(); + // Prevent loading additional feeds into this MapDB. loaded = true; + LOG.info("Detected {} errors in feed.", errors.size()); } public void loadFromFile(ZipFile zip) throws Exception { @@ -285,24 +297,6 @@ public void toFile (String file) { } } - /** - * Static factory method returning a new instance of GTFSFeed containing the contents of - * the GTFS file at the supplied filesystem path. - */ - public static GTFSFeed fromFile(String file) { - GTFSFeed feed = new GTFSFeed(); - ZipFile zip; - try { - zip = new ZipFile(file); - feed.loadFromFile(zip); - zip.close(); - return feed; - } catch (Exception e) { - LOG.error("Error loading GTFS: {}", e.getMessage()); - throw new RuntimeException(e); - } - } - /** * For the given trip ID, fetch all the stop times in order of increasing stop_sequence. * This is an efficient iteration over a tree map. @@ -428,14 +422,19 @@ public List getOrderedStopListForTrip (String trip_id) { } /** - * Bin all trips by stop sequence and pick/drop sequences. - * @return A map from a list of stop IDs to a list of Trip IDs that visit those stops in that sequence. + * Bin all trips by stop sequence and pick/drop sequences. + * A map from a list of stop IDs to a list of Trip IDs that visit those stops in that sequence. + * This changes the contents of the GTFSFeed (writes to it) so should be done once when the feed is first loaded. + * In normal usage this will be called automatically at the end of the feed loading process. + * The method is only public for special cases like tests where we programmatically build feeds. */ public void findPatterns() { + if (this.patterns.size() > 0) { + throw new GtfsLibException("Patterns should only be found once, after all trips are loaded."); + } + if (progressListener != null) progressListener.beginTask("Grouping trips into patterns.", trips.size()); int n = 0; - Multimap tripsForPattern = HashMultimap.create(); - for (String trip_id : trips.keySet()) { if (++n % 100000 == 0) { LOG.info("trip {}", human(n)); @@ -450,6 +449,7 @@ public void findPatterns() { .forEach(key::addStopTime); tripsForPattern.put(key, trip_id); + if (progressListener != null) progressListener.increment(); } // create an in memory list because we will rename them and they need to be immutable once they hit mapdb @@ -458,6 +458,7 @@ public void findPatterns() { .map((e) -> new Pattern(this, e.getKey().stops, new ArrayList<>(e.getValue()))) .collect(Collectors.toList()); + if (progressListener != null) progressListener.beginTask("Naming and indexing patterns.", patterns.size() * 3); namePatterns(patterns); // Index patterns by ID and by the trips they contain. @@ -466,6 +467,7 @@ public void findPatterns() { for (String tripid : pattern.associatedTrips) { this.patternForTrip.put(tripid, pattern.pattern_id); } + if (progressListener != null) progressListener.increment(); } LOG.info("Total patterns: {}", tripsForPattern.keySet().size()); @@ -499,13 +501,21 @@ private void namePatterns(Collection patterns) { namingInfo.fromStops.put(fromName, pattern); namingInfo.toStops.put(toName, pattern); - pattern.orderedStops.stream().map(stops::get).forEach(stop -> { - if (fromName.equals(stop.stop_name) || toName.equals(stop.stop_name)) return; - + for (String stopId : pattern.orderedStops) { + Stop stop = stops.get(stopId); + if (stop == null) { + // A ReferentialIntegrityError should have been recorded during stop_time loading and naming should + // be halted. + return; + } + if (fromName.equals(stop.stop_name) || toName.equals(stop.stop_name)) { + continue; + } namingInfo.vias.put(stop.stop_name, pattern); - }); - + } namingInfo.patternsOnRoute.add(pattern); + if (progressListener != null) progressListener.increment(); + } // name the patterns on each route @@ -513,30 +523,9 @@ private void namePatterns(Collection patterns) { for (Pattern pattern : info.patternsOnRoute) { pattern.name = null; // clear this now so we don't get confused later on - String headsign = trips.get(pattern.associatedTrips.get(0)).trip_headsign; - String fromName = stops.get(pattern.orderedStops.get(0)).stop_name; String toName = stops.get(pattern.orderedStops.get(pattern.orderedStops.size() - 1)).stop_name; - - /* We used to use this code but decided it is better to just always have the from/to info, with via if necessary. - if (headsign != null && info.headsigns.get(headsign).size() == 1) { - // easy, unique headsign, we're done - pattern.name = headsign; - continue; - } - - if (info.toStops.get(toName).size() == 1) { - pattern.name = String.format(Locale.US, "to %s", toName); - continue; - } - - if (info.fromStops.get(fromName).size() == 1) { - pattern.name = String.format(Locale.US, "from %s", fromName); - continue; - } - */ - // check if combination from, to is unique Set intersection = new HashSet<>(info.fromStops.get(fromName)); intersection.retainAll(info.toStops.get(toName)); @@ -577,6 +566,7 @@ private void namePatterns(Collection patterns) { // give up pattern.name = String.format(Locale.US, "from %s to %s like trip %s", fromName, toName, pattern.associatedTrips.get(0)); } + if (progressListener != null) progressListener.increment(); } // attach a stop and trip count to each @@ -755,47 +745,13 @@ private static class PatternNamingInfo { List patternsOnRoute = new ArrayList<>(); } - /** Create a GTFS feed in a temp file */ - public GTFSFeed () { - // calls to this must be first operation in constructor - why, Java? - this(DBMaker.newTempFileDB() - .transactionDisable() - .mmapFileEnable() - .asyncWriteEnable() - .deleteFilesAfterClose() - .compressionEnable() - .closeOnJvmShutdown() - .make()); - } - /** Create a GTFS feed connected to a particular DB, which will be created if it does not exist. */ - public GTFSFeed (File dbFile) { - this(constructDB(dbFile)); - } + /// CONSTRUCTORS and associated helper methods + /// These are private, use static factory methods to create instances. - // One critical point when constructing the MapDB is the instance cache type and size. - // The instance cache is how MapDB keeps some instances in memory to avoid deserializing them repeatedly from disk. - // We perform referential integrity checks against tables which in some feeds have hundreds of thousands of rows. - // We have observed that the referential integrity checks are very slow with the instance cache disabled. - // MapDB's default cache type is a hash table, which is very sensitive to the cache size. - // It defaults to 2^15 (32ki) and only seems to run smoothly at other powers of two, so we use 2^16 (64ki). - // This might have something to do with compiler optimizations on the hash code calculations. - // Initial tests show similar speeds for the default hashtable cache of 64k or 32k size and the hardRef cache. - // By not calling any of the cacheEnable or cacheSize methods on the DB builder, we use the default values - // that seem to perform well. - private static DB constructDB(File dbFile) { - try{ - return DBMaker.newFileDB(dbFile) - .transactionDisable() - .mmapFileEnable() - .asyncWriteEnable() - .compressionEnable() - .closeOnJvmShutdown() - .make(); - } catch (ExecutionError | IOError | Exception e) { - LOG.error("Could not construct db from file.", e); - return null; - } + /** @param dbFile the file to create or connect to, or null if a temporary file should be used. */ + private GTFSFeed (File dbFile, boolean writable) { + this(constructMapDb(dbFile, writable)); } private GTFSFeed (DB db) { @@ -829,4 +785,117 @@ private GTFSFeed (DB db) { errors = db.getTreeSet("errors"); } + + // One critical point when constructing the MapDB is the instance cache type and size. + // The instance cache is how MapDB keeps some instances in memory to avoid deserializing them repeatedly from disk. + // We perform referential integrity checks against tables which in some feeds have hundreds of thousands of rows. + // We have observed that the referential integrity checks are very slow with the instance cache disabled. + // MapDB's default cache type is a hash table, which is very sensitive to the cache size. + // It defaults to 2^15 (32ki) and only seems to run smoothly at other powers of two, so we use 2^16 (64ki). + // This might have something to do with compiler optimizations on the hash code calculations. + // Initial tests show similar speeds for the default hashtable cache of 64k or 32k size and the hardRef cache. + // By not calling any of the cacheEnable or cacheSize methods on the DB builder, we use the default values + // that seem to perform well. + private static DB constructMapDb (File dbFile, boolean readOnly) { + DBMaker dbMaker; + // TODO also allow for in-memory + if (dbFile == null) { + dbMaker = DBMaker.newTempFileDB(); + } else { + dbMaker = DBMaker.newFileDB(dbFile); + } + if (readOnly) { + dbMaker.readOnly(); + } else { + dbMaker.asyncWriteEnable(); + } + try{ + return dbMaker + .transactionDisable() + .mmapFileEnable() + .compressionEnable() + .closeOnJvmShutdown() + .make(); + } catch (Exception e) { + throw new GtfsLibException("Could not construct db.", e); + } + } + + + /// STATIC FACTORY METHODS + /// Use these rather than constructors to create GTFSFeed objects in a more fluent way. + + + public static GTFSFeed reopenReadOnly (File file) { + if (file.exists()) { + return new GTFSFeed(file, true); + } else { + throw new GtfsLibException("Cannot reopen file, it does not exist."); + } + } + + /** + * Create a new DB file and load the specified GTFS ZIP into it. The resulting writable feed object is not returned + * and must be reopened for subsequent read-only access. + * @param dbFile the new file in which to store the database, or null to use a temporary file + */ + public static void newFileFromGtfs (File dbFile, File gtfsFile) { + if (gtfsFile == null || !gtfsFile.exists()) { + throw new GtfsLibException("Cannot load from GTFS feed, file does not exist."); + } + try { + GTFSFeed feed = newWritableFile(dbFile); + feed.loadFromFile(new ZipFile(gtfsFile)); + feed.close(); + } catch (Exception e) { + throw new GtfsLibException("Cannot load GTFS from feed ZIP.", e); + } + } + + /** + * Ideally we wouldn't expose any readable feeds after loading them, but we need to inject progress listeners, clear + * out errors, and build some indexes. Be sure to close feeds and reopen them read-only as early as is feasible. + * Ideally we'd somehow encapsulate all that so we never reveal a writable GTFSFeed object. + * @param dbFile the new database file (which must be empty or not yet exist), or null to use a new temp file. + */ + public static GTFSFeed newWritableFile (File dbFile) { + // Length check is for cases where a newly created empty temp file is passed in. + if (dbFile != null && dbFile.exists() && dbFile.length() > 0) { + throw new GtfsLibException("Cannot create new file, it already exists."); + } + return new GTFSFeed(dbFile, false); + } + + /** + * Static factory method returning a new instance of GTFSFeed containing the contents of + * the GTFS file at the supplied filesystem path. This could probably be combined with some other factory methods. + */ + public static GTFSFeed writableTempFileFromGtfs (String file) { + GTFSFeed feed = new GTFSFeed(null, false); + try { + ZipFile zip = new ZipFile(file); + feed.loadFromFile(zip); + zip.close(); + return feed; + } catch (Exception e) { + LOG.error("Error loading GTFS: {}", e.getMessage()); + throw new RuntimeException(e); + } + } + + public static GTFSFeed readOnlyTempFileFromGtfs (String fileName) { + try { + File tempFile = File.createTempFile("com.conveyal.gtfs.", ".db"); + tempFile.deleteOnExit(); + GTFSFeed.newFileFromGtfs(tempFile, new File(fileName)); + return GTFSFeed.reopenReadOnly(tempFile); + } catch (Exception e) { + throw new GtfsLibException("Error loading GTFS.", e); + } + } + + public static GTFSFeed newWritableInMemory () { + return new GTFSFeed(DBMaker.newMemoryDB().transactionDisable().make()); + } + } diff --git a/src/main/java/com/conveyal/gtfs/GtfsLibException.java b/src/main/java/com/conveyal/gtfs/GtfsLibException.java new file mode 100644 index 000000000..9c5062ed3 --- /dev/null +++ b/src/main/java/com/conveyal/gtfs/GtfsLibException.java @@ -0,0 +1,14 @@ +package com.conveyal.gtfs; + +/** A generic exception for errors encountered within Conveyal GTFS loading and manipulation code. */ +public class GtfsLibException extends RuntimeException { + + public GtfsLibException (String message) { + super(message); + } + + public GtfsLibException (String message, Throwable cause) { + super(message, cause); + } + +} diff --git a/src/main/java/com/conveyal/gtfs/error/DateParseError.java b/src/main/java/com/conveyal/gtfs/error/DateParseError.java index 4abbdb19c..566dee393 100644 --- a/src/main/java/com/conveyal/gtfs/error/DateParseError.java +++ b/src/main/java/com/conveyal/gtfs/error/DateParseError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Represents a problem parsing a date field from a GTFS feed. */ @@ -14,4 +16,7 @@ public DateParseError(String file, long line, String field) { return "Could not parse date (format should be YYYYMMDD)."; } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/DuplicateKeyError.java b/src/main/java/com/conveyal/gtfs/error/DuplicateKeyError.java index fda44d996..9630449f4 100644 --- a/src/main/java/com/conveyal/gtfs/error/DuplicateKeyError.java +++ b/src/main/java/com/conveyal/gtfs/error/DuplicateKeyError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Indicates that a GTFS entity was not added to a table because another object already exists with the same primary key. */ @@ -14,4 +16,7 @@ public DuplicateKeyError(String file, long line, String field) { return "Duplicate primary key."; } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/DuplicateStopError.java b/src/main/java/com/conveyal/gtfs/error/DuplicateStopError.java index 5ed91f7c1..1d448a1fb 100644 --- a/src/main/java/com/conveyal/gtfs/error/DuplicateStopError.java +++ b/src/main/java/com/conveyal/gtfs/error/DuplicateStopError.java @@ -1,6 +1,7 @@ package com.conveyal.gtfs.error; import com.conveyal.gtfs.validator.model.DuplicateStops; +import com.conveyal.gtfs.validator.model.Priority; import java.io.Serializable; @@ -21,4 +22,7 @@ public DuplicateStopError(DuplicateStops duplicateStop) { return message; } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/DuplicateTripError.java b/src/main/java/com/conveyal/gtfs/error/DuplicateTripError.java index b972e4c3c..964b5d5f9 100644 --- a/src/main/java/com/conveyal/gtfs/error/DuplicateTripError.java +++ b/src/main/java/com/conveyal/gtfs/error/DuplicateTripError.java @@ -11,7 +11,6 @@ public class DuplicateTripError extends GTFSError implements Serializable { public static final long serialVersionUID = 1L; - public final Priority priority = Priority.LOW; public final String duplicateTripId; public final String patternName; public final String routeId; @@ -34,4 +33,8 @@ public DuplicateTripError(Trip trip, long line, String duplicateTripId, String p @Override public String getMessage() { return String.format("Trip Ids %s & %s (route %s) are duplicates (pattern: %s, calendar: %s, from %s to %s)", duplicateTripId, affectedEntityId, routeId, patternName, serviceId, firstDeparture, lastArrival); } + + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/EmptyFieldError.java b/src/main/java/com/conveyal/gtfs/error/EmptyFieldError.java index db6b0a652..4703df572 100644 --- a/src/main/java/com/conveyal/gtfs/error/EmptyFieldError.java +++ b/src/main/java/com/conveyal/gtfs/error/EmptyFieldError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Indicates that a field marked as required is not present in a GTFS feed on a particular line. */ @@ -14,4 +16,7 @@ public EmptyFieldError(String file, long line, String field) { return String.format("No value supplied for a required column."); } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/EmptyTableError.java b/src/main/java/com/conveyal/gtfs/error/EmptyTableError.java index 1fb4cd5e6..60f1e0187 100644 --- a/src/main/java/com/conveyal/gtfs/error/EmptyTableError.java +++ b/src/main/java/com/conveyal/gtfs/error/EmptyTableError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** @@ -15,4 +17,8 @@ public EmptyTableError(String file) { @Override public String getMessage() { return String.format("Table is present in zip file, but it has no entries."); } + + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/GTFSError.java b/src/main/java/com/conveyal/gtfs/error/GTFSError.java index 223219284..84680a82d 100644 --- a/src/main/java/com/conveyal/gtfs/error/GTFSError.java +++ b/src/main/java/com/conveyal/gtfs/error/GTFSError.java @@ -50,7 +50,10 @@ public final String getErrorCode () { return this.getClass().getSimpleName(); } - public Priority getPriority () { + /** + * @return The Error priority level associated with this class. + */ + public Priority getPriority() { return Priority.UNKNOWN; } diff --git a/src/main/java/com/conveyal/gtfs/error/GeneralError.java b/src/main/java/com/conveyal/gtfs/error/GeneralError.java index a1545fe42..35219eb21 100644 --- a/src/main/java/com/conveyal/gtfs/error/GeneralError.java +++ b/src/main/java/com/conveyal/gtfs/error/GeneralError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Represents any GTFS loading problem that does not have its own class, with a free-text message. */ @@ -17,4 +19,7 @@ public GeneralError(String file, long line, String field, String message) { return message; } + @Override public Priority getPriority() { + return Priority.UNKNOWN; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/MisplacedStopError.java b/src/main/java/com/conveyal/gtfs/error/MisplacedStopError.java index 1d06c3d5e..991289ec6 100644 --- a/src/main/java/com/conveyal/gtfs/error/MisplacedStopError.java +++ b/src/main/java/com/conveyal/gtfs/error/MisplacedStopError.java @@ -11,16 +11,18 @@ public class MisplacedStopError extends GTFSError implements Serializable { public static final long serialVersionUID = 1L; - public final Priority priority; public final Stop stop; public MisplacedStopError(String affectedEntityId, long line, Stop stop) { super("stops", line, "stop_id", affectedEntityId); - this.priority = Priority.HIGH; this.stop = stop; } @Override public String getMessage() { return String.format("Stop Id %s is misplaced.", affectedEntityId); } + + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/MissingColumnError.java b/src/main/java/com/conveyal/gtfs/error/MissingColumnError.java index 78fb402a4..24022a66a 100644 --- a/src/main/java/com/conveyal/gtfs/error/MissingColumnError.java +++ b/src/main/java/com/conveyal/gtfs/error/MissingColumnError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Indicates that a column marked as required is entirely missing from a GTFS feed. */ @@ -14,4 +16,7 @@ public MissingColumnError(String file, String field) { return String.format("Missing required column."); } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/MissingShapeError.java b/src/main/java/com/conveyal/gtfs/error/MissingShapeError.java index 9e7262245..9487d8551 100644 --- a/src/main/java/com/conveyal/gtfs/error/MissingShapeError.java +++ b/src/main/java/com/conveyal/gtfs/error/MissingShapeError.java @@ -11,8 +11,6 @@ public class MissingShapeError extends GTFSError implements Serializable { public static final long serialVersionUID = 1L; - public final Priority priority = Priority.MEDIUM; - public MissingShapeError(Trip trip) { super("trips", trip.sourceFileLine, "shape_id", trip.trip_id); } @@ -20,4 +18,8 @@ public MissingShapeError(Trip trip) { @Override public String getMessage() { return "Trip " + affectedEntityId + " is missing a shape"; } + + @Override public Priority getPriority() { + return Priority.LOW; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/MissingTableError.java b/src/main/java/com/conveyal/gtfs/error/MissingTableError.java index ec2ba0d5f..6a756a8b2 100644 --- a/src/main/java/com/conveyal/gtfs/error/MissingTableError.java +++ b/src/main/java/com/conveyal/gtfs/error/MissingTableError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Indicates that a table marked as required is not present in a GTFS feed. */ @@ -14,4 +16,7 @@ public MissingTableError(String file) { return String.format("This table is required by the GTFS specification but is missing."); } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/NoAgencyInFeedError.java b/src/main/java/com/conveyal/gtfs/error/NoAgencyInFeedError.java index 71a46dc62..e82c6895d 100644 --- a/src/main/java/com/conveyal/gtfs/error/NoAgencyInFeedError.java +++ b/src/main/java/com/conveyal/gtfs/error/NoAgencyInFeedError.java @@ -6,8 +6,6 @@ * Created by landon on 5/2/17. */ public class NoAgencyInFeedError extends GTFSError { - public final Priority priority = Priority.HIGH; - public NoAgencyInFeedError() { super("agency", 0, "agency_id"); } @@ -15,4 +13,8 @@ public NoAgencyInFeedError() { @Override public String getMessage() { return String.format("No agency listed in feed (must have at least one)."); } + + @Override public Priority getPriority() { + return Priority.HIGH; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/NumberParseError.java b/src/main/java/com/conveyal/gtfs/error/NumberParseError.java index bbc4d06e0..c691c911a 100644 --- a/src/main/java/com/conveyal/gtfs/error/NumberParseError.java +++ b/src/main/java/com/conveyal/gtfs/error/NumberParseError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Represents a problem parsing an integer field of GTFS feed. */ @@ -14,4 +16,7 @@ public NumberParseError(String file, long line, String field) { return String.format("Error parsing a number from a string."); } + @Override public Priority getPriority() { + return Priority.HIGH; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/OverlappingTripsInBlockError.java b/src/main/java/com/conveyal/gtfs/error/OverlappingTripsInBlockError.java index 947cdce95..459c2527b 100644 --- a/src/main/java/com/conveyal/gtfs/error/OverlappingTripsInBlockError.java +++ b/src/main/java/com/conveyal/gtfs/error/OverlappingTripsInBlockError.java @@ -23,4 +23,8 @@ public OverlappingTripsInBlockError(long line, String field, String affectedEnti @Override public String getMessage() { return String.format("Trip Ids %s overlap (route: %s) and share block ID %s", String.join(" & ", tripIds), routeId, affectedEntityId); } + + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/RangeError.java b/src/main/java/com/conveyal/gtfs/error/RangeError.java index e8427e730..42a0f51e6 100644 --- a/src/main/java/com/conveyal/gtfs/error/RangeError.java +++ b/src/main/java/com/conveyal/gtfs/error/RangeError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Indicates that a number is out of the acceptable range. */ @@ -19,4 +21,7 @@ public RangeError(String file, long line, String field, double min, double max, return String.format("Number %s outside of acceptable range [%s,%s].", actual, min, max); } + @Override public Priority getPriority() { + return Priority.LOW; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/ReferentialIntegrityError.java b/src/main/java/com/conveyal/gtfs/error/ReferentialIntegrityError.java index 970058c54..0ef7edad0 100644 --- a/src/main/java/com/conveyal/gtfs/error/ReferentialIntegrityError.java +++ b/src/main/java/com/conveyal/gtfs/error/ReferentialIntegrityError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Indicates that an entity referenced another entity that does not exist. */ @@ -26,4 +28,7 @@ public int compareTo (GTFSError o) { return String.format(badReference); } + @Override public Priority getPriority() { + return Priority.HIGH; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/ReversedTripShapeError.java b/src/main/java/com/conveyal/gtfs/error/ReversedTripShapeError.java index eb06565cd..c29e2439e 100644 --- a/src/main/java/com/conveyal/gtfs/error/ReversedTripShapeError.java +++ b/src/main/java/com/conveyal/gtfs/error/ReversedTripShapeError.java @@ -11,7 +11,6 @@ public class ReversedTripShapeError extends GTFSError implements Serializable { public static final long serialVersionUID = 1L; - public final Priority priority = Priority.HIGH; public final String shapeId; public ReversedTripShapeError(Trip trip) { @@ -22,4 +21,8 @@ public ReversedTripShapeError(Trip trip) { @Override public String getMessage() { return "Trip " + affectedEntityId + " references reversed shape " + shapeId; } + + @Override public Priority getPriority() { + return Priority.HIGH; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/ShapeMissingCoordinatesError.java b/src/main/java/com/conveyal/gtfs/error/ShapeMissingCoordinatesError.java index 0e489fc49..8ece94a15 100644 --- a/src/main/java/com/conveyal/gtfs/error/ShapeMissingCoordinatesError.java +++ b/src/main/java/com/conveyal/gtfs/error/ShapeMissingCoordinatesError.java @@ -11,7 +11,6 @@ public class ShapeMissingCoordinatesError extends GTFSError implements Serializable { public static final long serialVersionUID = 1L; - public final Priority priority = Priority.MEDIUM; public final String[] tripIds; public ShapeMissingCoordinatesError(ShapePoint shapePoint, String[] tripIds) { @@ -22,4 +21,8 @@ public ShapeMissingCoordinatesError(ShapePoint shapePoint, String[] tripIds) { @Override public String getMessage() { return "Shape " + affectedEntityId + " is missing coordinates (affects " + tripIds.length + " trips)"; } + + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } \ No newline at end of file diff --git a/src/main/java/com/conveyal/gtfs/error/TableInSubdirectoryError.java b/src/main/java/com/conveyal/gtfs/error/TableInSubdirectoryError.java index 55cd9be73..e415ef6c8 100644 --- a/src/main/java/com/conveyal/gtfs/error/TableInSubdirectoryError.java +++ b/src/main/java/com/conveyal/gtfs/error/TableInSubdirectoryError.java @@ -11,7 +11,6 @@ public class TableInSubdirectoryError extends GTFSError implements Serializable public static final long serialVersionUID = 1L; public final String directory; - public final Priority priority = Priority.HIGH; public TableInSubdirectoryError(String file, String directory) { super(file, 0, null); @@ -21,4 +20,8 @@ public TableInSubdirectoryError(String file, String directory) { @Override public String getMessage() { return String.format("All GTFS files (including %s.txt) should be at root of zipfile, not nested in subdirectory (%s)", file, directory); } + + @Override public Priority getPriority() { + return Priority.HIGH; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/TimeParseError.java b/src/main/java/com/conveyal/gtfs/error/TimeParseError.java index 59837d03e..6dcc4b282 100644 --- a/src/main/java/com/conveyal/gtfs/error/TimeParseError.java +++ b/src/main/java/com/conveyal/gtfs/error/TimeParseError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Represents a problem parsing a time of day field of GTFS feed. */ @@ -14,4 +16,7 @@ public TimeParseError(String file, long line, String field) { return "Could not parse time (format should be HH:MM:SS)."; } + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/TimeZoneError.java b/src/main/java/com/conveyal/gtfs/error/TimeZoneError.java index f0f618bed..5a9710f2a 100644 --- a/src/main/java/com/conveyal/gtfs/error/TimeZoneError.java +++ b/src/main/java/com/conveyal/gtfs/error/TimeZoneError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** @@ -27,4 +29,8 @@ public TimeZoneError(String tableName, long line, String field, String affectedE @Override public String getMessage() { return message + ". (" + field + ": " + affectedEntityId + ")"; } + + @Override public Priority getPriority() { + return Priority.MEDIUM; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/URLParseError.java b/src/main/java/com/conveyal/gtfs/error/URLParseError.java index 8b21a852d..6157e861c 100644 --- a/src/main/java/com/conveyal/gtfs/error/URLParseError.java +++ b/src/main/java/com/conveyal/gtfs/error/URLParseError.java @@ -1,5 +1,7 @@ package com.conveyal.gtfs.error; +import com.conveyal.gtfs.validator.model.Priority; + import java.io.Serializable; /** Represents a problem parsing a URL field from a GTFS feed. */ @@ -14,4 +16,7 @@ public URLParseError(String file, long line, String field) { return "Could not parse URL (format should be ://?#)."; } + @Override public Priority getPriority() { + return Priority.LOW; + } } diff --git a/src/main/java/com/conveyal/gtfs/error/UnusedStopError.java b/src/main/java/com/conveyal/gtfs/error/UnusedStopError.java index 19d469d18..be2c9ea47 100644 --- a/src/main/java/com/conveyal/gtfs/error/UnusedStopError.java +++ b/src/main/java/com/conveyal/gtfs/error/UnusedStopError.java @@ -9,16 +9,18 @@ public class UnusedStopError extends GTFSError implements Serializable { public static final long serialVersionUID = 1L; - public final Priority priority; public final Stop stop; public UnusedStopError(Stop stop) { super("stops", stop.sourceFileLine, "stop_id", stop.stop_id); - this.priority = Priority.LOW; this.stop = stop; } @Override public String getMessage() { return String.format("Stop Id %s is not used in any trips.", affectedEntityId); } + + @Override public Priority getPriority() { + return Priority.LOW; + } } diff --git a/src/main/java/com/conveyal/gtfs/model/Entity.java b/src/main/java/com/conveyal/gtfs/model/Entity.java index 220fedf83..7e3800778 100644 --- a/src/main/java/com/conveyal/gtfs/model/Entity.java +++ b/src/main/java/com/conveyal/gtfs/model/Entity.java @@ -1,6 +1,7 @@ package com.conveyal.gtfs.model; import com.beust.jcommander.internal.Sets; +import com.conveyal.r5.analyst.progress.ProgressInputStream; import com.conveyal.gtfs.GTFSFeed; import com.conveyal.gtfs.error.DateParseError; import com.conveyal.gtfs.error.EmptyFieldError; @@ -246,12 +247,12 @@ protected V getRefField(String column, boolean required, Map target protected abstract void loadOneRow() throws IOException; /** - * The main entry point into an Entity.Loader. Interprets each row of a CSV file within a zip file as a sinle + * The main entry point into an Entity.Loader. Interprets each row of a CSV file within a zip file as a single * GTFS entity, and loads them into a table. * * @param zip the zip file from which to read a table */ - public void loadTable(ZipFile zip) throws IOException { + public void loadTable (ZipFile zip) throws IOException { ZipEntry entry = zip.getEntry(tableName + ".txt"); if (entry == null) { Enumeration entries = zip.entries(); @@ -269,15 +270,19 @@ public void loadTable(ZipFile zip) throws IOException { } else { LOG.info("Table {} was missing but it is not required.", tableName); } - if (entry == null) return; } LOG.info("Loading GTFS table {} from {}", tableName, entry); - InputStream zis = zip.getInputStream(entry); + InputStream inStream = zip.getInputStream(entry); // skip any byte order mark that may be present. Files must be UTF-8, // but the GTFS spec says that "files that include the UTF byte order mark are acceptable" - InputStream bis = new BOMInputStream(zis); - CsvReader reader = new CsvReader(bis, ',', Charset.forName("UTF8")); + inStream = new BOMInputStream(inStream); + // TODO Would this benefit from buffering,especially considering progress reporting? Try and measure speed. + if (feed.progressListener != null) { + inStream = new ProgressInputStream(feed.progressListener, inStream); + feed.progressListener.beginTask("Loading GTFS table " + entry.getName(), (int)(entry.getSize())); + } + CsvReader reader = new CsvReader(inStream, ',', Charset.forName("UTF8")); this.reader = reader; boolean hasHeaders = reader.readHeaders(); if (!hasHeaders) { diff --git a/src/main/java/com/conveyal/osmlib/OSM.java b/src/main/java/com/conveyal/osmlib/OSM.java index d2bfb7ea0..f690bb18d 100644 --- a/src/main/java/com/conveyal/osmlib/OSM.java +++ b/src/main/java/com/conveyal/osmlib/OSM.java @@ -241,13 +241,12 @@ public void readVex(InputStream inputStream) { } } - public void readPbf(InputStream inputStream) { + public void readPbf(InputStream inputStream) throws OsmLibException { try { OSMEntitySource source = new PBFInput(inputStream); source.copyTo(this); - } catch (IOException ex) { - LOG.error("Error occurred while parsing VEX stream."); - ex.printStackTrace(); + } catch (Exception exception) { + throw new OsmLibException("Failed to read OSM PBF file.", exception); } } diff --git a/src/main/java/com/conveyal/osmlib/OsmLibException.java b/src/main/java/com/conveyal/osmlib/OsmLibException.java new file mode 100644 index 000000000..96924169d --- /dev/null +++ b/src/main/java/com/conveyal/osmlib/OsmLibException.java @@ -0,0 +1,14 @@ +package com.conveyal.osmlib; + +/** A generic exception representing any problem encountered within Conveyal osm-lib related code. */ +public class OsmLibException extends RuntimeException { + + public OsmLibException (String message) { + super(message); + } + + public OsmLibException (String message, Throwable cause) { + super(message, cause); + } + +} diff --git a/src/main/java/com/conveyal/r5/R5Main.java b/src/main/java/com/conveyal/r5/R5Main.java deleted file mode 100644 index 6aaa0dd79..000000000 --- a/src/main/java/com/conveyal/r5/R5Main.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.conveyal.r5; - -import com.conveyal.r5.analyst.cluster.AnalysisWorker; -import com.conveyal.r5.point_to_point.PointToPointRouterServer; - -import java.util.Arrays; - -/** - * Main entry point for R5. - * Currently only supports starting up Analyst components (not plain old journey planning). - * This will start up either an Analyst worker or a broker depending on the first argument. - */ -public class R5Main { - public static void main (String... args) throws Exception { - System.out.println("____/\\\\\\\\\\\\\\\\\\_______/\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\_ \n" + - " __/\\\\\\///////\\\\\\____\\/\\\\\\///////////__ \n" + - " _\\/\\\\\\_____\\/\\\\\\____\\/\\\\\\_____________ \n" + - " _\\/\\\\\\\\\\\\\\\\\\\\\\/_____\\/\\\\\\\\\\\\\\\\\\\\\\\\_____ \n" + - " _\\/\\\\\\//////\\\\\\_____\\////////////\\\\\\___ \n" + - " _\\/\\\\\\____\\//\\\\\\_______________\\//\\\\\\__ \n" + - " _\\/\\\\\\_____\\//\\\\\\___/\\\\\\________\\/\\\\\\__ \n" + - " _\\/\\\\\\______\\//\\\\\\_\\//\\\\\\\\\\\\\\\\\\\\\\\\\\/___ \n" + - " _\\///________\\///___\\/////////////_____\n"); - - // Pull argument 0 off as the sub-command, - // then pass the remaining args (1..n) on to that subcommand. - String command = args[0]; - String[] commandArguments = Arrays.copyOfRange(args, 1, args.length); - if ("worker".equals(command)) { - AnalysisWorker.main(commandArguments); - } else if ("point".equals(command)) { - PointToPointRouterServer.main(commandArguments); - } else { - System.err.println("Unknown command " + command); - } - } -} diff --git a/src/main/java/com/conveyal/analysis/BackendVersion.java b/src/main/java/com/conveyal/r5/SoftwareVersion.java similarity index 76% rename from src/main/java/com/conveyal/analysis/BackendVersion.java rename to src/main/java/com/conveyal/r5/SoftwareVersion.java index 6e2685bbc..f1f0acc01 100644 --- a/src/main/java/com/conveyal/analysis/BackendVersion.java +++ b/src/main/java/com/conveyal/r5/SoftwareVersion.java @@ -1,4 +1,4 @@ -package com.conveyal.analysis; +package com.conveyal.r5; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -13,12 +13,14 @@ * in automated deployment, testing, and debugging situations. Note that when building the code in an IDE, the version * information may not be supplied to this class. It may only be provided in a command line Maven build. */ -public class BackendVersion { +public class SoftwareVersion { - private static final Logger LOG = LoggerFactory.getLogger(BackendVersion.class); + private static final Logger LOG = LoggerFactory.getLogger(SoftwareVersion.class); + private static final String VERSION_PROPERTIES_FILE = "version.properties"; private static final String UNKNOWN = "UNKNOWN"; - public static final BackendVersion instance = new BackendVersion(); + // This could potentially be made into a Component so it's non-static + public static SoftwareVersion instance = new SoftwareVersion(); private final Properties properties = new Properties(); @@ -27,13 +29,11 @@ public class BackendVersion { public final String commit; public final String branch; - private BackendVersion () { - try { - InputStream is = BackendVersion.class.getClassLoader().getResourceAsStream("version.properties"); + protected SoftwareVersion () { + try (InputStream is = getClass().getResourceAsStream(VERSION_PROPERTIES_FILE)) { properties.load(is); - is.close(); } catch (IOException | NullPointerException e) { - LOG.error("Error loading version and commit information for Analysis Backend: {}", e.toString()); + LOG.error("Error loading version and commit information: {}", e.toString()); } version = getPropertyOrUnknown("version"); commit = getPropertyOrUnknown("commit"); diff --git a/src/main/java/com/conveyal/r5/analyst/FileCategory.java b/src/main/java/com/conveyal/r5/analyst/FileCategory.java deleted file mode 100644 index 3bf2f1bd3..000000000 --- a/src/main/java/com/conveyal/r5/analyst/FileCategory.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.conveyal.r5.analyst; - -/** - * For use by FilePersistence. Avoids specifying bucket names or subfolders with Strings. - */ -public enum FileCategory { - - POLYGON, // Only this one is currently used, others are examples - GRID, - BUNDLE; - -} diff --git a/src/main/java/com/conveyal/r5/analyst/FilePersistence.java b/src/main/java/com/conveyal/r5/analyst/FilePersistence.java deleted file mode 100644 index 1c7872fd5..000000000 --- a/src/main/java/com/conveyal/r5/analyst/FilePersistence.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.conveyal.r5.analyst; - -import com.conveyal.r5.analyst.cluster.AnalysisWorkerTask; - -import java.io.InputStream; - -/** - * This is an abstraction for long term file storage. - * We used to use only S3 for this, but we now provide an abstract class to allow multiple implementations. - * Files saved by an implementation should be available to both the backend and the workers indefinitely into the future. - */ -public abstract class FilePersistence { - - /** - * Convenience method to ensure that all results files for a particular static site end up in the same place, - * which is typically a bucket on S3. The top level directory is hard-coded for now but could be configurable - * if and when actual use cases require it. - */ - public void saveStaticSiteData (AnalysisWorkerTask task, String fileName, PersistenceBuffer persistenceBuffer) { - String directoryName = "analysis-static/" + task.jobId; - saveData(directoryName, fileName, persistenceBuffer); - } - - /** - * This is a blocking call and should only return when the file is completely uploaded. - * That prevents our workers from producing output faster than uploads can complete, - * and building up a queue of waiting uploads. - * The PersistenceBuffer must be marked 'done' before it is handed to this method. - */ - public abstract void saveData (String directory, String fileName, PersistenceBuffer persistenceBuffer); - - /** - * Get an input stream for the file of the given type with the given name. - * TODO this is the only method currently using FileCategory to automatically create bucket names. Extend to other methods. - */ - public abstract InputStream getData (FileCategory category, String name); - - /** - * This should be called when the application is shutting down to perform any cleanup, await completion, - * shutdown async upload threads etc. - */ - public abstract void shutdown(); - -} diff --git a/src/main/java/com/conveyal/r5/analyst/PersistenceBuffer.java b/src/main/java/com/conveyal/r5/analyst/PersistenceBuffer.java index 484f4341f..ccbefa833 100644 --- a/src/main/java/com/conveyal/r5/analyst/PersistenceBuffer.java +++ b/src/main/java/com/conveyal/r5/analyst/PersistenceBuffer.java @@ -15,6 +15,10 @@ * by AWS S3. It also works around a limitation of ByteArrayOutputStream that requires the backing byte array to be * copied to create an InputStream. It uses the little-endian representation of integers so they can be read * straight out of a typed array in Javascript on any little-endian architecture (nearly all processors these days). + * This is an encapsulated replacement for a previous pattern where we'd chain a bunch of output streams, connect + * them to an input stream, then launch a short-lived thread to upload data pulled from the input stream. This was + * ugly and prone to stalling or locking up. This whole construct largely exists to allow creating lots of files in + * memory and uploading them without creating and deleting local files, a major part of TAUI site creation. */ public class PersistenceBuffer { @@ -73,18 +77,18 @@ public OutputStream getOutputStream() { */ public InputStream getInputStream () { if (!doneWriting) { - throw new RuntimeException("You must mark a PersistenceBuffer 'doneWriting' before reading its contents as an InputStream."); + throw new RuntimeException("You must call doneWriting()) before reading its contents as an InputStream."); } return buffer.getInputStream(); } /** - * @return the size of the underlying byte buffer. Because of compression this is not well-defined until writing is - * completed. Therefore a call to this method will automatically end writing. + * @return the size of the underlying byte buffer. Because of compression this size is not well-defined until + * writing is completed. Therefore a call to this method will fail if doneWriting() has not been called. */ public long getSize() { if (!doneWriting) { - throw new RuntimeException("You must mark a PersistenceBuffer 'doneWriting' before taking its size."); + throw new RuntimeException("You must call doneWriting()) before taking the size of a buffer."); } return buffer.size(); } @@ -102,7 +106,8 @@ public String getMimeType() { } /** - * Signal that writing to this buffer is complete. Certain operations will not be possible until writing is complete. + * Signal that writing to this buffer is complete. + * Certain operations will not be possible until writing is complete. */ public void doneWriting() { if (doneWriting) { diff --git a/src/main/java/com/conveyal/r5/analyst/PointSetCache.java b/src/main/java/com/conveyal/r5/analyst/PointSetCache.java index 4fe5d3a7e..816d5f0bc 100644 --- a/src/main/java/com/conveyal/r5/analyst/PointSetCache.java +++ b/src/main/java/com/conveyal/r5/analyst/PointSetCache.java @@ -1,5 +1,6 @@ package com.conveyal.r5.analyst; +import com.conveyal.file.FileCategory; import com.conveyal.file.FileStorage; import com.conveyal.file.FileStorageFormat; import com.conveyal.file.FileStorageKey; @@ -16,6 +17,8 @@ import java.util.concurrent.ExecutionException; import java.util.zip.GZIPInputStream; +import static com.conveyal.file.FileCategory.GRIDS; + /** * A local in-memory cache for PointSets, which are loaded from persistent storage on S3. * It will load either gridded or freeform pointsets, depending on the file extension of the S3 key. @@ -28,7 +31,6 @@ public class PointSetCache { private static final int CACHE_SIZE = 200; private final FileStorage fileStore; - private final String bucket; private LoadingCache cache = CacheBuilder.newBuilder() .maximumSize(CACHE_SIZE) @@ -40,13 +42,12 @@ public PointSet load(String s) throws Exception { } }); - public PointSetCache(FileStorage fileStore, String bucket) { - this.bucket = bucket; + public PointSetCache(FileStorage fileStore) { this.fileStore = fileStore; } private PointSet loadPointSet(String key) throws IOException { - File file = fileStore.getFile(new FileStorageKey(bucket, key)); + File file = fileStore.getFile(new FileStorageKey(GRIDS, key)); // If the object does not exist on S3, getObject will throw an exception which will be caught in the // PointSetCache.get method. Grids are gzipped on S3. InputStream is = new GZIPInputStream(FileUtils.getInputStream(file)); @@ -76,8 +77,8 @@ public PointSet get (String key) { // something like RegionalAnalysisController. private static PointSetCache instance; - public static void initializeStatically (FileStorage fileStorage, String gridBucket) { - instance = new PointSetCache(fileStorage, gridBucket); + public static void initializeStatically (FileStorage fileStorage) { + instance = new PointSetCache(fileStorage); } public static FreeFormPointSet readFreeFormFromFileStore (String key) { diff --git a/src/main/java/com/conveyal/r5/analyst/S3FilePersistence.java b/src/main/java/com/conveyal/r5/analyst/S3FilePersistence.java deleted file mode 100644 index 932fb21fd..000000000 --- a/src/main/java/com/conveyal/r5/analyst/S3FilePersistence.java +++ /dev/null @@ -1,165 +0,0 @@ -package com.conveyal.r5.analyst; - -import com.amazonaws.event.ProgressEvent; -import com.amazonaws.event.ProgressEventType; -import com.amazonaws.event.ProgressListener; -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.ObjectMetadata; -import com.amazonaws.services.s3.model.S3Object; -import com.amazonaws.services.s3.transfer.TransferManager; -import com.amazonaws.services.s3.transfer.TransferManagerBuilder; -import com.amazonaws.services.s3.transfer.TransferProgress; -import com.amazonaws.services.s3.transfer.Upload; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.InputStream; - -import static com.conveyal.r5.common.Util.human; - -/** - * An implementation of long-term file persistence using Amazon AWS S3. - * - * For any file whose length is not known in advance, the S3 client will buffer the whole thing in memory. - * This obviously dangerous because it can exhaust memory. However, when we are going to gzip files, we just don't - * know the size in advance. If the files are known to be reasonably small, we can do all the layout and compression - * in memory buffers then upload from the buffer, whose length is known. - * - * From docs: "callers must supply the size of options in the stream through the content length field in the - * ObjectMetadata parameter. If no content length is specified for the input stream, then TransferManager will attempt - * to buffer all the stream contents in memory and upload the options as a traditional, single part upload. - * Because the entire stream contents must be buffered in memory, this can be very expensive, and should be - * avoided whenever possible." - * - */ -public class S3FilePersistence extends FilePersistence { - - public static final Logger LOG = LoggerFactory.getLogger(S3FilePersistence.class); - - /** - * The common prefix of all accessed buckets. - * Example: if baseBucket is "analysis-staging", files of type POLYGON will be in bucket "analysis-staging-polygons". - * NOTE: For now this only applies to reading files using S3FilePersistence, not writing them. - * TODO if this works well, extend to all other bucket names and file loading / saving. - */ - private final String baseBucket; - - /** Manage transfers to S3 in the background, so we can continue calculating while uploading. */ - private final TransferManager transferManager; - - // Low-level client, for now we're trying the high-level TransferManager TODO maybe use this so we don't have to shut down the transferManager - private final AmazonS3 amazonS3; - - public S3FilePersistence (String region, String baseBucket) { - - this.amazonS3 = AmazonS3ClientBuilder.standard() - // .enableAccelerateMode() // this fails looking up s3-accelerate.amazonaws.com - .withRegion(region) - .build(); - - this.transferManager = TransferManagerBuilder.standard() - .withS3Client(amazonS3) - .build(); - - this.baseBucket = baseBucket; - } - - @Override - public void saveData(String directory, String fileName, PersistenceBuffer persistenceBuffer) { - try { - ObjectMetadata metadata = new ObjectMetadata(); - // Set content encoding to gzip. This way browsers will decompress on download using native deflate code. - // http://www.rightbrainnetworks.com/blog/serving-compressed-gzipped-static-files-from-amazon-s3-or-cloudfront/ - metadata.setContentEncoding("gzip"); - metadata.setContentType(persistenceBuffer.getMimeType()); - // We must setContentLength or the S3 client will re-buffer the InputStream into another memory buffer. - metadata.setContentLength(persistenceBuffer.getSize()); -// amazonS3.putObject(directory, fileName, persistenceBuffer.getInputStream(), metadata); - final Upload upload = transferManager.upload(directory, fileName, persistenceBuffer.getInputStream(), metadata); - upload.addProgressListener(new UploadProgressLogger(upload)); - // Block until upload completes to avoid accumulating unlimited uploads in memory. - upload.waitForCompletion(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - @Override - public InputStream getData(FileCategory category, String name) { - String fullBucketName = getFullBucketName(category); - LOG.info("Fetching {} with name {} from S3 bucket {}.", category, name, fullBucketName); - S3Object s3Object = amazonS3.getObject(fullBucketName, name); - // TODO progress reporting, cacheing in local file using getObject(GetObjectRequest getObjectRequest, File destinationFile) or TransferManager.download() - InputStream inputStream = s3Object.getObjectContent(); - return inputStream; - } - - /** - * Example: for baseBucket "analysis-staging" and file type POLYGON, returns "analysis-staging-polygons". - */ - private String getFullBucketName (FileCategory category) { - return baseBucket + "-" + category.name().toLowerCase() + "s"; - } - - @Override - public void shutdown() { - transferManager.shutdownNow(); - } - - // TODO wire this up to our r5/analysis progress and task system - private static class UploadProgressLogger implements ProgressListener { - - private static final int LOG_INTERVAL_SECONDS = 5; - - private final long beginTime; - - private long lastLogTime; - - private Upload upload; - - public UploadProgressLogger(Upload upload) { - this.upload = upload; - beginTime = System.currentTimeMillis(); - lastLogTime = beginTime; - } - - @Override - public void progressChanged(ProgressEvent progressEvent) { - final ProgressEventType eventType = progressEvent.getEventType(); - if (eventType == ProgressEventType.REQUEST_BYTE_TRANSFER_EVENT || - eventType == ProgressEventType.TRANSFER_COMPLETED_EVENT) { - long now = System.currentTimeMillis(); - if (now > lastLogTime + LOG_INTERVAL_SECONDS * 1000 || - eventType == ProgressEventType.TRANSFER_COMPLETED_EVENT) { - TransferProgress transferProgress = upload.getProgress(); - double durationSec = (now - beginTime) / 1000D; - LOG.info("{} transferred {} of {} ({} percent), duration {}, speed {})", - upload.getDescription(), - human(transferProgress.getBytesTransferred(), "B"), - human(transferProgress.getTotalBytesToTransfer(), "B"), - transferProgress.getPercentTransferred(), - human(durationSec, "s"), - human(transferProgress.getBytesTransferred() / durationSec, "B/sec")); - lastLogTime = now; - } - } else if (eventType == ProgressEventType.TRANSFER_FAILED_EVENT) { - LOG.error("{}: TRANSFER FAILED.", upload.getDescription()); - } - } - } -} - -/* Some old code that might be relevant if we need to set regions. - The client creation process can detect the region that EC2 instances are running in. - When testing locally setting the region is essential. - - // Clients for communicating with Amazon web services - // When creating the S3 and SQS clients use the default credentials chain. - // This will check environment variables and ~/.aws/credentials first, then fall back on - // the auto-assigned IAM role if this code is running on an EC2 instance. - // http://docs.aws.amazon.com/AWSSdkDocsJava/latest/DeveloperGuide/java-dg-roles.html - AmazonS3ClientBuilder builder = AmazonS3Client.builder(); - builder.setRegion(ec2info.region); - AmazonS3 s3 = builder.build(); -*/ diff --git a/src/main/java/com/conveyal/r5/analyst/SelectingGridReducer.java b/src/main/java/com/conveyal/r5/analyst/SelectingGridReducer.java deleted file mode 100644 index 35246203e..000000000 --- a/src/main/java/com/conveyal/r5/analyst/SelectingGridReducer.java +++ /dev/null @@ -1,95 +0,0 @@ -package com.conveyal.r5.analyst; - -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.S3Object; -import com.google.common.io.LittleEndianDataInputStream; - -import java.io.IOException; -import java.io.InputStream; -import java.util.zip.GZIPInputStream; - -/** - * Access grids are three-dimensional arrays, with the first two dimensions consisting of x and y coordinates of origins - * within the regional analysis, and the third dimension reflects multiple values of the indicator of interest. This could - * be instantaneous accessibility results for each Monte Carlo draw when computing average instantaneous accessibility (i.e. - * Owen-style accessibility), or it could be multiple bootstrap replications of the sampling distribution of accessibility - * given median travel time (see Conway, M. W., Byrd, A. and van Eggermond, M. "A Statistical Approach to Comparing - * Accessibility Results: Including Uncertainty in Public Transport Sketch Planning," paper presented at the 2017 World - * Symposium of Transport and Land Use Research, Brisbane, QLD, Australia, Jul 3-6.) - * - * A SelectingGridReducer simply grabs the value at a particular index within each origin. - * When storing bootstrap replications of travel time, we also store the point estimate (using all Monte Carlo draws - * equally weighted) as the first value, so a SelectingGridReducer(0) can be used to retrieve the point estimate. - * - * DEPRECATED because this has been copied into analysis-backend where it belongs. - */ -@Deprecated -public class SelectingGridReducer { - - private static final AmazonS3 s3 = AmazonS3ClientBuilder.defaultClient(); - - /** Version of the access grid format we read */ - private static final int ACCESS_GRID_VERSION = 0; - - public final int index; - - /** Initialize with the index to extract */ - public SelectingGridReducer(int index) { - this.index = index; - } - - public Grid compute(String resultsBucket, String key) throws IOException { - S3Object accessGrid = s3.getObject(resultsBucket, key); - - return compute(accessGrid.getObjectContent()); - } - - public Grid compute (InputStream rawInput) throws IOException { - LittleEndianDataInputStream input = new LittleEndianDataInputStream(new GZIPInputStream(rawInput)); - - char[] header = new char[8]; - for (int i = 0; i < 8; i++) { - header[i] = (char) input.readByte(); - } - - if (!"ACCESSGR".equals(new String(header))) { - throw new IllegalArgumentException("Input not in access grid format!"); - } - - int version = input.readInt(); - - if (version != ACCESS_GRID_VERSION) { - throw new IllegalArgumentException(String.format("Version mismatch of access grids, expected %s, found %s", ACCESS_GRID_VERSION, version)); - } - - int zoom = input.readInt(); - int west = input.readInt(); - int north = input.readInt(); - int width = input.readInt(); - int height = input.readInt(); - - // The number of samples stored at each origin; these could be instantaneous accessibility values for each - // Monte Carlo draw, or they could be bootstrap replications of a sampling distribution of accessibility given - // median travel time. - int nSamples = input.readInt(); - - Grid outputGrid = new Grid(zoom, width, height, north, west); - - int[] valuesThisOrigin = new int[nSamples]; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - // input values are delta-coded per origin, so use val to keep track of current value - for (int iteration = 0, val = 0; iteration < nSamples; iteration++) { - valuesThisOrigin[iteration] = (val += input.readInt()); - } - // compute percentiles - outputGrid.grid[x][y] = valuesThisOrigin[index]; - } - } - input.close(); - return outputGrid; - } - -} diff --git a/src/main/java/com/conveyal/r5/analyst/WebMercatorGridPointSet.java b/src/main/java/com/conveyal/r5/analyst/WebMercatorGridPointSet.java index 9f14ba3e4..c3b585812 100644 --- a/src/main/java/com/conveyal/r5/analyst/WebMercatorGridPointSet.java +++ b/src/main/java/com/conveyal/r5/analyst/WebMercatorGridPointSet.java @@ -22,6 +22,10 @@ public class WebMercatorGridPointSet extends PointSet implements Serializable { public static final Logger LOG = LoggerFactory.getLogger(WebMercatorGridPointSet.class); + /** + * Default Web Mercator zoom level for grids (origin/destination layers, aggregation area masks, etc.). + * Level 10 is probably ideal but will quadruple calculation relative to 9. + */ public static final int DEFAULT_ZOOM = 9; /** web mercator zoom level */ @@ -225,4 +229,12 @@ public WebMercatorExtents getWebMercatorExtents () { return new WebMercatorExtents(west, north, width, height, zoom); } + public static int parseZoom(String zoom) { + if (zoom != null) { + return Integer.parseInt(zoom); + } else { + return DEFAULT_ZOOM; + } + } + } diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorker.java b/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorker.java index 240ba88ca..ee88a1272 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorker.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorker.java @@ -1,23 +1,17 @@ package com.conveyal.r5.analyst.cluster; -import com.amazonaws.regions.Regions; -import com.conveyal.analysis.BackendVersion; +import com.conveyal.analysis.components.eventbus.EventBus; +import com.conveyal.analysis.components.eventbus.HandleRegionalEvent; +import com.conveyal.analysis.components.eventbus.HandleSinglePointEvent; import com.conveyal.file.FileStorage; -import com.conveyal.file.LocalFileStorage; -import com.conveyal.file.S3FileStorage; -import com.conveyal.gtfs.GTFSCache; import com.conveyal.r5.OneOriginResult; import com.conveyal.r5.analyst.AccessibilityResult; -import com.conveyal.r5.analyst.FilePersistence; import com.conveyal.r5.analyst.NetworkPreloader; import com.conveyal.r5.analyst.PersistenceBuffer; import com.conveyal.r5.analyst.PointSetCache; -import com.conveyal.r5.analyst.S3FilePersistence; import com.conveyal.r5.analyst.TravelTimeComputer; -import com.conveyal.r5.analyst.error.ScenarioApplicationException; import com.conveyal.r5.analyst.error.TaskError; import com.conveyal.r5.common.JsonUtilities; -import com.conveyal.r5.streets.OSMCache; import com.conveyal.r5.transit.TransportNetwork; import com.conveyal.r5.transit.TransportNetworkCache; import com.conveyal.r5.transitive.TransitiveNetwork; @@ -37,10 +31,7 @@ import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.io.OutputStream; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; @@ -48,7 +39,6 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Properties; import java.util.Random; import java.util.UUID; import java.util.concurrent.BlockingQueue; @@ -63,19 +53,67 @@ import static com.conveyal.r5.profile.PerTargetPropagater.SECONDS_PER_MINUTE; import static com.google.common.base.Preconditions.checkElementIndex; import static com.google.common.base.Preconditions.checkNotNull; -import static com.google.common.base.Preconditions.checkState; /** - * This is a main class run by worker machines in our Analysis computation cluster. - * It polls a broker requesting work over HTTP, telling the broker what networks and scenarios it has loaded. - * When it receives some work from the broker it does the necessary work and returns the results back to the front - * end via the broker. - * - * The worker can poll for work over two different channels. One is for large asynchronous batch jobs, the other is - * intended for interactive single point requests that should return as fast as possible. + * This is a main class run by worker machines in our Analysis computation cluster. It polls a broker requesting work + * over HTTP, telling the broker what networks and scenarios it has loaded. When it receives some work from the broker + * it does the necessary work and returns the results back to the front end via the broker. + * The worker may also listen for interactive single point requests that should return as fast as possible. */ public class AnalysisWorker implements Runnable { + /** + * All parameters needed to configure an AnalysisWorker instance. + * This config interface is kind of huge and includes most things in the WorkerConfig. + * This implies too much functionality is concentrated in AnalysisWorker and should be compartmentalized. + */ + public interface Config { + + /** + * This worker will only listen for incoming single point requests if this field is true when run() is invoked. + * Setting this to false before running creates a regional-only cluster worker. + * This is useful in testing when running many workers on the same machine. + */ + boolean listenForSinglePoint(); + + /** + * If this is true, the worker will not actually do any work. It will just report all tasks as completed + * after a small delay, but will fail to do so on the given percentage of tasks. This is used in testing task + * re-delivery and overall broker sanity with multiple jobs and multiple failing workers. + */ + boolean testTaskRedelivery(); + String brokerAddress(); + String brokerPort(); + String initialGraphId(); + + } + + // CONSTANTS + + private static final Logger LOG = LoggerFactory.getLogger(AnalysisWorker.class); + + public static final int POLL_WAIT_SECONDS = 15; + public static final int POLL_MAX_RANDOM_WAIT = 5; + + /** + * This timeout should be longer than the longest expected worker calculation for a single-point request. + * Preparing networks or linking grids will take longer, but those cases are now handled with + * WorkerNotReadyException. + */ + private static final int HTTP_CLIENT_TIMEOUT_SEC = 55; + + /** The port on which the worker will listen for single point tasks forwarded from the backend. */ + public static final int WORKER_LISTEN_PORT = 7080; + + /** + * When testTaskRedelivery=true, how often the worker will fail to return a result for a task. + * TODO merge this with the boolean config parameter to enable intentional failure. + */ + public static final int TESTING_FAILURE_RATE_PERCENT = 20; + + + // STATIC FIELDS + /** * Worker ID - just a random ID so we can differentiate machines used for computation. * Useful to isolate the logs from a particular machine, as well as to evaluate any @@ -92,96 +130,36 @@ public class AnalysisWorker implements Runnable { */ public static final String machineId = UUID.randomUUID().toString().replaceAll("-", ""); - private static final Logger LOG = LoggerFactory.getLogger(AnalysisWorker.class); - - private static final String DEFAULT_BROKER_ADDRESS = "localhost"; - - private static final String DEFAULT_BROKER_PORT = "7070"; - - public static final int POLL_WAIT_SECONDS = 15; - - public static final int POLL_MAX_RANDOM_WAIT = 5; - - /** The port on which the worker will listen for single point tasks forwarded from the backend. */ - public static final int WORKER_LISTEN_PORT = 7080; + // INSTANCE FIELDS - // TODO make non-static and make implementations swappable - // This is very ugly because it's static but initialized at class instantiation. - public static FilePersistence filePersistence; + /** Hold a reference to the config object to avoid copying the many config values. */ + private final Config config; /** Keeps some TransportNetworks around, lazy-loading or lazy-building them. */ public final NetworkPreloader networkPreloader; - /** - * If this is true, the worker will not actually do any work. It will just report all tasks as completed - * after a small delay, but will fail to do so on the given percentage of tasks. This is used in testing task - * re-delivery and overall broker sanity with multiple jobs and multiple failing workers. - */ - private final boolean testTaskRedelivery; - - /** In the type of tests described above, this is how often the worker will fail to return a result for a task. */ - public static final int TESTING_FAILURE_RATE_PERCENT = 20; - - /** The amount of time (in minutes) a worker will stay alive after starting certain work */ - static final int PRELOAD_KEEPALIVE_MINUTES = 90; - static final int REGIONAL_KEEPALIVE_MINUTES = 5; - static final int SINGLE_KEEPALIVE_MINUTES = 60; - - /** Clock time (milliseconds since epoch) at which the worker should be considered idle */ - long shutdownAfter; - boolean inPreloading; - - void adjustShutdownClock (int keepAliveMinutes) { - long t = System.currentTimeMillis() + keepAliveMinutes * 60 * 1000; - if (inPreloading) { - inPreloading = false; - shutdownAfter = t; - } else { - shutdownAfter = Math.max(shutdownAfter, t); - } - } - - /** Whether this worker should shut down automatically when idle. */ - public final boolean autoShutdown; - - public static final Random random = new Random(); + private final Random random = new Random(); /** The common root of all API URLs contacted by this worker, e.g. http://localhost:7070/api/ */ - protected String brokerBaseUrl; + protected final String brokerBaseUrl; /** The HTTP client the worker uses to contact the broker and fetch regional analysis tasks. */ - static final HttpClient httpClient = makeHttpClient(); - - /** - * This timeout should be longer than the longest expected worker calculation for a single-point request. - * Preparing networks or linking grids will take longer, but those cases are now handled with - * WorkerNotReadyException. - */ - private static final int HTTP_CLIENT_TIMEOUT_SEC = 55; + private final HttpClient httpClient = makeHttpClient(); /** * The results of finished work accumulate here, and will be sent in batches back to the broker. * All access to this field should be synchronized since it will is written to by multiple threads. * We don't want to just wrap it in a SynchronizedList because we need an atomic copy-and-empty operation. */ - private List workResults = new ArrayList<>(); + private final List workResults = new ArrayList<>(); /** The last time (in milliseconds since the epoch) that we polled for work. */ private long lastPollingTime; /** Keep track of how many tasks per minute this worker is processing, broken down by scenario ID. */ - ThroughputTracker throughputTracker = new ThroughputTracker(); - - /** - * This worker will only listen for incoming single point requests if this field is true when run() is invoked. - * Setting this to false before running creates a regional-only cluster worker. This is useful in testing when - * running many workers on the same machine. - */ - protected boolean listenForSinglePointRequests; + private final ThroughputTracker throughputTracker = new ThroughputTracker(); - /** - * This has been pulled out into a method so the broker can also make a similar http client. - */ + /** Convenience method allowing the backend broker and the worker to make similar HTTP clients. */ public static HttpClient makeHttpClient () { PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager(); mgr.setDefaultMaxPerRoute(20); @@ -195,137 +173,60 @@ public static HttpClient makeHttpClient () { .build(); } + private final FileStorage fileStorage; + /** * A loading cache of opportunity dataset grids (not grid pointsets or linkages). * TODO use the WebMercatorGridExtents in these Grids. */ - PointSetCache pointSetCache; - - /** The transport network this worker already has loaded, and therefore prefers to work on. */ - String networkId = null; + private final PointSetCache pointSetCache; /** Information about the EC2 instance (if any) this worker is running on. */ - EC2Info ec2info; + public EC2Info ec2info; - /** If true Analyst is running locally, do not use internet connection and remote services such as S3. */ - private boolean workOffline; + /** The transport network this worker already has loaded, and therefore prefers to work on. */ + protected String networkId = null; /** * A queue to hold a backlog of regional analysis tasks. * This avoids "slow joiner" syndrome where we wait to poll for more work until all N fetched tasks have finished, * but one of the tasks takes much longer than all the rest. * This should be long enough to hold all that have come in - we don't need to block on polling the manager. + * Can this be replaced with the general purpose TaskScheduler component? + * That will depend whether all TaskScheduler Tasks are tracked in a way intended to be visible to users. */ private ThreadPoolExecutor regionalTaskExecutor; - /** The HTTP server that receives single-point requests. */ + /** The HTTP server that receives single-point requests. TODO make this more consistent with the backend HTTP API components. */ private spark.Service sparkHttpService; - public static AnalysisWorker forConfig (Properties config) { - // FIXME why is there a separate configuration parsing section here? Why not always make the cache based on the configuration? - // FIXME why is some configuration done here and some in the constructor? - boolean workOffline = Boolean.parseBoolean(config.getProperty("work-offline", "false")); - String graphDirectory = config.getProperty("cache-dir", "cache/graphs"); - FileStorage fileStore; - if (workOffline) { - fileStore = new LocalFileStorage(graphDirectory); - } else { - fileStore = new S3FileStorage(config.getProperty("aws-region"), graphDirectory); - } - - // TODO worker config classes structured like BackendConfig - String graphsBucket = workOffline ? null : config.getProperty("graphs-bucket"); - OSMCache osmCache = new OSMCache(fileStore, () -> graphsBucket); - GTFSCache gtfsCache = new GTFSCache(fileStore, () -> graphsBucket); + private final EventBus eventBus; - TransportNetworkCache cache = new TransportNetworkCache(fileStore, gtfsCache, osmCache, graphsBucket); - return new AnalysisWorker(config, fileStore, cache); - } - - // TODO merge this constructor with the forConfig factory method, so we don't have different logic for local and cluster workers - public AnalysisWorker (Properties config, FileStorage fileStore, TransportNetworkCache transportNetworkCache) { - // print out date on startup so that CloudWatch logs has a unique fingerprint + /** Constructor that takes injected components. */ + public AnalysisWorker ( + FileStorage fileStorage, + TransportNetworkCache transportNetworkCache, + EventBus eventBus, + Config config + ) { + // Print out date on startup so that CloudWatch logs has a unique fingerprint LOG.info("Analyst worker {} starting at {}", machineId, LocalDateTime.now().format(DateTimeFormatter.ISO_DATE_TIME)); - // PARSE THE CONFIGURATION TODO move configuration parsing into a separate method. - - testTaskRedelivery = Boolean.parseBoolean(config.getProperty("test-task-redelivery", "false")); - - // Region region = Region.getRegion(Regions.fromName(config.getProperty("aws-region"))); - // TODO Eliminate this default base-bucket value "analysis-staging" and set it properly when the backend starts workers. - // It's currently harmless to hard-wire it because it only affects polygon downloads for experimental modifications. - filePersistence = new S3FilePersistence(config.getProperty("aws-region"), config.getProperty("base-bucket", "analysis-staging")); + this.config = config; + this.brokerBaseUrl = String.format("http://%s:%s/internal", config.brokerAddress(), config.brokerPort()); - // First, check whether we are running Analyst offline. - workOffline = Boolean.parseBoolean(config.getProperty("work-offline", "false")); - if (workOffline) { - LOG.info("Working offline. Avoiding internet connections and hosted services."); - } - - { - String brokerAddress = config.getProperty("broker-address", DEFAULT_BROKER_ADDRESS); - String brokerPort = config.getProperty("broker-port", DEFAULT_BROKER_PORT); - this.brokerBaseUrl = String.format("http://%s:%s/internal", brokerAddress, brokerPort); - } - - // set the initial graph affinity of this worker (if it is not in the config file it will be - // set to null, i.e. no graph affinity) - // we don't actually build the graph now; this is just a hint to the broker as to what - // graph this machine was intended to analyze. - this.networkId = config.getProperty("initial-graph-id"); - - this.pointSetCache = new PointSetCache(fileStore, config.getProperty("pointsets-bucket")); + // Set the initial graph affinity of this worker (which will be null in local operation). + // We don't actually build / load / process the TransportNetwork until we receive the first task. + // This just provides a hint to the broker as to what network this machine was intended to analyze. + this.networkId = config.initialGraphId(); + this.fileStorage = fileStorage; + this.pointSetCache = new PointSetCache(fileStorage); // Make this cache a component? this.networkPreloader = new NetworkPreloader(transportNetworkCache); - this.autoShutdown = Boolean.parseBoolean(config.getProperty("auto-shutdown", "false")); - this.listenForSinglePointRequests = Boolean.parseBoolean(config.getProperty("listen-for-single-point", "true")); - - // Keep the worker alive for an initial window to prepare for analysis - inPreloading = true; - shutdownAfter = System.currentTimeMillis() + PRELOAD_KEEPALIVE_MINUTES * 60 * 1000; - - // Discover information about what EC2 instance / region we're running on, if any. - // If the worker isn't running in Amazon EC2, then region will be unknown so fall back on a default, because - // the new request signing v4 requires you to know the region where the S3 objects are. - ec2info = new EC2Info(); - if (!workOffline) { - ec2info.fetchMetadata(); - } - if (ec2info.region == null) { - // We're working offline and/or not running on EC2. Set a default region rather than detecting one. - ec2info.region = Regions.EU_WEST_1.getName(); - } - } - - /** - * Shut down if enough time has passed since certain events (startup or handling an analysis request). When EC2 - * billing was in hourly increments, the worker would only consider shutting down every 60 minutes. But EC2 - * billing is now by the second, so we check more frequently (during regular polling). - */ - public void considerShuttingDown() { - long now = System.currentTimeMillis(); - - if (now > shutdownAfter) { - LOG.info("Machine has been idle for at least {} minutes (single point) and {} minutes (regional), " + - "shutting down.", SINGLE_KEEPALIVE_MINUTES , REGIONAL_KEEPALIVE_MINUTES); - // Stop accepting any new single-point requests while shutdown is happening. - // TODO maybe actively tell the broker this worker is shutting down. - sparkHttpService.stop(); - try { - Process process = new ProcessBuilder("sudo", "/sbin/shutdown", "-h", "now").start(); - process.waitFor(); - } catch (Exception ex) { - LOG.error("Unable to terminate worker", ex); - // TODO email us or something - } finally { - System.exit(0); - } - } + this.eventBus = eventBus; } - /** - * This is the main worker event loop which fetches tasks from a broker and schedules them for execution. - */ + /** The main worker event loop which fetches tasks from a broker and schedules them for execution. */ @Override public void run() { @@ -333,10 +234,10 @@ public void run() { // The default task rejection policy is "Abort". // The executor's queue is rather long because some tasks complete very fast and we poll max once per second. int availableProcessors = Runtime.getRuntime().availableProcessors(); - LOG.info("Java reports the number of available processors is: {}", availableProcessors); + LOG.debug("Java reports the number of available processors is: {}", availableProcessors); int maxThreads = availableProcessors; int taskQueueLength = availableProcessors * 6; - LOG.info("Maximum number of regional processing threads is {}, length of task queue is {}.", maxThreads, taskQueueLength); + LOG.debug("Maximum number of regional processing threads is {}, length of task queue is {}.", maxThreads, taskQueueLength); BlockingQueue taskQueue = new LinkedBlockingQueue<>(taskQueueLength); regionalTaskExecutor = new ThreadPoolExecutor(1, maxThreads, 60, TimeUnit.SECONDS, taskQueue); @@ -351,7 +252,7 @@ public void run() { // "needed(acceptors=1 + selectors=8 + request=1)". Even worse, in container-based testing environments this // required number of threads is even higher and any value we specify can cause the server (and tests) to fail. // TODO find a more effective way to limit simultaneous computations, e.g. feed them through the regional thread pool. - if (listenForSinglePointRequests) { + if (config.listenForSinglePoint()) { // Use the newer non-static Spark framework syntax. sparkHttpService = spark.Service.ignite().port(WORKER_LISTEN_PORT); sparkHttpService.post("/single", new AnalysisWorkerController(this)::handleSinglePoint); @@ -372,23 +273,37 @@ public void run() { if (tasks == null || tasks.isEmpty()) { // Either there was no work, or some kind of error occurred. // Sleep for a while before polling again, adding a random component to spread out the polling load. - if (autoShutdown) {considerShuttingDown();} + // TODO only randomize delay on the first round, after that it's excessive. int randomWait = random.nextInt(POLL_MAX_RANDOM_WAIT); LOG.debug("Polling the broker did not yield any regional tasks. Sleeping {} + {} sec.", POLL_WAIT_SECONDS, randomWait); sleepSeconds(POLL_WAIT_SECONDS + randomWait); continue; } for (RegionalTask task : tasks) { + // Try to enqueue each task for execution, repeatedly failing until the queue is not full. + // The list of fetched tasks essentially serves as a secondary queue, which is awkward. This is using + // exceptions for normal flow control, which is nasty. We should do this differently (#596). while (true) { try { // TODO define non-anonymous runnable class to instantiate here, specifically for async regional tasks. - regionalTaskExecutor.execute(() -> this.handleOneRegionalTask(task)); + regionalTaskExecutor.execute(() -> { + try { + this.handleOneRegionalTask(task); + } catch (Throwable t) { + LOG.error( + "An error occurred while handling a regional task, reporting to backend. {}", + ExceptionUtils.stackTraceString(t) + ); + synchronized (workResults) { + workResults.add(new RegionalWorkResult(t, task)); + } + } + }); break; } catch (RejectedExecutionException e) { - // Queue is full, wait a bit and try to feed it more tasks. - // FIXME if we burn through the internal queue in less than 1 second this is a speed bottleneck. - // This happens with regions unconnected to transit and with very small travel time cutoffs. - // FIXME this is really using the list of fetched tasks as a secondary queue, it's awkward. + // Queue is full, wait a bit and try to feed it more tasks. If worker handles all tasks in its + // internal queue in less than 1 second, this is a speed bottleneck. This happens with regions + // unconnected to transit and with very small travel time cutoffs. sleepSeconds(1); } } @@ -396,19 +311,17 @@ public void run() { } } - /** - * Bypass idiotic java checked exceptions. - */ + /** Bypass idiotic java checked exceptions. */ public static void sleepSeconds (int seconds) { try { - Thread.sleep(seconds * 1000); + Thread.sleep(seconds * 1000L); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } protected byte[] handleAndSerializeOneSinglePointTask (TravelTimeSurfaceTask task) throws IOException { - LOG.info("Handling single-point task {}", task.toString()); + LOG.debug("Handling single-point task {}", task.toString()); // Get all the data needed to run one analysis task, or at least begin preparing it. final AsyncLoader.LoaderState networkLoaderState = networkPreloader.preloadData(task); @@ -445,9 +358,9 @@ protected OneOriginResult handleOneSinglePointTask (TravelTimeSurfaceTask task, task.loadAndValidateDestinationPointSets(pointSetCache); } - // After the AsyncLoader has reported all required data are ready for analysis, advance the shutdown clock to - // reflect that the worker is performing single-point work. - adjustShutdownClock(SINGLE_KEEPALIVE_MINUTES); + // After the AsyncLoader has reported all required data are ready for analysis, + // signal that we will begin processing the task. + eventBus.send(new HandleSinglePointEvent()); // Perform the core travel time computations. TravelTimeComputer computer = new TravelTimeComputer(task, transportNetwork); @@ -496,15 +409,16 @@ private byte[] singlePointResultToBinary ( * Handle one task representing one of many origins within a regional analysis. * This method is generally being executed asynchronously, handling a large number of tasks on a pool of worker * threads. It stockpiles results as they are produced, so they can be returned to the backend in batches when the - * worker polls the backend. + * worker polls the backend. If any problem is encountered, the Throwable may be allowed to propagate up as all + * Throwables will be caught and reported to the backend, causing the regional job to end. */ - protected void handleOneRegionalTask(RegionalTask task) { + protected void handleOneRegionalTask (RegionalTask task) throws Throwable { - LOG.info("Handling regional task {}", task.toString()); + LOG.debug("Handling regional task {}", task.toString()); // If this worker is being used in a test of the task redelivery mechanism. Report most work as completed // without actually doing anything, but fail to report results a certain percentage of the time. - if (testTaskRedelivery) { + if (config.testTaskRedelivery()) { pretendToDoWork(task); return; } @@ -528,83 +442,78 @@ protected void handleOneRegionalTask(RegionalTask task) { maxTripDurationMinutes = 120; } task.maxTripDurationMinutes = maxTripDurationMinutes; - LOG.info("Maximum cutoff was {} minutes, limiting trip duration to {} minutes based on decay function {}.", + LOG.debug("Maximum cutoff was {} minutes, limiting trip duration to {} minutes based on decay function {}.", maxCutoffMinutes, maxTripDurationMinutes, task.decayFunction.getClass().getSimpleName()); } - try { - // TODO (re)validate multi-percentile and multi-cutoff parameters. Validation currently in TravelTimeReducer. - // This version should require both arrays to be present, and single values to be missing. - // Using a newer backend, the task should have been normalized to use arrays not single values. - checkNotNull(task.cutoffsMinutes, "This worker requires an array of cutoffs (rather than a single value)."); - checkNotNull(task.percentiles, "This worker requires an array of percentiles (rather than a single one)."); - checkElementIndex(0, task.cutoffsMinutes.length, "Regional task must specify at least one cutoff."); - checkElementIndex(0, task.percentiles.length, "Regional task must specify at least one percentile."); - - // Get the graph object for the ID given in the task, fetching inputs and building as needed. - // All requests handled together are for the same graph, and this call is synchronized so the graph will - // only be built once. - // Record the currently loaded network ID so we "stick" to this same graph on subsequent polls. - networkId = task.graphId; - // Note we're completely bypassing the async loader here and relying on the older nested LoadingCaches. - // If those are ever removed, the async loader will need a synchronous mode with per-path blocking (kind of - // reinventing the wheel of LoadingCache) or we'll need to make preparation for regional tasks async. - TransportNetwork transportNetwork = networkPreloader.transportNetworkCache.getNetworkForScenario(task - .graphId, task.scenarioId); - - // Static site tasks do not specify destinations, but all other regional tasks should. - // Load the PointSets based on the IDs (actually, full storage keys including IDs) in the task. - // The presence of these grids in the task will then trigger the computation of accessibility values. - if (!task.makeTauiSite) { - task.loadAndValidateDestinationPointSets(pointSetCache); - } + // TODO (re)validate multi-percentile and multi-cutoff parameters. Validation currently in TravelTimeReducer. + // This version should require both arrays to be present, and single values to be missing. + // Using a newer backend, the task should have been normalized to use arrays not single values. + checkNotNull(task.cutoffsMinutes, "This worker requires an array of cutoffs (rather than a single value)."); + checkNotNull(task.percentiles, "This worker requires an array of percentiles (rather than a single one)."); + checkElementIndex(0, task.cutoffsMinutes.length, "Regional task must specify at least one cutoff."); + checkElementIndex(0, task.percentiles.length, "Regional task must specify at least one percentile."); - // If we are generating a static site, there must be a single metadata file for an entire batch of results. - // Arbitrarily we create this metadata as part of the first task in the job. - if (task.makeTauiSite && task.taskId == 0) { - LOG.info("This is the first task in a job that will produce a static site. Writing shared metadata."); - saveStaticSiteMetadata(task, transportNetwork); - } + // Get the graph object for the ID given in the task, fetching inputs and building as needed. + // All requests handled together are for the same graph, and this call is synchronized so the graph will + // only be built once. + // Record the currently loaded network ID so we "stick" to this same graph on subsequent polls. + networkId = task.graphId; + // Note we're completely bypassing the async loader here and relying on the older nested LoadingCaches. + // If those are ever removed, the async loader will need a synchronous mode with per-path blocking (kind of + // reinventing the wheel of LoadingCache) or we'll need to make preparation for regional tasks async. + TransportNetwork transportNetwork = networkPreloader.transportNetworkCache.getNetworkForScenario(task + .graphId, task.scenarioId); + + // Static site tasks do not specify destinations, but all other regional tasks should. + // Load the PointSets based on the IDs (actually, full storage keys including IDs) in the task. + // The presence of these grids in the task will then trigger the computation of accessibility values. + if (!task.makeTauiSite) { + task.loadAndValidateDestinationPointSets(pointSetCache); + } - // Advance the shutdown clock to reflect that the worker is performing regional work. - adjustShutdownClock(REGIONAL_KEEPALIVE_MINUTES); - - // Perform the core travel time and accessibility computations. - TravelTimeComputer computer = new TravelTimeComputer(task, transportNetwork); - OneOriginResult oneOriginResult = computer.computeTravelTimes(); - - if (task.makeTauiSite) { - // Unlike a normal regional task, this will write a time grid rather than an accessibility indicator - // value because we're generating a set of time grids for a static site. We only save a file if it has - // non-default contents, as a way to save storage and bandwidth. - // TODO eventually carry out actions based on what's present in the result, not on the request type. - if (oneOriginResult.travelTimes.anyCellReached()) { - TimeGridWriter timeGridWriter = new TimeGridWriter(oneOriginResult.travelTimes, task); - PersistenceBuffer persistenceBuffer = timeGridWriter.writeToPersistenceBuffer(); - String timesFileName = task.taskId + "_times.dat"; - filePersistence.saveStaticSiteData(task, timesFileName, persistenceBuffer); - } else { - LOG.info("No destination cells reached. Not saving static site file to reduce storage space."); - } - // Overwrite with an empty set of results to send back to the backend, allowing it to track job - // progress. This avoids crashing the backend by sending back massive 2 million element travel times - // that have already been written to S3, and throwing exceptions on old backends that can't deal with - // null AccessibilityResults. - oneOriginResult = new OneOriginResult(null, new AccessibilityResult(task), null); - } + // If we are generating a static site, there must be a single metadata file for an entire batch of results. + // Arbitrarily we create this metadata as part of the first task in the job. + if (task.makeTauiSite && task.taskId == 0) { + LOG.info("This is the first task in a job that will produce a static site. Writing shared metadata."); + saveTauiMetadata(task, transportNetwork); + } - // Accumulate accessibility results, which will be returned to the backend in batches. - // For most regional analyses, this is an accessibility indicator value for one of many origins, - // but for static sites the indicator value is not known, it is computed in the UI. We still want to return - // dummy (zero) accessibility results so the backend is aware of progress through the list of origins. - synchronized (workResults) { - workResults.add(new RegionalWorkResult(oneOriginResult, task)); + // After the TransportNetwork has been loaded, signal that we will begin processing the task. + eventBus.send(new HandleRegionalEvent()); + + // Perform the core travel time and accessibility computations. + TravelTimeComputer computer = new TravelTimeComputer(task, transportNetwork); + OneOriginResult oneOriginResult = computer.computeTravelTimes(); + + if (task.makeTauiSite) { + // Unlike a normal regional task, this will write a time grid rather than an accessibility indicator + // value because we're generating a set of time grids for a static site. We only save a file if it has + // non-default contents, as a way to save storage and bandwidth. + // TODO eventually carry out actions based on what's present in the result, not on the request type. + if (oneOriginResult.travelTimes.anyCellReached()) { + TimeGridWriter timeGridWriter = new TimeGridWriter(oneOriginResult.travelTimes, task); + PersistenceBuffer persistenceBuffer = timeGridWriter.writeToPersistenceBuffer(); + String timesFileName = task.taskId + "_times.dat"; + fileStorage.saveTauiData(task, timesFileName, persistenceBuffer); + } else { + LOG.debug("No destination cells reached. Not saving static site file to reduce storage space."); } - throughputTracker.recordTaskCompletion(task.jobId); - } catch (Exception ex) { - LOG.error("An error occurred while handling a regional task: {}", ExceptionUtils.asString(ex)); - // TODO communicate regional analysis errors to the backend (in workResults) + // Overwrite with an empty set of results to send back to the backend, allowing it to track job + // progress. This avoids crashing the backend by sending back massive 2 million element travel times + // that have already been written to S3, and throwing exceptions on old backends that can't deal with + // null AccessibilityResults. + oneOriginResult = new OneOriginResult(null, new AccessibilityResult(task), null); + } + + // Accumulate accessibility results, which will be returned to the backend in batches. + // For most regional analyses, this is an accessibility indicator value for one of many origins, + // but for static sites the indicator value is not known, it is computed in the UI. We still want to return + // dummy (zero) accessibility results so the backend is aware of progress through the list of origins. + synchronized (workResults) { + workResults.add(new RegionalWorkResult(oneOriginResult, task)); } + throughputTracker.recordTaskCompletion(task.jobId); } /** @@ -684,11 +593,11 @@ public static void addJsonToGrid ( jsonBlock.accessibility = accessibilityResult.getIntValues(); } jsonBlock.pathSummaries = pathResult == null ? Collections.EMPTY_LIST : pathResult.getPathIterationsForDestination(); - LOG.info("Travel time surface written, appending {}.", jsonBlock); + LOG.debug("Travel time surface written, appending {}.", jsonBlock); // We could do this when setting up the Spark handler, supplying writeValue as the response transformer // But then you also have to handle the case where you are returning raw bytes. JsonUtilities.objectMapper.writeValue(outputStream, jsonBlock); - LOG.info("Done writing"); + LOG.debug("Done writing"); } /** @@ -737,7 +646,7 @@ public List getSomeWork () { // Non-200 response code or a null entity. Something is weird. LOG.error("Unsuccessful polling. HTTP response code: " + response.getStatusLine().getStatusCode()); } catch (Exception e) { - LOG.error("Exception while polling backend for work: {}",ExceptionUtils.asString(e)); + LOG.error("Exception while polling backend for work: {}",ExceptionUtils.stackTraceString(e)); } finally { // We have to properly close any streams so the HTTP connection is released back to the (finite) pool. EntityUtils.consumeQuietly(responseEntity); @@ -754,57 +663,25 @@ public List getSomeWork () { /** * Generate and write out metadata describing what's in a directory of static site output. */ - public static void saveStaticSiteMetadata (AnalysisWorkerTask analysisWorkerTask, TransportNetwork network) { + public void saveTauiMetadata (AnalysisWorkerTask analysisWorkerTask, TransportNetwork network) { try { // Save the regional analysis request, giving the UI some context to display the results. // This is the request object sent to the workers to generate these static site regional results. PersistenceBuffer buffer = PersistenceBuffer.serializeAsJson(analysisWorkerTask); - AnalysisWorker.filePersistence.saveStaticSiteData(analysisWorkerTask, "request.json", buffer); + fileStorage.saveTauiData(analysisWorkerTask, "request.json", buffer); // Save non-fatal warnings encountered applying the scenario to the network for this regional analysis. buffer = PersistenceBuffer.serializeAsJson(network.scenarioApplicationWarnings); - AnalysisWorker.filePersistence.saveStaticSiteData(analysisWorkerTask, "warnings.json", buffer); + fileStorage.saveTauiData(analysisWorkerTask, "warnings.json", buffer); // Save transit route data that allows rendering paths with the Transitive library in a separate file. TransitiveNetwork transitiveNetwork = new TransitiveNetwork(network.transitLayer); buffer = PersistenceBuffer.serializeAsJson(transitiveNetwork); - AnalysisWorker.filePersistence.saveStaticSiteData(analysisWorkerTask, "transitive.json", buffer); + fileStorage.saveTauiData(analysisWorkerTask, "transitive.json", buffer); } catch (Exception e) { - LOG.error("Exception saving static metadata: {}", ExceptionUtils.asString(e)); + LOG.error("Exception saving static metadata: {}", ExceptionUtils.stackTraceString(e)); throw new RuntimeException(e); } } - /** - * Requires a worker configuration, which is a Java Properties file with the following - * attributes. - * - * graphs-bucket S3 bucket in which graphs are stored. - * pointsets-bucket S3 bucket in which pointsets are stored - * auto-shutdown Should this worker shut down its machine if it is idle (e.g. on throwaway cloud instances) - * initial-graph-id The graph ID for this worker to load immediately upon startup - */ - public static void main (String[] args) { - LOG.info("Starting R5 Analyst Worker version {}", BackendVersion.instance.version); - LOG.info("R5 git commit is {}", BackendVersion.instance.commit); - LOG.info("R5 git branch is {}", BackendVersion.instance.branch); - - String configFileName = "worker.conf"; - if (args.length > 0) { - configFileName = args[0]; - } - Properties config = new Properties(); - try (InputStream configInputStream = new FileInputStream(new File(configFileName))) { - config.load(configInputStream); - } catch (Exception e) { - LOG.error("Error loading worker configuration, shutting down. " + ExceptionUtils.asString(e)); - return; - } - try { - AnalysisWorker.forConfig(config).run(); - } catch (Exception e) { - LOG.error("Unhandled error in analyst worker, shutting down. " + ExceptionUtils.asString(e)); - } - } - } diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerController.java b/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerController.java index 2874b080e..f89255001 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerController.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerController.java @@ -1,11 +1,9 @@ package com.conveyal.r5.analyst.cluster; -import com.conveyal.r5.OneOriginResult; import com.conveyal.r5.analyst.error.ScenarioApplicationException; import com.conveyal.r5.analyst.error.TaskError; import com.conveyal.r5.common.JsonUtilities; import com.conveyal.r5.util.ExceptionUtils; -import com.google.common.io.LittleEndianDataOutputStream; import org.eclipse.jetty.http.HttpStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,39 +42,33 @@ public Object handleSinglePoint (Request request, Response response) throws IOEx TravelTimeSurfaceTask task = JsonUtilities.objectFromRequestBody(request, TravelTimeSurfaceTask.class); // TODO do not return raw binary data from method, return better typed response. // TODO possibly move data preloading to this point, to allow returning different HTTP status codes. - if (task.logRequest){ LOG.info(request.body()); } - try { - try { - byte[] binaryResult = analysisWorker.handleAndSerializeOneSinglePointTask(task); - response.status(HttpStatus.OK_200); - if (task.getFormat().equals(GEOTIFF)) { - response.header("Content-Type", "application/x-geotiff"); - } else { - response.header("Content-Type", "application/octet-stream"); - } - return binaryResult; - } catch (WorkerNotReadyException workerNotReadyException) { - // We're using exceptions for flow control here, which is kind of ugly. Define a ResultOrError class? - if (workerNotReadyException.isError()) { - if (workerNotReadyException.asyncLoaderState.exception instanceof ScenarioApplicationException) { - return reportTaskErrors(response, - ((ScenarioApplicationException)workerNotReadyException.asyncLoaderState.exception).taskErrors); - } else { - return jsonResponse(response, HttpStatus.BAD_REQUEST_400, - ExceptionUtils.asString(workerNotReadyException.asyncLoaderState.exception)); - } + byte[] binaryResult = analysisWorker.handleAndSerializeOneSinglePointTask(task); + response.status(HttpStatus.OK_200); + if (task.getFormat().equals(GEOTIFF)) { + response.header("Content-Type", "application/x-geotiff"); + } else { + response.header("Content-Type", "application/octet-stream"); + } + return binaryResult; + } catch (WorkerNotReadyException workerNotReadyException) { + // We're using exceptions for flow control here, which is kind of ugly. Define a ResultOrError class? + if (workerNotReadyException.isError()) { + Throwable t = workerNotReadyException.asyncLoaderState.throwable; + if (t instanceof ScenarioApplicationException) { + return reportTaskErrors(response, ((ScenarioApplicationException)t).taskErrors); } else { - return jsonResponse(response, HttpStatus.ACCEPTED_202, workerNotReadyException.asyncLoaderState.message); + return reportTaskErrors(response, List.of(new TaskError(t))); } + } else { + return jsonResponse(response, HttpStatus.ACCEPTED_202, workerNotReadyException.asyncLoaderState.message); } - } catch (Exception exception) { - // Handle any uncaught exceptions in any of the above code. - // TODO shouldn't some of these serious uncaught errors be 500s? - return jsonResponse(response, HttpStatus.BAD_REQUEST_400, ExceptionUtils.asString(exception)); + } catch (Throwable throwable) { + // Handle any uncaught exceptions in any of the above code. Should some serious uncaught errors be 500s? + return reportTaskErrors(response, List.of(new TaskError(throwable))); } } @@ -95,8 +87,6 @@ private static byte[] jsonResponse (Response response, int httpStatusCode, Strin public static byte[] reportTaskErrors(Response response, List taskErrors) throws IOException { response.status(HttpStatus.BAD_REQUEST_400); response.header("Content-Type", "application/json"); - // TODO expand task errors, this just logs the memory address of the list. - LOG.warn("Reporting errors in response to single-point request:\n" + taskErrors.toString()); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); addJsonToGrid(byteArrayOutputStream, null, taskErrors, Collections.emptyList(), null); byteArrayOutputStream.close(); diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerTask.java b/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerTask.java index 9aee862d1..ddfaf14e8 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerTask.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/AnalysisWorkerTask.java @@ -221,8 +221,8 @@ public void loadAndValidateDestinationPointSets (PointSetCache pointSetCache) { checkNotNull(destinationPointSetKeys); int nPointSets = destinationPointSetKeys.length; checkState( - nPointSets > 0 && nPointSets <= 10, - "You must specify at least 1 destination PointSet, but no more than 10." + nPointSets > 0 && nPointSets <= 12, + "You must specify at least 1 destination PointSet, but no more than 12." ); destinationPointSets = new PointSet[nPointSets]; for (int i = 0; i < nPointSets; i++) { diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/EC2Info.java b/src/main/java/com/conveyal/r5/analyst/cluster/EC2Info.java index 5cf63f53a..d8453c838 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/EC2Info.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/EC2Info.java @@ -1,19 +1,12 @@ package com.conveyal.r5.analyst.cluster; -import com.amazonaws.util.EC2MetadataUtils; -import org.apache.http.client.HttpClient; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.HttpUriRequest; -import org.apache.http.client.methods.RequestBuilder; -import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; - /** - * API data model class to hold information about the ec2 instance a worker is running on (if any). + * API data model class to hold information about the cloud compute instance a worker is running on (if any). */ +@Deprecated public class EC2Info { private static final Logger LOG = LoggerFactory.getLogger(EC2Info.class); @@ -24,28 +17,7 @@ public class EC2Info { public String machineImage; public String privateIp; - /** Empty constructor, which will be called during deserialization from JSON. */ + // No-arg constructor for deserialization. public EC2Info() { } - /** This will attempt to retrieve metadata about the instance this code is running on. */ - public void fetchMetadata() { - HttpClient httpClient = HttpClients.createDefault(); - RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(1000).build(); - HttpUriRequest get = RequestBuilder.get().setUri(EC2MetadataUtils.getHostAddressForEC2MetadataService()) - .setConfig(requestConfig).build(); - try { - httpClient.execute(get); - machineImage = EC2MetadataUtils.getAmiId(); - instanceType = EC2MetadataUtils.getInstanceType(); - instanceId = EC2MetadataUtils.getInstanceId(); - // There is a method to get a Region object, but stick to strings for deserialization simplicity. - region = EC2MetadataUtils.getEC2InstanceRegion(); - // IP address fetching should really not be tied to EC2 but for now this lets us get a useable IP. - privateIp = EC2MetadataUtils.getPrivateIpAddress(); - //EC2MetadataUtils.getInstanceInfo(); - } catch (IOException ex) { - LOG.warn("Connection to metadata URL failed, probably not running on EC2."); - } - } - } diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/PathResult.java b/src/main/java/com/conveyal/r5/analyst/cluster/PathResult.java index 783eb5c21..edf8ad057 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/PathResult.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/PathResult.java @@ -155,13 +155,18 @@ public static class PathIterations { this.egress = pathTemplate.stopSequence.egress == null ? null : pathTemplate.stopSequence.egress.toString(); this.transitLegs = pathTemplate.transitLegs(transitLayer); this.iterations = iterations.stream().map(HumanReadableIteration::new).collect(Collectors.toList()); + iterations.forEach(pathTemplate.stopSequence::transferTime); // The transferTime method includes an + // assertion that the transfer time is non-negative, i.e. that the access + egress + wait + ride times of + // a specific iteration do not exceed the total travel time. Perform that sense check here, even though + // the transfer time is not reported to the front-end for the human-readable single-point responses. + // TODO add transferTime to HumanReadableIteration? } } /** * Returns human-readable details of path iterations, for JSON representation (e.g. in the UI console). */ - List getPathIterationsForDestination() { + public List getPathIterationsForDestination() { checkState(iterationsForPathTemplates.length == 1, "Paths were stored for multiple " + "destinations, but only one is being requested"); List detailsForDestination = new ArrayList<>(); diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/PathWriter.java b/src/main/java/com/conveyal/r5/analyst/cluster/PathWriter.java index 0ef751159..1e11c6f21 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/PathWriter.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/PathWriter.java @@ -1,5 +1,6 @@ package com.conveyal.r5.analyst.cluster; +import com.conveyal.analysis.components.WorkerComponents; import com.conveyal.r5.analyst.PersistenceBuffer; import com.conveyal.r5.profile.StreetMode; import com.conveyal.r5.transit.path.Path; @@ -183,8 +184,7 @@ public void finishAndStorePaths () { throw new RuntimeException("IO exception while writing path grid.", e); } persistenceBuffer.doneWriting(); - String pathFileName = task.taskId + "_paths.dat"; - AnalysisWorker.filePersistence.saveStaticSiteData(task, pathFileName, persistenceBuffer); + WorkerComponents.fileStorage.saveTauiData(task, task.taskId + "_paths.dat", persistenceBuffer); } } diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/RegionalWorkResult.java b/src/main/java/com/conveyal/r5/analyst/cluster/RegionalWorkResult.java index eb0134555..39a650d3e 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/RegionalWorkResult.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/RegionalWorkResult.java @@ -1,6 +1,7 @@ package com.conveyal.r5.analyst.cluster; import com.conveyal.r5.OneOriginResult; +import com.conveyal.r5.util.ExceptionUtils; import java.util.ArrayList; @@ -36,6 +37,14 @@ public class RegionalWorkResult { */ public int[][][] accessibilityValues; + /** + * If this field is non-null, the worker is reporting an error that compromises the quality of the result at this + * origin point, and potentially for then entire regional analysis. Put into a Set on backend since all workers + * will probably report the same problem, but we may want to tolerate errors on a small number of origin points to + * not waste computation. On the other hand any error here implies incorrect inputs, configuration, or software. + */ + public String error; + /** Trivial no-arg constructor for deserialization. Private to prevent usage outside deserialization. */ private RegionalWorkResult() { } @@ -50,8 +59,14 @@ public RegionalWorkResult(OneOriginResult result, RegionalTask task) { this.travelTimeValues = result.travelTimes == null ? null : result.travelTimes.values; this.accessibilityValues = result.accessibility == null ? null : result.accessibility.getIntValues(); this.pathResult = result.paths == null ? null : result.paths.summarizeIterations(PathResult.Stat.MINIMUM); + // TODO checkTravelTimeInvariants, checkAccessibilityInvariants to verify that values are monotonically increasing } - // TODO checkTravelTimeInvariants, checkAccessibilityInvariants to verify that values are monotonically increasing + /** Constructor used when results for this origin are considered unusable due to an unhandled error. */ + public RegionalWorkResult(Throwable t, RegionalTask task) { + this.jobId = task.jobId; + this.taskId = task.taskId; + this.error = ExceptionUtils.shortAndLongString(t); + } } diff --git a/src/main/java/com/conveyal/r5/analyst/cluster/WorkerStatus.java b/src/main/java/com/conveyal/r5/analyst/cluster/WorkerStatus.java index a701f473d..7d002fa9a 100644 --- a/src/main/java/com/conveyal/r5/analyst/cluster/WorkerStatus.java +++ b/src/main/java/com/conveyal/r5/analyst/cluster/WorkerStatus.java @@ -1,6 +1,6 @@ package com.conveyal.r5.analyst.cluster; -import com.conveyal.analysis.BackendVersion; +import com.conveyal.r5.SoftwareVersion; import com.conveyal.r5.analyst.WorkerCategory; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonUnwrapped; @@ -57,7 +57,7 @@ public WorkerStatus() { } public WorkerStatus (AnalysisWorker worker) { workerName = "R5"; - workerVersion = BackendVersion.instance.version; + workerVersion = SoftwareVersion.instance.version; workerId = worker.machineId; // TODO overwrite with cloud provider (EC2) machine ID in a generic way // Eventually we'll want to report all networks the worker has loaded, to give the backend hints about what kind @@ -92,8 +92,8 @@ public WorkerStatus (AnalysisWorker worker) { memoryTotal = runtime.totalMemory(); memoryFree = runtime.freeMemory(); - if (ec2.privateIp != null) { - // Give priority to the private IP address if running on EC2 + if (ec2 != null && ec2.privateIp != null) { + // Give priority to the private IP address if running in cloud compute environment. ipAddress = ec2.privateIp; } else { // Get whatever is the default IP address diff --git a/src/main/java/com/conveyal/r5/analyst/error/TaskError.java b/src/main/java/com/conveyal/r5/analyst/error/TaskError.java index c24a65ec1..4019fe68e 100644 --- a/src/main/java/com/conveyal/r5/analyst/error/TaskError.java +++ b/src/main/java/com/conveyal/r5/analyst/error/TaskError.java @@ -8,10 +8,10 @@ import java.util.List; /** - * This is an API model object for reporting a single error or warning that occurred on a worker back to the client via the broker. - * The most common errors a user will see are problems applying scenario modifications, so this provides some fields - * to clarify what modification caused the error. - * But it can also wrap any old Exception to report more unexpected kinds of errors. + * This is an API model object for reporting a single error or warning that occurred on a worker back to the UI via + * the backend. The most common errors a user will see are problems applying scenario modifications, so this provides + * some fields to clarify what modification caused the error, if any. But it can also contain messages from any old + * Exception (or other Throwable such as an Error) to report more unexpected kinds of errors. */ public class TaskError { @@ -20,10 +20,11 @@ public class TaskError { public final List messages = new ArrayList<>(); /** This constructor is used when an unexpected, unhandled error is encountered. */ - public TaskError(Exception ex) { + public TaskError(Throwable throwable) { this.modificationId = null; - this.title = "Unhandled error: " + ex.toString(); - this.messages.add(ExceptionUtils.asString(ex)); + this.title = "Unhandled error: " + throwable.getClass().getSimpleName(); + this.messages.add(ExceptionUtils.shortCauseString(throwable)); + this.messages.add(ExceptionUtils.stackTraceString(throwable)); } /** diff --git a/src/main/java/com/conveyal/r5/analyst/fare/ParetoServer.java b/src/main/java/com/conveyal/r5/analyst/fare/ParetoServer.java index dca3b84ce..75d2e6d6a 100644 --- a/src/main/java/com/conveyal/r5/analyst/fare/ParetoServer.java +++ b/src/main/java/com/conveyal/r5/analyst/fare/ParetoServer.java @@ -1,6 +1,6 @@ package com.conveyal.r5.analyst.fare; -import com.conveyal.analysis.BackendVersion; +import com.conveyal.r5.SoftwareVersion; import com.conveyal.r5.api.util.LegMode; import com.conveyal.r5.common.GeometryUtils; import com.conveyal.r5.common.JsonUtilities; @@ -13,7 +13,6 @@ import gnu.trove.iterator.TIntObjectIterator; import gnu.trove.map.TIntIntMap; import org.locationtech.jts.geom.Coordinate; -import org.locationtech.jts.geom.CoordinateXY; import org.locationtech.jts.geom.LineString; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,7 +20,6 @@ import spark.Response; import java.io.IOException; -import java.time.LocalDate; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; @@ -125,7 +123,7 @@ public static final class ParetoReturn { public final Collection trips; public final long computeTimeMillis; /** save backend version in JSON output - useful for JSON that's being pushed to fareto-examples */ - public BackendVersion backendVersion = BackendVersion.instance; + public SoftwareVersion backendVersion = SoftwareVersion.instance; public String generationTime = LocalDateTime.now().format(DateTimeFormatter.ISO_DATE_TIME); public ParetoReturn(ProfileRequest request, Collection trips, long computeTimeMillis) { diff --git a/src/main/java/com/conveyal/r5/analyst/progress/ApiTask.java b/src/main/java/com/conveyal/r5/analyst/progress/ApiTask.java new file mode 100644 index 000000000..ef7934328 --- /dev/null +++ b/src/main/java/com/conveyal/r5/analyst/progress/ApiTask.java @@ -0,0 +1,15 @@ +package com.conveyal.r5.analyst.progress; + +import java.util.UUID; + +/** API model for tasks in an activity response. Times are durations rather than absolute to counter clock drift. */ +public class ApiTask { + public UUID id; + public String title; + public String detail; + public Task.State state; + public int percentComplete; + public int secondsActive; + public int secondsComplete; + public WorkProduct workProduct; +} diff --git a/src/main/java/com/conveyal/r5/analyst/progress/NetworkPreloaderProgressListener.java b/src/main/java/com/conveyal/r5/analyst/progress/NetworkPreloaderProgressListener.java index d2071d85a..243f06623 100644 --- a/src/main/java/com/conveyal/r5/analyst/progress/NetworkPreloaderProgressListener.java +++ b/src/main/java/com/conveyal/r5/analyst/progress/NetworkPreloaderProgressListener.java @@ -60,6 +60,12 @@ public synchronized void increment () { } } + @Override + public synchronized void increment (int n) { + throw new UnsupportedOperationException(); + // TODO combined progress updating with case where n == 1 + } + private int getPercentComplete () { return (currentElement * 100) / totalElements; } diff --git a/src/main/java/com/conveyal/r5/analyst/progress/NoopProgressListener.java b/src/main/java/com/conveyal/r5/analyst/progress/NoopProgressListener.java index 2075ebf4a..ea3956875 100644 --- a/src/main/java/com/conveyal/r5/analyst/progress/NoopProgressListener.java +++ b/src/main/java/com/conveyal/r5/analyst/progress/NoopProgressListener.java @@ -1,15 +1,18 @@ package com.conveyal.r5.analyst.progress; +/** + * For classes that support progress listeners but may not always use one, to avoid littering the code with null checks + * we have this trivial ProgressListener implementation that does nothing. + */ public class NoopProgressListener implements ProgressListener { @Override - public void beginTask(String description, int totalElements) { - - } + public void beginTask(String description, int totalElements) { } @Override - public void increment() { + public void increment() { } - } + @Override + public void increment (int n) { } } diff --git a/src/main/java/com/conveyal/r5/analyst/progress/ProgressInputStream.java b/src/main/java/com/conveyal/r5/analyst/progress/ProgressInputStream.java new file mode 100644 index 000000000..bc02502d2 --- /dev/null +++ b/src/main/java/com/conveyal/r5/analyst/progress/ProgressInputStream.java @@ -0,0 +1,57 @@ +package com.conveyal.r5.analyst.progress; + +import org.apache.commons.fileupload.FileItem; +import org.apache.commons.io.input.ProxyInputStream; + +import java.io.IOException; +import java.io.InputStream; + +import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.commons.io.IOUtils.EOF; + +/** + * This will report progress as the total number of bytes that have passed through the stream, like CountingInputStream. + * This can exceed 100% of the file size if the caller uses mark and reset. The progressListener should be + * pre-configured with the total number of bytes expected and a detail message using ProgressListener::beginTask. + * The static method forFileItem() demonstrates usage when reading from a file of known length. + */ +public class ProgressInputStream extends ProxyInputStream { + + private final ProgressListener progressListener; + + public ProgressInputStream (ProgressListener progressListener, InputStream proxy) { + super(proxy); + this.progressListener = progressListener; + } + + @Override + protected synchronized void afterRead (final int n) { + if (n != EOF) { + progressListener.increment(n); + } + } + + @Override + public synchronized long skip (final long length) throws IOException { + final long skippedBytes = super.skip(length); + progressListener.increment((int) skippedBytes); + return skippedBytes; + } + + /** + * Given an uploaded file, report progress on reading it. + * Incrementing the progress seems to introduce some inefficiency when performing unbuffered small reads, such as + * calls to InputStream.read() which are used by DataInputStream to read numbers. + * TODO wrap in buffered input stream to reduce small read calls, or tune to only report once per percentage? + */ + public static ProgressInputStream forFileItem (FileItem fileItem, ProgressListener progressListener) { + try { + checkArgument(fileItem.getSize() < Integer.MAX_VALUE); + progressListener.beginTask("Reading file " + fileItem.getName(), (int)(fileItem.getSize())); + return new ProgressInputStream(progressListener, fileItem.getInputStream()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/src/main/java/com/conveyal/r5/analyst/progress/ProgressListener.java b/src/main/java/com/conveyal/r5/analyst/progress/ProgressListener.java index bbbd0dd15..7fbb67b10 100644 --- a/src/main/java/com/conveyal/r5/analyst/progress/ProgressListener.java +++ b/src/main/java/com/conveyal/r5/analyst/progress/ProgressListener.java @@ -2,6 +2,7 @@ /** * This interface provides simple callbacks to allow long running, asynchronous operations to report on their progress. + * Take care that all method implementations are very fast as the increment methods might be called in tight loops. */ public interface ProgressListener { @@ -16,9 +17,12 @@ public interface ProgressListener { */ void beginTask(String description, int totalElements); - /** - * Call this method to report that one unit of work has been performed. - */ - void increment(); + /** Call this method to report that N units of work have been performed. */ + void increment(int n); + + /** Call this method to report that one unit of work has been performed. */ + default void increment () { + increment(1); + } } diff --git a/src/main/java/com/conveyal/r5/analyst/progress/Task.java b/src/main/java/com/conveyal/r5/analyst/progress/Task.java index 1b913da6c..949ad2d22 100644 --- a/src/main/java/com/conveyal/r5/analyst/progress/Task.java +++ b/src/main/java/com/conveyal/r5/analyst/progress/Task.java @@ -1,13 +1,19 @@ package com.conveyal.r5.analyst.progress; -import com.fasterxml.jackson.annotation.JsonIgnore; +import com.conveyal.analysis.UserPermissions; +import com.conveyal.analysis.models.Model; +import com.conveyal.r5.util.ExceptionUtils; +import java.time.Duration; import java.time.Instant; import java.util.ArrayList; import java.util.List; +import java.util.UUID; /** - * This is a draft for a more advanced task progress system. It is not yet complete or functional. + * This is a draft for a more advanced task progress system. It is not yet complete. + * Task is intended for background tasks whose progress the end user should be aware of, such as file uploads. + * It should not be used for automatic internal actions (such as Events) which would clutter a user's active task list. * * A Task (or some interface that it implements) could be used by the AsyncLoader to track progress. Together with some * AsyncLoader functionality it will be a bit like a Future with progress reporting. Use of AsyncLoader could then be @@ -18,19 +24,26 @@ * Or an expected number of "relative work units" if new sub-tasks will be added on the fly, so absolutes are not known. * Tasks should implement ProgressListener functionality, and bubble up progress to parent tasks. * - * This class serves simultaneously as an internal domain object for tracking task execution, and an external API model - * object for communicating relevant information to the web UI (when serialized to JSON). + * TODO rename to BackgroundTask */ public class Task implements Runnable, ProgressListener { + public static enum State { + QUEUED, ACTIVE, DONE, ERROR + } + + /** Every has an ID so the UI can update tasks it already knows about with new information after polling. */ + public final UUID id = UUID.randomUUID(); + // User and group are only relevant on the backend. On workers, we want to show network or cost table build progress // to everyone in the organization, even if someone else's action kicked off the process. + // We could also store the full UserPermissions object instead of breaking it into fields. - private String user; + public String user; private String group; - // Timestamps to track execution time and wait time. + // Timestamps to track elapsed execution time and wait time. private Instant enqueued; @@ -42,12 +55,18 @@ public class Task implements Runnable, ProgressListener { private int logFrequency = 0; + /** An unchanging, human readable name for this task. */ + public final String title; + + /** Text describing the current work, which changes over the course of task processing. */ public String description; public int totalWorkUnits; public int currentWorkUnit; + public State state; + /** To be on the safe side all tasks are considered heavyweight unless we explicitly set them to be lightweight. */ private boolean isHeavy = true; @@ -59,14 +78,11 @@ public class Task implements Runnable, ProgressListener { // and only track progress on the slow ones. public int nWorkUnitsSkipped; - public boolean errored; - private Throwable throwable; /** How often (every how many work units) this task will log progress on the backend. */ private int loggingFrequency; - @JsonIgnore private Task parentTask; // Maybe TObjectIntMap to store subtask weights. @@ -76,39 +92,56 @@ public class Task implements Runnable, ProgressListener { // to prevent simultaneous execution of sequential tasks. public Task nextTask; + /** Private constructor to encourage use of fluent methods. */ + private Task (String title) { + this.title = title; + // It's not strictly accurate to mark the task enqueued before finish constructing it and actually submit it, + // but this avoids needing to expose that state transition and enforce calling it later. + markEnqueued(); + } + public double getPercentComplete() { return (currentWorkUnit * 100D) / totalWorkUnits; } List subtasks = new ArrayList<>(); - /** - * Private constructor to encourage use of fluent methods. - */ - private Task () {} + // TODO find a better way to set this than directly inside a closure + public WorkProduct workProduct; public void addSubtask (Task subtask) { } + private void markEnqueued () { + enqueued = Instant.now(); + description = "Waiting..."; + state = State.QUEUED; + } + + private void markActive () { + began = Instant.now(); + this.state = State.ACTIVE; + } + // Because of the subtask / next task mechanism, we don't let the actions mark tasks complete. // This is another reason to pass the actions only a limited progress reporting interface. private void markComplete () { - this.completed = Instant.now(); + completed = Instant.now(); + // Force progress bars to display 100% whenever an action completes successfully. + currentWorkUnit = totalWorkUnits; + description = "Completed."; + state = State.DONE; } - public boolean isHeavy () { - return this.isHeavy; + private void markError (Throwable throwable) { + completed = Instant.now(); + description = ExceptionUtils.shortCauseString(throwable); + state = State.ERROR; } - /** - * Abort the current task and cancel any subtasks or following tasks. - * @param throwable the reason for aborting this task. - */ - public void abort (Throwable throwable) { - this.throwable = throwable; - // LOG? - this.markComplete(); + public boolean isHeavy () { + return this.isHeavy; } protected void bubbleUpProgress() { @@ -142,53 +175,72 @@ public void run () { // The main action is run before the subtasks. It may not make sense progress reporting-wise for tasks to have // both their own actions and subtasks with their own actions. Perhaps series of tasks are a special kind of // action, which should encapsulate the bubble-up progress computation. - this.action.action(this); - for (Task subtask : subtasks) { - subtask.run(); + markActive(); + try { + this.action.action(this); + // for (Task subtask : subtasks) subtask.run(); + markComplete(); + } catch (Throwable t) { + // TODO Store error in work product and write product to Mongo uniformly + markError(t); } - this.markComplete(); - this.nextTask.run(); } @Override public void beginTask(String description, int totalElements) { - // Just using an existing interface that may eventually be modified to not include this method. - throw new UnsupportedOperationException(); + // In the absence of subtasks we can call this repeatedly on the same task, which will cause the UI progress + // bar to reset to zero at each stage, while keeping the same top level title. + this.description = description; + this.totalWorkUnits = totalElements; + this.currentWorkUnit = 0; } @Override - public void increment () { - this.currentWorkUnit += 1; - // Occasionally bubble up progress to parent tasks, log to console, etc. - if (this.bubbleUpdateFrequency > 0 && (currentWorkUnit % bubbleUpdateFrequency == 0)) { - parentTask.bubbleUpProgress(); - } - if (this.logFrequency > 0 && (currentWorkUnit % logFrequency == 0)) { - // LOG.info... + public void increment (int n) { + currentWorkUnit += n; + if (currentWorkUnit >= totalWorkUnits || currentWorkUnit < 0) { + currentWorkUnit = totalWorkUnits - 1; } } + // Methods for reporting elapsed times over API + + public Duration durationInQueue () { + Instant endTime = (began == null) ? Instant.now() : began; + return Duration.between(enqueued, endTime); + } + + public Duration durationExecuting () { + if (began == null) return Duration.ZERO; + Instant endTime = (completed == null) ? Instant.now() : completed; + return Duration.between(began, endTime); + } + + public Duration durationComplete () { + if (completed == null) return Duration.ZERO; + return Duration.between(completed, Instant.now()); + } + // FLUENT METHODS FOR CONFIGURING - // Really we should make a User object that combines user and group fields. - public Task forUser (String user, String group) { + /** Call this static factory to begin building a task. */ + public static Task create (String title) { + Task task = new Task(title); + return task; + } + + public Task forUser (String user) { this.user = user; - this.group = group; return this; } - public Task withDescription (String description) { - this.description = description; + public Task inGroup (String group) { + this.group = group; return this; } - /** - * We may actually want the TaskAction to set the total work units via its restricted ProgressReporter interface. - */ - public Task withTotalWorkUnits (int totalWorkUnits) { - this.totalWorkUnits = totalWorkUnits; - this.bubbleUpdateFrequency = totalWorkUnits / 100; - return this; + public Task forUser (UserPermissions userPermissions) { + return this.forUser(userPermissions.email).inGroup(userPermissions.accessGroup); } public Task withAction (TaskAction action) { @@ -196,13 +248,37 @@ public Task withAction (TaskAction action) { return this; } + // We can't return the WorkProduct from TaskAction, that would be disrupted by throwing exceptions. + // It is also awkward to make a method to set it on ProgressListener - it's not really progress. + // So we set it directly on the task before submitting it. Requires pre-setting (not necessarily storing) Model._id. + public Task withWorkProduct (Model model) { + this.workProduct = WorkProduct.forModel(model); + return this; + } + + /** Ideally we'd just pass in a Model, but currently we have two base classes, also see WorkProduct.forModel(). */ + public Task withWorkProduct (WorkProductType type, String id, String region) { + this.workProduct = new WorkProduct(type, id, region); + return this; + } + public Task setHeavy (boolean heavy) { this.isHeavy = heavy; return this; } - public static Task newTask () { - return new Task(); + /** Convert a single internal Task object to its representation for JSON serialization and return to the UI. */ + public ApiTask toApiTask () { + ApiTask apiTask = new ApiTask(); + apiTask.id = id; // This can be the same as the workProduct ID except for cases with no Mongo document + apiTask.title = title; + apiTask.detail = description; + apiTask.state = state; + apiTask.percentComplete = (int) getPercentComplete(); + apiTask.secondsActive = (int) durationExecuting().getSeconds(); + apiTask.secondsComplete = (int) durationComplete().getSeconds(); + apiTask.workProduct = workProduct; + return apiTask; } } diff --git a/src/main/java/com/conveyal/r5/analyst/progress/TaskAction.java b/src/main/java/com/conveyal/r5/analyst/progress/TaskAction.java index da576d525..9d0bb0d8c 100644 --- a/src/main/java/com/conveyal/r5/analyst/progress/TaskAction.java +++ b/src/main/java/com/conveyal/r5/analyst/progress/TaskAction.java @@ -13,6 +13,6 @@ public interface TaskAction { * The parameter is a simpler interface of Task that only allows progress reporting, to encapsulate actions and * prevent them from seeing or modifying the task hierarchy that triggers and manages them. */ - public void action (ProgressListener progressListener); + public void action (ProgressListener progressListener) throws Exception; } diff --git a/src/main/java/com/conveyal/r5/analyst/progress/TaskExecutor.java b/src/main/java/com/conveyal/r5/analyst/progress/TaskExecutor.java deleted file mode 100644 index 7ccdc9aa9..000000000 --- a/src/main/java/com/conveyal/r5/analyst/progress/TaskExecutor.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.conveyal.r5.analyst.progress; - -import com.google.common.collect.Lists; - -import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; - -/** - * This could replace our current bare ExecutorServices class. This can remain all-static for simplicity at first. - * Like many other things in Analysis it should probably become a singleton instance (though this is a pure question of - * style for the moment, as we never have more than one instance running in the same JVM). - * - * This executor is in R5 rather than analysis-backend, so does not now have access to AnalysisServerConfig.lightThreads - * and AnalysisServerConfig.heavyThreads config options. Pool sizes would need to be initialized manually at startup. - * There might not be any reason for it to be in R5 if it's entirely managing backend tasks. But we do expect to at - * least manage and report on progress building networks and distance tables, which needs to happen in a specific - * versioned worker instance. - * - * This is moving in the direction of having a single unified task management and reporting system across the backend - * and workers. It could be interesting to gather task status from the whole cluster of workers and merge them together - * into one view. This could conceivably even include regional analyses and chunks of work for regional analyses on - * workers. But even if such merging doesn't occur, it will be convenient to always report progress from backend and - * workers to the UI in the same data structures. - */ -public abstract class TaskExecutor { - - private static final ExecutorService light = Executors.newFixedThreadPool(10); - private static final ExecutorService heavy = Executors.newFixedThreadPool(2); - - public static void enqueue (Task task) { - task.validate(); - if (task.isHeavy()) { - heavy.submit(task); - } else { - light.submit(task); - } - } - - /** - * Just demonstrating how this would be used. - */ - public static void example () { - TaskExecutor.enqueue(Task.newTask() - .forUser("abyrd@conveyal.com", "conveyal") - .withDescription("Process some complicated things") - .withTotalWorkUnits(1024) - .withAction((progressListener -> { - double sum = 0; - for (int i = 0; i < 1024; i++) { - sum += Math.sqrt(i); - progressListener.increment(); - } - })) - ); - } - - /** - * @return a hierarchical structure of all currently executing tasks, for serialization and transmission to the UI. - */ - public static List getAllTasksForUI () { - // Hmm, we need to get all the tasks back out of the Executor once they're seen as Runnables... - return Lists.newArrayList(); - } - -} diff --git a/src/main/java/com/conveyal/r5/analyst/progress/WorkProduct.java b/src/main/java/com/conveyal/r5/analyst/progress/WorkProduct.java new file mode 100644 index 000000000..beff7e3e6 --- /dev/null +++ b/src/main/java/com/conveyal/r5/analyst/progress/WorkProduct.java @@ -0,0 +1,29 @@ +package com.conveyal.r5.analyst.progress; + +import com.conveyal.analysis.controllers.UserActivityController; +import com.conveyal.analysis.models.Model; + +/** + * A unique identifier for the final product of a single task action. Currently this serves as both an + * internal data structure and an API model class, which should be harmless as it's an immutable data class. + * The id is unique within the type, so the regionId is redundant information, but facilitates prefectches on the UI. + */ +public class WorkProduct { + + public final WorkProductType type; + public final String id; + public final String regionId; + + public WorkProduct (WorkProductType type, String id, String regionId) { + this.type = type; + this.id = id; + this.regionId = regionId; + } + + // FIXME Not all Models have a regionId. Rather than pass that in as a String, refine the programming API. + public static WorkProduct forModel (Model model) { + WorkProduct product = new WorkProduct(WorkProductType.forModel(model), model._id, null); + return product; + } + +} diff --git a/src/main/java/com/conveyal/r5/analyst/progress/WorkProductType.java b/src/main/java/com/conveyal/r5/analyst/progress/WorkProductType.java new file mode 100644 index 000000000..bb71ba955 --- /dev/null +++ b/src/main/java/com/conveyal/r5/analyst/progress/WorkProductType.java @@ -0,0 +1,25 @@ +package com.conveyal.r5.analyst.progress; + +import com.conveyal.analysis.controllers.UserActivityController; +import com.conveyal.analysis.models.Bundle; +import com.conveyal.analysis.models.Model; +import com.conveyal.analysis.models.OpportunityDataset; +import com.conveyal.analysis.models.RegionalAnalysis; + +/** + * There is some implicit and unenforced correspondence between these values and those in FileCategory, as well + * as the tables in Mongo. We should probably clearly state and enforce this parallelism. No background work is + * done creating regions, projects, or modifications so they don't need to be represented here. + */ +public enum WorkProductType { + + BUNDLE, REGIONAL_ANALYSIS, AGGREGATION_AREA, OPPORTUNITY_DATASET; + + // Currently we have two base classes for db objects so may need to use Object instead of BaseModel parameter + public static WorkProductType forModel (Model model) { + if (model instanceof Bundle) return BUNDLE; + if (model instanceof OpportunityDataset) return OPPORTUNITY_DATASET; + if (model instanceof RegionalAnalysis) return REGIONAL_ANALYSIS; + throw new IllegalArgumentException("Unrecognized work product type."); + } +} diff --git a/src/main/java/com/conveyal/r5/analyst/scenario/IndexedPolygonCollection.java b/src/main/java/com/conveyal/r5/analyst/scenario/IndexedPolygonCollection.java index 0e82bb542..339ef1149 100644 --- a/src/main/java/com/conveyal/r5/analyst/scenario/IndexedPolygonCollection.java +++ b/src/main/java/com/conveyal/r5/analyst/scenario/IndexedPolygonCollection.java @@ -1,7 +1,6 @@ package com.conveyal.r5.analyst.scenario; -import com.conveyal.r5.analyst.FileCategory; -import com.conveyal.r5.analyst.cluster.AnalysisWorker; +import com.conveyal.analysis.components.WorkerComponents; import org.geotools.feature.FeatureCollection; import org.geotools.feature.FeatureIterator; import org.geotools.geojson.feature.FeatureJSON; @@ -22,7 +21,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.zip.GZIPInputStream; + +import static com.conveyal.file.FileCategory.POLYGONS; /** * This is an abstraction for the polygons used to configure the road congestion modification type and the ride hailing @@ -104,14 +104,9 @@ public IndexedPolygonCollection ( } public void loadFromS3GeoJson() throws Exception { - InputStream s3InputStream = AnalysisWorker.filePersistence.getData(FileCategory.POLYGON, polygonLayer); - // To test on local files: - //InputStream s3InputStream = new FileInputStream("/Users/abyrd/" + polygonLayer); - if (polygonLayer.endsWith(".gz")) { - s3InputStream = new GZIPInputStream(s3InputStream); - } + InputStream polygonInputStream = WorkerComponents.fileStorage.getInputStream(POLYGONS, polygonLayer); FeatureJSON featureJSON = new FeatureJSON(); - FeatureCollection featureCollection = featureJSON.readFeatureCollection(s3InputStream); + FeatureCollection featureCollection = featureJSON.readFeatureCollection(polygonInputStream); LOG.info("Validating features and creating spatial index..."); FeatureType featureType = featureCollection.getSchema(); CoordinateReferenceSystem crs = featureType.getCoordinateReferenceSystem(); diff --git a/src/main/java/com/conveyal/r5/analyst/scenario/PickupDelay.java b/src/main/java/com/conveyal/r5/analyst/scenario/PickupDelay.java index 720165e39..d04169c26 100644 --- a/src/main/java/com/conveyal/r5/analyst/scenario/PickupDelay.java +++ b/src/main/java/com/conveyal/r5/analyst/scenario/PickupDelay.java @@ -171,7 +171,7 @@ public boolean resolve (TransportNetwork network) { } } catch (Exception e) { // Record any unexpected errors to bubble up to the UI. - errors.add(ExceptionUtils.asString(e)); + errors.add(ExceptionUtils.stackTraceString(e)); } return errors.size() > 0; } diff --git a/src/main/java/com/conveyal/r5/analyst/scenario/RoadCongestion.java b/src/main/java/com/conveyal/r5/analyst/scenario/RoadCongestion.java index ada0069aa..f113a0565 100644 --- a/src/main/java/com/conveyal/r5/analyst/scenario/RoadCongestion.java +++ b/src/main/java/com/conveyal/r5/analyst/scenario/RoadCongestion.java @@ -1,6 +1,7 @@ package com.conveyal.r5.analyst.scenario; -import com.conveyal.r5.analyst.FileCategory; +import com.conveyal.analysis.components.WorkerComponents; +import com.conveyal.file.FileCategory; import com.conveyal.r5.analyst.cluster.AnalysisWorker; import com.conveyal.r5.streets.EdgeStore; import com.conveyal.r5.transit.TransportNetwork; @@ -28,6 +29,8 @@ import java.util.List; import java.util.zip.GZIPInputStream; +import static com.conveyal.file.FileCategory.POLYGONS; + /** * To simulate traffic congestion, apply a slow-down (or speed-up) factor to roads, according to attributes of polygon * features. This replaces the array of edge speeds in a scenario copy of the street layer's edge store. @@ -108,16 +111,9 @@ public boolean resolve (TransportNetwork network) { // Polygon should only need to be fetched once when the scenario is applied, then the resulting network is cached. // this.features = polygonLayerCache.getPolygonFeatureCollection(this.polygonLayer); // Note: Newer JTS now has GeoJsonReader - try { - InputStream s3InputStream = AnalysisWorker.filePersistence.getData(FileCategory.POLYGON, polygonLayer); - // To test on local files: - //InputStream s3InputStream = new FileInputStream("/Users/abyrd/" + polygonLayer); - // TODO handle gzip decompression in FilePersistence base class. - if (polygonLayer.endsWith(".gz")) { - s3InputStream = new GZIPInputStream(s3InputStream); - } + try (InputStream inputStream = WorkerComponents.fileStorage.getInputStream(POLYGONS, polygonLayer)) { FeatureJSON featureJSON = new FeatureJSON(); - FeatureCollection featureCollection = featureJSON.readFeatureCollection(s3InputStream); + FeatureCollection featureCollection = featureJSON.readFeatureCollection(inputStream); LOG.info("Validating features and creating spatial index..."); polygonSpatialIndex = new STRtree(); FeatureType featureType = featureCollection.getSchema(); @@ -180,7 +176,7 @@ public boolean resolve (TransportNetwork network) { logUpdatedEdgeCounts = false; } } catch (Exception e) { - errors.add(ExceptionUtils.asString(e)); + errors.add(ExceptionUtils.stackTraceString(e)); } return errors.size() > 0; } diff --git a/src/main/java/com/conveyal/r5/kryo/KryoNetworkSerializer.java b/src/main/java/com/conveyal/r5/kryo/KryoNetworkSerializer.java index 0b0e33869..2f6b63d57 100644 --- a/src/main/java/com/conveyal/r5/kryo/KryoNetworkSerializer.java +++ b/src/main/java/com/conveyal/r5/kryo/KryoNetworkSerializer.java @@ -1,6 +1,6 @@ package com.conveyal.r5.kryo; -import com.conveyal.analysis.BackendVersion; +import com.conveyal.r5.SoftwareVersion; import com.conveyal.kryo.InstanceCountingClassResolver; import com.conveyal.kryo.TIntArrayListSerializer; import com.conveyal.kryo.TIntIntHashMapSerializer; @@ -38,6 +38,12 @@ public abstract class KryoNetworkSerializer { private static final Logger LOG = LoggerFactory.getLogger(KryoNetworkSerializer.class); + /** + * This string should be changed to a new value each time the network storage format changes. + * I considered using an ISO date string but that could get confusing when seen in filenames. + */ + public static final String NETWORK_FORMAT_VERSION = "nv1"; + public static final byte[] HEADER = "R5NETWORK".getBytes(); /** Set this to true to count instances and print a report including which serializer is handling each class. */ @@ -92,8 +98,8 @@ public static void write (TransportNetwork network, File file) throws IOExceptio Output output = new Output(new FileOutputStream(file)); Kryo kryo = makeKryo(); output.write(HEADER); - kryo.writeObject(output, BackendVersion.instance.version); - kryo.writeObject(output, BackendVersion.instance.commit); + kryo.writeObject(output, NETWORK_FORMAT_VERSION); + kryo.writeObject(output, SoftwareVersion.instance.commit); kryo.writeObject(output, network); output.close(); LOG.info("Done writing."); @@ -115,12 +121,13 @@ public static TransportNetwork read (File file) throws Exception { if (!Arrays.equals(HEADER, header)) { throw new RuntimeException("Unrecognized file header. Is this an R5 Kryo network?"); } - String version = kryo.readObject(input, String.class); + String formatVersion = kryo.readObject(input, String.class); String commit = kryo.readObject(input, String.class); - LOG.info("Loading {} file saved by R5 version {} commit {}", new String(header), version, commit); - if (!BackendVersion.instance.version.equals(version)) { - throw new RuntimeException(String.format("File version %s is not compatible with this R5 version %s", - version, BackendVersion.instance.version)); + LOG.info("Loading network from file format version {}, written by R5 commit {}", formatVersion, commit); + if (!NETWORK_FORMAT_VERSION.equals(formatVersion)) { + throw new RuntimeException( + String.format("File format version is %s, this R5 requires %s", formatVersion, NETWORK_FORMAT_VERSION) + ); } TransportNetwork result = kryo.readObject(input, TransportNetwork.class); input.close(); diff --git a/src/main/java/com/conveyal/r5/profile/FastRaptorWorker.java b/src/main/java/com/conveyal/r5/profile/FastRaptorWorker.java index eeee044bc..833fbfcd1 100644 --- a/src/main/java/com/conveyal/r5/profile/FastRaptorWorker.java +++ b/src/main/java/com/conveyal/r5/profile/FastRaptorWorker.java @@ -1,9 +1,9 @@ package com.conveyal.r5.profile; import com.conveyal.r5.analyst.cluster.AnalysisWorkerTask; -import com.conveyal.r5.api.util.TransitModes; +import com.conveyal.r5.transit.FilteredPattern; +import com.conveyal.r5.transit.FilteredPatterns; import com.conveyal.r5.transit.PickDropType; -import com.conveyal.r5.transit.RouteInfo; import com.conveyal.r5.transit.TransitLayer; import com.conveyal.r5.transit.TripPattern; import com.conveyal.r5.transit.TripSchedule; @@ -24,14 +24,15 @@ import static com.conveyal.r5.profile.FastRaptorWorker.FrequencyBoardingMode.HALF_HEADWAY; import static com.conveyal.r5.profile.FastRaptorWorker.FrequencyBoardingMode.MONTE_CARLO; import static com.conveyal.r5.profile.FastRaptorWorker.FrequencyBoardingMode.UPPER_BOUND; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; /** - * FastRaptorWorker is faster than the old RaptorWorker and made to be more maintainable. - * It is simpler, as it only focuses on the transit network; see the Propagater class for the methods that extend - * the travel times from the final transit stop of a trip out to the individual targets. - * - * The algorithm used herein is described in + * FastRaptorWorker finds stop-to-stop paths through the transit network. + * The PerTargetPropagater extends travel times from the final transit stop of trips out to the individual targets. + * This system also accounts for pure-frequency routes by using Monte Carlo methods (generating randomized schedules). + * There is support for saving paths, so we can report how to reach a destination rather than just how long it takes. + * The algorithm used herein is described in: * * Conway, Matthew Wigginton, Andrew Byrd, and Marco van der Linden. “Evidence-Based Transit and Land Use Sketch Planning * Using Interactive Accessibility Methods on Combined Schedule and Headway-Based Networks.” Transportation Research @@ -40,22 +41,15 @@ * Delling, Daniel, Thomas Pajor, and Renato Werneck. “Round-Based Public Transit Routing,” January 1, 2012. * http://research.microsoft.com/pubs/156567/raptor_alenex.pdf. * - * There is basic support for saving paths, so we can report how to reach a destination rather than just how long it takes. - * - * This class originated as a rewrite of our RAPTOR code that would use "thin workers", allowing computation by a - * generic function-execution service like AWS Lambda. The gains in efficiency were significant enough that this is now - * the way we do all analysis work. This system also accounts for pure-frequency routes by using Monte Carlo methods - * (generating randomized schedules). * - * TODO rename to remove "fast" and revise above comments, there is only one worker now. - * Maybe just call it TransitRouter. But then there's also McRaptor. + * TODO rename to remove "fast". Maybe just call it TransitRouter, but then there's also McRaptor. */ public class FastRaptorWorker { private static final Logger LOG = LoggerFactory.getLogger(FastRaptorWorker.class); /** - * This value essentially serves as Infinity for ints - it's bigger than every other number. + * This value is used as positive infinity for ints - it's bigger than every other number. * It is the travel time to a transit stop or a target before that stop or target is ever reached. * Be careful when propagating travel times from stops to targets, adding anything to UNREACHED will cause overflow. */ @@ -70,7 +64,7 @@ public class FastRaptorWorker { public static final int SECONDS_PER_MINUTE = 60; /** - * Step for departure times. Use caution when changing this as the functions request.getTimeWindowLengthMinutes + * Step for departure times. Use caution when changing this, as the functions request.getTimeWindowLengthMinutes * and request.getMonteCarloDrawsPerMinute below which assume this value is 1 minute. */ private static final int DEPARTURE_STEP_SEC = 60; @@ -82,6 +76,8 @@ public class FastRaptorWorker { private static final int MINIMUM_BOARD_WAIT_SEC = 60; // ENABLE_OPTIMIZATION_X flags enable code paths that should affect efficiency but have no effect on output. + // They may change results where our algorithm is not perfectly optimal, for example with respect to overtaking + // (see discussion at #708). public static final boolean ENABLE_OPTIMIZATION_RANGE_RAPTOR = true; public static final boolean ENABLE_OPTIMIZATION_FREQ_UPPER_BOUND = true; @@ -111,18 +107,15 @@ public class FastRaptorWorker { /** The routing parameters. */ private final AnalysisWorkerTask request; - /** The indexes of the trip patterns running on a given day with frequency-based trips of selected modes. */ - private final BitSet runningFrequencyPatterns = new BitSet(); - - /** The indexes of the trip patterns running on a given day with scheduled trips of selected modes. */ - private final BitSet runningScheduledPatterns = new BitSet(); - /** Generates and stores departure time offsets for every frequency-based set of trips. */ private final FrequencyRandomOffsets offsets; - /** Services active on the date of the search */ + /** Services active on the date of the search. */ private final BitSet servicesActive; + /** TripPatterns that have been prefiltered for the specific search date and modes. */ + private FilteredPatterns filteredPatterns; + /** * The state resulting from the scheduled search at a particular departure minute. * This state is reused at each departure minute without re-initializing it (this is the range-raptor optimization). @@ -142,6 +135,10 @@ public class FastRaptorWorker { /** If we're going to store paths to every destination (e.g. for static sites) then they'll be retained here. */ public List pathsPerIteration; + /** + * Only fast initialization steps are performed in the constructor. + * All slower work is done in route() so timing information can be collected. + */ public FastRaptorWorker (TransitLayer transitLayer, AnalysisWorkerTask request, TIntIntMap accessStops) { this.transit = transitLayer; this.request = request; @@ -171,7 +168,9 @@ public FastRaptorWorker (TransitLayer transitLayer, AnalysisWorkerTask request, */ public int[][] route () { raptorTimer.fullSearch.start(); - prefilterPatterns(); + raptorTimer.patternFiltering.start(); + filteredPatterns = transit.filteredPatternCache.get(request.transitModes, servicesActive); + raptorTimer.patternFiltering.stop(); // Initialize result storage. Results are one arrival time at each stop, for every raptor iteration. final int nStops = transit.getStopCount(); final int nIterations = iterationsPerMinute * nMinutes; @@ -238,29 +237,6 @@ private void dumpAllTimesToFile(int[][] arrivalTimesAtStopsPerIteration, int max } } - /** - * Before routing, filter the set of patterns down to only the ones that are actually running on the search date. - * We can also filter down to only those modes enabled in the search request, because all trips in a pattern are - * defined to be on same route, and GTFS allows only one mode per route. - */ - private void prefilterPatterns () { - for (int patternIndex = 0; patternIndex < transit.tripPatterns.size(); patternIndex++) { - TripPattern pattern = transit.tripPatterns.get(patternIndex); - RouteInfo routeInfo = transit.routes.get(pattern.routeIndex); - TransitModes mode = TransitLayer.getTransitModes(routeInfo.route_type); - if (pattern.servicesActive.intersects(servicesActive) && request.transitModes.contains(mode)) { - // At least one trip on this pattern is relevant, based on the profile request's date and modes. - if (pattern.hasFrequencies) { - runningFrequencyPatterns.set(patternIndex); - } - // Schedule case is not an "else" clause because we support patterns with both frequency and schedule. - if (pattern.hasSchedules) { - runningScheduledPatterns.set(patternIndex); - } - } - } - } - /** * Create the initial array of states for the latest departure minute in the window, one state for each round. * One state for each allowed transit ride, plus a zeroth round containing the results of the access street search. @@ -289,7 +265,10 @@ private void initializeScheduleState (int departureTime) { /** * Set the departure time in the scheduled search to the given departure time, and prepare for the scheduled * search at the next-earlier minute. This is reusing results from one departure time as an upper bound on - * arrival times for an earlier departure time (i.e. range raptor). + * arrival times for an earlier departure time (i.e. range raptor). Note that this reuse can give riders + * "look-ahead" abilities about trips that will be overtaken, depending on the departure time window; they will + * not board the first feasible departure from a stop if a later one (that has been ridden in a later departure + * minute) will arrive at their destination stop earlier. */ private void advanceScheduledSearchToPreviousMinute (int nextMinuteDepartureTime) { for (RaptorState state : this.scheduleState) { @@ -470,6 +449,62 @@ private static Path[] pathToEachStop(RaptorState state) { return paths; } + + /** + * Starting from a trip we're already riding, step backward through the trips in the supplied filteredPattern to see + * if there is a usable one that departs earlier from the current stop position in the pattern, and if so return its + * index within the filtered pattern. This method assumes there is no overtaking in the FilteredPattern's schedules. + */ + private int checkEarlierScheduledDeparture ( + int departAfter, FilteredPattern filteredPattern, int stopInPattern, int currentTrip + ) { + checkArgument(filteredPattern.noScheduledOvertaking); + int bestTrip = currentTrip; + int candidateTrip = currentTrip; + while (--candidateTrip >= 0) { + // The tripSchedules in the supplied pattern are known to be sorted by departure time at all stops. + TripSchedule candidateSchedule = filteredPattern.runningScheduledTrips.get(candidateTrip); + final int candidateDeparture = candidateSchedule.departures[stopInPattern]; + if (candidateDeparture > departAfter) { + bestTrip = candidateTrip; + } else { + // We are confident of being on the earliest feasible departure. + break; + } + } + return bestTrip; + } + + + /** + * Perform a linear search through the trips in the supplied filteredPattern, finding the one that departs + * earliest from the given stop position in the pattern, and returning its index within the filtered pattern. + */ + private int findEarliestScheduledDeparture ( + int departAfter, FilteredPattern filteredPattern, int stopInPattern + ) { + // Trips are sorted in ascending order by time of departure from first stop + List trips = filteredPattern.runningScheduledTrips; + boolean noOvertaking = filteredPattern.noScheduledOvertaking; + int bestTrip = -1; + int bestDeparture = Integer.MAX_VALUE; + for (int t = 0; t < trips.size(); t++) { + TripSchedule ts = trips.get(t); + final int departure = ts.departures[stopInPattern]; + if (departure > departAfter && departure < bestDeparture) { + bestTrip = t; + bestDeparture = departure; + // No overtaking plus sorting by time of departure from first stop guarantees sorting by time of + // departure at this stop; so we know this is the earliest departure and can break early. + if (noOvertaking) break; + } + } + return bestTrip; + } + + // Chosen to be completely invalid as an array index or time in order to fail fast. + private static final int NONE = -1; + /** * A sub-step in the process of performing a RAPTOR search at one specific departure time (at one specific minute). * This method handles only the routes that have exact schedules. There is another method that handles only the @@ -477,105 +512,68 @@ private static Path[] pathToEachStop(RaptorState state) { */ private void doScheduledSearchForRound (RaptorState outputState) { final RaptorState inputState = outputState.previous; - BitSet patternsToExplore = patternsToExploreInNextRound(inputState, runningScheduledPatterns, true); + BitSet patternsToExplore = patternsToExploreInNextRound( + inputState, filteredPatterns.runningScheduledPatterns, true + ); for (int patternIndex = patternsToExplore.nextSetBit(0); patternIndex >= 0; patternIndex = patternsToExplore.nextSetBit(patternIndex + 1) ) { + FilteredPattern filteredPattern = filteredPatterns.patterns.get(patternIndex); TripPattern pattern = transit.tripPatterns.get(patternIndex); - int onTrip = -1; + // As we scan down the stops of the pattern, we may board a trip, and possibly re-board a different trip. + // Keep track of the index of the currently boarded trip within the list of filtered TripSchedules. + int onTrip = NONE; int waitTime = 0; - int boardTime = 0; - int boardStop = -1; + int boardTime = NONE; + int boardStop = NONE; TripSchedule schedule = null; - - for (int stopPositionInPattern = 0; stopPositionInPattern < pattern.stops.length; stopPositionInPattern++) { - int stop = pattern.stops[stopPositionInPattern]; - - // attempt to alight if we're on board and if drop off is allowed, done above the board search so - // that we don't check for alighting when boarding - if (onTrip > -1 && pattern.dropoffs[stopPositionInPattern] != PickDropType.NONE) { - int alightTime = schedule.arrivals[stopPositionInPattern]; + // Iterate over all stops in the current TripPattern ("scan" down the pattern) + for (int stopInPattern = 0; stopInPattern < pattern.stops.length; stopInPattern++) { + int stop = pattern.stops[stopInPattern]; + // Alight at the current stop in the pattern if drop-off is allowed and we're already on a trip. + // This block is above the boarding search so that we don't alight from the same stop where we boarded. + if (onTrip != NONE && pattern.dropoffs[stopInPattern] != PickDropType.NONE) { + int alightTime = schedule.arrivals[stopInPattern]; int inVehicleTime = alightTime - boardTime; - - // Use checkState instead? - if (waitTime + inVehicleTime + inputState.bestTimes[boardStop] > alightTime) { - LOG.error("Components of travel time are larger than travel time!"); - } - + checkState (alightTime == inputState.bestTimes[boardStop] + waitTime + inVehicleTime, + "Components of travel time are larger than travel time!"); outputState.setTimeAtStop(stop, alightTime, patternIndex, boardStop, waitTime, inVehicleTime, false); } - - // Don't attempt to board if this stop was not reached in the last round or if pick up is not allowed. - // Scheduled searches only care about updates within this departure minute, enabling range-raptor. + // If the current stop was reached in the previous round and allows pick-up, board or re-board a trip. + // Second parameter is true to only look at changes within this departure minute, enabling range-raptor. if (inputState.stopWasUpdated(stop, true) && - pattern.pickups[stopPositionInPattern] != PickDropType.NONE + pattern.pickups[stopInPattern] != PickDropType.NONE ) { int earliestBoardTime = inputState.bestTimes[stop] + MINIMUM_BOARD_WAIT_SEC; - if (onTrip == -1) { - int candidateTripIndex = -1; - for (TripSchedule candidateSchedule : pattern.tripSchedules) { - candidateTripIndex++; - if (!servicesActive.get(candidateSchedule.serviceCode) || candidateSchedule.headwaySeconds != null) { - // frequency trip or not running - continue; - } - if (earliestBoardTime < candidateSchedule.departures[stopPositionInPattern]) { - // board this trip (the earliest trip that can be boarded on this pattern at this stop) - onTrip = candidateTripIndex; - schedule = candidateSchedule; - boardTime = candidateSchedule.departures[stopPositionInPattern]; - waitTime = boardTime - inputState.bestTimes[stop]; - boardStop = stop; - break; - } - } + // Boarding/reboarding search is conditional on previous-round arrival at this stop earlier than the + // current trip in the current round. Otherwise the search is unnecessary and yields later trips. + if (schedule != null && (earliestBoardTime >= schedule.departures[stopInPattern])) { + continue; + } + int newTrip; + if (onTrip != NONE && filteredPattern.noScheduledOvertaking) { + // Optimized reboarding search: Already on a trip, trips known to be sorted by departure time. + newTrip = checkEarlierScheduledDeparture(earliestBoardTime, filteredPattern, stopInPattern, onTrip); } else { - // A specific trip on this pattern could be boarded at an upstream stop. If we are ready to - // depart from this stop before this trip does, it might be preferable to board at this stop - // instead. - if (earliestBoardTime < schedule.departures[stopPositionInPattern]) { - // First, it might be possible to board an earlier trip at this stop. - int earlierTripIdx = onTrip; - while (--earlierTripIdx >= 0) { - // The tripSchedules in a given pattern are sorted by time of departure from the first - // stop. So they are sorted by time of departure at this stop, if the possibility - // of overtaking is ignored. - TripSchedule earlierTripSchedule = pattern.tripSchedules.get(earlierTripIdx); - - if (earlierTripSchedule.headwaySeconds != null || !servicesActive.get(earlierTripSchedule.serviceCode)) { - // This is a frequency trip or it is not running on the day of the search. - continue; - } - // The assertion below is a sanity check, but not a complete check that all the - // tripSchedules are sorted, because later tripSchedules are not considered. - checkState(earlierTripSchedule.departures[0] <= schedule.departures[0], - "Trip schedules not sorted by departure time at first stop of pattern"); - - if (earliestBoardTime < earlierTripSchedule.departures[stopPositionInPattern]) { - // The trip under consideration can be boarded at this stop - onTrip = earlierTripIdx; - schedule = earlierTripSchedule; - boardTime = earlierTripSchedule.departures[stopPositionInPattern]; - waitTime = boardTime - inputState.bestTimes[stop]; - boardStop = stop; - } else { - // The trip under consideration arrives at this stop earlier than one could feasibly - // board. Stop searching, because trips are sorted by departure time within a pattern. - break; - } - } - // Second, if we care about paths or travel time components, check whether boarding at - // this stop instead of the upstream one would allow a shorter access/transfer leg. - // Doing so will not affect total travel time (as long as this is in a conditional - // ensuring we won't miss the trip we're on), but it will affect the breakdown of walk vs. - // wait time. - if (retainPaths && inputState.shorterAccessOrTransferLeg(stop, boardStop)) { - boardTime = schedule.departures[stopPositionInPattern]; - waitTime = boardTime - inputState.bestTimes[stop]; - boardStop = stop; - } - } + // General purpose departure search: not already on a trip or trips are not known to be sorted. + newTrip = findEarliestScheduledDeparture(earliestBoardTime, filteredPattern, stopInPattern); + } + // If we care about paths or travel time components, check whether boarding at this stop instead of + // the upstream one would allow a shorter access/transfer leg. Doing so will not affect total travel + // time (as long as this is in a conditional ensuring we won't miss the trip we're on), but it will + // affect the breakdown of walk vs. wait time. + final boolean reboardForPaths = retainPaths + && (onTrip != NONE) + && inputState.shorterAccessOrTransferLeg(stop, boardStop); + + if ((newTrip != onTrip) || reboardForPaths) { + checkState(newTrip != NONE); // Should never change from being on a trip to on no trip. + onTrip = newTrip; + schedule = filteredPattern.runningScheduledTrips.get(newTrip); + boardTime = schedule.departures[stopInPattern]; + waitTime = boardTime - inputState.bestTimes[stop]; + boardStop = stop; } } } @@ -616,21 +614,18 @@ private void doFrequencySearchForRound (RaptorState outputState, FrequencyBoardi // are applying randomized schedules that are not present in the accumulated range-raptor upper bound state. // Those randomized frequency routes may cascade improvements from updates made at previous departure minutes. final boolean withinMinute = (frequencyBoardingMode == UPPER_BOUND); - BitSet patternsToExplore = patternsToExploreInNextRound(inputState, runningFrequencyPatterns, withinMinute); + BitSet patternsToExplore = patternsToExploreInNextRound( + inputState, filteredPatterns.runningFrequencyPatterns, withinMinute + ); for (int patternIndex = patternsToExplore.nextSetBit(0); patternIndex >= 0; patternIndex = patternsToExplore.nextSetBit(patternIndex + 1) ) { + FilteredPattern filteredPattern = filteredPatterns.patterns.get(patternIndex); TripPattern pattern = transit.tripPatterns.get(patternIndex); - int tripScheduleIndex = -1; // First loop iteration will immediately increment to 0. - for (TripSchedule schedule : pattern.tripSchedules) { + for (TripSchedule schedule : filteredPattern.runningFrequencyTrips) { tripScheduleIndex++; - - // If this trip's service is inactive (it's not running) or it's a scheduled (non-freq) trip, skip it. - if (!servicesActive.get(schedule.serviceCode) || schedule.headwaySeconds == null) { - continue; - } // Loop through all the entries for this trip (time windows with service at a given frequency). for (int frequencyEntryIdx = 0; frequencyEntryIdx < schedule.headwaySeconds.length; diff --git a/src/main/java/com/conveyal/r5/profile/RaptorTimer.java b/src/main/java/com/conveyal/r5/profile/RaptorTimer.java index e4b4504e3..1a6954d91 100644 --- a/src/main/java/com/conveyal/r5/profile/RaptorTimer.java +++ b/src/main/java/com/conveyal/r5/profile/RaptorTimer.java @@ -7,6 +7,8 @@ public class RaptorTimer { public final ExecutionTimer fullSearch = new ExecutionTimer("Full range-Raptor search"); + public final ExecutionTimer patternFiltering = new ExecutionTimer(fullSearch, "Pattern filtering"); + public final ExecutionTimer scheduledSearch = new ExecutionTimer(fullSearch, "Scheduled/bounds search"); public final ExecutionTimer scheduledSearchTransit = new ExecutionTimer(scheduledSearch, "Scheduled search"); diff --git a/src/main/java/com/conveyal/r5/streets/OSMCache.java b/src/main/java/com/conveyal/r5/streets/OSMCache.java index 29430272a..35eddf764 100644 --- a/src/main/java/com/conveyal/r5/streets/OSMCache.java +++ b/src/main/java/com/conveyal/r5/streets/OSMCache.java @@ -1,36 +1,30 @@ package com.conveyal.r5.streets; +import com.conveyal.file.FileCategory; import com.conveyal.file.FileStorage; import com.conveyal.file.FileStorageKey; import com.conveyal.osmlib.OSM; +import com.conveyal.osmlib.OsmLibException; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; +import javax.annotation.Nonnull; import java.io.File; import java.util.concurrent.ExecutionException; /** - * TODO this should be moved out of osm-lib and R5 into Analysis, and better integrated with TransportNetworkCache - * so we don't need to have a dependency on AWS S3 SDK and multiple S3 clients. + * TODO this should be moved so we don't need to have a dependency on AWS S3 SDK and multiple S3 clients. * Maybe a general local+S3 object storage mechanism for externalizable objects, using the TransferManager. - * We are currently getting AWS SDK dependency transitively through gtfs-lib. Future versions of gtfs-lib will not - * have this functionality. */ public class OSMCache { - public final String bucket; private final FileStorage fileStorage; - public interface Config { - String bundleBucket (); - } - /** * Construct a new OSMCache. * If bucket is null, we will work offline (will not create an S3 client, avoiding need to set an AWS region). */ - public OSMCache (FileStorage fileStorage, Config config) { - this.bucket = config.bundleBucket(); + public OSMCache (FileStorage fileStorage) { this.fileStorage = fileStorage; } @@ -43,11 +37,13 @@ public String cleanId(String id) { } public FileStorageKey getKey (String id) { + // FIXME Transforming IDs each time they're used seems problematic. They should probably only be validated here. String cleanId = cleanId(id); - return new FileStorageKey(bucket, cleanId + ".pbf"); + return new FileStorageKey(FileCategory.BUNDLES, cleanId + ".pbf"); } - public OSM get (String id) { + /** This should always return an OSM object, not null. If something prevents that, it should throw an exception. */ + public @Nonnull OSM get (String id) throws OsmLibException { try { return osmCache.get(id, () -> { File osmFile = fileStorage.getFile(getKey(id)); @@ -57,7 +53,7 @@ public OSM get (String id) { return ret; }); } catch (ExecutionException e) { - throw new RuntimeException(e); + throw new OsmLibException("Exception in OSM MapDB CacheLoader.", e.getCause()); } } } diff --git a/src/main/java/com/conveyal/r5/transit/FilteredPattern.java b/src/main/java/com/conveyal/r5/transit/FilteredPattern.java new file mode 100644 index 000000000..5ef398791 --- /dev/null +++ b/src/main/java/com/conveyal/r5/transit/FilteredPattern.java @@ -0,0 +1,65 @@ +package com.conveyal.r5.transit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; + +/** + * FilteredPatterns correspond to a single specific TripPattern, indicating all the trips running on a particular day. + * TripPatterns contain all the trips on a route that follow the same stop sequence. This often includes trips on + * different days of the week or special schedules where vehicles travel faster or slower. By filtering down to only + * those trips running on a particular day (a particular set of service codes), we usually get a smaller set of trips + * with no overtaking, which enables certain optimizations and is more efficient for routing. + */ +public class FilteredPattern { + + private static Logger LOG = LoggerFactory.getLogger(FilteredPattern.class); + + /** + * Schedule-based (i.e. not frequency-based) trips running in a particular set of GTFS services, sorted in + * ascending order by time of departure from first stop + */ + public List runningScheduledTrips = new ArrayList<>(); + + /** Frequency-based trips active in a particular set of GTFS services */ + public List runningFrequencyTrips = new ArrayList<>(); + + /** If no active schedule-based trip of this filtered pattern overtakes another. */ + public boolean noScheduledOvertaking; + + /** + * Filter the trips in a source TripPattern, excluding trips not active in the supplied set of services, and + * dividing them into separate scheduled and frequency trip lists. Check the runningScheduledTrips for overtaking. + */ + public FilteredPattern (TripPattern source, BitSet servicesActive) { + for (TripSchedule schedule : source.tripSchedules) { + if (servicesActive.get(schedule.serviceCode)) { + if (schedule.headwaySeconds == null) { + runningScheduledTrips.add(schedule); + } else { + runningFrequencyTrips.add(schedule); + } + } + } + // Check whether any running trip on this pattern overtakes another + noScheduledOvertaking = true; + for (int i = 0; i < runningScheduledTrips.size() - 1; i++) { + if (overtakes(runningScheduledTrips.get(i), runningScheduledTrips.get(i + 1))) { + noScheduledOvertaking = false; + LOG.warn("Overtaking: route {} pattern {}", source.routeId, source.originalId); + break; + } + } + } + + private static boolean overtakes (TripSchedule a, TripSchedule b) { + for (int s = 0; s < a.departures.length; s++) { + if (a.departures[s] > b.departures[s]) return true; + } + return false; + } + +} diff --git a/src/main/java/com/conveyal/r5/transit/FilteredPatternCache.java b/src/main/java/com/conveyal/r5/transit/FilteredPatternCache.java new file mode 100644 index 000000000..712d1d6c3 --- /dev/null +++ b/src/main/java/com/conveyal/r5/transit/FilteredPatternCache.java @@ -0,0 +1,50 @@ +package com.conveyal.r5.transit; + +import com.conveyal.r5.api.util.TransitModes; +import com.conveyal.r5.util.Tuple2; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; + +import java.util.BitSet; +import java.util.EnumSet; + +/** + * Stores the patterns and trips relevant for routing based on the transit modes and date in an analysis request. + * We can't just cache the single most recently used filtered patterns, because a worker might need to simultaneously + * handle two requests for the same scenario on different dates or with different modes. + * + * There are good reasons why this cache is specific to a single TransitLayer (representing one specific scenario). + * To create FilteredPatterns we need the source TransitLayer object. LoadingCaches must compute values based only on + * their keys. So a system-wide FilteredPatternCache would either need to recursively look up TransportNetworks in + * the TransportNetworkCache, or would need to have TransportNetwork or TransitLayer references in its keys. Neither + * of these seems desirable - the latter would impede garbage collection of evicted TransportNetworks. + */ +public class FilteredPatternCache { + + /** + * All FilteredPatterns stored in this cache will be derived from this single TransitLayer representing a single + * scenario, but for different unique combinations of (transitModes, services). + */ + private final TransitLayer transitLayer; + + private final LoadingCache cache; + + public FilteredPatternCache (TransitLayer transitLayer) { + this.transitLayer = transitLayer; + this.cache = Caffeine.newBuilder().maximumSize(2).build(key -> { + return new FilteredPatterns(transitLayer, key.a, key.b); + }); + } + + // TODO replace all keys and tuples with Java 16/17 Records + private static class Key extends Tuple2, BitSet> { + public Key (EnumSet transitModes, BitSet servicesActive) { + super(transitModes, servicesActive); + } + } + + public FilteredPatterns get (EnumSet transitModes, BitSet servicesActive) { + return cache.get(new Key(transitModes, servicesActive)); + } + +} diff --git a/src/main/java/com/conveyal/r5/transit/FilteredPatterns.java b/src/main/java/com/conveyal/r5/transit/FilteredPatterns.java new file mode 100644 index 000000000..59173a0e3 --- /dev/null +++ b/src/main/java/com/conveyal/r5/transit/FilteredPatterns.java @@ -0,0 +1,63 @@ +package com.conveyal.r5.transit; + +import com.conveyal.r5.api.util.TransitModes; +import com.conveyal.r5.util.Tuple2; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.EnumSet; +import java.util.List; + +import static com.conveyal.r5.transit.TransitLayer.getTransitModes; + +/** + * Holds all the FilteredPatterns instances for a particular TransitLayer (scenario) given a particular set of + * filtering criteria (transit modes and active services). There is one FilteredPattern instance for each TripPattern + * that is present in the filtered TransitLayer. Many TripPatterns contain a mixture of trips from different days, + * and those trips appear to overtake one another if we do not filter them down. Filtering allows us to flag more + * effectively which patterns have no overtaking, which is useful because departure time searches can be then optimized + * for patterns with no overtaking. All trips in a TripPattern are defined to be on same route, and GTFS allows only one + * mode per route. + */ +public class FilteredPatterns { + + /** + * List with the same length and indexes as the unfiltered TripPatterns in the input TransitLayer. + * Patterns that do not meet the mode/services filtering criteria are recorded as null. + */ + public final List patterns; + + /** The indexes of the trip patterns running on a given day with frequency-based trips of selected modes. */ + public BitSet runningFrequencyPatterns = new BitSet(); + + /** The indexes of the trip patterns running on a given day with scheduled trips of selected modes. */ + public BitSet runningScheduledPatterns = new BitSet(); + + /** + * Construct FilteredPatterns from the given TransitLayer, filtering for the specified modes and active services. + * It's tempting to use List.of() or Collectors.toUnmodifiableList() but these cause an additional array copy. + */ + public FilteredPatterns (TransitLayer transitLayer, EnumSet modes, BitSet services) { + List sourcePatterns = transitLayer.tripPatterns; + patterns = new ArrayList<>(sourcePatterns.size()); + for (int patternIndex = 0; patternIndex < sourcePatterns.size(); patternIndex++) { + TripPattern pattern = sourcePatterns.get(patternIndex); + RouteInfo routeInfo = transitLayer.routes.get(pattern.routeIndex); + TransitModes mode = getTransitModes(routeInfo.route_type); + if (pattern.servicesActive.intersects(services) && modes.contains(mode)) { + patterns.add(new FilteredPattern(pattern, services)); + // At least one trip on this pattern is relevant, based on the profile request's date and modes. + if (pattern.hasFrequencies) { + runningFrequencyPatterns.set(patternIndex); + } + // Schedule case is not an "else" clause because we support patterns with both frequency and schedule. + if (pattern.hasSchedules) { + runningScheduledPatterns.set(patternIndex); + } + } else { + patterns.add(null); + } + } + } + +} diff --git a/src/main/java/com/conveyal/r5/transit/TransitLayer.java b/src/main/java/com/conveyal/r5/transit/TransitLayer.java index dc53e7921..13eb357dd 100644 --- a/src/main/java/com/conveyal/r5/transit/TransitLayer.java +++ b/src/main/java/com/conveyal/r5/transit/TransitLayer.java @@ -46,6 +46,7 @@ import java.util.BitSet; import java.util.Collection; import java.util.Collections; +import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -104,6 +105,9 @@ public class TransitLayer implements Serializable, Cloneable { public List tripPatterns = new ArrayList<>(); + /** Stores the relevant patterns and trips based on the transit modes and date in an analysis request. */ + public transient FilteredPatternCache filteredPatternCache = new FilteredPatternCache(this); + // Maybe we need a StopStore that has (streetVertexForStop, transfers, flags, etc.) public TIntList streetVertexForStop = new TIntArrayList(); @@ -186,7 +190,8 @@ public void loadFromGtfs (GTFSFeed gtfs) throws DuplicateFeedException { /** * Load data from a GTFS feed. Call multiple times to load multiple feeds. - * The feed is not closed after being loaded. + * The supplied feed is treated as read-only, and is not closed after being loaded. + * This method requires findPatterns() to have been called on the feed before it's passed in. */ public void loadFromGtfs (GTFSFeed gtfs, LoadLevel level) throws DuplicateFeedException { if (feedChecksums.containsKey(gtfs.feedId)) { @@ -227,11 +232,6 @@ public void loadFromGtfs (GTFSFeed gtfs, LoadLevel level) throws DuplicateFeedEx LOG.debug("Service {} has ID {}", serviceIndex, serviceId); }); - // Group trips by stop pattern (including pickup/dropoff type) and fill stop times into patterns. - // Also group trips by the blockId they belong to, and chain them together if they allow riders to stay on board - // the vehicle from one trip to the next, even if it changes routes or directions. This is called "interlining". - gtfs.findPatterns(); - LOG.info("Creating trip patterns and schedules."); // These are temporary maps used only for grouping purposes. @@ -748,6 +748,7 @@ public TransitLayer scenarioCopy(TransportNetwork newScenarioNetwork, boolean wi // the scenario that modified it. If the scenario will not affect the contents of the layer, its // scenarioId remains unchanged as is done in StreetLayer. copy.scenarioId = newScenarioNetwork.scenarioId; + copy.filteredPatternCache = new FilteredPatternCache(copy); } return copy; } diff --git a/src/main/java/com/conveyal/r5/transit/TransportNetwork.java b/src/main/java/com/conveyal/r5/transit/TransportNetwork.java index d7895761d..358c24dc0 100644 --- a/src/main/java/com/conveyal/r5/transit/TransportNetwork.java +++ b/src/main/java/com/conveyal/r5/transit/TransportNetwork.java @@ -26,7 +26,6 @@ import java.io.Serializable; import java.time.ZoneId; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -125,7 +124,7 @@ public static TransportNetwork fromFiles ( osm.intersectionDetection = true; osm.readFromFile(osmSourceFile); // Supply feeds with a stream so they do not sit open in memory while other feeds are being processed. - Stream feeds = gtfsSourceFiles.stream().map(GTFSFeed::fromFile); + Stream feeds = gtfsSourceFiles.stream().map(GTFSFeed::readOnlyTempFileFromGtfs); return fromInputs(tnBuilderConfig, osm, feeds); } diff --git a/src/main/java/com/conveyal/r5/transit/TransportNetworkCache.java b/src/main/java/com/conveyal/r5/transit/TransportNetworkCache.java index efc08bc28..b8587feaf 100644 --- a/src/main/java/com/conveyal/r5/transit/TransportNetworkCache.java +++ b/src/main/java/com/conveyal/r5/transit/TransportNetworkCache.java @@ -1,6 +1,5 @@ package com.conveyal.r5.transit; -import com.conveyal.analysis.BackendVersion; import com.conveyal.file.FileStorage; import com.conveyal.file.FileStorageKey; import com.conveyal.file.FileUtils; @@ -19,6 +18,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nonnull; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -31,6 +31,8 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import static com.conveyal.file.FileCategory.BUNDLES; + /** * This holds one or more TransportNetworks keyed on unique strings. * Because (de)serialization is now much faster than building networks from scratch, built graphs are cached on the @@ -52,7 +54,6 @@ public class TransportNetworkCache { private final FileStorage fileStorage; private final GTFSCache gtfsCache; private final OSMCache osmCache; - private final String bucket; /** * A table of already seen scenarios, avoiding downloading them repeatedly from S3 and allowing us to replace @@ -61,22 +62,23 @@ public class TransportNetworkCache { private final ScenarioCache scenarioCache = new ScenarioCache(); /** Create a transport network cache. If source bucket is null, will work offline. */ - public TransportNetworkCache(FileStorage fileStorage, GTFSCache gtfsCache, OSMCache osmCache, String bucket) { + public TransportNetworkCache (FileStorage fileStorage, GTFSCache gtfsCache, OSMCache osmCache) { this.osmCache = osmCache; this.gtfsCache = gtfsCache; - this.bucket = bucket; this.cache = createCache(DEFAULT_CACHE_SIZE); this.fileStorage = fileStorage; } - /** Convenience method that returns transport network from cache. */ - public synchronized TransportNetwork getNetwork (String networkId) { + /** + * Find a transport network by ID, building or loading as needed from pre-existing OSM, GTFS, MapDB, or Kryo files. + * This should never return null. If a TransportNetwork can't be built or loaded, an exception will be thrown. + */ + public synchronized @Nonnull + TransportNetwork getNetwork (String networkId) throws TransportNetworkException { try { return cache.get(networkId); } catch (Exception e) { - LOG.error("Exception while loading a transport network into the cache: {}", e.toString()); - e.printStackTrace(); - return null; + throw new TransportNetworkException("Could not load TransportNetwork into cache. ", e); } } @@ -85,7 +87,7 @@ public synchronized TransportNetwork getNetwork (String networkId) { */ public void rememberScenario (Scenario scenario) { if (scenario == null) { - throw new AssertionError("Expecting a scenario to be embedded in this task."); + throw new IllegalArgumentException("Expecting a scenario to be embedded in this task."); } else { scenarioCache.storeScenario(scenario); } @@ -119,7 +121,7 @@ public synchronized TransportNetwork getNetworkForScenario (String networkId, St TransportNetwork scenarioNetwork = baseNetwork.scenarios.get(scenarioId); if (scenarioNetwork == null) { // The network for this scenario was not found in the cache. Create that scenario network and cache it. - LOG.info("Applying scenario to base network..."); + LOG.debug("Applying scenario to base network..."); // Fetch the full scenario if an ID was specified. Scenario scenario = resolveScenario(networkId, scenarioId); // Apply any scenario modifications to the network before use, performing protective copies where necessary. @@ -128,10 +130,10 @@ public synchronized TransportNetwork getNetworkForScenario (String networkId, St // the InactiveTripsFilter. The solution may be to cache linked point sets based on scenario ID but always // apply scenarios every time. scenarioNetwork = scenario.applyToTransportNetwork(baseNetwork); - LOG.info("Done applying scenario. Caching the resulting network."); + LOG.debug("Done applying scenario. Caching the resulting network."); baseNetwork.scenarios.put(scenario.id, scenarioNetwork); } else { - LOG.info("Reusing cached TransportNetwork for scenario {}.", scenarioId); + LOG.debug("Reusing cached TransportNetwork for scenario {}.", scenarioId); } return scenarioNetwork; } @@ -140,39 +142,25 @@ private String getScenarioFilename(String networkId, String scenarioId) { return String.format("%s_%s.json", networkId, scenarioId); } - /** If this transport network is already built and cached, fetch it quick */ - private TransportNetwork checkCached (String networkId) { - FileStorageKey r5Key = getR5NetworkFileStorageKey(networkId); - if (fileStorage.exists(r5Key)) { - File r5Network = fileStorage.getFile(r5Key); - LOG.info("Loading cached transport network at {}", r5Network); - try { - return KryoNetworkSerializer.read(r5Network); - } catch (Exception e) { - LOG.error("Exception occurred retrieving cached transport network", e); - } - } else { - LOG.error("Could not find transport network " + networkId); - } - return null; - } - private String getR5NetworkFilename(String networkId) { - return networkId + "_" + BackendVersion.instance.version + ".dat"; + return String.format("%s_%s.dat", networkId, KryoNetworkSerializer.NETWORK_FORMAT_VERSION); } private FileStorageKey getR5NetworkFileStorageKey (String networkId) { - return new FileStorageKey(bucket, getR5NetworkFilename(networkId)); + return new FileStorageKey(BUNDLES, getR5NetworkFilename(networkId)); } - /** If we did not find a cached network, build one */ - public TransportNetwork buildNetwork (String networkId) { + /** + * If we did not find a cached network, build one from the input files. Should throw an exception rather than + * returning null if for any reason it can't finish building one. + */ + private @Nonnull TransportNetwork buildNetwork (String networkId) { TransportNetwork network; - // check if we have a new-format bundle with a JSON manifest - FileStorageKey manifestFileKey = new FileStorageKey(bucket,GTFSCache.cleanId(networkId) + ".json"); + // Check if we have a new-format bundle with a JSON manifest. + FileStorageKey manifestFileKey = new FileStorageKey(BUNDLES, GTFSCache.cleanId(networkId) + ".json"); if (fileStorage.exists(manifestFileKey)) { - LOG.info("Detected new-format bundle with manifest."); + LOG.debug("Detected new-format bundle with manifest."); network = buildNetworkFromManifest(networkId); } else { LOG.warn("Detected old-format bundle stored as single ZIP file"); @@ -188,27 +176,24 @@ public TransportNetwork buildNetwork (String networkId) { network.transitLayer.buildDistanceTables(null); network.rebuildLinkedGridPointSet(StreetMode.WALK); - // Cache the serialized network on the local filesystem. - - + // Cache the serialized network on the local filesystem and mirror it to any remote storage. try { File cacheLocation = FileUtils.createScratchFile(); - // Serialize TransportNetwork to local cache on this worker KryoNetworkSerializer.write(network, cacheLocation); - // Store locally (and on S3) fileStorage.moveIntoStorage(getR5NetworkFileStorageKey(networkId), cacheLocation); } catch (Exception e) { - // Don't break here as we do have a network to return, we just couldn't cache it. - LOG.error("Error saving cached network", e); + // Tolerate exceptions here as we do have a network to return, we just failed to cache it. + LOG.error("Error saving cached network, returning the object anyway.", e); } return network; } - /** Build a transport network given a network ID, using a zip of all bundle files in S3 */ + /** Build a transport network given a network ID, using a zip of all bundle files in S3. */ + @Deprecated private TransportNetwork buildNetworkFromBundleZip (String networkId) { // The location of the inputs that will be used to build this graph File dataDirectory = FileUtils.createScratchDirectory(); - FileStorageKey zipKey = new FileStorageKey(bucket,networkId + ".zip"); + FileStorageKey zipKey = new FileStorageKey(BUNDLES, networkId + ".zip"); File zipFile = fileStorage.getFile(zipKey); try { @@ -229,7 +214,7 @@ private TransportNetwork buildNetworkFromBundleZip (String networkId) { zis.close(); } catch (Exception e) { // TODO delete cache dir which is probably corrupted. - LOG.info("Error retrieving transportation network input files", e); + LOG.warn("Error retrieving transportation network input files", e); return null; } @@ -254,7 +239,7 @@ private TransportNetwork buildNetworkFromBundleZip (String networkId) { * It contains the unique IDs of the GTFS feeds and OSM extract. */ private TransportNetwork buildNetworkFromManifest (String networkId) { - FileStorageKey manifestFileKey = new FileStorageKey(bucket, getManifestFilename(networkId)); + FileStorageKey manifestFileKey = new FileStorageKey(BUNDLES, getManifestFilename(networkId)); File manifestFile = fileStorage.getFile(manifestFileKey); BundleManifest manifest; @@ -305,24 +290,34 @@ private LoadingCache createCache(int size) { } /** - * Return the graph for the given unique identifier for graph builder inputs on S3. - * If this is the same as the last graph built, just return the pre-built graph. - * If not, build the graph from the inputs, fetching them from S3 to the local cache as needed. + * CacheLoader method, which should only be called by the LoadingCache. + * Return the graph for the given unique identifier. Load pre-built serialized networks from local or remote + * storage. If none is available for the given id, build the network from its inputs, fetching them from remote + * storage to local storage as needed. Note the cache size is currently hard-wired to 1, so series of calls with + * the same ID will return the same object, but calls with different IDs will cause it to be reloaded from files. + * This should always return a usable TransportNetwork not null, and should throw an exception whenever it can't. */ - private TransportNetwork loadNetwork(String networkId) { - LOG.info("Finding or building a TransportNetwork for ID {} and R5 version {}", networkId, BackendVersion.instance.version); - - TransportNetwork network = checkCached(networkId); - if (network == null) { - LOG.info("Cached transport network for id {} and R5 version {} was not found. Building the network from scratch.", - networkId, BackendVersion.instance.version); - network = buildNetwork(networkId); + private @Nonnull TransportNetwork loadNetwork(String networkId) throws TransportNetworkException { + LOG.debug( + "Finding or building a TransportNetwork for ID {} with file format version {}.", + networkId, KryoNetworkSerializer.NETWORK_FORMAT_VERSION + ); + try { + FileStorageKey r5Key = getR5NetworkFileStorageKey(networkId); + if (fileStorage.exists(r5Key)) { + File networkFile = fileStorage.getFile(r5Key); + LOG.debug("Loading cached transport network at {}", networkFile); + return KryoNetworkSerializer.read(networkFile); + } else { + LOG.debug( + "Cached transport network for ID {} with file format version {} was not found. Building from scratch.", + networkId, KryoNetworkSerializer.NETWORK_FORMAT_VERSION + ); + return buildNetwork(networkId); + } + } catch (Exception e) { + throw new TransportNetworkException("Exception occurred retrieving or building network.", e); } - - // TODO determine why we were manually inserting into the cache. - // It now results in concurrent modification deadlock because it's called inside a cacheloader. - // cache.put(networkId, network); - return network; } /** @@ -359,10 +354,10 @@ private Scenario resolveScenario (String networkId, String scenarioId) { // If a scenario ID is supplied, it overrides any supplied full scenario. // There is no intermediate cache here for the scenario objects - we read them from disk files. // This is not a problem, they're only read once before cacheing the resulting scenario-network. - FileStorageKey scenarioFileKey = new FileStorageKey(bucket, getScenarioFilename(networkId, scenarioId)); + FileStorageKey scenarioFileKey = new FileStorageKey(BUNDLES, getScenarioFilename(networkId, scenarioId)); try { File scenarioFile = fileStorage.getFile(scenarioFileKey); - LOG.info("Loading scenario from disk file {}", scenarioFile); + LOG.debug("Loading scenario from disk file {}", scenarioFile); return JsonUtilities.lenientObjectMapper.readValue(scenarioFile, Scenario.class); } catch (Exception e) { LOG.error("Could not fetch scenario {} or read it from from disk: {}", scenarioId, e.toString()); diff --git a/src/main/java/com/conveyal/r5/transit/TransportNetworkException.java b/src/main/java/com/conveyal/r5/transit/TransportNetworkException.java new file mode 100644 index 000000000..41b394954 --- /dev/null +++ b/src/main/java/com/conveyal/r5/transit/TransportNetworkException.java @@ -0,0 +1,14 @@ +package com.conveyal.r5.transit; + +/** Generic runtime exception for any problem encountered when building or loading a TransportNetwork. */ +public class TransportNetworkException extends RuntimeException { + + public TransportNetworkException (String message) { + super(message); + } + + public TransportNetworkException (String message, Throwable cause) { + super(message, cause); + } + +} diff --git a/src/main/java/com/conveyal/r5/transit/TripPattern.java b/src/main/java/com/conveyal/r5/transit/TripPattern.java index 88324db66..7c7e08224 100644 --- a/src/main/java/com/conveyal/r5/transit/TripPattern.java +++ b/src/main/java/com/conveyal/r5/transit/TripPattern.java @@ -22,8 +22,8 @@ import java.util.stream.StreamSupport; /** + * All the Trips on the same Route that have the same sequence of stops, with the same pickup/dropoff options. * This is like a Transmodel JourneyPattern. - * All the trips on the same Route that have the same sequence of stops, with the same pickup/dropoff options. */ public class TripPattern implements Serializable, Cloneable { @@ -33,6 +33,7 @@ public class TripPattern implements Serializable, Cloneable { * This is the ID of this trip pattern _in the original transport network_. This is important because if it were the * ID in this transport network the ID would depend on the order of application of scenarios, and because this ID is * used to map results back to the original network. + * TODO This concept of an "original" transport network may be obsolete, this field doesn't seem to be used anywhere. */ public int originalId; @@ -44,8 +45,7 @@ public class TripPattern implements Serializable, Cloneable { public PickDropType[] dropoffs; public BitSet wheelchairAccessible; // One bit per stop - /** TripSchedules for all trips following this pattern, sorted in ascending order by time of departure from first - * stop */ + /** TripSchedules for all trips in this pattern, sorted in ascending order by time of departure from first stop. */ public List tripSchedules = new ArrayList<>(); /** GTFS shape for this pattern. Should be left null in non-customer-facing applications */ @@ -67,8 +67,8 @@ public class TripPattern implements Serializable, Cloneable { public BitSet servicesActive = new BitSet(); /** - * index of this route in TransitLayer data. -1 if detailed route information has not been loaded - * TODO clarify what "this route" means. The route of this tripPattern? + * The index of this TripPatterns's route in the TransitLayer, or -1 if not yet loaded. + * Do we really want/need this redundant representation of routeId? */ public int routeIndex = -1; @@ -132,6 +132,8 @@ public void setOrVerifyDirection (int directionId) { /** * Linear search. * @return null if no departure is possible. + * FIXME this is unused. And is active true by definition (this.servicesActive is a BitSet with serviceCode set for + * every one of this.tripSchedules)? */ TripSchedule findNextDeparture (int time, int stopOffset) { TripSchedule bestSchedule = null; @@ -177,9 +179,7 @@ public String toStringDetailed (TransitLayer transitLayer) { return sb.toString(); } - /** - * @return true when none of the supplied tripIds are on this pattern. - */ + /** @return true when none of the supplied tripIds are on this pattern. */ public boolean containsNoTrips(Set tripIds) { return this.tripSchedules.stream().noneMatch(ts -> tripIds.contains(ts.tripId)); } @@ -225,5 +225,4 @@ public List getHopGeometries(TransitLayer transitLayer) { } return geometries; } - } diff --git a/src/main/java/com/conveyal/r5/util/AsyncLoader.java b/src/main/java/com/conveyal/r5/util/AsyncLoader.java index 68d845908..4606803bb 100644 --- a/src/main/java/com/conveyal/r5/util/AsyncLoader.java +++ b/src/main/java/com/conveyal/r5/util/AsyncLoader.java @@ -46,6 +46,7 @@ public abstract class AsyncLoader { * Each cache has its own executor, so tiny things like loading grids are not held up by slower things like * linking or distance calculations. This also avoids deadlocks where one cache loads values from another cache * as part of its building process, but the sub-task can never happen because the parent task is hogging a thread. + * We may want to consider using the single TaskScheduler component which has separate heavy and light executors. */ private Executor executor = Executors.newFixedThreadPool(2); @@ -72,22 +73,22 @@ public static class LoaderState { public final String message; public final int percentComplete; public final V value; - public final Exception exception; + public final Throwable throwable; private LoaderState(Status status, String message, int percentComplete, V value) { this.status = status; this.message = message; this.percentComplete = percentComplete; this.value = value; - this.exception = null; + this.throwable = null; } - private LoaderState(Exception exception) { + private LoaderState(Throwable throwable) { this.status = Status.ERROR; - this.message = exception.toString(); + this.message = throwable.toString(); this.percentComplete = 0; this.value = null; - this.exception = exception; + this.throwable = throwable; } @Override @@ -113,7 +114,9 @@ public LoaderState get (K key) { enqueueLoadTask = true; } } - // TODO maybe use futures and get() with timeout, so fast scenario applications don't need to be retried + // Here we could potentially use futures and get() with timeout, so fast scenario applications don't require + // re-polling. We should probably unify the progress tracking, exception handling, and thread pool management + // with taskScheduler. Async loading could be kicked off once per key, then fail fast on subsequent requests. // Enqueue task outside the above block (synchronizing the fewest lines possible). if (enqueueLoadTask) { executor.execute(() -> { @@ -123,9 +126,11 @@ public LoaderState get (K key) { synchronized (map) { map.put(key, new LoaderState(Status.PRESENT, null, 100, value)); } - } catch (Exception ex) { - setError(key, ex); - LOG.error(ExceptionUtils.asString(ex)); + } catch (Throwable t) { + // It's essential to trap Throwable rather than just Exception. Otherwise the executor + // threads can be killed by any Error that happens, stalling the executor. + setError(key, t); + LOG.error("Async load failed: " + ExceptionUtils.stackTraceString(t)); } }); } @@ -151,12 +156,12 @@ public void setProgress(K key, int percentComplete, String message) { } /** - * Call this method inside the buildValue method to indicate progress. + * Call this method inside the buildValue method to indicate that an unrecoverable error has happened. * FIXME this will permanently associate an error with the key. No further attempt will ever be made to create the value. */ - protected void setError (K key, Exception exception) { + protected void setError (K key, Throwable throwable) { synchronized (map) { - map.put(key, new LoaderState(exception)); + map.put(key, new LoaderState(throwable)); } } } diff --git a/src/main/java/com/conveyal/r5/util/ExceptionUtils.java b/src/main/java/com/conveyal/r5/util/ExceptionUtils.java index 0221b0e98..901651e85 100644 --- a/src/main/java/com/conveyal/r5/util/ExceptionUtils.java +++ b/src/main/java/com/conveyal/r5/util/ExceptionUtils.java @@ -2,18 +2,52 @@ import java.io.PrintWriter; import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** * Convenience functions for working with exceptions (or more generally throwables). */ public abstract class ExceptionUtils { - public static String asString(Throwable throwable) { + /** + * Returns the output of Throwable.printStackTrace() in a String. + * This is the usual Java stack trace we're accustomed to seeing on the console. + * The throwable.printStackTrace method includes the class name and detail message, and will traverse the whole + * chain of causes showing multiple stack traces, avoiding any reference loops. The resulting string will contain + * linefeeds and tabs which must be properly handled when displaying (e.g. in HTML). + */ + public static String stackTraceString (Throwable throwable) { StringWriter sw = new StringWriter(); - sw.append(throwable.getMessage()); - sw.append("\n"); throwable.printStackTrace(new PrintWriter(sw)); return sw.toString(); } + /** + * Short-form exception summary that includes the chain of causality, reversed such that the root cause comes first. + * We might want to add line numbers of one stack frame with class simple names. + */ + public static String shortCauseString (Throwable throwable) { + List items = new ArrayList<>(); + Set seen = new HashSet<>(); // Bail out if there are cycles in the cause chain + while (throwable != null && !seen.contains(throwable)) { + String item = throwable.getClass().getSimpleName(); + if (throwable.getMessage() != null) { + item += ": " + throwable.getMessage(); + } + items.add(item); + seen.add(throwable); + throwable = throwable.getCause(); + } + Collections.reverse(items); + return String.join(", caused ", items); + } + + public static String shortAndLongString (Throwable throwable) { + return shortCauseString(throwable) + "\n[detail follows]\n" + stackTraceString(throwable); + } + } diff --git a/src/main/java/com/conveyal/r5/util/S3Util.java b/src/main/java/com/conveyal/r5/util/S3Util.java deleted file mode 100644 index e888c8e4d..000000000 --- a/src/main/java/com/conveyal/r5/util/S3Util.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.conveyal.r5.util; - -import com.amazonaws.services.s3.AmazonS3; -import com.amazonaws.services.s3.AmazonS3ClientBuilder; -import com.amazonaws.services.s3.model.ObjectMetadata; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.InputStream; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -/** - * Created by matthewc on 10/21/16. - */ -public class S3Util { - private static final Logger LOG = LoggerFactory.getLogger(S3Util.class); - public static final AmazonS3 s3 = AmazonS3ClientBuilder.defaultClient(); - - private static ThreadPoolExecutor executor = new ThreadPoolExecutor(10, 256, 60,TimeUnit.SECONDS, new ArrayBlockingQueue<>(255)); - // can't use CallerRunsPolicy as that would cause deadlocks, calling thread is writing to inputstream - static { - executor.setRejectedExecutionHandler(new ThreadPoolExecutor.AbortPolicy()); - } - - public static void streamToS3 (String bucket, String key, InputStream is, ObjectMetadata metadata) { - // write to S3 in a thread - executor.execute(() -> { - try { - s3.putObject(bucket, key, is, metadata); - is.close(); - } catch (Exception e) { - LOG.error("Exception writing to S3", e); - } - }); - } -} diff --git a/src/main/java/com/conveyal/r5/util/Tuple2.java b/src/main/java/com/conveyal/r5/util/Tuple2.java new file mode 100644 index 000000000..595e0f356 --- /dev/null +++ b/src/main/java/com/conveyal/r5/util/Tuple2.java @@ -0,0 +1,31 @@ +package com.conveyal.r5.util; + +import java.util.Objects; + +/** + * Generic logic for a 2-tuple of different types. + * Reduces high-maintenance boilerplate clutter when making map key types. + * TODO replace with Records in Java 16 or 17 + */ +public class Tuple2 { + public final A a; + public final B b; + + public Tuple2 (A a, B b) { + this.a = a; + this.b = b; + } + + @Override + public boolean equals (Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Tuple2 tuple2 = (Tuple2) o; + return Objects.equals(a, tuple2.a) && Objects.equals(b, tuple2.b); + } + + @Override + public int hashCode () { + return Objects.hash(a, b); + } +} diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml index 9a893278c..955c22851 100644 --- a/src/main/resources/logback.xml +++ b/src/main/resources/logback.xml @@ -1,4 +1,4 @@ - +