Skip to content

Commit

Permalink
Update scalardb-analytics-spark-sample to support 3.14
Browse files Browse the repository at this point in the history
  • Loading branch information
choplin committed Nov 20, 2024
1 parent bdedc82 commit 990519e
Show file tree
Hide file tree
Showing 13 changed files with 139 additions and 148 deletions.
1 change: 1 addition & 0 deletions scalardb-analytics-spark-sample/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.scala_history
118 changes: 61 additions & 57 deletions scalardb-analytics-spark-sample/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,50 +1,52 @@
services:
spark-shell:
spark-sql:
build:
context: ./docker
dockerfile: Dockerfile.spark
volumes:
- ./scalardb.properties:/etc/scalardb.properties
- ./spark-defaults.conf:/opt/spark/conf/spark-defaults.conf
- ./cert.pem:/etc/cert.pem
- .scala_history_jline3:/root/.scala_history_jline3
- .scala_history:/root/.scala_history
- ~/.m2:/root/.m2 #ToDo
networks:
- scalar-network
profiles:
- dev
depends_on:
- backend-postgres
- backend-cassandra
- backend-dynamodb
- scalardb-cassandra
- scalardb-mysql
- postgres
command:
- "/opt/spark/bin/spark-shell"
- "/opt/spark/bin/spark-sql"
- "--packages"
- "com.scalar-labs:scalardb-analytics-spark-3.5_2.12:3.12.0"
- "com.scalar-labs:scalardb-analytics-spark-without-licensing-all-3.5_2.12:3.14.0,com.h2database:h2:2.2.224" #ToDo

backend-postgres:
image: postgres:15.1
ports:
- "5432"
sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
volumes:
- backend-postgres-data:/var/lib/postgresql/data
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_DB=test
- ./scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
working_dir: /sample-data-loader
networks:
- scalar-network
healthcheck:
test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"]
interval: 1s
timeout: 1s
retries: 10
start_period: 1s
profiles:
- dev
depends_on:
- scalardb-cassandra
- scalardb-mysql
- postgres
command: ["java", "-jar", "/app.jar"]

backend-cassandra:
scalardb-cassandra:
image: cassandra:3.11
ports:
- "9042"
- 9042
volumes:
- backend-cassandra-data:/var/lib/cassandra
- scalardb-cassandra-data:/var/lib/cassandra
environment:
- CASSANDRA_DC=dc1
- CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch
Expand All @@ -55,50 +57,52 @@ services:
interval: 1s
timeout: 1s
retries: 10
start_period: 5s
start_period: 10s

backend-dynamodb:
image: amazon/dynamodb-local:1.21.0
scalardb-mysql:
image: mysql:8.0.36
ports:
- "8000"
command:
[
"-jar",
"DynamoDBLocal.jar",
"-sharedDb",
"-dbPath",
"/home/dynamodblocal",
"-optimizeDbBeforeStartup",
]
- 3306
volumes:
- backend-dynamodb-data:/home/dynamodblocal
- scalardb-mysql-data:/var/lib/mysql
environment:
- MYSQL_ROOT_PASSWORD=mysql
- MYSQL_DATABASE=sampledb
networks:
- scalar-network
healthcheck:
test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root"]
interval: 1s
timeout: 1s
retries: 10
start_period: 5s

sample-data-loader:
build:
context: sample-data-loader
dockerfile: Dockerfile
postgres:
image: postgres:15.1
ports:
- 5432
volumes:
- ./scalardb.properties:/etc/scalardb.properties
- ./schema.json:/etc/schema.json
- ./data:/data
working_dir: /sample-data-loader
- postgres-data:/var/lib/postgresql/data
- ./data/customer.csv:/opt/customer.csv
- ./sql/postgres_copy.sql:/docker-entrypoint-initdb.d/postgres_copy.sql
environment:
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_DB=sampledb
networks:
- scalar-network
profiles:
- dev
depends_on:
- backend-postgres
- backend-cassandra
- backend-dynamodb
command: ["java", "-jar", "/app.jar"]
healthcheck:
test: ["CMD", "psql", "-U", "postgres", "-c", "select 1"]
interval: 1s
timeout: 1s
retries: 10
start_period: 5s

volumes:
analytics-data: {}
backend-postgres-data: {}
backend-cassandra-data: {}
backend-dynamodb-data: {}
scalardb-cassandra-data: {}
scalardb-mysql-data: {}
postgres-data: {}

networks:
scalar-network: {}
2 changes: 1 addition & 1 deletion scalardb-analytics-spark-sample/docker/Dockerfile.spark
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM eclipse-temurin:17-jre-jammy

WORKDIR /work

ENV SPARK_VERSION 3.5.1
ENV SPARK_VERSION 3.5.3

RUN apt-get update && \
apt-get install -y --no-install-recommends \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
plugins {
application
id("com.github.johnrengelman.shadow") version "7.1.2"
id("com.gradleup.shadow") version "8.3.5"
id("com.diffplug.spotless") version "6.24.0"
}

Expand All @@ -9,10 +9,11 @@ repositories {
}

dependencies {
implementation("com.scalar-labs:scalardb:3.12.1")
implementation("com.scalar-labs:scalardb-schema-loader:3.12.1")
implementation("com.scalar-labs:scalardb:3.14.0")
implementation("com.scalar-labs:scalardb-schema-loader:3.14.0")
implementation("org.apache.commons:commons-csv:1.10.0")


implementation("io.netty:netty-transport-native-epoll:4.1.99.Final:linux-x86_64")
implementation("io.netty:netty-transport-native-epoll:4.1.99.Final:linux-aarch_64")
implementation("io.netty:netty-transport-native-kqueue:4.1.99.Final:osx-x86_64")
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
Expand Down
7 changes: 5 additions & 2 deletions scalardb-analytics-spark-sample/sample-data-loader/gradlew
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
#

##############################################################################
#
Expand Down Expand Up @@ -55,7 +57,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
Expand Down Expand Up @@ -84,7 +86,8 @@ done
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
APP_HOME=$( cd -P "${APP_HOME:-./}" > /dev/null && printf '%s
' "$PWD" ) || exit

# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
Expand Down
22 changes: 12 additions & 10 deletions scalardb-analytics-spark-sample/sample-data-loader/gradlew.bat
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@rem SPDX-License-Identifier: Apache-2.0
@rem

@if "%DEBUG%"=="" @echo off
@rem ##########################################################################
Expand Down Expand Up @@ -43,11 +45,11 @@ set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if %ERRORLEVEL% equ 0 goto execute

echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
echo. 1>&2
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
echo. 1>&2
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
echo location of your Java installation. 1>&2

goto fail

Expand All @@ -57,11 +59,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe

if exist "%JAVA_EXE%" goto execute

echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
echo. 1>&2
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
echo. 1>&2
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
echo location of your Java installation. 1>&2

goto fail

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.scalar.db.api.DistributedTransaction;
import com.scalar.db.api.DistributedTransactionManager;
import com.scalar.db.api.Mutation;
import com.scalar.db.api.Put;
import com.scalar.db.exception.transaction.TransactionException;
import com.scalar.db.io.Key;
Expand All @@ -14,29 +15,18 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;

public class Loader implements AutoCloseable {
private static final String CUSTOMER_DATA = "/data/customer.csv";
private static final String ORDERS_DATA = "/data/orders.csv";
private static final String LINEITEM_DATA = "/data/lineitem.csv";
private static final String CONFIG_FILE_PATH = "/etc/scalardb.properties";
private static final String SCHEMA_FILE_PATH = "/etc/schema.json";

private static final String[] CUSTOMER_COLUMNS = {
"c_custkey",
"c_name",
"c_address",
"c_nationkey",
"c_phone",
"c_acctbal",
"c_mktsegment",
"c_comment"
};

private static final String[] ORDERS_COLUMNS = {
"o_orderkey",
"o_custkey",
Expand Down Expand Up @@ -82,8 +72,6 @@ public void close() {
public void load() throws TransactionException, IOException, SchemaLoaderException {
loadSchema();

loadData(this.manager, CUSTOMER_DATA, CUSTOMER_COLUMNS, this::buildPutCustomer);

loadData(this.manager, ORDERS_DATA, ORDERS_COLUMNS, this::buildPutOrders);

loadData(this.manager, LINEITEM_DATA, LINEITEM_COLUMNS, this::buildPutLineitem);
Expand All @@ -101,25 +89,9 @@ private void loadSchema() throws SchemaLoaderException {
SchemaLoader.load(configFilePath, schemaFilePath, options, createCoordinatorTables);
}

private Put buildPutCustomer(CSVRecord record) {
return Put.newBuilder()
.namespace("dynamons")
.table("customer")
.partitionKey(Key.ofInt("c_custkey", intCol(record, "c_custkey")))
.textValue("c_name", stringCol(record, "c_name"))
.textValue("c_address", stringCol(record, "c_address"))
.intValue("c_nationkey", intCol(record, "c_nationkey"))
.textValue("c_phone", stringCol(record, "c_phone"))
.doubleValue("c_acctbal", doubleCol(record, "c_acctbal"))
.textValue("c_mktsegment", stringCol(record, "c_mktsegment"))
.textValue("c_comment", stringCol(record, "c_comment"))
.enableImplicitPreRead()
.build();
}

private Put buildPutOrders(CSVRecord record) {
return Put.newBuilder()
.namespace("postgresns")
.namespace("mysqlns")
.table("orders")
.partitionKey(Key.ofInt("o_orderkey", intCol(record, "o_orderkey")))
.intValue("o_custkey", intCol(record, "o_custkey"))
Expand Down Expand Up @@ -175,7 +147,8 @@ private void loadData(
transaction = manager.start();
for (CSVRecord record : records) {
Put put = putFunction.apply(record);
transaction.put(put);
List<Mutation> mutations = List.of(put);
transaction.mutate(mutations);
}
transaction.commit();
} catch (TransactionException e) {
Expand Down
25 changes: 7 additions & 18 deletions scalardb-analytics-spark-sample/scalardb.properties
Original file line number Diff line number Diff line change
@@ -1,29 +1,18 @@
scalar.db.storage=multi-storage
scalar.db.multi_storage.storages=cassandra,postgres,dynamodb
scalar.db.multi_storage.storages=cassandra,mysql

scalar.db.multi_storage.storages.cassandra.storage=cassandra
scalar.db.multi_storage.storages.cassandra.contact_points=backend-cassandra
scalar.db.multi_storage.storages.cassandra.contact_points=scalardb-cassandra
scalar.db.multi_storage.storages.cassandra.contact_port=9042
scalar.db.multi_storage.storages.cassandra.username=cassandra
scalar.db.multi_storage.storages.cassandra.password=cassandra

scalar.db.multi_storage.storages.postgres.storage=jdbc
scalar.db.multi_storage.storages.postgres.contact_points=jdbc:postgresql://backend-postgres:5432/test
scalar.db.multi_storage.storages.postgres.username=postgres
scalar.db.multi_storage.storages.postgres.password=postgres
scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.min_idle=5
scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.max_idle=10
scalar.db.multi_storage.storages.postgres.jdbc.connection_pool.max_total=25
scalar.db.multi_storage.storages.mysql.storage=jdbc
scalar.db.multi_storage.storages.mysql.contact_points=jdbc:mysql://scalardb-mysql:3306/sampledb
scalar.db.multi_storage.storages.mysql.username=root
scalar.db.multi_storage.storages.mysql.password=mysql

scalar.db.multi_storage.storages.dynamodb.contact_points=ap-northeast-1
scalar.db.multi_storage.storages.dynamodb.username=access_key_id
scalar.db.multi_storage.storages.dynamodb.password=secret_access_key
scalar.db.multi_storage.storages.dynamodb.storage=dynamo
scalar.db.multi_storage.storages.dynamodb.dynamo.endpoint_override=http://backend-dynamodb:8000
scalar.db.multi_storage.storages.dynamodb.dynamo.table_metadata.namespace=table_metadata
scalar.db.multi_storage.storages.dynamodb.dynamo.namespace.prefix=scalar_

scalar.db.multi_storage.namespace_mapping=cassandrans:cassandra,postgresns:postgres,dynamons:dynamodb
scalar.db.multi_storage.namespace_mapping=cassandrans:cassandra,mysqlns:mysql

scalar.db.multi_storage.default_storage=cassandra

Expand Down
Loading

0 comments on commit 990519e

Please sign in to comment.