Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move to SQL to store observations #339

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 60 additions & 49 deletions datahost-ld-openapi/deps.edn
Original file line number Diff line number Diff line change
@@ -1,56 +1,63 @@
{:paths ["resources" "src"]
:deps {datahost/shared {:local/root "../shared-deps"}

clj-http/clj-http {:mvn/version "3.12.3"}
io.github.swirrl/grafter.repository {:mvn/version "3.0.0"}
grafter/matcha.alpha {:mvn/version "0.4.0"}
dev.weavejester/medley {:mvn/version "1.7.0"}
clj-http/clj-http {:mvn/version "3.12.3"}
io.github.swirrl/grafter.repository {:mvn/version "3.0.0"}
grafter/matcha.alpha {:mvn/version "0.4.0"}
dev.weavejester/medley {:mvn/version "1.7.0"}

com.yetanalytics/flint {:mvn/version "0.2.1"
:exclusions [org.clojure/clojure
org.clojure/clojurescript]}
com.yetanalytics/flint {:mvn/version "0.2.1"
:exclusions [org.clojure/clojure
org.clojure/clojurescript]}

org.clojure/data.json {:mvn/version "2.4.0"}
metosin/malli {:mvn/version "0.11.0"}
metosin/reitit {:mvn/version "0.7.0-alpha4"}
metosin/ring-http-response {:mvn/version "0.9.3"}
ring-cors/ring-cors {:mvn/version "0.1.13"}
duratom/duratom {:mvn/version "0.5.8"}
camel-snake-kebab/camel-snake-kebab {:mvn/version "0.4.3"}
metosin/jsonista {:mvn/version "0.3.7"}
org.glassfish/jakarta.json {:mvn/version "2.0.1"}
com.apicatalog/titanium-json-ld {:mvn/version "1.3.2"}
scicloj/tablecloth {:mvn/version "7.000-beta-50"}
net.openhft/zero-allocation-hashing {:mvn/version "0.16"}
metrics-clojure/metrics-clojure {:mvn/version "2.10.0"}
com.google.cloud/google-cloud-secretmanager {:mvn/version "2.23.0"}}
org.clojure/data.json {:mvn/version "2.4.0"}
metosin/malli {:mvn/version "0.11.0"}
metosin/reitit {:mvn/version "0.7.0-alpha4"}
metosin/ring-http-response {:mvn/version "0.9.3"}
ring-cors/ring-cors {:mvn/version "0.1.13"}
duratom/duratom {:mvn/version "0.5.8"}
camel-snake-kebab/camel-snake-kebab {:mvn/version "0.4.3"}
metosin/jsonista {:mvn/version "0.3.7"}
org.glassfish/jakarta.json {:mvn/version "2.0.1"}
com.apicatalog/titanium-json-ld {:mvn/version "1.3.2"}
scicloj/tablecloth {:mvn/version "7.000-beta-50"}
net.openhft/zero-allocation-hashing {:mvn/version "0.16"}
metrics-clojure/metrics-clojure {:mvn/version "2.10.0"}
com.google.cloud/google-cloud-secretmanager {:mvn/version "2.23.0"}

com.github.seancorfield/next.jdbc {:mvn/version "1.3.894"}
com.layerware/hugsql-core {:mvn/version "0.5.3"}
com.layerware/hugsql-adapter-next-jdbc {:mvn/version "0.5.3"}

org.xerial/sqlite-jdbc {:mvn/version "3.44.1.0"}
com.h2database/h2 {:mvn/version "2.2.224"}}




:aliases
{:run {:main-opts ["-m" "tpximpact.datahost.ldapi"]}

:auth/basic {:extra-paths ["env/auth/resources"]}
:auth/basic {:extra-paths ["env/auth/resources"]}

:ldapi/docker {:extra-paths ["env/docker/resources" "env/auth/resources"]
:jvm-opts ["-Xmx4g"
"-Dcom.sun.management.jmxremote.ssl=false"
"-Dcom.sun.management.jmxremote.authenticate=false"
"-Dcom.sun.management.jmxremote.port=3007"
;;"-Dlog4j.configuration=log4j2-docker.xml"
;;"-Dlog4j2.debug=true"
]
:ldapi/docker {:extra-paths ["env/docker/resources" "env/auth/resources"]
:jvm-opts ["-Xmx4g"
"-Dcom.sun.management.jmxremote.ssl=false"
"-Dcom.sun.management.jmxremote.authenticate=false"
"-Dcom.sun.management.jmxremote.port=3007"
;;"-Dlog4j.configuration=log4j2-docker.xml"
;;"-Dlog4j2.debug=true"
]

:main-opts ["-m" "tpximpact.datahost.ldapi"]}
:main-opts ["-m" "tpximpact.datahost.ldapi"]}

:build {:deps {io.github.clojure/tools.build {:git/tag "v0.9.4" :git/sha "76b78fe"}
io.github.seancorfield/build-clj {:git/tag "v0.9.2" :git/sha "9c9f078"}
:build {:deps {io.github.clojure/tools.build {:git/tag "v0.9.4" :git/sha "76b78fe"}
io.github.seancorfield/build-clj {:git/tag "v0.9.2" :git/sha "9c9f078"}

io.github.juxt/pack.alpha {:git/sha "802b3d6347376db51093d122eb4b8cf8a7bbd7cf"}
com.google.cloud.tools/jib-core {:mvn/version "0.23.0"}
}
:ns-default build}
io.github.juxt/pack.alpha {:git/sha "802b3d6347376db51093d122eb4b8cf8a7bbd7cf"}
com.google.cloud.tools/jib-core {:mvn/version "0.23.0"}
}
:ns-default build}

:test {:extra-paths ["test" "env/test/resources"]
:extra-deps {lambdaisland/kaocha {:mvn/version "1.85.1342"}
Expand All @@ -67,19 +74,23 @@
:test-watch {:extra-deps {lambdaisland/kaocha {:mvn/version "1.85.1342"}}
:exec-fn kaocha.runner/exec-fn
:exec-args {:watch? true
:skip-meta [:pending]
:fail-fast? true}}
:skip-meta [:pending]
:fail-fast? true}}
:test/unit {:exec-args {:tests [{:id :unit
:skip-meta [:hurl]}]}}
:skip-meta [:hurl]}]}}
:test/integration {:exec-args {:tests [{:id :integration
:focus-meta [:hurl]}]
:plugins []
:reporter kaocha.report/documentation}}
:focus-meta [:hurl]}]
:plugins []
:reporter kaocha.report/documentation}}
:test/sql {:exec-args {:tests [{:id :sql
:focus-meta [:sql]}]
:plugins []
:reporter kaocha.report/documentation}}

:dev {:extra-paths ["env/dev/src" "env/test/resources" "test"]
:extra-deps {integrant/repl {:mvn/version "0.3.2"}
org.clojure/test.check {:mvn/version "1.1.1"}
grafter/vocabularies {:mvn/version "0.3.9"}
vvvvalvalval/scope-capture {:mvn/version "0.3.3"}
:dev {:extra-paths ["env/dev/src" "env/test/resources" "test"]
:extra-deps {integrant/repl {:mvn/version "0.3.2"}
org.clojure/test.check {:mvn/version "1.1.1"}
grafter/vocabularies {:mvn/version "0.3.9"}
vvvvalvalval/scope-capture {:mvn/version "0.3.3"}
org.clojure/data.csv {:mvn/version "1.0.1"}}
:jvm-opts ["-Dclojure.tools.logging.factory=clojure.tools.logging.impl/log4j2-factory"]}}}
:jvm-opts ["-Dclojure.tools.logging.factory=clojure.tools.logging.impl/log4j2-factory"]}}}
8 changes: 8 additions & 0 deletions datahost-ld-openapi/doc/decision-log.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,11 @@ The latest agreements about the entity/update model are these:
**Re: Fuseki**
- We'll stick with the RDF4J native store mounted on its own volume as the triplestore for now
- We realized fuseki/JENA likely won't be the correct solution for the full production system anyway, and the RDF4J native store can meet our needs for this phase, so there is no point in adding more complexity to our deployment and development by adding a separate db right now

## December 14, 2023

**Re: Using SQL for dataset storage**
- Tablecloth provides an in-memory data structures are convenient but are obviously limit the size of the ingested data.
- We used generated data (see [issue #314](https://github.com/Swirrl/datahost-prototypes/issues/314)) to import up to 10mil rows
using H2 and SQLite databases in a spike (TC couldn't go beyond 3mil on M2 MB Pro with 16GB RAM).

31 changes: 30 additions & 1 deletion datahost-ld-openapi/env/test/resources/test-system.edn
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
{:tpximpact.datahost.ldapi.jetty/http-port #int #or [#env "LD_API_HTTP_PORT" "3400"]
:tpximpact.datahost.ldapi.native-datastore.repo/data-directory #or [#env "LD_API_TEST_DATA_DIR" "./tmp/ld-test-db"]
:tpximpact.datahost.ldapi.store.sql/db-config {:spec "jdbc:sqlite::memory:"
:user "SA"
:password ""}

;; test rdf-base-uri intentionally uses a weird value that doesn't include `data`
;; in order to prove that RDF URIs are not necessarily tied to routing.
Expand All @@ -14,4 +17,30 @@
:tpximpact.datahost.ldapi.store.temp-file-store/store {}

:tpximpact.datahost.ldapi.router/handler
{:change-store #ig/ref :tpximpact.datahost.ldapi.store.temp-file-store/store}}
{:change-store #ig/ref :tpximpact.datahost.ldapi.store.temp-file-store/store}

:tpximpact.datahost.ldapi.test/sqlite-connection
{:db-config #ig/ref :tpximpact.datahost.ldapi.store.sql/db-config}

:tpximpact.datahost.ldapi.store.sql/store-factory
{:db-config #ig/ref :tpximpact.datahost.ldapi.store.sql/db-config
:connection #ig/ref :tpximpact.datahost.ldapi.test/sqlite-connection
:data-source #ig/ref :tpximpact.datahost.ldapi.test/data-source
:db-executor #ig/ref :tpximpact.datahost.ldapi.store.sql/executor}

:tpximpact.datahost.ldapi.test/data-source
{:connection #ig/ref :tpximpact.datahost.ldapi.test/sqlite-connection}

:tpximpact.datahost.ldapi.test/sql-db
{:db-config #ig/ref :tpximpact.datahost.ldapi.store.sql/db-config
:store-factory #ig/ref :tpximpact.datahost.ldapi.store.sql/store-factory
:connection #ig/ref :tpximpact.datahost.ldapi.test/sqlite-connection
:db-executor #ig/ref :tpximpact.datahost.ldapi.store.sql/executor
:data-source #ig/ref :tpximpact.datahost.ldapi.test/data-source}

;; method for this key will create the necessary tables
;; and make them available for test
:tpximpact.datahost.ldapi.models.release/common-tables
{:db {:db-config #ig/ref :tpximpact.datahost.ldapi.store.sql/db-config
:db-executor #ig/ref :tpximpact.datahost.ldapi.store.sql/executor
:data-source #ig/ref :tpximpact.datahost.ldapi.test/data-source}}}
20 changes: 20 additions & 0 deletions datahost-ld-openapi/hurl-scripts/common/schema-name-age-2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"@type": "dh:TableSchema",
"appropriate-csvw:modeling-of-dialect": "UTF-8,RFC4180",
"dh:appliesToRelease": "http://localhost:3000/data/test-032/releases/release-1",
"dcterms:title": "Fun schema, different title",
"dh:columns": [
{
"@type": "dh:DimensionColumn",
"csvw:datatype": "string",
"csvw:name": "name",
"csvw:titles": "name"
},
{
"@type": "dh:MeasureColumn",
"csvw:datatype": "int",
"csvw:name": "age",
"csvw:titles": "age"
}
]
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# Description: We try to append a data with incorrectly formatted 'double' value,
# and that should be rejected by the API.

PUT {{scheme}}://{{host_name}}/data/{{series}}
Content-Type: application/json
{"dcterms:description": "Crimes Data - Float coercion test",
Expand Down Expand Up @@ -63,23 +60,4 @@ HTTP 201
revision_url: header "Location"

GET {{scheme}}://{{host_name}}{{revision_url}}
Accept: application/json

HTTP 200

## One append

POST {{scheme}}://{{host_name}}{{revision_url}}/appends
Accept: application/json
Content-Type: text/csv
[QueryStringParams]
description: Add data for liverpool
format: text/csv
```
area,age_range,estimate
liverpool,16-20,103.5
manchester,16-20,invalid_double
leeds,16-20,99.2
```

HTTP 400
Accept: application/json
31 changes: 31 additions & 0 deletions datahost-ld-openapi/hurl-scripts/int-003.hurl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Description: Basic smoke tests for release data.


GET {{scheme}}://{{host_name}}/data/{{series}}/release/release-1-medatata.json

HTTP 404
[Asserts]
body matches /Not found/

PUT {{scheme}}://{{host_name}}/data/{{series}}
Accept: application/json
Content-Type: application/json
{
"dcterms:title": "Test Series",
"dcterms:description": "A very simple series"
}

HTTP 201
[Captures]
dataset: jsonpath "$['dh:baseEntity']"

PUT {{scheme}}://{{host_name}}/data/{{series}}/release/release-1
Accept: application/json
Content-Type: application/json
{
"dcterms:title": "Test Release",
"dcterms:description": "A very simple Release"
}

HTTP 201

26 changes: 26 additions & 0 deletions datahost-ld-openapi/hurl-scripts/int-005/int-005.hurl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Description: Ensure we accept 'double' values formatted without the decimal point.


POST {{scheme}}://{{host_name}}/data/{{series}}/release/{{release}}/revision/1/appends
Accept: application/json
Content-Type: text/csv
[QueryStringParams]
description: Add data for liverpool
format: text/csv
```
area,age_range,estimate
liverpool,16-20,103.5
manchester,16-20,100
leeds,16-20,99.2
```

HTTP 201

GET {{scheme}}://{{host_name}}/data/{{series}}/release/{{release}}/revision/1
Accept: text/csv

# Let's ensure the double '100' does not get turned into an integer

HTTP 200
[Asserts]
body matches /manchester,16-20,100.0/
1 change: 1 addition & 0 deletions datahost-ld-openapi/hurl-scripts/int-005/setup.ref
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
common/setup-area+age-range+estimate.hurl
Loading