From f82d8724164c530fdb80fdd447e185ac2a394876 Mon Sep 17 00:00:00 2001 From: raphaelauv Date: Tue, 18 Jun 2024 18:28:35 +0200 Subject: [PATCH] doc: full iceberg-rest example --- README.md | 4 +++ example/README.md | 15 +++++++++ example/docker-compose.yaml | 58 +++++++++++++++++++++++++++++++++++ example/iceberg_s3_example.py | 42 +++++++++++++++++++++++++ 4 files changed, 119 insertions(+) create mode 100644 example/README.md create mode 100644 example/docker-compose.yaml create mode 100644 example/iceberg_s3_example.py diff --git a/README.md b/README.md index a9d3599..1ddb515 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ export CATALOG_CATALOG__IMPL=org.apache.iceberg.aws.glue.GlueCatalog java -jar ./build/libs/iceberg-rest-image-all.jar ``` +## Example + +how run a iceberg-rest service using a centralized external database -> [example](./example/README.md) + ## Browse To browse the catalog, you can use `pyiceberg`: diff --git a/example/README.md b/example/README.md new file mode 100644 index 0000000..995c520 --- /dev/null +++ b/example/README.md @@ -0,0 +1,15 @@ +# iceberg-rest example + +how start the example stack + +```shell +docker compose up -d +``` + +then + +```shell +pip install "pyiceberg[s3fs,pyarrow] +curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o /tmp/yellow_tripdata_2023-01.parquet +python3 iceberg_s3_example.py +``` \ No newline at end of file diff --git a/example/docker-compose.yaml b/example/docker-compose.yaml new file mode 100644 index 0000000..b04d71f --- /dev/null +++ b/example/docker-compose.yaml @@ -0,0 +1,58 @@ +services: + minio: + image: minio/minio:RELEASE.2024-05-27T19-17-46Z + command: server /data --console-address ":9001" + ports: + - "9020:9000" + - "9021:9001" + environment: + MINIO_ROOT_USER: admin + MINIO_ROOT_PASSWORD: adminadmin + MINIO_SITE_REGION: eu-west-3 + + createbuckets: + image: minio/mc + depends_on: + - minio + entrypoint: > + /bin/sh -c " + echo sleep 10; + sleep 10; + /usr/bin/mc config host add myminio http://minio:9000 admin adminadmin; + /usr/bin/mc mb myminio/test-bucket; + exit 0; + " + + iceberg_rest: + image: tabulario/iceberg-rest + depends_on: + iceberg_rest-db: + condition: service_healthy + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=adminadmin + - AWS_REGION=eu-west-3 + - CATALOG_WAREHOUSE=s3://test-bucket/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + - CATALOG_S3_PATH__STYLE__ACCESS=true + - CATALOG_CATALOG__IMPL=org.apache.iceberg.jdbc.JdbcCatalog + - CATALOG_URI=jdbc:postgresql://iceberg_rest-db:5432/iceberg_restdb + - CATALOG_JDBC_USER=iceberg_rest + - CATALOG_JDBC_PASSWORD=password + + iceberg_rest-db: + image: postgres:15.2 + init: true + environment: + POSTGRES_DB: iceberg_restdb + POSTGRES_USER: iceberg_rest + POSTGRES_PASSWORD: password + ports: + - "5436:5432" + healthcheck: + test: [ "CMD", "pg_isready", "-U", "iceberg_rest", "-d", "iceberg_restdb" ] + interval: 5s + retries: 5 diff --git a/example/iceberg_s3_example.py b/example/iceberg_s3_example.py new file mode 100644 index 0000000..a48cc43 --- /dev/null +++ b/example/iceberg_s3_example.py @@ -0,0 +1,42 @@ +# pip install "pyiceberg[s3fs,pyarrow] +# curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o /tmp/yellow_tripdata_2023-01.parquet + +import os + +os.environ["AWS_DEFAULT_REGION"] = "eu-west-3" +os.environ["AWS_REGION"] = "eu-west-3" +os.environ["AWS_ACCESS_KEY_ID"] = "admin" +os.environ["AWS_SECRET_ACCESS_KEY"] = "adminadmin" + + +def run_iceberg(): + from pyiceberg.catalog.rest import RestCatalog + + catalog = RestCatalog( + "default", + **{ + "uri": "http://localhost:8181", + "warehouse": "s3://test-bucket/", + "s3.endpoint": "http://localhost:9020", + }, + ) + import pyarrow.parquet as pq + + df = pq.read_table("/tmp/yellow_tripdata_2023-01.parquet") + + catalog.create_namespace("default") + table = catalog.create_table( + "default.taxi_dataset", + schema=df.schema, + ) + + table.append(df) + + + table = catalog.load_table("default.taxi_dataset") + df = table.scan().to_arrow() + print(len(df)) + + +if __name__ == '__main__': + run_iceberg()