From d8f93607bc00aff851db240eb2aeee5bc798e9f1 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Fri, 13 Sep 2024 08:31:17 -0400 Subject: [PATCH] Update all ingest tests to use api for partitioning --- .github/workflows/e2e.yml | 1 + .github/workflows/ingest-test-fixtures-update-pr.yml | 1 + test_e2e/src/airtable-diff.sh | 3 +++ test_e2e/src/airtable-large.sh | 3 +++ test_e2e/src/astradb.sh | 3 +++ test_e2e/src/azure.sh | 3 +++ test_e2e/src/biomed-api.sh | 3 +++ test_e2e/src/biomed-path.sh | 3 +++ test_e2e/src/box.sh | 3 +++ test_e2e/src/confluence-diff.sh | 3 +++ test_e2e/src/confluence-large.sh | 3 +++ test_e2e/src/couchbase.sh | 3 +++ test_e2e/src/delta-table.sh | 3 +++ test_e2e/src/discord.sh | 3 +++ test_e2e/src/dropbox.sh | 3 +++ test_e2e/src/elasticsearch.sh | 3 +++ test_e2e/src/gcs.sh | 3 +++ test_e2e/src/github.sh | 3 +++ test_e2e/src/gitlab.sh | 3 +++ test_e2e/src/google-drive.sh | 3 +++ test_e2e/src/hubspot.sh | 3 +++ test_e2e/src/jira.sh | 3 +++ test_e2e/src/kafka-local.sh | 3 +++ test_e2e/src/local-embed-bedrock.sh | 3 +++ test_e2e/src/local-embed-mixedbreadai.sh | 3 +++ test_e2e/src/local-embed-octoai.sh | 3 +++ test_e2e/src/local-embed-vertexai.sh | 3 +++ test_e2e/src/local-embed-voyageai.sh | 3 +++ test_e2e/src/local-embed.sh | 3 +++ test_e2e/src/local-failed-partition.sh | 3 +++ test_e2e/src/local-single-file-basic-chunking.sh | 3 +++ test_e2e/src/local-single-file-chunk-no-orig-elements.sh | 3 +++ test_e2e/src/local-single-file-with-encoding.sh | 3 +++ .../src/local-single-file-with-pdf-infer-table-structure.sh | 3 +++ test_e2e/src/local-single-file.sh | 3 +++ test_e2e/src/local.sh | 3 +++ test_e2e/src/mongodb.sh | 3 +++ test_e2e/src/notion.sh | 3 +++ test_e2e/src/onedrive.sh | 3 +++ test_e2e/src/opensearch.sh | 3 +++ test_e2e/src/outlook.sh | 3 +++ test_e2e/src/pdf-fast-reprocess.sh | 3 +++ test_e2e/src/s3-compression.sh | 3 +++ test_e2e/src/s3-filter.sh | 3 +++ test_e2e/src/s3-minio.sh | 3 +++ test_e2e/src/s3.sh | 3 +++ test_e2e/src/salesforce.sh | 3 +++ test_e2e/src/sftp.sh | 3 +++ test_e2e/src/sharepoint-with-permissions.sh | 3 +++ test_e2e/src/sharepoint.sh | 3 +++ test_e2e/src/slack.sh | 3 +++ test_e2e/src/wikipedia.sh | 3 +++ 52 files changed, 152 insertions(+) diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 6384401c0..621cce8d0 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -79,6 +79,7 @@ jobs: SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + UNS_PAID_API_KEY: ${{ secrets.UNS_PAID_API_KEY }} NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} diff --git a/.github/workflows/ingest-test-fixtures-update-pr.yml b/.github/workflows/ingest-test-fixtures-update-pr.yml index 1cae88d61..c6dddf995 100644 --- a/.github/workflows/ingest-test-fixtures-update-pr.yml +++ b/.github/workflows/ingest-test-fixtures-update-pr.yml @@ -71,6 +71,7 @@ jobs: SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}} SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} UNS_API_KEY: ${{ secrets.UNS_API_KEY }} + UNS_PAID_API_KEY: ${{ secrets.UNS_PAID_API_KEY }} NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} diff --git a/test_e2e/src/airtable-diff.sh b/test_e2e/src/airtable-diff.sh index 01701875b..173adebf8 100755 --- a/test_e2e/src/airtable-diff.sh +++ b/test_e2e/src/airtable-diff.sh @@ -38,6 +38,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ airtable \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \ --list-of-paths "$VARIED_DATA_BASE_ID,$VARIED_DATA_BASE_ID_2" \ diff --git a/test_e2e/src/airtable-large.sh b/test_e2e/src/airtable-large.sh index 79785ebd2..38027cf17 100755 --- a/test_e2e/src/airtable-large.sh +++ b/test_e2e/src/airtable-large.sh @@ -41,6 +41,9 @@ source "$SCRIPT_DIR"/env_setup/airtable/component_ids.sh RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ airtable \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \ --list-of-paths "$LARGE_TEST_LIST_OF_PATHS" \ diff --git a/test_e2e/src/astradb.sh b/test_e2e/src/astradb.sh index ab979f88a..628bf5737 100755 --- a/test_e2e/src/astradb.sh +++ b/test_e2e/src/astradb.sh @@ -24,6 +24,9 @@ COLLECTION_NAME="ingest_test_src" PYTHONPATH=. ./unstructured_ingest/main.py \ astradb \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --token "$ASTRA_DB_APPLICATION_TOKEN" \ --api-endpoint "$ASTRA_DB_API_ENDPOINT" \ --collection-name "$COLLECTION_NAME" \ diff --git a/test_e2e/src/azure.sh b/test_e2e/src/azure.sh index b50a5bb72..5726069bb 100755 --- a/test_e2e/src/azure.sh +++ b/test_e2e/src/azure.sh @@ -24,6 +24,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ azure \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/biomed-api.sh b/test_e2e/src/biomed-api.sh index 96eb317af..eff0ff10a 100755 --- a/test_e2e/src/biomed-api.sh +++ b/test_e2e/src/biomed-api.sh @@ -26,6 +26,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ biomed \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/biomed-path.sh b/test_e2e/src/biomed-path.sh index ceea9548a..f3b852622 100755 --- a/test_e2e/src/biomed-path.sh +++ b/test_e2e/src/biomed-path.sh @@ -26,6 +26,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ biomed \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/box.sh b/test_e2e/src/box.sh index 9fb30cc57..441d50218 100755 --- a/test_e2e/src/box.sh +++ b/test_e2e/src/box.sh @@ -46,6 +46,9 @@ jq 'keys' <"$BOX_APP_CONFIG_PATH" RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ box \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --box-app-config "$BOX_APP_CONFIG_PATH" \ --remote-url box://utic-test-ingest-fixtures \ diff --git a/test_e2e/src/confluence-diff.sh b/test_e2e/src/confluence-diff.sh index da55b469d..90d504086 100755 --- a/test_e2e/src/confluence-diff.sh +++ b/test_e2e/src/confluence-diff.sh @@ -34,6 +34,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ confluence \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/confluence-large.sh b/test_e2e/src/confluence-large.sh index 98ba7698a..b053bf764 100755 --- a/test_e2e/src/confluence-large.sh +++ b/test_e2e/src/confluence-large.sh @@ -40,6 +40,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ confluence \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/couchbase.sh b/test_e2e/src/couchbase.sh index b1034eee1..22e853048 100755 --- a/test_e2e/src/couchbase.sh +++ b/test_e2e/src/couchbase.sh @@ -56,6 +56,9 @@ wait PYTHONPATH=. ./unstructured_ingest/main.py \ couchbase \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_e2e/src/delta-table.sh b/test_e2e/src/delta-table.sh index 45e88a3a3..c97ace719 100755 --- a/test_e2e/src/delta-table.sh +++ b/test_e2e/src/delta-table.sh @@ -34,6 +34,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ delta-table \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_created,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_e2e/src/discord.sh b/test_e2e/src/discord.sh index 2d5c75351..d239894b2 100755 --- a/test_e2e/src/discord.sh +++ b/test_e2e/src/discord.sh @@ -32,6 +32,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ discord \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_e2e/src/dropbox.sh b/test_e2e/src/dropbox.sh index e0bb2048d..8ad447425 100755 --- a/test_e2e/src/dropbox.sh +++ b/test_e2e/src/dropbox.sh @@ -37,6 +37,9 @@ DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<<"$DROPBOX_RESPONSE") RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ dropbox \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/elasticsearch.sh b/test_e2e/src/elasticsearch.sh index 92386844e..4d915ae9e 100755 --- a/test_e2e/src/elasticsearch.sh +++ b/test_e2e/src/elasticsearch.sh @@ -40,6 +40,9 @@ wait RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ elasticsearch \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/gcs.sh b/test_e2e/src/gcs.sh index 940608f06..f8b9d9d62 100755 --- a/test_e2e/src/gcs.sh +++ b/test_e2e/src/gcs.sh @@ -37,6 +37,9 @@ echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ gcs \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/github.sh b/test_e2e/src/github.sh index 8629b9379..cf4d5cec3 100755 --- a/test_e2e/src/github.sh +++ b/test_e2e/src/github.sh @@ -41,6 +41,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} #shellcheck disable=SC2086 PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ github \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/gitlab.sh b/test_e2e/src/gitlab.sh index add6be109..34ca1dcfd 100755 --- a/test_e2e/src/gitlab.sh +++ b/test_e2e/src/gitlab.sh @@ -27,6 +27,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ gitlab \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/google-drive.sh b/test_e2e/src/google-drive.sh index 18b42fb1f..1e97c0bd6 100755 --- a/test_e2e/src/google-drive.sh +++ b/test_e2e/src/google-drive.sh @@ -38,6 +38,9 @@ echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE" RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ google-drive \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.version \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/hubspot.sh b/test_e2e/src/hubspot.sh index bff2bc81a..08e7c89a8 100755 --- a/test_e2e/src/hubspot.sh +++ b/test_e2e/src/hubspot.sh @@ -41,6 +41,9 @@ fi PYTHONPATH=. ./unstructured_ingest/main.py \ hubspot \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_e2e/src/jira.sh b/test_e2e/src/jira.sh index 9dfcb56fa..53f3e4534 100755 --- a/test_e2e/src/jira.sh +++ b/test_e2e/src/jira.sh @@ -53,6 +53,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ jira \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/kafka-local.sh b/test_e2e/src/kafka-local.sh index 9e30f7939..4ebd59187 100755 --- a/test_e2e/src/kafka-local.sh +++ b/test_e2e/src/kafka-local.sh @@ -60,6 +60,9 @@ python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ kafka \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --bootstrap-server localhost \ --download-dir "$DOWNLOAD_DIR" \ --topic "$KAFKA_TOPIC" \ diff --git a/test_e2e/src/local-embed-bedrock.sh b/test_e2e/src/local-embed-bedrock.sh index a30c720e6..031d8b3b4 100755 --- a/test_e2e/src/local-embed-bedrock.sh +++ b/test_e2e/src/local-embed-bedrock.sh @@ -27,6 +27,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-embed-mixedbreadai.sh b/test_e2e/src/local-embed-mixedbreadai.sh index 4a4bf83d9..67f784bd2 100755 --- a/test_e2e/src/local-embed-mixedbreadai.sh +++ b/test_e2e/src/local-embed-mixedbreadai.sh @@ -32,6 +32,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} # Run the ingestion script with the specified parameters PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.record_locator.path,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-embed-octoai.sh b/test_e2e/src/local-embed-octoai.sh index 3249838ad..0946dde24 100755 --- a/test_e2e/src/local-embed-octoai.sh +++ b/test_e2e/src/local-embed-octoai.sh @@ -28,6 +28,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-embed-vertexai.sh b/test_e2e/src/local-embed-vertexai.sh index ef4a72b75..c760176be 100755 --- a/test_e2e/src/local-embed-vertexai.sh +++ b/test_e2e/src/local-embed-vertexai.sh @@ -28,6 +28,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-embed-voyageai.sh b/test_e2e/src/local-embed-voyageai.sh index e2b0743ca..873808579 100755 --- a/test_e2e/src/local-embed-voyageai.sh +++ b/test_e2e/src/local-embed-voyageai.sh @@ -28,6 +28,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-embed.sh b/test_e2e/src/local-embed.sh index 629e4519d..a45fbb10b 100755 --- a/test_e2e/src/local-embed.sh +++ b/test_e2e/src/local-embed.sh @@ -22,6 +22,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-failed-partition.sh b/test_e2e/src/local-failed-partition.sh index 49d116ab1..7fe87b9a9 100755 --- a/test_e2e/src/local-failed-partition.sh +++ b/test_e2e/src/local-failed-partition.sh @@ -41,6 +41,9 @@ function check() { RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy fast \ diff --git a/test_e2e/src/local-single-file-basic-chunking.sh b/test_e2e/src/local-single-file-basic-chunking.sh index b0dced9e7..e2cdaa804 100755 --- a/test_e2e/src/local-single-file-basic-chunking.sh +++ b/test_e2e/src/local-single-file-basic-chunking.sh @@ -26,6 +26,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --chunking-strategy basic \ --chunk-overlap 20 \ --chunk-max-characters 150 \ diff --git a/test_e2e/src/local-single-file-chunk-no-orig-elements.sh b/test_e2e/src/local-single-file-chunk-no-orig-elements.sh index ccd74e58b..aa368db8b 100755 --- a/test_e2e/src/local-single-file-chunk-no-orig-elements.sh +++ b/test_e2e/src/local-single-file-chunk-no-orig-elements.sh @@ -37,6 +37,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --chunking-strategy by_title \ --no-chunk-include-orig-elements \ --chunk-max-characters 2000 \ diff --git a/test_e2e/src/local-single-file-with-encoding.sh b/test_e2e/src/local-single-file-with-encoding.sh index a82ab68cc..c87a9bdd0 100755 --- a/test_e2e/src/local-single-file-with-encoding.sh +++ b/test_e2e/src/local-single-file-with-encoding.sh @@ -23,6 +23,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-single-file-with-pdf-infer-table-structure.sh b/test_e2e/src/local-single-file-with-pdf-infer-table-structure.sh index c4ffec385..6b2a8baf0 100755 --- a/test_e2e/src/local-single-file-with-pdf-infer-table-structure.sh +++ b/test_e2e/src/local-single-file-with-pdf-infer-table-structure.sh @@ -23,6 +23,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local-single-file.sh b/test_e2e/src/local-single-file.sh index afd4f13f8..118d3848c 100755 --- a/test_e2e/src/local-single-file.sh +++ b/test_e2e/src/local-single-file.sh @@ -25,6 +25,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --output-dir "$OUTPUT_DIR" \ diff --git a/test_e2e/src/local.sh b/test_e2e/src/local.sh index e1aa14a90..ffd245643 100755 --- a/test_e2e/src/local.sh +++ b/test_e2e/src/local.sh @@ -22,6 +22,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --strategy hi_res \ diff --git a/test_e2e/src/mongodb.sh b/test_e2e/src/mongodb.sh index f34b6ee76..b4fd14790 100755 --- a/test_e2e/src/mongodb.sh +++ b/test_e2e/src/mongodb.sh @@ -29,6 +29,9 @@ pip install -r requirements/connectors/mongodb.txt PYTHONPATH=. ./unstructured_ingest/main.py \ mongodb \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ diff --git a/test_e2e/src/notion.sh b/test_e2e/src/notion.sh index 4d3a2361c..ce96b058a 100755 --- a/test_e2e/src/notion.sh +++ b/test_e2e/src/notion.sh @@ -32,6 +32,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ notion \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --download-dir "$DOWNLOAD_DIR" \ --notion-api-key "$NOTION_API_KEY" \ diff --git a/test_e2e/src/onedrive.sh b/test_e2e/src/onedrive.sh index 1e651a1e4..da86d7a55 100755 --- a/test_e2e/src/onedrive.sh +++ b/test_e2e/src/onedrive.sh @@ -32,6 +32,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ onedrive \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/opensearch.sh b/test_e2e/src/opensearch.sh index af6c17460..d3d71fdf3 100755 --- a/test_e2e/src/opensearch.sh +++ b/test_e2e/src/opensearch.sh @@ -38,6 +38,9 @@ wait RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ opensearch \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/outlook.sh b/test_e2e/src/outlook.sh index f4ed512bc..3a5d080c9 100755 --- a/test_e2e/src/outlook.sh +++ b/test_e2e/src/outlook.sh @@ -32,6 +32,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ outlook \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/pdf-fast-reprocess.sh b/test_e2e/src/pdf-fast-reprocess.sh index 6812e28fd..cfe9b7376 100755 --- a/test_e2e/src/pdf-fast-reprocess.sh +++ b/test_e2e/src/pdf-fast-reprocess.sh @@ -31,6 +31,9 @@ ls "$INPUT_PATH" RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ --strategy fast \ diff --git a/test_e2e/src/s3-compression.sh b/test_e2e/src/s3-compression.sh index b4d3a3a4a..8d083f73b 100755 --- a/test_e2e/src/s3-compression.sh +++ b/test_e2e/src/s3-compression.sh @@ -23,6 +23,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/s3-filter.sh b/test_e2e/src/s3-filter.sh index 5617e7c77..07998d885 100755 --- a/test_e2e/src/s3-filter.sh +++ b/test_e2e/src/s3-filter.sh @@ -24,6 +24,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/s3-minio.sh b/test_e2e/src/s3-minio.sh index ef9deddaf..993ed5311 100755 --- a/test_e2e/src/s3-minio.sh +++ b/test_e2e/src/s3-minio.sh @@ -36,6 +36,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_modified,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.date_created \ diff --git a/test_e2e/src/s3.sh b/test_e2e/src/s3.sh index 09f0c4c00..944b3b8c5 100755 --- a/test_e2e/src/s3.sh +++ b/test_e2e/src/s3.sh @@ -26,6 +26,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/salesforce.sh b/test_e2e/src/salesforce.sh index 0d786fe1e..76aeb920e 100755 --- a/test_e2e/src/salesforce.sh +++ b/test_e2e/src/salesforce.sh @@ -46,6 +46,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ salesforce \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --categories "EmailMessage,Campaign" \ --download-dir "$DOWNLOAD_DIR" \ --username "$SALESFORCE_USERNAME" \ diff --git a/test_e2e/src/sftp.sh b/test_e2e/src/sftp.sh index a558358fa..4bdc078b5 100755 --- a/test_e2e/src/sftp.sh +++ b/test_e2e/src/sftp.sh @@ -36,6 +36,9 @@ wait RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ sftp \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.data_source.version \ diff --git a/test_e2e/src/sharepoint-with-permissions.sh b/test_e2e/src/sharepoint-with-permissions.sh index 4808e7a6e..5bc7cda54 100755 --- a/test_e2e/src/sharepoint-with-permissions.sh +++ b/test_e2e/src/sharepoint-with-permissions.sh @@ -42,6 +42,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ sharepoint \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.permissions_data \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/sharepoint.sh b/test_e2e/src/sharepoint.sh index 29e131ee7..0d8d9713a 100755 --- a/test_e2e/src/sharepoint.sh +++ b/test_e2e/src/sharepoint.sh @@ -34,6 +34,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ sharepoint \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \ diff --git a/test_e2e/src/slack.sh b/test_e2e/src/slack.sh index c9c61b667..12d637cd1 100755 --- a/test_e2e/src/slack.sh +++ b/test_e2e/src/slack.sh @@ -32,6 +32,9 @@ fi RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ slack \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ diff --git a/test_e2e/src/wikipedia.sh b/test_e2e/src/wikipedia.sh index 25e46672e..be6806c61 100755 --- a/test_e2e/src/wikipedia.sh +++ b/test_e2e/src/wikipedia.sh @@ -27,6 +27,9 @@ trap cleanup EXIT RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ wikipedia \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ --download-dir "$DOWNLOAD_DIR" \ --metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ --num-processes "$max_processes" \