Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update all ingest tests to use api for partitioning
Browse files Browse the repository at this point in the history
rbiseck3 committed Sep 13, 2024
1 parent 251e9f9 commit d8f9360
Showing 52 changed files with 152 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
@@ -79,6 +79,7 @@ jobs:
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
UNS_PAID_API_KEY: ${{ secrets.UNS_PAID_API_KEY }}
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
1 change: 1 addition & 0 deletions .github/workflows/ingest-test-fixtures-update-pr.yml
Original file line number Diff line number Diff line change
@@ -71,6 +71,7 @@ jobs:
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
UNS_PAID_API_KEY: ${{ secrets.UNS_PAID_API_KEY }}
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
3 changes: 3 additions & 0 deletions test_e2e/src/airtable-diff.sh
Original file line number Diff line number Diff line change
@@ -38,6 +38,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
airtable \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
--list-of-paths "$VARIED_DATA_BASE_ID,$VARIED_DATA_BASE_ID_2" \
3 changes: 3 additions & 0 deletions test_e2e/src/airtable-large.sh
Original file line number Diff line number Diff line change
@@ -41,6 +41,9 @@ source "$SCRIPT_DIR"/env_setup/airtable/component_ids.sh
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
airtable \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--personal-access-token "$AIRTABLE_PERSONAL_ACCESS_TOKEN" \
--list-of-paths "$LARGE_TEST_LIST_OF_PATHS" \
3 changes: 3 additions & 0 deletions test_e2e/src/astradb.sh
Original file line number Diff line number Diff line change
@@ -24,6 +24,9 @@ COLLECTION_NAME="ingest_test_src"

PYTHONPATH=. ./unstructured_ingest/main.py \
astradb \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--token "$ASTRA_DB_APPLICATION_TOKEN" \
--api-endpoint "$ASTRA_DB_API_ENDPOINT" \
--collection-name "$COLLECTION_NAME" \
3 changes: 3 additions & 0 deletions test_e2e/src/azure.sh
Original file line number Diff line number Diff line change
@@ -24,6 +24,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
azure \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/biomed-api.sh
Original file line number Diff line number Diff line change
@@ -26,6 +26,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
biomed \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/biomed-path.sh
Original file line number Diff line number Diff line change
@@ -26,6 +26,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
biomed \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/box.sh
Original file line number Diff line number Diff line change
@@ -46,6 +46,9 @@ jq 'keys' <"$BOX_APP_CONFIG_PATH"
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
box \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--box-app-config "$BOX_APP_CONFIG_PATH" \
--remote-url box://utic-test-ingest-fixtures \
3 changes: 3 additions & 0 deletions test_e2e/src/confluence-diff.sh
Original file line number Diff line number Diff line change
@@ -34,6 +34,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
confluence \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/confluence-large.sh
Original file line number Diff line number Diff line change
@@ -40,6 +40,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
confluence \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/couchbase.sh
Original file line number Diff line number Diff line change
@@ -56,6 +56,9 @@ wait

PYTHONPATH=. ./unstructured_ingest/main.py \
couchbase \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/delta-table.sh
Original file line number Diff line number Diff line change
@@ -34,6 +34,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
delta-table \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.data_source.date_created,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--download-dir "$DOWNLOAD_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/discord.sh
Original file line number Diff line number Diff line change
@@ -32,6 +32,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
discord \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--download-dir "$DOWNLOAD_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/dropbox.sh
Original file line number Diff line number Diff line change
@@ -37,6 +37,9 @@ DROPBOX_ACCESS_TOKEN=$(jq -r '.access_token' <<<"$DROPBOX_RESPONSE")
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
dropbox \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
3 changes: 3 additions & 0 deletions test_e2e/src/elasticsearch.sh
Original file line number Diff line number Diff line change
@@ -40,6 +40,9 @@ wait
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
elasticsearch \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/gcs.sh
Original file line number Diff line number Diff line change
@@ -37,6 +37,9 @@ echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE"
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
gcs \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
3 changes: 3 additions & 0 deletions test_e2e/src/github.sh
Original file line number Diff line number Diff line change
@@ -41,6 +41,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
#shellcheck disable=SC2086
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
github \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
3 changes: 3 additions & 0 deletions test_e2e/src/gitlab.sh
Original file line number Diff line number Diff line change
@@ -27,6 +27,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
gitlab \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.parent_id,metadata.category_depth \
3 changes: 3 additions & 0 deletions test_e2e/src/google-drive.sh
Original file line number Diff line number Diff line change
@@ -38,6 +38,9 @@ echo "$GCP_INGEST_SERVICE_KEY" >"$GCP_INGEST_SERVICE_KEY_FILE"
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
google-drive \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.version \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/hubspot.sh
Original file line number Diff line number Diff line change
@@ -41,6 +41,9 @@ fi

PYTHONPATH=. ./unstructured_ingest/main.py \
hubspot \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/jira.sh
Original file line number Diff line number Diff line change
@@ -53,6 +53,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
jira \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
3 changes: 3 additions & 0 deletions test_e2e/src/kafka-local.sh
Original file line number Diff line number Diff line change
@@ -60,6 +60,9 @@ python "$SCRIPT_DIR"/python/test-produce-kafka-message.py up \
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
kafka \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--bootstrap-server localhost \
--download-dir "$DOWNLOAD_DIR" \
--topic "$KAFKA_TOPIC" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-embed-bedrock.sh
Original file line number Diff line number Diff line change
@@ -27,6 +27,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-embed-mixedbreadai.sh
Original file line number Diff line number Diff line change
@@ -32,6 +32,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
# Run the ingestion script with the specified parameters
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.record_locator.path,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-embed-octoai.sh
Original file line number Diff line number Diff line change
@@ -28,6 +28,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-embed-vertexai.sh
Original file line number Diff line number Diff line change
@@ -28,6 +28,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-embed-voyageai.sh
Original file line number Diff line number Diff line change
@@ -28,6 +28,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-embed.sh
Original file line number Diff line number Diff line change
@@ -22,6 +22,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-failed-partition.sh
Original file line number Diff line number Diff line change
@@ -41,6 +41,9 @@ function check() {
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--strategy fast \
3 changes: 3 additions & 0 deletions test_e2e/src/local-single-file-basic-chunking.sh
Original file line number Diff line number Diff line change
@@ -26,6 +26,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}

PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--chunking-strategy basic \
--chunk-overlap 20 \
--chunk-max-characters 150 \
3 changes: 3 additions & 0 deletions test_e2e/src/local-single-file-chunk-no-orig-elements.sh
Original file line number Diff line number Diff line change
@@ -37,6 +37,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}

PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--chunking-strategy by_title \
--no-chunk-include-orig-elements \
--chunk-max-characters 2000 \
3 changes: 3 additions & 0 deletions test_e2e/src/local-single-file-with-encoding.sh
Original file line number Diff line number Diff line change
@@ -23,6 +23,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
Original file line number Diff line number Diff line change
@@ -23,6 +23,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local-single-file.sh
Original file line number Diff line number Diff line change
@@ -25,6 +25,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--output-dir "$OUTPUT_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/local.sh
Original file line number Diff line number Diff line change
@@ -22,6 +22,9 @@ trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--num-processes "$max_processes" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--strategy hi_res \
3 changes: 3 additions & 0 deletions test_e2e/src/mongodb.sh
Original file line number Diff line number Diff line change
@@ -29,6 +29,9 @@ pip install -r requirements/connectors/mongodb.txt

PYTHONPATH=. ./unstructured_ingest/main.py \
mongodb \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.date_created,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
--download-dir "$DOWNLOAD_DIR" \
3 changes: 3 additions & 0 deletions test_e2e/src/notion.sh
Original file line number Diff line number Diff line change
@@ -32,6 +32,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
notion \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--metadata-exclude coordinates,filename,file_directory,metadata.last_modified,metadata.data_source.date_processed,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--download-dir "$DOWNLOAD_DIR" \
--notion-api-key "$NOTION_API_KEY" \
3 changes: 3 additions & 0 deletions test_e2e/src/onedrive.sh
Original file line number Diff line number Diff line change
@@ -32,6 +32,9 @@ fi
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
onedrive \
--api-key "$UNS_PAID_API_KEY" \
--partition-by-api \
--partition-endpoint "https://api.unstructuredapp.io" \
--download-dir "$DOWNLOAD_DIR" \
--metadata-exclude file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
Loading

0 comments on commit d8f9360

Please sign in to comment.