-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Charles Yuan
authored and
Charles Yuan
committed
Jan 2, 2025
1 parent
6e24a85
commit a70dc0c
Showing
5 changed files
with
416 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,4 +167,5 @@ cython_debug/ | |
|
||
# data/ | ||
*.xlsx | ||
*.csv | ||
*.csv | ||
*.jsonl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### Batch API folder fetch response Example\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Install the libraries (ipython is used for displaying markdown in this demo)\n", | ||
"# !pip3 install --upgrade ipython\n", | ||
"# !pip3 install --upgrade any-parser" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 16, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import json\n", | ||
"import logging\n", | ||
"import os\n", | ||
"from concurrent.futures import ThreadPoolExecutor, as_completed\n", | ||
"\n", | ||
"from dotenv import load_dotenv\n", | ||
"\n", | ||
"from any_parser import AnyParser\n", | ||
"\n", | ||
"# Configure logging\n", | ||
"logging.basicConfig(level=logging.INFO)\n", | ||
"logger = logging.getLogger(__name__)\n", | ||
"\n", | ||
"# Load environment variables\n", | ||
"load_dotenv(override=True)\n", | ||
"\n", | ||
"MAX_WORKER = 10" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Get API key and create parser\n", | ||
"api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", | ||
"if not api_key:\n", | ||
" raise ValueError(\"CAMBIO_API_KEY is not set\")\n", | ||
"ap = AnyParser(api_key)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Read responses from JSONL file" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 18, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Change to your real output json from parse_batch_upload.py\n", | ||
"response_file = \"./sample_data_20250102103047.jsonl\"\n", | ||
"with open(response_file, \"r\") as f:\n", | ||
" responses = [json.loads(line) for line in f]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 19, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Process responses concurrently\n", | ||
"def process_response(response):\n", | ||
" \"\"\"Process a single response by retrieving markdown content\"\"\"\n", | ||
" request_id = response[\"requestId\"]\n", | ||
" try:\n", | ||
" markdown = ap.batches.retrieve(request_id)\n", | ||
" if markdown:\n", | ||
" response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", | ||
" response[\"requestStatus\"] = \"COMPLETED\"\n", | ||
" response[\"completionTime\"] = markdown.completionTime\n", | ||
" except Exception as e:\n", | ||
" logger.error(f\"Error processing {request_id}: {str(e)}\")\n", | ||
" response[\"error\"] = [str(e)]\n", | ||
" return response" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Process responses concurrently\n", | ||
"with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", | ||
" future_to_response = {\n", | ||
" executor.submit(process_response, response): response\n", | ||
" for response in responses\n", | ||
" }\n", | ||
"\n", | ||
" updated_responses = []\n", | ||
" for future in as_completed(future_to_response):\n", | ||
" updated_response = future.result()\n", | ||
" updated_responses.append(updated_response)\n", | ||
"\n", | ||
"# Write all updated responses back to file\n", | ||
"with open(response_file, \"w\") as f:\n", | ||
" for response in updated_responses:\n", | ||
" f.write(json.dumps(response) + \"\\n\")\n", | ||
"\n", | ||
"print(f\"Updated all responses in {response_file} with markdown content\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Print out the first row from the updated file" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"First row from updated file:\n", | ||
"{\n", | ||
" \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n", | ||
" \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n", | ||
" \"requestStatus\": \"COMPLETED\",\n", | ||
" \"result\": [\n", | ||
" \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n", | ||
" ],\n", | ||
" \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n", | ||
"}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Read and print first row from the updated file\n", | ||
"with open(response_file, \"r\") as f:\n", | ||
" first_row = json.loads(f.readline())\n", | ||
" print(\"First row from updated file:\")\n", | ||
" print(json.dumps(first_row, indent=2))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## End of the notebook\n", | ||
"\n", | ||
"Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", | ||
"\n", | ||
"<a href=\"https://www.cambioml.com/\" title=\"Title\">\n", | ||
" <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n", | ||
"</a>" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "any-parse", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.15" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.