Skip to content

Commit

Permalink
add notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Charles Yuan authored and Charles Yuan committed Jan 2, 2025
1 parent 6e24a85 commit a70dc0c
Show file tree
Hide file tree
Showing 5 changed files with 416 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,5 @@ cython_debug/

# data/
*.xlsx
*.csv
*.csv
*.jsonl
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ Each response in the JSONL file contains:
- The filename
- A unique request ID
- Additional processing metadata

You can later use these request IDs to retrieve the extracted content for each file:

```python
Expand Down
206 changes: 206 additions & 0 deletions examples/parse_batch_fetch.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Batch API folder fetch response Example\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Install the libraries (ipython is used for displaying markdown in this demo)\n",
"# !pip3 install --upgrade ipython\n",
"# !pip3 install --upgrade any-parser"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import logging\n",
"import os\n",
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
"\n",
"from dotenv import load_dotenv\n",
"\n",
"from any_parser import AnyParser\n",
"\n",
"# Configure logging\n",
"logging.basicConfig(level=logging.INFO)\n",
"logger = logging.getLogger(__name__)\n",
"\n",
"# Load environment variables\n",
"load_dotenv(override=True)\n",
"\n",
"MAX_WORKER = 10"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Get API key and create parser\n",
"api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
"if not api_key:\n",
" raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
"ap = AnyParser(api_key)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read responses from JSONL file"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# Change to your real output json from parse_batch_upload.py\n",
"response_file = \"./sample_data_20250102103047.jsonl\"\n",
"with open(response_file, \"r\") as f:\n",
" responses = [json.loads(line) for line in f]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Process responses concurrently\n",
"def process_response(response):\n",
" \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
" request_id = response[\"requestId\"]\n",
" try:\n",
" markdown = ap.batches.retrieve(request_id)\n",
" if markdown:\n",
" response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
" response[\"requestStatus\"] = \"COMPLETED\"\n",
" response[\"completionTime\"] = markdown.completionTime\n",
" except Exception as e:\n",
" logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
" response[\"error\"] = [str(e)]\n",
" return response"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
]
}
],
"source": [
"# Process responses concurrently\n",
"with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
" future_to_response = {\n",
" executor.submit(process_response, response): response\n",
" for response in responses\n",
" }\n",
"\n",
" updated_responses = []\n",
" for future in as_completed(future_to_response):\n",
" updated_response = future.result()\n",
" updated_responses.append(updated_response)\n",
"\n",
"# Write all updated responses back to file\n",
"with open(response_file, \"w\") as f:\n",
" for response in updated_responses:\n",
" f.write(json.dumps(response) + \"\\n\")\n",
"\n",
"print(f\"Updated all responses in {response_file} with markdown content\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Print out the first row from the updated file"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"First row from updated file:\n",
"{\n",
" \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
" \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
" \"requestStatus\": \"COMPLETED\",\n",
" \"result\": [\n",
" \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
" ],\n",
" \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
"}\n"
]
}
],
"source": [
"# Read and print first row from the updated file\n",
"with open(response_file, \"r\") as f:\n",
" first_row = json.loads(f.readline())\n",
" print(\"First row from updated file:\")\n",
" print(json.dumps(first_row, indent=2))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## End of the notebook\n",
"\n",
"Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
"\n",
"<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
" <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
"</a>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "any-parse",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
4 changes: 2 additions & 2 deletions examples/parse_batch_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

# Read responses from JSONL file
# Change to your real output json from parse_batch_upload.py
response_file = "./sample_data_20241219190049.jsonl"
response_file = "./sample_data_20250102103047.jsonl"
with open(response_file, "r") as f:
responses = [json.loads(line) for line in f]

Expand All @@ -36,7 +36,7 @@ def process_response(response):
request_id = response["requestId"]
try:
markdown = ap.batches.retrieve(request_id)
if markdown:
if markdown: # TODO: add status check here
response["result"] = [markdown.result[0] if markdown.result else ""]
response["requestStatus"] = "COMPLETED"
response["completionTime"] = markdown.completionTime
Expand Down
Loading

0 comments on commit a70dc0c

Please sign in to comment.