diff --git a/.gitignore b/.gitignore index 3ced46f..9da19ae 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,5 @@ cython_debug/ # data/ *.xlsx -*.csv \ No newline at end of file +*.csv +*.jsonl \ No newline at end of file diff --git a/README.md b/README.md index 3131c25..0cd4e83 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ Each response in the JSONL file contains: - The filename - A unique request ID - Additional processing metadata + You can later use these request IDs to retrieve the extracted content for each file: ```python diff --git a/examples/parse_batch_api.ipynb b/examples/parse_batch_api.ipynb new file mode 100644 index 0000000..e5a83f7 --- /dev/null +++ b/examples/parse_batch_api.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Anyparser Batch API Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step1: Batch API Folder Processing Upload" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from datetime import datetime\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Batch Request" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload responses saved to: ./sample_data_20250103003352.jsonl\n" + ] + } + ], + "source": [ + "# Upload folder for batch processing\n", + "WORKING_FOLDER = \"./sample_data\"\n", + "responses = ap.batches.create(WORKING_FOLDER)\n", + "\n", + "# Save responses to JSONL file with timestamp\n", + "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", + "output_file = f\"./sample_data_{timestamp}.jsonl\"\n", + "\n", + "with open(output_file, \"w\") as f:\n", + " for response in responses:\n", + " f.write(json.dumps(response.model_dump()) + \"\\n\")\n", + "\n", + "print(f\"Upload responses saved to: {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the first element status in the jsonl using the requestId" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking status for file: Earnings-Presentation-Q2-2024.pdf\n", + "Content not yet available\n" + ] + } + ], + "source": [ + "# Get first response from the JSONL file\n", + "with open(output_file, \"r\") as f:\n", + " first_response = json.loads(f.readline())\n", + "\n", + "request_id = first_response[\"requestId\"]\n", + "print(f\"Checking status for file: {first_response['fileName']}\")\n", + "\n", + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After 2 hours, you can check the content of the first file in the folder again" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content retrieved successfully\n" + ] + } + ], + "source": [ + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step2: Batch API folder fetch response\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "import os\n", + "from concurrent.futures import ThreadPoolExecutor, as_completed\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser\n", + "\n", + "# Configure logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "MAX_WORKER = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read responses from JSONL file" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Change to your real output json from parse_batch_upload.py\n", + "response_file = \"./sample_data_20250102103047.jsonl\"\n", + "with open(response_file, \"r\") as f:\n", + " responses = [json.loads(line) for line in f]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Process responses concurrently\n", + "def process_response(response):\n", + " \"\"\"Process a single response by retrieving markdown content\"\"\"\n", + " request_id = response[\"requestId\"]\n", + " try:\n", + " markdown = ap.batches.retrieve(request_id)\n", + " if markdown and markdown.result:\n", + " response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", + " response[\"requestStatus\"] = \"COMPLETED\"\n", + " response[\"completionTime\"] = markdown.completionTime\n", + " except Exception as e:\n", + " logger.error(f\"Error processing {request_id}: {str(e)}\")\n", + " response[\"error\"] = [str(e)]\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n" + ] + } + ], + "source": [ + "# Process responses concurrently\n", + "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", + " future_to_response = {\n", + " executor.submit(process_response, response): response\n", + " for response in responses\n", + " }\n", + "\n", + " updated_responses = []\n", + " for future in as_completed(future_to_response):\n", + " updated_response = future.result()\n", + " updated_responses.append(updated_response)\n", + "\n", + "# Write all updated responses back to file\n", + "with open(response_file, \"w\") as f:\n", + " for response in updated_responses:\n", + " f.write(json.dumps(response) + \"\\n\")\n", + "\n", + "print(f\"Updated all responses in {response_file} with markdown content\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out the first row from the updated file" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First row from updated file:\n", + "{\n", + " \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n", + " \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n", + " \"requestStatus\": \"COMPLETED\",\n", + " \"result\": [\n", + " \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n", + " ],\n", + " \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n", + "}\n" + ] + } + ], + "source": [ + "# Read and print first row from the updated file\n", + "with open(response_file, \"r\") as f:\n", + " first_row = json.loads(f.readline())\n", + " print(\"First row from updated file:\")\n", + " print(json.dumps(first_row, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any-parse", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py deleted file mode 100644 index 4704c64..0000000 --- a/examples/parse_batch_fetch.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Test batch API folder fetch response""" - -import json -import logging -import os -from concurrent.futures import ThreadPoolExecutor, as_completed - -from dotenv import load_dotenv - -from any_parser import AnyParser - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Load environment variables -load_dotenv(override=True) - -MAX_WORKER = 10 - -# Get API key and create parser -api_key = os.environ.get("CAMBIO_API_KEY") -if not api_key: - raise ValueError("CAMBIO_API_KEY is not set") -ap = AnyParser(api_key) - -# Read responses from JSONL file -# Change to your real output json from parse_batch_upload.py -response_file = "./sample_data_20241219190049.jsonl" -with open(response_file, "r") as f: - responses = [json.loads(line) for line in f] - - -def process_response(response): - """Process a single response by retrieving markdown content""" - request_id = response["requestId"] - try: - markdown = ap.batches.retrieve(request_id) - if markdown: - response["result"] = [markdown.result[0] if markdown.result else ""] - response["requestStatus"] = "COMPLETED" - response["completionTime"] = markdown.completionTime - except Exception as e: - logger.error(f"Error processing {request_id}: {str(e)}") - response["error"] = [str(e)] - return response - - -# Process responses concurrently -with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor: - future_to_response = { - executor.submit(process_response, response): response for response in responses - } - - updated_responses = [] - for future in as_completed(future_to_response): - updated_response = future.result() - updated_responses.append(updated_response) - -# Write all updated responses back to file -with open(response_file, "w") as f: - for response in updated_responses: - f.write(json.dumps(response) + "\n") - -print(f"Updated all responses in {response_file} with markdown content") diff --git a/examples/parse_batch_upload.py b/examples/parse_batch_upload.py deleted file mode 100644 index d9f4cc4..0000000 --- a/examples/parse_batch_upload.py +++ /dev/null @@ -1,32 +0,0 @@ -"""Batch API Folder Processing Upload Example""" - -import json -import os -from datetime import datetime - -from dotenv import load_dotenv - -from any_parser import AnyParser - -# Load environment variables -load_dotenv(override=True) - -# Get API key and create parser -api_key = os.environ.get("CAMBIO_API_KEY") -if not api_key: - raise ValueError("CAMBIO_API_KEY is not set") -ap = AnyParser(api_key) - -# Upload folder for batch processing -WORKING_FOLDER = "./sample_data" -responses = ap.batches.create(WORKING_FOLDER) - -# Save responses to JSONL file with timestamp -timestamp = datetime.now().strftime("%Y%m%d%H%M%S") -output_file = f"./sample_data_{timestamp}.jsonl" - -with open(output_file, "w") as f: - for response in responses: - f.write(json.dumps(response.model_dump()) + "\n") - -print(f"Upload responses saved to: {output_file}")