From a70dc0cfc56d709b756a32e03dbffd29dfd0af8c Mon Sep 17 00:00:00 2001 From: Charles Yuan Date: Thu, 2 Jan 2025 13:58:17 +0800 Subject: [PATCH] add notebook --- .gitignore | 3 +- README.md | 1 + examples/parse_batch_fetch.ipynb | 206 ++++++++++++++++++++++++++++++ examples/parse_batch_fetch.py | 4 +- examples/parse_batch_upload.ipynb | 205 +++++++++++++++++++++++++++++ 5 files changed, 416 insertions(+), 3 deletions(-) create mode 100644 examples/parse_batch_fetch.ipynb create mode 100644 examples/parse_batch_upload.ipynb diff --git a/.gitignore b/.gitignore index 3ced46f..9da19ae 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,5 @@ cython_debug/ # data/ *.xlsx -*.csv \ No newline at end of file +*.csv +*.jsonl \ No newline at end of file diff --git a/README.md b/README.md index 3131c25..0cd4e83 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ Each response in the JSONL file contains: - The filename - A unique request ID - Additional processing metadata + You can later use these request IDs to retrieve the extracted content for each file: ```python diff --git a/examples/parse_batch_fetch.ipynb b/examples/parse_batch_fetch.ipynb new file mode 100644 index 0000000..bfe0b35 --- /dev/null +++ b/examples/parse_batch_fetch.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Batch API folder fetch response Example\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import logging\n", + "import os\n", + "from concurrent.futures import ThreadPoolExecutor, as_completed\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser\n", + "\n", + "# Configure logging\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "MAX_WORKER = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read responses from JSONL file" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Change to your real output json from parse_batch_upload.py\n", + "response_file = \"./sample_data_20250102103047.jsonl\"\n", + "with open(response_file, \"r\") as f:\n", + " responses = [json.loads(line) for line in f]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Process responses concurrently\n", + "def process_response(response):\n", + " \"\"\"Process a single response by retrieving markdown content\"\"\"\n", + " request_id = response[\"requestId\"]\n", + " try:\n", + " markdown = ap.batches.retrieve(request_id)\n", + " if markdown:\n", + " response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", + " response[\"requestStatus\"] = \"COMPLETED\"\n", + " response[\"completionTime\"] = markdown.completionTime\n", + " except Exception as e:\n", + " logger.error(f\"Error processing {request_id}: {str(e)}\")\n", + " response[\"error\"] = [str(e)]\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n" + ] + } + ], + "source": [ + "# Process responses concurrently\n", + "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", + " future_to_response = {\n", + " executor.submit(process_response, response): response\n", + " for response in responses\n", + " }\n", + "\n", + " updated_responses = []\n", + " for future in as_completed(future_to_response):\n", + " updated_response = future.result()\n", + " updated_responses.append(updated_response)\n", + "\n", + "# Write all updated responses back to file\n", + "with open(response_file, \"w\") as f:\n", + " for response in updated_responses:\n", + " f.write(json.dumps(response) + \"\\n\")\n", + "\n", + "print(f\"Updated all responses in {response_file} with markdown content\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out the first row from the updated file" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First row from updated file:\n", + "{\n", + " \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n", + " \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n", + " \"requestStatus\": \"COMPLETED\",\n", + " \"result\": [\n", + " \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n", + " ],\n", + " \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n", + "}\n" + ] + } + ], + "source": [ + "# Read and print first row from the updated file\n", + "with open(response_file, \"r\") as f:\n", + " first_row = json.loads(f.readline())\n", + " print(\"First row from updated file:\")\n", + " print(json.dumps(first_row, indent=2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "any-parse", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py index 4704c64..0825009 100644 --- a/examples/parse_batch_fetch.py +++ b/examples/parse_batch_fetch.py @@ -26,7 +26,7 @@ # Read responses from JSONL file # Change to your real output json from parse_batch_upload.py -response_file = "./sample_data_20241219190049.jsonl" +response_file = "./sample_data_20250102103047.jsonl" with open(response_file, "r") as f: responses = [json.loads(line) for line in f] @@ -36,7 +36,7 @@ def process_response(response): request_id = response["requestId"] try: markdown = ap.batches.retrieve(request_id) - if markdown: + if markdown: # TODO: add status check here response["result"] = [markdown.result[0] if markdown.result else ""] response["requestStatus"] = "COMPLETED" response["completionTime"] = markdown.completionTime diff --git a/examples/parse_batch_upload.ipynb b/examples/parse_batch_upload.ipynb new file mode 100644 index 0000000..6e29234 --- /dev/null +++ b/examples/parse_batch_upload.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Batch API Folder Processing Upload Example" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Install the libraries (ipython is used for displaying markdown in this demo)\n", + "# !pip3 install --upgrade ipython\n", + "# !pip3 install --upgrade any-parser" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from datetime import datetime\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "from any_parser import AnyParser" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Load environment variables\n", + "load_dotenv(override=True)\n", + "\n", + "# Get API key and create parser\n", + "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", + "if not api_key:\n", + " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", + "ap = AnyParser(api_key)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Batch Request" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Upload responses saved to: ./sample_data_20250102134950.jsonl\n" + ] + } + ], + "source": [ + "# Upload folder for batch processing\n", + "WORKING_FOLDER = \"./sample_data\"\n", + "responses = ap.batches.create(WORKING_FOLDER)\n", + "\n", + "# Save responses to JSONL file with timestamp\n", + "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", + "output_file = f\"./sample_data_{timestamp}.jsonl\"\n", + "\n", + "with open(output_file, \"w\") as f:\n", + " for response in responses:\n", + " f.write(json.dumps(response.model_dump()) + \"\\n\")\n", + "\n", + "print(f\"Upload responses saved to: {output_file}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the first element status in the jsonl using the requestId" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking status for file: test3.pdf\n", + "Content not yet available\n" + ] + } + ], + "source": [ + "# Get first response from the JSONL file\n", + "with open(output_file, \"r\") as f:\n", + " first_response = json.loads(f.readline())\n", + "\n", + "request_id = first_response[\"requestId\"]\n", + "print(f\"Checking status for file: {first_response['fileName']}\")\n", + "\n", + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After 2 hours, you can check the content of the first file in the folder again" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Content retrieved successfully\n" + ] + } + ], + "source": [ + "# Retrieve status using request ID\n", + "markdown = ap.batches.retrieve(request_id)\n", + "if markdown and markdown.result:\n", + " print(\"Content retrieved successfully\")\n", + "else:\n", + " print(\"Content not yet available\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the job is completed, refer to examples/parse_batch_fetch.ipynb to fetch all responses in the jsonl file:\n", + "\n", + "https://github.com/CambioML/any-parser/blob/main/examples/parse_batch_fetch.ipynb\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End of the notebook\n", + "\n", + "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", + "\n", + "\n", + " \n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}