From a70dc0cfc56d709b756a32e03dbffd29dfd0af8c Mon Sep 17 00:00:00 2001
From: Charles Yuan <charles@Charless-MacBook-Pro-2.local>
Date: Thu, 2 Jan 2025 13:58:17 +0800
Subject: [PATCH] add notebook

---
 .gitignore                        |   3 +-
 README.md                         |   1 +
 examples/parse_batch_fetch.ipynb  | 206 ++++++++++++++++++++++++++++++
 examples/parse_batch_fetch.py     |   4 +-
 examples/parse_batch_upload.ipynb | 205 +++++++++++++++++++++++++++++
 5 files changed, 416 insertions(+), 3 deletions(-)
 create mode 100644 examples/parse_batch_fetch.ipynb
 create mode 100644 examples/parse_batch_upload.ipynb

diff --git a/.gitignore b/.gitignore
index 3ced46f..9da19ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,4 +167,5 @@ cython_debug/
 
 # data/
 *.xlsx
-*.csv
\ No newline at end of file
+*.csv
+*.jsonl
\ No newline at end of file
diff --git a/README.md b/README.md
index 3131c25..0cd4e83 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,7 @@ Each response in the JSONL file contains:
 - The filename
 - A unique request ID
 - Additional processing metadata
+
 You can later use these request IDs to retrieve the extracted content for each file:
 
 ```python
diff --git a/examples/parse_batch_fetch.ipynb b/examples/parse_batch_fetch.ipynb
new file mode 100644
index 0000000..bfe0b35
--- /dev/null
+++ b/examples/parse_batch_fetch.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Batch API folder fetch response Example\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
+    "# !pip3 install --upgrade ipython\n",
+    "# !pip3 install --upgrade any-parser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import logging\n",
+    "import os\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from any_parser import AnyParser\n",
+    "\n",
+    "# Configure logging\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "MAX_WORKER = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get API key and create parser\n",
+    "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
+    "if not api_key:\n",
+    "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
+    "ap = AnyParser(api_key)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read responses from JSONL file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change to your real output json from parse_batch_upload.py\n",
+    "response_file = \"./sample_data_20250102103047.jsonl\"\n",
+    "with open(response_file, \"r\") as f:\n",
+    "    responses = [json.loads(line) for line in f]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process responses concurrently\n",
+    "def process_response(response):\n",
+    "    \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
+    "    request_id = response[\"requestId\"]\n",
+    "    try:\n",
+    "        markdown = ap.batches.retrieve(request_id)\n",
+    "        if markdown:\n",
+    "            response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
+    "            response[\"requestStatus\"] = \"COMPLETED\"\n",
+    "            response[\"completionTime\"] = markdown.completionTime\n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
+    "        response[\"error\"] = [str(e)]\n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Process responses concurrently\n",
+    "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
+    "    future_to_response = {\n",
+    "        executor.submit(process_response, response): response\n",
+    "        for response in responses\n",
+    "    }\n",
+    "\n",
+    "    updated_responses = []\n",
+    "    for future in as_completed(future_to_response):\n",
+    "        updated_response = future.result()\n",
+    "        updated_responses.append(updated_response)\n",
+    "\n",
+    "# Write all updated responses back to file\n",
+    "with open(response_file, \"w\") as f:\n",
+    "    for response in updated_responses:\n",
+    "        f.write(json.dumps(response) + \"\\n\")\n",
+    "\n",
+    "print(f\"Updated all responses in {response_file} with markdown content\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print out the first row from the updated file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First row from updated file:\n",
+      "{\n",
+      "  \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
+      "  \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
+      "  \"requestStatus\": \"COMPLETED\",\n",
+      "  \"result\": [\n",
+      "    \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
+      "  ],\n",
+      "  \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read and print first row from the updated file\n",
+    "with open(response_file, \"r\") as f:\n",
+    "    first_row = json.loads(f.readline())\n",
+    "    print(\"First row from updated file:\")\n",
+    "    print(json.dumps(first_row, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## End of the notebook\n",
+    "\n",
+    "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
+    "\n",
+    "<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
+    "    <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
+    "</a>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "any-parse",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py
index 4704c64..0825009 100644
--- a/examples/parse_batch_fetch.py
+++ b/examples/parse_batch_fetch.py
@@ -26,7 +26,7 @@
 
 # Read responses from JSONL file
 # Change to your real output json from parse_batch_upload.py
-response_file = "./sample_data_20241219190049.jsonl"
+response_file = "./sample_data_20250102103047.jsonl"
 with open(response_file, "r") as f:
     responses = [json.loads(line) for line in f]
 
@@ -36,7 +36,7 @@ def process_response(response):
     request_id = response["requestId"]
     try:
         markdown = ap.batches.retrieve(request_id)
-        if markdown:
+        if markdown:  # TODO: add status check here
             response["result"] = [markdown.result[0] if markdown.result else ""]
             response["requestStatus"] = "COMPLETED"
             response["completionTime"] = markdown.completionTime
diff --git a/examples/parse_batch_upload.ipynb b/examples/parse_batch_upload.ipynb
new file mode 100644
index 0000000..6e29234
--- /dev/null
+++ b/examples/parse_batch_upload.ipynb
@@ -0,0 +1,205 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Batch API Folder Processing Upload Example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
+    "# !pip3 install --upgrade ipython\n",
+    "# !pip3 install --upgrade any-parser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from any_parser import AnyParser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load environment variables\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "# Get API key and create parser\n",
+    "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
+    "if not api_key:\n",
+    "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
+    "ap = AnyParser(api_key)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create Batch Request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Upload responses saved to: ./sample_data_20250102134950.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Upload folder for batch processing\n",
+    "WORKING_FOLDER = \"./sample_data\"\n",
+    "responses = ap.batches.create(WORKING_FOLDER)\n",
+    "\n",
+    "# Save responses to JSONL file with timestamp\n",
+    "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+    "output_file = f\"./sample_data_{timestamp}.jsonl\"\n",
+    "\n",
+    "with open(output_file, \"w\") as f:\n",
+    "    for response in responses:\n",
+    "        f.write(json.dumps(response.model_dump()) + \"\\n\")\n",
+    "\n",
+    "print(f\"Upload responses saved to: {output_file}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check the first element status in the jsonl using the requestId"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Checking status for file: test3.pdf\n",
+      "Content not yet available\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get first response from the JSONL file\n",
+    "with open(output_file, \"r\") as f:\n",
+    "    first_response = json.loads(f.readline())\n",
+    "\n",
+    "request_id = first_response[\"requestId\"]\n",
+    "print(f\"Checking status for file: {first_response['fileName']}\")\n",
+    "\n",
+    "# Retrieve status using request ID\n",
+    "markdown = ap.batches.retrieve(request_id)\n",
+    "if markdown and markdown.result:\n",
+    "    print(\"Content retrieved successfully\")\n",
+    "else:\n",
+    "    print(\"Content not yet available\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After 2 hours, you can check the content of the first file in the folder again"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Content retrieved successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Retrieve status using request ID\n",
+    "markdown = ap.batches.retrieve(request_id)\n",
+    "if markdown and markdown.result:\n",
+    "    print(\"Content retrieved successfully\")\n",
+    "else:\n",
+    "    print(\"Content not yet available\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the job is completed, refer to examples/parse_batch_fetch.ipynb to fetch all responses in the jsonl file:\n",
+    "\n",
+    "https://github.com/CambioML/any-parser/blob/main/examples/parse_batch_fetch.ipynb\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## End of the notebook\n",
+    "\n",
+    "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
+    "\n",
+    "<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
+    "    <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
+    "</a>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}