add notebook

CambioML · Jan 2, 2025 · a70dc0c · a70dc0c
1 parent 6e24a85
commit a70dc0c
Show file tree

Hide file tree

Showing 5 changed files with 416 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,4 +167,5 @@ cython_debug/
 
 # data/
 *.xlsx
-*.csv
+*.csv
+*.jsonl
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ Each response in the JSONL file contains:
 - The filename
 - A unique request ID
 - Additional processing metadata
+
 You can later use these request IDs to retrieve the extracted content for each file:
 
 ```python

diff --git a/examples/parse_batch_fetch.ipynb b/examples/parse_batch_fetch.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Batch API folder fetch response Example\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
+    "# !pip3 install --upgrade ipython\n",
+    "# !pip3 install --upgrade any-parser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import logging\n",
+    "import os\n",
+    "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+    "\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from any_parser import AnyParser\n",
+    "\n",
+    "# Configure logging\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "# Load environment variables\n",
+    "load_dotenv(override=True)\n",
+    "\n",
+    "MAX_WORKER = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get API key and create parser\n",
+    "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
+    "if not api_key:\n",
+    "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
+    "ap = AnyParser(api_key)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read responses from JSONL file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Change to your real output json from parse_batch_upload.py\n",
+    "response_file = \"./sample_data_20250102103047.jsonl\"\n",
+    "with open(response_file, \"r\") as f:\n",
+    "    responses = [json.loads(line) for line in f]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process responses concurrently\n",
+    "def process_response(response):\n",
+    "    \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
+    "    request_id = response[\"requestId\"]\n",
+    "    try:\n",
+    "        markdown = ap.batches.retrieve(request_id)\n",
+    "        if markdown:\n",
+    "            response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
+    "            response[\"requestStatus\"] = \"COMPLETED\"\n",
+    "            response[\"completionTime\"] = markdown.completionTime\n",
+    "    except Exception as e:\n",
+    "        logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
+    "        response[\"error\"] = [str(e)]\n",
+    "    return response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Process responses concurrently\n",
+    "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
+    "    future_to_response = {\n",
+    "        executor.submit(process_response, response): response\n",
+    "        for response in responses\n",
+    "    }\n",
+    "\n",
+    "    updated_responses = []\n",
+    "    for future in as_completed(future_to_response):\n",
+    "        updated_response = future.result()\n",
+    "        updated_responses.append(updated_response)\n",
+    "\n",
+    "# Write all updated responses back to file\n",
+    "with open(response_file, \"w\") as f:\n",
+    "    for response in updated_responses:\n",
+    "        f.write(json.dumps(response) + \"\\n\")\n",
+    "\n",
+    "print(f\"Updated all responses in {response_file} with markdown content\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print out the first row from the updated file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First row from updated file:\n",
+      "{\n",
+      "  \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
+      "  \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
+      "  \"requestStatus\": \"COMPLETED\",\n",
+      "  \"result\": [\n",
+      "    \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
+      "  ],\n",
+      "  \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Read and print first row from the updated file\n",
+    "with open(response_file, \"r\") as f:\n",
+    "    first_row = json.loads(f.readline())\n",
+    "    print(\"First row from updated file:\")\n",
+    "    print(json.dumps(first_row, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## End of the notebook\n",
+    "\n",
+    "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
+    "\n",
+    "<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
+    "    <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
+    "</a>"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "any-parse",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/parse_batch_fetch.py b/examples/parse_batch_fetch.py
@@ -26,7 +26,7 @@
 
 # Read responses from JSONL file
 # Change to your real output json from parse_batch_upload.py
-response_file = "./sample_data_20241219190049.jsonl"
+response_file = "./sample_data_20250102103047.jsonl"
 with open(response_file, "r") as f:
     responses = [json.loads(line) for line in f]
 
@@ -36,7 +36,7 @@ def process_response(response):
     request_id = response["requestId"]
     try:
         markdown = ap.batches.retrieve(request_id)
-        if markdown:
+        if markdown:  # TODO: add status check here
             response["result"] = [markdown.result[0] if markdown.result else ""]
             response["requestStatus"] = "COMPLETED"
             response["completionTime"] = markdown.completionTime
-Original file line number
+Diff line change
@@ Expand Up / @@ -167,4 +167,5 @@ cython_debug/ @@
     # data/
     *.xlsx
-    *.csv
+    *.csv
+    *.jsonl