Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: address issues in code review #72

Merged
merged 1 commit into from
Dec 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ markdown = ap.batches.retrieve(request_id)
```

> ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete.
>
> ⚠️ **Important:** API keys generated from cambioml.com do not automatically have batch processing permissions. Please contact [email protected] to request batch processing access for your API key.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!


## :scroll: Examples
Check out these examples to see how you can utilize **AnyParser** to extract text, numbers, and symbols in fewer than 10 lines of code!
Expand Down
14 changes: 8 additions & 6 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
from any_parser.utils import validate_file_inputs

PUBLIC_SHARED_BASE_URL = "https://public-api.cambio-ai.com"
# TODO: Update this to the correct batch endpoint
PUBLIC_BATCH_BASE_URL = (
"http://AnyPar-ApiCo-cuKOBXasmUF1-1986145995.us-west-2.elb.amazonaws.com"
)
PUBLIC_BATCH_BASE_URL = "http://batch-api.cambio-ai.com"
TIMEOUT = 60


Expand Down Expand Up @@ -123,7 +120,12 @@ class AnyParser:
extracting information from different types of files.
"""

def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None:
def __init__(
self,
api_key: str,
base_url: str = PUBLIC_SHARED_BASE_URL,
batch_url: str = PUBLIC_BATCH_BASE_URL,
) -> None:
"""Initialize AnyParser with API credentials.

Args:
Expand All @@ -138,7 +140,7 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None
)
self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url)
self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url)
self.batches = BatchParser(api_key, PUBLIC_BATCH_BASE_URL)
self.batches = BatchParser(api_key, batch_url)

@handle_file_processing
def parse(
Expand Down
17 changes: 16 additions & 1 deletion any_parser/batch_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Batch parser implementation."""

import os
from typing import List, Optional

import requests
Expand All @@ -11,17 +12,29 @@


class UploadResponse(BaseModel):
"""
Response from the batch upload endpoint.
"""

fileName: str
requestId: str
requestStatus: str


class UsageResponse(BaseModel):
"""
Response from the batch usage endpoint.
"""

pageLimit: int
pageRemaining: int


class FileStatusResponse(BaseModel):
"""
Response from the batch file status endpoint.
"""

fileName: str
fileType: str
requestId: str
Expand Down Expand Up @@ -51,6 +64,9 @@ def create(self, file_path: str) -> UploadResponse:
Returns:
FileUploadResponse object containing upload details
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"The file path '{file_path}' does not exist.")

with open(file_path, "rb") as f:
files = {"file": f}
response = requests.post(
Expand All @@ -59,7 +75,6 @@ def create(self, file_path: str) -> UploadResponse:
files=files,
timeout=TIMEOUT,
)
print(response.json())

if response.status_code != 200:
raise Exception(f"Upload failed: {response.text}")
Expand Down
Loading