-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add instruction extraction with sync and async example notebooks #51
Changes from 5 commits
e9168a6
582ddf6
6cb2fdb
9de5fd4
8a62433
53002f3
6d5347c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,7 @@ | ||
"""AnyParser module for parsing data.""" | ||
|
||
from any_parser.any_parser import ModelType # Import ModelType here | ||
from any_parser.any_parser import AnyParser | ||
from any_parser.any_parser import AnyParser, ModelType | ||
|
||
__all__ = ["AnyParser", "ModelType"] # Add ModelType to __all__ | ||
__all__ = ["AnyParser", "ModelType"] | ||
|
||
__version__ = "0.0.16" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,12 +22,12 @@ | |
"png", | ||
"gif", | ||
] | ||
RESULT_TYPES = ["markdown", "json"] | ||
|
||
|
||
class ModelType(Enum): | ||
BASE = "base" | ||
PRO = "pro" | ||
ULTRA = "ultra" | ||
|
||
|
||
class ProcessType(Enum): | ||
|
@@ -50,8 +50,9 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None | |
Returns: | ||
None | ||
""" | ||
self._sync_url = f"{base_url}/extract" | ||
self._sync_refined_url = f"{base_url}/refined_extract" | ||
self._sync_extract_url = f"{base_url}/extract" | ||
self._sync_json_url = f"{base_url}/json/extract" | ||
self._sync_refined_url = f"{base_url}/refined_parse" | ||
self._async_upload_url = f"{base_url}/async/upload" | ||
self._async_fetch_url = f"{base_url}/async/fetch" | ||
self._api_key = api_key | ||
|
@@ -76,19 +77,15 @@ def extract( | |
""" | ||
file_extension = Path(file_path).suffix.lower().lstrip(".") | ||
|
||
# Check if the file exists | ||
if not Path(file_path).is_file(): | ||
return f"Error: File does not exist: {file_path}", None | ||
# Check if the file exists and file_type | ||
error = self._check_file_type_and_path(file_path, file_extension) | ||
|
||
# Check for valid file extension | ||
if file_extension not in SUPPORTED_FILE_EXTENSIONS: | ||
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) | ||
return ( | ||
f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}.", | ||
None, | ||
) | ||
if error: | ||
return error, None | ||
|
||
self._check_model(model) | ||
error = self._check_model(model) | ||
if error: | ||
return error, None | ||
|
||
# Encode the file content in base64 | ||
with open(file_path, "rb") as file: | ||
|
@@ -104,11 +101,13 @@ def extract( | |
payload["extract_args"] = extract_args | ||
|
||
if model == ModelType.BASE: | ||
url = self._sync_url | ||
elif model == ModelType.PRO or model == ModelType.ULTRA: | ||
url = self._sync_extract_url | ||
elif model == ModelType.PRO: | ||
url = self._sync_refined_url | ||
if model == ModelType.PRO: | ||
payload["quick_mode"] = True | ||
else: | ||
return "Error: Invalid model type", None | ||
|
||
# Send the POST request | ||
start_time = time.time() | ||
|
@@ -137,6 +136,62 @@ def extract( | |
else: | ||
return f"Error: {response.status_code} {response.text}", None | ||
|
||
def extract_json( | ||
self, | ||
file_path: str, | ||
extract_instruction: Dict, | ||
) -> Tuple[str, str]: | ||
"""Extract json in real-time. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like a GenAI low quality dosstring, you should not just trust GenAI auto completely output, but given model models about extract what json based on what? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated with latest commit |
||
|
||
Args: | ||
file_path (str): The path to the file to be parsed. | ||
extract_instruction (Dict): A dictionary containing the keys to be extracted, | ||
with their values as the description of those keys. | ||
Returns: | ||
tuple(str, str): The extracted data and the time taken. | ||
""" | ||
file_extension = Path(file_path).suffix.lower().lstrip(".") | ||
|
||
# Check if the file exists and file_type | ||
error = self._check_file_type_and_path(file_path, file_extension) | ||
if error: | ||
return error, None | ||
|
||
# Encode the file content in base64 | ||
with open(file_path, "rb") as file: | ||
encoded_file = base64.b64encode(file.read()).decode("utf-8") | ||
|
||
# Create the JSON payload | ||
payload = { | ||
"file_content": encoded_file, | ||
"file_type": file_extension, | ||
"instruction_args": {"extract_instruction": extract_instruction}, | ||
} | ||
|
||
# Send the POST request | ||
start_time = time.time() | ||
response = requests.post( | ||
self._sync_json_url, | ||
headers=self._headers, | ||
data=json.dumps(payload), | ||
timeout=TIMEOUT, | ||
) | ||
end_time = time.time() | ||
|
||
# Check if the request was successful | ||
if response.status_code == 200: | ||
try: | ||
response_data = response.json() | ||
result = response_data["json"] | ||
return ( | ||
result, | ||
f"Time Elapsed: {end_time - start_time:.2f} seconds", | ||
) | ||
except json.JSONDecodeError: | ||
return f"Error: Invalid JSON response: {response.text}", None | ||
else: | ||
return f"Error: {response.status_code} {response.text}", None | ||
|
||
def async_extract( | ||
self, | ||
file_path: str, | ||
|
@@ -153,25 +208,24 @@ def async_extract( | |
""" | ||
file_extension = Path(file_path).suffix.lower().lstrip(".") | ||
|
||
# Check if the file exists | ||
if not Path(file_path).is_file(): | ||
return f"Error: File does not exist: {file_path}" | ||
# Check if the file exists and file_type | ||
error = self._check_file_type_and_path(file_path, file_extension) | ||
|
||
# Check for valid file extension | ||
if file_extension not in SUPPORTED_FILE_EXTENSIONS: | ||
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) | ||
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}." | ||
if error: | ||
return error, None | ||
|
||
self._check_model(model) | ||
error = self._check_model(model) | ||
if error: | ||
return error, None | ||
|
||
file_name = Path(file_path).name | ||
|
||
if model == ModelType.BASE: | ||
process_type = ProcessType.FILE | ||
elif model == ModelType.PRO: | ||
process_type = ProcessType.FILE_REFINED_QUICK | ||
elif model == ModelType.ULTRA: | ||
process_type = ProcessType.FILE_REFINED | ||
else: | ||
return "Error: Invalid model type", None | ||
|
||
# Create the JSON payload | ||
payload = { | ||
|
@@ -190,33 +244,58 @@ def async_extract( | |
timeout=TIMEOUT, | ||
) | ||
|
||
# Check if the request was successful | ||
if response.status_code == 200: | ||
try: | ||
file_id = response.json().get("fileId") | ||
presigned_url = response.json().get("presignedUrl") | ||
with open(file_path, "rb") as file_to_upload: | ||
files = {"file": (file_path, file_to_upload)} | ||
upload_resp = requests.post( | ||
presigned_url["url"], | ||
data=presigned_url["fields"], | ||
files=files, | ||
timeout=TIMEOUT, | ||
) | ||
if upload_resp.status_code != 204: | ||
return f"Error: {upload_resp.status_code} {upload_resp.text}" | ||
return file_id | ||
except json.JSONDecodeError: | ||
return "Error: Invalid JSON response" | ||
else: | ||
return f"Error: {response.status_code} {response.text}" | ||
# If response successful, upload the file | ||
return self._upload_file_to_presigned_url(file_path, response) | ||
|
||
def async_extract_json( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated to |
||
self, | ||
file_path: str, | ||
extract_instruction: Dict, | ||
) -> str: | ||
"""Extract data asynchronously. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same. This what does extract data means in the docstring. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated with latest commit |
||
|
||
Args: | ||
file_path (str): The path to the file to be parsed. | ||
extract_instruction (Dict): A dictionary containing the keys to be extracted, | ||
with their values as the description of those keys. | ||
Returns: | ||
str: The file id of the uploaded file. | ||
""" | ||
file_extension = Path(file_path).suffix.lower().lstrip(".") | ||
|
||
# Check if the file exists and file_type | ||
error = self._check_file_type_and_path(file_path, file_extension) | ||
|
||
if error: | ||
return error, None | ||
|
||
file_name = Path(file_path).name | ||
|
||
# Create the JSON payload | ||
payload = { | ||
"file_name": file_name, | ||
"process_type": "json", | ||
"extract_args": {"extract_instruction": extract_instruction}, | ||
} | ||
|
||
# Send the POST request | ||
response = requests.post( | ||
self._async_upload_url, | ||
headers=self._headers, | ||
data=json.dumps(payload), | ||
timeout=TIMEOUT, | ||
) | ||
|
||
# If response successful, upload the file | ||
return self._upload_file_to_presigned_url(file_path, response) | ||
|
||
def async_fetch( | ||
self, | ||
file_id: str, | ||
sync: bool = True, | ||
sync_timeout: int = 60, | ||
sync_interval: int = 5, | ||
result_type: str = "markdown", | ||
) -> str: | ||
"""Fetches extraction results asynchronously. | ||
|
||
|
@@ -225,11 +304,14 @@ def async_fetch( | |
sync (bool, optional): Whether to wait for the results synchronously. | ||
sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60. | ||
sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5. | ||
result_type (string, optional): The type of result to fetch. Defaults to `markdown`. | ||
|
||
Returns: | ||
str: The extracted results as a markdown string. | ||
None: If the extraction is still in progress (when sync is False). | ||
""" | ||
self._check_result_type(result_type) | ||
|
||
response = None | ||
# Create the JSON payload | ||
payload = {"file_id": file_id} | ||
|
@@ -258,15 +340,53 @@ def async_fetch( | |
if response is None: | ||
return "Error: timeout, no response received" | ||
if response.status_code == 200: | ||
markdown_list = response.json()["markdown"] | ||
return "\n".join(markdown_list) | ||
if result_type == "json": | ||
return response.json()["json"] | ||
else: | ||
markdown_list = response.json()["markdown"] | ||
return "\n".join(markdown_list) | ||
if response.status_code == 202: | ||
return None | ||
return f"Error: {response.status_code} {response.text}" | ||
|
||
def _upload_file_to_presigned_url( | ||
self, file_path: str, response: requests.Response | ||
) -> str: | ||
if response.status_code == 200: | ||
try: | ||
file_id = response.json().get("fileId") | ||
presigned_url = response.json().get("presignedUrl") | ||
with open(file_path, "rb") as file_to_upload: | ||
files = {"file": (file_path, file_to_upload)} | ||
upload_resp = requests.post( | ||
presigned_url["url"], | ||
data=presigned_url["fields"], | ||
files=files, | ||
timeout=TIMEOUT, | ||
) | ||
if upload_resp.status_code != 204: | ||
return f"Error: {upload_resp.status_code} {upload_resp.text}" | ||
return file_id | ||
except json.JSONDecodeError: | ||
return "Error: Invalid JSON response" | ||
else: | ||
return f"Error: {response.status_code} {response.text}" | ||
|
||
def _check_model(self, model: ModelType) -> None: | ||
if model not in {ModelType.BASE, ModelType.PRO, ModelType.ULTRA}: | ||
if model not in {ModelType.BASE, ModelType.PRO}: | ||
valid_models = ", ".join(["`" + model.value + "`" for model in ModelType]) | ||
raise ValueError( | ||
f"Invalid model type: {model}. Supported `model` types include {valid_models}." | ||
) | ||
return f"Invalid model type: {model}. Supported `model` types include {valid_models}." | ||
|
||
def _check_file_type_and_path(self, file_path, file_extension): | ||
# Check if the file exists | ||
if not Path(file_path).is_file(): | ||
return f"Error: File does not exist: {file_path}" | ||
|
||
if file_extension not in SUPPORTED_FILE_EXTENSIONS: | ||
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS) | ||
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}." | ||
|
||
def _check_result_type(self, result_type: str) -> None: | ||
if result_type not in RESULT_TYPES: | ||
valid_result_types = ", ".join(RESULT_TYPES) | ||
return f"Invalid result type: {result_type}. Supported `result_type` types include {valid_result_types}." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Refactor to a utils.py to improve readability and no need to use _ then. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved to utils.py in latest commit |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks to me a very bad name. Especially, extract required no prompt while extract_json requires prompt. Then, suddently, you start to return json. Logically, I do not know why an extract_instruction starts to get this extract to extract_json. You should re-consider on naming this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated to
extract_key_value