Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add instruction extraction with sync and async example notebooks #51

Merged
merged 7 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions any_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""AnyParser module for parsing data."""

from any_parser.any_parser import ModelType # Import ModelType here
from any_parser.any_parser import AnyParser
from any_parser.any_parser import AnyParser, ModelType

__all__ = ["AnyParser", "ModelType"] # Add ModelType to __all__
__all__ = ["AnyParser", "ModelType"]

__version__ = "0.0.16"
216 changes: 170 additions & 46 deletions any_parser/any_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"png",
"gif",
]
RESULT_TYPES = ["markdown", "json"]


class ModelType(Enum):
Expand Down Expand Up @@ -50,7 +51,8 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None
Returns:
None
"""
self._sync_url = f"{base_url}/extract"
self._sync_extract_url = f"{base_url}/extract"
self._sync_json_url = f"{base_url}/json/extract"
self._sync_refined_url = f"{base_url}/refined_extract"
self._async_upload_url = f"{base_url}/async/upload"
self._async_fetch_url = f"{base_url}/async/fetch"
Expand All @@ -76,19 +78,15 @@ def extract(
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists
if not Path(file_path).is_file():
return f"Error: File does not exist: {file_path}", None
# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)

# Check for valid file extension
if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return (
f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}.",
None,
)
if error:
return error, None

self._check_model(model)
error = self._check_model(model)
if error:
return error, None

# Encode the file content in base64
with open(file_path, "rb") as file:
Expand All @@ -104,11 +102,13 @@ def extract(
payload["extract_args"] = extract_args

if model == ModelType.BASE:
url = self._sync_url
url = self._sync_extract_url
elif model == ModelType.PRO or model == ModelType.ULTRA:
url = self._sync_refined_url
if model == ModelType.PRO:
payload["quick_mode"] = True
else:
return "Error: Invalid model type", None

# Send the POST request
start_time = time.time()
Expand Down Expand Up @@ -137,6 +137,62 @@ def extract(
else:
return f"Error: {response.status_code} {response.text}", None

def extract_json(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks to me a very bad name. Especially, extract required no prompt while extract_json requires prompt. Then, suddently, you start to return json. Logically, I do not know why an extract_instruction starts to get this extract to extract_json. You should re-consider on naming this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to extract_key_value

self,
file_path: str,
extract_instruction: Dict,
) -> Tuple[str, str]:
"""Extract json in real-time.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a GenAI low quality dosstring, you should not just trust GenAI auto completely output, but given model models about extract what json based on what?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated with latest commit


Args:
file_path (str): The path to the file to be parsed.
extract_instruction (Dict): A dictionary containing the keys to be extracted,
with their values as the description of those keys.
Returns:
tuple(str, str): The extracted data and the time taken.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)
if error:
return error, None

# Encode the file content in base64
with open(file_path, "rb") as file:
encoded_file = base64.b64encode(file.read()).decode("utf-8")

# Create the JSON payload
payload = {
"file_content": encoded_file,
"file_type": file_extension,
"instruction_args": {"extract_instruction": extract_instruction},
}

# Send the POST request
start_time = time.time()
response = requests.post(
self._sync_json_url,
headers=self._headers,
data=json.dumps(payload),
timeout=TIMEOUT,
)
end_time = time.time()

# Check if the request was successful
if response.status_code == 200:
try:
response_data = response.json()
result = response_data["json"]
return (
result,
f"Time Elapsed: {end_time - start_time:.2f} seconds",
)
except json.JSONDecodeError:
return f"Error: Invalid JSON response: {response.text}", None
else:
return f"Error: {response.status_code} {response.text}", None

def async_extract(
self,
file_path: str,
Expand All @@ -153,16 +209,15 @@ def async_extract(
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists
if not Path(file_path).is_file():
return f"Error: File does not exist: {file_path}"
# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)

# Check for valid file extension
if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
if error:
return error, None

self._check_model(model)
error = self._check_model(model)
if error:
return error, None

file_name = Path(file_path).name

Expand All @@ -172,6 +227,8 @@ def async_extract(
process_type = ProcessType.FILE_REFINED_QUICK
elif model == ModelType.ULTRA:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you remove ultra?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed in latest commit

process_type = ProcessType.FILE_REFINED
else:
return "Error: Invalid model type", None

# Create the JSON payload
payload = {
Expand All @@ -190,33 +247,58 @@ def async_extract(
timeout=TIMEOUT,
)

# Check if the request was successful
if response.status_code == 200:
try:
file_id = response.json().get("fileId")
presigned_url = response.json().get("presignedUrl")
with open(file_path, "rb") as file_to_upload:
files = {"file": (file_path, file_to_upload)}
upload_resp = requests.post(
presigned_url["url"],
data=presigned_url["fields"],
files=files,
timeout=TIMEOUT,
)
if upload_resp.status_code != 204:
return f"Error: {upload_resp.status_code} {upload_resp.text}"
return file_id
except json.JSONDecodeError:
return "Error: Invalid JSON response"
else:
return f"Error: {response.status_code} {response.text}"
# If response successful, upload the file
return self._upload_file_to_presigned_url(file_path, response)

def async_extract_json(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to extract_key_value

self,
file_path: str,
extract_instruction: Dict,
) -> str:
"""Extract data asynchronously.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same. This what does extract data means in the docstring.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated with latest commit


Args:
file_path (str): The path to the file to be parsed.
extract_instruction (Dict): A dictionary containing the keys to be extracted,
with their values as the description of those keys.
Returns:
str: The file id of the uploaded file.
"""
file_extension = Path(file_path).suffix.lower().lstrip(".")

# Check if the file exists and file_type
error = self._check_file_type_and_path(file_path, file_extension)

if error:
return error, None

file_name = Path(file_path).name

# Create the JSON payload
payload = {
"file_name": file_name,
"process_type": "json",
"extract_args": {"extract_instruction": extract_instruction},
}

# Send the POST request
response = requests.post(
self._async_upload_url,
headers=self._headers,
data=json.dumps(payload),
timeout=TIMEOUT,
)

# If response successful, upload the file
return self._upload_file_to_presigned_url(file_path, response)

def async_fetch(
self,
file_id: str,
sync: bool = True,
sync_timeout: int = 60,
sync_interval: int = 5,
result_type: str = "markdown",
) -> str:
"""Fetches extraction results asynchronously.

Expand All @@ -225,11 +307,14 @@ def async_fetch(
sync (bool, optional): Whether to wait for the results synchronously.
sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60.
sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5.
result_type (string, optional): The type of result to fetch. Defaults to `markdown`.

Returns:
str: The extracted results as a markdown string.
None: If the extraction is still in progress (when sync is False).
"""
self._check_result_type(result_type)

response = None
# Create the JSON payload
payload = {"file_id": file_id}
Expand Down Expand Up @@ -258,15 +343,54 @@ def async_fetch(
if response is None:
return "Error: timeout, no response received"
if response.status_code == 200:
markdown_list = response.json()["markdown"]
return "\n".join(markdown_list)
if result_type == "json":
return response.json()["json"]
else:
print(response.json())
markdown_list = response.json()["markdown"]
return "\n".join(markdown_list)
if response.status_code == 202:
return None
return f"Error: {response.status_code} {response.text}"

def _upload_file_to_presigned_url(
self, file_path: str, response: requests.Response
) -> str:
if response.status_code == 200:
try:
file_id = response.json().get("fileId")
presigned_url = response.json().get("presignedUrl")
with open(file_path, "rb") as file_to_upload:
files = {"file": (file_path, file_to_upload)}
upload_resp = requests.post(
presigned_url["url"],
data=presigned_url["fields"],
files=files,
timeout=TIMEOUT,
)
if upload_resp.status_code != 204:
return f"Error: {upload_resp.status_code} {upload_resp.text}"
return file_id
except json.JSONDecodeError:
return "Error: Invalid JSON response"
else:
return f"Error: {response.status_code} {response.text}"

def _check_model(self, model: ModelType) -> None:
if model not in {ModelType.BASE, ModelType.PRO, ModelType.ULTRA}:
valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
raise ValueError(
f"Invalid model type: {model}. Supported `model` types include {valid_models}."
)
return f"Invalid model type: {model}. Supported `model` types include {valid_models}."

def _check_file_type_and_path(self, file_path, file_extension):
# Check if the file exists
if not Path(file_path).is_file():
return f"Error: File does not exist: {file_path}"

if file_extension not in SUPPORTED_FILE_EXTENSIONS:
supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."

def _check_result_type(self, result_type: str) -> None:
if result_type not in RESULT_TYPES:
valid_result_types = ", ".join(RESULT_TYPES)
return f"Invalid result type: {result_type}. Supported `result_type` types include {valid_result_types}."
Loading
Loading