CambioML · CambioML · Oct 22, 2024 · Oct 19, 2024 · Oct 21, 2024 · Oct 21, 2024
@@ -1,8 +1,7 @@
 """AnyParser module for parsing data."""
 
-from any_parser.any_parser import ModelType  # Import ModelType here
-from any_parser.any_parser import AnyParser
+from any_parser.any_parser import AnyParser, ModelType
 
-__all__ = ["AnyParser", "ModelType"]  # Add ModelType to __all__
+__all__ = ["AnyParser", "ModelType"]
 
 __version__ = "0.0.16"
@@ -22,6 +22,7 @@
     "png",
     "gif",
 ]
+RESULT_TYPES = ["markdown", "json"]
 
 
 class ModelType(Enum):
@@ -50,7 +51,8 @@ def __init__(self, api_key: str, base_url: str = PUBLIC_SHARED_BASE_URL) -> None
         Returns:
             None
         """
-        self._sync_url = f"{base_url}/extract"
+        self._sync_extract_url = f"{base_url}/extract"
+        self._sync_json_url = f"{base_url}/json/extract"
         self._sync_refined_url = f"{base_url}/refined_extract"
         self._async_upload_url = f"{base_url}/async/upload"
         self._async_fetch_url = f"{base_url}/async/fetch"
@@ -76,19 +78,15 @@ def extract(
         """
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
-        # Check if the file exists
-        if not Path(file_path).is_file():
-            return f"Error: File does not exist: {file_path}", None
+        # Check if the file exists and file_type
+        error = self._check_file_type_and_path(file_path, file_extension)
 
-        # Check for valid file extension
-        if file_extension not in SUPPORTED_FILE_EXTENSIONS:
-            supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
-            return (
-                f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}.",
-                None,
-            )
+        if error:
+            return error, None
 
-        self._check_model(model)
+        error = self._check_model(model)
+        if error:
+            return error, None
 
         # Encode the file content in base64
         with open(file_path, "rb") as file:
@@ -104,11 +102,13 @@ def extract(
             payload["extract_args"] = extract_args
 
         if model == ModelType.BASE:
-            url = self._sync_url
+            url = self._sync_extract_url
         elif model == ModelType.PRO or model == ModelType.ULTRA:
             url = self._sync_refined_url
             if model == ModelType.PRO:
                 payload["quick_mode"] = True
+        else:
+            return "Error: Invalid model type", None
 
         # Send the POST request
         start_time = time.time()
@@ -137,6 +137,62 @@ def extract(
         else:
             return f"Error: {response.status_code} {response.text}", None
 
+    def extract_json(
+        self,
+        file_path: str,
+        extract_instruction: Dict,
+    ) -> Tuple[str, str]:
+        """Extract json in real-time.
+
+        Args:
+            file_path (str): The path to the file to be parsed.
+            extract_instruction (Dict): A dictionary containing the keys to be extracted,
+                with their values as the description of those keys.
+        Returns:
+            tuple(str, str): The extracted data and the time taken.
+        """
+        file_extension = Path(file_path).suffix.lower().lstrip(".")
+
+        # Check if the file exists and file_type
+        error = self._check_file_type_and_path(file_path, file_extension)
+        if error:
+            return error, None
+
+        # Encode the file content in base64
+        with open(file_path, "rb") as file:
+            encoded_file = base64.b64encode(file.read()).decode("utf-8")
+
+        # Create the JSON payload
+        payload = {
+            "file_content": encoded_file,
+            "file_type": file_extension,
+            "instruction_args": {"extract_instruction": extract_instruction},
+        }
+
+        # Send the POST request
+        start_time = time.time()
+        response = requests.post(
+            self._sync_json_url,
+            headers=self._headers,
+            data=json.dumps(payload),
+            timeout=TIMEOUT,
+        )
+        end_time = time.time()
+
+        # Check if the request was successful
+        if response.status_code == 200:
+            try:
+                response_data = response.json()
+                result = response_data["json"]
+                return (
+                    result,
+                    f"Time Elapsed: {end_time - start_time:.2f} seconds",
+                )
+            except json.JSONDecodeError:
+                return f"Error: Invalid JSON response: {response.text}", None
+        else:
+            return f"Error: {response.status_code} {response.text}", None
+
     def async_extract(
         self,
         file_path: str,
@@ -153,16 +209,15 @@ def async_extract(
         """
         file_extension = Path(file_path).suffix.lower().lstrip(".")
 
-        # Check if the file exists
-        if not Path(file_path).is_file():
-            return f"Error: File does not exist: {file_path}"
+        # Check if the file exists and file_type
+        error = self._check_file_type_and_path(file_path, file_extension)
 
-        # Check for valid file extension
-        if file_extension not in SUPPORTED_FILE_EXTENSIONS:
-            supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
-            return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
+        if error:
+            return error, None
 
-        self._check_model(model)
+        error = self._check_model(model)
+        if error:
+            return error, None
 
         file_name = Path(file_path).name
 
@@ -172,6 +227,8 @@ def async_extract(
             process_type = ProcessType.FILE_REFINED_QUICK
         elif model == ModelType.ULTRA:
             process_type = ProcessType.FILE_REFINED
+        else:
+            return "Error: Invalid model type", None
 
         # Create the JSON payload
         payload = {
@@ -190,33 +247,58 @@ def async_extract(
             timeout=TIMEOUT,
         )
 
-        # Check if the request was successful
-        if response.status_code == 200:
-            try:
-                file_id = response.json().get("fileId")
-                presigned_url = response.json().get("presignedUrl")
-                with open(file_path, "rb") as file_to_upload:
-                    files = {"file": (file_path, file_to_upload)}
-                    upload_resp = requests.post(
-                        presigned_url["url"],
-                        data=presigned_url["fields"],
-                        files=files,
-                        timeout=TIMEOUT,
-                    )
-                    if upload_resp.status_code != 204:
-                        return f"Error: {upload_resp.status_code} {upload_resp.text}"
-                return file_id
-            except json.JSONDecodeError:
-                return "Error: Invalid JSON response"
-        else:
-            return f"Error: {response.status_code} {response.text}"
+        # If response successful, upload the file
+        return self._upload_file_to_presigned_url(file_path, response)
+
+    def async_extract_json(
+        self,
+        file_path: str,
+        extract_instruction: Dict,
+    ) -> str:
+        """Extract data asynchronously.
+
+        Args:
+            file_path (str): The path to the file to be parsed.
+            extract_instruction (Dict): A dictionary containing the keys to be extracted,
+                with their values as the description of those keys.
+        Returns:
+            str: The file id of the uploaded file.
+        """
+        file_extension = Path(file_path).suffix.lower().lstrip(".")
+
+        # Check if the file exists and file_type
+        error = self._check_file_type_and_path(file_path, file_extension)
+
+        if error:
+            return error, None
+
+        file_name = Path(file_path).name
+
+        # Create the JSON payload
+        payload = {
+            "file_name": file_name,
+            "process_type": "json",
+            "extract_args": {"extract_instruction": extract_instruction},
+        }
+
+        # Send the POST request
+        response = requests.post(
+            self._async_upload_url,
+            headers=self._headers,
+            data=json.dumps(payload),
+            timeout=TIMEOUT,
+        )
+
+        # If response successful, upload the file
+        return self._upload_file_to_presigned_url(file_path, response)
 
     def async_fetch(
         self,
         file_id: str,
         sync: bool = True,
         sync_timeout: int = 60,
         sync_interval: int = 5,
+        result_type: str = "markdown",
     ) -> str:
         """Fetches extraction results asynchronously.
 
@@ -225,11 +307,14 @@ def async_fetch(
             sync (bool, optional): Whether to wait for the results synchronously.
             sync_timeout (int, optional): Maximum time to wait for results in seconds. Defaults to 60.
             sync_interval (int, optional): Time interval between polling attempts in seconds. Defaults to 5.
+            result_type (string, optional): The type of result to fetch. Defaults to `markdown`.
 
         Returns:
             str: The extracted results as a markdown string.
             None: If the extraction is still in progress (when sync is False).
         """
+        self._check_result_type(result_type)
+
         response = None
         # Create the JSON payload
         payload = {"file_id": file_id}
@@ -258,15 +343,54 @@ def async_fetch(
         if response is None:
             return "Error: timeout, no response received"
         if response.status_code == 200:
-            markdown_list = response.json()["markdown"]
-            return "\n".join(markdown_list)
+            if result_type == "json":
+                return response.json()["json"]
+            else:
+                print(response.json())
+                markdown_list = response.json()["markdown"]
+                return "\n".join(markdown_list)
         if response.status_code == 202:
             return None
         return f"Error: {response.status_code} {response.text}"
 
+    def _upload_file_to_presigned_url(
+        self, file_path: str, response: requests.Response
+    ) -> str:
+        if response.status_code == 200:
+            try:
+                file_id = response.json().get("fileId")
+                presigned_url = response.json().get("presignedUrl")
+                with open(file_path, "rb") as file_to_upload:
+                    files = {"file": (file_path, file_to_upload)}
+                    upload_resp = requests.post(
+                        presigned_url["url"],
+                        data=presigned_url["fields"],
+                        files=files,
+                        timeout=TIMEOUT,
+                    )
+                    if upload_resp.status_code != 204:
+                        return f"Error: {upload_resp.status_code} {upload_resp.text}"
+                return file_id
+            except json.JSONDecodeError:
+                return "Error: Invalid JSON response"
+        else:
+            return f"Error: {response.status_code} {response.text}"
+
     def _check_model(self, model: ModelType) -> None:
         if model not in {ModelType.BASE, ModelType.PRO, ModelType.ULTRA}:
             valid_models = ", ".join(["`" + model.value + "`" for model in ModelType])
-            raise ValueError(
-                f"Invalid model type: {model}. Supported `model` types include {valid_models}."
-            )
+            return f"Invalid model type: {model}. Supported `model` types include {valid_models}."
+
+    def _check_file_type_and_path(self, file_path, file_extension):
+        # Check if the file exists
+        if not Path(file_path).is_file():
+            return f"Error: File does not exist: {file_path}"
+
+        if file_extension not in SUPPORTED_FILE_EXTENSIONS:
+            supported_types = ", ".join(SUPPORTED_FILE_EXTENSIONS)
+            return f"Error: Unsupported file type: {file_extension}. Supported file types include {supported_types}."
+
+    def _check_result_type(self, result_type: str) -> None:
+        if result_type not in RESULT_TYPES:
+            valid_result_types = ", ".join(RESULT_TYPES)
+            return f"Invalid result type: {result_type}. Supported `result_type` types include {valid_result_types}."