add sync and async extrqct-json tests, add README

CambioML · Oct 21, 2024 · 6cb2fdb · 6cb2fdb
1 parent 582ddf6
commit 6cb2fdb
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 3 deletions.
diff --git a/any_parser/any_parser.py b/any_parser/any_parser.py
@@ -343,7 +343,6 @@ def async_fetch(
             if result_type == "json":
                 return response.json()["json"]
             else:
-                print(response.json())
                 markdown_list = response.json()["markdown"]
                 return "\n".join(markdown_list)
         if response.status_code == 202:

diff --git a/examples/sample_data/test_w2.docx b/examples/sample_data/test_w2.docx
diff --git a/examples/sample_data/test_w2.png b/examples/sample_data/test_w2.png
diff --git a/examples/sample_data/test_w2.pptx b/examples/sample_data/test_w2.pptx
diff --git a/run_tests.sh b/run_tests.sh
@@ -1,2 +1,2 @@
 #!/bin/sh
-python -m unittest discover tests
+python -m unittest discover tests -v
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,27 @@
+# Testing
+Overview of running tests for AnyParser sdk. These should be run before submitting any pull request.
+
+These tests are written using the unittest framework in Python. The tests are located in the `tests/test.py` file. Test data is located in the `tests/test_data.py` file.
+
+## Setup
+1. Install the required packages by running the following command:
+```bash
+pip install Levenshtein
+```
+
+## Running Tests
+1. Make sure you are in the root folder.
+2. Run the following command:
+```bash
+./run_tests.sh
+```
+
+If you just want to run an individual test within the test.py file, you can run the following command:
+```bash
+python -m unittest -k <test_name>
+```
+
+For example, if you want to run `test_pdf_sync_extract`, you can run the following command:
+```bash
+python -m unittest -k test_pdf_sync_extract
+```
diff --git a/tests/test.py b/tests/test.py
@@ -2,11 +2,14 @@
 
 import os
 import sys
+import time
 import unittest
 
 import Levenshtein
 from dotenv import load_dotenv
 
+from tests.test_data import EXTRACT_JSON_TEST_DATA
+
 sys.path.append(".")
 load_dotenv(override=True)
 from any_parser import AnyParser  # noqa: E402
@@ -174,6 +177,37 @@ def test_image_async_extract_and_fetch(self):
             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
         )
 
+    def test_sync_extract_json(self):
+        """Synchronous JSON Extraction with subtests for different file formats"""
+        for data in EXTRACT_JSON_TEST_DATA:
+            with self.subTest(working_file=data["working_file"]):
+                # extract
+                key_value_result, elapsed_time = self.ap.extract_json(
+                    data["working_file"], data["extract_instruction"]
+                )
+
+                # assertions
+                self.assertEqual(key_value_result, data["correct_output"])
+                self.assertIn("Time Elapsed", elapsed_time)
+
+    def test_async_extract_json_and_fetch(self):
+        """Asynchronous JSON Extraction with subtests for different file formats"""
+        for data in EXTRACT_JSON_TEST_DATA:
+            with self.subTest(working_file=data["working_file"]):
+                # extract
+                file_id = self.ap.async_extract_json(
+                    data["working_file"], data["extract_instruction"]
+                )
+                self.assertFalse(file_id.startswith("Error:"), file_id)
+                # fetch
+                key_value_result = self.ap.async_fetch(
+                    file_id=file_id, result_type="json"
+                )
+                # assertions
+                self.assertEqual(key_value_result, data["correct_output"])
+                # wait 1 s between requests
+                time.sleep(1)
+
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main(verbosity=2)
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -0,0 +1,78 @@
+EXTRACT_JSON_TEST_DATA = [
+    {
+        "working_file": "./examples/sample_data/test1.pdf",
+        "extract_instruction": {
+            "social_security_number": "the social security number of the employee",
+            "ein": "the employer identification number",
+            "first_name": "the first name of the employee",
+            "last_name": "the last name of the employee",
+        },
+        "correct_output": [
+            [
+                {
+                    "social_security_number": "758-58-5787",
+                    "ein": "78-8778788",
+                    "first_name": "Jesan",
+                    "last_name": "Rahaman",
+                }
+            ]
+        ],
+    },
+    {
+        "working_file": "./examples/sample_data/test_w2.pptx",
+        "extract_instruction": {
+            "social_security_number": "the social security number of the employee",
+            "ein": "the employer identification number",
+            "first_name": "the first name of the employee",
+            "last_name": "the last name of the employee",
+        },
+        "correct_output": [
+            [
+                {
+                    "social_security_number": "758-58-5787",
+                    "ein": "78-8778788",
+                    "first_name": "Jesan",
+                    "last_name": "Rahaman",
+                }
+            ]
+        ],
+    },
+    {
+        "working_file": "./examples/sample_data/test_w2.docx",
+        "extract_instruction": {
+            "social_security_number": "the social security number of the employee",
+            "ein": "the employer identification number",
+            "first_name": "the first name of the employee",
+            "last_name": "the last name of the employee",
+        },
+        "correct_output": [
+            [
+                {
+                    "social_security_number": "758-58-5787",
+                    "ein": "78-8778788",
+                    "first_name": "Jesan",
+                    "last_name": "Rahaman",
+                }
+            ]
+        ],
+    },
+    {
+        "working_file": "./examples/sample_data/test_w2.png",
+        "extract_instruction": {
+            "social_security_number": "the social security number of the employee",
+            "ein": "the employer identification number",
+            "first_name": "the first name of the employee",
+            "last_name": "the last name of the employee",
+        },
+        "correct_output": [
+            [
+                {
+                    "social_security_number": "758-58-5787",
+                    "ein": "78-8778788",
+                    "first_name": "Jesan",
+                    "last_name": "Rahaman",
+                }
+            ]
+        ],
+    },
+]