Eval app improvements

dataforgoodfr · Apr 30, 2024 · 29b141a · 29b141a
1 parent ef31063
commit 29b141a
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 157 deletions.
diff --git a/collecte/pdf_downloader.py b/collecte/pdf_downloader.py
@@ -72,7 +72,6 @@ def download_pdf(
     company_folder = download_folder / website_name
 
     Path.mkdir(company_folder, parents=True, exist_ok=True)
-    #local_filename = Path(company_folder) / url.split("/")[-1]
     local_filename = Path(company_folder) / (website_name + "___" + url.split("/")[-1])
 
     exception_status = None

diff --git a/configs/eval_table_extraction.yaml b/configs/eval_table_extraction.yaml
@@ -2,26 +2,21 @@ pagefilter:
   type: FromFilename
 
 table_extraction:
-  # - type: Camelot
-  #   params:
-  #     flavor: stream
-  # - type: Camelot
-  #   params:
-  #     flavor: lattice
-  - type: Unstructured
+  - type: FromCSV
     params:
-      hi_res_model_name: "yolox"
-  - type: Unstructured
+      csv_directory: "data/extractions/extracttable"
+  - type: LLamaParse
+  - type: UnstructuredAPI
     params:
       hi_res_model_name: "yolox"
-      pdf_image_dpi: 300
+  - type: UnstructuredAPI
+    params:
+      hi_res_model_name: "detectron2_onnx"
   - type: Unstructured
     params:
       hi_res_model_name: "yolox"
-      pdf_image_dpi: 500
-  - type: UnstructuredAPI
+      pdf_image_dpi: 400
+  - type: Unstructured
     params:
-      hi_res_model_name: "yolox"
-  - type: LLamaParse
-
-# table_cleaning:
+      hi_res_model_name: "detectron2_onnx"
+      pdf_image_dpi: 400
diff --git a/country_by_country/table_extraction/unstructured_api.py b/country_by_country/table_extraction/unstructured_api.py
@@ -71,11 +71,21 @@ def __call__(self, pdf_filepath: str) -> dict:
         except Exception as e:
             print(e)
         else:
-            tables_list = [
-                pd.read_html(StringIO(el["metadata"]["text_as_html"]))[0]
-                for el in resp.elements
-                if el["type"] == "Table"
-            ]
+            tables_list = []
+            for el in resp.elements:
+                if el["type"] == "Table":
+                    # Enclose in try block to ignore case when pandas can't read the table
+                    # Happens when the html is incorrectly formatted
+                    try:
+                        table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
+                            0
+                        ]
+                    except Exception:
+                        logging.info(
+                            "Html table discarded. Pandas couldn't read the table.",
+                        )
+                    else:
+                        tables_list.append(table)
 
             # Create asset
             new_asset = {