dataforgoodfr · guillaume-millot · Apr 30, 2024 · Apr 30, 2024
diff --git a/configs/eval_table_extraction.yaml b/configs/eval_table_extraction.yaml
@@ -2,26 +2,23 @@ pagefilter:
   type: FromFilename
 
 table_extraction:
-  # - type: Camelot
-  #   params:
-  #     flavor: stream
-  # - type: Camelot
-  #   params:
-  #     flavor: lattice
-  - type: Unstructured
+  - type: FromCSV
     params:
-      hi_res_model_name: "yolox"
-  - type: Unstructured
+      csv_directory: "data/extractions/extracttable"
+  - type: LLamaParse
+  - type: UnstructuredAPI
     params:
       hi_res_model_name: "yolox"
-      pdf_image_dpi: 300
+  - type: UnstructuredAPI
+    params:
+      hi_res_model_name: "detectron2_onnx"
   - type: Unstructured
     params:
       hi_res_model_name: "yolox"
-      pdf_image_dpi: 500
-  - type: UnstructuredAPI
+      pdf_image_dpi: 400
+  - type: Unstructured
     params:
-      hi_res_model_name: "yolox"
-  - type: LLamaParse
+      hi_res_model_name: "detectron2_onnx"
+      pdf_image_dpi: 400
 
 # table_cleaning:
diff --git a/country_by_country/table_extraction/unstructured_api.py b/country_by_country/table_extraction/unstructured_api.py
@@ -71,11 +71,20 @@ def __call__(self, pdf_filepath: str) -> dict:
         except Exception as e:
             print(e)
         else:
-            tables_list = [
-                pd.read_html(StringIO(el["metadata"]["text_as_html"]))[0]
-                for el in resp.elements
-                if el["type"] == "Table"
-            ]
+            tables_list = []
+            for el in resp.elements:
+                if el["type"] == "Table":
+                    # Enclose in try block to ignore case where pandas can't read the table (html incorrectly formatted)
+                    try:
+                        table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
+                            0
+                        ]
+                    except Exception:
+                        logging.info(
+                            "Html table discarded. Pandas couldn't read the table.",
+                        )
+                    else:
+                        tables_list.append(table)
 
             # Create asset
             new_asset = {

diff --git a/eval/eval_app.py b/eval/eval_app.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 # Standard imports
+import base64
 import json
 import pickle
 import sys
@@ -32,19 +33,23 @@
 from huggingface_hub import hf_hub_download
 from streamlit import session_state as ss
 from streamlit_option_menu import option_menu
-from streamlit_pdf_viewer import pdf_viewer
 from utils import append_count_to_duplicates, convert_to_str, reformat_str
 
 from country_by_country import pagefilter
+from country_by_country.utils.utils import keep_pages
 
 
 # Callbacks
-def on_pdf_selected() -> None:
-    ss["pdf_downloaded"] = hf_hub_download(
-        repo_id="DataForGood/taxobservatory_data",
-        filename=ss.pdf_selected,
-        repo_type="dataset",
-    )
+def download_pdf() -> None:
+    pdf_file = ss.pdf_selected.replace("'", "_")
+    try:
+        ss.pdf_downloaded = hf_hub_download(
+            repo_id="DataForGood/taxobservatory_data",
+            filename=f"pdf/{ss.pdf_selected}",
+            repo_type="dataset",
+        )
+    except Exception:
+        st.error("Couldn't download PDF: " + pdf_file)
 
 
 def on_table_selected(key: str) -> None:
@@ -73,25 +78,24 @@ def main(ref_data_file: str = None) -> None:
             ss.ref_uploaded = None
 
     # Display title
-    st.title("Table extraction benchmark")
-
-    st.markdown(
-        """Once you drag and drop a pickle file, you can select a PDF to display
-        its tables\nand visually compare the results. The cells in the tables
-        are colored :green[green] if they are present in the tables of the
-        reference extraction, and :red[red] otherwise. Note the color only
+    st.title(
+        "Table extraction benchmark",
+        help="""Drag and drop a pickle file containg evaluation results, select a PDF to see
+        the corresponding extracted tables and start comparing.
+        Cells in the tables are colored **:green[in green]** if they are present in the tables of the
+        reference extraction, and **:red[red]** otherwise. Note that the color only
         indicates if one extracted value is present in the reference
-        extraction, not if that value is at the right location in the table.""",
+        extraction, not if that value is at the right location in the table. Change the reference extraction via the select box in the left sidebar.""",
     )
 
     # Display sidebar
     pdf_file = None
     with st.sidebar:
         # Select pickle containing results
         uploaded_file = st.file_uploader(
-            "Select a pickle file to load results",
+            "Select a pickle file to load evaluations results",
             type="pkl",
-            help="Run eval_table_extraction.py to generate a picke file.",
+            help="Run _eval_table_extraction.py_ to generate a picke file containing extracted tables for multiple PDFs",
         )
 
         if uploaded_file:
@@ -104,9 +108,10 @@ def main(ref_data_file: str = None) -> None:
             # Select PDF to load results
             pdf_file = st.selectbox(
                 "Select a PDF file",
-                asset_dict.keys(),
-                on_change=on_pdf_selected,
+                sorted(asset_dict.keys()),
+                on_change=download_pdf,
                 key="pdf_selected",
+                help="The corresponding extracted tables will be displayed (both REF and extractions from the picke file)",
             )
 
     # Pull the extractions applied to the PDF
@@ -115,7 +120,7 @@ def main(ref_data_file: str = None) -> None:
 
 
 def process_pdf(pdf_file: str, asset_dict: dict) -> None:
-    # Append REF data matching the PDF to our assets
+    # Append REF data to extractions in assets
     if ss.ref_uploaded is not None:
         company = pdf_file.split("_")[0]
         year = pdf_file.split("_")[1]
@@ -126,20 +131,19 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
             .reset_index(drop=True)
             .dropna(axis="columns", how="all")
         )
-        asset_dict[pdf_file]["table_extractors"].append(
+        asset_dict[pdf_file]["table_extractors"].insert(
+            0,
             {
                 "type": "REF",
                 "params": {"src_file": ref_data_file},
                 "tables": [ref_df],
             },
         )
 
-    # Pull the extractions from the asssets
-    extractions = [
-        extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]
-    ]
-    extractions = append_count_to_duplicates(extractions)
-    extractions.append("PDF")
+    # Pull extractions from the asssets including REF data
+    extractions = append_count_to_duplicates(
+        [extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]],
+    )
 
     # Select reference extraction for comparison (default to REF data)
     with st.sidebar:
@@ -148,17 +152,42 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
         except Exception:
             ref_idx = 0
         ref_extraction = st.selectbox(
-            "Select ref extraction for comparison",
-            extractions[:-1],
+            "Select reference extraction for comparison",
+            extractions,
             index=ref_idx,
         )
         if ref_extraction is not None:
             ss.ref_extraction = ref_extraction
 
-    # Display tabs (one per extraction + one to display PDF)
-    tabs = st.tabs(extractions)
+    # Display tabs (one to display PDF + one per extraction)
+    tabs = st.tabs(["PDF", *extractions])
+
+    # Tab to display PDF
+    with tabs[0]:
+        if not ss.pdf_downloaded:
+            download_pdf()
 
-    for idx, tab in enumerate(tabs[:-1]):
+        if ss.pdf_downloaded:
+            # Get pages to render
+            assets = {}
+            pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets)
+            pages_to_render = list(assets["pagefilter"]["selected_pages"])
+
+            # Filter pages from PDF
+            pdf_fitered = keep_pages(ss.pdf_downloaded, pages_to_render)
+
+            # Get content of pages
+            with open(pdf_fitered, "rb") as f:
+                base64_pdf = base64.b64encode(f.read()).decode("utf-8")
+
+            # Embed content in HTML
+            pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="1000" type="application/pdf"></iframe>'
+
+            # Display content
+            st.markdown(pdf_display, unsafe_allow_html=True)
+
+    # Tabs to display extractions
+    for idx, tab in enumerate(tabs[1:]):
         with tab:
             # Display parameters of the extraction
             st.write(
@@ -171,86 +200,79 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
             dfs_str = ["Table " + str(i) for i in range(len(dfs))]
 
             # Select table to display
-            selected = option_menu(
-                None,
-                dfs_str,
-                menu_icon=None,
-                icons=None,
-                manual_select=min(ss.selected_idx, len(dfs_str) - 1),
-                orientation="horizontal",
-                key="tab_" + str(idx),
-                on_change=on_table_selected,
-                styles={
-                    "container": {
-                        "padding": "0!important",
-                        "margin": "0!important",
-                        "background-color": "#EFF2F6",
-                    },
-                    "nav-item": {
-                        "max-width": "100px",
-                        "color": "black",
-                        "font-size": "14px",
+            if len(dfs_str) == 0:
+                st.info("No table extracted.")
+            else:
+                selected = option_menu(
+                    None,
+                    dfs_str,
+                    menu_icon=None,
+                    icons=None,
+                    manual_select=min(ss.selected_idx, len(dfs_str) - 1),
+                    orientation="horizontal",
+                    key="tab_" + str(idx),
+                    on_change=on_table_selected,
+                    styles={
+                        "container": {
+                            "padding": "0!important",
+                            "margin": "0!important",
+                            "background-color": "#EFF2F6",
+                        },
+                        "nav-item": {
+                            "max-width": "100px",
+                            "color": "black",
+                            "font-size": "14px",
+                        },
+                        "icon": {"font-size": "0px"},
                     },
-                    "icon": {"font-size": "0px"},
-                },
-            )
-            selected_idx = dfs_str.index(selected)
-
-            # Display table
-            df = dfs[selected_idx]
-
-            # Check if values in table are in tables of reference extraction
-            refvalues = []
-            for dfref in asset_dict[pdf_file]["table_extractors"][
-                extractions.index(ref_extraction)
-            ]["tables"]:
-                refvalues.extend(dfref.map(reformat_str).to_numpy().flatten())
-            mask = df.map(reformat_str).isin(refvalues)
-
-            # Apply font color (green vs red) based on above check
-            def color_mask(val: bool) -> None:
-                return f'color: {"green" if val is True else "red"}'
-
-            dfst = df.style.apply(
-                lambda c, mask=mask: mask[c.name].apply(color_mask),
-            )
-
-            # Display table with appropriate font color
-            column_config = {}
-            for col in df.columns:
-                column_config[col] = st.column_config.Column(width="small")
-
-            st.dataframe(
-                dfst,
-                column_config=column_config,
-                use_container_width=False,
-                height=round(35.5 * (len(dfst.index) + 1)),
-            )
-
-    # Tab to display PDF
-    with tabs[-1]:
-
-        if not ss.pdf_downloaded:
-            ss["pdf_downloaded"] = hf_hub_download(
-                repo_id="DataForGood/taxobservatory_data",
-                filename=ss.pdf_selected,
-                repo_type="dataset",
-            )
-
-        if ss.pdf_downloaded:
-            # Get pages to render
-            assets = {}
-            pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets)
-            pages_to_render = [
-                page + 1 for page in assets["pagefilter"]["selected_pages"]
-            ]
-
-            # Render pages from PDF
-            pdf_viewer(
-                input=ss.pdf_downloaded,
-                pages_to_render=pages_to_render,
-                width=1000,
-            )
+                )
+                selected_idx = dfs_str.index(selected) if selected in dfs_str else 0
+
+                # Display table
+                df = dfs[selected_idx]
+
+                # Fill any empty headers to prevent error when calling st.dataframe()
+                if df.columns.duplicated().sum() > 0:
+                    cols = pd.Series(df.columns)
+                    for dup in set(df.columns[df.columns.duplicated()]):
+                        if dup == "":
+                            st.info(df.columns.get_loc(dup))
+                            cols[df.columns.get_loc(dup)] = [
+                                "COL" + str(idx)
+                                for idx, dup in enumerate(df.columns.get_loc(dup))
+                            ]
+                    df.columns = cols
+
+                # Check if values in table are in tables of reference extraction
+                refvalues = []
+                for dfref in asset_dict[pdf_file]["table_extractors"][
+                    extractions.index(ref_extraction)
+                ]["tables"]:
+                    refvalues.extend(dfref.map(reformat_str).to_numpy().flatten())
+                mask = df.map(reformat_str).isin(refvalues)
+
+                # Apply font color (green vs red) based on above check
+                def color_mask(val: bool) -> None:
+                    return f'color: {"green" if val is True else "red"}'
+
+                dfst = df.style.apply(
+                    lambda c, mask=mask: mask[c.name].apply(color_mask),
+                )
+
+                # Display table with appropriate font color
+                column_config = {}
+                for col in df.columns:
+                    column_config[col] = st.column_config.Column(width="small")
+
+                try:
+                    st.dataframe(
+                        dfst,
+                        column_config=column_config,
+                        use_container_width=False,
+                        height=round(35.5 * (len(dfst.index) + 1)),
+                    )
+                except Exception as error:
+                    st.error(error)
 
 
 if __name__ == "__main__":