From 29b141ad1b17cc37e13cd80f52eed46a3252d6de Mon Sep 17 00:00:00 2001
From: Guillaume Millot <guillaume@Guillaumes-MBP.lan>
Date: Tue, 30 Apr 2024 16:01:45 +0200
Subject: [PATCH] Eval app improvements

---
 collecte/pdf_downloader.py                    |   1 -
 configs/eval_table_extraction.yaml            |  27 +-
 .../table_extraction/unstructured_api.py      |  20 +-
 eval/eval_app.py                              | 295 ++++++++++--------
 eval/utils.py                                 |  13 +
 5 files changed, 199 insertions(+), 157 deletions(-)

diff --git a/collecte/pdf_downloader.py b/collecte/pdf_downloader.py
index 52637de..928484e 100644
--- a/collecte/pdf_downloader.py
+++ b/collecte/pdf_downloader.py
@@ -72,7 +72,6 @@ def download_pdf(
     company_folder = download_folder / website_name
 
     Path.mkdir(company_folder, parents=True, exist_ok=True)
-    #local_filename = Path(company_folder) / url.split("/")[-1]
     local_filename = Path(company_folder) / (website_name + "___" + url.split("/")[-1])
 
     exception_status = None
diff --git a/configs/eval_table_extraction.yaml b/configs/eval_table_extraction.yaml
index 966d364..30cc1c1 100644
--- a/configs/eval_table_extraction.yaml
+++ b/configs/eval_table_extraction.yaml
@@ -2,26 +2,21 @@ pagefilter:
   type: FromFilename
 
 table_extraction:
-  # - type: Camelot
-  #   params:
-  #     flavor: stream
-  # - type: Camelot
-  #   params:
-  #     flavor: lattice
-  - type: Unstructured
+  - type: FromCSV
     params:
-      hi_res_model_name: "yolox"
-  - type: Unstructured
+      csv_directory: "data/extractions/extracttable"
+  - type: LLamaParse
+  - type: UnstructuredAPI
     params:
       hi_res_model_name: "yolox"
-      pdf_image_dpi: 300
+  - type: UnstructuredAPI
+    params:
+      hi_res_model_name: "detectron2_onnx"
   - type: Unstructured
     params:
       hi_res_model_name: "yolox"
-      pdf_image_dpi: 500
-  - type: UnstructuredAPI
+      pdf_image_dpi: 400
+  - type: Unstructured
     params:
-      hi_res_model_name: "yolox"
-  - type: LLamaParse
-
-# table_cleaning:
+      hi_res_model_name: "detectron2_onnx"
+      pdf_image_dpi: 400
\ No newline at end of file
diff --git a/country_by_country/table_extraction/unstructured_api.py b/country_by_country/table_extraction/unstructured_api.py
index b5ed56c..9ea0353 100644
--- a/country_by_country/table_extraction/unstructured_api.py
+++ b/country_by_country/table_extraction/unstructured_api.py
@@ -71,11 +71,21 @@ def __call__(self, pdf_filepath: str) -> dict:
         except Exception as e:
             print(e)
         else:
-            tables_list = [
-                pd.read_html(StringIO(el["metadata"]["text_as_html"]))[0]
-                for el in resp.elements
-                if el["type"] == "Table"
-            ]
+            tables_list = []
+            for el in resp.elements:
+                if el["type"] == "Table":
+                    # Enclose in try block to ignore case when pandas can't read the table
+                    # Happens when the html is incorrectly formatted
+                    try:
+                        table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
+                            0
+                        ]
+                    except Exception:
+                        logging.info(
+                            "Html table discarded. Pandas couldn't read the table.",
+                        )
+                    else:
+                        tables_list.append(table)
 
             # Create asset
             new_asset = {
diff --git a/eval/eval_app.py b/eval/eval_app.py
index f80d633..9445cce 100644
--- a/eval/eval_app.py
+++ b/eval/eval_app.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 # Standard imports
+import base64
 import json
 import pickle
 import sys
@@ -32,24 +33,27 @@
 from huggingface_hub import hf_hub_download
 from streamlit import session_state as ss
 from streamlit_option_menu import option_menu
-from streamlit_pdf_viewer import pdf_viewer
-from utils import append_count_to_duplicates, convert_to_str, reformat_str
+from utils import (
+    append_count_to_duplicates,
+    convert_to_str,
+    fill_df_empty_headers,
+    reformat_str,
+)
 
 from country_by_country import pagefilter
+from country_by_country.utils.utils import keep_pages
 
 
-# Callbacks
-def on_pdf_selected() -> None:
-    ss["pdf_downloaded"] = hf_hub_download(
-        repo_id="DataForGood/taxobservatory_data",
-        filename=ss.pdf_selected,
-        repo_type="dataset",
-    )
-
-
-def on_table_selected(key: str) -> None:
-    selected = ss[key]
-    ss.selected_idx = int(selected.split(" ", 1)[1])
+def download_pdf() -> None:
+    pdf_file = ss.pdf_selected.replace("'", "_")
+    try:
+        ss.pdf_downloaded = hf_hub_download(
+            repo_id="DataForGood/taxobservatory_data",
+            filename=f"pdf/{ss.pdf_selected}",
+            repo_type="dataset",
+        )
+    except Exception:
+        st.error("Couldn't download PDF: " + pdf_file)
 
 
 def main(ref_data_file: str = None) -> None:
@@ -73,15 +77,15 @@ def main(ref_data_file: str = None) -> None:
             ss.ref_uploaded = None
 
     # Display title
-    st.title("Table extraction benchmark")
-
-    st.markdown(
-        """Once you drag and drop a pickle file, you can select a PDF to display
-        its tables\nand visually compare the results. The cells in the tables
-        are colored :green[green] if they are present in the tables of the
-        reference extraction, and :red[red] otherwise. Note the color only
-        indicates if one extracted value is present in the reference
-        extraction, not if that value is at the right location in the table.""",
+    st.title(
+        "Table extraction benchmark",
+        help="""Drag and drop a pickle file containg evaluation results, select a PDF to see
+        the corresponding extracted tables and start comparing. Cells in the tables are
+        colored **:green[in green]** if they are present in the tables of the reference
+        extraction, and **:red[red]** otherwise. Note that the color only indicates if one
+        extracted value is present in the reference extraction, not if that value is at the
+        right location in the table. Change the reference extraction via the select box in the
+        left sidebar.""",
     )
 
     # Display sidebar
@@ -89,9 +93,10 @@ def main(ref_data_file: str = None) -> None:
     with st.sidebar:
         # Select pickle containing results
         uploaded_file = st.file_uploader(
-            "Select a pickle file to load results",
+            "Select a pickle file to load evaluations results",
             type="pkl",
-            help="Run eval_table_extraction.py to generate a picke file.",
+            help="""Run _eval_table_extraction.py_ to generate a picke file containing
+            extracted tables for multiple PDFs""",
         )
 
         if uploaded_file:
@@ -104,42 +109,52 @@ def main(ref_data_file: str = None) -> None:
             # Select PDF to load results
             pdf_file = st.selectbox(
                 "Select a PDF file",
-                asset_dict.keys(),
-                on_change=on_pdf_selected,
+                sorted(asset_dict.keys()),
+                on_change=download_pdf,
                 key="pdf_selected",
+                help="""The corresponding extracted tables will be displayed (both REF and
+                extractions from the picke file)""",
             )
 
-    # Pull the extractions applied to the PDF
+    # Display tabs containing PDF and extracted tables
     if pdf_file is not None:
         process_pdf(pdf_file, asset_dict)
 
 
+def append_ref_data(pdf_file: str, asset_dict: dict) -> None:
+    company = pdf_file.split("_")[0]
+    year = pdf_file.split("_")[1]
+    cols = [2, *list(range(5, 10)), *list(range(15, 18))]
+    ref_df = (
+        ss.ref_uploaded.query(f'company=="{company}" and year=={year}')
+        .iloc[:, cols]
+        .reset_index(drop=True)
+        .dropna(axis="columns", how="all")
+    )
+    asset_dict[pdf_file]["table_extractors"].insert(
+        0,
+        {
+            "type": "REF",
+            "params": {"src_file": ref_data_file},
+            "tables": [ref_df],
+        },
+    )
+
+
+def select_table(key: str) -> None:
+    selected = ss[key]
+    ss.selected_idx = int(selected.split(" ", 1)[1])
+
+
 def process_pdf(pdf_file: str, asset_dict: dict) -> None:
-    # Append REF data matching the PDF to our assets
+    # Append REF data to extractions in assets
     if ss.ref_uploaded is not None:
-        company = pdf_file.split("_")[0]
-        year = pdf_file.split("_")[1]
-        cols = [2, *list(range(5, 10)), *list(range(15, 18))]
-        ref_df = (
-            ss.ref_uploaded.query(f'company=="{company}" and year=={year}')
-            .iloc[:, cols]
-            .reset_index(drop=True)
-            .dropna(axis="columns", how="all")
-        )
-        asset_dict[pdf_file]["table_extractors"].append(
-            {
-                "type": "REF",
-                "params": {"src_file": ref_data_file},
-                "tables": [ref_df],
-            },
-        )
+        append_ref_data(pdf_file, asset_dict)
 
-    # Pull the extractions from the asssets
-    extractions = [
-        extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]
-    ]
-    extractions = append_count_to_duplicates(extractions)
-    extractions.append("PDF")
+    # List all the extraction names including REF
+    extractions = append_count_to_duplicates(
+        [extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]],
+    )
 
     # Select reference extraction for comparison (default to REF data)
     with st.sidebar:
@@ -148,17 +163,43 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
         except Exception:
             ref_idx = 0
         ref_extraction = st.selectbox(
-            "Select ref extraction for comparison",
-            extractions[:-1],
+            "Select reference extraction for comparison",
+            extractions,
             index=ref_idx,
         )
         if ref_extraction is not None:
             ss.ref_extraction = ref_extraction
 
-    # Display tabs (one per extraction + one to display PDF)
-    tabs = st.tabs(extractions)
+    # Display tabs (one to display PDF + one per extraction)
+    tabs = st.tabs(["PDF", *extractions])
+
+    # Tab to display PDF
+    with tabs[0]:
+        if not ss.pdf_downloaded:
+            download_pdf()
+
+        if ss.pdf_downloaded:
+            # Get pages to render
+            assets = {}
+            pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets)
+            pages_to_render = list(assets["pagefilter"]["selected_pages"])
+
+            # Filter pages from PDF
+            pdf_fitered = keep_pages(ss.pdf_downloaded, pages_to_render)
+
+            # Get content of pages
+            with Path(pdf_fitered).open("rb") as f:
+                base64_pdf = base64.b64encode(f.read()).decode("utf-8")
+
+            # Embed content in HTML
+            pdf_display = f"""<iframe src="data:application/pdf;base64,{base64_pdf}"
+            width="800" height="1000" type="application/pdf"></iframe>"""
+
+            # Display content
+            st.markdown(pdf_display, unsafe_allow_html=True)
 
-    for idx, tab in enumerate(tabs[:-1]):
+    # Tabs to display extractions
+    for idx, tab in enumerate(tabs[1:]):
         with tab:
             # Display parameters of the extraction
             st.write(
@@ -171,86 +212,70 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
             dfs_str = ["Table " + str(i) for i in range(len(dfs))]
 
             # Select table to display
-            selected = option_menu(
-                None,
-                dfs_str,
-                menu_icon=None,
-                icons=None,
-                manual_select=min(ss.selected_idx, len(dfs_str) - 1),
-                orientation="horizontal",
-                key="tab_" + str(idx),
-                on_change=on_table_selected,
-                styles={
-                    "container": {
-                        "padding": "0!important",
-                        "margin": "0!important",
-                        "background-color": "#EFF2F6",
+            if len(dfs_str) == 0:
+                st.info("No table extracted.")
+            else:
+                selected = option_menu(
+                    None,
+                    dfs_str,
+                    menu_icon=None,
+                    icons=None,
+                    manual_select=min(ss.selected_idx, len(dfs_str) - 1),
+                    orientation="horizontal",
+                    key="tab_" + str(idx),
+                    on_change=select_table,
+                    styles={
+                        "container": {
+                            "padding": "0!important",
+                            "margin": "0!important",
+                            "background-color": "#EFF2F6",
+                        },
+                        "nav-item": {
+                            "max-width": "100px",
+                            "color": "black",
+                            "font-size": "14px",
+                        },
+                        "icon": {"font-size": "0px"},
                     },
-                    "nav-item": {
-                        "max-width": "100px",
-                        "color": "black",
-                        "font-size": "14px",
-                    },
-                    "icon": {"font-size": "0px"},
-                },
-            )
-            selected_idx = dfs_str.index(selected)
-
-            # Display table
-            df = dfs[selected_idx]
-
-            # Check if values in table are in tables of reference extraction
-            refvalues = []
-            for dfref in asset_dict[pdf_file]["table_extractors"][
-                extractions.index(ref_extraction)
-            ]["tables"]:
-                refvalues.extend(dfref.map(reformat_str).to_numpy().flatten())
-            mask = df.map(reformat_str).isin(refvalues)
-
-            # Apply font color (green vs red) based on above check
-            def color_mask(val: bool) -> None:
-                return f'color: {"green" if val is True else "red"}'
-
-            dfst = df.style.apply(
-                lambda c, mask=mask: mask[c.name].apply(color_mask),
-            )
-
-            # Display table with appropriate font color
-            column_config = {}
-            for col in df.columns:
-                column_config[col] = st.column_config.Column(width="small")
-
-            st.dataframe(
-                dfst,
-                column_config=column_config,
-                use_container_width=False,
-                height=round(35.5 * (len(dfst.index) + 1)),
-            )
-
-    # Tab to display PDF
-    with tabs[-1]:
-
-        if not ss.pdf_downloaded:
-            ss["pdf_downloaded"] = hf_hub_download(
-                repo_id="DataForGood/taxobservatory_data",
-                filename=f"pdf/{ss.pdf_selected}",
-                repo_type="dataset",
-            )
-
-        if ss.pdf_downloaded:
-            # Get pages to render
-            assets = {}
-            pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets)
-            pages_to_render = [
-                page + 1 for page in assets["pagefilter"]["selected_pages"]
-            ]
-
-            # Render pages from PDF
-            pdf_viewer(
-                input=ss.pdf_downloaded,
-                pages_to_render=pages_to_render,
-                width=1000,
-            )
+                )
+                selected_idx = dfs_str.index(selected) if selected in dfs_str else 0
+
+                # Display table
+                df = dfs[selected_idx]
+
+                # Fill any empty headers to prevent error when calling st.dataframe()
+                fill_df_empty_headers(df)
+
+                # Check if values in table are in tables of reference extraction
+                refvalues = []
+                for dfref in asset_dict[pdf_file]["table_extractors"][
+                    extractions.index(ref_extraction)
+                ]["tables"]:
+                    refvalues.extend(dfref.map(reformat_str).to_numpy().flatten())
+                mask = df.map(reformat_str).isin(refvalues)
+
+                # Apply font color (green vs red) based on above check
+                def color_mask(val: bool) -> None:
+                    return f'color: {"green" if val is True else "red"}'
+
+                dfst = df.style.apply(
+                    lambda c, mask=mask: mask[c.name].apply(color_mask),
+                )
+
+                # Display table with appropriate font color
+                column_config = {}
+                for col in df.columns:
+                    column_config[col] = st.column_config.Column(width="small")
+
+                try:
+                    st.dataframe(
+                        dfst,
+                        column_config=column_config,
+                        use_container_width=False,
+                        height=round(35.5 * (len(dfst.index) + 1)),
+                    )
+                except Exception as error:
+                    st.error(error)
 
 
 if __name__ == "__main__":
diff --git a/eval/utils.py b/eval/utils.py
index cd15d2f..66866d5 100644
--- a/eval/utils.py
+++ b/eval/utils.py
@@ -24,6 +24,8 @@
 import contextlib
 import re
 
+import pandas as pd
+
 
 def append_count_to_duplicates(strings: list[str]) -> list[str]:
     """Append count to duplicate strings in array"""
@@ -52,3 +54,14 @@ def reformat_str(el: any) -> str:
     Output string."""
     el = convert_to_str(el).replace(",", "")
     return re.sub(r"\((\d+)\)", r"-\1", el)
+
+
+def fill_df_empty_headers(df: pd) -> str:
+    if df.columns.duplicated().sum() > 0:
+        cols = pd.Series(df.columns)
+        for dup in set(df.columns[df.columns.duplicated()]):
+            if dup == "":
+                cols[df.columns.get_loc(dup)] = [
+                    "COL" + str(idx) for idx, dup in enumerate(df.columns.get_loc(dup))
+                ]
+        df.columns = cols