From 29b141ad1b17cc37e13cd80f52eed46a3252d6de Mon Sep 17 00:00:00 2001 From: Guillaume Millot Date: Tue, 30 Apr 2024 16:01:45 +0200 Subject: [PATCH] Eval app improvements --- collecte/pdf_downloader.py | 1 - configs/eval_table_extraction.yaml | 27 +- .../table_extraction/unstructured_api.py | 20 +- eval/eval_app.py | 295 ++++++++++-------- eval/utils.py | 13 + 5 files changed, 199 insertions(+), 157 deletions(-) diff --git a/collecte/pdf_downloader.py b/collecte/pdf_downloader.py index 52637de..928484e 100644 --- a/collecte/pdf_downloader.py +++ b/collecte/pdf_downloader.py @@ -72,7 +72,6 @@ def download_pdf( company_folder = download_folder / website_name Path.mkdir(company_folder, parents=True, exist_ok=True) - #local_filename = Path(company_folder) / url.split("/")[-1] local_filename = Path(company_folder) / (website_name + "___" + url.split("/")[-1]) exception_status = None diff --git a/configs/eval_table_extraction.yaml b/configs/eval_table_extraction.yaml index 966d364..30cc1c1 100644 --- a/configs/eval_table_extraction.yaml +++ b/configs/eval_table_extraction.yaml @@ -2,26 +2,21 @@ pagefilter: type: FromFilename table_extraction: - # - type: Camelot - # params: - # flavor: stream - # - type: Camelot - # params: - # flavor: lattice - - type: Unstructured + - type: FromCSV params: - hi_res_model_name: "yolox" - - type: Unstructured + csv_directory: "data/extractions/extracttable" + - type: LLamaParse + - type: UnstructuredAPI params: hi_res_model_name: "yolox" - pdf_image_dpi: 300 + - type: UnstructuredAPI + params: + hi_res_model_name: "detectron2_onnx" - type: Unstructured params: hi_res_model_name: "yolox" - pdf_image_dpi: 500 - - type: UnstructuredAPI + pdf_image_dpi: 400 + - type: Unstructured params: - hi_res_model_name: "yolox" - - type: LLamaParse - -# table_cleaning: + hi_res_model_name: "detectron2_onnx" + pdf_image_dpi: 400 \ No newline at end of file diff --git a/country_by_country/table_extraction/unstructured_api.py b/country_by_country/table_extraction/unstructured_api.py index b5ed56c..9ea0353 100644 --- a/country_by_country/table_extraction/unstructured_api.py +++ b/country_by_country/table_extraction/unstructured_api.py @@ -71,11 +71,21 @@ def __call__(self, pdf_filepath: str) -> dict: except Exception as e: print(e) else: - tables_list = [ - pd.read_html(StringIO(el["metadata"]["text_as_html"]))[0] - for el in resp.elements - if el["type"] == "Table" - ] + tables_list = [] + for el in resp.elements: + if el["type"] == "Table": + # Enclose in try block to ignore case when pandas can't read the table + # Happens when the html is incorrectly formatted + try: + table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[ + 0 + ] + except Exception: + logging.info( + "Html table discarded. Pandas couldn't read the table.", + ) + else: + tables_list.append(table) # Create asset new_asset = { diff --git a/eval/eval_app.py b/eval/eval_app.py index f80d633..9445cce 100644 --- a/eval/eval_app.py +++ b/eval/eval_app.py @@ -21,6 +21,7 @@ # SOFTWARE. # Standard imports +import base64 import json import pickle import sys @@ -32,24 +33,27 @@ from huggingface_hub import hf_hub_download from streamlit import session_state as ss from streamlit_option_menu import option_menu -from streamlit_pdf_viewer import pdf_viewer -from utils import append_count_to_duplicates, convert_to_str, reformat_str +from utils import ( + append_count_to_duplicates, + convert_to_str, + fill_df_empty_headers, + reformat_str, +) from country_by_country import pagefilter +from country_by_country.utils.utils import keep_pages -# Callbacks -def on_pdf_selected() -> None: - ss["pdf_downloaded"] = hf_hub_download( - repo_id="DataForGood/taxobservatory_data", - filename=ss.pdf_selected, - repo_type="dataset", - ) - - -def on_table_selected(key: str) -> None: - selected = ss[key] - ss.selected_idx = int(selected.split(" ", 1)[1]) +def download_pdf() -> None: + pdf_file = ss.pdf_selected.replace("'", "_") + try: + ss.pdf_downloaded = hf_hub_download( + repo_id="DataForGood/taxobservatory_data", + filename=f"pdf/{ss.pdf_selected}", + repo_type="dataset", + ) + except Exception: + st.error("Couldn't download PDF: " + pdf_file) def main(ref_data_file: str = None) -> None: @@ -73,15 +77,15 @@ def main(ref_data_file: str = None) -> None: ss.ref_uploaded = None # Display title - st.title("Table extraction benchmark") - - st.markdown( - """Once you drag and drop a pickle file, you can select a PDF to display - its tables\nand visually compare the results. The cells in the tables - are colored :green[green] if they are present in the tables of the - reference extraction, and :red[red] otherwise. Note the color only - indicates if one extracted value is present in the reference - extraction, not if that value is at the right location in the table.""", + st.title( + "Table extraction benchmark", + help="""Drag and drop a pickle file containg evaluation results, select a PDF to see + the corresponding extracted tables and start comparing. Cells in the tables are + colored **:green[in green]** if they are present in the tables of the reference + extraction, and **:red[red]** otherwise. Note that the color only indicates if one + extracted value is present in the reference extraction, not if that value is at the + right location in the table. Change the reference extraction via the select box in the + left sidebar.""", ) # Display sidebar @@ -89,9 +93,10 @@ def main(ref_data_file: str = None) -> None: with st.sidebar: # Select pickle containing results uploaded_file = st.file_uploader( - "Select a pickle file to load results", + "Select a pickle file to load evaluations results", type="pkl", - help="Run eval_table_extraction.py to generate a picke file.", + help="""Run _eval_table_extraction.py_ to generate a picke file containing + extracted tables for multiple PDFs""", ) if uploaded_file: @@ -104,42 +109,52 @@ def main(ref_data_file: str = None) -> None: # Select PDF to load results pdf_file = st.selectbox( "Select a PDF file", - asset_dict.keys(), - on_change=on_pdf_selected, + sorted(asset_dict.keys()), + on_change=download_pdf, key="pdf_selected", + help="""The corresponding extracted tables will be displayed (both REF and + extractions from the picke file)""", ) - # Pull the extractions applied to the PDF + # Display tabs containing PDF and extracted tables if pdf_file is not None: process_pdf(pdf_file, asset_dict) +def append_ref_data(pdf_file: str, asset_dict: dict) -> None: + company = pdf_file.split("_")[0] + year = pdf_file.split("_")[1] + cols = [2, *list(range(5, 10)), *list(range(15, 18))] + ref_df = ( + ss.ref_uploaded.query(f'company=="{company}" and year=={year}') + .iloc[:, cols] + .reset_index(drop=True) + .dropna(axis="columns", how="all") + ) + asset_dict[pdf_file]["table_extractors"].insert( + 0, + { + "type": "REF", + "params": {"src_file": ref_data_file}, + "tables": [ref_df], + }, + ) + + +def select_table(key: str) -> None: + selected = ss[key] + ss.selected_idx = int(selected.split(" ", 1)[1]) + + def process_pdf(pdf_file: str, asset_dict: dict) -> None: - # Append REF data matching the PDF to our assets + # Append REF data to extractions in assets if ss.ref_uploaded is not None: - company = pdf_file.split("_")[0] - year = pdf_file.split("_")[1] - cols = [2, *list(range(5, 10)), *list(range(15, 18))] - ref_df = ( - ss.ref_uploaded.query(f'company=="{company}" and year=={year}') - .iloc[:, cols] - .reset_index(drop=True) - .dropna(axis="columns", how="all") - ) - asset_dict[pdf_file]["table_extractors"].append( - { - "type": "REF", - "params": {"src_file": ref_data_file}, - "tables": [ref_df], - }, - ) + append_ref_data(pdf_file, asset_dict) - # Pull the extractions from the asssets - extractions = [ - extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"] - ] - extractions = append_count_to_duplicates(extractions) - extractions.append("PDF") + # List all the extraction names including REF + extractions = append_count_to_duplicates( + [extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]], + ) # Select reference extraction for comparison (default to REF data) with st.sidebar: @@ -148,17 +163,43 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None: except Exception: ref_idx = 0 ref_extraction = st.selectbox( - "Select ref extraction for comparison", - extractions[:-1], + "Select reference extraction for comparison", + extractions, index=ref_idx, ) if ref_extraction is not None: ss.ref_extraction = ref_extraction - # Display tabs (one per extraction + one to display PDF) - tabs = st.tabs(extractions) + # Display tabs (one to display PDF + one per extraction) + tabs = st.tabs(["PDF", *extractions]) + + # Tab to display PDF + with tabs[0]: + if not ss.pdf_downloaded: + download_pdf() + + if ss.pdf_downloaded: + # Get pages to render + assets = {} + pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets) + pages_to_render = list(assets["pagefilter"]["selected_pages"]) + + # Filter pages from PDF + pdf_fitered = keep_pages(ss.pdf_downloaded, pages_to_render) + + # Get content of pages + with Path(pdf_fitered).open("rb") as f: + base64_pdf = base64.b64encode(f.read()).decode("utf-8") + + # Embed content in HTML + pdf_display = f"""""" + + # Display content + st.markdown(pdf_display, unsafe_allow_html=True) - for idx, tab in enumerate(tabs[:-1]): + # Tabs to display extractions + for idx, tab in enumerate(tabs[1:]): with tab: # Display parameters of the extraction st.write( @@ -171,86 +212,70 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None: dfs_str = ["Table " + str(i) for i in range(len(dfs))] # Select table to display - selected = option_menu( - None, - dfs_str, - menu_icon=None, - icons=None, - manual_select=min(ss.selected_idx, len(dfs_str) - 1), - orientation="horizontal", - key="tab_" + str(idx), - on_change=on_table_selected, - styles={ - "container": { - "padding": "0!important", - "margin": "0!important", - "background-color": "#EFF2F6", + if len(dfs_str) == 0: + st.info("No table extracted.") + else: + selected = option_menu( + None, + dfs_str, + menu_icon=None, + icons=None, + manual_select=min(ss.selected_idx, len(dfs_str) - 1), + orientation="horizontal", + key="tab_" + str(idx), + on_change=select_table, + styles={ + "container": { + "padding": "0!important", + "margin": "0!important", + "background-color": "#EFF2F6", + }, + "nav-item": { + "max-width": "100px", + "color": "black", + "font-size": "14px", + }, + "icon": {"font-size": "0px"}, }, - "nav-item": { - "max-width": "100px", - "color": "black", - "font-size": "14px", - }, - "icon": {"font-size": "0px"}, - }, - ) - selected_idx = dfs_str.index(selected) - - # Display table - df = dfs[selected_idx] - - # Check if values in table are in tables of reference extraction - refvalues = [] - for dfref in asset_dict[pdf_file]["table_extractors"][ - extractions.index(ref_extraction) - ]["tables"]: - refvalues.extend(dfref.map(reformat_str).to_numpy().flatten()) - mask = df.map(reformat_str).isin(refvalues) - - # Apply font color (green vs red) based on above check - def color_mask(val: bool) -> None: - return f'color: {"green" if val is True else "red"}' - - dfst = df.style.apply( - lambda c, mask=mask: mask[c.name].apply(color_mask), - ) - - # Display table with appropriate font color - column_config = {} - for col in df.columns: - column_config[col] = st.column_config.Column(width="small") - - st.dataframe( - dfst, - column_config=column_config, - use_container_width=False, - height=round(35.5 * (len(dfst.index) + 1)), - ) - - # Tab to display PDF - with tabs[-1]: - - if not ss.pdf_downloaded: - ss["pdf_downloaded"] = hf_hub_download( - repo_id="DataForGood/taxobservatory_data", - filename=f"pdf/{ss.pdf_selected}", - repo_type="dataset", - ) - - if ss.pdf_downloaded: - # Get pages to render - assets = {} - pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets) - pages_to_render = [ - page + 1 for page in assets["pagefilter"]["selected_pages"] - ] - - # Render pages from PDF - pdf_viewer( - input=ss.pdf_downloaded, - pages_to_render=pages_to_render, - width=1000, - ) + ) + selected_idx = dfs_str.index(selected) if selected in dfs_str else 0 + + # Display table + df = dfs[selected_idx] + + # Fill any empty headers to prevent error when calling st.dataframe() + fill_df_empty_headers(df) + + # Check if values in table are in tables of reference extraction + refvalues = [] + for dfref in asset_dict[pdf_file]["table_extractors"][ + extractions.index(ref_extraction) + ]["tables"]: + refvalues.extend(dfref.map(reformat_str).to_numpy().flatten()) + mask = df.map(reformat_str).isin(refvalues) + + # Apply font color (green vs red) based on above check + def color_mask(val: bool) -> None: + return f'color: {"green" if val is True else "red"}' + + dfst = df.style.apply( + lambda c, mask=mask: mask[c.name].apply(color_mask), + ) + + # Display table with appropriate font color + column_config = {} + for col in df.columns: + column_config[col] = st.column_config.Column(width="small") + + try: + st.dataframe( + dfst, + column_config=column_config, + use_container_width=False, + height=round(35.5 * (len(dfst.index) + 1)), + ) + except Exception as error: + st.error(error) if __name__ == "__main__": diff --git a/eval/utils.py b/eval/utils.py index cd15d2f..66866d5 100644 --- a/eval/utils.py +++ b/eval/utils.py @@ -24,6 +24,8 @@ import contextlib import re +import pandas as pd + def append_count_to_duplicates(strings: list[str]) -> list[str]: """Append count to duplicate strings in array""" @@ -52,3 +54,14 @@ def reformat_str(el: any) -> str: Output string.""" el = convert_to_str(el).replace(",", "") return re.sub(r"\((\d+)\)", r"-\1", el) + + +def fill_df_empty_headers(df: pd) -> str: + if df.columns.duplicated().sum() > 0: + cols = pd.Series(df.columns) + for dup in set(df.columns[df.columns.duplicated()]): + if dup == "": + cols[df.columns.get_loc(dup)] = [ + "COL" + str(idx) for idx, dup in enumerate(df.columns.get_loc(dup)) + ] + df.columns = cols