Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Eval app updates #83

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 11 additions & 14 deletions configs/eval_table_extraction.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,23 @@ pagefilter:
type: FromFilename

table_extraction:
# - type: Camelot
# params:
# flavor: stream
# - type: Camelot
# params:
# flavor: lattice
- type: Unstructured
- type: FromCSV
params:
hi_res_model_name: "yolox"
- type: Unstructured
csv_directory: "data/extractions/extracttable"
- type: LLamaParse
- type: UnstructuredAPI
params:
hi_res_model_name: "yolox"
pdf_image_dpi: 300
- type: UnstructuredAPI
params:
hi_res_model_name: "detectron2_onnx"
- type: Unstructured
params:
hi_res_model_name: "yolox"
pdf_image_dpi: 500
- type: UnstructuredAPI
pdf_image_dpi: 400
- type: Unstructured
params:
hi_res_model_name: "yolox"
- type: LLamaParse
hi_res_model_name: "detectron2_onnx"
pdf_image_dpi: 400

# table_cleaning:
19 changes: 14 additions & 5 deletions country_by_country/table_extraction/unstructured_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,20 @@ def __call__(self, pdf_filepath: str) -> dict:
except Exception as e:
print(e)
else:
tables_list = [
pd.read_html(StringIO(el["metadata"]["text_as_html"]))[0]
for el in resp.elements
if el["type"] == "Table"
]
tables_list = []
for el in resp.elements:
if el["type"] == "Table":
# Enclose in try block to ignore case where pandas can't read the table (html incorrectly formatted)
try:
table = pd.read_html(StringIO(el["metadata"]["text_as_html"]))[
0
]
except Exception:
logging.info(
"Html table discarded. Pandas couldn't read the table.",
)
else:
tables_list.append(table)

# Create asset
new_asset = {
Expand Down
244 changes: 133 additions & 111 deletions eval/eval_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# SOFTWARE.

# Standard imports
import base64
import json
import pickle
import sys
Expand All @@ -32,19 +33,23 @@
from huggingface_hub import hf_hub_download
from streamlit import session_state as ss
from streamlit_option_menu import option_menu
from streamlit_pdf_viewer import pdf_viewer
from utils import append_count_to_duplicates, convert_to_str, reformat_str

from country_by_country import pagefilter
from country_by_country.utils.utils import keep_pages


# Callbacks
def on_pdf_selected() -> None:
ss["pdf_downloaded"] = hf_hub_download(
repo_id="DataForGood/taxobservatory_data",
filename=ss.pdf_selected,
repo_type="dataset",
)
def download_pdf() -> None:
pdf_file = ss.pdf_selected.replace("'", "_")
try:
ss.pdf_downloaded = hf_hub_download(
repo_id="DataForGood/taxobservatory_data",
filename=f"pdf/{ss.pdf_selected}",
repo_type="dataset",
)
except Exception:
st.error("Couldn't download PDF: " + pdf_file)


def on_table_selected(key: str) -> None:
Expand Down Expand Up @@ -73,25 +78,24 @@ def main(ref_data_file: str = None) -> None:
ss.ref_uploaded = None

# Display title
st.title("Table extraction benchmark")

st.markdown(
"""Once you drag and drop a pickle file, you can select a PDF to display
its tables\nand visually compare the results. The cells in the tables
are colored :green[green] if they are present in the tables of the
reference extraction, and :red[red] otherwise. Note the color only
st.title(
"Table extraction benchmark",
help="""Drag and drop a pickle file containg evaluation results, select a PDF to see
the corresponding extracted tables and start comparing.
Cells in the tables are colored **:green[in green]** if they are present in the tables of the
reference extraction, and **:red[red]** otherwise. Note that the color only
indicates if one extracted value is present in the reference
extraction, not if that value is at the right location in the table.""",
extraction, not if that value is at the right location in the table. Change the reference extraction via the select box in the left sidebar.""",
)

# Display sidebar
pdf_file = None
with st.sidebar:
# Select pickle containing results
uploaded_file = st.file_uploader(
"Select a pickle file to load results",
"Select a pickle file to load evaluations results",
type="pkl",
help="Run eval_table_extraction.py to generate a picke file.",
help="Run _eval_table_extraction.py_ to generate a picke file containing extracted tables for multiple PDFs",
)

if uploaded_file:
Expand All @@ -104,9 +108,10 @@ def main(ref_data_file: str = None) -> None:
# Select PDF to load results
pdf_file = st.selectbox(
"Select a PDF file",
asset_dict.keys(),
on_change=on_pdf_selected,
sorted(asset_dict.keys()),
on_change=download_pdf,
key="pdf_selected",
help="The corresponding extracted tables will be displayed (both REF and extractions from the picke file)",
)

# Pull the extractions applied to the PDF
Expand All @@ -115,7 +120,7 @@ def main(ref_data_file: str = None) -> None:


def process_pdf(pdf_file: str, asset_dict: dict) -> None:
# Append REF data matching the PDF to our assets
# Append REF data to extractions in assets
if ss.ref_uploaded is not None:
company = pdf_file.split("_")[0]
year = pdf_file.split("_")[1]
Expand All @@ -126,20 +131,19 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
.reset_index(drop=True)
.dropna(axis="columns", how="all")
)
asset_dict[pdf_file]["table_extractors"].append(
asset_dict[pdf_file]["table_extractors"].insert(
0,
{
"type": "REF",
"params": {"src_file": ref_data_file},
"tables": [ref_df],
},
)

# Pull the extractions from the asssets
extractions = [
extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]
]
extractions = append_count_to_duplicates(extractions)
extractions.append("PDF")
# Pull extractions from the asssets including REF data
extractions = append_count_to_duplicates(
[extractor["type"] for extractor in asset_dict[pdf_file]["table_extractors"]],
)

# Select reference extraction for comparison (default to REF data)
with st.sidebar:
Expand All @@ -148,17 +152,42 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
except Exception:
ref_idx = 0
ref_extraction = st.selectbox(
"Select ref extraction for comparison",
extractions[:-1],
"Select reference extraction for comparison",
extractions,
index=ref_idx,
)
if ref_extraction is not None:
ss.ref_extraction = ref_extraction

# Display tabs (one per extraction + one to display PDF)
tabs = st.tabs(extractions)
# Display tabs (one to display PDF + one per extraction)
tabs = st.tabs(["PDF", *extractions])

# Tab to display PDF
with tabs[0]:
if not ss.pdf_downloaded:
download_pdf()

for idx, tab in enumerate(tabs[:-1]):
if ss.pdf_downloaded:
# Get pages to render
assets = {}
pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets)
pages_to_render = list(assets["pagefilter"]["selected_pages"])

# Filter pages from PDF
pdf_fitered = keep_pages(ss.pdf_downloaded, pages_to_render)

# Get content of pages
with open(pdf_fitered, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode("utf-8")

# Embed content in HTML
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="1000" type="application/pdf"></iframe>'

# Display content
st.markdown(pdf_display, unsafe_allow_html=True)

# Tabs to display extractions
for idx, tab in enumerate(tabs[1:]):
with tab:
# Display parameters of the extraction
st.write(
Expand All @@ -171,86 +200,79 @@ def process_pdf(pdf_file: str, asset_dict: dict) -> None:
dfs_str = ["Table " + str(i) for i in range(len(dfs))]

# Select table to display
selected = option_menu(
None,
dfs_str,
menu_icon=None,
icons=None,
manual_select=min(ss.selected_idx, len(dfs_str) - 1),
orientation="horizontal",
key="tab_" + str(idx),
on_change=on_table_selected,
styles={
"container": {
"padding": "0!important",
"margin": "0!important",
"background-color": "#EFF2F6",
},
"nav-item": {
"max-width": "100px",
"color": "black",
"font-size": "14px",
if len(dfs_str) == 0:
st.info("No table extracted.")
else:
selected = option_menu(
None,
dfs_str,
menu_icon=None,
icons=None,
manual_select=min(ss.selected_idx, len(dfs_str) - 1),
orientation="horizontal",
key="tab_" + str(idx),
on_change=on_table_selected,
styles={
"container": {
"padding": "0!important",
"margin": "0!important",
"background-color": "#EFF2F6",
},
"nav-item": {
"max-width": "100px",
"color": "black",
"font-size": "14px",
},
"icon": {"font-size": "0px"},
},
"icon": {"font-size": "0px"},
},
)
selected_idx = dfs_str.index(selected)

# Display table
df = dfs[selected_idx]

# Check if values in table are in tables of reference extraction
refvalues = []
for dfref in asset_dict[pdf_file]["table_extractors"][
extractions.index(ref_extraction)
]["tables"]:
refvalues.extend(dfref.map(reformat_str).to_numpy().flatten())
mask = df.map(reformat_str).isin(refvalues)

# Apply font color (green vs red) based on above check
def color_mask(val: bool) -> None:
return f'color: {"green" if val is True else "red"}'

dfst = df.style.apply(
lambda c, mask=mask: mask[c.name].apply(color_mask),
)

# Display table with appropriate font color
column_config = {}
for col in df.columns:
column_config[col] = st.column_config.Column(width="small")

st.dataframe(
dfst,
column_config=column_config,
use_container_width=False,
height=round(35.5 * (len(dfst.index) + 1)),
)

# Tab to display PDF
with tabs[-1]:

if not ss.pdf_downloaded:
ss["pdf_downloaded"] = hf_hub_download(
repo_id="DataForGood/taxobservatory_data",
filename=ss.pdf_selected,
repo_type="dataset",
)

if ss.pdf_downloaded:
# Get pages to render
assets = {}
pagefilter.FromFilename()(ss.pdf_downloaded, assets=assets)
pages_to_render = [
page + 1 for page in assets["pagefilter"]["selected_pages"]
]

# Render pages from PDF
pdf_viewer(
input=ss.pdf_downloaded,
pages_to_render=pages_to_render,
width=1000,
)
)
selected_idx = dfs_str.index(selected) if selected in dfs_str else 0

# Display table
df = dfs[selected_idx]

# Fill any empty headers to prevent error when calling st.dataframe()
if df.columns.duplicated().sum() > 0:
cols = pd.Series(df.columns)
for dup in set(df.columns[df.columns.duplicated()]):
if dup == "":
st.info(df.columns.get_loc(dup))
cols[df.columns.get_loc(dup)] = [
"COL" + str(idx)
for idx, dup in enumerate(df.columns.get_loc(dup))
]
df.columns = cols

# Check if values in table are in tables of reference extraction
refvalues = []
for dfref in asset_dict[pdf_file]["table_extractors"][
extractions.index(ref_extraction)
]["tables"]:
refvalues.extend(dfref.map(reformat_str).to_numpy().flatten())
mask = df.map(reformat_str).isin(refvalues)

# Apply font color (green vs red) based on above check
def color_mask(val: bool) -> None:
return f'color: {"green" if val is True else "red"}'

dfst = df.style.apply(
lambda c, mask=mask: mask[c.name].apply(color_mask),
)

# Display table with appropriate font color
column_config = {}
for col in df.columns:
column_config[col] = st.column_config.Column(width="small")

try:
st.dataframe(
dfst,
column_config=column_config,
use_container_width=False,
height=round(35.5 * (len(dfst.index) + 1)),
)
except Exception as error:
st.error(error)


if __name__ == "__main__":
Expand Down