Skip to content

Commit

Permalink
Merge pull request #89 from dataforgoodfr/extract_refactor_upload_page
Browse files Browse the repository at this point in the history
Extract refactor upload page
  • Loading branch information
RonanMorgan authored May 6, 2024
2 parents 4e09a76 + 4d5eea3 commit 7f56fbf
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 61 deletions.
19 changes: 5 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,11 @@ To start the streamlit app and use the extractor streamlined version, start it l

`streamlit run app/index.py`

The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below.
The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below.

Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured.

[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae)


## Running the pipeline from the command line
Expand Down Expand Up @@ -104,20 +107,8 @@ This config file uses:
- LangChain with GPT-4-turbo-preview for requesting the parsed tables to extract
and re-order the necessary informations

## Running the pipeline with the streamlit app

You can also interact with the pipeline with a streamlit app :

```
streamlit run app/index.py
```

Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured.

[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae)


# Avaiable blocks
# Available blocks

## Page filter

Expand Down
124 changes: 78 additions & 46 deletions app/pages/0_Import_File.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import copy
from menu import display_pages_menu, display_config
from pypdf import PdfReader
from utils import get_pdf_iframe, set_state
from utils import get_pdf_iframe, set_state, generate_assets

from country_by_country.processor import ReportProcessor

Expand All @@ -18,6 +18,48 @@ def set_page_filter(value: dict):
set_state(["config", "pagefilter"], value)


def initiate_configuration() -> None:
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
if isinstance(st.session_state["config"]["pagefilter"], list):
st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][
"pagefilter"
][0]
st.session_state["selected_page_filter_name"] = st.session_state["config"][
"pagefilter"
]["type"]


def on_pdf_file_upload() -> None:
# Change states related to the pdf file upload
mytmpfile.write(st.session_state.original_pdf.read())
st.session_state["working_file_pdf"] = mytmpfile
st.session_state["original_pdf_name"] = st.session_state.original_pdf.name

# Generate assets
generate_assets()

st.session_state["page_redirection"] = "pages/1_Selected_Pages.py"


def on_config_file_upload() -> None:
st.session_state["initial_config"] = st.session_state["initial_uploaded_config"]
initiate_configuration()


def on_change_page_filter(name_to_filter_dict: dict) -> None:
st.session_state["selected_page_filter_name"] = st.session_state[
"radio_button_filter_selection"
] # this 'buffer' is needed because selectors wipe their key on reload
set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]])


# Check if a redirection was requested
# Workaround because st.switch_page is not allowed in a callback function
if st.session_state.get("page_redirection", False):
page_to_redirect_to = st.session_state["page_redirection"]
st.session_state["page_redirection"] = False
st.switch_page(page_to_redirect_to)

st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
st.title("Country by Country Tax Reporting analysis")
st.subheader(
Expand All @@ -27,20 +69,34 @@ def set_page_filter(value: dict):

mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)

# State initialization
if "first_time" not in st.session_state:
logging.info("State initialization...")
st.session_state["first_time"] = False

logging.info("... loading default extract config")
with open("app/extract_config.yaml", "r") as f:
st.session_state["initial_config"] = yaml.safe_load(f.read())
initiate_configuration()

logging.info("... initializing processor and assets")
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
st.session_state["assets"] = {
"pagefilter": {},
"table_extractors": [],
}

with st.sidebar:

st.markdown("# PDF Upload")

st.markdown("## PDF Report to process")
original_pdf = st.file_uploader(
"Upload a pdf document containing financial table : ",
key="original_pdf",
on_change=on_pdf_file_upload,
)

if original_pdf is not None:
mytmpfile.write(original_pdf.read())
st.session_state["working_file_pdf"] = mytmpfile
st.session_state["original_pdf_name"] = original_pdf.name

if "original_pdf_name" in st.session_state:
st.markdown(
"Already loaded file : " + st.session_state["original_pdf_name"],
Expand All @@ -50,7 +106,10 @@ def set_page_filter(value: dict):
# Upload personalized config if required
loaded_config = st.file_uploader(
"Upload a config if the default config doesn't suit you :",
key="initial_uploaded_config",
on_change=initiate_configuration,
)

if loaded_config is not None:
if not loaded_config.name.endswith(".yaml"):
st.error("Please upload a yaml file")
Expand All @@ -69,26 +128,28 @@ def set_page_filter(value: dict):
loaded_config = None

# Extract config
with open("app/extract_config.yaml", "r") as f:
default_config = f.read()

if not st.session_state.get("config_is_set", False):
st.session_state["initial_config"] = yaml.safe_load(default_config)
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
st.session_state["config_is_set"] = True

if bool(loaded_config):
st.session_state["initial_config"] = loaded_config_dict
st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
st.session_state["config_is_set"] = True

# Set page filter
page_filter_radio_dict = {
page_filter_name_to_config_mapping = {
pagefilter["type"]: pagefilter
for pagefilter in st.session_state["initial_config"]["pagefilter"]
}
selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys())
set_page_filter(page_filter_radio_dict[selected_page_filter])
page_filter_list = list(page_filter_name_to_config_mapping.keys())
current_selected_page_filter_index = page_filter_list.index(
st.session_state["selected_page_filter_name"]
)
selected_page_filter_name = st.radio(
"Page filter",
page_filter_list,
index=current_selected_page_filter_index,
on_change=on_change_page_filter,
key="radio_button_filter_selection",
args=(page_filter_name_to_config_mapping,),
)

display_config()

Expand All @@ -103,32 +164,3 @@ def set_page_filter(value: dict):
get_pdf_iframe(st.session_state["working_file_pdf"].name),
unsafe_allow_html=True,
)

if "first_time" not in st.session_state:
st.session_state["first_time"] = False
logging.info("Loading config and pdf")
st.session_state["proc"] = ReportProcessor(st.session_state["config"])

logging.info("Config and pdf loaded")

assets = {
"pagefilter": {},
"table_extractors": [],
}

# Filtering the pages
st.session_state["proc"].page_filter(
st.session_state["working_file_pdf"].name,
assets,
)

logging.info(f"Assets : {assets}")

if len(assets["pagefilter"]["selected_pages"]) == 0:
# No page has been automatically selected by the page filter
# Hence, we display the full pdf, letting the user select the pages
pdfreader = PdfReader(st.session_state["working_file_pdf"])
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
assets["pagefilter"]["selected_pages"] = list(range(number_pages))
st.session_state["assets"] = assets
st.switch_page("pages/1_Selected_Pages.py")
3 changes: 2 additions & 1 deletion app/pages/1_Selected_Pages.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import streamlit as st
from country_by_country.processor import ReportProcessor
from utils import get_pdf_iframe, set_state
from utils import get_pdf_iframe, set_state, generate_assets
from country_by_country.utils.utils import keep_pages
from pypdf import PdfReader
from menu import display_pages_menu, display_config
Expand Down Expand Up @@ -29,6 +29,7 @@ def set_extractors() -> None:
]
set_state(["config", "table_extraction"], selected_extractors_dict)
st.session_state["proc"] = ReportProcessor(st.session_state["config"])
generate_assets()


st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈"
Expand Down
24 changes: 24 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import base64
import logging
from pathlib import Path
from typing import Any

import pandas as pd
import streamlit as st
from pypdf import PdfReader


def get_pdf_iframe(pdf_to_process: str) -> str:
Expand Down Expand Up @@ -61,3 +63,25 @@ def set_state(key: Any, value: Any) -> None:
nested_value[key_list[-1]] = value
else:
st.session_state[key] = value


def generate_assets() -> None:
assets = {
"pagefilter": {},
"table_extractors": [],
}

# Filtering the pages
st.session_state["proc"].page_filter(
st.session_state["working_file_pdf"].name,
assets,
)

logging.info(f"Assets : {assets}")

if len(assets["pagefilter"]["selected_pages"]) == 0:
# No page has been automatically selected by the page filter
# Hence, we display the full pdf, letting the user select the pages
number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
assets["pagefilter"]["selected_pages"] = list(range(number_pages))
st.session_state["assets"] = assets

0 comments on commit 7f56fbf

Please sign in to comment.