Merge pull request #89 from dataforgoodfr/extract_refactor_upload_page

Extract refactor upload page
dataforgoodfr · May 6, 2024 · 7f56fbf · 7f56fbf
2 parents 4e09a76 + 4d5eea3
commit 7f56fbf
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -55,8 +55,11 @@ To start the streamlit app and use the extractor streamlined version, start it l
 
 `streamlit run app/index.py`
 
-The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below. 
+The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below.
 
+Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured.
+
+[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae)
 
 
 ## Running the pipeline from the command line
@@ -104,20 +107,8 @@ This config file uses:
 - LangChain with GPT-4-turbo-preview for requesting the parsed tables to extract
   and re-order the necessary informations
 
-## Running the pipeline with the streamlit app
-
-You can also interact with the pipeline with a streamlit app :
-
-```
-streamlit run app/index.py
-```
-
-Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured.
-
-[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae)
-
 
-# Avaiable blocks
+# Available blocks
 
 ## Page filter
 

diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py
@@ -7,7 +7,7 @@
 import copy
 from menu import display_pages_menu, display_config
 from pypdf import PdfReader
-from utils import get_pdf_iframe, set_state
+from utils import get_pdf_iframe, set_state, generate_assets
 
 from country_by_country.processor import ReportProcessor
 
@@ -18,6 +18,48 @@ def set_page_filter(value: dict):
     set_state(["config", "pagefilter"], value)
 
 
+def initiate_configuration() -> None:
+    st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
+    if isinstance(st.session_state["config"]["pagefilter"], list):
+        st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][
+            "pagefilter"
+        ][0]
+    st.session_state["selected_page_filter_name"] = st.session_state["config"][
+        "pagefilter"
+    ]["type"]
+
+
+def on_pdf_file_upload() -> None:
+    # Change states related to the pdf file upload
+    mytmpfile.write(st.session_state.original_pdf.read())
+    st.session_state["working_file_pdf"] = mytmpfile
+    st.session_state["original_pdf_name"] = st.session_state.original_pdf.name
+
+    # Generate assets
+    generate_assets()
+
+    st.session_state["page_redirection"] = "pages/1_Selected_Pages.py"
+
+
+def on_config_file_upload() -> None:
+    st.session_state["initial_config"] = st.session_state["initial_uploaded_config"]
+    initiate_configuration()
+
+
+def on_change_page_filter(name_to_filter_dict: dict) -> None:
+    st.session_state["selected_page_filter_name"] = st.session_state[
+        "radio_button_filter_selection"
+    ]  # this 'buffer' is needed because selectors wipe their key on reload
+    set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]])
+
+
+# Check if a redirection was requested
+# Workaround because st.switch_page is not allowed in a callback function
+if st.session_state.get("page_redirection", False):
+    page_to_redirect_to = st.session_state["page_redirection"]
+    st.session_state["page_redirection"] = False
+    st.switch_page(page_to_redirect_to)
+
 st.set_page_config(layout="wide", page_title="Accueil - upload de PDF")
 st.title("Country by Country Tax Reporting analysis")
 st.subheader(
@@ -27,20 +69,34 @@ def set_page_filter(value: dict):
 
 mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
 
+# State initialization
+if "first_time" not in st.session_state:
+    logging.info("State initialization...")
+    st.session_state["first_time"] = False
+
+    logging.info("... loading default extract config")
+    with open("app/extract_config.yaml", "r") as f:
+        st.session_state["initial_config"] = yaml.safe_load(f.read())
+    initiate_configuration()
+
+    logging.info("... initializing processor and assets")
+    st.session_state["proc"] = ReportProcessor(st.session_state["config"])
+    st.session_state["assets"] = {
+        "pagefilter": {},
+        "table_extractors": [],
+    }
+
 with st.sidebar:
 
     st.markdown("# PDF Upload")
 
     st.markdown("## PDF Report to process")
     original_pdf = st.file_uploader(
         "Upload a pdf document containing financial table : ",
+        key="original_pdf",
+        on_change=on_pdf_file_upload,
     )
 
-    if original_pdf is not None:
-        mytmpfile.write(original_pdf.read())
-        st.session_state["working_file_pdf"] = mytmpfile
-        st.session_state["original_pdf_name"] = original_pdf.name
-
     if "original_pdf_name" in st.session_state:
         st.markdown(
             "Already loaded file : " + st.session_state["original_pdf_name"],
@@ -50,7 +106,10 @@ def set_page_filter(value: dict):
     # Upload personalized config if required
     loaded_config = st.file_uploader(
         "Upload a config if the default config doesn't suit you :",
+        key="initial_uploaded_config",
+        on_change=initiate_configuration,
     )
+
     if loaded_config is not None:
         if not loaded_config.name.endswith(".yaml"):
             st.error("Please upload a yaml file")
@@ -69,26 +128,28 @@ def set_page_filter(value: dict):
             loaded_config = None
 
     # Extract config
-    with open("app/extract_config.yaml", "r") as f:
-        default_config = f.read()
-
-    if not st.session_state.get("config_is_set", False):
-        st.session_state["initial_config"] = yaml.safe_load(default_config)
-        st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
-        st.session_state["config_is_set"] = True
 
     if bool(loaded_config):
         st.session_state["initial_config"] = loaded_config_dict
         st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"])
-        st.session_state["config_is_set"] = True
 
     # Set page filter
-    page_filter_radio_dict = {
+    page_filter_name_to_config_mapping = {
         pagefilter["type"]: pagefilter
         for pagefilter in st.session_state["initial_config"]["pagefilter"]
     }
-    selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys())
-    set_page_filter(page_filter_radio_dict[selected_page_filter])
+    page_filter_list = list(page_filter_name_to_config_mapping.keys())
+    current_selected_page_filter_index = page_filter_list.index(
+        st.session_state["selected_page_filter_name"]
+    )
+    selected_page_filter_name = st.radio(
+        "Page filter",
+        page_filter_list,
+        index=current_selected_page_filter_index,
+        on_change=on_change_page_filter,
+        key="radio_button_filter_selection",
+        args=(page_filter_name_to_config_mapping,),
+    )
 
     display_config()
 
@@ -103,32 +164,3 @@ def set_page_filter(value: dict):
         get_pdf_iframe(st.session_state["working_file_pdf"].name),
         unsafe_allow_html=True,
     )
-
-    if "first_time" not in st.session_state:
-        st.session_state["first_time"] = False
-        logging.info("Loading config and pdf")
-        st.session_state["proc"] = ReportProcessor(st.session_state["config"])
-
-        logging.info("Config and pdf loaded")
-
-        assets = {
-            "pagefilter": {},
-            "table_extractors": [],
-        }
-
-        # Filtering the pages
-        st.session_state["proc"].page_filter(
-            st.session_state["working_file_pdf"].name,
-            assets,
-        )
-
-        logging.info(f"Assets : {assets}")
-
-        if len(assets["pagefilter"]["selected_pages"]) == 0:
-            # No page has been automatically selected by the page filter
-            # Hence, we display the full pdf, letting the user select the pages
-            pdfreader = PdfReader(st.session_state["working_file_pdf"])
-            number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
-            assets["pagefilter"]["selected_pages"] = list(range(number_pages))
-        st.session_state["assets"] = assets
-        st.switch_page("pages/1_Selected_Pages.py")
diff --git a/app/pages/1_Selected_Pages.py b/app/pages/1_Selected_Pages.py
@@ -1,6 +1,6 @@
 import streamlit as st
 from country_by_country.processor import ReportProcessor
-from utils import get_pdf_iframe, set_state
+from utils import get_pdf_iframe, set_state, generate_assets
 from country_by_country.utils.utils import keep_pages
 from pypdf import PdfReader
 from menu import display_pages_menu, display_config
@@ -29,6 +29,7 @@ def set_extractors() -> None:
     ]
     set_state(["config", "table_extraction"], selected_extractors_dict)
     st.session_state["proc"] = ReportProcessor(st.session_state["config"])
+    generate_assets()
 
 
 st.set_page_config(layout="wide", page_title="Pages selection")  # page_icon="📈"

diff --git a/app/utils.py b/app/utils.py
@@ -1,9 +1,11 @@
 import base64
+import logging
 from pathlib import Path
 from typing import Any
 
 import pandas as pd
 import streamlit as st
+from pypdf import PdfReader
 
 
 def get_pdf_iframe(pdf_to_process: str) -> str:
@@ -61,3 +63,25 @@ def set_state(key: Any, value: Any) -> None:
         nested_value[key_list[-1]] = value
     else:
         st.session_state[key] = value
+
+
+def generate_assets() -> None:
+    assets = {
+        "pagefilter": {},
+        "table_extractors": [],
+    }
+
+    # Filtering the pages
+    st.session_state["proc"].page_filter(
+        st.session_state["working_file_pdf"].name,
+        assets,
+    )
+
+    logging.info(f"Assets : {assets}")
+
+    if len(assets["pagefilter"]["selected_pages"]) == 0:
+        # No page has been automatically selected by the page filter
+        # Hence, we display the full pdf, letting the user select the pages
+        number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages)
+        assets["pagefilter"]["selected_pages"] = list(range(number_pages))
+    st.session_state["assets"] = assets