From df076048e831493f43f86f87c023c1c2b805db4c Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 13:58:50 +0200 Subject: [PATCH 01/12] fix & feat refacto import_file --- app/pages/0_Import_File.py | 135 +++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 735eff9..7be8812 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -18,6 +18,61 @@ def set_page_filter(value: dict): set_state(["config", "pagefilter"], value) +def initiate_configuration() -> None: + st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) + if isinstance(st.session_state["config"]["pagefilter"], list): + st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][ + "pagefilter" + ][0] + st.session_state["selected_page_filter_name"] = st.session_state["config"][ + "pagefilter" + ]["type"] + + +def generate_assets() -> None: + assets = { + "pagefilter": {}, + "table_extractors": [], + } + + # Filtering the pages + st.session_state["proc"].page_filter( + st.session_state["working_file_pdf"].name, + assets, + ) + + logging.info(f"Assets : {assets}") + + if len(assets["pagefilter"]["selected_pages"]) == 0: + # No page has been automatically selected by the page filter + # Hence, we display the full pdf, letting the user select the pages + pdfreader = PdfReader(st.session_state["working_file_pdf"]) + number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) + assets["pagefilter"]["selected_pages"] = list(range(number_pages)) + st.session_state["assets"] = assets + + +def on_pdf_file_upload() -> None: + # Change states related to the pdf file upload + mytmpfile.write(st.session_state.original_pdf.read()) + st.session_state["working_file_pdf"] = mytmpfile + st.session_state["original_pdf_name"] = st.session_state.original_pdf.name + + # Generate assets + generate_assets() + + st.switch_page("pages/1_Selected_Pages.py") + + +def on_config_file_upload() -> None: + st.session_state["initial_config"] = st.session_state["initial_uploaded_config"] + initiate_configuration() + + +def on_change_page_filter(name_to_filter_dict: dict) -> None: + set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]]) + + st.set_page_config(layout="wide", page_title="Accueil - upload de PDF") st.title("Country by Country Tax Reporting analysis") st.subheader( @@ -27,6 +82,23 @@ def set_page_filter(value: dict): mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) +# State initialization +if "first_time" not in st.session_state: + logging.info("State initialization...") + st.session_state["first_time"] = False + + logging.info("... loading default extract config") + with open("app/extract_config.yaml", "r") as f: + st.session_state["initial_config"] = yaml.safe_load(f.read()) + initiate_configuration() + + logging.info("... initializing processor and assets") + st.session_state["proc"] = ReportProcessor(st.session_state["config"]) + st.session_state["assets"] = { + "pagefilter": {}, + "table_extractors": [], + } + with st.sidebar: st.markdown("# PDF Upload") @@ -34,13 +106,10 @@ def set_page_filter(value: dict): st.markdown("## PDF Report to process") original_pdf = st.file_uploader( "Upload a pdf document containing financial table : ", + key="original_pdf", + on_change=on_pdf_file_upload, ) - if original_pdf is not None: - mytmpfile.write(original_pdf.read()) - st.session_state["working_file_pdf"] = mytmpfile - st.session_state["original_pdf_name"] = original_pdf.name - if "original_pdf_name" in st.session_state: st.markdown( "Already loaded file : " + st.session_state["original_pdf_name"], @@ -50,6 +119,8 @@ def set_page_filter(value: dict): # Upload personalized config if required loaded_config = st.file_uploader( "Upload a config if the default config doesn't suit you :", + key="initial_uploaded_config", + on_change=initiate_configuration, ) if loaded_config is not None: if not loaded_config.name.endswith(".yaml"): @@ -69,26 +140,28 @@ def set_page_filter(value: dict): loaded_config = None # Extract config - with open("app/extract_config.yaml", "r") as f: - default_config = f.read() - - if not st.session_state.get("config_is_set", False): - st.session_state["initial_config"] = yaml.safe_load(default_config) - st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) - st.session_state["config_is_set"] = True if bool(loaded_config): st.session_state["initial_config"] = loaded_config_dict st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) - st.session_state["config_is_set"] = True # Set page filter - page_filter_radio_dict = { + page_filter_name_to_config_mapping = { pagefilter["type"]: pagefilter for pagefilter in st.session_state["initial_config"]["pagefilter"] } - selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys()) - set_page_filter(page_filter_radio_dict[selected_page_filter]) + page_filter_list = list(page_filter_name_to_config_mapping.keys()) + current_selected_page_filter_index = page_filter_list.index( + st.session_state["selected_page_filter_name"] + ) + selected_page_filter_name = st.radio( + "Page filter", + page_filter_list, + index=current_selected_page_filter_index, + on_change=on_change_page_filter, + key="selected_page_filter_name", + args=(page_filter_name_to_config_mapping,), + ) display_config() @@ -104,31 +177,5 @@ def set_page_filter(value: dict): unsafe_allow_html=True, ) - if "first_time" not in st.session_state: - st.session_state["first_time"] = False - logging.info("Loading config and pdf") - st.session_state["proc"] = ReportProcessor(st.session_state["config"]) - - logging.info("Config and pdf loaded") - - assets = { - "pagefilter": {}, - "table_extractors": [], - } - - # Filtering the pages - st.session_state["proc"].page_filter( - st.session_state["working_file_pdf"].name, - assets, - ) - - logging.info(f"Assets : {assets}") - - if len(assets["pagefilter"]["selected_pages"]) == 0: - # No page has been automatically selected by the page filter - # Hence, we display the full pdf, letting the user select the pages - pdfreader = PdfReader(st.session_state["working_file_pdf"]) - number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) - assets["pagefilter"]["selected_pages"] = list(range(number_pages)) - st.session_state["assets"] = assets - st.switch_page("pages/1_Selected_Pages.py") +# DEBUG +st.write(st.session_state["proc"]) From eeeb69c70a6d9b49c276968ee42a97e7e0be4890 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 13:59:22 +0200 Subject: [PATCH 02/12] fix readme tremalit part --- README.md | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 39103bb..edef8c7 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,11 @@ To start the streamlit app and use the extractor streamlined version, start it l `streamlit run app/index.py` -The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below. +The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below. +Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured. + +[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae) ## Running the pipeline from the command line @@ -104,20 +107,8 @@ This config file uses: - LangChain with GPT-4-turbo-preview for requesting the parsed tables to extract and re-order the necessary informations -## Running the pipeline with the streamlit app - -You can also interact with the pipeline with a streamlit app : - -``` -streamlit run app/index.py -``` - -Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured. - -[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae) - -# Avaiable blocks +# Available blocks ## Page filter From 35d4899a311d838938eebcdb7c9583c7f7f1b4a4 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 14:40:08 +0200 Subject: [PATCH 03/12] fix remove debug msg --- app/pages/0_Import_File.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 7be8812..e871e44 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -176,6 +176,3 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: get_pdf_iframe(st.session_state["working_file_pdf"].name), unsafe_allow_html=True, ) - -# DEBUG -st.write(st.session_state["proc"]) From 7bb07f4f5daa438edf5350d964e9f4adffd3f1ef Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 16:17:37 +0200 Subject: [PATCH 04/12] fix callback switch page + selector wiping their key state --- app/pages/0_Import_File.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index e871e44..4f9f860 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -17,7 +17,6 @@ def set_page_filter(value: dict): set_state(["config", "pagefilter"], value) - def initiate_configuration() -> None: st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) if isinstance(st.session_state["config"]["pagefilter"], list): @@ -27,7 +26,7 @@ def initiate_configuration() -> None: st.session_state["selected_page_filter_name"] = st.session_state["config"][ "pagefilter" ]["type"] - + #debug def generate_assets() -> None: assets = { @@ -51,7 +50,6 @@ def generate_assets() -> None: assets["pagefilter"]["selected_pages"] = list(range(number_pages)) st.session_state["assets"] = assets - def on_pdf_file_upload() -> None: # Change states related to the pdf file upload mytmpfile.write(st.session_state.original_pdf.read()) @@ -61,7 +59,8 @@ def on_pdf_file_upload() -> None: # Generate assets generate_assets() - st.switch_page("pages/1_Selected_Pages.py") + st.session_state["page_redirection"] = "pages/1_Selected_Pages.py" + #st.switch_page("pages/1_Selected_Pages.py") def on_config_file_upload() -> None: @@ -70,8 +69,15 @@ def on_config_file_upload() -> None: def on_change_page_filter(name_to_filter_dict: dict) -> None: + st.session_state["selected_page_filter_name"] = st.session_state["radio_button_filter_selection"] #this 'buffer' is needed because selectors wipe their key on reload set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]]) +# Check if a redirection was requested +# Workaround because st.switch_page is not allowed in a callback function +if st.session_state.get("page_redirection", False): + page_to_redirect_to = st.session_state["page_redirection"] + st.session_state["page_redirection"] = False + st.switch_page(page_to_redirect_to) st.set_page_config(layout="wide", page_title="Accueil - upload de PDF") st.title("Country by Country Tax Reporting analysis") @@ -122,6 +128,7 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: key="initial_uploaded_config", on_change=initiate_configuration, ) + if loaded_config is not None: if not loaded_config.name.endswith(".yaml"): st.error("Please upload a yaml file") @@ -159,7 +166,7 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: page_filter_list, index=current_selected_page_filter_index, on_change=on_change_page_filter, - key="selected_page_filter_name", + key="radio_button_filter_selection", args=(page_filter_name_to_config_mapping,), ) @@ -175,4 +182,4 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: st.markdown( get_pdf_iframe(st.session_state["working_file_pdf"].name), unsafe_allow_html=True, - ) + ) \ No newline at end of file From 28603fd9fa62e03fbfb4a3fc459688e7da5c25bc Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Sun, 5 May 2024 09:24:40 +0200 Subject: [PATCH 05/12] clean debug messages --- app/pages/0_Import_File.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 4f9f860..60fc6cf 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -26,7 +26,6 @@ def initiate_configuration() -> None: st.session_state["selected_page_filter_name"] = st.session_state["config"][ "pagefilter" ]["type"] - #debug def generate_assets() -> None: assets = { @@ -60,7 +59,6 @@ def on_pdf_file_upload() -> None: generate_assets() st.session_state["page_redirection"] = "pages/1_Selected_Pages.py" - #st.switch_page("pages/1_Selected_Pages.py") def on_config_file_upload() -> None: From f0c667ed8fa8dc7c99fdc456c6f0ecb2763335d6 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 13:58:50 +0200 Subject: [PATCH 06/12] fix & feat refacto import_file --- app/pages/0_Import_File.py | 135 +++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 735eff9..7be8812 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -18,6 +18,61 @@ def set_page_filter(value: dict): set_state(["config", "pagefilter"], value) +def initiate_configuration() -> None: + st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) + if isinstance(st.session_state["config"]["pagefilter"], list): + st.session_state["config"]["pagefilter"] = st.session_state["initial_config"][ + "pagefilter" + ][0] + st.session_state["selected_page_filter_name"] = st.session_state["config"][ + "pagefilter" + ]["type"] + + +def generate_assets() -> None: + assets = { + "pagefilter": {}, + "table_extractors": [], + } + + # Filtering the pages + st.session_state["proc"].page_filter( + st.session_state["working_file_pdf"].name, + assets, + ) + + logging.info(f"Assets : {assets}") + + if len(assets["pagefilter"]["selected_pages"]) == 0: + # No page has been automatically selected by the page filter + # Hence, we display the full pdf, letting the user select the pages + pdfreader = PdfReader(st.session_state["working_file_pdf"]) + number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) + assets["pagefilter"]["selected_pages"] = list(range(number_pages)) + st.session_state["assets"] = assets + + +def on_pdf_file_upload() -> None: + # Change states related to the pdf file upload + mytmpfile.write(st.session_state.original_pdf.read()) + st.session_state["working_file_pdf"] = mytmpfile + st.session_state["original_pdf_name"] = st.session_state.original_pdf.name + + # Generate assets + generate_assets() + + st.switch_page("pages/1_Selected_Pages.py") + + +def on_config_file_upload() -> None: + st.session_state["initial_config"] = st.session_state["initial_uploaded_config"] + initiate_configuration() + + +def on_change_page_filter(name_to_filter_dict: dict) -> None: + set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]]) + + st.set_page_config(layout="wide", page_title="Accueil - upload de PDF") st.title("Country by Country Tax Reporting analysis") st.subheader( @@ -27,6 +82,23 @@ def set_page_filter(value: dict): mytmpfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) +# State initialization +if "first_time" not in st.session_state: + logging.info("State initialization...") + st.session_state["first_time"] = False + + logging.info("... loading default extract config") + with open("app/extract_config.yaml", "r") as f: + st.session_state["initial_config"] = yaml.safe_load(f.read()) + initiate_configuration() + + logging.info("... initializing processor and assets") + st.session_state["proc"] = ReportProcessor(st.session_state["config"]) + st.session_state["assets"] = { + "pagefilter": {}, + "table_extractors": [], + } + with st.sidebar: st.markdown("# PDF Upload") @@ -34,13 +106,10 @@ def set_page_filter(value: dict): st.markdown("## PDF Report to process") original_pdf = st.file_uploader( "Upload a pdf document containing financial table : ", + key="original_pdf", + on_change=on_pdf_file_upload, ) - if original_pdf is not None: - mytmpfile.write(original_pdf.read()) - st.session_state["working_file_pdf"] = mytmpfile - st.session_state["original_pdf_name"] = original_pdf.name - if "original_pdf_name" in st.session_state: st.markdown( "Already loaded file : " + st.session_state["original_pdf_name"], @@ -50,6 +119,8 @@ def set_page_filter(value: dict): # Upload personalized config if required loaded_config = st.file_uploader( "Upload a config if the default config doesn't suit you :", + key="initial_uploaded_config", + on_change=initiate_configuration, ) if loaded_config is not None: if not loaded_config.name.endswith(".yaml"): @@ -69,26 +140,28 @@ def set_page_filter(value: dict): loaded_config = None # Extract config - with open("app/extract_config.yaml", "r") as f: - default_config = f.read() - - if not st.session_state.get("config_is_set", False): - st.session_state["initial_config"] = yaml.safe_load(default_config) - st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) - st.session_state["config_is_set"] = True if bool(loaded_config): st.session_state["initial_config"] = loaded_config_dict st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) - st.session_state["config_is_set"] = True # Set page filter - page_filter_radio_dict = { + page_filter_name_to_config_mapping = { pagefilter["type"]: pagefilter for pagefilter in st.session_state["initial_config"]["pagefilter"] } - selected_page_filter = st.radio("Page filter", page_filter_radio_dict.keys()) - set_page_filter(page_filter_radio_dict[selected_page_filter]) + page_filter_list = list(page_filter_name_to_config_mapping.keys()) + current_selected_page_filter_index = page_filter_list.index( + st.session_state["selected_page_filter_name"] + ) + selected_page_filter_name = st.radio( + "Page filter", + page_filter_list, + index=current_selected_page_filter_index, + on_change=on_change_page_filter, + key="selected_page_filter_name", + args=(page_filter_name_to_config_mapping,), + ) display_config() @@ -104,31 +177,5 @@ def set_page_filter(value: dict): unsafe_allow_html=True, ) - if "first_time" not in st.session_state: - st.session_state["first_time"] = False - logging.info("Loading config and pdf") - st.session_state["proc"] = ReportProcessor(st.session_state["config"]) - - logging.info("Config and pdf loaded") - - assets = { - "pagefilter": {}, - "table_extractors": [], - } - - # Filtering the pages - st.session_state["proc"].page_filter( - st.session_state["working_file_pdf"].name, - assets, - ) - - logging.info(f"Assets : {assets}") - - if len(assets["pagefilter"]["selected_pages"]) == 0: - # No page has been automatically selected by the page filter - # Hence, we display the full pdf, letting the user select the pages - pdfreader = PdfReader(st.session_state["working_file_pdf"]) - number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) - assets["pagefilter"]["selected_pages"] = list(range(number_pages)) - st.session_state["assets"] = assets - st.switch_page("pages/1_Selected_Pages.py") +# DEBUG +st.write(st.session_state["proc"]) From a9f44b0f89457d713930e1248713f01b58335156 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 13:59:22 +0200 Subject: [PATCH 07/12] fix readme tremalit part --- README.md | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 39103bb..edef8c7 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,11 @@ To start the streamlit app and use the extractor streamlined version, start it l `streamlit run app/index.py` -The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below. +The app comes with page detection and parsers default config but you can change it by providing a yaml file following the config.yaml format below. +Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured. + +[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae) ## Running the pipeline from the command line @@ -104,20 +107,8 @@ This config file uses: - LangChain with GPT-4-turbo-preview for requesting the parsed tables to extract and re-order the necessary informations -## Running the pipeline with the streamlit app - -You can also interact with the pipeline with a streamlit app : - -``` -streamlit run app/index.py -``` - -Below is an example of the pipeline running on one of the reports, parsing the tables with LlamaParse and Unstructured. - -[PipelineDemonstration.webm](https://github.com/dataforgoodfr/12_taxobservatory/assets/1128418/f9c64e83-9c15-4de2-a512-4f4b25f2f3ae) - -# Avaiable blocks +# Available blocks ## Page filter From 879f5f03e4c82d70fd493c4ae999f7b4ecabe548 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 14:40:08 +0200 Subject: [PATCH 08/12] fix remove debug msg --- app/pages/0_Import_File.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 7be8812..e871e44 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -176,6 +176,3 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: get_pdf_iframe(st.session_state["working_file_pdf"].name), unsafe_allow_html=True, ) - -# DEBUG -st.write(st.session_state["proc"]) From 69f45d31ac0c649b3f5e84d1fa7643ff833af128 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Thu, 2 May 2024 16:17:37 +0200 Subject: [PATCH 09/12] fix callback switch page + selector wiping their key state --- app/pages/0_Import_File.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index e871e44..4f9f860 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -17,7 +17,6 @@ def set_page_filter(value: dict): set_state(["config", "pagefilter"], value) - def initiate_configuration() -> None: st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) if isinstance(st.session_state["config"]["pagefilter"], list): @@ -27,7 +26,7 @@ def initiate_configuration() -> None: st.session_state["selected_page_filter_name"] = st.session_state["config"][ "pagefilter" ]["type"] - + #debug def generate_assets() -> None: assets = { @@ -51,7 +50,6 @@ def generate_assets() -> None: assets["pagefilter"]["selected_pages"] = list(range(number_pages)) st.session_state["assets"] = assets - def on_pdf_file_upload() -> None: # Change states related to the pdf file upload mytmpfile.write(st.session_state.original_pdf.read()) @@ -61,7 +59,8 @@ def on_pdf_file_upload() -> None: # Generate assets generate_assets() - st.switch_page("pages/1_Selected_Pages.py") + st.session_state["page_redirection"] = "pages/1_Selected_Pages.py" + #st.switch_page("pages/1_Selected_Pages.py") def on_config_file_upload() -> None: @@ -70,8 +69,15 @@ def on_config_file_upload() -> None: def on_change_page_filter(name_to_filter_dict: dict) -> None: + st.session_state["selected_page_filter_name"] = st.session_state["radio_button_filter_selection"] #this 'buffer' is needed because selectors wipe their key on reload set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]]) +# Check if a redirection was requested +# Workaround because st.switch_page is not allowed in a callback function +if st.session_state.get("page_redirection", False): + page_to_redirect_to = st.session_state["page_redirection"] + st.session_state["page_redirection"] = False + st.switch_page(page_to_redirect_to) st.set_page_config(layout="wide", page_title="Accueil - upload de PDF") st.title("Country by Country Tax Reporting analysis") @@ -122,6 +128,7 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: key="initial_uploaded_config", on_change=initiate_configuration, ) + if loaded_config is not None: if not loaded_config.name.endswith(".yaml"): st.error("Please upload a yaml file") @@ -159,7 +166,7 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: page_filter_list, index=current_selected_page_filter_index, on_change=on_change_page_filter, - key="selected_page_filter_name", + key="radio_button_filter_selection", args=(page_filter_name_to_config_mapping,), ) @@ -175,4 +182,4 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: st.markdown( get_pdf_iframe(st.session_state["working_file_pdf"].name), unsafe_allow_html=True, - ) + ) \ No newline at end of file From 27c8fc369cb33b83896b2011726dffab4ffdd62c Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Sun, 5 May 2024 09:24:40 +0200 Subject: [PATCH 10/12] clean debug messages --- app/pages/0_Import_File.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 4f9f860..60fc6cf 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -26,7 +26,6 @@ def initiate_configuration() -> None: st.session_state["selected_page_filter_name"] = st.session_state["config"][ "pagefilter" ]["type"] - #debug def generate_assets() -> None: assets = { @@ -60,7 +59,6 @@ def on_pdf_file_upload() -> None: generate_assets() st.session_state["page_redirection"] = "pages/1_Selected_Pages.py" - #st.switch_page("pages/1_Selected_Pages.py") def on_config_file_upload() -> None: From 69da052800a616bf814ea53e2dbf7e2591de4d01 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Sun, 5 May 2024 12:04:53 +0200 Subject: [PATCH 11/12] fix assets generation when selecting extractor --- app/pages/0_Import_File.py | 24 +----------------------- app/pages/1_Selected_Pages.py | 4 ++-- app/utils.py | 24 +++++++++++++++++++++++- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 60fc6cf..893e2d6 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -7,7 +7,7 @@ import copy from menu import display_pages_menu, display_config from pypdf import PdfReader -from utils import get_pdf_iframe, set_state +from utils import get_pdf_iframe, set_state, generate_assets from country_by_country.processor import ReportProcessor @@ -27,28 +27,6 @@ def initiate_configuration() -> None: "pagefilter" ]["type"] -def generate_assets() -> None: - assets = { - "pagefilter": {}, - "table_extractors": [], - } - - # Filtering the pages - st.session_state["proc"].page_filter( - st.session_state["working_file_pdf"].name, - assets, - ) - - logging.info(f"Assets : {assets}") - - if len(assets["pagefilter"]["selected_pages"]) == 0: - # No page has been automatically selected by the page filter - # Hence, we display the full pdf, letting the user select the pages - pdfreader = PdfReader(st.session_state["working_file_pdf"]) - number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) - assets["pagefilter"]["selected_pages"] = list(range(number_pages)) - st.session_state["assets"] = assets - def on_pdf_file_upload() -> None: # Change states related to the pdf file upload mytmpfile.write(st.session_state.original_pdf.read()) diff --git a/app/pages/1_Selected_Pages.py b/app/pages/1_Selected_Pages.py index 9b6093f..6a352e9 100644 --- a/app/pages/1_Selected_Pages.py +++ b/app/pages/1_Selected_Pages.py @@ -1,6 +1,6 @@ import streamlit as st from country_by_country.processor import ReportProcessor -from utils import get_pdf_iframe, set_state +from utils import get_pdf_iframe, set_state, generate_assets from country_by_country.utils.utils import keep_pages from pypdf import PdfReader from menu import display_pages_menu, display_config @@ -29,7 +29,7 @@ def set_extractors() -> None: ] set_state(["config", "table_extraction"], selected_extractors_dict) st.session_state["proc"] = ReportProcessor(st.session_state["config"]) - + generate_assets() st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈" st.title("Country by Country Tax Reporting analysis : Selected Pages") diff --git a/app/utils.py b/app/utils.py index 2c79b3f..b09d6aa 100644 --- a/app/utils.py +++ b/app/utils.py @@ -1,8 +1,10 @@ import base64 +import logging from pathlib import Path from typing import Any import pandas as pd +from pypdf import PdfReader import streamlit as st @@ -41,7 +43,6 @@ def to_csv_file(df: pd.DataFrame) -> bytes: return df.to_csv(index=False).encode("utf-8") - def set_state(key: Any, value: Any) -> None: """ Sets the session_state[key] to value. @@ -61,3 +62,24 @@ def set_state(key: Any, value: Any) -> None: nested_value[key_list[-1]] = value else: st.session_state[key] = value + +def generate_assets() -> None: + assets = { + "pagefilter": {}, + "table_extractors": [], + } + + # Filtering the pages + st.session_state["proc"].page_filter( + st.session_state["working_file_pdf"].name, + assets, + ) + + logging.info(f"Assets : {assets}") + + if len(assets["pagefilter"]["selected_pages"]) == 0: + # No page has been automatically selected by the page filter + # Hence, we display the full pdf, letting the user select the pages + number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) + assets["pagefilter"]["selected_pages"] = list(range(number_pages)) + st.session_state["assets"] = assets \ No newline at end of file From 4d5eea38f24f3f9ccba9d25a2ef14327dcbff3f7 Mon Sep 17 00:00:00 2001 From: Quentin Jarry Date: Sun, 5 May 2024 12:29:20 +0200 Subject: [PATCH 12/12] precommit formatting --- app/pages/0_Import_File.py | 9 +++++++-- app/pages/1_Selected_Pages.py | 1 + app/utils.py | 6 ++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/app/pages/0_Import_File.py b/app/pages/0_Import_File.py index 893e2d6..9585a87 100644 --- a/app/pages/0_Import_File.py +++ b/app/pages/0_Import_File.py @@ -17,6 +17,7 @@ def set_page_filter(value: dict): set_state(["config", "pagefilter"], value) + def initiate_configuration() -> None: st.session_state["config"] = copy.deepcopy(st.session_state["initial_config"]) if isinstance(st.session_state["config"]["pagefilter"], list): @@ -27,6 +28,7 @@ def initiate_configuration() -> None: "pagefilter" ]["type"] + def on_pdf_file_upload() -> None: # Change states related to the pdf file upload mytmpfile.write(st.session_state.original_pdf.read()) @@ -45,9 +47,12 @@ def on_config_file_upload() -> None: def on_change_page_filter(name_to_filter_dict: dict) -> None: - st.session_state["selected_page_filter_name"] = st.session_state["radio_button_filter_selection"] #this 'buffer' is needed because selectors wipe their key on reload + st.session_state["selected_page_filter_name"] = st.session_state[ + "radio_button_filter_selection" + ] # this 'buffer' is needed because selectors wipe their key on reload set_page_filter(name_to_filter_dict[st.session_state["selected_page_filter_name"]]) + # Check if a redirection was requested # Workaround because st.switch_page is not allowed in a callback function if st.session_state.get("page_redirection", False): @@ -158,4 +163,4 @@ def on_change_page_filter(name_to_filter_dict: dict) -> None: st.markdown( get_pdf_iframe(st.session_state["working_file_pdf"].name), unsafe_allow_html=True, - ) \ No newline at end of file + ) diff --git a/app/pages/1_Selected_Pages.py b/app/pages/1_Selected_Pages.py index 6a352e9..b02cd68 100644 --- a/app/pages/1_Selected_Pages.py +++ b/app/pages/1_Selected_Pages.py @@ -31,6 +31,7 @@ def set_extractors() -> None: st.session_state["proc"] = ReportProcessor(st.session_state["config"]) generate_assets() + st.set_page_config(layout="wide", page_title="Pages selection") # page_icon="📈" st.title("Country by Country Tax Reporting analysis : Selected Pages") st.subheader( diff --git a/app/utils.py b/app/utils.py index b09d6aa..e977c2a 100644 --- a/app/utils.py +++ b/app/utils.py @@ -4,8 +4,8 @@ from typing import Any import pandas as pd -from pypdf import PdfReader import streamlit as st +from pypdf import PdfReader def get_pdf_iframe(pdf_to_process: str) -> str: @@ -43,6 +43,7 @@ def to_csv_file(df: pd.DataFrame) -> bytes: return df.to_csv(index=False).encode("utf-8") + def set_state(key: Any, value: Any) -> None: """ Sets the session_state[key] to value. @@ -63,6 +64,7 @@ def set_state(key: Any, value: Any) -> None: else: st.session_state[key] = value + def generate_assets() -> None: assets = { "pagefilter": {}, @@ -82,4 +84,4 @@ def generate_assets() -> None: # Hence, we display the full pdf, letting the user select the pages number_pages = len(PdfReader(st.session_state["working_file_pdf"]).pages) assets["pagefilter"]["selected_pages"] = list(range(number_pages)) - st.session_state["assets"] = assets \ No newline at end of file + st.session_state["assets"] = assets