From 71b84918baf3722ed4d7be025e384d5ed120e756 Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Mon, 11 Nov 2024 11:41:21 -0500 Subject: [PATCH 1/6] Move _populate_defaults out of apply_fixups --- xl2times/transforms.py | 52 +++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index f9e1328..cf08fd6 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1208,6 +1208,29 @@ def capitalise_attributes_table(table: EmbeddedXlTable): return [capitalise_attributes_table(table) for table in tables] +def _populate_defaults(tag: Tag, dataframe: DataFrame, col_name: str, config: Config): + """Fill in some of the missing values based on defaults in place.""" + i_na = ( + dataframe["attribute"] + .str.upper() + .isin(config.veda_attr_defaults[col_name].keys()) + & dataframe[col_name].isna() + ) + if any(i_na): + for attr in dataframe[i_na]["attribute"].unique(): + i_attr = dataframe["attribute"] == attr + for default_value in config.veda_attr_defaults[col_name][attr.upper()]: + # Ensure that previously filled values are not overwritten + i_fill = i_na & i_attr & dataframe[col_name].isna() + if any(i_fill): + if default_value not in config.known_columns[tag]: + dataframe.loc[i_fill, [col_name]] = default_value + else: + dataframe.loc[i_fill, [col_name]] = dataframe[i_fill][ + default_value + ] + + def apply_fixups( config: Config, tables: list[EmbeddedXlTable], @@ -1224,33 +1247,9 @@ def apply_fixups_table(table: EmbeddedXlTable): if "year" in df.columns: df["year"] = pd.to_numeric(df["year"], errors="coerce") - def _populate_defaults(dataframe: DataFrame, col_name: str): - """Fill in some of the missing values based on defaults in place.""" - i_na = ( - dataframe["attribute"] - .str.upper() - .isin(config.veda_attr_defaults[col_name].keys()) - & dataframe[col_name].isna() - ) - if any(i_na): - for attr in dataframe[i_na]["attribute"].unique(): - i_attr = dataframe["attribute"] == attr - for default_value in config.veda_attr_defaults[col_name][ - attr.upper() - ]: - # Ensure that previously filled values are not overwritten - i_fill = i_na & i_attr & dataframe[col_name].isna() - if any(i_fill): - if default_value not in config.known_columns[tag]: - dataframe.loc[i_fill, [col_name]] = default_value - else: - dataframe.loc[i_fill, [col_name]] = dataframe[i_fill][ - default_value - ] - # Populate commodity and other_indexes based on defaults for col in ("commodity", "other_indexes"): - _populate_defaults(df, col) + _populate_defaults(tag, df, col, config) # Fill other indexes for some attributes # FLO_SHAR @@ -2709,7 +2708,7 @@ def convert_aliases( tables: dict[str, DataFrame], model: TimesModel, ) -> dict[str, DataFrame]: - # Ensure TIMES names for all attributes + """Ensure TIMES names for all attributes.""" replacement_dict = {} for k, v in config.veda_attr_defaults["aliases"].items(): for alias in v: @@ -2723,6 +2722,7 @@ def convert_aliases( # Drop duplicates generated due to renaming # TODO: Clear values in irrelevant columns before doing this # TODO: Do this comprehensively for all relevant tables + # TODO: Duplicates should only be removed if in the same file/module df = tables[Tag.fi_t] df = df.dropna(subset="value").drop_duplicates( subset=[col for col in df.columns if col != "value"], keep="last" From b3f510592e3ef88096c3e6c2d4acafa33affbf5d Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Mon, 11 Nov 2024 16:40:39 -0500 Subject: [PATCH 2/6] Create fill_defaults_in_transform_tables --- xl2times/transforms.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index cf08fd6..5608f71 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1231,6 +1231,24 @@ def _populate_defaults(tag: Tag, dataframe: DataFrame, col_name: str, config: Co ] +def fill_defaults_in_transform_tables( + config: Config, + tables: dict[str, DataFrame], + model: TimesModel, +) -> dict[str, DataFrame]: + """Fill in some of the missing values based on defaults in place.""" + tags = [Tag.tfm_ins] + + for tag in tags: + if tag in tables: + table = tables[tag] + # Populate other_indexes based on defaults + _populate_defaults(tag, table, "other_indexes", config) + tables[tag] = table + + return tables + + def apply_fixups( config: Config, tables: list[EmbeddedXlTable], From 3eea7911952002307e680da3ca32dcaebc4b3b1a Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Mon, 11 Nov 2024 16:42:14 -0500 Subject: [PATCH 3/6] Include fill_defaults_in_transform_tables in __main__.py --- xl2times/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xl2times/__main__.py b/xl2times/__main__.py index da3f3d5..7b0e9e5 100644 --- a/xl2times/__main__.py +++ b/xl2times/__main__.py @@ -146,6 +146,7 @@ def convert_xl_to_times( transforms.process_units, transforms.complete_commodity_groups, transforms.process_wildcards, + transforms.fill_defaults_in_transform_tables, transforms.apply_transform_tables, transforms.explode_process_commodity_cols, transforms.apply_final_fixup, From eca92ebef0a0299059516ae83dc4c2372d74c1b4 Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Mon, 11 Nov 2024 19:30:19 -0500 Subject: [PATCH 4/6] Extend the list of transform tables --- xl2times/transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 5608f71..ded4707 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1237,7 +1237,7 @@ def fill_defaults_in_transform_tables( model: TimesModel, ) -> dict[str, DataFrame]: """Fill in some of the missing values based on defaults in place.""" - tags = [Tag.tfm_ins] + tags = [Tag.tfm_mig, Tag.tfm_ins, Tag.tfm_upd] for tag in tags: if tag in tables: From b4f75e90c79310eedf3d223594136ed7d2e3f02c Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Tue, 12 Nov 2024 09:28:15 -0500 Subject: [PATCH 5/6] Refactor some parts to expose the logic --- xl2times/transforms.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index ded4707..165a636 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -1210,25 +1210,28 @@ def capitalise_attributes_table(table: EmbeddedXlTable): def _populate_defaults(tag: Tag, dataframe: DataFrame, col_name: str, config: Config): """Fill in some of the missing values based on defaults in place.""" - i_na = ( + starting_na = ( dataframe["attribute"] .str.upper() .isin(config.veda_attr_defaults[col_name].keys()) & dataframe[col_name].isna() ) - if any(i_na): - for attr in dataframe[i_na]["attribute"].unique(): + if any(starting_na): + attributes = dataframe[starting_na]["attribute"].unique() + for attr in attributes: i_attr = dataframe["attribute"] == attr - for default_value in config.veda_attr_defaults[col_name][attr.upper()]: + default_values = config.veda_attr_defaults[col_name][attr.upper()] + for default_value in default_values: # Ensure that previously filled values are not overwritten - i_fill = i_na & i_attr & dataframe[col_name].isna() - if any(i_fill): + current_na = dataframe[col_name].isna() + remaining_na = starting_na & i_attr & current_na + if any(remaining_na): if default_value not in config.known_columns[tag]: - dataframe.loc[i_fill, [col_name]] = default_value - else: - dataframe.loc[i_fill, [col_name]] = dataframe[i_fill][ - default_value - ] + dataframe.loc[remaining_na, [col_name]] = default_value + elif default_value in dataframe.columns: + dataframe.loc[remaining_na, [col_name]] = dataframe[ + remaining_na + ][default_value] def fill_defaults_in_transform_tables( @@ -1242,8 +1245,8 @@ def fill_defaults_in_transform_tables( for tag in tags: if tag in tables: table = tables[tag] - # Populate other_indexes based on defaults - _populate_defaults(tag, table, "other_indexes", config) + # Populate other_indexes based on defaults. Use known columns info from the fi_t tag. + _populate_defaults(Tag.fi_t, table, "other_indexes", config) tables[tag] = table return tables @@ -2741,10 +2744,9 @@ def convert_aliases( # TODO: Clear values in irrelevant columns before doing this # TODO: Do this comprehensively for all relevant tables # TODO: Duplicates should only be removed if in the same file/module - df = tables[Tag.fi_t] - df = df.dropna(subset="value").drop_duplicates( - subset=[col for col in df.columns if col != "value"], keep="last" - ) + df = tables[Tag.fi_t].dropna(subset="value") + cols = [col for col in df.columns if col != "value"] + df = df.drop_duplicates(subset=cols, keep="last") tables[Tag.fi_t] = df.reset_index(drop=True) return tables From b0864dcd0127addca562e8fa5a1fe1edd9d21515 Mon Sep 17 00:00:00 2001 From: Olexandr Balyk Date: Thu, 14 Nov 2024 07:13:43 -0500 Subject: [PATCH 6/6] Explode other_indexes as well --- xl2times/transforms.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xl2times/transforms.py b/xl2times/transforms.py index 165a636..17cf7c9 100644 --- a/xl2times/transforms.py +++ b/xl2times/transforms.py @@ -2584,6 +2584,9 @@ def explode_process_commodity_cols( if "commodity" in df.columns: df = df.explode("commodity", ignore_index=True) + if "other_indexes" in df.columns: + df = df.explode("other_indexes", ignore_index=True) + tables[tag] = df return tables