etsap-TIMES · olejandro · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 12, 2024
diff --git a/xl2times/__main__.py b/xl2times/__main__.py
@@ -146,6 +146,7 @@ def convert_xl_to_times(
         transforms.process_units,
         transforms.complete_commodity_groups,
         transforms.process_wildcards,
+        transforms.fill_defaults_in_transform_tables,
         transforms.apply_transform_tables,
         transforms.explode_process_commodity_cols,
         transforms.apply_final_fixup,

diff --git a/xl2times/transforms.py b/xl2times/transforms.py
@@ -1208,6 +1208,50 @@ def capitalise_attributes_table(table: EmbeddedXlTable):
     return [capitalise_attributes_table(table) for table in tables]
 
 
+def _populate_defaults(tag: Tag, dataframe: DataFrame, col_name: str, config: Config):
+    """Fill in some of the missing values based on defaults in place."""
+    starting_na = (
+        dataframe["attribute"]
+        .str.upper()
+        .isin(config.veda_attr_defaults[col_name].keys())
+        & dataframe[col_name].isna()
+    )
+    if any(starting_na):
+        attributes = dataframe[starting_na]["attribute"].unique()
+        for attr in attributes:
+            i_attr = dataframe["attribute"] == attr
+            default_values = config.veda_attr_defaults[col_name][attr.upper()]
+            for default_value in default_values:
+                # Ensure that previously filled values are not overwritten
+                current_na = dataframe[col_name].isna()
+                remaining_na = starting_na & i_attr & current_na
+                if any(remaining_na):
+                    if default_value not in config.known_columns[tag]:
+                        dataframe.loc[remaining_na, [col_name]] = default_value
+                    elif default_value in dataframe.columns:
+                        dataframe.loc[remaining_na, [col_name]] = dataframe[
+                            remaining_na
+                        ][default_value]
+
+
+def fill_defaults_in_transform_tables(
+    config: Config,
+    tables: dict[str, DataFrame],
+    model: TimesModel,
+) -> dict[str, DataFrame]:
+    """Fill in some of the missing values based on defaults in place."""
+    tags = [Tag.tfm_mig, Tag.tfm_ins, Tag.tfm_upd]
+
+    for tag in tags:
+        if tag in tables:
+            table = tables[tag]
+            # Populate other_indexes based on defaults. Use known columns info from the fi_t tag.
+            _populate_defaults(Tag.fi_t, table, "other_indexes", config)
+            tables[tag] = table
+
+    return tables
+
+
 def apply_fixups(
     config: Config,
     tables: list[EmbeddedXlTable],
@@ -1224,33 +1268,9 @@ def apply_fixups_table(table: EmbeddedXlTable):
         if "year" in df.columns:
             df["year"] = pd.to_numeric(df["year"], errors="coerce")
 
-        def _populate_defaults(dataframe: DataFrame, col_name: str):
-            """Fill in some of the missing values based on defaults in place."""
-            i_na = (
-                dataframe["attribute"]
-                .str.upper()
-                .isin(config.veda_attr_defaults[col_name].keys())
-                & dataframe[col_name].isna()
-            )
-            if any(i_na):
-                for attr in dataframe[i_na]["attribute"].unique():
-                    i_attr = dataframe["attribute"] == attr
-                    for default_value in config.veda_attr_defaults[col_name][
-                        attr.upper()
-                    ]:
-                        # Ensure that previously filled values are not overwritten
-                        i_fill = i_na & i_attr & dataframe[col_name].isna()
-                        if any(i_fill):
-                            if default_value not in config.known_columns[tag]:
-                                dataframe.loc[i_fill, [col_name]] = default_value
-                            else:
-                                dataframe.loc[i_fill, [col_name]] = dataframe[i_fill][
-                                    default_value
-                                ]
-
         # Populate commodity and other_indexes based on defaults
         for col in ("commodity", "other_indexes"):
-            _populate_defaults(df, col)
+            _populate_defaults(tag, df, col, config)
 
         # Fill other indexes for some attributes
         # FLO_SHAR
@@ -2564,6 +2584,9 @@ def explode_process_commodity_cols(
         if "commodity" in df.columns:
             df = df.explode("commodity", ignore_index=True)
 
+        if "other_indexes" in df.columns:
+            df = df.explode("other_indexes", ignore_index=True)
+
         tables[tag] = df
 
     return tables
@@ -2709,7 +2732,7 @@ def convert_aliases(
     tables: dict[str, DataFrame],
     model: TimesModel,
 ) -> dict[str, DataFrame]:
-    # Ensure TIMES names for all attributes
+    """Ensure TIMES names for all attributes."""
     replacement_dict = {}
     for k, v in config.veda_attr_defaults["aliases"].items():
         for alias in v:
@@ -2723,10 +2746,10 @@ def convert_aliases(
     # Drop duplicates generated due to renaming
     # TODO: Clear values in irrelevant columns before doing this
     # TODO: Do this comprehensively for all relevant tables
-    df = tables[Tag.fi_t]
-    df = df.dropna(subset="value").drop_duplicates(
-        subset=[col for col in df.columns if col != "value"], keep="last"
-    )
+    # TODO: Duplicates should only be removed if in the same file/module
+    df = tables[Tag.fi_t].dropna(subset="value")
+    cols = [col for col in df.columns if col != "value"]
+    df = df.drop_duplicates(subset=cols, keep="last")
     tables[Tag.fi_t] = df.reset_index(drop=True)
     return tables