Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend application of defaults beyond FI_T tables #240

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions xl2times/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def convert_xl_to_times(
transforms.process_units,
transforms.complete_commodity_groups,
transforms.process_wildcards,
transforms.fill_defaults_in_transform_tables,
transforms.apply_transform_tables,
transforms.explode_process_commodity_cols,
transforms.apply_final_fixup,
Expand Down
83 changes: 53 additions & 30 deletions xl2times/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,50 @@ def capitalise_attributes_table(table: EmbeddedXlTable):
return [capitalise_attributes_table(table) for table in tables]


def _populate_defaults(tag: Tag, dataframe: DataFrame, col_name: str, config: Config):
"""Fill in some of the missing values based on defaults in place."""
starting_na = (
dataframe["attribute"]
.str.upper()
.isin(config.veda_attr_defaults[col_name].keys())
& dataframe[col_name].isna()
)
if any(starting_na):
attributes = dataframe[starting_na]["attribute"].unique()
for attr in attributes:
i_attr = dataframe["attribute"] == attr
default_values = config.veda_attr_defaults[col_name][attr.upper()]
for default_value in default_values:
# Ensure that previously filled values are not overwritten
current_na = dataframe[col_name].isna()
remaining_na = starting_na & i_attr & current_na
if any(remaining_na):
if default_value not in config.known_columns[tag]:
dataframe.loc[remaining_na, [col_name]] = default_value
elif default_value in dataframe.columns:
dataframe.loc[remaining_na, [col_name]] = dataframe[
remaining_na
][default_value]


def fill_defaults_in_transform_tables(
config: Config,
tables: dict[str, DataFrame],
model: TimesModel,
) -> dict[str, DataFrame]:
"""Fill in some of the missing values based on defaults in place."""
tags = [Tag.tfm_mig, Tag.tfm_ins, Tag.tfm_upd]

for tag in tags:
if tag in tables:
table = tables[tag]
# Populate other_indexes based on defaults. Use known columns info from the fi_t tag.
_populate_defaults(Tag.fi_t, table, "other_indexes", config)
tables[tag] = table

return tables


def apply_fixups(
config: Config,
tables: list[EmbeddedXlTable],
Expand All @@ -1224,33 +1268,9 @@ def apply_fixups_table(table: EmbeddedXlTable):
if "year" in df.columns:
df["year"] = pd.to_numeric(df["year"], errors="coerce")

def _populate_defaults(dataframe: DataFrame, col_name: str):
"""Fill in some of the missing values based on defaults in place."""
i_na = (
dataframe["attribute"]
.str.upper()
.isin(config.veda_attr_defaults[col_name].keys())
& dataframe[col_name].isna()
)
if any(i_na):
for attr in dataframe[i_na]["attribute"].unique():
i_attr = dataframe["attribute"] == attr
for default_value in config.veda_attr_defaults[col_name][
attr.upper()
]:
# Ensure that previously filled values are not overwritten
i_fill = i_na & i_attr & dataframe[col_name].isna()
if any(i_fill):
if default_value not in config.known_columns[tag]:
dataframe.loc[i_fill, [col_name]] = default_value
else:
dataframe.loc[i_fill, [col_name]] = dataframe[i_fill][
default_value
]

# Populate commodity and other_indexes based on defaults
for col in ("commodity", "other_indexes"):
_populate_defaults(df, col)
_populate_defaults(tag, df, col, config)

# Fill other indexes for some attributes
# FLO_SHAR
Expand Down Expand Up @@ -2564,6 +2584,9 @@ def explode_process_commodity_cols(
if "commodity" in df.columns:
df = df.explode("commodity", ignore_index=True)

if "other_indexes" in df.columns:
df = df.explode("other_indexes", ignore_index=True)

tables[tag] = df

return tables
Expand Down Expand Up @@ -2709,7 +2732,7 @@ def convert_aliases(
tables: dict[str, DataFrame],
model: TimesModel,
) -> dict[str, DataFrame]:
# Ensure TIMES names for all attributes
"""Ensure TIMES names for all attributes."""
replacement_dict = {}
for k, v in config.veda_attr_defaults["aliases"].items():
for alias in v:
Expand All @@ -2723,10 +2746,10 @@ def convert_aliases(
# Drop duplicates generated due to renaming
# TODO: Clear values in irrelevant columns before doing this
# TODO: Do this comprehensively for all relevant tables
df = tables[Tag.fi_t]
df = df.dropna(subset="value").drop_duplicates(
subset=[col for col in df.columns if col != "value"], keep="last"
)
# TODO: Duplicates should only be removed if in the same file/module
df = tables[Tag.fi_t].dropna(subset="value")
cols = [col for col in df.columns if col != "value"]
df = df.drop_duplicates(subset=cols, keep="last")
tables[Tag.fi_t] = df.reset_index(drop=True)
return tables

Expand Down
Loading