Skip to content

Commit

Permalink
update filtering logic to match spc zones
Browse files Browse the repository at this point in the history
  • Loading branch information
Hussein-Mahfouz committed Oct 4, 2024
1 parent 5739d89 commit b4148cf
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 56 deletions.
38 changes: 27 additions & 11 deletions scripts/0_preprocess_inputs.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
import geopandas as gpd
import pandas as pd
from uatk_spc import Reader

import acbm
from acbm.cli import acbm_cli
from acbm.config import load_config
from acbm.logger_config import preprocessing_logger as logger
from acbm.preprocessing import edit_boundary_resolution, filter_boundaries
from acbm.preprocessing import edit_boundary_resolution


@acbm_cli
def main(config_file):
config = load_config(config_file)
config.init_rng()
region = config.region
# Pick a region with SPC output saved
spc_path = acbm.root_path / "data/external/spc_output/raw/"

# ----- BOUNDARIES
logger.info("Preprocessing Boundary Layer")
Expand All @@ -25,25 +30,36 @@ def main(config_file):

boundaries = boundaries.to_crs(epsg=4326)

## Dissolve boundaries if resolution is MSOA
## --- Dissolve boundaries if resolution is MSOA

boundary_geography = config.parameters.boundary_geography # can only be OA or MSOA
logger.info(f"2. Dissolving boundaries to {boundary_geography} level")

boundaries = edit_boundary_resolution(boundaries, boundary_geography)

## Filter to study area
## --- Filter to study area
# we filter using msoa21cd values, which exist regardless of the boundary resolution

logger.info("3. Filtering boundaries to specified study area")
# TODO get from config and log
# logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}")

boundaries_filtered = filter_boundaries(
# boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"]
boundaries=boundaries,
column=config.parameters.boundary_filter_column,
values=config.parameters.boundary_filter_values,

# Step 1: Get zones from SPC (these will be 2011 MSOAs)
spc = Reader(spc_path, region, backend="pandas")
zones_in_region = list(spc.info_per_msoa.keys())

# Step 2: Filter boundaries to identified zones

# a) get MSOA11CD to MSOA21CD lookup
msoa_lookup = pd.read_csv(
acbm.root_path
/ "data/external/MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv"
)
# Filter msoa_lookup to include only rows where MSOA11CD is in zones_in_region
msoa_lookup_filtered = msoa_lookup[msoa_lookup["MSOA11CD"].isin(zones_in_region)]
# Extract the corresponding MSOA21CD values
msoa21cd_values = msoa_lookup_filtered["MSOA21CD"].tolist()

# b) filter boundaries to include only rows where MSOA21CD is in msoa21cd_values
boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)]

## Save the output as parquet
logger.info(
Expand Down
45 changes: 0 additions & 45 deletions src/acbm/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,51 +54,6 @@ def edit_boundary_resolution(
return study_area


# TODO: create spatial filter option
def filter_boundaries(boundaries, column, values):
"""
Filter the boundaries GeoDataFrame by the specified column and values.
Parameters
----------
boundaries: gpd.GeoDataFrame): The GeoDataFrame containing the boundaries.
column: str
The column to filter by (e.g., 'LEP22NM1', 'LAD22NM', 'rgn22nm').
values: list
The list of values to keep in the specified column.
Returns
-------
gpd.GeoDataFrame
The filtered GeoDataFrame.
Raises
------
ValueError
If the specified column does not exist in the GeoDataFrame.
If any of the specified values are not present in the column.
"""

# Check if the column exists in the GeoDataFrame
if column not in boundaries.columns:
error_message = f"Column '{column}' does not exist in the GeoDataFrame."
raise ValueError(error_message)

# Check if all values are present in the specified column
unique_values = boundaries[column].unique()
missing_values = [value for value in values if value not in unique_values]
if missing_values:
error_message = (
f"Values {missing_values} are not present in the column '{column}'. "
f"Unique values in the column are: {unique_values}"
)
raise ValueError(error_message)

# Filter boundaries layer by column = values
return boundaries[boundaries[column].isin(values)]


# ----- MATCHING


Expand Down

0 comments on commit b4148cf

Please sign in to comment.