update filtering logic to match spc zones

Urban-Analytics-Technology-Platform · Oct 4, 2024 · b4148cf · b4148cf
1 parent 5739d89
commit b4148cf
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 56 deletions.
diff --git a/scripts/0_preprocess_inputs.py b/scripts/0_preprocess_inputs.py
@@ -1,16 +1,21 @@
 import geopandas as gpd
+import pandas as pd
+from uatk_spc import Reader
 
 import acbm
 from acbm.cli import acbm_cli
 from acbm.config import load_config
 from acbm.logger_config import preprocessing_logger as logger
-from acbm.preprocessing import edit_boundary_resolution, filter_boundaries
+from acbm.preprocessing import edit_boundary_resolution
 
 
 @acbm_cli
 def main(config_file):
     config = load_config(config_file)
     config.init_rng()
+    region = config.region
+    # Pick a region with SPC output saved
+    spc_path = acbm.root_path / "data/external/spc_output/raw/"
 
     # ----- BOUNDARIES
     logger.info("Preprocessing Boundary Layer")
@@ -25,25 +30,36 @@ def main(config_file):
 
     boundaries = boundaries.to_crs(epsg=4326)
 
-    ## Dissolve boundaries if resolution is MSOA
+    ## --- Dissolve boundaries if resolution is MSOA
 
     boundary_geography = config.parameters.boundary_geography  # can only be OA or MSOA
     logger.info(f"2. Dissolving boundaries to {boundary_geography} level")
 
     boundaries = edit_boundary_resolution(boundaries, boundary_geography)
 
-    ## Filter to study area
+    ## --- Filter to study area
+    # we filter using msoa21cd values, which exist regardless of the boundary resolution
 
     logger.info("3. Filtering boundaries to specified study area")
-    # TODO get from config and log
-    # logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}")
-
-    boundaries_filtered = filter_boundaries(
-        # boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"]
-        boundaries=boundaries,
-        column=config.parameters.boundary_filter_column,
-        values=config.parameters.boundary_filter_values,
+
+    # Step 1: Get zones from SPC (these will be 2011 MSOAs)
+    spc = Reader(spc_path, region, backend="pandas")
+    zones_in_region = list(spc.info_per_msoa.keys())
+
+    # Step 2: Filter boundaries to identified zones
+
+    # a) get MSOA11CD to MSOA21CD lookup
+    msoa_lookup = pd.read_csv(
+        acbm.root_path
+        / "data/external/MSOA_2011_MSOA_2021_Lookup_for_England_and_Wales.csv"
     )
+    # Filter msoa_lookup to include only rows where MSOA11CD is in zones_in_region
+    msoa_lookup_filtered = msoa_lookup[msoa_lookup["MSOA11CD"].isin(zones_in_region)]
+    # Extract the corresponding MSOA21CD values
+    msoa21cd_values = msoa_lookup_filtered["MSOA21CD"].tolist()
+
+    # b) filter boundaries to include only rows where MSOA21CD is in msoa21cd_values
+    boundaries_filtered = boundaries[boundaries["MSOA21CD"].isin(msoa21cd_values)]
 
     ## Save the output as parquet
     logger.info(

diff --git a/src/acbm/preprocessing.py b/src/acbm/preprocessing.py
@@ -54,51 +54,6 @@ def edit_boundary_resolution(
     return study_area
 
 
-# TODO: create spatial filter option
-def filter_boundaries(boundaries, column, values):
-    """
-    Filter the boundaries GeoDataFrame by the specified column and values.
-
-    Parameters
-    ----------
-
-    boundaries: gpd.GeoDataFrame): The GeoDataFrame containing the boundaries.
-    column: str
-        The column to filter by (e.g., 'LEP22NM1', 'LAD22NM', 'rgn22nm').
-    values: list
-        The list of values to keep in the specified column.
-
-    Returns
-    -------
-    gpd.GeoDataFrame
-        The filtered GeoDataFrame.
-
-    Raises
-    ------
-    ValueError
-        If the specified column does not exist in the GeoDataFrame.
-        If any of the specified values are not present in the column.
-    """
-
-    # Check if the column exists in the GeoDataFrame
-    if column not in boundaries.columns:
-        error_message = f"Column '{column}' does not exist in the GeoDataFrame."
-        raise ValueError(error_message)
-
-    # Check if all values are present in the specified column
-    unique_values = boundaries[column].unique()
-    missing_values = [value for value in values if value not in unique_values]
-    if missing_values:
-        error_message = (
-            f"Values {missing_values} are not present in the column '{column}'. "
-            f"Unique values in the column are: {unique_values}"
-        )
-        raise ValueError(error_message)
-
-    # Filter boundaries layer by column = values
-    return boundaries[boundaries[column].isin(values)]
-
-
 # ----- MATCHING