Skip to content

Commit

Permalink
boundary preprocessing wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Hussein-Mahfouz committed Oct 2, 2024
1 parent 41a32b6 commit eb1801f
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 23 deletions.
47 changes: 47 additions & 0 deletions scripts/0_preprocess_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import geopandas as gpd

import acbm
from acbm.logger_config import preprocessing_logger_logger as logger
from acbm.preprocessing import edit_boundary_resolution, filter_boundaries

# ----- BOUNDARIES
logger.info("Preprocessing Boundary Layer")

## Read in the boundary layer for the whole of England

logger.info("1. Reading in the boundary layer for the whole of England")


boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson"
)

boundaries = boundaries.to_crs(epsg=4326)

## Dissolve boundaries if resolution is MSOA

boundary_geography = "OA" # can only be OA or MSOA
logger.info(f"2. Dissolving boundaries to {boundary_geography} level")

boundaries = edit_boundary_resolution(boundaries, boundary_geography)


## Filter to study area

logger.info("3. Filtering boundaries to specified study area")
# TODO get from config and log
# logger.info(f"3. Filtering boundaries to {config.parameters.boundary_filter_column} = {config.parameters.study_area}")

boundaries_filtered = filter_boundaries(
boundaries=boundaries, column="LEP22NM1", values=["Leeds City Region"]
)

## Save the output as parquet
logger.info(
f"4. Saving the boundaries to {acbm.root_path / 'data/external/boundaries/'} path"
)

boundaries_filtered.to_file(
acbm.root_path / "data/external/boundaries/study_area_zones.geojson",
driver="GeoJSON",
)
7 changes: 1 addition & 6 deletions scripts/3.1_assign_primary_feasible_zones.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,11 @@ def main(config_file):
# --- Study area boundaries

logger.info("Loading study area boundaries")
where_clause = "MSOA21NM LIKE '%Leeds%'"

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)

# convert boundaries to 4326
boundaries = boundaries.to_crs(epsg=4326)

logger.info("Study area boundaries loaded")

# --- Assign activity home locations to boundaries zoning system
Expand Down
7 changes: 3 additions & 4 deletions scripts/3.2.2_assign_primary_zone_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,13 @@ def main(config_file):

# --- boundaries

where_clause = "MSOA21NM LIKE '%Leeds%'"
logger.info("Loading study area boundaries")

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)

boundaries = boundaries.to_crs(epsg=4326)
logger.info("Study area boundaries loaded")

# osm POI data

Expand Down
8 changes: 2 additions & 6 deletions scripts/3.2.3_assign_secondary_zone.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,11 @@ def main(config_file):

logger.info("Preprocessing: Adding OA21CD to the data")

where_clause = "MSOA21NM LIKE '%Leeds%'"

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)

# convert boundaries to 4326
boundaries = boundaries.to_crs(epsg=4326)
logger.info("Study area boundaries loaded")

# --- Assign activity home locations to boundaries zoning system

Expand Down
10 changes: 4 additions & 6 deletions scripts/3.3_assign_facility_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,13 @@ def main(config_file):
)

# --- Load data: Boundaries
logger.info("Loading boundaries data")

where_clause = "MSOA21NM LIKE '%Leeds%'"
logger.info("Loading study area boundaries")

boundaries = gpd.read_file(
acbm.root_path / "data/external/boundaries/oa_england.geojson",
where=where_clause,
acbm.root_path / "data/external/boundaries/study_area_zones.geojson"
)
boundaries = boundaries.to_crs(epsg=4326)

logger.info("Study area boundaries loaded")

# --- Prepprocess: add zone column to POI data
logger.info("Adding zone column to POI data")
Expand Down
1 change: 1 addition & 0 deletions src/acbm/logger_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def create_logger(name, log_file):


# Create loggers for different modules
preprocessing_logger = create_logger("preprocessing", "preprocessing.log")
matching_logger = create_logger("matching", "matching.log")
assigning_primary_feasible_logger = create_logger(
"assigning_primary_feasible", "assigning_primary_feasible.log"
Expand Down
94 changes: 93 additions & 1 deletion src/acbm/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,102 @@
import numpy as np
import pandas as pd
from pyproj import Transformer
from shapely import Point
from shapely.geometry import MultiPolygon, Point

import acbm

# ----- PREPROCESSING BOUNDARIES


def edit_boundary_resolution(
study_area: gpd.GeoDataFrame, geography: str
) -> gpd.GeoDataFrame:
"""
This function takes a GeoDataFrame and a geography resolution as input and returns
a GeoDataFrame with the specified geography resolution. It dissolves OA boundaries
to MSOA boundaries if the geography resolution is set to "MSOA". Otherwise, it
retains the original OA boundaries. Currently it only works for OA and MSOA
Parameters
----------
study_area : gpd.GeoDataFrame
A GeoDataFrame containing the study area boundaries
geography : str
A string specifying the geography resolution. It can be either "OA" or "MSOA"
Returns
-------
gpd.GeoDataFrame
A GeoDataFrame containing the study area boundaries with the specified geography
"""
# Drop unnecessary columns
columns_to_drop = ["GlobalID", "OA21CD", "LSOA21CD", "LSOA21NM"]
study_area = study_area.drop(
columns=[col for col in columns_to_drop if col in study_area.columns]
)

# Dissolve based on the specified geography
if geography == "MSOA":
print("converting from OA to MSOA")
study_area = study_area.dissolve(by="MSOA21CD").reset_index()
elif geography == "OA":
print("keeping original OA boundaries")

# Ensure all geometries are MultiPolygon
study_area["geometry"] = study_area["geometry"].apply(
lambda geom: MultiPolygon([geom]) if geom.geom_type == "Polygon" else geom
)

return study_area


# TODO: create spatial filter option
def filter_boundaries(boundaries, column, values):
"""
Filter the boundaries GeoDataFrame by the specified column and values.
Parameters
----------
boundaries: gpd.GeoDataFrame): The GeoDataFrame containing the boundaries.
column: str
The column to filter by (e.g., 'LEP22NM1', 'LAD22NM', 'rgn22nm').
values: list
The list of values to keep in the specified column.
Returns
-------
gpd.GeoDataFrame
The filtered GeoDataFrame.
Raises
------
ValueError
If the specified column does not exist in the GeoDataFrame.
If any of the specified values are not present in the column.
"""

# Check if the column exists in the GeoDataFrame
if column not in boundaries.columns:
error_message = f"Column '{column}' does not exist in the GeoDataFrame."
raise ValueError(error_message)

# Check if all values are present in the specified column
unique_values = boundaries[column].unique()
missing_values = [value for value in values if value not in unique_values]
if missing_values:
error_message = (
f"Values {missing_values} are not present in the column '{column}'."
)
raise ValueError(error_message)

# Filter boundaries layer by column = values
return boundaries[boundaries[column].isin(values)]


# ----- MATCHING


def nts_filter_by_year(
data: pd.DataFrame, psu: pd.DataFrame, years: list
Expand Down

0 comments on commit eb1801f

Please sign in to comment.