Merge pull request #48 from Urban-Analytics-Technology-Platform/time_…

…estimates_workflow Time estimates workflow
Urban-Analytics-Technology-Platform · Oct 2, 2024 · 41a32b6 · 41a32b6
2 parents 4628798 + ba2ddab
commit 41a32b6
Show file tree

Hide file tree

Showing 6 changed files with 516 additions and 196 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ click = "^8.1.7"
 tomlkit = "^0.13.0"
 cml-pam = "0.3.2"
 gdal = "<=3.8.4"
+pandera = "^0.20.4"
 
 [tool.poetry.dev-dependencies]
 pytest = ">= 6"

diff --git a/scripts/3.1_assign_primary_feasible_zones.py b/scripts/3.1_assign_primary_feasible_zones.py
@@ -78,69 +78,45 @@ def main(config_file):
     # are compared to the travel times of the individual's actual trips from the nts
     # (`tst`/`TripStart` and `tet`/`TripEnd`)
 
-    logger.info("Loading travel time matrix")
-
-    travel_times = pd.read_parquet(
-        acbm.root_path / "data/external/travel_times/oa/travel_time_matrix_acbm.parquet"
-    )
-
-    logger.info("Travel time matrix loaded")
-
-    logger.info("Merging travel time matrix with boundaries")
-
-    # convert from_id and to_id to int to match the boundaries data type
-    travel_times = travel_times.astype({"from_id": int, "to_id": int})
-
-    # merge travel_times with boundaries
-    travel_times = travel_times.merge(
-        boundaries[["OBJECTID", config.zone_id]],
-        left_on="from_id",
-        right_on="OBJECTID",
-        how="left",
-    )
-    travel_times = travel_times.drop(columns="OBJECTID")
-
-    travel_times = travel_times.merge(
-        boundaries[["OBJECTID", config.zone_id]],
-        left_on="to_id",
-        right_on="OBJECTID",
-        how="left",
-        suffixes=("_from", "_to"),
-    )
-    travel_times = travel_times.drop(columns="OBJECTID")
-
-    # #### Travel distance matrix
-    #
-    # Some areas aren't reachable by specific modes. We create a travel distance matrix
-    # to fall back on when the, inplace=Truere are no travel time calculations
-
-    logger.info("Creating travel time estimates")
-
-    travel_time_estimates = zones_to_time_matrix(
-        zones=boundaries, id_col=config.zone_id, to_dict=True
+    # TODO: move to config
+    travel_time_matrix_path = (
+        acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet"
     )
 
-    with open(
-        acbm.root_path / "data/interim/assigning/travel_time_estimates.pkl", "wb"
-    ) as f:
-        pkl.dump(travel_time_estimates, f)
-
-    logger.info("Travel time estimates created")
+    if config.parameters.travel_times:
+        logger.info("Loading travel time matrix")
+        try:
+            travel_times = pd.read_parquet(travel_time_matrix_path)
+            print("Travel time matrix loaded successfully.")
+        except Exception as e:
+            logger.info(
+                f"Failed to load travel time matrix: {e}. Check that you have a "
+                "travel_times matrix at {travel_time_matrix_path}. Otherwise set "
+                "travel_times to false in config"
+            )
+            raise e
+    else:
+        # If travel_times is not true or loading failed, create a new travel time matrix
+        logger.info("No travel time matrix found. Creating a new travel time matrix.")
+        # Create a new travel time matrix based on distances between zones
+        travel_times = zones_to_time_matrix(zones=boundaries, id_col="OA21CD")
+        logger.info("Travel time estimates created")
 
     # --- Intrazonal trip times
     #
     # Intrazonal trips all have time = 0. Our `get_possible_zones` function finds zones
     # that are within a specified % threshold from the reported time in the NTS.
     # A threshold percentage from a non zero number never equals 0, so intrazonal trips
     # are not found. The problem is also explained in this issue #30
-    #
+
     # Below, we assign intrazonal trips a non-zero time based on the zone area
 
     # get intrazone travel time estimates per mode
 
     logger.info("Creating intrazonal travel time estimates")
 
-    intrazone_times = intrazone_time(boundaries.set_index("OBJECTID"))
+    # TODO: use config zone_id instead of OA21CD
+    intrazone_times = intrazone_time(zones=boundaries, key_column="OA21CD")
 
     logger.info("Intrazonal travel time estimates created")
 
@@ -150,7 +126,7 @@ def main(config_file):
     travel_times = replace_intrazonal_travel_time(
         travel_times=travel_times,
         intrazonal_estimates=intrazone_times,
-        column_to_replace="travel_time_p50",
+        column_to_replace="time",
     )
 
     logger.info("Intrazonal travel times replaced")
@@ -223,10 +199,11 @@ def main(config_file):
         activity_chains=activity_chains_edu,
         travel_times=travel_times,
         activities_per_zone=activities_per_zone,
+        boundaries=boundaries,
         key_col="id",
+        zone_id=config.zone_id,
         filter_by_activity=True,
         activity_col="education_type",
-        zone_id=config.zone_id,
         time_tolerance=0.3,
     )
 
@@ -249,10 +226,11 @@ def main(config_file):
         activity_chains=activity_chains_work,
         travel_times=travel_times,
         activities_per_zone=activities_per_zone,
+        boundaries=boundaries,
         key_col="id",
+        zone_id=config.zone_id,
         filter_by_activity=True,
         activity_col="dact",
-        zone_id=config.zone_id,
         time_tolerance=0.3,
     )
 

diff --git a/scripts/3.2.3_assign_secondary_zone.py b/scripts/3.2.3_assign_secondary_zone.py
@@ -310,7 +310,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
     logger.info("Analysis (matrices): Step 1 - Loading travel time data")
 
     travel_times = pd.read_parquet(
-        acbm.root_path / "data/external/travel_times/oa/travel_time_matrix_acbm.parquet"
+        acbm.root_path / "data/external/travel_times/oa/travel_time_matrix.parquet"
     )
 
     # Edit modes
@@ -323,44 +323,25 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
 
     # I will do 2 for now
 
-    # keep only the rows that match specific "combination" values
-    modes_to_use = ["car", "walk", "cycle", "pt_wkday_morning"]
-
-    # Filter the DataFrame
-    travel_times = travel_times[travel_times["combination"].isin(modes_to_use)]
-
-    # Rename specific values in "combination" column
-    travel_times["combination"] = travel_times["combination"].replace(
-        {"cycle": "bike", "pt_wkday_morning": "pt"}
-    )
-
-    # Add OA21CD
-    # TODO: move this upstream and delete from here
-    logger.info("Analysis (matrices): Step 3 - Adding OA21CD to travel time data")
-
-    # convert from_id and to_id to int to match the boundaries data type
-    travel_times = travel_times.astype({"from_id": int, "to_id": int})
-
-    # merge travel_times with boundaries
-    travel_times = travel_times.merge(
-        boundaries[["OBJECTID", config.zone_id]],
-        left_on="from_id",
-        right_on="OBJECTID",
-        how="left",
-    )
-    travel_times = travel_times.drop(columns="OBJECTID")
+    # Check if 'time_of_day' column exists (this implies we have travel times for PT by time of day - ie travel times have not
+    # been generated by zones_to_time_matrix() function)
+    # TODO: just replace with time estimates from zones_to_time_matrix() function
+    if "time_of_day" in travel_times.columns:
+        # Apply filtering logic
+        travel_times = travel_times[
+            (travel_times["mode"] != "pt")
+            | (
+                (travel_times["mode"] == "pt")
+                & (travel_times["time_of_day"] == "morning")
+                & (travel_times["weekday"] == 1)
+            )
+        ]
 
-    travel_times = travel_times.merge(
-        boundaries[["OBJECTID", config.zone_id]],
-        left_on="to_id",
-        right_on="OBJECTID",
-        how="left",
-        suffixes=("_from", "_to"),
-    )
-    travel_times = travel_times.drop(columns="OBJECTID")
+    # Rename specific values in "mode" column
+    travel_times["mode"] = travel_times["mode"].replace({"cycle": "bike"})
 
     # --- Calculate OD probabilities (probabilities of choosing a destination zone for an activity, given the origin zone)
-    logger.info("Analysis (matrices): Step 4 - Calculating OD probabilities")
+    logger.info("Analysis (matrices): Step 3 - Calculating OD probabilities")
 
     activities_per_zone = pd.read_parquet(
         acbm.root_path / "data/interim/assigning/activities_per_zone.parquet"
@@ -385,14 +366,14 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
 
     # Calculate the visit_probability: it is a funciton of floor_area and travel time
     merged_df["visit_prob"] = np.where(
-        merged_df["travel_time_p50"] != 0,  # avoid division by zero
-        round(merged_df["floor_area"] / np.sqrt(merged_df["travel_time_p50"])),
+        merged_df["time"] != 0,  # avoid division by zero
+        round(merged_df["floor_area"] / np.sqrt(merged_df["time"])),
         round(merged_df["floor_area"]),
     )
 
     # --- Create matrices for travel times and OD probabilities
     logger.info(
-        "Analysis (matrices): Step 5 - Creating matrices for travel times and OD probabilities"
+        "Analysis (matrices): Step 4 - Creating matrices for travel times and OD probabilities"
     )
 
     # Get unique zone labels for matrix
@@ -409,8 +390,8 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
 
     matrix_travel_times = create_od_matrices(
         df=merged_df,
-        mode_column="combination",
-        value_column="travel_time_p50",
+        mode_column="mode",
+        value_column="time",
         zone_labels=zone_labels,
         fill_value=300,  # replace missing travel times with 6 hours (they are unreachable)
         zone_from=config.origin_zone_id(zone_id),
@@ -419,7 +400,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
 
     matrix_od_probs = create_od_matrices(
         df=merged_df,
-        mode_column="combination",
+        mode_column="mode",
         value_column="visit_prob",
         zone_labels=zone_labels,
         # replace missing probabilities with 1. There are no activities so shouldn't be visited
@@ -431,9 +412,9 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
     )
 
     # Create ODMatrix objects
-    logger.info("Analysis (matrices): Step 6 - Creating ODMatrix objects")
+    logger.info("Analysis (matrices): Step 5 - Creating ODMatrix objects")
 
-    mode_types = travel_times["combination"].unique()
+    mode_types = travel_times["mode"].unique()
 
     matrices_pam_travel_time = [
         ODMatrix("time", mode, zone_labels, zone_labels, matrix_travel_times[mode])
@@ -449,7 +430,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
     matrices_pam_all = matrices_pam_travel_time + matrices_pam_od_probs
 
     # create ODFactory
-    logger.info("Analysis (matrices): Step 7 - Creating ODFactory object")
+    logger.info("Analysis (matrices): Step 6 - Creating ODFactory object")
 
     od = ODFactory.from_matrices(matrices=matrices_pam_all)
 
@@ -459,7 +440,7 @@ def merge_columns_from_other(df: pd.DataFrame, other: pd.DataFrame) -> pd.DataFr
     update_population_plans(population, od)
 
     # --- Save
-    logger.info("Saving: Step 9 - Saving population")
+    logger.info("Saving: Step 7 - Saving population")
 
     write.to_csv(population, dir=(acbm.root_path / "data/processed/activities_pam"))