skip precommit hook

CodeForPhilly · Nov 21, 2024 · b40c1cd · b40c1cd
1 parent cd87b34
commit b40c1cd
Show file tree

Hide file tree

Showing 10 changed files with 97 additions and 109 deletions.
diff --git a/data/src/constants/services.py b/data/src/constants/services.py
@@ -34,7 +34,25 @@
 )
 
 # Load data for complaints from L&I
-COMPLAINTS_SQL_QUERY = f"SELECT address, service_request_id, subject, status, service_name, service_code, lat AS y, lon AS x FROM public_cases_fc WHERE requested_datetime >= '{one_year_ago}' AND lat IS NOT NULL"
+COMPLAINTS_SQL_QUERY = f"""
+SELECT address, service_request_id, subject, status, service_name, service_code, lat AS y, lon AS x 
+FROM public_cases_fc 
+WHERE requested_datetime >= '{one_year_ago}' 
+  AND lat IS NOT NULL
+  AND (
+    subject ILIKE '%dumping%'
+    OR subject ILIKE '%blight%'
+    OR subject ILIKE '%rubbish%'
+    OR subject ILIKE '%weeds%'
+    OR subject ILIKE '%graffiti%'
+    OR subject ILIKE '%abandoned%'
+    OR subject ILIKE '%sanitation%'
+    OR subject ILIKE '%litter%'
+    OR subject ILIKE '%vacant%'
+    OR subject ILIKE '%trash%'
+    OR subject ILIKE '%unsafe%'
+  )
+"""
 
 VIOLATIONS_SQL_QUERY = f"SELECT parcel_id_num, casenumber, casecreateddate, casetype, casestatus, violationnumber, violationcodetitle, violationstatus, opa_account_num, address, opa_owner, geocode_x AS x, geocode_y AS y FROM violations WHERE violationdate >= '{one_year_ago}' AND geocode_x IS NOT NULL"
 

diff --git a/data/src/data_utils/__init__.py b/data/src/data_utils/__init__.py
@@ -1,6 +1,7 @@
 from .city_owned_properties import city_owned_properties
 from .phs_properties import phs_properties
-from .l_and_i import l_and_i
+from .li_violations import li_violations
+from .li_complaints import li_complaints
 from .rco_geoms import rco_geoms
 from .tree_canopy import tree_canopy
 from .nbhoods import nbhoods
@@ -14,7 +15,8 @@
 __all__ = [
     "city_owned_properties",
     "phs_properties",
-    "l_and_i",
+    "li_violations",
+    "li_complaints",
     "rco_geoms",
     "tree_canopy",
     "nbhoods",

diff --git a/data/src/data_utils/contig_neighbors.py b/data/src/data_utils/contig_neighbors.py
@@ -1,18 +1,23 @@
 import warnings
 import networkx as nx
 from libpysal.weights import Queen
+import numpy as np
 
 
 def contig_neighbors(primary_featurelayer):
-    # Filter the parcels to only consider vacant properties
-    parcels = primary_featurelayer.gdf[primary_featurelayer.gdf["vacant"] == 1]
-
-    if parcels.empty:
+    # Create a filtered dataframe with only vacant properties and polygon geometries
+    vacant_parcels = primary_featurelayer.gdf.loc[
+        (primary_featurelayer.gdf["vacant"]) &
+        (primary_featurelayer.gdf.geometry.type.isin(["Polygon", "MultiPolygon"])),
+        ["opa_id", "geometry"]
+    ]
+
+    if vacant_parcels.empty:
         print("No vacant properties found in the dataset.")
-        primary_featurelayer.gdf["n_contiguous"] = 0
+        primary_featurelayer.gdf["n_contiguous"] = np.nan
         return primary_featurelayer
 
-    print(f"Found {len(parcels)} vacant properties.")
+    print(f"Found {len(vacant_parcels)} vacant properties.")
 
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", category=FutureWarning)
@@ -24,27 +29,28 @@ def contig_neighbors(primary_featurelayer):
 
         # Create a spatial weights matrix for vacant parcels
         print("Creating spatial weights matrix for vacant parcels...")
-        w = Queen.from_dataframe(parcels)
+        w = Queen.from_dataframe(vacant_parcels)
 
+    # Convert the spatial weights matrix to a NetworkX graph
     print("Converting spatial weights matrix to NetworkX graph...")
     g = w.to_networkx()
 
-    # Calculate the number of contiguous neighbors for each vacant property
+    # Calculate the number of contiguous vacant properties for each vacant parcel
     print("Calculating number of contiguous vacant neighbors for each property...")
     n_contiguous = {
         node: len(nx.node_connected_component(g, node)) - 1 for node in g.nodes
     }
 
-    # Assign the number of contiguous vacant neighbors to vacant properties
-    parcels["n_contiguous"] = parcels.index.map(n_contiguous).fillna(0).astype(int)
+    # Assign the contiguous neighbor count to the filtered vacant parcels
+    vacant_parcels["n_contiguous"] = vacant_parcels.index.map(n_contiguous)
 
-    print("Joining results back to primary feature layer...")
+    # Merge the results back to the primary feature layer
     primary_featurelayer.gdf = primary_featurelayer.gdf.merge(
-        parcels[["opa_id", "n_contiguous"]], on="opa_id", how="left"
+        vacant_parcels[["opa_id", "n_contiguous"]], on="opa_id", how="left"
     )
 
-    # For non-vacant properties, set the number of contiguous vacant neighbors to 0
-    primary_featurelayer.gdf["n_contiguous"].fillna(0, inplace=True)
+    # Assign NA for non-vacant properties
+    primary_featurelayer.gdf.loc[~primary_featurelayer.gdf["vacant"], "n_contiguous"] = np.nan
 
     print("Process completed. Returning updated primary feature layer.")
     return primary_featurelayer
diff --git a/data/src/data_utils/li_complaints.py b/data/src/data_utils/li_complaints.py
@@ -0,0 +1,8 @@
+from constants.services import COMPLAINTS_SQL_QUERY
+
+
+from data_utils.kde import apply_kde_to_primary
+
+
+def li_complaints(primary_featurelayer):
+    return apply_kde_to_primary(primary_featurelayer, "L and I Complaints", COMPLAINTS_SQL_QUERY)
diff --git a/data/src/data_utils/l_and_i.py → data/src/data_utils/li_violations.py b/data/src/data_utils/l_and_i.py → data/src/data_utils/li_violations.py
@@ -2,14 +2,14 @@
 import geopandas as gpd
 from typing import List
 from classes.featurelayer import FeatureLayer
-from constants.services import COMPLAINTS_SQL_QUERY, VIOLATIONS_SQL_QUERY
+from constants.services import VIOLATIONS_SQL_QUERY
 
 
-def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
+def li_violations(primary_featurelayer: FeatureLayer) -> FeatureLayer:
     """
-    Process L&I (Licenses and Inspections) data for complaints and violations.
+    Process L&I (Licenses and Inspections) data for violations.
 
-    This function filters and processes L&I complaints and violations data,
+    This function filters and processes L&I violations data,
     joining it with the primary feature layer based on spatial relationships
     and OPA (Office of Property Assessment) identifiers.
 
@@ -33,35 +33,6 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
         "unsafe",
     ]
 
-    # Load complaints data from L&I
-    l_and_i_complaints: FeatureLayer = FeatureLayer(
-        name="LI Complaints", carto_sql_queries=COMPLAINTS_SQL_QUERY
-    )
-
-    # Filter for rows where 'subject' contains any of the keywords
-    l_and_i_complaints.gdf = l_and_i_complaints.gdf[
-        l_and_i_complaints.gdf["subject"].str.lower().str.contains("|".join(keywords))
-    ]
-
-    # Filter for only Status = 'Open'
-    l_and_i_complaints.gdf = l_and_i_complaints.gdf[
-        l_and_i_complaints.gdf["status"].str.lower() == "open"
-    ]
-
-    # Group by geometry and concatenate the violationcodetitle values into a list with a semicolon separator
-    l_and_i_complaints.gdf = (
-        l_and_i_complaints.gdf.groupby("geometry")["service_name"]
-        .apply(lambda x: "; ".join([val for val in x if val is not None]))
-        .reset_index()
-    )
-
-    l_and_i_complaints.rebuild_gdf()
-
-    # rename the column to 'li_complaints'
-    l_and_i_complaints.gdf.rename(
-        columns={"service_name": "li_complaints"}, inplace=True
-    )
-
     # Load data for violations from L&I
     l_and_i_violations: FeatureLayer = FeatureLayer(
         name="LI Violations", carto_sql_queries=VIOLATIONS_SQL_QUERY, from_xy=True
@@ -121,7 +92,6 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
         .apply(lambda x: "; ".join([val for val in x if val is not None]))
         .reset_index()
     )
-    l_and_i_complaints.rebuild_gdf()
 
     # rename the column to 'li_violations'
     l_and_i_violations.gdf.rename(
@@ -134,19 +104,6 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
         "opa_account_num",
     )
 
-    # Complaints need a spatial join, but we need to take special care to merge on just the parcel geoms first to get opa_id
-    complaints_with_opa_id: gpd.GeoDataFrame = primary_featurelayer.gdf.sjoin(
-        l_and_i_complaints.gdf, how="left", predicate="contains"
-    )
-    complaints_with_opa_id.drop(columns=["index_right"], inplace=True)
-
-    # Concatenate the complaints values into a list with a semicolon separator by opa_id
-    complaints_with_opa_id = (
-        complaints_with_opa_id.groupby("opa_id")["li_complaints"]
-        .apply(lambda x: "; ".join([str(val) for val in x if val is not None]))
-        .reset_index()[["opa_id", "li_complaints"]]
-    )
-
     # Clean up the NaN values in the li_complaints column
     def remove_nan_strings(x: str) -> str | None:
         """
@@ -163,16 +120,6 @@ def remove_nan_strings(x: str) -> str | None:
         else:
             return x
 
-    complaints_with_opa_id["li_complaints"] = complaints_with_opa_id[
-        "li_complaints"
-    ].apply(remove_nan_strings)
-
-    # Merge the complaints values back into the primary_featurelayer
-    primary_featurelayer.opa_join(
-        complaints_with_opa_id,
-        "opa_id",
-    )
-
     primary_featurelayer.gdf[
         ["all_violations_past_year", "open_violations_past_year"]
     ] = (

diff --git a/data/src/data_utils/negligent_devs.py b/data/src/data_utils/negligent_devs.py
@@ -1,6 +1,5 @@
 import pandas as pd
 
-
 def negligent_devs(primary_featurelayer):
     devs = primary_featurelayer.gdf
 
@@ -11,29 +10,27 @@ def negligent_devs(primary_featurelayer):
         devs[["opa_id", "city_owner_agency", "standardized_address", "vacant"]].head(10)
     )
 
-    # Count observations where vacant == 1 by standardized_address
-    vacant_counts = (
-        devs[devs["vacant"] == 1]
-        .groupby("standardized_address")
-        .size()
-        .reset_index(name="vacant_property_count")
+    # Count total properties and vacant properties by standardized_address
+    property_counts = (
+        devs.groupby("standardized_address")
+        .agg(
+            n_total_properties_owned=("opa_id", "size"),
+            n_vacant_properties_owned=("vacant", "sum"),
+        )
+        .reset_index()
     )
 
-    print("Head of resulting DataFrame with vacant counts:")
-    print(vacant_counts.head(10))
+    print("Head of resulting DataFrame with property counts:")
+    print(property_counts.head(10))
 
-    # Merge the vacant counts back to the main DataFrame
+    # Merge the property counts back to the main DataFrame
     primary_featurelayer.gdf = primary_featurelayer.gdf.merge(
-        vacant_counts, on="standardized_address", how="left"
+        property_counts, on="standardized_address", how="left"
     )
 
     # Identify negligent developers: non-city owned entities owning 5+ vacant properties
-    primary_featurelayer.gdf["n_properties_owned"] = primary_featurelayer.gdf.groupby(
-        "opa_id"
-    )["vacant_property_count"].transform("sum")
-
     primary_featurelayer.gdf["negligent_dev"] = (
-        primary_featurelayer.gdf["n_properties_owned"] >= 5
+        primary_featurelayer.gdf["n_vacant_properties_owned"] >= 5
     ) & (
         primary_featurelayer.gdf["city_owner_agency"].isna()
         | (primary_featurelayer.gdf["city_owner_agency"] == "")
@@ -42,7 +39,7 @@ def negligent_devs(primary_featurelayer):
     print("Final feature layer data with negligent_dev flag:")
     print(
         primary_featurelayer.gdf[
-            ["opa_id", "n_properties_owned", "negligent_dev"]
+            ["opa_id", "n_total_properties_owned", "n_vacant_properties_owned", "negligent_dev"]
         ].head(10)
     )
 

diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py
@@ -1,5 +1,6 @@
 from classes.featurelayer import FeatureLayer
 from constants.services import PHS_LAYERS_TO_LOAD
+import pandas as pd
 
 
 def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
@@ -22,13 +23,8 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
     # Perform spatial join between primary feature layer and PHS properties
     primary_featurelayer.spatial_join(phs_properties)
 
-    # Initialize 'phs_care_program' column with default "no" for all rows
-    primary_featurelayer.gdf["phs_care_program"] = "No"
-
-    # Set 'phs_care_program' to "yes" for matched rows
-    primary_featurelayer.gdf.loc[
-        primary_featurelayer.gdf["program"].notna(), "phs_care_program"
-    ] = "Yes"
+    # Create 'phs_care_program' column with values from 'program', drop 'program'
+    primary_featurelayer.gdf["phs_care_program"] = primary_featurelayer.gdf.pop("program")
 
     # Rebuild the GeoDataFrame after updates
     primary_featurelayer.rebuild_gdf()

diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py
@@ -1,15 +1,17 @@
+import pandas as pd
+
 def priority_level(dataset):
     priority_levels = []
     for idx, row in dataset.gdf.iterrows():
         priority_level = ""
 
         # Decision Points
         guncrime_density_percentile = row["gun_crimes_density_percentile"]
-        in_phs_landcare = row["phs_care_program"] == "yes"
+        in_phs_landcare = pd.notna(row["phs_care_program"])
         has_li_complaint_or_violation = (
             row["li_complaints"] is not None
             and float(row["all_violations_past_year"]) > 0
-        )
+        ) or (row["l_and_i_complaints_density_percentile"] > 50)
         very_low_tree_canopy = row["tree_canopy_gap"] >= 0.3
 
         # Updated logic based on percentile values

diff --git a/data/src/data_utils/pwd_parcels.py b/data/src/data_utils/pwd_parcels.py
@@ -1,5 +1,6 @@
 from classes.featurelayer import FeatureLayer
 from constants.services import PWD_PARCELS_QUERY
+import geopandas as gpd
 
 
 def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer:
@@ -52,6 +53,13 @@ def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer:
         how="left",
     )
 
+    # Coerce merged_gdf into a GeoDataFrame
+    merged_gdf = gpd.GeoDataFrame(
+        merged_gdf,
+        geometry="geometry",
+        crs=primary_featurelayer.gdf.crs,  # Ensure the CRS matches the original
+    )
+
     # Log observations with no polygon geometry
     no_geometry_count = merged_gdf["geometry"].isnull().sum()
     print("Number of observations with no polygon geometry:", no_geometry_count)
@@ -62,10 +70,16 @@ def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer:
     )
     print("Number of observations retaining point geometry:", no_geometry_count)
 
-    # Validate the merged GeoDataFrame
-    updated_gdf = FeatureLayer(
-        name=primary_featurelayer.name,
-        gdf=merged_gdf,
+    # Count observations with point geometry grouped by 'vacant'
+    point_geometry_counts = (
+        merged_gdf[merged_gdf["geometry"].geom_type == "Point"]
+        .groupby("vacant")
+        .size()
     )
 
-    return updated_gdf
+    # Log the results
+    print("Counts of point geometry grouped by 'vacant':")
+    print(point_geometry_counts)
+
+    # Wrap the GeoDataFrame back into a FeatureLayer
+    return FeatureLayer(name=primary_featurelayer.name, gdf=merged_gdf)
diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py
@@ -92,10 +92,8 @@ def vacant_properties(primary_featurelayer) -> FeatureLayer:
     # Final null value check before returning
     check_null_percentage(df)
 
-    # Create vacant column in the primary feature layer based on opa_id match
-    primary_featurelayer.gdf["vacant"] = (
-        primary_featurelayer.gdf["opa_id"].isin(df["opa_id"]).astype(int)
-    )
+    # Create vacant column in the primary feature layer as True/False
+    primary_featurelayer.gdf["vacant"] = primary_featurelayer.gdf["opa_id"].isin(df["opa_id"])
 
     print("Vacant column added based on opa_id match.")