Skip to content

Commit

Permalink
skip precommit hook
Browse files Browse the repository at this point in the history
  • Loading branch information
nlebovits committed Nov 21, 2024
1 parent cd87b34 commit b40c1cd
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 109 deletions.
20 changes: 19 additions & 1 deletion data/src/constants/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,25 @@
)

# Load data for complaints from L&I
COMPLAINTS_SQL_QUERY = f"SELECT address, service_request_id, subject, status, service_name, service_code, lat AS y, lon AS x FROM public_cases_fc WHERE requested_datetime >= '{one_year_ago}' AND lat IS NOT NULL"
COMPLAINTS_SQL_QUERY = f"""
SELECT address, service_request_id, subject, status, service_name, service_code, lat AS y, lon AS x
FROM public_cases_fc
WHERE requested_datetime >= '{one_year_ago}'
AND lat IS NOT NULL
AND (
subject ILIKE '%dumping%'
OR subject ILIKE '%blight%'
OR subject ILIKE '%rubbish%'
OR subject ILIKE '%weeds%'
OR subject ILIKE '%graffiti%'
OR subject ILIKE '%abandoned%'
OR subject ILIKE '%sanitation%'
OR subject ILIKE '%litter%'
OR subject ILIKE '%vacant%'
OR subject ILIKE '%trash%'
OR subject ILIKE '%unsafe%'
)
"""

VIOLATIONS_SQL_QUERY = f"SELECT parcel_id_num, casenumber, casecreateddate, casetype, casestatus, violationnumber, violationcodetitle, violationstatus, opa_account_num, address, opa_owner, geocode_x AS x, geocode_y AS y FROM violations WHERE violationdate >= '{one_year_ago}' AND geocode_x IS NOT NULL"

Expand Down
6 changes: 4 additions & 2 deletions data/src/data_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .city_owned_properties import city_owned_properties
from .phs_properties import phs_properties
from .l_and_i import l_and_i
from .li_violations import li_violations
from .li_complaints import li_complaints
from .rco_geoms import rco_geoms
from .tree_canopy import tree_canopy
from .nbhoods import nbhoods
Expand All @@ -14,7 +15,8 @@
__all__ = [
"city_owned_properties",
"phs_properties",
"l_and_i",
"li_violations",
"li_complaints",
"rco_geoms",
"tree_canopy",
"nbhoods",
Expand Down
34 changes: 20 additions & 14 deletions data/src/data_utils/contig_neighbors.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import warnings
import networkx as nx
from libpysal.weights import Queen
import numpy as np


def contig_neighbors(primary_featurelayer):
# Filter the parcels to only consider vacant properties
parcels = primary_featurelayer.gdf[primary_featurelayer.gdf["vacant"] == 1]

if parcels.empty:
# Create a filtered dataframe with only vacant properties and polygon geometries
vacant_parcels = primary_featurelayer.gdf.loc[
(primary_featurelayer.gdf["vacant"]) &
(primary_featurelayer.gdf.geometry.type.isin(["Polygon", "MultiPolygon"])),
["opa_id", "geometry"]
]

if vacant_parcels.empty:
print("No vacant properties found in the dataset.")
primary_featurelayer.gdf["n_contiguous"] = 0
primary_featurelayer.gdf["n_contiguous"] = np.nan
return primary_featurelayer

print(f"Found {len(parcels)} vacant properties.")
print(f"Found {len(vacant_parcels)} vacant properties.")

with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning)
Expand All @@ -24,27 +29,28 @@ def contig_neighbors(primary_featurelayer):

# Create a spatial weights matrix for vacant parcels
print("Creating spatial weights matrix for vacant parcels...")
w = Queen.from_dataframe(parcels)
w = Queen.from_dataframe(vacant_parcels)

# Convert the spatial weights matrix to a NetworkX graph
print("Converting spatial weights matrix to NetworkX graph...")
g = w.to_networkx()

# Calculate the number of contiguous neighbors for each vacant property
# Calculate the number of contiguous vacant properties for each vacant parcel
print("Calculating number of contiguous vacant neighbors for each property...")
n_contiguous = {
node: len(nx.node_connected_component(g, node)) - 1 for node in g.nodes
}

# Assign the number of contiguous vacant neighbors to vacant properties
parcels["n_contiguous"] = parcels.index.map(n_contiguous).fillna(0).astype(int)
# Assign the contiguous neighbor count to the filtered vacant parcels
vacant_parcels["n_contiguous"] = vacant_parcels.index.map(n_contiguous)

print("Joining results back to primary feature layer...")
# Merge the results back to the primary feature layer
primary_featurelayer.gdf = primary_featurelayer.gdf.merge(
parcels[["opa_id", "n_contiguous"]], on="opa_id", how="left"
vacant_parcels[["opa_id", "n_contiguous"]], on="opa_id", how="left"
)

# For non-vacant properties, set the number of contiguous vacant neighbors to 0
primary_featurelayer.gdf["n_contiguous"].fillna(0, inplace=True)
# Assign NA for non-vacant properties
primary_featurelayer.gdf.loc[~primary_featurelayer.gdf["vacant"], "n_contiguous"] = np.nan

print("Process completed. Returning updated primary feature layer.")
return primary_featurelayer
8 changes: 8 additions & 0 deletions data/src/data_utils/li_complaints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from constants.services import COMPLAINTS_SQL_QUERY


from data_utils.kde import apply_kde_to_primary


def li_complaints(primary_featurelayer):
return apply_kde_to_primary(primary_featurelayer, "L and I Complaints", COMPLAINTS_SQL_QUERY)
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
import geopandas as gpd
from typing import List
from classes.featurelayer import FeatureLayer
from constants.services import COMPLAINTS_SQL_QUERY, VIOLATIONS_SQL_QUERY
from constants.services import VIOLATIONS_SQL_QUERY


def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
def li_violations(primary_featurelayer: FeatureLayer) -> FeatureLayer:
"""
Process L&I (Licenses and Inspections) data for complaints and violations.
Process L&I (Licenses and Inspections) data for violations.
This function filters and processes L&I complaints and violations data,
This function filters and processes L&I violations data,
joining it with the primary feature layer based on spatial relationships
and OPA (Office of Property Assessment) identifiers.
Expand All @@ -33,35 +33,6 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
"unsafe",
]

# Load complaints data from L&I
l_and_i_complaints: FeatureLayer = FeatureLayer(
name="LI Complaints", carto_sql_queries=COMPLAINTS_SQL_QUERY
)

# Filter for rows where 'subject' contains any of the keywords
l_and_i_complaints.gdf = l_and_i_complaints.gdf[
l_and_i_complaints.gdf["subject"].str.lower().str.contains("|".join(keywords))
]

# Filter for only Status = 'Open'
l_and_i_complaints.gdf = l_and_i_complaints.gdf[
l_and_i_complaints.gdf["status"].str.lower() == "open"
]

# Group by geometry and concatenate the violationcodetitle values into a list with a semicolon separator
l_and_i_complaints.gdf = (
l_and_i_complaints.gdf.groupby("geometry")["service_name"]
.apply(lambda x: "; ".join([val for val in x if val is not None]))
.reset_index()
)

l_and_i_complaints.rebuild_gdf()

# rename the column to 'li_complaints'
l_and_i_complaints.gdf.rename(
columns={"service_name": "li_complaints"}, inplace=True
)

# Load data for violations from L&I
l_and_i_violations: FeatureLayer = FeatureLayer(
name="LI Violations", carto_sql_queries=VIOLATIONS_SQL_QUERY, from_xy=True
Expand Down Expand Up @@ -121,7 +92,6 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
.apply(lambda x: "; ".join([val for val in x if val is not None]))
.reset_index()
)
l_and_i_complaints.rebuild_gdf()

# rename the column to 'li_violations'
l_and_i_violations.gdf.rename(
Expand All @@ -134,19 +104,6 @@ def l_and_i(primary_featurelayer: FeatureLayer) -> FeatureLayer:
"opa_account_num",
)

# Complaints need a spatial join, but we need to take special care to merge on just the parcel geoms first to get opa_id
complaints_with_opa_id: gpd.GeoDataFrame = primary_featurelayer.gdf.sjoin(
l_and_i_complaints.gdf, how="left", predicate="contains"
)
complaints_with_opa_id.drop(columns=["index_right"], inplace=True)

# Concatenate the complaints values into a list with a semicolon separator by opa_id
complaints_with_opa_id = (
complaints_with_opa_id.groupby("opa_id")["li_complaints"]
.apply(lambda x: "; ".join([str(val) for val in x if val is not None]))
.reset_index()[["opa_id", "li_complaints"]]
)

# Clean up the NaN values in the li_complaints column
def remove_nan_strings(x: str) -> str | None:
"""
Expand All @@ -163,16 +120,6 @@ def remove_nan_strings(x: str) -> str | None:
else:
return x

complaints_with_opa_id["li_complaints"] = complaints_with_opa_id[
"li_complaints"
].apply(remove_nan_strings)

# Merge the complaints values back into the primary_featurelayer
primary_featurelayer.opa_join(
complaints_with_opa_id,
"opa_id",
)

primary_featurelayer.gdf[
["all_violations_past_year", "open_violations_past_year"]
] = (
Expand Down
31 changes: 14 additions & 17 deletions data/src/data_utils/negligent_devs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd


def negligent_devs(primary_featurelayer):
devs = primary_featurelayer.gdf

Expand All @@ -11,29 +10,27 @@ def negligent_devs(primary_featurelayer):
devs[["opa_id", "city_owner_agency", "standardized_address", "vacant"]].head(10)
)

# Count observations where vacant == 1 by standardized_address
vacant_counts = (
devs[devs["vacant"] == 1]
.groupby("standardized_address")
.size()
.reset_index(name="vacant_property_count")
# Count total properties and vacant properties by standardized_address
property_counts = (
devs.groupby("standardized_address")
.agg(
n_total_properties_owned=("opa_id", "size"),
n_vacant_properties_owned=("vacant", "sum"),
)
.reset_index()
)

print("Head of resulting DataFrame with vacant counts:")
print(vacant_counts.head(10))
print("Head of resulting DataFrame with property counts:")
print(property_counts.head(10))

# Merge the vacant counts back to the main DataFrame
# Merge the property counts back to the main DataFrame
primary_featurelayer.gdf = primary_featurelayer.gdf.merge(
vacant_counts, on="standardized_address", how="left"
property_counts, on="standardized_address", how="left"
)

# Identify negligent developers: non-city owned entities owning 5+ vacant properties
primary_featurelayer.gdf["n_properties_owned"] = primary_featurelayer.gdf.groupby(
"opa_id"
)["vacant_property_count"].transform("sum")

primary_featurelayer.gdf["negligent_dev"] = (
primary_featurelayer.gdf["n_properties_owned"] >= 5
primary_featurelayer.gdf["n_vacant_properties_owned"] >= 5
) & (
primary_featurelayer.gdf["city_owner_agency"].isna()
| (primary_featurelayer.gdf["city_owner_agency"] == "")
Expand All @@ -42,7 +39,7 @@ def negligent_devs(primary_featurelayer):
print("Final feature layer data with negligent_dev flag:")
print(
primary_featurelayer.gdf[
["opa_id", "n_properties_owned", "negligent_dev"]
["opa_id", "n_total_properties_owned", "n_vacant_properties_owned", "negligent_dev"]
].head(10)
)

Expand Down
10 changes: 3 additions & 7 deletions data/src/data_utils/phs_properties.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from classes.featurelayer import FeatureLayer
from constants.services import PHS_LAYERS_TO_LOAD
import pandas as pd


def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
Expand All @@ -22,13 +23,8 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
# Perform spatial join between primary feature layer and PHS properties
primary_featurelayer.spatial_join(phs_properties)

# Initialize 'phs_care_program' column with default "no" for all rows
primary_featurelayer.gdf["phs_care_program"] = "No"

# Set 'phs_care_program' to "yes" for matched rows
primary_featurelayer.gdf.loc[
primary_featurelayer.gdf["program"].notna(), "phs_care_program"
] = "Yes"
# Create 'phs_care_program' column with values from 'program', drop 'program'
primary_featurelayer.gdf["phs_care_program"] = primary_featurelayer.gdf.pop("program")

# Rebuild the GeoDataFrame after updates
primary_featurelayer.rebuild_gdf()
Expand Down
6 changes: 4 additions & 2 deletions data/src/data_utils/priority_level.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import pandas as pd

def priority_level(dataset):
priority_levels = []
for idx, row in dataset.gdf.iterrows():
priority_level = ""

# Decision Points
guncrime_density_percentile = row["gun_crimes_density_percentile"]
in_phs_landcare = row["phs_care_program"] == "yes"
in_phs_landcare = pd.notna(row["phs_care_program"])
has_li_complaint_or_violation = (
row["li_complaints"] is not None
and float(row["all_violations_past_year"]) > 0
)
) or (row["l_and_i_complaints_density_percentile"] > 50)
very_low_tree_canopy = row["tree_canopy_gap"] >= 0.3

# Updated logic based on percentile values
Expand Down
24 changes: 19 additions & 5 deletions data/src/data_utils/pwd_parcels.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from classes.featurelayer import FeatureLayer
from constants.services import PWD_PARCELS_QUERY
import geopandas as gpd


def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer:
Expand Down Expand Up @@ -52,6 +53,13 @@ def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer:
how="left",
)

# Coerce merged_gdf into a GeoDataFrame
merged_gdf = gpd.GeoDataFrame(
merged_gdf,
geometry="geometry",
crs=primary_featurelayer.gdf.crs, # Ensure the CRS matches the original
)

# Log observations with no polygon geometry
no_geometry_count = merged_gdf["geometry"].isnull().sum()
print("Number of observations with no polygon geometry:", no_geometry_count)
Expand All @@ -62,10 +70,16 @@ def pwd_parcels(primary_featurelayer: FeatureLayer) -> FeatureLayer:
)
print("Number of observations retaining point geometry:", no_geometry_count)

# Validate the merged GeoDataFrame
updated_gdf = FeatureLayer(
name=primary_featurelayer.name,
gdf=merged_gdf,
# Count observations with point geometry grouped by 'vacant'
point_geometry_counts = (
merged_gdf[merged_gdf["geometry"].geom_type == "Point"]
.groupby("vacant")
.size()
)

return updated_gdf
# Log the results
print("Counts of point geometry grouped by 'vacant':")
print(point_geometry_counts)

# Wrap the GeoDataFrame back into a FeatureLayer
return FeatureLayer(name=primary_featurelayer.name, gdf=merged_gdf)
6 changes: 2 additions & 4 deletions data/src/data_utils/vacant_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,8 @@ def vacant_properties(primary_featurelayer) -> FeatureLayer:
# Final null value check before returning
check_null_percentage(df)

# Create vacant column in the primary feature layer based on opa_id match
primary_featurelayer.gdf["vacant"] = (
primary_featurelayer.gdf["opa_id"].isin(df["opa_id"]).astype(int)
)
# Create vacant column in the primary feature layer as True/False
primary_featurelayer.gdf["vacant"] = primary_featurelayer.gdf["opa_id"].isin(df["opa_id"])

print("Vacant column added based on opa_id match.")

Expand Down

0 comments on commit b40c1cd

Please sign in to comment.