From caedc95b910ce6b25d23d628a66feee951bd5d6e Mon Sep 17 00:00:00 2001 From: nlebovits Date: Fri, 25 Oct 2024 15:34:57 -0400 Subject: [PATCH 1/3] quick fix for labeling percentiles issue --- data/src/data_utils/kde.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/data/src/data_utils/kde.py b/data/src/data_utils/kde.py index 8741dacc..477cc306 100644 --- a/data/src/data_utils/kde.py +++ b/data/src/data_utils/kde.py @@ -147,11 +147,13 @@ def apply_kde_to_primary(primary_featurelayer, name, query, resolution=resolutio def label_percentile(value): - if value == 1: - return "1st Percentile" - elif value == 2: - return "2nd Percentile" - elif value == 3: - return "3rd Percentile" + if 10 <= value % 100 <= 13: + return f"{value}th Percentile" + elif value % 10 == 1: + return f"{value}st Percentile" + elif value % 10 == 2: + return f"{value}nd Percentile" + elif value % 10 == 3: + return f"{value}rd Percentile" else: return f"{value}th Percentile" From 0a87dcca7a7a677cb98c89f680411c4ea376ed27 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Fri, 25 Oct 2024 16:02:55 -0400 Subject: [PATCH 2/3] fix issue with empty strings --- data/src/data_utils/negligent_devs.py | 48 +++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/data/src/data_utils/negligent_devs.py b/data/src/data_utils/negligent_devs.py index ba601cd0..aa95532c 100644 --- a/data/src/data_utils/negligent_devs.py +++ b/data/src/data_utils/negligent_devs.py @@ -58,18 +58,38 @@ def create_standardized_address(row): def negligent_devs(primary_featurelayer): devs = primary_featurelayer.gdf - city_owners = devs.loc[~devs["city_owner_agency"].isna()].copy() - non_city_owners = devs.loc[devs["city_owner_agency"].isna()].copy() + + print("Columns in 'devs' DataFrame:", devs.columns) + + print("Initial properties data:") + print(devs[['opa_id', 'city_owner_agency', 'mailing_street']].head(10)) + + city_owners = devs.loc[~devs["city_owner_agency"].isna() & (devs["city_owner_agency"] != "")].copy() + non_city_owners = devs.loc[devs["city_owner_agency"].isna() | (devs["city_owner_agency"] == "")].copy() + + print(f"City owners shape: {city_owners.shape}, Non-city owners shape: {non_city_owners.shape}") + + # Log before standardizing addresses + print("Non-city owners mailing streets before standardization:") + print(non_city_owners[['opa_id', 'mailing_street']].head(10)) non_city_owners.loc[:, "mailing_street"] = ( non_city_owners["mailing_street"].astype(str).apply(standardize_street) ) + print("Non-city owners mailing streets after standardization:") + print(non_city_owners[['opa_id', 'mailing_street']].head(10)) + for term in ["ST", "AVE", "RD", "BLVD"]: non_city_owners.loc[:, "mailing_street"] = non_city_owners[ "mailing_street" ].replace(regex={f"{term}.*": term}) + # Log after applying term replacement + print("Non-city owners mailing streets after term replacement:") + print(non_city_owners[['opa_id', 'mailing_street']].head(10)) + + # Fill missing address components non_city_owners.loc[:, "mailing_address_1"] = non_city_owners[ "mailing_address_1" ].fillna("") @@ -84,33 +104,52 @@ def negligent_devs(primary_featurelayer): ].fillna("") non_city_owners.loc[:, "mailing_zip"] = non_city_owners["mailing_zip"].fillna("") + # Log addresses before creating standardized address + print("Non-city owners mailing details before creating standardized address:") + print(non_city_owners[['opa_id', 'mailing_street', 'mailing_city_state', 'mailing_zip']].head(10)) + non_city_owners.loc[:, "standardized_address"] = non_city_owners.apply( create_standardized_address, axis=1 ) + # Log standardized addresses and counts + print("Standardized addresses with counts:") address_counts = ( non_city_owners.groupby("standardized_address") .size() .reset_index(name="property_count") ) + print(address_counts.head(10)) + sorted_address_counts = address_counts.sort_values( by="property_count", ascending=False ) + print("Top standardized addresses by property count:") + print(sorted_address_counts.head(10)) non_city_owners = non_city_owners.merge( sorted_address_counts, on="standardized_address", how="left" ) + # Log merged data for city owners city_owner_counts = ( city_owners.groupby("city_owner_agency") .size() .reset_index(name="property_count") ) + print("City owner counts:") + print(city_owner_counts.head(10)) + city_owners = city_owners.merge( city_owner_counts, on="city_owner_agency", how="left" ) devs_combined = pd.concat([city_owners, non_city_owners], axis=0) + + # Final check on the merged data before updating primary_featurelayer + print("Combined data with property counts:") + print(devs_combined[['opa_id', 'property_count']].head(10)) + primary_featurelayer.gdf = primary_featurelayer.gdf.merge( devs_combined[["opa_id", "property_count"]], on="opa_id", how="left" ) @@ -119,6 +158,9 @@ def negligent_devs(primary_featurelayer): ) primary_featurelayer.gdf.loc[:, "negligent_dev"] = ( primary_featurelayer.gdf["n_properties_owned"] > 5 - ) & (primary_featurelayer.gdf["city_owner_agency"].isna()) + ) & (primary_featurelayer.gdf["city_owner_agency"].isna() | (primary_featurelayer.gdf["city_owner_agency"] == "")) + + print("Final feature layer data with negligent_dev flag:") + print(primary_featurelayer.gdf[['opa_id', 'n_properties_owned', 'negligent_dev']].head(10)) return primary_featurelayer From ad413d4be5c180058ca5e44d242d9035c1f7a00f Mon Sep 17 00:00:00 2001 From: nlebovits Date: Fri, 25 Oct 2024 20:25:43 -0400 Subject: [PATCH 3/3] ignore report mvp docs for the moment --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 934cdf40..a4fe2518 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ data/src/tmp .DS_Store /data/src/local_outputs/ /data/notebooks/ +/data/reports/ ## App