diff --git a/data/src/classes/featurelayer.py b/data/src/classes/featurelayer.py index 35251706..e704ce97 100644 --- a/data/src/classes/featurelayer.py +++ b/data/src/classes/featurelayer.py @@ -314,35 +314,7 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None: self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326) self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON") - - # Load the GeoJSON from the polygons, drop geometry, and save as Parquet - gdf_polygons = gpd.read_file(temp_geojson_polygons) - df_no_geom = gdf_polygons.drop(columns=["geometry"]) - - # Check if the DataFrame has fewer than 25,000 rows - num_rows, num_cols = df_no_geom.shape - if num_rows < 25000: - print( - f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload." - ) - return - - # Save the DataFrame as Parquet - df_no_geom.to_parquet(temp_parquet) - - # Upload Parquet to Google Cloud Storage - blob_parquet = bucket.blob(f"{tiles_file_id_prefix}.parquet") - try: - blob_parquet.upload_from_filename(temp_parquet) - parquet_size = os.stat(temp_parquet).st_size - parquet_size_mb = parquet_size / (1024 * 1024) - print( - f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns." - ) - except Exception as e: - print(f"Parquet upload failed: {e}") - return - + # Command for generating PMTiles for points up to zoom level zoom_threshold points_command: list[str] = [ "tippecanoe", diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py index 039843f1..7c8e79de 100644 --- a/data/src/data_utils/access_process.py +++ b/data/src/data_utils/access_process.py @@ -39,10 +39,5 @@ def access_process(dataset: Any) -> Any: access_processes.append(access_process) dataset.gdf["access_process"] = access_processes - - # Print the distribution of "access_process" - distribution = dataset.gdf["access_process"].value_counts() - print("Distribution of access process:") - print(distribution) - + return dataset diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py index e5627850..c906c2d1 100644 --- a/data/src/data_utils/phs_properties.py +++ b/data/src/data_utils/phs_properties.py @@ -22,10 +22,10 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: primary_featurelayer.spatial_join(phs_properties) # Initialize 'phs_care_program' column with default "no" for all rows - primary_featurelayer.gdf["phs_care_program"] = "no" + primary_featurelayer.gdf["phs_care_program"] = "No" # Set 'phs_care_program' to "yes" for matched rows - primary_featurelayer.gdf.loc[primary_featurelayer.gdf["phs_care_program"] != "no", "phs_care_program"] = "yes" + primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes" # Rebuild the GeoDataFrame after updates primary_featurelayer.rebuild_gdf() diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py index 613313f2..33097de3 100644 --- a/data/src/data_utils/priority_level.py +++ b/data/src/data_utils/priority_level.py @@ -47,4 +47,5 @@ def priority_level(dataset): priority_levels.append(priority_level) dataset.gdf["priority_level"] = priority_levels + return dataset diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py index 87a8b6f7..d6573218 100644 --- a/data/src/data_utils/vacant_properties.py +++ b/data/src/data_utils/vacant_properties.py @@ -145,13 +145,7 @@ def vacant_properties() -> FeatureLayer: vacant_properties.gdf, geometry="geometry" ) - print( - f"Vacant properties data size before dropping NAs: {len(vacant_properties.gdf)} rows." - ) vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True) - print( - f"Vacant properties data size after dropping NAs: {len(vacant_properties.gdf)} rows." - ) # Final null value check before returning check_null_percentage(vacant_properties.gdf) @@ -184,4 +178,9 @@ def vacant_properties() -> FeatureLayer: # Ensure concatenated data is still a GeoDataFrame vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry") + before_drop = vacant_properties.gdf.shape[0] + vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id") + after_drop = vacant_properties.gdf.shape[0] + print(f"Duplicate vacant properties dropped: {before_drop - after_drop}") + return vacant_properties diff --git a/data/src/script.py b/data/src/script.py index 46e1db3b..78c5f90d 100644 --- a/data/src/script.py +++ b/data/src/script.py @@ -80,12 +80,32 @@ for service in services: dataset = service(dataset) +before_drop = dataset.gdf.shape[0] +dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") +after_drop = dataset.gdf.shape[0] +print(f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}") + # Add Priority Level dataset = priority_level(dataset) +# Print the distribution of "priority_level" +distribution = dataset.gdf["priority_level"].value_counts() +print("Distribution of priority level:") +print(distribution) + # Add Access Process dataset = access_process(dataset) +# Print the distribution of "access_process" +distribution = dataset.gdf["access_process"].value_counts() +print("Distribution of access process:") +print(distribution) + +before_drop = dataset.gdf.shape[0] +dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") +after_drop = dataset.gdf.shape[0] +print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}") + # back up old tiles file whether we are reloading data or not if backup is None: backup = BackupArchiveDatabase()