From f9fde5cd1d6eb451cb98303e68e4f885a3432b41 Mon Sep 17 00:00:00 2001 From: Joseph Xu Date: Thu, 12 Oct 2023 23:16:06 -0700 Subject: [PATCH] Propagate building properties to generated examples. This will enable the inference pipeline to propagate these same properties to the output file. This CL also simplifies the handling of building coordinates in the example generation pipeline as a bonus. PiperOrigin-RevId: 573114475 --- src/generate_examples_main.py | 40 +++--- src/skai/buildings.py | 124 ++++++++++++++----- src/skai/buildings_test.py | 125 +++++++++++++++++-- src/skai/earth_engine.py | 72 +++++------ src/skai/generate_examples.py | 161 ++++++++++++------------ src/skai/generate_examples_test.py | 190 +++++++++++++++++++++++------ src/skai/open_street_map.py | 20 +-- src/skai/read_raster.py | 27 ++-- src/skai/read_raster_test.py | 34 ++++-- src/skai/utils.py | 17 +-- 10 files changed, 541 insertions(+), 269 deletions(-) diff --git a/src/generate_examples_main.py b/src/generate_examples_main.py index 3e47109f..a87ef8ff 100644 --- a/src/generate_examples_main.py +++ b/src/generate_examples_main.py @@ -36,6 +36,7 @@ --cloud_region=us-west1 """ +import os import time from typing import List @@ -164,6 +165,8 @@ Polygon = shapely.geometry.polygon.Polygon +BUILDINGS_FILE_NAME = 'processed_buildings.parquet' + def _read_image_config(path: str) -> List[str]: with tf.io.gfile.GFile(path, 'r') as f: @@ -207,35 +210,32 @@ def main(args): raise ValueError('At least labels_file (for labeled examples extraction) ' 'or buildings_method != none (for unlabeled data) should ' 'be specified.') - if config.buildings_method != 'none': + + buildings_path = os.path.join(config.output_dir, BUILDINGS_FILE_NAME) + if config.labels_file: + generate_examples.read_labels_file( + config.labels_file, + config.label_property, + config.labels_to_classes, + config.num_keep_labeled_examples, + buildings_path + ) + buildings_labeled = True + else: if config.aoi_path: aois = buildings.read_aois(config.aoi_path) else: aois = [read_raster.get_raster_bounds(path, gdal_env) for path in after_image_patterns] try: - building_centroids = generate_examples.get_building_centroids( - config, aois + generate_examples.download_building_footprints( + config, aois, buildings_path ) except generate_examples.NotInitializedEarthEngineError: logging.fatal('Could not initialize Earth Engine.', exc_info=True) except generate_examples.NoBuildingFoundError: logging.fatal('No building is found.', exc_info=True) - logging.info('Found %d buildings in area of interest.', - len(building_centroids)) - else: - # Only if one wants to extract labeled examples and labels_file is provided. - building_centroids = [] - - if config.labels_file: - labeled_coordinates = generate_examples.read_labels_file( - config.labels_file, - config.label_property, - config.labels_to_classes, - config.num_keep_labeled_examples, - ) - else: - labeled_coordinates = [] + buildings_labeled = False generate_examples.generate_examples_pipeline( before_image_patterns, @@ -245,8 +245,8 @@ def main(args): config.resolution, config.output_dir, config.output_shards, - building_centroids, - labeled_coordinates, + buildings_path, + buildings_labeled, config.use_dataflow, gdal_env, timestamped_dataset, diff --git a/src/skai/buildings.py b/src/skai/buildings.py index 921b60a4..1841ab2c 100644 --- a/src/skai/buildings.py +++ b/src/skai/buildings.py @@ -14,7 +14,7 @@ """Functions for reading building centroids from files.""" -from typing import List, Tuple +import os import geopandas as gpd import pandas as pd import shapely.geometry @@ -24,8 +24,8 @@ Polygon = shapely.geometry.polygon.Polygon -def _read_buildings_csv(path: str) -> List[Tuple[float, float]]: - """Reads (longitude, latitude) coordinates from a CSV file. +def _read_buildings_csv(path: str) -> gpd.GeoDataFrame: + """Reads CSV file containing building footprints to GeoDataFrame. The file should contain "longitude" and "latitude" columns. @@ -40,48 +40,48 @@ def _read_buildings_csv(path: str) -> List[Tuple[float, float]]: """ with tf.io.gfile.GFile(path, 'r') as csv_file: df = pd.read_csv(csv_file) - if 'longitude' not in df.columns or 'latitude' not in df.columns: - raise ValueError( - f'Malformed CSV file "{path}". File does not contain "longitude" and ' - '"latitude" columns') - return [(row.longitude, row.latitude) for _, row in df.iterrows()] + if 'geometry' in df.columns: + geometries = gpd.GeoSeries.from_wkt(df['geometry']) + df.drop(columns=['geometry'], inplace=True) + elif 'wkt' in df.columns: + geometries = gpd.GeoSeries.from_wkt(df['wkt']) + df.drop(columns=['wkt'], inplace=True) + elif 'longitude' in df.columns and 'latitude' in df.columns: + geometries = gpd.points_from_xy(df['longitude'], df['latitude']) + df.drop(columns=['longitude', 'latitude'], inplace=True) + else: + raise ValueError(f'No geometry information found in file "{path}"') + + return gpd.GeoDataFrame(df, geometry=geometries, crs=4326) -def read_buildings_file(path: str, - regions: List[Polygon]) -> List[Tuple[float, float]]: - """Extracts building coordinates from a file. +def convert_buildings_file( + path: str, regions: list[Polygon], output_path: str +) -> None: + """Converts an input file encoding building footprints to the standard format. - Supported file formats are csv, shapefile, and geojson. + Also filters out any buildings that don't fall in one of the specified region + polygons. + + Supported file formats are csv and anything that GeoPandas handles. Args: path: Path to buildings file. regions: Regions to where building coordinates should come from. - - Returns: - List of (longitude, latitude) building coordinates. + output_path: Path to write buildings GeoDataFrame to. """ if path.lower().endswith('.csv'): - coords = _read_buildings_csv(path) + buildings_gdf = _read_buildings_csv(path) else: - coords = [] - df = gpd.read_file(path).to_crs(epsg=4326) - geometries = list(df.geometry.values) - for g in geometries: - centroid = g.centroid - coords.append((centroid.x, centroid.y)) - - filtered_coords = [] - for lon, lat in coords: - point = Point(lon, lat) - for region in regions: - if region.intersects(point): - filtered_coords.append((lon, lat)) - break + with tf.io.gfile.GFile(path, 'rb') as f: + buildings_gdf = gpd.read_file(f).to_crs(epsg=4326) - return filtered_coords + combined_regions = gpd.GeoSeries(regions).unary_union + in_regions = buildings_gdf.intersects(combined_regions) + write_buildings_file(buildings_gdf[in_regions], output_path) -def read_aois(path: str) -> List[Polygon]: +def read_aois(path: str) -> list[Polygon]: """Reads area of interest polygons from a file. Common file formats such as shapefile and GeoJSON are supported. However, the @@ -106,3 +106,63 @@ def read_aois(path: str) -> List[Polygon]: raise ValueError( f'Unexpected geometry for area of interest: "{g.geometryType()}"') return geometries + + +def write_buildings_file(gdf: gpd.GeoDataFrame, output_path: str) -> None: + """Writes a GeoDataFrame of building geometries to file. + + Serializes GeoDataFrame using Parquet file format to allow fast reading of + individual columns, such as longitude and latitude, in large datasets. + + Args: + gdf: GeoDataFrame of building geometries. + output_path: Output path. + """ + if 'longitude' not in gdf.columns and 'latitude' not in gdf.columns: + centroids = gdf.geometry.centroid + output_gdf = gdf.copy().to_crs(4326) + output_gdf['longitude'] = [c.x for c in centroids] + output_gdf['latitude'] = [c.y for c in centroids] + else: + output_gdf = gdf.to_crs(4326) + + output_dir = os.path.dirname(output_path) + if not tf.io.gfile.exists(output_dir): + tf.io.gfile.makedirs(output_dir) + with tf.io.gfile.GFile(output_path, 'wb') as f: + f.closed = False + output_gdf.to_parquet(f, index=False) + + +def read_buildings_file(path: str) -> gpd.GeoDataFrame: + """Reads a GeoDataFrame of building geometries from file. + + The GeoDataFrame must have been serialized by the write_buildings_file + function defined above. + + Args: + path: Path to serialized GeoDataFrame. + + Returns: + Buildings GeoDataFrame. + """ + with tf.io.gfile.GFile(path, 'rb') as f: + f.closed = False # Work-around for GFile issue. + return gpd.read_parquet(f).to_crs(4326) + + +def read_building_coordinates(path: str) -> pd.DataFrame: + """Reads only the longitude and latitude columns of a buildings file. + + The GeoDataFrame must have been serialized by the write_buildings_file + function defined above. + + Args: + path: Path to buildings file. Should be a GeoDataFrame in parquet format. + + Returns: + DataFrame (not GeoDataFrame) containing + """ + with tf.io.gfile.GFile(path, 'rb') as f: + f.closed = False # Work-around for GFile issue. + return pd.read_parquet(f, columns=['longitude', 'latitude']) diff --git a/src/skai/buildings_test.py b/src/skai/buildings_test.py index 857e3c51..fa864e86 100644 --- a/src/skai/buildings_test.py +++ b/src/skai/buildings_test.py @@ -14,7 +14,12 @@ """Tests for buildings.py.""" import pathlib +import tempfile from absl.testing import absltest +import geopandas as gpd +import geopandas.testing +import numpy as np +import pandas as pd import shapely.geometry from skai import buildings @@ -34,29 +39,99 @@ def get_test_file_path(relative_test_data_path: str) -> str: return str(current_dir / relative_test_data_path) +def get_temp_file(suffix: str = '') -> str: + return tempfile.mkstemp(suffix=suffix, dir=absltest.TEST_TMPDIR.value)[1] + + +def create_test_buildings_gdf(num_buildings: int) -> gpd.GeoDataFrame: + longitudes = np.random.uniform(low=-180, high=180, size=(num_buildings,)) + latitudes = np.random.uniform(low=-90, high=90, size=(num_buildings,)) + centroids = gpd.points_from_xy(longitudes, latitudes) + ids = [str(i) for i in range(num_buildings)] + return gpd.GeoDataFrame({'id': ids}, geometry=centroids, crs=4326) + + class BuildingsTest(absltest.TestCase): - def testReadBuildingsFileGeoJSON(self): + def test_convert_buildings_file_from_geojson(self): path = get_test_file_path('test_data/building_centroids.geojson') + output_path = get_temp_file() regions = [Polygon.from_bounds(178.78737, -16.65851, 178.81098, -16.63617)] - coords = buildings.read_buildings_file(path, regions) + buildings.convert_buildings_file(path, regions, output_path) + buildings_gdf = buildings.read_buildings_file(output_path) # The input file, "building_centroids.geojson", has 102 centroids. However, # one centroid will be filtered out because it doesn't fall within the # region boundaries. Hence there should be 101 centroids. - self.assertLen(coords, 101) + self.assertLen(buildings_gdf, 101) - def testReadBuildingsFileCSV(self): + def test_convert_buildings_file_from_csv(self): path = get_test_file_path('test_data/building_centroids.csv') + output_path = get_temp_file() regions = [Polygon.from_bounds(178.78737, -16.65851, 178.81098, -16.63617)] - coords = buildings.read_buildings_file(path, regions) + buildings.convert_buildings_file(path, regions, output_path) + buildings_gdf = buildings.read_buildings_file(output_path) # The input file, "building_centroids.csv", has 102 centroids. However, # one centroid will be filtered out because it doesn't fall within the # region boundaries. Hence there should be 101 centroids. - self.assertLen(coords, 101) + self.assertLen(buildings_gdf, 101) - def testReadAOIs(self): + def test_convert_buildings_file_from_csv_with_wkt(self): + buildings_data = [ + { + 'latitude': 29.99202352, + 'longitude': -4.99707854, + 'area_in_meters': 26.1683, + 'confidence': 0.8192, + 'geometry': ( + 'POLYGON((-4.9970664523855 29.9919846910361, -4.99705832764922' + ' 29.9920597015168, -4.99709061807397 29.992062351673,' + ' -4.99709874278586 29.9919873411914, -4.9970664523855' + ' 29.9919846910361))' + ), + 'full_plus_code': '7CXQX2R3+R53V', + }, + { + 'latitude': 29.99214936, + 'longitude': -4.99758957, + 'area_in_meters': 31.8337, + 'confidence': 0.8393, + 'geometry': ( + 'POLYGON((-4.99758309542527 29.9921067182802, -4.99755994204994' + ' 29.9921837854051, -4.99759604572576 29.9921920041646,' + ' -4.99761919907427 29.9921149370345, -4.99758309542527' + ' 29.9921067182802))' + ), + 'full_plus_code': '7CXQX2R2+VX3R', + }, + ] + path = get_temp_file(suffix='.csv') + pd.DataFrame(buildings_data).to_csv(path, index=False) + regions = [Polygon.from_bounds(-180, -90, 180, 90)] + output_path = get_temp_file() + buildings.convert_buildings_file(path, regions, output_path) + buildings_gdf = buildings.read_buildings_file(output_path) + + self.assertLen(buildings_gdf, 2) + self.assertContainsSubset( + [ + 'longitude', + 'latitude', + 'area_in_meters', + 'confidence', + 'full_plus_code', + ], + buildings_gdf.columns, + ) + expected_geometries = gpd.GeoSeries.from_wkt( + [b['geometry'] for b in buildings_data], crs=4326 + ) + geopandas.testing.assert_geoseries_equal( + buildings_gdf.geometry, expected_geometries, check_less_precise=True + ) + + def test_read_aois(self): path = get_test_file_path('test_data/aoi.geojson') aois = buildings.read_aois(path) self.assertLen(aois, 2) @@ -75,6 +150,42 @@ def testReadAOIs(self): [178.820579612784513, -16.644811608514196], [178.823626242664915, -16.641714920481871]]))) + def test_read_write_building_files(self): + path = get_temp_file() + buildings_gdf = create_test_buildings_gdf(20) + buildings.write_buildings_file(buildings_gdf, path) + loaded_buildings_gdf = buildings.read_buildings_file(path) + geopandas.testing.assert_geoseries_equal( + buildings_gdf.geometry, loaded_buildings_gdf.geometry + ) + self.assertContainsSubset( + ['longitude', 'latitude', 'id'], loaded_buildings_gdf.columns + ) + np.testing.assert_allclose( + loaded_buildings_gdf.longitude.values, + [c.x for c in buildings_gdf.geometry], + ) + np.testing.assert_allclose( + loaded_buildings_gdf.latitude.values, + [c.y for c in buildings_gdf.geometry], + ) + + def test_read_write_building_coordinates(self): + path = get_temp_file() + buildings_gdf = create_test_buildings_gdf(20) + buildings.write_buildings_file(buildings_gdf, path) + coordinates_df = buildings.read_building_coordinates(path) + self.assertContainsSubset( + ['longitude', 'latitude'], coordinates_df.columns + ) + np.testing.assert_allclose( + coordinates_df.longitude.values, + [c.x for c in buildings_gdf.geometry], + ) + np.testing.assert_allclose( + coordinates_df.latitude.values, + [c.y for c in buildings_gdf.geometry], + ) if __name__ == '__main__': absltest.main() diff --git a/src/skai/earth_engine.py b/src/skai/earth_engine.py index 2a52c5a8..74c4a1e8 100644 --- a/src/skai/earth_engine.py +++ b/src/skai/earth_engine.py @@ -16,7 +16,6 @@ import json import shutil -from typing import List, Optional, Tuple import urllib.request from absl import logging @@ -24,6 +23,7 @@ import geopandas as gpd import pandas as pd import shapely.geometry +from skai import buildings import tensorflow as tf ShapelyGeometry = shapely.geometry.base.BaseGeometry @@ -43,17 +43,14 @@ def _get_open_building_feature_centroid(feature: ee.Feature) -> ee.Feature: def _download_feature_collection( - collection: ee.FeatureCollection, properties: List[str], - output_path: str) -> gpd.GeoDataFrame: + collection: ee.FeatureCollection, properties: list[str], + output_path: str) -> None: """Downloads a FeatureCollection from Earth Engine as a GeoDataFrame. Args: collection: EE FeatureCollection to download. properties: List of properties to download. - output_path: Path to save CSV file to. - - Returns: - FeatureCollection data as a GeoDataFrame. + output_path: Path of GeoDataFrame feather output. """ url = collection.getDownloadURL('csv', properties) with urllib.request.urlopen(url) as url_file, tf.io.gfile.GFile( @@ -70,19 +67,20 @@ def _download_feature_collection( properties = df.drop(columns=['.geo']) elif 'longitude' in df.columns and 'latitude' in df.columns: geometry = gpd.points_from_xy(df['longitude'], df['latitude']) - properties = df.drop(columns=['longitude', 'latitude']) + properties = df else: - geometry = None - properties = None + raise ValueError('No geometries found in feature collection.') - return gpd.GeoDataFrame(properties, geometry=geometry) + buildings.write_buildings_file( + gpd.GeoDataFrame(properties, geometry=geometry), output_path + ) -def get_open_buildings(regions: List[ShapelyGeometry], +def get_open_buildings(regions: list[ShapelyGeometry], collection: str, confidence: float, as_centroids: bool, - output_path: str) -> gpd.GeoDataFrame: + output_path: str) -> None: """Downloads Open Buildings footprints for the Area of Interest from EE. Args: @@ -90,11 +88,7 @@ def get_open_buildings(regions: List[ShapelyGeometry], collection: Name of Earth Engine FeatureCollection containing footprints. confidence: Confidence threshold for included buildings. as_centroids: If true, download centroids instead of full footprints. - output_path: Save footprints to this file in addition to returning the - GeoDataFrame. - - Returns: - GeoDataFrame of building footprints. + output_path: Save footprints to this file as a GeoPackage. """ bounds = ee.FeatureCollection([_shapely_to_ee_feature(r) for r in regions]) open_buildings = ee.FeatureCollection(collection) @@ -102,34 +96,26 @@ def get_open_buildings(regions: List[ShapelyGeometry], aoi_buildings = aoi_buildings.filter(f'confidence >= {confidence}') if as_centroids: centroids = aoi_buildings.map(_get_open_building_feature_centroid) - return _download_feature_collection(centroids, ['longitude', 'latitude'], - output_path) + _download_feature_collection( + centroids, + [ + 'longitude', + 'latitude', + 'full_plus_code', + 'confidence', + 'area_in_meters', + ], + output_path, + ) else: - return _download_feature_collection(aoi_buildings, ['.geo'], output_path) - - -def get_open_buildings_centroids( - regions: List[ShapelyGeometry], - collection: str, - confidence: float, - output_path: str) -> List[Tuple[float, float]]: - """Downloads Open Buildings footprints as centroids of regions of interest. - - Args: - regions: List of regions as shapely geometries. - collection: Name of Earth Engine FeatureCollection containing footprints. - confidence: Confidence threshold for included buildings. - output_path: Save footprints to this file in addition to returning the - GeoDataFrame. - - Returns: - List of (longitude, latitude) building centroids. - """ - gdf = get_open_buildings(regions, collection, confidence, True, output_path) - return list(zip(gdf['geometry'].x, gdf['geometry'].y)) + _download_feature_collection( + aoi_buildings, + ['.geo', 'full_plus_code', 'confidence', 'area_in_meters'], + output_path, + ) -def initialize(service_account: str, private_key: Optional[str]) -> bool: +def initialize(service_account: str, private_key: str | None) -> bool: """Initializes EE server connection. When not using a service account, this function assumes that the user has diff --git a/src/skai/generate_examples.py b/src/skai/generate_examples.py index 49c50fd2..b2c0e51f 100644 --- a/src/skai/generate_examples.py +++ b/src/skai/generate_examples.py @@ -203,9 +203,9 @@ def __init__(self): super().__init__('Earth Engine could not be initialized.') -def get_building_centroids( - config, regions: List[Polygon] -) -> List[Tuple[float, float]]: +def download_building_footprints( + config, regions: list[Polygon], output_path: str +) -> None: """Finds building centroids based on flag settings. This function is meant to be called from generate_examples_main.py. @@ -214,21 +214,20 @@ def get_building_centroids( config: A configuration object that specify how to get the building centroids. regions: List of polygons of regions to find buildings in. - - Returns: - List of building centroids in (longitude, latitude) format. + output_path: Path to write buildings file to. Raises: ValueError: if buildings_method flag has unknown value. NotInitializedEarthEngineError: if earth couldnot be initialized. NoBuildingFoundError: if no building is found in the area of interest. """ - centroids = None if config.buildings_method == 'file': - centroids = buildings.read_buildings_file(config.buildings_file, regions) + buildings.convert_buildings_file( + config.buildings_file, regions, output_path + ) elif config.buildings_method == 'open_street_map': - centroids = open_street_map.get_building_centroids_in_regions( - regions, config.overpass_url + open_street_map.get_building_centroids_in_regions( + regions, config.overpass_url, output_path ) elif config.buildings_method == 'open_buildings': if not earth_engine.initialize( @@ -236,21 +235,15 @@ def get_building_centroids( ): raise NotInitializedEarthEngineError() logging.info('Querying Open Buildings centroids. This may take a while.') - output_path = os.path.join( - config.output_dir, 'open_buildings_centroids.csv' + earth_engine.get_open_buildings_centroids( + regions, + config.open_buildings_feature_collection, + config.open_buildings_confidence, + output_path, ) - centroids = earth_engine.get_open_buildings_centroids( - regions, config.open_buildings_feature_collection, - config.open_buildings_confidence, output_path - ) - logging.info('Open Buildings centroids saved to %s', output_path) else: raise ValueError('Invalid value for "buildings_method" flag.') - if not centroids: - raise NoBuildingFoundError() - return list(set(centroids)) # Deduplicate. - def _to_grayscale(image: np.ndarray) -> np.ndarray: return cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) @@ -443,10 +436,12 @@ def _create_example( longitude, latitude, before_image_id, after_image_id ) int64_id = _make_int64_id(example_id) - plus_code = openlocationcode.encode( - latitude=latitude, longitude=longitude, codeLength=_PLUS_CODE_LENGTH - ) - utils.add_bytes_feature('plus_code', plus_code.encode(), example) + if 'plus_code' not in scalar_features: + plus_code = openlocationcode.encode( + latitude=latitude, longitude=longitude, codeLength=_PLUS_CODE_LENGTH + ) + utils.add_bytes_feature('plus_code', plus_code.encode(), example) + utils.add_bytes_feature('example_id', example_id.encode(), example) utils.add_int64_feature('int64_id', int64_id, example) utils.add_bytes_feature( @@ -531,16 +526,33 @@ def process( yield example -def _coordinates_to_scalar_features(coordinates_path: str): - coordinates = utils.read_coordinates_file(coordinates_path) - for longitude, latitude, label, string_label in coordinates: +def _extract_scalar_features_from_buildings_file(buildings_path: str): + """Extracts scalar features of each example from buildings file. + + Args: + buildings_path: Path to serialized buildings file. + + Yields: + Tuple of (encoded coordinates, scalar features dictionary). + """ + buildings_gdf = buildings.read_buildings_file(buildings_path) + for _, row in buildings_gdf.iterrows(): + longitude = row['longitude'] + latitude = row['latitude'] + label = row['label'] if 'label' in row.index else -1.0 + string_label = row['string_label'] if 'string_label' in row.index else '' + encoded_coords = utils.encode_coordinates(longitude, latitude) - feature = _FeatureUnion(scalar_features={ + scalar_features = { 'coordinates': [longitude, latitude], 'label': [label], 'string_label': [string_label] - }) - yield (encoded_coords, feature) + } + if 'full_plus_code' in row.index: + scalar_features['plus_code'] = [row['full_plus_code']] + if 'area_in_meters' in row.index: + scalar_features['area_in_meters'] = [row['area_in_meters']] + yield (encoded_coords, _FeatureUnion(scalar_features=scalar_features)) def _remove_large_images(example: Example) -> Example: @@ -576,7 +588,7 @@ def _generate_examples( pipeline, before_image_patterns: List[str], after_image_patterns: List[str], - coordinates_path: str, + buildings_path: str, large_patch_size: int, example_patch_size: int, resolution: float, @@ -590,7 +602,7 @@ def _generate_examples( pipeline: Beam pipeline. before_image_patterns: List of before image path patterns. after_image_patterns: List of after image path patterns. - coordinates_path: Path to file containing building coordinates. + buildings_path: Path to serialized building footprints GeoDataFrame file. large_patch_size: Size in pixels of before and after image patches for labeling and alignment. Typically 256. example_patch_size: Size of patches to extract into examples. Typically 64. @@ -604,10 +616,10 @@ def _generate_examples( """ scalar_features = ( pipeline - | stage_prefix + 'encode_coordinates_path' >> beam.Create( - [coordinates_path]) + | stage_prefix + 'encode_buildings_path' >> beam.Create( + [buildings_path]) | stage_prefix + 'create_scalar_features' >> beam.FlatMap( - _coordinates_to_scalar_features)) + _extract_scalar_features_from_buildings_file)) input_collections = [scalar_features] after_image_size = large_patch_size @@ -621,7 +633,7 @@ def _generate_examples( before_raster_paths = _expand_patterns(before_image_patterns) before_patches = read_raster.extract_patches_from_rasters( pipeline, - coordinates_path, + buildings_path, before_raster_paths, large_patch_size, resolution, @@ -637,7 +649,7 @@ def _generate_examples( after_raster_paths = _expand_patterns(after_image_patterns) after_patches = read_raster.extract_patches_from_rasters( pipeline, - coordinates_path, + buildings_path, after_raster_paths, after_image_size, resolution, @@ -676,10 +688,11 @@ def _generate_examples( def read_labels_file( path: str, label_property: str, - labels_to_classes: List[str] = None, - max_points: int = None -) -> List[Tuple[float, float, float, str]]: - """Reads labels from a GIS file. + labels_to_classes: List[str], + max_points: int, + output_path: str, +) -> None: + """Reads labels from a GIS file and writes to the standard buildings format. If the "label_property" is a string, then it is assumed to be the name of a class, e.g. "damaged". In labels_to_classes, user can specify the mapping of @@ -694,10 +707,8 @@ def read_labels_file( label_property: The property to use as the label, e.g. "string_label". labels_to_classes: List of string in "class=label" format, e.g. ["no_damage=0", "damaged=1", "destroyed=1"]. - max_points: Number of labeled examples to keep - - Returns: - List of tuples of the form (longitude, latitude, float label). + max_points: Number of labeled examples to keep. + output_path: Buildings file output path. """ # Parse labels_to_classes into dictionary format if specified if labels_to_classes: @@ -715,28 +726,30 @@ def read_labels_file( raise # Generate coordinates from label file - df = gpd.read_file(path).to_crs(epsg=4326) - coordinates = [] - for _, row in df.iterrows(): - centroid = row.geometry.centroid + gdf = gpd.read_file(path) + if max_points: + gdf = gdf.iloc[:max_points] + string_labels = [] + float_labels = [] + for _, row in gdf.iterrows(): label = row[label_property] if isinstance(label, str): try: - float_label = label_to_class_dict[label] - coordinates.append((centroid.x, centroid.y, float_label, label)) + string_labels.append(label) + float_labels.append(label_to_class_dict[label]) except KeyError: logging.warning('Label %s is not recognized.', label) elif isinstance(label, (int, float)): - float_label = float(label) - coordinates.append((centroid.x, centroid.y, float_label, str(label))) + string_labels.append(str(label)) + float_labels.append(float(label)) else: raise ValueError(f'Unrecognized label property type {type(label)}') - if max_points: - coordinates = coordinates[:max_points] - - # logging.info('Read %d labeled coordinates.', len(coordinates)) - return coordinates + output_gdf = gpd.GeoDataFrame( + {'string_label': string_labels, 'label': float_labels}, + geometry=gpd.geometry, + ) + buildings.write_buildings_file(output_gdf, output_path) def parse_gdal_env(settings: List[str]) -> Dict[str, str]: @@ -767,8 +780,8 @@ def generate_examples_pipeline( resolution: float, output_dir: str, num_output_shards: int, - unlabeled_coordinates: List[Tuple[float, float]], - labeled_coordinates: List[Tuple[float, float, float, str]], + buildings_path: str, + buildings_labeled: bool, use_dataflow: bool, gdal_env: Dict[str, str], dataflow_job_name: Optional[str], @@ -788,10 +801,8 @@ def generate_examples_pipeline( resolution: Desired resolution of image patches. output_dir: Parent output directory. num_output_shards: Number of output shards. - unlabeled_coordinates: List of coordinates (longitude, latitude) to extract - unlabeled examples for. - labeled_coordinates: List of coordinates (longitude, latitude, label, - string_label) to extract labeled examples for. + buildings_path: Path to file containing building footprints. + buildings_labeled: True if buildings have labels. use_dataflow: If true, run pipeline on GCP Dataflow. gdal_env: GDAL environment configuration. dataflow_job_name: Name of dataflow job. @@ -816,26 +827,20 @@ def generate_examples_pipeline( accelerator_count=0, ) - coordinates_path = os.path.join(temp_dir, 'coordinates') - if unlabeled_coordinates: - small_examples_output_prefix = ( - os.path.join(output_dir, 'examples', 'unlabeled', 'unlabeled')) - large_examples_output_prefix = ( - os.path.join(output_dir, 'examples', 'unlabeled-large', 'unlabeled')) - labeled_coordinates = [(longitude, latitude, -1.0, '') - for longitude, latitude in unlabeled_coordinates] - utils.write_coordinates_file(labeled_coordinates, coordinates_path) - - elif labeled_coordinates: + if buildings_labeled: small_examples_output_prefix = ( os.path.join(output_dir, 'examples', 'labeled', 'labeled')) large_examples_output_prefix = ( os.path.join(output_dir, 'examples', 'labeled-large', 'labeled')) - utils.write_coordinates_file(labeled_coordinates, coordinates_path) + else: + small_examples_output_prefix = ( + os.path.join(output_dir, 'examples', 'unlabeled', 'unlabeled')) + large_examples_output_prefix = ( + os.path.join(output_dir, 'examples', 'unlabeled-large', 'unlabeled')) with beam.Pipeline(options=pipeline_options) as pipeline: large_examples, small_examples = _generate_examples( - pipeline, before_image_patterns, after_image_patterns, coordinates_path, + pipeline, before_image_patterns, after_image_patterns, buildings_path, large_patch_size, example_patch_size, resolution, gdal_env, 'generate_examples', cloud_detector_model_path) diff --git a/src/skai/generate_examples_test.py b/src/skai/generate_examples_test.py index 99b5e5ea..324a4fd9 100644 --- a/src/skai/generate_examples_test.py +++ b/src/skai/generate_examples_test.py @@ -18,17 +18,17 @@ import os import pathlib import tempfile -from typing import Any, List, Tuple +from typing import Any from absl.testing import absltest from absl.testing import parameterized import apache_beam as beam from apache_beam.testing import test_pipeline -from apache_beam.testing.util import assert_that -from apache_beam.testing.util import equal_to +import apache_beam.testing.util as test_util +import geopandas as gpd import numpy as np +from skai import buildings from skai import generate_examples -from skai import utils import tensorflow as tf @@ -48,7 +48,7 @@ def _deserialize_image(serialized_image: bytes) -> np.ndarray: return tf.io.decode_image(serialized_image).numpy() -def _unordered_all_close(list1: List[Any], list2: List[Any]) -> bool: +def _unordered_all_close(list1: list[Any], list2: list[Any]) -> bool: """Return that two lists of coordinates are close to each other.""" if len(list1) != len(list2): return False @@ -58,14 +58,65 @@ def _unordered_all_close(list1: List[Any], list2: List[Any]) -> bool: return np.allclose(sorted_list1, sorted_list2) +def _create_buildings_file( + coordinates: list[tuple[float, float]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + gdf = gpd.GeoDataFrame( + { + 'area_in_meters': [0.0] * len(coordinates), + }, + geometry=gpd.points_from_xy(longitudes, latitudes), + crs=4326, + ) + buildings.write_buildings_file(gdf, output_path) + + +def _create_labeled_buildings_file( + coordinates: list[tuple[float, float, float, str]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + labels = [c[2] for c in coordinates] + string_labels = [c[3] for c in coordinates] + area_in_meters = [0.0] * len(coordinates) + gdf = gpd.GeoDataFrame( + { + 'label': labels, + 'string_label': string_labels, + 'area_in_meters': area_in_meters, + }, + geometry=gpd.points_from_xy(longitudes, latitudes), + crs=4326, + ) + buildings.write_buildings_file(gdf, output_path) + + +def _create_buildings_file_with_plus_code( + coordinates: list[tuple[float, float]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + gdf = gpd.GeoDataFrame( + { + 'area_in_meters': [0.0] * len(coordinates), + 'full_plus_code': ['abc'] * len(coordinates), + }, + geometry=gpd.points_from_xy(longitudes, latitudes), + crs=4326, + ) + buildings.write_buildings_file(gdf, output_path) + + def _check_examples( before_image_id: str, after_image_id: str, small_patch_size: int, large_patch_size: int, - expected_coordinates: List[Tuple[float, float, float]], - expected_string_labels: List[str], - expected_plus_codes: List[str], + expected_coordinates: list[tuple[float, float, float]], + expected_string_labels: list[str], + expected_plus_codes: list[str], expect_blank_before: bool, expect_large_patch: bool): """Validates examples generated from beam pipeline. @@ -106,7 +157,8 @@ def _check_examples_internal(actual_examples): 'example_id', 'int64_id', 'plus_code', - 'string_label' + 'string_label', + 'area_in_meters', ]) if expect_large_patch: expected_feature_names.update( @@ -192,7 +244,7 @@ def setUp(self): super().setUp() current_dir = pathlib.Path(__file__).parent self.test_image_path = str(current_dir / TEST_IMAGE_PATH) - self.coordinates_path = str(current_dir / 'coordinates') + self.buildings_path = str(current_dir / 'buildings') self.test_image_path_patterns = str(current_dir / 'test_data/country_*.tif') self.test_config_path = str(current_dir / TEST_CONFIG_PATH) self.test_missing_dataset_name_config_path = str( @@ -205,18 +257,19 @@ def setUp(self): def testGenerateExamplesFn(self): """Tests GenerateExamplesFn class.""" - unlabeled_coordinates = [(178.482925, -16.632893, -1.0, ''), - (178.482283, -16.632279, -1.0, '')] - utils.write_coordinates_file(unlabeled_coordinates, self.coordinates_path) + _create_buildings_file( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) with test_pipeline.TestPipeline() as pipeline: large_examples, small_examples = generate_examples._generate_examples( pipeline, [self.test_image_path], [self.test_image_path], - self.coordinates_path, 62, 32, 0.5, {}, 'unlabeled') + self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') # Example at second input coordinate should be dropped because its patch # falls mostly outside the before and after image bounds. - assert_that( + test_util.assert_that( large_examples, _check_examples( self.test_image_path, @@ -231,7 +284,7 @@ def testGenerateExamplesFn(self): ), label='assert_large_examples', ) - assert_that( + test_util.assert_that( small_examples, _check_examples( self.test_image_path, @@ -250,16 +303,20 @@ def testGenerateExamplesFn(self): def testGenerateExamplesFnLabeled(self): """Tests GenerateExamplesFn class.""" - labeled_coordinates = [(178.482925, -16.632893, 0, 'no_damage'), - (178.482924, -16.632894, 1, 'destroyed')] - utils.write_coordinates_file(labeled_coordinates, self.coordinates_path) + _create_labeled_buildings_file( + [ + (178.482925, -16.632893, 0, 'no_damage'), + (178.482924, -16.632894, 1, 'destroyed'), + ], + self.buildings_path, + ) with test_pipeline.TestPipeline() as pipeline: large_examples, small_examples = generate_examples._generate_examples( pipeline, [self.test_image_path], [self.test_image_path], - self.coordinates_path, 62, 32, 0.5, {}, 'labeled') + self.buildings_path, 62, 32, 0.5, {}, 'labeled') - assert_that( + test_util.assert_that( large_examples, _check_examples( self.test_image_path, @@ -275,7 +332,7 @@ def testGenerateExamplesFnLabeled(self): label='assert_large_examples', ) - assert_that( + test_util.assert_that( small_examples, _check_examples( self.test_image_path, @@ -291,21 +348,68 @@ def testGenerateExamplesFnLabeled(self): label='assert_small_examples', ) + def testGenerateExamplesFnWithPlusCodes(self): + """Tests GenerateExamplesFn class.""" + + _create_buildings_file_with_plus_code( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) + + with test_pipeline.TestPipeline() as pipeline: + large_examples, small_examples = generate_examples._generate_examples( + pipeline, [self.test_image_path], [self.test_image_path], + self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') + + # Example at second input coordinate should be dropped because its patch + # falls mostly outside the before and after image bounds. + test_util.assert_that( + large_examples, + _check_examples( + self.test_image_path, + self.test_image_path, + 32, + 62, + [(178.482925, -16.632893, -1.0)], + [''], + ['abc'], + False, + True, + ), + label='assert_large_examples', + ) + test_util.assert_that( + small_examples, + _check_examples( + self.test_image_path, + self.test_image_path, + 32, + 62, + [(178.482925, -16.632893, -1.0)], + [''], + ['abc'], + False, + False, + ), + label='assert_small_examples', + ) + def testGenerateExamplesFnNoBefore(self): """Tests GenerateExamplesFn class without before image.""" - coordinates = [(178.482925, -16.632893, -1.0, ''), - (178.482283, -16.632279, -1.0, '')] - utils.write_coordinates_file(coordinates, self.coordinates_path) + _create_buildings_file( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) with test_pipeline.TestPipeline() as pipeline: large_examples, small_examples = generate_examples._generate_examples( - pipeline, [], [self.test_image_path], self.coordinates_path, 62, 32, + pipeline, [], [self.test_image_path], self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') # Example at second input coordinate should be dropped because its patch # falls mostly outside the before and after image bounds. - assert_that( + test_util.assert_that( large_examples, _check_examples( '', @@ -321,7 +425,7 @@ def testGenerateExamplesFnNoBefore(self): label='assert_large_examples', ) - assert_that( + test_util.assert_that( small_examples, _check_examples( '', @@ -339,8 +443,7 @@ def testGenerateExamplesFnNoBefore(self): def testGenerateExampleFnPathPattern(self): """Test GenerateExampleFn class with a path pattern.""" - coordinates = [(178.482925, -16.632893, -1.0, '')] - utils.write_coordinates_file(coordinates, self.coordinates_path) + _create_buildings_file([(178.482925, -16.632893)], self.buildings_path) expected_before_image_ids = glob.glob(self.test_image_path_patterns) @@ -348,7 +451,7 @@ def testGenerateExampleFnPathPattern(self): # The path patterns specify two before images. large_examples, small_examples = generate_examples._generate_examples( pipeline, [self.test_image_path_patterns], - [self.test_image_path], self.coordinates_path, 62, 32, 0.5, + [self.test_image_path], self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') small_examples_before_ids = ( @@ -358,17 +461,24 @@ def testGenerateExampleFnPathPattern(self): large_examples | 'Map large examples to before image ids' >> beam.Map(_get_before_image_id)) - assert_that(small_examples_before_ids, - equal_to(expected_before_image_ids), - 'check small examples before image ids') + test_util.assert_that( + small_examples_before_ids, + test_util.equal_to(expected_before_image_ids), + 'check small examples before image ids', + ) - assert_that(large_examples_before_ids, - equal_to(expected_before_image_ids), - 'check large examples before image ids') + test_util.assert_that( + large_examples_before_ids, + test_util.equal_to(expected_before_image_ids), + 'check large examples before image ids', + ) def testGenerateExamplesPipeline(self): output_dir = tempfile.mkdtemp(dir=absltest.TEST_TMPDIR.value) - coordinates = [(178.482925, -16.632893), (178.482283, -16.632279)] + _create_buildings_file( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) generate_examples.generate_examples_pipeline( before_image_patterns=[self.test_image_path], after_image_patterns=[self.test_image_path], @@ -377,8 +487,8 @@ def testGenerateExamplesPipeline(self): resolution=0.5, output_dir=output_dir, num_output_shards=1, - unlabeled_coordinates=coordinates, - labeled_coordinates=[], + buildings_path=self.buildings_path, + buildings_labeled=False, use_dataflow=False, gdal_env={}, dataflow_job_name='test', diff --git a/src/skai/open_street_map.py b/src/skai/open_street_map.py index 7072e50a..9d1886da 100644 --- a/src/skai/open_street_map.py +++ b/src/skai/open_street_map.py @@ -16,11 +16,14 @@ Please see https://wiki.openstreetmap.org/wiki/Overpass_API for details. """ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional import xml.etree.ElementTree as ET +import geopandas as gpd import requests import shapely.geometry +from skai import buildings + Polygon = shapely.geometry.polygon.Polygon Point = shapely.geometry.point.Point @@ -123,18 +126,17 @@ def get_buildings_in_region(region: Polygon, def get_building_centroids_in_regions( - regions: List[Polygon], overpass_url: str) -> List[Tuple[float, float]]: + regions: List[Polygon], overpass_url: str, output_path: str) -> None: """Queries OpenStreetMap Overpass API for all building centroids in a region. Args: regions: Regions of interest as polygons. Must be in EPSG:4326 (long/lat). overpass_url: Overpass URL e.g. https://lz4.overpass-api.de/api/interpreter. - - Returns: - A list of building centroids in all regions. + output_path: Save footprints to this path as a GeoPackage. """ - centroids = [] + polygons = [] for region in regions: - polygons = get_buildings_in_region(region, overpass_url) - centroids.extend([(p.centroid.x, p.centroid.y) for p in polygons]) - return centroids + polygons.extend(get_buildings_in_region(region, overpass_url)) + buildings.write_buildings_file( + gpd.GeoDataFrame(geometry=polygons), output_path + ) diff --git a/src/skai/read_raster.py b/src/skai/read_raster.py index fefbddfc..db65f72f 100644 --- a/src/skai/read_raster.py +++ b/src/skai/read_raster.py @@ -27,6 +27,7 @@ import rtree import shapely.geometry +from skai import buildings from skai import utils Metrics = beam.metrics.Metrics @@ -241,17 +242,17 @@ def _resample_image(image: np.ndarray, patch_size: int) -> np.ndarray: image, (patch_size, patch_size), interpolation=cv2.INTER_CUBIC) -def _coordinates_to_groups( +def _buildings_to_groups( raster_path: str, - coordinates_path: str, + buildings_path: str, patch_size: int, resolution: float, gdal_env: Dict[str, str]) -> Iterable[Tuple[str, _WindowGroup]]: - """Converts a list of coordinates into pixel windows and then groups them. + """Converts building centroids into pixel windows and then groups them. Args: raster_path: Path to raster image. - coordinates_path: Path to coordinates file. + buildings_path: Path to buildings file. patch_size: Size of patches to extract. resolution: Resolution of patches to extract. gdal_env: GDAL environment configuration. @@ -259,10 +260,11 @@ def _coordinates_to_groups( Yields: Tuples of raster path and grouped windows. """ - coordinates = utils.read_coordinates_file(coordinates_path) - coords_with_ids = [(utils.encode_coordinates(longitude, - latitude), longitude, latitude) - for longitude, latitude, _, _ in coordinates] + coords_df = buildings.read_building_coordinates(buildings_path) + coords_with_ids = [ + (utils.encode_coordinates(lng, lat), lng, lat) + for lng, lat in zip(coords_df.longitude, coords_df.latitude) + ] with rasterio.Env(**gdal_env): raster = rasterio.open(raster_path) raster_res = _get_raster_resolution_in_meters(raster) @@ -343,7 +345,7 @@ def process( def extract_patches_from_rasters( pipeline: beam.Pipeline, - coordinates_path: str, + buildings_path: str, raster_paths: List[str], patch_size: int, resolution: float, @@ -353,8 +355,7 @@ def extract_patches_from_rasters( Args: pipeline: Beam pipeline. - coordinates_path: Path to CSV file containing the longitude, latitude - coordinates to extract patches for. + buildings_path: Path to building footprints file. raster_paths: Raster paths. patch_size: Desired size of output patches. resolution: Desired resolution of output patches. @@ -367,8 +368,8 @@ def extract_patches_from_rasters( return (pipeline | stage_prefix + '_encode_raster_paths' >> beam.Create(raster_paths) | stage_prefix + '_make_window_groups' >> beam.FlatMap( - _coordinates_to_groups, - coordinates_path=coordinates_path, + _buildings_to_groups, + buildings_path=buildings_path, patch_size=patch_size, resolution=resolution, gdal_env=gdal_env) diff --git a/src/skai/read_raster_test.py b/src/skai/read_raster_test.py index 1f77a9c0..68ca33c3 100644 --- a/src/skai/read_raster_test.py +++ b/src/skai/read_raster_test.py @@ -19,11 +19,12 @@ import apache_beam as beam from apache_beam.testing import test_pipeline -from apache_beam.testing.util import assert_that +import apache_beam.testing.util as test_util +import geopandas as gpd import rasterio +from skai import buildings from skai import read_raster -from skai import utils TEST_IMAGE_PATH = 'test_data/blank.tif' @@ -31,6 +32,17 @@ WindowGroup = read_raster._WindowGroup +def _create_buildings_file( + coordinates: list[tuple[float, float]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + gdf = gpd.GeoDataFrame( + geometry=gpd.points_from_xy(longitudes, latitudes), crs=4326 + ) + buildings.write_buildings_file(gdf, output_path) + + class ReadRasterTest(absltest.TestCase): def setUp(self): @@ -99,19 +111,19 @@ def _check_output_patches(patches): assert patches[2][1][0] == expected_image_path assert patches[2][1][1].shape == (64, 64, 3) - assert_that(patches, _check_output_patches) + test_util.assert_that(patches, _check_output_patches) - def test_coordinates_to_groups(self): + def test_buildings_to_groups(self): coordinates = [ - (178.482925, -16.632893, -1.0, 'no_damage'), - (178.482283, -16.632279, -1.0, 'no_damage'), - (178.482284, -16.632279, -1.0, 'no_damage')] + (178.482925, -16.632893), + (178.482283, -16.632279), + (178.482284, -16.632279)] with tempfile.NamedTemporaryFile(dir=absltest.TEST_TMPDIR.value) as f: - coordinates_path = f.name - utils.write_coordinates_file(coordinates, coordinates_path) - groups = list(read_raster._coordinates_to_groups( - self.test_image_path, coordinates_path, 32, 0.5, {})) + buildings_path = f.name + _create_buildings_file(coordinates, buildings_path) + groups = list(read_raster._buildings_to_groups( + self.test_image_path, buildings_path, 32, 0.5, {})) self.assertLen(groups, 2) def test_get_raster_bounds(self): diff --git a/src/skai/utils.py b/src/skai/utils.py index 811be540..9568ad7b 100644 --- a/src/skai/utils.py +++ b/src/skai/utils.py @@ -16,10 +16,8 @@ import base64 import io -import os -import pickle import struct -from typing import Any, Iterable, List, Sequence, Tuple +from typing import Iterable, List, Sequence, Tuple from absl import flags import PIL.Image @@ -127,16 +125,3 @@ def encode_coordinates(longitude: float, latitude: float) -> str: def decode_coordinates(encoded_coords: str) -> Tuple[float, float]: buffer = base64.b16decode(encoded_coords.encode('ascii')) return struct.unpack(' None: - output_dir = os.path.dirname(path) - if not tf.io.gfile.exists(output_dir): - tf.io.gfile.makedirs(output_dir) - with tf.io.gfile.GFile(path, 'wb') as f: - pickle.dump(coordinates, f) - - -def read_coordinates_file(path: str) -> List[Any]: - with tf.io.gfile.GFile(path, 'rb') as f: - return pickle.load(f)