diff --git a/src/generate_examples_main.py b/src/generate_examples_main.py index 3e47109f..a87ef8ff 100644 --- a/src/generate_examples_main.py +++ b/src/generate_examples_main.py @@ -36,6 +36,7 @@ --cloud_region=us-west1 """ +import os import time from typing import List @@ -164,6 +165,8 @@ Polygon = shapely.geometry.polygon.Polygon +BUILDINGS_FILE_NAME = 'processed_buildings.parquet' + def _read_image_config(path: str) -> List[str]: with tf.io.gfile.GFile(path, 'r') as f: @@ -207,35 +210,32 @@ def main(args): raise ValueError('At least labels_file (for labeled examples extraction) ' 'or buildings_method != none (for unlabeled data) should ' 'be specified.') - if config.buildings_method != 'none': + + buildings_path = os.path.join(config.output_dir, BUILDINGS_FILE_NAME) + if config.labels_file: + generate_examples.read_labels_file( + config.labels_file, + config.label_property, + config.labels_to_classes, + config.num_keep_labeled_examples, + buildings_path + ) + buildings_labeled = True + else: if config.aoi_path: aois = buildings.read_aois(config.aoi_path) else: aois = [read_raster.get_raster_bounds(path, gdal_env) for path in after_image_patterns] try: - building_centroids = generate_examples.get_building_centroids( - config, aois + generate_examples.download_building_footprints( + config, aois, buildings_path ) except generate_examples.NotInitializedEarthEngineError: logging.fatal('Could not initialize Earth Engine.', exc_info=True) except generate_examples.NoBuildingFoundError: logging.fatal('No building is found.', exc_info=True) - logging.info('Found %d buildings in area of interest.', - len(building_centroids)) - else: - # Only if one wants to extract labeled examples and labels_file is provided. - building_centroids = [] - - if config.labels_file: - labeled_coordinates = generate_examples.read_labels_file( - config.labels_file, - config.label_property, - config.labels_to_classes, - config.num_keep_labeled_examples, - ) - else: - labeled_coordinates = [] + buildings_labeled = False generate_examples.generate_examples_pipeline( before_image_patterns, @@ -245,8 +245,8 @@ def main(args): config.resolution, config.output_dir, config.output_shards, - building_centroids, - labeled_coordinates, + buildings_path, + buildings_labeled, config.use_dataflow, gdal_env, timestamped_dataset, diff --git a/src/skai/buildings.py b/src/skai/buildings.py index 921b60a4..1841ab2c 100644 --- a/src/skai/buildings.py +++ b/src/skai/buildings.py @@ -14,7 +14,7 @@ """Functions for reading building centroids from files.""" -from typing import List, Tuple +import os import geopandas as gpd import pandas as pd import shapely.geometry @@ -24,8 +24,8 @@ Polygon = shapely.geometry.polygon.Polygon -def _read_buildings_csv(path: str) -> List[Tuple[float, float]]: - """Reads (longitude, latitude) coordinates from a CSV file. +def _read_buildings_csv(path: str) -> gpd.GeoDataFrame: + """Reads CSV file containing building footprints to GeoDataFrame. The file should contain "longitude" and "latitude" columns. @@ -40,48 +40,48 @@ def _read_buildings_csv(path: str) -> List[Tuple[float, float]]: """ with tf.io.gfile.GFile(path, 'r') as csv_file: df = pd.read_csv(csv_file) - if 'longitude' not in df.columns or 'latitude' not in df.columns: - raise ValueError( - f'Malformed CSV file "{path}". File does not contain "longitude" and ' - '"latitude" columns') - return [(row.longitude, row.latitude) for _, row in df.iterrows()] + if 'geometry' in df.columns: + geometries = gpd.GeoSeries.from_wkt(df['geometry']) + df.drop(columns=['geometry'], inplace=True) + elif 'wkt' in df.columns: + geometries = gpd.GeoSeries.from_wkt(df['wkt']) + df.drop(columns=['wkt'], inplace=True) + elif 'longitude' in df.columns and 'latitude' in df.columns: + geometries = gpd.points_from_xy(df['longitude'], df['latitude']) + df.drop(columns=['longitude', 'latitude'], inplace=True) + else: + raise ValueError(f'No geometry information found in file "{path}"') + + return gpd.GeoDataFrame(df, geometry=geometries, crs=4326) -def read_buildings_file(path: str, - regions: List[Polygon]) -> List[Tuple[float, float]]: - """Extracts building coordinates from a file. +def convert_buildings_file( + path: str, regions: list[Polygon], output_path: str +) -> None: + """Converts an input file encoding building footprints to the standard format. - Supported file formats are csv, shapefile, and geojson. + Also filters out any buildings that don't fall in one of the specified region + polygons. + + Supported file formats are csv and anything that GeoPandas handles. Args: path: Path to buildings file. regions: Regions to where building coordinates should come from. - - Returns: - List of (longitude, latitude) building coordinates. + output_path: Path to write buildings GeoDataFrame to. """ if path.lower().endswith('.csv'): - coords = _read_buildings_csv(path) + buildings_gdf = _read_buildings_csv(path) else: - coords = [] - df = gpd.read_file(path).to_crs(epsg=4326) - geometries = list(df.geometry.values) - for g in geometries: - centroid = g.centroid - coords.append((centroid.x, centroid.y)) - - filtered_coords = [] - for lon, lat in coords: - point = Point(lon, lat) - for region in regions: - if region.intersects(point): - filtered_coords.append((lon, lat)) - break + with tf.io.gfile.GFile(path, 'rb') as f: + buildings_gdf = gpd.read_file(f).to_crs(epsg=4326) - return filtered_coords + combined_regions = gpd.GeoSeries(regions).unary_union + in_regions = buildings_gdf.intersects(combined_regions) + write_buildings_file(buildings_gdf[in_regions], output_path) -def read_aois(path: str) -> List[Polygon]: +def read_aois(path: str) -> list[Polygon]: """Reads area of interest polygons from a file. Common file formats such as shapefile and GeoJSON are supported. However, the @@ -106,3 +106,63 @@ def read_aois(path: str) -> List[Polygon]: raise ValueError( f'Unexpected geometry for area of interest: "{g.geometryType()}"') return geometries + + +def write_buildings_file(gdf: gpd.GeoDataFrame, output_path: str) -> None: + """Writes a GeoDataFrame of building geometries to file. + + Serializes GeoDataFrame using Parquet file format to allow fast reading of + individual columns, such as longitude and latitude, in large datasets. + + Args: + gdf: GeoDataFrame of building geometries. + output_path: Output path. + """ + if 'longitude' not in gdf.columns and 'latitude' not in gdf.columns: + centroids = gdf.geometry.centroid + output_gdf = gdf.copy().to_crs(4326) + output_gdf['longitude'] = [c.x for c in centroids] + output_gdf['latitude'] = [c.y for c in centroids] + else: + output_gdf = gdf.to_crs(4326) + + output_dir = os.path.dirname(output_path) + if not tf.io.gfile.exists(output_dir): + tf.io.gfile.makedirs(output_dir) + with tf.io.gfile.GFile(output_path, 'wb') as f: + f.closed = False + output_gdf.to_parquet(f, index=False) + + +def read_buildings_file(path: str) -> gpd.GeoDataFrame: + """Reads a GeoDataFrame of building geometries from file. + + The GeoDataFrame must have been serialized by the write_buildings_file + function defined above. + + Args: + path: Path to serialized GeoDataFrame. + + Returns: + Buildings GeoDataFrame. + """ + with tf.io.gfile.GFile(path, 'rb') as f: + f.closed = False # Work-around for GFile issue. + return gpd.read_parquet(f).to_crs(4326) + + +def read_building_coordinates(path: str) -> pd.DataFrame: + """Reads only the longitude and latitude columns of a buildings file. + + The GeoDataFrame must have been serialized by the write_buildings_file + function defined above. + + Args: + path: Path to buildings file. Should be a GeoDataFrame in parquet format. + + Returns: + DataFrame (not GeoDataFrame) containing + """ + with tf.io.gfile.GFile(path, 'rb') as f: + f.closed = False # Work-around for GFile issue. + return pd.read_parquet(f, columns=['longitude', 'latitude']) diff --git a/src/skai/buildings_test.py b/src/skai/buildings_test.py index 857e3c51..fa864e86 100644 --- a/src/skai/buildings_test.py +++ b/src/skai/buildings_test.py @@ -14,7 +14,12 @@ """Tests for buildings.py.""" import pathlib +import tempfile from absl.testing import absltest +import geopandas as gpd +import geopandas.testing +import numpy as np +import pandas as pd import shapely.geometry from skai import buildings @@ -34,29 +39,99 @@ def get_test_file_path(relative_test_data_path: str) -> str: return str(current_dir / relative_test_data_path) +def get_temp_file(suffix: str = '') -> str: + return tempfile.mkstemp(suffix=suffix, dir=absltest.TEST_TMPDIR.value)[1] + + +def create_test_buildings_gdf(num_buildings: int) -> gpd.GeoDataFrame: + longitudes = np.random.uniform(low=-180, high=180, size=(num_buildings,)) + latitudes = np.random.uniform(low=-90, high=90, size=(num_buildings,)) + centroids = gpd.points_from_xy(longitudes, latitudes) + ids = [str(i) for i in range(num_buildings)] + return gpd.GeoDataFrame({'id': ids}, geometry=centroids, crs=4326) + + class BuildingsTest(absltest.TestCase): - def testReadBuildingsFileGeoJSON(self): + def test_convert_buildings_file_from_geojson(self): path = get_test_file_path('test_data/building_centroids.geojson') + output_path = get_temp_file() regions = [Polygon.from_bounds(178.78737, -16.65851, 178.81098, -16.63617)] - coords = buildings.read_buildings_file(path, regions) + buildings.convert_buildings_file(path, regions, output_path) + buildings_gdf = buildings.read_buildings_file(output_path) # The input file, "building_centroids.geojson", has 102 centroids. However, # one centroid will be filtered out because it doesn't fall within the # region boundaries. Hence there should be 101 centroids. - self.assertLen(coords, 101) + self.assertLen(buildings_gdf, 101) - def testReadBuildingsFileCSV(self): + def test_convert_buildings_file_from_csv(self): path = get_test_file_path('test_data/building_centroids.csv') + output_path = get_temp_file() regions = [Polygon.from_bounds(178.78737, -16.65851, 178.81098, -16.63617)] - coords = buildings.read_buildings_file(path, regions) + buildings.convert_buildings_file(path, regions, output_path) + buildings_gdf = buildings.read_buildings_file(output_path) # The input file, "building_centroids.csv", has 102 centroids. However, # one centroid will be filtered out because it doesn't fall within the # region boundaries. Hence there should be 101 centroids. - self.assertLen(coords, 101) + self.assertLen(buildings_gdf, 101) - def testReadAOIs(self): + def test_convert_buildings_file_from_csv_with_wkt(self): + buildings_data = [ + { + 'latitude': 29.99202352, + 'longitude': -4.99707854, + 'area_in_meters': 26.1683, + 'confidence': 0.8192, + 'geometry': ( + 'POLYGON((-4.9970664523855 29.9919846910361, -4.99705832764922' + ' 29.9920597015168, -4.99709061807397 29.992062351673,' + ' -4.99709874278586 29.9919873411914, -4.9970664523855' + ' 29.9919846910361))' + ), + 'full_plus_code': '7CXQX2R3+R53V', + }, + { + 'latitude': 29.99214936, + 'longitude': -4.99758957, + 'area_in_meters': 31.8337, + 'confidence': 0.8393, + 'geometry': ( + 'POLYGON((-4.99758309542527 29.9921067182802, -4.99755994204994' + ' 29.9921837854051, -4.99759604572576 29.9921920041646,' + ' -4.99761919907427 29.9921149370345, -4.99758309542527' + ' 29.9921067182802))' + ), + 'full_plus_code': '7CXQX2R2+VX3R', + }, + ] + path = get_temp_file(suffix='.csv') + pd.DataFrame(buildings_data).to_csv(path, index=False) + regions = [Polygon.from_bounds(-180, -90, 180, 90)] + output_path = get_temp_file() + buildings.convert_buildings_file(path, regions, output_path) + buildings_gdf = buildings.read_buildings_file(output_path) + + self.assertLen(buildings_gdf, 2) + self.assertContainsSubset( + [ + 'longitude', + 'latitude', + 'area_in_meters', + 'confidence', + 'full_plus_code', + ], + buildings_gdf.columns, + ) + expected_geometries = gpd.GeoSeries.from_wkt( + [b['geometry'] for b in buildings_data], crs=4326 + ) + geopandas.testing.assert_geoseries_equal( + buildings_gdf.geometry, expected_geometries, check_less_precise=True + ) + + def test_read_aois(self): path = get_test_file_path('test_data/aoi.geojson') aois = buildings.read_aois(path) self.assertLen(aois, 2) @@ -75,6 +150,42 @@ def testReadAOIs(self): [178.820579612784513, -16.644811608514196], [178.823626242664915, -16.641714920481871]]))) + def test_read_write_building_files(self): + path = get_temp_file() + buildings_gdf = create_test_buildings_gdf(20) + buildings.write_buildings_file(buildings_gdf, path) + loaded_buildings_gdf = buildings.read_buildings_file(path) + geopandas.testing.assert_geoseries_equal( + buildings_gdf.geometry, loaded_buildings_gdf.geometry + ) + self.assertContainsSubset( + ['longitude', 'latitude', 'id'], loaded_buildings_gdf.columns + ) + np.testing.assert_allclose( + loaded_buildings_gdf.longitude.values, + [c.x for c in buildings_gdf.geometry], + ) + np.testing.assert_allclose( + loaded_buildings_gdf.latitude.values, + [c.y for c in buildings_gdf.geometry], + ) + + def test_read_write_building_coordinates(self): + path = get_temp_file() + buildings_gdf = create_test_buildings_gdf(20) + buildings.write_buildings_file(buildings_gdf, path) + coordinates_df = buildings.read_building_coordinates(path) + self.assertContainsSubset( + ['longitude', 'latitude'], coordinates_df.columns + ) + np.testing.assert_allclose( + coordinates_df.longitude.values, + [c.x for c in buildings_gdf.geometry], + ) + np.testing.assert_allclose( + coordinates_df.latitude.values, + [c.y for c in buildings_gdf.geometry], + ) if __name__ == '__main__': absltest.main() diff --git a/src/skai/earth_engine.py b/src/skai/earth_engine.py index 2a52c5a8..74c4a1e8 100644 --- a/src/skai/earth_engine.py +++ b/src/skai/earth_engine.py @@ -16,7 +16,6 @@ import json import shutil -from typing import List, Optional, Tuple import urllib.request from absl import logging @@ -24,6 +23,7 @@ import geopandas as gpd import pandas as pd import shapely.geometry +from skai import buildings import tensorflow as tf ShapelyGeometry = shapely.geometry.base.BaseGeometry @@ -43,17 +43,14 @@ def _get_open_building_feature_centroid(feature: ee.Feature) -> ee.Feature: def _download_feature_collection( - collection: ee.FeatureCollection, properties: List[str], - output_path: str) -> gpd.GeoDataFrame: + collection: ee.FeatureCollection, properties: list[str], + output_path: str) -> None: """Downloads a FeatureCollection from Earth Engine as a GeoDataFrame. Args: collection: EE FeatureCollection to download. properties: List of properties to download. - output_path: Path to save CSV file to. - - Returns: - FeatureCollection data as a GeoDataFrame. + output_path: Path of GeoDataFrame feather output. """ url = collection.getDownloadURL('csv', properties) with urllib.request.urlopen(url) as url_file, tf.io.gfile.GFile( @@ -70,19 +67,20 @@ def _download_feature_collection( properties = df.drop(columns=['.geo']) elif 'longitude' in df.columns and 'latitude' in df.columns: geometry = gpd.points_from_xy(df['longitude'], df['latitude']) - properties = df.drop(columns=['longitude', 'latitude']) + properties = df else: - geometry = None - properties = None + raise ValueError('No geometries found in feature collection.') - return gpd.GeoDataFrame(properties, geometry=geometry) + buildings.write_buildings_file( + gpd.GeoDataFrame(properties, geometry=geometry), output_path + ) -def get_open_buildings(regions: List[ShapelyGeometry], +def get_open_buildings(regions: list[ShapelyGeometry], collection: str, confidence: float, as_centroids: bool, - output_path: str) -> gpd.GeoDataFrame: + output_path: str) -> None: """Downloads Open Buildings footprints for the Area of Interest from EE. Args: @@ -90,11 +88,7 @@ def get_open_buildings(regions: List[ShapelyGeometry], collection: Name of Earth Engine FeatureCollection containing footprints. confidence: Confidence threshold for included buildings. as_centroids: If true, download centroids instead of full footprints. - output_path: Save footprints to this file in addition to returning the - GeoDataFrame. - - Returns: - GeoDataFrame of building footprints. + output_path: Save footprints to this file as a GeoPackage. """ bounds = ee.FeatureCollection([_shapely_to_ee_feature(r) for r in regions]) open_buildings = ee.FeatureCollection(collection) @@ -102,34 +96,26 @@ def get_open_buildings(regions: List[ShapelyGeometry], aoi_buildings = aoi_buildings.filter(f'confidence >= {confidence}') if as_centroids: centroids = aoi_buildings.map(_get_open_building_feature_centroid) - return _download_feature_collection(centroids, ['longitude', 'latitude'], - output_path) + _download_feature_collection( + centroids, + [ + 'longitude', + 'latitude', + 'full_plus_code', + 'confidence', + 'area_in_meters', + ], + output_path, + ) else: - return _download_feature_collection(aoi_buildings, ['.geo'], output_path) - - -def get_open_buildings_centroids( - regions: List[ShapelyGeometry], - collection: str, - confidence: float, - output_path: str) -> List[Tuple[float, float]]: - """Downloads Open Buildings footprints as centroids of regions of interest. - - Args: - regions: List of regions as shapely geometries. - collection: Name of Earth Engine FeatureCollection containing footprints. - confidence: Confidence threshold for included buildings. - output_path: Save footprints to this file in addition to returning the - GeoDataFrame. - - Returns: - List of (longitude, latitude) building centroids. - """ - gdf = get_open_buildings(regions, collection, confidence, True, output_path) - return list(zip(gdf['geometry'].x, gdf['geometry'].y)) + _download_feature_collection( + aoi_buildings, + ['.geo', 'full_plus_code', 'confidence', 'area_in_meters'], + output_path, + ) -def initialize(service_account: str, private_key: Optional[str]) -> bool: +def initialize(service_account: str, private_key: str | None) -> bool: """Initializes EE server connection. When not using a service account, this function assumes that the user has diff --git a/src/skai/generate_examples.py b/src/skai/generate_examples.py index 49c50fd2..b2c0e51f 100644 --- a/src/skai/generate_examples.py +++ b/src/skai/generate_examples.py @@ -203,9 +203,9 @@ def __init__(self): super().__init__('Earth Engine could not be initialized.') -def get_building_centroids( - config, regions: List[Polygon] -) -> List[Tuple[float, float]]: +def download_building_footprints( + config, regions: list[Polygon], output_path: str +) -> None: """Finds building centroids based on flag settings. This function is meant to be called from generate_examples_main.py. @@ -214,21 +214,20 @@ def get_building_centroids( config: A configuration object that specify how to get the building centroids. regions: List of polygons of regions to find buildings in. - - Returns: - List of building centroids in (longitude, latitude) format. + output_path: Path to write buildings file to. Raises: ValueError: if buildings_method flag has unknown value. NotInitializedEarthEngineError: if earth couldnot be initialized. NoBuildingFoundError: if no building is found in the area of interest. """ - centroids = None if config.buildings_method == 'file': - centroids = buildings.read_buildings_file(config.buildings_file, regions) + buildings.convert_buildings_file( + config.buildings_file, regions, output_path + ) elif config.buildings_method == 'open_street_map': - centroids = open_street_map.get_building_centroids_in_regions( - regions, config.overpass_url + open_street_map.get_building_centroids_in_regions( + regions, config.overpass_url, output_path ) elif config.buildings_method == 'open_buildings': if not earth_engine.initialize( @@ -236,21 +235,15 @@ def get_building_centroids( ): raise NotInitializedEarthEngineError() logging.info('Querying Open Buildings centroids. This may take a while.') - output_path = os.path.join( - config.output_dir, 'open_buildings_centroids.csv' + earth_engine.get_open_buildings_centroids( + regions, + config.open_buildings_feature_collection, + config.open_buildings_confidence, + output_path, ) - centroids = earth_engine.get_open_buildings_centroids( - regions, config.open_buildings_feature_collection, - config.open_buildings_confidence, output_path - ) - logging.info('Open Buildings centroids saved to %s', output_path) else: raise ValueError('Invalid value for "buildings_method" flag.') - if not centroids: - raise NoBuildingFoundError() - return list(set(centroids)) # Deduplicate. - def _to_grayscale(image: np.ndarray) -> np.ndarray: return cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) @@ -443,10 +436,12 @@ def _create_example( longitude, latitude, before_image_id, after_image_id ) int64_id = _make_int64_id(example_id) - plus_code = openlocationcode.encode( - latitude=latitude, longitude=longitude, codeLength=_PLUS_CODE_LENGTH - ) - utils.add_bytes_feature('plus_code', plus_code.encode(), example) + if 'plus_code' not in scalar_features: + plus_code = openlocationcode.encode( + latitude=latitude, longitude=longitude, codeLength=_PLUS_CODE_LENGTH + ) + utils.add_bytes_feature('plus_code', plus_code.encode(), example) + utils.add_bytes_feature('example_id', example_id.encode(), example) utils.add_int64_feature('int64_id', int64_id, example) utils.add_bytes_feature( @@ -531,16 +526,33 @@ def process( yield example -def _coordinates_to_scalar_features(coordinates_path: str): - coordinates = utils.read_coordinates_file(coordinates_path) - for longitude, latitude, label, string_label in coordinates: +def _extract_scalar_features_from_buildings_file(buildings_path: str): + """Extracts scalar features of each example from buildings file. + + Args: + buildings_path: Path to serialized buildings file. + + Yields: + Tuple of (encoded coordinates, scalar features dictionary). + """ + buildings_gdf = buildings.read_buildings_file(buildings_path) + for _, row in buildings_gdf.iterrows(): + longitude = row['longitude'] + latitude = row['latitude'] + label = row['label'] if 'label' in row.index else -1.0 + string_label = row['string_label'] if 'string_label' in row.index else '' + encoded_coords = utils.encode_coordinates(longitude, latitude) - feature = _FeatureUnion(scalar_features={ + scalar_features = { 'coordinates': [longitude, latitude], 'label': [label], 'string_label': [string_label] - }) - yield (encoded_coords, feature) + } + if 'full_plus_code' in row.index: + scalar_features['plus_code'] = [row['full_plus_code']] + if 'area_in_meters' in row.index: + scalar_features['area_in_meters'] = [row['area_in_meters']] + yield (encoded_coords, _FeatureUnion(scalar_features=scalar_features)) def _remove_large_images(example: Example) -> Example: @@ -576,7 +588,7 @@ def _generate_examples( pipeline, before_image_patterns: List[str], after_image_patterns: List[str], - coordinates_path: str, + buildings_path: str, large_patch_size: int, example_patch_size: int, resolution: float, @@ -590,7 +602,7 @@ def _generate_examples( pipeline: Beam pipeline. before_image_patterns: List of before image path patterns. after_image_patterns: List of after image path patterns. - coordinates_path: Path to file containing building coordinates. + buildings_path: Path to serialized building footprints GeoDataFrame file. large_patch_size: Size in pixels of before and after image patches for labeling and alignment. Typically 256. example_patch_size: Size of patches to extract into examples. Typically 64. @@ -604,10 +616,10 @@ def _generate_examples( """ scalar_features = ( pipeline - | stage_prefix + 'encode_coordinates_path' >> beam.Create( - [coordinates_path]) + | stage_prefix + 'encode_buildings_path' >> beam.Create( + [buildings_path]) | stage_prefix + 'create_scalar_features' >> beam.FlatMap( - _coordinates_to_scalar_features)) + _extract_scalar_features_from_buildings_file)) input_collections = [scalar_features] after_image_size = large_patch_size @@ -621,7 +633,7 @@ def _generate_examples( before_raster_paths = _expand_patterns(before_image_patterns) before_patches = read_raster.extract_patches_from_rasters( pipeline, - coordinates_path, + buildings_path, before_raster_paths, large_patch_size, resolution, @@ -637,7 +649,7 @@ def _generate_examples( after_raster_paths = _expand_patterns(after_image_patterns) after_patches = read_raster.extract_patches_from_rasters( pipeline, - coordinates_path, + buildings_path, after_raster_paths, after_image_size, resolution, @@ -676,10 +688,11 @@ def _generate_examples( def read_labels_file( path: str, label_property: str, - labels_to_classes: List[str] = None, - max_points: int = None -) -> List[Tuple[float, float, float, str]]: - """Reads labels from a GIS file. + labels_to_classes: List[str], + max_points: int, + output_path: str, +) -> None: + """Reads labels from a GIS file and writes to the standard buildings format. If the "label_property" is a string, then it is assumed to be the name of a class, e.g. "damaged". In labels_to_classes, user can specify the mapping of @@ -694,10 +707,8 @@ def read_labels_file( label_property: The property to use as the label, e.g. "string_label". labels_to_classes: List of string in "class=label" format, e.g. ["no_damage=0", "damaged=1", "destroyed=1"]. - max_points: Number of labeled examples to keep - - Returns: - List of tuples of the form (longitude, latitude, float label). + max_points: Number of labeled examples to keep. + output_path: Buildings file output path. """ # Parse labels_to_classes into dictionary format if specified if labels_to_classes: @@ -715,28 +726,30 @@ def read_labels_file( raise # Generate coordinates from label file - df = gpd.read_file(path).to_crs(epsg=4326) - coordinates = [] - for _, row in df.iterrows(): - centroid = row.geometry.centroid + gdf = gpd.read_file(path) + if max_points: + gdf = gdf.iloc[:max_points] + string_labels = [] + float_labels = [] + for _, row in gdf.iterrows(): label = row[label_property] if isinstance(label, str): try: - float_label = label_to_class_dict[label] - coordinates.append((centroid.x, centroid.y, float_label, label)) + string_labels.append(label) + float_labels.append(label_to_class_dict[label]) except KeyError: logging.warning('Label %s is not recognized.', label) elif isinstance(label, (int, float)): - float_label = float(label) - coordinates.append((centroid.x, centroid.y, float_label, str(label))) + string_labels.append(str(label)) + float_labels.append(float(label)) else: raise ValueError(f'Unrecognized label property type {type(label)}') - if max_points: - coordinates = coordinates[:max_points] - - # logging.info('Read %d labeled coordinates.', len(coordinates)) - return coordinates + output_gdf = gpd.GeoDataFrame( + {'string_label': string_labels, 'label': float_labels}, + geometry=gpd.geometry, + ) + buildings.write_buildings_file(output_gdf, output_path) def parse_gdal_env(settings: List[str]) -> Dict[str, str]: @@ -767,8 +780,8 @@ def generate_examples_pipeline( resolution: float, output_dir: str, num_output_shards: int, - unlabeled_coordinates: List[Tuple[float, float]], - labeled_coordinates: List[Tuple[float, float, float, str]], + buildings_path: str, + buildings_labeled: bool, use_dataflow: bool, gdal_env: Dict[str, str], dataflow_job_name: Optional[str], @@ -788,10 +801,8 @@ def generate_examples_pipeline( resolution: Desired resolution of image patches. output_dir: Parent output directory. num_output_shards: Number of output shards. - unlabeled_coordinates: List of coordinates (longitude, latitude) to extract - unlabeled examples for. - labeled_coordinates: List of coordinates (longitude, latitude, label, - string_label) to extract labeled examples for. + buildings_path: Path to file containing building footprints. + buildings_labeled: True if buildings have labels. use_dataflow: If true, run pipeline on GCP Dataflow. gdal_env: GDAL environment configuration. dataflow_job_name: Name of dataflow job. @@ -816,26 +827,20 @@ def generate_examples_pipeline( accelerator_count=0, ) - coordinates_path = os.path.join(temp_dir, 'coordinates') - if unlabeled_coordinates: - small_examples_output_prefix = ( - os.path.join(output_dir, 'examples', 'unlabeled', 'unlabeled')) - large_examples_output_prefix = ( - os.path.join(output_dir, 'examples', 'unlabeled-large', 'unlabeled')) - labeled_coordinates = [(longitude, latitude, -1.0, '') - for longitude, latitude in unlabeled_coordinates] - utils.write_coordinates_file(labeled_coordinates, coordinates_path) - - elif labeled_coordinates: + if buildings_labeled: small_examples_output_prefix = ( os.path.join(output_dir, 'examples', 'labeled', 'labeled')) large_examples_output_prefix = ( os.path.join(output_dir, 'examples', 'labeled-large', 'labeled')) - utils.write_coordinates_file(labeled_coordinates, coordinates_path) + else: + small_examples_output_prefix = ( + os.path.join(output_dir, 'examples', 'unlabeled', 'unlabeled')) + large_examples_output_prefix = ( + os.path.join(output_dir, 'examples', 'unlabeled-large', 'unlabeled')) with beam.Pipeline(options=pipeline_options) as pipeline: large_examples, small_examples = _generate_examples( - pipeline, before_image_patterns, after_image_patterns, coordinates_path, + pipeline, before_image_patterns, after_image_patterns, buildings_path, large_patch_size, example_patch_size, resolution, gdal_env, 'generate_examples', cloud_detector_model_path) diff --git a/src/skai/generate_examples_test.py b/src/skai/generate_examples_test.py index 99b5e5ea..324a4fd9 100644 --- a/src/skai/generate_examples_test.py +++ b/src/skai/generate_examples_test.py @@ -18,17 +18,17 @@ import os import pathlib import tempfile -from typing import Any, List, Tuple +from typing import Any from absl.testing import absltest from absl.testing import parameterized import apache_beam as beam from apache_beam.testing import test_pipeline -from apache_beam.testing.util import assert_that -from apache_beam.testing.util import equal_to +import apache_beam.testing.util as test_util +import geopandas as gpd import numpy as np +from skai import buildings from skai import generate_examples -from skai import utils import tensorflow as tf @@ -48,7 +48,7 @@ def _deserialize_image(serialized_image: bytes) -> np.ndarray: return tf.io.decode_image(serialized_image).numpy() -def _unordered_all_close(list1: List[Any], list2: List[Any]) -> bool: +def _unordered_all_close(list1: list[Any], list2: list[Any]) -> bool: """Return that two lists of coordinates are close to each other.""" if len(list1) != len(list2): return False @@ -58,14 +58,65 @@ def _unordered_all_close(list1: List[Any], list2: List[Any]) -> bool: return np.allclose(sorted_list1, sorted_list2) +def _create_buildings_file( + coordinates: list[tuple[float, float]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + gdf = gpd.GeoDataFrame( + { + 'area_in_meters': [0.0] * len(coordinates), + }, + geometry=gpd.points_from_xy(longitudes, latitudes), + crs=4326, + ) + buildings.write_buildings_file(gdf, output_path) + + +def _create_labeled_buildings_file( + coordinates: list[tuple[float, float, float, str]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + labels = [c[2] for c in coordinates] + string_labels = [c[3] for c in coordinates] + area_in_meters = [0.0] * len(coordinates) + gdf = gpd.GeoDataFrame( + { + 'label': labels, + 'string_label': string_labels, + 'area_in_meters': area_in_meters, + }, + geometry=gpd.points_from_xy(longitudes, latitudes), + crs=4326, + ) + buildings.write_buildings_file(gdf, output_path) + + +def _create_buildings_file_with_plus_code( + coordinates: list[tuple[float, float]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + gdf = gpd.GeoDataFrame( + { + 'area_in_meters': [0.0] * len(coordinates), + 'full_plus_code': ['abc'] * len(coordinates), + }, + geometry=gpd.points_from_xy(longitudes, latitudes), + crs=4326, + ) + buildings.write_buildings_file(gdf, output_path) + + def _check_examples( before_image_id: str, after_image_id: str, small_patch_size: int, large_patch_size: int, - expected_coordinates: List[Tuple[float, float, float]], - expected_string_labels: List[str], - expected_plus_codes: List[str], + expected_coordinates: list[tuple[float, float, float]], + expected_string_labels: list[str], + expected_plus_codes: list[str], expect_blank_before: bool, expect_large_patch: bool): """Validates examples generated from beam pipeline. @@ -106,7 +157,8 @@ def _check_examples_internal(actual_examples): 'example_id', 'int64_id', 'plus_code', - 'string_label' + 'string_label', + 'area_in_meters', ]) if expect_large_patch: expected_feature_names.update( @@ -192,7 +244,7 @@ def setUp(self): super().setUp() current_dir = pathlib.Path(__file__).parent self.test_image_path = str(current_dir / TEST_IMAGE_PATH) - self.coordinates_path = str(current_dir / 'coordinates') + self.buildings_path = str(current_dir / 'buildings') self.test_image_path_patterns = str(current_dir / 'test_data/country_*.tif') self.test_config_path = str(current_dir / TEST_CONFIG_PATH) self.test_missing_dataset_name_config_path = str( @@ -205,18 +257,19 @@ def setUp(self): def testGenerateExamplesFn(self): """Tests GenerateExamplesFn class.""" - unlabeled_coordinates = [(178.482925, -16.632893, -1.0, ''), - (178.482283, -16.632279, -1.0, '')] - utils.write_coordinates_file(unlabeled_coordinates, self.coordinates_path) + _create_buildings_file( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) with test_pipeline.TestPipeline() as pipeline: large_examples, small_examples = generate_examples._generate_examples( pipeline, [self.test_image_path], [self.test_image_path], - self.coordinates_path, 62, 32, 0.5, {}, 'unlabeled') + self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') # Example at second input coordinate should be dropped because its patch # falls mostly outside the before and after image bounds. - assert_that( + test_util.assert_that( large_examples, _check_examples( self.test_image_path, @@ -231,7 +284,7 @@ def testGenerateExamplesFn(self): ), label='assert_large_examples', ) - assert_that( + test_util.assert_that( small_examples, _check_examples( self.test_image_path, @@ -250,16 +303,20 @@ def testGenerateExamplesFn(self): def testGenerateExamplesFnLabeled(self): """Tests GenerateExamplesFn class.""" - labeled_coordinates = [(178.482925, -16.632893, 0, 'no_damage'), - (178.482924, -16.632894, 1, 'destroyed')] - utils.write_coordinates_file(labeled_coordinates, self.coordinates_path) + _create_labeled_buildings_file( + [ + (178.482925, -16.632893, 0, 'no_damage'), + (178.482924, -16.632894, 1, 'destroyed'), + ], + self.buildings_path, + ) with test_pipeline.TestPipeline() as pipeline: large_examples, small_examples = generate_examples._generate_examples( pipeline, [self.test_image_path], [self.test_image_path], - self.coordinates_path, 62, 32, 0.5, {}, 'labeled') + self.buildings_path, 62, 32, 0.5, {}, 'labeled') - assert_that( + test_util.assert_that( large_examples, _check_examples( self.test_image_path, @@ -275,7 +332,7 @@ def testGenerateExamplesFnLabeled(self): label='assert_large_examples', ) - assert_that( + test_util.assert_that( small_examples, _check_examples( self.test_image_path, @@ -291,21 +348,68 @@ def testGenerateExamplesFnLabeled(self): label='assert_small_examples', ) + def testGenerateExamplesFnWithPlusCodes(self): + """Tests GenerateExamplesFn class.""" + + _create_buildings_file_with_plus_code( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) + + with test_pipeline.TestPipeline() as pipeline: + large_examples, small_examples = generate_examples._generate_examples( + pipeline, [self.test_image_path], [self.test_image_path], + self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') + + # Example at second input coordinate should be dropped because its patch + # falls mostly outside the before and after image bounds. + test_util.assert_that( + large_examples, + _check_examples( + self.test_image_path, + self.test_image_path, + 32, + 62, + [(178.482925, -16.632893, -1.0)], + [''], + ['abc'], + False, + True, + ), + label='assert_large_examples', + ) + test_util.assert_that( + small_examples, + _check_examples( + self.test_image_path, + self.test_image_path, + 32, + 62, + [(178.482925, -16.632893, -1.0)], + [''], + ['abc'], + False, + False, + ), + label='assert_small_examples', + ) + def testGenerateExamplesFnNoBefore(self): """Tests GenerateExamplesFn class without before image.""" - coordinates = [(178.482925, -16.632893, -1.0, ''), - (178.482283, -16.632279, -1.0, '')] - utils.write_coordinates_file(coordinates, self.coordinates_path) + _create_buildings_file( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) with test_pipeline.TestPipeline() as pipeline: large_examples, small_examples = generate_examples._generate_examples( - pipeline, [], [self.test_image_path], self.coordinates_path, 62, 32, + pipeline, [], [self.test_image_path], self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') # Example at second input coordinate should be dropped because its patch # falls mostly outside the before and after image bounds. - assert_that( + test_util.assert_that( large_examples, _check_examples( '', @@ -321,7 +425,7 @@ def testGenerateExamplesFnNoBefore(self): label='assert_large_examples', ) - assert_that( + test_util.assert_that( small_examples, _check_examples( '', @@ -339,8 +443,7 @@ def testGenerateExamplesFnNoBefore(self): def testGenerateExampleFnPathPattern(self): """Test GenerateExampleFn class with a path pattern.""" - coordinates = [(178.482925, -16.632893, -1.0, '')] - utils.write_coordinates_file(coordinates, self.coordinates_path) + _create_buildings_file([(178.482925, -16.632893)], self.buildings_path) expected_before_image_ids = glob.glob(self.test_image_path_patterns) @@ -348,7 +451,7 @@ def testGenerateExampleFnPathPattern(self): # The path patterns specify two before images. large_examples, small_examples = generate_examples._generate_examples( pipeline, [self.test_image_path_patterns], - [self.test_image_path], self.coordinates_path, 62, 32, 0.5, + [self.test_image_path], self.buildings_path, 62, 32, 0.5, {}, 'unlabeled') small_examples_before_ids = ( @@ -358,17 +461,24 @@ def testGenerateExampleFnPathPattern(self): large_examples | 'Map large examples to before image ids' >> beam.Map(_get_before_image_id)) - assert_that(small_examples_before_ids, - equal_to(expected_before_image_ids), - 'check small examples before image ids') + test_util.assert_that( + small_examples_before_ids, + test_util.equal_to(expected_before_image_ids), + 'check small examples before image ids', + ) - assert_that(large_examples_before_ids, - equal_to(expected_before_image_ids), - 'check large examples before image ids') + test_util.assert_that( + large_examples_before_ids, + test_util.equal_to(expected_before_image_ids), + 'check large examples before image ids', + ) def testGenerateExamplesPipeline(self): output_dir = tempfile.mkdtemp(dir=absltest.TEST_TMPDIR.value) - coordinates = [(178.482925, -16.632893), (178.482283, -16.632279)] + _create_buildings_file( + [(178.482925, -16.632893), (178.482283, -16.632279)], + self.buildings_path, + ) generate_examples.generate_examples_pipeline( before_image_patterns=[self.test_image_path], after_image_patterns=[self.test_image_path], @@ -377,8 +487,8 @@ def testGenerateExamplesPipeline(self): resolution=0.5, output_dir=output_dir, num_output_shards=1, - unlabeled_coordinates=coordinates, - labeled_coordinates=[], + buildings_path=self.buildings_path, + buildings_labeled=False, use_dataflow=False, gdal_env={}, dataflow_job_name='test', diff --git a/src/skai/open_street_map.py b/src/skai/open_street_map.py index 7072e50a..9d1886da 100644 --- a/src/skai/open_street_map.py +++ b/src/skai/open_street_map.py @@ -16,11 +16,14 @@ Please see https://wiki.openstreetmap.org/wiki/Overpass_API for details. """ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional import xml.etree.ElementTree as ET +import geopandas as gpd import requests import shapely.geometry +from skai import buildings + Polygon = shapely.geometry.polygon.Polygon Point = shapely.geometry.point.Point @@ -123,18 +126,17 @@ def get_buildings_in_region(region: Polygon, def get_building_centroids_in_regions( - regions: List[Polygon], overpass_url: str) -> List[Tuple[float, float]]: + regions: List[Polygon], overpass_url: str, output_path: str) -> None: """Queries OpenStreetMap Overpass API for all building centroids in a region. Args: regions: Regions of interest as polygons. Must be in EPSG:4326 (long/lat). overpass_url: Overpass URL e.g. https://lz4.overpass-api.de/api/interpreter. - - Returns: - A list of building centroids in all regions. + output_path: Save footprints to this path as a GeoPackage. """ - centroids = [] + polygons = [] for region in regions: - polygons = get_buildings_in_region(region, overpass_url) - centroids.extend([(p.centroid.x, p.centroid.y) for p in polygons]) - return centroids + polygons.extend(get_buildings_in_region(region, overpass_url)) + buildings.write_buildings_file( + gpd.GeoDataFrame(geometry=polygons), output_path + ) diff --git a/src/skai/read_raster.py b/src/skai/read_raster.py index fefbddfc..db65f72f 100644 --- a/src/skai/read_raster.py +++ b/src/skai/read_raster.py @@ -27,6 +27,7 @@ import rtree import shapely.geometry +from skai import buildings from skai import utils Metrics = beam.metrics.Metrics @@ -241,17 +242,17 @@ def _resample_image(image: np.ndarray, patch_size: int) -> np.ndarray: image, (patch_size, patch_size), interpolation=cv2.INTER_CUBIC) -def _coordinates_to_groups( +def _buildings_to_groups( raster_path: str, - coordinates_path: str, + buildings_path: str, patch_size: int, resolution: float, gdal_env: Dict[str, str]) -> Iterable[Tuple[str, _WindowGroup]]: - """Converts a list of coordinates into pixel windows and then groups them. + """Converts building centroids into pixel windows and then groups them. Args: raster_path: Path to raster image. - coordinates_path: Path to coordinates file. + buildings_path: Path to buildings file. patch_size: Size of patches to extract. resolution: Resolution of patches to extract. gdal_env: GDAL environment configuration. @@ -259,10 +260,11 @@ def _coordinates_to_groups( Yields: Tuples of raster path and grouped windows. """ - coordinates = utils.read_coordinates_file(coordinates_path) - coords_with_ids = [(utils.encode_coordinates(longitude, - latitude), longitude, latitude) - for longitude, latitude, _, _ in coordinates] + coords_df = buildings.read_building_coordinates(buildings_path) + coords_with_ids = [ + (utils.encode_coordinates(lng, lat), lng, lat) + for lng, lat in zip(coords_df.longitude, coords_df.latitude) + ] with rasterio.Env(**gdal_env): raster = rasterio.open(raster_path) raster_res = _get_raster_resolution_in_meters(raster) @@ -343,7 +345,7 @@ def process( def extract_patches_from_rasters( pipeline: beam.Pipeline, - coordinates_path: str, + buildings_path: str, raster_paths: List[str], patch_size: int, resolution: float, @@ -353,8 +355,7 @@ def extract_patches_from_rasters( Args: pipeline: Beam pipeline. - coordinates_path: Path to CSV file containing the longitude, latitude - coordinates to extract patches for. + buildings_path: Path to building footprints file. raster_paths: Raster paths. patch_size: Desired size of output patches. resolution: Desired resolution of output patches. @@ -367,8 +368,8 @@ def extract_patches_from_rasters( return (pipeline | stage_prefix + '_encode_raster_paths' >> beam.Create(raster_paths) | stage_prefix + '_make_window_groups' >> beam.FlatMap( - _coordinates_to_groups, - coordinates_path=coordinates_path, + _buildings_to_groups, + buildings_path=buildings_path, patch_size=patch_size, resolution=resolution, gdal_env=gdal_env) diff --git a/src/skai/read_raster_test.py b/src/skai/read_raster_test.py index 1f77a9c0..68ca33c3 100644 --- a/src/skai/read_raster_test.py +++ b/src/skai/read_raster_test.py @@ -19,11 +19,12 @@ import apache_beam as beam from apache_beam.testing import test_pipeline -from apache_beam.testing.util import assert_that +import apache_beam.testing.util as test_util +import geopandas as gpd import rasterio +from skai import buildings from skai import read_raster -from skai import utils TEST_IMAGE_PATH = 'test_data/blank.tif' @@ -31,6 +32,17 @@ WindowGroup = read_raster._WindowGroup +def _create_buildings_file( + coordinates: list[tuple[float, float]], output_path: str +) -> gpd.GeoDataFrame: + longitudes = [c[0] for c in coordinates] + latitudes = [c[1] for c in coordinates] + gdf = gpd.GeoDataFrame( + geometry=gpd.points_from_xy(longitudes, latitudes), crs=4326 + ) + buildings.write_buildings_file(gdf, output_path) + + class ReadRasterTest(absltest.TestCase): def setUp(self): @@ -99,19 +111,19 @@ def _check_output_patches(patches): assert patches[2][1][0] == expected_image_path assert patches[2][1][1].shape == (64, 64, 3) - assert_that(patches, _check_output_patches) + test_util.assert_that(patches, _check_output_patches) - def test_coordinates_to_groups(self): + def test_buildings_to_groups(self): coordinates = [ - (178.482925, -16.632893, -1.0, 'no_damage'), - (178.482283, -16.632279, -1.0, 'no_damage'), - (178.482284, -16.632279, -1.0, 'no_damage')] + (178.482925, -16.632893), + (178.482283, -16.632279), + (178.482284, -16.632279)] with tempfile.NamedTemporaryFile(dir=absltest.TEST_TMPDIR.value) as f: - coordinates_path = f.name - utils.write_coordinates_file(coordinates, coordinates_path) - groups = list(read_raster._coordinates_to_groups( - self.test_image_path, coordinates_path, 32, 0.5, {})) + buildings_path = f.name + _create_buildings_file(coordinates, buildings_path) + groups = list(read_raster._buildings_to_groups( + self.test_image_path, buildings_path, 32, 0.5, {})) self.assertLen(groups, 2) def test_get_raster_bounds(self): diff --git a/src/skai/utils.py b/src/skai/utils.py index 811be540..9568ad7b 100644 --- a/src/skai/utils.py +++ b/src/skai/utils.py @@ -16,10 +16,8 @@ import base64 import io -import os -import pickle import struct -from typing import Any, Iterable, List, Sequence, Tuple +from typing import Iterable, List, Sequence, Tuple from absl import flags import PIL.Image @@ -127,16 +125,3 @@ def encode_coordinates(longitude: float, latitude: float) -> str: def decode_coordinates(encoded_coords: str) -> Tuple[float, float]: buffer = base64.b16decode(encoded_coords.encode('ascii')) return struct.unpack(' None: - output_dir = os.path.dirname(path) - if not tf.io.gfile.exists(output_dir): - tf.io.gfile.makedirs(output_dir) - with tf.io.gfile.GFile(path, 'wb') as f: - pickle.dump(coordinates, f) - - -def read_coordinates_file(path: str) -> List[Any]: - with tf.io.gfile.GFile(path, 'rb') as f: - return pickle.load(f)