From 867e70dea03365a5276f3a4fab093df4bdefd7f7 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Wed, 3 Jul 2024 19:11:14 +0200
Subject: [PATCH 01/16] cloud mask full image in S2 images with no QA60 or
 cloud probability data (#24)

---
 geedim/mask.py     |  68 +++++++++++++--------
 tests/test_mask.py | 148 +++++++++++++++++++++++++++++++--------------
 2 files changed, 148 insertions(+), 68 deletions(-)

diff --git a/geedim/mask.py b/geedim/mask.py
index f7bf5f7..d596ef5 100644
--- a/geedim/mask.py
+++ b/geedim/mask.py
@@ -13,10 +13,12 @@
     See the License for the specific language governing permissions and
     limitations under the License.
 """
+
 import logging
 from typing import Dict
 
 import ee
+
 import geedim.schema
 from geedim.download import BaseImage
 from geedim.enums import CloudMaskMethod
@@ -160,25 +162,23 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
             reducer="sum", geometry=region, crs=proj, scale=scale, bestEffort=True, maxPixels=1e6
         )
 
-        fill_portion = (
-            ee.Number(sums.get('FILL_MASK')).divide(ee.Number(sums.get('REGION_SUM'))).multiply(100)
-        )
+        fill_portion = ee.Number(sums.get('FILL_MASK')).divide(ee.Number(sums.get('REGION_SUM'))).multiply(100)
 
         # set the encapsulated image properties
         self.ee_image = self.ee_image.set('FILL_PORTION', fill_portion)
         # set CLOUDLESS_PORTION=100 for the generic case, where cloud/shadow masking is not supported
-        self.ee_image = self.ee_image.set('CLOUDLESS_PORTION', 100.)
+        self.ee_image = self.ee_image.set('CLOUDLESS_PORTION', 100.0)
 
     def mask_clouds(self):
-        """ Apply the cloud/shadow mask if supported, otherwise apply the fill mask. """
+        """Apply the cloud/shadow mask if supported, otherwise apply the fill mask."""
         self.ee_image = self.ee_image.updateMask(self.ee_image.select('FILL_MASK'))
 
 
 class CloudMaskedImage(MaskedImage):
-    """ A base class for encapsulating cloud/shadow masked images. """
+    """A base class for encapsulating cloud/shadow masked images."""
 
     def _cloud_dist(self, cloudless_mask: ee.Image = None, max_cloud_dist: float = 5000) -> ee.Image:
-        """ Find the cloud/shadow distance in units of 10m. """
+        """Find the cloud/shadow distance in units of 10m."""
         if not cloudless_mask:
             cloudless_mask = self.ee_image.select('CLOUDLESS_MASK')
         proj = get_projection(self.ee_image, min_scale=False)  # use maximum scale projection to save processing time
@@ -190,9 +190,11 @@ def _cloud_dist(self, cloudless_mask: ee.Image = None, max_cloud_dist: float = 5
         cloud_pix = ee.Number(max_cloud_dist).divide(proj.nominalScale()).round()  # cloud_dist in pixels
 
         # Find distance to nearest cloud/shadow (units of 10m).
-        cloud_dist = cloud_shadow_mask.fastDistanceTransform(
-            neighborhood=cloud_pix, units='pixels', metric='squared_euclidean'
-        ).sqrt().multiply(proj.nominalScale().divide(10))
+        cloud_dist = (
+            cloud_shadow_mask.fastDistanceTransform(neighborhood=cloud_pix, units='pixels', metric='squared_euclidean')
+            .sqrt()
+            .multiply(proj.nominalScale().divide(10))
+        )
 
         # Reproject to force calculation at correct scale.
         cloud_dist = cloud_dist.reproject(crs=proj, scale=proj.nominalScale()).rename('CLOUD_DIST')
@@ -218,7 +220,7 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
         if not region:
             region = self.ee_image.geometry()  # use the image footprint
 
-        proj = get_projection(self.ee_image, min_scale=False)  # get projection of minimum scale band
+        proj = get_projection(self.ee_image, min_scale=False)  # get projection of maximum scale band
         # If _proj_scale is set, use that as the scale, otherwise use the proj.nomimalScale().  For non-composite images
         # these should be the same value.  For composite images, there is no `fixed` projection, hence the
         # need for _proj_scale.
@@ -234,9 +236,7 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
         sums = stats_image.reduceRegion(
             reducer="sum", geometry=region, crs=proj, scale=scale, bestEffort=True, maxPixels=1e6
         ).rename(['FILL_MASK', 'CLOUDLESS_MASK'], ['FILL_PORTION', 'CLOUDLESS_PORTION'])
-        fill_portion = (
-            ee.Number(sums.get('FILL_PORTION')).divide(ee.Number(sums.get('REGION_SUM'))).multiply(100)
-        )
+        fill_portion = ee.Number(sums.get('FILL_PORTION')).divide(ee.Number(sums.get('REGION_SUM'))).multiply(100)
         cloudless_portion = (
             ee.Number(sums.get('CLOUDLESS_PORTION')).divide(ee.Number(sums.get('FILL_PORTION'))).multiply(100)
         )
@@ -255,7 +255,7 @@ def _aux_image(self, **kwargs) -> ee.Image:
         raise NotImplementedError('This virtual method should be overridden by derived classes.')
 
     def mask_clouds(self):
-        """ Apply the cloud/shadow mask. """
+        """Apply the cloud/shadow mask."""
         self.ee_image = self.ee_image.updateMask(self.ee_image.select('CLOUDLESS_MASK'))
 
 
@@ -315,12 +315,20 @@ def _aux_image(self, mask_shadows: bool = True, mask_cirrus: bool = True, max_cl
 
 
 class Sentinel2ClImage(CloudMaskedImage):
-    """ Base class for cloud/shadow masking of Sentinel-2 TOA and SR images. """
+    """Base class for cloud/shadow masking of Sentinel-2 TOA and SR images."""
 
     def _aux_image(
-        self, s2_toa: bool = False, mask_cirrus: bool = True, mask_shadows: bool = True,
-        mask_method: CloudMaskMethod = CloudMaskMethod.cloud_prob, prob: float = 60, dark: float = 0.15,
-        shadow_dist: int = 1000, buffer: int = 50, cdi_thresh: float = None, max_cloud_dist: int = 5000
+        self,
+        s2_toa: bool = False,
+        mask_cirrus: bool = True,
+        mask_shadows: bool = True,
+        mask_method: CloudMaskMethod = CloudMaskMethod.cloud_prob,
+        prob: float = 60,
+        dark: float = 0.15,
+        shadow_dist: int = 1000,
+        buffer: int = 50,
+        cdi_thresh: float = None,
+        max_cloud_dist: int = 5000,
     ) -> ee.Image:
         """
         Derive cloud, shadow and validity masks for the encapsulated image.
@@ -365,9 +373,17 @@ def _aux_image(
         mask_method = CloudMaskMethod(mask_method)
 
         def get_cloud_prob(ee_im):
-            """Get the cloud probability image from COPERNICUS/S2_CLOUD_PROBABILITY that corresponds to `ee_im`."""
+            """Get the cloud probability image from COPERNICUS/S2_CLOUD_PROBABILITY that corresponds to `ee_im`, if it
+            exists.  Otherwise, get a 100% probability.
+            """
+            # create a default 100% cloud probability image to revert to if the cloud probability image does not exist
+            # (use a 10m projection to prevent any issue with _set_region_stats() etc auto-scale)
+            default_cloud_prob = ee.Image(100).setDefaultProjection(ee_im.select('QA10').projection())
+
+            # get the cloud probability image if it exists, otherwise revert to default_cloud_prob
             filt = ee.Filter.eq('system:index', ee_im.get('system:index'))
             cloud_prob = ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY').filter(filt).first()
+            cloud_prob = ee.Image(ee.List([cloud_prob, default_cloud_prob]).reduce(ee.Reducer.firstNonNull()))
             return cloud_prob.rename('CLOUD_PROB')
 
         def get_cloud_mask(ee_im, cloud_prob=None):
@@ -379,6 +395,9 @@ def get_cloud_mask(ee_im, cloud_prob=None):
             else:
                 qa = ee_im.select('QA60')
                 cloud_mask = qa.bitwiseAnd(1 << 10).neq(0)
+                # set cloud_mask to true for images post Feb 2022 when QA60 was no longer populated
+                qa_invalid = ee_im.date().difference(ee.Date('2022-02-01'), 'days').gt(0)
+                cloud_mask = cloud_mask.bitwiseOr(qa_invalid)
                 if mask_cirrus:
                     cloud_mask = cloud_mask.Or(qa.bitwiseAnd(1 << 11).neq(0))
             return cloud_mask.rename('CLOUD_MASK')
@@ -393,7 +412,7 @@ def get_cdi_cloud_mask(ee_im):
             else:
                 # get the Sentinel-2 TOA image that corresponds to ee_im
                 idx = ee_im.get('system:index')
-                s2_toa_image = (ee.ImageCollection('COPERNICUS/S2').filter(ee.Filter.eq('system:index', idx)).first())
+                s2_toa_image = ee.ImageCollection('COPERNICUS/S2').filter(ee.Filter.eq('system:index', idx)).first()
             cdi_image = ee.Algorithms.Sentinel2.CDI(s2_toa_image)
             return cdi_image.lt(cdi_thresh).rename('CDI_CLOUD_MASK')
 
@@ -401,6 +420,7 @@ def get_shadow_mask(ee_im, cloud_mask):
             """Given a cloud mask, get a shadow mask for ee_im."""
             dark_mask = ee_im.select('B8').lt(dark * 1e4)
             if not s2_toa:
+                # exclude water
                 dark_mask = ee_im.select('SCL').neq(6).And(dark_mask)
 
             proj = get_projection(ee_im, min_scale=False)
@@ -454,21 +474,21 @@ def get_shadow_mask(ee_im, cloud_mask):
 
 
 class Sentinel2SrClImage(Sentinel2ClImage):
-    """ Class for cloud/shadow masking of Sentinel-2 SR (COPERNICUS/S2_SR) images. """
+    """Class for cloud/shadow masking of Sentinel-2 SR (COPERNICUS/S2_SR) images."""
 
     def _aux_image(self, s2_toa: bool = False, **kwargs) -> ee.Image:
         return Sentinel2ClImage._aux_image(self, s2_toa=False, **kwargs)
 
 
 class Sentinel2ToaClImage(Sentinel2ClImage):
-    """ Class for cloud/shadow masking of Sentinel-2 TOA (COPERNICUS/S2) images. """
+    """Class for cloud/shadow masking of Sentinel-2 TOA (COPERNICUS/S2) images."""
 
     def _aux_image(self, s2_toa: bool = False, **kwargs) -> ee.Image:
         return Sentinel2ClImage._aux_image(self, s2_toa=True, **kwargs)
 
 
 def class_from_id(image_id: str) -> type:
-    """ Return the *Image class that corresponds to the provided Earth Engine image/collection ID. """
+    """Return the *Image class that corresponds to the provided Earth Engine image/collection ID."""
     ee_coll_name, _ = split_id(image_id)
     if image_id in geedim.schema.collection_schema:
         return geedim.schema.collection_schema[image_id]['image_type']
diff --git a/tests/test_mask.py b/tests/test_mask.py
index 84d30fd..a11f77b 100644
--- a/tests/test_mask.py
+++ b/tests/test_mask.py
@@ -13,17 +13,19 @@
     See the License for the specific language governing permissions and
     limitations under the License.
 """
+
 from typing import Dict
 
 import ee
 import numpy as np
 import pytest
 import rasterio as rio
-from geedim.mask import MaskedImage, get_projection, class_from_id
+
+from geedim.mask import MaskedImage, get_projection, class_from_id, Sentinel2SrClImage
 
 
 def test_class_from_id(landsat_image_ids, s2_sr_image_id, s2_toa_hm_image_id, generic_image_ids):
-    """  Test class_from_id(). """
+    """Test class_from_id()."""
     from geedim.mask import LandsatImage, Sentinel2SrClImage, Sentinel2ToaClImage
 
     assert all([class_from_id(im_id) == LandsatImage for im_id in landsat_image_ids])
@@ -33,33 +35,47 @@ def test_class_from_id(landsat_image_ids, s2_sr_image_id, s2_toa_hm_image_id, ge
 
 
 def test_from_id():
-    """ Test MaskedImage.from_id() sets _id. """
+    """Test MaskedImage.from_id() sets _id."""
     ee_id = 'MODIS/006/MCD43A4/2022_01_01'
     gd_image = MaskedImage.from_id(ee_id)
     assert gd_image._id == ee_id
 
 
 @pytest.mark.parametrize(
-    'masked_image', [
-        'user_masked_image', 'modis_nbar_masked_image', 'gch_masked_image', 's1_sar_masked_image',
-        'gedi_agb_masked_image', 'gedi_cth_masked_image', 'landsat_ndvi_masked_image'
-    ]
+    'masked_image',
+    [
+        'user_masked_image',
+        'modis_nbar_masked_image',
+        'gch_masked_image',
+        's1_sar_masked_image',
+        'gedi_agb_masked_image',
+        'gedi_cth_masked_image',
+        'landsat_ndvi_masked_image',
+    ],
 )
 def test_gen_aux_bands_exist(masked_image: str, request: pytest.FixtureRequest):
-    """ Test the presence of auxiliary band (i.e. FILL_MASK) in generic masked images. """
+    """Test the presence of auxiliary band (i.e. FILL_MASK) in generic masked images."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     band_names = masked_image.ee_image.bandNames().getInfo()
     assert 'FILL_MASK' in band_names
 
 
 @pytest.mark.parametrize(
-    'masked_image', [
-        's2_sr_masked_image', 's2_toa_masked_image', 's2_sr_hm_masked_image', 's2_toa_hm_masked_image',
-        'l9_masked_image', 'l8_masked_image', 'l7_masked_image', 'l5_masked_image', 'l4_masked_image'
-    ]
+    'masked_image',
+    [
+        's2_sr_masked_image',
+        's2_toa_masked_image',
+        's2_sr_hm_masked_image',
+        's2_toa_hm_masked_image',
+        'l9_masked_image',
+        'l8_masked_image',
+        'l7_masked_image',
+        'l5_masked_image',
+        'l4_masked_image',
+    ],
 )
 def test_cloud_mask_aux_bands_exist(masked_image: str, request: pytest.FixtureRequest):
-    """ Test the presence of auxiliary bands in cloud masked images. """
+    """Test the presence of auxiliary bands in cloud masked images."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     band_names = masked_image.ee_image.bandNames().getInfo()
     exp_band_names = ['CLOUD_MASK', 'SHADOW_MASK', 'FILL_MASK', 'CLOUDLESS_MASK', 'CLOUD_DIST']
@@ -68,12 +84,25 @@ def test_cloud_mask_aux_bands_exist(masked_image: str, request: pytest.FixtureRe
 
 
 @pytest.mark.parametrize(
-    'masked_image', [
-        's2_sr_masked_image', 's2_toa_masked_image', 's2_sr_hm_masked_image', 's2_toa_hm_masked_image',
-        'l9_masked_image', 'l8_masked_image', 'l7_masked_image', 'l5_masked_image', 'l4_masked_image',
-        'user_masked_image', 'modis_nbar_masked_image', 'gch_masked_image', 's1_sar_masked_image',
-        'gedi_agb_masked_image', 'gedi_cth_masked_image', 'landsat_ndvi_masked_image'
-    ]
+    'masked_image',
+    [
+        's2_sr_masked_image',
+        's2_toa_masked_image',
+        's2_sr_hm_masked_image',
+        's2_toa_hm_masked_image',
+        'l9_masked_image',
+        'l8_masked_image',
+        'l7_masked_image',
+        'l5_masked_image',
+        'l4_masked_image',
+        'user_masked_image',
+        'modis_nbar_masked_image',
+        'gch_masked_image',
+        's1_sar_masked_image',
+        'gedi_agb_masked_image',
+        'gedi_cth_masked_image',
+        'landsat_ndvi_masked_image',
+    ],
 )
 def test_set_region_stats(masked_image: str, region_100ha, request: pytest.FixtureRequest):
     """
@@ -88,7 +117,7 @@ def test_set_region_stats(masked_image: str, region_100ha, request: pytest.Fixtu
 
 @pytest.mark.parametrize('image_id', ['l9_image_id', 'l8_image_id', 'l7_image_id', 'l5_image_id', 'l4_image_id'])
 def test_landsat_cloudless_portion(image_id: str, request: pytest.FixtureRequest):
-    """ Test `geedim` CLOUDLESS_PORTION for the whole image against related Landsat CLOUD_COVER property. """
+    """Test `geedim` CLOUDLESS_PORTION for the whole image against related Landsat CLOUD_COVER property."""
     image_id: MaskedImage = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_shadows=False, mask_cirrus=False)
     masked_image._set_region_stats()
@@ -101,7 +130,7 @@ def test_landsat_cloudless_portion(image_id: str, request: pytest.FixtureRequest
 
 @pytest.mark.parametrize('image_id', ['s2_toa_image_id', 's2_sr_image_id', 's2_toa_hm_image_id', 's2_sr_hm_image_id'])
 def test_s2_cloudless_portion(image_id: str, request: pytest.FixtureRequest):
-    """ Test `geedim` CLOUDLESS_PORTION for the whole image against CLOUDY_PIXEL_PERCENTAGE Sentinel-2 property. """
+    """Test `geedim` CLOUDLESS_PORTION for the whole image against CLOUDY_PIXEL_PERCENTAGE Sentinel-2 property."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_method='qa', mask_shadows=False, mask_cirrus=False)
     masked_image._set_region_stats()
@@ -113,7 +142,7 @@ def test_s2_cloudless_portion(image_id: str, request: pytest.FixtureRequest):
 
 @pytest.mark.parametrize('image_id', ['l9_image_id'])
 def test_landsat_cloudmask_params(image_id: str, request: pytest.FixtureRequest):
-    """ Test Landsat cloud/shadow masking `mask_shadows` and `mask_cirrus` parameters. """
+    """Test Landsat cloud/shadow masking `mask_shadows` and `mask_cirrus` parameters."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_shadows=False, mask_cirrus=False)
     masked_image._set_region_stats()
@@ -135,7 +164,7 @@ def test_landsat_cloudmask_params(image_id: str, request: pytest.FixtureRequest)
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_mask_shadows(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `mask_shadows` parameter. """
+    """Test S2 cloud/shadow masking `mask_shadows` parameter."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_shadows=False)
     masked_image._set_region_stats(region_10000ha)
@@ -150,7 +179,7 @@ def test_s2_cloudmask_mask_shadows(image_id: str, region_10000ha: Dict, request:
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_prob(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `prob` parameter. """
+    """Test S2 cloud/shadow masking `prob` parameter."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_shadows=True, prob=80)
     masked_image._set_region_stats(region_10000ha)
@@ -159,12 +188,12 @@ def test_s2_cloudmask_prob(image_id: str, region_10000ha: Dict, request: pytest.
     masked_image._set_region_stats(region_10000ha)
     prob40_portion = 100 * masked_image.properties['CLOUDLESS_PORTION'] / masked_image.properties['FILL_PORTION']
     # test there is more cloud (less CLOUDLESS_PORTION) with prob=40 as compared to prob=80
-    assert prob80_portion > prob40_portion
+    assert prob80_portion > prob40_portion > 0
 
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_method(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `mask_method` parameter. """
+    """Test S2 cloud/shadow masking `mask_method` parameter."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_method='cloud-prob')
     masked_image._set_region_stats(region_10000ha)
@@ -175,11 +204,12 @@ def test_s2_cloudmask_method(image_id: str, region_10000ha: Dict, request: pytes
     # test `mask_method` changes CLOUDLESS_PORTION but not by too much
     assert cloud_prob_portion != qa_portion
     assert cloud_prob_portion == pytest.approx(qa_portion, abs=10)
+    assert (cloud_prob_portion != pytest.approx(0, abs=1)) and (qa_portion != pytest.approx(0, abs=1))
 
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_mask_cirrus(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `mask_cirrus` parameter. """
+    """Test S2 cloud/shadow masking `mask_cirrus` parameter."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, mask_method='qa', mask_cirrus=False)
     # cloud and shadow free portion
@@ -194,7 +224,7 @@ def test_s2_cloudmask_mask_cirrus(image_id: str, region_10000ha: Dict, request:
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_dark(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `dark` parameter. """
+    """Test S2 cloud/shadow masking `dark` parameter."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, dark=0.5)
     masked_image._set_region_stats(region_10000ha)
@@ -209,7 +239,7 @@ def test_s2_cloudmask_dark(image_id: str, region_10000ha: Dict, request: pytest.
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_shadow_dist(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `shadow_dist` parameter. """
+    """Test S2 cloud/shadow masking `shadow_dist` parameter."""
     image_id: str = request.getfixturevalue(image_id)
     masked_image = MaskedImage.from_id(image_id, shadow_dist=200)
     masked_image._set_region_stats(region_10000ha)
@@ -224,12 +254,12 @@ def test_s2_cloudmask_shadow_dist(image_id: str, region_10000ha: Dict, request:
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
 def test_s2_cloudmask_cdi_thresh(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud/shadow masking `cdi_thresh` parameter. """
+    """Test S2 cloud/shadow masking `cdi_thresh` parameter."""
     image_id: str = request.getfixturevalue(image_id)
-    masked_image = MaskedImage.from_id(image_id, cdi_thresh=.5)
+    masked_image = MaskedImage.from_id(image_id, cdi_thresh=0.5)
     masked_image._set_region_stats(region_10000ha)
     cdi_pt5_portion = 100 * masked_image.properties['CLOUDLESS_PORTION'] / masked_image.properties['FILL_PORTION']
-    masked_image = MaskedImage.from_id(image_id, cdi_thresh=-.5)
+    masked_image = MaskedImage.from_id(image_id, cdi_thresh=-0.5)
     masked_image._set_region_stats(region_10000ha)
     cdi_negpt5_portion = 100 * masked_image.properties['CLOUDLESS_PORTION'] / masked_image.properties['FILL_PORTION']
     # test that increasing `cdi_thresh` results in an increase in detected cloud and corresponding decrease in
@@ -239,10 +269,10 @@ def test_s2_cloudmask_cdi_thresh(image_id: str, region_10000ha: Dict, request: p
 
 @pytest.mark.parametrize('image_id, max_cloud_dist', [('s2_sr_image_id', 100), ('s2_sr_hm_image_id', 500)])
 def test_s2_clouddist_max(image_id: str, max_cloud_dist: int, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test S2 cloud distance `max_cloud_dist` parameter. """
+    """Test S2 cloud distance `max_cloud_dist` parameter."""
 
     def get_max_cloud_dist(cloud_dist: ee.Image):
-        """ Get the maximum of `cloud_dist` over region_10000ha. """
+        """Get the maximum of `cloud_dist` over region_10000ha."""
         mcd = cloud_dist.reduceRegion(reducer='max', geometry=region_10000ha, bestEffort=True, maxPixels=1e4)
         return mcd.get('CLOUD_DIST').getInfo() * 10
 
@@ -253,8 +283,33 @@ def get_max_cloud_dist(cloud_dist: ee.Image):
     assert meas_max_cloud_dist == pytest.approx(max_cloud_dist, rel=0.1)
 
 
+def test_s2_no_cloud_prob(s2_sr_hm_image_id: str, region_100ha: dict):
+    """Test S2 cloud probability masking without corresponding 'COPERNICUS/S2_CLOUD_PROBABILITY' image gives 100% cloud."""
+    ee_image = ee.Image(s2_sr_hm_image_id)
+    ee_image = ee_image.set('system:index', 'unknown')  # prevent linking to cloud probability
+    masked_image = Sentinel2SrClImage(ee_image, mask_method='cloud-prob')
+    masked_image._set_region_stats(region_100ha)
+
+    assert masked_image.properties is not None
+    assert masked_image.properties['CLOUDLESS_PORTION'] == pytest.approx(0, abs=1)
+    assert masked_image.properties['FILL_PORTION'] == pytest.approx(100, abs=1)
+
+
+def test_s2_no_qa(s2_sr_hm_image_id: str, region_100ha: dict):
+    """Test S2 QA masking with empty QA60 band (i.e. all S2 images post Feb 2022) gives 100% cloud."""
+    # 2024 image that fills region_100ha and has some cloud
+    masked_image = MaskedImage.from_id(
+        'COPERNICUS/S2_SR_HARMONIZED/20240426T080609_20240426T083054_T34HEJ', mask_method='qa'
+    )
+    masked_image._set_region_stats(region_100ha)
+
+    assert masked_image.properties is not None
+    assert masked_image.properties['CLOUDLESS_PORTION'] == pytest.approx(0, abs=1)
+    assert masked_image.properties['FILL_PORTION'] == pytest.approx(100, abs=1)
+
+
 def get_mask_stats(masked_image: MaskedImage, region: Dict):
-    """  Get cloud/shadow etc aux band statistics for a given MaskedImage and region. """
+    """Get cloud/shadow etc aux band statistics for a given MaskedImage and region."""
     ee_image = masked_image.ee_image
     pan = ee_image.select([0, 1, 2]).reduce(ee.Reducer.mean())
     cdist = ee_image.select('CLOUD_DIST')
@@ -262,7 +317,7 @@ def get_mask_stats(masked_image: MaskedImage, region: Dict):
     shadow_mask = ee_image.select('SHADOW_MASK')
     cloudless_mask = ee_image.select('CLOUDLESS_MASK')
     pan_cloud = pan.updateMask(cloud_mask).rename('PAN_CLOUD')
-    pan_shadow = pan.mask(shadow_mask).rename('PAN_SHADOW')
+    pan_shadow = pan.updateMask(shadow_mask).rename('PAN_SHADOW')
     pan_cloudless = pan.updateMask(cloudless_mask).rename('PAN_CLOUDLESS')
     cdist_cloud = cdist.updateMask(cloud_mask).rename('CDIST_CLOUD')
     cdist_cloudless = cdist.updateMask(cloudless_mask).rename('CDIST_CLOUDLESS')
@@ -282,31 +337,38 @@ def get_mask_stats(masked_image: MaskedImage, region: Dict):
     'masked_image', ['l9_masked_image', 'l8_masked_image', 'l7_masked_image', 'l5_masked_image', 'l4_masked_image']
 )
 def test_landsat_aux_bands(masked_image: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test Landsat auxiliary band values. """
+    """Test Landsat auxiliary band values for sanity."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     stats = get_mask_stats(masked_image, region_10000ha)
+    # cloud is brighter than cloudless
     assert stats['PAN_CLOUD'] > stats['PAN_CLOUDLESS']
+    # cloudless is brighter than shadow
     assert stats['PAN_CLOUDLESS'] > stats['PAN_SHADOW']
+    # cloudless areas have greater distance to cloud than cloudy areas
     assert stats['CDIST_CLOUDLESS'] > stats['CDIST_CLOUD']
 
 
-@pytest.mark.parametrize('masked_image',
-    ['s2_sr_masked_image', 's2_toa_masked_image', 's2_sr_hm_masked_image', 's2_toa_hm_masked_image']
+@pytest.mark.parametrize(
+    'masked_image', ['s2_sr_masked_image', 's2_toa_masked_image', 's2_sr_hm_masked_image', 's2_toa_hm_masked_image']
 )  # yapf: disable
 def test_s2_aux_bands(masked_image: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """ Test Sentinel-2 auxiliary band values. """
+    """Test Sentinel-2 auxiliary band values for sanity."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     stats = get_mask_stats(masked_image, region_10000ha)
+    # cloud is brighter than cloudless
     assert stats['PAN_CLOUD'] > stats['PAN_CLOUDLESS']
+    # cloudless is brighter than shadow
     assert stats['PAN_CLOUDLESS'] > stats['PAN_SHADOW']
+    # cloudless areas have greater distance to cloud than cloudy areas
     assert stats['CDIST_CLOUDLESS'] > stats['CDIST_CLOUD']
     proj_scale = get_projection(masked_image.ee_image, min_scale=False).nominalScale().getInfo()
+    # min distance to cloud is pixel size
     assert stats['CDIST_MIN'] * 10 == proj_scale
 
 
 @pytest.mark.parametrize('masked_image', ['s2_sr_masked_image', 'l9_masked_image'])
 def test_mask_clouds(masked_image: str, region_100ha: Dict, tmp_path, request: pytest.FixtureRequest):
-    """ Test MaskedImage.mask_clouds() by downloading and examining dataset masks. """
+    """Test MaskedImage.mask_clouds() by downloading and examining dataset masks."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     filename = tmp_path.joinpath(f'test_image.tif')
     masked_image.mask_clouds()
@@ -324,11 +386,9 @@ def test_mask_clouds(masked_image: str, region_100ha: Dict, tmp_path, request: p
 
 
 def test_skysat_region_stats():
-    """ Test _set_region_stats() works on SKYSAT image with no region. """
+    """Test _set_region_stats() works on SKYSAT image with no region."""
     ee_image = ee.Image('SKYSAT/GEN-A/PUBLIC/ORTHO/RGB/s02_20141004T074858Z')
     masked_image = MaskedImage(ee_image)
     masked_image._set_region_stats()
     assert 'FILL_PORTION' in masked_image.properties
     assert masked_image.properties['FILL_PORTION'] > 0.8
-##
-

From 12b2e104f2bd64a35c788e8925237b2ca59ac6ce Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Wed, 3 Jul 2024 19:45:27 +0200
Subject: [PATCH 02/16] update rio pin for nodata=-inf compatibility

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index aa48457..2344c30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ readme = 'README.rst'
 requires-python = '>=3.6'
 dependencies = [
     'numpy>=1.19',
-    'rasterio>=1.1',
+    'rasterio>=1.3',
     'click>=8',
     'tqdm>=4.6',
     'earthengine-api>=0.1.2',

From ce8df7a9aad0ab5823d8e97604270993a0dce11c Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Wed, 3 Jul 2024 20:02:11 +0200
Subject: [PATCH 03/16] change float* nodata value to -inf

---
 geedim/download.py     | 5 ++++-
 tests/test_download.py | 9 +++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/geedim/download.py b/geedim/download.py
index c145e1b..4d68a59 100644
--- a/geedim/download.py
+++ b/geedim/download.py
@@ -52,7 +52,7 @@
 
 
 class BaseImage:
-    _float_nodata = float('nan')
+    _float_nodata = float('-inf')
     _desc_width = 50
     _default_resampling = ResamplingMethod.near
     _ee_max_tile_size = 32
@@ -984,6 +984,9 @@ def download(
 
             def download_tile(tile):
                 """Download a tile and write into the destination GeoTIFF."""
+                # Note: due to an Earth Engine bug (https://issuetracker.google.com/issues/350528377), tile_array is
+                # float64 for uint32 and int64 exp_image types.  The tile_array nodata value is as it should be
+                # though, so conversion to the exp_image / GeoTIFF type happens ok below.
                 tile_array = tile.download(session=session, bar=bar)
                 with out_lock:
                     out_ds.write(tile_array, window=tile.window)
diff --git a/tests/test_download.py b/tests/test_download.py
index 6516212..9aa3c25 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -466,18 +466,15 @@ def test_prepare_for_download(base_image: str, request: pytest.FixtureRequest):
         ('int16', -(2**15)),
         ('uint32', 0),
         ('int32', -(2**31)),
-        ('float32', float('nan')),
-        ('float64', float('nan')),
+        ('float32', float('-inf')),
+        ('float64', float('-inf')),
     ],
 )
 def test_prepare_nodata(user_fix_base_image: BaseImage, region_25ha: Dict, dtype: str, exp_nodata: float):
     """Test BaseImage._prepare_for_download() sets rasterio profile nodata correctly for different dtypes."""
     exp_image, exp_profile = user_fix_base_image._prepare_for_download(region=region_25ha, scale=30, dtype=dtype)
     assert exp_image.dtype == dtype
-    if np.isnan(exp_profile['nodata']):
-        assert np.isnan(exp_nodata)
-    else:
-        assert exp_profile['nodata'] == exp_nodata
+    assert exp_profile['nodata'] == exp_nodata
 
 
 @pytest.mark.parametrize(

From 59089326cca6bb3bfa7ac5938636dc47fee40e9f Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Wed, 3 Jul 2024 21:03:13 +0200
Subject: [PATCH 04/16] leave float* nodata on EE default (-inf)

---
 geedim/tile.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/geedim/tile.py b/geedim/tile.py
index d683ad1..39e98d8 100644
--- a/geedim/tile.py
+++ b/geedim/tile.py
@@ -127,11 +127,6 @@ def _download_to_array(self, url: str, session: requests.Session = None, bar: tq
         with utils.suppress_rio_logs(), env, MemoryFile(gtiff_bytes) as mem_file:
             with mem_file.open() as ds:
                 array = ds.read()
-                if (array.dtype == np.dtype('float32')) or (array.dtype == np.dtype('float64')):
-                    # GEE sets nodata to -inf for float data types, (but does not populate the nodata field).
-                    # rasterio won't allow nodata=-inf, so this is a workaround to change nodata to nan at
-                    # source.
-                    array[np.isinf(array)] = np.nan
         return array
 
     def download(

From 7d86a34925738233ffdeb005e0179786cd5a4e56 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Wed, 3 Jul 2024 21:07:52 +0200
Subject: [PATCH 05/16] specify _set_region_stats scale for problem images

---
 tests/test_mask.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_mask.py b/tests/test_mask.py
index a11f77b..a9bfef2 100644
--- a/tests/test_mask.py
+++ b/tests/test_mask.py
@@ -288,7 +288,7 @@ def test_s2_no_cloud_prob(s2_sr_hm_image_id: str, region_100ha: dict):
     ee_image = ee.Image(s2_sr_hm_image_id)
     ee_image = ee_image.set('system:index', 'unknown')  # prevent linking to cloud probability
     masked_image = Sentinel2SrClImage(ee_image, mask_method='cloud-prob')
-    masked_image._set_region_stats(region_100ha)
+    masked_image._set_region_stats(region_100ha, scale=60)
 
     assert masked_image.properties is not None
     assert masked_image.properties['CLOUDLESS_PORTION'] == pytest.approx(0, abs=1)
@@ -301,7 +301,7 @@ def test_s2_no_qa(s2_sr_hm_image_id: str, region_100ha: dict):
     masked_image = MaskedImage.from_id(
         'COPERNICUS/S2_SR_HARMONIZED/20240426T080609_20240426T083054_T34HEJ', mask_method='qa'
     )
-    masked_image._set_region_stats(region_100ha)
+    masked_image._set_region_stats(region_100ha, scale=60)
 
     assert masked_image.properties is not None
     assert masked_image.properties['CLOUDLESS_PORTION'] == pytest.approx(0, abs=1)

From b5bfd5bc13e309e0d2f2dddc040a934cec574e0a Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 4 Jul 2024 17:11:30 +0200
Subject: [PATCH 06/16] add test for EE GeoTIFF nodata value

---
 tests/integration.py | 104 ++++++++++++++++++++++++++++++-------------
 1 file changed, 73 insertions(+), 31 deletions(-)

diff --git a/tests/integration.py b/tests/integration.py
index aae678d..883b27b 100644
--- a/tests/integration.py
+++ b/tests/integration.py
@@ -13,31 +13,35 @@
     See the License for the specific language governing permissions and
     limitations under the License.
 """
-import ee
-from pathlib import Path
-from httplib2 import Http
+
 import json
+from pathlib import Path
+
+import ee
 import numpy as np
+import pytest
 import rasterio as rio
+from click.testing import CliRunner
+from httplib2 import Http
+from rasterio.coords import BoundingBox
 from rasterio.crs import CRS
 from rasterio.features import bounds
 from rasterio.warp import transform_geom
-from rasterio.coords import BoundingBox
-from click.testing import CliRunner
-import pytest
+
 import geedim as gd
-from geedim import utils, cli
+from geedim import cli, utils
+from geedim.download import BaseImage
 
 
 @pytest.fixture(scope='session', autouse=True)
 def ee_init():
-    """ Override the ee_init fixture, so that we only initialise as geemap does, below. """
+    """Override the ee_init fixture, so that we only initialise as geemap does, below."""
     return
 
 
 def test_geemap_integration(tmp_path: Path):
-    """ Simulate the geemap download example. """
-    gd.Initialize(opt_url=None, http_transport=Http())    # a replica of geemap Initialize
+    """Simulate the geemap download example."""
+    gd.Initialize(opt_url=None, http_transport=Http())  # a replica of geemap Initialize
     ee_image = ee.ImageCollection("LANDSAT/LC08/C02/T1_TOA").first()
     gd_image = gd.download.BaseImage(ee_image)
     out_file = tmp_path.joinpath('landsat.tif')
@@ -47,27 +51,29 @@ def test_geemap_integration(tmp_path: Path):
 
 
 def test_geeml_integration(tmp_path: Path):
-    """ Test the geeml `user memory limit exceeded` example. """
+    """Test the geeml `user memory limit exceeded` example."""
     gd.Initialize()
     region = {
         'geodesic': False,
         'crs': {'type': 'name', 'properties': {'name': 'EPSG:4326'}},
         'type': 'Polygon',
-        'coordinates': [[
-            [6.030749828407996, 53.66867883985145],
-            [6.114742307473171, 53.66867883985145],
-            [6.114742307473171, 53.76381042843971],
-            [6.030749828407996, 53.76381042843971],
-            [6.030749828407996, 53.66867883985145]
-        ]]
+        'coordinates': [
+            [
+                [6.030749828407996, 53.66867883985145],
+                [6.114742307473171, 53.66867883985145],
+                [6.114742307473171, 53.76381042843971],
+                [6.030749828407996, 53.76381042843971],
+                [6.030749828407996, 53.66867883985145],
+            ]
+        ],
     }  # yapf: disable
 
     ee_image = (
-        ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED').
-        filterDate('2019-01-01', '2020-01-01').
-        filterBounds(region).
-        select(['B4', 'B3', 'B2', 'B8']).
-        reduce(ee.Reducer.percentile([35]))
+        ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
+        .filterDate('2019-01-01', '2020-01-01')
+        .filterBounds(region)
+        .select(['B4', 'B3', 'B2', 'B8'])
+        .reduce(ee.Reducer.percentile([35]))
     )  # yapf: disable
 
     gd_image = gd.download.BaseImage(ee_image)
@@ -82,7 +88,13 @@ def test_geeml_integration(tmp_path: Path):
 
     # test we can download the image with a max_tile_size of 16 MB
     gd_image.download(
-        out_file, crs='EPSG:4326', region=region, scale=10, dtype='float64', overwrite=True, max_tile_size=16,
+        out_file,
+        crs='EPSG:4326',
+        region=region,
+        scale=10,
+        dtype='float64',
+        overwrite=True,
+        max_tile_size=16,
     )
     assert out_file.exists()
     with rio.open(out_file, 'r') as ds:
@@ -94,14 +106,14 @@ def test_geeml_integration(tmp_path: Path):
         # sometimes the top/bottom bounds of the dataset are swapped, so extract and compare UL and BR corners
         print(f'region_bounds: {region_bounds}')
         print(f'ds.bounds: {ds.bounds}')
-        ds_ul = np.array([min(ds.bounds.left,ds.bounds.right), min(ds.bounds.top,ds.bounds.bottom)])
-        ds_lr = np.array([max(ds.bounds.left,ds.bounds.right), max(ds.bounds.top,ds.bounds.bottom)])
+        ds_ul = np.array([min(ds.bounds.left, ds.bounds.right), min(ds.bounds.top, ds.bounds.bottom)])
+        ds_lr = np.array([max(ds.bounds.left, ds.bounds.right), max(ds.bounds.top, ds.bounds.bottom)])
         assert region_cnrs.min(axis=0) == pytest.approx(ds_ul, abs=1e-3)
         assert region_cnrs.max(axis=0) == pytest.approx(ds_lr, abs=1e-3)
 
 
 def test_cli_asset_export(l8_image_id, region_25ha_file: Path, runner: CliRunner, tmp_path: Path):
-    """  Export a test image to an asset using the CLI. """
+    """Export a test image to an asset using the CLI."""
     # create a randomly named folder to allow parallel tests without overwriting the same asset
     gd.Initialize()
     folder = f'geedim/int_test_asset_export_{np.random.randint(1 << 31)}'
@@ -118,7 +130,7 @@ def test_cli_asset_export(l8_image_id, region_25ha_file: Path, runner: CliRunner
             f'--dtype uint16 --mask --resampling bilinear --wait --type asset'
         )
         result = runner.invoke(cli.cli, cli_str.split())
-        assert (result.exit_code == 0)
+        assert result.exit_code == 0
         assert ee.data.getAsset(test_asset_id) is not None
 
         # download the asset image
@@ -139,13 +151,43 @@ def test_cli_asset_export(l8_image_id, region_25ha_file: Path, runner: CliRunner
     with open(region_25ha_file) as f:
         region = json.load(f)
     with rio.open(download_filename, 'r') as im:
-        im : rio.DatasetReader
+        im: rio.DatasetReader
         exp_region = transform_geom('EPSG:4326', im.crs, region)
         exp_bounds = BoundingBox(*bounds(exp_region))
         assert im.crs == CRS.from_string(crs)
         assert im.transform[0] == scale
         assert im.count > 1
         assert (
-            (im.bounds[0] <= exp_bounds[0]) and (im.bounds[1] <= exp_bounds[1]) and
-            (im.bounds[2] >= exp_bounds[2]) and (im.bounds[3] >= exp_bounds[3])
+            (im.bounds[0] <= exp_bounds[0])
+            and (im.bounds[1] <= exp_bounds[1])
+            and (im.bounds[2] >= exp_bounds[2])
+            and (im.bounds[3] >= exp_bounds[3])
         )
+
+
+@pytest.mark.parametrize('dtype', ['float32', 'float64', 'uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32'])
+def test_ee_geotiff_nodata(dtype: str, l9_image_id: str):
+    """Test the nodata value of the Earth Engine GeoTIFF returned by ``ee.data.computePixels()`` or
+    ``ee.Image.getDownloadUrl()`` equals the geedim expected value (see
+    https://issuetracker.google.com/issues/350528377 for context).
+    """
+    # use geedim to prepare an image for downloading as dtype
+    gd.Initialize()
+    masked_image = gd.MaskedImage.from_id(l9_image_id)
+    shape = (10, 10)
+    exp_image, profile = masked_image._prepare_for_download(shape=shape, dtype=dtype)
+
+    # download a small tile with ee.data.computePixels
+    request = {
+        'expression': exp_image.ee_image,
+        'bandIds': ['SR_B3'],
+        'grid': {'dimensions': {'width': shape[1], 'height': shape[0]}},
+        'fileFormat': 'GEO_TIFF',
+    }
+    im_bytes = ee.data.computePixels(request)
+
+    # test nodata with rasterio
+    with rio.MemoryFile(im_bytes) as mf, mf.open() as ds:
+        assert ds.nodata == profile['nodata']
+        # test the EE dtype is not lower precision compared to expected dtype
+        assert np.promote_types(profile['dtype'], ds.dtypes[0]) == ds.dtypes[0]

From dbc8fa409b771ceb0b471be1b709c6887aeca18a Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 4 Jul 2024 21:44:26 +0200
Subject: [PATCH 07/16] simplify get_projection

---
 geedim/utils.py | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/geedim/utils.py b/geedim/utils.py
index c5f00c9..275526f 100644
--- a/geedim/utils.py
+++ b/geedim/utils.py
@@ -23,7 +23,7 @@
 import time
 from contextlib import contextmanager
 from threading import Thread
-from typing import Tuple, Optional
+from typing import Optional, Tuple
 
 import ee
 import numpy as np
@@ -164,7 +164,7 @@ def get_bounds(filename: pathlib.Path, expand: float = 5):
     return src_bbox_wgs84
 
 
-def get_projection(image, min_scale=True):
+def get_projection(image: ee.Image, min_scale: bool = True) -> ee.Projection:
     """
     Get the min/max scale projection of image bands.  Server side - no calls to getInfo().
     Adapted from from https://github.com/gee-community/gee_tools, MIT license.
@@ -186,23 +186,11 @@ def get_projection(image, min_scale=True):
         raise TypeError('image is not an instance of ee.Image')
 
     bands = image.bandNames()
+    scales = bands.map(lambda band: image.select(ee.String(band)).projection().nominalScale())
+    projs = bands.map(lambda band: image.select(ee.String(band)).projection())
+    projs = projs.sort(scales)
 
-    compare = ee.Number.lte if min_scale else ee.Number.gte
-    init_proj = image.select(0).projection()
-
-    def compare_scale(name, prev_proj):
-        """Server side comparison of band scales"""
-        prev_proj = ee.Projection(prev_proj)
-        prev_scale = prev_proj.nominalScale()
-
-        curr_proj = image.select([name]).projection()
-        curr_scale = ee.Number(curr_proj.nominalScale())
-
-        condition = compare(curr_scale, prev_scale)
-        comp_proj = ee.Algorithms.If(condition, curr_proj, prev_proj)
-        return ee.Projection(comp_proj)
-
-    return ee.Projection(bands.iterate(compare_scale, init_proj))
+    return ee.Projection(projs.get(0) if min_scale else projs.get(-1))
 
 
 class Spinner(Thread):

From c67afe804620a2ee9beb0365f06f3ec00cf0e897 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 4 Jul 2024 21:55:17 +0200
Subject: [PATCH 08/16] add fixtures for S2 images missing QA/cloud data

---
 tests/conftest.py | 160 +++++++++++++++++++++++++++++-----------------
 1 file changed, 101 insertions(+), 59 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 6d23ed4..4055aaf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,13 +13,16 @@
     See the License for the specific language governing permissions and
     limitations under the License.
 """
+
 import pathlib
 from typing import Dict, List
 
 import ee
 import pytest
 from click.testing import CliRunner
+
 from geedim import Initialize, MaskedImage
+from geedim.mask import Sentinel2SrClImage
 from geedim.utils import root_path
 
 
@@ -31,31 +34,34 @@ def ee_init():
 
 @pytest.fixture(scope='session')
 def region_25ha() -> Dict:
-    """ A geojson polygon defining a 500x500m region. """
+    """A geojson polygon defining a 500x500m region."""
     return {
         "type": "Polygon",
-        "coordinates":
-            [[[21.6389, -33.4520], [21.6389, -33.4474], [21.6442, -33.4474], [21.6442, -33.4520], [21.6389, -33.4520]]]
+        "coordinates": [
+            [[21.6389, -33.4520], [21.6389, -33.4474], [21.6442, -33.4474], [21.6442, -33.4520], [21.6389, -33.4520]]
+        ],
     }
 
 
 @pytest.fixture(scope='session')
 def region_100ha() -> Dict:
-    """ A geojson polygon defining a 1x1km region. """
+    """A geojson polygon defining a 1x1km region."""
     return {
         "type": "Polygon",
-        "coordinates":
-            [[[21.6374, -33.4547], [21.6374, -33.4455], [21.6480, -33.4455], [21.6480, -33.4547], [21.6374, -33.4547]]]
+        "coordinates": [
+            [[21.6374, -33.4547], [21.6374, -33.4455], [21.6480, -33.4455], [21.6480, -33.4547], [21.6374, -33.4547]]
+        ],
     }
 
 
 @pytest.fixture(scope='session')
 def region_10000ha() -> Dict:
-    """ A geojson polygon defining a 10x10km region. """
+    """A geojson polygon defining a 10x10km region."""
     return {
         "type": "Polygon",
-        "coordinates":
-            [[[21.5893, -33.4964], [21.5893, -33.4038], [21.6960, -33.4038], [21.6960, -33.4964], [21.5893, -33.4964]]]
+        "coordinates": [
+            [[21.5893, -33.4964], [21.5893, -33.4038], [21.6960, -33.4038], [21.6960, -33.4964], [21.5893, -33.4964]]
+        ],
     }
 
 
@@ -66,75 +72,83 @@ def const_image_25ha_file() -> pathlib.Path:
 
 @pytest.fixture(scope='session')
 def l4_image_id() -> str:
-    """ Landsat-4 EE ID for image that covers `region_*ha`, with partial cloud cover only for `region10000ha`.  """
+    """Landsat-4 EE ID for image that covers `region_*ha`, with partial cloud cover only for `region10000ha`."""
     return 'LANDSAT/LT04/C02/T1_L2/LT04_173083_19880310'
 
 
 @pytest.fixture(scope='session')
 def l5_image_id() -> str:
-    """ Landsat-5 EE ID for image that covers `region_*ha` with partial cloud cover.  """
+    """Landsat-5 EE ID for image that covers `region_*ha` with partial cloud cover."""
     return 'LANDSAT/LT05/C02/T1_L2/LT05_173083_20051112'  # 'LANDSAT/LT05/C02/T1_L2/LT05_173083_20070307'
 
 
 @pytest.fixture(scope='session')
 def l7_image_id() -> str:
-    """ Landsat-7 EE ID for image that covers `region_*ha` with partial cloud cover.  """
+    """Landsat-7 EE ID for image that covers `region_*ha` with partial cloud cover."""
     return 'LANDSAT/LE07/C02/T1_L2/LE07_173083_20220119'  # 'LANDSAT/LE07/C02/T1_L2/LE07_173083_20200521'
 
 
 @pytest.fixture(scope='session')
 def l8_image_id() -> str:
-    """ Landsat-8 EE ID for image that covers `region_*ha` with partial cloud cover.  """
+    """Landsat-8 EE ID for image that covers `region_*ha` with partial cloud cover."""
     return 'LANDSAT/LC08/C02/T1_L2/LC08_173083_20180217'  # 'LANDSAT/LC08/C02/T1_L2/LC08_173083_20171113'
 
 
 @pytest.fixture(scope='session')
 def l9_image_id() -> str:
-    """ Landsat-9 EE ID for image that covers `region_*ha` with partial cloud cover. """
+    """Landsat-9 EE ID for image that covers `region_*ha` with partial cloud cover."""
     return 'LANDSAT/LC09/C02/T1_L2/LC09_173083_20220308'  # 'LANDSAT/LC09/C02/T1_L2/LC09_173083_20220103'
 
 
 @pytest.fixture(scope='session')
 def landsat_image_ids(l4_image_id, l5_image_id, l7_image_id, l8_image_id, l9_image_id) -> List[str]:
-    """ Landsat4-9 EE IDs for images that covers `region_*ha` with partial cloud cover. """
+    """Landsat4-9 EE IDs for images that covers `region_*ha` with partial cloud cover."""
     return [l4_image_id, l5_image_id, l7_image_id, l8_image_id, l9_image_id]
 
 
 @pytest.fixture(scope='session')
 def s2_sr_image_id() -> str:
-    """ Sentinel-2 SR EE ID for image that covers `region_*ha` with partial cloud cover. """
+    """Sentinel-2 SR EE ID for image that covers `region_*ha` with partial cloud cover."""
     # 'COPERNICUS/S2/20220107T081229_20220107T083059_T34HEJ'
     return 'COPERNICUS/S2_SR/20211004T080801_20211004T083709_T34HEJ'
 
 
 @pytest.fixture(scope='session')
 def s2_toa_image_id() -> str:
-    """ Sentinel-2 TOA EE ID for image that covers `region_*ha` with partial cloud cover. """
+    """Sentinel-2 TOA EE ID for image that covers `region_*ha` with partial cloud cover."""
     return 'COPERNICUS/S2/20220107T081229_20220107T083059_T34HEJ'
 
 
 @pytest.fixture(scope='session')
 def s2_sr_hm_image_id() -> str:
-    """ Harmonised Sentinel-2 SR EE ID for image that covers `region_*ha` with partial cloud cover. """
+    """Harmonised Sentinel-2 SR EE ID for image that covers `region_*ha` with partial cloud cover."""
     # 'COPERNICUS/S2/20220107T081229_20220107T083059_T34HEJ'
     return 'COPERNICUS/S2_SR_HARMONIZED/20211004T080801_20211004T083709_T34HEJ'
 
 
+@pytest.fixture(scope='session')
+def s2_sr_hm_noqa_image_id() -> str:
+    """Harmonised Sentinel-2 SR EE ID for image with no QA* data, that covers `region_*ha` with partial cloud
+    cover.
+    """
+    return 'COPERNICUS/S2_SR_HARMONIZED/20240426T080609_20240426T083054_T34HEJ'
+
+
 @pytest.fixture(scope='session')
 def s2_toa_hm_image_id() -> str:
-    """ Harmonised Sentinel-2 TOA EE ID for image that covers `region_*ha` with partial cloud cover. """
+    """Harmonised Sentinel-2 TOA EE ID for image that covers `region_*ha` with partial cloud cover."""
     return 'COPERNICUS/S2_HARMONIZED/20220107T081229_20220107T083059_T34HEJ'
 
 
 @pytest.fixture(scope='session')
 def s2_image_ids(s2_sr_image_id, s2_toa_image_id, s2_sr_hm_image_id, s2_toa_hm_image_id) -> List[str]:
-    """ Sentinel-2 TOA/SR EE IDs for images that covers `region_*ha` with partial cloud cover. """
+    """Sentinel-2 TOA/SR EE IDs for images that covers `region_*ha` with partial cloud cover."""
     return [s2_sr_image_id, s2_toa_image_id, s2_sr_hm_image_id, s2_toa_hm_image_id]
 
 
 @pytest.fixture(scope='session')
 def modis_nbar_image_id() -> str:
-    """ Global MODIS NBAR image ID.  WGS84 @ 500m. """
+    """Global MODIS NBAR image ID.  WGS84 @ 500m."""
     return 'MODIS/006/MCD43A4/2022_01_01'
 
 
@@ -149,25 +163,25 @@ def gch_image_id() -> str:
 
 @pytest.fixture(scope='session')
 def s1_sar_image_id() -> str:
-    """ Sentinel-1 SAR GRD EE image ID.  10m. """
+    """Sentinel-1 SAR GRD EE image ID.  10m."""
     return 'COPERNICUS/S1_GRD/S1A_IW_GRDH_1SDV_20220112T171750_20220112T171815_041430_04ED28_0A04'
 
 
 @pytest.fixture(scope='session')
 def gedi_agb_image_id() -> str:
-    """ GEDI aboveground biomass density EE image ID.  1km."""
+    """GEDI aboveground biomass density EE image ID.  1km."""
     return 'LARSE/GEDI/GEDI04_B_002'
 
 
 @pytest.fixture(scope='session')
 def gedi_cth_image_id() -> str:
-    """ GEDI canopy top height EE image ID.  25m."""
+    """GEDI canopy top height EE image ID.  25m."""
     return 'LARSE/GEDI/GEDI02_A_002_MONTHLY/202009_018E_036S'
 
 
 @pytest.fixture(scope='session')
 def landsat_ndvi_image_id() -> str:
-    """ Landsat 8-day NDVI composite EE image iD.  Composite in WGS84 with underlying 30m scale."""
+    """Landsat 8-day NDVI composite EE image iD.  Composite in WGS84 with underlying 30m scale."""
     return 'LANDSAT/LC08/C01/T1_8DAY_NDVI/20211219'
 
 
@@ -175,157 +189,186 @@ def landsat_ndvi_image_id() -> str:
 def generic_image_ids(
     modis_nbar_image_id, gch_image_id, s1_sar_image_id, gedi_agb_image_id, gedi_cth_image_id, landsat_ndvi_image_id
 ) -> List[str]:
-    """ A list of various EE image IDs for non-cloud/shadow masked images. """
+    """A list of various EE image IDs for non-cloud/shadow masked images."""
     return [
-        modis_nbar_image_id, gch_image_id, s1_sar_image_id, gedi_agb_image_id, gedi_cth_image_id, landsat_ndvi_image_id
+        modis_nbar_image_id,
+        gch_image_id,
+        s1_sar_image_id,
+        gedi_agb_image_id,
+        gedi_cth_image_id,
+        landsat_ndvi_image_id,
     ]
 
 
 @pytest.fixture(scope='session')
 def l4_masked_image(l4_image_id) -> MaskedImage:
-    """ Landsat-4 MaskedImage that covers `region_*ha`, with partial cloud cover only for `region10000ha`. """
+    """Landsat-4 MaskedImage that covers `region_*ha`, with partial cloud cover only for `region10000ha`."""
     return MaskedImage.from_id(l4_image_id)
 
 
 @pytest.fixture(scope='session')
 def l5_masked_image(l5_image_id) -> MaskedImage:
-    """ Landsat-5 MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Landsat-5 MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(l5_image_id)
 
 
 @pytest.fixture(scope='session')
 def l7_masked_image(l7_image_id) -> MaskedImage:
-    """ Landsat-7 MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Landsat-7 MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(l7_image_id)
 
 
 @pytest.fixture(scope='session')
 def l8_masked_image(l8_image_id) -> MaskedImage:
-    """ Landsat-8 MaskedImage that cover `region_*ha` with partial cloud cover. """
+    """Landsat-8 MaskedImage that cover `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(l8_image_id)
 
 
 @pytest.fixture(scope='session')
 def l9_masked_image(l9_image_id) -> MaskedImage:
-    """ Landsat-9 MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Landsat-9 MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(l9_image_id)
 
 
 @pytest.fixture(scope='session')
 def landsat_masked_images(
-    l4_masked_image, l5_masked_image, l7_masked_image, l8_masked_image,
-    l9_masked_image
-) -> List[MaskedImage]: # yapf: disable
-    """ Landsat4-9 MaskedImage's that cover `region_*ha` with partial cloud cover. """
+    l4_masked_image, l5_masked_image, l7_masked_image, l8_masked_image, l9_masked_image
+) -> List[MaskedImage]:  # yapf: disable
+    """Landsat4-9 MaskedImage's that cover `region_*ha` with partial cloud cover."""
     return [l4_masked_image, l5_masked_image, l7_masked_image, l8_masked_image, l9_masked_image]
 
 
 @pytest.fixture(scope='session')
 def s2_sr_masked_image(s2_sr_image_id) -> MaskedImage:
-    """ Sentinel-2 SR MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Sentinel-2 SR MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(s2_sr_image_id)
 
 
 @pytest.fixture(scope='session')
 def s2_toa_masked_image(s2_toa_image_id) -> MaskedImage:
-    """ Sentinel-2 TOA MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Sentinel-2 TOA MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(s2_toa_image_id)
 
 
 @pytest.fixture(scope='session')
 def s2_sr_hm_masked_image(s2_sr_hm_image_id) -> MaskedImage:
-    """ Harmonised Sentinel-2 SR MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Harmonised Sentinel-2 SR MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(s2_sr_hm_image_id)
 
 
+@pytest.fixture(scope='session')
+def s2_sr_hm_nocp_masked_image(s2_sr_hm_image_id) -> MaskedImage:
+    """Harmonised Sentinel-2 SR MaskedImage with no corresponding cloud probability, that covers `region_*ha` with
+    partial cloud cover.
+    """
+    ee_image = ee.Image(s2_sr_hm_image_id)
+    ee_image = ee_image.set('system:index', 'unknown')  # prevent linking to cloud probability
+    return Sentinel2SrClImage(ee_image, mask_method='cloud-prob')
+
+
+@pytest.fixture(scope='session')
+def s2_sr_hm_noqa_masked_image(s2_sr_hm_noqa_image_id) -> MaskedImage:
+    """Harmonised Sentinel-2 SR MaskedImage with empty QA* bands, that covers `region_*ha` with partial cloud cover."""
+    return MaskedImage.from_id(s2_sr_hm_noqa_image_id, mask_method='qa')
+
+
 @pytest.fixture(scope='session')
 def s2_toa_hm_masked_image(s2_toa_hm_image_id) -> MaskedImage:
-    """ Harmonised Sentinel-2 TOA MaskedImage that covers `region_*ha` with partial cloud cover. """
+    """Harmonised Sentinel-2 TOA MaskedImage that covers `region_*ha` with partial cloud cover."""
     return MaskedImage.from_id(s2_toa_hm_image_id)
 
 
 @pytest.fixture(scope='session')
 def s2_masked_images(
     s2_sr_masked_image, s2_toa_masked_image, s2_sr_hm_masked_image, s2_toa_hm_masked_image
-) -> List[MaskedImage]: # yapf: disable
-    """ Sentinel-2 TOA and SR MaskedImage's that cover `region_*ha` with partial cloud cover. """
+) -> List[MaskedImage]:  # yapf: disable
+    """Sentinel-2 TOA and SR MaskedImage's that cover `region_*ha` with partial cloud cover."""
     return [s2_sr_masked_image, s2_toa_masked_image, s2_sr_hm_masked_image, s2_toa_hm_masked_image]
 
 
 @pytest.fixture(scope='session')
 def user_masked_image() -> MaskedImage:
-    """ A MaskedImage instance where the encapsulated image has no fixed projection or ID.  """
+    """A MaskedImage instance where the encapsulated image has no fixed projection or ID."""
     return MaskedImage(ee.Image([1, 2, 3]))
 
 
 @pytest.fixture(scope='session')
 def modis_nbar_masked_image(modis_nbar_image_id) -> MaskedImage:
-    """ MODIS NBAR MaskedImage with global coverage.  """
+    """MODIS NBAR MaskedImage with global coverage."""
     return MaskedImage.from_id(modis_nbar_image_id)
 
 
 @pytest.fixture(scope='session')
 def gch_masked_image(gch_image_id) -> MaskedImage:
-    """ Global Canopy Height (10m) MaskedImage. """
+    """Global Canopy Height (10m) MaskedImage."""
     return MaskedImage.from_id(gch_image_id)
 
 
 @pytest.fixture(scope='session')
 def s1_sar_masked_image(s1_sar_image_id) -> MaskedImage:
-    """ Sentinel-1 SAR GRD MaskedImage.  10m. """
+    """Sentinel-1 SAR GRD MaskedImage.  10m."""
     return MaskedImage.from_id(s1_sar_image_id)
 
 
 @pytest.fixture(scope='session')
 def gedi_agb_masked_image(gedi_agb_image_id) -> MaskedImage:
-    """ GEDI aboveground biomass density MaskedImage.  1km."""
+    """GEDI aboveground biomass density MaskedImage.  1km."""
     return MaskedImage.from_id(gedi_agb_image_id)
 
 
 @pytest.fixture(scope='session')
 def gedi_cth_masked_image(gedi_cth_image_id) -> MaskedImage:
-    """ GEDI canopy top height MaskedImage.  25m."""
+    """GEDI canopy top height MaskedImage.  25m."""
     return MaskedImage.from_id(gedi_cth_image_id)
 
 
 @pytest.fixture(scope='session')
 def landsat_ndvi_masked_image(landsat_ndvi_image_id) -> MaskedImage:
-    """ Landsat 8-day NDVI composite MaskedImage.  Composite in WGS84 with underlying 30m scale."""
+    """Landsat 8-day NDVI composite MaskedImage.  Composite in WGS84 with underlying 30m scale."""
     return MaskedImage.from_id(landsat_ndvi_image_id)
 
 
 @pytest.fixture(scope='session')
 def generic_masked_images(
-    modis_nbar_masked_image, gch_masked_image, s1_sar_masked_image, gedi_agb_masked_image, gedi_cth_masked_image,
-    landsat_ndvi_masked_image
+    modis_nbar_masked_image,
+    gch_masked_image,
+    s1_sar_masked_image,
+    gedi_agb_masked_image,
+    gedi_cth_masked_image,
+    landsat_ndvi_masked_image,
 ) -> List[MaskedImage]:
-    """ A list of various non-cloud/shadow MaskedImage's. """
+    """A list of various non-cloud/shadow MaskedImage's."""
     return [
-        modis_nbar_masked_image, gch_masked_image, s1_sar_masked_image, gedi_agb_masked_image, gedi_cth_masked_image,
-        landsat_ndvi_masked_image
+        modis_nbar_masked_image,
+        gch_masked_image,
+        s1_sar_masked_image,
+        gedi_agb_masked_image,
+        gedi_cth_masked_image,
+        landsat_ndvi_masked_image,
     ]
 
+
 @pytest.fixture
 def runner():
-    """ click runner for command line execution. """
+    """click runner for command line execution."""
     return CliRunner()
 
 
 @pytest.fixture
 def region_25ha_file() -> pathlib.Path:
-    """ Path to region_25ha geojson file. """
+    """Path to region_25ha geojson file."""
     return root_path.joinpath('tests/data/region_25ha.geojson')
 
 
 @pytest.fixture
 def region_100ha_file() -> pathlib.Path:
-    """ Path to region_100ha geojson file. """
+    """Path to region_100ha geojson file."""
     return root_path.joinpath('tests/data/region_100ha.geojson')
 
 
 @pytest.fixture
 def region_10000ha_file() -> pathlib.Path:
-    """ Path to region_10000ha geojson file. """
+    """Path to region_10000ha geojson file."""
     return root_path.joinpath('tests/data/region_10000ha.geojson')
 
 
@@ -343,4 +386,3 @@ def get_image_std(ee_image: ee.Image, region: Dict, std_scale: float):
         reducer='mean', geometry=region, crs=proj.crs(), scale=std_scale, bestEffort=True, maxPixels=1e6
     )
     return mean_std_image.get('TEST').getInfo()
-

From ecc6618b44d0a938b832303c8bf7793e6c8b2a1b Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 4 Jul 2024 22:01:25 +0200
Subject: [PATCH 09/16] Add and improve S2 cloud masking tests

Adjust existing tests to test both S2 cloud mask methods.
Test new _ee_proj property and add tests for masking S2 images with missing data.
---
 tests/test_mask.py | 143 +++++++++++++++++++++++++++------------------
 1 file changed, 85 insertions(+), 58 deletions(-)

diff --git a/tests/test_mask.py b/tests/test_mask.py
index a9bfef2..072eab7 100644
--- a/tests/test_mask.py
+++ b/tests/test_mask.py
@@ -21,7 +21,8 @@
 import pytest
 import rasterio as rio
 
-from geedim.mask import MaskedImage, get_projection, class_from_id, Sentinel2SrClImage
+from geedim.enums import CloudMaskMethod
+from geedim.mask import class_from_id, MaskedImage, Sentinel2SrClImage
 
 
 def test_class_from_id(landsat_image_ids, s2_sr_image_id, s2_toa_hm_image_id, generic_image_ids):
@@ -115,6 +116,27 @@ def test_set_region_stats(masked_image: str, region_100ha, request: pytest.Fixtu
         assert masked_image.properties[stat_name] >= 0 and masked_image.properties[stat_name] <= 100
 
 
+@pytest.mark.parametrize(
+    'masked_image, exp_scale',
+    [
+        ('s2_sr_hm_masked_image', 60),
+        ('s2_sr_hm_noqa_masked_image', 60),
+        ('s2_sr_hm_nocp_masked_image', 60),
+        ('l9_masked_image', 30),
+        ('l4_masked_image', 30),
+        ('s1_sar_masked_image', 10),
+        ('gedi_agb_masked_image', 1000),
+    ],
+)
+def test_ee_proj(masked_image: str, exp_scale: float, request: pytest.FixtureRequest):
+    """Test MaskedImage._ee_proj has the expected scale and is not WGS84."""
+    masked_image: MaskedImage = request.getfixturevalue(masked_image)
+    proj = masked_image._ee_proj.getInfo()
+
+    assert np.abs(proj['transform'][0]) == pytest.approx(exp_scale, rel=1e-3)
+    assert proj.get('crs', 'wkt') != 'EPSG:4326'
+
+
 @pytest.mark.parametrize('image_id', ['l9_image_id', 'l8_image_id', 'l7_image_id', 'l5_image_id', 'l4_image_id'])
 def test_landsat_cloudless_portion(image_id: str, request: pytest.FixtureRequest):
     """Test `geedim` CLOUDLESS_PORTION for the whole image against related Landsat CLOUD_COVER property."""
@@ -195,16 +217,16 @@ def test_s2_cloudmask_prob(image_id: str, region_10000ha: Dict, request: pytest.
 def test_s2_cloudmask_method(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
     """Test S2 cloud/shadow masking `mask_method` parameter."""
     image_id: str = request.getfixturevalue(image_id)
-    masked_image = MaskedImage.from_id(image_id, mask_method='cloud-prob')
-    masked_image._set_region_stats(region_10000ha)
-    cloud_prob_portion = 100 * masked_image.properties['CLOUDLESS_PORTION'] / masked_image.properties['FILL_PORTION']
-    masked_image = MaskedImage.from_id(image_id, mask_method='qa')
-    masked_image._set_region_stats(region_10000ha)
-    qa_portion = 100 * masked_image.properties['CLOUDLESS_PORTION'] / masked_image.properties['FILL_PORTION']
+    cl_portions = []
+    for mask_method in CloudMaskMethod:
+        masked_image = MaskedImage.from_id(image_id, mask_method=mask_method)
+        masked_image._set_region_stats(region_10000ha)
+        cl_portions.append(100 * masked_image.properties['CLOUDLESS_PORTION'])
+
     # test `mask_method` changes CLOUDLESS_PORTION but not by too much
-    assert cloud_prob_portion != qa_portion
-    assert cloud_prob_portion == pytest.approx(qa_portion, abs=10)
-    assert (cloud_prob_portion != pytest.approx(0, abs=1)) and (qa_portion != pytest.approx(0, abs=1))
+    assert len(set(cl_portions)) == len(cl_portions)
+    assert all([cl_portions[0] != pytest.approx(cp, abs=10) for cp in cl_portions[1:]])
+    assert all([cp != pytest.approx(0, abs=1) for cp in cl_portions])
 
 
 @pytest.mark.parametrize('image_id', ['s2_sr_image_id', 's2_toa_image_id'])
@@ -283,24 +305,10 @@ def get_max_cloud_dist(cloud_dist: ee.Image):
     assert meas_max_cloud_dist == pytest.approx(max_cloud_dist, rel=0.1)
 
 
-def test_s2_no_cloud_prob(s2_sr_hm_image_id: str, region_100ha: dict):
-    """Test S2 cloud probability masking without corresponding 'COPERNICUS/S2_CLOUD_PROBABILITY' image gives 100% cloud."""
-    ee_image = ee.Image(s2_sr_hm_image_id)
-    ee_image = ee_image.set('system:index', 'unknown')  # prevent linking to cloud probability
-    masked_image = Sentinel2SrClImage(ee_image, mask_method='cloud-prob')
-    masked_image._set_region_stats(region_100ha, scale=60)
-
-    assert masked_image.properties is not None
-    assert masked_image.properties['CLOUDLESS_PORTION'] == pytest.approx(0, abs=1)
-    assert masked_image.properties['FILL_PORTION'] == pytest.approx(100, abs=1)
-
-
-def test_s2_no_qa(s2_sr_hm_image_id: str, region_100ha: dict):
-    """Test S2 QA masking with empty QA60 band (i.e. all S2 images post Feb 2022) gives 100% cloud."""
-    # 2024 image that fills region_100ha and has some cloud
-    masked_image = MaskedImage.from_id(
-        'COPERNICUS/S2_SR_HARMONIZED/20240426T080609_20240426T083054_T34HEJ', mask_method='qa'
-    )
+@pytest.mark.parametrize('masked_image', ['s2_sr_hm_noqa_masked_image', 's2_sr_hm_nocp_masked_image'])
+def test_s2_region_stats_missing_data(masked_image: str, region_100ha: dict, request: pytest.FixtureRequest):
+    """Test region stats are 0% cloudless for S2 images masked with missing QA* or cloud probability data."""
+    masked_image: MaskedImage = request.getfixturevalue(masked_image)
     masked_image._set_region_stats(region_100ha, scale=60)
 
     assert masked_image.properties is not None
@@ -308,29 +316,45 @@ def test_s2_no_qa(s2_sr_hm_image_id: str, region_100ha: dict):
     assert masked_image.properties['FILL_PORTION'] == pytest.approx(100, abs=1)
 
 
-def get_mask_stats(masked_image: MaskedImage, region: Dict):
-    """Get cloud/shadow etc aux band statistics for a given MaskedImage and region."""
+def _test_mask_stats(masked_image: MaskedImage, region: Dict):
+    """Sanity tests on cloud/shadow etc. aux bands for a given MaskedImage and region."""
+    # create a pan image for testing brightnesses
     ee_image = masked_image.ee_image
     pan = ee_image.select([0, 1, 2]).reduce(ee.Reducer.mean())
-    cdist = ee_image.select('CLOUD_DIST')
+
+    # mask the pan image with cloud, shadow & cloudless masks
     cloud_mask = ee_image.select('CLOUD_MASK')
     shadow_mask = ee_image.select('SHADOW_MASK')
     cloudless_mask = ee_image.select('CLOUDLESS_MASK')
     pan_cloud = pan.updateMask(cloud_mask).rename('PAN_CLOUD')
     pan_shadow = pan.updateMask(shadow_mask).rename('PAN_SHADOW')
     pan_cloudless = pan.updateMask(cloudless_mask).rename('PAN_CLOUDLESS')
+
+    # mask the cloud distance image with cloud & cloudless masks
+    cdist = ee_image.select('CLOUD_DIST')
     cdist_cloud = cdist.updateMask(cloud_mask).rename('CDIST_CLOUD')
     cdist_cloudless = cdist.updateMask(cloudless_mask).rename('CDIST_CLOUDLESS')
+
+    # find mean stats of all masked images, and min of cloud distance where it is >0
     stats_image = ee.Image([pan_cloud, pan_shadow, pan_cloudless, cdist_cloud, cdist_cloudless])
-    proj = get_projection(ee_image, min_scale=False)
-    means = stats_image.reduceRegion(
+    proj = masked_image._ee_proj
+    stats = stats_image.reduceRegion(
         reducer='mean', geometry=region, crs=proj.crs(), scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
     )
     cdist_min = cdist.updateMask(cdist).reduceRegion(
         reducer='min', geometry=region, crs=proj.crs(), scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
     )
-    means = means.set('CDIST_MIN', cdist_min.get('CLOUD_DIST'))
-    return means.getInfo()
+    stats = stats.set('CDIST_MIN', cdist_min.get('CLOUD_DIST'))
+    stats = stats.getInfo()
+
+    # test cloud is brighter than cloudless
+    assert stats['PAN_CLOUD'] > stats['PAN_CLOUDLESS']
+    # test cloudless is brighter than shadow
+    assert stats['PAN_CLOUDLESS'] > stats['PAN_SHADOW']
+    # test cloudless areas have greater distance to cloud than cloudy areas
+    assert stats['CDIST_CLOUDLESS'] > stats['CDIST_CLOUD']
+    # test min distance to cloud is pixel size
+    assert stats['CDIST_MIN'] * 10 == masked_image._ee_proj.nominalScale().getInfo()
 
 
 @pytest.mark.parametrize(
@@ -339,31 +363,34 @@ def get_mask_stats(masked_image: MaskedImage, region: Dict):
 def test_landsat_aux_bands(masked_image: str, region_10000ha: Dict, request: pytest.FixtureRequest):
     """Test Landsat auxiliary band values for sanity."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
-    stats = get_mask_stats(masked_image, region_10000ha)
-    # cloud is brighter than cloudless
-    assert stats['PAN_CLOUD'] > stats['PAN_CLOUDLESS']
-    # cloudless is brighter than shadow
-    assert stats['PAN_CLOUDLESS'] > stats['PAN_SHADOW']
-    # cloudless areas have greater distance to cloud than cloudy areas
-    assert stats['CDIST_CLOUDLESS'] > stats['CDIST_CLOUD']
+    _test_mask_stats(masked_image, region_10000ha)
 
 
 @pytest.mark.parametrize(
-    'masked_image', ['s2_sr_masked_image', 's2_toa_masked_image', 's2_sr_hm_masked_image', 's2_toa_hm_masked_image']
-)  # yapf: disable
-def test_s2_aux_bands(masked_image: str, region_10000ha: Dict, request: pytest.FixtureRequest):
+    'image_id',
+    ['s2_sr_image_id', 's2_toa_image_id', 's2_sr_hm_image_id', 's2_toa_hm_image_id'],
+)
+def test_s2_aux_bands(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
     """Test Sentinel-2 auxiliary band values for sanity."""
-    masked_image: MaskedImage = request.getfixturevalue(masked_image)
-    stats = get_mask_stats(masked_image, region_10000ha)
-    # cloud is brighter than cloudless
-    assert stats['PAN_CLOUD'] > stats['PAN_CLOUDLESS']
-    # cloudless is brighter than shadow
-    assert stats['PAN_CLOUDLESS'] > stats['PAN_SHADOW']
-    # cloudless areas have greater distance to cloud than cloudy areas
-    assert stats['CDIST_CLOUDLESS'] > stats['CDIST_CLOUD']
-    proj_scale = get_projection(masked_image.ee_image, min_scale=False).nominalScale().getInfo()
-    # min distance to cloud is pixel size
-    assert stats['CDIST_MIN'] * 10 == proj_scale
+    image_id: str = request.getfixturevalue(image_id)
+    for mask_method in CloudMaskMethod:
+        masked_image = MaskedImage.from_id(image_id, mask_method=mask_method)
+        _test_mask_stats(masked_image, region_10000ha)
+
+
+@pytest.mark.parametrize(
+    'image_id, unlink, mask_method',
+    [('s2_sr_hm_image_id', True, 'qa'), ('s2_sr_hm_noqa_image_id', False, 'cloud-prob')],
+)
+def test_s2_aux_bands_missing_data(
+    image_id: str, unlink: bool, mask_method: str, region_10000ha: Dict, request: pytest.FixtureRequest
+):
+    """Test Sentinel-2 auxiliary band values for sanity on images that are missing QA* or cloud probability data."""
+    image_id: str = request.getfixturevalue(image_id)
+    ee_image = ee.Image(image_id)
+    ee_image = ee_image.set('system:index', 'unknown') if unlink else ee_image
+    masked_image = Sentinel2SrClImage(ee_image, mask_method=mask_method)
+    _test_mask_stats(masked_image, region_10000ha)
 
 
 @pytest.mark.parametrize('masked_image', ['s2_sr_masked_image', 'l9_masked_image'])
@@ -372,7 +399,7 @@ def test_mask_clouds(masked_image: str, region_100ha: Dict, tmp_path, request: p
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     filename = tmp_path.joinpath(f'test_image.tif')
     masked_image.mask_clouds()
-    proj_scale = get_projection(masked_image.ee_image, min_scale=False).nominalScale()
+    proj_scale = masked_image._ee_proj.nominalScale()
     masked_image.download(filename, region=region_100ha, dtype='int32', scale=proj_scale)
     assert filename.exists()
     with rio.open(filename, 'r') as ds:
@@ -380,7 +407,7 @@ def test_mask_clouds(masked_image: str, region_100ha: Dict, tmp_path, request: p
         cloudless_mask = ds.read(ds.descriptions.index('CLOUDLESS_MASK') + 1, masked=True)
         assert np.all(cloudless_mask == 1)  # all cloud/shadow areas should be masked (0)
         cloudless_mask = cloudless_mask.filled(0).astype('bool')  # fill nodata with 0 and cast to bool
-        # test that cloudless_mask is the same as the nodata/dataset mask for each bands
+        # test that cloudless_mask is the same as the nodata/dataset mask for each band
         ds_masks = ds.read_masks().astype('bool')
         assert np.all(cloudless_mask == ds_masks)
 

From 89cd96992d6bdfb7ad24b742468442640aad7d70 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 4 Jul 2024 23:07:49 +0200
Subject: [PATCH 10/16] Refactor stats and aux band projections.

Add _ee_proj property with overrides to simplify scale / reprojection, and
avoid issues with non-fixed projection bands.
---
 geedim/mask.py | 115 +++++++++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 46 deletions(-)

diff --git a/geedim/mask.py b/geedim/mask.py
index d596ef5..f6af6f6 100644
--- a/geedim/mask.py
+++ b/geedim/mask.py
@@ -22,7 +22,7 @@
 import geedim.schema
 from geedim.download import BaseImage
 from geedim.enums import CloudMaskMethod
-from geedim.utils import split_id, get_projection
+from geedim.utils import get_projection, split_id
 
 logger = logging.getLogger(__name__)
 
@@ -76,12 +76,22 @@ def __init__(self, ee_image: ee.Image, mask: bool = _default_mask, region: dict
         # TODO: consider adding proj_scale parameter here, rather than in _set_region_stats, then it can be re-used in
         #  S2 cloud masking and distance
         BaseImage.__init__(self, ee_image)
+        self._ee_projection = None
         self._add_aux_bands(**kwargs)  # add any mask and cloud distance bands
         if region:
             self._set_region_stats(region)
         if mask:
             self.mask_clouds()
 
+    @property
+    def _ee_proj(self) -> ee.Projection:
+        """Projection to use for mask calculations and statistics."""
+        if self._ee_projection is None:
+            # use the minimum scale projection for the base class / generic case (excludes non-fixed projections with
+            # 1 deg. scales)
+            self._ee_projection = get_projection(self._ee_image, min_scale=True)
+        return self._ee_projection
+
     @staticmethod
     def from_id(image_id: str, **kwargs) -> 'MaskedImage':
         """
@@ -145,21 +155,16 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
         if not region:
             region = self.ee_image.geometry()  # use the image footprint
 
-        proj = get_projection(self.ee_image, min_scale=False)  # get projection of minimum scale band
-        # If _proj_scale is set, use that as the scale, otherwise use the proj.nomimalScale().  For non-composite images
-        # these should be the same value.  For composite images, there is no `fixed` projection, hence the
-        # need for _proj_scale.
-        scale = scale or proj.nominalScale()
+        # composite images have no fixed projection (scale = 1deg) and need the scale kwarg
+        scale = scale or self._ee_proj.nominalScale()
 
         # Find the fill portion as the (sum over the region of FILL_MASK) divided by (sum over the region of a constant
-        # image (==1)).  We take this approach rather than using a mean reducer, as this does not find the mean over
-        # the region, but the mean over the part of the region covered by the image.
-        stats_image = ee.Image(
-            [self.ee_image.select('FILL_MASK').unmask(), ee.Image(1).rename('REGION_SUM')]
-        )  # yapf: disable
+        # image (==1)).  Note that a mean reducer, does not find the mean over the region, but the mean over the part
+        # of the region covered by the image.
+        stats_image = ee.Image([self.ee_image.select('FILL_MASK').unmask(), ee.Image(1).rename('REGION_SUM')])
         # Note: sometimes proj has no EPSG in crs(), hence use crs=proj and not crs=proj.crs() below
         sums = stats_image.reduceRegion(
-            reducer="sum", geometry=region, crs=proj, scale=scale, bestEffort=True, maxPixels=1e6
+            reducer="sum", geometry=region, crs=self._ee_proj, scale=scale, bestEffort=True, maxPixels=1e6
         )
 
         fill_portion = ee.Number(sums.get('FILL_MASK')).divide(ee.Number(sums.get('REGION_SUM'))).multiply(100)
@@ -181,23 +186,23 @@ def _cloud_dist(self, cloudless_mask: ee.Image = None, max_cloud_dist: float = 5
         """Find the cloud/shadow distance in units of 10m."""
         if not cloudless_mask:
             cloudless_mask = self.ee_image.select('CLOUDLESS_MASK')
-        proj = get_projection(self.ee_image, min_scale=False)  # use maximum scale projection to save processing time
 
-        # Note that initial *MASK bands before any call to mask_clouds(), are themselves masked, so this cloud/shadow
-        # mask excludes (i.e. masks) already masked pixels.  This avoids finding distance to e.g. scanline errors in
-        # Landsat-7.
         cloud_shadow_mask = cloudless_mask.Not()
-        cloud_pix = ee.Number(max_cloud_dist).divide(proj.nominalScale()).round()  # cloud_dist in pixels
+        cloud_pix = ee.Number(max_cloud_dist).divide(self._ee_proj.nominalScale()).round()  # cloud_dist in pixels
 
-        # Find distance to nearest cloud/shadow (units of 10m).
+        # Find distance to nearest cloud/shadow (units of 10m).  cloudless_mask is itself masked for validity, so
+        # finding distances to invalid areas is avoided.
         cloud_dist = (
             cloud_shadow_mask.fastDistanceTransform(neighborhood=cloud_pix, units='pixels', metric='squared_euclidean')
             .sqrt()
-            .multiply(proj.nominalScale().divide(10))
+            .multiply(self._ee_proj.nominalScale().divide(10))
         )
 
         # Reproject to force calculation at correct scale.
-        cloud_dist = cloud_dist.reproject(crs=proj, scale=proj.nominalScale()).rename('CLOUD_DIST')
+        cloud_dist = cloud_dist.reproject(
+            crs=self._ee_proj,
+            scale=self._ee_proj.nominalScale(),
+        ).rename('CLOUD_DIST')
 
         # Clip cloud_dist to max_cloud_dist.
         cloud_dist = cloud_dist.where(cloud_dist.gt(ee.Image(max_cloud_dist / 10)), max_cloud_dist / 10)
@@ -220,21 +225,18 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
         if not region:
             region = self.ee_image.geometry()  # use the image footprint
 
-        proj = get_projection(self.ee_image, min_scale=False)  # get projection of maximum scale band
-        # If _proj_scale is set, use that as the scale, otherwise use the proj.nomimalScale().  For non-composite images
-        # these should be the same value.  For composite images, there is no `fixed` projection, hence the
-        # need for _proj_scale.
-        scale = scale or proj.nominalScale()
+        # composite images have no fixed projection (scale = 1deg) and need the scale kwarg
+        scale = scale or self._ee_proj.nominalScale()
 
-        # Find the fill portion as the (sum over the region of FILL_MASK) divided by (sum over the region of a constant
-        # image (==1)).  We take this approach rather than using a mean reducer, as this does not find the mean over
-        # the region, but the mean over the part of the region covered by the image.  Then we find cloudless portion
-        # as portion of fill that is cloudless.
+        # Find the fill portion as the (sum over the region of FILL_MASK) divided by (sum over the region of a
+        # constant image (==1)).  Then find cloudless portion as portion of fill that is cloudless.  Note that a mean
+        # reducer, does not find the mean over the region, but the mean over the part of the region covered by the
+        # image.
         stats_image = ee.Image(
             [self.ee_image.select(['FILL_MASK', 'CLOUDLESS_MASK']).unmask(), ee.Image(1).rename('REGION_SUM')]
-        )  # yapf: disable
+        )
         sums = stats_image.reduceRegion(
-            reducer="sum", geometry=region, crs=proj, scale=scale, bestEffort=True, maxPixels=1e6
+            reducer="sum", geometry=region, crs=self._ee_proj, scale=scale, bestEffort=True, maxPixels=1e6
         ).rename(['FILL_MASK', 'CLOUDLESS_MASK'], ['FILL_PORTION', 'CLOUDLESS_PORTION'])
         fill_portion = ee.Number(sums.get('FILL_PORTION')).divide(ee.Number(sums.get('REGION_SUM'))).multiply(100)
         cloudless_portion = (
@@ -271,6 +273,13 @@ class LandsatImage(CloudMaskedImage):
     * LANDSAT/LC09/C02/T1_L2
     """
 
+    @property
+    def _ee_proj(self) -> ee.Projection:
+        if self._ee_projection is None:
+            # use the default projection (all landsat bands have same scale)
+            self._ee_projection = self._ee_image.projection()
+        return self._ee_projection
+
     def _aux_image(self, mask_shadows: bool = True, mask_cirrus: bool = True, max_cloud_dist: int = 5000) -> ee.Image:
         """
         Retrieve the auxiliary image containing cloud/shadow masks and cloud distance.
@@ -317,6 +326,14 @@ def _aux_image(self, mask_shadows: bool = True, mask_cirrus: bool = True, max_cl
 class Sentinel2ClImage(CloudMaskedImage):
     """Base class for cloud/shadow masking of Sentinel-2 TOA and SR images."""
 
+    @property
+    def _ee_proj(self) -> ee.Projection:
+        if self._ee_projection is None:
+            # use the B1 projection with maximum scale (60m) to reduce processing times (some S2 images have empty QA
+            # bands with no fixed projection, so utils.get_projection(min_scale=False) should not be used here).
+            self._ee_projection = self._ee_image.select(0).projection()
+        return self._ee_projection
+
     def _aux_image(
         self,
         s2_toa: bool = False,
@@ -376,14 +393,14 @@ def get_cloud_prob(ee_im):
             """Get the cloud probability image from COPERNICUS/S2_CLOUD_PROBABILITY that corresponds to `ee_im`, if it
             exists.  Otherwise, get a 100% probability.
             """
-            # create a default 100% cloud probability image to revert to if the cloud probability image does not exist
-            # (use a 10m projection to prevent any issue with _set_region_stats() etc auto-scale)
-            default_cloud_prob = ee.Image(100).setDefaultProjection(ee_im.select('QA10').projection())
+            # default 100% cloud probability image
+            default_cloud_prob = ee.Image(100)
 
             # get the cloud probability image if it exists, otherwise revert to default_cloud_prob
             filt = ee.Filter.eq('system:index', ee_im.get('system:index'))
             cloud_prob = ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY').filter(filt).first()
             cloud_prob = ee.Image(ee.List([cloud_prob, default_cloud_prob]).reduce(ee.Reducer.firstNonNull()))
+
             return cloud_prob.rename('CLOUD_PROB')
 
         def get_cloud_mask(ee_im, cloud_prob=None):
@@ -393,13 +410,14 @@ def get_cloud_mask(ee_im, cloud_prob=None):
                     cloud_prob = get_cloud_prob(ee_im)
                 cloud_mask = cloud_prob.gte(prob)
             else:
+                # find cloud mask, setting to true for images post Feb 2022 when QA60 was no longer populated
+                qa_invalid = ee.Number(ee_im.date().difference(ee.Date('2022-02-01'), 'days').gt(0))
                 qa = ee_im.select('QA60')
-                cloud_mask = qa.bitwiseAnd(1 << 10).neq(0)
-                # set cloud_mask to true for images post Feb 2022 when QA60 was no longer populated
-                qa_invalid = ee_im.date().difference(ee.Date('2022-02-01'), 'days').gt(0)
-                cloud_mask = cloud_mask.bitwiseOr(qa_invalid)
+                cloud_mask = qa.bitwiseAnd(1 << 10).neq(0).bitwiseOr(qa_invalid)
+
                 if mask_cirrus:
                     cloud_mask = cloud_mask.Or(qa.bitwiseAnd(1 << 11).neq(0))
+
             return cloud_mask.rename('CLOUD_MASK')
 
         def get_cdi_cloud_mask(ee_im):
@@ -423,27 +441,30 @@ def get_shadow_mask(ee_im, cloud_mask):
                 # exclude water
                 dark_mask = ee_im.select('SCL').neq(6).And(dark_mask)
 
-            proj = get_projection(ee_im, min_scale=False)
             # Note:
             # S2 MEAN_SOLAR_AZIMUTH_ANGLE (SAA) appears to be measured clockwise with 0 at N (i.e. shadow goes in the
             # opposite direction), directionalDistanceTransform() angle appears to be measured clockwise with 0 at W.
             # So we need to add/subtract 180 to SAA to get shadow angle in S2 convention, then add 90 to get
-            # directionalDistanceTransform() convention i.e. we need to add -180 + 90 = -90 to the SAA.  This is not
+            # directionalDistanceTransform() convention i.e. we need to add -180 + 90 = -90 to SAA.  This is not
             # the same as in the EE tutorial which is 90-SAA
             # (https://developers.google.com/earth-engine/tutorials/community/sentinel-2-s2cloudless).
             shadow_azimuth = ee.Number(-90).add(ee.Number(ee_im.get('MEAN_SOLAR_AZIMUTH_ANGLE')))
-            proj_pixels = ee.Number(shadow_dist).divide(proj.nominalScale()).round()
+            proj_pixels = ee.Number(shadow_dist).divide(self._ee_proj.nominalScale()).round()
+
             # Project the cloud mask in the direction of the shadows it will cast.
             cloud_cast_proj = cloud_mask.directionalDistanceTransform(shadow_azimuth, proj_pixels).select('distance')
-            # The reproject is necessary to force calculation at the correct scale - the coarse proj scale is used to
-            # improve processing times.
-            cloud_cast_mask = cloud_cast_proj.mask().reproject(crs=proj.crs(), scale=proj.nominalScale())
+
+            # reproject to force calculation at the correct scale
+            cloud_cast_mask = cloud_cast_proj.mask().reproject(
+                crs=self._ee_proj.crs(), scale=self._ee_proj.nominalScale()
+            )
             return cloud_cast_mask.And(dark_mask).rename('SHADOW_MASK')
 
-        # gather and combine the various masks
+        # combine the various masks
         ee_image = self.ee_image
         cloud_prob = get_cloud_prob(ee_image) if mask_method == CloudMaskMethod.cloud_prob else None
         cloud_mask = get_cloud_mask(ee_image, cloud_prob=cloud_prob)
+
         if cdi_thresh is not None:
             cloud_mask = cloud_mask.And(get_cdi_cloud_mask(ee_image))
         if mask_shadows:
@@ -454,9 +475,11 @@ def get_shadow_mask(ee_im, cloud_mask):
 
         # do a morphological opening type operation that removes small (20m) blobs from the mask and then dilates
         cloud_shadow_mask = cloud_shadow_mask.focal_min(20, units='meters').focal_max(buffer, units='meters')
+
         # derive a fill mask from the Earth Engine mask for the surface reflectance bands
         fill_mask = ee_image.select('B.*').mask().reduce(ee.Reducer.allNonZero())
-        # Clip this mask to the image footprint.  (Without this step we get memory limit errors on download.)
+
+        # clip this mask to the image footprint (without this step we get memory limit errors on download)
         fill_mask = fill_mask.clip(ee_image.geometry()).rename('FILL_MASK')
 
         # combine all masks into cloudless_mask

From 210e86645e1cc46b000422dd32eb274df03b5a75 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 4 Jul 2024 23:25:44 +0200
Subject: [PATCH 11/16] update nodata test for EE's -inf

---
 tests/integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration.py b/tests/integration.py
index 883b27b..ffdff3d 100644
--- a/tests/integration.py
+++ b/tests/integration.py
@@ -100,7 +100,7 @@ def test_geeml_integration(tmp_path: Path):
     with rio.open(out_file, 'r') as ds:
         assert ds.count == 4
         assert ds.dtypes[0] == 'float64'
-        assert np.isnan(ds.nodata)
+        assert np.isinf(ds.nodata)
         region_cnrs = np.array(region['coordinates'][0])
         region_bounds = rio.coords.BoundingBox(*region_cnrs.min(axis=0), *region_cnrs.max(axis=0))
         # sometimes the top/bottom bounds of the dataset are swapped, so extract and compare UL and BR corners

From a19ce8792a83d7378ae5de4907d6a519ca063d3d Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 11 Jul 2024 17:44:52 +0200
Subject: [PATCH 12/16] refine tests and fixtures for missing cloud data

---
 tests/conftest.py  |  27 +++++++--
 tests/test_mask.py | 135 +++++++++++++++++++++++++++++++--------------
 2 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 4055aaf..887034e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -127,13 +127,21 @@ def s2_sr_hm_image_id() -> str:
 
 
 @pytest.fixture(scope='session')
-def s2_sr_hm_noqa_image_id() -> str:
-    """Harmonised Sentinel-2 SR EE ID for image with no QA* data, that covers `region_*ha` with partial cloud
+def s2_sr_hm_qa_mask_image_id() -> str:
+    """Harmonised Sentinel-2 SR EE ID for image with masked QA* data, that covers `region_*ha` with partial cloud
     cover.
     """
     return 'COPERNICUS/S2_SR_HARMONIZED/20240426T080609_20240426T083054_T34HEJ'
 
 
+@pytest.fixture(scope='session')
+def s2_sr_hm_qa_zero_image_id() -> str:
+    """Harmonised Sentinel-2 SR EE ID for image with zero QA* data, that covers `region_*ha` with partial cloud
+    cover.
+    """
+    return 'COPERNICUS/S2_SR_HARMONIZED/20230407T080611_20230407T083613_T34HEJ'
+
+
 @pytest.fixture(scope='session')
 def s2_toa_hm_image_id() -> str:
     """Harmonised Sentinel-2 TOA EE ID for image that covers `region_*ha` with partial cloud cover."""
@@ -261,15 +269,22 @@ def s2_sr_hm_nocp_masked_image(s2_sr_hm_image_id) -> MaskedImage:
     """Harmonised Sentinel-2 SR MaskedImage with no corresponding cloud probability, that covers `region_*ha` with
     partial cloud cover.
     """
+    # create an image with unknown id to prevent linking to cloud probability
     ee_image = ee.Image(s2_sr_hm_image_id)
-    ee_image = ee_image.set('system:index', 'unknown')  # prevent linking to cloud probability
+    ee_image = ee_image.set('system:index', 'COPERNICUS/S2_HARMONIZED/unknown')
     return Sentinel2SrClImage(ee_image, mask_method='cloud-prob')
 
 
 @pytest.fixture(scope='session')
-def s2_sr_hm_noqa_masked_image(s2_sr_hm_noqa_image_id) -> MaskedImage:
-    """Harmonised Sentinel-2 SR MaskedImage with empty QA* bands, that covers `region_*ha` with partial cloud cover."""
-    return MaskedImage.from_id(s2_sr_hm_noqa_image_id, mask_method='qa')
+def s2_sr_hm_qa_mask_masked_image(s2_sr_hm_qa_mask_image_id: str) -> MaskedImage:
+    """Harmonised Sentinel-2 SR MaskedImage with masked QA* bands, that covers `region_*ha` with partial cloud cover."""
+    return MaskedImage.from_id(s2_sr_hm_qa_mask_image_id, mask_method='qa')
+
+
+@pytest.fixture(scope='session')
+def s2_sr_hm_qa_zero_masked_image(s2_sr_hm_qa_zero_image_id: str) -> MaskedImage:
+    """Harmonised Sentinel-2 SR MaskedImage with zero QA* bands, that covers `region_*ha` with partial cloud cover."""
+    return MaskedImage.from_id(s2_sr_hm_qa_zero_image_id, mask_method='qa')
 
 
 @pytest.fixture(scope='session')
diff --git a/tests/test_mask.py b/tests/test_mask.py
index 072eab7..6f530ef 100644
--- a/tests/test_mask.py
+++ b/tests/test_mask.py
@@ -14,7 +14,7 @@
     limitations under the License.
 """
 
-from typing import Dict
+from typing import Dict, Iterable
 
 import ee
 import numpy as np
@@ -22,7 +22,7 @@
 import rasterio as rio
 
 from geedim.enums import CloudMaskMethod
-from geedim.mask import class_from_id, MaskedImage, Sentinel2SrClImage
+from geedim.mask import class_from_id, CloudMaskedImage, MaskedImage, Sentinel2SrClImage
 
 
 def test_class_from_id(landsat_image_ids, s2_sr_image_id, s2_toa_hm_image_id, generic_image_ids):
@@ -110,7 +110,7 @@ def test_set_region_stats(masked_image: str, region_100ha, request: pytest.Fixtu
     Test MaskedImage._set_region_stats() generates the expected properties and that these are in the valid range.
     """
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
-    masked_image._set_region_stats(region_100ha)
+    masked_image._set_region_stats(region_100ha, scale=masked_image._ee_proj.nominalScale())
     for stat_name in ['FILL_PORTION', 'CLOUDLESS_PORTION']:
         assert stat_name in masked_image.properties
         assert masked_image.properties[stat_name] >= 0 and masked_image.properties[stat_name] <= 100
@@ -120,16 +120,18 @@ def test_set_region_stats(masked_image: str, region_100ha, request: pytest.Fixtu
     'masked_image, exp_scale',
     [
         ('s2_sr_hm_masked_image', 60),
-        ('s2_sr_hm_noqa_masked_image', 60),
-        ('s2_sr_hm_nocp_masked_image', 60),
         ('l9_masked_image', 30),
         ('l4_masked_image', 30),
         ('s1_sar_masked_image', 10),
         ('gedi_agb_masked_image', 1000),
+        # include fixtures with bands that have no fixed projection
+        ('s2_sr_hm_qa_mask_masked_image', 60),
+        ('s2_sr_hm_qa_zero_masked_image', 60),
+        ('s2_sr_hm_nocp_masked_image', 60),
     ],
 )
 def test_ee_proj(masked_image: str, exp_scale: float, request: pytest.FixtureRequest):
-    """Test MaskedImage._ee_proj has the expected scale and is not WGS84."""
+    """Test MaskedImage._ee_proj has the correct scale and CRS."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     proj = masked_image._ee_proj.getInfo()
 
@@ -305,18 +307,20 @@ def get_max_cloud_dist(cloud_dist: ee.Image):
     assert meas_max_cloud_dist == pytest.approx(max_cloud_dist, rel=0.1)
 
 
-@pytest.mark.parametrize('masked_image', ['s2_sr_hm_noqa_masked_image', 's2_sr_hm_nocp_masked_image'])
-def test_s2_region_stats_missing_data(masked_image: str, region_100ha: dict, request: pytest.FixtureRequest):
-    """Test region stats are 0% cloudless for S2 images masked with missing QA* or cloud probability data."""
+@pytest.mark.parametrize(
+    'masked_image', ['s2_sr_hm_qa_mask_masked_image', 's2_sr_hm_qa_zero_masked_image', 's2_sr_hm_nocp_masked_image']
+)
+def test_s2_region_stats_missing_data(masked_image: str, region_10000ha: dict, request: pytest.FixtureRequest):
+    """Test S2 region stats for unmasked images missing required cloud data."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
-    masked_image._set_region_stats(region_100ha, scale=60)
+    masked_image._set_region_stats(region_10000ha, scale=60)
 
     assert masked_image.properties is not None
     assert masked_image.properties['CLOUDLESS_PORTION'] == pytest.approx(0, abs=1)
     assert masked_image.properties['FILL_PORTION'] == pytest.approx(100, abs=1)
 
 
-def _test_mask_stats(masked_image: MaskedImage, region: Dict):
+def _test_aux_stats(masked_image: MaskedImage, region: Dict):
     """Sanity tests on cloud/shadow etc. aux bands for a given MaskedImage and region."""
     # create a pan image for testing brightnesses
     ee_image = masked_image.ee_image
@@ -339,10 +343,10 @@ def _test_mask_stats(masked_image: MaskedImage, region: Dict):
     stats_image = ee.Image([pan_cloud, pan_shadow, pan_cloudless, cdist_cloud, cdist_cloudless])
     proj = masked_image._ee_proj
     stats = stats_image.reduceRegion(
-        reducer='mean', geometry=region, crs=proj.crs(), scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
+        reducer='mean', geometry=region, crs=proj, scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
     )
     cdist_min = cdist.updateMask(cdist).reduceRegion(
-        reducer='min', geometry=region, crs=proj.crs(), scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
+        reducer='min', geometry=region, crs=proj, scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
     )
     stats = stats.set('CDIST_MIN', cdist_min.get('CLOUD_DIST'))
     stats = stats.getInfo()
@@ -363,53 +367,104 @@ def _test_mask_stats(masked_image: MaskedImage, region: Dict):
 def test_landsat_aux_bands(masked_image: str, region_10000ha: Dict, request: pytest.FixtureRequest):
     """Test Landsat auxiliary band values for sanity."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
-    _test_mask_stats(masked_image, region_10000ha)
+    _test_aux_stats(masked_image, region_10000ha)
 
 
 @pytest.mark.parametrize(
-    'image_id',
-    ['s2_sr_image_id', 's2_toa_image_id', 's2_sr_hm_image_id', 's2_toa_hm_image_id'],
+    'image_id, mask_methods',
+    [
+        ('s2_sr_image_id', CloudMaskMethod),
+        ('s2_toa_image_id', CloudMaskMethod),
+        ('s2_sr_hm_image_id', CloudMaskMethod),
+        ('s2_toa_hm_image_id', CloudMaskMethod),
+        # images missing QA60 so do cloud-prob method only
+        ('s2_sr_hm_qa_mask_image_id', ['cloud-prob']),
+        ('s2_sr_hm_qa_zero_image_id', ['cloud-prob']),
+    ],
 )
-def test_s2_aux_bands(image_id: str, region_10000ha: Dict, request: pytest.FixtureRequest):
-    """Test Sentinel-2 auxiliary band values for sanity."""
+def test_s2_aux_bands(image_id: str, mask_methods: Iterable, region_10000ha: Dict, request: pytest.FixtureRequest):
+    """Test Sentinel-2 auxiliary band values for sanity with all masking methods."""
     image_id: str = request.getfixturevalue(image_id)
-    for mask_method in CloudMaskMethod:
+    for mask_method in mask_methods:
         masked_image = MaskedImage.from_id(image_id, mask_method=mask_method)
-        _test_mask_stats(masked_image, region_10000ha)
+        _test_aux_stats(masked_image, region_10000ha)
+
+
+def test_s2_aux_bands_unlink(s2_sr_hm_image_id: str, region_10000ha: Dict):
+    """Test Sentinel-2 auxiliary band values for sanity on an image without linked cloud data."""
+    # TODO: include the cloud score+ method in this test when that method is added
+    # create an image with unknown id to prevent linking to cloud data
+    ee_image = ee.Image(s2_sr_hm_image_id)
+    ee_image = ee_image.set('system:index', 'COPERNICUS/S2_HARMONIZED/unknown')
+
+    for mask_method in ['qa']:
+        masked_image = Sentinel2SrClImage(ee_image, mask_method=mask_method)
+        _test_aux_stats(masked_image, region_10000ha)
 
 
 @pytest.mark.parametrize(
-    'image_id, unlink, mask_method',
-    [('s2_sr_hm_image_id', True, 'qa'), ('s2_sr_hm_noqa_image_id', False, 'cloud-prob')],
+    'masked_image', ['s2_sr_hm_nocp_masked_image', 's2_sr_hm_qa_mask_masked_image', 's2_sr_hm_qa_zero_masked_image']
 )
-def test_s2_aux_bands_missing_data(
-    image_id: str, unlink: bool, mask_method: str, region_10000ha: Dict, request: pytest.FixtureRequest
-):
-    """Test Sentinel-2 auxiliary band values for sanity on images that are missing QA* or cloud probability data."""
-    image_id: str = request.getfixturevalue(image_id)
-    ee_image = ee.Image(image_id)
-    ee_image = ee_image.set('system:index', 'unknown') if unlink else ee_image
-    masked_image = Sentinel2SrClImage(ee_image, mask_method=mask_method)
-    _test_mask_stats(masked_image, region_10000ha)
+def test_s2_aux_bands_missing_data(masked_image: str, region_10000ha: Dict, request: pytest.FixtureRequest):
+    """Test Sentinel-2 auxiliary band masking / transparency for unmasked images missing required cloud data."""
+    masked_image: MaskedImage = request.getfixturevalue(masked_image)
+
+    # get region sums of the auxiliary masks
+    proj = masked_image._ee_proj
+    aux_bands = masked_image.ee_image.select('.*MASK|CLOUD_DIST')
+    aux_mask = aux_bands.mask()
+    stats = aux_mask.reduceRegion(
+        reducer='sum', geometry=region_10000ha, crs=proj, scale=proj.nominalScale(), bestEffort=True, maxPixels=1e8
+    )
+    stats = stats.getInfo()
+
+    # test auxiliary masks are transparent
+    assert stats['FILL_MASK'] > 0
+    for band_name in ['CLOUDLESS_MASK', 'CLOUD_MASK', 'SHADOW_MASK', 'CLOUD_DIST']:
+        assert stats[band_name] == 0
 
 
-@pytest.mark.parametrize('masked_image', ['s2_sr_masked_image', 'l9_masked_image'])
+@pytest.mark.parametrize('masked_image', ['gedi_cth_masked_image', 's2_sr_masked_image', 'l9_masked_image'])
 def test_mask_clouds(masked_image: str, region_100ha: Dict, tmp_path, request: pytest.FixtureRequest):
-    """Test MaskedImage.mask_clouds() by downloading and examining dataset masks."""
+    """Test MaskedImage.mask_clouds() masks the fill or cloudless portion by downloading and examining dataset masks."""
     masked_image: MaskedImage = request.getfixturevalue(masked_image)
     filename = tmp_path.joinpath(f'test_image.tif')
     masked_image.mask_clouds()
     proj_scale = masked_image._ee_proj.nominalScale()
-    masked_image.download(filename, region=region_100ha, dtype='int32', scale=proj_scale)
+    masked_image.download(filename, region=region_100ha, dtype='float32', scale=proj_scale)
     assert filename.exists()
+
+    with rio.open(filename, 'r') as ds:
+        mask_name = 'CLOUDLESS_MASK' if isinstance(masked_image, CloudMaskedImage) else 'FILL_MASK'
+        mask = ds.read(ds.descriptions.index(mask_name) + 1, masked=True)
+
+        # test areas outside CLOUDLESS_MASK / FILL_MASK are masked
+        assert np.all(mask == 1)
+
+        # test CLOUDLESS_MASK / FILL_MASK matches the nodata mask for each band
+        mask = mask.filled(0).astype('bool')
+        ds_masks = ds.read_masks().astype('bool')
+        assert np.all(mask == ds_masks)
+
+
+@pytest.mark.parametrize(
+    'masked_image', ['s2_sr_hm_nocp_masked_image', 's2_sr_hm_qa_mask_masked_image', 's2_sr_hm_qa_zero_masked_image']
+)
+def test_s2_mask_clouds_missing_data(masked_image: str, region_100ha: Dict, tmp_path, request: pytest.FixtureRequest):
+    """Test Sentinel2SrClImage.mask_clouds() masks the entire image when it is missing required cloud data. Downloads
+    and examines dataset masks.
+    """
+    masked_image: MaskedImage = request.getfixturevalue(masked_image)
+    filename = tmp_path.joinpath(f'test_image.tif')
+    masked_image.mask_clouds()
+    proj_scale = masked_image._ee_proj.nominalScale()
+    masked_image.download(filename, region=region_100ha, dtype='float32', scale=proj_scale)
+    assert filename.exists()
+
+    # test all data is masked / nodata
     with rio.open(filename, 'r') as ds:
-        ds: rio.DatasetReader = ds
-        cloudless_mask = ds.read(ds.descriptions.index('CLOUDLESS_MASK') + 1, masked=True)
-        assert np.all(cloudless_mask == 1)  # all cloud/shadow areas should be masked (0)
-        cloudless_mask = cloudless_mask.filled(0).astype('bool')  # fill nodata with 0 and cast to bool
-        # test that cloudless_mask is the same as the nodata/dataset mask for each band
         ds_masks = ds.read_masks().astype('bool')
-        assert np.all(cloudless_mask == ds_masks)
+        assert not np.any(ds_masks)
 
 
 def test_skysat_region_stats():

From 69975aacc3517c6f16bc74d8edcd427ee48e54c6 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Thu, 11 Jul 2024 18:41:57 +0200
Subject: [PATCH 13/16] mask aux bands for images missing cloud data

---
 geedim/mask.py | 56 +++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/geedim/mask.py b/geedim/mask.py
index f6af6f6..18c8cad 100644
--- a/geedim/mask.py
+++ b/geedim/mask.py
@@ -73,8 +73,6 @@ def __init__(self, ee_image: ee.Image, mask: bool = _default_mask, region: dict
             Maximum distance (m) to look for clouds when forming the 'cloud distance' band.  Valid for
             Sentinel-2 images.
         """
-        # TODO: consider adding proj_scale parameter here, rather than in _set_region_stats, then it can be re-used in
-        #  S2 cloud masking and distance
         BaseImage.__init__(self, ee_image)
         self._ee_projection = None
         self._add_aux_bands(**kwargs)  # add any mask and cloud distance bands
@@ -109,7 +107,6 @@ def from_id(image_id: str, **kwargs) -> 'MaskedImage':
         MaskedImage
             A MaskedImage, or sub-class instance.
         """
-
         cls = class_from_id(image_id)
         ee_image = ee.Image(image_id)
         gd_image = cls(ee_image, **kwargs)
@@ -150,7 +147,9 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
         region : dict, ee.Geometry, optional
             Region inside of which to find statistics.  If not specified, the image footprint is used.
         scale: float, optional
-            Re-project to this scale when finding statistics.
+            Re-project to this scale when finding statistics.  Defaults to the scale of
+            :attr:`~MaskedImage._ee_proj`.  Should be provided if the encapsulated image is a composite / without a
+            fixed projection.
         """
         if not region:
             region = self.ee_image.geometry()  # use the image footprint
@@ -190,8 +189,8 @@ def _cloud_dist(self, cloudless_mask: ee.Image = None, max_cloud_dist: float = 5
         cloud_shadow_mask = cloudless_mask.Not()
         cloud_pix = ee.Number(max_cloud_dist).divide(self._ee_proj.nominalScale()).round()  # cloud_dist in pixels
 
-        # Find distance to nearest cloud/shadow (units of 10m).  cloudless_mask is itself masked for validity, so
-        # finding distances to invalid areas is avoided.
+        # Find distance to nearest cloud/shadow (units of 10m).  Distances are found for all pixels, including masked /
+        # invalid pixels, which are treated as 0 (non cloud/shadow).
         cloud_dist = (
             cloud_shadow_mask.fastDistanceTransform(neighborhood=cloud_pix, units='pixels', metric='squared_euclidean')
             .sqrt()
@@ -199,10 +198,10 @@ def _cloud_dist(self, cloudless_mask: ee.Image = None, max_cloud_dist: float = 5
         )
 
         # Reproject to force calculation at correct scale.
-        cloud_dist = cloud_dist.reproject(
-            crs=self._ee_proj,
-            scale=self._ee_proj.nominalScale(),
-        ).rename('CLOUD_DIST')
+        cloud_dist = cloud_dist.reproject(crs=self._ee_proj, scale=self._ee_proj.nominalScale()).rename('CLOUD_DIST')
+
+        # Prevent use of invalid pixels.
+        cloud_dist = cloud_dist.updateMask(cloudless_mask.mask())
 
         # Clip cloud_dist to max_cloud_dist.
         cloud_dist = cloud_dist.where(cloud_dist.gt(ee.Image(max_cloud_dist / 10)), max_cloud_dist / 10)
@@ -220,7 +219,9 @@ def _set_region_stats(self, region: Dict = None, scale: float = None):
         region : dict, ee.Geometry, optional
             Region inside of which to find statistics.  If not specified, the image footprint is used.
         scale: float, optional
-            Re-project to this scale when finding statistics.
+            Re-project to this scale when finding statistics.  Defaults to the scale of
+            :attr:`~MaskedImage._ee_proj`.  Should be provided if the encapsulated image is a composite / without a
+            fixed projection.
         """
         if not region:
             region = self.ee_image.geometry()  # use the image footprint
@@ -393,10 +394,12 @@ def get_cloud_prob(ee_im):
             """Get the cloud probability image from COPERNICUS/S2_CLOUD_PROBABILITY that corresponds to `ee_im`, if it
             exists.  Otherwise, get a 100% probability.
             """
-            # default 100% cloud probability image
-            default_cloud_prob = ee.Image(100)
+            # default masked cloud probability image
+            default_cloud_prob = ee.Image().updateMask(0)
 
             # get the cloud probability image if it exists, otherwise revert to default_cloud_prob
+            # (ee.Image.linkCollection() has a similar functionality but the masked image it returns when cloud
+            # probability doesn't exist is 0 rather than nodata on download)
             filt = ee.Filter.eq('system:index', ee_im.get('system:index'))
             cloud_prob = ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY').filter(filt).first()
             cloud_prob = ee.Image(ee.List([cloud_prob, default_cloud_prob]).reduce(ee.Reducer.firstNonNull()))
@@ -410,11 +413,12 @@ def get_cloud_mask(ee_im, cloud_prob=None):
                     cloud_prob = get_cloud_prob(ee_im)
                 cloud_mask = cloud_prob.gte(prob)
             else:
-                # find cloud mask, setting to true for images post Feb 2022 when QA60 was no longer populated
-                qa_invalid = ee.Number(ee_im.date().difference(ee.Date('2022-02-01'), 'days').gt(0))
-                qa = ee_im.select('QA60')
-                cloud_mask = qa.bitwiseAnd(1 << 10).neq(0).bitwiseOr(qa_invalid)
+                # mask QA60 if it is post Feb 2022 to invalidate the cloud mask (post Feb 2022 QA60 is no longer
+                # populated and may be masked / transparent or zero).
+                qa_valid = ee.Number(ee_im.date().difference(ee.Date('2022-02-01'), 'days').lt(0))
+                qa = ee_im.select('QA60').updateMask(qa_valid)
 
+                cloud_mask = qa.bitwiseAnd(1 << 10).neq(0)
                 if mask_cirrus:
                     cloud_mask = cloud_mask.Or(qa.bitwiseAnd(1 << 11).neq(0))
 
@@ -451,14 +455,20 @@ def get_shadow_mask(ee_im, cloud_mask):
             shadow_azimuth = ee.Number(-90).add(ee.Number(ee_im.get('MEAN_SOLAR_AZIMUTH_ANGLE')))
             proj_pixels = ee.Number(shadow_dist).divide(self._ee_proj.nominalScale()).round()
 
-            # Project the cloud mask in the direction of the shadows it will cast.
+            # Project the cloud mask in the direction of the shadows it will cast (can project the mask into invalid
+            # areas).
             cloud_cast_proj = cloud_mask.directionalDistanceTransform(shadow_azimuth, proj_pixels).select('distance')
 
-            # reproject to force calculation at the correct scale
-            cloud_cast_mask = cloud_cast_proj.mask().reproject(
-                crs=self._ee_proj.crs(), scale=self._ee_proj.nominalScale()
-            )
-            return cloud_cast_mask.And(dark_mask).rename('SHADOW_MASK')
+            # Reproject to force calculation at the correct scale.
+            cloud_cast_mask = cloud_cast_proj.mask().reproject(crs=self._ee_proj, scale=self._ee_proj.nominalScale())
+
+            # Remove any projections in invalid areas.
+            cloud_cast_mask = cloud_cast_mask.updateMask(cloud_mask.mask())
+
+            # Find shadow mask as intersection between projected clouds and dark areas.
+            shadow_mask = cloud_cast_mask.And(dark_mask)
+
+            return shadow_mask.rename('SHADOW_MASK')
 
         # combine the various masks
         ee_image = self.ee_image

From 98081e5b7b211263deec3d80dd4e75c7f0ddb89f Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Fri, 12 Jul 2024 13:38:29 +0200
Subject: [PATCH 14/16] format updates

---
 tests/test_cli.py      | 283 +++++++++++++++++++++++++----------------
 tests/test_download.py |   1 -
 tests/test_tile.py     |   2 +-
 3 files changed, 175 insertions(+), 111 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 66fa3c1..6a14773 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -13,62 +13,72 @@
     See the License for the specific language governing permissions and
     limitations under the License.
 """
+
 import json
 import pathlib
 from datetime import datetime
 from glob import glob
-from typing import List, Dict, Tuple
+from typing import Dict, List, Tuple
 
 import ee
 import numpy as np
 import pytest
 import rasterio as rio
 from click.testing import CliRunner
-from geedim.cli import cli
-from geedim.utils import root_path, asset_id
 from rasterio.coords import BoundingBox
 from rasterio.crs import CRS
 from rasterio.features import bounds
-from rasterio.warp import transform_geom
 from rasterio.transform import Affine
+from rasterio.warp import transform_geom
+
+from geedim.cli import cli
+from geedim.utils import asset_id
 
 
 @pytest.fixture()
 def l4_5_image_id_list(l4_image_id, l5_image_id) -> List[str]:
-    """ A list of landsat 4 & 5 image ID's. """
+    """A list of landsat 4 & 5 image ID's."""
     return [l4_image_id, l5_image_id]
 
 
 @pytest.fixture()
 def l8_9_image_id_list(l8_image_id, l9_image_id) -> List[str]:
-    """ A list of landsat 8 & 9 image ID's. """
+    """A list of landsat 8 & 9 image ID's."""
     return [l8_image_id, l9_image_id]
 
 
 @pytest.fixture()
 def s2_sr_image_id_list() -> List[str]:
-    """ A list of Sentinel-2 SR image IDs. """
+    """A list of Sentinel-2 SR image IDs."""
     return [
         'COPERNICUS/S2_SR/20211004T080801_20211004T083709_T34HEJ',
         'COPERNICUS/S2_SR/20211123T081241_20211123T083704_T34HEJ',
-        'COPERNICUS/S2_SR/20220107T081229_20220107T083059_T34HEJ'
+        'COPERNICUS/S2_SR/20220107T081229_20220107T083059_T34HEJ',
     ]
 
 
 @pytest.fixture()
 def gedi_image_id_list() -> List[str]:
-    """ A list of GEDI canopy top height ID's. """
+    """A list of GEDI canopy top height ID's."""
     return [
-        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202008_018E_036S', 'LARSE/GEDI/GEDI02_A_002_MONTHLY/202009_018E_036S',
-        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202005_018E_036S'
+        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202008_018E_036S',
+        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202009_018E_036S',
+        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202005_018E_036S',
     ]
 
 
 def _test_downloaded_file(
-    filename: pathlib.Path, region: Dict = None, crs: str = None, scale: float = None, dtype: str = None,
-    bands: List[str] = None, scale_offset: bool = None, transform: Affine = None, shape: Tuple[int, int] = None
+    filename: pathlib.Path,
+    region: Dict = None,
+    crs: str = None,
+    scale: float = None,
+    dtype: str = None,
+    bands: List[str] = None,
+    scale_offset: bool = None,
+    transform: Affine = None,
+    shape: Tuple[int, int] = None,
 ):
-    """ Helper function to test image file format against given parameters. """
+    """Helper function to test image file format against given parameters."""
     with rio.open(filename, 'r') as ds:
         ds: rio.DatasetReader = ds
         assert ds.nodata is not None
@@ -79,8 +89,10 @@ def _test_downloaded_file(
             exp_region = transform_geom('EPSG:4326', ds.crs, region)
             exp_bounds = BoundingBox(*bounds(exp_region))
             assert (
-                (ds.bounds[0] <= exp_bounds[0]) and (ds.bounds[1] <= exp_bounds[1]) and
-                (ds.bounds[2] >= exp_bounds[2]) and (ds.bounds[3] >= exp_bounds[3])
+                (ds.bounds[0] <= exp_bounds[0])
+                and (ds.bounds[1] <= exp_bounds[1])
+                and (ds.bounds[2] >= exp_bounds[2])
+                and (ds.bounds[3] >= exp_bounds[3])
             )
         if crs:
             assert CRS(ds.crs) == CRS.from_string(crs)
@@ -90,7 +102,8 @@ def _test_downloaded_file(
             assert ds.dtypes[0] == dtype
         if scale_offset:
             refl_bands = [
-                i for i in range(1, ds.count + 1)
+                i
+                for i in range(1, ds.count + 1)
                 if ('center_wavelength' in ds.tags(i)) and (float(ds.tags(i)['center_wavelength']) < 1)
             ]
             array = ds.read(refl_bands, masked=True)
@@ -102,11 +115,12 @@ def _test_downloaded_file(
             assert ds.shape == tuple(shape)
         if bands:
             assert set(bands) == set(ds.descriptions)
-            assert set(bands) ==  set([ds.tags(bi)['name'] for bi in range(1, ds.count + 1)])
+            assert set(bands) == set([ds.tags(bi)['name'] for bi in range(1, ds.count + 1)])
 
 
 @pytest.mark.parametrize(
-    'name, start_date, end_date, region, fill_portion, cloudless_portion, is_csmask', [
+    'name, start_date, end_date, region, fill_portion, cloudless_portion, is_csmask',
+    [
         ('LANDSAT/LC09/C02/T1_L2', '2022-01-01', '2022-02-01', 'region_100ha_file', 10, 50, True),
         ('LANDSAT/LE07/C02/T1_L2', '2022-01-01', '2022-02-01', 'region_100ha_file', 0, 0, True),
         ('LANDSAT/LT05/C02/T1_L2', '2005-01-01', '2006-02-01', 'region_100ha_file', 40, 50, True),
@@ -114,12 +128,20 @@ def _test_downloaded_file(
         ('COPERNICUS/S2', '2022-01-01', '2022-01-15', 'region_100ha_file', 50, 40, True),
         ('COPERNICUS/S2_SR_HARMONIZED', '2022-01-01', '2022-01-15', 'region_100ha_file', 0, 50, True),
         ('COPERNICUS/S2_HARMONIZED', '2022-01-01', '2022-01-15', 'region_100ha_file', 50, 40, True),
-        ('LARSE/GEDI/GEDI02_A_002_MONTHLY', '2021-11-01', '2022-01-01', 'region_100ha_file', 1, 0, False)
-    ]
+        ('LARSE/GEDI/GEDI02_A_002_MONTHLY', '2021-11-01', '2022-01-01', 'region_100ha_file', 1, 0, False),
+    ],
 )
 def test_search(
-    name, start_date: str, end_date: str, region: str, fill_portion: float, cloudless_portion: float, is_csmask: bool,
-    tmp_path: pathlib.Path, runner: CliRunner, request: pytest.FixtureRequest
+    name,
+    start_date: str,
+    end_date: str,
+    region: str,
+    fill_portion: float,
+    cloudless_portion: float,
+    is_csmask: bool,
+    tmp_path: pathlib.Path,
+    runner: CliRunner,
+    request: pytest.FixtureRequest,
 ):
     """
     Test search command gives valid results for different cloud/shadow maskable, and generic collections.
@@ -131,8 +153,8 @@ def test_search(
         f'-cp {cloudless_portion} -op {results_file}'
     )
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
-    assert (results_file.exists())
+    assert result.exit_code == 0
+    assert results_file.exists()
     with open(results_file, 'r') as f:
         properties = json.load(f)
 
@@ -155,7 +177,7 @@ def test_search(
 
 
 def test_config_search_s2(region_10000ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test `config` sub-command chained with `search` of Sentinel-2 affects CLOUDLESS_PORTION as expected. """
+    """Test `config` sub-command chained with `search` of Sentinel-2 affects CLOUDLESS_PORTION as expected."""
     results_file = tmp_path.joinpath('search_results.json')
     name = 'COPERNICUS/S2_SR'
     cl_portion_list = []
@@ -165,8 +187,8 @@ def test_config_search_s2(region_10000ha_file: pathlib.Path, runner: CliRunner,
             f'{results_file}'
         )
         result = runner.invoke(cli, cli_str.split())
-        assert (result.exit_code == 0)
-        assert (results_file.exists())
+        assert result.exit_code == 0
+        assert results_file.exists()
         with open(results_file, 'r') as f:
             properties = json.load(f)
         cl_portion_list.append(np.array([prop_dict['CLOUDLESS_PORTION'] for prop_dict in properties.values()]))
@@ -176,7 +198,7 @@ def test_config_search_s2(region_10000ha_file: pathlib.Path, runner: CliRunner,
 
 
 def test_config_search_l9(region_10000ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test `config` sub-command chained with `search` of Landsat-9 affects CLOUDLESS_PORTION as expected. """
+    """Test `config` sub-command chained with `search` of Landsat-9 affects CLOUDLESS_PORTION as expected."""
     results_file = tmp_path.joinpath('search_results.json')
     name = 'LANDSAT/LC09/C02/T1_L2'
     cl_portion_list = []
@@ -186,8 +208,8 @@ def test_config_search_l9(region_10000ha_file: pathlib.Path, runner: CliRunner,
             f' {results_file}'
         )
         result = runner.invoke(cli, cli_str.split())
-        assert (result.exit_code == 0)
-        assert (results_file.exists())
+        assert result.exit_code == 0
+        assert results_file.exists()
         with open(results_file, 'r') as f:
             properties = json.load(f)
         cl_portion_list.append(np.array([prop_dict['CLOUDLESS_PORTION'] for prop_dict in properties.values()]))
@@ -197,7 +219,7 @@ def test_config_search_l9(region_10000ha_file: pathlib.Path, runner: CliRunner,
 
 
 def test_region_bbox_search(region_100ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test --bbox gives same search results as --region <geojson file>. """
+    """Test --bbox gives same search results as --region <geojson file>."""
 
     results_file = tmp_path.joinpath('search_results.json')
     with open(region_100ha_file, 'r') as f:
@@ -206,14 +228,14 @@ def test_region_bbox_search(region_100ha_file: pathlib.Path, runner: CliRunner,
     bbox_str = ' '.join([str(b) for b in bbox])
     cli_strs = [
         f'search -c LANDSAT/LC09/C02/T1_L2 -s 2022-01-01 -e 2022-02-01 -r {region_100ha_file} -op {results_file}',
-        f'search -c LANDSAT/LC09/C02/T1_L2 -s 2022-01-01 -e 2022-02-01 -b {bbox_str} -op {results_file}'
+        f'search -c LANDSAT/LC09/C02/T1_L2 -s 2022-01-01 -e 2022-02-01 -b {bbox_str} -op {results_file}',
     ]
 
     props_list = []
     for cli_str in cli_strs:
         result = runner.invoke(cli, cli_str.split())
-        assert (result.exit_code == 0)
-        assert (results_file.exists())
+        assert result.exit_code == 0
+        assert results_file.exists()
         with open(results_file, 'r') as f:
             properties = json.load(f)
         props_list.append(properties)
@@ -222,19 +244,19 @@ def test_region_bbox_search(region_100ha_file: pathlib.Path, runner: CliRunner,
 
 
 def test_raster_region_search(const_image_25ha_file, region_25ha_file, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test --region works with a raster file. """
+    """Test --region works with a raster file."""
 
     results_file = tmp_path.joinpath('search_results.json')
     cli_strs = [
         f'search -c LANDSAT/LC09/C02/T1_L2 -s 2022-01-01 -e 2022-02-01 -r {region_25ha_file} -op {results_file}',
-        f'search -c LANDSAT/LC09/C02/T1_L2 -s 2022-01-01 -e 2022-02-01 -r {const_image_25ha_file} -op {results_file}'
+        f'search -c LANDSAT/LC09/C02/T1_L2 -s 2022-01-01 -e 2022-02-01 -r {const_image_25ha_file} -op {results_file}',
     ]
 
     props_list = []
     for cli_str in cli_strs:
         result = runner.invoke(cli, cli_str.split())
-        assert (result.exit_code == 0)
-        assert (results_file.exists())
+        assert result.exit_code == 0
+        assert results_file.exists()
         with open(results_file, 'r') as f:
             properties = json.load(f)
         props_list.append(properties)
@@ -243,17 +265,15 @@ def test_raster_region_search(const_image_25ha_file, region_25ha_file, runner: C
 
 
 def test_search_add_props_l9(region_25ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test --add-property generates results with the additional property keys. """
+    """Test --add-property generates results with the additional property keys."""
     results_file = tmp_path.joinpath('search_results.json')
     name = 'LANDSAT/LC09/C02/T1_L2'
     add_props = ['CLOUD_COVER', 'GEOMETRIC_RMSE_VERIFY']
     add_props_str = ''.join([f' -ap {add_prop} ' for add_prop in add_props])
-    cli_str = (
-        f'search -c {name} -s 2022-01-01 -e 2022-02-01 -r {region_25ha_file} {add_props_str} -op {results_file}'
-    )
+    cli_str = f'search -c {name} -s 2022-01-01 -e 2022-02-01 -r {region_25ha_file} {add_props_str} -op {results_file}'
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
-    assert (results_file.exists())
+    assert result.exit_code == 0
+    assert results_file.exists()
     with open(results_file, 'r') as f:
         properties = json.load(f)
     prop_keys = list(properties.values())[0].keys()
@@ -262,7 +282,7 @@ def test_search_add_props_l9(region_25ha_file: pathlib.Path, runner: CliRunner,
 
 
 def test_search_custom_filter_l9(region_25ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test --custom-filter filters the search results as specified. """
+    """Test --custom-filter filters the search results as specified."""
     results_file = tmp_path.joinpath('search_results.json')
     name = 'LANDSAT/LC09/C02/T1_L2'
     cc_thresh = 30
@@ -272,8 +292,8 @@ def test_search_custom_filter_l9(region_25ha_file: pathlib.Path, runner: CliRunn
         f'-cf {add_prop}>{cc_thresh} -op {results_file}'
     )
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
-    assert (results_file.exists())
+    assert result.exit_code == 0
+    assert results_file.exists()
     with open(results_file, 'r') as f:
         properties = json.load(f)
     prop_keys = list(properties.values())[0].keys()
@@ -281,20 +301,16 @@ def test_search_custom_filter_l9(region_25ha_file: pathlib.Path, runner: CliRunn
     assert all([prop[add_prop] > cc_thresh for prop in properties.values()])
 
 
-def test_search_cloudless_portion_no_value(
-    region_25ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path
-):
-    """ Test `search --cloudless-portion` gives the same results as `search --cloudless-portion 0`. """
+def test_search_cloudless_portion_no_value(region_25ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path):
+    """Test `search --cloudless-portion` gives the same results as `search --cloudless-portion 0`."""
     results_file = tmp_path.joinpath(f'search_results.json')
     name = 'LANDSAT/LC09/C02/T1_L2'
     clp_list = []
     for post_fix, cp_spec in zip(['no_val', 'zero_val'], ['-cp', '-cp 0']):
-        cli_str = (
-            f'search -c {name} -s 2022-01-01 -e 2022-04-01 -r {region_25ha_file} {cp_spec} -op {results_file}'
-        )
+        cli_str = f'search -c {name} -s 2022-01-01 -e 2022-04-01 -r {region_25ha_file} {cp_spec} -op {results_file}'
         result = runner.invoke(cli, cli_str.split())
-        assert (result.exit_code == 0)
-        assert (results_file.exists())
+        assert result.exit_code == 0
+        assert results_file.exists()
         with open(results_file, 'r') as f:
             props = json.load(f)
         prop_keys = list(props.values())[0].keys()
@@ -306,25 +322,26 @@ def test_search_cloudless_portion_no_value(
 
 
 @pytest.mark.parametrize(
-    'image_id, region_file', [
+    'image_id, region_file',
+    [
         ('l8_image_id', 'region_25ha_file'),
         ('s2_sr_hm_image_id', 'region_25ha_file'),
         ('gedi_cth_image_id', 'region_25ha_file'),
         ('modis_nbar_image_id', 'region_25ha_file'),
-    ]
+    ],
 )  # yapf: disable
 def test_download_region_defaults(
     image_id: str, region_file: pathlib.Path, tmp_path: pathlib.Path, runner: CliRunner, request
 ):
-    """ Test image download with default crs, scale, dtype etc.  """
+    """Test image download with default crs, scale, dtype etc."""
     image_id = request.getfixturevalue(image_id)
     region_file = request.getfixturevalue(region_file)
     out_file = tmp_path.joinpath(image_id.replace('/', '-') + '.tif')
 
     cli_str = f'download -i {image_id} -r {region_file} -dd {tmp_path}'
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
-    assert (out_file.exists())
+    assert result.exit_code == 0
+    assert out_file.exists()
 
     # test downloaded file readability and format
     with open(region_file) as f:
@@ -333,16 +350,17 @@ def test_download_region_defaults(
 
 
 @pytest.mark.parametrize(
-    'image_id, region_file', [
+    'image_id, region_file',
+    [
         ('l8_image_id', 'region_25ha_file'),
         ('s2_sr_hm_image_id', 'region_25ha_file'),
         ('gedi_cth_image_id', 'region_25ha_file'),
-    ]
+    ],
 )  # yapf: disable
 def test_download_crs_transform(
     image_id: str, region_file: pathlib.Path, tmp_path: pathlib.Path, runner: CliRunner, request
 ):
-    """ Test image download with crs, crs_transform, & shape specified. """
+    """Test image download with crs, crs_transform, & shape specified."""
     image_id = request.getfixturevalue(image_id)
     region_file = request.getfixturevalue(region_file)
     out_file = tmp_path.joinpath(image_id.replace('/', '-') + '.tif')
@@ -360,8 +378,8 @@ def test_download_crs_transform(
     # run the download
     cli_str = f'download -i {image_id} -c {crs} -ct {crs_transform_str} -sh {shape_str} -dd {tmp_path}'
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
-    assert (out_file.exists())
+    assert result.exit_code == 0
+    assert out_file.exists()
 
     # test downloaded file readability and format
     _test_downloaded_file(out_file, crs=crs, region=region, transform=crs_transform, shape=shape)
@@ -370,21 +388,22 @@ def test_download_crs_transform(
 def test_download_like(
     l8_image_id: str, s2_sr_image_id: str, region_25ha_file: pathlib.Path, tmp_path: pathlib.Path, runner: CliRunner
 ):
-    """ Test image download using --like. """
+    """Test image download using --like."""
     l8_file = tmp_path.joinpath(l8_image_id.replace('/', '-') + '.tif')
     s2_file = tmp_path.joinpath(s2_sr_image_id.replace('/', '-') + '.tif')
 
     # download the landsat 8 image to be the template
+    # TODO: make a synthetic template to save time
     l8_cli_str = f'download -i {l8_image_id} -r {region_25ha_file} -dd {tmp_path}'
     result = runner.invoke(cli, l8_cli_str.split())
-    assert (result.exit_code == 0)
-    assert (l8_file.exists())
+    assert result.exit_code == 0
+    assert l8_file.exists()
 
     # download the sentinel 2 image like the landsat 8 image
     s2_cli_str = f'download -i {s2_sr_image_id} --like {l8_file} -dd {tmp_path}'
     result = runner.invoke(cli, s2_cli_str.split())
-    assert (result.exit_code == 0)
-    assert (s2_file.exists())
+    assert result.exit_code == 0
+    assert s2_file.exists()
 
     # test the landsat 8 image is 'like' the sentinel 2 image
     with rio.open(l8_file) as l8_im, rio.open(s2_file, 'r') as s2_im:
@@ -394,27 +413,57 @@ def test_download_like(
 
 
 @pytest.mark.parametrize(
-    'image_id, region_file, crs, scale, dtype, bands, mask, resampling, scale_offset, max_tile_size, max_tile_dim', [
+    'image_id, region_file, crs, scale, dtype, bands, mask, resampling, scale_offset, max_tile_size, max_tile_dim',
+    [
         ('l5_image_id', 'region_25ha_file', 'EPSG:3857', 30, 'uint16', None, False, 'near', False, 16, 10000),
         ('l9_image_id', 'region_25ha_file', 'EPSG:3857', 30, 'float32', None, False, 'near', True, 32, 10000),
         (
-            's2_toa_image_id', 'region_25ha_file', 'EPSG:3857', 10, 'float64', ['B5', 'B9'], True, 'bilinear', True, 32,
-            10000
+            's2_toa_image_id',
+            'region_25ha_file',
+            'EPSG:3857',
+            10,
+            'float64',
+            ['B5', 'B9'],
+            True,
+            'bilinear',
+            True,
+            32,
+            10000,
         ),
         ('modis_nbar_image_id', 'region_100ha_file', 'EPSG:3857', 500, 'int32', None, False, 'bicubic', False, 4, 100),
         (
-            'gedi_cth_image_id', 'region_25ha_file', 'EPSG:3857', 10, 'float32', ['rh99'], True, 'bilinear', False, 32,
-            10000
+            'gedi_cth_image_id',
+            'region_25ha_file',
+            'EPSG:3857',
+            10,
+            'float32',
+            ['rh99'],
+            True,
+            'bilinear',
+            False,
+            32,
+            10000,
         ),
         ('landsat_ndvi_image_id', 'region_25ha_file', 'EPSG:3857', 30, 'float64', None, True, 'near', False, 32, 10000),
-    ]
-) # yapf: disable
+    ],
+)  # yapf: disable
 def test_download_params(
-    image_id: str, region_file: str, crs: str, scale: float, dtype: str, bands: List[str], mask: bool, resampling: str,
-    scale_offset: bool, max_tile_size: float, max_tile_dim: int, tmp_path: pathlib.Path, runner: CliRunner,
-    request: pytest.FixtureRequest
+    image_id: str,
+    region_file: str,
+    crs: str,
+    scale: float,
+    dtype: str,
+    bands: List[str],
+    mask: bool,
+    resampling: str,
+    scale_offset: bool,
+    max_tile_size: float,
+    max_tile_dim: int,
+    tmp_path: pathlib.Path,
+    runner: CliRunner,
+    request: pytest.FixtureRequest,
 ):
-    """ Test image download, specifying all cli params except crs_transform and shape. """
+    """Test image download, specifying all cli params except crs_transform and shape."""
     image_id = request.getfixturevalue(image_id)
     region_file = request.getfixturevalue(region_file)
     out_file = tmp_path.joinpath(image_id.replace('/', '-') + '.tif')
@@ -427,8 +476,8 @@ def test_download_params(
     cli_str += ' --scale-offset' if scale_offset else ' --no-scale-offset'
     cli_str += ''.join([f' --band-name {bn}' for bn in bands]) if bands else ''
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
-    assert (out_file.exists())
+    assert result.exit_code == 0
+    assert out_file.exists()
 
     with open(region_file) as f:
         region = json.load(f)
@@ -441,10 +490,10 @@ def test_download_params(
 def test_max_tile_size_error(
     s2_sr_image_id: str, region_100ha_file: pathlib.Path, tmp_path: pathlib.Path, runner: CliRunner, request
 ):
-    """ Test image download with max_tile_size > EE limit raises an EE error.  """
+    """Test image download with max_tile_size > EE limit raises an EE error."""
     cli_str = f'download -i {s2_sr_image_id} -r {region_100ha_file} -dd {tmp_path} -mts 100'
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code != 0)
+    assert result.exit_code != 0
     assert isinstance(result.exception, ValueError)
     assert 'download size limit' in str(result.exception)
 
@@ -452,26 +501,26 @@ def test_max_tile_size_error(
 def test_max_tile_dim_error(
     s2_sr_image_id: str, region_100ha_file: pathlib.Path, tmp_path: pathlib.Path, runner: CliRunner, request
 ):
-    """ Test image download with max_tile_dim > EE limit raises an EE error.  """
+    """Test image download with max_tile_dim > EE limit raises an EE error."""
     cli_str = f'download -i {s2_sr_image_id} -r {region_100ha_file} -dd {tmp_path} -mtd 100000'
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code != 0)
+    assert result.exit_code != 0
     assert isinstance(result.exception, ValueError)
     assert 'download limit' in str(result.exception)
 
 
 def test_export_drive_params(l8_image_id: str, region_25ha_file: pathlib.Path, runner: CliRunner):
-    """ Test export to google drive starts ok, specifying all cli params"""
+    """Test export to google drive starts ok, specifying all cli params"""
     cli_str = (
         f'export -i {l8_image_id} -r {region_25ha_file} -df geedim/test --crs EPSG:3857 --scale 30 --dtype uint16 '
         f'--mask --resampling bilinear --no-wait --band-name SR_B4 --band-name SR_B3 --band-name SR_B2'
     )
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
+    assert result.exit_code == 0
 
 
 def test_export_asset_params(l8_image_id: str, region_25ha_file: pathlib.Path, runner: CliRunner):
-    """ Test export to asset starts ok, specifying all cli params"""
+    """Test export to asset starts ok, specifying all cli params"""
     # Note when e.g. github runs this test in parallel, it could run into problems trying to overwrite an existing
     # asset.  The overwrite error won't be raised with --no-wait though.  So this test serves at least to check the
     # CLI export options work, and won't fail if run in parallel, even if it runs into overwrite problems.
@@ -487,11 +536,11 @@ def test_export_asset_params(l8_image_id: str, region_25ha_file: pathlib.Path, r
         f'--mask --resampling bilinear --no-wait --type asset --band-name SR_B4 --band-name SR_B3 --band-name SR_B2'
     )
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
+    assert result.exit_code == 0
 
 
 def test_export_asset_no_folder_error(l8_image_id: str, region_25ha_file: pathlib.Path, runner: CliRunner):
-    """ Test export to asset raises an error when no folder is specified. """
+    """Test export to asset raises an error when no folder is specified."""
     cli_str = (
         f'export -i {l8_image_id} -r {region_25ha_file} --crs EPSG:3857 --scale 30 '
         f'--dtype uint16 --mask --resampling bilinear --no-wait --type asset'
@@ -503,16 +552,22 @@ def test_export_asset_no_folder_error(l8_image_id: str, region_25ha_file: pathli
 
 @pytest.mark.parametrize('image_list, scale', [('s2_sr_image_id_list', 10), ('l8_9_image_id_list', 30)])
 def test_composite_defaults(
-    image_list: str, scale: float, region_25ha_file: pathlib.Path, runner: CliRunner, tmp_path: pathlib.Path,
-    request: pytest.FixtureRequest
+    image_list: str,
+    scale: float,
+    region_25ha_file: pathlib.Path,
+    runner: CliRunner,
+    tmp_path: pathlib.Path,
+    request: pytest.FixtureRequest,
 ):
-    """ Test composite with default CLI parameters.  """
+    """Test composite with default CLI parameters."""
     image_list = request.getfixturevalue(image_list)
     image_ids_str = ' -i '.join(image_list)
-    cli_str = f'composite -i {image_ids_str} download --crs EPSG:3857 --scale {scale} -r {region_25ha_file} -dd' \
-              f' {tmp_path}'
+    cli_str = (
+        f'composite -i {image_ids_str} download --crs EPSG:3857 --scale {scale} -r {region_25ha_file} -dd'
+        f' {tmp_path}'
+    )
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
+    assert result.exit_code == 0
 
     # test downloaded file exists
     out_files = glob(str(tmp_path.joinpath(f'*COMP*.tif')))
@@ -525,18 +580,28 @@ def test_composite_defaults(
 
 
 @pytest.mark.parametrize(
-    'image_list, method, region_file, date, mask, resampling, download_scale', [
+    'image_list, method, region_file, date, mask, resampling, download_scale',
+    [
         ('s2_sr_image_id_list', 'mosaic', None, '2021-10-01', True, 'near', 10),
         ('l8_9_image_id_list', 'q-mosaic', 'region_25ha_file', None, True, 'bilinear', 30),
         ('l8_9_image_id_list', 'medoid', 'region_25ha_file', None, True, 'near', 30),
         ('gedi_image_id_list', 'medoid', None, None, True, 'bilinear', 25),
-    ]
+    ],
 )
 def test_composite_params(
-    image_list: str, method: str, region_file: str, date: str, mask: bool, resampling: str, download_scale: float,
-    region_25ha_file, runner: CliRunner, tmp_path: pathlib.Path, request: pytest.FixtureRequest
+    image_list: str,
+    method: str,
+    region_file: str,
+    date: str,
+    mask: bool,
+    resampling: str,
+    download_scale: float,
+    region_25ha_file,
+    runner: CliRunner,
+    tmp_path: pathlib.Path,
+    request: pytest.FixtureRequest,
 ):
-    """ Test composite with default CLI parameters. """
+    """Test composite with default CLI parameters."""
     image_list = request.getfixturevalue(image_list)
     region_file = request.getfixturevalue(region_file) if region_file else None
     image_ids_str = ' -i '.join(image_list)
@@ -547,7 +612,7 @@ def test_composite_params(
     cli_download_str = f'download -r {region_25ha_file} --crs EPSG:3857 --scale {download_scale} -dd {tmp_path}'
     cli_str = cli_comp_str + ' ' + cli_download_str
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
+    assert result.exit_code == 0
 
     # test downloaded file exists
     out_files = glob(str(tmp_path.joinpath(f'*COMP*.tif')))
@@ -560,14 +625,14 @@ def test_composite_params(
 
 
 def test_search_composite_download(region_25ha_file, runner: CliRunner, tmp_path: pathlib.Path):
-    """ Test chaining of `search`, `composite` and `download`. """
+    """Test chaining of `search`, `composite` and `download`."""
 
     cli_search_str = f'search -c COPERNICUS/S1_GRD -s 2022-01-01 -e 2022-02-01 -r {region_25ha_file}'
     cli_comp_str = f'composite --mask'
     cli_download_str = f'download --crs EPSG:3857 --scale 10 -dd {tmp_path}'
     cli_str = cli_search_str + ' ' + cli_comp_str + ' ' + cli_download_str
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
+    assert result.exit_code == 0
 
     # test downloaded file exists
     out_files = glob(str(tmp_path.joinpath(f'*COMP*.tif')))
@@ -591,7 +656,7 @@ def test_search_composite_x2_download(region_25ha_file, runner: CliRunner, tmp_p
     cli_download_str = f'download --crs EPSG:3857 --scale 30 -dd {tmp_path}'
     cli_str = cli_search_str + ' ' + cli_comp1_str + ' ' + cli_comp2_str + ' ' + cli_download_str
     result = runner.invoke(cli, cli_str.split())
-    assert (result.exit_code == 0)
+    assert result.exit_code == 0
 
     # test downloaded file exists
     out_files = glob(str(tmp_path.joinpath(f'*COMP*.tif')))
diff --git a/tests/test_download.py b/tests/test_download.py
index 9aa3c25..d01dd42 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -127,7 +127,6 @@ def _test_export_profile(exp_profile, tgt_profile, transform_shape=False):
         assert exp_profile[key] == tgt_profile[key]
 
     assert utils.rio_crs(exp_profile['crs']) == utils.rio_crs(tgt_profile['crs'])
-
     assert exp_profile['transform'][0] == tgt_profile['transform'][0]
 
     if transform_shape:
diff --git a/tests/test_tile.py b/tests/test_tile.py
index 7739c96..3fb49d7 100644
--- a/tests/test_tile.py
+++ b/tests/test_tile.py
@@ -74,7 +74,7 @@ def zipped_gtiff_bytes(mock_base_image: BaseImageLike) -> bytes:
             **rio.default_gtiff_profile,
             width=mock_base_image.shape[1],
             height=mock_base_image.shape[0],
-            count=mock_base_image.count
+            count=mock_base_image.count,
         ) as ds:
             ds.write(array)
         with zipfile.ZipFile(zip_buffer, 'a', zipfile.ZIP_DEFLATED, False) as zf:

From d5b3032403f6e97a7cff3b6e25cb14b2e40ebd62 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Fri, 12 Jul 2024 13:39:22 +0200
Subject: [PATCH 15/16] change and add tests for masking with missing cloud
 data

---
 tests/test_collection.py | 270 ++++++++++++++++++++++++++-------------
 1 file changed, 183 insertions(+), 87 deletions(-)

diff --git a/tests/test_collection.py b/tests/test_collection.py
index 7b04ee0..2071bde 100644
--- a/tests/test_collection.py
+++ b/tests/test_collection.py
@@ -13,55 +13,57 @@
     See the License for the specific language governing permissions and
     limitations under the License.
 """
+
 from datetime import datetime
-from typing import List, Union, Dict
+from typing import Dict, List, Union
 
 import ee
 import numpy as np
 import pytest
+
 from geedim import schema
 from geedim.collection import MaskedCollection
 from geedim.enums import CompositeMethod, ResamplingMethod
-from geedim.errors import UnfilteredError, InputImageError
+from geedim.errors import InputImageError, UnfilteredError
 from geedim.mask import MaskedImage
-from geedim.utils import split_id, get_projection
-
+from geedim.utils import get_projection, split_id
 from .conftest import get_image_std
 
 
 @pytest.fixture()
 def l4_5_image_list(l4_image_id, l5_masked_image) -> List[Union[str, MaskedImage]]:
-    """ A list of landsat 4 & 5 image ID / MaskedImage's """
+    """A list of landsat 4 & 5 image ID / MaskedImage's"""
     return [l4_image_id, l5_masked_image]
 
 
 @pytest.fixture()
 def l8_9_image_list(l8_image_id, l9_masked_image) -> List[Union[str, MaskedImage]]:
-    """ A list of landsat 8 & 9 image IDs/ MaskedImage's """
+    """A list of landsat 8 & 9 image IDs/ MaskedImage's"""
     return [l8_image_id, l9_masked_image]
 
 
 @pytest.fixture()
 def s2_sr_image_list() -> List[Union[str, MaskedImage]]:
-    """ A list of Sentinel-2 SR image IDs/ MaskedImage's """
+    """A list of Sentinel-2 SR image IDs/ MaskedImage's"""
     return [
         'COPERNICUS/S2_SR/20211004T080801_20211004T083709_T34HEJ',
         'COPERNICUS/S2_SR/20211123T081241_20211123T083704_T34HEJ',
-        MaskedImage.from_id('COPERNICUS/S2_SR/20220107T081229_20220107T083059_T34HEJ')
+        MaskedImage.from_id('COPERNICUS/S2_SR/20220107T081229_20220107T083059_T34HEJ'),
     ]
 
 
 @pytest.fixture()
 def gedi_image_list() -> List[Union[str, MaskedImage]]:
-    """ A list of GEDI canopy top height IDs/ MaskedImage's """
+    """A list of GEDI canopy top height IDs/ MaskedImage's"""
     return [
-        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202008_018E_036S', 'LARSE/GEDI/GEDI02_A_002_MONTHLY/202009_018E_036S',
-        MaskedImage.from_id('LARSE/GEDI/GEDI02_A_002_MONTHLY/202005_018E_036S')
+        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202008_018E_036S',
+        'LARSE/GEDI/GEDI02_A_002_MONTHLY/202009_018E_036S',
+        MaskedImage.from_id('LARSE/GEDI/GEDI02_A_002_MONTHLY/202005_018E_036S'),
     ]
 
 
 def test_split_id():
-    """ Test split_id(). """
+    """Test split_id()."""
     coll_name, im_id = split_id('A/B/C')
     assert coll_name == 'A/B'
     assert im_id == 'C'
@@ -72,7 +74,7 @@ def test_split_id():
 
 @pytest.mark.parametrize('name', ['l9_image_id', 'gch_image_id'])
 def test_from_name(name: str, request):
-    """ Test MaskedCollection.from_name() for non Sentinel-2 collections. """
+    """Test MaskedCollection.from_name() for non Sentinel-2 collections."""
     name = request.getfixturevalue(name)
     name, _ = split_id(name)
     gd_collection = MaskedCollection.from_name(name)
@@ -105,7 +107,7 @@ def test_from_name_s2(name: str, request):
 
 
 def test_unfiltered_error(s2_sr_image_id):
-    """ Test UnfilteredError is raised when calling `properties` or `composite` on an unfiltered collection. """
+    """Test UnfilteredError is raised when calling `properties` or `composite` on an unfiltered collection."""
     gd_collection = MaskedCollection.from_name(split_id(s2_sr_image_id)[0])
     with pytest.raises(UnfilteredError):
         _ = gd_collection.properties
@@ -113,8 +115,8 @@ def test_unfiltered_error(s2_sr_image_id):
         _ = gd_collection.composite()
 
 
-def test_from_list_errors(landsat_image_ids, s2_image_ids, user_masked_image):
-    """ Test MaskedCollection.from_list() various error cases. """
+def test_from_list_errors(landsat_image_ids, user_masked_image):
+    """Test MaskedCollection.from_list() various error cases."""
     with pytest.raises(InputImageError):
         # test an error is raised when an image has no 'id'/'system:time_start' property
         MaskedCollection.from_list([landsat_image_ids[0], user_masked_image])
@@ -166,7 +168,7 @@ def test_from_list(image_list: str, request):
 
 @pytest.mark.parametrize('image_list', ['s2_sr_image_list', 'gedi_image_list'])
 def test_from_list_order(image_list: str, request):
-    """ Test MaskedCollection.from_list() maintains the order of the provided image list. """
+    """Test MaskedCollection.from_list() maintains the order of the provided image list."""
     image_list: List = request.getfixturevalue(image_list)[::-1]
     image_ids = [im_obj if isinstance(im_obj, str) else im_obj.id for im_obj in image_list]
     gd_collection = MaskedCollection.from_list(image_list)
@@ -174,19 +176,21 @@ def test_from_list_order(image_list: str, request):
 
 
 def test_from_list_ee_image(gedi_image_list: List):
-    """ Test MaskedCollection.from_list() with an ee.Image in the list. """
+    """Test MaskedCollection.from_list() with an ee.Image in the list."""
     image_ids = [im_obj if isinstance(im_obj, str) else im_obj.id for im_obj in gedi_image_list]
     image_list = gedi_image_list
     image_list[1] = ee.Image(image_list[1])
     gd_collection = MaskedCollection.from_list(image_list)
     assert list(gd_collection.properties.keys()) == image_ids
 
+
 @pytest.mark.parametrize(
-    'image_list, add_props', [
+    'image_list, add_props',
+    [
         ('s2_sr_image_list', ['AOT_RETRIEVAL_ACCURACY', 'CLOUDY_PIXEL_PERCENTAGE']),
-        ('l8_9_image_list', ['CLOUD_COVER', 'GEOMETRIC_RMSE_VERIFY'])
-    ]
-)  # yapf: disable
+        ('l8_9_image_list', ['CLOUD_COVER', 'GEOMETRIC_RMSE_VERIFY']),
+    ],
+)
 def test_from_list_add_props(image_list: str, add_props: List, request: pytest.FixtureRequest):
     """
     Test MaskedCollection.from_list(add_props=...) contains the add_props in properties and schema.
@@ -204,14 +208,15 @@ def test_from_list_add_props(image_list: str, add_props: List, request: pytest.F
 
 
 @pytest.mark.parametrize(
-    'name, start_date, end_date, region, fill_portion, cloudless_portion, is_csmask', [
+    'name, start_date, end_date, region, fill_portion, cloudless_portion, is_csmask',
+    [
         ('LANDSAT/LC09/C02/T1_L2', '2022-01-01', '2022-02-01', 'region_100ha', 0, 50, True),
         ('LANDSAT/LE07/C02/T1_L2', '2022-01-01', '2022-02-01', 'region_100ha', 0, 0, True),
         ('LANDSAT/LT05/C02/T1_L2', '2005-01-01', '2006-02-01', 'region_100ha', 40, 50, True),
         ('COPERNICUS/S2_SR', '2022-01-01', '2022-01-15', 'region_100ha', 0, 50, True),
         ('COPERNICUS/S2_HARMONIZED', '2022-01-01', '2022-01-15', 'region_100ha', 50, 40, True),
-        ('LARSE/GEDI/GEDI02_A_002_MONTHLY', '2021-08-01', '2021-09-01', 'region_100ha', .1, 0, False)
-    ]
+        ('LARSE/GEDI/GEDI02_A_002_MONTHLY', '2021-08-01', '2021-09-01', 'region_100ha', 0.1, 0, False),
+    ],
 )
 def test_search(
     name, start_date: str, end_date: str, region: str, fill_portion: float, cloudless_portion: float, is_csmask, request
@@ -247,7 +252,7 @@ def test_search(
 
 
 def test_empty_search(region_100ha):
-    """ Test MaskedCollection.search() for empty search results. """
+    """Test MaskedCollection.search() for empty search results."""
     gd_collection = MaskedCollection.from_name('LANDSAT/LC09/C02/T1_L2')
     searched_collection = gd_collection.search('2022-01-01', '2022-01-02', region_100ha, cloudless_portion=100)
     assert searched_collection.properties is not None
@@ -256,7 +261,7 @@ def test_empty_search(region_100ha):
 
 
 def test_search_no_end_date(region_100ha):
-    """ Test MaskedCollection.search() with ``end_date=None`` searches for a day from ``start_date`. """
+    """Test MaskedCollection.search() with ``end_date=None`` searches for a day from ``start_date``."""
     gd_collection = MaskedCollection.from_name('LANDSAT/LC09/C02/T1_L2')
     searched_collection = gd_collection.search('2022-01-03', None, region_100ha)
 
@@ -271,7 +276,7 @@ def test_search_no_end_date(region_100ha):
 
 
 def test_search_date_error(region_100ha):
-    """ Test MaskedCollection.search() raises an error when end date is on or before start date. """
+    """Test MaskedCollection.search() raises an error when end date is on or before start date."""
     gd_collection = MaskedCollection.from_name('LANDSAT/LC09/C02/T1_L2')
     with pytest.raises(ValueError):
         _ = gd_collection.search('2022-01-02', '2022-01-01', region_100ha)
@@ -328,11 +333,12 @@ def test_search_add_props(region_25ha):
 
 
 @pytest.mark.parametrize(
-    'name, start_date, end_date, region', [
+    'name, start_date, end_date, region',
+    [
         ('LANDSAT/LC09/C02/T1_L2', '2022-01-01', '2022-02-01', 'region_100ha'),
         ('COPERNICUS/S2_HARMONIZED', '2022-01-01', '2022-01-15', 'region_100ha'),
-        ('LARSE/GEDI/GEDI02_A_002_MONTHLY', '2021-08-01', '2021-09-01', 'region_100ha')
-    ]
+        ('LARSE/GEDI/GEDI02_A_002_MONTHLY', '2021-08-01', '2021-09-01', 'region_100ha'),
+    ],
 )
 def test_search_no_fill_or_cloudless_portion(
     name: str, start_date: str, end_date: str, region: str, request: pytest.FixtureRequest
@@ -363,14 +369,15 @@ def test_search_no_fill_or_cloudless_portion(
 
 
 @pytest.mark.parametrize(
-    'image_list, method, region, date', [
+    'image_list, method, region, date',
+    [
         ('s2_sr_image_list', CompositeMethod.q_mosaic, 'region_10000ha', None),
         ('s2_sr_image_list', CompositeMethod.q_mosaic, None, '2021-10-01'),
         ('gedi_image_list', CompositeMethod.mosaic, 'region_10000ha', None),
         ('gedi_image_list', CompositeMethod.mosaic, None, '2020-09-01'),
         ('l8_9_image_list', CompositeMethod.medoid, 'region_10000ha', None),
         ('l8_9_image_list', CompositeMethod.medoid, None, '2021-10-01'),
-    ]
+    ],
 )
 def test_composite_region_date_ordering(image_list, method, region, date, request):
     """
@@ -383,13 +390,15 @@ def test_composite_region_date_ordering(image_list, method, region, date, reques
     ee_collection = gd_collection._prepare_for_composite(method=method, date=date, region=region)
     properties = gd_collection._get_properties(ee_collection)
     assert len(properties) == len(image_list)
+
     if region:
-        # test images are ordered by CLOUDLESS/FILL_PORTION
+        # test images are ordered by CLOUDLESS/FILL_PORTION, and that portions are not the same
         schema_keys = list(gd_collection.schema.keys())
-        # CLOUDLESS_PORTION is not in MaskedCollection.schema for generic images, so use FILL_PORTION instead
         portion_key = 'CLOUDLESS_PORTION' if 'CLOUDLESS_PORTION' in schema_keys else 'FILL_PORTION'
         im_portions = [im_props[portion_key] for im_props in properties.values()]
         assert sorted(im_portions) == im_portions
+        assert len(set(im_portions)) == len(im_portions)
+
     elif date:
         # test images are ordered by time difference with `date`
         im_dates = np.array(
@@ -400,60 +409,134 @@ def test_composite_region_date_ordering(image_list, method, region, date, reques
         assert all(sorted(im_date_diff, reverse=True) == im_date_diff)
 
 
+def _get_masked_portion(
+    ee_image: ee.Image,
+    proj: ee.Projection = None,
+    region: dict = None,
+) -> ee.Number:
+    """Return the valid portion of the ``ee_image`` inside ``region``.  Assumes the ``region`` is completely covered
+    by ``ee_image``.
+    """
+    proj = proj or get_projection(ee_image)
+    ee_mask = ee_image.select('SR_B.*|B.*|rh.*').mask().reduce(ee.Reducer.allNonZero()).rename('EE_MASK')
+    mean = ee_mask.reduceRegion(reducer='mean', crs=proj, scale=proj.nominalScale(), geometry=region)
+    return ee.Number(mean.get('EE_MASK')).multiply(100)
+
+
 @pytest.mark.parametrize(
-    'image_list, method, mask', [
-        ('s2_sr_image_list', CompositeMethod.q_mosaic, True), ('s2_sr_image_list', CompositeMethod.mosaic, False),
-        ('l8_9_image_list', CompositeMethod.medoid, True), ('l8_9_image_list', CompositeMethod.median, False),
-        ('s2_sr_image_list', CompositeMethod.medoid, True), ('s2_sr_image_list', CompositeMethod.medoid, False),
+    'image_list, method, mask',
+    [
+        ('s2_sr_image_list', CompositeMethod.q_mosaic, True),
+        ('s2_sr_image_list', CompositeMethod.mosaic, False),
+        ('l8_9_image_list', CompositeMethod.medoid, True),
+        ('l8_9_image_list', CompositeMethod.median, False),
+        ('s2_sr_image_list', CompositeMethod.medoid, True),
+        ('s2_sr_image_list', CompositeMethod.medoid, False),
         ('l8_9_image_list', CompositeMethod.medoid, False),
-    ]
+    ],
 )
 def test_composite_mask(image_list, method, mask, region_100ha, request):
-    """
-    Test the MaskedImage.composite() `mask` parameter results in a masked composite image, comprised of masked
-    component images.
-    """
-
+    """In MaskedImage.composite(), test masking of component and composite images with the `mask` parameter."""
+    # form the composite collection and image
     image_list: List = request.getfixturevalue(image_list)
     gd_collection = MaskedCollection.from_list(image_list)
-    proj = get_projection(gd_collection.ee_collection.first(), min_scale=True)
     ee_collection = gd_collection._prepare_for_composite(method=method, mask=mask)
+    composite_im = gd_collection.composite(method=method, mask=mask)
     properties = gd_collection._get_properties(ee_collection)
+    proj = get_projection(ee_collection.first(), min_scale=True)
     assert len(properties) == len(image_list)
 
-    def count_masked_pixels(ee_image: ee.Image, count_list: ee.List):
-        """ Return the pixel count (area) of the EE mask / valid image area.  """
-        ee_mask = ee_image.select('SR_B.*|B.*|rh.*').mask().reduce(ee.Reducer.allNonZero()).rename('EE_MASK')
-        count = ee_mask.reduceRegion(reducer='sum', crs=proj.crs(), scale=proj.nominalScale(), geometry=region_100ha)
-        return ee.List(count_list).add(count.get('EE_MASK'))
+    def get_masked_portions(ee_image: ee.Image, portions: ee.List) -> ee.List:
+        """Append the valid portion of ``ee_image`` to  ``portions``."""
+        portion = _get_masked_portion(ee_image, proj=proj, region=region_100ha)
+        return ee.List(portions).add(portion)
 
-    # get the mask pixel counts for the component images
-    component_mask_counts = ee_collection.iterate(count_masked_pixels, ee.List([])).getInfo()
-    # get the mask pixel count for the composite image
-    composite_im = gd_collection.composite(method=method, mask=mask)
-    composite_mask_count = count_masked_pixels(composite_im.ee_image, []).get(0).getInfo()
+    # get the mask portions for the component and composite images
+    component_portions = np.array(ee_collection.iterate(get_masked_portions, ee.List([])).getInfo())
+    composite_portion = _get_masked_portion(composite_im.ee_image, proj=proj, region=region_100ha).getInfo()
+
+    # test masking of components and composite image
     if mask:
-        # test the composite mask (valid area) is smaller than combined area of the component image masks
-        assert composite_mask_count <= np.sum(component_mask_counts)
-        # test the composite mask is larger than the smallest of the component image masks
-        assert composite_mask_count >= np.min(component_mask_counts)
+        assert np.all(component_portions > 0) and np.all(component_portions < 100)
+        assert composite_portion <= np.sum(component_portions)
+        assert composite_portion >= np.min(component_portions)
     else:
-        # test that the component mask areas are ~ equal to composite mask area (i.e. the image area)
-        assert np.array(component_mask_counts) == pytest.approx(composite_mask_count, rel=.01)
+        assert component_portions == pytest.approx(100, abs=1)
+        assert composite_portion >= component_portions.max()
 
 
 @pytest.mark.parametrize(
-    'image_list, resampling, std_scale', [
-        ('s2_sr_image_list', ResamplingMethod.bilinear, 60), ('s2_sr_image_list', ResamplingMethod.bicubic, 60),
-        ('s2_sr_image_list', ResamplingMethod.average, 120), ('l8_9_image_list', ResamplingMethod.bilinear, 30),
-        ('l8_9_image_list', ResamplingMethod.bicubic, 30), ('l8_9_image_list', ResamplingMethod.average, 120),
-    ]
+    'masked_image, method, mask_kwargs',
+    [
+        ('s2_sr_hm_nocp_masked_image', 'q-mosaic', dict(mask_method='cloud-prob')),
+        ('s2_sr_hm_qa_mask_masked_image', 'mosaic', dict(mask_method='qa')),
+        ('s2_sr_hm_qa_zero_masked_image', 'medoid', dict(mask_method='qa')),
+    ],
+)
+def test_s2_composite_mask_missing_data(masked_image: str, method: str, mask_kwargs: dict, region_100ha, request):
+    """In MaskedImage.composite(), test when an S2 component image is fully masked due to missing cloud data,
+    the composite image is also fully masked.
+    """
+    # form the composite collection and image
+    masked_image: List = request.getfixturevalue(masked_image)
+    image_list = [masked_image]
+    gd_collection = MaskedCollection.from_list(image_list)
+
+    ee_collection = gd_collection._prepare_for_composite(method=method, mask=True, **mask_kwargs)
+    composite_im = gd_collection.composite(method=method, mask=True, **mask_kwargs)
+    properties = gd_collection._get_properties(ee_collection)
+    proj = get_projection(ee_collection.first(), min_scale=True)
+    assert len(properties) == len(image_list)
+
+    # get the mask portions for the component and composite images
+    component_portion = _get_masked_portion(ee_collection.first(), proj=proj, region=region_100ha).getInfo()
+    composite_portion = _get_masked_portion(composite_im.ee_image, proj=proj, region=region_100ha).getInfo()
+
+    # test component and composite images are fully masked
+    assert component_portion == composite_portion == 0
+
+
+def test_s2_composite_q_mosaic_missing_data(s2_sr_hm_nocp_masked_image: MaskedImage, region_100ha: dict):
+    """In MaskedImage.composite(), test when an S2 component image is unmasked, but has masked CLOUD_DIST band due to
+    missing cloud data, the composite image is fully masked with 'q-mosaic' method.
+    """
+    # form the composite collection and image
+    image_list = [s2_sr_hm_nocp_masked_image]
+    gd_collection = MaskedCollection.from_list(image_list)
+
+    kwargs = dict(method='q-mosaic', mask_method='cloud-prob', mask=False)
+    ee_collection = gd_collection._prepare_for_composite(**kwargs)
+    composite_im = gd_collection.composite(**kwargs)
+    properties = gd_collection._get_properties(ee_collection)
+    proj = get_projection(ee_collection.first(), min_scale=True)
+    assert len(properties) == len(image_list)
+
+    # get and test the mask portions for the component and composite images
+    component_portion = _get_masked_portion(ee_collection.first(), proj=proj, region=region_100ha).getInfo()
+    composite_portion = _get_masked_portion(composite_im.ee_image, proj=proj, region=region_100ha).getInfo()
+    assert component_portion == pytest.approx(100, abs=1)
+    assert composite_portion == 0
+
+
+@pytest.mark.parametrize(
+    'image_list, resampling, std_scale',
+    [
+        ('s2_sr_image_list', ResamplingMethod.bilinear, 60),
+        ('s2_sr_image_list', ResamplingMethod.bicubic, 60),
+        ('s2_sr_image_list', ResamplingMethod.average, 120),
+        ('l8_9_image_list', ResamplingMethod.bilinear, 30),
+        ('l8_9_image_list', ResamplingMethod.bicubic, 30),
+        ('l8_9_image_list', ResamplingMethod.average, 120),
+    ],
 )
 def test_composite_resampling(
-    image_list: str, resampling: ResamplingMethod, std_scale: float, region_10000ha: Dict,
-    request: pytest.FixtureRequest
+    image_list: str,
+    resampling: ResamplingMethod,
+    std_scale: float,
+    region_10000ha: Dict,
+    request: pytest.FixtureRequest,
 ):
-    """ Test that resampling smooths the composite image. """
+    """Test that resampling smooths the composite image."""
 
     image_list: List = request.getfixturevalue(image_list)
     gd_collection = MaskedCollection.from_list(image_list)
@@ -497,16 +580,28 @@ def test_composite_landsat_cloud_mask_params(l8_9_image_list, region_10000ha):
 
 
 @pytest.mark.parametrize(
-    'image_list, method, mask, region, date, cloud_kwargs', [
+    'image_list, method, mask, region, date, cloud_kwargs',
+    [
         ('s2_sr_image_list', CompositeMethod.q_mosaic, True, 'region_100ha', None, {}),
         ('s2_sr_image_list', CompositeMethod.mosaic, True, None, '2021-10-01', {}),
         ('s2_sr_image_list', CompositeMethod.medoid, False, None, None, {}),
         (
-            's2_sr_image_list', CompositeMethod.median, True, None, None,
+            's2_sr_image_list',
+            CompositeMethod.median,
+            True,
+            None,
+            None,
             dict(
-                mask_method='qa', mask_cirrus=False, mask_shadows=False, prob=60, dark=0.2, shadow_dist=500, buffer=500,
-                cdi_thresh=None, max_cloud_dist=2000
-            )
+                mask_method='qa',
+                mask_cirrus=False,
+                mask_shadows=False,
+                prob=60,
+                dark=0.2,
+                shadow_dist=500,
+                buffer=500,
+                cdi_thresh=None,
+                max_cloud_dist=2000,
+            ),
         ),
         ('l8_9_image_list', CompositeMethod.q_mosaic, True, 'region_100ha', None, {}),
         ('l8_9_image_list', CompositeMethod.mosaic, False, None, '2022-03-01', {}),
@@ -514,18 +609,22 @@ def test_composite_landsat_cloud_mask_params(l8_9_image_list, region_10000ha):
         ('l8_9_image_list', CompositeMethod.median, True, None, None, {}),
         ('l4_5_image_list', CompositeMethod.q_mosaic, False, 'region_100ha', None, {}),
         (
-            'l4_5_image_list', CompositeMethod.mosaic, True, None, '1988-01-01',
-            dict(mask_cirrus=False, mask_shadows=False)
+            'l4_5_image_list',
+            CompositeMethod.mosaic,
+            True,
+            None,
+            '1988-01-01',
+            dict(mask_cirrus=False, mask_shadows=False),
         ),
         ('l4_5_image_list', CompositeMethod.medoid, True, None, None, {}),
         ('l4_5_image_list', CompositeMethod.median, True, None, None, {}),
         ('gedi_image_list', CompositeMethod.mosaic, True, 'region_100ha', None, {}),
         ('gedi_image_list', CompositeMethod.mosaic, True, None, '2020-09-01', {}),
         ('gedi_image_list', CompositeMethod.medoid, True, None, None, {}),
-    ]
-)  # yapf: disable
+    ],
+)
 def test_composite(image_list, method, mask, region, date, cloud_kwargs, request):
-    """ Test MaskedCollection.composite() runs successfully with a variety of `method` and other parameters. """
+    """Test MaskedCollection.composite() runs successfully with a variety of `method` and other parameters."""
     image_list: List = request.getfixturevalue(image_list)
     region: Dict = request.getfixturevalue(region) if region else None
     gd_collection = MaskedCollection.from_list(image_list)
@@ -535,7 +634,7 @@ def test_composite(image_list, method, mask, region, date, cloud_kwargs, request
 
 
 def test_composite_errors(gedi_image_list, region_100ha):
-    """ Test MaskedCollection.composite() error conditions. """
+    """Test MaskedCollection.composite() error conditions."""
     gedi_collection = MaskedCollection.from_list(gedi_image_list)
     with pytest.raises(ValueError):
         # q-mosaic is only supported on cloud/shadow maskable images
@@ -554,14 +653,14 @@ def test_composite_errors(gedi_image_list, region_100ha):
 
 @pytest.mark.parametrize('image_list', ['s2_sr_image_list', 'l8_9_image_list'])
 def test_composite_date(image_list: str, request: pytest.FixtureRequest):
-    """ Test the composite date is the same as the first input image date. """
+    """Test the composite date is the same as the first input image date."""
 
     image_list: List = request.getfixturevalue(image_list)
     gd_collection = MaskedCollection.from_list(image_list)
     # assumes the image_list's are in date order
     first_date = datetime.utcfromtimestamp(
         gd_collection.ee_collection.first().get('system:time_start').getInfo() / 1000
-    ) # yapf: disable
+    )
     comp_im = gd_collection.composite()
     assert comp_im.date == first_date
 
@@ -585,9 +684,7 @@ def test_composite_mult_kwargs(region_100ha):
     assert cp_prob80 != pytest.approx(cp_prob40, abs=1e-1)
 
 
-@pytest.mark.parametrize(
-    'name', ['FAO/WAPOR/2/L1_RET_E', 'MODIS/006/MCD43A4']
-)
+@pytest.mark.parametrize('name', ['FAO/WAPOR/2/L1_RET_E', 'MODIS/006/MCD43A4'])
 def test_unbounded_search_no_region(name):
     """
     Test searching an unbounded collection without a region raises an exception.
@@ -612,4 +709,3 @@ def test_unknown_collection_error(region_25ha):
         gd_collection = gd_collection.search(start_date, end_date, region_25ha)
         _ = gd_collection.properties
     assert 'not found' in str(ex)
-

From 7fc00ed530ec7b2e5da3ac6b8fa84d3a036b05a9 Mon Sep 17 00:00:00 2001
From: Dugal Harris <dugalh@gmail.com>
Date: Fri, 12 Jul 2024 17:54:48 +0200
Subject: [PATCH 16/16] add to do

---
 geedim/mask.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/geedim/mask.py b/geedim/mask.py
index 18c8cad..20344e7 100644
--- a/geedim/mask.py
+++ b/geedim/mask.py
@@ -388,6 +388,7 @@ def _aux_image(
         ee.Image
             An Earth Engine image containing *_MASK and CLOUD_DIST bands.
         """
+        # TODO: add warning about post Feb 2022 validity when qa method is used
         mask_method = CloudMaskMethod(mask_method)
 
         def get_cloud_prob(ee_im):