Merging main branch into staging branch

chanzuckerberg · Aug 15, 2024 · 0c386de · 0c386de
2 parents 804d469 + 05066e6
commit 0c386de
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 19 deletions.
diff --git a/backend/layers/processing/process_seurat.py b/backend/layers/processing/process_seurat.py
@@ -13,6 +13,7 @@
 )
 from backend.layers.processing.logger import logit
 from backend.layers.processing.process_logic import ProcessingLogic
+from backend.layers.processing.utils.matrix_utils import enforce_canonical_format
 from backend.layers.processing.utils.rds_citation_from_h5ad import rds_citation_from_h5ad
 from backend.layers.thirdparty.s3_provider import S3ProviderInterface
 from backend.layers.thirdparty.uri_provider import UriProviderInterface
@@ -74,6 +75,14 @@ def process(self, dataset_version_id: DatasetVersionId, artifact_bucket: str, da
         adata = anndata.read_h5ad(labeled_h5ad_filename)
         if "citation" in adata.uns:
             adata.uns["citation"] = rds_citation_from_h5ad(adata.uns["citation"])
+
+        # enforce for canonical
+        logger.info("enforce canonical format in X")
+        enforce_canonical_format(adata)
+        if adata.raw:
+            logger.info("enforce canonical format in raw.X")
+            enforce_canonical_format(adata.raw)
+
         adata.write_h5ad(labeled_h5ad_filename)
 
         # Use Seurat to convert to RDS

diff --git a/backend/layers/processing/utils/matrix_utils.py b/backend/layers/processing/utils/matrix_utils.py
@@ -2,6 +2,8 @@
 
 import numpy as np
 
+logger: logging.Logger = logging.getLogger("matrix_utils")
+
 
 def is_matrix_sparse(matrix: np.ndarray, sparse_threshold):
     """
@@ -57,3 +59,14 @@ def is_matrix_sparse(matrix: np.ndarray, sparse_threshold):
 
     is_sparse = (100.0 * number_of_non_zero_elements / total_number_of_matrix_elements) < sparse_threshold
     return is_sparse
+
+
+def enforce_canonical_format(adata):
+    """
+    Enforce canonical format for an AnnData, if not already in canonical format.  This function will modify the
+    matrix in place.
+    """
+    X = adata.X
+    if hasattr(X, "has_canonical_format") and not X.has_canonical_format:
+        logger.warning("noncanonical data found in X; converting to canonical format using sum_duplicates.")
+        X.sum_duplicates()
diff --git a/tests/unit/processing/test_matrix_utils.py b/tests/unit/processing/test_matrix_utils.py
@@ -1,48 +1,89 @@
-import unittest
+import logging
+from unittest.mock import Mock
 
 import numpy as np
+import pytest
+from anndata import AnnData
+from scipy.sparse import coo_matrix
 
-from backend.layers.processing.utils.matrix_utils import is_matrix_sparse
+from backend.layers.processing.utils.matrix_utils import enforce_canonical_format, is_matrix_sparse
 
+LOGGER = logging.getLogger("matrix_utils")
+LOGGER.propagate = True
 
-class TestMatrixUtils(unittest.TestCase):
+
+class TestMatrixUtils:
     def test__is_matrix_sparse__zero_and_one_hundred_percent_threshold(self):
         matrix = np.array([1, 2, 3])
 
-        self.assertFalse(is_matrix_sparse(matrix, 0))
-        self.assertTrue(is_matrix_sparse(matrix, 100))
+        assert not is_matrix_sparse(matrix, 0)
+        assert is_matrix_sparse(matrix, 100)
 
     def test__is_matrix_sparse__partially_populated_sparse_matrix_returns_true(self):
         matrix = np.zeros([3, 4])
         matrix[2][3] = 1.0
         matrix[1][1] = 2.2
 
-        self.assertTrue(is_matrix_sparse(matrix, 50))
+        assert is_matrix_sparse(matrix, 50)
 
     def test__is_matrix_sparse__partially_populated_dense_matrix_returns_false(self):
         matrix = np.zeros([2, 2])
         matrix[0][0] = 1.0
         matrix[0][1] = 2.2
         matrix[1][1] = 3.7
 
-        self.assertFalse(is_matrix_sparse(matrix, 50))
+        assert not is_matrix_sparse(matrix, 50)
 
-    def test__is_matrix_sparse__giant_matrix_returns_false_early(self):
+    def test__is_matrix_sparse__giant_matrix_returns_false_early(self, caplog):
+        caplog.set_level(logging.INFO)
         matrix = np.ones([20000, 20])
 
-        with self.assertLogs(level="INFO") as logger:
-            self.assertFalse(is_matrix_sparse(matrix, 1))
+        assert not is_matrix_sparse(matrix, 1)
 
-            # Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
-            # non-zero elements in the matrix.
-            self.assertIn("Percentage of non-zero elements (estimate)", logger.output[0])
+        # Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
+        # non-zero elements in the matrix.
+        assert "Percentage of non-zero elements (estimate)" in caplog.text
 
-    def test__is_matrix_sparse_with_column_shift_encoding__giant_matrix_returns_false_early(self):
+    def test__is_matrix_sparse_with_column_shift_encoding__giant_matrix_returns_false_early(self, caplog):
+        caplog.set_level(logging.INFO)
         matrix = np.random.rand(20000, 20)
 
-        with self.assertLogs(level="INFO") as logger:
-            self.assertFalse(is_matrix_sparse(matrix, 1))
+        assert not is_matrix_sparse(matrix, 1)
+
+        # Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
+        # non-zero elements in the matrix.
+        assert "Percentage of non-zero elements (estimate)" in caplog.text
+
+
+@pytest.fixture
+def noncanonical_matrix():
+    array = np.array([[1, 0, 1], [3, 2, 3], [4, 5, 4]])
+    return coo_matrix((array[0], (array[1], array[2])))
+
+
+@pytest.fixture
+def canonical_adata():
+    return Mock(X=Mock(has_canonical_format=True))
+
+
+class TestEnforceCanonical:
+    def test_adata_with_noncanonical_X_and_raw_X(self, noncanonical_matrix, caplog):
+        assert noncanonical_matrix.has_canonical_format is False
+        adata = AnnData(noncanonical_matrix)
+        enforce_canonical_format(adata)
+        assert adata.X.has_canonical_format is True
+        assert "noncanonical data found in X; converting to canonical format using sum_duplicates." in caplog.text
+
+    def test_adata_with_noncanonical_raw_X(self, noncanonical_matrix, caplog):
+        caplog.set_level(logging.WARNING)
+        assert noncanonical_matrix.has_canonical_format is False
+        adata = AnnData(raw=AnnData(noncanonical_matrix))
+        enforce_canonical_format(adata.raw)
+        assert adata.raw.X.has_canonical_format is True
+        assert "noncanonical data found in X; converting to canonical format using sum_duplicates." in caplog.text
 
-            # Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
-            # non-zero elements in the matrix.
-            self.assertIn("Percentage of non-zero elements (estimate)", logger.output[0])
+    def test_adata_with_canonical_X(self, canonical_adata, caplog):
+        caplog.set_level(logging.WARNING)
+        enforce_canonical_format(canonical_adata)
+        assert canonical_adata.X.has_canonical_format is True
+        assert "noncanonical data found in X; converting to canonical format using sum_duplicates." not in caplog.text