Skip to content

Commit

Permalink
Merging main branch into staging branch
Browse files Browse the repository at this point in the history
  • Loading branch information
actions-user committed Aug 15, 2024
2 parents 804d469 + 05066e6 commit 0c386de
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 19 deletions.
9 changes: 9 additions & 0 deletions backend/layers/processing/process_seurat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from backend.layers.processing.logger import logit
from backend.layers.processing.process_logic import ProcessingLogic
from backend.layers.processing.utils.matrix_utils import enforce_canonical_format
from backend.layers.processing.utils.rds_citation_from_h5ad import rds_citation_from_h5ad
from backend.layers.thirdparty.s3_provider import S3ProviderInterface
from backend.layers.thirdparty.uri_provider import UriProviderInterface
Expand Down Expand Up @@ -74,6 +75,14 @@ def process(self, dataset_version_id: DatasetVersionId, artifact_bucket: str, da
adata = anndata.read_h5ad(labeled_h5ad_filename)
if "citation" in adata.uns:
adata.uns["citation"] = rds_citation_from_h5ad(adata.uns["citation"])

# enforce for canonical
logger.info("enforce canonical format in X")
enforce_canonical_format(adata)
if adata.raw:
logger.info("enforce canonical format in raw.X")
enforce_canonical_format(adata.raw)

adata.write_h5ad(labeled_h5ad_filename)

# Use Seurat to convert to RDS
Expand Down
13 changes: 13 additions & 0 deletions backend/layers/processing/utils/matrix_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import numpy as np

logger: logging.Logger = logging.getLogger("matrix_utils")


def is_matrix_sparse(matrix: np.ndarray, sparse_threshold):
"""
Expand Down Expand Up @@ -57,3 +59,14 @@ def is_matrix_sparse(matrix: np.ndarray, sparse_threshold):

is_sparse = (100.0 * number_of_non_zero_elements / total_number_of_matrix_elements) < sparse_threshold
return is_sparse


def enforce_canonical_format(adata):
"""
Enforce canonical format for an AnnData, if not already in canonical format. This function will modify the
matrix in place.
"""
X = adata.X
if hasattr(X, "has_canonical_format") and not X.has_canonical_format:
logger.warning("noncanonical data found in X; converting to canonical format using sum_duplicates.")
X.sum_duplicates()
79 changes: 60 additions & 19 deletions tests/unit/processing/test_matrix_utils.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,89 @@
import unittest
import logging
from unittest.mock import Mock

import numpy as np
import pytest
from anndata import AnnData
from scipy.sparse import coo_matrix

from backend.layers.processing.utils.matrix_utils import is_matrix_sparse
from backend.layers.processing.utils.matrix_utils import enforce_canonical_format, is_matrix_sparse

LOGGER = logging.getLogger("matrix_utils")
LOGGER.propagate = True

class TestMatrixUtils(unittest.TestCase):

class TestMatrixUtils:
def test__is_matrix_sparse__zero_and_one_hundred_percent_threshold(self):
matrix = np.array([1, 2, 3])

self.assertFalse(is_matrix_sparse(matrix, 0))
self.assertTrue(is_matrix_sparse(matrix, 100))
assert not is_matrix_sparse(matrix, 0)
assert is_matrix_sparse(matrix, 100)

def test__is_matrix_sparse__partially_populated_sparse_matrix_returns_true(self):
matrix = np.zeros([3, 4])
matrix[2][3] = 1.0
matrix[1][1] = 2.2

self.assertTrue(is_matrix_sparse(matrix, 50))
assert is_matrix_sparse(matrix, 50)

def test__is_matrix_sparse__partially_populated_dense_matrix_returns_false(self):
matrix = np.zeros([2, 2])
matrix[0][0] = 1.0
matrix[0][1] = 2.2
matrix[1][1] = 3.7

self.assertFalse(is_matrix_sparse(matrix, 50))
assert not is_matrix_sparse(matrix, 50)

def test__is_matrix_sparse__giant_matrix_returns_false_early(self):
def test__is_matrix_sparse__giant_matrix_returns_false_early(self, caplog):
caplog.set_level(logging.INFO)
matrix = np.ones([20000, 20])

with self.assertLogs(level="INFO") as logger:
self.assertFalse(is_matrix_sparse(matrix, 1))
assert not is_matrix_sparse(matrix, 1)

# Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
# non-zero elements in the matrix.
self.assertIn("Percentage of non-zero elements (estimate)", logger.output[0])
# Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
# non-zero elements in the matrix.
assert "Percentage of non-zero elements (estimate)" in caplog.text

def test__is_matrix_sparse_with_column_shift_encoding__giant_matrix_returns_false_early(self):
def test__is_matrix_sparse_with_column_shift_encoding__giant_matrix_returns_false_early(self, caplog):
caplog.set_level(logging.INFO)
matrix = np.random.rand(20000, 20)

with self.assertLogs(level="INFO") as logger:
self.assertFalse(is_matrix_sparse(matrix, 1))
assert not is_matrix_sparse(matrix, 1)

# Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
# non-zero elements in the matrix.
assert "Percentage of non-zero elements (estimate)" in caplog.text


@pytest.fixture
def noncanonical_matrix():
array = np.array([[1, 0, 1], [3, 2, 3], [4, 5, 4]])
return coo_matrix((array[0], (array[1], array[2])))


@pytest.fixture
def canonical_adata():
return Mock(X=Mock(has_canonical_format=True))


class TestEnforceCanonical:
def test_adata_with_noncanonical_X_and_raw_X(self, noncanonical_matrix, caplog):
assert noncanonical_matrix.has_canonical_format is False
adata = AnnData(noncanonical_matrix)
enforce_canonical_format(adata)
assert adata.X.has_canonical_format is True
assert "noncanonical data found in X; converting to canonical format using sum_duplicates." in caplog.text

def test_adata_with_noncanonical_raw_X(self, noncanonical_matrix, caplog):
caplog.set_level(logging.WARNING)
assert noncanonical_matrix.has_canonical_format is False
adata = AnnData(raw=AnnData(noncanonical_matrix))
enforce_canonical_format(adata.raw)
assert adata.raw.X.has_canonical_format is True
assert "noncanonical data found in X; converting to canonical format using sum_duplicates." in caplog.text

# Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of
# non-zero elements in the matrix.
self.assertIn("Percentage of non-zero elements (estimate)", logger.output[0])
def test_adata_with_canonical_X(self, canonical_adata, caplog):
caplog.set_level(logging.WARNING)
enforce_canonical_format(canonical_adata)
assert canonical_adata.X.has_canonical_format is True
assert "noncanonical data found in X; converting to canonical format using sum_duplicates." not in caplog.text

0 comments on commit 0c386de

Please sign in to comment.