Skip to content

Commit

Permalink
[c++,python] Fix default reads on unwritten dense arrays (#3136)
Browse files Browse the repository at this point in the history
* Found issue while working on spatial's `MultiscaleImage` and erroring
out when applying `read_spatial_region` to a level that had not been
written to yet
* Add new `non_empty_domain_slot_opt` alongside `non_empty_domain_slot`
that returns `std::nullopt` if it is empty rather than (0, 0)
* Fix logic in C++ `ManagedQuery::setup_read` to check if the array has
been written to yet. If it hasn't, set the subarray to use soma_dim_0's
full domain. Otherwise, use the non-empty domain as before
* Also fix logic in Python `DenseNDArray.read` to use the new
`non_empty_domain_slot_opt` and correctly set the `data_shape` to the
full domain if the array has not been written to yet
* Add unit test in pytest
* Update all tests to reflect fix
  • Loading branch information
nguyenv authored Oct 7, 2024
1 parent 7018462 commit 39e9ab3
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 25 deletions.
16 changes: 12 additions & 4 deletions apis/python/src/tiledbsoma/_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,18 @@ def read(
# all, in which case the best we can do is use the schema shape.
handle: clib.SOMADenseNDArray = self._handle._handle

data_shape = handle.shape
ned = self.non_empty_domain()
if ned is not None:
data_shape = tuple(slot[1] + 1 for slot in ned)
ned = []
for dim_name in handle.dimension_names:
dtype = np.dtype(self.schema.field(dim_name).type.to_pandas_dtype())
slot = handle.non_empty_domain_slot_opt(dim_name, dtype)
if slot is None:
use_shape = True
break
ned.append(slot[1] + 1)
else:
use_shape = False

data_shape = tuple(handle.shape if use_shape else ned)
target_shape = dense_indices_to_shape(coords, data_shape, result_order)

context = handle.context()
Expand Down
56 changes: 56 additions & 0 deletions apis/python/src/tiledbsoma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,62 @@ void load_soma_array(py::module& m) {
}
})

.def(
"non_empty_domain_slot_opt",
[](SOMAArray& array, std::string name, py::dtype dtype) {
switch (np_to_tdb_dtype(dtype)) {
case TILEDB_UINT64:
return py::cast(
array.non_empty_domain_slot_opt<uint64_t>(name));
case TILEDB_DATETIME_YEAR:
case TILEDB_DATETIME_MONTH:
case TILEDB_DATETIME_WEEK:
case TILEDB_DATETIME_DAY:
case TILEDB_DATETIME_HR:
case TILEDB_DATETIME_MIN:
case TILEDB_DATETIME_SEC:
case TILEDB_DATETIME_MS:
case TILEDB_DATETIME_US:
case TILEDB_DATETIME_NS:
case TILEDB_DATETIME_PS:
case TILEDB_DATETIME_FS:
case TILEDB_DATETIME_AS:
case TILEDB_INT64:
return py::cast(
array.non_empty_domain_slot_opt<int64_t>(name));
case TILEDB_UINT32:
return py::cast(
array.non_empty_domain_slot_opt<uint32_t>(name));
case TILEDB_INT32:
return py::cast(
array.non_empty_domain_slot_opt<int32_t>(name));
case TILEDB_UINT16:
return py::cast(
array.non_empty_domain_slot_opt<uint16_t>(name));
case TILEDB_INT16:
return py::cast(
array.non_empty_domain_slot_opt<int16_t>(name));
case TILEDB_UINT8:
return py::cast(
array.non_empty_domain_slot_opt<uint8_t>(name));
case TILEDB_INT8:
return py::cast(
array.non_empty_domain_slot_opt<int8_t>(name));
case TILEDB_FLOAT64:
return py::cast(
array.non_empty_domain_slot_opt<double>(name));
case TILEDB_FLOAT32:
return py::cast(
array.non_empty_domain_slot_opt<float>(name));
case TILEDB_STRING_UTF8:
case TILEDB_STRING_ASCII:
return py::cast(array.non_empty_domain_slot_var(name));
default:
throw TileDBSOMAError(
"Unsupported dtype for nonempty domain.");
}
})

.def(
"soma_domain_slot",
[](SOMAArray& array, std::string name, py::dtype dtype) {
Expand Down
8 changes: 2 additions & 6 deletions apis/python/tests/test_basic_anndata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,8 @@ def test_import_anndata(conftest_pbmc_small, ingest_modes, X_kind):
assert X.used_shape() == tuple([(0, e - 1) for e in orig.shape])
else:
assert X.metadata.get(metakey) == "SOMADenseNDArray"
if have_ingested:
matrix = X.read(coords=all2d)
assert matrix.size == orig.X.size
else:
with pytest.raises(ValueError):
X.read(coords=all2d)
matrix = X.read(coords=all2d)
assert matrix.size == orig.X.size

# Check raw/X/data (sparse)
assert exp.ms["raw"].X["data"].metadata.get(metakey) == "SOMASparseNDArray"
Expand Down
15 changes: 15 additions & 0 deletions apis/python/tests/test_dense_nd_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,3 +491,18 @@ def test_fixed_timestamp(tmp_path: pathlib.Path):

with pytest.raises(soma.SOMAError):
soma.open(tmp_path.as_posix(), context=fixed_time, tiledb_timestamp=111)


@pytest.mark.parametrize("shape", [(10,), (10, 20), (10, 20, 2), (2, 4, 6, 8)])
def test_read_to_unwritten_array(tmp_path, shape):
uri = tmp_path.as_posix()

soma.DenseNDArray.create(uri, type=pa.uint8(), shape=shape)

with tiledb.open(uri, "r") as A:
expected = A[:]["soma_data"]

with soma.DenseNDArray.open(uri, "r") as A:
actual = A.read().to_numpy()

assert np.array_equal(expected, actual)
42 changes: 28 additions & 14 deletions libtiledbsoma/src/soma/managed_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,22 +100,36 @@ void ManagedQuery::setup_read() {
return;
}

auto schema = array_->schema();

// If the query is uninitialized, set the subarray for the query
if (status == Query::Status::UNINITIALIZED) {
// Dense array must have a subarray set. If the array is dense and no
// ranges have been set, add a range for the array's entire non-empty
// domain on dimension 0.
if (array_->schema().array_type() == TILEDB_DENSE &&
!subarray_range_set_) {
auto non_empty_domain = array_->non_empty_domain<int64_t>(0);
subarray_->add_range(
0, non_empty_domain.first, non_empty_domain.second);
// domain on dimension 0. In the case that the non-empty domain does not
// exist (when the array has not been written to yet), use dimension 0's
// full domain
if (schema.array_type() == TILEDB_DENSE && !subarray_range_set_) {
// Check if the array has been written to by using the C API as
// there is no way to to check for an empty domain using the current
// CPP API
int32_t is_empty;
int64_t ned[2];
ctx_->handle_error(tiledb_array_get_non_empty_domain_from_index(
ctx_->ptr().get(), array_->ptr().get(), 0, &ned, &is_empty));

std::pair<int64_t, int64_t> array_shape;
if (is_empty == 1) {
array_shape = schema.domain().dimension(0).domain<int64_t>();
} else {
array_shape = std::make_pair(ned[0], ned[1]);
}

subarray_->add_range(0, array_shape.first, array_shape.second);
LOG_DEBUG(fmt::format(
"[ManagedQuery] Add full NED range to dense subarray = (0, {}, "
"{})",
non_empty_domain.first,
non_empty_domain.second));
"[ManagedQuery] Add full range to dense subarray = (0, {}, {})",
array_shape.first,
array_shape.second));
}

// Set the subarray for range slicing
Expand All @@ -125,14 +139,14 @@ void ManagedQuery::setup_read() {
// If no columns were selected, select all columns.
// Add dims and attrs in the same order as specified in the schema
if (columns_.empty()) {
if (array_->schema().array_type() == TILEDB_SPARSE) {
for (const auto& dim : array_->schema().domain().dimensions()) {
if (schema.array_type() == TILEDB_SPARSE) {
for (const auto& dim : schema.domain().dimensions()) {
columns_.push_back(dim.name());
}
}
int attribute_num = array_->schema().attribute_num();
int attribute_num = schema.attribute_num();
for (int i = 0; i < attribute_num; i++) {
columns_.push_back(array_->schema().attribute(i).name());
columns_.push_back(schema.attribute(i).name());
}
}

Expand Down
38 changes: 37 additions & 1 deletion libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,8 @@ class SOMAArray : public SOMAObject {

/**
* Retrieves the non-empty domain from the array. This is the union of the
* non-empty domains of the array fragments.
* non-empty domains of the array fragments. Returns (0, 0) for empty
* domains.
*/
template <typename T>
std::pair<T, T> non_empty_domain_slot(const std::string& name) const {
Expand All @@ -733,6 +734,41 @@ class SOMAArray : public SOMAObject {
}
}

/**
* Retrieves the non-empty domain from the array. This is the union of the
* non-empty domains of the array fragments. Return std::nullopt for empty
* domains.
*/
template <typename T>
std::optional<std::pair<T, T>> non_empty_domain_slot_opt(
const std::string& name) const {
try {
int32_t is_empty;
T ned[2];

// TODO currently we need to use the TileDB C API in order to check
// if the domain is empty or not. The C++ API returns (0, 0)
// currently which could also represent a single point at coordinate
// 0. Replace this when the C++ API supports correct checking for
// empty domains
ctx_->tiledb_ctx()->handle_error(
tiledb_array_get_non_empty_domain_from_name(
ctx_->tiledb_ctx()->ptr().get(),
arr_->ptr().get(),
name.c_str(),
&ned,
&is_empty));

if (is_empty == 1) {
return std::nullopt;
} else {
return std::make_pair(ned[0], ned[1]);
}
} catch (const std::exception& e) {
throw TileDBSOMAError(e.what());
}
}

/**
* Retrieves the non-empty domain from the array on the given dimension.
* This is the union of the non-empty domains of the array fragments.
Expand Down

0 comments on commit 39e9ab3

Please sign in to comment.