Skip to content

Commit

Permalink
[r] Push schema accessor down to libtiledbsoma (#3079)
Browse files Browse the repository at this point in the history
* [r] Push `schema` accessor down to `libtiledbsoma`

* debug [skip ci]

* unit-testing

* close some dangling-open unit-test handles

* fix unit-test failures

* code-review feedback

Co-authored-by: Paul Hoffman <[email protected]>

* DESCRIPTION and NEWS.md [skip ci]

* one more spot

* rebase

---------

Co-authored-by: Paul Hoffman <[email protected]>
  • Loading branch information
johnkerl and mojaveazure authored Oct 3, 2024
1 parent 3bc119e commit c50cb98
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 19 deletions.
2 changes: 1 addition & 1 deletion apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
like those commonly used for single cell data analysis. It is documented at
<https://github.com/single-cell-data>; a formal specification available is at
<https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
Version: 1.14.99.4
Version: 1.14.99.5
Authors@R: c(
person(given = "Aaron", family = "Wolen",
role = c("cre", "aut"), email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Changes

* Push `schema` accessor down to `libtiledbsoma` [#3079](https://github.com/single-cell-data/TileDB-SOMA/pull/3079)
* Push `attrnames` down to C++ [#3121](https://github.com/single-cell-data/TileDB-SOMA/pull/3121)
* Use `libtiledbsoma` for R schema evolution [#3100](https://github.com/single-cell-data/TileDB-SOMA/pull/3100)
* Implement missing `domain` argument to `SOMADataFrame` `create` [#3032](https://github.com/single-cell-data/TileDB-SOMA/pull/3032)
Expand Down
4 changes: 4 additions & 0 deletions apis/r/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ c_attrnames <- function(uri, ctxxp) {
.Call(`_tiledbsoma_c_attrnames`, uri, ctxxp)
}

c_schema <- function(uri, ctxxp) {
.Call(`_tiledbsoma_c_schema`, uri, ctxxp)
}

resize <- function(uri, new_shape, ctxxp) {
invisible(.Call(`_tiledbsoma_resize`, uri, new_shape, ctxxp))
}
Expand Down
5 changes: 5 additions & 0 deletions apis/r/R/SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,11 @@ SOMADataFrame <- R6::R6Class(
is.data.frame(values) || is_arrow_table(values) || is_arrow_record_batch(values)
)

# Leave state unmodified
# TODO: this issue will automatically go away on https://github.com/single-cell-data/TileDB-SOMA/issues/3059
omode <- self$mode()
on.exit(self$reopen(mode = omode))

if (is.data.frame(values)) {
if (!is.null(row_index_name)) {
stopifnot(
Expand Down
2 changes: 1 addition & 1 deletion apis/r/R/SOMADenseNDArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ SOMADenseNDArray <- R6::R6Class(
"'coords' must be a list of integer vectors" =
is.list(coords) && all(vapply_lgl(coords, is.integer)),
"length of 'coords' must match number of dimensions" =
length(coords) == length(self$dimensions())
length(coords) == self$ndim()
)

## the 'soma_data' data type may not have been cached, and if so we need to fetch it
Expand Down
3 changes: 2 additions & 1 deletion apis/r/R/TileDBArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ TileDBArray <- R6::R6Class(
#' @description Retrieve the array schema as an Arrow schema (lifecycle: maturing)
#' @return A [`arrow::schema`] object
schema = function() {
arrow_schema_from_tiledb_schema(tiledb::schema(self$object))
arrow::as_schema(
c_schema(self$uri, private$.soma_context));
},

#' @description Retrieve the array schema as TileDB schema (lifecycle: maturing)
Expand Down
13 changes: 13 additions & 0 deletions apis/r/src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// c_schema
SEXP c_schema(const std::string& uri, Rcpp::XPtr<somactx_wrap_t> ctxxp);
RcppExport SEXP _tiledbsoma_c_schema(SEXP uriSEXP, SEXP ctxxpSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP);
Rcpp::traits::input_parameter< Rcpp::XPtr<somactx_wrap_t> >::type ctxxp(ctxxpSEXP);
rcpp_result_gen = Rcpp::wrap(c_schema(uri, ctxxp));
return rcpp_result_gen;
END_RCPP
}
// resize
void resize(const std::string& uri, Rcpp::NumericVector new_shape, Rcpp::XPtr<somactx_wrap_t> ctxxp);
RcppExport SEXP _tiledbsoma_resize(SEXP uriSEXP, SEXP new_shapeSEXP, SEXP ctxxpSEXP) {
Expand Down Expand Up @@ -747,6 +759,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_tiledbsoma_ndim", (DL_FUNC) &_tiledbsoma_ndim, 2},
{"_tiledbsoma_c_dimnames", (DL_FUNC) &_tiledbsoma_c_dimnames, 2},
{"_tiledbsoma_c_attrnames", (DL_FUNC) &_tiledbsoma_c_attrnames, 2},
{"_tiledbsoma_c_schema", (DL_FUNC) &_tiledbsoma_c_schema, 2},
{"_tiledbsoma_resize", (DL_FUNC) &_tiledbsoma_resize, 3},
{"_tiledbsoma_resize_soma_joinid", (DL_FUNC) &_tiledbsoma_resize_soma_joinid, 3},
{"_tiledbsoma_tiledbsoma_upgrade_shape", (DL_FUNC) &_tiledbsoma_tiledbsoma_upgrade_shape, 3},
Expand Down
28 changes: 28 additions & 0 deletions apis/r/src/rinterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,34 @@ Rcpp::CharacterVector c_attrnames(
return retval;
}

// [[Rcpp::export]]
SEXP c_schema(const std::string& uri, Rcpp::XPtr<somactx_wrap_t> ctxxp) {
auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, ctxxp->ctxptr);
std::unique_ptr<ArrowSchema> lib_retval = sr->arrow_schema();
sr->close();

auto schemaxp = nanoarrow_schema_owning_xptr();
auto sch = nanoarrow_output_schema_from_xptr(schemaxp);
exitIfError(
ArrowSchemaInitFromType(sch, NANOARROW_TYPE_STRUCT), "Bad schema init");
exitIfError(ArrowSchemaSetName(sch, ""), "Bad schema name");
exitIfError(
ArrowSchemaAllocateChildren(sch, lib_retval->n_children),
"Bad schema children alloc");

for (size_t i = 0; i < lib_retval->n_children; i++) {
spdl::info(
"[c_schema] Accessing name '{}' format '{}' at position {}",
std::string(lib_retval->children[i]->name),
std::string(lib_retval->children[i]->format),
i);

ArrowSchemaMove(lib_retval->children[i], sch->children[i]);
}

return schemaxp;
}

// [[Rcpp::export]]
void resize(
const std::string& uri,
Expand Down
59 changes: 43 additions & 16 deletions apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -617,33 +617,47 @@ test_that("SOMADataFrame can be updated", {
uri <- withr::local_tempdir("soma-dataframe-update")
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
sdf <- create_and_populate_soma_dataframe(uri, nrows = 10L)
sdf$close();

# Retrieve the table from disk
tbl0 <- SOMADataFrameOpen(uri, "READ")$read()$concat()
sdf <- SOMADataFrameOpen(uri, "READ")
tbl0 <- sdf$read()$concat()
sdf$close()

# Remove a column and update
tbl0$float_column <- NULL
sdf <- SOMADataFrameOpen(uri, "WRITE")$update(tbl0)
sdf <- SOMADataFrameOpen(uri, "WRITE")
sdf$update(tbl0)
sdf$close()

# Verify attribute was removed on disk
tbl1 <- SOMADataFrameOpen(uri, "READ")$read()$concat()
sdf <- SOMADataFrameOpen(uri, "READ")
tbl1 <- sdf$read()$concat()
expect_true(tbl1$Equals(tbl0))
sdf$close()

# # Add a new column and update
tbl0$float_column <- sample(c(TRUE, FALSE), nrow(tbl0), replace = TRUE)
sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0)
sdf <- SOMADataFrameOpen(uri, "WRITE")
sdf$update(tbl0)
sdf$close()

# Verify attribute was added on disk
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat()
sdf <- SOMADataFrameOpen(uri, "READ")
tbl1 <- sdf$read()$concat()
expect_true(tbl1$Equals(tbl0))
sdf$close()

# Add a new enum and update
tbl0$frobo <- factor(sample(letters[1:3], nrow(tbl0), replace = TRUE))
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))
sdf <- SOMADataFrameOpen(uri, "WRITE")
expect_no_condition(sdf$update(tbl0))
sdf$close()

# Verify enum was added on disk
sdf <- SOMADataFrameOpen(uri, "READ")
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
tbl1 <- sdf$read()$concat(),
"Table"
)
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
Expand All @@ -652,6 +666,7 @@ test_that("SOMADataFrame can be updated", {
"factor",
exact = TRUE
)
sdf$close()

# Add a new enum where levels aren't in appearance- or alphabetical-order
tbl0 <- tbl1
Expand All @@ -663,11 +678,14 @@ test_that("SOMADataFrame can be updated", {
levels(tbl0$GetColumnByName("rlvl")$as_vector()),
c("green", "red", "blue")
)
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))
sdf <- SOMADataFrameOpen(uri, "WRITE")
expect_no_condition(sdf$update(tbl0))
sdf$close()

# Verify unordered enum was added on disk
sdf <- SOMADataFrameOpen(uri, "READ")
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
tbl1 <- sdf$read()$concat(),
"Table"
)
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
Expand All @@ -684,26 +702,31 @@ test_that("SOMADataFrame can be updated", {

# Verify queryability
expect_s3_class(
tblq <- SOMADataFrameOpen(uri, mode = "READ")$read(value_filter = 'rlvl == "green"')$concat(),
tblq <- sdf$read(value_filter = 'rlvl == "green"')$concat(),
"Table"
)
expect_length(tblq[["rlvl"]], 3)
expect_s3_class(
tblq <- SOMADataFrameOpen(uri, mode = "READ")$read(value_filter = 'rlvl %in% c("blue", "green")')$concat(),
tblq <- sdf$read(value_filter = 'rlvl %in% c("blue", "green")')$concat(),
"Table"
)
expect_length(tblq[["rlvl"]], 6)
sdf$close()

# Add a new ordered and update
tbl0 <- tbl1
tbl0$ord <- ordered(sample(c("g1", "g2", "g3"), nrow(tbl0), replace = TRUE))
expect_no_condition(sdf <- SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0))
sdf <- SOMADataFrameOpen(uri, "WRITE")
expect_no_condition(sdf$update(tbl0))
sdf$close()

# Verify ordered was added on disk
sdf <- SOMADataFrameOpen(uri, "READ")
expect_s3_class(
tbl1 <- SOMADataFrameOpen(uri, mode = "READ")$read()$concat(),
tbl1 <- sdf$read()$concat(),
"Table"
)
sdf$close()

# Read ordered enums
expect_identical(as.data.frame(tbl1), as.data.frame(tbl0))
Expand All @@ -715,26 +738,30 @@ test_that("SOMADataFrame can be updated", {

# Error if attempting to drop an array dimension
tbl0$int_column <- NULL # drop the indexed dimension
sdf <- SOMADataFrameOpen(uri, "WRITE")
expect_error(
SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0),
sdf$update(tbl0),
"The following indexed field does not exist"
)
tbl0 <- tbl1


# Error on incompatible schema updates
tbl0$string_column <- tbl0$string_column$cast(target_type = arrow::int32()) # string to int
expect_error(
SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0),
sdf$update(tbl0),
"Schemas are incompatible"
)
tbl0 <- tbl1

# Error if the number of rows changes
tbl0 <- tbl0$Slice(offset = 1, length = tbl0$num_rows - 1)
expect_error(
SOMADataFrameOpen(uri, mode = "WRITE")$update(tbl0),
sdf$update(tbl0),
"Number of rows in 'values' must match number of rows in array"
)

sdf$close()
})

test_that("SOMADataFrame can be updated from a data frame", {
Expand Down

0 comments on commit c50cb98

Please sign in to comment.