From 664689527e2e840ee560038d63f765b91ffbe425 Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Thu, 9 Jan 2025 17:19:40 -0800 Subject: [PATCH 01/10] add facet.limit to pull all facet values --- R/bcdc_search.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bcdc_search.R b/R/bcdc_search.R index aaee28f2..1d5c79b9 100644 --- a/R/bcdc_search.R +++ b/R/bcdc_search.R @@ -40,7 +40,7 @@ bcdc_search_facets <- function(facet = c("license_id", "download_audience", cli <- bcdc_catalogue_client("action/package_search") - r <- cli$get(query = list(facet.field = query, rows = 0)) + r <- cli$get(query = list(facet.field = query, rows = 0, facet.limit=1000)) r$raise_for_status() res <- jsonlite::fromJSON(r$parse("UTF-8")) From 324662f07d6da72afc30ae6b8250f8bdd7305de3 Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Thu, 9 Jan 2025 17:49:00 -0800 Subject: [PATCH 02/10] use group_package_show endpoint --- R/bcdc_search.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/bcdc_search.R b/R/bcdc_search.R index 1d5c79b9..f07ddabe 100644 --- a/R/bcdc_search.R +++ b/R/bcdc_search.R @@ -80,9 +80,9 @@ bcdc_list_groups <- function() bcdc_search_facets("groups") bcdc_list_group_records <- function(group) { if(!has_internet()) stop("No access to internet", call. = FALSE) # nocov - cli <- bcdc_catalogue_client("action/group_show") + cli <- bcdc_catalogue_client("action/group_package_show") - r <- cli$get(query = list(id = group, include_datasets = 'true')) + r <- cli$get(query = list(id = group, limit = 1000)) if (r$status_code == 404){ stop("404: URL not found - you may have specified an invalid group?", call. = FALSE) @@ -93,7 +93,7 @@ bcdc_list_group_records <- function(group) { res <- jsonlite::fromJSON(r$parse("UTF-8")) stopifnot(res$success) - d <- tibble::as_tibble(res$result$packages) + d <- tibble::as_tibble(res$result) as.bcdc_group(d, description = res$result$description) } From a9e6465e330d367463cff581f5323149095f2e0a Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Mon, 13 Jan 2025 17:18:28 -0800 Subject: [PATCH 03/10] use package_search endpoint --- R/bcdc_search.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/R/bcdc_search.R b/R/bcdc_search.R index f07ddabe..45aeec81 100644 --- a/R/bcdc_search.R +++ b/R/bcdc_search.R @@ -40,7 +40,7 @@ bcdc_search_facets <- function(facet = c("license_id", "download_audience", cli <- bcdc_catalogue_client("action/package_search") - r <- cli$get(query = list(facet.field = query, rows = 0, facet.limit=1000)) + r <- cli$get(query = list(facet.field = query, rows = 0, facet.limit = 1000)) r$raise_for_status() res <- jsonlite::fromJSON(r$parse("UTF-8")) @@ -120,9 +120,14 @@ bcdc_list_organizations <- function() bcdc_search_facets("organization") bcdc_list_organization_records <- function(organization) { if(!has_internet()) stop("No access to internet", call. = FALSE) # nocov - cli <- bcdc_catalogue_client("action/organization_show") + query <- paste0("organization:", organization) - r <- cli$get(query = list(id = organization, include_datasets = 'true')) + cli <- bcdc_catalogue_client("action/package_search") + + r <- cli$get(query = list( + fq = query, # filter query for the organization + rows = 1000 # number of datasets to retrieve (adjust as needed) + )) if (r$status_code == 404){ stop("404: URL not found - you may have specified an invalid organization?", call. = FALSE) @@ -133,7 +138,7 @@ bcdc_list_organization_records <- function(organization) { res <- jsonlite::fromJSON(r$parse("UTF-8")) stopifnot(res$success) - d <- tibble::as_tibble(res$result$packages) + d <- tibble::as_tibble(res$result$results) as.bcdc_organization(d, description = res$result$description) } From 26985cb890aadb9282b05896d870a1246c35e229 Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Sat, 18 Jan 2025 21:13:01 -0800 Subject: [PATCH 04/10] set limits using options --- R/bcdc_search.R | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/R/bcdc_search.R b/R/bcdc_search.R index 45aeec81..263ec52d 100644 --- a/R/bcdc_search.R +++ b/R/bcdc_search.R @@ -40,7 +40,9 @@ bcdc_search_facets <- function(facet = c("license_id", "download_audience", cli <- bcdc_catalogue_client("action/package_search") - r <- cli$get(query = list(facet.field = query, rows = 0, facet.limit = 1000)) + option_facet_limit <- getOption("bcdata.max_package_search_facet_limit", 1000) + + r <- cli$get(query = list(facet.field = query, rows = 0, facet.limit = option_facet_limit)) r$raise_for_status() res <- jsonlite::fromJSON(r$parse("UTF-8")) @@ -82,7 +84,9 @@ bcdc_list_group_records <- function(group) { cli <- bcdc_catalogue_client("action/group_package_show") - r <- cli$get(query = list(id = group, limit = 1000)) + option_group_limit <- getOption("bcdata.max_group_package_show_limit", 1000) + + r <- cli$get(query = list(id = group, limit = option_group_limit)) if (r$status_code == 404){ stop("404: URL not found - you may have specified an invalid group?", call. = FALSE) @@ -120,13 +124,13 @@ bcdc_list_organizations <- function() bcdc_search_facets("organization") bcdc_list_organization_records <- function(organization) { if(!has_internet()) stop("No access to internet", call. = FALSE) # nocov - query <- paste0("organization:", organization) + option_package_limit <- getOption("bcdata.max_package_search_limit", 1000) cli <- bcdc_catalogue_client("action/package_search") r <- cli$get(query = list( - fq = query, # filter query for the organization - rows = 1000 # number of datasets to retrieve (adjust as needed) + fq = paste0("organization:", organization), # filter query for the organization + rows = option_package_limit )) if (r$status_code == 404){ From 69c772cfab229e5b133adcd329f31b8418597aaa Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Sat, 18 Jan 2025 21:13:23 -0800 Subject: [PATCH 05/10] add new options --- R/bcdc_options.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/R/bcdc_options.R b/R/bcdc_options.R index ebaa7b6e..1a456fa9 100644 --- a/R/bcdc_options.R +++ b/R/bcdc_options.R @@ -34,6 +34,21 @@ #' using this option you can set the size of the chunk requested. On slower #' connections, or when having problems, it may help to lower the chunk limit. #' +#' `bcdata.max_package_search_limit` is an option for setting the maximum number of +#' datasets returned when querying by organization with the package_search API endpoint. The +#' default limit (1000) is purposely set high to return all datasets for a +#' given organization. +#' +#' `bcdata.max_package_search_facet_limit` is an option for setting the maximum number of +#' values returned when querying facet fields with the package_search API endpoint. The +#' default limit (1000) is purposely set high to return all values for each facet field +#' ("license_id", "download_audience", "res_format", "publish_state", "organization", "groups"). +#' +#' `bcdata.max_group_package_show_limit` is an option for setting the maximum number of +#' datasets returned when querying by group with the group_package_show API endpoint. The +#' default limit (1000) is purposely set high to return all datasets for a +#' given group. +#' #' `bcdata.single_download_limit` *Deprecated*. This is the maximum number of #' records an object can be before forcing a paginated download; it is set by #' querying the server capabilities. This option is deprecated and will be @@ -81,7 +96,10 @@ bcdc_options <- function() { "bcdata.max_geom_pred_size", null_to_na(getOption("bcdata.max_geom_pred_size")), 5E5, "bcdata.chunk_limit", null_to_na(getOption("bcdata.chunk_limit")), server_single_download_limit, "bcdata.single_download_limit", - null_to_na(deprecate_single_download_limit_option()), server_single_download_limit + null_to_na(deprecate_single_download_limit_option()), server_single_download_limit, + "bcdata.max_package_search_limit", null_to_na(getOption("bcdata.max_package_search_limit")), 1000, + "bcdata.max_package_search_facet_limit", null_to_na(getOption("bcdata.max_package_search_facet_limit")), 1000, + "bcdata.max_group_package_show_limit", null_to_na(getOption("bcdata.max_group_package_show_limit")), 1000 ) } From fe2c1eac7b528eed1a7a1d457566dc6f423f7735 Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Sat, 18 Jan 2025 21:13:51 -0800 Subject: [PATCH 06/10] add tests --- tests/testthat/test-search.R | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/testthat/test-search.R b/tests/testthat/test-search.R index 9f9ee3c4..41b493a7 100644 --- a/tests/testthat/test-search.R +++ b/tests/testthat/test-search.R @@ -37,3 +37,33 @@ test_that("bcdc_search works with zero results", { expect_output(print(res), "returned no results") }) + +test_that('bcdc_search_facets works', { + skip_on_cran() + skip_if_net_down() + orgs <- bcdc_search_facets("organization") + grps <- bcdc_search_facets("groups") + licences <- bcdc_search_facets("license_id") + expect_s3_class(orgs, "data.frame") + expect_s3_class(grps, "data.frame") + expect_s3_class(licences, "data.frame") + expect_gt(nrow(orgs), 160) + expect_gt(nrow(grps), 20) + expect_gte(nrow(licences), 15) +}) + +test_that('bcdc_list_group_records works', { + skip_on_cran() + skip_if_net_down() + census <- bcdc_list_group_records("census-profiles") + expect_s3_class(census, "data.frame") + expect_gte(nrow(census), 30) +}) + +test_that('bcdc_list_organization_records works', { + skip_on_cran() + skip_if_net_down() + bcstats <- bcdc_list_organization_records("bc-stats") + expect_s3_class(bcstats, "data.frame") + expect_gt(nrow(bcstats), 70) +}) From 9f6f597548c5d97f78c5200d009ba03d5aa62e50 Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Sat, 18 Jan 2025 21:14:00 -0800 Subject: [PATCH 07/10] document --- man/bcdc_options.Rd | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/man/bcdc_options.Rd b/man/bcdc_options.Rd index 2ef8aef1..85156e32 100644 --- a/man/bcdc_options.Rd +++ b/man/bcdc_options.Rd @@ -29,6 +29,21 @@ downloaded. This is called "pagination". bcdata does this all for you, however b using this option you can set the size of the chunk requested. On slower connections, or when having problems, it may help to lower the chunk limit. +\code{bcdata.max_package_search_limit} is an option for setting the maximum number of +datasets returned when querying by organization with the package_search API endpoint. The +default limit (1000) is purposely set high to return all datasets for a +given organization. + +\code{bcdata.max_package_search_facet_limit} is an option for setting the maximum number of +values returned when querying facet fields with the package_search API endpoint. The +default limit (1000) is purposely set high to return all values for each facet field +("license_id", "download_audience", "res_format", "publish_state", "organization", "groups"). + +\code{bcdata.max_group_package_show_limit} is an option for setting the maximum number of +datasets returned when querying by group with the group_package_show API endpoint. The +default limit (1000) is purposely set high to return all datasets for a +given group. + \code{bcdata.single_download_limit} \emph{Deprecated}. This is the maximum number of records an object can be before forcing a paginated download; it is set by querying the server capabilities. This option is deprecated and will be From 7093193c59534559bda4047b54949aa4b9a6ec90 Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Sat, 18 Jan 2025 21:14:09 -0800 Subject: [PATCH 08/10] update news --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index a9dfad30..7b8e4f7f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,8 @@ # bcdata (development version) +* Fix bugs where `bcdc_search_facets()`, `bcdc_list_group_records()` and + `bcdc_list_organization_records()` were not returning all records after + the upgrade to CKAN 2.9 (#353). Includes adding tests and bcdata-specific + options for these queries, see `bcdc_options()`. # bcdata 0.5.0 From 24caceec8c7be68b4e6dc13836d0d4485a2e859c Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Mon, 20 Jan 2025 20:02:54 -0800 Subject: [PATCH 09/10] improve tests --- tests/testthat/test-search.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test-search.R b/tests/testthat/test-search.R index 41b493a7..86191e4e 100644 --- a/tests/testthat/test-search.R +++ b/tests/testthat/test-search.R @@ -47,23 +47,29 @@ test_that('bcdc_search_facets works', { expect_s3_class(orgs, "data.frame") expect_s3_class(grps, "data.frame") expect_s3_class(licences, "data.frame") - expect_gt(nrow(orgs), 160) - expect_gt(nrow(grps), 20) - expect_gte(nrow(licences), 15) + expect_gte(nrow(orgs), 1) + expect_gte(nrow(grps), 1) + expect_gte(nrow(licences), 1) }) test_that('bcdc_list_group_records works', { skip_on_cran() skip_if_net_down() census <- bcdc_list_group_records("census-profiles") + grps <- bcdc_search_facets("groups") + census_count <- grps |> dplyr::filter(name == "census-profiles") |> dplyr::pull(count) expect_s3_class(census, "data.frame") - expect_gte(nrow(census), 30) + expect_gte(nrow(census), 1) + expect_equal(nrow(census), census_count) }) test_that('bcdc_list_organization_records works', { skip_on_cran() skip_if_net_down() bcstats <- bcdc_list_organization_records("bc-stats") + orgs <- bcdc_search_facets("organization") + bcstats_count <- orgs |> dplyr::filter(name == "bc-stats") |> dplyr::pull(count) expect_s3_class(bcstats, "data.frame") - expect_gt(nrow(bcstats), 70) + expect_gte(nrow(bcstats), 1) + expect_equal(nrow(bcstats), bcstats_count) }) From f0165f5e90d1d830b0867301d1a239cabd33b0fb Mon Sep 17 00:00:00 2001 From: Stephanie Hazlitt Date: Mon, 20 Jan 2025 20:03:07 -0800 Subject: [PATCH 10/10] add tests for new options --- tests/testthat/test-options.R | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-options.R b/tests/testthat/test-options.R index a4982bfd..72007c1b 100644 --- a/tests/testthat/test-options.R +++ b/tests/testthat/test-options.R @@ -17,7 +17,6 @@ test_that("bcdc_options() returns a tibble",{ expect_s3_class(opts, "tbl_df") }) - test_that("bcdata.chunk_limit",{ skip_if_net_down() skip_on_cran() @@ -30,6 +29,33 @@ test_that("bcdata.chunk_limit",{ }) }) +test_that("bcdata.max_package_search_limit works",{ + skip_if_net_down() + skip_on_cran() + withr::with_options(list(bcdata.max_package_search_limit = 10), { + bcstats <- bcdc_list_organization_records("bc-stats") + expect_lte(nrow(bcstats), 10) + }) +}) + +test_that("bcdata.max_package_search_facet_limit works",{ + skip_if_net_down() + skip_on_cran() + withr::with_options(list(bcdata.max_package_search_facet_limit = 10), { + orgs <- bcdc_search_facets("organization") + expect_lte(nrow(orgs), 10) + }) +}) + +test_that("bcdata.max_group_package_show_limit works",{ + skip_if_net_down() + skip_on_cran() + withr::with_options(list(bcdata.max_group_package_show_limit = 10), { + census <- bcdc_list_group_records("census-profiles") + expect_lte(nrow(census), 10) + }) +}) + test_that("bcdata.single_download_limit is deprecated but works", { # This can be removed when bcdata.single_download_limit is removed skip_if_net_down() @@ -37,7 +63,7 @@ test_that("bcdata.single_download_limit is deprecated but works", { withr::local_options(list(bcdata.single_download_limit = 1)) withr::local_envvar(list(BCDC_KEY = NULL)) # so snapshot not affected by message expect_s3_class( - bcdc_query_geodata(record = '76b1b7a3-2112-4444-857a-afccf7b20da8'), + bcdc_query_geodata(record = '76b1b7a3-2112-4444-857a-afccf7b20da8'), "bcdc_promise" ) })