diff --git a/NEWS.md b/NEWS.md index 4d32c59..15cc2ae 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,14 @@ +# openalexR 2.0.0 +* Breaking changes in column names in the output of `oa_fetch`: + * `so` is now `source_display_name` + * `so_id` is now `source_id` + * `ab` is now `abstract` + * `url` is now `landing_page_url` + * the nested columns under `authors` no longer have the `au` prefix + +* Deprecated `oa2bibliometrix()`. Use `bibliometrix::convert2df()` +(from the **bibliometrix** R package) instead. + # openalexR 1.4.0 * "topics" are now a valid entity in oa_fetch * The column "topics" replaces concepts in most entities' returned dataframes diff --git a/R/oa2bibliometrix.R b/R/oa2bibliometrix.R index 71d81f4..afa02e0 100644 --- a/R/oa2bibliometrix.R +++ b/R/oa2bibliometrix.R @@ -6,7 +6,7 @@ #' #' @param df is bibliographic collection of works donwloaded from OpenALex. #' @return a data.frame with class "bibliometrix". -#' +#' @details Use \code{bibliometrix::convert2df()} (bibliometrix R package) instead. #' #' @examples #' \dontrun{ @@ -22,13 +22,10 @@ #' # Results have to be sorted by relevance score in a descending order. #' #' query <- oa_query( -#' identifier = NULL, #' entity = "works", #' cites = "W2755950973", -#' from_publication_date = "2021-01-01", -#' to_publication_date = "2021-12-31", -#' search = NULL, -#' endpoint = "https://api.openalex.org" +#' from_publication_date = "2021-10-01", +#' to_publication_date = "2021-12-31" #' ) #' #' res <- oa_request( @@ -45,6 +42,8 @@ #' @export #' oa2bibliometrix <- function(df) { + .Deprecated(msg = "oa2bibliometrix() is deprecated. Please use bibliometrix::convert2df() instead.") + df$id_oa <- shorten_oaid(df$id) names(df)[names(df) == "id"] <- "id_url" @@ -57,18 +56,17 @@ oa2bibliometrix <- function(df) { countrycode$Country <- as.character(countrycode$Country) # Authors - AU_info <- lapply(df$author, function(l) { + AU_info <- lapply(df$author[7], function(l) { if (length(l) == 0 || (length(l) == 1 && is.na(l))){ return(empty_list( c("AU", "RP", "C1", "AU_UN", "AU_CO") )) } else { - l$institution_country_code[is.na(l$institution_country_code)] <- "Not available" - AU <- au_collapse(l$au_display_name) - C1 <- au_collapse(l$au_affiliation_raw) - RP <- au_collapse(l$au_affiliation_raw[1]) - AU_UN <- au_collapse(l$institution_display_name) - AU_CO <- au_collapse(countrycode[l$institution_country_code, 1]) + AU <- au_collapse(l$display_name) + C1 <- au_collapse(l$affiliation_raw) + RP <- au_collapse(l$affiliation_raw[1]) + AU_UN <- au_collapse(lapply(l$affiliations, function(x) x$display_name)) + AU_CO <- au_collapse(countrycode[unlist(lapply(l$affiliations, function(x) x$country_code)), 1]) list(AU = AU, RP = RP, C1 = C1, AU_UN = AU_UN, AU_CO = AU_CO) } }) @@ -91,11 +89,11 @@ oa2bibliometrix <- function(df) { df <- cbind(AU_info, ID, df) df$TI <- toupper(df$display_name) - df$AB <- toupper(df$ab) - df$SO <- toupper(df$so) + df$AB <- toupper(df$abstract) + df$SO <- toupper(df$source_display_name) df$DT <- toupper(df$type) df$DB <- "OPENALEX" - df$JI <- shorten_oaid(df$so_id) + df$JI <- shorten_oaid(df$source_id) df$J9 <- df$JI df$PY <- df$publication_year df$TC <- df$cited_by_count diff --git a/R/oa2df.R b/R/oa2df.R index 86a4d45..12386a4 100644 --- a/R/oa2df.R +++ b/R/oa2df.R @@ -137,8 +137,8 @@ oa2df <- function(data, entity, options = NULL, count_only = FALSE, group_by = N works2df <- function(data, abstract = TRUE, verbose = TRUE, pb = if (verbose) oa_progress(length(data)) else NULL) { col_order <- c( - "id", "title", "display_name", "author", "ab", "publication_date", "relevance_score", - "so", "so_id", "host_organization", "issn_l", "url", "pdf_url", + "id", "title", "display_name", "author", "abstract", "publication_date", "relevance_score", + "source_display_name", "source_id", "issn_l", "landing_page_url", "pdf_url", "license", "version", "first_page", "last_page", "volume", "issue", "is_oa", "is_oa_anywhere", "oa_status", "oa_url", "any_repository_has_fulltext", "language", "grants", "cited_by_count", "counts_by_year", @@ -171,21 +171,11 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE, "flat", "ids" ) - venue_cols <- c( - url = "landing_page_url", - pdf_url = "pdf_url", - is_oa = "is_oa", - license = "license", - version = "version" - ) so_cols <- c( - so_id = "id", - so = "display_name", - issn_l = "issn_l", - host_organization = "host_organization_name" + source_id = "id", + source_display_name = "display_name", + issn_l = "issn_l" ) - inst_cols <- c("id", "display_name", "ror", "country_code", "type", "lineage") - empty_inst <- empty_list(inst_cols) n <- length(data) list_df <- vector(mode = "list", length = n) @@ -205,71 +195,33 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE, if (!is.null(sim_fields$publication_date)) { sim_fields$publication_date <- as.Date(sim_fields$publication_date) } - - author <- venue <- ab <- apc <- NULL - - if (!is.null(paper$primary_location)) { - so_info <- paper$primary_location["source"] - so_info <- if (length(so_info[[1]]) == 0) NA else so_info[[1]] - venue_info <- replace_w_na(paper$primary_location[venue_cols]) - venue <- setNames( - c(venue_info, so_info[so_cols]), - c(names(venue_cols), names(so_cols)) - ) - } - - # authorships and affilitation - if (!is.null(paper$authorships)) { - author <- subs_na( - lapply(paper$authorships, function(l) { - l_inst <- l$institutions - inst_idx <- lengths(l_inst) > 0 - if (length(inst_idx) > 0 && any(inst_idx)) { - first_inst <- l_inst[inst_idx][[1]] - first_inst$lineage <- paste(first_inst$lineage, collapse = ", ") - } else { - first_inst <- empty_inst - } - first_inst <- prepend(first_inst, "institution") - aff_raw <- list( - au_affiliation_raw = - if (length(l$raw_affiliation_strings)) { - l$raw_affiliation_strings[[1]] - } else { - NA_character_ - } - ) - l_author <- if (length(l$author) > 0) { - prepend(replace_w_na(l$author), "au") - } else { - empty_list(c("au_id", "au_display_name", "au_orcid")) - } - c(l_author, l[c("author_position", "is_corresponding")], aff_raw, first_inst) - }), "rbind_df" - ) - } - - # Abstract - if (!is.null(paper$abstract_inverted_index) && abstract) { - ab <- abstract_build(paper$abstract_inverted_index) - } + author <- process_paper_authors(paper$authorships) + ab <- abstract_build(paper$abstract_inverted_index, abstract) paper_biblio <- replace_w_na(paper$biblio) open_access <- replace_w_na(paper$open_access) if (length(open_access) > 0) { names(open_access)[[1]] <- "is_oa_anywhere" } + so_info <- paper$primary_location + venue <- so_info[names(so_info) != "source"] + source <- so_info$source + if (!is.null(source)){ + source <- setNames(source[so_cols], names(so_cols)) + } + # Process APC + apc <- NULL if (any(lengths(paper[c("apc_list", "apc_paid")]) > 0)) { apc_fields <- list(value = NA, currency = NA, value_usd = NA, provenance = NA) apc <- list(rbind.data.frame( - c(type = "list", modifyList(apc_fields, as.list(paper$apc_list))), - c(type = "paid", modifyList(apc_fields, as.list(paper$apc_paid))) + c(type = "list", utils::modifyList(apc_fields, as.list(paper$apc_list))), + c(type = "paid", utils::modifyList(apc_fields, as.list(paper$apc_paid))) )) } topics <- process_topics(paper, "score") - out_ls <- c(sim_fields, venue, open_access, paper_biblio, - list(author = author, ab = ab, apc = apc), topics) + out_ls <- c(sim_fields, venue, source, open_access, paper_biblio, + list(author = author, abstract = ab, apc = apc), topics) out_ls[sapply(out_ls, is.null)] <- NULL list_df[[i]] <- out_ls } @@ -278,9 +230,16 @@ works2df <- function(data, abstract = TRUE, verbose = TRUE, out_df[, intersect(col_order, names(out_df))] } -abstract_build <- function(ab) { - if (is.null(ab)) { - return(NA) +#' Build abstract from inverted index +#' +#' @param ab List. Inverted index of abstract. +#' @param build Logical. If TRUE, build the abstract. +#' +#' @return Character string. The abstract of the paper. +#' @keywords internal +abstract_build <- function(ab, build = TRUE) { + if (is.null(ab) || !build) { + return(NULL) } w <- rep(names(ab), lengths(ab)) ind <- unlist(ab) @@ -291,6 +250,62 @@ abstract_build <- function(ab) { paste(w[order(ind)], collapse = " ", sep = "") } +#' Process paper authorships +#' +#' @param authorships List. Authorships element of paper. +#' +#' @return List. A list of one dataframe with the processed authors: +#' id, display_name, orcid, author_position, is_corresponding, affiliations, affiliation_raw +#' @keywords internal +process_paper_authors <- function(authorships){ + if (is.null(authorships)) { + return(NULL) + } + authors_ls <- lapply(authorships, function(l) { + l_author <- if (length(l$author)) { + replace_w_na(l$author) + } else { + empty_list(names(l$author)) + } + + affiliation_raw <- if (length(l$raw_affiliation_strings)) { + l$raw_affiliation_strings[[1]] + } else { + NA_character_ + } + + affs <- list( + affiliations = process_affil(l$institutions), + affiliation_raw = affiliation_raw + ) + + c(l_author, l[c("author_position", "is_corresponding")], affs) + }) + + list(rbind_oa_ls(authors_ls)) +} + + +#' Process affiliations +#' +#' @param l_institution List. Nested elements include +#' id, display_name, ror, country_code, type, lineage +#' +#' @return Dataframe of with the following columns: +#' id, display_name, ror, country_code, type, lineage +#' @keywords internal +process_affil <- function(l_institution){ + if (!length(l_institution)){ + return(list(empty_df())) + } + l_inst <- lapply(l_institution, function(x) { + x$lineage <- paste(x$lineage, collapse = ", ") + x + }) + subs_na(l_inst, "rbind_df") +} + + #' Convert OpenAlex collection of authors' records from list format to data frame #' @@ -313,17 +328,11 @@ abstract_build <- function(ab) { #' # University of Naples Federico II is associated to the OpenAlex id I71267560. #' #' -#' query_author <- oa_query( -#' identifier = NULL, +#' res <- oa_fetch( #' entity = "authors", #' last_known_institutions.id = "I71267560", -#' works_count = ">500" -#' ) -#' -#' res <- oa_request( -#' query_url = query_author, -#' count_only = FALSE, -#' verbose = FALSE +#' works_count = ">700", +#' output = "list" #' ) #' #' df <- oa2df(res, entity = "authors") @@ -339,7 +348,6 @@ authors2df <- function(data, verbose = TRUE, inst_cols <- c("id", "display_name", "ror", "country_code", "type", "lineage") empty_inst <- empty_list(inst_cols) - empty_inst$affiliations_other <- list(NULL) author_process <- tibble::tribble( ~type, ~field, @@ -367,6 +375,8 @@ authors2df <- function(data, verbose = TRUE, fields$type, SIMPLIFY = FALSE ) + + # current affiliation sub_affiliation <- item$last_known_institutions if (!is.null(sub_affiliation) && length(sub_affiliation)) { sub_affiliation <- sub_affiliation[[1]] @@ -378,15 +388,16 @@ authors2df <- function(data, verbose = TRUE, } sub_affiliation <- replace_w_na(sub_affiliation) + # all affiliations if (!is.null(item$affiliations)) { - affiliations_other <- sapply(item$affiliations, function(x) x$institution$id) - if (!is.null(sub_affiliation$affiliation_id)) { - affiliations_other <- affiliations_other[affiliations_other != sub_affiliation$affiliation_id] - } - sub_affiliation$affiliations_other <- list(affiliations_other) + l_inst <- lapply(item$affiliations, function(x) x$institution) + affs <- list(affiliations = process_affil(l_inst)) + } else { + affs <- NULL } + topics <- process_topics(item, "count") - list_df[[i]] <- c(sim_fields, sub_affiliation, topics) + list_df[[i]] <- c(sim_fields, sub_affiliation, affs, topics) } col_order <- c( @@ -394,8 +405,7 @@ authors2df <- function(data, verbose = TRUE, "ids", "orcid", "works_count", "cited_by_count", "counts_by_year", "affiliation_display_name", "affiliation_id", "affiliation_ror", "affiliation_country_code", "affiliation_type", "affiliation_lineage", - "affiliations_other", - "topics", "works_api_url" + "affiliations", "topics", "works_api_url" ) out_df <- rbind_oa_ls(list_df) @@ -420,16 +430,11 @@ authors2df <- function(data, verbose = TRUE, #' #' # Query to search information about all Italian educational institutions #' -#' query_inst <- oa_query( +#' res <- oa_fetch( #' entity = "institutions", #' country_code = "it", -#' type = "education" -#' ) -#' -#' res <- oa_request( -#' query_url = query_inst, -#' count_only = FALSE, -#' verbose = FALSE +#' type = "education", +#' output = "list" #' ) #' #' oa2df(res, entity = "institutions") @@ -522,15 +527,10 @@ institutions2df <- function(data, verbose = TRUE, #' # Query to search information about all Italian educational institutions #' #' -#' query_inst <- oa_query( +#' res <- oa_query( #' entity = "concepts", -#' display_name.search = "electrodynamics" -#' ) -#' -#' res <- oa_request( -#' query_url = query_inst, -#' count_only = FALSE, -#' verbose = FALSE +#' display_name.search = "electrodynamics", +#' output = "list" #' ) #' #' df <- oa2df(res, entity = "concepts") @@ -850,15 +850,10 @@ publishers2df <- function(data, verbose = TRUE, #' # Query to search information about all Italian educational institutions #' #' -#' query_inst <- oa_query( +#' res <- oa_query( #' entity = "topics", -#' display_name.search = "electrodynamics" -#' ) -#' -#' res <- oa_request( -#' query_url = query_inst, -#' count_only = FALSE, -#' verbose = FALSE +#' display_name.search = "electrodynamics", +#' output = "list" #' ) #' #' df <- oa2df(res, entity = "topics") diff --git a/R/simplify.R b/R/simplify.R index 6f90857..b3ed4c5 100644 --- a/R/simplify.R +++ b/R/simplify.R @@ -115,7 +115,7 @@ get_auth_position <- function(y, position = "first") { if (length(y) == 1 && is.na(y)) { return(NA_character_) } - last <- y[y$author_position == position, "au_display_name"] + last <- y[y$author_position == position, "display_name", drop = TRUE] if (length(last) == 0) { return(NA_character_) } diff --git a/R/utils.R b/R/utils.R index 44f87fb..d5baf61 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,7 +1,7 @@ `%||%` <- function(x, y) if (is.null(x)) y else x -replace_w_na <- function(x, y = NA){ +replace_w_na <- function(x, y = NA) { lapply(x, `%||%`, y = y) } @@ -36,6 +36,13 @@ empty_list <- function(vars) { setNames(as.list(rep(NA, length(vars))), vars) } +empty_df <- function(column_names = c("id", "display_name", "ror", "country_code", "type", "lineage")) { + setNames(data.frame( + lapply(column_names, function(x) character(0)), + stringsAsFactors = FALSE + ), column_names) +} + isValidEmail <- function(x) { grepl("\\<[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\>", as.character(x), ignore.case = TRUE) } @@ -65,7 +72,7 @@ id_type <- function(identifier) { oa_print <- function() { p <- as.integer(Sys.getenv("openalexR.print")) - if (is.na(p)){ + if (is.na(p)) { return(NULL) } p diff --git a/man/abstract_build.Rd b/man/abstract_build.Rd new file mode 100644 index 0000000..c3ea912 --- /dev/null +++ b/man/abstract_build.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oa2df.R +\name{abstract_build} +\alias{abstract_build} +\title{Build abstract from inverted index} +\usage{ +abstract_build(ab, build = TRUE) +} +\arguments{ +\item{ab}{List. Inverted index of abstract.} + +\item{build}{Logical. If TRUE, build the abstract.} +} +\value{ +Character string. The abstract of the paper. +} +\description{ +Build abstract from inverted index +} +\keyword{internal} diff --git a/man/authors2df.Rd b/man/authors2df.Rd index de43e5a..4aeb5d0 100644 --- a/man/authors2df.Rd +++ b/man/authors2df.Rd @@ -38,17 +38,11 @@ The function converts a list of authors' records obtained using \code{oa_request # University of Naples Federico II is associated to the OpenAlex id I71267560. -query_author <- oa_query( - identifier = NULL, +res <- oa_fetch( entity = "authors", last_known_institutions.id = "I71267560", - works_count = ">500" -) - -res <- oa_request( - query_url = query_author, - count_only = FALSE, - verbose = FALSE + works_count = ">700", + output = "list" ) df <- oa2df(res, entity = "authors") diff --git a/man/concepts2df.Rd b/man/concepts2df.Rd index 7d4999e..81c2a57 100644 --- a/man/concepts2df.Rd +++ b/man/concepts2df.Rd @@ -35,15 +35,10 @@ The function converts a list of concepts' records obtained using \code{oa_reques # Query to search information about all Italian educational institutions -query_inst <- oa_query( +res <- oa_query( entity = "concepts", - display_name.search = "electrodynamics" -) - -res <- oa_request( - query_url = query_inst, - count_only = FALSE, - verbose = FALSE + display_name.search = "electrodynamics", + output = "list" ) df <- oa2df(res, entity = "concepts") diff --git a/man/institutions2df.Rd b/man/institutions2df.Rd index dde3696..280c7c7 100644 --- a/man/institutions2df.Rd +++ b/man/institutions2df.Rd @@ -34,16 +34,11 @@ The function converts a list of institutions' records obtained using \code{oa_re # Query to search information about all Italian educational institutions -query_inst <- oa_query( +res <- oa_fetch( entity = "institutions", country_code = "it", - type = "education" -) - -res <- oa_request( - query_url = query_inst, - count_only = FALSE, - verbose = FALSE + type = "education", + output = "list" ) oa2df(res, entity = "institutions") diff --git a/man/oa2bibliometrix.Rd b/man/oa2bibliometrix.Rd index ae1ba93..2a995a5 100644 --- a/man/oa2bibliometrix.Rd +++ b/man/oa2bibliometrix.Rd @@ -17,6 +17,9 @@ It converts bibliographic collections gathered from OpenAlex database \href{http bibliometrix data frame (\href{https://bibliometrix.org/}{https://bibliometrix.org/}) Column names follow https://images.webofknowledge.com/images/help/WOS/hs_wos_fieldtags.html. } +\details{ +Use \code{bibliometrix::convert2df()} (bibliometrix R package) instead. +} \examples{ \dontrun{ @@ -31,13 +34,10 @@ Column names follow https://images.webofknowledge.com/images/help/WOS/hs_wos_fie # Results have to be sorted by relevance score in a descending order. query <- oa_query( - identifier = NULL, entity = "works", cites = "W2755950973", - from_publication_date = "2021-01-01", - to_publication_date = "2021-12-31", - search = NULL, - endpoint = "https://api.openalex.org" + from_publication_date = "2021-10-01", + to_publication_date = "2021-12-31" ) res <- oa_request( diff --git a/man/process_affil.Rd b/man/process_affil.Rd new file mode 100644 index 0000000..55bdedd --- /dev/null +++ b/man/process_affil.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oa2df.R +\name{process_affil} +\alias{process_affil} +\title{Process affiliations} +\usage{ +process_affil(l_institution) +} +\arguments{ +\item{l_institution}{List. Nested elements include +id, display_name, ror, country_code, type, lineage} +} +\value{ +Dataframe of with the following columns: +id, display_name, ror, country_code, type, lineage +} +\description{ +Process affiliations +} +\keyword{internal} diff --git a/man/process_paper_authors.Rd b/man/process_paper_authors.Rd new file mode 100644 index 0000000..2ddac0a --- /dev/null +++ b/man/process_paper_authors.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oa2df.R +\name{process_paper_authors} +\alias{process_paper_authors} +\title{Process paper authorships} +\usage{ +process_paper_authors(authorships) +} +\arguments{ +\item{authorships}{List. Authorships element of paper.} +} +\value{ +List. A list of one dataframe with the processed authors: +id, display_name, orcid, author_position, is_corresponding, affiliations, affiliation_raw +} +\description{ +Process paper authorships +} +\keyword{internal} diff --git a/man/topics2df.Rd b/man/topics2df.Rd index 7694c03..7ad5580 100644 --- a/man/topics2df.Rd +++ b/man/topics2df.Rd @@ -35,15 +35,10 @@ The function converts a list of topics' records obtained using \code{oa_request} # Query to search information about all Italian educational institutions -query_inst <- oa_query( +res <- oa_query( entity = "topics", - display_name.search = "electrodynamics" -) - -res <- oa_request( - query_url = query_inst, - count_only = FALSE, - verbose = FALSE + display_name.search = "electrodynamics", + output = "list" ) df <- oa2df(res, entity = "topics") diff --git a/tests/testthat/test-oa2bibliometrix.R b/tests/testthat/test-oa2bibliometrix.R index 2b36896..681a6ed 100644 --- a/tests/testthat/test-oa2bibliometrix.R +++ b/tests/testthat/test-oa2bibliometrix.R @@ -8,7 +8,7 @@ test_that("oa2bibliometrix works", { to_publication_date = "2021-01-31" ) - aria_bibli <- oa2bibliometrix(aria_citations) + expect_warning(aria_bibli <- oa2bibliometrix(aria_citations)) expect_s3_class(aria_bibli, "data.frame") expect_equal(nrow(aria_citations), nrow(aria_bibli)) }) diff --git a/tests/testthat/test-oa_fetch.R b/tests/testthat/test-oa_fetch.R index 347ec44..a29db50 100644 --- a/tests/testthat/test-oa_fetch.R +++ b/tests/testthat/test-oa_fetch.R @@ -47,7 +47,7 @@ test_that("oa_fetch works", { paste0("https://openalex.org/", sort(work_ids)) ) - expect_true("au_affiliation_raw" %in% names(multi_works$author[[1]])) + expect_true("affiliation_raw" %in% names(multi_works$author[[1]])) Sys.sleep(1 / 10) # warn about truncated authors @@ -372,7 +372,7 @@ test_that("oa_fetch works with 1 identifier", { expect_s3_class(s, "data.frame") expect_s3_class(co, "data.frame") - expect_equal(dim(w), c(1, 39)) + expect_equal(dim(w), c(1, 38)) expect_equal(dim(a), c(1, 17)) expect_equal(dim(i), c(1, 21)) expect_equal(dim(f), c(1, 17)) diff --git a/tests/testthat/test-oa_snowball.R b/tests/testthat/test-oa_snowball.R index bbb4f79..8991d09 100644 --- a/tests/testthat/test-oa_snowball.R +++ b/tests/testthat/test-oa_snowball.R @@ -68,7 +68,7 @@ test_that("oa_snowball works for author orcids", { ) nodes <- snowball_orcid$nodes - orcids_in <- lapply(nodes$author[nodes$oa_input], function(x) x$au_orcid) + orcids_in <- lapply(nodes$author[nodes$oa_input], function(x) x$orcid) either_orcid <- paste(orcids, collapse = "|") expect_true(is.list(snowball_orcid))