Skip to content

Commit

Permalink
Refactor remote_to_local() fix #403 (#404)
Browse files Browse the repository at this point in the history
* Refactor the parsing of Google urls ref #403

* Refactor

* Finalize

* Correct
  • Loading branch information
chainsawriot authored Apr 30, 2024
1 parent c054e80 commit c529994
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 78 deletions.
11 changes: 0 additions & 11 deletions R/convert_google_url.R

This file was deleted.

113 changes: 56 additions & 57 deletions R/remote_to_local.R
Original file line number Diff line number Diff line change
@@ -1,69 +1,68 @@
remote_to_local <- function(file, format) {
if (missing(format)) {
# handle google sheets urls
if (grepl("docs\\.google\\.com/spreadsheets", file)) {
file <- convert_google_url(file, export_as = "csv")
if (grepl("docs\\.google\\.com/spreadsheets", file)) {
if (missing(format) || (!missing(format) && !format %in% c("csv", "tsv", "xlsx", "ods"))) {
format <- "csv"
} else {
# try to extract format from URL
format <- try(get_info(file)$format, silent = TRUE)
if (inherits(format, "try-error")) {
format <- "TMP"
}
}
file <- .convert_google_url(file, export_as = format)
}
if (missing(format)) {
## try to extract format from URL, see below
format <- .get_ext_temp(file)
} else {
# handle google sheets urls
if (grepl("docs\\.google\\.com/spreadsheets", file)) {
format <- .standardize_format(format)
if (format %in% c("csv", "tsv", "xlsx", "ods")) {
file <- convert_google_url(file, export_as = format)
format <- format
} else {
file <- convert_google_url(file, export_as = "csv")
format <- "csv"
}
} else {
format <- .standardize_format(format)
}
format <- .standardize_format(format)
}
# save file locally
temp_file <- tempfile(fileext = paste0(".", format))
u <- curl::curl_fetch_memory(file)
writeBin(object = u$content, con = temp_file)

if (format == "TMP") {
# try to extract format from curl's final URL
format <- try(get_info(u$url)$format, silent = TRUE)
if (inherits(format, "try-error")) {
# try to extract format from headers
h1 <- curl::parse_headers(u$headers)
# check `Content-Disposition` header
if (any(grepl("^Content-Disposition", h1))) {
h <- h1[grep("filename", h1)]
if (length(h)) {
f <- regmatches(h, regexpr("(?<=\")(.*)(?<!\")", h, perl = TRUE))
if (!length(f)) {
f <- regmatches(h, regexpr("(?<=filename=)(.*)", h, perl = TRUE))
}
f <- paste0(dirname(temp_file), "/", f)
file.copy(from = temp_file, to = f)
unlink(temp_file)
temp_file <- f
}
} else {
stop("Unrecognized file format. Try specifying with the format argument.")
}
# check `Content-Type` header
# if (any(grepl("^Content-Type", h1))) {
# h <- h1[grep("^Content-Type", h1)]
# ## PARSE MIME TYPE
# }
} else {
f <- sub("TMP$", format, temp_file)
file.copy(from = temp_file, to = f)
unlink(temp_file)
temp_file <- f
if (format != "TMP") { ## the happiest path
return(temp_file)
}
## fomart = "TMP": try to extract format from curl's final URL
format <- .get_ext_temp(u$url)
if (format != "TMP") { ## contain a file extension, also happy
renamed_file <- sub("TMP$", format, temp_file)
file.copy(from = temp_file, to = renamed_file)
unlink(temp_file)
return(renamed_file)
}
## try to extract format from headers: read #403 about whether this code is doing anything
h1 <- curl::parse_headers(u$headers)
## check `Content-Disposition` header
if (!any(grepl("^Content-Disposition", h1))) {
stop("Unrecognized file format. Try specifying with the format argument.")
}
h <- h1[grep("filename", h1)]
if (length(h)) {
f <- regmatches(h, regexpr("(?<=\")(.*)(?<!\")", h, perl = TRUE))
if (!length(f)) {
f <- regmatches(h, regexpr("(?<=filename=)(.*)", h, perl = TRUE))
}
f <- paste0(dirname(temp_file), "/", f)
file.copy(from = temp_file, to = f)
unlink(temp_file)
return(f)
}
}

.convert_google_url <- function(url, export_as = "csv") {
## convert a google sheets url to google csv export URL
## extract the doc-id and append /export?format = csv to it. (default)
google_key <- regmatches(url, regexpr("[[:alnum:]_-]{30,}", url))
if (grepl("gid=[[:digit:]]+", url)) {
gidpart <- paste0(regmatches(url, regexpr("gid=[[:digit:]]+", url)))
} else {
gidpart <- "gid=0"
}
return(paste0("https://docs.google.com/spreadsheets/d/", google_key, "/export?", gidpart, "&format=", export_as))
}

.get_ext_temp <- function(file, temp_format = "TMP") {
## This is a version of get_ext for internal usage
## When file can't be queried, return `temp_format` instead of error
format <- try(get_info(file)$format, silent = TRUE)
if (inherits(format, "try-error")) {
return(temp_format)
}
return(temp_file)
return(format)
}
1 change: 0 additions & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ get_ext <- function(file) {
get_info(file)$input
}


.query_format <- function(input, file) {
unique_rio_formats <- unique(rio_formats[, colnames(rio_formats) != "note"])
if (file == "clipboard") {
Expand Down
21 changes: 12 additions & 9 deletions tests/testthat/test_remote.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
skip_on_cran()

test_that("Import Remote Stata File", {
f <- try(import("http://www.stata-press.com/data/r13/auto.dta"))
if (!inherits(f, "try-error")) {
Expand All @@ -6,7 +8,7 @@ test_that("Import Remote Stata File", {
})

test_that("Import Remote GitHub File", {
rfile <- "https://raw.githubusercontent.com/leeper/rio/master/inst/examples/no_header.csv"
rfile <- "https://raw.githubusercontent.com/gesistsa/rio/main/tests/testdata/noheader.csv"
rfile_imported1 <- try(import(rfile))
if (!inherits(rfile_imported1, "try-error")) {
expect_true(inherits(rfile_imported1, "data.frame"), label = "Import remote file (implied format)")
Expand All @@ -21,16 +23,17 @@ test_that("Import Remote GitHub File", {
expect_true(file.exists(lfile), label = "Remote file copied successfully")
expect_true(inherits(import(lfile), "data.frame"), label = "Import local copy successfully")
}
## short url
payload <- try(import("https://is.gd/NLAxtg"))
if (!inherits(payload, "try-error")) {
expect_true(inherits(payload, "data.frame"), label = "Import remote file from shorten url (implied format)")
}
## no extension
noextension_url <- "https://github.com/gesistsa/rio/raw/main/tests/testdata/iris_no_extension_xls"
expect_error(import(noextension_url))
expect_error(import(noextension_url, format = "xls"), NA)
})

## test_that("Import Remote File from Shortened URL", {
## skip_if_not_installed(pkg = "data.table")
## shorturl <- try(import("https://raw.githubusercontent.com/gesistsa/rio/main/tests/testdata/example.csvy"))
## if (!inherits(shorturl, "try-error")) {
## expect_true(inherits(shorturl, "data.frame"), label = "Import remote file")
## }
## })

test_that("Import from Google Sheets", {
googleurl1 <- "https://docs.google.com/spreadsheets/d/1I9mJsS5QnXF2TNNntTy-HrcdHmIF9wJ8ONYvEJTXSNo/edit#gid=0"
expect_true(inherits(import(googleurl1), "data.frame"), label = "Import google sheets (specified sheet)")
Expand Down

0 comments on commit c529994

Please sign in to comment.