vincentarelbundock · vincentarelbundock · Sep 6, 2023 · Sep 4, 2023 · Sep 4, 2023 · Sep 4, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -18,6 +18,10 @@ Authors@R:
              role = "ctb",
              email = "[email protected]",
              comment = c(ORCID = "0000-0001-7195-4117")),
+      person("Etienne", "Bacher",
+             email = "[email protected]",
+             role = "ctb",
+             comment = c(ORCID = "0000-0002-9271-5075")),
       person(given = "Samuel",
              family = "Meichtry",
              role = "ctb",

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # countrycode 1.5.0.9000
 
+* Important speed-up for detection of country names using regular expressions (Thanks to Etienne Bacher).
 * `countryname` gets the `nomatch` argument.
 * `countryname` returns NA when the code does not support a given country. (Issue #336)
 * Improved regex for Italy

diff --git a/R/countrycode.R b/R/countrycode.R
@@ -19,12 +19,12 @@
 #' [`codelist_panel`][codelist_panel] data.frame as a base into which they can
 #' merge their other data. This data.frame includes most relevant code, and is
 #' already "reconciled" to ensure that each political unit is only represented
 #' by one row in any given year. From there, it is just a matter of using [merge()]
 #' to combine different datasets which use different codes.
 #'
 #' @param sourcevar Vector which contains the codes or country names to be
 #'   converted (character or factor)
 #' @param origin A string which identifies the coding scheme of origin (e.g., `"iso3c"`). See
 #'   [`codelist`][codelist] for a list of available codes.
 #' @param destination A string or vector of strings which identify the coding
 #'   scheme of destination (e.g., `"iso3c"` or `c("cowc", "iso3c")`). See
@@ -32,7 +32,7 @@
 #'   vector of destination codes, they are used sequentially to fill in
 #'   missing values not covered by the previous destination code in the
 #'   vector.
 #' @param warn Prints unique elements from sourcevar for which no match was found
 #' @param nomatch When countrycode fails to find a match for the code of
 #'   origin, it fills-in the destination vector with `nomatch`. The default
 #'   behavior is to fill non-matching codes with `NA`. If `nomatch = NULL`,
@@ -80,7 +80,7 @@
 #' countrycode(c('United States', 'Algeria'), 'country.name', 'iso3c')
 #' countrycode(c('United States', 'Algeria'), 'country.name', 'iso3c',
 #'             custom_match = c('Algeria' = 'ALG'))
-#' 
+#'
 #' x <- c("canada", "antarctica")
 #' countryname(x)
 #' countryname(x, destination = "cowc", warn = FALSE)
@@ -90,7 +90,7 @@
 #'  # Download the dictionary of US states from Github
 #'  state_dict <- "https://bit.ly/2ToSrFv"
 #'  state_dict <- read.csv(state_dict)
-#' 
+#'
 #'  # The "state.regex" column includes regular expressions, so we set an attribute.
 #'  attr(state_dict, "origin_regex") <- "state.regex"
 #
@@ -121,7 +121,7 @@
     }
 
     # default country names (only for default dictionary)
-    if (is.null(custom_dict)) { 
+    if (is.null(custom_dict)) {
         if (origin == 'country.name') {
             origin <- 'country.name.en'
         }
@@ -204,25 +204,41 @@
 
     out <- rep(NA, length(sourcevar))
     for (dest in destination) {
-        out <- ifelse(is.na(out),
-                      countrycode_convert(
-                          ## user-supplied arguments
-                          sourcevar = sourcevar,
-                          origin = origin,
-                          destination = dest,
-                          warn = warn,
-                          nomatch = nomatch,
-                          custom_dict = custom_dict,
-                          custom_match = custom_match,
-                          origin_regex = origin_regex,
-                          ## countrycode-supplied arguments
-                          origin_vector = origin_vector,
-                          dictionary = dictionary),
-                      out)
+        if (length(destination) == 1) {
+            out <- countrycode_convert(
+                ## user-supplied arguments
+                sourcevar = sourcevar,
+                origin = origin,
+                destination = dest,
+                warn = warn,
+                nomatch = nomatch,
+                custom_dict = custom_dict,
+                custom_match = custom_match,
+                origin_regex = origin_regex,
+                ## countrycode-supplied arguments
+                origin_vector = origin_vector,
+                dictionary = dictionary)
+        } else {
+            out <- ifelse(is.na(out),
+                          countrycode_convert(
+                              ## user-supplied arguments
+                              sourcevar = sourcevar,
+                              origin = origin,
+                              destination = dest,
+                              warn = warn,
+                              nomatch = nomatch,
+                              custom_dict = custom_dict,
+                              custom_match = custom_match,
+                              origin_regex = origin_regex,
+                              ## countrycode-supplied arguments
+                              origin_vector = origin_vector,
+                              dictionary = dictionary),
+                          out)
+        }
     }
     return(out)
 }
-                        
+
 
 #' internal function called by `countrycode()`
 #'
@@ -248,13 +264,24 @@
         dict <- stats::na.omit(dictionary[, c(origin, destination)])
         sourcefctr <- factor(origin_vector)
 
-        # match levels of sourcefctr
-        matches <-
-          sapply(c(levels(sourcefctr), NA), function(x) { # add NA so there's at least one item
-            x <- tryCatch(trimws(x), error = function(e) x) # sometimes an error is triggered by encoding issues
-            matchidx <- sapply(dict[[origin]], function(y) grepl(y, x, perl = TRUE, ignore.case = TRUE))
-            dict[matchidx, destination]
-          })
+        # possibilities (add NA so there's at least one item)
+        choices <- c(levels(sourcefctr), NA)
+        # sometimes an error is triggered by encoding issues
+        choices <- tryCatch(trimws(choices), error = function(e) choices)
+
+        # Apply all regexes on all inputs. This gives a matrix where rows
+        # are the inputs and columns are the regexes.
+        # For each row, the `TRUE` values indicate the matches.
+        matchidx <- sapply(dict[[origin]], grepl, x = choices,
+                           perl = TRUE, ignore.case = TRUE)
+        if (all(is.na(choices))) {
+            matches <- vector("list", length = length(choices))
+        } else {
+            out <- apply(matchidx, 1, which, simplify = FALSE)
+            names(out) <- choices
+            matches <- lapply(out, function(x) dict[x, destination])
+        }
+
 
         # fill elements that have zero matches with the appropriate NA
         matches[sapply(matches, length) == 0] <- `class<-`(NA, class(dict[[destination]]))