From d91ba40f586a118dda78198e5a7375f95ab62c87 Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Sun, 5 May 2024 15:35:40 +0200 Subject: [PATCH 01/17] feat: Add example json_split in cpp --- NAMESPACE | 1 + R/loggit.R | 3 +++ src/.gitignore | 3 +++ src/split_json.cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+) create mode 100644 src/.gitignore create mode 100644 src/split_json.cpp diff --git a/NAMESPACE b/NAMESPACE index d09e32c..578ec68 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,3 +13,4 @@ export(stop) export(stopifnot) export(warning) export(with_loggit) +useDynLib(loggit2, .registration=TRUE) diff --git a/R/loggit.R b/R/loggit.R index 042ac3b..ed2931b 100644 --- a/R/loggit.R +++ b/R/loggit.R @@ -1,3 +1,6 @@ +#' @useDynLib loggit2, .registration=TRUE +NULL + #' Log entries to file #' #' Log entries to a [ndjson](https://github.com/ndjson) log file, defined by [set_logfile()]. diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/split_json.cpp b/src/split_json.cpp new file mode 100644 index 0000000..7104748 --- /dev/null +++ b/src/split_json.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +extern "C" SEXP splitString(SEXP strSEXP) { + const char* str = CHAR(STRING_ELT(strSEXP, 0)); + std::vector result; + std::string token; + bool inQuotes = false; + int backslashCount = 0; + + for (int i = 0; str[i] != '\0'; ++i) { + char ch = str[i]; + if (ch == '\\' && !inQuotes) { + backslashCount++; + token += ch; + } else if (ch == '"' && (backslashCount % 2 == 0)) { + inQuotes = !inQuotes; + token += ch; + backslashCount = 0; + } else if ((ch == ',' || ch == ':') && !inQuotes) { + result.push_back(token); + token.clear(); + backslashCount = 0; + } else { + token += ch; + backslashCount = 0; + } + } + + if (!token.empty()) { + result.push_back(token); + } + + SEXP ans = PROTECT(allocVector(STRSXP, result.size())); + for (size_t j = 0; j < result.size(); ++j) { + SET_STRING_ELT(ans, j, mkChar(result[j].c_str())); + } + + UNPROTECT(1); + return ans; +} From de318f580e887c57563de8f4b414818c877df9f7 Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 00:50:50 +0200 Subject: [PATCH 02/17] feat: Allow NA --- src/split_json.cpp | 66 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/src/split_json.cpp b/src/split_json.cpp index 7104748..42cf97c 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -3,15 +3,32 @@ #include #include -extern "C" SEXP splitString(SEXP strSEXP) { +extern "C" SEXP split_json(SEXP strSEXP) { const char* str = CHAR(STRING_ELT(strSEXP, 0)); - std::vector result; + std::vector keys; + std::vector values; + std::vector keysNA; + std::vector valuesNA; std::string token; bool inQuotes = false; int backslashCount = 0; + bool isKey = true; - for (int i = 0; str[i] != '\0'; ++i) { + int start = 0; + int end = strlen(str) - 1; + + // Ignore leading and trailing whitespaces + while (str[start] == ' ') start++; + while (str[end] == ' ') end--; + + // Check for '{' and '}' and adjust the boundaries + if (str[start] == '{') start++; + if (str[end] == '}') end--; + + for (int i = start; i <= end; ++i) { char ch = str[i]; + if (ch == ' ' && !inQuotes) continue; // Skip spaces outside quotes + if (ch == '\\' && !inQuotes) { backslashCount++; token += ch; @@ -20,7 +37,19 @@ extern "C" SEXP splitString(SEXP strSEXP) { token += ch; backslashCount = 0; } else if ((ch == ',' || ch == ':') && !inQuotes) { - result.push_back(token); + bool isNA = token == "null"; + if (token.front() == '"' && token.back() == '"' && token.size() > 1) { + token = token.substr(1, token.size() - 2); // Remove quotes + } + if (isKey) { + keys.push_back(token); + keysNA.push_back(isNA); + isKey = false; // Next token will be a value + } else { + values.push_back(token); + valuesNA.push_back(isNA); + isKey = true; // Next token will be a key + } token.clear(); backslashCount = 0; } else { @@ -29,15 +58,32 @@ extern "C" SEXP splitString(SEXP strSEXP) { } } - if (!token.empty()) { - result.push_back(token); + // Check the last token + if (!token.empty() && token != "null") { + if (token.front() == '"' && token.back() == '"' && token.size() > 1) { + token = token.substr(1, token.size() - 2); // Remove quotes + } + values.push_back(token); + valuesNA.push_back(token == "null"); + } else { + values.push_back(""); + valuesNA.push_back(true); } - SEXP ans = PROTECT(allocVector(STRSXP, result.size())); - for (size_t j = 0; j < result.size(); ++j) { - SET_STRING_ELT(ans, j, mkChar(result[j].c_str())); + SEXP ans = PROTECT(allocVector(VECSXP, 2)); // List with two elements + SEXP keysR = PROTECT(allocVector(STRSXP, keys.size())); + SEXP valuesR = PROTECT(allocVector(STRSXP, values.size())); + + for (size_t j = 0; j < keys.size(); ++j) { + SET_STRING_ELT(keysR, j, keysNA[j] ? NA_STRING : mkChar(keys[j].c_str())); } + for (size_t j = 0; j < values.size(); ++j) { + SET_STRING_ELT(valuesR, j, valuesNA[j] ? NA_STRING : mkChar(values[j].c_str())); + } + + SET_VECTOR_ELT(ans, 0, keysR); + SET_VECTOR_ELT(ans, 1, valuesR); - UNPROTECT(1); + UNPROTECT(3); return ans; } From 39bab8de30fff1136230cf3d3aa9f20bf5a47d72 Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 06:12:46 +0200 Subject: [PATCH 03/17] Later R Import --- src/split_json.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/split_json.cpp b/src/split_json.cpp index 42cf97c..ef6a697 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -1,7 +1,8 @@ -#include -#include #include #include +#include +#include + extern "C" SEXP split_json(SEXP strSEXP) { const char* str = CHAR(STRING_ELT(strSEXP, 0)); From 57185b3f01fee5c9bac13efbc736cd1ab6c87af9 Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 06:33:08 +0200 Subject: [PATCH 04/17] Rf_ --- src/split_json.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/split_json.cpp b/src/split_json.cpp index ef6a697..7473725 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -71,9 +71,9 @@ extern "C" SEXP split_json(SEXP strSEXP) { valuesNA.push_back(true); } - SEXP ans = PROTECT(allocVector(VECSXP, 2)); // List with two elements - SEXP keysR = PROTECT(allocVector(STRSXP, keys.size())); - SEXP valuesR = PROTECT(allocVector(STRSXP, values.size())); + SEXP ans = PROTECT(Rf_allocVector(VECSXP, 2)); // List with two elements + SEXP keysR = PROTECT(Rf_allocVector(STRSXP, keys.size())); + SEXP valuesR = PROTECT(Rf_allocVector(STRSXP, values.size())); for (size_t j = 0; j < keys.size(); ++j) { SET_STRING_ELT(keysR, j, keysNA[j] ? NA_STRING : mkChar(keys[j].c_str())); From baba4bc3bf7cb4df523aacbb4c54b377d7be7d56 Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 06:46:28 +0200 Subject: [PATCH 05/17] Rf_ 2 --- src/split_json.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/split_json.cpp b/src/split_json.cpp index 7473725..89381dc 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -76,10 +76,10 @@ extern "C" SEXP split_json(SEXP strSEXP) { SEXP valuesR = PROTECT(Rf_allocVector(STRSXP, values.size())); for (size_t j = 0; j < keys.size(); ++j) { - SET_STRING_ELT(keysR, j, keysNA[j] ? NA_STRING : mkChar(keys[j].c_str())); + SET_STRING_ELT(keysR, j, keysNA[j] ? NA_STRING : Rf_mkChar(keys[j].c_str())); } for (size_t j = 0; j < values.size(); ++j) { - SET_STRING_ELT(valuesR, j, valuesNA[j] ? NA_STRING : mkChar(values[j].c_str())); + SET_STRING_ELT(valuesR, j, valuesNA[j] ? NA_STRING : Rf_mkChar(values[j].c_str())); } SET_VECTOR_ELT(ans, 0, keysR); From ccbf405db6265d9fada503c484e2eebfdc3ab85e Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 17:45:17 +0200 Subject: [PATCH 06/17] feat: Add `split_ndjson` --- R/json.R | 24 +----------------------- R/split_json.R | 7 +++++++ src/split_json.cpp | 26 +++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 24 deletions(-) create mode 100644 R/split_json.R diff --git a/R/json.R b/R/json.R index 875164c..e1b4a7d 100644 --- a/R/json.R +++ b/R/json.R @@ -121,28 +121,8 @@ read_ndjson <- function(logfile, unsanitize = TRUE) { # Read in lines of log data logdata <- readLines(logfile) - # List first; easier to add to dynamically - log_df <- data.frame() - - # Split out the log data into individual pieces, which will include JSON keys AND values - logdata <- substring(logdata, first = 3L, last = nchar(logdata) - 2L) - logdata <- strsplit(logdata, '", "', fixed = TRUE) - log_kvs <- lapply(logdata, FUN = function(x) strsplit(x, '": "', fixed = FALSE)) - for (kvs in seq_along(log_kvs)) { - missing_key <- which(lengths(log_kvs[[kvs]]) == 1L) - for (mk in missing_key) { - log_kvs[[kvs]][[mk]] <- c(log_kvs[[kvs]][[mk]], "") - } - } + log_kvs <- split_ndjson(logdata) - key_value_split <- function(x) { - x <- unlist(x, use.names = FALSE) - keys <- x[c(TRUE, FALSE)] - values <- x[c(FALSE, TRUE)] - list(keys = keys, values = values) - } - - log_kvs <- lapply(log_kvs, key_value_split) rowcount <- length(log_kvs) all_keys <- unique(unlist(lapply(log_kvs, FUN = function(x) x[["keys"]]))) @@ -158,8 +138,6 @@ read_ndjson <- function(logfile, unsanitize = TRUE) { } } - if (unsanitize) log_df <- lapply(log_df, FUN = default_ndjson_unsanitizer) - log_df <- as.data.frame(log_df) log_df diff --git a/R/split_json.R b/R/split_json.R new file mode 100644 index 0000000..4305b42 --- /dev/null +++ b/R/split_json.R @@ -0,0 +1,7 @@ +split_json <- function(x) { + .Call("split_json", x) +} + +split_ndjson <- function(x) { + .Call("split_ndjson", x) +} \ No newline at end of file diff --git a/src/split_json.cpp b/src/split_json.cpp index 89381dc..7f7607b 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -85,6 +85,30 @@ extern "C" SEXP split_json(SEXP strSEXP) { SET_VECTOR_ELT(ans, 0, keysR); SET_VECTOR_ELT(ans, 1, valuesR); - UNPROTECT(3); + SEXP names = PROTECT(Rf_allocVector(STRSXP, 2)); + SET_STRING_ELT(names, 0, Rf_mkChar("keys")); + SET_STRING_ELT(names, 1, Rf_mkChar("values")); + Rf_setAttrib(ans, R_NamesSymbol, names); + + UNPROTECT(4); return ans; } + +extern "C" SEXP split_ndjson(SEXP strVecSEXP) { + if (!isString(strVecSEXP)) { + error("Input must be a character vector."); + } + + int n = length(strVecSEXP); + SEXP result = PROTECT(Rf_allocVector(VECSXP, n)); + + for (int i = 0; i < n; i++) { + SEXP strSEXP = PROTECT(Rf_mkString(CHAR(STRING_ELT(strVecSEXP, i)))); + SET_VECTOR_ELT(result, i, split_json(strSEXP)); + UNPROTECT(1); // Entferne den Schutz für strSEXP nach dessen Verwendung + } + + UNPROTECT(1); // Entferne den Schutz für result + return result; +} + From 9e201da0ed10b96fa4524676449bd722c2bc4a11 Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 18:53:59 +0200 Subject: [PATCH 07/17] fix: Reduced sanitizer --- R/json.R | 54 +++++++++++++++++++----------------------------------- R/utils.R | 4 ++-- 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/R/json.R b/R/json.R index e1b4a7d..49e643f 100644 --- a/R/json.R +++ b/R/json.R @@ -1,11 +1,8 @@ sanitizer_map <- list( - "{" = "__LEFTBRACE__", - "}" = "__RIGHTBRACE__", - '"' = "__DBLQUOTE__", - "," = "__COMMA__", - "\r" = "__CR__", - "\n" = "__LF__" -) + "\\" = "\\\\", + '"' = "\"", + "\r" = "\\r", + "\n" = "\\n") #' Sanitization for ndJSON. @@ -20,13 +17,10 @@ sanitizer_map <- list( #' The default sanatizer and unsanatizer are based on the following mapping: #' #' | Character | Replacement | -#' |:--------- | :---------------------- | -#' | `{` | `__LEFTBRACE__` | -#' | `}` | `__RIGHTBRACE__` | -#' | `"` | `__DBLQUOTE__` | -#' | `,` | `__COMMA__` | -#' | `\r` | `__CR__` | -#' | `\n` | `__LF__` | +#' |:--------- | :-----------| +#' | `"` | `\"` | +#' | `\r` | `\\r` | +#' | `\n` | `\\n` | #' #' This type of function is needed because because some characters in a JSON cannot appear unescaped and #' since `loggit2` reimplements its own very simple string-based JSON parser. @@ -47,20 +41,15 @@ default_ndjson_sanitizer <- function(string) { string <- gsub(pattern = k, replacement = sanitizer_map[[k]], string, fixed = TRUE) } - # Explicit NAs must be marked so that no new ones are inserted when rotating the log - string[is.na(string)] <- "__NA__" - string } #' @rdname sanitizers default_ndjson_unsanitizer <- function(string) { - for (k in names(sanitizer_map)) { + for (k in rev(names(sanitizer_map))) { string <- gsub(pattern = sanitizer_map[[k]], replacement = k, string, fixed = TRUE) } - string[string == "__NA__"] <- NA_character_ - string } @@ -74,31 +63,25 @@ default_ndjson_unsanitizer <- function(string) { #' @param echo Echo the `ndjson` entry to the R console? Defaults to `TRUE`. #' @param overwrite Overwrite previous log file data? Defaults to `FALSE`, and #' so will append new log entries to the log file. -#' @param sanitizer Should the log data be sanitized before writing to json? #' #' @keywords internal -write_ndjson <- function(log_df, logfile = get_logfile(), echo = TRUE, overwrite = FALSE, sanitize = TRUE) { - - if (sanitize) { - for (field in colnames(log_df)) { - log_df[, field] <- default_ndjson_sanitizer(log_df[, field]) - } - } +write_ndjson <- function(log_df, logfile = get_logfile(), echo = TRUE, overwrite = FALSE) { # logdata will be built into a character vector where each element is a valid # JSON object, constructed from each row of the log data frame. logdata <- character(nrow(log_df)) - field_names <- paste0("\"", colnames(log_df), "\"") + log_df <- as.data.frame(lapply(log_df, function (x) default_ndjson_sanitizer(as.character(x)))) + + row_names <- paste0("\"", colnames(log_df), "\"") for (row in seq_len(nrow(log_df))) { row_data <- as.character(log_df[row,]) - na_entries <- is.na(row_data) - row_data <- row_data[!na_entries] - row_names <- field_names[!na_entries] - row_data <- paste0("\"", row_data, "\"") + na_entries <- is.na(row_data) + row_data[!na_entries] <- paste0("\"", row_data[!na_entries], "\"") + row_data[na_entries] <- "null" row_data <- paste(row_names, row_data, sep = ": ", collapse = ", ") logdata[row] <- paste0("{", row_data, "}") } @@ -111,12 +94,11 @@ write_ndjson <- function(log_df, logfile = get_logfile(), echo = TRUE, overwrite #' Read ndJSON-formatted log file #' #' @param logfile Log file to read from, and convert to a `data.frame`. -#' @param unsanitize Should the log data be unsanitized after reading from json? #' #' @keywords internal #' #' @return A `data.frame` -read_ndjson <- function(logfile, unsanitize = TRUE) { +read_ndjson <- function(logfile) { # Read in lines of log data logdata <- readLines(logfile) @@ -138,6 +120,8 @@ read_ndjson <- function(logfile, unsanitize = TRUE) { } } + log_df <- lapply(log_df, default_ndjson_unsanitizer) + log_df <- as.data.frame(log_df) log_df diff --git a/R/utils.R b/R/utils.R index 1a3a0fa..3ba6a54 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,12 +56,12 @@ rotate_logs <- function(rotate_lines = 100000L, logfile = get_logfile()) { cat(NULL, file = logfile) return(invisible(NULL)) } - log_df <- read_ndjson(logfile, unsanitize = FALSE) + log_df <- read_ndjson(logfile) if (nrow(log_df) <= rotate_lines) { return(invisible(NULL)) } log_df <- log_df[seq.int(from = nrow(log_df) - rotate_lines + 1L, length.out = rotate_lines),] - write_ndjson(log_df, logfile, echo = FALSE, overwrite = TRUE, sanitize = FALSE) + write_ndjson(log_df, logfile, echo = FALSE, overwrite = TRUE) } #' Find the Call of a Parent Function in the Call Hierarchy From 342efe6d17c90fee54af8dda189920bb110a2c6f Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 19:36:55 +0200 Subject: [PATCH 08/17] fix: Escaped " --- R/json.R | 2 +- src/split_json.cpp | 2 +- tests/testthat/test-json.R | 24 ++---------------------- 3 files changed, 4 insertions(+), 24 deletions(-) diff --git a/R/json.R b/R/json.R index 49e643f..463621b 100644 --- a/R/json.R +++ b/R/json.R @@ -1,6 +1,6 @@ sanitizer_map <- list( "\\" = "\\\\", - '"' = "\"", + '"' = '\\\"', "\r" = "\\r", "\n" = "\\n") diff --git a/src/split_json.cpp b/src/split_json.cpp index 7f7607b..5993d87 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -30,7 +30,7 @@ extern "C" SEXP split_json(SEXP strSEXP) { char ch = str[i]; if (ch == ' ' && !inQuotes) continue; // Skip spaces outside quotes - if (ch == '\\' && !inQuotes) { + if (ch == '\\') { backslashCount++; token += ch; } else if (ch == '"' && (backslashCount % 2 == 0)) { diff --git a/tests/testthat/test-json.R b/tests/testthat/test-json.R index d50cfd0..661e25d 100644 --- a/tests/testthat/test-json.R +++ b/tests/testthat/test-json.R @@ -24,26 +24,6 @@ test_that("write_logs() and read_logs() work in tandem", { }) cleanup() - -test_that("write_logs() and read_logs() work with disallowed JSON characters via santizers", { - loggit("INFO", 'default { } , " \r \n sanitizer', echo = FALSE) - - log_df_want <- data.frame( - log_lvl = "INFO", - log_msg = "default __LEFTBRACE__ __RIGHTBRACE__ __COMMA__ __DBLQUOTE__ __CR__ __LF__ sanitizer", - stringsAsFactors = FALSE - ) - - # Rreturn the sanitized strings as-is - log_df_got <- read_ndjson(logfile = get_logfile(), unsanitize = FALSE) - - log_df_got$timestamp <- NULL - - expect_identical(log_df_want, log_df_got) -}) -cleanup() - - test_that("read_logs() works with unsanitizers", { loggit("INFO", 'default { } , " \r \n unsanitizer', echo = FALSE) @@ -55,8 +35,8 @@ cleanup() test_that("write_logs() special cases", { # Ignore NAs log_data <- data.frame(log_lvl = c("foo", "bar"), log_msg = NA_character_) - write_ndjson(log_data, echo = FALSE, sanitize = FALSE) + write_ndjson(log_data, echo = FALSE) log_file <- read_logs() - expect_identical(log_file, data.frame(log_lvl = c("foo", "bar"))) + expect_identical(log_file, data.frame(log_lvl = c("foo", "bar"), log_msg = NA_character_)) }) cleanup() From 512e67690253691478ff529982da6b1d106c9cbd Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 19:51:29 +0200 Subject: [PATCH 09/17] feat: Rotate efficent --- R/utils.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/utils.R b/R/utils.R index 3ba6a54..039215f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,12 +56,12 @@ rotate_logs <- function(rotate_lines = 100000L, logfile = get_logfile()) { cat(NULL, file = logfile) return(invisible(NULL)) } - log_df <- read_ndjson(logfile) - if (nrow(log_df) <= rotate_lines) { + log_df <- readLines(logfile) + if (length(log_df) <= rotate_lines) { return(invisible(NULL)) } - log_df <- log_df[seq.int(from = nrow(log_df) - rotate_lines + 1L, length.out = rotate_lines),] - write_ndjson(log_df, logfile, echo = FALSE, overwrite = TRUE) + log_df <- log_df[seq.int(from = length(log_df) - rotate_lines + 1L, length.out = rotate_lines)] + write(log_df, logfile, append = FALSE) } #' Find the Call of a Parent Function in the Call Hierarchy From 341cc5a5c4689f8153cd27d1fd0106b11d43b03c Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 20:07:16 +0200 Subject: [PATCH 10/17] fix: Escaped csv --- R/json.R | 5 +++-- R/utils.R | 15 +++++---------- tests/testthat/_snaps/utils/test.csv | 8 +++----- tests/testthat/_snaps/utils/test.loggit | 2 +- tests/testthat/test-utils.R | 2 +- tests/testthat/testdata/test.loggit | 6 +++--- 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/R/json.R b/R/json.R index 463621b..3e040ff 100644 --- a/R/json.R +++ b/R/json.R @@ -94,11 +94,12 @@ write_ndjson <- function(log_df, logfile = get_logfile(), echo = TRUE, overwrite #' Read ndJSON-formatted log file #' #' @param logfile Log file to read from, and convert to a `data.frame`. +#' @param unsanitize Should the log data be unsanitized? #' #' @keywords internal #' #' @return A `data.frame` -read_ndjson <- function(logfile) { +read_ndjson <- function(logfile, unsanitize = TRUE) { # Read in lines of log data logdata <- readLines(logfile) @@ -120,7 +121,7 @@ read_ndjson <- function(logfile) { } } - log_df <- lapply(log_df, default_ndjson_unsanitizer) + if(unsanitize) log_df <- lapply(log_df, default_ndjson_unsanitizer) log_df <- as.data.frame(log_df) diff --git a/R/utils.R b/R/utils.R index 039215f..0a0a689 100644 --- a/R/utils.R +++ b/R/utils.R @@ -3,6 +3,7 @@ #' This function returns a `data.frame` containing all the logs in the provided `ndjson` log file. #' #' @param logfile Path to log file. +#' @param unsanitize Should the log messages be unsanitized? #' #' @return A `data.frame`. #' @@ -15,11 +16,11 @@ #' read_logs() #' #' @export -read_logs <- function(logfile = get_logfile()) { +read_logs <- function(logfile = get_logfile(), unsanitize = TRUE) { base::stopifnot("Log file does not exist" = file.exists(logfile)) - log <- read_ndjson(logfile) + log <- read_ndjson(logfile, unsanitize = unsanitize) if (nrow(log) == 0L) log <- data.frame(timestamp = character(), log_lvl = character(), log_msg = character()) @@ -94,14 +95,8 @@ find_call <- function() { #' @return Invisible `NULL`. #' #' @export -convert_to_csv <- function(file, logfile = get_logfile(), remove_message_lf = TRUE, ...) { - log <- read_logs(logfile = logfile) - - if (remove_message_lf) { - msg_flag <- log$log_lvl == "INFO" - msg <- log$log_msg[msg_flag] - log$log_msg[msg_flag] <- gsub("\n$", "", msg) - } +convert_to_csv <- function(file, logfile = get_logfile(), unsanitize = FALSE, ...) { + log <- read_logs(logfile = logfile, unsanitize = unsanitize) write.table(log, file = file, ...) diff --git a/tests/testthat/_snaps/utils/test.csv b/tests/testthat/_snaps/utils/test.csv index 2256981..0ff1d4f 100644 --- a/tests/testthat/_snaps/utils/test.csv +++ b/tests/testthat/_snaps/utils/test.csv @@ -1,9 +1,7 @@ "timestamp" "log_lvl" "log_msg" "extra" -"1" "2024-05-01T20:09:27+0200" "INFO" "log_1" "extra -" -"2" "2024-05-01T20:09:27+0200" "WARN" "log_96 -" NA +"1" "2024-05-01T20:09:27+0200" "INFO" "log_1" "extra\n" +"2" "2024-05-01T20:09:27+0200" "WARN" "log_96 \n" NA "3" "2024-05-01T20:09:27+0200" "INFO" "log_97" NA "4" "2024-05-01T20:09:27+0200" "INFO" "log_98" NA "5" "2024-05-01T20:09:27+0200" "INFO" "log_99" NA -"6" "2024-05-01T20:09:27+0200" "INFO" "log_100 " "extra" +"6" "2024-05-01T20:09:27+0200" "INFO" "log_100 \n" "extra" diff --git a/tests/testthat/_snaps/utils/test.loggit b/tests/testthat/_snaps/utils/test.loggit index f4cf55b..8fa5c84 100644 --- a/tests/testthat/_snaps/utils/test.loggit +++ b/tests/testthat/_snaps/utils/test.loggit @@ -1,3 +1,3 @@ {"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_98"} {"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_99"} -{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_100 __LF__", "extra": "extra"} +{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_100 \n", "extra": "extra"} diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index faff3ad..b796d6a 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -82,7 +82,7 @@ test_that("convert_to_csv", { tmp_dir <- tempdir() convert_to_csv(file = file.path(tmp_dir, "test.csv"), logfile = "testdata/test.loggit") convert_to_csv( - file = file.path(tmp_dir, "test_with_lf.csv"), remove_message_lf = FALSE, logfile = "testdata/test.loggit" + file = file.path(tmp_dir, "test_with_lf.csv"), unsanitize = TRUE, logfile = "testdata/test.loggit" ) expect_snapshot_file(file.path(tmp_dir, "test.csv")) diff --git a/tests/testthat/testdata/test.loggit b/tests/testthat/testdata/test.loggit index 971da15..78cd9a9 100644 --- a/tests/testthat/testdata/test.loggit +++ b/tests/testthat/testdata/test.loggit @@ -1,6 +1,6 @@ -{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_1", "extra": "extra__LF__"} -{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "WARN", "log_msg": "log_96 __LF__"} +{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_1", "extra": "extra\n"} +{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "WARN", "log_msg": "log_96 \n"} {"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_97"} {"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_98"} {"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_99"} -{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_100 __LF__", "extra": "extra"} +{"timestamp": "2024-05-01T20:09:27+0200", "log_lvl": "INFO", "log_msg": "log_100 \n", "extra": "extra"} From 3b0e112f4bafa789a60ddfd87a2bad0fe3f13b98 Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 20:10:25 +0200 Subject: [PATCH 11/17] fix: Explicit Rf_ --- src/split_json.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/split_json.cpp b/src/split_json.cpp index 5993d87..9ed3db4 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -95,11 +95,11 @@ extern "C" SEXP split_json(SEXP strSEXP) { } extern "C" SEXP split_ndjson(SEXP strVecSEXP) { - if (!isString(strVecSEXP)) { - error("Input must be a character vector."); + if (!Rf_isString(strVecSEXP)) { + Rf_error("Input must be a character vector."); } - int n = length(strVecSEXP); + int n = Rf_length(strVecSEXP); SEXP result = PROTECT(Rf_allocVector(VECSXP, n)); for (int i = 0; i < n; i++) { From 95cec09776f038350c8157da89b48eb446f66e49 Mon Sep 17 00:00:00 2001 From: Matthias Ollech <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 20:14:50 +0200 Subject: [PATCH 12/17] doc: Update doc --- R/utils.R | 2 +- man/convert_to_csv.Rd | 4 ++-- man/message.Rd | 2 +- man/read_logs.Rd | 4 +++- man/read_ndjson.Rd | 2 +- man/sanitizers.Rd | 9 +++------ man/warning.Rd | 4 ++-- man/write_ndjson.Rd | 10 +--------- 8 files changed, 14 insertions(+), 23 deletions(-) diff --git a/R/utils.R b/R/utils.R index 0a0a689..b8d09d0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -89,7 +89,7 @@ find_call <- function() { #' #' @param file Path to write csv file to #' @param logfile Path to log file to read from -#' @param remove_message_lf Should the line breaks at the end of messages be removed? +#' @param unsanitize Should the line breaks at the end of messages be not escaped? #' @param ... Additional arguments to pass to `utils::write.table()` #' #' @return Invisible `NULL`. diff --git a/man/convert_to_csv.Rd b/man/convert_to_csv.Rd index dd2bf2d..76b2f30 100644 --- a/man/convert_to_csv.Rd +++ b/man/convert_to_csv.Rd @@ -4,14 +4,14 @@ \alias{convert_to_csv} \title{Write log to csv file} \usage{ -convert_to_csv(file, logfile = get_logfile(), remove_message_lf = TRUE, ...) +convert_to_csv(file, logfile = get_logfile(), unsanitize = FALSE, ...) } \arguments{ \item{file}{Path to write csv file to} \item{logfile}{Path to log file to read from} -\item{remove_message_lf}{Should the line breaks at the end of messages be removed?} +\item{unsanitize}{Should the line breaks at the end of messages be not escaped?} \item{...}{Additional arguments to pass to \code{utils::write.table()}} } diff --git a/man/message.Rd b/man/message.Rd index a90753e..27f2557 100644 --- a/man/message.Rd +++ b/man/message.Rd @@ -34,8 +34,8 @@ but it includes logging of the exception message via \code{loggit()}. } \seealso{ Other handlers: -\code{\link{stopifnot}()}, \code{\link{stop}()}, +\code{\link{stopifnot}()}, \code{\link{warning}()} } \concept{handlers} diff --git a/man/read_logs.Rd b/man/read_logs.Rd index 69b0085..afae899 100644 --- a/man/read_logs.Rd +++ b/man/read_logs.Rd @@ -4,10 +4,12 @@ \alias{read_logs} \title{Return log file as an R data frame} \usage{ -read_logs(logfile = get_logfile()) +read_logs(logfile = get_logfile(), unsanitize = TRUE) } \arguments{ \item{logfile}{Path to log file.} + +\item{unsanitize}{Should the log messages be unsanitized?} } \value{ A \code{data.frame}. diff --git a/man/read_ndjson.Rd b/man/read_ndjson.Rd index 79539ec..d659783 100644 --- a/man/read_ndjson.Rd +++ b/man/read_ndjson.Rd @@ -9,7 +9,7 @@ read_ndjson(logfile, unsanitize = TRUE) \arguments{ \item{logfile}{Log file to read from, and convert to a \code{data.frame}.} -\item{unsanitize}{Should the log data be unsanitized after reading from json?} +\item{unsanitize}{Should the log data be unsanitized?} } \value{ A \code{data.frame} diff --git a/man/sanitizers.Rd b/man/sanitizers.Rd index 82f27f9..b418c38 100644 --- a/man/sanitizers.Rd +++ b/man/sanitizers.Rd @@ -25,12 +25,9 @@ Associated \emph{sanitizer} and \emph{unsanitizer} should be constructed in such \details{ The default sanatizer and unsanatizer are based on the following mapping:\tabular{ll}{ Character \tab Replacement \cr - \verb{\{} \tab \verb{__LEFTBRACE__} \cr - \verb{\}} \tab \verb{__RIGHTBRACE__} \cr - \verb{"} \tab \verb{__DBLQUOTE__} \cr - \verb{,} \tab \verb{__COMMA__} \cr - \verb{\\r} \tab \verb{__CR__} \cr - \verb{\\n} \tab \verb{__LF__} \cr + \verb{"} \tab \verb{\\"} \cr + \verb{\\r} \tab \verb{\\\\r} \cr + \verb{\\n} \tab \verb{\\\\n} \cr } diff --git a/man/warning.Rd b/man/warning.Rd index 8c37d63..f0fd0fb 100644 --- a/man/warning.Rd +++ b/man/warning.Rd @@ -49,7 +49,7 @@ but it includes logging of the exception message via \code{loggit()}. \seealso{ Other handlers: \code{\link{message}()}, -\code{\link{stopifnot}()}, -\code{\link{stop}()} +\code{\link{stop}()}, +\code{\link{stopifnot}()} } \concept{handlers} diff --git a/man/write_ndjson.Rd b/man/write_ndjson.Rd index 6d6223b..798b7ce 100644 --- a/man/write_ndjson.Rd +++ b/man/write_ndjson.Rd @@ -4,13 +4,7 @@ \alias{write_ndjson} \title{Write ndJSON-formatted log file} \usage{ -write_ndjson( - log_df, - logfile = get_logfile(), - echo = TRUE, - overwrite = FALSE, - sanitize = TRUE -) +write_ndjson(log_df, logfile = get_logfile(), echo = TRUE, overwrite = FALSE) } \arguments{ \item{log_df}{Data frame of log data. Rows are converted to \code{ndjson} entries, @@ -23,8 +17,6 @@ file.} \item{overwrite}{Overwrite previous log file data? Defaults to \code{FALSE}, and so will append new log entries to the log file.} - -\item{sanitizer}{Should the log data be sanitized before writing to json?} } \description{ Write ndJSON-formatted log file From 0b71dffc0fea94595198ece21ed7b4f9327b166c Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 22:16:59 +0200 Subject: [PATCH 13/17] mnt: Suggest `utils` --- DESCRIPTION | 3 ++- R/utils.R | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d465b0c..2eddc20 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,8 @@ Depends: R (>= 3.4.0) Suggests: knitr (>= 1.19), rmarkdown (>= 1.8), - testthat (>= 3.0.0) + testthat (>= 3.0.0), + utils URL: https://github.com/MEO265/loggit2 BugReports: https://github.com/MEO265/loggit2/issues RoxygenNote: 7.3.0 diff --git a/R/utils.R b/R/utils.R index b8d09d0..79bc7b3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -96,9 +96,13 @@ find_call <- function() { #' #' @export convert_to_csv <- function(file, logfile = get_logfile(), unsanitize = FALSE, ...) { + if (!requireNamespace(package = "utils", quietly = TRUE)) { + stop("Package 'utils' is not available. Please install it, if you want to use this function.") + } + log <- read_logs(logfile = logfile, unsanitize = unsanitize) - write.table(log, file = file, ...) + utils::write.table(log, file = file, ...) return(invisible(NULL)) } \ No newline at end of file From 560b676a9ce6669ffc134d61878c94e3b2ba3aa1 Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 22:29:23 +0200 Subject: [PATCH 14/17] mnt: Register on c level --- R/split_json.R | 4 ---- src/loggit.h | 7 +++++++ src/register.cpp | 14 ++++++++++++++ src/split_json.cpp | 1 + 4 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 src/loggit.h create mode 100644 src/register.cpp diff --git a/R/split_json.R b/R/split_json.R index 4305b42..aed238d 100644 --- a/R/split_json.R +++ b/R/split_json.R @@ -1,7 +1,3 @@ -split_json <- function(x) { - .Call("split_json", x) -} - split_ndjson <- function(x) { .Call("split_ndjson", x) } \ No newline at end of file diff --git a/src/loggit.h b/src/loggit.h new file mode 100644 index 0000000..1ef2653 --- /dev/null +++ b/src/loggit.h @@ -0,0 +1,7 @@ +// meineFunktionen.h +#ifndef LOGGIT +#define LOGGIT + +extern "C" SEXP split_ndjson(SEXP strVecSEXP); + +#endif diff --git a/src/register.cpp b/src/register.cpp new file mode 100644 index 0000000..3491431 --- /dev/null +++ b/src/register.cpp @@ -0,0 +1,14 @@ +#include +#include +#include +#include "loggit.h" + +static const R_CallMethodDef CallEntries[] = { + {"split_ndjson", (DL_FUNC) &split_ndjson, 1}, + {NULL, NULL, 0} +}; + +void R_init_loggit2(DllInfo *dll) { + R_useDynamicSymbols(dll, FALSE); + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); +} diff --git a/src/split_json.cpp b/src/split_json.cpp index 9ed3db4..f1141e2 100644 --- a/src/split_json.cpp +++ b/src/split_json.cpp @@ -2,6 +2,7 @@ #include #include #include +#include "loggit.h" extern "C" SEXP split_json(SEXP strSEXP) { From b7ebb4dc8442c419c042d98c5afb188bdd9429bd Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Tue, 7 May 2024 22:46:31 +0200 Subject: [PATCH 15/17] mnt: Update codecov settings --- .covrignore | 1 + R/utils.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.covrignore b/.covrignore index 855dd6b..8649dac 100644 --- a/.covrignore +++ b/.covrignore @@ -1 +1,2 @@ R/zzz.R +src/register.cpp \ No newline at end of file diff --git a/R/utils.R b/R/utils.R index 79bc7b3..6b7ddea 100644 --- a/R/utils.R +++ b/R/utils.R @@ -97,7 +97,7 @@ find_call <- function() { #' @export convert_to_csv <- function(file, logfile = get_logfile(), unsanitize = FALSE, ...) { if (!requireNamespace(package = "utils", quietly = TRUE)) { - stop("Package 'utils' is not available. Please install it, if you want to use this function.") + stop("Package 'utils' is not available. Please install it, if you want to use this function.") # nocov } log <- read_logs(logfile = logfile, unsanitize = unsanitize) From ef0c03612ca3b86910d2a4f0e368127b055227fb Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Wed, 8 May 2024 18:21:58 +0200 Subject: [PATCH 16/17] test: Add test for ndjson error --- tests/testthat/test-handlers.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testthat/test-handlers.R b/tests/testthat/test-handlers.R index 97b6703..2977c02 100644 --- a/tests/testthat/test-handlers.R +++ b/tests/testthat/test-handlers.R @@ -137,3 +137,7 @@ test_that("stopifnot", { }) cleanup() + +test_that("split_ndjson error", { + expect_error(split_ndjson(3L), "Input must be a character vector.") +}) From 997bf9939a7863260730abe941c78357abfa18b7 Mon Sep 17 00:00:00 2001 From: MEO265 <99362508+MEO265@users.noreply.github.com> Date: Wed, 8 May 2024 18:37:26 +0200 Subject: [PATCH 17/17] mnt: Add NEWS bullet --- NEWS.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d38ff0e..7f7c1b9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,13 +1,17 @@ # loggit2 DEV ## Breaking changes -* Custom `sanitizer`s and `unsanitizer`s are no longer supported. This decision was made because no active user is known and this functionality severely limits further development. +* Custom `sanitizer`s and `unsanitizer`s are no longer supported. This decision was made because no active + user is known and this functionality severely limits further development. If custom `sanitizer`s were previously used, they could simply be executed before or after instead in `loggit()` or `read_logs()`. If custom sanitizer has been used to get around bugs, please report them so that they can be fixed. +* Special characters are no longer escaped by replacement, but rather by "\". ## New features * Add `convert_to_csv()` to convert log files to CSV format. * Add `with_loggit()` to log third-party code or to easily use different `loggit()`-parameters for a chunk of code. +* `NA`s are now stored as `null` in the json log. And `read_logs()` also restores these as `NA`. + This was previously (unintentionally) guaranteed by replacing the `NA` with `"__NA__"`. ## Bugfixes * `read_logs()` now correctly reads empty character values `""`, as in `{"key": ""}`, as such. @@ -17,12 +21,14 @@ ## Minor changes * `read_logs()` now returns a `data.frame` with the empty character columns "timestamp", "log_lvl" and "log_msg" instead of an empty (0x0) `data.frame` if the log file has no entries. +* The Json reading functions are more tolerant of manual changes to the log. ## Internals * `write_ndjson` no longer warns if the log contains unsanitized line breaks. This warning could only be generated by package-internal errors (therefore nonsensical in the cran package) or by a custom `sanitizer`, but in this case only this one character was specifically tested and thus provides a false sense of security. +* The package now requires compilation. This is necessary because the JSON parser was written in C++ for faster reading. # loggit2 2.2.2