Skip to content

Commit

Permalink
Merge pull request #45 from OHDSI/ignore_convention
Browse files Browse the repository at this point in the history
Add ignore capabilities and helper functions
  • Loading branch information
fdefalco authored Dec 20, 2023
2 parents e953a7e + 19104a6 commit d98d737
Show file tree
Hide file tree
Showing 10 changed files with 310 additions and 126 deletions.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ export(buildNetworkIndex)
export(buildNetworkPerformanceIndex)
export(buildNetworkUnmappedSourceCodeIndex)
export(computeCharacterizationDifference)
export(getIgnoredReleases)
export(getIgnoredSources)
export(exportToParquet)
export(getSourceReleaseKey)
import(data.table)
Expand Down
18 changes: 12 additions & 6 deletions R/BuildDataQualityHistoryIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,18 @@ buildDataQualityHistoryIndex <-

for (directory in directories) {
resultFile <- file.path(directory, "dq-result.json")
if (file.exists(resultFile)) {
writeLines(paste("processing", resultFile))
fileContents <- readLines(resultFile, warn = FALSE)
resultJson <- jsonlite::fromJSON(fileContents)
addResultsToIndex(resultJson)
} else {
if (file.exists(resultFile) && directory %in% AresIndexer::getIgnoredReleases(sourceFolder)) {
writeLines(paste("AresIndexIgnore file present, skipping release dq file: ", resultFile))
}
else if (file.exists(resultFile)) {

writeLines(paste("processing", resultFile))
fileContents <- readLines(resultFile, warn = FALSE)
resultJson <- jsonlite::fromJSON(fileContents)
addResultsToIndex(resultJson)

}
else {
writeLines(paste("missing", resultFile))
}
}
Expand Down
78 changes: 48 additions & 30 deletions R/BuildDataQualityIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,42 +38,60 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) {

# iterate on sources
for (sourceFolder in sourceFolders) {
historicalIndex <- AresIndexer::buildDataQualityHistoryIndex(sourceFolder)
historicalFile <- file.path(sourceFolder, "data-quality-index.json")
write(jsonlite::toJSON(historicalIndex),historicalFile)

# skip index for source if ignore file present
if (file.exists(file.path(sourceFolder,".aresIndexIgnore"))){

releaseFolders <- list.dirs(sourceFolder, recursive = F)
# iterate on source releases and process all failed results
for (releaseFolder in releaseFolders) {
dataQualityResultsFile <- file.path(releaseFolder, "dq-result.json")
writeLines(paste("AresIndexIgnore file present, skipping source folder: ", sourceFolder))
}else
{
historicalIndex <- AresIndexer::buildDataQualityHistoryIndex(sourceFolder)
historicalFile <- file.path(sourceFolder, "data-quality-index.json")
write(jsonlite::toJSON(historicalIndex),historicalFile)

# process each data quality result file
if (file.exists(dataQualityResultsFile)) {
dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)
results <- dataQualityResults$CheckResults

# for each release, generate a summary of failures by cdm_table_name
domainAggregates <- results %>% filter(failed==1) %>% count(tolower(cdmTableName))
names(domainAggregates) <- c("cdm_table_name", "count_failed")
data.table::fwrite(domainAggregates, file.path(releaseFolder,"domain-issues.csv"))
releaseFolders <- list.dirs(sourceFolder, recursive = F)
# iterate on source releases and process all failed results
for (releaseFolder in releaseFolders) {

# collect all failures from this result file for network analysis
outColNames <- c("checkName", "checkLevel", "cdmTableName", "category", "subcategory", "context", "cdmFieldName", "conceptId", "unitConceptId")
missingColNames <- setdiff(outColNames, names(results))
for (colName in missingColNames) {
writeLines(paste0("Expected column is missing in DQD results. Adding column with NA values: ", colName))
results[,colName] <- NA
# skip index for release if ignore file present

if (releaseFolder %in% AresIndexer::getIgnoredReleases(sourceFolder)){

writeLines(paste("AresIndexIgnore file present, skipping release folder: ", releaseFolder))

}else {

dataQualityResultsFile <- file.path(releaseFolder, "dq-result.json")

# process each data quality result file
if (file.exists(dataQualityResultsFile)) {
dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)
results <- dataQualityResults$CheckResults

# for each release, generate a summary of failures by cdm_table_name
domainAggregates <- results %>% filter(failed==1) %>% count(tolower(cdmTableName))
names(domainAggregates) <- c("cdm_table_name", "count_failed")
data.table::fwrite(domainAggregates, file.path(releaseFolder,"domain-issues.csv"))

# collect all failures from this result file for network analysis
outColNames <- c("checkName", "checkLevel", "cdmTableName", "category", "subcategory", "context", "cdmFieldName", "conceptId", "unitConceptId")
missingColNames <- setdiff(outColNames, names(results))
for (colName in missingColNames) {
writeLines(paste0("Expected column is missing in DQD results. Adding column with NA values: ", colName))
results[,colName] <- NA
}
sourceFailures <- results[results[,"failed"]==1,outColNames]
sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$cdmSourceName
sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$cdmSourceAbbreviation
sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$cdmSourceAbbreviation)
sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d")
sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d")
networkIndex <- rbind(networkIndex, sourceFailures)
} else {
writeLines(paste("missing data quality result file ",dataQualityResultsFile))
}
}
sourceFailures <- results[results[,"failed"]==1,outColNames]
sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$cdmSourceName
sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$cdmSourceAbbreviation
sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$cdmSourceAbbreviation)
sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d")
sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d")
networkIndex <- rbind(networkIndex, sourceFailures)
} else {
writeLines(paste("missing data quality result file ",dataQualityResultsFile))
}
}
}
Expand Down
160 changes: 89 additions & 71 deletions R/BuildNetworkIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ buildNetworkIndex <- function(sourceFolders, outputFolder) {
# iterate on sources
for (sourceFolder in sourceFolders) {
writeLines(paste("processing source folder: ", sourceFolder))

# skip index for source if ignore file present
if (file.exists(file.path(sourceFolder,".aresIndexIgnore"))){

writeLines(paste("AresIndexIgnore file present, skipping source folder: ", sourceFolder))
}else
{

source <- {}

sourceCount <- sourceCount + 1
Expand All @@ -69,78 +77,86 @@ buildNetworkIndex <- function(sourceFolders, outputFolder) {
# iterate on source releases
for (releaseFolder in releaseFolders) {
writeLines(paste("processing release folder: ", releaseFolder))
dataQualityResultsFile <- file.path(releaseFolder, "dq-result.json")
personResultsFile <- file.path(releaseFolder, "person.json")
observationPeriodResultsFile <- file.path(releaseFolder, "observationperiod.json")

# add data quality details
if (file.exists(dataQualityResultsFile)) {
dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)

# add person results
if (file.exists(personResultsFile)) {
personResults <- jsonlite::fromJSON(personResultsFile)
count_person <- sum(personResults$BIRTH_YEAR_DATA$COUNT_PERSON)
} else {
writeLines(paste("missing person results file ", personResultsFile))
}

# add observation period results
if (file.exists(observationPeriodResultsFile)) {
observationPeriodResults <- jsonlite::fromJSON(observationPeriodResultsFile)
obs_period_start <- min(observationPeriodResults$OBSERVED_BY_MONTH$MONTH_YEAR)
obs_period_end <- max(observationPeriodResults$OBSERVED_BY_MONTH$MONTH_YEAR)
} else {
writeLines(paste("missing observation period results file ", observationPeriodResultsFile))
}

source$cdm_source_name <- dataQualityResults$Metadata$cdmSourceName
source$cdm_source_abbreviation <- dataQualityResults$Metadata$cdmSourceAbbreviation
source$cdm_source_key <- gsub(" ", "_", source$cdm_source_abbreviation)
source$cdm_holder <- dataQualityResults$Metadata$cdmHolder
source$source_description <- dataQualityResults$Metadata$sourceDescription

source$releases <- rbind(
source$releases,
list(
release_name = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d"),
release_id = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d"),
cdm_version = dataQualityResults$Metadata$cdmVersion,
vocabulary_version = dataQualityResults$Metadata$vocabularyVersion,
dqd_version = dataQualityResults$Metadata$dqdVersion,
count_data_quality_issues = dataQualityResults$Overview$countOverallFailed,
count_data_quality_checks = dataQualityResults$Overview$countTotal,
dqd_execution_date = format(lubridate::ymd_hms(dataQualityResults$endTimestamp),"%Y-%m-%d"),
count_person = count_person,
obs_period_start = format(lubridate::ym(obs_period_start),"%Y-%m"),
obs_period_end = format(lubridate::ym(obs_period_end),"%Y-%m")

# skip index for release if ignore file present
if (releaseFolder %in% AresIndexer::getIgnoredReleases(sourceFolder)){

writeLines(paste("AresIndexIgnore file present, skipping release folder: ", releaseFolder))

}else {

dataQualityResultsFile <- file.path(releaseFolder, "dq-result.json")
personResultsFile <- file.path(releaseFolder, "person.json")
observationPeriodResultsFile <- file.path(releaseFolder, "observationperiod.json")

# add data quality details
if (file.exists(dataQualityResultsFile)) {
dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)

# add person results
if (file.exists(personResultsFile)) {
personResults <- jsonlite::fromJSON(personResultsFile)
count_person <- sum(personResults$BIRTH_YEAR_DATA$COUNT_PERSON)
} else {
writeLines(paste("missing person results file ", personResultsFile))
}

# add observation period results
if (file.exists(observationPeriodResultsFile)) {
observationPeriodResults <- jsonlite::fromJSON(observationPeriodResultsFile)
obs_period_start <- min(observationPeriodResults$OBSERVED_BY_MONTH$MONTH_YEAR)
obs_period_end <- max(observationPeriodResults$OBSERVED_BY_MONTH$MONTH_YEAR)
} else {
writeLines(paste("missing observation period results file ", observationPeriodResultsFile))
}

source$cdm_source_name <- dataQualityResults$Metadata$cdmSourceName
source$cdm_source_abbreviation <- dataQualityResults$Metadata$cdmSourceAbbreviation
source$cdm_source_key <- gsub(" ", "_", source$cdm_source_abbreviation)
source$cdm_holder <- dataQualityResults$Metadata$cdmHolder
source$source_description <- dataQualityResults$Metadata$sourceDescription

source$releases <- rbind(
source$releases,
list(
release_name = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d"),
release_id = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d"),
cdm_version = dataQualityResults$Metadata$cdmVersion,
vocabulary_version = dataQualityResults$Metadata$vocabularyVersion,
dqd_version = dataQualityResults$Metadata$dqdVersion,
count_data_quality_issues = dataQualityResults$Overview$countOverallFailed,
count_data_quality_checks = dataQualityResults$Overview$countTotal,
dqd_execution_date = format(lubridate::ymd_hms(dataQualityResults$endTimestamp),"%Y-%m-%d"),
count_person = count_person,
obs_period_start = format(lubridate::ym(obs_period_start),"%Y-%m"),
obs_period_end = format(lubridate::ym(obs_period_end),"%Y-%m")
)
)
)
} else {
writeLines(paste("missing data quality result file ",dataQualityResultsFile))
}

cdmSourceFile <- file.path(releaseFolder, "cdmsource.csv")
if (file.exists(cdmSourceFile)) {
cdmSourceData <- read.csv(cdmSourceFile)
releaseIntervalData <- rbind(releaseIntervalData, cdmSourceData)
}
}

averageUpdateIntervalDays <- "n/a"
if (nrow(releaseIntervalData) > 1 ) {
processedIndex <- releaseIntervalData %>%
mutate(DAYS_ELAPSED = as.Date(CDM_RELEASE_DATE) - lag(as.Date(CDM_RELEASE_DATE))) %>%
filter(!is.na(DAYS_ELAPSED))

averageUpdateIntervalDays <- round(as.numeric(abs(mean(processedIndex$DAYS_ELAPSED)), units="days"))
}

source$releases <- source$releases[order(-dqd_execution_date)]
source$count_releases <- nrow(source$releases)
source$average_update_interval_days <- averageUpdateIntervalDays
index$sources[[sourceCount]] <- source
}
} else {
writeLines(paste("missing data quality result file ",dataQualityResultsFile))
}

cdmSourceFile <- file.path(releaseFolder, "cdmsource.csv")
if (file.exists(cdmSourceFile)) {
cdmSourceData <- read.csv(cdmSourceFile)
releaseIntervalData <- rbind(releaseIntervalData, cdmSourceData)
}

averageUpdateIntervalDays <- "n/a"
if (nrow(releaseIntervalData) > 1 ) {
processedIndex <- releaseIntervalData %>%
mutate(DAYS_ELAPSED = as.Date(CDM_RELEASE_DATE) - lag(as.Date(CDM_RELEASE_DATE))) %>%
filter(!is.na(DAYS_ELAPSED))

averageUpdateIntervalDays <- round(as.numeric(abs(mean(processedIndex$DAYS_ELAPSED)), units="days"))
}

source$releases <- source$releases[order(-dqd_execution_date)]
source$count_releases <- nrow(source$releases)
source$average_update_interval_days <- averageUpdateIntervalDays
index$sources[[sourceCount]] <- source
}
}

indexJson <- jsonlite::toJSON(index,auto_unbox = T)
write(indexJson, file.path(outputFolder,"index.json"))
Expand All @@ -149,4 +165,6 @@ buildNetworkIndex <- function(sourceFolders, outputFolder) {
}
data.table::fwrite(networkPerformanceIndex, file=paste0(outputFolder, "/network-performance.csv"))
invisible(indexJson)
}
}
}
4 changes: 2 additions & 2 deletions R/BuildNetworkPerformanceIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ buildNetworkPerformanceIndex <-

options(dplyr.summarise.inform = FALSE)
networkIndex <- data.frame()
analysisDetails <- dplyr::select(Achilles::getAnalysisDetails(), c("analysis_id", "category")) %>%
rename(TASK = analysis_id)
analysisDetails <- dplyr::select(Achilles::getAnalysisDetails(), c("ANALYSIS_ID", "CATEGORY")) %>%
rename(TASK = ANALYSIS_ID)
releaseFolders <- list.dirs(sourceFolder, recursive = F)
latestRelease <- max(releaseFolders)

Expand Down
47 changes: 30 additions & 17 deletions R/BuildNetworkUnmappedSourceCodeIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,36 @@ buildNetworkUnmappedSourceCodeIndex <-
# iterate on sources
networkIndex <- data.frame()
for (sourceFolder in sourceFolders) {
# find the latest release in the source folder
releaseFolders <- list.dirs(sourceFolder, recursive = F)
releaseFolders <- sort(releaseFolders, decreasing = T)
if (length(releaseFolders) > 0) {
latestReleaseFolder <- releaseFolders[1]
completenessFile <-
file.path(latestReleaseFolder, "quality-completeness.csv")
cdmSourceFile <-
file.path(latestReleaseFolder, "cdmsource.csv")
if (file.exists(cdmSourceFile)) {
if (file.exists(completenessFile)) {
cdmSourceData <- read.csv(cdmSourceFile)
completenessData <- read.csv(completenessFile)
withSourceValue <- dplyr::filter(completenessData, nchar(stringr::str_trim(completenessData$SOURCE_VALUE))>0)
if (nrow(withSourceValue >0)) {
withSourceValue$DATA_SOURCE <- cdmSourceData$CDM_SOURCE_ABBREVIATION
networkIndex <- dplyr::bind_rows(networkIndex, withSourceValue)

# skip index for source if ignore file present

if (file.exists(file.path(sourceFolder,".aresIndexIgnore"))){

writeLines(paste("AresIndexIngore file present, skipping source folder: ", sourceFolder))
}else
{
# skip for releases where ignore file present
releaseFolders<- releaseFolders[!releaseFolders %in% AresIndexer::getIgnoredReleases(sourceFolder)]

# find the latest release in the source folder
releaseFolders <- list.dirs(sourceFolder, recursive = F)
releaseFolders <- sort(releaseFolders, decreasing = T)

if (length(releaseFolders) > 0) {
latestReleaseFolder <- releaseFolders[1]
completenessFile <-
file.path(latestReleaseFolder, "quality-completeness.csv")
cdmSourceFile <-
file.path(latestReleaseFolder, "cdmsource.csv")
if (file.exists(cdmSourceFile)) {
if (file.exists(completenessFile)) {
cdmSourceData <- read.csv(cdmSourceFile)
completenessData <- read.csv(completenessFile)
withSourceValue <- dplyr::filter(completenessData, nchar(stringr::str_trim(completenessData$SOURCE_VALUE))>0)
if (nrow(withSourceValue >0)) {
withSourceValue$DATA_SOURCE <- cdmSourceData$CDM_SOURCE_ABBREVIATION
networkIndex <- dplyr::bind_rows(networkIndex, withSourceValue)
}
}
}
}
Expand Down
Loading

0 comments on commit d98d737

Please sign in to comment.