Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GO profiles Fixes #292

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ tool_test_output.html
tool_test_output.json
.vscode
bioconda_recipes
.gitignore
.gitignore
.Rproj.user
.Rhistory
11 changes: 1 addition & 10 deletions tools/proteore_goprofiles/.shed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,4 @@ categories: [Proteomics]
description: ProteoRE goProfiles
long_description: Statistical analysis of functional profiles
name: proteore_goprofiles
owner: proteore
include:
- README.rst
- goprofiles.R
- goprofiles.xml
- test-data/ID_Converted_FKW_Lacombe_et_al_2017_OK.txt
- test-data/GO_Profile_text_output.txt
- test-data/profile.BP.pdf
- test-data/profile.CC.pdf
- test-data/profile.MF.pdf
owner: galaxyp
247 changes: 133 additions & 114 deletions tools/proteore_goprofiles/goprofiles.R
Original file line number Diff line number Diff line change
@@ -1,40 +1,45 @@
options(warn=-1) #TURN OFF WARNINGS !!!!!!
options(warn = -1) #TURN OFF WARNINGS !!!!!!

# Load necessary libraries
suppressMessages(library(goProfiles,quietly = TRUE))
suppressMessages(library(goProfiles, quietly = TRUE))

# Read file and return file content as data.frame
read_file <- function(path,header){
file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE)
if (inherits(file,"try-error")){
read_file <- function(path, header) {
file <- try(read.csv(path, header = header,
sep = "\t", stringsAsFactors = FALSE,
quote = "\"", check.names = F), silent = TRUE)
if (inherits(file, "try-error")) {
stop("File not found !")
}else{
return(file)
}
}

#convert a string to boolean
str2bool <- function(x){
if (any(is.element(c("t","true"),tolower(x)))){
return (TRUE)
}else if (any(is.element(c("f","false"),tolower(x)))){
return (FALSE)
str2bool <- function(x) {
if (any(is.element(c("t", "true"), tolower(x)))) {
return(TRUE)
}else if (any(is.element(c("f", "false"), tolower(x)))) {
return(FALSE)
}else{
return(NULL)
}
}

check_ids <- function(vector,type) {
uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
if (type == "Entrez"){
return(grepl(entrez_id,vector))
check_ids <- function(vector, type) {
# nolint start
uniprot_pattern <-
"^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
# nolint end
if (type == "Entrez") {
return(grepl(entrez_id, vector))
} else if (type == "UniProt") {
return(grepl(uniprot_pattern,vector))
return(grepl(uniprot_pattern, vector))
}
}

getprofile = function(ids, id_type, level, duplicate,species) {
getprofile <- function(ids, id_type, level, duplicate, species) {
####################################################################
# Arguments
# - ids: list of input IDs
Expand All @@ -43,89 +48,100 @@ getprofile = function(ids, id_type, level, duplicate,species) {
# - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE)
# - species
####################################################################

library(species, character.only = TRUE, quietly = TRUE)
if (species=="org.Hs.eg.db"){
package=org.Hs.eg.db
} else if (species=="org.Mm.eg.db"){
package=org.Mm.eg.db
} else if (species=="org.Rn.eg.db"){
package=org.Rn.eg.db

if (species == "org.Hs.eg.db") {
package <- org.Hs.eg.db #nolint
} else if (species == "org.Mm.eg.db") {
package <- org.Mm.eg.db #nolint
} else if (species == "org.Rn.eg.db") {
package <- org.Rn.eg.db #nolint
}

# Check if level is number
if (! as.numeric(level) %% 1 == 0) {
stop("Please enter an integer for level")
} else {
level = as.numeric(level)
level <- as.numeric(level)
}
#genes = as.vector(file[,ncol])


# Extract Gene Entrez ID
if (id_type == "Entrez") {
id = select(package, ids, "ENTREZID", multiVals = "first")
id <- select(package, ids, "ENTREZID", multiVals = "first") #nolint
} else {
id = select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first")
id <- select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first") #nolint
}
if (duplicate) {
id <- unique(id)
}
if (duplicate) { id = unique(id) }
genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))]
NAs = id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID

genes_ids <- id$ENTREZID[which(!is.na(id$ENTREZID))]
nas <- id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID #nolint

# Create basic profiles
profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T)
profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T)
profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T)
profile.ALL = basicProfile(genes_ids, onto='ANY', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T)
# Print profile
# printProfiles(profile)

profile.CC <-
basicProfile(genes_ids, onto = "CC", level = level,
orgPackage = species, empty.cats = F, ord = T, na.rm = T)
profile.BP <-
basicProfile(genes_ids, onto = "BP", level = level,
orgPackage = species, empty.cats = F, ord = T, na.rm = T)
profile.MF <-
basicProfile(genes_ids, onto = "MF", level = level,
orgPackage = species, empty.cats = F, ord = T, na.rm = T)
profile.ALL <-
basicProfile(genes_ids, onto = "ANY", level = level,
orgPackage = species, empty.cats = F, ord = T, na.rm = T)

return(c(profile.CC, profile.MF, profile.BP, profile.ALL))
}

#return height and width of plot in inches from profile
plot_size_from_nb_onto <- function(profile){
width=10
range = seq(25, 2000, by=25)
names(range) = seq(5,242, by=3)
nb_onto = round(nrow(profile[[1]])/25)*25
if (nb_onto < 25) {nb_onto = 25}
if (nb_onto <= 2000) {
height= as.integer(names(which(range==nb_onto)))
} else {
height=250
}
return (c(width,height))
plot_size_from_nb_onto <- function(profile) {
width <- 10
range <- seq(25, 2000, by = 25)
names(range) <- seq(5, 242, by = 3)
nb_onto <- round(nrow(profile[[1]]) / 25) * 25
if (nb_onto < 25) {
nb_onto <- 25
}
if (nb_onto <= 2000) {
height <- as.integer(names(which(range == nb_onto)))
} else {
height <- 250
}
return(c(width, height))
}

make_plot <- function(profile,percent,title,onto,plot_opt){
tmp <- plot_size_from_nb_onto (profile)
make_plot <- function(profile, percent, title, onto, plot_opt) {

tmp <- plot_size_from_nb_onto(profile)
width <- tmp[1]
height <- tmp[2]

if (plot_opt == "PDF") {
file_name=paste("profile_",onto,".pdf",collapse="",sep="")
pdf(file_name, width=width, heigh=height)
} else if (plot_opt == "JPEG"){
file_name=paste("profile_",onto,".jpeg",collapse="",sep="")
jpeg(file_name,width=width, height=height, units = "in", res=100)
} else if (plot_opt == "PNG"){
file_name=paste("profile_",onto,".png",collapse="",sep="")
png(file_name,width=width, height=height, units = "in", res=100)
file_name <- paste("profile_", onto, ".pdf", collapse = "", sep = "")
pdf(file_name, width = width, height = height)
} else if (plot_opt == "JPEG") {
file_name <- paste("profile_", onto, ".jpeg", collapse = "", sep = "")
jpeg(file_name, width = width, height = height, units = "in", res = 100)
} else if (plot_opt == "PNG") {
file_name <- paste("profile_", onto, ".png", collapse = "", sep = "")
png(file_name, width = width, height = height, units = "in", res = 100)
}
plotProfiles(profile, percentage=percent, multiplePlots=FALSE, aTitle=title)
plotProfiles(profile, percentage = percent,
multiplePlots = FALSE, aTitle = title)
dev.off()
}

goprofiles = function() {
goprofiles <- function() {
args <- commandArgs(TRUE)
if(length(args)<1) {
if (length(args) < 1) {
args <- c("--help")
}

# Help section
if("--help" %in% args) {
if ("--help" %in% args) {
cat("Selection and Annotation HPA
Arguments:
--input_type: type of input (list of id or filename)
Expand All @@ -141,63 +157,66 @@ goprofiles = function() {
--duplicate: remove dupliate input IDs (true/false)
--text_output: text output filename \n
--species")
q(save="no")
q(save = "no")
}

# Parse arguments
parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
args <- as.list(as.character(argsDF$V2))
names(args) <- argsDF$V1

#save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/goprofiles/args.Rda")
#load("/home/dchristiany/proteore_project/ProteoRE/tools/goprofiles/args.Rda")

id_type = args$id_type
input_type = args$input_type
parse_args <- function(x) strsplit(sub("^--", "", x), "=")
args_df <- as.data.frame(do.call("rbind", parse_args(args)))
args <- as.list(as.character(args_df$V2))
names(args) <- args_df$V1


id_type <- args$id_type
input_type <- args$input_type
if (input_type == "text") {
input = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";"))
input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
} else if (input_type == "file") {
filename = args$input
ncol = args$ncol
filename <- args$input
ncol <- args$ncol
# Check ncol
if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
stop("Please enter an integer for level")
} else {
ncol = as.numeric(gsub("c", "", ncol))
ncol <- as.numeric(gsub("c", "", ncol))
}
header = str2bool(args$header)
header <- str2bool(args$header)
# Get file content
file = read_file(filename, header)
file <- read_file(filename, header)
# Extract Protein IDs list
input = unlist(strsplit(as.character(file[,ncol]),";"))
input <- unlist(strsplit(as.character(file[, ncol]), ";"))
}
input = input [which(!is.na(gsub("NA",NA,input)))]

if (! any(check_ids(input,id_type))){
stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file"))
input <- input [which(!is.na(gsub("NA", NA, input)))]

if (! any(check_ids(input, id_type))) {
stop(paste(id_type,
"not found in your ids list, please check your IDs in input
or the selected column of your input file"))
}
ontoopt = strsplit(args$onto_opt, ",")[[1]]
onto_pos = as.integer(gsub("BP",3,gsub("MF",2,gsub("CC",1,ontoopt))))
plotopt = args$plot_opt
level = args$level
per = as.logical(args$per)
title = args$title
duplicate = str2bool(args$duplicate)
text_output = args$text_output
species=args$species

profiles = getprofile(input, id_type, level, duplicate,species)

ontoopt <- strsplit(args$onto_opt, ",")[[1]]
onto_pos <- as.integer(gsub("BP", 3, gsub("MF", 2, gsub("CC", 1, ontoopt))))
plotopt <- args$plot_opt
level <- args$level
per <- as.logical(args$per)
title <- args$title
duplicate <- str2bool(args$duplicate)
text_output <- args$text_output
species <- args$species

profiles <- getprofile(input, id_type, level, duplicate, species)

for (index in onto_pos) {
onto = names(profiles[index])
profile=profiles[index]
make_plot(profile,per,title,onto,plotopt)
text_output=paste("goProfiles_",onto,"_",title,".tsv",sep="",collapse="")
profile = as.data.frame(profile)
profile <- as.data.frame(apply(profile, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" and " " to NA
write.table(profile, text_output, sep="\t", row.names = FALSE, quote=FALSE, col.names = T)
onto <- names(profiles[index])
profile <- profiles[index]
make_plot(profile, per, title, onto, plotopt)
text_output <- paste("goProfiles_", onto, "_",
title, ".tsv", sep = "", collapse = "")
profile <- as.data.frame(profile)
profile <- as.data.frame(apply(profile, c(1, 2),
function(x) gsub("^$|^ $", NA, x))) #convert "" and " " to NA
write.table(profile, text_output, sep = "\t",
row.names = FALSE, quote = FALSE, col.names = T)
}
}

Expand Down
Loading