From e8ce898d4154881400d3e0368b01fe2dd5f323a4 Mon Sep 17 00:00:00 2001
From: combesf <florence.combes@cea.fr>
Date: Tue, 6 Apr 2021 14:32:47 +0200
Subject: [PATCH 1/2] R code and wrapper version

R code syntax correction
and wrapper version update
---
 .../add_protein_features.R                    | 258 ++++++++++--------
 .../add_protein_features.xml                  |   2 +-
 2 files changed, 143 insertions(+), 117 deletions(-)

diff --git a/tools/proteore_prot_features/add_protein_features.R b/tools/proteore_prot_features/add_protein_features.R
index 41df2a4..01e38e7 100644
--- a/tools/proteore_prot_features/add_protein_features.R
+++ b/tools/proteore_prot_features/add_protein_features.R
@@ -1,195 +1,221 @@
 # Read file and return file content as data.frame
-read_file <- function(path,header){
-  file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="", check.names = F),silent=TRUE)
-  if (inherits(file,"try-error")){
+read_file <- function(path, header) {
+  file <- try(read.csv(path, header = header, sep = "\t",
+                       stringsAsFactors = FALSE, quote = "", check.names = F),
+              silent = TRUE)
+  if (inherits(file, "try-error")) {
     stop("File not found !")
   }else{
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+    file <- file[!apply(is.na(file) | file == "", 1, all), drop = FALSE]
     return(file)
   }
 }
 
-order_columns <- function (df,ncol,id_type,file){
-  if (id_type=="Uniprot_AC"){ncol=ncol(file)}
-  if (ncol==1){ #already at the right position
-    return (df)
+order_columns <- function(df, ncol, id_type, file) {
+  if (id_type == "Uniprot_AC") {
+    ncol <- ncol(file)
+    }
+  if (ncol == 1) { #already at the right position
+    return(df)
   } else {
-    df = df[,c(2:ncol,1,(ncol+1):dim.data.frame(df)[2])]
+    df <- df[, c(2:ncol, 1, (ncol + 1):dim.data.frame(df)[2])]
   }
-  return (df)
+  return(df)
 }
 
-get_list_from_cp <-function(list){
-  list = gsub(";","\t",list)
-  list = strsplit(list, "[ \t\n]+")[[1]]
-  list = gsub("NA","",list)
-  list = list[list != ""]    #remove empty entry
-  list = gsub("-.+", "", list)  #Remove isoform accession number (e.g. "-2")
+get_list_from_cp <- function(list) {
+  list <- gsub(";", "\t", list)
+  list <- strsplit(list, "[ \t\n]+")[[1]]
+  list <- gsub("NA", "", list)
+  list <- list[list != ""]    #remove empty entry
+  list <- gsub("-.+", "", list)  #Remove isoform accession number (e.g. "-2")
   return(list)
 }
 
-get_args <- function(){
-  
+get_args <- function() {
   ## Collect arguments
   args <- commandArgs(TRUE)
-  
+
   ## Default setting when no arguments passed
-  if(length(args) < 1) {
+  if (length(args) < 1) {
     args <- c("--help")
   }
-  
+
   ## Help section
-  if("--help" %in% args) {
+  if ("--help" %in% args) {
     cat("Selection and Annotation HPA
         Arguments:
-          --inputtype: type of input (list of id or filename)
+        --inputtype: type of input (list of id or filename)
         --input: input
         --nextprot: path to nextprot information file
         --column: the column number which you would like to apply...
         --header: true/false if your file contains a header
         --type: the type of input IDs (Uniprot_AC/EntrezID)
-        --pc_features: IsoPoint,SeqLength,MW,Chr,SubcellLocations,Diseases,protein_name,function,post_trans_mod,protein_family,pathway
+        --pc_features: IsoPoint,SeqLength,MW,Chr,SubcellLocations,Diseases,
+        protein_name,function,post_trans_mod,protein_family,pathway
         --output: text output filename \n")
-    
-    q(save="no")
+
+  q(save = "no")
   }
-  
-  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
-  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
-  args <- as.list(as.character(argsDF$V2))
-  names(args) <- argsDF$V1
-  
+
+  parseargs <- function(x) strsplit(sub("^--", "", x), "=")
+  argsdf <- as.data.frame(do.call("rbind", parseargs(args)))
+  args <- as.list(as.character(argsdf$V2))
+  names(args) <- argsdf$V1
+
   return(args)
 }
 
-str2bool <- function(x){
-  if (any(is.element(c("t","true"),tolower(x)))){
-    return (TRUE)
-  }else if (any(is.element(c("f","false"),tolower(x)))){
-    return (FALSE)
+str2bool <- function(x) {
+  if (any(is.element(c("t", "true"), tolower(x)))) {
+    return(TRUE)
+  }else if (any(is.element(c("f", "false"), tolower(x)))) {
+    return(FALSE)
   }else{
     return(NULL)
   }
 }
 
 #take data frame, return  data frame
-split_ids_per_line <- function(line,ncol){
-  
+split_ids_per_line <- function(line, ncol) {
   #print (line)
-  header = colnames(line)
-  line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol])
-  
-  if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) {
-    if (length(line)==1 ) {
-      lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F)
+  header <- colnames(line)
+  line[ncol] <- gsub("[[:blank:]]|\u00A0", "", line[ncol])
+
+  if (length(unlist(strsplit(as.character(line[ncol]), ";"))) > 1) {
+    if (length(line) == 1) {
+      lines <- as.data.frame(unlist(strsplit(as.character(line[ncol]), ";")),
+                             stringsAsFactors = F)
     } else {
-      if (ncol==1) {                                #first column
-        lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)]))
-      } else if (ncol==length(line)) {                 #last column
-        lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";"))))
+      if (ncol == 1) {                                #first column
+        lines <- suppressWarnings(cbind(unlist(strsplit(
+          as.character(line[ncol]), ";")), line[2:length(line)]))
+      } else if (ncol == length(line)) {                 #last column
+        lines <- suppressWarnings(cbind(line[1:ncol - 1], unlist(strsplit(
+          as.character(line[ncol]), ";"))))
       } else {
-        lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)]))
+        lines <- suppressWarnings(cbind(line[1:ncol - 1], unlist(strsplit(
+          as.character(line[ncol]), ";"), use.names = F),
+          line[(ncol + 1):length(line)]))
       }
     }
-    colnames(lines)=header
+    colnames(lines) <- header
     return(lines)
   } else {
     return(line)
   }
 }
 
-#create new lines if there's more than one id per cell in the columns in order to have only one id per line
-one_id_one_line <-function(tab,ncol){
-  
-  if (ncol(tab)>1){
-    
-    tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x))
-    header=colnames(tab)
-    res=as.data.frame(matrix(ncol=ncol(tab),nrow=0))
-    for (i in 1:nrow(tab) ) {
-      lines = split_ids_per_line(tab[i,],ncol)
-      res = rbind(res,lines)
+#create new lines if there's more than one id per cell in the columns
+#in order to have only one id per line
+one_id_one_line <- function(tab, ncol) {
+
+  if (ncol(tab) > 1) {
+
+    tab[, ncol] <- sapply(tab[, ncol], function(x) gsub("[[:blank:]]", "", x))
+    header <- colnames(tab)
+    res <- as.data.frame(matrix(ncol = ncol(tab), nrow = 0))
+    for (i in seq_len(nrow(tab))) {
+      lines <- split_ids_per_line(tab[i, ], ncol)
+      res <- rbind(res, lines)
     }
   }else {
-    res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F)
-    res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F)
-    colnames(res)=colnames(tab)
+    res <- unlist(sapply(tab[, 1], function(x) strsplit(x, ";")), use.names = F)
+    res <- data.frame(res[which(!is.na(res[res != ""]))], stringsAsFactors = F)
+    colnames(res) <- colnames(tab)
   }
   return(res)
 }
 
 # Get information from neXtProt
-get_nextprot_info <- function(nextprot,input,pc_features,localization,diseases_info){
-  cols = c("NextprotID",pc_features)
-  cols=cols[cols!="None"]
-  info = nextprot[match(input,nextprot$NextprotID),intersect(colnames(nextprot),cols)]
+get_nextprot_info <- function(nextprot, input, pc_features, localization,
+                              diseases_info) {
+  cols <- c("nextprotid", pc_features)
+  cols <- cols[cols != "None"]
+  info <- nextprot[match(input, nextprot$nextprotid),
+                   intersect(colnames(nextprot), cols)]
   return(info)
 }
 
-protein_features = function() {
+protein_features <- function() {
+
+  args <- get_args()
+
+  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/
+  #add_protein_features/args.rda")
+  #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_protein_features
+  #/args.rda")
 
-  args <- get_args()  
-  
-  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/add_protein_features/args.rda")
-  #load("/home/dchristiany/proteore_project/ProteoRE/tools/add_protein_features/args.rda")
-  
   #setting variables
-  inputtype = args$inputtype
+  inputtype <- args$inputtype
   if (inputtype == "copy_paste") {
-    input = get_list_from_cp(args$input)
-    file = data.frame(input,stringsAsFactors = F)
-    ncol=1
+    input <- get_list_from_cp(args$input)
+    file <- data.frame(input, stringsAsFactors = F)
+    ncol <- 1
   } else if (inputtype == "file") {
-    filename = args$input
-    ncol = args$column
+    filename <- args$input
+    ncol <- args$column
     # Check ncol
     if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
       stop("Please enter an integer for level")
     } else {
-      ncol = as.numeric(gsub("c", "", ncol))
+      ncol <- as.numeric(gsub("c", "", ncol))
     }
-    
-    header = str2bool(args$header)
-    file = read_file(filename, header)                                                    # Get file content
-    if (any(grep(";",file[,ncol]))) {file = one_id_one_line(file,ncol)}
-    if (args$type == "NextprotID" && ! "NextprotID" %in% colnames(file)) { colnames(file)[ncol] <- "NextprotID" 
-    } else if (args$type == "NextprotID" && "NextprotID" %in% colnames(file) && match("NextprotID",colnames(file))!=ncol ) { 
-      colnames(file)[match("NextprotID",colnames(file))] <- "old_NextprotID" 
-      colnames(file)[ncol] = "NextprotID"
+
+    header <- str2bool(args$header)
+    file <- read_file(filename, header)  # Get file content
+    if (any(grep(";", file[, ncol]))) {
+      file <- one_id_one_line(file, ncol)
+      }
+    if (args$type == "nextprotid" && ! "nextprotid" %in% colnames(file)) {
+      colnames(file)[ncol] <- "nextprotid"
+    } else if (args$type == "nextprotid" && "nextprotid" %in% colnames(file)
+               && match("nextprotid", colnames(file)) != ncol) {
+      colnames(file)[match("nextprotid", colnames(file))] <- "old_nextprotid"
+      colnames(file)[ncol] <- "nextprotid"
     }
   }
 
   # Read reference file
-  nextprot = read_file(args$nextprot,T)
-  
+  nextprot <- read_file(args$nextprot, T)
+
   # Parse arguments
-  id_type = args$type
-  pc_features = strsplit(args$pc_features, ",")[[1]]
-  output = args$output
-
-  # Change the sample ids if they are Uniprot_AC ids to be able to match them with
-  # Nextprot data
-  if (id_type=="Uniprot_AC"){
-    NextprotID = gsub("^NX_$","",gsub("^","NX_",file[,ncol]))
-    file = cbind(file,NextprotID)
-    if (inputtype=="copy_paste") {colnames(file)[1]="Uniprot-AC"}
-    ncol=ncol(file)
+  id_type <- args$type
+  pc_features <- strsplit(args$pc_features, ",")[[1]]
+  output <- args$output
+
+  # Change the sample ids if they are Uniprot_AC ids to be able to match
+  # them with Nextprot data
+  if (id_type == "Uniprot_AC") {
+    nextprotid <- gsub("^NX_$", "", gsub("^", "NX_", file[, ncol]))
+    file <- cbind(file, nextprotid)
+    if (inputtype == "copy_paste") {
+      colnames(file)[1] <- "Uniprot-AC"
+      }
+    ncol <- ncol(file)
   }
-  NextprotID = file[,ncol]
+  nextprotid <- file[, ncol]
 
   #Select user input protein ids in nextprot
-  #NextprotID = unique(NextprotID[which(!is.na(NextprotID[NextprotID!=""]))])
-  if (all(!NextprotID %in% nextprot[,1])){
-    write.table("None of the input ids can be found in Nextprot",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
+  #nextprotid = unique(nextprotid[which(!is.na(nextprotid[nextprotid!=""]))])
+  if (all(!nextprotid %in% nextprot[, 1])) {
+    write.table("None of the input ids can be found in Nextprot", file = output,
+                sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
   } else {
-    res <- get_nextprot_info(nextprot,NextprotID,pc_features)
-    res = res[!duplicated(res$NextprotID),]
-    output_content = merge(file, res,by.x=ncol,by.y="NextprotID",incomparables = NA,all.x=T)
-    output_content = order_columns(output_content,ncol,id_type,file)
-    if (id_type=="Uniprot_AC"){output_content = output_content[,-which(colnames(output_content)=="NextprotID")]}      #remove nextprotID column
-    output_content <- as.data.frame(apply(output_content, c(1,2), function(x) gsub("^$|^ $", NA, x)))  #convert "" et " " to NA
-    write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
-  } 
-  
+    res <- get_nextprot_info(nextprot, nextprotid, pc_features)
+    res <- res[!duplicated(res$nextprotid), ]
+    output_content <- merge(file, res, by.x = ncol, by.y = "nextprotid",
+                            incomparables = NA, all.x = T)
+    output_content <- order_columns(output_content, ncol, id_type, file)
+    if (id_type == "Uniprot_AC") {
+      output_content <- output_content[, -which(colnames(output_content) ==
+                              "nextprotid")]
+      } #remove nextprotID column
+    output_content <- as.data.frame(apply(output_content, c(1, 2),
+                function(x) gsub("^$|^ $", NA, x)))  #convert "" and " " to NA
+    write.table(output_content, output, row.names = FALSE, sep = "\t",
+                quote = FALSE)
+    }
+
 }
 protein_features()
diff --git a/tools/proteore_prot_features/add_protein_features.xml b/tools/proteore_prot_features/add_protein_features.xml
index fb35775..0786d70 100644
--- a/tools/proteore_prot_features/add_protein_features.xml
+++ b/tools/proteore_prot_features/add_protein_features.xml
@@ -1,4 +1,4 @@
-<tool id="prot_features" name="Add protein features" version="2020.08.17">
+<tool id="prot_features" name="Add protein features" version="2021.04.06">
 <description>[neXtProt, Human]
 </description>
 <requirements>

From 9095ceda4f503470596c877539abe9c640a7dd3b Mon Sep 17 00:00:00 2001
From: combesf <florence.combes@cea.fr>
Date: Tue, 6 Apr 2021 14:37:41 +0200
Subject: [PATCH 2/2] Update add_protein_features.R

syntax again
---
 tools/proteore_prot_features/add_protein_features.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/proteore_prot_features/add_protein_features.R b/tools/proteore_prot_features/add_protein_features.R
index 01e38e7..7485749 100644
--- a/tools/proteore_prot_features/add_protein_features.R
+++ b/tools/proteore_prot_features/add_protein_features.R
@@ -196,8 +196,6 @@ protein_features <- function() {
   }
   nextprotid <- file[, ncol]
 
-  #Select user input protein ids in nextprot
-  #nextprotid = unique(nextprotid[which(!is.na(nextprotid[nextprotid!=""]))])
   if (all(!nextprotid %in% nextprot[, 1])) {
     write.table("None of the input ids can be found in Nextprot", file = output,
                 sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)