-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_cgh_dictionary_cytoband.R
executable file
·199 lines (161 loc) · 7 KB
/
2_cgh_dictionary_cytoband.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#################################################
# USAGE
#
# Given a dataset, this code will give the 0-based start and stop indices for
# the sections in the chromosome arms. The argument "subdir" allows you to create
#subdirectories to run modifications of the original directories if needed.
#
# Input: aCGH file with no missing values formatted as specified in TAaCGH_Manual
# If the name of the dataset is "set", the name of the file should end with "data_full.txt"
# that is set_data_full.txt
#
# Output: Tab delimited file named set_dict_cyto.txt with the following columns:
# "Chrom", "Arm", "Beg", "End", "Length", "Segment", "bpStart", "bpEnd", "CytoStart", "CytoEnd"
# where "bpStart", "bpEnd" are the start-index and stop-index repectively for the section.
# The index is 0-based, so downstream analysis with python or other 0-start languages
# can use the dictionary as is. Otherwise, in 1-start languages (R, etc.), the code will
# need to account for this.
#################################################
# COMMAND LINE ARGUMENTS
# 4. dataSet: short name for dataSet (e.g. set)
# 5. numParts: number of parts to split the dictionary. Usually 8
# 6. action: arms, sections
# 7. segLength: Section size (Best: 20 to 50). In the case of arms it will take the full arm, but
# segLength will be the minimum number of probes to run a specific arm.
# 8. subdir a directory within /dataSet dir to read the dictionaries
# use arms if action=arms or sect if action=sections
# you can use a different directory if running subsets of the original dictionaries
# EXAMPLE
# Note: use vanilla when testing and slave when ready to avoid the log
# R --vanilla --args set 8 arms 20 arms < 2_cgh_dictionary_cytoband.R
# R --vanilla --args horlings 8 sections 20 sect < 2_cgh_dictionary_cytoband.R
#library(gtools) #it looks it doesn't need this library
# Get the command line arguments
args = commandArgs();
dataSet <- args[4];
numParts <- as.integer(args[5]);
action <- args[6];
segLength <- as.integer(args[7]);
subdir <- args[8];
# # to run locally
# dataSet <- "kwek8p11q";
# numParts <- 1;
# action <- "sections";
# segLength <- 20;
# subdir <- "sect";
###############################
# FUNCTIONS NEEDED
slice <- function(x,n) {
N <- length(x);
lapply(seq(1, N, n), function(i) x[i:min(i+n-1,N)]);
}
#begPath <- "~/Research";
begPath <- "..";
DictPath <- paste(begPath, 'Data', dataSet,subdir, sep='/');
if(!file.exists(DictPath)) {
dir.create(DictPath)
}
###############################
# READ aCGH FILE
dataFile <- paste(dataSet, "data", "full.txt", sep="_");
dataPath <- paste(begPath, "Data", dataSet, dataFile, sep="/");
# path to create a duplicate
dupFile <- paste(dataSet, "data", "orig.txt", sep="_");
dupPath <- paste(begPath, "Data", dataSet, dupFile, sep="/");
print(dataPath);
data <- read.table(dataPath, header=T, sep='\t', comment.char="", stringsAsFactors = FALSE);
# Making sure the dataset was in the right order. Will replace original file
# saving a copy of the original file under set_data_orig.txt
# if Chrom has characters (like "X") will read as character variable and
# will have produce a weird order (I believe it doesn't matter)
#TODO will be nice to have a proper routine to check for format complience
if(!file.exists(dupPath)) {
write.table(data,dupPath,sep='\t',row.names = FALSE)
}
data<-data[order(data$Chrom, data$Arm, data$bp),]
write.table(data,dataPath,sep='\t',row.names = FALSE)
###############################
# BEGIN PROGRAM
# Strip out the full list of chromosomes and arms
chromList <- data$Chrom;
armList <- data$Arm;
# Determine the unique chromosomes
chroms <- unique(chromList);
arms <- c("p", "q");
dict <- c();
for(chrom in chroms) {
for(arm in arms) {
# Collect the indices
indices <- intersect(which(data$Chrom == chrom), which(data$Arm == arm));
if(length(indices) != 0) {
# The -1s make it so the indices are 0-based for later use
# in the python scripts hom_stats.py and net_stats.py
beg <- indices[1] - 1;
end <- indices[length(indices)] - 1;
armLength <- end - beg;
#if(armLength > minProbes) {
if(armLength > segLength) {
if(action == "arms") {
dict <- rbind(dict, c(chrom, arm, beg, end, end-beg, 1));
} else if(action == "sections") {
# Initialize variables to track location of segment for while loop
segNum <- 1;
#segLength <- 20;
segAdvance <- round(segLength/2);
segBegIndex <- beg;
segEndIndex <- beg + segLength - 1;
while(segEndIndex <= end) {
# If there is a buffer from the end bigger than 15, add a non-last segment
#if(abs(end - segEndIndex) > 9) {
if(abs(end - segEndIndex) > segAdvance-1) {
# These indices are getting things from R array so need to be base 1.
bpStart <- data$bp[segBegIndex+1];
bpEnd <- data$bp[segEndIndex+1];
cytoStart <- toString(data$Cytoband[segBegIndex+1]);
cytoEnd <- toString(data$Cytoband[segEndIndex+1]);
dict <- rbind(dict, c(chrom, arm, segBegIndex, segEndIndex, segEndIndex-segBegIndex, segNum, bpStart, bpEnd, cytoStart, cytoEnd));
# dict <- rbind(dict, c(chrom, arm, segBegIndex, segEndIndex, segEndIndex-segBegIndex, segNum));
# Update stuff
segNum <- segNum + 1;
segBegIndex <- segBegIndex + segAdvance;
segEndIndex <- segBegIndex + segLength - 1;
} else {
# These indices are getting things from R array so need to be base 1.
bpStart <- data$bp[segBegIndex+1];
bpEnd <- data$bp[end+1];
cytoStart <- toString(data$Cytoband[segBegIndex+1]);
cytoEnd <- toString(data$Cytoband[end+1]);
# Otherwise, this should be the last segment
dict <- rbind(dict, c(chrom, arm, segBegIndex, end, end-segBegIndex, segNum, bpStart, bpEnd, cytoStart, cytoEnd));
# dict <- rbind(dict, c(chrom, arm, segBegIndex, end, end-segBegIndex, segNum));
segBegIndex <- segBegIndex + segAdvance;
segEndIndex <- segEndIndex + segLength - 1;
}
}
}
}
}
}
}
if(action == "arms") {
# Change column names
colnames(dict) <- c("Chrom", "Arm", "Beg", "End", "Length","OneSeg");
} else if(action == "sections") {
# Change column names
colnames(dict) <- c("Chrom", "Arm", "Beg", "End", "Length", "Segment", "bpStart", "bpEnd", "CytoStart", "CytoEnd");
# colnames(dict) <- c("Chrom", "Arm", "Beg", "End", "Length", "Segment");
}
# Split up the dictionary
indices <- c(1:nrow(dict));
partsList <- slice(indices, ceiling(nrow(dict) / numParts));
for(i in c(1:length(partsList)))
{
partDictFile <- paste(dataSet, '_',subdir,'_dict_', i, '.txt', sep='');
partDictPath <- paste(begPath, 'Data', dataSet,subdir, partDictFile, sep='/');
print(dict[partsList[[i]], ]);
write.table(dict[partsList[[i]], ], partDictPath, sep='\t', row.names=FALSE);
}
# Setup and write the dictionary file
dictFile <- paste(dataSet,subdir, "dict_cyto.txt", sep="_");
dictPath <- paste(begPath, "Data", dataSet, subdir, dictFile, sep="/");
write.table(dict, dictPath, sep='\t', row.names=FALSE);