-
Notifications
You must be signed in to change notification settings - Fork 3
/
06_sf2.Rmd
280 lines (211 loc) · 12.3 KB
/
06_sf2.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
---
title: "Interpretation of SweepFinder results"
output:
github_document:
pandoc_args: --webtex
bibliography: bibliography.bib
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.width=12, fig.height=8,
echo=FALSE, warning=FALSE, message=FALSE)
library(tidyverse)
library(ggpubr)
source("R/constants.R")
```
Due to the extreme demographic history of Magnetic Island we found that it was not possible to distinguish selective sweeps from demographic effects in that location. Consequently our interpretation of SweepFinder results is restricted to Northern reefs. We examined loci with significant sweep scores in two ways. Firstly we looked at the entire Northern population as a whole for which sweeps could be interpreted as being due to adaptations required across all inshore sites. Secondly we looked at the difference between Marine (Fitzroy Island, Pelorus Island) and Plume (Pandora Reef, Dunk Island) sites. Sweeps designated as `Marine Only` or `Plume Only` were identified using bedtools (see [11_marine_vs_plume.sh](hpc/SF2/11_marine_vs_plume.sh) ). This produced the following files;
- `nomi_10_sweeps.gff` containing contiguous regions with SweepFinder scores > 10 using pooled allele frequencies across all northern reefs.
- `marine_only.gff` containing contiguous regions with SweepFinder scores > 10 that were in marine sites and not in plume
- `plume_only.gff` containing contiguous regions with SweepFinder scores > 10 that were in plume sites and not in marine
```{r}
scaff_sizes <- read_tsv("hpc/SF2/aten_chromsizes.txt",col_names = c("scaffold","scaffold_length"))
scaff_offset <- scaff_sizes %>%
dplyr::mutate(offset = dplyr::order_by(dplyr::desc(scaffold_length),cumsum(scaffold_length))) %>%
dplyr::mutate(scaff_type = as.integer(row_number() %% 2) )
gff_cols <- c('scaffold','source','feature','start','end','score','strand','phase','attributes')
sweeps <- read_tsv("hpc/SF2/nomi_10_sweeps.gff",col_names = gff_cols,col_types = cols()) %>%
mutate(length = end-start) %>%
left_join(scaff_offset) %>%
mutate(xpos = start + offset) %>%
unite(col = 'sweep_id',scaffold,start,sep = "_",remove = FALSE)
mo_sweeps <- read_tsv("hpc/SF2/marine_only.gff",col_names = gff_cols[c(1,4,5,6)],col_types = cols()) %>%
mutate(length = end-start) %>%
left_join(scaff_offset) %>%
mutate(xpos = start + offset) %>%
unite(col = 'sweep_id', scaffold, start, sep = "_", remove = FALSE)
po_sweeps <- read_tsv("hpc/SF2/plume_only.gff",col_names = gff_cols[c(1,4,5,6)],col_types = cols()) %>%
mutate(length = end-start) %>%
left_join(scaff_offset) %>%
mutate(xpos = start + offset) %>%
unite(col = 'sweep_id', scaffold, start, sep = "_", remove = FALSE)
```
Using the best fitting dadi model (`isolation_asym_mig`) as neutral background a CLR threshold of 100 gives an FDR of approximately 10%. The Manhattan plot below shows that these sites are distributed across the genome.
```{r}
ggplot(sweeps,aes(x=xpos)) +
geom_point(aes(y=score,color=scaff_type),size=0.1) +
geom_point(data=sweeps %>% filter(score>100),aes(y=score),color="red") +
theme_pubr() +
theme(legend.position = "none") + xlab("Genome Position") + ylab("SweepFinder 2 CLR Statistic")
```
```{r}
all_significant <- sweeps %>% filter(score>100)
```
```{r}
# Load table of overlaps between sweeps and genes at the chosen threshold
bedtools_cols <- paste('gene',c('scaffold','source','feature','start','end','score','strand','phase','attributes'),sep = "_")
bedtools_cols <- c(bedtools_cols,paste('sweep',c('scaffold','source','feature','start','end','score','strand','phase','attributes'),sep = "_"))
nomi_sweep_genes <- read_table2("hpc/SF2/nomi_10_sweep_genes.gff",col_names = bedtools_cols,col_types = cols()) %>%
dplyr::arrange(dplyr::desc(sweep_score)) %>%
dplyr::mutate(geneid = str_match(gene_attributes,pattern = "ID=([^;]+)")[,2]) %>%
dplyr::left_join(scaff_offset, by=c("sweep_scaffold"="scaffold")) %>%
dplyr::mutate(xpos = sweep_start + offset) %>%
dplyr::rename(score=sweep_score) %>%
tidyr::unite(col = 'sweep_id',sweep_scaffold,sweep_start,sep = "_")
```
```{r}
# Attach popgen stats to sweeps
#
# Popgen stat
tajima_stats <- read_tsv("hpc/angsd/north_Tajima_sweeps.tsv",col_names = c("scaffold","start","sweep_score","WinCenter","Tajima")) %>%
unite(col='sweep_id',scaffold,start,sep="_",remove = TRUE) %>% dplyr::select(-WinCenter,-sweep_score) %>%
group_by(sweep_id) %>%
summarise(Tajima=mean(Tajima))
fst_stats <- read_tsv("hpc/angsd/MI.north_Fst_sweeps.tsv",col_names = c("scaffold","start","sweep_score","WinCenter","Fst")) %>%
unite(col='sweep_id',scaffold,start,sep="_",remove = TRUE) %>% dplyr::select(-WinCenter,-sweep_score) %>%
group_by(sweep_id) %>%
summarise(Fst=mean(Fst))
nomi_sweep_genes_td <- nomi_sweep_genes %>%
left_join(tajima_stats, by='sweep_id') %>%
left_join(fst_stats, by="sweep_id")
```
```{r}
# Read in annotation information
# Gene annotations
annotations <- read_tsv("raw_data/annotation_table.tsv", col_types = cols()) %>%
mutate(gene_id = str_remove(aten_id,".m1$"))
# Attach annotations to sweep genes
annotated_sweep_genes <- nomi_sweep_genes_td %>% left_join(annotations, by=c('geneid'='gene_id')) %>%
group_by(geneid) %>%
top_n(1,score)
```
```{r}
# Sort and print key columns
sweep_genes_summary_all <- annotated_sweep_genes %>%
dplyr::select(sweep_id,geneid,score,Tajima,Fst,evalue,saccver,method,kegg,protein_name=`Protein names`,gene_name=`Gene names`,GO=ipr_go) %>%
arrange(desc(score))
sweep_genes_summary <- sweep_genes_summary_all %>% filter(score>100)
# This forms the basis of the initial gene list prior to filtering for Fst
#
#write_tsv(sweep_genes_summary,path = "results/sweep_genes_summary.tsv")
#write_tsv(sweep_genes_summary_all,path = "results/sweep_genes_summary_all.tsv")
```
Genes associated with these significant loci were identified using [bedtools window](https://bedtools.readthedocs.io/en/latest/content/tools/window.html). This reports all overlaps between sweep loci (encoded as `nomi_10_sweeps.gff` and gene models). (see [07_genes_in_sweeps.sh](hpc/SF2/07_genes_in_sweeps.sh) for details). A hand annotated version of this table is included as supplementary information with the paper.
```{r}
library(topGO)
gostring2vector <- function(gostring){
str_split(gostring,";")[[1]] %>% str_trim()
}
gene_ids <- annotations$gene_id
gostrings <- annotations$ipr_go
geneID2GO <- lapply(gostrings,gostring2vector)
names(geneID2GO) <- gene_ids
```
```{r}
get_enrichment <- function(onto,thresh=100){
sweep_genes <- annotated_sweep_genes %>%
filter(score>thresh) %>% pull(geneid) %>% unique()
target_list_membership <- factor(as.integer(gene_ids %in% sweep_genes))
names(target_list_membership) <- gene_ids
GOdata <- new("topGOdata",
ontology = onto,
allGenes = target_list_membership,
annot = annFUN.gene2GO,
gene2GO = geneID2GO,
nodeSize = 5)
# This runs the test to see if there are significantly enriched GO terms
resultFis <- runTest(GOdata, algorithm = "weight01", statistic = "fisher")
gt <- GenTable(GOdata, classic = resultFis,orderBy = "weight", ranksOf = "classic", topNodes = 50)
list(godata = GOdata, result = resultFis, table = gt)
}
```
```{r}
# This can be BP (Biological Process), MF (Molecular Function), CC (Cellular Compartment)
bp_topgo <- get_enrichment("BP")
mf_topgo <- get_enrichment("MF")
cc_topgo <- get_enrichment("CC")
bp_table <- bp_topgo$table
mf_table <- mf_topgo$table
cc_table <- cc_topgo$table
all_ontologies <- rbind(bp_table %>% add_column(ontology="BP"),
mf_table %>% add_column(ontology="MF"),
cc_table %>% add_column(ontology="CC")) %>%
filter(Significant>1)
#write_tsv(all_ontologies,"results/significant_go_terms.tsv")
```
Gene Ontology annotations were obtained for these genes through GO terms assigned to conserved domains (via Interproscan) and the results were used to search for terms that might be enriched in the sweep set compared to background. GO term enrichment analysis was done using the R package topGO version 2.36.0 [@Alexa2006-wf] using genes associated with sweeps (scores > 100) as the target set and all other annotated genes as background. topGO uses a weighting scheme (we used the weight01 scheme) to downweight genes that are also attached to related terms in the GO graph. Significance testing was performed using Fisher's exact test based on weighted gene counts. As outlined [in the topGO manual](https://bioconductor.org/packages/release/bioc/vignettes/topGO/inst/doc/topGO.pdf)) there is no clear way to apply a formal multiple-testing corrections for this p-value.
A single GO term, `GO:0005509 calcium ion binding` was significantly enriched among sweep genes. This term was associated with SOMPs as well as EGF domain containing genes, both of which were abundant in the target set.
```{r}
all_ontologies_sig <- all_ontologies %>% filter(as.numeric(classic)<0.01)
knitr::kable(all_ontologies_sig)
```
```{r}
gene_go_map <- sweep_genes_summary %>% dplyr::select(geneid,GO) %>% splitstackshape::cSplit("GO",sep=";",direction = "long") %>% mutate(GO=str_trim(GO))
significant_go_terms <- all_ontologies %>% filter(as.numeric(classic)<0.01) %>% filter(Significant>1) %>% pull(GO.ID)
genes_in_significant_go_terms <- gene_go_map %>% filter(GO %in% significant_go_terms) %>% pull(geneid) %>% unique()
annotated_go_significant_genes <- annotated_sweep_genes %>% filter(geneid %in% genes_in_significant_go_terms) %>%
dplyr::select(sweep_id,geneid,score,evalue,saccver,method,kegg,protein_name=`Protein names`,gene_name=`Gene names`,GO=`Gene ontology IDs`) %>%
arrange(desc(score))
#write_tsv(annotated_go_significant_genes,path = "results/annotated_go_significant_genes.tsv")
```
Genes annotated with the GO term `GO:0005509 calcium ion binding`.
```{r}
display_genes <- annotated_go_significant_genes %>% dplyr::select(geneid,CLR=score,UniprotID = saccver,`Protein Name`=protein_name)
knitr::kable(display_genes)
```
## Marine vs Plume
Sweep loci unique to either Marine or Plume were used to extract overlapping genes. A list of these genes is shown below.
```{r}
# Load table of overlaps between sweeps and genes at the chosen threshold
bedtools_mvsp_cols <- c(paste('gene',gff_cols,sep="_"),
paste('sweep',gff_cols[c(1,4,5,6)],sep="_"))
mo_sweep_genes <- read_table2("hpc/SF2/marine_only_sweep_genes.gff",col_names = bedtools_mvsp_cols,col_types = cols()) %>%
dplyr::arrange(dplyr::desc(sweep_score)) %>%
dplyr::mutate(geneid = str_match(gene_attributes,pattern = "ID=([^;]+)")[,2]) %>%
dplyr::left_join(scaff_offset, by=c("sweep_scaffold"="scaffold")) %>%
dplyr::mutate(xpos = sweep_start + offset) %>%
dplyr::rename(score=sweep_score) %>%
tidyr::unite(col = 'sweep_id',sweep_scaffold,sweep_start,sep = "_") %>%
add_column(wq = "marine")
po_sweep_genes <- read_table2("hpc/SF2/plume_only_sweep_genes.gff",col_names = bedtools_mvsp_cols,col_types = cols()) %>%
dplyr::arrange(dplyr::desc(sweep_score)) %>%
dplyr::mutate(geneid = str_match(gene_attributes,pattern = "ID=([^;]+)")[,2]) %>%
dplyr::left_join(scaff_offset, by=c("sweep_scaffold"="scaffold")) %>%
dplyr::mutate(xpos = sweep_start + offset) %>%
dplyr::rename(score=sweep_score) %>%
tidyr::unite(col = 'sweep_id',sweep_scaffold,sweep_start,sep = "_") %>%
add_column(wq = "plume")
mvsp_sweep_genes <- rbind(mo_sweep_genes,po_sweep_genes)
# Gene annotations
annotations <- read_tsv("raw_data/annotation_table.tsv", col_types = cols()) %>%
mutate(gene_id = str_remove(aten_id,".m1$"))
#Attach annotations to sweep genes
annotated_mvsp_sweep_genes <- mvsp_sweep_genes %>% left_join(annotations, by=c('geneid'='gene_id')) %>%
group_by(geneid) %>%
top_n(1,score)
```
```{r}
collapse_uniq <- function(vec){
vec_nona <- vec[!is.na(vec)]
if(length(vec_nona)<1){
return("")
}
paste(unique(vec_nona),collapse = ";")
}
# Sort and print key columns
mvsp_sweep_genes_summary <- annotated_mvsp_sweep_genes %>%
group_by(score) %>%
summarise(geneid = collapse_uniq(geneid), wq = collapse_uniq(wq), UniprotID = collapse_uniq(saccver), protein_name = collapse_uniq(`Protein names`)) %>%
mutate(CLR=as.numeric(score)) %>%
dplyr::arrange(dplyr::desc(CLR)) %>%
dplyr::filter(CLR>50)
knitr::kable(mvsp_sweep_genes_summary)
```