-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6f3d455
Showing
27 changed files
with
2,646 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.ipynb_checkpoints | ||
env | ||
uniref90_fungal.fasta.gz | ||
clusters.tsv | ||
effector_homologues.fasta | ||
effector_homologues.tsv | ||
localised.tsv | ||
proteomes.tsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euo pipefail | ||
|
||
|
||
EFFECTORS="raw/fungal_effectors.tsv" | ||
UNIREF="raw/uniref90_fungal.fasta.gz" | ||
|
||
OUT_TABFILE="processed/effector_homologues.tsv" | ||
OUT_FASTAFILE="processed/effector_homologues.fasta" | ||
|
||
|
||
TMPDIR="tmp$$" | ||
mkdir -p "${TMPDIR}" | ||
mkdir -p "$(dirname "${OUT_TABFILE}")" | ||
mkdir -p "$(dirname "${OUT_FASTAFILE}")" | ||
|
||
awk -F'\t' '$6 != "no" {print $5"\t"$20}' "${EFFECTORS}" \ | ||
| tail -n+2 \ | ||
| sed 's/\*[[:space:]]*$//' \ | ||
| bin/tsv_to_fasta.sh \ | ||
> "${TMPDIR}/effectors.fasta" | ||
|
||
rm -rf -- "${TMPDIR}/query" "${TMPDIR}/target" "${TMPDIR}/results" "${TMPDIR}/tmp" | ||
mkdir "${TMPDIR}/query" "${TMPDIR}/target" "${TMPDIR}/results" "${TMPDIR}/tmp" | ||
|
||
mmseqs createdb "${TMPDIR}/effectors.fasta" "${TMPDIR}/query/db" | ||
mmseqs createdb "${UNIREF}" "${TMPDIR}/target/db" | ||
mmseqs createindex "${TMPDIR}/target/db" "${TMPDIR}/tmp" | ||
|
||
mmseqs search \ | ||
"${TMPDIR}/query/db" \ | ||
"${TMPDIR}/target/db" \ | ||
"${TMPDIR}/results/db" \ | ||
"${TMPDIR}/tmp" \ | ||
-e 0.00001 \ | ||
--start-sens 3 \ | ||
-s 7.0 \ | ||
--sens-steps 3 \ | ||
--cov-mode 0 \ | ||
-c 0.7 \ | ||
|
||
|
||
mmseqs convertalis \ | ||
"${TMPDIR}/query/db" \ | ||
"${TMPDIR}/target/db" \ | ||
"${TMPDIR}/results/db" \ | ||
"${TMPDIR}/target_matches.tsv" \ | ||
--format-output "query,target,evalue,pident,bits,qstart,qend,qlen,tstart,tend,tlen,theader" | ||
|
||
|
||
cut -f2 "${TMPDIR}/target_matches.tsv" \ | ||
| uniq \ | ||
> "${TMPDIR}/target_matches_unique.tsv" | ||
|
||
zcat "${UNIREF}" \ | ||
| bin/fasta_to_tsv.sh \ | ||
| grep -f "${TMPDIR}/target_matches_unique.tsv" -F \ | ||
| sort -k 1b,1 \ | ||
> "${TMPDIR}/target_match_sequences.tsv" | ||
|
||
bin/tsv_to_fasta.sh "${TMPDIR}/target_match_sequences.tsv" > "${OUT_FASTAFILE}" | ||
|
||
join -1 2 -2 1 -t ' ' \ | ||
<(sort -k 2b,2 "${TMPDIR}/target_matches.tsv") \ | ||
"${TMPDIR}/target_match_sequences.tsv" \ | ||
> "${OUT_TABFILE}" | ||
|
||
|
||
rm -rf -- "${TMPDIR}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euo pipefail | ||
|
||
OUTFILE="processed/localised.tsv" | ||
TMPDIR="tmp$$" | ||
|
||
mkdir -p "$(dirname "${OUTFILE}")" | ||
|
||
|
||
zcat raw/uniprot/uniprot_fungal_secreted.fasta.gz \ | ||
| bin/fasta_to_tsv.sh \ | ||
| awk '{print "secreted\t"$0}' \ | ||
| sort -u \ | ||
> "${OUTFILE}" | ||
|
||
zcat raw/uniprot/{uniprot_fungal_non_secreted.fasta.gz,uniprot_fungal_membrane.fasta.gz,uniprot_fungal_er.fasta.gz,uniprot_fungal_golgi.fasta.gz,/uniprot_fungal_gpi.fasta.gz} \ | ||
| bin/fasta_to_tsv.sh \ | ||
| sort -u \ | ||
| awk '{print "non_secreted\t"$0}' \ | ||
>> "${OUTFILE}" | ||
|
||
|
||
|
||
exit 0 | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
EFFECTORS="$1" | ||
|
||
|
||
# Homology reduce secreted | ||
awk -F'\t' '$3 != "no" {print $9"\t"$16}' data/fungal_effectors.tsv \ | ||
| tail -n+2 \ | ||
| sed 's/\*[[:space:]]*$//' \ | ||
| bin/tsv_to_fasta.sh \ | ||
> secreted.fasta | ||
|
||
zcat data/uniprot_fungal_secreted.fasta.gz >> secreted.fasta | ||
|
||
bin/reduce_homologous.sh reduced_secreted.fasta secreted.fasta | ||
|
||
|
||
# Homology reduce non-secreted | ||
zcat data/uniprot_fungal_non_secreted.fasta.gz data/uniprot_fungal_membrane.fasta.gz \ | ||
data/uniprot_fungal_er.fasta.gz data/uniprot_fungal_golgi.fasta.gz \ | ||
data/uniprot_fungal_gpi.fasta.gz \ | ||
| bin/fasta_to_tsv.sh \ | ||
| sort -u \ | ||
| bin/tsv_to_fasta.sh \ | ||
> non_secreted.fasta | ||
|
||
bin/reduce_homologous_remote.sh reduced_non_secreted.fasta non_secreted.fasta | ||
|
||
# Remove any remaining secreted from non-secreted | ||
bin/subset_by_search.sh subset_non_secreted.fasta reduced_non_secreted.fasta secreted.fasta | ||
|
||
|
||
# Generate secreted train_test_split | ||
|
||
awk -F'\t' '$3 != "no" && $1 == "test" {print $9}' data/fungal_effectors.tsv > test_targets.txt | ||
awk -F'\t' '$3 != "no" && $1 == "train" {print $9}' data/fungal_effectors.tsv > train_targets.txt | ||
|
||
grep -f test_targets.txt -F reduced_secreted.fasta.tsv | awk '{print $1}' > test_effector.txt | ||
|
||
grep -f train_targets.txt -vF reduced_secreted.fasta.tsv \ | ||
| awk '{print $1}' \ | ||
| shuf -n 100 \ | ||
> test_non_effector.txt | ||
|
||
bin/fasta_to_tsv.sh reduced_secreted.fasta \ | ||
| grep -F -f <(cat test_effector.txt test_non_effector.txt) \ | ||
| bin/tsv_to_fasta.sh \ | ||
> secreted_test.fasta | ||
|
||
bin/fasta_to_tsv.sh reduced_secreted.fasta \ | ||
| grep -vF -f <(cat test_effector.txt test_non_effector.txt) \ | ||
| bin/tsv_to_fasta.sh \ | ||
> secreted_train.fasta | ||
|
||
rm -f test_effector.txt test_non_effector.txt train_targets.txt test_targets.txt | ||
|
||
|
||
# Generate nonsecreted train test split | ||
bin/fasta_to_tsv.sh subset_non_secreted.fasta \ | ||
| shuf -n 1000 \ | ||
> non_secreted_test.tsv | ||
|
||
bin/tsv_to_fasta.sh non_secreted_test.tsv > non_secreted_test.fasta | ||
|
||
|
||
bin/fasta_to_tsv.sh reduced_non_secreted.fasta \ | ||
| grep -vF -f <(awk -F '\t' '{ print $1 }' non_secreted_test.tsv) \ | ||
| bin/tsv_to_fasta.sh \ | ||
> non_secreted_train.fasta | ||
|
||
rm -f non_secreted_train.tsv non_secreted_test.tsv | ||
rm -f secreted.fasta non_secreted.fasta reduced_non_secreted.fasta* reduced_secreted.fasta* | ||
rm -f subset_non_secreted.fasta |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euo pipefail | ||
|
||
INDIR="raw/proteomes" | ||
OUTFILE="processed/proteomes.tsv" | ||
mkdir -p "$(dirname "${OUTFILE}")" | ||
|
||
rm -f "${OUTFILE}" | ||
touch "${OUTFILE}" | ||
|
||
for f in "${INDIR}"/*.fasta | ||
do | ||
ISOLATE="$(basename "${f%.fasta}")" | ||
sed "s/^>/>${ISOLATE}_/" "${f}" \ | ||
| bin/fasta_to_tsv.sh \ | ||
| awk -F'\t' '{print "proteome\t"$0}' \ | ||
>> "${OUTFILE}" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -euo pipefail | ||
|
||
TMPDIR="tmp$$" | ||
|
||
EFFECTORS="raw/fungal_effectors.tsv" | ||
LOCALIZED="processed/localised.tsv" | ||
PROTEOMES="processed/proteomes.tsv" | ||
HOMOLOGS="processed/effector_homologues.fasta" | ||
|
||
OUTCLUSTERS="processed/clusters.tsv" | ||
|
||
mkdir -p "${TMPDIR}" | ||
mkdir -p "$(dirname "${OUTCLUSTERS}")" | ||
|
||
awk -F'\t' '{print $5"\t"$20}' "${EFFECTORS}" \ | ||
| tail -n+2 \ | ||
| sed 's/\*[[:space:]]*$//' \ | ||
| bin/tsv_to_fasta.sh \ | ||
> "${TMPDIR}/combined.fasta" | ||
|
||
awk -F'\t' 'BEGIN {OFS="\t"} {print $2, $3}' "${LOCALIZED}" | bin/tsv_to_fasta.sh >> "${TMPDIR}/combined.fasta" | ||
awk -F'\t' 'BEGIN {OFS="\t"} {print $2, $3}' "${PROTEOMES}" | bin/tsv_to_fasta.sh >> "${TMPDIR}/combined.fasta" | ||
|
||
cat "${HOMOLOGS}" >> "${TMPDIR}/combined.fasta" | ||
|
||
rm -rf -- "${TMPDIR}/seqs" "${TMPDIR}/clu" "${TMPDIR}/tmp" | ||
mkdir -p "${TMPDIR}/seqs" "${TMPDIR}/clu" "${TMPDIR}/tmp" | ||
|
||
mmseqs createdb "${TMPDIR}/combined.fasta" "${TMPDIR}/seqs/db" | ||
|
||
mmseqs cluster \ | ||
"${TMPDIR}/seqs/db" \ | ||
"${TMPDIR}/clu/db" \ | ||
"${TMPDIR}/tmp" \ | ||
--min-seq-id 0.3 \ | ||
--cov-mode 0 \ | ||
-c 0.7 \ | ||
--cluster-mode 0 | ||
|
||
mmseqs createtsv \ | ||
"${TMPDIR}/seqs/db" \ | ||
"${TMPDIR}/seqs/db" \ | ||
"${TMPDIR}/clu/db" \ | ||
"${OUTCLUSTERS}" |
Oops, something went wrong.