Skip to content

Commit

Permalink
dataset for v1
Browse files Browse the repository at this point in the history
  • Loading branch information
darcyabjones committed Jun 16, 2020
0 parents commit 6f3d455
Show file tree
Hide file tree
Showing 27 changed files with 2,646 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.ipynb_checkpoints
env
uniref90_fungal.fasta.gz
clusters.tsv
effector_homologues.fasta
effector_homologues.tsv
localised.tsv
proteomes.tsv
70 changes: 70 additions & 0 deletions 01-enrich_effectors.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env bash

set -euo pipefail


EFFECTORS="raw/fungal_effectors.tsv"
UNIREF="raw/uniref90_fungal.fasta.gz"

OUT_TABFILE="processed/effector_homologues.tsv"
OUT_FASTAFILE="processed/effector_homologues.fasta"


TMPDIR="tmp$$"
mkdir -p "${TMPDIR}"
mkdir -p "$(dirname "${OUT_TABFILE}")"
mkdir -p "$(dirname "${OUT_FASTAFILE}")"

awk -F'\t' '$6 != "no" {print $5"\t"$20}' "${EFFECTORS}" \
| tail -n+2 \
| sed 's/\*[[:space:]]*$//' \
| bin/tsv_to_fasta.sh \
> "${TMPDIR}/effectors.fasta"

rm -rf -- "${TMPDIR}/query" "${TMPDIR}/target" "${TMPDIR}/results" "${TMPDIR}/tmp"
mkdir "${TMPDIR}/query" "${TMPDIR}/target" "${TMPDIR}/results" "${TMPDIR}/tmp"

mmseqs createdb "${TMPDIR}/effectors.fasta" "${TMPDIR}/query/db"
mmseqs createdb "${UNIREF}" "${TMPDIR}/target/db"
mmseqs createindex "${TMPDIR}/target/db" "${TMPDIR}/tmp"

mmseqs search \
"${TMPDIR}/query/db" \
"${TMPDIR}/target/db" \
"${TMPDIR}/results/db" \
"${TMPDIR}/tmp" \
-e 0.00001 \
--start-sens 3 \
-s 7.0 \
--sens-steps 3 \
--cov-mode 0 \
-c 0.7 \


mmseqs convertalis \
"${TMPDIR}/query/db" \
"${TMPDIR}/target/db" \
"${TMPDIR}/results/db" \
"${TMPDIR}/target_matches.tsv" \
--format-output "query,target,evalue,pident,bits,qstart,qend,qlen,tstart,tend,tlen,theader"


cut -f2 "${TMPDIR}/target_matches.tsv" \
| uniq \
> "${TMPDIR}/target_matches_unique.tsv"

zcat "${UNIREF}" \
| bin/fasta_to_tsv.sh \
| grep -f "${TMPDIR}/target_matches_unique.tsv" -F \
| sort -k 1b,1 \
> "${TMPDIR}/target_match_sequences.tsv"

bin/tsv_to_fasta.sh "${TMPDIR}/target_match_sequences.tsv" > "${OUT_FASTAFILE}"

join -1 2 -2 1 -t ' ' \
<(sort -k 2b,2 "${TMPDIR}/target_matches.tsv") \
"${TMPDIR}/target_match_sequences.tsv" \
> "${OUT_TABFILE}"


rm -rf -- "${TMPDIR}"
104 changes: 104 additions & 0 deletions 02-process_secretome.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env bash

set -euo pipefail

OUTFILE="processed/localised.tsv"
TMPDIR="tmp$$"

mkdir -p "$(dirname "${OUTFILE}")"


zcat raw/uniprot/uniprot_fungal_secreted.fasta.gz \
| bin/fasta_to_tsv.sh \
| awk '{print "secreted\t"$0}' \
| sort -u \
> "${OUTFILE}"

zcat raw/uniprot/{uniprot_fungal_non_secreted.fasta.gz,uniprot_fungal_membrane.fasta.gz,uniprot_fungal_er.fasta.gz,uniprot_fungal_golgi.fasta.gz,/uniprot_fungal_gpi.fasta.gz} \
| bin/fasta_to_tsv.sh \
| sort -u \
| awk '{print "non_secreted\t"$0}' \
>> "${OUTFILE}"



exit 0








EFFECTORS="$1"


# Homology reduce secreted
awk -F'\t' '$3 != "no" {print $9"\t"$16}' data/fungal_effectors.tsv \
| tail -n+2 \
| sed 's/\*[[:space:]]*$//' \
| bin/tsv_to_fasta.sh \
> secreted.fasta

zcat data/uniprot_fungal_secreted.fasta.gz >> secreted.fasta

bin/reduce_homologous.sh reduced_secreted.fasta secreted.fasta


# Homology reduce non-secreted
zcat data/uniprot_fungal_non_secreted.fasta.gz data/uniprot_fungal_membrane.fasta.gz \
data/uniprot_fungal_er.fasta.gz data/uniprot_fungal_golgi.fasta.gz \
data/uniprot_fungal_gpi.fasta.gz \
| bin/fasta_to_tsv.sh \
| sort -u \
| bin/tsv_to_fasta.sh \
> non_secreted.fasta

bin/reduce_homologous_remote.sh reduced_non_secreted.fasta non_secreted.fasta

# Remove any remaining secreted from non-secreted
bin/subset_by_search.sh subset_non_secreted.fasta reduced_non_secreted.fasta secreted.fasta


# Generate secreted train_test_split

awk -F'\t' '$3 != "no" && $1 == "test" {print $9}' data/fungal_effectors.tsv > test_targets.txt
awk -F'\t' '$3 != "no" && $1 == "train" {print $9}' data/fungal_effectors.tsv > train_targets.txt

grep -f test_targets.txt -F reduced_secreted.fasta.tsv | awk '{print $1}' > test_effector.txt

grep -f train_targets.txt -vF reduced_secreted.fasta.tsv \
| awk '{print $1}' \
| shuf -n 100 \
> test_non_effector.txt

bin/fasta_to_tsv.sh reduced_secreted.fasta \
| grep -F -f <(cat test_effector.txt test_non_effector.txt) \
| bin/tsv_to_fasta.sh \
> secreted_test.fasta

bin/fasta_to_tsv.sh reduced_secreted.fasta \
| grep -vF -f <(cat test_effector.txt test_non_effector.txt) \
| bin/tsv_to_fasta.sh \
> secreted_train.fasta

rm -f test_effector.txt test_non_effector.txt train_targets.txt test_targets.txt


# Generate nonsecreted train test split
bin/fasta_to_tsv.sh subset_non_secreted.fasta \
| shuf -n 1000 \
> non_secreted_test.tsv

bin/tsv_to_fasta.sh non_secreted_test.tsv > non_secreted_test.fasta


bin/fasta_to_tsv.sh reduced_non_secreted.fasta \
| grep -vF -f <(awk -F '\t' '{ print $1 }' non_secreted_test.tsv) \
| bin/tsv_to_fasta.sh \
> non_secreted_train.fasta

rm -f non_secreted_train.tsv non_secreted_test.tsv
rm -f secreted.fasta non_secreted.fasta reduced_non_secreted.fasta* reduced_secreted.fasta*
rm -f subset_non_secreted.fasta
19 changes: 19 additions & 0 deletions 03-process_proteomes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

set -euo pipefail

INDIR="raw/proteomes"
OUTFILE="processed/proteomes.tsv"
mkdir -p "$(dirname "${OUTFILE}")"

rm -f "${OUTFILE}"
touch "${OUTFILE}"

for f in "${INDIR}"/*.fasta
do
ISOLATE="$(basename "${f%.fasta}")"
sed "s/^>/>${ISOLATE}_/" "${f}" \
| bin/fasta_to_tsv.sh \
| awk -F'\t' '{print "proteome\t"$0}' \
>> "${OUTFILE}"
done
46 changes: 46 additions & 0 deletions 04-reduce_homology.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash

set -euo pipefail

TMPDIR="tmp$$"

EFFECTORS="raw/fungal_effectors.tsv"
LOCALIZED="processed/localised.tsv"
PROTEOMES="processed/proteomes.tsv"
HOMOLOGS="processed/effector_homologues.fasta"

OUTCLUSTERS="processed/clusters.tsv"

mkdir -p "${TMPDIR}"
mkdir -p "$(dirname "${OUTCLUSTERS}")"

awk -F'\t' '{print $5"\t"$20}' "${EFFECTORS}" \
| tail -n+2 \
| sed 's/\*[[:space:]]*$//' \
| bin/tsv_to_fasta.sh \
> "${TMPDIR}/combined.fasta"

awk -F'\t' 'BEGIN {OFS="\t"} {print $2, $3}' "${LOCALIZED}" | bin/tsv_to_fasta.sh >> "${TMPDIR}/combined.fasta"
awk -F'\t' 'BEGIN {OFS="\t"} {print $2, $3}' "${PROTEOMES}" | bin/tsv_to_fasta.sh >> "${TMPDIR}/combined.fasta"

cat "${HOMOLOGS}" >> "${TMPDIR}/combined.fasta"

rm -rf -- "${TMPDIR}/seqs" "${TMPDIR}/clu" "${TMPDIR}/tmp"
mkdir -p "${TMPDIR}/seqs" "${TMPDIR}/clu" "${TMPDIR}/tmp"

mmseqs createdb "${TMPDIR}/combined.fasta" "${TMPDIR}/seqs/db"

mmseqs cluster \
"${TMPDIR}/seqs/db" \
"${TMPDIR}/clu/db" \
"${TMPDIR}/tmp" \
--min-seq-id 0.3 \
--cov-mode 0 \
-c 0.7 \
--cluster-mode 0

mmseqs createtsv \
"${TMPDIR}/seqs/db" \
"${TMPDIR}/seqs/db" \
"${TMPDIR}/clu/db" \
"${OUTCLUSTERS}"
Loading

0 comments on commit 6f3d455

Please sign in to comment.