Skip to content

Commit

Permalink
modified: functions/results.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
mattilalab committed Jan 24, 2022
1 parent a10dc24 commit 6e2565c
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 117 deletions.
188 changes: 101 additions & 87 deletions functions/results.sh
Original file line number Diff line number Diff line change
Expand Up @@ -209,115 +209,123 @@ results_cleanup() {
# column statistics
extract_columns_stats(){
local coevPair="${1}"
cd $coevPair

# Run Luqman's script for Bonferroni & co correction of p-values
echo -e "Running R for Bonferroni correction for $coevPair"
Rscript $CWD/R/AdjPval.R bothWays.tsv bothWays-corrected.tsv
if [ -d "$coevPair" ]; then
cd $coevPair

# Run Luqman's script for Bonferroni & co correction of p-values
echo -e "Running R for Bonferroni correction for $coevPair"
Rscript $CWD/R/AdjPval.R bothWays.tsv bothWays-corrected.tsv

sed 1d bothWays-corrected.tsv | while read -r msa1 msa2 colA realA colB realB corr boot p_value bonferroni holm bh hochberg hommel by fdr; do
sed 1d bothWays-corrected.tsv | while read -r msa1 msa2 colA realA colB realB corr boot p_value bonferroni holm bh hochberg hommel by fdr; do

# make sure the amino acid position exists in the reference organism
if (( $(echo "$realA > 0" |bc -l) && $(echo "$realB > 0" |bc -l) )); then
# make sure the amino acid position exists in the reference organism
if (( $(echo "$realA > 0" |bc -l) && $(echo "$realB > 0" |bc -l) )); then

# get amino acid and Gblocks score
seq1=$( sed "${realA}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa1}.fa.${ORGANISM}.ref | awk '{print $1$2}' )
seq2=$( sed "${realB}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa2}.fa.${ORGANISM}.ref | awk '{print $1$2}' )
flt1=$( sed "${realA}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa1}.fa.${ORGANISM}.ref | awk '{print $3}' )
flt2=$( sed "${realB}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa2}.fa.${ORGANISM}.ref | awk '{print $3}' )

# Convert Gblocks scores to numbers (. = 0 ; # = 1)
if [ "$flt1" = "#" ]; then
gblscore1="1"
elif [ "$flt1" = "." ]; then
gblscore1="0"
else
echo "Check Gblocks scores!"
fi
if [ "$flt2" = "#" ]; then
gblscore2="1"
elif [ "$flt2" = "." ]; then
gblscore2="0"
else
echo "Check Gblocks scores!"
fi
# get amino acid and Gblocks score
seq1=$( sed "${realA}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa1}.fa.${ORGANISM}.ref | awk '{print $1$2}' )
seq2=$( sed "${realB}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa2}.fa.${ORGANISM}.ref | awk '{print $1$2}' )
flt1=$( sed "${realA}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa1}.fa.${ORGANISM}.ref | awk '{print $3}' )
flt2=$( sed "${realB}q;d" $TMP/$MSA/$GUIDANCEMSA-$GUIDANCECUT-$MSAMETHOD/${msa2}.fa.${ORGANISM}.ref | awk '{print $3}' )

# Convert Gblocks scores to numbers (. = 0 ; # = 1)
if [ "$flt1" = "#" ]; then
gblscore1="1"
elif [ "$flt1" = "." ]; then
gblscore1="0"
else
echo "Check Gblocks scores!"
fi
if [ "$flt2" = "#" ]; then
gblscore2="1"
elif [ "$flt2" = "." ]; then
gblscore2="0"
else
echo "Check Gblocks scores!"
fi

### Calculate mean Gblocks score. Do we really need this?
#gblscore=$(printf "${gblscore1}\n $gblscore2" | datamash mean 1)
### Calculate mean Gblocks score. Do we really need this?
#gblscore=$(printf "${gblscore1}\n $gblscore2" | datamash mean 1)

# Extract MSA columns for the co-evolving amino acids
echo -e "Extracting $colA-$colB"
cat ./msa/$msa1.fa | seqkit subseq -r ${colA}:${colA} | seqkit sort -o columnStats/$msa1-$colA.txt
cat ./msa/$msa2.fa | seqkit subseq -r ${colB}:${colB} | seqkit sort -o columnStats/$msa2-$colB.txt

columnTotalA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | wc -l)
columnGapsA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | grep "-" | wc -l)
roundPerGapA=$(printf "%1.5f" `echo "1 - ${columnGapsA}/${columnTotalA}" | bc -l`)

columnNoGapsA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | grep -v "-" | wc -l)
columnUniqueA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | grep -v "-" | sort | uniq -c | sort -n -r | sed -n 1p | awk '{ print $1 }')
roundDivResA=$(printf "%1.5f" `echo "1 - ${columnUniqueA}/${columnNoGapsA}" | bc -l`)

columnTotalB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | wc -l)
columnGapsB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | grep "-" | wc -l)
roundPerGapB=$(printf "%1.5f" `echo "1 - ${columnGapsB}/${columnTotalB}" | bc -l`)

columnNoGapsB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | grep -v "-" | wc -l)
columnUniqueB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | grep -v "-" | sort | uniq -c | sort -n -r | sed -n 1p | awk '{ print $1 }')
roundDivResB=$(printf "%1.5f" `echo "1 - ${columnUniqueB}/${columnNoGapsB}" | bc -l`)

# Calculate gaps and diversity
GapsAB=$(printf "${roundPerGapA}\n $roundPerGapB" | datamash mean 1 )
DivsAB=$(printf "${roundDivResA}\n $roundDivResB" | datamash mean 1 )
# Extract MSA columns for the co-evolving amino acids
echo -e "Extracting $colA-$colB"
cat ./msa/$msa1.fa | seqkit subseq -r ${colA}:${colA} | seqkit sort -o columnStats/$msa1-$colA.txt
cat ./msa/$msa2.fa | seqkit subseq -r ${colB}:${colB} | seqkit sort -o columnStats/$msa2-$colB.txt

columnTotalA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | wc -l)
columnGapsA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | grep "-" | wc -l)
roundPerGapA=$(printf "%1.5f" `echo "1 - ${columnGapsA}/${columnTotalA}" | bc -l`)

columnNoGapsA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | grep -v "-" | wc -l)
columnUniqueA=$(sed '/^>/d' columnStats/$msa1-$colA.txt | grep -v "-" | sort | uniq -c | sort -n -r | sed -n 1p | awk '{ print $1 }')
roundDivResA=$(printf "%1.5f" `echo "1 - ${columnUniqueA}/${columnNoGapsA}" | bc -l`)

columnTotalB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | wc -l)
columnGapsB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | grep "-" | wc -l)
roundPerGapB=$(printf "%1.5f" `echo "1 - ${columnGapsB}/${columnTotalB}" | bc -l`)

columnNoGapsB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | grep -v "-" | wc -l)
columnUniqueB=$(sed '/^>/d' columnStats/$msa2-$colB.txt | grep -v "-" | sort | uniq -c | sort -n -r | sed -n 1p | awk '{ print $1 }')
roundDivResB=$(printf "%1.5f" `echo "1 - ${columnUniqueB}/${columnNoGapsB}" | bc -l`)

# Calculate gaps and diversity
GapsAB=$(printf "${roundPerGapA}\n $roundPerGapB" | datamash mean 1 )
DivsAB=$(printf "${roundDivResA}\n $roundDivResB" | datamash mean 1 )

# Include correlation threshold as well. Just in case, make sure value is not in scientific format
corrT1=$(printf "%1.10f" `sed -n '2p' ${msa1}.fa_${msa2}.fa-coev_inter.csv | awk '{print $6}'`)
corrT2=$(printf "%1.10f" `sed -n '2p' ${msa2}.fa_${msa1}.fa-coev_inter.csv | awk '{print $6}'`)
corrT=$(printf "${corrT1}\n $corrT2" | datamash mean 1)
# Include correlation threshold as well. Just in case, make sure value is not in scientific format
corrT1=$(printf "%1.10f" `sed -n '2p' ${msa1}.fa_${msa2}.fa-coev_inter.csv | awk '{print $6}'`)
corrT2=$(printf "%1.10f" `sed -n '2p' ${msa2}.fa_${msa1}.fa-coev_inter.csv | awk '{print $6}'`)
corrT=$(printf "${corrT1}\n $corrT2" | datamash mean 1)

# Calculate coevolutionary correlation, normalized to threshold. For normalization:
# https://www.mathworks.com/matlabcentral/answers/322438-normalize-data-with-a-threshold
# Thanks to Dian Dimitrov.
normC=$(printf "%1.10f" `echo "($corr - $corrT)/(1 - $corrT)" |bc -l`)
# Calculate coevolutionary correlation, normalized to threshold. For normalization:
# https://www.mathworks.com/matlabcentral/answers/322438-normalize-data-with-a-threshold
# Thanks to Dian Dimitrov.
normC=$(printf "%1.10f" `echo "($corr - $corrT)/(1 - $corrT)" |bc -l`)
else
# Make it echo sth else...
# Make it echo sth else...
echo "No amino acid pairs passed the Bonferroni correction for $msa1 $msa2!"
fi
echo "$msa1 $msa2 $colA $realA $colB $realB $seq1 $seq2 $gblscore1 $gblscore2 $GapsAB $DivsAB $corrT $corr $normC $boot $p_value $bonferroni $holm $bh $hochberg $hommel $by $fdr" >> bothWays-corrected-columns.tsv
echo "$msa1 $msa2 $colA $realA $colB $realB $seq1 $seq2 $gblscore1 $gblscore2 $GapsAB $DivsAB $corrT $corr $normC $boot $p_value $bonferroni $holm $bh $hochberg $hommel $by $fdr" >> $TMP/$RESULTS/allResidues.tsv
done
sed -i "1i msa1 msa2 colA realA colB realB seq1 seq2 gblscore1 gblscore2 GapsAB DivsAB corrT corr normC boot p_value bonferroni holm bh hochberg hommel by fdr" bothWays-corrected-columns.tsv
done
sed -i "1i msa1 msa2 colA realA colB realB seq1 seq2 gblscore1 gblscore2 GapsAB DivsAB corrT corr normC boot p_value bonferroni holm bh hochberg hommel by fdr" bothWays-corrected-columns.tsv
elif [ ! -d "$coevPair" ]; then
echo "No protein pairs"
else
echo "Something went wrong!"
fi
cd ..
}

# Export column statistics
protein_pairs_stats() {
local coevPair="${1}"
cd $coevPair
if [ -d "$coevPair" ]; then
cd $coevPair

# Define UniProt numbers of the two proteins
msa_1=$(sed -n '2p' bothWays-corrected-columns.tsv | awk '{print $1}')
msa_2=$(sed -n '2p' bothWays-corrected-columns.tsv | awk '{print $2}')
# Define UniProt numbers of the two proteins
msa_1=$(sed -n '2p' bothWays-corrected-columns.tsv | awk '{print $1}')
msa_2=$(sed -n '2p' bothWays-corrected-columns.tsv | awk '{print $2}')

### Count the number of co-evolving sites from each protein. We have headers, so skip them...
sitesCountA=$(sed 1d bothWays-corrected-columns.tsv | awk '{print $4}' | datamash countunique 1)
sitesCountB=$(sed 1d bothWays-corrected-columns.tsv | awk '{print $6}' | datamash countunique 1)
### Count the number of co-evolving sites from each protein. We have headers, so skip them...
sitesCountA=$(sed 1d bothWays-corrected-columns.tsv | awk '{print $4}' | datamash countunique 1)
sitesCountB=$(sed 1d bothWays-corrected-columns.tsv | awk '{print $6}' | datamash countunique 1)

### Report total comparisons (MSA1*MSA2). They are the same FWD and REV.
totCompar=$(sed -n '2p' ${msa_1}.fa_${msa_2}.fa-coev_inter.csv | awk '{print $4}')
### Report total comparisons (MSA1*MSA2). They are the same FWD and REV.
totCompar=$(sed -n '2p' ${msa_1}.fa_${msa_2}.fa-coev_inter.csv | awk '{print $4}')

### Report correlation threshold
coevThr=$(sed -n '2p' bothWays-corrected-columns.tsv | awk '{print $13}')
### Report correlation threshold
coevThr=$(sed -n '2p' bothWays-corrected-columns.tsv | awk '{print $13}')

### Include average correlation
avgRa=$(printf "%1.10f" `sed -n '2p' ${msa_1}.fa_${msa_2}.fa-coev_inter.csv | awk '{print $7}'`)
avgRb=$(printf "%1.10f" `sed -n '2p' ${msa_2}.fa_${msa_1}.fa-coev_inter.csv | awk '{print $7}'`)
averR=$(printf "${avgRa}\n $avgRb" | datamash mean 1)
### Include average correlation
avgRa=$(printf "%1.10f" `sed -n '2p' ${msa_1}.fa_${msa_2}.fa-coev_inter.csv | awk '{print $7}'`)
avgRb=$(printf "%1.10f" `sed -n '2p' ${msa_2}.fa_${msa_1}.fa-coev_inter.csv | awk '{print $7}'`)
averR=$(printf "${avgRa}\n $avgRb" | datamash mean 1)

### Include average significant correlation
avgSigRa=$(printf "%1.10f" `sed -n '2p' ${msa_1}.fa_${msa_2}.fa-coev_inter.csv | awk '{print $8}'`)
avgSigRb=$(printf "%1.10f" `sed -n '2p' ${msa_2}.fa_${msa_1}.fa-coev_inter.csv | awk '{print $8}'`)
averSigR=$(printf "${avgSigRa}\n $avgSigRb" | datamash mean 1)
### Include average significant correlation
avgSigRa=$(printf "%1.10f" `sed -n '2p' ${msa_1}.fa_${msa_2}.fa-coev_inter.csv | awk '{print $8}'`)
avgSigRb=$(printf "%1.10f" `sed -n '2p' ${msa_2}.fa_${msa_1}.fa-coev_inter.csv | awk '{print $8}'`)
averSigR=$(printf "${avgSigRa}\n $avgSigRb" | datamash mean 1)

#coevNumber=$(awk '{print $3}' $coevPair/summary-${PVALUE}-${BONFERRONI}.tsv | datamash count 1)

Expand Down Expand Up @@ -376,9 +384,15 @@ protein_pairs_stats() {

# Collect data
#echo "$msa_1 $msa_2 $coevThr $avgCor $avgSignCor $totCompar $coevNumAll $coevMIN $coevMAX $coevMEAN $coevSDEV $coevNumber $cCoevMIN $cCoevMAX $cCoevMEAN $cCoevSDEV $bootMIN $bootMAX $bootMEAN $bootSDEV $pMeanMIN $pMeanMAX $pMeanMEAN $pMeanSDEV $BonferroniMIN $BonferroniMAX $BonferroniMEAN $BonferroniSDEV $gblocksMIN $gblocksMAX $gblocksMEAN $gblocksSDEV $chiboth $chiboth_fin" >> $EOUT
echo "$msa_1 $msa_2 $coevThr $averR $averSigR $totCompar $sitesCountA $sitesCountB $GapsMIN $GapsMAX $GapsMEAN $DivsMIN $DivsMAX $DivsMEAN $cCoevMIN $cCoevMAX $cCoevMEAN $bootMIN $bootMAX $bootMEAN $pMeanMIN $pMeanMAX $pMeanMEAN $BonferroniMIN $BonferroniMAX $BonferroniMEAN $chiboth_fin" >> $EOUT
echo "${msa_1} ${msa_2} added"
cd ..
echo "$msa_1 $msa_2 $coevThr $averR $averSigR $totCompar $sitesCountA $sitesCountB $GapsMIN $GapsMAX $GapsMEAN $DivsMIN $DivsMAX $DivsMEAN $cCoevMIN $cCoevMAX $cCoevMEAN $bootMIN $bootMAX $bootMEAN $pMeanMIN $pMeanMAX $pMeanMEAN $BonferroniMIN $BonferroniMAX $BonferroniMEAN $chiboth_fin" >> $EOUT
echo "${msa_1} ${msa_2} added"
cd ..
elif [ ! -d "$coevPair" ]; then
echo "No protein pairs"
else
echo "Something went wrong!"
fi

}

# Clean up the results, add column headers
Expand Down
12 changes: 0 additions & 12 deletions proteins/17S-DNA-pol.mmus.1001

This file was deleted.

8 changes: 0 additions & 8 deletions proteins/CCT.mmus.CORUM-51.tsv

This file was deleted.

38 changes: 38 additions & 0 deletions proteins/NADH.mmus
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
P03888 Mtnd1 NADH-dehydrogenase
P03899 Mtnd3 NADH-dehydrogenase
P03911 Mtnd4 NADH-dehydrogenase
O35683 Ndufa1 NADH-dehydrogenase
Q99LC3 Ndufa10 NADH-dehydrogenase
Q7TMF3 Ndufa12 NADH-dehydrogenase
Q9ERS2 Ndufa13 NADH-dehydrogenase
Q9CQ75 Ndufa2 NADH-dehydrogenase
Q9CQ91 Ndufa3 NADH-dehydrogenase
Q62425 Ndufa4 NADH-dehydrogenase
Q9CPP6 Ndufa5 NADH-dehydrogenase
Q9CQZ5 Ndufa6 NADH-dehydrogenase
Q9Z1P6 Ndufa7 NADH-dehydrogenase
Q9DCJ5 Ndufa8 NADH-dehydrogenase
Q9DC69 Ndufa9 NADH-dehydrogenase
Q9CR21 Ndufab1 NADH-dehydrogenase
Q9DCS9 Ndufb10 NADH-dehydrogenase
O09111 Ndufb11 NADH-dehydrogenase
Q9CPU2 Ndufb2 NADH-dehydrogenase
Q9CQZ6 Ndufb3 NADH-dehydrogenase
Q9CQC7 Ndufb4 NADH-dehydrogenase
Q9CQH3 Ndufb5 NADH-dehydrogenase
Q3UIU2 Ndufb6 NADH-dehydrogenase
Q9CR61 Ndufb7 NADH-dehydrogenase
Q9D6J5 Ndufb8 NADH-dehydrogenase
Q9CQJ8 Ndufb9 NADH-dehydrogenase
Q9CQ54 Ndufc2 NADH-dehydrogenase
Q91VD9 Ndufs1 NADH-dehydrogenase
Q91WD5 Ndufs2 NADH-dehydrogenase
Q9DCT2 Ndufs3 NADH-dehydrogenase
Q9CXZ1 Ndufs4 NADH-dehydrogenase
Q99LY9 Ndufs5 NADH-dehydrogenase
P52503 Ndufs6 NADH-dehydrogenase
Q9DC70 Ndufs7 NADH-dehydrogenase
Q8K3J1 Ndufs8 NADH-dehydrogenase
Q91YT0 Ndufv1 NADH-dehydrogenase
Q9D6J6 Ndufv2 NADH-dehydrogenase
Q9MD82 mt-Nd5 NADH-dehydrogenase
9 changes: 0 additions & 9 deletions proteins/mRNA.mmus.537

This file was deleted.

2 changes: 1 addition & 1 deletion settings.conf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ORGANISM="10090" # Taxid of the reference organism (e.g. "10090" for M. musculus
LEVEL="40674" # Level at which to search for orthologues; 2759 (Eukaryota); 33208 (Metazoa); 7742 (Vertebrata); 32523 (Tetrapoda); 40674 (Mammalia)

## WORKING AND DATABASE DIRS
TMP="/var/tmp/autocoev2" # Working folder
TMP="/var/tmp/autocoev3" # Working folder
DTB="/var/tmp/DB10v1" # Folder where databases are unpacked

## THREADS UTILIZATION
Expand Down

0 comments on commit 6e2565c

Please sign in to comment.