Skip to content

Commit

Permalink
Refactoring, arg list handling, add two parameters (#11)
Browse files Browse the repository at this point in the history
* (1) Fix handling of ks_peaks.tsv; (2) shorten 19_pan_aug_leftover_merged_prot to 19_palm to reduce length of arglist; (3) move small families to the side in alignment step; (4) correct printing of header for counts per accession

* MANY changes in the course of moving alignment & modeling steps to -common.sh. Likely breakages. Incomplete.

* Various fixes in alignment and modeling steps, after moving this code to -common

* Handle argument list too long errors around augment_cluster_sets.awk

* Add Wm82.gnm6 annotation to Glycine pangene config & make file

* Remove 04_dag at start of filter, to avoid problems of leftover files from previous runs

* Minor cosmetic change in stats/ks_histplots.tsv

* Add two params, min_align_count min_annots_in_align, to the reporting in the summarize step

* Fix listing of pandagma_conf_params reported in summarize step, and other minor cleanup

* Add two alignment parameters to config files; and minor changes to pandagma-fsup.sh to work with the pandagma driver script and pandagma-common.sh

* Changes in step xfr_aligns_trees to handle output from fsup

* Remove optional steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees from commandlist in -pan and -fam workflows

* Bug fix & documentation tweaks in pandagma-fsup.sh

* Minor change to handling of directory/file transfer in pandagma-fsup.sh

* Minor change in stats reporting

* Restructure ks_filter step so that ks_block_wgd_cutoff and max_pair_ks are used if stats/ks_histplots.tsv isn't provided

* Handle transfer of 19_pan_aug_leftover_merged_prot in -common rather than in -fam

* Tweak output of counts-per-accession header in stats.txt file
  • Loading branch information
StevenCannon-USDA authored Mar 1, 2024
1 parent 80556c0 commit 030b8ad
Show file tree
Hide file tree
Showing 30 changed files with 796 additions and 246 deletions.
2 changes: 1 addition & 1 deletion batch_fam_example_singularity.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ singularity exec $IMAGE pandagma fam -c $CONFIG
#singularity exec $IMAGE pandagma fam -c $CONFIG -s consense
#singularity exec $IMAGE pandagma fam -c $CONFIG -s cluster_rest
#singularity exec $IMAGE pandagma fam -c $CONFIG -s add_extra
#singularity exec $IMAGE pandagma fam -c $CONFIG -s align
#singularity exec $IMAGE pandagma fam -c $CONFIG -s align_protein
#singularity exec $IMAGE pandagma fam -c $CONFIG -s model_and_trim
#singularity exec $IMAGE pandagma fam -c $CONFIG -s calc_trees
#singularity exec $IMAGE pandagma fam -c $CONFIG -s summarize
Expand Down
75 changes: 75 additions & 0 deletions batch_fam_prod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/bash
#SBATCH -A m4440
#SBATCH -q regular
#SBATCH -N 1
#SBATCH -n 30 # number of cores/tasks in this job
#SBATCH -t 23:00:00
#SBATCH -C cpu
#SBATCH -J pand-fam2
#SBATCH -o %x_%j.out
#SBATCH -e %x_%j.err

set -o errexit
set -o nounset
set -o xtrace

date # print timestamp

# If using conda environment for dependencies:
module load conda
conda activate pandagma

PDGPATH=$PWD
CONFIG=$PWD/config/family3_22_3.conf

echo "Config: $CONFIG"

export PATH=$PATH:$PDGPATH/bin
echo "PATH: $PATH"

##########
# Test PATH
which pandagma
which calc_ks_from_dag.pl

##########
## Fetch relevant data files; e.g.
#mkdir -p data
#make -C data -f $PWD/get_data/family3_22_3.mk

##########
## Filter transposable elements
#pandagma TEfilter -c $CONFIG

##########
## Run all main steps, assuming input data files exist in ./data
## Work directory will be ./work_pandagma
## Output will go to ./out_pandagma
#pandagma fam -c $CONFIG -d data_TEfilter

##########
## Run individual steps
#pandagma fam -c $CONFIG -s ingest -d data_TEfilter
#pandagma fam -c $CONFIG -s mmseqs
#pandagma fam -c $CONFIG -s filter
#pandagma fam -c $CONFIG -s dagchainer
#pandagma fam -c $CONFIG -s ks_calc
#pandagma fam -c $CONFIG -s ks_filter
#pandagma fam -c $CONFIG -s mcl
#pandagma fam -c $CONFIG -s consense
#pandagma fam -c $CONFIG -s cluster_rest
#pandagma fam -c $CONFIG -s add_extra
#pandagma fam -c $CONFIG -s tabularize
#pandagma fam -c $CONFIG -s align_protein
#pandagma fam -c $CONFIG -s model_and_trim
#pandagma fam -c $CONFIG -s calc_trees
pandagma fam -c $CONFIG -s xfr_aligns_trees
pandagma fam -c $CONFIG -s summarize

##########
## Optional work-directory cleanup steps
#pandagma fam -c $CONFIG -s clean
#rm -rf ./work_pandagma

date # print timestamp

3 changes: 2 additions & 1 deletion batch_pan_example_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ which calc_ks_from_dag.pl

##########
# Optional alignment and tree-construction steps
#pandagma pan -c $CONFIG -s align
#pandagma pan -c $CONFIG -s align_cds
#pandagma pan -c $CONFIG -s align_protein
#pandagma pan -c $CONFIG -s model_and_trim
#pandagma pan -c $CONFIG -s calc_trees
pandagma pan -c $CONFIG -s xfr_aligns_trees
Expand Down
7 changes: 5 additions & 2 deletions bin/fetch-datastore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ readonly DATAFILE=${1}

# adjust URL for collections that are located in the annex
case ${DATAFILE} in
acacr.Acra3RX.gnm1.ann1.6C0V.*|\
arahy.Tifrunner.gnm1.ann2.TN8K.*|\
arath.Col0.gnm9.ann11.KH24.*|\
bauva.BV-YZ2020.gnm2.ann1.RJ1G.*|\
chafa.ISC494698.gnm1.ann1.G7XW.*|\
dalod.SKLTGB.gnm1.ann1.R67B.*|\
phach.longxuteng.gnm1.ann1.KGX9.*|\
prupe.Lovell.gnm2.ann1.S2ZZ.*|\
quisa.S10.gnm1.ann1.RQ4J.*|\
sento.Myeongyun.gnm1.ann1.5WXB.*|\
Expand All @@ -30,6 +32,7 @@ collection_type=annotations

case ${genspa} in
[A-Z]*) genus=${genspa} species=GENUS collection_type=pangenes collection=${1%.*.*.*} ;;
acacr) genus=Acacia species=crassicarpa ;;
aesev) genus=Aeschynomene species=evenia ;;
aradu) genus=Arachis species=duranensis ;;
arahy) genus=Arachis species=hypogaea ;;
Expand All @@ -52,13 +55,15 @@ case ${genspa} in
glyso) genus=Glycine species=soja ;;
glyst) genus=Glycine species=stenophita ;;
glysy) genus=Glycine species=syndetika ;;
labpu) genus=Lablab species=purpureus ;;
legume) genus=LEGUMES species=Fabaceae ;;
lencu) genus=Lens species=culinaris ;;
lotja) genus=Lotus species=japonicus ;;
lupal) genus=Lupinus species=albus ;;
medsa) genus=Medicago species=sativa ;;
medtr) genus=Medicago species=truncatula ;;
phaac) genus=Phaseolus species=acutifolius ;;
phach) genus=Phanera species=championii ;;
phalu) genus=Phaseolus species=lunatus ;;
phavu) genus=Phaseolus species=vulgaris ;;
pissa) genus=Pisum species=sativum ;;
Expand Down Expand Up @@ -88,6 +93,4 @@ if [[ "$collection" == *"XinJiangDaYe"* ]]; then
collection="XinJiangDaYe.gnm1.ann1.RKB9"
fi

#echo "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}"

curl --no-progress-meter --fail "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}"
204 changes: 185 additions & 19 deletions bin/pandagma-common.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

version="2023-01-28"
version="2023-02-29"
set -o errexit -o errtrace -o nounset -o pipefail -o posix

trap 'echo ${0##*/}:${LINENO} ERROR executing command: ${BASH_COMMAND}' ERR
Expand Down Expand Up @@ -52,44 +52,210 @@ calc_seq_stats () {
}'
}

##########
run_align_cds() {
cd "${WORK_DIR}" || exit

echo "== Align CDS sequences =="

# If not already present, retrieve sequences for each family, preparatory to aligning them
# 19_pan_aug_leftover_merged_cds
if [[ -d 19_palmc ]]; then
: # do nothing; the directory and file(s) exist
else
mkdir -p 19_palmc
echo " For each pan-gene set, retrieve sequences into a multifasta file."
get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \
-fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmc
fi

echo; echo "== Move small families to the side =="
mkdir -p 19_palmc_small
# Below, count_seqs = number of sequences in the alignment; count_annots = number of unique annotation groups.
min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it.
for filepath in 19_palmc/*; do
file=$(basename "$filepath")
count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | wc -l);
count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l)
if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then
echo "Set aside small family $file; $count_seqs sequences, $count_annots distinct annotations";
mv "$filepath" 19_palmc_small/
fi;
done

echo; echo "== Align nucleotide sequence for each gene family =="
mkdir -p 20_aligns_cds
for filepath in 19_palmc/*; do
file=$(basename "$filepath");
echo " Computing alignment, using program famsa, for file $file"
famsa -t 2 19_palmc/"$file" 20_aligns_cds/"$file" 1>/dev/null &
if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
done
wait
}

##########
run_align_protein() {
cd "${WORK_DIR}" || exit

echo "== Align protein sequences =="

# If not already present, retrieve sequences for each family, preparatory to aligning them
if [[ -d 19_palmp ]]; then
: # do nothing; the directory and file(s) exist
else
mkdir -p 19_palmp
echo " For each pan-gene set, retrieve sequences into a multifasta file."
get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \
"${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" \
-fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp
fi

echo; echo "== Move small families to the side =="
mkdir -p 19_palmp_small
# Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups.
min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it.
for filepath in 19_palmp/*; do
file=$(basename "$filepath")
count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l);
count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l)
if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then
echo "Set aside small family $file; $count_seqs sequences, $count_annots annotations";
mv "$filepath" 19_palmp_small/
fi;
done

echo; echo "== Align protein sequence for the each gene family =="
mkdir -p 20_aligns_prot
for filepath in 19_palmp/*; do
file=$(basename "$filepath");
# echo " Computing alignment, using program famsa, for file $file"
famsa -t 2 19_palmp/"$file" 20_aligns_prot/"$file" 1>/dev/null &
if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
done
wait
}

##########
run_model_and_trim() {
echo; echo "== Build HMMs =="
cd "${WORK_DIR}" || exit
mkdir -p 21_hmm
for filepath in 20_aligns_prot/*; do
file=$(basename "$filepath");
hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null &
if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
done
wait

echo; echo "== Realign to HMMs =="
mkdir -p 22_hmmalign
for filepath in 21_hmm/*; do
file=$(basename "$filepath");
# printf "%s " "$file"
hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palmp/"$file" &
if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
done
wait
echo

echo; echo "== Trim HMM alignments to match-states =="
mkdir -p 23_hmmalign_trim1
for filepath in 22_hmmalign/*; do
file=$(basename "$filepath");
# printf "%s " "$file"
perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" |
sed '/^$/d' > 23_hmmalign_trim1/"$file" &
if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
done
wait
echo

echo; echo "== Filter alignments prior to tree calculation =="
mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log
min_depth=3
min_pct_depth=20
min_pct_aligned=20
for filepath in 23_hmmalign_trim1/*; do
file=$(basename "$filepath")
# printf "%s " "$file"
filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \
-depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned &
if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
done
wait
echo
}

run_calc_trees() {
case $scriptname in
'pandagma pan') fasttree_opts='-nt'; dir_prefix=2 ;;
'pandagma fam') fasttree_opts=''; dir_prefix=2 ;;
'pandagma fsup') fasttree_opts=''; dir_prefix=4
'pandagma pan') dir_prefix=2 ;;
'pandagma fam') dir_prefix=2 ;;
'pandagma fsup') dir_prefix=4
esac
echo; echo "== Calculate trees =="
cd "${WORK_DIR}"

mkdir -p "${dir_prefix}4_trees"

echo; echo "== Move small (<4) and very low-entropy families (sequences are all identical) to the side =="
mkdir -p "${dir_prefix}3_pan_aug_small_or_identical"
min_seq_count=4
# Below, "count" is the number of unique sequences in the alignment.
for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do
file=$(basename "$filepath")
count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l);
if [[ $count -lt $min_seq_count ]]; then
echo "Set aside small or low-entropy family $file";
mv "$filepath" "${dir_prefix}3_pan_aug_small_or_identical"/
fi;
done

# By default, FastTreeMP uses all available threads.
# It is more efficient to run more jobs on one core each by setting an environment variable.
OMP_NUM_THREADS=1
export OMP_NUM_THREADS
for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do
file=$(basename "$filepath")
echo " Calculating tree for $file"
fasttree "${fasttree_opts}" -quiet "$filepath" > "${dir_prefix}4_trees/$file" &
if [[ "$scriptname" =~ "pandagma pan" ]]; then # pan; calculate from nucleotide alignments
echo "fasttree -nt -quiet $filepath > ${dir_prefix}4_trees/$file"
fasttree -quiet -nt "$filepath" > "${dir_prefix}4_trees/$file"
else # fam or fsup; calculate from protein alignments
echo "fasttree -quiet $filepath > ${dir_prefix}4_trees/$file"
fasttree -quiet "$filepath" > "${dir_prefix}4_trees/$file" &
fi
# allow to execute up to $NPROC concurrent asynchronous processes
if [[ $(jobs -r -p | wc -l) -ge ${NPROC} ]]; then wait -n; fi
done
wait
}

run_xfr_aligns_trees() {
echo; echo "Copy alignment and tree results into output directory"

full_out_dir="${out_dir}"
cd "${submit_dir}" || exit

if [ ! -d "$full_out_dir" ]; then
echo "creating output directory \"${full_out_dir}/\""
mkdir -p "$full_out_dir"
fi

if [[ "$scriptname" =~ "pandagma pan" ]] || [[ "$scriptname" =~ "pandagma fam" ]]; then
for dir in 19_pan_aug_leftover_merged_prot 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do
if [ -d "${WORK_DIR}/$dir" ]; then
echo "Copying directory $dir to output directory"
cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/
else
echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping"
fi
done
elif [[ "$scriptname" =~ "pandagma fsup" ]]; then
for dir in 35_sup_in_fams_prot 42_hmmalign 43_hmmalign_trim2 44_trees; do
if [ -d "${WORK_DIR}/$dir" ]; then
echo "Copying directory $dir to output directory"
cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/
else
echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping"
fi
done
else
echo "Exiting, as $scriptname is not one of pan, fam, or fsup"
exit 1
fi

# Remove zero-length files in output directory
find "${full_out_dir}" -size 0 -print -delete
}

main_pan_fam() {
if [ "$#" -eq 0 ]; then
echo >&2 "$HELP_DOC" && exit 0;
Expand Down Expand Up @@ -134,7 +300,7 @@ main_pan_fam() {
export fam_dir

case ${step} in
all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./pandagma_out
all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./out_pandagma
*) if [ "${out_dir:-}" ]; then
printf '\noption -o OUT_DIR not applicable to step %s\n' "$step" >&2
exit 1
Expand Down
Loading

0 comments on commit 030b8ad

Please sign in to comment.