Refactoring, arg list handling, add two parameters (#11)

* (1) Fix handling of ks_peaks.tsv; (2) shorten 19_pan_aug_leftover_merged_prot to 19_palm to reduce length of arglist; (3) move small families to the side in alignment step; (4) correct printing of header for counts per accession * MANY changes in the course of moving alignment & modeling steps to -common.sh. Likely breakages. Incomplete. * Various fixes in alignment and modeling steps, after moving this code to -common * Handle argument list too long errors around augment_cluster_sets.awk * Add Wm82.gnm6 annotation to Glycine pangene config & make file * Remove 04_dag at start of filter, to avoid problems of leftover files from previous runs * Minor cosmetic change in stats/ks_histplots.tsv * Add two params, min_align_count min_annots_in_align, to the reporting in the summarize step * Fix listing of pandagma_conf_params reported in summarize step, and other minor cleanup * Add two alignment parameters to config files; and minor changes to pandagma-fsup.sh to work with the pandagma driver script and pandagma-common.sh * Changes in step xfr_aligns_trees to handle output from fsup * Remove optional steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees from commandlist in -pan and -fam workflows * Bug fix & documentation tweaks in pandagma-fsup.sh * Minor change to handling of directory/file transfer in pandagma-fsup.sh * Minor change in stats reporting * Restructure ks_filter step so that ks_block_wgd_cutoff and max_pair_ks are used if stats/ks_histplots.tsv isn't provided * Handle transfer of 19_pan_aug_leftover_merged_prot in -common rather than in -fam * Tweak output of counts-per-accession header in stats.txt file
legumeinfo · Mar 1, 2024 · 030b8ad · 030b8ad
1 parent 80556c0
commit 030b8ad
Show file tree

Hide file tree

Showing 30 changed files with 796 additions and 246 deletions.
diff --git a/batch_fam_example_singularity.sh b/batch_fam_example_singularity.sh
@@ -39,7 +39,7 @@ singularity exec $IMAGE pandagma fam -c $CONFIG
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s consense
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s cluster_rest
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s add_extra
-#singularity exec $IMAGE pandagma fam -c $CONFIG -s align
+#singularity exec $IMAGE pandagma fam -c $CONFIG -s align_protein
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s model_and_trim
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s calc_trees
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s summarize

diff --git a/batch_fam_prod.sh b/batch_fam_prod.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#SBATCH -A m4440
+#SBATCH -q regular
+#SBATCH -N 1
+#SBATCH -n 30    #  number of cores/tasks in this job
+#SBATCH -t 23:00:00
+#SBATCH -C cpu
+#SBATCH -J pand-fam2
+#SBATCH -o %x_%j.out
+#SBATCH -e %x_%j.err
+
+set -o errexit
+set -o nounset
+set -o xtrace
+
+date   # print timestamp
+
+# If using conda environment for dependencies:
+module load conda
+conda activate pandagma
+
+PDGPATH=$PWD
+CONFIG=$PWD/config/family3_22_3.conf
+
+echo "Config: $CONFIG"
+
+export PATH=$PATH:$PDGPATH/bin
+echo "PATH: $PATH"
+
+##########
+# Test PATH
+which pandagma
+which calc_ks_from_dag.pl
+
+##########
+## Fetch relevant data files; e.g.
+#mkdir -p data
+#make -C data -f $PWD/get_data/family3_22_3.mk
+
+##########
+## Filter transposable elements
+#pandagma TEfilter -c $CONFIG
+
+##########
+## Run all main steps, assuming input data files exist in ./data
+## Work directory will be ./work_pandagma
+## Output will go to ./out_pandagma
+#pandagma fam -c $CONFIG -d data_TEfilter
+
+##########
+## Run individual steps
+#pandagma fam -c $CONFIG -s ingest -d data_TEfilter
+#pandagma fam -c $CONFIG -s mmseqs
+#pandagma fam -c $CONFIG -s filter
+#pandagma fam -c $CONFIG -s dagchainer
+#pandagma fam -c $CONFIG -s ks_calc
+#pandagma fam -c $CONFIG -s ks_filter
+#pandagma fam -c $CONFIG -s mcl
+#pandagma fam -c $CONFIG -s consense
+#pandagma fam -c $CONFIG -s cluster_rest
+#pandagma fam -c $CONFIG -s add_extra
+#pandagma fam -c $CONFIG -s tabularize
+#pandagma fam -c $CONFIG -s align_protein
+#pandagma fam -c $CONFIG -s model_and_trim
+#pandagma fam -c $CONFIG -s calc_trees
+pandagma fam -c $CONFIG -s xfr_aligns_trees
+pandagma fam -c $CONFIG -s summarize
+
+##########
+## Optional work-directory cleanup steps
+#pandagma fam -c $CONFIG -s clean
+#rm -rf ./work_pandagma
+
+date   # print timestamp
+
diff --git a/batch_pan_example_conda.sh b/batch_pan_example_conda.sh
@@ -49,7 +49,8 @@ which calc_ks_from_dag.pl
 
 ##########
 # Optional alignment and tree-construction steps
-#pandagma pan -c $CONFIG -s align
+#pandagma pan -c $CONFIG -s align_cds
+#pandagma pan -c $CONFIG -s align_protein
 #pandagma pan -c $CONFIG -s model_and_trim
 #pandagma pan -c $CONFIG -s calc_trees
 pandagma pan -c $CONFIG -s xfr_aligns_trees

diff --git a/bin/fetch-datastore.sh b/bin/fetch-datastore.sh
@@ -8,11 +8,13 @@ readonly DATAFILE=${1}
 
 # adjust URL for collections that are located in the annex
 case ${DATAFILE} in
+  acacr.Acra3RX.gnm1.ann1.6C0V.*|\
   arahy.Tifrunner.gnm1.ann2.TN8K.*|\
   arath.Col0.gnm9.ann11.KH24.*|\
   bauva.BV-YZ2020.gnm2.ann1.RJ1G.*|\
   chafa.ISC494698.gnm1.ann1.G7XW.*|\
   dalod.SKLTGB.gnm1.ann1.R67B.*|\
+  phach.longxuteng.gnm1.ann1.KGX9.*|\
   prupe.Lovell.gnm2.ann1.S2ZZ.*|\
   quisa.S10.gnm1.ann1.RQ4J.*|\
   sento.Myeongyun.gnm1.ann1.5WXB.*|\
@@ -30,6 +32,7 @@ collection_type=annotations
 
 case ${genspa} in
   [A-Z]*) genus=${genspa} species=GENUS collection_type=pangenes collection=${1%.*.*.*} ;;
+  acacr) genus=Acacia species=crassicarpa ;;
   aesev) genus=Aeschynomene species=evenia ;;
   aradu) genus=Arachis species=duranensis ;;
   arahy) genus=Arachis species=hypogaea ;;
@@ -52,13 +55,15 @@ case ${genspa} in
   glyso) genus=Glycine species=soja ;;
   glyst) genus=Glycine species=stenophita ;;
   glysy) genus=Glycine species=syndetika ;;
+  labpu) genus=Lablab species=purpureus ;;
   legume) genus=LEGUMES species=Fabaceae ;;
   lencu) genus=Lens species=culinaris ;;
   lotja) genus=Lotus species=japonicus ;;
   lupal) genus=Lupinus species=albus ;;
   medsa) genus=Medicago species=sativa ;;
   medtr) genus=Medicago species=truncatula ;;
   phaac) genus=Phaseolus species=acutifolius ;;
+  phach) genus=Phanera species=championii ;;
   phalu) genus=Phaseolus species=lunatus ;;
   phavu) genus=Phaseolus species=vulgaris ;;
   pissa) genus=Pisum species=sativum ;;
@@ -88,6 +93,4 @@ if [[ "$collection" == *"XinJiangDaYe"* ]]; then
   collection="XinJiangDaYe.gnm1.ann1.RKB9"
 fi
 
-#echo "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}"
-
 curl --no-progress-meter --fail "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}"
diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-version="2023-01-28"
+version="2023-02-29"
 set -o errexit -o errtrace -o nounset -o pipefail -o posix
 
 trap 'echo ${0##*/}:${LINENO} ERROR executing command: ${BASH_COMMAND}' ERR
@@ -52,44 +52,210 @@ calc_seq_stats () {
           }'
 }
 
+##########
+run_align_cds() {
+  cd "${WORK_DIR}" || exit
+
+  echo "== Align CDS sequences =="
+
+  # If not already present, retrieve sequences for each family, preparatory to aligning them
+  # 19_pan_aug_leftover_merged_cds
+  if [[ -d 19_palmc ]]; then
+    : # do nothing; the directory and file(s) exist
+  else
+    mkdir -p 19_palmc
+    echo "  For each pan-gene set, retrieve sequences into a multifasta file."
+    get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \
+      -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmc
+  fi
+
+  echo; echo "== Move small families to the side =="
+  mkdir -p 19_palmc_small
+  # Below, count_seqs = number of sequences in the alignment; count_annots = number of unique annotation groups.
+  min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it.
+  for filepath in 19_palmc/*; do
+    file=$(basename "$filepath")
+    count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | wc -l);
+    count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l)
+    if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then
+      echo "Set aside small family $file; $count_seqs sequences, $count_annots distinct annotations";
+      mv "$filepath" 19_palmc_small/
+    fi;
+  done
+
+  echo; echo "== Align nucleotide sequence for each gene family =="
+  mkdir -p 20_aligns_cds
+  for filepath in 19_palmc/*; do
+    file=$(basename "$filepath");
+    echo "  Computing alignment, using program famsa, for file $file"
+    famsa -t 2 19_palmc/"$file" 20_aligns_cds/"$file" 1>/dev/null &
+    if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
+  done
+  wait
+}
+
+##########
+run_align_protein() {
+  cd "${WORK_DIR}" || exit
+
+  echo "== Align protein sequences =="
+
+  # If not already present, retrieve sequences for each family, preparatory to aligning them
+  if [[ -d 19_palmp ]]; then
+    : # do nothing; the directory and file(s) exist
+  else
+    mkdir -p 19_palmp
+    echo "  For each pan-gene set, retrieve sequences into a multifasta file."
+    get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \
+      "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" \
+      -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp
+  fi
+
+  echo; echo "== Move small families to the side =="
+  mkdir -p 19_palmp_small
+  # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups.
+  min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it.
+  for filepath in 19_palmp/*; do
+    file=$(basename "$filepath")
+    count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l);
+    count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l)
+    if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then
+      echo "Set aside small family $file; $count_seqs sequences, $count_annots annotations";
+      mv "$filepath" 19_palmp_small/
+    fi;
+  done
+
+  echo; echo "== Align protein sequence for the each gene family =="
+  mkdir -p 20_aligns_prot
+  for filepath in 19_palmp/*; do
+    file=$(basename "$filepath");
+    # echo "  Computing alignment, using program famsa, for file $file"
+    famsa -t 2 19_palmp/"$file" 20_aligns_prot/"$file" 1>/dev/null &
+    if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
+  done
+  wait
+}
+
+##########
+run_model_and_trim() {
+  echo; echo "== Build HMMs =="
+  cd "${WORK_DIR}" || exit
+  mkdir -p 21_hmm
+  for filepath in 20_aligns_prot/*; do
+    file=$(basename "$filepath");
+    hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null &
+    if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
+  done
+  wait
+
+  echo; echo "== Realign to HMMs =="
+  mkdir -p 22_hmmalign
+  for filepath in 21_hmm/*; do
+    file=$(basename "$filepath");
+    # printf "%s " "$file"
+    hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palmp/"$file" &
+    if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
+  done
+  wait
+  echo
+
+  echo; echo "== Trim HMM alignments to match-states =="
+  mkdir -p 23_hmmalign_trim1
+  for filepath in 22_hmmalign/*; do
+    file=$(basename "$filepath");
+    # printf "%s " "$file"
+    perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" |
+      sed '/^$/d' > 23_hmmalign_trim1/"$file" &
+    if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
+  done
+  wait
+  echo
+
+  echo; echo "== Filter alignments prior to tree calculation =="
+  mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log
+  min_depth=3
+  min_pct_depth=20
+  min_pct_aligned=20
+  for filepath in 23_hmmalign_trim1/*; do
+    file=$(basename "$filepath")
+    # printf "%s " "$file"
+    filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \
+                    -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned &
+    if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi
+  done
+  wait
+  echo
+}
+
 run_calc_trees() {
   case $scriptname in
-    'pandagma pan') fasttree_opts='-nt'; dir_prefix=2 ;;
-    'pandagma fam') fasttree_opts=''; dir_prefix=2 ;;
-    'pandagma fsup') fasttree_opts=''; dir_prefix=4
+    'pandagma pan')  dir_prefix=2 ;;
+    'pandagma fam')  dir_prefix=2 ;;
+    'pandagma fsup') dir_prefix=4
   esac
   echo; echo "== Calculate trees =="
   cd "${WORK_DIR}"
 
   mkdir -p "${dir_prefix}4_trees"
 
-  echo; echo "== Move small (<4) and very low-entropy families (sequences are all identical) to the side =="
-  mkdir -p "${dir_prefix}3_pan_aug_small_or_identical"
-  min_seq_count=4
-  # Below, "count" is the number of unique sequences in the alignment.
-  for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do
-    file=$(basename "$filepath")
-    count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l);
-    if [[ $count -lt $min_seq_count ]]; then
-      echo "Set aside small or low-entropy family $file";
-      mv "$filepath" "${dir_prefix}3_pan_aug_small_or_identical"/
-    fi;
-  done
-
   # By default, FastTreeMP uses all available threads.
   # It is more efficient to run more jobs on one core each by setting an environment variable.
   OMP_NUM_THREADS=1
   export OMP_NUM_THREADS
   for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do
     file=$(basename "$filepath")
     echo "  Calculating tree for $file"
-    fasttree "${fasttree_opts}" -quiet "$filepath" > "${dir_prefix}4_trees/$file" &
+    if [[ "$scriptname" =~ "pandagma pan" ]]; then  # pan; calculate from nucleotide alignments
+      echo "fasttree -nt -quiet $filepath > ${dir_prefix}4_trees/$file"
+      fasttree -quiet -nt "$filepath" > "${dir_prefix}4_trees/$file"
+    else # fam or fsup; calculate from protein alignments
+      echo "fasttree -quiet $filepath > ${dir_prefix}4_trees/$file"
+      fasttree -quiet "$filepath" > "${dir_prefix}4_trees/$file" &
+    fi
     # allow to execute up to $NPROC concurrent asynchronous processes
     if [[ $(jobs -r -p | wc -l) -ge ${NPROC} ]]; then wait -n; fi
   done
   wait
 }
 
+run_xfr_aligns_trees() {
+  echo; echo "Copy alignment and tree results into output directory"
+
+  full_out_dir="${out_dir}"
+  cd "${submit_dir}" || exit
+
+  if [ ! -d "$full_out_dir" ]; then
+      echo "creating output directory \"${full_out_dir}/\""
+      mkdir -p "$full_out_dir"
+  fi
+
+  if [[ "$scriptname" =~ "pandagma pan" ]] || [[ "$scriptname" =~ "pandagma fam" ]]; then
+    for dir in 19_pan_aug_leftover_merged_prot 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do
+      if [ -d "${WORK_DIR}/$dir" ]; then
+        echo "Copying directory $dir to output directory"
+        cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/
+      else
+        echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping"
+      fi
+    done
+  elif [[ "$scriptname" =~ "pandagma fsup" ]]; then
+    for dir in 35_sup_in_fams_prot 42_hmmalign 43_hmmalign_trim2 44_trees; do
+      if [ -d "${WORK_DIR}/$dir" ]; then
+        echo "Copying directory $dir to output directory"
+        cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/
+      else
+        echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping"
+      fi
+    done
+  else 
+    echo "Exiting, as $scriptname is not one of pan, fam, or fsup"
+    exit 1
+  fi
+
+  # Remove zero-length files in output directory
+    find "${full_out_dir}" -size 0 -print -delete
+}
+
 main_pan_fam() {
   if [ "$#" -eq 0 ]; then
     echo >&2 "$HELP_DOC" && exit 0;
@@ -134,7 +300,7 @@ main_pan_fam() {
   export fam_dir
 
   case ${step} in 
-    all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./pandagma_out 
+    all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./out_pandagma 
     *) if [ "${out_dir:-}" ]; then
          printf '\noption -o OUT_DIR not applicable to step %s\n' "$step" >&2
          exit 1