From d16b7255a4fc82f1398d2305969140d44bf8d938 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Wed, 21 Feb 2024 06:00:09 -0800 Subject: [PATCH 01/18] (1) Fix handling of ks_peaks.tsv; (2) shorten 19_pan_aug_leftover_merged_prot to 19_palm to reduce length of arglist; (3) move small families to the side in alignment step; (4) correct printing of header for counts per accession --- README.md | 2 + bin/fetch-datastore.sh | 7 +- bin/pandagma-common.sh | 5 +- bin/pandagma-fam.sh | 69 +++++++++----- bin/pandagma-pan.sh | 13 +++ config/family3_22_3.conf | 147 ++++++++++++++++++++++++++++++ get_data/family3_22_3.mk | 80 ++++++++++++++++ manifests/MANIFEST_output_fam.yml | 2 +- 8 files changed, 295 insertions(+), 30 deletions(-) create mode 100644 config/family3_22_3.conf create mode 100644 get_data/family3_22_3.mk diff --git a/README.md b/README.md index ba68377..cd7928a 100644 --- a/README.md +++ b/README.md @@ -236,6 +236,7 @@ Variables in the config file for the **pangene workflow**, `pandagma pan`: This is used for picking representative IDs+sequence from an orthogroup, when this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees ``` Variables in the config file for the **family workflow**, `pandagma fam`: @@ -262,6 +263,7 @@ ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] This is used for picking representative IDs+sequence from an orthogroup, when this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees ``` ## Example run for the pangene workflow diff --git a/bin/fetch-datastore.sh b/bin/fetch-datastore.sh index b75d169..673f817 100755 --- a/bin/fetch-datastore.sh +++ b/bin/fetch-datastore.sh @@ -8,11 +8,13 @@ readonly DATAFILE=${1} # adjust URL for collections that are located in the annex case ${DATAFILE} in + acacr.Acra3RX.gnm1.ann1.6C0V.*|\ arahy.Tifrunner.gnm1.ann2.TN8K.*|\ arath.Col0.gnm9.ann11.KH24.*|\ bauva.BV-YZ2020.gnm2.ann1.RJ1G.*|\ chafa.ISC494698.gnm1.ann1.G7XW.*|\ dalod.SKLTGB.gnm1.ann1.R67B.*|\ + phach.longxuteng.gnm1.ann1.KGX9.*|\ prupe.Lovell.gnm2.ann1.S2ZZ.*|\ quisa.S10.gnm1.ann1.RQ4J.*|\ sento.Myeongyun.gnm1.ann1.5WXB.*|\ @@ -30,6 +32,7 @@ collection_type=annotations case ${genspa} in [A-Z]*) genus=${genspa} species=GENUS collection_type=pangenes collection=${1%.*.*.*} ;; + acacr) genus=Acacia species=crassicarpa ;; aesev) genus=Aeschynomene species=evenia ;; aradu) genus=Arachis species=duranensis ;; arahy) genus=Arachis species=hypogaea ;; @@ -52,6 +55,7 @@ case ${genspa} in glyso) genus=Glycine species=soja ;; glyst) genus=Glycine species=stenophita ;; glysy) genus=Glycine species=syndetika ;; + labpu) genus=Lablab species=purpureus ;; legume) genus=LEGUMES species=Fabaceae ;; lencu) genus=Lens species=culinaris ;; lotja) genus=Lotus species=japonicus ;; @@ -59,6 +63,7 @@ case ${genspa} in medsa) genus=Medicago species=sativa ;; medtr) genus=Medicago species=truncatula ;; phaac) genus=Phaseolus species=acutifolius ;; + phach) genus=Phanera species=championii ;; phalu) genus=Phaseolus species=lunatus ;; phavu) genus=Phaseolus species=vulgaris ;; pissa) genus=Pisum species=sativum ;; @@ -88,6 +93,4 @@ if [[ "$collection" == *"XinJiangDaYe"* ]]; then collection="XinJiangDaYe.gnm1.ann1.RKB9" fi -#echo "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}" - curl --no-progress-meter --fail "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}" diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh index a918569..fb98480 100755 --- a/bin/pandagma-common.sh +++ b/bin/pandagma-common.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -version="2023-01-28" +version="2023-02-19" set -o errexit -o errtrace -o nounset -o pipefail -o posix trap 'echo ${0##*/}:${LINENO} ERROR executing command: ${BASH_COMMAND}' ERR @@ -65,12 +65,11 @@ run_calc_trees() { echo; echo "== Move small (<4) and very low-entropy families (sequences are all identical) to the side ==" mkdir -p "${dir_prefix}3_pan_aug_small_or_identical" - min_seq_count=4 # Below, "count" is the number of unique sequences in the alignment. for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do file=$(basename "$filepath") count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); - if [[ $count -lt $min_seq_count ]]; then + if [[ $count -lt $min_align_count ]]; then echo "Set aside small or low-entropy family $file"; mv "$filepath" "${dir_prefix}3_pan_aug_small_or_identical"/ fi; diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index b176208..15cbb73 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -110,6 +110,7 @@ Variables in pandagma config file (Set the config with the CONF environment vari ks_binsize - For calculating and displaying histograms. [0.05] ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] max_pair_ks - Fallback value for excluding gene pairs, if a ks_peaks.tsv file is not provided. [4.0] + min_align_count - Minimum number of sequences to trigger alignments, modeling, and trees consen_prefix - Prefix to use in orthogroup names annot_str_regex - Regular expression for capturing annotation name from gene ID, e.g. @@ -459,16 +460,14 @@ run_ks_filter() { cd "${WORK_DIR}" || exit if [ -d 05_kaksout_ks_filtered ]; then rm -rf 05_kaksout_ks_filtered ; fi mkdir -p 05_kaksout_ks_filtered - if [[ -f ks_peaks.tsv ]]; then # filter based on list of Ks values - if [[ -f ks_peaks.tsv ]]; then - echo "Filtering on quotas from expected_quotas and ks_pair_cutoff values provided in ks_peaks.tsv" - ks_peaks=ks_peaks.tsv - fi + if [[ -f stats/ks_peaks.tsv ]]; then # filter based on list of Ks values + echo "Filtering on quotas from expected_quotas and ks_pair_cutoff values provided in stats/ks_peaks.tsv" + ks_peaks=stats/ks_peaks.tsv for ks_path in 05_kaksout/*.rptout; do outfilebase=$(basename "$ks_path" .rptout) echo " Filtering $ks_path based on expected block-median Ks values" < "${ks_path}" filter_mmseqs_by_ks.pl \ - -ks_peak ${ks_peaks} -annot_regex "$annot_str_regex" -max_pair_ks "$max_pair_ks" | + -ks_peaks ${ks_peaks} -annot_regex "$annot_str_regex" -max_pair_ks "$max_pair_ks" | awk 'NF==7' > 05_kaksout_ks_filtered/"${outfilebase}".rptout done else # don't filter, since ks_peaks.tsv file isn't provided @@ -478,7 +477,6 @@ run_ks_filter() { echo "or revising those values based on interpretation of stats/ks_histplots.tsv." echo "If stats/ks_histplots.tsv is not provided, then Ks filtering will be done using values provided" echo "in the config file for ks_block_wgd_cutoff and max_pair_ks." - echo "$*" 1>&2 ; exit 1; fi } @@ -490,7 +488,8 @@ run_mcl() { printf "\nPreparatory to clustering, combine the homology data into a file with gene pairs to be clustered.\n" - if [ -d 05_kaksout_ks_filtered ]; then # From step ks_filter; supersedes all other conditions + files=$(shopt -s nullglob dotglob; echo 05_kaksout_ks_filtered/*) + if [ -d 05_kaksout_ks_filtered ] && (( ${#files} )); then # From step ks_filter; supersedes all other conditions cat 05_kaksout_ks_filtered/*.rptout | awk 'NF==7 {print $1 "\t" $2}' | sort -u > 05_filtered_pairs.tsv else if [ "$strict_synt" -eq 1 ]; then @@ -693,19 +692,24 @@ run_add_extra() { perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' 18_syn_pan_aug_extra.clust.tsv \ > 18_syn_pan_aug_extra.hsh.tsv - echo " For each family set, retrieve sequences into a multifasta file." + echo " For each family set, retrieve sequences into a multifasta file 19_pan_aug_leftover_merged_prot (abbreviated 19_palm)" printf " Fasta file: %s %s\n" "${protein_files[@]}" "${protein_files_extra[@]}" - if [ -d 19_pan_aug_leftover_merged_prot ]; then rm -rf 19_pan_aug_leftover_merged_prot ; fi - mkdir -p 19_pan_aug_leftover_merged_prot + if [ -d 19_palm ]; then rm -rf 19_palm ; fi + mkdir -p 19_palm get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palm - echo " Merge fasta files from 19_pan_aug_leftover_merged_prot, prefixing IDs with panID__" - merge_files_to_pan_fasta.awk 19_pan_aug_leftover_merged_prot/* > 19_pan_aug_leftover_merged_prot.faa + echo " Merge fasta files from 19_palm, prefixing IDs with panID__" + merge_files_to_pan_fasta.awk 19_palm/* > 19_palm.faa + #for path in 19_palm/*; do + # pan_file=`basename $path` + # cat $path | awk -v panID=$pan_file ' $1~/^>/ {print ">" panID "__" substr($0,2) } + # $1!~/^>/ {print $1} ' >> 19_palm.faa + #done else echo "== No annotations were designated as \"extra\", so just promote the syn_pan_aug files as syn_pan_aug_extra. ==" - cp 07_pan_fasta_prot.faa 19_pan_aug_leftover_merged_prot.faa + cp 07_pan_fasta_prot.faa 19_palm.faa cp 12_syn_pan_aug.clust.tsv 18_syn_pan_aug_extra.clust.tsv cp 12_syn_pan_aug.hsh.tsv 18_syn_pan_aug_extra.hsh.tsv fi @@ -733,21 +737,33 @@ run_tabularize() { run_align() { echo; echo "== Retrieve sequences for each family, preparatory to aligning them ==" cd "${WORK_DIR}" || exit - if [[ -d 19_pan_aug_leftover_merged_prot ]] && [[ -f 19_pan_aug_leftover_merged_prot/${consen_prefix}00001 ]]; then + if [[ -d 19_palm ]] && [[ -f 19_palm/${consen_prefix}00001 ]]; then : # do nothing; the directory and file(s) exist else - mkdir -p 19_pan_aug_leftover_merged_prot + mkdir -p 19_palm echo " For each pan-gene set, retrieve sequences into a multifasta file." get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palm fi + echo; echo "== Move small families to the side ==" + mkdir -p 19_pan_aug_leftover_merged_small + # Below, "count" is the number of unique sequences in the alignment. + for filepath in 19_palm/*; do + file=$(basename "$filepath") + count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + if [[ $count -lt $min_align_count ]]; then + echo "Set aside small family $file"; + mv "$filepath" 19_pan_aug_leftover_merged_small/ + fi; + done + echo; echo "== Align the gene families ==" mkdir -p 20_aligns - for filepath in 19_pan_aug_leftover_merged_prot/*; do + for filepath in 19_palm/*; do file=$(basename "$filepath"); echo " Computing alignment, using program famsa, for file $file" - famsa -t 2 19_pan_aug_leftover_merged_prot/"$file" 20_aligns/"$file" 1>/dev/null & + famsa -t 2 19_palm/"$file" 20_aligns/"$file" 1>/dev/null & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi done wait @@ -757,6 +773,7 @@ run_align() { run_model_and_trim() { echo; echo "== Build HMMs ==" cd "${WORK_DIR}" || exit + mkdir -p 21_hmm for filepath in 20_aligns/*; do file=$(basename "$filepath"); @@ -770,7 +787,7 @@ run_model_and_trim() { for filepath in 21_hmm/*; do file=$(basename "$filepath"); printf "%s " "$file" - hmmalign --trim --outformat A2M --amino -o 22_hmmalign/"$file" 21_hmm/"$file" 19_pan_aug_leftover_merged_prot/"$file" & + hmmalign --trim --outformat A2M --amino -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palm/"$file" & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi done wait @@ -842,7 +859,7 @@ run_summarize() { echo "Couldn't find file manifests/MANIFEST_output_fam.yml" fi - for dir in 19_pan_aug_leftover_merged_prot 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do + for dir in 19_palm 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do if [ -d "${WORK_DIR}"/$dir ]; then echo "Copying directory $dir to output directory" cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ @@ -901,15 +918,19 @@ run_summarize() { { printf " %i\t%i\t%2.1f\t%s\n", $2, $4, 100*($4/$2), $1 }' >> "${stats_file}" echo " Print counts per accession" - if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then + { printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" + } >> "${stats_file}" + if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then + { transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | perl -lane 'next if ($.<=3); $ct=0; $sum=0; $nulls=0; $OGs=0; for $i (@F[1..(@F-1)]){ $OGs++; if ($i>0){$ct++; $sum+=$i} if ($i==0){$nulls++} }; printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' \ >> "${stats_file}" + } fi echo " Print histograms" @@ -939,7 +960,7 @@ run_clean() { echo " work_dir: $PWD" if [ -d MMTEMP ]; then rm -rf MMTEMP/*; fi - for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_pan_aug_leftover_merged_prot; do + for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_palm; do if [ -d $dir ]; then echo " Removing directory $dir"; rm -rf $dir & fi done diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index 56ac609..a00a119 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -114,6 +114,7 @@ Variables in pandagma config file (Set the config with the CONF environment vari this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. order_method - Method to determine consensus panID order. reference or alignment [default reference] + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees '' EOS @@ -931,6 +932,18 @@ run_align_protein() { -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot fi + echo; echo "== Move small families to the side ==" + mkdir -p 19_pan_aug_leftover_merged_small + # Below, "count" is the number of unique sequences in the alignment. + for filepath in 19_pan_aug_leftover_merged_prot/*; do + file=$(basename "$filepath") + count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + if [[ $count -lt $min_align_count ]]; then + echo "Set aside small family $file"; + mv "$filepath" 19_pan_aug_leftover_merged_small/ + fi; + done + echo; echo "== Align protein sequence for the each gene family ==" mkdir -p 20_aligns_prot for filepath in 19_pan_aug_leftover_merged_prot/*; do diff --git a/config/family3_22_3.conf b/config/family3_22_3.conf new file mode 100644 index 0000000..dd65ae9 --- /dev/null +++ b/config/family3_22_3.conf @@ -0,0 +1,147 @@ +clust_iden='0.40' +clust_cov="0.40" +extra_iden='0.30' +TE_match_iden='0.40' +mcl_inflation='1.6' +strict_synt="1" +ks_low_cutoff="0.50" # For inferring Ks peak per species pair. Don't consider Ks block-median values less than this. +ks_hi_cutoff="2.0" # For inferring Ks peak per species pair. Don't consider Ks block-median values greater than this. +ks_binsize="0.05" # For calculating and displaying histograms. Default 0.05 +ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. +max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. +consen_prefix='Legume.fam3.' +annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="6" # Minimum number of sequences in a family to trigger alignments, modeling, and trees + +expected_quotas=( + acacr 2 + aesev 2 + arath 2 + Arachis 4 + cerca 1 + chafa 2 + Cicer 2 + dalod 2 + Glycine 4 + labpu 2 + lencu 2 + lotja 2 + lupal 6 + Medicago 2 + phach 2 + Phaseolus 2 + prupe 1 + pissa 2 + quisa 2 + sento 2 + singl 2 + tripr 2 + vicfa 2 + Vigna 2 + vitvi 1 +) + +##### (required) list of BED & FASTA file paths +# Uncomment add file paths to the the annotation_files and cds_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz + Arachis.pan3.BYQ9.pctl30_named_protein.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz + Cicer.pan3.JVTK.pctl25_named_protein.bed.gz + dalod.SKLTGB.gnm1.ann1.R67B.gene_models_main.bed.gz + Glycine.pan5.MKRS.pctl25_named_protein.bed.gz + labpu.Highworth.gnm1.ann1.HJ3B.gene_models_main.bed.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz + lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz + lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz + Medicago.pan3.9X6B.pctl25_named_protein.bed.gz + phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz + Phaseolus.pan3.LXKV.pctl25_named_protein.bed.gz + pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz + vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz + Vigna.pan3.V294.pctl25_named_protein.bed.gz +) + +cds_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz + Arachis.pan3.BYQ9.pctl30_named_cds.fna.gz + cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz + chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz + Cicer.pan3.JVTK.pctl25_named_cds.fna.gz + dalod.SKLTGB.gnm1.ann1.R67B.cds.fna.gz + Glycine.pan5.MKRS.pctl25_named_cds.fna.gz + labpu.Highworth.gnm1.ann1.HJ3B.cds_primary.fna.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.cds.fna.gz + lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz + lupal.Amiga.gnm1.ann1.3GKS.cds.fna.gz + Medicago.pan3.9X6B.pctl25_named_cds.fna.gz + phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz + Phaseolus.pan3.LXKV.pctl25_named_cds.fna.gz + pissa.Cameor.gnm1.ann1.7SZR.cds_primary.fna.gz + quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz + sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz + singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz + tripr.MilvusB.gnm2.ann1.DFgp.cds_primary.fna.gz + vicfa.Hedin2.gnm1.ann1.PTNK.cds_primary.fna.gz + Vigna.pan3.V294.pctl25_named_cds.fna.gz +) + +protein_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz + Arachis.pan3.BYQ9.pctl30_named_protein.faa.gz + cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz + chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz + Cicer.pan3.JVTK.pctl25_named_protein.faa.gz + dalod.SKLTGB.gnm1.ann1.R67B.protein.faa.gz + Glycine.pan5.MKRS.pctl25_named_protein.faa.gz + labpu.Highworth.gnm1.ann1.HJ3B.protein_primary.faa.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.protein.faa.gz + lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz + lupal.Amiga.gnm1.ann1.3GKS.protein.faa.gz + Medicago.pan3.9X6B.pctl25_named_protein.faa.gz + phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz + Phaseolus.pan3.LXKV.pctl25_named_protein.faa.gz + pissa.Cameor.gnm1.ann1.7SZR.protein_primary.faa.gz + quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz + sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz + singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz + tripr.MilvusB.gnm2.ann1.DFgp.protein_primary.faa.gz + vicfa.Hedin2.gnm1.ann1.PTNK.protein_primary.faa.gz + Vigna.pan3.V294.pctl25_named_protein.faa.gz +) + +### (optional) Extra BED & FASTA files +annotation_files_extra=( + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz + vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz +) + +cds_files_extra=( + arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz + prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz + vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz +) + +protein_files_extra=( + arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz + prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz + vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz +) + +### (optional) Fasta file of sequences to use exclude from annotations, e.g. transposable elements or plastid seuqences. +### If provided, sequences in this file will be used to exclude CDS sequences if any matches are >= TE_match_iden +exclude_TE_match_file=( + legume.TE_lib_2024.rpt.6WVT.fna.gz +) + diff --git a/get_data/family3_22_3.mk b/get_data/family3_22_3.mk new file mode 100644 index 0000000..1d76515 --- /dev/null +++ b/get_data/family3_22_3.mk @@ -0,0 +1,80 @@ +FILES ::= \ +acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz \ +Arachis.pan3.BYQ9.pctl30_named_cds.fna.gz \ +Arachis.pan3.BYQ9.pctl30_named_protein.bed.gz \ +Arachis.pan3.BYQ9.pctl30_named_protein.faa.gz \ +arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz \ +arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz \ +arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz \ +Cicer.pan3.JVTK.pctl25_named_cds.fna.gz \ +Cicer.pan3.JVTK.pctl25_named_protein.bed.gz \ +Cicer.pan3.JVTK.pctl25_named_protein.faa.gz \ +dalod.SKLTGB.gnm1.ann1.R67B.cds.fna.gz \ +dalod.SKLTGB.gnm1.ann1.R67B.gene_models_main.bed.gz \ +dalod.SKLTGB.gnm1.ann1.R67B.protein.faa.gz \ +Glycine.pan5.MKRS.pctl25_named_cds.fna.gz \ +Glycine.pan5.MKRS.pctl25_named_protein.bed.gz \ +Glycine.pan5.MKRS.pctl25_named_protein.faa.gz \ +labpu.Highworth.gnm1.ann1.HJ3B.cds_primary.fna.gz \ +labpu.Highworth.gnm1.ann1.HJ3B.gene_models_main.bed.gz \ +labpu.Highworth.gnm1.ann1.HJ3B.protein_primary.faa.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.cds.fna.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.protein.faa.gz \ +lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz \ +lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz \ +lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz \ +lupal.Amiga.gnm1.ann1.3GKS.cds.fna.gz \ +lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz \ +lupal.Amiga.gnm1.ann1.3GKS.protein.faa.gz \ +Medicago.pan3.9X6B.pctl25_named_cds.fna.gz \ +Medicago.pan3.9X6B.pctl25_named_protein.bed.gz \ +Medicago.pan3.9X6B.pctl25_named_protein.faa.gz \ +Phaseolus.pan3.LXKV.pctl25_named_cds.fna.gz \ +Phaseolus.pan3.LXKV.pctl25_named_protein.bed.gz \ +Phaseolus.pan3.LXKV.pctl25_named_protein.faa.gz \ +pissa.Cameor.gnm1.ann1.7SZR.cds_primary.fna.gz \ +pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz \ +pissa.Cameor.gnm1.ann1.7SZR.protein_primary.faa.gz \ +phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz \ +phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz \ +phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz \ +quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz \ +quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz \ +quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz \ +singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz \ +singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz \ +singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.cds_primary.fna.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.protein_primary.faa.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.cds_primary.fna.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.protein_primary.faa.gz \ +Vigna.pan3.V294.pctl25_named_cds.fna.gz \ +Vigna.pan3.V294.pctl25_named_protein.bed.gz \ +Vigna.pan3.V294.pctl25_named_protein.faa.gz \ +vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz \ +vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz \ +vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz \ +legume.TE_lib_2024.rpt.6WVT.fna.gz + +include $(dir $(realpath $(lastword $(MAKEFILE_LIST))))/common.mk + diff --git a/manifests/MANIFEST_output_fam.yml b/manifests/MANIFEST_output_fam.yml index ada241e..65b1270 100644 --- a/manifests/MANIFEST_output_fam.yml +++ b/manifests/MANIFEST_output_fam.yml @@ -17,7 +17,7 @@ the set ID in the first column and genes in the second. 18_syn_pan_aug_extra.table.tsv: Tabular format of all gene families, in columns by species. Multiple paralogs from a species in a gene family are comma-separated within the column of the species. -19_pan_aug_leftover_merged_prot: Directory of fasta files for all families corresponding with 18_syn_pan_aug_extra.clust.tsv +19_palm: (short for pan_aug_leftover_merged) - directory of fasta files for all families corresponding with 18_syn_pan_aug_extra.clust.tsv stats.family_4_3.txt: Descriptive statistics about program parameters, input sequences, and Gene family products. From 5ae31a74beafd48a860c6ee82d2441df1cb2fea4 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Fri, 23 Feb 2024 09:06:46 -0800 Subject: [PATCH 02/18] MANY changes in the course of moving alignment & modeling steps to -common.sh. Likely breakages. Incomplete. --- README.md | 6 +- batch_fam_example_singularity.sh | 2 +- batch_fam_prod.sh | 75 +++++++++++++ batch_pan_example_conda.sh | 3 +- bin/pandagma-common.sh | 186 +++++++++++++++++++++++++++++-- bin/pandagma-fam.sh | 122 ++++---------------- bin/pandagma-fsup.sh | 2 + bin/pandagma-pan.sh | 184 ++++-------------------------- config/family3_18_3.conf | 2 + config/family3_22_3.conf | 1 + config/family3_4_3.conf | 2 + config/family3_sup.conf | 2 + config/family3_sup_ur.conf | 2 + config/family3_ur.conf | 2 + config/family_7_3.conf | 2 + config/family_ur.conf | 2 + 16 files changed, 318 insertions(+), 277 deletions(-) create mode 100755 batch_fam_prod.sh diff --git a/README.md b/README.md index cd7928a..25aaf58 100644 --- a/README.md +++ b/README.md @@ -236,7 +236,8 @@ Variables in the config file for the **pangene workflow**, `pandagma pan`: This is used for picking representative IDs+sequence from an orthogroup, when this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. - min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] ``` Variables in the config file for the **family workflow**, `pandagma fam`: @@ -263,7 +264,8 @@ ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] This is used for picking representative IDs+sequence from an orthogroup, when this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. - min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] ``` ## Example run for the pangene workflow diff --git a/batch_fam_example_singularity.sh b/batch_fam_example_singularity.sh index 5015d6d..40f1f0b 100755 --- a/batch_fam_example_singularity.sh +++ b/batch_fam_example_singularity.sh @@ -39,7 +39,7 @@ singularity exec $IMAGE pandagma fam -c $CONFIG #singularity exec $IMAGE pandagma fam -c $CONFIG -s consense #singularity exec $IMAGE pandagma fam -c $CONFIG -s cluster_rest #singularity exec $IMAGE pandagma fam -c $CONFIG -s add_extra -#singularity exec $IMAGE pandagma fam -c $CONFIG -s align +#singularity exec $IMAGE pandagma fam -c $CONFIG -s align_protein #singularity exec $IMAGE pandagma fam -c $CONFIG -s model_and_trim #singularity exec $IMAGE pandagma fam -c $CONFIG -s calc_trees #singularity exec $IMAGE pandagma fam -c $CONFIG -s summarize diff --git a/batch_fam_prod.sh b/batch_fam_prod.sh new file mode 100755 index 0000000..8700076 --- /dev/null +++ b/batch_fam_prod.sh @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH -A m4440 +#SBATCH -q regular +#SBATCH -N 1 +#SBATCH -n 30 # number of cores/tasks in this job +#SBATCH -t 23:00:00 +#SBATCH -C cpu +#SBATCH -J pand-fam2 +#SBATCH -o %x_%j.out +#SBATCH -e %x_%j.err + +set -o errexit +set -o nounset +set -o xtrace + +date # print timestamp + +# If using conda environment for dependencies: +module load conda +conda activate pandagma + +PDGPATH=$PWD +CONFIG=$PWD/config/family3_22_3.conf + +echo "Config: $CONFIG" + +export PATH=$PATH:$PDGPATH/bin +echo "PATH: $PATH" + +########## +# Test PATH +which pandagma +which calc_ks_from_dag.pl + +########## +## Fetch relevant data files; e.g. +#mkdir -p data +#make -C data -f $PWD/get_data/family3_22_3.mk + +########## +## Filter transposable elements +#pandagma TEfilter -c $CONFIG + +########## +## Run all main steps, assuming input data files exist in ./data +## Work directory will be ./work_pandagma +## Output will go to ./out_pandagma +#pandagma fam -c $CONFIG -d data_TEfilter + +########## +## Run individual steps +#pandagma fam -c $CONFIG -s ingest -d data_TEfilter +#pandagma fam -c $CONFIG -s mmseqs +#pandagma fam -c $CONFIG -s filter +#pandagma fam -c $CONFIG -s dagchainer +#pandagma fam -c $CONFIG -s ks_calc +#pandagma fam -c $CONFIG -s ks_filter +#pandagma fam -c $CONFIG -s mcl +#pandagma fam -c $CONFIG -s consense +#pandagma fam -c $CONFIG -s cluster_rest +#pandagma fam -c $CONFIG -s add_extra +#pandagma fam -c $CONFIG -s tabularize +#pandagma fam -c $CONFIG -s align_protein +#pandagma fam -c $CONFIG -s model_and_trim +#pandagma fam -c $CONFIG -s calc_trees +pandagma fam -c $CONFIG -s xfr_aligns_trees +pandagma fam -c $CONFIG -s summarize + +########## +## Optional work-directory cleanup steps +#pandagma fam -c $CONFIG -s clean +#rm -rf ./work_pandagma + +date # print timestamp + diff --git a/batch_pan_example_conda.sh b/batch_pan_example_conda.sh index 1747241..486a0e7 100755 --- a/batch_pan_example_conda.sh +++ b/batch_pan_example_conda.sh @@ -49,7 +49,8 @@ which calc_ks_from_dag.pl ########## # Optional alignment and tree-construction steps -#pandagma pan -c $CONFIG -s align +#pandagma pan -c $CONFIG -s align_cds +#pandagma pan -c $CONFIG -s align_protein #pandagma pan -c $CONFIG -s model_and_trim #pandagma pan -c $CONFIG -s calc_trees pandagma pan -c $CONFIG -s xfr_aligns_trees diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh index fb98480..025e892 100755 --- a/bin/pandagma-common.sh +++ b/bin/pandagma-common.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -version="2023-02-19" +version="2023-02-23" set -o errexit -o errtrace -o nounset -o pipefail -o posix trap 'echo ${0##*/}:${LINENO} ERROR executing command: ${BASH_COMMAND}' ERR @@ -52,11 +52,147 @@ calc_seq_stats () { }' } + +########## +run_align_cds() { + cd "${WORK_DIR}" || exit + + echo "== Align CDS sequences ==" + + # If not already present, retrieve sequences for each family, preparatory to aligning them + # 19_pan_aug_leftover_merged_cds + if [[ -d 19_palmc ]]; then + : # do nothing; the directory and file(s) exist + else + mkdir -p 19_palmc + echo " For each pan-gene set, retrieve sequences into a multifasta file." + get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \ + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmc + fi + + echo; echo "== Move small families to the side ==" + mkdir -p 19_palmc_small + # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups. + min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. + for filepath in 19_palmc/*; do + file=$(basename "$filepath") + count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) + if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then + echo "Set aside small family $file; $count_seqs sequences, $count_annots annotations"; + mv "$filepath" 19_palmc_small/ + fi; + done + + echo; echo "== Align nucleotide sequence for each gene family ==" + mkdir -p 20_aligns_cds + for filepath in 19_palmc/*; do + file=$(basename "$filepath"); + echo " Computing alignment, using program famsa, for file $file" + famsa -t 2 19_palmc/"$file" 20_aligns_cds/"$file" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait +} + +########## +run_align_protein() { + cd "${WORK_DIR}" || exit + + echo "== Align protein sequences ==" + + # If not already present, retrieve sequences for each family, preparatory to aligning them + if [[ -d 19_palmp ]]; then + : # do nothing; the directory and file(s) exist + else + mkdir -p 19_palmp + echo " For each pan-gene set, retrieve sequences into a multifasta file." + get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ + "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" \ + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp + fi + + echo; echo "== Move small families to the side ==" + mkdir -p 19_palmp_small + # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups. + min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. + for filepath in 19_palmp/*; do + file=$(basename "$filepath") + count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) + if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then + echo "Set aside small family $file; $count_seqs sequences, $count_annots annotations"; + mv "$filepath" 19_palmp_small/ + fi; + done + + echo; echo "== Align protein sequence for the each gene family ==" + mkdir -p 20_aligns_prot + for filepath in 19_palmp/*; do + file=$(basename "$filepath"); + echo " Computing alignment, using program famsa, for file $file" + famsa -t 2 19_palmp/"$file" 20_aligns_prot/"$file" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait +} + +########## +run_model_and_trim() { + echo; echo "== Build HMMs ==" + cd "${WORK_DIR}" || exit + mkdir -p 21_hmm + for filepath in 20_aligns_cds/*; do + file=$(basename "$filepath"); + hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + + echo; echo "== Realign to HMMs ==" + mkdir -p 22_hmmalign + for filepath in 21_hmm/*; do + file=$(basename "$filepath"); + printf "%f " "$file" + hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palmc/"$file" & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo + + echo; echo "== Trim HMM alignments to match-states ==" + mkdir -p 23_hmmalign_trim1 + for filepath in 22_hmmalign/*; do + file=$(basename "$filepath"); + printf "%s " "$file" + perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" | + sed '/^$/d' > 23_hmmalign_trim1/"$file" & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo + + echo; echo "== Filter alignments prior to tree calculation ==" + mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log + min_depth=3 + min_pct_depth=20 + min_pct_aligned=20 + for filepath in 23_hmmalign_trim1/*; do + file=$(basename "$filepath") + printf "%f " "$file" + filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ + -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo +} + run_calc_trees() { case $scriptname in - 'pandagma pan') fasttree_opts='-nt'; dir_prefix=2 ;; - 'pandagma fam') fasttree_opts=''; dir_prefix=2 ;; - 'pandagma fsup') fasttree_opts=''; dir_prefix=4 + 'pandagma pan') dir_prefix=2 ;; + 'pandagma fam') dir_prefix=2 ;; + 'pandagma fsup') dir_prefix=4 esac echo; echo "== Calculate trees ==" cd "${WORK_DIR}" @@ -65,12 +201,14 @@ run_calc_trees() { echo; echo "== Move small (<4) and very low-entropy families (sequences are all identical) to the side ==" mkdir -p "${dir_prefix}3_pan_aug_small_or_identical" - # Below, "count" is the number of unique sequences in the alignment. + # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups. + min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do file=$(basename "$filepath") - count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); - if [[ $count -lt $min_align_count ]]; then - echo "Set aside small or low-entropy family $file"; + count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) + if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then + echo "Set aside small or low-entropy family $file; $count_seqs sequences, $count_annots annotations"; mv "$filepath" "${dir_prefix}3_pan_aug_small_or_identical"/ fi; done @@ -82,13 +220,41 @@ run_calc_trees() { for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do file=$(basename "$filepath") echo " Calculating tree for $file" - fasttree "${fasttree_opts}" -quiet "$filepath" > "${dir_prefix}4_trees/$file" & + if [[ "$scriptname" =~ "pandagma pan" ]]; then # pan; calculate from nucleotide alignments + echo "fasttree -nt -quiet $filepath > ${dir_prefix}4_trees/$file" + fasttree -quiet -nt "$filepath" > "${dir_prefix}4_trees/$file" + else # fam or fsup; calculate from protein alignments + echo "fasttree -quiet $filepath > ${dir_prefix}4_trees/$file" + fasttree -quiet "$filepath" > "${dir_prefix}4_trees/$file" & + fi # allow to execute up to $NPROC concurrent asynchronous processes if [[ $(jobs -r -p | wc -l) -ge ${NPROC} ]]; then wait -n; fi done wait } +run_xfr_aligns_trees() { + echo; echo "Copy alignment and tree results into output directory" + + full_out_dir="${out_dir}" + + cd "${submit_dir}" || exit + + if [ ! -d "$full_out_dir" ]; then + echo "creating output directory \"${full_out_dir}/\"" + mkdir -p "$full_out_dir" + fi + + for dir in 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do + if [ -d "${WORK_DIR}"/$dir ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi + done +} + main_pan_fam() { if [ "$#" -eq 0 ]; then echo >&2 "$HELP_DOC" && exit 0; @@ -133,7 +299,7 @@ main_pan_fam() { export fam_dir case ${step} in - all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./pandagma_out + all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./out_pandagma *) if [ "${out_dir:-}" ]; then printf '\noption -o OUT_DIR not applicable to step %s\n' "$step" >&2 exit 1 diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index 15cbb73..f37fc18 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -110,7 +110,8 @@ Variables in pandagma config file (Set the config with the CONF environment vari ks_binsize - For calculating and displaying histograms. [0.05] ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] max_pair_ks - Fallback value for excluding gene pairs, if a ks_peaks.tsv file is not provided. [4.0] - min_align_count - Minimum number of sequences to trigger alignments, modeling, and trees + min_align_count - Minimum number of sequences to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] consen_prefix - Prefix to use in orthogroup names annot_str_regex - Regular expression for capturing annotation name from gene ID, e.g. @@ -692,24 +693,24 @@ run_add_extra() { perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' 18_syn_pan_aug_extra.clust.tsv \ > 18_syn_pan_aug_extra.hsh.tsv - echo " For each family set, retrieve sequences into a multifasta file 19_pan_aug_leftover_merged_prot (abbreviated 19_palm)" + echo " For each family set, retrieve sequences into a multifasta file 19_pan_aug_leftover_merged_prot (abbreviated 19_palmp)" printf " Fasta file: %s %s\n" "${protein_files[@]}" "${protein_files_extra[@]}" - if [ -d 19_palm ]; then rm -rf 19_palm ; fi - mkdir -p 19_palm + if [ -d 19_palmp ]; then rm -rf 19_palmp ; fi + mkdir -p 19_palmp get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palm + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp - echo " Merge fasta files from 19_palm, prefixing IDs with panID__" - merge_files_to_pan_fasta.awk 19_palm/* > 19_palm.faa - #for path in 19_palm/*; do + echo " Merge fasta files from 19_palmp, prefixing IDs with panID__" + merge_files_to_pan_fasta.awk 19_palmp/* > 19_palmp.faa + #for path in 19_palmp/*; do # pan_file=`basename $path` # cat $path | awk -v panID=$pan_file ' $1~/^>/ {print ">" panID "__" substr($0,2) } - # $1!~/^>/ {print $1} ' >> 19_palm.faa + # $1!~/^>/ {print $1} ' >> 19_palmp.faa #done else echo "== No annotations were designated as \"extra\", so just promote the syn_pan_aug files as syn_pan_aug_extra. ==" - cp 07_pan_fasta_prot.faa 19_palm.faa + cp 07_pan_fasta_prot.faa 19_palmp.faa cp 12_syn_pan_aug.clust.tsv 18_syn_pan_aug_extra.clust.tsv cp 12_syn_pan_aug.hsh.tsv 18_syn_pan_aug_extra.hsh.tsv fi @@ -733,94 +734,6 @@ run_tabularize() { rm tmp.table_header } -########## -run_align() { - echo; echo "== Retrieve sequences for each family, preparatory to aligning them ==" - cd "${WORK_DIR}" || exit - if [[ -d 19_palm ]] && [[ -f 19_palm/${consen_prefix}00001 ]]; then - : # do nothing; the directory and file(s) exist - else - mkdir -p 19_palm - echo " For each pan-gene set, retrieve sequences into a multifasta file." - get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palm - fi - - echo; echo "== Move small families to the side ==" - mkdir -p 19_pan_aug_leftover_merged_small - # Below, "count" is the number of unique sequences in the alignment. - for filepath in 19_palm/*; do - file=$(basename "$filepath") - count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); - if [[ $count -lt $min_align_count ]]; then - echo "Set aside small family $file"; - mv "$filepath" 19_pan_aug_leftover_merged_small/ - fi; - done - - echo; echo "== Align the gene families ==" - mkdir -p 20_aligns - for filepath in 19_palm/*; do - file=$(basename "$filepath"); - echo " Computing alignment, using program famsa, for file $file" - famsa -t 2 19_palm/"$file" 20_aligns/"$file" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait -} - -########## -run_model_and_trim() { - echo; echo "== Build HMMs ==" - cd "${WORK_DIR}" || exit - - mkdir -p 21_hmm - for filepath in 20_aligns/*; do - file=$(basename "$filepath"); - hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - - echo; echo "== Realign to HMMs ==" - mkdir -p 22_hmmalign - for filepath in 21_hmm/*; do - file=$(basename "$filepath"); - printf "%s " "$file" - hmmalign --trim --outformat A2M --amino -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palm/"$file" & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo - - echo; echo "== Trim HMM alignments to match-states ==" - mkdir -p 23_hmmalign_trim1 - for filepath in 22_hmmalign/*; do - file=$(basename "$filepath"); - printf "%s " "$file" - < "$filepath" perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' | - sed '/^$/d' > 23_hmmalign_trim1/"$file" & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo - - echo; echo "== Filter alignments prior to tree calculation ==" - mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log - min_depth=3 - min_pct_depth=20 - min_pct_aligned=20 - for filepath in 23_hmmalign_trim1/*; do - file=$(basename "$filepath") - printf "%s " "$file" - filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ - -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo -} - ########## run_summarize() { echo; echo "Summarize: Move results into output directory, and report some summary statistics" @@ -859,15 +772,18 @@ run_summarize() { echo "Couldn't find file manifests/MANIFEST_output_fam.yml" fi - for dir in 19_palm 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then + # Copy directory of final fasta files - abbreviated 19_palmp but copied to 19_pan_aug_leftover_merged_prot + for dir in 19_palmp; do + if [ -d "${WORK_DIR}"/19_palmp ]; then echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ + cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/19_pan_aug_leftover_merged_prot/ else echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" fi done + # 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees are transferred with -s xfr_aligns_trees + printf "Run of program %s, version %s\n" "$scriptname" "$version" > "${stats_file}" end_time=$(date) @@ -960,7 +876,7 @@ run_clean() { echo " work_dir: $PWD" if [ -d MMTEMP ]; then rm -rf MMTEMP/*; fi - for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_palm; do + for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_palmp; do if [ -d $dir ]; then echo " Removing directory $dir"; rm -rf $dir & fi done @@ -983,7 +899,7 @@ consen_prefix annot_str_regex' export commandlist="ingest mmseqs filter dagchainer \ ks_calc ks_filter \ mcl consense cluster_rest add_extra tabularize \ - align model_and_trim calc_trees summarize" + align model_and_trim calc_trees xfr_aligns_trees summarize" export dependencies='dagchainer mcl cons famsa run_DAG_chainer.pl' diff --git a/bin/pandagma-fsup.sh b/bin/pandagma-fsup.sh index 9a8a621..f6901e3 100755 --- a/bin/pandagma-fsup.sh +++ b/bin/pandagma-fsup.sh @@ -59,6 +59,8 @@ Variables in pandagma config file (Set the config with the CONF environment vari \"([^.]+\.[^.]+)\..+\" for two dot-separated fields, e.g. vigan.Shumari or \"(\D+\d+\D+)\d+.+\" for Zea assembly+annot string, e.g. Zm00032ab + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] File sets (arrays): protein_files diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index a00a119..88351c3 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -114,7 +114,8 @@ Variables in pandagma config file (Set the config with the CONF environment vari this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. order_method - Method to determine consensus panID order. reference or alignment [default reference] - min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] '' EOS @@ -653,20 +654,21 @@ run_add_extra() { hash_to_rows_by_1st_col.awk 18_syn_pan_aug_extra.hsh.tsv > 18_syn_pan_aug_extra.clust.tsv echo " For each pan-gene set, retrieve sequences into a multifasta file." - echo " Fasta file:" "${protein_files[@]}" "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" - if [ -d 19_pan_aug_leftover_merged_cds ]; then rm -rf 19_pan_aug_leftover_merged_cds; fi - mkdir -p 19_pan_aug_leftover_merged_cds + echo " The directory and multifasta file are called 19_pan_aug_leftover_merged_cds (abbreviated 19_palmc)" + echo " Fasta file:" "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" + if [ -d 19_palmc ]; then rm -rf 19_palmc; fi + mkdir -p 19_palmc get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_cds + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmc - echo " Merge files in 19_pan_aug_leftover_merged_cds, prefixing IDs with panID__" - find 19_pan_aug_leftover_merged_cds -maxdepth 1 -mindepth 1 | + echo " Merge files in 19_palmc, prefixing IDs with panID__" + find 19_palmc -maxdepth 1 -mindepth 1 | sort | - xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_pan_aug_leftover_merged_cds.fna + xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_palmc.fna else echo "== No annotations were designated as \"extra\", so just promote the syn_pan_aug files as syn_pan_aug_extra. ==" - cp 07_pan_fasta_cds.fna 19_pan_aug_leftover_merged_cds.fna + cp 07_pan_fasta_cds.fna 19_palmc.fna cp 12_syn_pan_aug.clust.tsv 18_syn_pan_aug_extra.clust.tsv cp 12_syn_pan_aug.hsh.tsv 18_syn_pan_aug_extra.hsh.tsv fi @@ -684,24 +686,25 @@ run_pick_exemplars() { done echo " Get protein sequences into pan-gene sets, corresponding with 18_syn_pan_aug_extra.clust.tsv" - if [ -d 19_pan_aug_leftover_merged_prot ]; then rm -rf 19_pan_aug_leftover_merged_prot; fi - mkdir -p 19_pan_aug_leftover_merged_prot + echo " The directory and multifasta file are called 19_pan_aug_leftover_merged_prot (abbreviated 19_palmp)" + if [ -d 19_palmp ]; then rm -rf 19_palmp; fi + mkdir -p 19_palmp get_fasta_from_family_file.pl 20_pan_fasta_prot.faa \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp - echo " Get all protein sequences from files in 19_pan_aug_leftover_merged_prot" - find 19_pan_aug_leftover_merged_prot -maxdepth 1 -mindepth 1 | + echo " Get all protein sequences from files in 19_palmp" + find 19_palmp -maxdepth 1 -mindepth 1 | sort | - xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_pan_aug_leftover_merged_prot.faa + xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_palmp.faa echo " Pick a representative sequence for each pangene set - as a sequence with the median length for that set." echo " == first proteins:" pick_family_rep.pl -nostop -prefer "$preferred_annot" \ - -out 21_pan_fasta_clust_rep_prot.faa < 19_pan_aug_leftover_merged_prot.faa + -out 21_pan_fasta_clust_rep_prot.faa < 19_palmp.faa echo " == then CDS sequences, corresponding with 21_pan_fasta_clust_rep_prot.faa" awk '$1~/^>/ {print substr($1,2)}' 21_pan_fasta_clust_rep_prot.faa > lists/lis.21_pan_fasta_clust_rep - get_fasta_subset.pl -in 19_pan_aug_leftover_merged_cds.fna -list lists/lis.21_pan_fasta_clust_rep \ + get_fasta_subset.pl -in 19_palmc.fna -list lists/lis.21_pan_fasta_clust_rep \ -clobber -out 21_pan_fasta_clust_rep_cds.fna perl -pi -e 's/__/ /' 21_pan_fasta_clust_rep_cds.fna @@ -889,153 +892,13 @@ run_order_and_name() { perl -pe 's/__/ /g' > 23_syn_pan_pctl"${pctl_low}"_posn_prot.faa } -########## -run_align_cds() { - cd "${WORK_DIR}" || exit - - echo "== Align CDS sequences ==" - - # If not already present, retrieve sequences for each family, preparatory to aligning them - if [[ -d 19_pan_aug_leftover_merged_cds ]]; then - : # do nothing; the directory and file(s) exist - else - mkdir -p 19_pan_aug_leftover_merged_cds - echo " For each pan-gene set, retrieve sequences into a multifasta file." - get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_cds - fi - - echo; echo "== Align nucleotide sequence for each gene family ==" - mkdir -p 20_aligns_cds - for filepath in 19_pan_aug_leftover_merged_cds/*; do - file=$(basename "$filepath"); - echo " Computing alignment, using program famsa, for file $file" - famsa -t 2 19_pan_aug_leftover_merged_cds/"$file" 20_aligns_cds/"$file" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait -} - -########## -run_align_protein() { - cd "${WORK_DIR}" || exit - - echo "== Align protein sequences ==" - - # If not already present, retrieve sequences for each family, preparatory to aligning them - if [[ -d 19_pan_aug_leftover_merged_prot ]]; then - : # do nothing; the directory and file(s) exist - else - mkdir -p 19_pan_aug_leftover_merged_prot - echo " For each pan-gene set, retrieve sequences into a multifasta file." - get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot - fi - - echo; echo "== Move small families to the side ==" - mkdir -p 19_pan_aug_leftover_merged_small - # Below, "count" is the number of unique sequences in the alignment. - for filepath in 19_pan_aug_leftover_merged_prot/*; do - file=$(basename "$filepath") - count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); - if [[ $count -lt $min_align_count ]]; then - echo "Set aside small family $file"; - mv "$filepath" 19_pan_aug_leftover_merged_small/ - fi; - done - - echo; echo "== Align protein sequence for the each gene family ==" - mkdir -p 20_aligns_prot - for filepath in 19_pan_aug_leftover_merged_prot/*; do - file=$(basename "$filepath"); - echo " Computing alignment, using program famsa, for file $file" - famsa -t 2 19_pan_aug_leftover_merged_prot/"$file" 20_aligns_prot/"$file" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait -} - -########## -run_model_and_trim() { - echo; echo "== Build HMMs ==" - cd "${WORK_DIR}" || exit - mkdir -p 21_hmm - for filepath in 20_aligns_cds/*; do - file=$(basename "$filepath"); - hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - - echo; echo "== Realign to HMMs ==" - mkdir -p 22_hmmalign - for filepath in 21_hmm/*; do - file=$(basename "$filepath"); - printf "%f " "$file" - hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_pan_aug_leftover_merged_cds/"$file" & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo - - echo; echo "== Trim HMM alignments to match-states ==" - mkdir -p 23_hmmalign_trim1 - for filepath in 22_hmmalign/*; do - file=$(basename "$filepath"); - printf "%s " "$file" - perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" | - sed '/^$/d' > 23_hmmalign_trim1/"$file" & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo - - echo; echo "== Filter alignments prior to tree calculation ==" - mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log - min_depth=3 - min_pct_depth=20 - min_pct_aligned=20 - for filepath in 23_hmmalign_trim1/*; do - file=$(basename "$filepath") - printf "%f " "$file" - filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ - -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo -} - -########## -run_xfr_aligns_trees() { - echo; echo "Copy alignment and tree results into output directory" - - full_out_dir="${out_dir}" - - cd "${submit_dir}" || exit - - if [ ! -d "$full_out_dir" ]; then - echo "creating output directory \"${full_out_dir}/\"" - mkdir -p "$full_out_dir" - fi - - for dir in 20_aligns_cds 20_aligns_prot 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done -} - ########## run_calc_chr_pairs() { cd "${WORK_DIR}" || exit echo "Generate a report of observed chromosome pairs" echo " Identify gene pairs, using mmseqs --easy_cluster" - mmseqs easy-cluster 19_pan_aug_leftover_merged_cds.fna 24_pan_fasta_clust 03_mmseqs_tmp \ + mmseqs easy-cluster 19_palmc.fna 24_pan_fasta_clust 03_mmseqs_tmp \ --min-seq-id "$clust_iden" -c "$clust_cov" --cov-mode 0 --cluster-reassign 1>/dev/null echo " Extract chromosome-chromosome correspondences" @@ -1273,7 +1136,7 @@ run_clean() { echo " work_dir: $PWD" if [ -d MMTEMP ]; then rm -rf MMTEMP/*; fi - for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_pan_aug_leftover_merged_prot \ + for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_palmp \ 22_syn_pan_aug_extra_pctl${pctl_low}; do if [ -d "$dir" ]; then echo " Removing directory $dir"; rm -rf "$dir" & fi @@ -1296,7 +1159,8 @@ pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation # The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. export commandlist="ingest mmseqs filter dagchainer mcl consense cluster_rest add_extra \ - pick_exemplars filter_to_pctile tabularize order_and_name calc_chr_pairs summarize" + pick_exemplars filter_to_pctile tabularize order_and_name \ + align model_and_trim calc_trees xfr_aligns_trees calc_chr_pairs summarize" export dependencies='mmseqs dagchainer mcl cons famsa hmmalign hmmbuild run_DAG_chainer.pl' diff --git a/config/family3_18_3.conf b/config/family3_18_3.conf index b5ad541..7529591 100644 --- a/config/family3_18_3.conf +++ b/config/family3_18_3.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( aesev 2 diff --git a/config/family3_22_3.conf b/config/family3_22_3.conf index dd65ae9..11a0e36 100644 --- a/config/family3_22_3.conf +++ b/config/family3_22_3.conf @@ -12,6 +12,7 @@ max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' min_align_count="6" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( acacr 2 diff --git a/config/family3_4_3.conf b/config/family3_4_3.conf index a6820ce..051d088 100644 --- a/config/family3_4_3.conf +++ b/config/family3_4_3.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( arath 2 diff --git a/config/family3_sup.conf b/config/family3_sup.conf index b149a15..c0170bd 100644 --- a/config/family3_sup.conf +++ b/config/family3_sup.conf @@ -3,6 +3,8 @@ clust_cov='0.40' TE_match_iden='0.40' consen_method='align_sample' # align_sample >> cons > hmmemit annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of FASTA file paths # List CDS files in the "cds_files" array diff --git a/config/family3_sup_ur.conf b/config/family3_sup_ur.conf index 7b45dca..11fba3e 100644 --- a/config/family3_sup_ur.conf +++ b/config/family3_sup_ur.conf @@ -3,6 +3,8 @@ clust_cov='0.40' TE_match_iden='0.40' consen_method='align_sample' # align_sample >> cons > hmmemit annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of FASTA file paths # List CDS files in the "cds_files" array diff --git a/config/family3_ur.conf b/config/family3_ur.conf index 7b45dca..11fba3e 100644 --- a/config/family3_ur.conf +++ b/config/family3_ur.conf @@ -3,6 +3,8 @@ clust_cov='0.40' TE_match_iden='0.40' consen_method='align_sample' # align_sample >> cons > hmmemit annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of FASTA file paths # List CDS files in the "cds_files" array diff --git a/config/family_7_3.conf b/config/family_7_3.conf index c78d43a..9de8302 100644 --- a/config/family_7_3.conf +++ b/config/family_7_3.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( arath 2 diff --git a/config/family_ur.conf b/config/family_ur.conf index bf39910..d16020b 100644 --- a/config/family_ur.conf +++ b/config/family_ur.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( arath 2 From b8ababa8243a0b6ad83d0c82f79107457c82f918 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Fri, 23 Feb 2024 13:55:40 -0800 Subject: [PATCH 03/18] Various fixes in alignment and modeling steps, after moving this code to -common --- bin/pandagma-common.sh | 32 +++++++++----------------------- bin/pandagma-fam.sh | 7 ++++--- bin/pandagma-pan.sh | 3 ++- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh index 025e892..cc65c28 100755 --- a/bin/pandagma-common.sh +++ b/bin/pandagma-common.sh @@ -72,14 +72,14 @@ run_align_cds() { echo; echo "== Move small families to the side ==" mkdir -p 19_palmc_small - # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups. + # Below, count_seqs = number of sequences in the alignment; count_annots = number of unique annotation groups. min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. for filepath in 19_palmc/*; do file=$(basename "$filepath") - count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | wc -l); count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then - echo "Set aside small family $file; $count_seqs sequences, $count_annots annotations"; + echo "Set aside small family $file; $count_seqs sequences, $count_annots distinct annotations"; mv "$filepath" 19_palmc_small/ fi; done @@ -130,7 +130,7 @@ run_align_protein() { mkdir -p 20_aligns_prot for filepath in 19_palmp/*; do file=$(basename "$filepath"); - echo " Computing alignment, using program famsa, for file $file" + # echo " Computing alignment, using program famsa, for file $file" famsa -t 2 19_palmp/"$file" 20_aligns_prot/"$file" 1>/dev/null & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi done @@ -142,7 +142,7 @@ run_model_and_trim() { echo; echo "== Build HMMs ==" cd "${WORK_DIR}" || exit mkdir -p 21_hmm - for filepath in 20_aligns_cds/*; do + for filepath in 20_aligns_prot/*; do file=$(basename "$filepath"); hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi @@ -153,8 +153,8 @@ run_model_and_trim() { mkdir -p 22_hmmalign for filepath in 21_hmm/*; do file=$(basename "$filepath"); - printf "%f " "$file" - hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palmc/"$file" & + # printf "%s " "$file" + hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palmp/"$file" & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi done wait @@ -164,7 +164,7 @@ run_model_and_trim() { mkdir -p 23_hmmalign_trim1 for filepath in 22_hmmalign/*; do file=$(basename "$filepath"); - printf "%s " "$file" + # printf "%s " "$file" perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" | sed '/^$/d' > 23_hmmalign_trim1/"$file" & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi @@ -179,7 +179,7 @@ run_model_and_trim() { min_pct_aligned=20 for filepath in 23_hmmalign_trim1/*; do file=$(basename "$filepath") - printf "%f " "$file" + # printf "%s " "$file" filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi @@ -199,20 +199,6 @@ run_calc_trees() { mkdir -p "${dir_prefix}4_trees" - echo; echo "== Move small (<4) and very low-entropy families (sequences are all identical) to the side ==" - mkdir -p "${dir_prefix}3_pan_aug_small_or_identical" - # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups. - min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. - for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do - file=$(basename "$filepath") - count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); - count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) - if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then - echo "Set aside small or low-entropy family $file; $count_seqs sequences, $count_annots annotations"; - mv "$filepath" "${dir_prefix}3_pan_aug_small_or_identical"/ - fi; - done - # By default, FastTreeMP uses all available threads. # It is more efficient to run more jobs on one core each by setting an environment variable. OMP_NUM_THREADS=1 diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index f37fc18..8d75a59 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -896,10 +896,11 @@ pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation strict_synt ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff max_pair_ks consen_prefix annot_str_regex' -export commandlist="ingest mmseqs filter dagchainer \ - ks_calc ks_filter \ +# The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. +# Those steps (functions) are in pandagma-common.sh +export commandlist="ingest mmseqs filter dagchainer ks_calc ks_filter \ mcl consense cluster_rest add_extra tabularize \ - align model_and_trim calc_trees xfr_aligns_trees summarize" + align_protein model_and_trim calc_trees xfr_aligns_trees summarize" export dependencies='dagchainer mcl cons famsa run_DAG_chainer.pl' diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index 88351c3..991e0cd 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -1158,9 +1158,10 @@ pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation consen_prefix annot_str_regex order_method preferred_annot' # The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. +# Those steps (functions) are in pandagma-common.sh export commandlist="ingest mmseqs filter dagchainer mcl consense cluster_rest add_extra \ pick_exemplars filter_to_pctile tabularize order_and_name \ - align model_and_trim calc_trees xfr_aligns_trees calc_chr_pairs summarize" + align_cds align_protein model_and_trim calc_trees xfr_aligns_trees calc_chr_pairs summarize" export dependencies='mmseqs dagchainer mcl cons famsa hmmalign hmmbuild run_DAG_chainer.pl' From 403dd1d0fb68e73ae8825b4430b66ab207d30b56 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Mon, 26 Feb 2024 07:26:30 -0800 Subject: [PATCH 04/18] Handle argument list too long errors around augment_cluster_sets.awk --- bin/pandagma-pan.sh | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index 991e0cd..be1df20 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -405,7 +405,9 @@ run_consense() { get_fasta_from_family_file.pl "${cds_files[@]}" -fam 06_syn_pan.clust.tsv -out 07_pan_fasta echo " Merge fasta files in 07_pan_fasta, prefixing IDs with panID" - merge_files_to_pan_fasta.awk 07_pan_fasta/* > 07_pan_fasta_cds.fna + pushd 07_pan_fasta + merge_files_to_pan_fasta.awk * > ../07_pan_fasta_cds.fna + popd echo " Pick a representative seq. for each orthogroup - as a sequence with the median length for that OG." echo " < 07_pan_fasta_cds.fna pick_family_rep.pl -prefer $preferred_annot -out 08_pan_fasta_clust_rep_cds.fna" @@ -415,7 +417,9 @@ run_consense() { cat_or_zcat "${cds_files[@]}" | awk '/^>/ {print substr($1,2)}' | sort > lists/09_all_genes echo " Get sorted list of all clustered genes" - awk '$1~/^>/ {print $1}' 07_pan_fasta/* | sed 's/>//' | sort > lists/09_all_clustered_genes + pushd 07_pan_fasta + awk '$1~/^>/ {print $1}' * | sed 's/>//' | sort > ../lists/09_all_clustered_genes + popd echo " Get list of genes not in clusters" comm -13 lists/09_all_clustered_genes lists/09_all_genes > lists/09_genes_not_in_clusters @@ -474,9 +478,11 @@ run_consense() { echo " Make augmented cluster sets. Uniqify on the back end, converting from mcl clust format to hsh (clust_ID gene)." cat /dev/null > 12_syn_pan_aug_pre.hsh.tsv - augment_cluster_sets.awk leftovers_dir=11_pan_leftovers 07_pan_fasta/* | - perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | - sort -k1,1 -k2,2 | uniq > 12_syn_pan_aug_pre.hsh.tsv + pushd 07_pan_fasta + augment_cluster_sets.awk leftovers_dir=../11_pan_leftovers * | + perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | + sort -k1,1 -k2,2 | uniq > ../12_syn_pan_aug_pre.hsh.tsv + popd echo " Reshape from mcl output format (clustered IDs on one line) to a hash format (clust_ID gene)" hash_to_rows_by_1st_col.awk < 12_syn_pan_aug_pre.hsh.tsv > 12_syn_pan_aug_pre.clust.tsv @@ -551,7 +557,9 @@ run_add_extra() { get_fasta_from_family_file.pl "${cds_files[@]}" -fam 12_syn_pan_aug.clust.tsv -out 13_pan_aug_fasta echo " Merge fasta files in 13_pan_aug_fasta, prefixing IDs with panID__" - merge_files_to_pan_fasta.awk 13_pan_aug_fasta/* > 13_pan_aug_fasta.fna + pushd 13_pan_aug_fasta + merge_files_to_pan_fasta.awk * > ../13_pan_aug_fasta.fna + popd echo " Store the panID - geneID in a hash to be retrieved later, after filtering by chromosome" awk '$1~/^>/ {print substr($1,2)}' 13_pan_aug_fasta.fna | @@ -646,9 +654,11 @@ run_add_extra() { -fam 14_syn_pan_extra.clust.tsv -out 16_pan_leftovers_extra/ echo " Make augmented cluster sets. Uniqify on the back end, converting from mcl clust format to hsh (clust_ID gene)." - augment_cluster_sets.awk leftovers_dir=16_pan_leftovers_extra 13_pan_aug_fasta/* | - perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | - sort -k1,1 -k2,2 | uniq > 18_syn_pan_aug_extra.hsh.tsv + pushd 13_pan_aug_fasta + augment_cluster_sets.awk leftovers_dir=../16_pan_leftovers_extra * | + perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | + sort -k1,1 -k2,2 | uniq > ../18_syn_pan_aug_extra.hsh.tsv + popd echo " Reshape from hash to mcl output format (clustered IDs on one line)" hash_to_rows_by_1st_col.awk 18_syn_pan_aug_extra.hsh.tsv > 18_syn_pan_aug_extra.clust.tsv From 2d4500c809e41ab8c967d0eef490eb8b755e78ef Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Mon, 26 Feb 2024 07:27:59 -0800 Subject: [PATCH 05/18] Add Wm82.gnm6 annotation to Glycine pangene config & make file --- ...ne5_21_29_6.conf => Glycine5_21_30_6.conf} | 23 +++++++++++-------- ...lycine5_21_29_6.mk => Glycine5_21_30_6.mk} | 3 +++ 2 files changed, 16 insertions(+), 10 deletions(-) rename config/{Glycine5_21_29_6.conf => Glycine5_21_30_6.conf} (98%) rename get_data/{Glycine5_21_29_6.mk => Glycine5_21_30_6.mk} (98%) diff --git a/config/Glycine5_21_29_6.conf b/config/Glycine5_21_30_6.conf similarity index 98% rename from config/Glycine5_21_29_6.conf rename to config/Glycine5_21_30_6.conf index 4a528bb..022031b 100644 --- a/config/Glycine5_21_29_6.conf +++ b/config/Glycine5_21_30_6.conf @@ -9,7 +9,7 @@ pctl_med="50" pctl_hi="75" consen_prefix='Glycine.pan5' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' -preferred_annot='Wm82_ISU01.gnm2.ann1' +preferred_annot='Wm82.gnm6.ann1' order_method="alignment" ##### (required) list of GFF & FASTA file paths @@ -26,12 +26,12 @@ annotation_files=( glyma.Lee.gnm2.ann1.1FNT.gene_models_main.bed.gz glyma.Lee.gnm3.ann1.ZYY3.gene_models_main.bed.gz glyma.Wenfeng7_IGA1001.gnm1.ann1.ZK5W.gene_models_main.bed.gz - glyma.Wm82_IGA1008.gnm1.ann1.FGN6.gene_models_main.bed.gz - glyma.Wm82_ISU01.gnm2.ann1.FGFB.gene_models_main.bed.gz - glyma.Wm82_NJAU.gnm1.ann1.KM71.gene_models_main.bed.gz glyma.Wm82.gnm2.ann1.RVB6.gene_models_main.bed.gz glyma.Wm82.gnm4.ann1.T8TQ.gene_models_main.bed.gz glyma.Wm82.gnm5.ann1.J7HW.gene_models_main.bed.gz + glyma.Wm82.gnm6.ann1.PKSW.gene_models_main.bed.gz + glyma.Wm82_IGA1008.gnm1.ann1.FGN6.gene_models_main.bed.gz + glyma.Wm82_NJAU.gnm1.ann1.KM71.gene_models_main.bed.gz glyma.Zh13_IGA1005.gnm1.ann1.87Z5.gene_models_main.bed.gz glyma.Zh13.gnm1.ann1.8VV3.gene_models_main.bed.gz glyma.Zh35_IGA1004.gnm1.ann1.RGN6.gene_models_main.bed.gz @@ -50,12 +50,12 @@ cds_files=( glyma.Lee.gnm2.ann1.1FNT.cds_primary.fna.gz glyma.Lee.gnm3.ann1.ZYY3.cds.fna.gz glyma.Wenfeng7_IGA1001.gnm1.ann1.ZK5W.cds.fna.gz - glyma.Wm82_IGA1008.gnm1.ann1.FGN6.cds.fna.gz - glyma.Wm82_ISU01.gnm2.ann1.FGFB.cds_primary.fna.gz - glyma.Wm82_NJAU.gnm1.ann1.KM71.cds_primary.fna.gz glyma.Wm82.gnm2.ann1.RVB6.cds_primary.fna.gz glyma.Wm82.gnm4.ann1.T8TQ.cds_primary.fna.gz glyma.Wm82.gnm5.ann1.J7HW.cds.fna.gz + glyma.Wm82.gnm6.ann1.PKSW.cds_primary.fna.gz + glyma.Wm82_IGA1008.gnm1.ann1.FGN6.cds.fna.gz + glyma.Wm82_NJAU.gnm1.ann1.KM71.cds_primary.fna.gz glyma.Zh13_IGA1005.gnm1.ann1.87Z5.cds.fna.gz glyma.Zh13.gnm1.ann1.8VV3.cds_primary.fna.gz glyma.Zh35_IGA1004.gnm1.ann1.RGN6.cds.fna.gz @@ -74,12 +74,12 @@ protein_files=( glyma.Lee.gnm2.ann1.1FNT.protein_primary.faa.gz glyma.Lee.gnm3.ann1.ZYY3.protein.faa.gz glyma.Wenfeng7_IGA1001.gnm1.ann1.ZK5W.protein.faa.gz - glyma.Wm82_IGA1008.gnm1.ann1.FGN6.protein.faa.gz - glyma.Wm82_ISU01.gnm2.ann1.FGFB.protein_primary.faa.gz - glyma.Wm82_NJAU.gnm1.ann1.KM71.protein_primary.faa.gz glyma.Wm82.gnm2.ann1.RVB6.protein_primary.faa.gz glyma.Wm82.gnm4.ann1.T8TQ.protein_primary.faa.gz glyma.Wm82.gnm5.ann1.J7HW.protein.faa.gz + glyma.Wm82.gnm6.ann1.PKSW.protein_primary.faa.gz + glyma.Wm82_IGA1008.gnm1.ann1.FGN6.protein.faa.gz + glyma.Wm82_NJAU.gnm1.ann1.KM71.protein_primary.faa.gz glyma.Zh13_IGA1005.gnm1.ann1.87Z5.protein.faa.gz glyma.Zh13.gnm1.ann1.8VV3.protein_primary.faa.gz glyma.Zh35_IGA1004.gnm1.ann1.RGN6.protein.faa.gz @@ -110,6 +110,7 @@ annotation_files_extra_constr=( glyma.TongShanTianEDan.gnm1.ann1.56XW.gene_models_main.bed.gz glyma.WanDouNo_28.gnm1.ann1.NLYP.gene_models_main.bed.gz glyma.Wm82.gnm1.ann1.DvBy.gene_models_main.bed.gz + glyma.Wm82_ISU01.gnm2.ann1.FGFB.gene_models_main.bed.gz glyma.XuDouNo_1.gnm1.ann1.G2T7.gene_models_main.bed.gz glyma.YuDouNo_22.gnm1.ann1.HCQ1.gene_models_main.bed.gz glyma.Zh13.gnm2.ann1.FJ3G.gene_models_main.bed.gz @@ -142,6 +143,7 @@ cds_files_extra_constr=( glyma.TongShanTianEDan.gnm1.ann1.56XW.cds_primary.fna.gz glyma.WanDouNo_28.gnm1.ann1.NLYP.cds_primary.fna.gz glyma.Wm82.gnm1.ann1.DvBy.cds_primary.fna.gz + glyma.Wm82_ISU01.gnm2.ann1.FGFB.cds_primary.fna.gz glyma.XuDouNo_1.gnm1.ann1.G2T7.cds_primary.fna.gz glyma.YuDouNo_22.gnm1.ann1.HCQ1.cds_primary.fna.gz glyma.Zh13.gnm2.ann1.FJ3G.cds_primary.fna.gz @@ -174,6 +176,7 @@ protein_files_extra_constr=( glyma.TongShanTianEDan.gnm1.ann1.56XW.protein_primary.faa.gz glyma.WanDouNo_28.gnm1.ann1.NLYP.protein_primary.faa.gz glyma.Wm82.gnm1.ann1.DvBy.protein_primary.faa.gz + glyma.Wm82_ISU01.gnm2.ann1.FGFB.protein_primary.faa.gz glyma.XuDouNo_1.gnm1.ann1.G2T7.protein_primary.faa.gz glyma.YuDouNo_22.gnm1.ann1.HCQ1.protein_primary.faa.gz glyma.Zh13.gnm2.ann1.FJ3G.protein_primary.faa.gz diff --git a/get_data/Glycine5_21_29_6.mk b/get_data/Glycine5_21_30_6.mk similarity index 98% rename from get_data/Glycine5_21_29_6.mk rename to get_data/Glycine5_21_30_6.mk index b42a58f..75f1fff 100644 --- a/get_data/Glycine5_21_29_6.mk +++ b/get_data/Glycine5_21_30_6.mk @@ -107,6 +107,9 @@ glyma.Wm82.gnm4.ann1.T8TQ.protein_primary.faa.gz \ glyma.Wm82.gnm5.ann1.J7HW.cds.fna.gz \ glyma.Wm82.gnm5.ann1.J7HW.gene_models_main.bed.gz \ glyma.Wm82.gnm5.ann1.J7HW.protein.faa.gz \ +glyma.Wm82.gnm6.ann1.PKSW.cds_primary.fna.gz \ +glyma.Wm82.gnm6.ann1.PKSW.gene_models_main.bed.gz \ +glyma.Wm82.gnm6.ann1.PKSW.protein_primary.faa.gz \ glyma.Wm82_IGA1008.gnm1.ann1.FGN6.cds.fna.gz \ glyma.Wm82_IGA1008.gnm1.ann1.FGN6.gene_models_main.bed.gz \ glyma.Wm82_IGA1008.gnm1.ann1.FGN6.protein.faa.gz \ From 38840b2bc993f11c36e4fcc93aadc4a23bc3c7e5 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Mon, 26 Feb 2024 09:42:36 -0800 Subject: [PATCH 06/18] Remove 04_dag at start of filter, to avoid problems of leftover files from previous runs --- bin/pandagma-pan.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index be1df20..6d1c37b 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -299,6 +299,7 @@ run_mmseqs() { run_filter() { echo; echo "From mmseqs cluster output, split out the following fields: molecule, gene, start, stop." cd "${WORK_DIR}" || exit + if [ -d 04_dag ]; then rm -rf 04_dag; fi mkdir -p 04_dag if [[ -v expected_chr_matches ]]; then # filter based on list of expected chromosome pairings if provided echo "Filtering on chromosome patterns defined in expected_chr_matches" From 947861fb7b0e823a6b8b8b7b0b42fcc6e265809f Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Tue, 27 Feb 2024 10:03:15 -0800 Subject: [PATCH 07/18] Minor cosmetic change in stats/ks_histplots.tsv --- bin/pandagma-fam.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index 8d75a59..04f2e1c 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -447,7 +447,7 @@ run_ks_calc() { echo "# $filebase" echo "# Normalized relative to Ks peak inferred at bin $ks_bin, with amplitude $ks_amplitude_pct" awk 'NF==7 && $7<=3 {print $7}' "$ks_path" | histogram.pl -z -n -s "$ks_binsize" | - histplot.pl -d "$ks_amplitude_pct" + histplot.pl -d "$ks_amplitude_pct" | cut -b1-150 echo "" } >> stats/ks_histplots.tsv done From 1b2337b1d18abb298408b12c743b48d91663f02c Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Tue, 27 Feb 2024 13:24:47 -0800 Subject: [PATCH 08/18] Add two params, min_align_count min_annots_in_align, to the reporting in the summarize step --- bin/pandagma-fam.sh | 7 ++++--- config/family3_22_3.conf | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index 04f2e1c..d14a4a2 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -892,9 +892,10 @@ run_clean() { ######################################## # Main program -pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation strict_synt -ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff max_pair_ks -consen_prefix annot_str_regex' +pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation strict_synt +ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff +max_pair_ks min_align_count min_annots_in_align +consen_prefix annot_str_regex preferred_annot expected_quotas' # The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. # Those steps (functions) are in pandagma-common.sh diff --git a/config/family3_22_3.conf b/config/family3_22_3.conf index 11a0e36..e52c3d6 100644 --- a/config/family3_22_3.conf +++ b/config/family3_22_3.conf @@ -11,7 +11,7 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' -min_align_count="6" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( From d592aa08539f68fa464be652570e3b73451fcdf6 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Tue, 27 Feb 2024 18:46:11 -0800 Subject: [PATCH 09/18] Fix listing of pandagma_conf_params reported in summarize step, and other minor cleanup --- bin/pandagma-fam.sh | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index d14a4a2..d61a841 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -102,6 +102,7 @@ Variables in pandagma config file (Set the config with the CONF environment vari clust_iden - Minimum identity threshold for mmseqs clustering [0.40] clust_cov - Minimum coverage for mmseqs clustering [0.40] extra_iden - Minimum identity threshold for mmseqs addition of \"extra\" annotations [0.30] + TE_match_iden - Minimum identity threshold for excluding match to transposable element file - for "TESearch" mcl_inflation - Inflation parameter, for Markov clustering [1.6] strict_synt - For clustering of the \"main\" annotations, use only syntenic pairs [1] The alternative (0) is to use all homologous pairs that satisfy expected_quotas @@ -110,18 +111,14 @@ Variables in pandagma config file (Set the config with the CONF environment vari ks_binsize - For calculating and displaying histograms. [0.05] ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] max_pair_ks - Fallback value for excluding gene pairs, if a ks_peaks.tsv file is not provided. [4.0] - min_align_count - Minimum number of sequences to trigger alignments, modeling, and trees [4] -min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] - consen_prefix - Prefix to use in orthogroup names annot_str_regex - Regular expression for capturing annotation name from gene ID, e.g. \"([^.]+\.[^.]+)\..+\" for two dot-separated fields, e.g. vigan.Shumari or \"(\D+\d+\D+)\d+.+\" for Zea assembly+annot string, e.g. Zm00032ab - preferred_annot - String to match and select an annotation set, from a gene ID. - This is used for picking representative IDs+sequence from an orthogroup, when - this annotation is among those with the median length for the orthogroup. - Otherwise, one is selected at random from those with median length. + min_align_count - Minimum number of sequences to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] + expected_quotas - (Optional) array of seqid prefixes & expected number of paralogs for the species identified by the prefix; e.g.: expected_quotas=(glyma 4 medtr 2) @@ -701,12 +698,9 @@ run_add_extra() { -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp echo " Merge fasta files from 19_palmp, prefixing IDs with panID__" - merge_files_to_pan_fasta.awk 19_palmp/* > 19_palmp.faa - #for path in 19_palmp/*; do - # pan_file=`basename $path` - # cat $path | awk -v panID=$pan_file ' $1~/^>/ {print ">" panID "__" substr($0,2) } - # $1!~/^>/ {print $1} ' >> 19_palmp.faa - #done + pushd 19_palmp + merge_files_to_pan_fasta.awk 19_palmp/* > ../19_palmp.faa + popd else echo "== No annotations were designated as \"extra\", so just promote the syn_pan_aug files as syn_pan_aug_extra. ==" @@ -773,14 +767,12 @@ run_summarize() { fi # Copy directory of final fasta files - abbreviated 19_palmp but copied to 19_pan_aug_leftover_merged_prot - for dir in 19_palmp; do - if [ -d "${WORK_DIR}"/19_palmp ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/19_pan_aug_leftover_merged_prot/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done + if [ -d "${WORK_DIR}"/19_palmp ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/19_pan_aug_leftover_merged_prot + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi # 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees are transferred with -s xfr_aligns_trees @@ -892,10 +884,9 @@ run_clean() { ######################################## # Main program -pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation strict_synt -ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff -max_pair_ks min_align_count min_annots_in_align -consen_prefix annot_str_regex preferred_annot expected_quotas' +pandagma_conf_params='clust_iden clust_cov extra_iden TE_match_iden mcl_inflation strict_synt +ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff max_pair_ks consen_prefix +annot_str_regex min_align_count min_annots_in_align' # The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. # Those steps (functions) are in pandagma-common.sh From ecc31ffe0d7de1de9d4f62038a45d28a9ecd3bea Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Wed, 28 Feb 2024 13:19:26 -0800 Subject: [PATCH 10/18] Add two alignment parameters to config files; and minor changes to pandagma-fsup.sh to work with the pandagma driver script and pandagma-common.sh --- bin/pandagma-fsup.sh | 6 ++- config/Arachis3_5_2_0.conf | 2 + config/Cicer3_4_2_0.conf | 2 + config/Glycine5_21_30_6.conf | 2 + config/Glycine_7_3_2.conf | 2 + config/Medicago3_8_0_15.conf | 2 + config/Phaseolus3_7_1_0.conf | 2 + config/Vigna4_7_1_4.conf | 2 + config/Zea_7_2_0.conf | 2 + config/family3_22_3.conf | 2 +- config/family3_sup.conf | 33 ++++++++++++++-- config/family3_sup_chafa.conf | 71 +++++++++++++++++++++++++++++++++++ config/family3_sup_ur.conf | 20 ++++++++-- get_data/family3_sup.mk | 38 +++++++++++++++++++ get_data/family3_sup_ur.mk | 10 +++++ 15 files changed, 185 insertions(+), 11 deletions(-) create mode 100644 config/family3_sup_chafa.conf diff --git a/bin/pandagma-fsup.sh b/bin/pandagma-fsup.sh index f6901e3..3675943 100755 --- a/bin/pandagma-fsup.sh +++ b/bin/pandagma-fsup.sh @@ -63,8 +63,9 @@ Variables in pandagma config file (Set the config with the CONF environment vari min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] File sets (arrays): - protein_files + annotation_files cds_files + protein_files EOS ######################################## @@ -80,6 +81,8 @@ canonicalize_paths() { cd "${DATA_DIR}" || exit + export ANN_REX=${annot_str_regex} + mapfile -t cds_files < <(realpath --canonicalize-existing "${cds_files[@]}") mapfile -t protein_files < <(realpath --canonicalize-existing "${protein_files[@]}") @@ -436,7 +439,6 @@ pandagma_conf_params='consen_iden clust_cov consen_method annot_str_regex' export commandlist="ingest fam_consen search_families realign_and_trim calc_trees summarize" export dependencies='hmmscan famsa' -export ANN_REX=${annot_str_regex} declare out_dir version consen_iden clust_cov consen_method annot_str_regex diff --git a/config/Arachis3_5_2_0.conf b/config/Arachis3_5_2_0.conf index fb6efb0..959f223 100644 --- a/config/Arachis3_5_2_0.conf +++ b/config/Arachis3_5_2_0.conf @@ -11,6 +11,8 @@ consen_prefix='Arachis.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='Tifrunner.gnm2.ann2' order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and fasta_files arrays. diff --git a/config/Cicer3_4_2_0.conf b/config/Cicer3_4_2_0.conf index 5486968..35de454 100644 --- a/config/Cicer3_4_2_0.conf +++ b/config/Cicer3_4_2_0.conf @@ -11,6 +11,8 @@ consen_prefix='Cicer.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='CDCFrontier.gnm3.ann1' order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Glycine5_21_30_6.conf b/config/Glycine5_21_30_6.conf index 022031b..06f2412 100644 --- a/config/Glycine5_21_30_6.conf +++ b/config/Glycine5_21_30_6.conf @@ -11,6 +11,8 @@ consen_prefix='Glycine.pan5' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='Wm82.gnm6.ann1' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and fasta_files arrays. diff --git a/config/Glycine_7_3_2.conf b/config/Glycine_7_3_2.conf index 2c26083..3d06719 100644 --- a/config/Glycine_7_3_2.conf +++ b/config/Glycine_7_3_2.conf @@ -11,6 +11,8 @@ consen_prefix='Glycine.pan3' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='Wm82.gnm4.ann1' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths, in the "main" set, for all-by-all comparisons # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Medicago3_8_0_15.conf b/config/Medicago3_8_0_15.conf index 6977320..cdcd72e 100644 --- a/config/Medicago3_8_0_15.conf +++ b/config/Medicago3_8_0_15.conf @@ -11,6 +11,8 @@ consen_prefix='Medicago.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot="A17.gnm5.ann1_6" order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Phaseolus3_7_1_0.conf b/config/Phaseolus3_7_1_0.conf index 699e9ba..a2dbc41 100644 --- a/config/Phaseolus3_7_1_0.conf +++ b/config/Phaseolus3_7_1_0.conf @@ -11,6 +11,8 @@ consen_prefix='Phaseolus.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='G19833.gnm2.ann1' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Vigna4_7_1_4.conf b/config/Vigna4_7_1_4.conf index 4429ba4..3d1311d 100644 --- a/config/Vigna4_7_1_4.conf +++ b/config/Vigna4_7_1_4.conf @@ -11,6 +11,8 @@ consen_prefix='Vigna.pan3' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='IT97K-499-35.gnm1.ann2' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Zea_7_2_0.conf b/config/Zea_7_2_0.conf index 5b165b0..5af763c 100644 --- a/config/Zea_7_2_0.conf +++ b/config/Zea_7_2_0.conf @@ -11,6 +11,8 @@ consen_prefix='Zea.pan' annot_str_regex='(\D+\d+\D+)\d+.+' preferred_annot='Zm00001eb' order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/family3_22_3.conf b/config/family3_22_3.conf index e52c3d6..8e20d73 100644 --- a/config/family3_22_3.conf +++ b/config/family3_22_3.conf @@ -43,7 +43,7 @@ expected_quotas=( ) ##### (required) list of BED & FASTA file paths -# Uncomment add file paths to the the annotation_files and cds_files arrays. +# add file paths to the the annotation_files, cds_files, and protein_files arrays. # The nth listed BED file corresponds to the nth listed FASTA file. annotation_files=( diff --git a/config/family3_sup.conf b/config/family3_sup.conf index c0170bd..65a8f19 100644 --- a/config/family3_sup.conf +++ b/config/family3_sup.conf @@ -6,10 +6,35 @@ annot_str_regex='([^.]+\.[^.]+)\..+' min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it -##### (required) list of FASTA file paths -# List CDS files in the "cds_files" array -# and protein files in the "protein_files" array. -# The files should be available at that location, in the "data_sup/" or other designated directory. +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + arahy.Tifrunner.gnm2.ann2.PVFB.gene_models_main.bed.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz + cajca.ICPL87119.gnm2.ann1.L3ZH.gene_models_main.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz + cicar.CDCFrontier.gnm3.ann1.NPD7.gene_models_main.bed.gz + glyma.Wm82.gnm4.ann1.T8TQ.gene_models_main.bed.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz + lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz + lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz + phalu.G27455.gnm1.ann1.JD7C.gene_models_main.bed.gz + phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz + pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz + vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz + vigan.Gyeongwon.gnm3.ann1.3Nz5.gene_models_main.bed.gz + vigra.VC1973A.gnm7.ann1.RWBG.gene_models_main.bed.gz + vigun.IT97K-499-35.gnm1.ann2.FD7K.gene_models_main.bed.gz +) cds_files=( arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz diff --git a/config/family3_sup_chafa.conf b/config/family3_sup_chafa.conf new file mode 100644 index 0000000..70063b9 --- /dev/null +++ b/config/family3_sup_chafa.conf @@ -0,0 +1,71 @@ +consen_iden='0.30' +clust_cov='0.40' +TE_match_iden='0.40' +consen_method='align_sample' # align_sample >> cons > hmmemit +annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it + +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz + lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz + phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz + phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz + prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz +) + +cds_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz + arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.cds_primary.fna.gz + cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz + chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz + lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.cds_primary.fna.gz + phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz + phavu.G19833.gnm2.ann1.PB8d.cds_primary.fna.gz + prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz + quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz + sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz + singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz + vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz +) + +protein_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz + arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.protein_primary.faa.gz + cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz + chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.protein_primary.faa.gz + lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz + phavu.G19833.gnm2.ann1.PB8d.protein_primary.faa.gz + phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz + prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz + quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz + sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz + singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz + vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz +) + +### (optional) Fasta file of sequences to use exclude from annotations, e.g. transposable elements or plastid seuqences. +### If provided, sequences in this file will be used to exclude CDS sequences if any matches are >= TE_match_iden +exclude_TE_match_file=( + legume.TE_lib_2024.rpt.6WVT.fna.gz +) diff --git a/config/family3_sup_ur.conf b/config/family3_sup_ur.conf index 11fba3e..0802e6e 100644 --- a/config/family3_sup_ur.conf +++ b/config/family3_sup_ur.conf @@ -6,10 +6,22 @@ annot_str_regex='([^.]+\.[^.]+)\..+' min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it -##### (required) list of FASTA file paths -# List CDS files in the "cds_files" array -# and protein files in the "protein_files" array. -# The files should be available at that location, in the "data_sup/" or other designated directory. +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz + phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz + prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz +) cds_files=( arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz diff --git a/get_data/family3_sup.mk b/get_data/family3_sup.mk index 3d8ca8f..7af6074 100644 --- a/get_data/family3_sup.mk +++ b/get_data/family3_sup.mk @@ -1,50 +1,88 @@ FILES ::= \ +acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz \ arahy.Tifrunner.gnm2.ann2.PVFB.cds.fna.gz \ +arahy.Tifrunner.gnm2.ann2.PVFB.gene_models_main.bed.gz \ arahy.Tifrunner.gnm2.ann2.PVFB.protein.faa.gz \ arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz \ +arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz \ arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.cds_primary.fna.gz \ +bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.protein_primary.faa.gz \ cajca.ICPL87119.gnm2.ann1.L3ZH.cds_primary.fna.gz \ +cajca.ICPL87119.gnm2.ann1.L3ZH.gene_models_main.bed.gz \ cajca.ICPL87119.gnm2.ann1.L3ZH.protein_primary.faa.gz \ cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz \ cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz \ chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz \ chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz \ cicar.CDCFrontier.gnm3.ann1.NPD7.cds.fna.gz \ +cicar.CDCFrontier.gnm3.ann1.NPD7.gene_models_main.bed.gz \ cicar.CDCFrontier.gnm3.ann1.NPD7.protein.faa.gz \ glyma.Wm82.gnm4.ann1.T8TQ.cds_primary.fna.gz \ +glyma.Wm82.gnm4.ann1.T8TQ.gene_models_main.bed.gz \ glyma.Wm82.gnm4.ann1.T8TQ.protein_primary.faa.gz \ lencu.CDC_Redberry.gnm2.ann1.5FB4.cds.fna.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz \ lencu.CDC_Redberry.gnm2.ann1.5FB4.protein.faa.gz \ lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz \ +lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz \ lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz \ lupal.Amiga.gnm1.ann1.3GKS.cds.fna.gz \ +lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz \ lupal.Amiga.gnm1.ann1.3GKS.protein.faa.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.cds_primary.fna.gz \ +medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.protein_primary.faa.gz \ +phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz \ +phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz \ +phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz \ phalu.G27455.gnm1.ann1.JD7C.cds_primary.fna.gz \ +phalu.G27455.gnm1.ann1.JD7C.gene_models_main.bed.gz \ phalu.G27455.gnm1.ann1.JD7C.protein_primary.faa.gz \ phavu.G19833.gnm2.ann1.PB8d.cds_primary.fna.gz \ +phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz \ phavu.G19833.gnm2.ann1.PB8d.protein_primary.faa.gz \ pissa.Cameor.gnm1.ann1.7SZR.cds_primary.fna.gz \ +pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz \ pissa.Cameor.gnm1.ann1.7SZR.protein_primary.faa.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz \ quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz \ +quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz \ quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz \ sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz \ sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz \ singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz \ +singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz \ singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz \ tripr.MilvusB.gnm2.ann1.DFgp.cds_primary.fna.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz \ tripr.MilvusB.gnm2.ann1.DFgp.protein_primary.faa.gz \ vicfa.Hedin2.gnm1.ann1.PTNK.cds_primary.fna.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz \ vicfa.Hedin2.gnm1.ann1.PTNK.protein_primary.faa.gz \ vigan.Gyeongwon.gnm3.ann1.3Nz5.cds_primary.fna.gz \ +vigan.Gyeongwon.gnm3.ann1.3Nz5.gene_models_main.bed.gz \ vigan.Gyeongwon.gnm3.ann1.3Nz5.protein_primary.faa.gz \ vigra.VC1973A.gnm7.ann1.RWBG.cds.fna.gz \ +vigra.VC1973A.gnm7.ann1.RWBG.gene_models_main.bed.gz \ vigra.VC1973A.gnm7.ann1.RWBG.protein.faa.gz \ vigun.IT97K-499-35.gnm1.ann2.FD7K.cds_primary.fna.gz \ +vigun.IT97K-499-35.gnm1.ann2.FD7K.gene_models_main.bed.gz \ vigun.IT97K-499-35.gnm1.ann2.FD7K.protein_primary.faa.gz \ +vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz \ +vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz \ +vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz \ legume.TE_lib_2024.rpt.6WVT.fna.gz include $(dir $(realpath $(lastword $(MAKEFILE_LIST))))/common.mk diff --git a/get_data/family3_sup_ur.mk b/get_data/family3_sup_ur.mk index 9b0449d..4ecfa85 100644 --- a/get_data/family3_sup_ur.mk +++ b/get_data/family3_sup_ur.mk @@ -1,22 +1,32 @@ FILES ::= \ +arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz \ arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz \ arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz \ +bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.cds_primary.fna.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.protein_primary.faa.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz \ cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz \ cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz \ +medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.cds_primary.fna.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.protein_primary.faa.gz \ +phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz \ phavu.G19833.gnm2.ann1.PB8d.cds_primary.fna.gz \ phavu.G19833.gnm2.ann1.PB8d.protein_primary.faa.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz \ prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz \ prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz \ +quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz \ quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz \ quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz \ sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz \ sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz \ +singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz \ singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz \ singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz \ +vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz \ vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz \ vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz \ legume.TE_lib_2024.rpt.6WVT.fna.gz From 17df08ddcd285b3d402bbfd8700e4c58adb0ef7b Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 06:12:44 -0800 Subject: [PATCH 11/18] Changes in step xfr_aligns_trees to handle output from fsup --- bin/pandagma-common.sh | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh index cc65c28..dd37eac 100755 --- a/bin/pandagma-common.sh +++ b/bin/pandagma-common.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -version="2023-02-23" +version="2023-02-29" set -o errexit -o errtrace -o nounset -o pipefail -o posix trap 'echo ${0##*/}:${LINENO} ERROR executing command: ${BASH_COMMAND}' ERR @@ -52,7 +52,6 @@ calc_seq_stats () { }' } - ########## run_align_cds() { cd "${WORK_DIR}" || exit @@ -223,7 +222,6 @@ run_xfr_aligns_trees() { echo; echo "Copy alignment and tree results into output directory" full_out_dir="${out_dir}" - cd "${submit_dir}" || exit if [ ! -d "$full_out_dir" ]; then @@ -231,14 +229,31 @@ run_xfr_aligns_trees() { mkdir -p "$full_out_dir" fi - for dir in 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done + if [[ "$scriptname" =~ "pandagma pan" ]] || [[ "$scriptname" =~ "pandagma fam" ]]; then + for dir in 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do + if [ -d "${WORK_DIR}/$dir" ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/ + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi + done + elif [[ "$scriptname" =~ "pandagma fsup" ]]; then + for dir in 35_sup_in_fams_prot 42_hmmalign 43_hmmalign_trim2 44_trees; do + if [ -d "${WORK_DIR}/$dir" ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/ + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi + done + else + echo "Exiting, as $scriptname is not one of pan, fam, or fsup" + exit 1 + fi + + # Remove zero-length files in output directory + find "${full_out_dir}" -size 0 -print -delete } main_pan_fam() { From 438b78fd8c9c4dd1dfacfe4f07949df392a19c6f Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 06:15:18 -0800 Subject: [PATCH 12/18] Remove optional steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees from commandlist in -pan and -fam workflows --- bin/pandagma-fam.sh | 5 ++--- bin/pandagma-pan.sh | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index d61a841..7aac56c 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -888,11 +888,10 @@ pandagma_conf_params='clust_iden clust_cov extra_iden TE_match_iden mcl_inflatio ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff max_pair_ks consen_prefix annot_str_regex min_align_count min_annots_in_align' -# The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. +# The steps align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. # Those steps (functions) are in pandagma-common.sh export commandlist="ingest mmseqs filter dagchainer ks_calc ks_filter \ - mcl consense cluster_rest add_extra tabularize \ - align_protein model_and_trim calc_trees xfr_aligns_trees summarize" + mcl consense cluster_rest add_extra tabularize summarize" export dependencies='dagchainer mcl cons famsa run_DAG_chainer.pl' diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index 6d1c37b..c3953eb 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -1172,7 +1172,7 @@ pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation # Those steps (functions) are in pandagma-common.sh export commandlist="ingest mmseqs filter dagchainer mcl consense cluster_rest add_extra \ pick_exemplars filter_to_pctile tabularize order_and_name \ - align_cds align_protein model_and_trim calc_trees xfr_aligns_trees calc_chr_pairs summarize" + calc_chr_pairs summarize" export dependencies='mmseqs dagchainer mcl cons famsa hmmalign hmmbuild run_DAG_chainer.pl' From 4437eb4a6bbab0f35d3b6f9e447a5b1a14ac3d7b Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 06:18:01 -0800 Subject: [PATCH 13/18] Bug fix & documentation tweaks in pandagma-fsup.sh --- bin/pandagma-fsup.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/pandagma-fsup.sh b/bin/pandagma-fsup.sh index 3675943..b4b61fd 100755 --- a/bin/pandagma-fsup.sh +++ b/bin/pandagma-fsup.sh @@ -41,10 +41,10 @@ Subcommands (in order they are usually run): all - All of the steps below (Or equivalently: omit the -s flag; \"all\" is default). ingest - Prepare the assembly and annotation files for analysis. - fam_consen - - search_families - - realign_and_trim - - calc_trees - + fam_consen - Generate a consensus sequence for each family. + search_families - Search provided annotation sets (protein) against family consensus sequences. + realign_and_trim - Build HMMs and trim the alignments, preparatory to calculating trees. + calc_trees - Calculate gene trees. summarize - Move results into output directory, and report summary statistics. EOS @@ -280,7 +280,6 @@ run_tabularize() { rm tmp.table_header } - ########## run_realign_and_trim() { echo; echo "== Retrieve sequences for each family, preparatory to aligning them ==" @@ -294,7 +293,7 @@ run_realign_and_trim() { mkdir -p 42_hmmalign for filepath in 35_sup_in_fams_prot/*; do file=$(basename "$filepath"); - hmmalign --trim --outformat A2M --amino -o 42_hmmalign/"$file" "{fam_dir}/21_hmm/$file" 35_sup_in_fams_prot/"$file" & + hmmalign --trim --outformat A2M --amino -o 42_hmmalign/"$file" "${fam_dir}/21_hmm/$file" 35_sup_in_fams_prot/"$file" & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi done wait @@ -435,8 +434,10 @@ run_summarize() { pandagma_conf_params='consen_iden clust_cov consen_method annot_str_regex' -# Run all specified steps -export commandlist="ingest fam_consen search_families realign_and_trim calc_trees summarize" +# Run all specified steps. +# The steps calc_trees and xfr_aligns_trees may be run separately. +# Those steps (functions) are in pandagma-common.sh +export commandlist="ingest fam_consen search_families realign_and_trim calc_trees xfr_aligns_trees summarize" export dependencies='hmmscan famsa' From 215ff4c89fe944112a4378874d835049bfa29b2f Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 08:32:36 -0800 Subject: [PATCH 14/18] Minor change to handling of directory/file transfer in pandagma-fsup.sh --- bin/pandagma-fsup.sh | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/bin/pandagma-fsup.sh b/bin/pandagma-fsup.sh index b4b61fd..c41f68d 100755 --- a/bin/pandagma-fsup.sh +++ b/bin/pandagma-fsup.sh @@ -350,14 +350,7 @@ run_summarize() { fi done - for dir in 35_sup_in_fams_prot 43_hmmalign_trim2 44_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done + # Directories 35_sup_in_fams_prot 43_hmmalign_trim2 44_trees are transferred in xfr_aligns_trees (in pandagma-common.sh) end_time=$(date) @@ -435,8 +428,7 @@ run_summarize() { pandagma_conf_params='consen_iden clust_cov consen_method annot_str_regex' # Run all specified steps. -# The steps calc_trees and xfr_aligns_trees may be run separately. -# Those steps (functions) are in pandagma-common.sh +# Those steps. Steps realign_and_trim calc_trees xfr_aligns_trees are in pandagma-common.sh export commandlist="ingest fam_consen search_families realign_and_trim calc_trees xfr_aligns_trees summarize" export dependencies='hmmscan famsa' From 8797872b4867779d709f865a131d22c326640229 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 08:37:48 -0800 Subject: [PATCH 15/18] Minor change in stats reporting --- bin/pandagma-fsup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/pandagma-fsup.sh b/bin/pandagma-fsup.sh index c41f68d..f789fc6 100755 --- a/bin/pandagma-fsup.sh +++ b/bin/pandagma-fsup.sh @@ -124,7 +124,7 @@ run_ingest() { # calc basic sequence stats annot_name=$(basename 02_fasta_prot/"$file_base" | perl -pe '$ann_rex=qr($ENV{"ANN_REX"}); s/$ann_rex/$1/' ) echo " CHECK: report annot_name to stats file? [$annot_name]" - printf " Added with hmmsearch: " >> stats/tmp.fasta_seqstats_sup + printf " Added with $consen_method: " >> stats/tmp.fasta_seqstats_sup cat_or_zcat "${protein_files[file_num]}" | calc_seq_stats >> stats/tmp.fasta_seqstats_sup done From fdf5414ac98ec134df80aa15b58d24053aeb9629 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 11:50:39 -0800 Subject: [PATCH 16/18] Restructure ks_filter step so that ks_block_wgd_cutoff and max_pair_ks are used if stats/ks_histplots.tsv isn't provided --- README.md | 15 ++-- bin/pandagma-fam.sh | 200 +++++++++++++++++++++++++++++--------------- 2 files changed, 139 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 25aaf58..788b6ea 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,8 @@ Subcommands for the **pangene** workflow, `pandagma pan`, in order they are usua Subcommands for the **gene family** workflow, `pandagma fam`, in order they are usually run: ``` - Run these first (if using ks_calc) + Run these first (if using the ks_peaks.tsv file; otherwise, run all main steps and + ks filtering will be done using parameters ks_block_wgd_cutoff and max_pair_ks) all - All of the steps below, except for ks_filter and clean (Or equivalently: omit the -s flag; \"all\" is default). ingest - Prepare the assembly and annotation files for analysis. @@ -195,7 +196,7 @@ Subcommands for the **gene family** workflow, `pandagma fam`, in order they are ks_calc - Calculation of Ks values on gene pairs from DAGchainer output. Evaluate the stats/ks_histplots.tsv and stats/ks_peaks_auto.tsv files and - put ks_peaks.tsv into the work directory, then run the following commands: + put ks_peaks.tsv into the \${WORK_DIR}/stats directory, then run the following commands: ks_filter - Filtering based on provided ks_peaks.tsv file (assumes prior ks_calc step) mcl - Derive clusters, with Markov clustering. consense - Calculate a consensus sequences from each pan-gene set, @@ -236,8 +237,6 @@ Variables in the config file for the **pangene workflow**, `pandagma pan`: This is used for picking representative IDs+sequence from an orthogroup, when this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. - min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] -min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] ``` Variables in the config file for the **family workflow**, `pandagma fam`: @@ -264,8 +263,6 @@ ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] This is used for picking representative IDs+sequence from an orthogroup, when this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. - min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] -min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] ``` ## Example run for the pangene workflow @@ -373,11 +370,11 @@ min_annots_in_align - Minimum number of distinct annotation groups in an alignme remaining steps (see **8** below). An intermediate output file, `stats/ks_peaks_auto.tsv`, is written to the work directory - This should be examined for biological plausibility (look at Ks peak values in column 3), - along with the other Ks results (histograms) in the work_pandagma/stats subdirectory. + This should be examined for biological plausibility, along with the other + Ks results (histograms) in the work_pandagma/stats subdirectory. The `ks_peaks_auto.tsv` file can be examined and used to create a file named `ks_peaks.tsv` with changes relative to `ks_peaks_auto.tsv` if necessary to reflect known or suspected WGD histories. - If stats/ks_histplots.tsv is not provided, then Ks filtering will be done using values provided + If stats/ks_peaks.tsv is not provided, then Ks filtering will be done using values provided in the config file for ks_block_wgd_cutoff and max_pair_ks. 4. Run steps `ks_filter` through `summarize`. diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index 7aac56c..3a18eed 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -39,7 +39,8 @@ Primary protein sequences and annotation (GFF3 or BED) files must be listed in t config file, in the arrays annotation_files and protein_files. See example files. Subcommands (in order they are usually run): - Run these first (if using ks_calc) + Run these first (if using the ks_peaks.tsv file; otherwise, run all main steps and + ks filtering will be done using parameters ks_block_wgd_cutoff and max_pair_ks) all - All of the steps below, except for ks_filter, clean (Or equivalently: omit the -s flag; \"all\" is default). ingest - Prepare the assembly and annotation files for analysis. @@ -49,7 +50,7 @@ Subcommands (in order they are usually run): ks_calc - Calculation of Ks values on gene pairs from DAGchainer output. Evaluate the stats/ks_histplots.tsv and stats/ks_peaks_auto.tsv files and - put ks_peaks.tsv into the original data directory, then run the following commands: + put ks_peaks.tsv into the \${WORK_DIR}/stats directory, then run the following commands: ks_filter - Filtering based on provided ks_peaks.tsv file (assumes prior ks_calc step) mcl - Derive clusters, with Markov clustering. consense - Calculate a consensus sequences from each pan-gene set, @@ -102,7 +103,6 @@ Variables in pandagma config file (Set the config with the CONF environment vari clust_iden - Minimum identity threshold for mmseqs clustering [0.40] clust_cov - Minimum coverage for mmseqs clustering [0.40] extra_iden - Minimum identity threshold for mmseqs addition of \"extra\" annotations [0.30] - TE_match_iden - Minimum identity threshold for excluding match to transposable element file - for "TESearch" mcl_inflation - Inflation parameter, for Markov clustering [1.6] strict_synt - For clustering of the \"main\" annotations, use only syntenic pairs [1] The alternative (0) is to use all homologous pairs that satisfy expected_quotas @@ -111,14 +111,16 @@ Variables in pandagma config file (Set the config with the CONF environment vari ks_binsize - For calculating and displaying histograms. [0.05] ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75] max_pair_ks - Fallback value for excluding gene pairs, if a ks_peaks.tsv file is not provided. [4.0] + consen_prefix - Prefix to use in orthogroup names annot_str_regex - Regular expression for capturing annotation name from gene ID, e.g. \"([^.]+\.[^.]+)\..+\" for two dot-separated fields, e.g. vigan.Shumari or \"(\D+\d+\D+)\d+.+\" for Zea assembly+annot string, e.g. Zm00032ab - min_align_count - Minimum number of sequences to trigger alignments, modeling, and trees [4] -min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] - + preferred_annot - String to match and select an annotation set, from a gene ID. + This is used for picking representative IDs+sequence from an orthogroup, when + this annotation is among those with the median length for the orthogroup. + Otherwise, one is selected at random from those with median length. expected_quotas - (Optional) array of seqid prefixes & expected number of paralogs for the species identified by the prefix; e.g.: expected_quotas=(glyma 4 medtr 2) @@ -444,7 +446,7 @@ run_ks_calc() { echo "# $filebase" echo "# Normalized relative to Ks peak inferred at bin $ks_bin, with amplitude $ks_amplitude_pct" awk 'NF==7 && $7<=3 {print $7}' "$ks_path" | histogram.pl -z -n -s "$ks_binsize" | - histplot.pl -d "$ks_amplitude_pct" | cut -b1-150 + histplot.pl -d "$ks_amplitude_pct" echo "" } >> stats/ks_histplots.tsv done @@ -452,55 +454,44 @@ run_ks_calc() { ########## run_ks_filter() { - echo; echo "From optional provided ks_peaks.tsv file and from processed DAGChainer output, (with gene-pair and " - echo "block-median Ks values added by calc_ks_from_dag.pl), filter on block-median Ks values." + echo; + echo "From optional provided ${WORK_DIR}/stats/ks_peaks.tsv file and from processed DAGChainer output,i " + echo "(with gene-pair and block-median Ks values added by calc_ks_from_dag.pl), filter on " + echo "block-median Ks values, and combine the homology data into a file with gene pairs to be clustered." + echo; cd "${WORK_DIR}" || exit - if [ -d 05_kaksout_ks_filtered ]; then rm -rf 05_kaksout_ks_filtered ; fi - mkdir -p 05_kaksout_ks_filtered - if [[ -f stats/ks_peaks.tsv ]]; then # filter based on list of Ks values - echo "Filtering on quotas from expected_quotas and ks_pair_cutoff values provided in stats/ks_peaks.tsv" + if [ -f stats/ks_peaks.tsv ]; then # filter based on list of Ks values + echo "Filtering on quotas from expected_quotas and ks_pair_cutoff values provided in ks_peaks.tsv" + if [ -d 05_kaksout_ks_filtered ]; then rm -rf 05_kaksout_ks_filtered ; fi + mkdir -p 05_kaksout_ks_filtered ks_peaks=stats/ks_peaks.tsv for ks_path in 05_kaksout/*.rptout; do outfilebase=$(basename "$ks_path" .rptout) echo " Filtering $ks_path based on expected block-median Ks values" < "${ks_path}" filter_mmseqs_by_ks.pl \ - -ks_peaks ${ks_peaks} -annot_regex "$annot_str_regex" -max_pair_ks "$max_pair_ks" | + -ks_peak "${ks_peaks}" -annot_regex "$annot_str_regex" -max_pair_ks "$max_pair_ks" | awk 'NF==7' > 05_kaksout_ks_filtered/"${outfilebase}".rptout done - else # don't filter, since ks_peaks.tsv file isn't provided - echo "No ks_peaks.tsv file was provided. It is recommended to review the provisional stats/ks_peaks_auto.tsv" + cat 05_kaksout_ks_filtered/*.rptout | awk 'NF==7 {print $1 "\t" $2}' | sort -u > 05_filtered_pairs.tsv + else # ks_peaks.tsv file isn't provided. Warn but proceed with a single Ks value: ks_hi_cutoff + echo "INFO: No ks_peaks.tsv file was provided. It is recommended to review the provisional stats/ks_peaks_auto.tsv" echo "and stats/ks_histplots.tsv files and provide a file stats/ks_peaks.tsv -- either simply copying" echo "ks_peaks_auto.tsv to ks_peaks.tsv if the reported Ks peak values (column 3) are acceptable," echo "or revising those values based on interpretation of stats/ks_histplots.tsv." echo "If stats/ks_histplots.tsv is not provided, then Ks filtering will be done using values provided" - echo "in the config file for ks_block_wgd_cutoff and max_pair_ks." - fi -} - -########## -run_mcl() { - # Calculate clusters using Markov clustering - cd "${WORK_DIR}" || exit - mkdir -p lists + echo "in the config file for ks_block_wgd_cutoff and max_pair_ks."; echo - printf "\nPreparatory to clustering, combine the homology data into a file with gene pairs to be clustered.\n" - - files=$(shopt -s nullglob dotglob; echo 05_kaksout_ks_filtered/*) - if [ -d 05_kaksout_ks_filtered ] && (( ${#files} )); then # From step ks_filter; supersedes all other conditions - cat 05_kaksout_ks_filtered/*.rptout | awk 'NF==7 {print $1 "\t" $2}' | sort -u > 05_filtered_pairs.tsv - else if [ "$strict_synt" -eq 1 ]; then # https://stackoverflow.com/questions/3601515/how-to-check-if-a-variable-is-set-in-bash - # https://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_02 if [ -z ${ks_block_wgd_cutoff+x} ] || [ -z ${max_pair_ks+x} ] || [ ! -d "${WORK_DIR}/05_kaksout" ] && [ ! "$(ls -A 05_kaksout/*.rptout)" ]; then - echo "## One or both of ks_block_wgd_cutoff and max_pair_ks are unset or 05_kaksout doesn't exist; no Ks filtering will be done."; - echo "## Combine the DAGChainer synteny pairs into a file to be clustered." + echo "Variables ks_block_wgd_cutoff and/or max_pair_ks are unset or 05_kaksout doesn't exist; no Ks filtering will be done."; + echo "Combine the DAGChainer synteny pairs into a file to be clustered." cat 04_dag/*.aligncoords | awk '$1!~/^#/ {print $2 "\t" $6}' | awk 'NF==2' | sort -u > 05_filtered_pairs.tsv else - echo "## The ks_block_wgd_cutoff is set to '$ks_block_wgd_cutoff' and max_pair_ks is set to '$max_pair_ks'"; - echo "## Combine the DAGChainer synteny pairs, with additional filtering by pairwise and block Ks thresholds, into a file to be clustered." + echo "The ks_block_wgd_cutoff is set to '$ks_block_wgd_cutoff' and max_pair_ks is set to '$max_pair_ks'"; + echo "Combine DAGChainer synteny pairs, filtering by pairwise and block Ks thresholds, into a file to be clustered." # Combine the synteny pairs into a file to be clustered cat 05_kaksout/*.rptout | awk -v PAIR_CUTOFF="$max_pair_ks" -v BLOCK_CUTOFF="$ks_block_wgd_cutoff" ' @@ -513,6 +504,12 @@ run_mcl() { awk 'NF==2' | sort -u > 05_filtered_pairs.tsv fi fi +} + +########## +run_mcl() { + # Calculate clusters using Markov clustering + cd "${WORK_DIR}" || exit printf "\nDo Markov clustering with inflation parameter %f and %d threads\n" "$mcl_inflation" "$NPROC" echo "MCL COMMAND: mcl 05_filtered_pairs.tsv -I $mcl_inflation -te ${NPROC} --abc -o tmp.syn_pan.clust.tsv" @@ -525,6 +522,7 @@ run_mcl() { rm tmp.syn_pan.clust.tsv echo " Move singleton and doubleton clusters into a list of leftovers" + mkdir -p lists awk 'NF==2 {print $2} NF==3 {print $2; print $3}' 06_syn_pan.clust.tsv > lists/lis.06_syn_pan_1s_2s awk 'NF>3 {print $0}' 06_syn_pan.clust.tsv > 06_syn_pan_ge3.clust.tsv @@ -690,21 +688,19 @@ run_add_extra() { perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' 18_syn_pan_aug_extra.clust.tsv \ > 18_syn_pan_aug_extra.hsh.tsv - echo " For each family set, retrieve sequences into a multifasta file 19_pan_aug_leftover_merged_prot (abbreviated 19_palmp)" + echo " For each family set, retrieve sequences into a multifasta file." printf " Fasta file: %s %s\n" "${protein_files[@]}" "${protein_files_extra[@]}" - if [ -d 19_palmp ]; then rm -rf 19_palmp ; fi - mkdir -p 19_palmp + if [ -d 19_pan_aug_leftover_merged_prot ]; then rm -rf 19_pan_aug_leftover_merged_prot ; fi + mkdir -p 19_pan_aug_leftover_merged_prot get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot - echo " Merge fasta files from 19_palmp, prefixing IDs with panID__" - pushd 19_palmp - merge_files_to_pan_fasta.awk 19_palmp/* > ../19_palmp.faa - popd + echo " Merge fasta files from 19_pan_aug_leftover_merged_prot, prefixing IDs with panID__" + merge_files_to_pan_fasta.awk 19_pan_aug_leftover_merged_prot/* > 19_pan_aug_leftover_merged_prot.faa else echo "== No annotations were designated as \"extra\", so just promote the syn_pan_aug files as syn_pan_aug_extra. ==" - cp 07_pan_fasta_prot.faa 19_palmp.faa + cp 07_pan_fasta_prot.faa 19_pan_aug_leftover_merged_prot.faa cp 12_syn_pan_aug.clust.tsv 18_syn_pan_aug_extra.clust.tsv cp 12_syn_pan_aug.hsh.tsv 18_syn_pan_aug_extra.hsh.tsv fi @@ -728,6 +724,81 @@ run_tabularize() { rm tmp.table_header } +########## +run_align() { + echo; echo "== Retrieve sequences for each family, preparatory to aligning them ==" + cd "${WORK_DIR}" || exit + if [[ -d 19_pan_aug_leftover_merged_prot ]] && [[ -f 19_pan_aug_leftover_merged_prot/${consen_prefix}00001 ]]; then + : # do nothing; the directory and file(s) exist + else + mkdir -p 19_pan_aug_leftover_merged_prot + echo " For each pan-gene set, retrieve sequences into a multifasta file." + get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot + fi + + echo; echo "== Align the gene families ==" + mkdir -p 20_aligns + for filepath in 19_pan_aug_leftover_merged_prot/*; do + file=$(basename "$filepath"); + echo " Computing alignment, using program famsa, for file $file" + famsa -t 2 19_pan_aug_leftover_merged_prot/"$file" 20_aligns/"$file" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait +} + +########## +run_model_and_trim() { + echo; echo "== Build HMMs ==" + cd "${WORK_DIR}" || exit + mkdir -p 21_hmm + for filepath in 20_aligns/*; do + file=$(basename "$filepath"); + hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + + echo; echo "== Realign to HMMs ==" + mkdir -p 22_hmmalign + for filepath in 21_hmm/*; do + file=$(basename "$filepath"); + printf "%s " "$file" + hmmalign --trim --outformat A2M --amino -o 22_hmmalign/"$file" 21_hmm/"$file" 19_pan_aug_leftover_merged_prot/"$file" & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo + + echo; echo "== Trim HMM alignments to match-states ==" + mkdir -p 23_hmmalign_trim1 + for filepath in 22_hmmalign/*; do + file=$(basename "$filepath"); + printf "%s " "$file" + < "$filepath" perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' | + sed '/^$/d' > 23_hmmalign_trim1/"$file" & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo + + echo; echo "== Filter alignments prior to tree calculation ==" + mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log + min_depth=3 + min_pct_depth=20 + min_pct_aligned=20 + for filepath in 23_hmmalign_trim1/*; do + file=$(basename "$filepath") + printf "%s " "$file" + filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ + -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo +} + ########## run_summarize() { echo; echo "Summarize: Move results into output directory, and report some summary statistics" @@ -766,15 +837,14 @@ run_summarize() { echo "Couldn't find file manifests/MANIFEST_output_fam.yml" fi - # Copy directory of final fasta files - abbreviated 19_palmp but copied to 19_pan_aug_leftover_merged_prot - if [ -d "${WORK_DIR}"/19_palmp ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/19_pan_aug_leftover_merged_prot - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - - # 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees are transferred with -s xfr_aligns_trees + for dir in 19_pan_aug_leftover_merged_prot 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do + if [ -d "${WORK_DIR}"/$dir ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi + done printf "Run of program %s, version %s\n" "$scriptname" "$version" > "${stats_file}" @@ -826,19 +896,15 @@ run_summarize() { { printf " %i\t%i\t%2.1f\t%s\n", $2, $4, 100*($4/$2), $1 }' >> "${stats_file}" echo " Print counts per accession" - { + if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" - } >> "${stats_file}" - if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then - { transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | perl -lane 'next if ($.<=3); $ct=0; $sum=0; $nulls=0; $OGs=0; for $i (@F[1..(@F-1)]){ $OGs++; if ($i>0){$ct++; $sum+=$i} if ($i==0){$nulls++} }; printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' \ >> "${stats_file}" - } fi echo " Print histograms" @@ -868,7 +934,7 @@ run_clean() { echo " work_dir: $PWD" if [ -d MMTEMP ]; then rm -rf MMTEMP/*; fi - for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_palmp; do + for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_pan_aug_leftover_merged_prot; do if [ -d $dir ]; then echo " Removing directory $dir"; rm -rf $dir & fi done @@ -884,14 +950,14 @@ run_clean() { ######################################## # Main program -pandagma_conf_params='clust_iden clust_cov extra_iden TE_match_iden mcl_inflation strict_synt -ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff max_pair_ks consen_prefix -annot_str_regex min_align_count min_annots_in_align' +pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation strict_synt +ks_low_cutoff ks_hi_cutoff ks_binsize ks_block_wgd_cutoff max_pair_ks +consen_prefix annot_str_regex' -# The steps align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. -# Those steps (functions) are in pandagma-common.sh -export commandlist="ingest mmseqs filter dagchainer ks_calc ks_filter \ - mcl consense cluster_rest add_extra tabularize summarize" +export commandlist="ingest mmseqs filter dagchainer \ + ks_calc ks_filter \ + mcl consense cluster_rest add_extra tabularize \ + align model_and_trim calc_trees summarize" export dependencies='dagchainer mcl cons famsa run_DAG_chainer.pl' From 0d10f5db360a45238868ed64ec4f5f8d226e98e5 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 11:56:14 -0800 Subject: [PATCH 17/18] Handle transfer of 19_pan_aug_leftover_merged_prot in -common rather than in -fam --- bin/pandagma-common.sh | 2 +- bin/pandagma-fam.sh | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh index dd37eac..1e8ba99 100755 --- a/bin/pandagma-common.sh +++ b/bin/pandagma-common.sh @@ -230,7 +230,7 @@ run_xfr_aligns_trees() { fi if [[ "$scriptname" =~ "pandagma pan" ]] || [[ "$scriptname" =~ "pandagma fam" ]]; then - for dir in 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do + for dir in 19_pan_aug_leftover_merged_prot 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do if [ -d "${WORK_DIR}/$dir" ]; then echo "Copying directory $dir to output directory" cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/ diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index 3a18eed..a7c38cc 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -837,15 +837,6 @@ run_summarize() { echo "Couldn't find file manifests/MANIFEST_output_fam.yml" fi - for dir in 19_pan_aug_leftover_merged_prot 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done - printf "Run of program %s, version %s\n" "$scriptname" "$version" > "${stats_file}" end_time=$(date) From cef3b01418340eae24c77650af9b673f94457532 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Thu, 29 Feb 2024 13:52:38 -0800 Subject: [PATCH 18/18] Tweak output of counts-per-accession header in stats.txt file --- bin/pandagma-fam.sh | 23 +++++++++++++++-------- bin/pandagma-pan.sh | 25 +++++++++++++------------ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index a7c38cc..281b1b8 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -888,14 +888,21 @@ run_summarize() { echo " Print counts per accession" if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then - printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" - printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" - transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | - perl -lane 'next if ($.<=3); - $ct=0; $sum=0; $nulls=0; $OGs=0; - for $i (@F[1..(@F-1)]){ $OGs++; if ($i>0){$ct++; $sum+=$i} if ($i==0){$nulls++} }; - printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' \ - >> "${stats_file}" + { + printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" + printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" + } >> "${stats_file}" + { + transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | + perl -lane 'next if ($.<=3); + $ct=0; $sum=0; $nulls=0; $OGs=0; + for $i (@F[1..(@F-1)]){ + $OGs++; + if ($i>0){$ct++; $sum+=$i} + if ($i==0){$nulls++} + }; + printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' + } >> "${stats_file}" fi echo " Print histograms" diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index c3953eb..c46b2f8 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -1105,18 +1105,19 @@ run_summarize() { echo " Print counts per accession" if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then { - printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" - printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" - transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | - perl -lane 'next if ($.<=3); - $ct=0; $sum=0; $nulls=0; $OGs=0; - for $i (@F[1..(@F-1)]){ - $OGs++; - if ($i>0){$ct++; $sum+=$i} - if ($i==0){$nulls++} - }; - printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' \ - + printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" + printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" + } >> "${stats_file}" + { + transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | + perl -lane 'next if ($.<=3); + $ct=0; $sum=0; $nulls=0; $OGs=0; + for $i (@F[1..(@F-1)]){ + $OGs++; + if ($i>0){$ct++; $sum+=$i} + if ($i==0){$nulls++} + }; + printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' } >> "${stats_file}" fi