From 030b8ade357b03e86494eb6e11f160ed5b47b095 Mon Sep 17 00:00:00 2001 From: Steven Cannon Date: Fri, 1 Mar 2024 15:20:33 -0600 Subject: [PATCH] Refactoring, arg list handling, add two parameters (#11) * (1) Fix handling of ks_peaks.tsv; (2) shorten 19_pan_aug_leftover_merged_prot to 19_palm to reduce length of arglist; (3) move small families to the side in alignment step; (4) correct printing of header for counts per accession * MANY changes in the course of moving alignment & modeling steps to -common.sh. Likely breakages. Incomplete. * Various fixes in alignment and modeling steps, after moving this code to -common * Handle argument list too long errors around augment_cluster_sets.awk * Add Wm82.gnm6 annotation to Glycine pangene config & make file * Remove 04_dag at start of filter, to avoid problems of leftover files from previous runs * Minor cosmetic change in stats/ks_histplots.tsv * Add two params, min_align_count min_annots_in_align, to the reporting in the summarize step * Fix listing of pandagma_conf_params reported in summarize step, and other minor cleanup * Add two alignment parameters to config files; and minor changes to pandagma-fsup.sh to work with the pandagma driver script and pandagma-common.sh * Changes in step xfr_aligns_trees to handle output from fsup * Remove optional steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees from commandlist in -pan and -fam workflows * Bug fix & documentation tweaks in pandagma-fsup.sh * Minor change to handling of directory/file transfer in pandagma-fsup.sh * Minor change in stats reporting * Restructure ks_filter step so that ks_block_wgd_cutoff and max_pair_ks are used if stats/ks_histplots.tsv isn't provided * Handle transfer of 19_pan_aug_leftover_merged_prot in -common rather than in -fam * Tweak output of counts-per-accession header in stats.txt file --- batch_fam_example_singularity.sh | 2 +- batch_fam_prod.sh | 75 ++++++ batch_pan_example_conda.sh | 3 +- bin/fetch-datastore.sh | 7 +- bin/pandagma-common.sh | 204 ++++++++++++++-- bin/pandagma-fam.sh | 32 ++- bin/pandagma-fsup.sh | 35 ++- bin/pandagma-pan.sh | 226 +++++------------- config/Arachis3_5_2_0.conf | 2 + config/Cicer3_4_2_0.conf | 2 + ...ne5_21_29_6.conf => Glycine5_21_30_6.conf} | 25 +- config/Glycine_7_3_2.conf | 2 + config/Medicago3_8_0_15.conf | 2 + config/Phaseolus3_7_1_0.conf | 2 + config/Vigna4_7_1_4.conf | 2 + config/Zea_7_2_0.conf | 2 + config/family3_18_3.conf | 2 + config/family3_22_3.conf | 148 ++++++++++++ config/family3_4_3.conf | 2 + config/family3_sup.conf | 35 ++- config/family3_sup_chafa.conf | 71 ++++++ config/family3_sup_ur.conf | 22 +- config/family3_ur.conf | 2 + config/family_7_3.conf | 2 + config/family_ur.conf | 2 + ...lycine5_21_29_6.mk => Glycine5_21_30_6.mk} | 3 + get_data/family3_22_3.mk | 80 +++++++ get_data/family3_sup.mk | 38 +++ get_data/family3_sup_ur.mk | 10 + manifests/MANIFEST_output_fam.yml | 2 +- 30 files changed, 796 insertions(+), 246 deletions(-) create mode 100755 batch_fam_prod.sh rename config/{Glycine5_21_29_6.conf => Glycine5_21_30_6.conf} (96%) create mode 100644 config/family3_22_3.conf create mode 100644 config/family3_sup_chafa.conf rename get_data/{Glycine5_21_29_6.mk => Glycine5_21_30_6.mk} (98%) create mode 100644 get_data/family3_22_3.mk diff --git a/batch_fam_example_singularity.sh b/batch_fam_example_singularity.sh index 5015d6d..40f1f0b 100755 --- a/batch_fam_example_singularity.sh +++ b/batch_fam_example_singularity.sh @@ -39,7 +39,7 @@ singularity exec $IMAGE pandagma fam -c $CONFIG #singularity exec $IMAGE pandagma fam -c $CONFIG -s consense #singularity exec $IMAGE pandagma fam -c $CONFIG -s cluster_rest #singularity exec $IMAGE pandagma fam -c $CONFIG -s add_extra -#singularity exec $IMAGE pandagma fam -c $CONFIG -s align +#singularity exec $IMAGE pandagma fam -c $CONFIG -s align_protein #singularity exec $IMAGE pandagma fam -c $CONFIG -s model_and_trim #singularity exec $IMAGE pandagma fam -c $CONFIG -s calc_trees #singularity exec $IMAGE pandagma fam -c $CONFIG -s summarize diff --git a/batch_fam_prod.sh b/batch_fam_prod.sh new file mode 100755 index 0000000..8700076 --- /dev/null +++ b/batch_fam_prod.sh @@ -0,0 +1,75 @@ +#!/bin/bash +#SBATCH -A m4440 +#SBATCH -q regular +#SBATCH -N 1 +#SBATCH -n 30 # number of cores/tasks in this job +#SBATCH -t 23:00:00 +#SBATCH -C cpu +#SBATCH -J pand-fam2 +#SBATCH -o %x_%j.out +#SBATCH -e %x_%j.err + +set -o errexit +set -o nounset +set -o xtrace + +date # print timestamp + +# If using conda environment for dependencies: +module load conda +conda activate pandagma + +PDGPATH=$PWD +CONFIG=$PWD/config/family3_22_3.conf + +echo "Config: $CONFIG" + +export PATH=$PATH:$PDGPATH/bin +echo "PATH: $PATH" + +########## +# Test PATH +which pandagma +which calc_ks_from_dag.pl + +########## +## Fetch relevant data files; e.g. +#mkdir -p data +#make -C data -f $PWD/get_data/family3_22_3.mk + +########## +## Filter transposable elements +#pandagma TEfilter -c $CONFIG + +########## +## Run all main steps, assuming input data files exist in ./data +## Work directory will be ./work_pandagma +## Output will go to ./out_pandagma +#pandagma fam -c $CONFIG -d data_TEfilter + +########## +## Run individual steps +#pandagma fam -c $CONFIG -s ingest -d data_TEfilter +#pandagma fam -c $CONFIG -s mmseqs +#pandagma fam -c $CONFIG -s filter +#pandagma fam -c $CONFIG -s dagchainer +#pandagma fam -c $CONFIG -s ks_calc +#pandagma fam -c $CONFIG -s ks_filter +#pandagma fam -c $CONFIG -s mcl +#pandagma fam -c $CONFIG -s consense +#pandagma fam -c $CONFIG -s cluster_rest +#pandagma fam -c $CONFIG -s add_extra +#pandagma fam -c $CONFIG -s tabularize +#pandagma fam -c $CONFIG -s align_protein +#pandagma fam -c $CONFIG -s model_and_trim +#pandagma fam -c $CONFIG -s calc_trees +pandagma fam -c $CONFIG -s xfr_aligns_trees +pandagma fam -c $CONFIG -s summarize + +########## +## Optional work-directory cleanup steps +#pandagma fam -c $CONFIG -s clean +#rm -rf ./work_pandagma + +date # print timestamp + diff --git a/batch_pan_example_conda.sh b/batch_pan_example_conda.sh index 1747241..486a0e7 100755 --- a/batch_pan_example_conda.sh +++ b/batch_pan_example_conda.sh @@ -49,7 +49,8 @@ which calc_ks_from_dag.pl ########## # Optional alignment and tree-construction steps -#pandagma pan -c $CONFIG -s align +#pandagma pan -c $CONFIG -s align_cds +#pandagma pan -c $CONFIG -s align_protein #pandagma pan -c $CONFIG -s model_and_trim #pandagma pan -c $CONFIG -s calc_trees pandagma pan -c $CONFIG -s xfr_aligns_trees diff --git a/bin/fetch-datastore.sh b/bin/fetch-datastore.sh index b75d169..673f817 100755 --- a/bin/fetch-datastore.sh +++ b/bin/fetch-datastore.sh @@ -8,11 +8,13 @@ readonly DATAFILE=${1} # adjust URL for collections that are located in the annex case ${DATAFILE} in + acacr.Acra3RX.gnm1.ann1.6C0V.*|\ arahy.Tifrunner.gnm1.ann2.TN8K.*|\ arath.Col0.gnm9.ann11.KH24.*|\ bauva.BV-YZ2020.gnm2.ann1.RJ1G.*|\ chafa.ISC494698.gnm1.ann1.G7XW.*|\ dalod.SKLTGB.gnm1.ann1.R67B.*|\ + phach.longxuteng.gnm1.ann1.KGX9.*|\ prupe.Lovell.gnm2.ann1.S2ZZ.*|\ quisa.S10.gnm1.ann1.RQ4J.*|\ sento.Myeongyun.gnm1.ann1.5WXB.*|\ @@ -30,6 +32,7 @@ collection_type=annotations case ${genspa} in [A-Z]*) genus=${genspa} species=GENUS collection_type=pangenes collection=${1%.*.*.*} ;; + acacr) genus=Acacia species=crassicarpa ;; aesev) genus=Aeschynomene species=evenia ;; aradu) genus=Arachis species=duranensis ;; arahy) genus=Arachis species=hypogaea ;; @@ -52,6 +55,7 @@ case ${genspa} in glyso) genus=Glycine species=soja ;; glyst) genus=Glycine species=stenophita ;; glysy) genus=Glycine species=syndetika ;; + labpu) genus=Lablab species=purpureus ;; legume) genus=LEGUMES species=Fabaceae ;; lencu) genus=Lens species=culinaris ;; lotja) genus=Lotus species=japonicus ;; @@ -59,6 +63,7 @@ case ${genspa} in medsa) genus=Medicago species=sativa ;; medtr) genus=Medicago species=truncatula ;; phaac) genus=Phaseolus species=acutifolius ;; + phach) genus=Phanera species=championii ;; phalu) genus=Phaseolus species=lunatus ;; phavu) genus=Phaseolus species=vulgaris ;; pissa) genus=Pisum species=sativum ;; @@ -88,6 +93,4 @@ if [[ "$collection" == *"XinJiangDaYe"* ]]; then collection="XinJiangDaYe.gnm1.ann1.RKB9" fi -#echo "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}" - curl --no-progress-meter --fail "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}" diff --git a/bin/pandagma-common.sh b/bin/pandagma-common.sh index a918569..1e8ba99 100755 --- a/bin/pandagma-common.sh +++ b/bin/pandagma-common.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -version="2023-01-28" +version="2023-02-29" set -o errexit -o errtrace -o nounset -o pipefail -o posix trap 'echo ${0##*/}:${LINENO} ERROR executing command: ${BASH_COMMAND}' ERR @@ -52,30 +52,152 @@ calc_seq_stats () { }' } +########## +run_align_cds() { + cd "${WORK_DIR}" || exit + + echo "== Align CDS sequences ==" + + # If not already present, retrieve sequences for each family, preparatory to aligning them + # 19_pan_aug_leftover_merged_cds + if [[ -d 19_palmc ]]; then + : # do nothing; the directory and file(s) exist + else + mkdir -p 19_palmc + echo " For each pan-gene set, retrieve sequences into a multifasta file." + get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \ + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmc + fi + + echo; echo "== Move small families to the side ==" + mkdir -p 19_palmc_small + # Below, count_seqs = number of sequences in the alignment; count_annots = number of unique annotation groups. + min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. + for filepath in 19_palmc/*; do + file=$(basename "$filepath") + count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | wc -l); + count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) + if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then + echo "Set aside small family $file; $count_seqs sequences, $count_annots distinct annotations"; + mv "$filepath" 19_palmc_small/ + fi; + done + + echo; echo "== Align nucleotide sequence for each gene family ==" + mkdir -p 20_aligns_cds + for filepath in 19_palmc/*; do + file=$(basename "$filepath"); + echo " Computing alignment, using program famsa, for file $file" + famsa -t 2 19_palmc/"$file" 20_aligns_cds/"$file" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait +} + +########## +run_align_protein() { + cd "${WORK_DIR}" || exit + + echo "== Align protein sequences ==" + + # If not already present, retrieve sequences for each family, preparatory to aligning them + if [[ -d 19_palmp ]]; then + : # do nothing; the directory and file(s) exist + else + mkdir -p 19_palmp + echo " For each pan-gene set, retrieve sequences into a multifasta file." + get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra[@]}" \ + "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" \ + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp + fi + + echo; echo "== Move small families to the side ==" + mkdir -p 19_palmp_small + # Below, count_seqs = number of unique sequences in the alignment; count_annots = # of unique annotation groups. + min_annots_in_align=2 # require at least this many distinct annotation groups in an alignment to retain it. + for filepath in 19_palmp/*; do + file=$(basename "$filepath") + count_seqs=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); + count_annots=$( perl -lane 'BEGIN{$REX=qr($ENV{"ANN_REX"})}; if($F[0]=~/$REX/){ print $1 }' $filepath | sort -u | wc -l) + if [[ $count_seqs -lt $min_align_count ]] || [[ $count_annots -lt $min_annots_in_align ]]; then + echo "Set aside small family $file; $count_seqs sequences, $count_annots annotations"; + mv "$filepath" 19_palmp_small/ + fi; + done + + echo; echo "== Align protein sequence for the each gene family ==" + mkdir -p 20_aligns_prot + for filepath in 19_palmp/*; do + file=$(basename "$filepath"); + # echo " Computing alignment, using program famsa, for file $file" + famsa -t 2 19_palmp/"$file" 20_aligns_prot/"$file" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait +} + +########## +run_model_and_trim() { + echo; echo "== Build HMMs ==" + cd "${WORK_DIR}" || exit + mkdir -p 21_hmm + for filepath in 20_aligns_prot/*; do + file=$(basename "$filepath"); + hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + + echo; echo "== Realign to HMMs ==" + mkdir -p 22_hmmalign + for filepath in 21_hmm/*; do + file=$(basename "$filepath"); + # printf "%s " "$file" + hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_palmp/"$file" & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo + + echo; echo "== Trim HMM alignments to match-states ==" + mkdir -p 23_hmmalign_trim1 + for filepath in 22_hmmalign/*; do + file=$(basename "$filepath"); + # printf "%s " "$file" + perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" | + sed '/^$/d' > 23_hmmalign_trim1/"$file" & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo + + echo; echo "== Filter alignments prior to tree calculation ==" + mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log + min_depth=3 + min_pct_depth=20 + min_pct_aligned=20 + for filepath in 23_hmmalign_trim1/*; do + file=$(basename "$filepath") + # printf "%s " "$file" + filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ + -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & + if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi + done + wait + echo +} + run_calc_trees() { case $scriptname in - 'pandagma pan') fasttree_opts='-nt'; dir_prefix=2 ;; - 'pandagma fam') fasttree_opts=''; dir_prefix=2 ;; - 'pandagma fsup') fasttree_opts=''; dir_prefix=4 + 'pandagma pan') dir_prefix=2 ;; + 'pandagma fam') dir_prefix=2 ;; + 'pandagma fsup') dir_prefix=4 esac echo; echo "== Calculate trees ==" cd "${WORK_DIR}" mkdir -p "${dir_prefix}4_trees" - echo; echo "== Move small (<4) and very low-entropy families (sequences are all identical) to the side ==" - mkdir -p "${dir_prefix}3_pan_aug_small_or_identical" - min_seq_count=4 - # Below, "count" is the number of unique sequences in the alignment. - for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do - file=$(basename "$filepath") - count=$(awk '$1!~/>/ {print FILENAME "\t" $1}' "$filepath" | sort -u | wc -l); - if [[ $count -lt $min_seq_count ]]; then - echo "Set aside small or low-entropy family $file"; - mv "$filepath" "${dir_prefix}3_pan_aug_small_or_identical"/ - fi; - done - # By default, FastTreeMP uses all available threads. # It is more efficient to run more jobs on one core each by setting an environment variable. OMP_NUM_THREADS=1 @@ -83,13 +205,57 @@ run_calc_trees() { for filepath in "${dir_prefix}3_hmmalign_trim2"/*; do file=$(basename "$filepath") echo " Calculating tree for $file" - fasttree "${fasttree_opts}" -quiet "$filepath" > "${dir_prefix}4_trees/$file" & + if [[ "$scriptname" =~ "pandagma pan" ]]; then # pan; calculate from nucleotide alignments + echo "fasttree -nt -quiet $filepath > ${dir_prefix}4_trees/$file" + fasttree -quiet -nt "$filepath" > "${dir_prefix}4_trees/$file" + else # fam or fsup; calculate from protein alignments + echo "fasttree -quiet $filepath > ${dir_prefix}4_trees/$file" + fasttree -quiet "$filepath" > "${dir_prefix}4_trees/$file" & + fi # allow to execute up to $NPROC concurrent asynchronous processes if [[ $(jobs -r -p | wc -l) -ge ${NPROC} ]]; then wait -n; fi done wait } +run_xfr_aligns_trees() { + echo; echo "Copy alignment and tree results into output directory" + + full_out_dir="${out_dir}" + cd "${submit_dir}" || exit + + if [ ! -d "$full_out_dir" ]; then + echo "creating output directory \"${full_out_dir}/\"" + mkdir -p "$full_out_dir" + fi + + if [[ "$scriptname" =~ "pandagma pan" ]] || [[ "$scriptname" =~ "pandagma fam" ]]; then + for dir in 19_pan_aug_leftover_merged_prot 20_align* 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do + if [ -d "${WORK_DIR}/$dir" ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/ + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi + done + elif [[ "$scriptname" =~ "pandagma fsup" ]]; then + for dir in 35_sup_in_fams_prot 42_hmmalign 43_hmmalign_trim2 44_trees; do + if [ -d "${WORK_DIR}/$dir" ]; then + echo "Copying directory $dir to output directory" + cp -r "${WORK_DIR}/$dir" "${full_out_dir}"/ + else + echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" + fi + done + else + echo "Exiting, as $scriptname is not one of pan, fam, or fsup" + exit 1 + fi + + # Remove zero-length files in output directory + find "${full_out_dir}" -size 0 -print -delete +} + main_pan_fam() { if [ "$#" -eq 0 ]; then echo >&2 "$HELP_DOC" && exit 0; @@ -134,7 +300,7 @@ main_pan_fam() { export fam_dir case ${step} in - all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./pandagma_out + all|summarize|xfr_aligns_trees) : "${out_dir:=out_pandagma}" ;; # default to ./out_pandagma *) if [ "${out_dir:-}" ]; then printf '\noption -o OUT_DIR not applicable to step %s\n' "$step" >&2 exit 1 diff --git a/bin/pandagma-fam.sh b/bin/pandagma-fam.sh index 3a18eed..281b1b8 100755 --- a/bin/pandagma-fam.sh +++ b/bin/pandagma-fam.sh @@ -837,15 +837,6 @@ run_summarize() { echo "Couldn't find file manifests/MANIFEST_output_fam.yml" fi - for dir in 19_pan_aug_leftover_merged_prot 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done - printf "Run of program %s, version %s\n" "$scriptname" "$version" > "${stats_file}" end_time=$(date) @@ -897,14 +888,21 @@ run_summarize() { echo " Print counts per accession" if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then - printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" - printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" - transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | - perl -lane 'next if ($.<=3); - $ct=0; $sum=0; $nulls=0; $OGs=0; - for $i (@F[1..(@F-1)]){ $OGs++; if ($i>0){$ct++; $sum+=$i} if ($i==0){$nulls++} }; - printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' \ - >> "${stats_file}" + { + printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" + printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" + } >> "${stats_file}" + { + transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | + perl -lane 'next if ($.<=3); + $ct=0; $sum=0; $nulls=0; $OGs=0; + for $i (@F[1..(@F-1)]){ + $OGs++; + if ($i>0){$ct++; $sum+=$i} + if ($i==0){$nulls++} + }; + printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' + } >> "${stats_file}" fi echo " Print histograms" diff --git a/bin/pandagma-fsup.sh b/bin/pandagma-fsup.sh index 9a8a621..f789fc6 100755 --- a/bin/pandagma-fsup.sh +++ b/bin/pandagma-fsup.sh @@ -41,10 +41,10 @@ Subcommands (in order they are usually run): all - All of the steps below (Or equivalently: omit the -s flag; \"all\" is default). ingest - Prepare the assembly and annotation files for analysis. - fam_consen - - search_families - - realign_and_trim - - calc_trees - + fam_consen - Generate a consensus sequence for each family. + search_families - Search provided annotation sets (protein) against family consensus sequences. + realign_and_trim - Build HMMs and trim the alignments, preparatory to calculating trees. + calc_trees - Calculate gene trees. summarize - Move results into output directory, and report summary statistics. EOS @@ -59,10 +59,13 @@ Variables in pandagma config file (Set the config with the CONF environment vari \"([^.]+\.[^.]+)\..+\" for two dot-separated fields, e.g. vigan.Shumari or \"(\D+\d+\D+)\d+.+\" for Zea assembly+annot string, e.g. Zm00032ab + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] File sets (arrays): - protein_files + annotation_files cds_files + protein_files EOS ######################################## @@ -78,6 +81,8 @@ canonicalize_paths() { cd "${DATA_DIR}" || exit + export ANN_REX=${annot_str_regex} + mapfile -t cds_files < <(realpath --canonicalize-existing "${cds_files[@]}") mapfile -t protein_files < <(realpath --canonicalize-existing "${protein_files[@]}") @@ -119,7 +124,7 @@ run_ingest() { # calc basic sequence stats annot_name=$(basename 02_fasta_prot/"$file_base" | perl -pe '$ann_rex=qr($ENV{"ANN_REX"}); s/$ann_rex/$1/' ) echo " CHECK: report annot_name to stats file? [$annot_name]" - printf " Added with hmmsearch: " >> stats/tmp.fasta_seqstats_sup + printf " Added with $consen_method: " >> stats/tmp.fasta_seqstats_sup cat_or_zcat "${protein_files[file_num]}" | calc_seq_stats >> stats/tmp.fasta_seqstats_sup done @@ -275,7 +280,6 @@ run_tabularize() { rm tmp.table_header } - ########## run_realign_and_trim() { echo; echo "== Retrieve sequences for each family, preparatory to aligning them ==" @@ -289,7 +293,7 @@ run_realign_and_trim() { mkdir -p 42_hmmalign for filepath in 35_sup_in_fams_prot/*; do file=$(basename "$filepath"); - hmmalign --trim --outformat A2M --amino -o 42_hmmalign/"$file" "{fam_dir}/21_hmm/$file" 35_sup_in_fams_prot/"$file" & + hmmalign --trim --outformat A2M --amino -o 42_hmmalign/"$file" "${fam_dir}/21_hmm/$file" 35_sup_in_fams_prot/"$file" & if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi done wait @@ -346,14 +350,7 @@ run_summarize() { fi done - for dir in 35_sup_in_fams_prot 43_hmmalign_trim2 44_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done + # Directories 35_sup_in_fams_prot 43_hmmalign_trim2 44_trees are transferred in xfr_aligns_trees (in pandagma-common.sh) end_time=$(date) @@ -430,11 +427,11 @@ run_summarize() { pandagma_conf_params='consen_iden clust_cov consen_method annot_str_regex' -# Run all specified steps -export commandlist="ingest fam_consen search_families realign_and_trim calc_trees summarize" +# Run all specified steps. +# Those steps. Steps realign_and_trim calc_trees xfr_aligns_trees are in pandagma-common.sh +export commandlist="ingest fam_consen search_families realign_and_trim calc_trees xfr_aligns_trees summarize" export dependencies='hmmscan famsa' -export ANN_REX=${annot_str_regex} declare out_dir version consen_iden clust_cov consen_method annot_str_regex diff --git a/bin/pandagma-pan.sh b/bin/pandagma-pan.sh index 56ac609..c46b2f8 100755 --- a/bin/pandagma-pan.sh +++ b/bin/pandagma-pan.sh @@ -114,6 +114,8 @@ Variables in pandagma config file (Set the config with the CONF environment vari this annotation is among those with the median length for the orthogroup. Otherwise, one is selected at random from those with median length. order_method - Method to determine consensus panID order. reference or alignment [default reference] + min_align_count - Minimum number of sequences in a family to trigger alignments, modeling, and trees [4] +min_annots_in_align - Minimum number of distinct annotation groups in an alignment to retain it [2] '' EOS @@ -297,6 +299,7 @@ run_mmseqs() { run_filter() { echo; echo "From mmseqs cluster output, split out the following fields: molecule, gene, start, stop." cd "${WORK_DIR}" || exit + if [ -d 04_dag ]; then rm -rf 04_dag; fi mkdir -p 04_dag if [[ -v expected_chr_matches ]]; then # filter based on list of expected chromosome pairings if provided echo "Filtering on chromosome patterns defined in expected_chr_matches" @@ -403,7 +406,9 @@ run_consense() { get_fasta_from_family_file.pl "${cds_files[@]}" -fam 06_syn_pan.clust.tsv -out 07_pan_fasta echo " Merge fasta files in 07_pan_fasta, prefixing IDs with panID" - merge_files_to_pan_fasta.awk 07_pan_fasta/* > 07_pan_fasta_cds.fna + pushd 07_pan_fasta + merge_files_to_pan_fasta.awk * > ../07_pan_fasta_cds.fna + popd echo " Pick a representative seq. for each orthogroup - as a sequence with the median length for that OG." echo " < 07_pan_fasta_cds.fna pick_family_rep.pl -prefer $preferred_annot -out 08_pan_fasta_clust_rep_cds.fna" @@ -413,7 +418,9 @@ run_consense() { cat_or_zcat "${cds_files[@]}" | awk '/^>/ {print substr($1,2)}' | sort > lists/09_all_genes echo " Get sorted list of all clustered genes" - awk '$1~/^>/ {print $1}' 07_pan_fasta/* | sed 's/>//' | sort > lists/09_all_clustered_genes + pushd 07_pan_fasta + awk '$1~/^>/ {print $1}' * | sed 's/>//' | sort > ../lists/09_all_clustered_genes + popd echo " Get list of genes not in clusters" comm -13 lists/09_all_clustered_genes lists/09_all_genes > lists/09_genes_not_in_clusters @@ -472,9 +479,11 @@ run_consense() { echo " Make augmented cluster sets. Uniqify on the back end, converting from mcl clust format to hsh (clust_ID gene)." cat /dev/null > 12_syn_pan_aug_pre.hsh.tsv - augment_cluster_sets.awk leftovers_dir=11_pan_leftovers 07_pan_fasta/* | - perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | - sort -k1,1 -k2,2 | uniq > 12_syn_pan_aug_pre.hsh.tsv + pushd 07_pan_fasta + augment_cluster_sets.awk leftovers_dir=../11_pan_leftovers * | + perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | + sort -k1,1 -k2,2 | uniq > ../12_syn_pan_aug_pre.hsh.tsv + popd echo " Reshape from mcl output format (clustered IDs on one line) to a hash format (clust_ID gene)" hash_to_rows_by_1st_col.awk < 12_syn_pan_aug_pre.hsh.tsv > 12_syn_pan_aug_pre.clust.tsv @@ -549,7 +558,9 @@ run_add_extra() { get_fasta_from_family_file.pl "${cds_files[@]}" -fam 12_syn_pan_aug.clust.tsv -out 13_pan_aug_fasta echo " Merge fasta files in 13_pan_aug_fasta, prefixing IDs with panID__" - merge_files_to_pan_fasta.awk 13_pan_aug_fasta/* > 13_pan_aug_fasta.fna + pushd 13_pan_aug_fasta + merge_files_to_pan_fasta.awk * > ../13_pan_aug_fasta.fna + popd echo " Store the panID - geneID in a hash to be retrieved later, after filtering by chromosome" awk '$1~/^>/ {print substr($1,2)}' 13_pan_aug_fasta.fna | @@ -644,28 +655,31 @@ run_add_extra() { -fam 14_syn_pan_extra.clust.tsv -out 16_pan_leftovers_extra/ echo " Make augmented cluster sets. Uniqify on the back end, converting from mcl clust format to hsh (clust_ID gene)." - augment_cluster_sets.awk leftovers_dir=16_pan_leftovers_extra 13_pan_aug_fasta/* | - perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | - sort -k1,1 -k2,2 | uniq > 18_syn_pan_aug_extra.hsh.tsv + pushd 13_pan_aug_fasta + augment_cluster_sets.awk leftovers_dir=../16_pan_leftovers_extra * | + perl -lane 'for $i (1..scalar(@F)-1){print $F[0], "\t", $F[$i]}' | + sort -k1,1 -k2,2 | uniq > ../18_syn_pan_aug_extra.hsh.tsv + popd echo " Reshape from hash to mcl output format (clustered IDs on one line)" hash_to_rows_by_1st_col.awk 18_syn_pan_aug_extra.hsh.tsv > 18_syn_pan_aug_extra.clust.tsv echo " For each pan-gene set, retrieve sequences into a multifasta file." - echo " Fasta file:" "${protein_files[@]}" "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" - if [ -d 19_pan_aug_leftover_merged_cds ]; then rm -rf 19_pan_aug_leftover_merged_cds; fi - mkdir -p 19_pan_aug_leftover_merged_cds + echo " The directory and multifasta file are called 19_pan_aug_leftover_merged_cds (abbreviated 19_palmc)" + echo " Fasta file:" "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" + if [ -d 19_palmc ]; then rm -rf 19_palmc; fi + mkdir -p 19_palmc get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_cds + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmc - echo " Merge files in 19_pan_aug_leftover_merged_cds, prefixing IDs with panID__" - find 19_pan_aug_leftover_merged_cds -maxdepth 1 -mindepth 1 | + echo " Merge files in 19_palmc, prefixing IDs with panID__" + find 19_palmc -maxdepth 1 -mindepth 1 | sort | - xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_pan_aug_leftover_merged_cds.fna + xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_palmc.fna else echo "== No annotations were designated as \"extra\", so just promote the syn_pan_aug files as syn_pan_aug_extra. ==" - cp 07_pan_fasta_cds.fna 19_pan_aug_leftover_merged_cds.fna + cp 07_pan_fasta_cds.fna 19_palmc.fna cp 12_syn_pan_aug.clust.tsv 18_syn_pan_aug_extra.clust.tsv cp 12_syn_pan_aug.hsh.tsv 18_syn_pan_aug_extra.hsh.tsv fi @@ -683,24 +697,25 @@ run_pick_exemplars() { done echo " Get protein sequences into pan-gene sets, corresponding with 18_syn_pan_aug_extra.clust.tsv" - if [ -d 19_pan_aug_leftover_merged_prot ]; then rm -rf 19_pan_aug_leftover_merged_prot; fi - mkdir -p 19_pan_aug_leftover_merged_prot + echo " The directory and multifasta file are called 19_pan_aug_leftover_merged_prot (abbreviated 19_palmp)" + if [ -d 19_palmp ]; then rm -rf 19_palmp; fi + mkdir -p 19_palmp get_fasta_from_family_file.pl 20_pan_fasta_prot.faa \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot + -fam 18_syn_pan_aug_extra.clust.tsv -out 19_palmp - echo " Get all protein sequences from files in 19_pan_aug_leftover_merged_prot" - find 19_pan_aug_leftover_merged_prot -maxdepth 1 -mindepth 1 | + echo " Get all protein sequences from files in 19_palmp" + find 19_palmp -maxdepth 1 -mindepth 1 | sort | - xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_pan_aug_leftover_merged_prot.faa + xargs awk '/^>/ {printf(">%s__%s\n", substr(FILENAME,index(FILENAME,"/")+1), substr($0,2)); next} {print}' > 19_palmp.faa echo " Pick a representative sequence for each pangene set - as a sequence with the median length for that set." echo " == first proteins:" pick_family_rep.pl -nostop -prefer "$preferred_annot" \ - -out 21_pan_fasta_clust_rep_prot.faa < 19_pan_aug_leftover_merged_prot.faa + -out 21_pan_fasta_clust_rep_prot.faa < 19_palmp.faa echo " == then CDS sequences, corresponding with 21_pan_fasta_clust_rep_prot.faa" awk '$1~/^>/ {print substr($1,2)}' 21_pan_fasta_clust_rep_prot.faa > lists/lis.21_pan_fasta_clust_rep - get_fasta_subset.pl -in 19_pan_aug_leftover_merged_cds.fna -list lists/lis.21_pan_fasta_clust_rep \ + get_fasta_subset.pl -in 19_palmc.fna -list lists/lis.21_pan_fasta_clust_rep \ -clobber -out 21_pan_fasta_clust_rep_cds.fna perl -pi -e 's/__/ /' 21_pan_fasta_clust_rep_cds.fna @@ -888,141 +903,13 @@ run_order_and_name() { perl -pe 's/__/ /g' > 23_syn_pan_pctl"${pctl_low}"_posn_prot.faa } -########## -run_align_cds() { - cd "${WORK_DIR}" || exit - - echo "== Align CDS sequences ==" - - # If not already present, retrieve sequences for each family, preparatory to aligning them - if [[ -d 19_pan_aug_leftover_merged_cds ]]; then - : # do nothing; the directory and file(s) exist - else - mkdir -p 19_pan_aug_leftover_merged_cds - echo " For each pan-gene set, retrieve sequences into a multifasta file." - get_fasta_from_family_file.pl "${cds_files[@]}" "${cds_files_extra_constr[@]}" "${cds_files_extra_free[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_cds - fi - - echo; echo "== Align nucleotide sequence for each gene family ==" - mkdir -p 20_aligns_cds - for filepath in 19_pan_aug_leftover_merged_cds/*; do - file=$(basename "$filepath"); - echo " Computing alignment, using program famsa, for file $file" - famsa -t 2 19_pan_aug_leftover_merged_cds/"$file" 20_aligns_cds/"$file" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait -} - -########## -run_align_protein() { - cd "${WORK_DIR}" || exit - - echo "== Align protein sequences ==" - - # If not already present, retrieve sequences for each family, preparatory to aligning them - if [[ -d 19_pan_aug_leftover_merged_prot ]]; then - : # do nothing; the directory and file(s) exist - else - mkdir -p 19_pan_aug_leftover_merged_prot - echo " For each pan-gene set, retrieve sequences into a multifasta file." - get_fasta_from_family_file.pl "${protein_files[@]}" "${protein_files_extra_constr[@]}" "${protein_files_extra_free[@]}" \ - -fam 18_syn_pan_aug_extra.clust.tsv -out 19_pan_aug_leftover_merged_prot - fi - - echo; echo "== Align protein sequence for the each gene family ==" - mkdir -p 20_aligns_prot - for filepath in 19_pan_aug_leftover_merged_prot/*; do - file=$(basename "$filepath"); - echo " Computing alignment, using program famsa, for file $file" - famsa -t 2 19_pan_aug_leftover_merged_prot/"$file" 20_aligns_prot/"$file" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait -} - -########## -run_model_and_trim() { - echo; echo "== Build HMMs ==" - cd "${WORK_DIR}" || exit - mkdir -p 21_hmm - for filepath in 20_aligns_cds/*; do - file=$(basename "$filepath"); - hmmbuild -n "$file" 21_hmm/"$file" "$filepath" 1>/dev/null & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - - echo; echo "== Realign to HMMs ==" - mkdir -p 22_hmmalign - for filepath in 21_hmm/*; do - file=$(basename "$filepath"); - printf "%f " "$file" - hmmalign --trim --outformat A2M -o 22_hmmalign/"$file" 21_hmm/"$file" 19_pan_aug_leftover_merged_cds/"$file" & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo - - echo; echo "== Trim HMM alignments to match-states ==" - mkdir -p 23_hmmalign_trim1 - for filepath in 22_hmmalign/*; do - file=$(basename "$filepath"); - printf "%s " "$file" - perl -ne 'if ($_ =~ />/) {print $_} else {$line = $_; $line =~ s/[a-z]//g; print $line}' "$filepath" | - sed '/^$/d' > 23_hmmalign_trim1/"$file" & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo - - echo; echo "== Filter alignments prior to tree calculation ==" - mkdir -p 23_hmmalign_trim2 23_hmmalign_trim2_log - min_depth=3 - min_pct_depth=20 - min_pct_aligned=20 - for filepath in 23_hmmalign_trim1/*; do - file=$(basename "$filepath") - printf "%f " "$file" - filter_align.pl -in "$filepath" -out 23_hmmalign_trim2/"$file" -log 23_hmmalign_trim2_log/"$file" \ - -depth $min_depth -pct_depth $min_pct_depth -min_pct_aligned $min_pct_aligned & - if [[ $(jobs -r -p | wc -l) -ge $((NPROC/2)) ]]; then wait -n; fi - done - wait - echo -} - -########## -run_xfr_aligns_trees() { - echo; echo "Copy alignment and tree results into output directory" - - full_out_dir="${out_dir}" - - cd "${submit_dir}" || exit - - if [ ! -d "$full_out_dir" ]; then - echo "creating output directory \"${full_out_dir}/\"" - mkdir -p "$full_out_dir" - fi - - for dir in 20_aligns_cds 20_aligns_prot 21_hmm 22_hmmalign 23_hmmalign_trim2 24_trees; do - if [ -d "${WORK_DIR}"/$dir ]; then - echo "Copying directory $dir to output directory" - cp -r "${WORK_DIR}"/$dir "${full_out_dir}"/ - else - echo "Warning: couldn't find dir ${WORK_DIR}/$dir; skipping" - fi - done -} - ########## run_calc_chr_pairs() { cd "${WORK_DIR}" || exit echo "Generate a report of observed chromosome pairs" echo " Identify gene pairs, using mmseqs --easy_cluster" - mmseqs easy-cluster 19_pan_aug_leftover_merged_cds.fna 24_pan_fasta_clust 03_mmseqs_tmp \ + mmseqs easy-cluster 19_palmc.fna 24_pan_fasta_clust 03_mmseqs_tmp \ --min-seq-id "$clust_iden" -c "$clust_cov" --cov-mode 0 --cluster-reassign 1>/dev/null echo " Extract chromosome-chromosome correspondences" @@ -1218,18 +1105,19 @@ run_summarize() { echo " Print counts per accession" if [ -f "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv ]; then { - printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" - printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" - transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | - perl -lane 'next if ($.<=3); - $ct=0; $sum=0; $nulls=0; $OGs=0; - for $i (@F[1..(@F-1)]){ - $OGs++; - if ($i>0){$ct++; $sum+=$i} - if ($i==0){$nulls++} - }; - printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' \ - + printf "\n== For all annotation sets, counts of genes-in-orthogroups and counts of orthogroups-with-genes:\n" + printf " gns-in-OGs OGs-w-gns OGs-w-gns/gns pct-non-null-OGs pct-null-OGs annot-set\n" + } >> "${stats_file}" + { + transpose.pl "${full_out_dir}"/18_syn_pan_aug_extra.counts.tsv | + perl -lane 'next if ($.<=3); + $ct=0; $sum=0; $nulls=0; $OGs=0; + for $i (@F[1..(@F-1)]){ + $OGs++; + if ($i>0){$ct++; $sum+=$i} + if ($i==0){$nulls++} + }; + printf(" %d\t%d\t%.2f\t%.2f\t%.2f\t%s\n", $sum, $ct, 100*$ct/$sum, 100*($OGs-$nulls)/$OGs, 100*$nulls/$OGs, $F[0])' } >> "${stats_file}" fi @@ -1260,7 +1148,7 @@ run_clean() { echo " work_dir: $PWD" if [ -d MMTEMP ]; then rm -rf MMTEMP/*; fi - for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_pan_aug_leftover_merged_prot \ + for dir in 11_pan_leftovers 13_extra_out_dir 16_pan_leftovers_extra 19_palmp \ 22_syn_pan_aug_extra_pctl${pctl_low}; do if [ -d "$dir" ]; then echo " Removing directory $dir"; rm -rf "$dir" & fi @@ -1282,8 +1170,10 @@ pandagma_conf_params='clust_iden clust_cov extra_iden mcl_inflation consen_prefix annot_str_regex order_method preferred_annot' # The steps align_cds, align_protein, model_and_trim, calc_trees, and xfr_aligns_trees may be run separately. +# Those steps (functions) are in pandagma-common.sh export commandlist="ingest mmseqs filter dagchainer mcl consense cluster_rest add_extra \ - pick_exemplars filter_to_pctile tabularize order_and_name calc_chr_pairs summarize" + pick_exemplars filter_to_pctile tabularize order_and_name \ + calc_chr_pairs summarize" export dependencies='mmseqs dagchainer mcl cons famsa hmmalign hmmbuild run_DAG_chainer.pl' diff --git a/config/Arachis3_5_2_0.conf b/config/Arachis3_5_2_0.conf index fb6efb0..959f223 100644 --- a/config/Arachis3_5_2_0.conf +++ b/config/Arachis3_5_2_0.conf @@ -11,6 +11,8 @@ consen_prefix='Arachis.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='Tifrunner.gnm2.ann2' order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and fasta_files arrays. diff --git a/config/Cicer3_4_2_0.conf b/config/Cicer3_4_2_0.conf index 5486968..35de454 100644 --- a/config/Cicer3_4_2_0.conf +++ b/config/Cicer3_4_2_0.conf @@ -11,6 +11,8 @@ consen_prefix='Cicer.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='CDCFrontier.gnm3.ann1' order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Glycine5_21_29_6.conf b/config/Glycine5_21_30_6.conf similarity index 96% rename from config/Glycine5_21_29_6.conf rename to config/Glycine5_21_30_6.conf index 4a528bb..06f2412 100644 --- a/config/Glycine5_21_29_6.conf +++ b/config/Glycine5_21_30_6.conf @@ -9,8 +9,10 @@ pctl_med="50" pctl_hi="75" consen_prefix='Glycine.pan5' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' -preferred_annot='Wm82_ISU01.gnm2.ann1' +preferred_annot='Wm82.gnm6.ann1' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and fasta_files arrays. @@ -26,12 +28,12 @@ annotation_files=( glyma.Lee.gnm2.ann1.1FNT.gene_models_main.bed.gz glyma.Lee.gnm3.ann1.ZYY3.gene_models_main.bed.gz glyma.Wenfeng7_IGA1001.gnm1.ann1.ZK5W.gene_models_main.bed.gz - glyma.Wm82_IGA1008.gnm1.ann1.FGN6.gene_models_main.bed.gz - glyma.Wm82_ISU01.gnm2.ann1.FGFB.gene_models_main.bed.gz - glyma.Wm82_NJAU.gnm1.ann1.KM71.gene_models_main.bed.gz glyma.Wm82.gnm2.ann1.RVB6.gene_models_main.bed.gz glyma.Wm82.gnm4.ann1.T8TQ.gene_models_main.bed.gz glyma.Wm82.gnm5.ann1.J7HW.gene_models_main.bed.gz + glyma.Wm82.gnm6.ann1.PKSW.gene_models_main.bed.gz + glyma.Wm82_IGA1008.gnm1.ann1.FGN6.gene_models_main.bed.gz + glyma.Wm82_NJAU.gnm1.ann1.KM71.gene_models_main.bed.gz glyma.Zh13_IGA1005.gnm1.ann1.87Z5.gene_models_main.bed.gz glyma.Zh13.gnm1.ann1.8VV3.gene_models_main.bed.gz glyma.Zh35_IGA1004.gnm1.ann1.RGN6.gene_models_main.bed.gz @@ -50,12 +52,12 @@ cds_files=( glyma.Lee.gnm2.ann1.1FNT.cds_primary.fna.gz glyma.Lee.gnm3.ann1.ZYY3.cds.fna.gz glyma.Wenfeng7_IGA1001.gnm1.ann1.ZK5W.cds.fna.gz - glyma.Wm82_IGA1008.gnm1.ann1.FGN6.cds.fna.gz - glyma.Wm82_ISU01.gnm2.ann1.FGFB.cds_primary.fna.gz - glyma.Wm82_NJAU.gnm1.ann1.KM71.cds_primary.fna.gz glyma.Wm82.gnm2.ann1.RVB6.cds_primary.fna.gz glyma.Wm82.gnm4.ann1.T8TQ.cds_primary.fna.gz glyma.Wm82.gnm5.ann1.J7HW.cds.fna.gz + glyma.Wm82.gnm6.ann1.PKSW.cds_primary.fna.gz + glyma.Wm82_IGA1008.gnm1.ann1.FGN6.cds.fna.gz + glyma.Wm82_NJAU.gnm1.ann1.KM71.cds_primary.fna.gz glyma.Zh13_IGA1005.gnm1.ann1.87Z5.cds.fna.gz glyma.Zh13.gnm1.ann1.8VV3.cds_primary.fna.gz glyma.Zh35_IGA1004.gnm1.ann1.RGN6.cds.fna.gz @@ -74,12 +76,12 @@ protein_files=( glyma.Lee.gnm2.ann1.1FNT.protein_primary.faa.gz glyma.Lee.gnm3.ann1.ZYY3.protein.faa.gz glyma.Wenfeng7_IGA1001.gnm1.ann1.ZK5W.protein.faa.gz - glyma.Wm82_IGA1008.gnm1.ann1.FGN6.protein.faa.gz - glyma.Wm82_ISU01.gnm2.ann1.FGFB.protein_primary.faa.gz - glyma.Wm82_NJAU.gnm1.ann1.KM71.protein_primary.faa.gz glyma.Wm82.gnm2.ann1.RVB6.protein_primary.faa.gz glyma.Wm82.gnm4.ann1.T8TQ.protein_primary.faa.gz glyma.Wm82.gnm5.ann1.J7HW.protein.faa.gz + glyma.Wm82.gnm6.ann1.PKSW.protein_primary.faa.gz + glyma.Wm82_IGA1008.gnm1.ann1.FGN6.protein.faa.gz + glyma.Wm82_NJAU.gnm1.ann1.KM71.protein_primary.faa.gz glyma.Zh13_IGA1005.gnm1.ann1.87Z5.protein.faa.gz glyma.Zh13.gnm1.ann1.8VV3.protein_primary.faa.gz glyma.Zh35_IGA1004.gnm1.ann1.RGN6.protein.faa.gz @@ -110,6 +112,7 @@ annotation_files_extra_constr=( glyma.TongShanTianEDan.gnm1.ann1.56XW.gene_models_main.bed.gz glyma.WanDouNo_28.gnm1.ann1.NLYP.gene_models_main.bed.gz glyma.Wm82.gnm1.ann1.DvBy.gene_models_main.bed.gz + glyma.Wm82_ISU01.gnm2.ann1.FGFB.gene_models_main.bed.gz glyma.XuDouNo_1.gnm1.ann1.G2T7.gene_models_main.bed.gz glyma.YuDouNo_22.gnm1.ann1.HCQ1.gene_models_main.bed.gz glyma.Zh13.gnm2.ann1.FJ3G.gene_models_main.bed.gz @@ -142,6 +145,7 @@ cds_files_extra_constr=( glyma.TongShanTianEDan.gnm1.ann1.56XW.cds_primary.fna.gz glyma.WanDouNo_28.gnm1.ann1.NLYP.cds_primary.fna.gz glyma.Wm82.gnm1.ann1.DvBy.cds_primary.fna.gz + glyma.Wm82_ISU01.gnm2.ann1.FGFB.cds_primary.fna.gz glyma.XuDouNo_1.gnm1.ann1.G2T7.cds_primary.fna.gz glyma.YuDouNo_22.gnm1.ann1.HCQ1.cds_primary.fna.gz glyma.Zh13.gnm2.ann1.FJ3G.cds_primary.fna.gz @@ -174,6 +178,7 @@ protein_files_extra_constr=( glyma.TongShanTianEDan.gnm1.ann1.56XW.protein_primary.faa.gz glyma.WanDouNo_28.gnm1.ann1.NLYP.protein_primary.faa.gz glyma.Wm82.gnm1.ann1.DvBy.protein_primary.faa.gz + glyma.Wm82_ISU01.gnm2.ann1.FGFB.protein_primary.faa.gz glyma.XuDouNo_1.gnm1.ann1.G2T7.protein_primary.faa.gz glyma.YuDouNo_22.gnm1.ann1.HCQ1.protein_primary.faa.gz glyma.Zh13.gnm2.ann1.FJ3G.protein_primary.faa.gz diff --git a/config/Glycine_7_3_2.conf b/config/Glycine_7_3_2.conf index 2c26083..3d06719 100644 --- a/config/Glycine_7_3_2.conf +++ b/config/Glycine_7_3_2.conf @@ -11,6 +11,8 @@ consen_prefix='Glycine.pan3' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='Wm82.gnm4.ann1' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths, in the "main" set, for all-by-all comparisons # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Medicago3_8_0_15.conf b/config/Medicago3_8_0_15.conf index 6977320..cdcd72e 100644 --- a/config/Medicago3_8_0_15.conf +++ b/config/Medicago3_8_0_15.conf @@ -11,6 +11,8 @@ consen_prefix='Medicago.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot="A17.gnm5.ann1_6" order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Phaseolus3_7_1_0.conf b/config/Phaseolus3_7_1_0.conf index 699e9ba..a2dbc41 100644 --- a/config/Phaseolus3_7_1_0.conf +++ b/config/Phaseolus3_7_1_0.conf @@ -11,6 +11,8 @@ consen_prefix='Phaseolus.pan2' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='G19833.gnm2.ann1' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Vigna4_7_1_4.conf b/config/Vigna4_7_1_4.conf index 4429ba4..3d1311d 100644 --- a/config/Vigna4_7_1_4.conf +++ b/config/Vigna4_7_1_4.conf @@ -11,6 +11,8 @@ consen_prefix='Vigna.pan3' annot_str_regex='([^.]+\.[^.]+\.[^.]+\.[^.]+)\..+' preferred_annot='IT97K-499-35.gnm1.ann2' order_method="alignment" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/Zea_7_2_0.conf b/config/Zea_7_2_0.conf index 5b165b0..5af763c 100644 --- a/config/Zea_7_2_0.conf +++ b/config/Zea_7_2_0.conf @@ -11,6 +11,8 @@ consen_prefix='Zea.pan' annot_str_regex='(\D+\d+\D+)\d+.+' preferred_annot='Zm00001eb' order_method="reference" +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of GFF & FASTA file paths # Uncomment add file paths to the the annotation_files and cds_files arrays. diff --git a/config/family3_18_3.conf b/config/family3_18_3.conf index b5ad541..7529591 100644 --- a/config/family3_18_3.conf +++ b/config/family3_18_3.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( aesev 2 diff --git a/config/family3_22_3.conf b/config/family3_22_3.conf new file mode 100644 index 0000000..8e20d73 --- /dev/null +++ b/config/family3_22_3.conf @@ -0,0 +1,148 @@ +clust_iden='0.40' +clust_cov="0.40" +extra_iden='0.30' +TE_match_iden='0.40' +mcl_inflation='1.6' +strict_synt="1" +ks_low_cutoff="0.50" # For inferring Ks peak per species pair. Don't consider Ks block-median values less than this. +ks_hi_cutoff="2.0" # For inferring Ks peak per species pair. Don't consider Ks block-median values greater than this. +ks_binsize="0.05" # For calculating and displaying histograms. Default 0.05 +ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. +max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. +consen_prefix='Legume.fam3.' +annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it + +expected_quotas=( + acacr 2 + aesev 2 + arath 2 + Arachis 4 + cerca 1 + chafa 2 + Cicer 2 + dalod 2 + Glycine 4 + labpu 2 + lencu 2 + lotja 2 + lupal 6 + Medicago 2 + phach 2 + Phaseolus 2 + prupe 1 + pissa 2 + quisa 2 + sento 2 + singl 2 + tripr 2 + vicfa 2 + Vigna 2 + vitvi 1 +) + +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz + Arachis.pan3.BYQ9.pctl30_named_protein.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz + Cicer.pan3.JVTK.pctl25_named_protein.bed.gz + dalod.SKLTGB.gnm1.ann1.R67B.gene_models_main.bed.gz + Glycine.pan5.MKRS.pctl25_named_protein.bed.gz + labpu.Highworth.gnm1.ann1.HJ3B.gene_models_main.bed.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz + lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz + lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz + Medicago.pan3.9X6B.pctl25_named_protein.bed.gz + phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz + Phaseolus.pan3.LXKV.pctl25_named_protein.bed.gz + pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz + vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz + Vigna.pan3.V294.pctl25_named_protein.bed.gz +) + +cds_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz + Arachis.pan3.BYQ9.pctl30_named_cds.fna.gz + cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz + chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz + Cicer.pan3.JVTK.pctl25_named_cds.fna.gz + dalod.SKLTGB.gnm1.ann1.R67B.cds.fna.gz + Glycine.pan5.MKRS.pctl25_named_cds.fna.gz + labpu.Highworth.gnm1.ann1.HJ3B.cds_primary.fna.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.cds.fna.gz + lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz + lupal.Amiga.gnm1.ann1.3GKS.cds.fna.gz + Medicago.pan3.9X6B.pctl25_named_cds.fna.gz + phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz + Phaseolus.pan3.LXKV.pctl25_named_cds.fna.gz + pissa.Cameor.gnm1.ann1.7SZR.cds_primary.fna.gz + quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz + sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz + singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz + tripr.MilvusB.gnm2.ann1.DFgp.cds_primary.fna.gz + vicfa.Hedin2.gnm1.ann1.PTNK.cds_primary.fna.gz + Vigna.pan3.V294.pctl25_named_cds.fna.gz +) + +protein_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz + Arachis.pan3.BYQ9.pctl30_named_protein.faa.gz + cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz + chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz + Cicer.pan3.JVTK.pctl25_named_protein.faa.gz + dalod.SKLTGB.gnm1.ann1.R67B.protein.faa.gz + Glycine.pan5.MKRS.pctl25_named_protein.faa.gz + labpu.Highworth.gnm1.ann1.HJ3B.protein_primary.faa.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.protein.faa.gz + lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz + lupal.Amiga.gnm1.ann1.3GKS.protein.faa.gz + Medicago.pan3.9X6B.pctl25_named_protein.faa.gz + phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz + Phaseolus.pan3.LXKV.pctl25_named_protein.faa.gz + pissa.Cameor.gnm1.ann1.7SZR.protein_primary.faa.gz + quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz + sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz + singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz + tripr.MilvusB.gnm2.ann1.DFgp.protein_primary.faa.gz + vicfa.Hedin2.gnm1.ann1.PTNK.protein_primary.faa.gz + Vigna.pan3.V294.pctl25_named_protein.faa.gz +) + +### (optional) Extra BED & FASTA files +annotation_files_extra=( + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz + vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz +) + +cds_files_extra=( + arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz + prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz + vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz +) + +protein_files_extra=( + arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz + prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz + vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz +) + +### (optional) Fasta file of sequences to use exclude from annotations, e.g. transposable elements or plastid seuqences. +### If provided, sequences in this file will be used to exclude CDS sequences if any matches are >= TE_match_iden +exclude_TE_match_file=( + legume.TE_lib_2024.rpt.6WVT.fna.gz +) + diff --git a/config/family3_4_3.conf b/config/family3_4_3.conf index a6820ce..051d088 100644 --- a/config/family3_4_3.conf +++ b/config/family3_4_3.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( arath 2 diff --git a/config/family3_sup.conf b/config/family3_sup.conf index b149a15..65a8f19 100644 --- a/config/family3_sup.conf +++ b/config/family3_sup.conf @@ -3,11 +3,38 @@ clust_cov='0.40' TE_match_iden='0.40' consen_method='align_sample' # align_sample >> cons > hmmemit annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it -##### (required) list of FASTA file paths -# List CDS files in the "cds_files" array -# and protein files in the "protein_files" array. -# The files should be available at that location, in the "data_sup/" or other designated directory. +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + arahy.Tifrunner.gnm2.ann2.PVFB.gene_models_main.bed.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz + cajca.ICPL87119.gnm2.ann1.L3ZH.gene_models_main.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz + cicar.CDCFrontier.gnm3.ann1.NPD7.gene_models_main.bed.gz + glyma.Wm82.gnm4.ann1.T8TQ.gene_models_main.bed.gz + lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz + lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz + lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz + phalu.G27455.gnm1.ann1.JD7C.gene_models_main.bed.gz + phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz + pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz + vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz + vigan.Gyeongwon.gnm3.ann1.3Nz5.gene_models_main.bed.gz + vigra.VC1973A.gnm7.ann1.RWBG.gene_models_main.bed.gz + vigun.IT97K-499-35.gnm1.ann2.FD7K.gene_models_main.bed.gz +) cds_files=( arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz diff --git a/config/family3_sup_chafa.conf b/config/family3_sup_chafa.conf new file mode 100644 index 0000000..70063b9 --- /dev/null +++ b/config/family3_sup_chafa.conf @@ -0,0 +1,71 @@ +consen_iden='0.30' +clust_cov='0.40' +TE_match_iden='0.40' +consen_method='align_sample' # align_sample >> cons > hmmemit +annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it + +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz + lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz + phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz + phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz + prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz +) + +cds_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz + arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.cds_primary.fna.gz + cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz + chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz + lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.cds_primary.fna.gz + phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz + phavu.G19833.gnm2.ann1.PB8d.cds_primary.fna.gz + prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz + quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz + sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz + singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz + vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz +) + +protein_files=( + acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz + aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz + arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.protein_primary.faa.gz + cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz + chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.protein_primary.faa.gz + lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz + phavu.G19833.gnm2.ann1.PB8d.protein_primary.faa.gz + phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz + prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz + quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz + sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz + singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz + vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz +) + +### (optional) Fasta file of sequences to use exclude from annotations, e.g. transposable elements or plastid seuqences. +### If provided, sequences in this file will be used to exclude CDS sequences if any matches are >= TE_match_iden +exclude_TE_match_file=( + legume.TE_lib_2024.rpt.6WVT.fna.gz +) diff --git a/config/family3_sup_ur.conf b/config/family3_sup_ur.conf index 7b45dca..0802e6e 100644 --- a/config/family3_sup_ur.conf +++ b/config/family3_sup_ur.conf @@ -3,11 +3,25 @@ clust_cov='0.40' TE_match_iden='0.40' consen_method='align_sample' # align_sample >> cons > hmmemit annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it -##### (required) list of FASTA file paths -# List CDS files in the "cds_files" array -# and protein files in the "protein_files" array. -# The files should be available at that location, in the "data_sup/" or other designated directory. +##### (required) list of BED & FASTA file paths +# add file paths to the the annotation_files, cds_files, and protein_files arrays. +# The nth listed BED file corresponds to the nth listed FASTA file. + +annotation_files=( + arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz + bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz + cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz + medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz + phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz + prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz + quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz + sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz + singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz + vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz +) cds_files=( arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz diff --git a/config/family3_ur.conf b/config/family3_ur.conf index 7b45dca..11fba3e 100644 --- a/config/family3_ur.conf +++ b/config/family3_ur.conf @@ -3,6 +3,8 @@ clust_cov='0.40' TE_match_iden='0.40' consen_method='align_sample' # align_sample >> cons > hmmemit annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it ##### (required) list of FASTA file paths # List CDS files in the "cds_files" array diff --git a/config/family_7_3.conf b/config/family_7_3.conf index c78d43a..9de8302 100644 --- a/config/family_7_3.conf +++ b/config/family_7_3.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( arath 2 diff --git a/config/family_ur.conf b/config/family_ur.conf index bf39910..d16020b 100644 --- a/config/family_ur.conf +++ b/config/family_ur.conf @@ -11,6 +11,8 @@ ks_block_wgd_cutoff="1.75" # Fallback, if a ks_cutoffs file is not provided. max_pair_ks="4.0" # Fallback value for excluding gene pairs, if a ks_cutoffs file is not provided. consen_prefix='Legume.fam3.' annot_str_regex='([^.]+\.[^.]+)\..+' +min_align_count="4" # Minimum number of sequences in a family to trigger alignments, modeling, and trees +min_annots_in_align="2" # Minimum number of distinct annotation groups in an alignment to retain it expected_quotas=( arath 2 diff --git a/get_data/Glycine5_21_29_6.mk b/get_data/Glycine5_21_30_6.mk similarity index 98% rename from get_data/Glycine5_21_29_6.mk rename to get_data/Glycine5_21_30_6.mk index b42a58f..75f1fff 100644 --- a/get_data/Glycine5_21_29_6.mk +++ b/get_data/Glycine5_21_30_6.mk @@ -107,6 +107,9 @@ glyma.Wm82.gnm4.ann1.T8TQ.protein_primary.faa.gz \ glyma.Wm82.gnm5.ann1.J7HW.cds.fna.gz \ glyma.Wm82.gnm5.ann1.J7HW.gene_models_main.bed.gz \ glyma.Wm82.gnm5.ann1.J7HW.protein.faa.gz \ +glyma.Wm82.gnm6.ann1.PKSW.cds_primary.fna.gz \ +glyma.Wm82.gnm6.ann1.PKSW.gene_models_main.bed.gz \ +glyma.Wm82.gnm6.ann1.PKSW.protein_primary.faa.gz \ glyma.Wm82_IGA1008.gnm1.ann1.FGN6.cds.fna.gz \ glyma.Wm82_IGA1008.gnm1.ann1.FGN6.gene_models_main.bed.gz \ glyma.Wm82_IGA1008.gnm1.ann1.FGN6.protein.faa.gz \ diff --git a/get_data/family3_22_3.mk b/get_data/family3_22_3.mk new file mode 100644 index 0000000..1d76515 --- /dev/null +++ b/get_data/family3_22_3.mk @@ -0,0 +1,80 @@ +FILES ::= \ +acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz \ +Arachis.pan3.BYQ9.pctl30_named_cds.fna.gz \ +Arachis.pan3.BYQ9.pctl30_named_protein.bed.gz \ +Arachis.pan3.BYQ9.pctl30_named_protein.faa.gz \ +arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz \ +arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz \ +arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz \ +Cicer.pan3.JVTK.pctl25_named_cds.fna.gz \ +Cicer.pan3.JVTK.pctl25_named_protein.bed.gz \ +Cicer.pan3.JVTK.pctl25_named_protein.faa.gz \ +dalod.SKLTGB.gnm1.ann1.R67B.cds.fna.gz \ +dalod.SKLTGB.gnm1.ann1.R67B.gene_models_main.bed.gz \ +dalod.SKLTGB.gnm1.ann1.R67B.protein.faa.gz \ +Glycine.pan5.MKRS.pctl25_named_cds.fna.gz \ +Glycine.pan5.MKRS.pctl25_named_protein.bed.gz \ +Glycine.pan5.MKRS.pctl25_named_protein.faa.gz \ +labpu.Highworth.gnm1.ann1.HJ3B.cds_primary.fna.gz \ +labpu.Highworth.gnm1.ann1.HJ3B.gene_models_main.bed.gz \ +labpu.Highworth.gnm1.ann1.HJ3B.protein_primary.faa.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.cds.fna.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.protein.faa.gz \ +lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz \ +lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz \ +lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz \ +lupal.Amiga.gnm1.ann1.3GKS.cds.fna.gz \ +lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz \ +lupal.Amiga.gnm1.ann1.3GKS.protein.faa.gz \ +Medicago.pan3.9X6B.pctl25_named_cds.fna.gz \ +Medicago.pan3.9X6B.pctl25_named_protein.bed.gz \ +Medicago.pan3.9X6B.pctl25_named_protein.faa.gz \ +Phaseolus.pan3.LXKV.pctl25_named_cds.fna.gz \ +Phaseolus.pan3.LXKV.pctl25_named_protein.bed.gz \ +Phaseolus.pan3.LXKV.pctl25_named_protein.faa.gz \ +pissa.Cameor.gnm1.ann1.7SZR.cds_primary.fna.gz \ +pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz \ +pissa.Cameor.gnm1.ann1.7SZR.protein_primary.faa.gz \ +phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz \ +phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz \ +phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz \ +quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz \ +quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz \ +quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz \ +singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz \ +singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz \ +singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.cds_primary.fna.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.protein_primary.faa.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.cds_primary.fna.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.protein_primary.faa.gz \ +Vigna.pan3.V294.pctl25_named_cds.fna.gz \ +Vigna.pan3.V294.pctl25_named_protein.bed.gz \ +Vigna.pan3.V294.pctl25_named_protein.faa.gz \ +vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz \ +vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz \ +vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz \ +legume.TE_lib_2024.rpt.6WVT.fna.gz + +include $(dir $(realpath $(lastword $(MAKEFILE_LIST))))/common.mk + diff --git a/get_data/family3_sup.mk b/get_data/family3_sup.mk index 3d8ca8f..7af6074 100644 --- a/get_data/family3_sup.mk +++ b/get_data/family3_sup.mk @@ -1,50 +1,88 @@ FILES ::= \ +acacr.Acra3RX.gnm1.ann1.6C0V.cds_primary.fna.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.gene_models_main.bed.gz \ +acacr.Acra3RX.gnm1.ann1.6C0V.protein_primary.faa.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.cds_primary.fna.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.gene_models_main.bed.gz \ +aesev.CIAT22838.gnm1.ann1.ZM3R.protein_primary.faa.gz \ arahy.Tifrunner.gnm2.ann2.PVFB.cds.fna.gz \ +arahy.Tifrunner.gnm2.ann2.PVFB.gene_models_main.bed.gz \ arahy.Tifrunner.gnm2.ann2.PVFB.protein.faa.gz \ arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz \ +arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz \ arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.cds_primary.fna.gz \ +bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.protein_primary.faa.gz \ cajca.ICPL87119.gnm2.ann1.L3ZH.cds_primary.fna.gz \ +cajca.ICPL87119.gnm2.ann1.L3ZH.gene_models_main.bed.gz \ cajca.ICPL87119.gnm2.ann1.L3ZH.protein_primary.faa.gz \ cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz \ cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz \ chafa.ISC494698.gnm1.ann1.G7XW.cds_primary.fna.gz \ +chafa.ISC494698.gnm1.ann1.G7XW.gene_models_main.bed.gz \ chafa.ISC494698.gnm1.ann1.G7XW.protein_primary.faa.gz \ cicar.CDCFrontier.gnm3.ann1.NPD7.cds.fna.gz \ +cicar.CDCFrontier.gnm3.ann1.NPD7.gene_models_main.bed.gz \ cicar.CDCFrontier.gnm3.ann1.NPD7.protein.faa.gz \ glyma.Wm82.gnm4.ann1.T8TQ.cds_primary.fna.gz \ +glyma.Wm82.gnm4.ann1.T8TQ.gene_models_main.bed.gz \ glyma.Wm82.gnm4.ann1.T8TQ.protein_primary.faa.gz \ lencu.CDC_Redberry.gnm2.ann1.5FB4.cds.fna.gz \ +lencu.CDC_Redberry.gnm2.ann1.5FB4.gene_models_main.bed.gz \ lencu.CDC_Redberry.gnm2.ann1.5FB4.protein.faa.gz \ lotja.MG20.gnm3.ann1.WF9B.cds_primary.fna.gz \ +lotja.MG20.gnm3.ann1.WF9B.gene_models_main.bed.gz \ lotja.MG20.gnm3.ann1.WF9B.protein_primary.faa.gz \ lupal.Amiga.gnm1.ann1.3GKS.cds.fna.gz \ +lupal.Amiga.gnm1.ann1.3GKS.gene_models_main.bed.gz \ lupal.Amiga.gnm1.ann1.3GKS.protein.faa.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.cds_primary.fna.gz \ +medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.protein_primary.faa.gz \ +phach.longxuteng.gnm1.ann1.KGX9.cds_primary.fna.gz \ +phach.longxuteng.gnm1.ann1.KGX9.gene_models_main.bed.gz \ +phach.longxuteng.gnm1.ann1.KGX9.protein_primary.faa.gz \ phalu.G27455.gnm1.ann1.JD7C.cds_primary.fna.gz \ +phalu.G27455.gnm1.ann1.JD7C.gene_models_main.bed.gz \ phalu.G27455.gnm1.ann1.JD7C.protein_primary.faa.gz \ phavu.G19833.gnm2.ann1.PB8d.cds_primary.fna.gz \ +phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz \ phavu.G19833.gnm2.ann1.PB8d.protein_primary.faa.gz \ pissa.Cameor.gnm1.ann1.7SZR.cds_primary.fna.gz \ +pissa.Cameor.gnm1.ann1.7SZR.gene_models_main.bed.gz \ pissa.Cameor.gnm1.ann1.7SZR.protein_primary.faa.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz \ quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz \ +quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz \ quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz \ sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz \ sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz \ singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz \ +singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz \ singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz \ tripr.MilvusB.gnm2.ann1.DFgp.cds_primary.fna.gz \ +tripr.MilvusB.gnm2.ann1.DFgp.gene_models_main.bed.gz \ tripr.MilvusB.gnm2.ann1.DFgp.protein_primary.faa.gz \ vicfa.Hedin2.gnm1.ann1.PTNK.cds_primary.fna.gz \ +vicfa.Hedin2.gnm1.ann1.PTNK.gene_models_main.bed.gz \ vicfa.Hedin2.gnm1.ann1.PTNK.protein_primary.faa.gz \ vigan.Gyeongwon.gnm3.ann1.3Nz5.cds_primary.fna.gz \ +vigan.Gyeongwon.gnm3.ann1.3Nz5.gene_models_main.bed.gz \ vigan.Gyeongwon.gnm3.ann1.3Nz5.protein_primary.faa.gz \ vigra.VC1973A.gnm7.ann1.RWBG.cds.fna.gz \ +vigra.VC1973A.gnm7.ann1.RWBG.gene_models_main.bed.gz \ vigra.VC1973A.gnm7.ann1.RWBG.protein.faa.gz \ vigun.IT97K-499-35.gnm1.ann2.FD7K.cds_primary.fna.gz \ +vigun.IT97K-499-35.gnm1.ann2.FD7K.gene_models_main.bed.gz \ vigun.IT97K-499-35.gnm1.ann2.FD7K.protein_primary.faa.gz \ +vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz \ +vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz \ +vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz \ legume.TE_lib_2024.rpt.6WVT.fna.gz include $(dir $(realpath $(lastword $(MAKEFILE_LIST))))/common.mk diff --git a/get_data/family3_sup_ur.mk b/get_data/family3_sup_ur.mk index 9b0449d..4ecfa85 100644 --- a/get_data/family3_sup_ur.mk +++ b/get_data/family3_sup_ur.mk @@ -1,22 +1,32 @@ FILES ::= \ +arath.Col0.gnm9.ann11.KH24.gene_models_main.bed.gz \ arath.Col0.gnm9.ann11.KH24.cds_primary.fna.gz \ arath.Col0.gnm9.ann11.KH24.protein_primary.faa.gz \ +bauva.BV-YZ2020.gnm2.ann1.RJ1G.gene_models_main.bed.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.cds_primary.fna.gz \ bauva.BV-YZ2020.gnm2.ann1.RJ1G.protein_primary.faa.gz \ +cerca.ISC453364.gnm3.ann1.3N1M.gene_models_main.bed.gz \ cerca.ISC453364.gnm3.ann1.3N1M.cds_primary.fna.gz \ cerca.ISC453364.gnm3.ann1.3N1M.protein_primary.faa.gz \ +medtr.A17_HM341.gnm4.ann2.G3ZY.gene_models_main.bed.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.cds_primary.fna.gz \ medtr.A17_HM341.gnm4.ann2.G3ZY.protein_primary.faa.gz \ +phavu.G19833.gnm2.ann1.PB8d.gene_models_main.bed.gz \ phavu.G19833.gnm2.ann1.PB8d.cds_primary.fna.gz \ phavu.G19833.gnm2.ann1.PB8d.protein_primary.faa.gz \ +prupe.Lovell.gnm2.ann1.S2ZZ.gene_models_main.bed.gz \ prupe.Lovell.gnm2.ann1.S2ZZ.cds_primary.fna.gz \ prupe.Lovell.gnm2.ann1.S2ZZ.protein_primary.faa.gz \ +quisa.S10.gnm1.ann1.RQ4J.gene_models_main.bed.gz \ quisa.S10.gnm1.ann1.RQ4J.cds_primary.fna.gz \ quisa.S10.gnm1.ann1.RQ4J.protein_primary.faa.gz \ +sento.Myeongyun.gnm1.ann1.5WXB.gene_models_main.bed.gz \ sento.Myeongyun.gnm1.ann1.5WXB.cds.fna.gz \ sento.Myeongyun.gnm1.ann1.5WXB.protein.faa.gz \ +singl.CAF01.gnm1.ann1.WFKC.gene_models_main.bed.gz \ singl.CAF01.gnm1.ann1.WFKC.cds.fna.gz \ singl.CAF01.gnm1.ann1.WFKC.protein.faa.gz \ +vitvi.PN40024.gnm2.ann1.V31M.gene_models_main.bed.gz \ vitvi.PN40024.gnm2.ann1.V31M.cds_primary.fna.gz \ vitvi.PN40024.gnm2.ann1.V31M.protein_primary.faa.gz \ legume.TE_lib_2024.rpt.6WVT.fna.gz diff --git a/manifests/MANIFEST_output_fam.yml b/manifests/MANIFEST_output_fam.yml index ada241e..65b1270 100644 --- a/manifests/MANIFEST_output_fam.yml +++ b/manifests/MANIFEST_output_fam.yml @@ -17,7 +17,7 @@ the set ID in the first column and genes in the second. 18_syn_pan_aug_extra.table.tsv: Tabular format of all gene families, in columns by species. Multiple paralogs from a species in a gene family are comma-separated within the column of the species. -19_pan_aug_leftover_merged_prot: Directory of fasta files for all families corresponding with 18_syn_pan_aug_extra.clust.tsv +19_palm: (short for pan_aug_leftover_merged) - directory of fasta files for all families corresponding with 18_syn_pan_aug_extra.clust.tsv stats.family_4_3.txt: Descriptive statistics about program parameters, input sequences, and Gene family products.