legumeinfo · nathanweeks · Mar 1, 2024 · Feb 21, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/README.md b/README.md
@@ -185,7 +185,8 @@ Subcommands for the **pangene** workflow, `pandagma pan`, in order they are usua
 Subcommands for the **gene family** workflow, `pandagma fam`, in order they are usually run:
 
 ```
-  Run these first (if using ks_calc)
+  Run these first (if using the ks_peaks.tsv file; otherwise, run all main steps and 
+  ks filtering will be done using parameters ks_block_wgd_cutoff and max_pair_ks)
                 all - All of the steps below, except for ks_filter and clean
                         (Or equivalently: omit the -s flag; \"all\" is default).
              ingest - Prepare the assembly and annotation files for analysis.
@@ -195,7 +196,7 @@ Subcommands for the **gene family** workflow, `pandagma fam`, in order they are
             ks_calc - Calculation of Ks values on gene pairs from DAGchainer output.
 
   Evaluate the stats/ks_histplots.tsv and stats/ks_peaks_auto.tsv files and
-  put ks_peaks.tsv into the work directory, then run the following commands:
+  put ks_peaks.tsv into the \${WORK_DIR}/stats directory, then run the following commands:
           ks_filter - Filtering based on provided ks_peaks.tsv file (assumes prior ks_calc step)
                 mcl - Derive clusters, with Markov clustering.
            consense - Calculate a consensus sequences from each pan-gene set,
@@ -369,11 +370,11 @@ ks_block_wgd_cutoff - Fallback, if a ks_peaks.tsv file is not provided. [1.75]
    remaining steps (see **8** below).
 
     An intermediate output file, `stats/ks_peaks_auto.tsv`, is written to the work directory
-    This should be examined for biological plausibility (look at Ks peak values in column 3),
-    along with the other Ks results (histograms) in the work_pandagma/stats subdirectory.
+    This should be examined for biological plausibility, along with the other 
+    Ks results (histograms) in the work_pandagma/stats subdirectory.
     The `ks_peaks_auto.tsv` file can be examined and used to create a file named `ks_peaks.tsv`
     with changes relative to `ks_peaks_auto.tsv` if necessary to reflect known or suspected WGD histories. 
-    If stats/ks_histplots.tsv is not provided, then Ks filtering will be done using values provided
+    If stats/ks_peaks.tsv is not provided, then Ks filtering will be done using values provided
     in the config file for ks_block_wgd_cutoff and max_pair_ks.
 
 4. Run steps `ks_filter` through `summarize`.

diff --git a/batch_fam_example_singularity.sh b/batch_fam_example_singularity.sh
@@ -39,7 +39,7 @@ singularity exec $IMAGE pandagma fam -c $CONFIG
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s consense
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s cluster_rest
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s add_extra
-#singularity exec $IMAGE pandagma fam -c $CONFIG -s align
+#singularity exec $IMAGE pandagma fam -c $CONFIG -s align_protein
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s model_and_trim
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s calc_trees
 #singularity exec $IMAGE pandagma fam -c $CONFIG -s summarize

diff --git a/batch_fam_prod.sh b/batch_fam_prod.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+#SBATCH -A m4440
+#SBATCH -q regular
+#SBATCH -N 1
+#SBATCH -n 30    #  number of cores/tasks in this job
+#SBATCH -t 23:00:00
+#SBATCH -C cpu
+#SBATCH -J pand-fam2
+#SBATCH -o %x_%j.out
+#SBATCH -e %x_%j.err
+
+set -o errexit
+set -o nounset
+set -o xtrace
+
+date   # print timestamp
+
+# If using conda environment for dependencies:
+module load conda
+conda activate pandagma
+
+PDGPATH=$PWD
+CONFIG=$PWD/config/family3_22_3.conf
+
+echo "Config: $CONFIG"
+
+export PATH=$PATH:$PDGPATH/bin
+echo "PATH: $PATH"
+
+##########
+# Test PATH
+which pandagma
+which calc_ks_from_dag.pl
+
+##########
+## Fetch relevant data files; e.g.
+#mkdir -p data
+#make -C data -f $PWD/get_data/family3_22_3.mk
+
+##########
+## Filter transposable elements
+#pandagma TEfilter -c $CONFIG
+
+##########
+## Run all main steps, assuming input data files exist in ./data
+## Work directory will be ./work_pandagma
+## Output will go to ./out_pandagma
+#pandagma fam -c $CONFIG -d data_TEfilter
+
+##########
+## Run individual steps
+#pandagma fam -c $CONFIG -s ingest -d data_TEfilter
+#pandagma fam -c $CONFIG -s mmseqs
+#pandagma fam -c $CONFIG -s filter
+#pandagma fam -c $CONFIG -s dagchainer
+#pandagma fam -c $CONFIG -s ks_calc
+#pandagma fam -c $CONFIG -s ks_filter
+#pandagma fam -c $CONFIG -s mcl
+#pandagma fam -c $CONFIG -s consense
+#pandagma fam -c $CONFIG -s cluster_rest
+#pandagma fam -c $CONFIG -s add_extra
+#pandagma fam -c $CONFIG -s tabularize
+#pandagma fam -c $CONFIG -s align_protein
+#pandagma fam -c $CONFIG -s model_and_trim
+#pandagma fam -c $CONFIG -s calc_trees
+pandagma fam -c $CONFIG -s xfr_aligns_trees
+pandagma fam -c $CONFIG -s summarize
+
+##########
+## Optional work-directory cleanup steps
+#pandagma fam -c $CONFIG -s clean
+#rm -rf ./work_pandagma
+
+date   # print timestamp
+
diff --git a/batch_pan_example_conda.sh b/batch_pan_example_conda.sh
@@ -49,7 +49,8 @@ which calc_ks_from_dag.pl
 
 ##########
 # Optional alignment and tree-construction steps
-#pandagma pan -c $CONFIG -s align
+#pandagma pan -c $CONFIG -s align_cds
+#pandagma pan -c $CONFIG -s align_protein
 #pandagma pan -c $CONFIG -s model_and_trim
 #pandagma pan -c $CONFIG -s calc_trees
 pandagma pan -c $CONFIG -s xfr_aligns_trees

diff --git a/bin/fetch-datastore.sh b/bin/fetch-datastore.sh
@@ -8,11 +8,13 @@ readonly DATAFILE=${1}
 
 # adjust URL for collections that are located in the annex
 case ${DATAFILE} in
+  acacr.Acra3RX.gnm1.ann1.6C0V.*|\
   arahy.Tifrunner.gnm1.ann2.TN8K.*|\
   arath.Col0.gnm9.ann11.KH24.*|\
   bauva.BV-YZ2020.gnm2.ann1.RJ1G.*|\
   chafa.ISC494698.gnm1.ann1.G7XW.*|\
   dalod.SKLTGB.gnm1.ann1.R67B.*|\
+  phach.longxuteng.gnm1.ann1.KGX9.*|\
   prupe.Lovell.gnm2.ann1.S2ZZ.*|\
   quisa.S10.gnm1.ann1.RQ4J.*|\
   sento.Myeongyun.gnm1.ann1.5WXB.*|\
@@ -30,6 +32,7 @@ collection_type=annotations
 
 case ${genspa} in
   [A-Z]*) genus=${genspa} species=GENUS collection_type=pangenes collection=${1%.*.*.*} ;;
+  acacr) genus=Acacia species=crassicarpa ;;
   aesev) genus=Aeschynomene species=evenia ;;
   aradu) genus=Arachis species=duranensis ;;
   arahy) genus=Arachis species=hypogaea ;;
@@ -52,13 +55,15 @@ case ${genspa} in
   glyso) genus=Glycine species=soja ;;
   glyst) genus=Glycine species=stenophita ;;
   glysy) genus=Glycine species=syndetika ;;
+  labpu) genus=Lablab species=purpureus ;;
   legume) genus=LEGUMES species=Fabaceae ;;
   lencu) genus=Lens species=culinaris ;;
   lotja) genus=Lotus species=japonicus ;;
   lupal) genus=Lupinus species=albus ;;
   medsa) genus=Medicago species=sativa ;;
   medtr) genus=Medicago species=truncatula ;;
   phaac) genus=Phaseolus species=acutifolius ;;
+  phach) genus=Phanera species=championii ;;
   phalu) genus=Phaseolus species=lunatus ;;
   phavu) genus=Phaseolus species=vulgaris ;;
   pissa) genus=Pisum species=sativum ;;
@@ -88,6 +93,4 @@ if [[ "$collection" == *"XinJiangDaYe"* ]]; then
   collection="XinJiangDaYe.gnm1.ann1.RKB9"
 fi
 
-#echo "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}"
-
 curl --no-progress-meter --fail "${DATASTORE}/${genus}/${species}/${collection_type}/${collection}/${DATAFILE}"