Merge pull request #50 from bedapub/report

Report
bedapub · Feb 6, 2024 · 45b85a7 · 45b85a7
2 parents 32c8aa5 + c9365bf
commit 45b85a7
Show file tree

Hide file tree

Showing 9 changed files with 457 additions and 34 deletions.
diff --git a/splicekit/__init__.py b/splicekit/__init__.py
@@ -43,6 +43,8 @@
 import splicekit.core.delta_dar
 print("splicekit | loading splicekit.clusterlogfc")
 import splicekit.clusterlogfc
+print("splicekit | loading splicekit.report")
+import splicekit.report
 
 # initialization (triggered once on "import splicekit")
 splicekit.core.annotation.compounds = {}
@@ -137,7 +139,6 @@ def genes():
     splicekit.core.features.load_genes()
     splicekit.core.features.read_genes()
     os.system("rm -f data/comparison_genes_data/*.tab.gz > /dev/null 2>&1")
-    #splicekit.core.features.save_comps_feature_data("genes")
     splicekit.core.features.make_counts_table("genes")
 
 def features():
@@ -187,7 +188,7 @@ def edgeR(run=None):
         if splicekit.config.platform=="SLURM":
             os.system('jobs=( $(ls jobs/edgeR/junctions/*.job) ); g=10; for((i=0; i < ${#jobs[@]}; i+=g)); do part=( "${jobs[@]:i:g}" ); job_ids=(); for job_fname in "${part[@]}"; do echo "[edgeR.junctions] submitted $job_fname"; job_id=$(sbatch --mem=' + config.edgeR_memory + ' --parsable ${job_fname}); job_ids+=($job_id); done; for job_id in "${job_ids[@]}"; do scontrol show job $job_id | grep -q "JobState=COMPLETED" || scontrol wait job $job_id; done; echo "[edgeR.junctions] processing next 10"; done; echo "[edgeR.junctions] processing complete"')
         splicekit.core.report.edgeR_feature('junctions')
-        splicekit.core.patterns.process() # adds donor patterns
+        splicekit.core.patterns.process() # adds patterns
 
     if run=="exons" or run==None:
         os.system(f"rm -f results/edgeR/exons/*.tab.gz > /dev/null 2>&1")

diff --git a/splicekit/config/__init__.py b/splicekit/config/__init__.py
@@ -5,6 +5,8 @@
 import splicekit.core as core
 import socket
 
+module_desc = "splicekit | config |"
+
 if os.path.exists("splicekit.config"):
     config_lines = open("splicekit.config").readlines()
     for cline in config_lines:
@@ -26,8 +28,7 @@ def jbrowse2_config(hostname):
         hostname=socket.gethostname()
     ip_addr=socket.gethostbyname(hostname)
     jbrowse2_url = f"http://{ip_addr}:{port}/jbrowse2/?config=splicekit_data/config.json"
-
-jbrowse2_config(None)
+    print(f"{module_desc} setting JBrowse2 URL to {jbrowse2_url}")
 
 # read in location of gtf and fasta files
 if genome_version!=None:

diff --git a/splicekit/core/annotation.py b/splicekit/core/annotation.py
@@ -92,7 +92,10 @@ def make_comparisons():
     f.close()
     # sort by sample ID
     for treatment, data in annotation.treatments.items():
-        annotation.treatments[treatment] = sort_readout_id(data)
+        try:
+            annotation.treatments[treatment] = sort_readout_id(data)
+        except:
+            pass
     dmso_hash = {}
     dmso_letter = "A"
     # sometimes the separates set was reshufled
@@ -132,7 +135,10 @@ def make_comparisons():
                 annotation.comparisons.append((short_names(comparison_name), temp_control, temp_test, short_names(control_group_id), short_names(test_group_id)))
     # sort and cast sample_ids to string
     annotation.samples = list(samples)
-    annotation.samples = sort_readout_id(annotation.samples)
+    try:
+        annotation.samples = sort_readout_id(annotation.samples)
+    except:
+        pass
     annotation.samples = [str(el) for el in annotation.samples]
 
 def write_comparisons():
@@ -178,8 +184,14 @@ def write_comparisons():
             control_ids.append(sample_id)
         for (sample_id, compound, rep, _) in test_set:
             test_ids.append(sample_id)
-        control_ids = sort_readout_id(control_ids)
-        test_ids = sort_readout_id(test_ids)
+        try:
+            control_ids = sort_readout_id(control_ids)
+        except:
+            pass
+        try:
+            test_ids = sort_readout_id(test_ids)
+        except:
+            pass
 
         # write rMATS {comparison_name}_control.tab, {comparison_name}_test.tab and {comparison_name}_run.sh
         for rtype, rfile in [("control", control_ids), ("test", test_ids)]:
@@ -199,6 +211,7 @@ def write_comparisons():
         comps_table.write("\t".join(row) + "\n") 
     comps_table.close()
     write_edgeR_jobs()
+    write_mds_job()
 
 def write_edgeR_jobs():
     if config.platform == 'SLURM':
@@ -264,8 +277,14 @@ def write_edgeR_jobs():
             control_ids.append(sample_id)
         for (sample_id, compound, rep, _) in test_set:
             test_ids.append(sample_id)
-        control_ids = sort_readout_id(control_ids)
-        test_ids = sort_readout_id(test_ids)
+        try:
+            control_ids = sort_readout_id(control_ids)
+        except:
+            pass
+        try:
+            test_ids = sort_readout_id(test_ids)
+        except:
+            pass
 
         try:
             filter_low = splicekit.config.filter_low
@@ -312,6 +331,65 @@ def write_edgeR_jobs():
     fsh_donor_anchors.close()
     fsh_acceptor_anchors.close()
 
+def write_mds_job():
+    if config.platform == 'SLURM':
+            job_mds="""
+#!/bin/bash
+#SBATCH --job-name=edgeR_mds                                      # Job name
+#SBATCH --ntasks=1                                                # Number of tasks
+#SBATCH --nodes=1                                                 # All tasks on one node
+#SBATCH --mem=8G                                                  # Allocate memory
+#SBATCH --partition=short                                         # Select queue
+#SBATCH --output=logs/edgeR/mds/mds.out         # Output file
+#SBATCH --error=logs/edgeR/mds/mds.err          # Error file
+
+module load R
+{container} R --no-save --args {input_folder} {sample_membership} {filter_low} < {core_path}/comps_edgeR_mds.R
+    """
+
+    else:
+
+        job_mds="""
+#!/bin/bash
+#BSUB -J mds                    # job name
+#BSUB -n 1                      # number of tasks
+#BSUB -R "span[hosts=1]"        # allocate hosts
+#BSUB -M {memory}               # allocate memory
+#BSUB -q {queue}                # select queue
+#BSUB -o logs/edgeR/mds/mds.out # output file
+#BSUB -e logs/edgeR/mds/mds.err # error file
+
+ml R
+{container} R --no-save --args {input_folder} {sample_membership} {filter_low} < {core_path}/comps_edgeR_mds.R
+    """
+
+    job_sh_mds="""{container} R --no-save --args {input_folder} {sample_membership} {filter_low} < {core_path}/comps_edgeR_mds.R"""
+    fsh_mds = open(f"jobs/edgeR/mds/process.sh", "wt")
+    fout_mds = open(f"jobs/edgeR/mds/mds.job", "wt")
+    sample_membership = {}
+    for (comparison_name, control_set, test_set, control_group_id, test_group_id) in annotation.comparisons:
+        for (sample_id, _, _, _) in control_set:
+            # in some rare cases, the same sample can be part of diverse control groups
+            #assert(sample_membership.get(sample_id, None) in [None, control_group_id])
+            sample_membership[sample_id] = control_group_id
+        for (sample_id, _, _, _) in test_set:
+            # in some rare cases, the same sample can be part of diverse test groups
+            #assert(sample_membership.get(sample_id, None) in [None, test_group_id])
+            sample_membership[sample_id] = test_group_id
+    sample_membership = [sample_membership[sample_id] for sample_id in annotation.samples]
+    try:
+        filter_low = splicekit.config.filter_low
+    except:
+        filter_low = "filter_low"
+
+    # mds
+    job = job_mds.format(queue=config.cluster_queue, memory=config.edgeR_memory, filter_low=filter_low, container=splicekit.config.container, core_path=os.path.dirname(core.__file__), input_folder=os.getcwd(), sample_membership=",".join(str(el) for el in sample_membership))
+    fout_mds.write(job)
+    fout_mds.close()        
+    job = job_sh_mds.format(filter_low=filter_low, container=splicekit.config.container, input_folder=os.getcwd(), sample_membership=",".join(str(el) for el in sample_membership))
+    fsh_mds.write(job+"\n")
+    fsh_mds.close()
+
 def make_design_contrast():
     # write design matrix
     f = open("annotation/design.tab", "wt")

diff --git a/splicekit/core/jbrowse2.py b/splicekit/core/jbrowse2.py
@@ -36,7 +36,10 @@ def setup():
 # for every bam file create bigwig and cram file plus a bed file that shows the junctions
 bam_dir = config.bam_path
 container = config.container
-bam_files = [fi for fi in os.listdir(bam_dir) if fi.endswith('.bam')]
+try:
+    bam_files = [fi for fi in os.listdir(bam_dir) if fi.endswith('.bam')]
+except:
+    print(f"{module_desc} no bam files found in provided bam folder")
 genome_fa = config.fasta_path
 gff_fname =  config.gff3_path
 dirs_to_check = ['logs/logs_jbrowse/', 'jobs/jobs_jbrowse/','results/results_jbrowse/'] 
@@ -53,7 +56,7 @@ def write_sample_jobs(force_samples):
     os.system("rm -r jobs/jobs_jbrowse/* >/dev/null 2>&1") # clean up previous jobs
 
     # create bigwig and then cram files
-    if config.platforn == 'SLURM':
+    if config.platform == 'SLURM':
         job_bw="""
 #!/bin/bash
 #SBATCH --job-name={sample}_jbrowse               # Job name
@@ -129,7 +132,7 @@ def create_jbrowse_config(force_annotation):
         for sample in bam_files:
             os.system(f"rm logs/logs_jbrowse/{sample}.out >/dev/null 2>&1")
             os.system(f"rm logs/logs_jbrowse/{sample}.err >/dev/null 2>&1")
-        print('{module_desc} creating config.json in jbrowse2/splicekit_data')
+        print(f'{module_desc} creating config.json in jbrowse2/splicekit_data')
         if os.path.isabs(genome_fa): # check if genome_fa path is absolute or relative
             os.system(f"cd jbrowse2/splicekit_data; {container} jbrowse add-assembly {genome_fa} --name GenomeSequence --load symlink") # initialize config.json with genome sequence
         else:
@@ -144,15 +147,15 @@ def create_jbrowse_config(force_annotation):
             new_gff_fname = new_gff_fname
         else:
             new_gff_fname = "../../" + new_gff_fname
-        print("{module_desc} start annotation parsing")
+        print(f"{module_desc} start annotation parsing")
         os.system(f'cd jbrowse2/splicekit_data; grep -v "^#" {new_gff_fname} | sed "s/gene/AAA/; s/mRNA/BBB/" | sort -k1,1 -k4,4n | sed "s/AAA/gene/; s/BBB/mRNA/" >| {new_gff_fname}.sorted.gff'); print('[jbrowse2] annotation sorted') 
         os.system(f'cd jbrowse2/splicekit_data; {container} bgzip {new_gff_fname}.sorted.gff -f' ); print('[jbrowse2] annotation compressed')  
         os.system(f'cd jbrowse2/splicekit_data; {container} tabix {new_gff_fname}.sorted.gff.gz'); print('[jbrowse2] annotation indexed')  
         os.system(f'cd jbrowse2/splicekit_data; {container} jbrowse add-track {new_gff_fname}.sorted.gff.gz --trackId annotation_track --name "Gene Annotations" --category "Annotations" --load symlink' ) 
         # indexes all assemblies that it can find in the current directory's config.json
-        print("{module_desc} adding indices for text searching")
+        print(f"{module_desc} adding indices for text searching")
         os.system(f"cd jbrowse2/splicekit_data; {container} jbrowse text-index --out .")
-        print("{module_desc} annotation fully parsed")
+        print(f"{module_desc} annotation fully parsed")
 
         # now we add each sample's files in different track categories  --------------------------------------------------------------------------------------------------------------------------
         for sample in bam_files:
@@ -162,7 +165,7 @@ def create_jbrowse_config(force_annotation):
             os.system(f'cd jbrowse2/splicekit_data; {container} jbrowse add-track ../../{bigwig_fname} --name {sample_id}_bw --trackId {sample_id}_bw  --category "Coverage" --load symlink')
             os.system(f'cd jbrowse2/splicekit_data; {container} jbrowse add-track ../../{cram_fname} --name {sample_id}_cram --trackId {sample_id}_cram --category "Reads" --load symlink')
     else:
-        print("{module_desc} config.json already existing in jbrowse2/splicekit_data --> use 'splicekit jbrowse2 process -force' to overwrite")
+        print(f"{module_desc} config.json already existing in jbrowse2/splicekit_data --> use 'splicekit jbrowse2 process -force' to overwrite")
 
 def process(force_samples=False, force_annotation=False):
     check_genome()    

diff --git a/splicekit/core/motifs.py b/splicekit/core/motifs.py
@@ -656,7 +656,7 @@ def cluster(cutoff=9):
             r = f.readline()
         f.close()
 
-        if len(cmd)==0:
+        if len(cdm)==0:
             continue
 
         Z = linkage(cdm, 'ward')

diff --git a/splicekit/folders.setup b/splicekit/folders.setup
@@ -12,11 +12,13 @@ logs/edgeR/exons
 logs/edgeR/genes
 logs/edgeR/donor_anchors
 logs/edgeR/acceptor_anchors
+logs/edgeR/mds
 jobs/edgeR/junctions
 jobs/edgeR/exons
 jobs/edgeR/donor_anchors
 jobs/edgeR/acceptor_anchors
 jobs/edgeR/genes
+jobs/edgeR/mds
 jobs
 jobs/count_donor_anchors
 jobs/count_acceptor_anchors
@@ -58,4 +60,6 @@ results/edgeR/genes
 results/edgeR/donor_anchors
 results/edgeR/acceptor_anchors
 results/edgeR/dispersion
-results/clusterlogfc
+results/edgeR/mds
+results/clusterlogfc
+report
diff --git a/splicekit/judge/__init__.py b/splicekit/judge/__init__.py
@@ -86,7 +86,7 @@ def plot(comp_name):
         fig.update_layout(title_font_size=11)
         fig.update_layout(font={"size":11})
         fig.update_traces(marker={"size":4})
-        fig.update_traces(customdata=[df["gene_name"].to_numpy()], hovertemplate="%{z} (gene)")
+        fig.update_traces(customdata=df["gene_name"].to_numpy(), hovertemplate="%{customdata} (gene)")
         fig.write_html("results/judge/plots/{comp_name}.html".format(comp_name=comp_name), full_html=False, include_plotlyjs="cdn")
         return True