Merge pull request #89 from josiahseaman/python-master

2.4.5
josiahseaman · Oct 28, 2019 · bc366b7 · bc366b7
2 parents 482e0d6 + 1e1d36f
commit bc366b7
Show file tree

Hide file tree

Showing 19 changed files with 1,227 additions and 726 deletions.
diff --git a/DDV/ChainParser.py b/DDV/ChainParser.py
@@ -3,6 +3,7 @@
 
 import os
 import sys
+import traceback
 from collections import namedtuple
 
 try:
@@ -594,7 +595,13 @@ def parse_chain(self, chromosomes):# -> list:
 
         batches = []
         for chromosome in chromosomes:
-            batches.append(self._parse_chromosome_in_chain(chromosome))
+            try:
+                result = self._parse_chromosome_in_chain(chromosome)
+                batches.append(result)
+            except BaseException as e:
+                print("Encountered exception while parsing chromosome alignment: ")
+                traceback.print_exc()
+                print("Continuing to next chromosome.")
         return batches
         # workers = multiprocessing.Pool(6)  # number of simultaneous processes. Watch your RAM usage
         # workers.map(self._parse_chromosome_in_chain, chromosomes)

diff --git a/DDV/DDVUtils.py b/DDV/DDVUtils.py
@@ -45,9 +45,18 @@ def pretty_contig_name(contig_name, title_width, title_lines):
 
 def filter_by_contigs(unfiltered, extract_contigs):
     if extract_contigs is not None:  # winnow down to only extracted contigs
-        filtered_contigs = [c for c in unfiltered if c.name.split()[0] in set(extract_contigs)]
-        if filtered_contigs:
-            return filtered_contigs
+        entry_found = False
+        contig_dict = {name: None for name in extract_contigs}
+        for c in unfiltered:
+            if c.name.split()[0] in set(extract_contigs):
+                contig_dict[c.name.split()[0]] = c
+                entry_found = True
+        ordered_contigs = [contig_dict[c] for c in extract_contigs if contig_dict[c] is not None]
+        if entry_found:
+            if len(ordered_contigs) != len(extract_contigs):
+                found = {c.name.split()[0] for c in ordered_contigs}
+                print("Some entries had no match:", {n for n in extract_contigs if n not in found})
+            return ordered_contigs
         else:
             print("Warning: No matching contigs were found, so the whole file is being used:",
                   extract_contigs, file=sys.stderr)

diff --git a/DDV/MultipleAlignmentLayout.py b/DDV/MultipleAlignmentLayout.py
@@ -2,12 +2,15 @@
     with_statement, generators, nested_scopes
 
 import os
+import shutil
 import traceback
 from datetime import datetime
+
+from DNASkittleUtils.Contigs import read_contigs
 from PIL import Image, ImageDraw
 
 import math
-from DDV.TileLayout import hex_to_rgb, TileLayout
+from DDV.TileLayout import hex_to_rgb, TileLayout, is_protein_sequence
 from natsort import natsorted
 
 from DDV.DDVUtils import make_output_directory
@@ -32,7 +35,7 @@ def __init__(self, sort_contigs=False, **kwargs):
         self.all_contents = {}  # (filename: contigs) output_fasta() determines order of fasta_sources
         self.current_column_height = 20
         self.next_origin = [self.border_width, 30] # margin for titles, incremented each MSA
-        self.protein_palette = True
+        self.single_file = False  # flag to detect a single, long MSA
         self.sort_contigs = sort_contigs
         self.title_height_px = 10
         self.x_pad = 20  # whitespace between MSA blocks
@@ -105,16 +108,24 @@ def process_all_alignments(self, input_fasta_folder, output_folder, output_file_
             # self.read_contigs_and_calc_padding(single_MSA, None)
             try:  # These try catch statements ensure we get at least some output.  These jobs can take hours
                 self.draw_nucleotides()
-                if self.use_titles:
+                if self.use_titles and not self.single_file:
                     self.draw_titles()
             except Exception as e:
                 print('Encountered exception while drawing nucleotides:', '\n')
                 traceback.print_exc()
             input_path = os.path.join(input_fasta_folder, single_MSA)
-            self.output_fasta(output_folder, input_path, False, None, False, append_fasta_sources=False)
+            self.output_fasta(output_folder, input_path, False, None, False,
+                              append_fasta_sources=False, create_source_download=False)
         print("\nDrew Nucleotides:", datetime.now() - start_time)
         self.output_image(output_folder, output_file_name, False)
         print("Output Image in:", datetime.now() - start_time)
+        target_folder = os.path.join(output_folder, 'sources', os.path.basename(input_fasta_folder))
+        if not os.path.exists(target_folder):
+            print("Copying entire sources directory:", input_fasta_folder)
+            shutil.copytree(input_fasta_folder,
+                            target_folder,
+                            ignore=lambda src, names: [n for n in names if '.fa' not in n],
+                            symlinks=True, )
 
 
     def draw_nucleotides(self, verbose=False):
@@ -129,18 +140,19 @@ def calc_all_padding(self):
         seq_start, title_length = 0, 0
         widest_sequence = 0
         for i, contig in enumerate(self.contigs):  # Type: class DNASkittleUtils.Contigs.Contig
-            length = len(contig.seq)
-            widest_sequence = max(widest_sequence, length)
-            contig.consensus_width = widest_sequence
             contig.reset_padding = 0
-            #First contig of each MSA has a 10px tall title
-            contig.title_padding = 0 if i != 0 else widest_sequence * self.title_height_px
             contig.tail_padding = 0
+            widest_sequence = max(widest_sequence, len(contig.seq))
+            contig.consensus_width = widest_sequence
+            #First contig of each MSA has a 10px tall title
+            contig.title_padding = 0
+            if i == 0 and not self.single_file:
+                contig.title_padding = widest_sequence * self.title_height_px
             contig.nuc_title_start = seq_start
             contig.nuc_seq_start = seq_start + title_length
             #at the moment these values are the same but they have different meanings
-            total_progress += length + contig.title_padding # pointer in image
-            seq_start += title_length + length  # pointer in text
+            total_progress += len(contig.seq) + contig.title_padding  # pointer in image
+            seq_start += title_length + len(contig.seq)  # pointer in text
         return total_progress
 
 
@@ -161,10 +173,12 @@ def guess_image_dimensions(self):
         areas = []
         for source in self.all_contents.values():
             areas.append((source[-1].consensus_width + self.x_pad) * (len(source) + self.y_pad))
-        area = sum(areas)
+        area = sum(areas) if not self.single_file else len(self.contigs[0].seq) * len(self.all_contents) *1.2
         self.image_length = int(area * 1.2)
         square_dim = int(math.sqrt(self.image_length))
-        image_wh = [max(max_w, 4 * square_dim //3), max(max_h, 2 * square_dim // 3)]
+        desired_width = 5 * square_dim // 3
+        # TODO still not a great algorithm
+        image_wh = [desired_width, max(max_h, self.image_length // desired_width)]
         return image_wh
 
     def calculate_mixed_layout(self):
@@ -181,48 +195,81 @@ def calculate_mixed_layout(self):
         image_wh = self.guess_image_dimensions()
 
         #unsorted, largest height per row, tends to be less dense
-        self.each_layout = []  # delete old defaul layout
+        self.each_layout = []  # delete old default layout
         for filename in self.fasta_sources:
             source = self.all_contents[filename]
             height = len(source) + self.title_height_px
             width = source[0].consensus_width
-            source[0].title_padding = self.title_height_px * width  # add
-            self.layout_based_on_repeat_size(width, height, image_wh[0])
+            self.layout_based_on_repeat_size(width, height, image_wh[0], source)
         self.i_layout = 0 # drawing starts at the beginning
 
         adjusted_height = self.next_origin[1] + self.current_column_height + self.y_pad  #could extend image
+        if self.single_file:
+            adjusted_height = image_wh[1]
         self.prepare_image(0, image_wh[0], adjusted_height)
 
 
 
     def preview_all_files(self, input_fasta_folder):
         """Populates fasta_sources with files from a directory"""
-        for single_MSA in fastas_in_folder(input_fasta_folder):
-            self.read_contigs_and_calc_padding(single_MSA, None)
-            fasta_name = os.path.basename(single_MSA)
-            self.fasta_sources.append(fasta_name)
-            self.all_contents[fasta_name] = self.contigs  # store contigs so the can be wiped
-        if self.sort_contigs:  # do this before self.each_layout is created in order
-            heights = [(len(self.all_contents[fasta_name]), fasta_name) for fasta_name in self.fasta_sources]
-            heights.sort(key=lambda pair: -pair[0])  # largest number of sequences first
-            self.fasta_sources = [pair[1] for pair in heights]  # override old ordering
-
-
-    def layout_based_on_repeat_size(self, width, height, max_width):
+        files = fastas_in_folder(input_fasta_folder)
+        self.single_file = len(files) == 1
+        if self.single_file:
+            self.spread_large_MSA_source(files[0])
+        else:
+            for single_MSA in files:
+                self.read_contigs_and_calc_padding(single_MSA, None)
+                fasta_name = os.path.basename(single_MSA)
+                self.fasta_sources.append(fasta_name)
+                self.all_contents[fasta_name] = self.contigs  # store contigs so the can be wiped
+            if self.sort_contigs:  # do this before self.each_layout is created in order
+                heights = [(len(self.all_contents[fasta_name]), fasta_name) for fasta_name in self.fasta_sources]
+                heights.sort(key=lambda pair: -pair[0])  # largest number of sequences first
+                self.fasta_sources = [pair[1] for pair in heights]  # override old ordering
+
+
+    def layout_based_on_repeat_size(self, width, height, max_width, contigs):
         """change layout to match dimensions of the repeat
         """
+        total_width = width + self.x_pad
+        max_rows = 1000
+        if height > max_rows :
+            columns = math.ceil(height / max_rows)
+            total_width = (width + self.x_pad) * columns
+            height = min(max_rows, height)
+        if self.single_file:
+            self.layout_phased_file(width, height, max_width)
+        else:  # Typical case with many small MSA
+            # skip to next mega row
+            if self.next_origin[0] + total_width - self.x_pad + 1 >= max_width:
+                self.next_origin[0] = self.border_width
+                self.next_origin[1] += self.current_column_height + self.y_pad
+                self.current_column_height = 1  # reset
+
+            self.current_column_height = max(height, self.current_column_height)
+            modulos = [width, height, 9999, 9999]
+            padding = [0, 0, self.x_pad, self.x_pad * 3]
+            self.each_layout.append(level_layout_factory(modulos, padding, self.next_origin))
+            self.next_origin[0] += total_width  # scoot next_origin by width we just used up
+        self.i_layout = len(self.each_layout) - 1  # select current layout
+
 
-        # skip to next mega row
-        if self.next_origin[0] + width + 1 >= max_width:
+    def layout_phased_file(self, width, height, max_width):
+        self.each_layout = []
+        usable_width = min(width, max_width - (self.border_width * 2))
+        # TODO more than one large MSA
+        height = len(self.fasta_sources)  # number of individuals
+        padding_between_mega_rows = 1
+        n_rows = math.ceil(len(self.contigs[0].seq) / usable_width)
+        modulos = [usable_width, 1, 1, n_rows]
+        padding = [0, height + padding_between_mega_rows, 0, height + padding_between_mega_rows]
+        for y, row in enumerate(self.fasta_sources):  # one layout for mouse over of each individual
             self.next_origin[0] = self.border_width
-            self.next_origin[1] += self.current_column_height + self.y_pad
-            self.current_column_height = 1  # reset
-
-        self.current_column_height = max(height, self.current_column_height)
-        modulos = [width, height, 9999, 9999]
-        padding = [0, 0, 20, 20 * 3]
-        self.each_layout.append(level_layout_factory(modulos, padding, self.next_origin))
-        self.next_origin[0] += width + self.x_pad  # scoot next_origin by width we just used up
+            self.next_origin[1] = 30 + y
+            self.each_layout.append(level_layout_factory(modulos, padding, self.next_origin))
+        # move origin to bottom of image
+        self.next_origin[1] += n_rows * \
+                               (height + padding_between_mega_rows)
         self.i_layout = len(self.each_layout) - 1  # select current layout
 
 
@@ -240,26 +287,19 @@ def draw_titles(self):
         self.write_title(contig_name, self.levels.base_width, self.title_height_px, font_size,
                          title_lines, title_width, upper_left, False, self.image)
 
-    def legend(self):
-        return "<strong>Legend:</strong>"+\
-                self.legend_line('Alanine (A)', 'A') +\
-                self.legend_line('Cysteine (C)', 'C') +\
-                self.legend_line('Aspartic acid (D)', 'D') +\
-                self.legend_line('Glutamic acid (E)', 'E') +\
-                self.legend_line('Phenylalanine (F)', 'F') +\
-                self.legend_line('Glycine (G)', 'G') +\
-                self.legend_line('Histidine (H)', 'H') +\
-                self.legend_line('Isoleucine (I)', 'I') +\
-                self.legend_line('Lysine (K)', 'K') +\
-                self.legend_line('Leucine (L)', 'L') +\
-                self.legend_line('Methionine (M)', 'M') +\
-                self.legend_line('Asparagine (N)', 'N') +\
-                self.legend_line('Proline (P)', 'P') +\
-                self.legend_line('Glutamine (Q)', 'Q') +\
-                self.legend_line('Arginine (R)', 'R') +\
-                self.legend_line('Serine (S)', 'S') +\
-                self.legend_line('Threonine (T)', 'T') +\
-                self.legend_line('Valine (V)', 'V') +\
-                self.legend_line('Tryptophan (W)', 'W') +\
-                self.legend_line('Tyrosine (Y)', 'Y')+ \
-                self.legend_line('Any (X)', 'X')
+    def spread_large_MSA_source(self, fasta_path):
+        individuals = read_contigs(fasta_path)
+        self.contigs = individuals
+        self.fasta_sources = [os.path.basename(fasta_path) + str(i) for i in range(len(individuals))]
+        self.all_contents = {source: [individuals[i]] for i, source in enumerate(self.fasta_sources)}
+        self.protein_palette = is_protein_sequence(self.contigs[0])
+
+        # Zero padding
+        for name, container in self.all_contents.items():
+            contig = container[0]
+            contig.reset_padding = 0
+            contig.title_padding = 0
+            contig.tail_padding = 0
+            contig.nuc_title_start = 0
+            contig.nuc_seq_start = 0
+            contig.consensus_width = len(contig.seq)
diff --git a/DDV/ParallelGenomeLayout.py b/DDV/ParallelGenomeLayout.py
@@ -30,7 +30,7 @@ def __init__(self, n_genomes, low_contrast=False, base_width=100, column_widths=
         self.each_layout = []  # one layout per data source assumed same order as self.fasta_sources
         # I found that less padding is better for keeping visual patterns coherent over clusters
         # of columns.  The white space has a disproportionate effect if you space it out too much.
-        p = 1  # padding_between_layouts
+        p = 1 if not self.use_border_boxes else 6  # padding_between_layouts
         cluster_width = sum(column_widths) + p * n_genomes  # total thickness of data and padding
         cluster_width += p * 2  # double up on padding between super columns
         column_clusters_per_mega_row = 10600 // cluster_width  # 10600
@@ -115,7 +115,9 @@ def draw_border_boxes(self, fasta_files):
         column_size = self.levels[2].chunk_size
         margin = 6
         color = hex_to_rgb('#c9c9c9')
-        for column_progress in range(0, self.image_length, column_size):
+        main_contig = self.contigs[0]
+        for column_progress in range(main_contig.title_padding,
+                                     len(main_contig.seq) + main_contig.title_padding + column_size, column_size):
             left, top = self.each_layout[0].position_on_screen(column_progress)
             left, top = max(0, left - margin), max(0, top - margin - self.header_height)
             # column_progress only works when first and last columns have the same width
@@ -130,9 +132,9 @@ def draw_border_boxes(self, fasta_files):
             #TODO: could be optimized by caching the text image
             for i, layout in enumerate(self.each_layout):
                 left, ignore = layout.position_on_screen(column_progress)
-                text = pp(column_progress) #+ ' ' + just_the_name(fasta_files[i]) # cluttered
-                self.write_title(text, self.base_width, self.header_height, 11, 1, 30,
-                                 (left, top + 3),
+                text = pp(column_progress - main_contig.title_padding) #+ ' ' + just_the_name(fasta_files[i]) # cluttered
+                self.write_title(text, self.base_width, self.header_height + 2, 11, 1, 30,
+                                 (left, top + 0),
                                  False, self.image, color=hex_to_rgb('#606060'))
         self.genome_processed = 0