Skip to content

Commit

Permalink
implement issue #55
Browse files Browse the repository at this point in the history
add output file that contains the boundaries of genes in the concatenated MSA.
  • Loading branch information
alpae committed Feb 2, 2024
1 parent ed6d7eb commit 167c12a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 12 deletions.
27 changes: 17 additions & 10 deletions read2tree/Aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ def concat_alignment(self):
use_alignments = self.alignments
alignments_aa = []
alignments_dna = []
grp_names = []

def sorter_groups(grp):
if grp.startswith('OG') and grp[2:].isdigit():
Expand All @@ -384,22 +385,28 @@ def sorter_groups(grp):
value = use_alignments[key]
alignments_aa.append(value.aa)
alignments_dna.append(value.dna)
grp_names.append(key)
concatination_aa = concatenate(alignments_aa)
concatination_dna = concatenate(alignments_dna)

if concatination_aa:
align_output = open(os.path.join(self.args.output_path,
"concat_" + self._species_name + "_aa.phy"), "w")
AlignIO.write(concatination_aa, align_output, "phylip-relaxed")
align_output.close()
outfn = os.path.join(self.args.output_path, "concat_" + self._species_name + "_aa.phy")
self._write_concation(outfn, concatination_aa, grp_names)

if concatination_dna:
align_output = open(os.path.join(self.args.output_path,
"concat_" + self._species_name + "_dna.phy"), "w")
AlignIO.write(concatination_dna, align_output, "phylip-relaxed")
align_output.close()

return (concatination_aa, concatination_dna)
outfn = os.path.join(self.args.output_path,"concat_" + self._species_name + "_dna.phy")
self._write_concation(outfn, concatination_dna, grp_names)
return concatination_aa, concatination_dna

def _write_concation(self, outfn, msa, group_names):
with open(outfn, "w") as align_output:
AlignIO.write(msa, align_output, "phylip-relaxed")
if 'boundaries' in msa.annotations:
with open(outfn.replace(".phy", ".group_boundaries"), "w") as fh:
start_pos = 0
for name, end_pos in zip(group_names, msa.annotations['boundaries']):
fh.write(f"{name} = {start_pos + 1}-{end_pos + 1}\n")
start_pos = end_pos

def add_to_alignment(self, mapper):
"""
Expand Down
7 changes: 5 additions & 2 deletions read2tree/utils/seq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def concatenate(alignments):
else:
unknown_char = '?'

boundaries, cumlength = [], 0
for aln in alignments:
length = aln.get_alignment_length()

Expand All @@ -137,8 +138,10 @@ def concatenate(alignments):
# else stuff the string representation into the tmp dict
for rec in aln:
tmp[rec.id].append(str(rec.seq))
cumlength += length
boundaries.append(cumlength)

# Stitch all the substrings together using join (most efficient way),
# and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
return MultipleSeqAlignment(SeqRecord(Seq(''.join(v)), id=k)
for (k, v) in tmp.items())
return MultipleSeqAlignment([SeqRecord(Seq(''.join(v)), id=k) for (k, v) in tmp.items()],
annotations={'boundaries': boundaries})

0 comments on commit 167c12a

Please sign in to comment.