Skip to content

Commit

Permalink
Merge pull request #303 from csbrasnett/fasta-reading
Browse files Browse the repository at this point in the history
added AA parsing to fasta file reading
  • Loading branch information
fgrunewald authored Mar 3, 2023
2 parents 652dd5c + b33cdbf commit d465317
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 22 deletions.
62 changes: 47 additions & 15 deletions polyply/src/simple_seq_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@
"G": "G",
"T": "U"}

ONE_LETTER_AA = {"G": "GLY",
"A": "ALA",
"V": "VAL",
"C": "CYS",
"P": "PRO",
"L": "LEU",
"I": "ILE",
"M": "MET",
"W": "TRP",
"F": "PHE",
"S": "SER",
"T": "THR",
"Y": "TYR",
"N": "ASN",
"Q": "GLN",
"K": "LYS",
"R": "ARG",
"H": "HIS",
"D": "ASP",
"E": "GLU",
}

class FileFormatError(Exception):
"""Raised when a parser fails due to invalid file format."""

Expand Down Expand Up @@ -79,7 +101,7 @@ def _parse_plain_delimited(filepath, delimiter=" "):

parse_txt = _parse_plain_delimited

def _parse_plain(lines, DNA=False, RNA=False):
def _parse_plain(lines, DNA=False, RNA=False, AA=False):
"""
Parse a plain one letter sequence block either for DNA, RNA,
or amino-acids. Lines can be a list of strings or a string.
Expand All @@ -98,6 +120,8 @@ def _parse_plain(lines, DNA=False, RNA=False):
if the sequence matches DNA
RNA: bool
if the sequence matches RNA
AA: bool
if the sequence matches AA
Returns
-------
Expand All @@ -118,23 +142,26 @@ def _parse_plain(lines, DNA=False, RNA=False):
resname = ONE_LETTER_DNA[token]
elif token in ONE_LETTER_RNA and RNA:
resname = ONE_LETTER_RNA[token]
elif token in ONE_LETTER_AA and AA:
resname = ONE_LETTER_AA[token]
else:
msg = f"Cannot find one letter residue match for {token}"
raise IOError(msg)

monomers.append(resname)

# make sure to set the defaults for the DNA and RNA terminals
monomers[0] = monomers[0] + "5"
monomers[-1] = monomers[-1] + "3"
if RNA or DNA:
monomers[0] = monomers[0] + "5"
monomers[-1] = monomers[-1] + "3"

seq_graph = _monomers_to_linear_nx_graph(monomers)
return seq_graph

def _identify_nucleotypes(comments):
def _identify_residues(comments):
"""
From a comment found in the ig or fasta file, identify if
the sequence is RNA or DNA sequence by checking if these
the sequence is RNA, DNA, or AA sequence by checking if these
keywords are in the comment lines. Raise an error if
none or conflicting information are found.
Expand All @@ -146,30 +173,35 @@ def _identify_nucleotypes(comments):
Returns
-------
bool, bool
is it DNA, RNA
is it DNA, RNA, AA
Raises
------
FileFormatError
neither RNA nor DNA keywords are found
neither RNA nor DNA nor AA keywords are found
both RNA and DNA are found
"""
RNA = False
DNA = False
AA = False

for comment in comments:
if "DNA" in comment:
DNA = True

if "RNA" in comment:
RNA = True

if "PROTEIN" in comment:
AA = True

if RNA and DNA:
raise FileFormatError("Found both RNA and DNA keyword in comment. Choose one.")

if not RNA and not DNA:
raise FileFormatError("Cannot identify if sequence is RNA or DNA from comment.")
if not RNA and not DNA and not AA:
raise FileFormatError("Cannot identify if sequence is RNA, DNA, or PROTEIN, from comment.")

return DNA, RNA
return DNA, RNA, AA

def parse_ig(filepath):
"""
Expand Down Expand Up @@ -220,8 +252,8 @@ def parse_ig(filepath):
msg = "The sequence is not complete, it does not end with 1 or 2."
raise FileFormatError(msg)

DNA, RNA = _identify_nucleotypes(comments)
seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA)
DNA, RNA, AA = _identify_residues(comments)
seq_graph = _parse_plain(clean_lines[1:], DNA=DNA, RNA=RNA, AA=AA)

if ter_char == '2':
nnodes = len(seq_graph.nodes)
Expand All @@ -237,7 +269,7 @@ def parse_ig(filepath):

def parse_fasta(filepath):
"""
Read fasta sequence of DNA/RNA.
Read fasta sequence of DNA/RNA/PROTEIN.
The parser automatically translates the one letter code to the
double letter nucleobase resnames, sets special residue names
Expand Down Expand Up @@ -265,7 +297,7 @@ def parse_fasta(filepath):

clean_lines = []
# first line must be a comment line
DNA, RNA =_identify_nucleotypes([lines[0]])
DNA, RNA, AA =_identify_residues([lines[0]])

for line in lines[1:]:
if '>' in line:
Expand All @@ -274,7 +306,7 @@ def parse_fasta(filepath):

clean_lines.append(line)

seq_graph = _parse_plain(clean_lines, RNA=RNA, DNA=DNA)
seq_graph = _parse_plain(clean_lines, RNA=RNA, DNA=DNA, AA=AA)
return seq_graph

def parse_json(filepath):
Expand Down
2 changes: 2 additions & 0 deletions polyply/tests/test_data/simple_seq_files/test_protein.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
> PROTEIN
GAKWNVFPS
38 changes: 31 additions & 7 deletions polyply/tests/test_simple_seq_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,37 +20,54 @@
from polyply import TEST_DATA
from polyply.src.meta_molecule import MetaMolecule
from .example_fixtures import example_meta_molecule
from polyply.src.simple_seq_parsers import (_identify_nucleotypes,
from polyply.src.simple_seq_parsers import (_identify_residues,
_monomers_to_linear_nx_graph,
_parse_plain,
FileFormatError)

@pytest.mark.parametrize('comments, DNA, RNA', (
@pytest.mark.parametrize('comments, DNA, RNA, AA', (
# single DNA comment
(["DNA lorem ipsum"],
True,
False,
False
),
# single RNA comment
(["RNA lorem ipsum"],
False,
True
True,
False
),
# single DNA comment multiple lines
(["lorem ipsum", "random line DNA", "DNA another line"],
True,
False,
False
),
# single RNA comment multiple lines
(["lorem ipsum", "random line RNA", "RNA another line"],
False,
True,
False
),
# single AA comment
(['lorem ipsum PROTEIN'],
False,
False,
True
),
))
def test_identify_nucleotypes(comments, DNA, RNA):
out_DNA, out_RNA = _identify_nucleotypes(comments)
# singe AA comment multiple lines
(["lorem ipsum", "random line PROTEIN", "PROTEIN another line"],
False,
False,
True
),
))
def test_identify_nucleotypes(comments, DNA, RNA, AA):
out_DNA, out_RNA, out_AA = _identify_residues(comments)
assert out_DNA == DNA
assert out_RNA == RNA
assert out_AA == AA

@pytest.mark.parametrize('comments', (
# both DNA and RNA are defined
Expand All @@ -60,7 +77,7 @@ def test_identify_nucleotypes(comments, DNA, RNA):
))
def test_identify_nucleotypes_fail(comments):
with pytest.raises(FileFormatError):
_identify_nucleotypes(comments)
_identify_residues(comments)

def _node_match(nodeA, nodeB):
resname = nodeA["resname"] == nodeB["resname"]
Expand Down Expand Up @@ -111,6 +128,13 @@ def test_sequence_parses_RNA(extension):
ref_graph = _monomers_to_linear_nx_graph(monomers)
assert nx.is_isomorphic(seq_graph, ref_graph, node_match=_node_match)

def test_sequence_parses_PROTEIN():
filepath = Path(TEST_DATA + "/simple_seq_files/test_protein.fasta")
seq_graph = MetaMolecule.parsers["fasta"](filepath)
monomers = ["GLY", "ALA", "LYS", "TRP", "ASN", "VAL", "PHE", "PRO", "SER"]
ref_graph = _monomers_to_linear_nx_graph(monomers)
assert nx.is_isomorphic(seq_graph, ref_graph, node_match=_node_match)

def test_unkown_nucleotype_error():
with pytest.raises(IOError):
lines = ["AABBBCCTG"]
Expand Down

0 comments on commit d465317

Please sign in to comment.