forked from kyclark/biofx_python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolution1_kmers_functional.py
executable file
·97 lines (67 loc) · 2.58 KB
/
solution1_kmers_functional.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
""" Longest Common Substring """
import argparse
import random
import sys
from itertools import chain
from collections import Counter
from typing import List, NamedTuple, TextIO
from Bio import SeqIO
class Args(NamedTuple):
""" Command-line arguments """
file: TextIO
# --------------------------------------------------
def get_args() -> Args:
""" Get command-line arguments """
parser = argparse.ArgumentParser(
description='Longest Common Substring',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('file',
help='Input FASTA',
metavar='FILE',
type=argparse.FileType('rt'))
args = parser.parse_args()
return Args(args.file)
# --------------------------------------------------
def main() -> None:
""" Make a jazz noise here """
args = get_args()
# Get a list of the sequences as strings
seqs = [str(rec.seq) for rec in SeqIO.parse(args.file, 'fasta')]
# Find the length of the shortest sequence
shortest = min(map(len, seqs))
for k in range(shortest, 0, -1):
if kmers := common_kmers(seqs, k):
print(random.choice(kmers))
sys.exit(0)
print('No common subsequence.')
# --------------------------------------------------
def common_kmers(seqs: List[str], k: int) -> List[str]:
""" Find k-mers common to all sequences """
kmers = [set(find_kmers(seq, k)) for seq in seqs]
counts = Counter(chain.from_iterable(kmers))
n = len(seqs)
return [kmer for kmer, freq in counts.items() if freq == n]
# --------------------------------------------------
def test_common_kmers() -> None:
""" Test common_kmers """
seqs = ['GATTACA', 'TAGACCA', 'ATACA']
assert common_kmers(seqs, 5) == []
assert sorted(common_kmers(seqs, 2)) == ['AC', 'CA', 'TA']
# --------------------------------------------------
def find_kmers(seq: str, k: int) -> List[str]:
""" Find k-mers in string """
n = len(seq) - k + 1
return [] if n < 1 else [seq[i:i + k] for i in range(n)]
# --------------------------------------------------
def test_find_kmers() -> None:
""" Test find_kmers """
assert find_kmers('', 1) == []
assert find_kmers('ACTG', 1) == ['A', 'C', 'T', 'G']
assert find_kmers('ACTG', 2) == ['AC', 'CT', 'TG']
assert find_kmers('ACTG', 3) == ['ACT', 'CTG']
assert find_kmers('ACTG', 4) == ['ACTG']
assert find_kmers('ACTG', 5) == []
# --------------------------------------------------
if __name__ == '__main__':
main()