-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPatternMatching.py
119 lines (89 loc) · 3.74 KB
/
PatternMatching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import time
from collections import defaultdict
from typing import List
class PatternMatching:
_preprocess_proposed_match_data = defaultdict(set)
def __init__(self, text: str = None, pattern: str = None) -> None:
self.text = text
self.pattern = pattern
def naive_match(self, text: str = None, pattern: str = None) -> List[int]:
"""Implementation of brute-force algorithm
Args:
text (str, optional): Main sequence. Defaults to None.
pattern (str, optional): pattern to be matched in sequence. Defaults to None.
Returns:
List[int]: List of integer indices where match occurs, if any.
"""
if text is None:
text = self.text
if pattern is None:
pattern = self.pattern
n = len(text)
k = len(pattern)
indices = []
for i in range(n - k + 1):
curr_kmer = text[i : i + k]
if pattern == curr_kmer:
indices.append(i)
return indices
@classmethod
def _preprocess_proposed_match(cls, text: str) -> dict[str : List[int]]:
for i in range(len(text) - 1):
cls._preprocess_proposed_match_data[text[i : i + 2]].add(i)
return cls._preprocess_proposed_match_data
def proposed_match(self, text: str = None, pattern: str = None) -> List[int]:
"""Implementation of the proposed algorithm
Args:
text (str, optional): Main sequence. Defaults to None.
pattern (str, optional): pattern to be matched in sequence. Defaults to None.
Returns:
List[int]: List of integer indices where match occurs, if any.
"""
if text is None:
text = self.text
if pattern is None:
pattern = self.pattern
k = len(pattern)
indices = []
idx_table_pat = defaultdict(list)
# creates index table for the text
if not self._preprocess_proposed_match_data:
idx_table_text = self._preprocess_proposed_match(text)
else:
idx_table_text = self._preprocess_proposed_match_data
# creates index table for the pattern
for i in range(0, k - 1, 2):
idx_table_pat[pattern[i : i + 2]].append(i)
# creates sorted array of pairs in pattern
sorted_pat_pair = sorted(idx_table_pat, key=lambda x: len(idx_table_pat[x]))
# creates sorted array of indexes of least occuring
# pair in pattern present in text
match_idx = sorted(idx_table_text[sorted_pat_pair[0]])
# stores index of first least occuring pair in pattern
align_idx = idx_table_pat[sorted_pat_pair[0]][0]
for idx_text in match_idx:
# Starting index of pattern in sequence
curr_align = idx_text - align_idx
cont = True
for pair in sorted_pat_pair:
if not cont:
break
for idx_pat in idx_table_pat[pair]:
if idx_pat + curr_align not in idx_table_text[pair]:
cont = False
break
# To handle odd length pattern
if len(pattern) % 2 != 0:
if cont and text[curr_align + k - 1] == pattern[-1]:
indices.append(curr_align)
elif cont:
indices.append(curr_align)
return indices
def performance(self) -> None:
"""Compares the runtime of naive algorithm and the proposed algorithm"""
funcs = [self.naive_match, self.proposed_match]
for func in funcs:
tic = time.perf_counter()
func()
toc = time.perf_counter()
print(f"{func.__name__} took {(toc - tic):.4f}s to run.")