-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalysis.py
84 lines (63 loc) · 2.45 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
A module for analyzing text tagged in the following format: 'text.POS'.
"""
def count_words(text : str) -> int:
"""
Given a tagged text, counts all the words (tokens other than punctuation)
in it
"""
counter = 0
tokens = text.split(" ")
for token in tokens:
# tokens should be formatted as text.POS
pos = token.split(".")[1]
if pos != "PUNCT":
counter += 1
return counter
def count_pos(text : str, word : str, pos : str = None) -> int:
"""
Given a tagged text, a word, and optionally a part of speech,
find all matches. If no part of speech is provided, just find
all occurences of the word.
"""
tokens = text.split(" ")
counter = 0
for token in tokens:
current_word = token.split(".")[0].lower()
if (len(token.split(".")) >= 2):
current_pos = token.split(".")[1]
else:
# if a word is not tagged for some reason,
# it will be ignored
current_pos = None
if pos:
if word == current_word and pos == current_pos:
counter += 1
else:
if word == current_word:
counter += 1
return counter
def count_name(text : str, name : str):
"""
Given a tagged text and a space-separated name,
count all occurences of that name
"""
tokens = text.split(" ")
name_parts = name.split()
counter = 0
for i, token in enumerate(tokens):
current_word = token.split(".")[0].lower()
# potential match detected
if current_word == name_parts[-1].lower():
# see if all of the name is matched
matched_parts = 0
for j in range(-1, -len(name_parts)-1, -1):
#print(current_word, name_parts[j].lower(), tokens[i + j + 1].split(".")[0].lower(), matched_parts)
if -j <= (i+ 1) and name_parts[j].lower() == tokens[i + j + 1].split(".")[0].lower():
matched_parts += 1
#print(matched_parts)
if matched_parts == len(name_parts):
counter += 1
return counter
if __name__ == "__main__":
print(count_name("John.blah smith.blah blah blah blah john smith blah. blah blah blah blah. John Smith blah a b blah john Smith.", "John Smith"))