-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathal_extraction.py
73 lines (55 loc) · 1.95 KB
/
al_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def count_adp(text, adp):
counter = 0
tokens = text.split()
for token in tokens:
token_parts = token.split(".")
if len(token_parts) > 1 and token_parts[1] == "ADP":
counter = counter + 1 if token_parts[0].lower() == adp else counter
return counter
def detect_a_or_al(token):
token_parts = token.split(".")
if len(token_parts) > 1 and token_parts[1] == "ADP":
return token_parts[0].lower() == "a" or token_parts[0].lower() == "al"
return False
def is_verb(token):
token_parts = token.split(".")
return len(token_parts) > 1 and token_parts[1] == "VERB"
def is_noun(token):
token_parts = token.split(".")
return len(token_parts) > 1 and (token_parts[1] == "NOUN" or token_parts[1] == "PRON" or token_parts[1] == "PRO")
def get_text(token):
return token.split(".")[0]
def get_pos(token):
parts = token.split(".")
if len(parts) > 1:
return parts[1]
return ""
def extract_als(text):
results = []
tokens = text.split()
for i, token in enumerate(tokens):
if detect_a_or_al(token):
context = ""
# collect tokens in front of adp until a verb is found
j = i
while (j > 0 and not is_verb(tokens[j])):
context = get_text(tokens[j]) + " " + context
j -= 1
# add the verb too
if j >= 0:
context = get_text(tokens[j]) + " " + context
k = i + 1
# collect tokens afterwards
while (k < len(tokens)):
context += get_text(tokens[k])
if is_noun(tokens[k]):
break
else:
context += " "
k += 1
results += [context]
return results
if __name__ == "__main__":
import tagged_cha_reader
test = tagged_cha_reader.get_text("input/103_spanish.cha")
print(extract_als(test))