-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCliticAnalyzer.py
162 lines (139 loc) · 7.28 KB
/
CliticAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import sys
import os
import csv
# Add the directory containing tagged_cha_reader to the Python path
module_path = '/Users/danieltregubenko/Desktop/tagged-transcript-processing-main'
if module_path not in sys.path:
sys.path.append(module_path)
# Now you can import the tagged_cha_reader module
import tagged_cha_reader
def contains_clitic_pronoun(tokens):
"""
Checks if the given list of tokens contains clitic pronouns either before the first verb
or attached to the second verb.
"""
clitic_pronouns = {"me", "te", "se", "lo", "la", "le", "les"}
combined_clitics = {f'{cl1}{cl2}' for cl1 in clitic_pronouns for cl2 in clitic_pronouns}
for token in tokens:
word, pos = token.split('.')
if pos == 'PRON' and word in clitic_pronouns:
return True, word, "proclitic"
if pos == 'VERB':
# Check if clitic is attached to the verb
verb = word
for clitic in clitic_pronouns.union(combined_clitics):
if verb.endswith(clitic):
return True, verb, "enclitic"
return False, None, None
def has_two_verbs_with_gap(tokens):
"""
Checks if there are two verbs in the given list of tokens.
The verbs can be next to each other or have specific allowed words between them.
"""
allowed_gaps = {
1: {"a", "que", "dando"},
2: {"a a", "de que"}
}
verb_positions = [i for i, token in enumerate(tokens) if token.split('.')[-1] == 'VERB']
for i in range(len(verb_positions) - 1):
gap_size = verb_positions[i + 1] - verb_positions[i] - 1
if gap_size in allowed_gaps:
gap_words = " ".join([tokens[j].split('.')[0] for j in range(verb_positions[i] + 1, verb_positions[i + 1])])
if gap_words in allowed_gaps[gap_size]:
return True
return False
def untag_sentence(sentence):
"""
Removes the part-of-speech tags from the sentence.
"""
return " ".join([word.split('.')[0] for word in sentence.split()])
def analyze_variable_clitics(input_dir, output_file):
"""
Analyzes chat files for sentences with variable clitics and generates a CSV with the results.
"""
filenames = [f for f in os.listdir(input_dir) if f.endswith('.cha') or f.endswith('.txt')]
print(f"Found {len(filenames)} files in the directory.")
results = []
for file_name in filenames:
print(f"Processing file: {file_name}")
file_prefix = os.path.splitext(file_name)[0]
file_path = os.path.join(input_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
sentences = [line[5:].strip() for line in lines if line.startswith('%pos:')]
for sentence in sentences:
if sentence.strip() == "":
continue
tokens = sentence.split()
if has_two_verbs_with_gap(tokens):
contains_clitic, clitic_word, clitic_position = contains_clitic_pronoun(tokens)
if contains_clitic:
untagged_sentence = untag_sentence(sentence)
print(f"Untagged Sentence: {untagged_sentence}\nClitic Word: {clitic_word}")
# Append file_prefix, untagged_sentence, clitic_word, and clitic_position
results.append([file_prefix, untagged_sentence, clitic_word, clitic_position])
print(f"Found {len(results)} sentences with variable clitics.")
# Write results to CSV in the same directory as input
output_path = os.path.join(input_dir, output_file)
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# Write header with 'ID', 'Sentence', 'Clitic', and 'Clitic Position'
writer.writerow(['ID', 'Sentence', 'Clitic', 'Clitic Position'])
writer.writerows(results)
print(f"Results written to {output_path}")
def analyze_adjacent_verbs(input_dir, output_file):
"""
Analyzes chat files specifically for sentences where two verbs are next to each other.
"""
filenames = [f for f in os.listdir(input_dir) if f.endswith('.cha') or f.endswith('.txt')]
print(f"Found {len(filenames)} files in the directory for adjacent verbs analysis.")
results = []
for file_name in filenames:
print(f"Processing file: {file_name}")
file_prefix = os.path.splitext(file_name)[0]
file_path = os.path.join(input_dir, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
sentences = [line[5:].strip() for line in lines if line.startswith('%pos:')]
for sentence in sentences:
if sentence.strip() == "":
continue
tokens = sentence.split()
verb_positions = [i for i, token in enumerate(tokens) if token.split('.')[-1] == 'VERB']
adjacent_pairs = []
for i in range(len(verb_positions) - 1):
if verb_positions[i + 1] - verb_positions[i] == 1:
adjacent_pairs.append((verb_positions[i], verb_positions[i + 1]))
print(f"Adjacent Verbs Found: {tokens[verb_positions[i]].split('.')[0]}, {tokens[verb_positions[i + 1]].split('.')[0]}")
for first_verb_idx, second_verb_idx in adjacent_pairs:
# Check if there is a clitic before the first verb
if first_verb_idx > 0:
clitic_word, clitic_pos = tokens[first_verb_idx - 1].split('.')
clitic_pronouns = {"me", "te", "se", "lo", "la", "le", "les"}
if clitic_pos == 'PRON' and clitic_word in clitic_pronouns:
untagged_sentence = untag_sentence(sentence)
print(f"Untagged Sentence: {untagged_sentence}\nClitic Word: {clitic_word}")
results.append([file_prefix, untagged_sentence, clitic_word, "proclitic"])
# Check if the second verb has a clitic attached
verb2_word, verb2_pos = tokens[second_verb_idx].split('.')
combined_clitics = {f'{cl1}{cl2}' for cl1 in clitic_pronouns for cl2 in clitic_pronouns}
for clitic in clitic_pronouns.union(combined_clitics):
if verb2_word.endswith(clitic):
untagged_sentence = untag_sentence(sentence)
print(f"Untagged Sentence: {untagged_sentence}\nClitic Word: {verb2_word}")
results.append([file_prefix, untagged_sentence, verb2_word, "enclitic"])
break
print(f"Found {len(results)} sentences with adjacent verbs having variable clitics.")
# Write results to CSV in the same directory as input
output_path = os.path.join(input_dir, output_file)
with open(output_path, 'a', newline='', encoding='utf-8') as csvfile: # Append to the existing file
writer = csv.writer(csvfile)
writer.writerows(results)
print(f"Results written to {output_path}")
# Update the input directory path to the correct one
input_dir = '/Users/danieltregubenko/Desktop/Clitic' # Directory containing chat files
output_file = 'variable_clitic_results.csv'
# First pass: original code
analyze_variable_clitics(input_dir, output_file)
# Second pass: analyze adjacent verbs
analyze_adjacent_verbs(input_dir, output_file)