-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
194 lines (156 loc) · 5.64 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import re
import json
import numpy as np
def get_hparams_from_file(config_path):
with open(config_path, "r", encoding="utf-8") as f:
data = f.read()
config = json.loads(data)
hparams = HParams(**config)
return hparams
class HParams:
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
def values(self):
return self.__dict__.values()
def __len__(self):
return len(self.__dict__)
def __getitem__(self, key):
return getattr(self, key)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()
def string_to_bits(string, pad_len=8):
# Convert each character to its ASCII value
ascii_values = [ord(char) for char in string]
# Convert ASCII values to binary representation
binary_values = [bin(value)[2:].zfill(8) for value in ascii_values]
# Convert binary strings to integer arrays
bit_arrays = [[int(bit) for bit in binary] for binary in binary_values]
# Convert list of arrays to NumPy array
numpy_array = np.array(bit_arrays)
numpy_array_full = np.zeros((pad_len, 8), dtype=numpy_array.dtype)
numpy_array_full[:, 2] = 1
max_len = min(pad_len, len(numpy_array))
numpy_array_full[:max_len] = numpy_array[:max_len]
return numpy_array_full
def bits_to_string(bits_array):
# Convert each row of the array to a binary string
binary_values = [''.join(str(bit) for bit in row) for row in bits_array]
# Convert binary strings to ASCII values
ascii_values = [int(binary, 2) for binary in binary_values]
# Convert ASCII values to characters
output_string = ''.join(chr(value) for value in ascii_values)
return output_string
def split_sentence(text, min_len=10, language_str='[EN]'):
if language_str in ['EN']:
sentences = split_sentences_latin(text, min_len=min_len)
else:
sentences = split_sentences_zh(text, min_len=min_len)
return sentences
def split_sentences_latin(text, min_len=10):
"""Split Long sentences into list of short ones
Args:
str: Input sentences.
Returns:
List[str]: list of output sentences.
"""
# deal with dirty sentences
text = re.sub('[。!?;]', '.', text)
text = re.sub('[,]', ',', text)
text = re.sub('[“”]', '"', text)
text = re.sub('[‘’]', "'", text)
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
text = re.sub('[\n\t ]+', ' ', text)
text = re.sub('([,.!?;])', r'\1 $#!', text)
# split
sentences = [s.strip() for s in text.split('$#!')]
if len(sentences[-1]) == 0: del sentences[-1]
new_sentences = []
new_sent = []
count_len = 0
for ind, sent in enumerate(sentences):
# print(sent)
new_sent.append(sent)
count_len += len(sent.split(" "))
if count_len > min_len or ind == len(sentences) - 1:
count_len = 0
new_sentences.append(' '.join(new_sent))
new_sent = []
return merge_short_sentences_latin(new_sentences)
def merge_short_sentences_latin(sens):
"""Avoid short sentences by merging them with the following sentence.
Args:
List[str]: list of input sentences.
Returns:
List[str]: list of output sentences.
"""
sens_out = []
for s in sens:
# If the previous sentence is too short, merge them with
# the current sentence.
if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
sens_out[-1] = sens_out[-1] + " " + s
else:
sens_out.append(s)
try:
if len(sens_out[-1].split(" ")) <= 2:
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
sens_out.pop(-1)
except:
pass
return sens_out
def split_sentences_zh(text, min_len=10):
text = re.sub('[。!?;]', '.', text)
text = re.sub('[,]', ',', text)
# 将文本中的换行符、空格和制表符替换为空格
text = re.sub('[\n\t ]+', ' ', text)
# 在标点符号后添加一个空格
text = re.sub('([,.!?;])', r'\1 $#!', text)
# 分隔句子并去除前后空格
# sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
sentences = [s.strip() for s in text.split('$#!')]
if len(sentences[-1]) == 0: del sentences[-1]
new_sentences = []
new_sent = []
count_len = 0
for ind, sent in enumerate(sentences):
new_sent.append(sent)
count_len += len(sent)
if count_len > min_len or ind == len(sentences) - 1:
count_len = 0
new_sentences.append(' '.join(new_sent))
new_sent = []
return merge_short_sentences_zh(new_sentences)
def merge_short_sentences_zh(sens):
# return sens
"""Avoid short sentences by merging them with the following sentence.
Args:
List[str]: list of input sentences.
Returns:
List[str]: list of output sentences.
"""
sens_out = []
for s in sens:
# If the previous sentense is too short, merge them with
# the current sentence.
if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
sens_out[-1] = sens_out[-1] + " " + s
else:
sens_out.append(s)
try:
if len(sens_out[-1]) <= 2:
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
sens_out.pop(-1)
except:
pass
return sens_out