-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkana.rb
executable file
·242 lines (205 loc) · 5.67 KB
/
kana.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/ruby
# coding: utf-8
#
# == NAME
# kana.rb
#
# == USAGE
# ./kana.rb
#
# == DESCRIPTION
# A script that makes phonetic readings of Japanese sentences.
# This is a library. Do not call it from the command line.
#
# == AUTHOR
# Douglas Perkins - https://dperkins.org - https://microca.st/dper
ENV['MECAB_PATH']='/usr/lib/x86_64-linux-gnu/libmecab.so.2'
require 'natto'
require 'nkf'
# A Japanese sentence and its phonetic reading.
class PhoneticSentence
# A parsed sentence produces tokens, and each token has a character type.
# Reference: https://bitbucket.org/buruzaemon/natto/wiki/Node-Parsing-char_type
# Reference: http://d.hatena.ne.jp/NE555/20120107
CHAR_TYPE_DEFAULT = 0
CHAR_TYPE_SPACE = 1
CHAR_TYPE_KANJI = 2
CHAR_TYPE_SYMBOL = 3
CHAR_TYPE_NUMERIC = 4
CHAR_TYPE_ALPHA = 5
CHAR_TYPE_HIRAGANA = 6
CHAR_TYPE_KATAKANA = 7
CHAR_TYPE_KANJINUMERIC = 8
CHAR_TYPE_GREEK = 9
CHAR_TYPE_CYRILLIC = 10
attr_accessor :japanese # The Japanese sentence.
attr_accessor :kana # The phonetic reading.
# Returns the token's grammatical part of speech.
def pos token
return token.feature.split(',').first
end
# Returns the token's detailed part of speech.
def detail token
return token.feature.split(',')[1]
end
# Returns the token to the left of the argument, or nil if none.
def find_left token
index = @tokens.find_index token
if index == 0
return nil
else
return @tokens[index - 1]
end
end
# Returns the token to the right of the argument, or nil if none.
def find_right token
index = @tokens.find_index token
if index == @tokens.length - 1
return nil
else
return @tokens[index + 1]
end
end
# Returns true iff a blank space should go before the token.
def pad token
pos = pos token
detail = detail token
left = find_left token
right = find_right token
# A leading token needs no lead spacing.
if (@tokens.find_index token) == 0
return false
end
# No space is needed after punctuation.
if left and (pos left) == '記号'
return false
end
# This line is useful for debugging.
#puts token.surface + ' ' + pos + ' ' + (detail token)
# Consider what part of speech it and adjacent tokens are.
case pos
when '名詞'
if (detail == '数') and left and (detail left) == '数'
return false
end
if detail == '接尾'
return false
end
when '動詞'
if detail == '非自立'
return false
end
if left and (pos left) == '動詞'
return false
end
when '助動詞'
if left and (pos left) == '動詞'
return false
end
if left and (pos left) == '助動詞'
return false
end
when '助詞'
if right and (pos left) == '助詞'
return false
end
if (detail == '接続助詞') and left and (pos left) == '動詞'
return false
end
when '記号'
return false
end
return true
end
# Returns kana for a given token.
def token_to_kana token
char_type = token.char_type
surface = token.surface
# This line is useful for debugging.
#puts surface + ' ' + char_type.to_s
if char_type == CHAR_TYPE_KANJI or char_type == CHAR_TYPE_KANJINUMERIC
katakana = token.feature.split(',')[-2]
hiragana = NKF.nkf('-h1 -w', katakana)
text = hiragana
elsif char_type == CHAR_TYPE_HIRAGANA
katakana = token.feature.split(',')[-2]
hiragana = NKF.nkf('-h1 -w', katakana)
# In cases where a word isn't known, no kana can be produced.
if katakana == '*'
text = token.surface
else
text = hiragana
end
elsif char_type == CHAR_TYPE_NUMERIC
text = surface
counters = ['番', '月']
last_character = text[-1]
if counters.include? last_character
katakana = token.feature.split(',')[-2]
hiragana = NKF.nkf('-h1 -w', katakana)
text = hiragana
end
else
text = surface
end
# Put space between some tokens.
if pad token
text = ' ' + text
end
return text
end
# Makes kana for the Japanese sentence.
def make_kana
kana = ''
@tokens.each do |token|
kana += token_to_kana token
end
@kana = kana
end
# Parses the Japanese.
def parse
nm = Natto::MeCab.new
tokens = []
nm.parse(@japanese) do |token|
unless token.feature.split(',').first == 'BOS/EOS'
tokens << token
end
end
@tokens = tokens
end
# Makes a PhoneticSentence.
def initialize japanese
@japanese = japanese.dup
parse
make_kana
end
end
# A test method for this class.
def testPhoneticSentence
sentences = []
sentences << '彼はいちごケーキが大好きです。'
sentences << 'どうぞよろしくお願いします。'
sentences << 'あなたは猫を飼っているよね。'
sentences << '今更どうしようもない事だ。'
sentences << '私は1982年に生まれました。'
sentences << '私は昨夜、遅くまで起きていた。'
sentences << '私は1982年に生まれました。'
sentences << '「トムとメアリーが離婚するって聞いたよ。」「それは噂だよ。」'
sentences << '損害は千ドルと見積もりしています。'
sentences << 'なんでにゃんにゃん言ってるの?'
sentences << '彼女は2人姉妹がいます。'
sentences << 'この顔にピンときたら110番!'
sentences << 'この顔にピンときたら110番!'
sentences << '努力したが何の成果も得られなかった。'
sentences << '黄色いレインコートを着ている女の子はだれですか。'
sentences << 'こんな暖かい陽気は2月にしては異常だ。'
sentences << 'こんな暖かい陽気は2月にしては異常だ。'
sentences.each do |sentence|
puts '漢字: ' + sentence
s = PhoneticSentence.new sentence
puts 'かな: ' + s.kana
puts ''
end
end
# Uncomment this line for testing.
#testPhoneticSentence