-
Notifications
You must be signed in to change notification settings - Fork 301
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #38 from maxhawkins/dupe
clean up language model generation
- Loading branch information
Showing
5 changed files
with
3,247 additions
and
3,293 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,52 +1,84 @@ | ||
import logging | ||
import math | ||
import os | ||
import shutil | ||
import subprocess | ||
import sys | ||
import tempfile | ||
|
||
from paths import get_binary | ||
from generate_wp import language_model_from_word_sequence | ||
from metasentence import MetaSentence | ||
|
||
MKGRAPH_PATH = get_binary("mkgraph") | ||
|
||
def get_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'): | ||
"""Generates a language model to fit the text | ||
def make_bigram_lm_fst(word_sequence): | ||
''' | ||
Use the given token sequence to make a bigram language model | ||
in OpenFST plain text format. | ||
''' | ||
word_sequence = ['[oov]', '[oov]'] + word_sequence + ['[oov]'] | ||
|
||
bigrams = {} | ||
prev_word = word_sequence[0] | ||
for word in word_sequence[1:]: | ||
bigrams.setdefault(prev_word, set()).add(word) | ||
prev_word = word | ||
|
||
node_ids = {} | ||
def get_node_id(word): | ||
node_id = node_ids.get(word, len(node_ids) + 1) | ||
node_ids[word] = node_id | ||
return node_id | ||
|
||
output = "" | ||
for from_word in sorted(bigrams.keys()): | ||
from_id = get_node_id(from_word) | ||
|
||
successors = bigrams[from_word] | ||
if len(successors) > 0: | ||
weight = -math.log(1.0 / len(successors)) | ||
else: | ||
weight = 0 | ||
|
||
for to_word in sorted(successors): | ||
to_id = get_node_id(to_word) | ||
output += '%d %d %s %s %f' % (from_id, to_id, to_word, to_word, weight) | ||
output += "\n" | ||
|
||
output += "%d 0\n" % (len(node_ids)) | ||
|
||
return output | ||
|
||
def make_bigram_language_model(kaldi_seq, proto_langdir='PROTO_LANGDIR'): | ||
"""Generates a language model to fit the text. | ||
Returns the filename of the generated language model FST. | ||
The caller is resposible for removing the generated file. | ||
`proto_langdir` is a path to a directory containing prototype model data | ||
`kaldi_seq` is a list of words within kaldi's vocabulary. | ||
""" | ||
|
||
# Create a language model directory | ||
lang_model_dir = tempfile.mkdtemp() | ||
logging.info('saving language model to %s', lang_model_dir) | ||
|
||
# Symlink in necessary files from the prototype directory | ||
for dirpath, dirnames, filenames in os.walk(proto_langdir, followlinks=True): | ||
for dirname in dirnames: | ||
relpath = os.path.relpath(os.path.join(dirpath, dirname), proto_langdir) | ||
os.makedirs(os.path.join(lang_model_dir, relpath)) | ||
for filename in filenames: | ||
abspath = os.path.abspath(os.path.join(dirpath, filename)) | ||
relpath = os.path.relpath(os.path.join(dirpath, filename), proto_langdir) | ||
dstpath = os.path.join(lang_model_dir, relpath) | ||
os.symlink(abspath, dstpath) | ||
|
||
# Generate a textual FST | ||
txt_fst = language_model_from_word_sequence(kaldi_seq) | ||
txt_fst_file = os.path.join(lang_model_dir, 'G.txt') | ||
open(txt_fst_file, 'w').write(txt_fst) | ||
txt_fst = make_bigram_lm_fst(kaldi_seq) | ||
txt_fst_file = tempfile.NamedTemporaryFile(delete=False) | ||
txt_fst_file.write(txt_fst) | ||
txt_fst_file.close() | ||
|
||
words_file = os.path.join(proto_langdir, "graphdir/words.txt") | ||
subprocess.check_output([MKGRAPH_PATH, | ||
os.path.join(lang_model_dir, 'langdir'), | ||
os.path.join(lang_model_dir, 'modeldir'), | ||
txt_fst_file, | ||
words_file, | ||
os.path.join(lang_model_dir, 'graphdir', 'HCLG.fst')]) | ||
hclg_filename = tempfile.mktemp(suffix='_HCLG.fst') | ||
try: | ||
subprocess.check_output([MKGRAPH_PATH, | ||
proto_langdir, | ||
txt_fst_file.name, | ||
hclg_filename]) | ||
except Exception, e: | ||
os.unlink(hclg_filename) | ||
raise e | ||
finally: | ||
os.unlink(txt_fst_file.name) | ||
|
||
# Return the language model directory | ||
return lang_model_dir | ||
return hclg_filename | ||
|
||
if __name__=='__main__': | ||
import sys | ||
get_language_model(open(sys.argv[1]).read()) | ||
make_bigram_language_model(open(sys.argv[1]).read()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.