-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_srilm_langmodel.sh
89 lines (74 loc) · 1.79 KB
/
train_srilm_langmodel.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/sh
set -e
ORDER=2
INPUT="file.pos"
TAGSET="ud" # tagset code [ud,ut]
LAN_CODE="en"
BASEDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
DATADIR=$BASEDIR/lm_data
CODE_DIR="$BASEDIR/src/code"
if [ -z "$CARMEL_DIR"]; then
CARMEL_DIR="/usr/local"
fi
if [ -z "$SRILM_DIR" ]; then
SRILM_DIR="$HOME/srilm-1.7.2"
fi
while [ $# -gt 1 ]
do
key="$1"
case $key in
-i|--input)
INPUT="$2"
shift # past argument
;;
-ts|--tagset)
TAGSET="$2"
shift # past argument
;;
-ord|--order)
ORDER="$2"
shift # past argument
;;
-l|--lang)
LAN_CODE="$2"
shift # past argument
;;
-sri|--sridir)
SRILM_DIR="$2"
shift # past argument
;;
-carmel|--carmel)
CARMEL_DIR="$2"
shift # past argument
;;
*)
# unknown option
;;
esac
shift
done
export CARMEL_DIR=$CARMEL_DIR
export SRILM_DIR=$SRILM_DIR
cd $CODE_DIR
g++ makelmfsa.cpp -o makelmfsa
g++ makelmfsa_x.cpp -o makelmfsa_x
echo ""
echo "Lang: $LAN_CODE"
mkdir -p $BASEDIR/lms/
mkdir -p $DATADIR/$LAN_CODE
cp $INPUT $DATADIR/$LAN_CODE/train.upos
python3 src/code/pos2char.py -ts $TAGSET < $INPUT > $DATADIR/$LAN_CODE/train.upos.ch
# run LM
#-addsmooth -kn \
$SRILM_DIR/bin/i686-m64/ngram-count -text $DATADIR/$LAN_CODE/train.upos.ch -order $ORDER \
-addsmooth 1 \
-lm $BASEDIR/lms/$LAN_CODE.$ORDER.lm
grep -vP "^$" < $BASEDIR/lms/$LAN_CODE.$ORDER.lm > temp
mv temp $BASEDIR/lms/$LAN_CODE.$ORDER.lm
# create fsa/fst
./makelmfsa $BASEDIR/lms/$LAN_CODE.$ORDER.lm
$CARMEL_DIR/bin/carmel -n $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa \
> $BASEDIR/lms/$LAN_CODE.$ORDER.lm.norm
# prepare Viterbi decoding
$CARMEL_DIR/bin/carmel --project-right --project-identity-fsa -HJ $BASEDIR/lms/$LAN_CODE.$ORDER.lm.wfsa \
> $BASEDIR/lms/$LAN_CODE.$ORDER.fsa.noe