eval

#!/bin/bash

# Usage: ./eval STEP_SIZE EM_ITER OUTPUT
#  where OUTPUT is the destination file of the scores
#  STEP_SIZE is the increments of lines of extra data to go by e.g. STEP_SIZE=100
#  EM_ITER is the number of iterations of EM to perform
#  will run on 0,100,200,300,...,39672
#
#  Set by definition to run on English, for 1 iteration of EM
#
# Watch the tagger output in real-time with tail -f debug/semisupervised_stdout.txt

if [ $# -ne "3" ] && [ $# -ne "4" ]; then
  echo "usage: eval STEP_SIZE EM_ITER OUTPUT (EN|SANS)"
  echo "Evaluate accuracy of semi-supervised HMM as a function of unlabeled corpus size."
  echo ""
  echo "Arguments:"
  echo "  STEP_SIZE: Num. of lines data is increased by on each iteration."
  echo "  EM_ITER: Num. of iterations of EM"
  echo "  OUTPUT: Path to output table of corpus size vs. accuracy"
  echo "  (EN|SANS): Either EN (english) or SANS (sanskrit). Defaults to English."
  exit 1
fi

# parameters to this script:
step_size=$1
em_iter=$2
output=$3

OUTPUT_DIR="debug" # where should the output from this program go?

# parameters to the tagger:
TRAIN="data/en/wsj2-21-trunc.txt" # labeled training corpus
TEST="data/en/wsj22-trunc.txt" # labeled test corpus
NOTAGS="data/en/wsj2-21-notags.txt" # unlabeled training corpus
OUT="data/output.txt" # tagged output file (temporary)
EXTRA="data/tmp.txt" # corpus of determined length (temporary)
iter=$em_iter
lang="EN"

# if a 4th arg is passed
if [ $# -eq "4" ]; then
  if [[ "${4}" -eq "SANS" ]]; then # check if we should be in sanskrit mode
    TRAIN="data/sans/train.txt"
    TEST="data/sans/test.txt"
    NOTAGS="data/sans/GRETILNoTagsTrain.txt"
    lang="SANS"
  elif [[ "${4}" -ne "EN" ]]; then
    echo "$4 is not a valid language option."
    exit 1
  fi
fi

tsize=( $( wc -l ${TRAIN}) ) # get size of training data
testsize=( $( wc -l ${TEST}) ) # get size of test data
len=( $( wc -l ${NOTAGS}) ) # make into array, use only first word (the length)

# make sure debug directory exists, create it if not
if [ ! -d "${OUTPUT_DIR}" ]; then
  mkdir "${OUTPUT_DIR}"
fi

# remove old output file (check for confirm):
if [ -e ${output} ]; then
  rm -i ${output}
  if [ $? -ne "0" ]; then
    exit 1 # exit on error
  fi
fi

# remove old tmp file
if [ -e ${EXTRA} ]; then
  rm -f ${EXTRA}
fi

trap "kill -9 $$" SIGINT SIGTERM # catch these signals to kill tagger and this script

echo "Running ${lang} tagger based on:"
printf "\t• ${tsize[0]} lines labeled training data, ${testsize[0]} lines of test data.\n"
printf "\t• Up to ${len[0]} lines unlabeled training data, incrementing by ${step_size} lines.\n"
printf "\t• ${iter} iterations of EM.\n"
printf "\nSupervised baseline accuracy : "
./tagger.py --lang ${lang} --model super --train ${TRAIN} --test ${TEST} --output ${OUT} >"${OUTPUT_DIR}/supervised_stdout.txt" 2>"${OUTPUT_DIR}/supervised_stderr.txt"
printf "$( ./score.py --lang ${lang} ${TEST} ${OUT} )\n"

printf "\nSemi-supervised accuracy:\n"

# i is the size of the data/tmp.txt file (in no. lines)
for i in $( seq 0 ${step_size} ${len[0]} ); do
  printf " ${i} unlabeled lines: "
  if [ $i -eq 0 ]; then
    touch ${EXTRA}
  else
    head -n $i ${NOTAGS} > ${EXTRA}
  fi

  ./tagger.py --lang ${lang} --model semisuper --iter $iter --train ${TRAIN} --test ${TEST} --output ${OUT} --extra ${EXTRA} >"${OUTPUT_DIR}/semisupervised_stdout.txt" 2>"${OUTPUT_DIR}/semisupervised_stderr.txt"
  score=$( ./score.py --lang ${lang} ${TEST} ${OUT} )
  printf "${score}\n"

  # log to file:
  echo "${i} ${score}" >> ${output}
done

# cleanup:
rm -f ${OUT} ${EXTRA}