config-sample.cfg

#################################################
#
#
#  Configuration file for mycoSORT
#
#
##################################################
########################### DIRECTORIES ##########
# project home
HOME_DIR=/home/usr/mycosort-pck-version/
#
# corpus directory
CORPUS_DIR=corpus/
#
# duplicate documents directory 
DUP_DIR=test/
#
# positive instances directory
POS_DIR=positives/
#
# negative instances directory 
NEG_DIR=negatives/
#
# train directory
TRAIN_DIR=train/
#
# test directory
TEST_DIR=test/
#
# feature directory
FEATURE_DIR=features/
#
# output directory for arff files
OUTPUT_MODEL=arff/
#
#################################################
###################### CORPUS SAMPLING ##########
# true if training set must be sampled
SAMPLE_TRAIN=false
#
# true if test set must be sampled
SAMPLE_TEST=false
#
# % of test corpus WRT the collection  
PERCT_TEST=15
#
# % positive on training set
PERCT_POS_TRAIN=50
#
# % positive on test set 
PERCT_POS_TEST=10
#
#################################################
########################## INPUT FILES ##########
# training file
TRAINING_FILE=/triagecorpus_train.xml
#
# test file
TEST_FILE=/triagecorpus_test.xml
#
# arff training file
ARFF_TRAIN=triage0.arff
#
# arff testing file
ARFF_TEST=triage1.arff
#
# stopwords list
STOP_LIST=stopList.txt
#
##################################################
########################## OUTPUT FILES ##########
# EC numbers feature list
ECNUM_FEATURES=ecnumbers.txt
#
# Journal title feature list
JOURNAL_TITLE_FEATURES=journaltitles.txt
#
# Abstract annotations feature list
ANNOTATION_FEATURES=annotations.txt
#
# Paper title annotations feature list
TITLE_FEATURES=titleAnnotations.txt
#
# Abstract ngrams feature list
NGRAM_FEATURES=ngrams_features.txt
#
# Paper title n-grams feature list
TITLE_NGRAMS=titleGrams.txt
#
# Paper ID and class
DOC_IDS=docIDs.txt
#
###################################################
########################## FEATURE SETUP ##########
# Extract size of abstract and title 
USE_TEXT_SIZE=false
#
# Extract Journal of publication 
USE_JOURNAL_TITLE_FEATURE=false
#
# Extract EC Numbers
USE_ECNUM_FEATURE=true
#
# minimum frequency to consider a feature
FEATURE_MIN_FREQ=2
#
# minimum length (in chars) to consider a feature
FEATURE_MIN_LENGTH=3
#
# extract document IDs
USE_DOC_ID=true
#
#############################
######### ANNOTATIONS #######
# Extract annotation content
USE_ANNOTATION_FEATURE=true
#
# Extract annotation entities
USE_ANNOTATION_TYPE=true
#
# Extract annotations from title separately
USE_TITLE_FEATURE=false
#
#############################
########## N-GRAMS ##########
# Extract ngrams 
USE_NGRAM_FEATURE=false
#
# Extract ngrams from title separately
USE_TITLE_NGRAMS=false
#
#use of stopwords list on ngrams
NGRAM_STOP=true
#
# Define size of extracted n-grams
NGRAM_SIZE=1
#
# Apply weights to ngrams
#USE_WEIGHTED_NGRAM=false
#
# Define weight of features
#WEIGHT=3
#
###################################################
########################## FEATURE SELECTION SETUP ##########
# Enable Odds Ratio (OR) filtering
USE_ODDS_RATIO=false
#
# Define minimum OR threshold to keep attribute 
OR_THRESHOLD=1
#
# Enable inverted document frequency (idf) filtering
USE_IDF=false
#
# Define minimum OR threshold to keep attribute
IDF_THRESHOLD=1
#
#################################################
########################### TASK SETUP ##########
# experiment type : train = 0 / test = 1
EXP_TYPE=0
#
# limit numbers of parameters - quantity (top) or -1 all file
NB_PARAMS=-1