forked from TsangLab/mycoSORT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config-sample.cfg
164 lines (164 loc) · 3.46 KB
/
config-sample.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#################################################
#
#
# Configuration file for mycoSORT
#
#
##################################################
########################### DIRECTORIES ##########
# project home
HOME_DIR=/home/usr/mycosort-pck-version/
#
# corpus directory
CORPUS_DIR=corpus/
#
# duplicate documents directory
DUP_DIR=test/
#
# positive instances directory
POS_DIR=positives/
#
# negative instances directory
NEG_DIR=negatives/
#
# train directory
TRAIN_DIR=train/
#
# test directory
TEST_DIR=test/
#
# feature directory
FEATURE_DIR=features/
#
# output directory for arff files
OUTPUT_MODEL=arff/
#
#################################################
###################### CORPUS SAMPLING ##########
# true if training set must be sampled
SAMPLE_TRAIN=false
#
# true if test set must be sampled
SAMPLE_TEST=false
#
# % of test corpus WRT the collection
PERCT_TEST=15
#
# % positive on training set
PERCT_POS_TRAIN=50
#
# % positive on test set
PERCT_POS_TEST=10
#
#################################################
########################## INPUT FILES ##########
# training file
TRAINING_FILE=/triagecorpus_train.xml
#
# test file
TEST_FILE=/triagecorpus_test.xml
#
# arff training file
ARFF_TRAIN=triage0.arff
#
# arff testing file
ARFF_TEST=triage1.arff
#
# stopwords list
STOP_LIST=stopList.txt
#
##################################################
########################## OUTPUT FILES ##########
# EC numbers feature list
ECNUM_FEATURES=ecnumbers.txt
#
# Journal title feature list
JOURNAL_TITLE_FEATURES=journaltitles.txt
#
# Abstract annotations feature list
ANNOTATION_FEATURES=annotations.txt
#
# Paper title annotations feature list
TITLE_FEATURES=titleAnnotations.txt
#
# Abstract ngrams feature list
NGRAM_FEATURES=ngrams_features.txt
#
# Paper title n-grams feature list
TITLE_NGRAMS=titleGrams.txt
#
# Paper ID and class
DOC_IDS=docIDs.txt
#
###################################################
########################## FEATURE SETUP ##########
# Extract size of abstract and title
USE_TEXT_SIZE=false
#
# Extract Journal of publication
USE_JOURNAL_TITLE_FEATURE=false
#
# Extract EC Numbers
USE_ECNUM_FEATURE=true
#
# minimum frequency to consider a feature
FEATURE_MIN_FREQ=2
#
# minimum length (in chars) to consider a feature
FEATURE_MIN_LENGTH=3
#
# extract document IDs
USE_DOC_ID=true
#
#############################
######### ANNOTATIONS #######
# Extract annotation content
USE_ANNOTATION_FEATURE=true
#
# Extract annotation entities
USE_ANNOTATION_TYPE=true
#
# Extract annotations from title separately
USE_TITLE_FEATURE=false
#
#############################
########## N-GRAMS ##########
# Extract ngrams
USE_NGRAM_FEATURE=false
#
# Extract ngrams from title separately
USE_TITLE_NGRAMS=false
#
#use of stopwords list on ngrams
NGRAM_STOP=true
#
# Define size of extracted n-grams
NGRAM_SIZE=1
#
# Apply weights to ngrams
#USE_WEIGHTED_NGRAM=false
#
# Define weight of features
#WEIGHT=3
#
###################################################
########################## FEATURE SELECTION SETUP ##########
# Enable Odds Ratio (OR) filtering
USE_ODDS_RATIO=false
#
# Define minimum OR threshold to keep attribute
OR_THRESHOLD=1
#
# Enable inverted document frequency (idf) filtering
USE_IDF=false
#
# Define minimum OR threshold to keep attribute
IDF_THRESHOLD=1
#
#################################################
########################### TASK SETUP ##########
# experiment type : train = 0 / test = 1
EXP_TYPE=0
#
# limit numbers of parameters - quantity (top) or -1 all file
NB_PARAMS=-1