Skip to content

Commit

Permalink
Added features for normalization of Wopr scores
Browse files Browse the repository at this point in the history
  • Loading branch information
mhkuu committed Jun 6, 2018
1 parent e6c42dc commit d67ca97
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 60 deletions.
32 changes: 26 additions & 6 deletions include/tscan/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -366,12 +366,22 @@ struct structStats: public basicStats {
lemma_freq_log_n(NAN),
lemma_freq_log_strict(NAN),
lemma_freq_log_n_strict(NAN),
avg_prob10_fwd(NAN),
avg_prob10_bwd(NAN),
entropy_fwd(NAN),
entropy_bwd(NAN),
perplexity_fwd(NAN),
perplexity_bwd(NAN),
avg_prob10_fwd(0),
avg_prob10_fwd_content(0),
avg_prob10_fwd_ex_names(0),
avg_prob10_fwd_content_ex_names(0),
avg_prob10_bwd(0),
avg_prob10_bwd_content(0),
avg_prob10_bwd_ex_names(0),
avg_prob10_bwd_content_ex_names(0),
entropy_fwd(0),
entropy_fwd_norm(0),
entropy_bwd(0),
entropy_bwd_norm(0),
perplexity_fwd(0),
perplexity_fwd_norm(0),
perplexity_bwd(0),
perplexity_bwd_norm(0),
al_gem(NAN),
al_max(NAN),
intensCnt(0),
Expand Down Expand Up @@ -690,11 +700,21 @@ struct structStats: public basicStats {
double lemma_freq_log_strict;
double lemma_freq_log_n_strict;
double avg_prob10_fwd;
double avg_prob10_fwd_content;
double avg_prob10_fwd_ex_names;
double avg_prob10_fwd_content_ex_names;
double avg_prob10_bwd;
double avg_prob10_bwd_content;
double avg_prob10_bwd_ex_names;
double avg_prob10_bwd_content_ex_names;
double entropy_fwd;
double entropy_fwd_norm;
double entropy_bwd;
double entropy_bwd_norm;
double perplexity_fwd;
double perplexity_fwd_norm;
double perplexity_bwd;
double perplexity_bwd_norm;
double al_gem;
double al_max;
int intensCnt;
Expand Down
68 changes: 30 additions & 38 deletions src/structstats.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -917,18 +917,30 @@ void structStats::intensToCSV( ostream& os ) const {
}

void structStats::miscHeader( ostream& os ) const {
os << "Log_prob_fwd,Entropie_fwd,Perplexiteit_fwd,";
os << "Log_prob_bwd,Entropie_bwd,Perplexiteit_bwd,";
os << "Log_prob_fwd,Log_prob_fwd_inhwrd,Log_prob_fwd_zn,Log_prob_fwd_inhwrd_zn,";
os << "Entropie_fwd,Entropie_fwd_norm,Perplexiteit_fwd,Perplexiteit_fwd_norm,";
os << "Log_prob_bwd,Log_prob_bwd_inhwrd,Log_prob_bwd_zn,Log_prob_bwd_inhwrd_zn,";
os << "Entropie_bwd,Entropie_bwd_norm,Perplexiteit_bwd,Perplexiteit_bwd_norm,";
os << "Eigen_classificatie";
}

void structStats::miscToCSV( ostream& os ) const {
os << proportion( avg_prob10_fwd, sentCnt ) << ",";
os << proportion( avg_prob10_fwd_content, sentCnt ) << ",";
os << proportion( avg_prob10_fwd_ex_names, sentCnt ) << ",";
os << proportion( avg_prob10_fwd_content_ex_names, sentCnt ) << ",";
os << proportion( entropy_fwd, sentCnt ) << ",";
os << proportion( entropy_fwd_norm, sentCnt ) << ",";
os << proportion( perplexity_fwd, sentCnt ) << ",";
os << proportion( perplexity_fwd_norm, sentCnt ) << ",";
os << proportion( avg_prob10_bwd, sentCnt ) << ",";
os << proportion( avg_prob10_bwd_content, sentCnt ) << ",";
os << proportion( avg_prob10_bwd_ex_names, sentCnt ) << ",";
os << proportion( avg_prob10_bwd_content_ex_names, sentCnt ) << ",";
os << proportion( entropy_bwd, sentCnt ) << ",";
os << proportion( entropy_bwd_norm, sentCnt ) << ",";
os << proportion( perplexity_bwd, sentCnt ) << ",";
os << proportion( perplexity_bwd_norm, sentCnt ) << ",";

os << "\"" << escape_quotes(toStringCounter(my_classification)) << "\"";
}
Expand Down Expand Up @@ -1370,44 +1382,24 @@ void structStats::merge( structStats *ss ){
lemma_freq_n_strict += ss->lemma_freq_n_strict;

// Wopr forwards probabilities
if ( !std::isnan(ss->avg_prob10_fwd) ){
if ( std::isnan(avg_prob10_fwd) )
avg_prob10_fwd = ss->avg_prob10_fwd;
else
avg_prob10_fwd += ss->avg_prob10_fwd;
}
if ( !std::isnan(ss->entropy_fwd) ){
if ( std::isnan(entropy_fwd) )
entropy_fwd = ss->entropy_fwd;
else
entropy_fwd += ss->entropy_fwd;
}
if ( !std::isnan(ss->perplexity_fwd) ){
if ( std::isnan(perplexity_fwd) )
perplexity_fwd = ss->perplexity_fwd;
else
perplexity_fwd += ss->perplexity_fwd;
}
avg_prob10_fwd += ss->avg_prob10_fwd;
avg_prob10_fwd_content += ss->avg_prob10_fwd_content;
avg_prob10_fwd_ex_names += ss->avg_prob10_fwd_ex_names;
avg_prob10_fwd_content_ex_names += ss->avg_prob10_fwd_content_ex_names;
entropy_fwd += ss->entropy_fwd;
entropy_fwd_norm += ss->entropy_fwd_norm;
perplexity_fwd += ss->perplexity_fwd;
perplexity_fwd_norm += ss->perplexity_fwd_norm;

// Wopr backwards probabilities
if ( !std::isnan(ss->avg_prob10_bwd) ){
if ( std::isnan(avg_prob10_bwd) )
avg_prob10_bwd = ss->avg_prob10_bwd;
else
avg_prob10_bwd += ss->avg_prob10_bwd;
}
if ( !std::isnan(ss->entropy_bwd) ){
if ( std::isnan(entropy_bwd) )
entropy_bwd = ss->entropy_bwd;
else
entropy_bwd += ss->entropy_bwd;
}
if ( !std::isnan(ss->perplexity_bwd) ){
if ( std::isnan(perplexity_bwd) )
perplexity_bwd = ss->perplexity_bwd;
else
perplexity_bwd += ss->perplexity_bwd;
}
avg_prob10_bwd += ss->avg_prob10_bwd;
avg_prob10_bwd_content += ss->avg_prob10_bwd_content;
avg_prob10_bwd_ex_names += ss->avg_prob10_bwd_ex_names;
avg_prob10_bwd_content_ex_names += ss->avg_prob10_bwd_content_ex_names;
entropy_bwd += ss->entropy_bwd;
entropy_bwd_norm += ss->entropy_bwd_norm;
perplexity_bwd += ss->perplexity_bwd;
perplexity_bwd_norm += ss->perplexity_bwd_norm;

intensCnt += ss->intensCnt;
intensBvnwCnt += ss->intensBvnwCnt;
Expand Down
49 changes: 33 additions & 16 deletions src/tscan.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1739,7 +1739,7 @@ void orderWopr( const string& type, const string& txt, vector<double>& wordProbs
cerr << "No usable FoLia date retrieved from Wopr. Got '"
<< result << "'" << endl;
}
cerr << "Done with Wopr" << endl;
cerr << "done with Wopr" << endl;
}

xmlDoc *AlpinoServerParse( folia::Sentence *);
Expand Down Expand Up @@ -1857,22 +1857,8 @@ sentStats::sentStats( int index, folia::Sentence *s, const sentStats* pred ):
} // omp section
} // omp sections

// if ( parseFailCnt == 1 ){
// // glorious fail
// return;
// }
sentCnt = 1; // so only count the sentence when not failed
if ( sentProb_fwd != -99 ){
avg_prob10_fwd = sentProb_fwd;
}
if ( sentProb_bwd != -99 ){
avg_prob10_bwd = sentProb_bwd;
}
entropy_fwd = sentEntropy_fwd;
entropy_bwd = sentEntropy_bwd;
perplexity_fwd = sentPerplexity_fwd;
perplexity_bwd = sentPerplexity_bwd;
// cerr << "PUNCTS " << puncts << endl;

bool question = false;
vector<string> wordbuffer;
vector<string> lemmabuffer;
Expand Down Expand Up @@ -1958,9 +1944,13 @@ sentStats::sentStats( int index, folia::Sentence *s, const sentStats* pred ):
if (ws->isContent) {
word_freq += ws->word_freq_log;
lemma_freq += ws->lemma_freq_log;
avg_prob10_fwd_content += ws->logprob10_fwd;
avg_prob10_bwd_content += ws->logprob10_bwd;
if (ws->prop != CGN::ISNAME) {
word_freq_n += ws->word_freq_log;
lemma_freq_n += ws->lemma_freq_log;
avg_prob10_fwd_content_ex_names += ws->logprob10_fwd;
avg_prob10_bwd_content_ex_names += ws->logprob10_bwd;
}
}
if (ws->isContentStrict) {
Expand All @@ -1971,6 +1961,10 @@ sentStats::sentStats( int index, folia::Sentence *s, const sentStats* pred ):
lemma_freq_n_strict += ws->lemma_freq_log;
}
}
if (ws->prop != CGN::ISNAME) {
avg_prob10_fwd_ex_names += ws->logprob10_fwd;
avg_prob10_bwd_ex_names += ws->logprob10_bwd;
}

if (ws->isNominal) nominalCnt++;

Expand Down Expand Up @@ -2439,6 +2433,29 @@ sentStats::sentStats( int index, folia::Sentence *s, const sentStats* pred ):
np_length( s, npCnt, indefNpCnt, npSize );
rarityLevel = settings.rarityLevel;
overlapSize = settings.overlapSize;

// Assign and normalize the values from Wopr
if ( sentProb_fwd != -99 ){
avg_prob10_fwd = sentProb_fwd;
}
if ( sentProb_bwd != -99 ){
avg_prob10_bwd = sentProb_bwd;
}
entropy_fwd = sentEntropy_fwd;
entropy_bwd = sentEntropy_bwd;
perplexity_fwd = sentPerplexity_fwd;
perplexity_bwd = sentPerplexity_bwd;

avg_prob10_fwd_content = proportion(avg_prob10_fwd_content, contentCnt).p;
avg_prob10_fwd_ex_names = proportion(avg_prob10_fwd_ex_names, wordCnt - nameCnt).p;
avg_prob10_fwd_content_ex_names = proportion(avg_prob10_fwd_content_ex_names, contentCnt - nameCnt).p;
avg_prob10_bwd_content = proportion(avg_prob10_bwd_content, contentCnt).p;
avg_prob10_bwd_ex_names = proportion(avg_prob10_bwd_ex_names, wordCnt - nameCnt).p;
avg_prob10_bwd_content_ex_names = proportion(avg_prob10_bwd_content_ex_names, contentCnt - nameCnt).p;
entropy_fwd_norm = proportion(entropy_fwd, w.size()).p;
entropy_bwd_norm = proportion(entropy_bwd, w.size()).p;
perplexity_fwd_norm = proportion(perplexity_fwd, pow(w.size(), 2)).p;
perplexity_bwd_norm = proportion(perplexity_bwd, pow(w.size(), 2)).p;
}

Conn::Type sentStats::checkMultiConnectives( const string& mword ){
Expand Down

0 comments on commit d67ca97

Please sign in to comment.