oaqa · kuoliu · Oct 31, 2013 · Nov 3, 2013 · Nov 4, 2013 · Nov 4, 2013
diff --git a/qa4mre-alzheimer-task/.classpath b/qa4mre-alzheimer-task/.classpath
@@ -17,11 +17,7 @@
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
-		<attributes>
-			<attribute name="maven.pomderived" value="true"/>
-		</attributes>
-	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>

diff --git a/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/Biomedical_about_Alzheimer_Sample_GS.xml_1.xml b/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/Biomedical_about_Alzheimer_Sample_GS.xml_1.xml
diff --git a/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_1.xmi b/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_1.xmi
diff --git a/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_2.xmi b/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_2.xmi
diff --git a/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_3.xmi b/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_3.xmi
diff --git a/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_4.xmi b/qa4mre-alzheimer-task/XMIs/12-test-alzheimer/QA4MRE-2012_BIOMEDICAL_GS.xml_4.xmi
diff --git a/qa4mre-alzheimer-task/data/13-test-alzheimer/QA4MRE-2013-EN_test.xml b/qa4mre-alzheimer-task/data/13-test-alzheimer/QA4MRE-2013-EN_test.xml
@@ -74,8 +74,7 @@ Of mice and men: an Alzheimer’s cure for our murine brethren. Alzheimer's Dise
 					<answer a_id='1'>Alzheimer's treatment</answer>
 					<answer a_id='2'>nest making</answer>
 					<answer a_id='3'>restoring smell</answer>
-					<answer a_id='4'>neurodegeneration
-</answer>
+					<answer a_id='4'>neurodegeneration</answer>
 					<answer a_id='5'>None of the above</answer>
 				</q>
 				<q  q_id="10" >
@@ -247,8 +246,7 @@ Fighting Alzheimer’s disease? Get the immune system on board. James Fuller Jam
 					<q_str>Name a similarity between AD and TB.</q_str>
 					<answer a_id='1'>the body is slowly destroying the brain</answer>
 					<answer a_id='2'>the vaccine that teaches the immune system to fight off the infection</answer>
-					<answer a_id='3'>drugs and therapy
-</answer>
+					<answer a_id='3'>drugs and therapy</answer>
 					<answer a_id='4'>side effects of the vaccines</answer>
 					<answer a_id='5'>None of the above</answer>
 				</q>
@@ -291,8 +289,7 @@ Alanna Shaikh: How I'm preparing to get Alzheimer's. I'd like to talk about my d
 				</q>
 				<q  q_id="5" >
 					<q_str>What is Alanna's aim when building her physical strength?</q_str>
-					<answer a_id='1'>to become a better person
-</answer>
+					<answer a_id='1'>to become a better person</answer>
 					<answer a_id='2'>to win a tai chi medal</answer>
 					<answer a_id='3'>to fill out forms</answer>
 					<answer a_id='4'>to have the ability to knit a sweater</answer>
@@ -434,8 +431,7 @@ Financial challenges faced by person with dementia. The idea: A person with deme
 				<q  q_id="7" >
 					<q_str>All but one of the following are reasons why a total of $100,000 for the common funds of a family seeking assistance is not enough. Which one is that?</q_str>
 					<answer a_id='1'>The needs of the family continue.</answer>
-					<answer a_id='2'>The cost of living constantly increases.
-</answer>
+					<answer a_id='2'>The cost of living constantly increases.</answer>
 					<answer a_id='3'>The costs incident to the disease constantly increase.</answer>
 					<answer a_id='4'>Health care covers assisted living.</answer>
 					<answer a_id='5'>None of the above</answer>

diff --git a/qa4mre-alzheimer-task/data/survey.txt b/qa4mre-alzheimer-task/data/survey.txt
@@ -0,0 +1,13 @@
+In 12 test data, we have 4 document, each with10 questions.
+In 13 sample data, we have 1 document with 10 questions.
+In 13 test data, we have 16 document, each with 15-20 questions.
+
+Each question in 12 test data and 13 sample data have 5 options and an implicit option of choosing nothing.
+But questions in 13 test data have 5 options which contains a explicit "None of above" option.
+
+The type of the questions are mainly factoid.
+  First is question about a specific part of a fact like what, where, how, why, who, aim, purpose.
+  Second is asking about some numeric features like how many, how old.
+  Third is asking us to name 2-3 examples, like "Name 2 ways to do ..."
+There is a special type of question in 13 test data, which asks about the degree of a certain fact. The options are like "Absolutely yes", "Probably yes", "Probably not" and "Absolutely not".
+We definitely need a special pipeline for the last type of questions.
diff --git a/qa4mre-alzheimer-task/documentation/Milestone 1 Report.pdf b/qa4mre-alzheimer-task/documentation/Milestone 1 Report.pdf
diff --git a/qa4mre-alzheimer-task/documentation/Milestone 2 Report.pdf b/qa4mre-alzheimer-task/documentation/Milestone 2 Report.pdf
diff --git a/qa4mre-alzheimer-task/documentation/Working Notes Paper.pdf b/qa4mre-alzheimer-task/documentation/Working Notes Paper.pdf
diff --git a/qa4mre-alzheimer-task/documentation/outputXML.zip b/qa4mre-alzheimer-task/documentation/outputXML.zip
diff --git a/qa4mre-alzheimer-task/solr/apache-solr-3.6.1.zip b/qa4mre-alzheimer-task/solr/apache-solr-3.6.1.zip
diff --git a/qa4mre-alzheimer-task/src/main/java/edu/cmu/lti/deiis/hw5/annotators/AcronymAnnotator.java b/qa4mre-alzheimer-task/src/main/java/edu/cmu/lti/deiis/hw5/annotators/AcronymAnnotator.java
@@ -0,0 +1,240 @@
+package edu.cmu.lti.deiis.hw5.annotators;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.ListIterator;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.collections.iterators.ArrayListIterator;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import edu.cmu.lti.qalab.types.Answer;
+import edu.cmu.lti.qalab.types.Sentence;
+import edu.cmu.lti.qalab.types.Synonym;
+import edu.cmu.lti.qalab.types.Token;
+import edu.cmu.lti.qalab.utils.Utils;
+
+/**
+ * Finds Token instances that appear to be acronyms and determines their likely
+ * expansion (ex: "AD" expands to "Alzheimer's Disease"). Updates the Token.synonyms
+ * FSList to include the expansion as a synonym (e.g. after processing the Token "AD",
+ * its synonyms list will contain "Alzheimer's Disease" as a synonym). 
+ */
+public class AcronymAnnotator extends JCasAnnotator_ImplBase{
+
+  // Regex patterns (to find acronyms)
+  // A) all UPPERCASE*            (ex: "IDE")
+  // B) lowercase with no vowels* (ex: "sst")
+  // C) MixedCase with uppercase in middle/end (ex: "LoB")
+  //         *with/without numbers
+  Pattern uppercasePattern=Pattern.compile("^[A-ZÃŸ]{2,6}[\\d]?$");
+  Pattern lowercasePattern=Pattern.compile("^[bcdfghjklmnpqrstvwxz\\d]{2,4}$");
+  Pattern mixedcasePattern=Pattern.compile("^[A-ZÃŸ]{1,6}[\\d]?[a-z]{0,3}[\\d]?[A-ZÃŸ]{1,6}[\\d]?[a-z\\d]{0,3}[\\d]?$");
+  HashMap<String, ArrayList<Synonym>> acronymSynonymMap = new HashMap<String, ArrayList<Synonym>>();
+
+  @Override
+	public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+    // Loop through tokens from test doc
+    ArrayList<Sentence> sentences = Utils.getSentenceListFromTestDocCAS(jCas);
+    for (Sentence s : sentences) {
+      ArrayList<Token> tokens = Utils.getTokenListFromSentenceList(s);
+      annotateAcronyms(tokens, jCas);      
+    }
+
+    // Loop through tokens from source doc
+    ArrayList<Sentence> sentences2 = Utils.getSentenceListFromSourceDocCAS(jCas);
+    for (Sentence s : sentences2) {
+      ArrayList<Token> tokens = Utils.getTokenListFromSentenceList(s);
+      annotateAcronyms(tokens, jCas);      
+    }
+
+    // Loop through tokens from answers
+    ArrayList<ArrayList<Answer>> answers = Utils.getAnswerListFromTestDocCAS(jCas);
+    for (ArrayList<Answer> aList : answers) {
+      for (Answer a : aList) {        
+        ArrayList<Token> tokens = Utils.getTokenListFromAnswer(a);
+        annotateAcronyms(tokens, jCas);      
+      }
+    }
+	}
+
+	private void annotateAcronyms(ArrayList<Token> tokens, JCas jCas) {
+
+    // Loop through tokens, and Find something that looks like an acronym: 
+    // A) all UPPERCASE*            (ex: "IDE")
+    // B) lowercase with no vowels* (ex: "sst")
+	  // C) MixedCase with uppercase in middle/end (ex: "LoB")
+    //         *with/without numbers
+
+	  int histLength = Math.min(5, tokens.size()-1);
+
+	  ArrayList<Token> prevTokens = new ArrayList<Token>(histLength);
+    ArrayList<Token> nextTokens = new ArrayList<Token>(histLength);
+
+	  for (int i=0; i<tokens.size(); i++) {
+	    // Store values of previous and next tokens
+      if (i==0) {
+        for (int j=1; j<=histLength; j++) {
+          prevTokens.add(tokens.get(i));
+          nextTokens.add(tokens.get(i+j));
+        }
+      } 
+      else if (i < tokens.size()-histLength) {
+        prevTokens.add(tokens.get(i-1));
+        prevTokens.remove(0);
+        nextTokens.add(tokens.get(i+histLength));
+        nextTokens.remove(0);
+      }
+      else if (i < tokens.size()-histLength) {
+        prevTokens.add(tokens.get(i-1));
+        prevTokens.remove(0);
+        nextTokens.remove(0);
+      }
+
+	    Token t = tokens.get(i);
+      String text = t.getText();
+
+      // If the token already exists in the acronym hashmap / DB, retrieve synonyms from hashmap.
+      ArrayList<Synonym> existingSynonyms = this.acronymSynonymMap.get(t.getText());
+      if (existingSynonyms != null) {
+        addUpdateTokenSynonyms(t, existingSynonyms, jCas);
+      } else {
+          // Else: determine whether the token is an acronym
+          // A) all UPPERCASE*            (ex: "IDE")
+          Matcher upperMatch = uppercasePattern.matcher(text);
+          while (upperMatch.find()) {
+            confirmMatch(tokens.get(i), prevTokens, nextTokens, jCas);
+          }
+          // B) lowercase with no vowels* (ex: "sst")
+          Matcher lowerMatch = lowercasePattern.matcher(text);
+          while (lowerMatch.find()) {
+            confirmMatch(tokens.get(i), prevTokens, nextTokens, jCas);        
+          }
+          // C) MixedCase with uppercase in middle/end (ex: "LoB")
+          Matcher mixedMatch = mixedcasePattern.matcher(text);
+          while (mixedMatch.find()) {
+            confirmMatch(tokens.get(i), prevTokens, nextTokens, jCas);        
+          }      
+      }
+    }
+  }
+
+  private void confirmMatch(Token t, ArrayList<Token> prevToks, ArrayList<Token> nextToks, JCas jCas) {
+
+    String acronym = t.getText();
+    boolean leftParensFound = false;
+    boolean rightParensFound = false;
+    boolean matchFound = false;
+    HashMap<String, Integer> synonymMap= new HashMap<String, Integer>();
+
+
+    // Determine if acronym is contained within parens
+    for (int i=0; i<prevToks.size(); i++) {
+      if (prevToks.get(i).getText().equalsIgnoreCase("(")) {
+        leftParensFound = true;
+        break;
+      }
+    }
+    for (int i=0; i<nextToks.size(); i++) {
+      if (nextToks.get(i).getText().equalsIgnoreCase(")")) {
+        rightParensFound = true;
+        break;
+      }
+    }
+
+    // If acronym is contained within parens
+    // 1) look to the left
+    // 2) if preceding tokens form NP and the first letter of any token 
+    //     = a letter within acronym
+    // 3) then match.
+    if (leftParensFound == true && rightParensFound == true) {
+      for (int i=0; i<prevToks.size(); i++) {
+        for (int j = 0; j < acronym.length(); j++){
+          char c = acronym.charAt(j);
+          if (Character.toLowerCase(prevToks.get(i).getText().charAt(0)) == Character.toLowerCase(c)) {
+            matchFound = true;
+            synonymMap.put(prevToks.get(i).getText(), 1);
+          }                  
+        }
+      }    
+    } else {
+      // If acronym is not contained within parens
+      // 1) look to the right
+      // 2) if following tokens consist of parens with NP within, and the
+      //      first letter of any token = a letter within acronym
+      // 3) then match. 
+      for (int i=0; i<nextToks.size(); i++) {
+        for (int j = 0; j < acronym.length(); j++){
+          char c = acronym.charAt(j);
+          if (Character.toLowerCase(nextToks.get(i).getText().charAt(0)) == Character.toLowerCase(c)) {
+            matchFound = true;
+            synonymMap.put(prevToks.get(i).getText(), 1);
+          }                  
+        }
+      }          
+    }
+
+    // If a match is found, store its expansion (noun phrase) as a synonym
+    if (matchFound == true) {
+      // Copy hashmap to ArrayList
+      ArrayList<Synonym> newSynonyms = new ArrayList<Synonym>();
+      for (Map.Entry<String, Integer> entry : synonymMap.entrySet())
+      {
+        Synonym newSynonym = new Synonym(jCas);
+        newSynonym.addToIndexes(jCas);
+        newSynonym.setText(entry.getKey());
+        newSynonyms.add(newSynonym);
+      }
+      addUpdateTokenSynonyms(t, newSynonyms, jCas);
+      this.acronymSynonymMap.put(t.getText(), newSynonyms);
+
+    }
+
+  }
+
+  private void addUpdateTokenSynonyms(Token t, ArrayList<Synonym> synonymList, JCas jCas) {
+
+    // If token already has synonym list, append it to synonymList
+    FSList prevSynonyms = t.getSynonyms();
+    if (prevSynonyms == null) {
+      // Do nothing        
+    } else {
+      boolean dupFlag = false;
+      try {
+        ArrayList<Synonym> prevSynonymsArrayList = Utils.fromFSListToCollection(prevSynonyms, Synonym.class);
+        for (Synonym s : prevSynonymsArrayList) {
+          for (Synonym s2 : synonymList) {
+            if (s.getText().equalsIgnoreCase(s2.getText())) {
+              dupFlag = true;
+            }
+          }
+          if (dupFlag == false) {
+            synonymList.add(s);          
+          }
+        }
+      } catch (NullPointerException e) {
+        // Some tokens seemed to not exist (caused null pointer exceptions). In this case, do not try to update them.
+        return;
+      }
+
+    }
+
+    // Set synonymList as the new FSList<Synonym> for the token
+    FSList updatedSynonyms = Utils.fromCollectionToFSList(jCas, synonymList);
+    updatedSynonyms.addToIndexes(jCas);
+    t.setSynonyms(updatedSynonyms);
+    t.addToIndexes();
+
+  }
+}