Draft of actor based system for using Slabs.

scalanlp · Jul 21, 2013 · 856c767 · bethard · Jul 21, 2013 · jasonbaldridge
1 parent a400ed8
commit 856c767
Show file tree

Hide file tree

Showing 5 changed files with 277 additions and 50 deletions.
diff --git a/build.sbt b/build.sbt
@@ -16,6 +16,8 @@ resolvers ++= Seq(
 )
 
 libraryDependencies ++= Seq(
+  "com.typesafe.akka" %% "akka-actor" % "2.2.0",
+  "com.typesafe.akka" %% "akka-agent" % "2.2.0",
   "org.scalatest" % "scalatest_2.10" % "1.9.1" % "test",
   "com.novocode" % "junit-interface" % "0.8" % "test->default",
   "org.scalacheck" %% "scalacheck" % "1.10.0" % "test",

diff --git a/src/main/scala/chalk/slab/ActorSlab.scala b/src/main/scala/chalk/slab/ActorSlab.scala
@@ -0,0 +1,136 @@
+//package chalk.slab
+//
+//import akka.actor.{Actor,ActorLogging,ActorSystem,Props}
+//import akka.pattern.ask
+//import akka.util.Timeout
+//import scala.collection.mutable.ListBuffer
+//import scala.concurrent.duration._
+//
+//case class ActorSlab(
+//  content: String,
+//  annotations: Map[String,Seq[Span]] = Map[String, Seq[Span]]()
+//) {
+//  
+//  override def toString = {
+//    val annotationString = (for ((attr, spans) <- annotations) yield {
+//      "  " + attr + ": " + spans.map(s=>"["+content.substring(s.start,s.end)+"]").mkString(" ")
+//    }).mkString("\n\n")
+//    s"$content\n\n$annotationString"
+//  }
+//}
+//
+//object ActorSlab {
+//
+//  def update(slab: ActorSlab, key: String, spansForKey: Seq[Span]) =
+//    ActorSlab(slab.content, slab.annotations ++ Map(key -> spansForKey.toSeq))
+//
+//}
+//  
+//case class Span(start: Int, end: Int)
+//
+//trait AnalysisComponent extends Actor with ActorLogging
+//
+//object AnalysisComponent {
+//  case class Process(slab: ActorSlab)
+//}
+//
+//class RegexSentenceDetector extends AnalysisComponent {
+//
+//  import AnalysisComponent._
+//  
+//  def receive = {
+//
+//    case Process(slab) =>
+//      val highestIndex = slab.content.length
+//      val matches = """[.?!]""".r.findAllMatchIn(slab.content)
+//      val enderIndices = ListBuffer(matches.map(_.end).toList: _*)
+//      if (enderIndices.last < highestIndex) 
+//        enderIndices += highestIndex
+//      
+//      val paired = enderIndices.toList.flatMap(i=> List(i,i+1))
+//      val spans = for (List(start,end) <- (0 :: paired).grouped(2)) yield Span(start,end)
+//      sender ! ActorSlab(slab.content, slab.annotations ++ Map("sentences" -> spans.toSeq))
+//  }
+//
+//}
+//
+//class WhitespaceTokenizer extends AnalysisComponent {
+//
+//  import AnalysisComponent._
+//  import scala.util.matching.Regex.Match
+//  
+//  def receive = {
+//    case Process(slab) =>
+//      val highestIndex = slab.content.length
+//      val tokenSpans = for {
+//        sentenceSpan <- slab.annotations("sentences")
+//        sentence = slab.content.substring(sentenceSpan.start,sentenceSpan.end)
+//        wsMatches = "\\s+".r.findAllMatchIn(sentence)
+//        span <- gappedSpans(wsMatches.toSeq, sentenceSpan.start, sentence.length)
+//      } yield {
+//        span
+//      }
+//      sender ! ActorSlab(slab.content, slab.annotations ++ Map("tokens" -> tokenSpans.toSeq))
+//  }
+//
+//  private def gappedSpans(foundMatches: Seq[Match], offset: Int, highestIndex: Int) = {
+//    val flattenedMatches = ListBuffer(foundMatches.flatMap(m => Seq(m.start,m.end)): _*)
+//    val allSpans = for {
+//      List(start,end) <- (0 +: flattenedMatches :+ highestIndex).toList.grouped(2)
+//    } yield {
+//      Span(start+offset,end+offset)
+//    }
+//    allSpans
+//  }
+//
+//}
+//
+//class DumbNer extends AnalysisComponent {
+//
+//  import AnalysisComponent._
+//
+//  def receive = {
+//
+//    case Process(slab) =>
+//      val dumbNerSpans = for {
+//        tokenSpan <- slab.annotations("tokens")
+//        token = slab.content.substring(tokenSpan.start, tokenSpan.end)
+//        if (token.head.isUpper)
+//      } yield {
+//        tokenSpan
+//      }
+//      sender ! ActorSlab.update(slab, "entities", dumbNerSpans)
+//  }
+//  
+//}
+//
+//
+//object AnalysisEngine {
+//
+//  import AnalysisComponent._
+//
+//  def main(args: Array[String]) {
+//    val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"
+//
+//    val slab = ActorSlab(text)
+//    val system = ActorSystem("ChalkSystem")
+//
+//    implicit val ec = system.dispatcher
+//    implicit val timeout = Timeout(10 seconds)    
+//
+//    val sentenceDetector = system.actorOf(Props[RegexSentenceDetector])
+//    val tokenizer = system.actorOf(Props[WhitespaceTokenizer])
+//    val ner = system.actorOf(Props[DumbNer])
+//
+//    for {
+//      slab1 <- (sentenceDetector ? Process(slab)).mapTo[ActorSlab]
+//      slab2 <- (tokenizer ? Process(slab1)).mapTo[ActorSlab]
+//      slab3 <- (ner ? Process(slab2)).mapTo[ActorSlab]
+//    } {
+//      println(slab3)
+//      system.shutdown
+//    }
+//    
+//  }
+//
+//}
diff --git a/src/main/scala/chalk/slab/AnalysisEngine.scala b/src/main/scala/chalk/slab/AnalysisEngine.scala
@@ -0,0 +1,79 @@
+package chalk.slab
+
+import akka.actor.{Actor,ActorLogging,ActorSystem,Props}
+import akka.pattern.ask
+import akka.util.Timeout
+import scala.collection.mutable.ListBuffer
+import scala.concurrent.duration._
+
+trait AnalysisComponent[X,Y,-Z<:Y,+W<:Y] extends Actor with ActorLogging {
+
+  import AnalysisComponent._
+
+  def process(slab: Slab[X,Y,Z]): Slab[X,Y,Y with W]
+
+  def receive = {
+    case Process(slab) =>
+      sender ! process(slab.asInstanceOf[Slab[X,Y,Z]])
+  }
+
+}
+
+object AnalysisComponent {
+  case class Process[X,Y,Z<:Y](slab: Slab[X,Y,Z])
+}
+
+class SentenceSegmenterActor[AnnotationTypes <: StringAnnotation]
+    extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Sentence] {
+
+  def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) = 
+    slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
+}
+
+class TokenizerActor[AnnotationTypes <: Sentence]
+    extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Token] {
+
+  def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) =
+    slab ++ slab.iterator[Sentence].flatMap(sentence =>
+      "\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
+        Token(sentence.begin + m.start, sentence.begin + m.end)))
+
+}
+
+
+object AnalysisEngine {
+
+  import AnalysisComponent._
+  import StringAnnotation._
+
+  def main(args: Array[String]) {
+    val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"
+
+    val slab = Slab(text)
+    val system = ActorSystem("ChalkSystem")
+
+    implicit val ec = system.dispatcher
+    implicit val timeout = Timeout(10 seconds)    
+
+
+    val sentenceSegmenter = system.actorOf(Props[SentenceSegmenterActor[StringAnnotation]])
+    val tokenizer = system.actorOf(Props[TokenizerActor[Sentence]])
+
+    for {
+      slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]]
+      slab2 <- (tokenizer ? Process(slab1)).mapTo[Slab[String,StringAnnotation,Sentence with Token]]
+    } {
+
+      // Notice that the last sentence (lacking EOS char) is missing.
+      val sentences = slab2.iterator[Sentence].toList
+      println("\nSENTENCES\n\n" + sentences.map(_.in(slab2).content).mkString("\n"))
+
+      val tokens = slab2.iterator[Token].toList
+      println("\nTOKENS\n\n" + tokens.map(_.in(slab2).content).mkString("\n"))
+
+      system.shutdown
+    }
+
+  }
+
+}
diff --git a/src/main/scala/chalk/slab/SLAB.scala → src/main/scala/chalk/slab/Slab.scala b/src/main/scala/chalk/slab/SLAB.scala → src/main/scala/chalk/slab/Slab.scala
@@ -2,11 +2,11 @@ package chalk.slab
 
 import scala.reflect.ClassTag
 
-trait SLAB[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationType] {
+trait Slab[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationType] {
 
   val content: ContentType
 
-  def ++[A <: BaseAnnotationType](annotations: Iterator[A]): SLAB[ContentType, BaseAnnotationType, AnnotationTypes with A]
+  def ++[A <: BaseAnnotationType](annotations: Iterator[A]): Slab[ContentType, BaseAnnotationType, AnnotationTypes with A]
 
   def iterator[A >: AnnotationTypes <: BaseAnnotationType: ClassTag]: Iterator[A]
 
@@ -15,11 +15,16 @@ trait SLAB[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationTy
   def preceding[A >: AnnotationTypes <: BaseAnnotationType: ClassTag](annotation: BaseAnnotationType): Iterator[A]
 
   def following[A >: AnnotationTypes <: BaseAnnotationType: ClassTag](annotation: BaseAnnotationType): Iterator[A]
+
+  def stringRep[A >: AnnotationTypes <: BaseAnnotationType: ClassTag] = {
+    iterator[A].mkString("\n")
+  }
+
 }
 
-abstract class SLABAnnotationOps[ContentType, BaseAnnotationType, AnnotationType >: AnnotationTypes <: BaseAnnotationType: ClassTag, AnnotationTypes <: BaseAnnotationType](
+abstract class SlabAnnotationOps[ContentType, BaseAnnotationType, AnnotationType >: AnnotationTypes <: BaseAnnotationType: ClassTag, AnnotationTypes <: BaseAnnotationType](
   val annotation: AnnotationType,
-  val slab: SLAB[ContentType, BaseAnnotationType, AnnotationTypes]) {
+  val slab: Slab[ContentType, BaseAnnotationType, AnnotationTypes]) {
 
   def content: ContentType
 
@@ -30,9 +35,39 @@ abstract class SLABAnnotationOps[ContentType, BaseAnnotationType, AnnotationType
   def following[A >: AnnotationTypes <: BaseAnnotationType: ClassTag] = this.slab.following[A](this.annotation)
 }
 
-object SLAB {
-  def apply[ContentType, BaseAnnotationType: HasBounds](content: ContentType): SLAB[ContentType, BaseAnnotationType, BaseAnnotationType] =
-    new HorribleInefficientSLAB(content)
+// =========================
+// Annotation infrastructure
+// =========================
+trait StringAnnotation {
+  val begin: Int
+  val end: Int
+  def in[AnnotationTypes <: StringAnnotation](slab: Slab[String, StringAnnotation, AnnotationTypes]) =
+    new SlabAnnotationOps(this, slab) {
+      def content = this.slab.content.substring(this.annotation.begin, this.annotation.end)
+    }
+}
+
+object StringAnnotation {
+  implicit object StringAnnotationHasBounds extends Slab.HasBounds[StringAnnotation] {
+    def covers(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
+      annotation1.begin <= annotation2.begin && annotation2.end <= annotation1.end
+    def follows(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
+      annotation2.end <= annotation1.begin
+    def precedes(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
+      annotation1.end <= annotation2.begin
+  }
+}
+
+// ===========
+// Annotations
+// ===========
+case class Sentence(val begin: Int, val end: Int) extends StringAnnotation
+case class Token(val begin: Int, val end: Int) extends StringAnnotation
+
+
+object Slab {
+  def apply[ContentType, BaseAnnotationType: HasBounds](content: ContentType): Slab[ContentType, BaseAnnotationType, BaseAnnotationType] =
+    new HorribleInefficientSlab(content)
 
   /**
    * This trait has the minimum necessary for the implementation below.
@@ -45,14 +80,14 @@ object SLAB {
     def follows(annotation1: AnnotationType, annotation2: AnnotationType): Boolean
   }
 
-  private[slab] class HorribleInefficientSLAB[ContentType, BaseAnnotationType, AnnotationTypes <: BaseAnnotationType](
+  private[slab] class HorribleInefficientSlab[ContentType, BaseAnnotationType, AnnotationTypes <: BaseAnnotationType](
     val content: ContentType,
     val _annotations: Seq[Any] = Seq.empty)(
       implicit hasBounds: HasBounds[BaseAnnotationType])
-    extends SLAB[ContentType, BaseAnnotationType, AnnotationTypes] {
+    extends Slab[ContentType, BaseAnnotationType, AnnotationTypes] {
 
-    def ++[AnnotationType](annotations: Iterator[AnnotationType]): SLAB[ContentType, BaseAnnotationType, AnnotationTypes with AnnotationType] =
-      new HorribleInefficientSLAB(this.content, this._annotations ++ annotations)
+    def ++[AnnotationType](annotations: Iterator[AnnotationType]): Slab[ContentType, BaseAnnotationType, AnnotationTypes with AnnotationType] =
+      new HorribleInefficientSlab(this.content, this._annotations ++ annotations)
 
     def iterator[A >: AnnotationTypes <: BaseAnnotationType: ClassTag]: Iterator[A] =
       this._annotations.iterator.collect {
@@ -67,5 +102,6 @@ object SLAB {
 
     def preceding[A >: AnnotationTypes <: BaseAnnotationType: ClassTag](annotation: BaseAnnotationType): Iterator[A] =
       this.iterator[A].filter(a => hasBounds.precedes(a, annotation)).toSeq.reverseIterator
+
   }
 }