Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
Draft of actor based system for using Slabs.
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonbaldridge committed Jul 21, 2013
1 parent a400ed8 commit 856c767
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 50 deletions.
2 changes: 2 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ resolvers ++= Seq(
)

libraryDependencies ++= Seq(
"com.typesafe.akka" %% "akka-actor" % "2.2.0",
"com.typesafe.akka" %% "akka-agent" % "2.2.0",
"org.scalatest" % "scalatest_2.10" % "1.9.1" % "test",
"com.novocode" % "junit-interface" % "0.8" % "test->default",
"org.scalacheck" %% "scalacheck" % "1.10.0" % "test",
Expand Down
136 changes: 136 additions & 0 deletions src/main/scala/chalk/slab/ActorSlab.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
//package chalk.slab
//
//import akka.actor.{Actor,ActorLogging,ActorSystem,Props}
//import akka.pattern.ask
//import akka.util.Timeout
//import scala.collection.mutable.ListBuffer
//import scala.concurrent.duration._
//
//case class ActorSlab(
// content: String,
// annotations: Map[String,Seq[Span]] = Map[String, Seq[Span]]()
//) {
//
// override def toString = {
// val annotationString = (for ((attr, spans) <- annotations) yield {
// " " + attr + ": " + spans.map(s=>"["+content.substring(s.start,s.end)+"]").mkString(" ")
// }).mkString("\n\n")
// s"$content\n\n$annotationString"
// }
//}
//
//object ActorSlab {
//
// def update(slab: ActorSlab, key: String, spansForKey: Seq[Span]) =
// ActorSlab(slab.content, slab.annotations ++ Map(key -> spansForKey.toSeq))
//
//}
//
//case class Span(start: Int, end: Int)
//
//trait AnalysisComponent extends Actor with ActorLogging
//
//object AnalysisComponent {
// case class Process(slab: ActorSlab)
//}
//
//class RegexSentenceDetector extends AnalysisComponent {
//
// import AnalysisComponent._
//
// def receive = {
//
// case Process(slab) =>
// val highestIndex = slab.content.length
// val matches = """[.?!]""".r.findAllMatchIn(slab.content)
// val enderIndices = ListBuffer(matches.map(_.end).toList: _*)
// if (enderIndices.last < highestIndex)
// enderIndices += highestIndex
//
// val paired = enderIndices.toList.flatMap(i=> List(i,i+1))
// val spans = for (List(start,end) <- (0 :: paired).grouped(2)) yield Span(start,end)
// sender ! ActorSlab(slab.content, slab.annotations ++ Map("sentences" -> spans.toSeq))
// }
//
//}
//
//class WhitespaceTokenizer extends AnalysisComponent {
//
// import AnalysisComponent._
// import scala.util.matching.Regex.Match
//
// def receive = {
// case Process(slab) =>
// val highestIndex = slab.content.length
// val tokenSpans = for {
// sentenceSpan <- slab.annotations("sentences")
// sentence = slab.content.substring(sentenceSpan.start,sentenceSpan.end)
// wsMatches = "\\s+".r.findAllMatchIn(sentence)
// span <- gappedSpans(wsMatches.toSeq, sentenceSpan.start, sentence.length)
// } yield {
// span
// }
// sender ! ActorSlab(slab.content, slab.annotations ++ Map("tokens" -> tokenSpans.toSeq))
// }
//
// private def gappedSpans(foundMatches: Seq[Match], offset: Int, highestIndex: Int) = {
// val flattenedMatches = ListBuffer(foundMatches.flatMap(m => Seq(m.start,m.end)): _*)
// val allSpans = for {
// List(start,end) <- (0 +: flattenedMatches :+ highestIndex).toList.grouped(2)
// } yield {
// Span(start+offset,end+offset)
// }
// allSpans
// }
//
//}
//
//class DumbNer extends AnalysisComponent {
//
// import AnalysisComponent._
//
// def receive = {
//
// case Process(slab) =>
// val dumbNerSpans = for {
// tokenSpan <- slab.annotations("tokens")
// token = slab.content.substring(tokenSpan.start, tokenSpan.end)
// if (token.head.isUpper)
// } yield {
// tokenSpan
// }
// sender ! ActorSlab.update(slab, "entities", dumbNerSpans)
// }
//
//}
//
//
//object AnalysisEngine {
//
// import AnalysisComponent._
//
// def main(args: Array[String]) {
// val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"
//
// val slab = ActorSlab(text)
// val system = ActorSystem("ChalkSystem")
//
// implicit val ec = system.dispatcher
// implicit val timeout = Timeout(10 seconds)
//
// val sentenceDetector = system.actorOf(Props[RegexSentenceDetector])
// val tokenizer = system.actorOf(Props[WhitespaceTokenizer])
// val ner = system.actorOf(Props[DumbNer])
//
// for {
// slab1 <- (sentenceDetector ? Process(slab)).mapTo[ActorSlab]
// slab2 <- (tokenizer ? Process(slab1)).mapTo[ActorSlab]
// slab3 <- (ner ? Process(slab2)).mapTo[ActorSlab]
// } {
// println(slab3)
// system.shutdown
// }
//
// }
//
//}
79 changes: 79 additions & 0 deletions src/main/scala/chalk/slab/AnalysisEngine.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package chalk.slab

import akka.actor.{Actor,ActorLogging,ActorSystem,Props}
import akka.pattern.ask
import akka.util.Timeout
import scala.collection.mutable.ListBuffer
import scala.concurrent.duration._

trait AnalysisComponent[X,Y,-Z<:Y,+W<:Y] extends Actor with ActorLogging {

import AnalysisComponent._

def process(slab: Slab[X,Y,Z]): Slab[X,Y,Y with W]

This comment has been minimized.

Copy link
@bethard

bethard Jul 21, 2013

Contributor

Perhaps instead of defining a process method, we could make AnalysisComponent extend (Slab[X,Y,Z] => Slab[X,Y,Y with W]) and call the method apply? That would allow users to easily switch back and forth between Akka Actors and traditional function composition.

This comment has been minimized.

Copy link
@jasonbaldridge

jasonbaldridge via email Jul 21, 2013

Author Member

def receive = {
case Process(slab) =>
sender ! process(slab.asInstanceOf[Slab[X,Y,Z]])
}

}

object AnalysisComponent {
case class Process[X,Y,Z<:Y](slab: Slab[X,Y,Z])
}

class SentenceSegmenterActor[AnnotationTypes <: StringAnnotation]
extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Sentence] {

def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) =
slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
}

class TokenizerActor[AnnotationTypes <: Sentence]
extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Token] {

def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) =
slab ++ slab.iterator[Sentence].flatMap(sentence =>
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
Token(sentence.begin + m.start, sentence.begin + m.end)))

}


object AnalysisEngine {

import AnalysisComponent._
import StringAnnotation._

def main(args: Array[String]) {
val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"

val slab = Slab(text)
val system = ActorSystem("ChalkSystem")

implicit val ec = system.dispatcher
implicit val timeout = Timeout(10 seconds)


val sentenceSegmenter = system.actorOf(Props[SentenceSegmenterActor[StringAnnotation]])
val tokenizer = system.actorOf(Props[TokenizerActor[Sentence]])

for {
slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]]
slab2 <- (tokenizer ? Process(slab1)).mapTo[Slab[String,StringAnnotation,Sentence with Token]]
} {

// Notice that the last sentence (lacking EOS char) is missing.
val sentences = slab2.iterator[Sentence].toList
println("\nSENTENCES\n\n" + sentences.map(_.in(slab2).content).mkString("\n"))

val tokens = slab2.iterator[Token].toList
println("\nTOKENS\n\n" + tokens.map(_.in(slab2).content).mkString("\n"))

system.shutdown
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@ package chalk.slab

import scala.reflect.ClassTag

trait SLAB[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationType] {
trait Slab[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationType] {

val content: ContentType

def ++[A <: BaseAnnotationType](annotations: Iterator[A]): SLAB[ContentType, BaseAnnotationType, AnnotationTypes with A]
def ++[A <: BaseAnnotationType](annotations: Iterator[A]): Slab[ContentType, BaseAnnotationType, AnnotationTypes with A]

def iterator[A >: AnnotationTypes <: BaseAnnotationType: ClassTag]: Iterator[A]

Expand All @@ -15,11 +15,16 @@ trait SLAB[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationTy
def preceding[A >: AnnotationTypes <: BaseAnnotationType: ClassTag](annotation: BaseAnnotationType): Iterator[A]

def following[A >: AnnotationTypes <: BaseAnnotationType: ClassTag](annotation: BaseAnnotationType): Iterator[A]

def stringRep[A >: AnnotationTypes <: BaseAnnotationType: ClassTag] = {
iterator[A].mkString("\n")
}

}

abstract class SLABAnnotationOps[ContentType, BaseAnnotationType, AnnotationType >: AnnotationTypes <: BaseAnnotationType: ClassTag, AnnotationTypes <: BaseAnnotationType](
abstract class SlabAnnotationOps[ContentType, BaseAnnotationType, AnnotationType >: AnnotationTypes <: BaseAnnotationType: ClassTag, AnnotationTypes <: BaseAnnotationType](
val annotation: AnnotationType,
val slab: SLAB[ContentType, BaseAnnotationType, AnnotationTypes]) {
val slab: Slab[ContentType, BaseAnnotationType, AnnotationTypes]) {

def content: ContentType

Expand All @@ -30,9 +35,39 @@ abstract class SLABAnnotationOps[ContentType, BaseAnnotationType, AnnotationType
def following[A >: AnnotationTypes <: BaseAnnotationType: ClassTag] = this.slab.following[A](this.annotation)
}

object SLAB {
def apply[ContentType, BaseAnnotationType: HasBounds](content: ContentType): SLAB[ContentType, BaseAnnotationType, BaseAnnotationType] =
new HorribleInefficientSLAB(content)
// =========================
// Annotation infrastructure
// =========================
trait StringAnnotation {
val begin: Int
val end: Int
def in[AnnotationTypes <: StringAnnotation](slab: Slab[String, StringAnnotation, AnnotationTypes]) =
new SlabAnnotationOps(this, slab) {
def content = this.slab.content.substring(this.annotation.begin, this.annotation.end)
}
}

object StringAnnotation {
implicit object StringAnnotationHasBounds extends Slab.HasBounds[StringAnnotation] {
def covers(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
annotation1.begin <= annotation2.begin && annotation2.end <= annotation1.end
def follows(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
annotation2.end <= annotation1.begin
def precedes(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
annotation1.end <= annotation2.begin
}
}

// ===========
// Annotations
// ===========
case class Sentence(val begin: Int, val end: Int) extends StringAnnotation
case class Token(val begin: Int, val end: Int) extends StringAnnotation


object Slab {
def apply[ContentType, BaseAnnotationType: HasBounds](content: ContentType): Slab[ContentType, BaseAnnotationType, BaseAnnotationType] =
new HorribleInefficientSlab(content)

/**
* This trait has the minimum necessary for the implementation below.
Expand All @@ -45,14 +80,14 @@ object SLAB {
def follows(annotation1: AnnotationType, annotation2: AnnotationType): Boolean
}

private[slab] class HorribleInefficientSLAB[ContentType, BaseAnnotationType, AnnotationTypes <: BaseAnnotationType](
private[slab] class HorribleInefficientSlab[ContentType, BaseAnnotationType, AnnotationTypes <: BaseAnnotationType](
val content: ContentType,
val _annotations: Seq[Any] = Seq.empty)(
implicit hasBounds: HasBounds[BaseAnnotationType])
extends SLAB[ContentType, BaseAnnotationType, AnnotationTypes] {
extends Slab[ContentType, BaseAnnotationType, AnnotationTypes] {

def ++[AnnotationType](annotations: Iterator[AnnotationType]): SLAB[ContentType, BaseAnnotationType, AnnotationTypes with AnnotationType] =
new HorribleInefficientSLAB(this.content, this._annotations ++ annotations)
def ++[AnnotationType](annotations: Iterator[AnnotationType]): Slab[ContentType, BaseAnnotationType, AnnotationTypes with AnnotationType] =
new HorribleInefficientSlab(this.content, this._annotations ++ annotations)

def iterator[A >: AnnotationTypes <: BaseAnnotationType: ClassTag]: Iterator[A] =
this._annotations.iterator.collect {
Expand All @@ -67,5 +102,6 @@ object SLAB {

def preceding[A >: AnnotationTypes <: BaseAnnotationType: ClassTag](annotation: BaseAnnotationType): Iterator[A] =
this.iterator[A].filter(a => hasBounds.precedes(a, annotation)).toSeq.reverseIterator

}
}
Loading

0 comments on commit 856c767

Please sign in to comment.