diff --git a/src/main/scala/chalk/slab/AnalysisEngine.scala b/src/main/scala/chalk/slab/AnalysisEngine.scala index 1c8b849..5efafd1 100644 --- a/src/main/scala/chalk/slab/AnalysisEngine.scala +++ b/src/main/scala/chalk/slab/AnalysisEngine.scala @@ -6,41 +6,68 @@ import akka.util.Timeout import scala.collection.mutable.ListBuffer import scala.concurrent.duration._ -trait AnalysisComponent[X,Y,-Z<:Y,+W<:Y] extends Actor with ActorLogging { - +/** + * An analysis function that takes a Slab with declared annotation types in it and outputs + * a new Slab with additional annotations of a new type. + * + * Documentation for the type variables: + * C = Content type + * B = Base annonation type + * I = Input annotation type + * O = Output annotation type + */ +trait AnalysisFunction[C,B,-I<:B,+O<:B] extends (Slab[C,B,I] => Slab[C,B,B with O]) + +/** + * An actor that mixes-in an AnalysisFunction and hands Slabs contained in Process messages over + * to the function. + */ +trait AnalysisComponent[C,B,-I<:B,+O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] { import AnalysisComponent._ - - def process(slab: Slab[X,Y,Z]): Slab[X,Y,Y with W] - def receive = { - case Process(slab) => - sender ! process(slab.asInstanceOf[Slab[X,Y,Z]]) + case Process(slab) => sender ! apply(slab.asInstanceOf[Slab[C,B,I]]) } - } +/** + * Companion object, e.g. to hold messages that can be processed by an AnalysisComponent actor. + */ object AnalysisComponent { - case class Process[X,Y,Z<:Y](slab: Slab[X,Y,Z]) + case class Process[C,B,I<:B](slab: Slab[C,B,I]) } -class SentenceSegmenterActor[AnnotationTypes <: StringAnnotation] - extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Sentence] { - - def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) = +/** + * A simple regex sentence segmenter. + */ +trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] { + def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end)) } -class TokenizerActor[AnnotationTypes <: Sentence] - extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Token] { - - def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) = +/** + * An actor that uses SentenceSegmenter. + */ +class SentenceSegmenterActor extends SentenceSegmenter + with AnalysisComponent[String,StringAnnotation,StringAnnotation,Sentence] + +/** + * A simple regex tokenizer. + */ +trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] { + def apply(slab: Slab[String, StringAnnotation, Sentence]) = slab ++ slab.iterator[Sentence].flatMap(sentence => "\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m => Token(sentence.begin + m.start, sentence.begin + m.end))) - } +/** + * An actor that uses Tokenizer. + */ +class TokenizerActor extends AnalysisComponent[String, StringAnnotation, Sentence, Token] with Tokenizer +/** + * Example application doing actor based Slab processing. + */ object AnalysisEngine { import AnalysisComponent._ @@ -56,8 +83,8 @@ object AnalysisEngine { implicit val timeout = Timeout(10 seconds) - val sentenceSegmenter = system.actorOf(Props[SentenceSegmenterActor[StringAnnotation]]) - val tokenizer = system.actorOf(Props[TokenizerActor[Sentence]]) + val sentenceSegmenter = system.actorOf(Props[SentenceSegmenterActor]) + val tokenizer = system.actorOf(Props[TokenizerActor]) for { slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]]