diff --git a/src/main/scala/chalk/slab/AnalysisEngine.scala b/src/main/scala/chalk/slab/AnalysisEngine.scala index 5efafd1..eb8d92a 100644 --- a/src/main/scala/chalk/slab/AnalysisEngine.scala +++ b/src/main/scala/chalk/slab/AnalysisEngine.scala @@ -6,23 +6,11 @@ import akka.util.Timeout import scala.collection.mutable.ListBuffer import scala.concurrent.duration._ -/** - * An analysis function that takes a Slab with declared annotation types in it and outputs - * a new Slab with additional annotations of a new type. - * - * Documentation for the type variables: - * C = Content type - * B = Base annonation type - * I = Input annotation type - * O = Output annotation type - */ -trait AnalysisFunction[C,B,-I<:B,+O<:B] extends (Slab[C,B,I] => Slab[C,B,B with O]) - /** * An actor that mixes-in an AnalysisFunction and hands Slabs contained in Process messages over * to the function. */ -trait AnalysisComponent[C,B,-I<:B,+O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] { +trait AnalysisComponent[C,B,I<:B,O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] { import AnalysisComponent._ def receive = { case Process(slab) => sender ! apply(slab.asInstanceOf[Slab[C,B,I]]) @@ -36,30 +24,12 @@ object AnalysisComponent { case class Process[C,B,I<:B](slab: Slab[C,B,I]) } -/** - * A simple regex sentence segmenter. - */ -trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] { - def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = - slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end)) -} - /** * An actor that uses SentenceSegmenter. */ class SentenceSegmenterActor extends SentenceSegmenter with AnalysisComponent[String,StringAnnotation,StringAnnotation,Sentence] -/** - * A simple regex tokenizer. - */ -trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] { - def apply(slab: Slab[String, StringAnnotation, Sentence]) = - slab ++ slab.iterator[Sentence].flatMap(sentence => - "\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m => - Token(sentence.begin + m.start, sentence.begin + m.end))) -} - /** * An actor that uses Tokenizer. */ @@ -73,8 +43,9 @@ object AnalysisEngine { import AnalysisComponent._ import StringAnnotation._ + val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation" + def main(args: Array[String]) { - val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation" val slab = Slab(text) val system = ActorSystem("ChalkSystem") diff --git a/src/main/scala/chalk/slab/AnalysisFunction.scala b/src/main/scala/chalk/slab/AnalysisFunction.scala new file mode 100644 index 0000000..950a3df --- /dev/null +++ b/src/main/scala/chalk/slab/AnalysisFunction.scala @@ -0,0 +1,56 @@ +package chalk.slab + +/** + * An analysis function that takes a Slab with declared annotation types in it and outputs + * a new Slab with additional annotations of a new type. + * + * Documentation for the type variables: + * C = Content type + * B = Base annonation type + * I = Input annotation type + * O = Output annotation type + */ +trait AnalysisFunction[C,B,I<:B,O<:B] extends (Slab[C,B,I] => Slab[C,B,B with I with O]) + +object StringIdentityAnalyzer extends AnalysisFunction[String, StringAnnotation, StringAnnotation, StringAnnotation] { + def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = slab +} + +/** + * A simple regex sentence segmenter. + */ +trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] { + def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = + slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end)) +} + +/** + * A simple regex tokenizer. + */ +trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] { + def apply(slab: Slab[String, StringAnnotation, Sentence]) = + slab ++ slab.iterator[Sentence].flatMap(sentence => + "\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m => + Token(sentence.begin + m.start, sentence.begin + m.end))) +} + + +object AnalysisPipeline { + import StringAnnotation._ + + def main (args: Array[String]) { + val sentenceSegmenter = new SentenceSegmenter{} + val tokenizer = new Tokenizer {} + val pipeline = StringIdentityAnalyzer andThen sentenceSegmenter andThen tokenizer + val slab = pipeline(Slab(AnalysisEngine.text)) + // Notice that the last sentence (lacking EOS char) is missing. + val sentences = slab.iterator[Sentence].toList + println("\nSENTENCES\n\n" + sentences.map(_.in(slab).content).mkString("\n")) + + val tokens = slab.iterator[Token].toList + println("\nTOKENS\n\n" + tokens.map(_.in(slab).content).mkString("\n")) + + } + + +}