This repository has been archived by the owner on May 29, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Draft of actor based system for using Slabs.
- Loading branch information
1 parent
a400ed8
commit 856c767
Showing
5 changed files
with
277 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
//package chalk.slab | ||
// | ||
//import akka.actor.{Actor,ActorLogging,ActorSystem,Props} | ||
//import akka.pattern.ask | ||
//import akka.util.Timeout | ||
//import scala.collection.mutable.ListBuffer | ||
//import scala.concurrent.duration._ | ||
// | ||
//case class ActorSlab( | ||
// content: String, | ||
// annotations: Map[String,Seq[Span]] = Map[String, Seq[Span]]() | ||
//) { | ||
// | ||
// override def toString = { | ||
// val annotationString = (for ((attr, spans) <- annotations) yield { | ||
// " " + attr + ": " + spans.map(s=>"["+content.substring(s.start,s.end)+"]").mkString(" ") | ||
// }).mkString("\n\n") | ||
// s"$content\n\n$annotationString" | ||
// } | ||
//} | ||
// | ||
//object ActorSlab { | ||
// | ||
// def update(slab: ActorSlab, key: String, spansForKey: Seq[Span]) = | ||
// ActorSlab(slab.content, slab.annotations ++ Map(key -> spansForKey.toSeq)) | ||
// | ||
//} | ||
// | ||
//case class Span(start: Int, end: Int) | ||
// | ||
//trait AnalysisComponent extends Actor with ActorLogging | ||
// | ||
//object AnalysisComponent { | ||
// case class Process(slab: ActorSlab) | ||
//} | ||
// | ||
//class RegexSentenceDetector extends AnalysisComponent { | ||
// | ||
// import AnalysisComponent._ | ||
// | ||
// def receive = { | ||
// | ||
// case Process(slab) => | ||
// val highestIndex = slab.content.length | ||
// val matches = """[.?!]""".r.findAllMatchIn(slab.content) | ||
// val enderIndices = ListBuffer(matches.map(_.end).toList: _*) | ||
// if (enderIndices.last < highestIndex) | ||
// enderIndices += highestIndex | ||
// | ||
// val paired = enderIndices.toList.flatMap(i=> List(i,i+1)) | ||
// val spans = for (List(start,end) <- (0 :: paired).grouped(2)) yield Span(start,end) | ||
// sender ! ActorSlab(slab.content, slab.annotations ++ Map("sentences" -> spans.toSeq)) | ||
// } | ||
// | ||
//} | ||
// | ||
//class WhitespaceTokenizer extends AnalysisComponent { | ||
// | ||
// import AnalysisComponent._ | ||
// import scala.util.matching.Regex.Match | ||
// | ||
// def receive = { | ||
// case Process(slab) => | ||
// val highestIndex = slab.content.length | ||
// val tokenSpans = for { | ||
// sentenceSpan <- slab.annotations("sentences") | ||
// sentence = slab.content.substring(sentenceSpan.start,sentenceSpan.end) | ||
// wsMatches = "\\s+".r.findAllMatchIn(sentence) | ||
// span <- gappedSpans(wsMatches.toSeq, sentenceSpan.start, sentence.length) | ||
// } yield { | ||
// span | ||
// } | ||
// sender ! ActorSlab(slab.content, slab.annotations ++ Map("tokens" -> tokenSpans.toSeq)) | ||
// } | ||
// | ||
// private def gappedSpans(foundMatches: Seq[Match], offset: Int, highestIndex: Int) = { | ||
// val flattenedMatches = ListBuffer(foundMatches.flatMap(m => Seq(m.start,m.end)): _*) | ||
// val allSpans = for { | ||
// List(start,end) <- (0 +: flattenedMatches :+ highestIndex).toList.grouped(2) | ||
// } yield { | ||
// Span(start+offset,end+offset) | ||
// } | ||
// allSpans | ||
// } | ||
// | ||
//} | ||
// | ||
//class DumbNer extends AnalysisComponent { | ||
// | ||
// import AnalysisComponent._ | ||
// | ||
// def receive = { | ||
// | ||
// case Process(slab) => | ||
// val dumbNerSpans = for { | ||
// tokenSpan <- slab.annotations("tokens") | ||
// token = slab.content.substring(tokenSpan.start, tokenSpan.end) | ||
// if (token.head.isUpper) | ||
// } yield { | ||
// tokenSpan | ||
// } | ||
// sender ! ActorSlab.update(slab, "entities", dumbNerSpans) | ||
// } | ||
// | ||
//} | ||
// | ||
// | ||
//object AnalysisEngine { | ||
// | ||
// import AnalysisComponent._ | ||
// | ||
// def main(args: Array[String]) { | ||
// val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation" | ||
// | ||
// val slab = ActorSlab(text) | ||
// val system = ActorSystem("ChalkSystem") | ||
// | ||
// implicit val ec = system.dispatcher | ||
// implicit val timeout = Timeout(10 seconds) | ||
// | ||
// val sentenceDetector = system.actorOf(Props[RegexSentenceDetector]) | ||
// val tokenizer = system.actorOf(Props[WhitespaceTokenizer]) | ||
// val ner = system.actorOf(Props[DumbNer]) | ||
// | ||
// for { | ||
// slab1 <- (sentenceDetector ? Process(slab)).mapTo[ActorSlab] | ||
// slab2 <- (tokenizer ? Process(slab1)).mapTo[ActorSlab] | ||
// slab3 <- (ner ? Process(slab2)).mapTo[ActorSlab] | ||
// } { | ||
// println(slab3) | ||
// system.shutdown | ||
// } | ||
// | ||
// } | ||
// | ||
//} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
package chalk.slab | ||
|
||
import akka.actor.{Actor,ActorLogging,ActorSystem,Props} | ||
import akka.pattern.ask | ||
import akka.util.Timeout | ||
import scala.collection.mutable.ListBuffer | ||
import scala.concurrent.duration._ | ||
|
||
trait AnalysisComponent[X,Y,-Z<:Y,+W<:Y] extends Actor with ActorLogging { | ||
|
||
import AnalysisComponent._ | ||
|
||
def process(slab: Slab[X,Y,Z]): Slab[X,Y,Y with W] | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
jasonbaldridge
via email
Author
Member
|
||
|
||
def receive = { | ||
case Process(slab) => | ||
sender ! process(slab.asInstanceOf[Slab[X,Y,Z]]) | ||
} | ||
|
||
} | ||
|
||
object AnalysisComponent { | ||
case class Process[X,Y,Z<:Y](slab: Slab[X,Y,Z]) | ||
} | ||
|
||
class SentenceSegmenterActor[AnnotationTypes <: StringAnnotation] | ||
extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Sentence] { | ||
|
||
def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) = | ||
slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end)) | ||
} | ||
|
||
class TokenizerActor[AnnotationTypes <: Sentence] | ||
extends AnalysisComponent[String, StringAnnotation, AnnotationTypes, Token] { | ||
|
||
def process(slab: Slab[String, StringAnnotation, AnnotationTypes]) = | ||
slab ++ slab.iterator[Sentence].flatMap(sentence => | ||
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m => | ||
Token(sentence.begin + m.start, sentence.begin + m.end))) | ||
|
||
} | ||
|
||
|
||
object AnalysisEngine { | ||
|
||
import AnalysisComponent._ | ||
import StringAnnotation._ | ||
|
||
def main(args: Array[String]) { | ||
val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation" | ||
|
||
val slab = Slab(text) | ||
val system = ActorSystem("ChalkSystem") | ||
|
||
implicit val ec = system.dispatcher | ||
implicit val timeout = Timeout(10 seconds) | ||
|
||
|
||
val sentenceSegmenter = system.actorOf(Props[SentenceSegmenterActor[StringAnnotation]]) | ||
val tokenizer = system.actorOf(Props[TokenizerActor[Sentence]]) | ||
|
||
for { | ||
slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]] | ||
slab2 <- (tokenizer ? Process(slab1)).mapTo[Slab[String,StringAnnotation,Sentence with Token]] | ||
} { | ||
|
||
// Notice that the last sentence (lacking EOS char) is missing. | ||
val sentences = slab2.iterator[Sentence].toList | ||
println("\nSENTENCES\n\n" + sentences.map(_.in(slab2).content).mkString("\n")) | ||
|
||
val tokens = slab2.iterator[Token].toList | ||
println("\nTOKENS\n\n" + tokens.map(_.in(slab2).content).mkString("\n")) | ||
|
||
system.shutdown | ||
} | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Perhaps instead of defining a
process
method, we could make AnalysisComponent extend(Slab[X,Y,Z] => Slab[X,Y,Y with W])
and call the methodapply
? That would allow users to easily switch back and forth between Akka Actors and traditional function composition.