Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
Merge pull request #20 from bethard/master
Browse files Browse the repository at this point in the history
Adds input type parameter to SentenceSegmenter and Tokenizer
  • Loading branch information
jasonbaldridge committed Jul 22, 2013
2 parents 8dd6388 + a86a1ee commit a441814
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 12 deletions.
4 changes: 2 additions & 2 deletions src/main/scala/chalk/slab/AnalysisEngine.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ object AnalysisComponent {
/**
* An actor that uses SentenceSegmenter.
*/
class SentenceSegmenterActor extends SentenceSegmenter
class SentenceSegmenterActor extends SentenceSegmenter[StringAnnotation]
with AnalysisComponent[String,StringAnnotation,StringAnnotation,Sentence]

/**
* An actor that uses Tokenizer.
*/
class TokenizerActor extends AnalysisComponent[String, StringAnnotation, Sentence, Token] with Tokenizer
class TokenizerActor extends AnalysisComponent[String, StringAnnotation, Sentence, Token] with Tokenizer[Sentence]


/**
Expand Down
31 changes: 21 additions & 10 deletions src/main/scala/chalk/slab/AnalysisFunction.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,30 +19,41 @@ object StringIdentityAnalyzer extends AnalysisFunction[String, StringAnnotation,
/**
* A simple regex sentence segmenter.
*/
trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] {
def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) =
slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
trait SentenceSegmenter[I <: StringAnnotation] extends AnalysisFunction[String, StringAnnotation, I, Sentence] {
def apply(slab: Slab[String, StringAnnotation, I]) =
// the [Sentence] is required because of https://issues.scala-lang.org/browse/SI-7647
slab.++[Sentence]("[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end)))
}

/**
* A simple regex tokenizer.
*/
trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] {
def apply(slab: Slab[String, StringAnnotation, Sentence]) =
slab ++ slab.iterator[Sentence].flatMap(sentence =>
trait Tokenizer[I <: Sentence] extends AnalysisFunction[String, StringAnnotation, I, Token] {
def apply(slab: Slab[String, StringAnnotation, I]) =
// the [Token] is required because of https://issues.scala-lang.org/browse/SI-7647
slab.++[Token](slab.iterator[Sentence].flatMap(sentence =>
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
Token(sentence.begin + m.start, sentence.begin + m.end)))
Token(sentence.begin + m.start, sentence.begin + m.end))))
}


object AnalysisPipeline {
import StringAnnotation._

// added only to demonstrate necesssity of [I] parameter on analyzers
private[AnalysisPipeline] case class Document(val begin: Int, val end: Int) extends StringAnnotation
private[AnalysisPipeline] def documentAdder(slab: Slab[String, StringAnnotation, StringAnnotation]) =
slab ++ Iterator(Document(0, slab.content.length))

def main (args: Array[String]) {
val sentenceSegmenter = new SentenceSegmenter{}
val tokenizer = new Tokenizer {}
val pipeline = StringIdentityAnalyzer andThen sentenceSegmenter andThen tokenizer
def sentenceSegmenter[I <: StringAnnotation] = new SentenceSegmenter[I]{}
def tokenizer[I <: Sentence] = new Tokenizer[I]{}
val pipeline = StringIdentityAnalyzer andThen documentAdder andThen sentenceSegmenter andThen tokenizer
val slab = pipeline(Slab(AnalysisEngine.text1))

// added only to demonstrate necesssity of [I] parameter on analyzers
val paragraphs = slab.iterator[Document].toList

// Notice that the last sentence (lacking EOS char) is missing.
val sentences = slab.iterator[Sentence].toList
println("\nSENTENCES\n\n" + sentences.map(_.in(slab).content).mkString("\n"))
Expand Down

0 comments on commit a441814

Please sign in to comment.