Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
Merge pull request #22 from bethard/masc-slab
Browse files Browse the repository at this point in the history
Adds support for loading MASC annotations into a Slab (issue #19)
  • Loading branch information
jasonbaldridge committed Jul 28, 2013
2 parents b3d3421 + bb5116c commit f073dd1
Show file tree
Hide file tree
Showing 19 changed files with 3,671 additions and 4 deletions.
120 changes: 118 additions & 2 deletions src/main/scala/chalk/corpora/MascUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@ package chalk.corpora
import scala.xml._
import java.io._
import io.Codec
import java.net.URL
import chalk.slab.Slab
import chalk.slab.Span
import chalk.slab.Source
import chalk.slab.Sentence
import chalk.slab.Segment
import chalk.slab.PartOfSpeech
import chalk.slab.EntityMention

case class MNode(id: String, targets: Seq[String])
case class MAnnotation(id: String, label: String, ref: String, features: Map[String,String])
Expand Down Expand Up @@ -271,8 +279,12 @@ object MascUtil {
.map(exml => MEdge(xmlId(exml), (exml \ "@from").toString, (exml \ "@to").toString))

def getAnnotations(doc: Elem) = (doc \\ "a").toSeq.map { axml =>
val features = (axml \\ "f").toSeq
.map(fnode => ((fnode \ "@name").toString -> fnode.child.toString)).toMap
val fs = (axml \\ "f").toSeq
val features = fs .map(fnode => {
val name = (fnode \ "@name").toString
val value = (fnode \ "@value").toString
name -> (if (!value.isEmpty) value else fnode.child.toString)
}).toMap
MAnnotation(xmlId(axml), (axml \ "@label").toString, (axml \ "@ref").toString, features)
}

Expand All @@ -285,3 +297,107 @@ object MascUtil {
}

}

object MascSlab {

/**
* Create a Slab from a MASC .txt file
*
* @param textFileUrl The URL of the MASC .txt (plain text) file.
* @return A Slab of the text, with the URL saved as a Source annotation.
*/
def apply(textFileUrl: URL): Slab.StringSlab[Source] = {
val text = io.Source.fromURL(textFileUrl)(Codec.UTF8).mkString
val slab = Slab[String, Span](text)
slab ++ Iterator(Source(0, text.length, textFileUrl))
}

/**
* Add sentences to a MASC Slab using the MASC -s.xml file.
*
* Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
*
* @param slab The Slab containing the text and source URL
* @return The Slab with added Sentence annotations as read from the MASC -s.xml file.
*/
def s[I <: Source](slab: Slab.StringSlab[I]) = {
val List(source) = slab.iterator[Source].toList
val sentenceXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-s.xml"))
val sentences = for (region <- MascUtil.getRegions(sentenceXml)) yield {
Sentence(region.start, region.end, Some(region.id))
}
slab ++ sentences.iterator
}

/**
* Add sentences to a MASC Slab using the MASC -seg.xml file.
*
* Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
*
* @param slab The Slab containing the text and source URL
* @return The Slab with added Segment annotations as read from the MASC -seg.xml file.
*/
def seg[I <: Source](slab: Slab.StringSlab[I]) = {
val List(source) = slab.iterator[Source].toList
val segmentXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-seg.xml"))
val segments = for (region <- MascUtil.getRegions(segmentXml)) yield {
Segment(region.start, region.end, Some(region.id))
}
slab ++ segments.iterator
}

/**
* Adds Penn PartOfSpeech tags using the MASC -penn.xml file.
*
* Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
*
* @param slab The Slab containing the text, the source URL and the Segment annotations.
* @return The Slab with added PartOfSpeech annotations as read from the MASC -penn.xml file.
*/
def penn[I <: Source with Segment](slab: Slab.StringSlab[I]) = {
val List(source) = slab.iterator[Source].toList
val pennXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-penn.xml"))

val idToSegment = (for (s <- slab.iterator[Segment]; id <- s.id.iterator) yield id -> s).toMap
val idToPosRegion = MascUtil.getNodes(pennXml).map(node => {
val segments = node.targets.map(idToSegment).sortBy(s => (s.begin, -s.end))
node.id -> MRegion(node.id, segments.head.begin, segments.last.end)
}).toMap

val partOfSpeechTags = for (annotation <- MascUtil.getAnnotations(pennXml)) yield {
val region = idToPosRegion(annotation.ref)
val tag = MascUtil.getPos(annotation)
PartOfSpeech(region.start, region.end, tag, Some(region.id))
}
// TODO: should probably create Stem annotations too, available as the MASC "base" feature

// FIXME: should not be necesssary to sort, but Slab needs better implementation
slab ++ partOfSpeechTags.sortBy(p => p.begin -> -p.end).iterator
}

/**
* Adds EntityMention annotations using the MASC -ne.xml file.
*
* Assumes there will be exactly one Source annotation, providing the URL of the MASC .txt file.
*
* @param slab The Slab containing the text, the source URL and PartOfSpeech annotations.
* @return The Slab with added EntityMention annotations as read from the MASC -ne.xml file.
*/
def ne[I <: Source with PartOfSpeech](slab: Slab.StringSlab[I]) = {
val List(source) = slab.iterator[Source].toList
val neXml = XML.load(source.url.toString().replaceAll("[.]txt$", "-ne.xml"))

val idToPos = (for (p <- slab.iterator[PartOfSpeech]; id <- p.id.iterator) yield id -> p).toMap
val neIdPosIdTuples = MascUtil.getEdges(neXml).map(e => (e.from -> e.to))
val neIdToPosIds = neIdPosIdTuples.groupBy(_._1).mapValues(_.map(_._2))

val entityMentions = for (annotation <- MascUtil.getAnnotations(neXml)) yield {
val posTags = neIdToPosIds(annotation.ref).map(idToPos).sortBy(p => p.begin -> -p.end)
val begin = posTags.head.begin
val end = posTags.last.end
EntityMention(begin, end, annotation.label, Some(annotation.ref))
}

slab ++ entityMentions.iterator
}
}
10 changes: 8 additions & 2 deletions src/main/scala/chalk/slab/Slab.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package chalk.slab

import scala.reflect.ClassTag
import java.net.URL

trait Slab[ContentType, BaseAnnotationType, +AnnotationTypes <: BaseAnnotationType] {

Expand Down Expand Up @@ -69,8 +70,12 @@ object Span {
// ===========
// Annotations
// ===========
case class Sentence(val begin: Int, val end: Int) extends Span
case class Token(val begin: Int, val end: Int) extends Span
case class Source(begin: Int, end: Int, url: URL) extends Span
case class Sentence(begin: Int, end: Int, id: Option[String] = None) extends Span
case class Segment(begin: Int, end: Int, id: Option[String] = None) extends Span
case class Token(begin: Int, end: Int, id: Option[String] = None) extends Span
case class PartOfSpeech(begin: Int, end: Int, tag: String, id: Option[String] = None) extends Span
case class EntityMention(begin: Int, end: Int, entityType: String, id: Option[String] = None) extends Span


object Slab {
Expand All @@ -97,6 +102,7 @@ object Slab {
extends Slab[ContentType, BaseAnnotationType, AnnotationTypes] {

def ++[AnnotationType](annotations: Iterator[AnnotationType]): Slab[ContentType, BaseAnnotationType, AnnotationTypes with AnnotationType] =
// FIXME: this should keep the annotations sorted by offset
new HorribleInefficientSlab(this.content, this._annotations ++ annotations)

def iterator[A >: AnnotationTypes <: BaseAnnotationType: ClassTag]: Iterator[A] =
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?xml version="1.0" encoding="UTF-8"?><graph xmlns="http://www.xces.org/ns/GrAF/1.0/" xmlns:graf="http://www.xces.org/ns/GrAF/1.0/">
<graphHeader>
<labelsDecl>
<labelUsage label="Non-Committed Belief" occurs="2"/>
<labelUsage label="Not Applicable" occurs="2"/>
<labelUsage label="Committed Belief" occurs="3"/>
</labelsDecl>
<annotationSpaces>
<annotationSpace as.id="cb"/>
</annotationSpaces>
</graphHeader>
<region xml:id="cb-r0" anchors="62 67"/>
<node xml:id="cb-n0">
<link targets="cb-r0"/>
</node>
<a xml:id="cb-N65577" label="Committed Belief" ref="cb-n0" as="cb"/>
<region xml:id="cb-r1" anchors="87 95"/>
<node xml:id="cb-n1">
<link targets="cb-r1"/>
</node>
<a xml:id="cb-N65593" label="Not Applicable" ref="cb-n1" as="cb"/>
<region xml:id="cb-r2" anchors="104 108"/>
<node xml:id="cb-n2">
<link targets="cb-r2"/>
</node>
<a xml:id="cb-N65609" label="Not Applicable" ref="cb-n2" as="cb"/>
<region xml:id="cb-r3" anchors="164 172"/>
<node xml:id="cb-n3">
<link targets="cb-r3"/>
</node>
<a xml:id="cb-N65625" label="Committed Belief" ref="cb-n3" as="cb"/>
<region xml:id="cb-r4" anchors="198 202"/>
<node xml:id="cb-n4">
<link targets="cb-r4"/>
</node>
<a xml:id="cb-N65641" label="Committed Belief" ref="cb-n4" as="cb"/>
<region xml:id="cb-r5" anchors="208 226"/>
<node xml:id="cb-n5">
<link targets="cb-r5"/>
</node>
<a xml:id="cb-N65657" label="Non-Committed Belief" ref="cb-n5" as="cb"/>
<region xml:id="cb-r6" anchors="232 236"/>
<node xml:id="cb-n6">
<link targets="cb-r6"/>
</node>
<a xml:id="cb-N65673" label="Non-Committed Belief" ref="cb-n6" as="cb"/>
</graph>
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<graph xmlns="http://www.xces.org/ns/GrAF/1.0/" xmlns:graf="http://www.xces.org/ns/GrAF/1.0/">
<graphHeader>
<labelsDecl>
<labelUsage label="Urging" occurs="1"/>
<labelUsage label="Saying" occurs="1"/>
<labelUsage label="Hurting" occurs="1"/>
<labelUsage label="Flowing" occurs="1"/>
<labelUsage label="Allowing" occurs="1"/>
</labelsDecl>
<annotationSpaces>
<annotationSpace as.id="event"/>
</annotationSpaces>
</graphHeader>
<region xml:id="event-r0" anchors="62 67"/>
<region xml:id="event-r1" anchors="87 95"/>
<region xml:id="event-r2" anchors="104 108"/>
<region xml:id="event-r3" anchors="198 202"/>
<region xml:id="event-r4" anchors="232 236"/>
<node xml:id="event-n0">
<link targets="event-r0"/>
</node>
<a xml:id="event-N65601" label="Urging" ref="event-n0" as="event">
<fs>
<f name="theme" value="United Nations"/>
<f name="agent" value="Tony Hall"/>
</fs>
</a>
<node xml:id="event-n1">
<link targets="event-r1"/>
</node>
<a xml:id="event-N65625" label="Allowing" ref="event-n1" as="event">
<fs>
<f name="theme" value="Flowing"/>
<f name="agent" value="United Nations"/>
</fs>
</a>
<node xml:id="event-n2">
<link targets="event-r2"/>
</node>
<a xml:id="event-N65649" label="Flowing" ref="event-n2" as="event">
<fs>
<f name="theme" value="food &amp; medicine"/>
<f name="goal" value="Iraq"/>
</fs>
</a>
<node xml:id="event-n3">
<link targets="event-r3"/>
</node>
<a xml:id="event-N65673" label="Saying" ref="event-n3" as="event">
<fs>
<f name="theme" value="Hurting"/>
<f name="agent" value="Hall"/>
</fs>
</a>
<node xml:id="event-n4">
<link targets="event-r4"/>
</node>
<a xml:id="event-N65697" label="Hurting" ref="event-n4" as="event">
<fs>
<f name="cause" value="UN economic sanctions"/>
<f name="location" value="there (Iraq)"/>
<f name="theme" value="millions of civilians"/>
</fs>
</a>
</graph>
Loading

0 comments on commit f073dd1

Please sign in to comment.