-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Separate reader/writer for the moment, but doesn't mean it has to stay this way - the code is highly identical!
- Loading branch information
Showing
8 changed files
with
2,298 additions
and
0 deletions.
There are no files selected for viewing
732 changes: 732 additions & 0 deletions
732
dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiP4Reader.java
Large diffs are not rendered by default.
Oops, something went wrong.
334 changes: 334 additions & 0 deletions
334
dkpro-core-io-tei-asl/src/main/java/org/dkpro/core/io/tei/TeiP4Writer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,334 @@ | ||
/* | ||
* Copyright 2017 | ||
* Ubiquitous Knowledge Processing (UKP) Lab | ||
* Technische Universität Darmstadt | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.dkpro.core.io.tei; | ||
|
||
import static org.apache.commons.io.IOUtils.closeQuietly; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.ATTR_FUNCTION; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.ATTR_LEMMA; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.ATTR_TYPE; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_BODY; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_FILE_DESC; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_HEADER; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_TEI; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_TEXT; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_TITLE; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.E_TEI_TITLE_STMT; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.TAG_CHARACTER; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.TAG_PARAGRAPH; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.TAG_PHRASE; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.TAG_RS; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.TAG_SUNIT; | ||
import static org.dkpro.core.io.tei.internal.TeiP4Constants.TAG_WORD; | ||
|
||
import java.io.OutputStream; | ||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.Stack; | ||
import java.util.regex.Pattern; | ||
|
||
import javax.xml.namespace.QName; | ||
import javax.xml.stream.XMLEventFactory; | ||
import javax.xml.stream.XMLEventWriter; | ||
import javax.xml.stream.XMLOutputFactory; | ||
import javax.xml.stream.XMLStreamException; | ||
import javax.xml.stream.events.Attribute; | ||
|
||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | ||
import org.apache.uima.cas.FSIterator; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.fit.descriptor.MimeTypeCapability; | ||
import org.apache.uima.fit.descriptor.ResourceMetaData; | ||
import org.apache.uima.fit.descriptor.TypeCapability; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.jcas.tcas.Annotation; | ||
import org.dkpro.core.api.io.JCasFileWriter_ImplBase; | ||
import org.dkpro.core.api.parameter.ComponentParameters; | ||
import org.dkpro.core.api.parameter.MimeTypes; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; | ||
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; | ||
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; | ||
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; | ||
import eu.openminted.share.annotations.api.DocumentationResource; | ||
import javanet.staxutils.IndentingXMLEventWriter; | ||
|
||
/** | ||
* UIMA CAS consumer writing the CAS document text in TEI format. | ||
*/ | ||
@ResourceMetaData(name = "TEI P4 XML Writer") | ||
@DocumentationResource("${docbase}/format-reference.html#format-${command}") | ||
@MimeTypeCapability({MimeTypes.APPLICATION_TEI_XML}) | ||
@TypeCapability( | ||
inputs = { | ||
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", | ||
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", | ||
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", | ||
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", | ||
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", | ||
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", | ||
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", | ||
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"}) | ||
public class TeiP4Writer | ||
extends JCasFileWriter_ImplBase | ||
{ | ||
/** | ||
* Specify the suffix of output files. Default value <code>.xml</code>. If the suffix is not | ||
* needed, provide an empty string as value. | ||
*/ | ||
public static final String PARAM_FILENAME_EXTENSION = | ||
ComponentParameters.PARAM_FILENAME_EXTENSION; | ||
@ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xml") | ||
private String filenameSuffix; | ||
|
||
/** | ||
* A token matching this pattern is rendered as a TEI "c" element instead of a "w" element. | ||
*/ | ||
public static final String PARAM_C_TEXT_PATTERN = "cTextPattern"; | ||
@ConfigurationParameter(name = PARAM_C_TEXT_PATTERN, mandatory = true, defaultValue = "[,.:;()]|(``)|('')|(--)") | ||
private Pattern cTextPattern; | ||
|
||
/** | ||
* Write constituent annotations to the CAS. Disabled by default because it requires type | ||
* priorities to be set up (Constituents must have a higher prio than Tokens). | ||
*/ | ||
public static final String PARAM_WRITE_CONSTITUENT = | ||
ComponentParameters.PARAM_WRITE_CONSTITUENT; | ||
@ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "false") | ||
private boolean writeConstituent; | ||
|
||
/** | ||
* Write named entity annotations to the CAS. Overlapping named entities are not supported. | ||
*/ | ||
public static final String PARAM_WRITE_NAMED_ENTITY = | ||
ComponentParameters.PARAM_WRITE_NAMED_ENTITY; | ||
@ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true") | ||
private boolean writeNamedEntity; | ||
|
||
/** | ||
* Indent the XML. | ||
*/ | ||
public static final String PARAM_INDENT = "indent"; | ||
@ConfigurationParameter(name = PARAM_INDENT, mandatory = true, defaultValue = "false") | ||
private boolean indent; | ||
|
||
private final XMLEventFactory xmlef = XMLEventFactory.newInstance(); | ||
|
||
@Override | ||
public void process(JCas aJCas) | ||
throws AnalysisEngineProcessException | ||
{ | ||
String text = aJCas.getDocumentText(); | ||
|
||
OutputStream docOS = null; | ||
XMLEventWriter xmlEventWriter = null; | ||
try { | ||
docOS = getOutputStream(aJCas, filenameSuffix); | ||
|
||
XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance(); | ||
xmlOutputFactory.setProperty(XMLOutputFactory.IS_REPAIRING_NAMESPACES, true); | ||
|
||
xmlEventWriter = xmlOutputFactory.createXMLEventWriter(docOS, "UTF-8"); | ||
if (indent) { | ||
xmlEventWriter = new IndentingXMLEventWriter(xmlEventWriter); | ||
} | ||
|
||
xmlEventWriter.add(xmlef.createStartDocument()); | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TEI, null, null)); | ||
|
||
// Render header | ||
DocumentMetaData meta = DocumentMetaData.get(aJCas); | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_HEADER, null, null)); | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_FILE_DESC, null, null)); | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TITLE_STMT, null, null)); | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TITLE, null, null)); | ||
xmlEventWriter.add(xmlef.createCharacters(meta.getDocumentTitle())); | ||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TITLE, null)); | ||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TITLE_STMT, null)); | ||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_FILE_DESC, null)); | ||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_HEADER, null)); | ||
|
||
// Render text | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_TEXT, null, null)); | ||
xmlEventWriter.add(xmlef.createStartElement(E_TEI_BODY, null, null)); | ||
|
||
FSIterator<Annotation> iterator = aJCas.getAnnotationIndex().iterator(); | ||
|
||
Stack<Annotation> stack = new Stack<Annotation>(); | ||
int pos = 0; | ||
Annotation cur = null; | ||
|
||
while (iterator.isValid()) { | ||
Annotation nextAnnot = iterator.get(); | ||
|
||
// Ignore unmapped elements | ||
Optional<String> teiElement = getTeiTag(nextAnnot); | ||
if (!teiElement.isPresent()) { | ||
iterator.moveToNext(); | ||
continue; | ||
} | ||
|
||
// Check if next annotation is potentially nested | ||
if (cur == null || nextAnnot.getBegin() < cur.getEnd()) { | ||
// Check if next annotation is fully nested | ||
if (cur == null || nextAnnot.getEnd() <= cur.getEnd()) { | ||
// Text between current and next annotation | ||
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, | ||
nextAnnot.getBegin()))); | ||
// Next annotation | ||
xmlEventWriter | ||
.add(xmlef.createStartElement(new QName(teiElement.get()), | ||
getAttributes(nextAnnot), null)); | ||
|
||
stack.push(cur); | ||
cur = nextAnnot; | ||
pos = nextAnnot.getBegin(); | ||
} | ||
else { | ||
// Overlapping annotations are ignored | ||
getLogger().debug("Unable to render overlapping annotation"); | ||
} | ||
iterator.moveToNext(); | ||
} | ||
// Next annotation is following, not nested | ||
else { | ||
// Text between current and next annotation | ||
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd()))); | ||
xmlEventWriter | ||
.add(xmlef.createEndElement(new QName(teiElement.get()), null)); | ||
|
||
pos = cur.getEnd(); | ||
cur = stack.pop(); | ||
} | ||
} | ||
|
||
// End of text, end all elements that are still on the stack | ||
if (cur != null) { | ||
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd()))); | ||
pos = cur.getEnd(); | ||
xmlEventWriter | ||
.add(xmlef.createEndElement(new QName(getTeiTag(cur).get()), null)); | ||
|
||
while (!stack.isEmpty()) { | ||
cur = stack.pop(); | ||
if (cur == null) { | ||
break; | ||
} | ||
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, cur.getEnd()))); | ||
pos = cur.getEnd(); | ||
xmlEventWriter.add( | ||
xmlef.createEndElement(new QName(getTeiTag(cur).get()), null)); | ||
} | ||
} | ||
|
||
if (pos < text.length()) { | ||
xmlEventWriter.add(xmlef.createCharacters(text.substring(pos, text.length()))); | ||
} | ||
|
||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_BODY, null)); | ||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TEXT, null)); | ||
xmlEventWriter.add(xmlef.createEndElement(E_TEI_TEI, null)); | ||
xmlEventWriter.add(xmlef.createEndDocument()); | ||
} | ||
catch (Exception e) { | ||
throw new AnalysisEngineProcessException(e); | ||
} | ||
finally { | ||
if (xmlEventWriter != null) { | ||
try { | ||
xmlEventWriter.close(); | ||
} | ||
catch (XMLStreamException e) { | ||
getLogger().warn("Error closing the XML event writer", e); | ||
} | ||
} | ||
|
||
closeQuietly(docOS); | ||
} | ||
} | ||
|
||
private Iterator<Attribute> getAttributes(Annotation aAnnotation) { | ||
List<Attribute> attributes = new ArrayList<Attribute>(); | ||
if (aAnnotation instanceof Token) { | ||
Token t = (Token) aAnnotation; | ||
if (t.getPos() != null) { | ||
attributes.add(xmlef.createAttribute(ATTR_TYPE, t.getPos().getPosValue())); | ||
} | ||
if (t.getLemma() != null) { | ||
attributes.add(xmlef.createAttribute(ATTR_LEMMA, t.getLemma().getValue())); | ||
} | ||
} | ||
else if (aAnnotation instanceof NamedEntity) { | ||
NamedEntity ne = (NamedEntity) aAnnotation; | ||
attributes.add(xmlef.createAttribute(ATTR_TYPE, ne.getValue())); | ||
} | ||
else if (aAnnotation instanceof Constituent) { | ||
Constituent c = (Constituent) aAnnotation; | ||
if ("ROOT".equals(c.getConstituentType())) { | ||
System.out.println(); | ||
} | ||
if (c.getConstituentType() != null) { | ||
attributes.add(xmlef.createAttribute(ATTR_TYPE, c.getConstituentType())); | ||
} | ||
if (c.getSyntacticFunction() != null) { | ||
attributes.add(xmlef.createAttribute(ATTR_FUNCTION, c.getSyntacticFunction())); | ||
} | ||
} | ||
return attributes.iterator(); | ||
} | ||
|
||
private Optional<String> getTeiTag(Annotation aAnnotation) | ||
{ | ||
if (aAnnotation instanceof Constituent) { | ||
Constituent c = (Constituent) aAnnotation; | ||
if ("ROOT".equals(c.getConstituentType())) { | ||
System.out.println(); | ||
} | ||
} | ||
|
||
if (aAnnotation.getTypeIndexID() == Token.type) { | ||
if (cTextPattern.matcher(aAnnotation.getCoveredText()).matches()) { | ||
return Optional.of(TAG_CHARACTER); | ||
} | ||
return Optional.of(TAG_WORD); | ||
} | ||
else if (aAnnotation.getTypeIndexID() == Sentence.type) { | ||
return Optional.of(TAG_SUNIT); | ||
} | ||
else if (aAnnotation.getTypeIndexID() == Paragraph.type) { | ||
return Optional.of(TAG_PARAGRAPH); | ||
} | ||
else if (writeConstituent && (aAnnotation instanceof ROOT)) { | ||
// We do not render ROOT nodes | ||
return Optional.empty(); | ||
} | ||
else if (writeConstituent && (aAnnotation instanceof Constituent)) { | ||
return Optional.of(TAG_PHRASE); | ||
} | ||
else if (writeNamedEntity && (aAnnotation instanceof NamedEntity)) { | ||
return Optional.of(TAG_RS); | ||
} | ||
else { | ||
return Optional.empty(); | ||
} | ||
} | ||
} |
Oops, something went wrong.