-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathtreetagger-wrapper.py
executable file
·26 lines (19 loc) · 1 KB
/
treetagger-wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright © 2011 University of Zürich
# Author: Rico Sennrich <[email protected]>
# Simple wrapper that deals with the TreeTagger's way of denoting sentence boundaries.
# Requires TreeTagger to be installed and TREETAGGER_BIN and TREETAGGER_MODEL to be set properly.
TREETAGGER_BIN = ""
TREETAGGER_MODEL = ""
#TREETAGGER_BIN = "/opt/tagger/treetagger/treetagger-3.2/bin/tree-tagger"
#TREETAGGER_MODEL = "/opt/tagger/treetagger/treetagger-3.2/lib/german-utf8.par"
import sys
from subprocess import Popen, PIPE
if not TREETAGGER_BIN or not TREETAGGER_MODEL:
sys.stderr.write('ERROR: set paths TREETAGGER_BIN and TREETAGGER_MODEL in ' + sys.argv[0] + '\n')
exit()
convert_to_sgml = Popen(['sed', r's/^$/<\/s>/'],stdin=sys.stdin,stdout=PIPE)
tag = Popen([TREETAGGER_BIN, '-token', '-sgml', '-eos-tag', '</s>', TREETAGGER_MODEL],stdin=convert_to_sgml.stdout,stdout=PIPE)
convert_to_blank_line = Popen(['sed', r's/^<\/s>$//'],stdin=tag.stdout)
convert_to_blank_line.wait()