-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparseXML_Twig_test1.pl
executable file
·102 lines (67 loc) · 2.26 KB
/
parseXML_Twig_test1.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/perl
# use module
use strict;
use XML::Twig;
use utf8;
use open ":encoding(utf8)";
binmode(STDOUT, ":utf8");
# create object
#$xml = new XML::Simple;
# read XML file
#my $file ="CARDS5080.xml";
my $file = $ARGV[0];
my $outdir = $ARGV[1];
open(OUT_PER, ">>$outdir/ps_text_peroration.txt" )|| die "cant open $outdir/ps_text_peroration.txt";
open(OUT_HAR, ">>$outdir/ps_text_harangue.txt" )|| die "cant open $outdir/ps_text_harangue.txt";
open(OUT_NON, ">>$outdir/ps_text_nonnar.txt" )|| die "cant open $outdir/ps_text_nonnar.txt";
open(OUT_BODY, ">>$outdir/ps_text_body.txt" )|| die "cant open $outdir/ps_text_body.txt";
open(OUT_OPEN, ">>$outdir/ps_text_opener.txt" )|| die "cant open $outdir/ps_text_opener.txt";
open(OUT_CLO, ">>$outdir/ps_text_closer.txt" )|| die "cant open $outdir/ps_text_closer.txt";
get_XML_content($file);
sub get_XML_content{
my $file = $_[0];
#only take the 's' sentence XML parts that are inside <body>.
my $twig= new XML::Twig( twig_roots => { 'body' => 1 },
twig_handlers => { 's'=> \&do_sentence } );
$twig->parsefile( "$file");
}
sub do_sentence{
my( $twig, $elem)= @_; # handlers params
my ($myattribute, $attribute_value);
#check for each node s element which attribute it has.
# print nodes with attributes to different sub files
foreach $myattribute ( keys( %{$elem->atts} ) )
{
$attribute_value = ${$elem->atts}{ $myattribute };
my $string = $elem->text; # get the text of element
#print "att= $attribute_value\n";
if($attribute_value eq "peroration" )
{
print OUT_PER "$string\n";
}
elsif($attribute_value eq "opener" )
{
print OUT_OPEN "$string\n";
}
elsif($attribute_value eq "closer" )
{
print OUT_CLO "$string\n";
}
elsif($attribute_value eq "harangue" )
{
print OUT_HAR "$string\n";
}
elsif($attribute_value eq "non-narration" )
{
print OUT_NON "$string\n";
}
else
{
print OUT_BODY "$string\n";
}
}
}
close(OUT_HAR);
close(OUT_NON);
close(OUT_PER);
close(OUT_BODY);