-
Notifications
You must be signed in to change notification settings - Fork 3
/
tokenize.pl
executable file
·48 lines (45 loc) · 1.18 KB
/
tokenize.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#/usr/bin/perl -w
use utf8;
binmode STDIN, ':utf8';
binmode STDOUT, ':utf8';
while (<>)
{
@words = split(/([\s+|,|\.|\:|;|\[|\]|\(|\)|\?|\~|\“|\”|\"|\"|\¡|\–|\¿|\!|\/|\%|\'|=|…|—|,|一|\*|•])/);
foreach (@words) {
if (m/^\s*$/) { next;}
elsif (m/^\.*$/) { next; }
elsif (m/^\,*$/) { next; }
elsif (m/^\:*$/) { next; }
elsif (m/^\;*$/) { next; }
elsif (m/^\-*$/) { next; }
elsif (m/^\(*$/) { next; }
elsif (m/^\)*$/) { next; }
elsif (m/^\/*$/) { next; }
elsif (m/^\“*$/) { next; }
elsif (m/^\”*$/) { next; }
elsif (m/^\]*$/) { next; }
elsif (m/^\[*$/) { next; }
elsif (m/^\¡*$/) { next; }
elsif (m/^\!*$/) { next; }
elsif (m/^\→*$/) { next; }
elsif (m/^\=*$/) { next; }
elsif (m/^\…*$/) { next; }
elsif (m/^\—*$/) { next; }
elsif (m/^\一*$/) { next; }
elsif (m/^\**$/) { next; }
elsif (m/^\•*$/) { next; }
elsif (m/^\,*$/) { next; }
elsif (m/\d{1,2}?/) { next; }
else {
=pod
I am deting the hyphen - .
=cut
my $string = $_;
$string = lc($string);
$string =~ s/’/\'/ig;
$string =~ s/`/\'/ig;
$string =~ s/\-//g;
print "$string\n";
}
}
}