Skip to content

Commit

Permalink
Add PICA Import format parser (#129)
Browse files Browse the repository at this point in the history
  • Loading branch information
nichtich committed Aug 9, 2023
1 parent 114e44a commit 2e23d57
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 38 deletions.
1 change: 1 addition & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Revision history for PICA::Data

{{$NEXT}}
- Add PICA Import format parser (#129)
- Add parser counter (method: count)

2.09 2023-04-14T07:03:05Z
Expand Down
1 change: 1 addition & 0 deletions lib/PICA/Data.pm
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ use PICA::Patch;
use PICA::Parser::XML;
use PICA::Parser::Plus;
use PICA::Parser::Plain;
use PICA::Parser::Import;
use PICA::Parser::Binary;
use PICA::Parser::PPXML;
use PICA::Parser::PIXML;
Expand Down
81 changes: 81 additions & 0 deletions lib/PICA/Parser/Import.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package PICA::Parser::Import;
use v5.14.1;
use utf8;

our $VERSION = '2.09';

use charnames ':full';
use Carp qw(carp croak);

use parent 'PICA::Parser::Base';

sub _next_record {
my ($self) = @_;

my $reader = $self->{reader};
my $line;

# All data before the first record separator is ignored
if (!$self->count) {
do {
$line = readline($reader);
return unless defined $line;
} while ($line =~ /^\x1D$/);
}

my @record;
while (1) {
$line = readline($reader);
if (!defined $line) {
@record ? last : return;
}

next if $line =~ /^#|^\s*$/; # ignore empty or comment lines
last if $line =~ /^\x1D$/;
chomp $line;

if ($line
=~ m/^\x1E([012][0-9][0-9][A-Z@])(\/(\d{2,3}))?\s((\x1F[^\x1F]+)+)$/
)
{
my $tag = $1;
my $occ = $3 > 0 ? $3 : '';
my @sf = split /\x1F/, $4;
shift @sf;
push @record, [$tag, $occ, map {split //, $_, 2} @sf];
}
elsif ($self->{strict}) {
croak "ERROR: invalid PICA field structure \"$line\"";
}
else {
carp
"WARNING: invalid PICA field structure \"$line\". Skipped field";
next;
}

}

return \@record;
}

1;
__END__
=encoding UTF-8
=head1 NAME
PICA::Parser::Import - PICA Import format parser
=head1 DESCRIPTION
Parses PICA+ records in PICA Import format (also known as "normalized title
format", see L<https://format.gbv.de/pica/import>).
Fields or subfields spread over several lines are not supported!
See L<PICA::Parser::Base> for synopsis and configuration.
The counterpart of this module is L<PICA::Writer::Import>.
=cut
16 changes: 7 additions & 9 deletions lib/PICA/Parser/Plain.pm
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,13 @@ sub _next_record {
$occ = $3;
$data = $4;
}
elsif ($self->{strict}) {
croak "ERROR: invalid PICA field structure \"$field\"";
}
else {
if ($self->{strict}) {
croak " ERROR : no valid PICA field structure \"$field\"";
}
else {
carp
"WARNING: no valid PICA field structure \"$field\". Skipped field";
next
}
carp
"WARNING: invalid PICA field structure \"$field\". Skipped field";
next;
}

if (!$self->{strict} && $data =~ //) {
Expand All @@ -83,7 +81,7 @@ sub _next_record {
}
else {
$tokens[-1] .= $code;
$code = substr $value, 0, 1;
$code = substr $value, 0, 1;
$value = substr $value, 1;
}
}
Expand Down
28 changes: 12 additions & 16 deletions lib/PICA/Parser/Plus.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use v5.14.1;
our $VERSION = '2.09';

use charnames qw(:full);
use Carp qw(carp croak);
use Carp qw(carp croak);

use parent 'PICA::Parser::Base';

Expand Down Expand Up @@ -32,24 +32,20 @@ sub _next_record {
foreach my $field (@fields) {
my ($tag, $occ, $data);
if ($field =~ m/^(\d{3}[A-Z@])(\/(\d{2,3}))?\s(.+)/) {
$tag = $1;
$occ = $3;
$data = $4;
my $tag = $1;
my $occ = $3;
my @subfields = map {split //, $_, 2}
split($self->SUBFIELD_INDICATOR, substr($4, 1));
push @record, [$tag, $occ > 0 ? $occ : '', @subfields];
}
elsif ($self->{strict}) {
croak "ERROR: invalid PICA field structure \"$field\"";
}
else {
if ($self->{strict}) {
croak "ERROR: no valid PICA field structure \"$field\"";
}
else {
carp
"WARNING: no valid PICA field structure \"$field\". Skipped field";
next;
}
carp
"WARNING: invalid PICA field structure \"$field\". Skipped field";
next;
}

my @subfields = map {substr($_, 0, 1), substr($_, 1)}
split($self->SUBFIELD_INDICATOR, substr($data, 1));
push @record, [$tag, $occ > 0 ? $occ : '', @subfields];
}

return \@record;
Expand Down
6 changes: 3 additions & 3 deletions lib/PICA/Writer/Import.pm
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ PICA::Writer::Import - PICA Import format serializer
=head2 DESCRIPTION
Serializes PICA+ records in PICA Import format (based on CBS normalized file
format, see L<https://format.gbv.de/pica/import>).
Serializes PICA+ records in PICA Import format (also known as "normalized title
format", see L<https://format.gbv.de/pica/import>).
Parser of PICA Import format has not been implemented yet.
See L<PICA::Parser::Import> for corresponding parser.
=cut
22 changes: 12 additions & 10 deletions t/20-parser.t
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ is $first->{record}->[6]->[7], '柳经纬主编;', 'Unicode';
is_deeply $first->{record}->[11],
['145Z', '40', 'a', '$', 'b', 'test$', 'c', '...'], 'sub field with $';

foreach my $type (qw(Plain Plus JSON Binary XML PPXML PIXML)) {
foreach my $type (qw(Plain Plus JSON Binary XML PPXML PIXML Import)) {
my $module = "PICA::Parser::$type";
my $file = 't/files/pica.' . lc($type);

Expand All @@ -26,12 +26,14 @@ foreach my $type (qw(Plain Plus JSON Binary XML PPXML PIXML)) {
is ref($parser), "PICA::Parser::$type", "parser from file";

my $record = $parser->next;
is_deeply $record, $first;
is $parser->count, 1;
is_deeply $record, $first, 'first record';
is $parser->count, 1, 'count';

#note explain $record if $type eq 'Import';

ok $parser->next()->{_id} eq '67890', 'next record';
ok !$parser->next, 'parsed all records';
is $parser->count, 2;
is $parser->count, 2, 'count';

foreach my $mode ('<', '<:utf8') {
next
Expand Down Expand Up @@ -89,18 +91,18 @@ note 'error handling';
my $plus
= "X01A \x{1F}01\x{1E}001A/0 \x{1F}01\x{1E}001A/AB \x{1F}01\x{1E}";
warnings_exist {PICA::Parser::Plus->new(\$plus)->next}[
qr{no valid PICA field structure},
qr{no valid PICA field structure},
qr{no valid PICA field structure}
qr{invalid PICA field structure},
qr{invalid PICA field structure},
qr{invalid PICA field structure}
],
'skip faulty fields with warnings';
dies_ok {PICA::Parser::Plus->new(\$plus, strict => 1)->next}
'die on faulty fields with option strict';
my $plain = "X01@ \$01\n\n001@/0 \$01\n\n001@/AB \$01";
warnings_exist {PICA::Parser::Plain->new(\$plain)->next}[
qr{no valid PICA field structure},
qr{no valid PICA field structure},
qr{no valid PICA field structure}
qr{invalid PICA field structure},
qr{invalid PICA field structure},
qr{invalid PICA field structure}
],
'skip faulty fields with warnings';
dies_ok {PICA::Parser::Plain->new(\$plain, strict => 1)->next}
Expand Down
26 changes: 26 additions & 0 deletions t/files/pica.import
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
All data before the first record separator (ASCII 29) is ignored.


# comment

002@ 0Aan
003@ 012345
010@ achi
011@ a2004n2004.01
012X 00xy
019@ aXB-CN
021A a我国民事立法的回顾与展望fHistory and perspective of China civil lawh柳经纬主编;
101@ a120cPICAdOldenburg, IBIT Universität Oldenburg <715>
101B 015-01-09t15:32:38.000
101D 015-01-09b9330a0715
101U 0utf8
145Z/40 a$btest$c...
201B/01 015-01-09t15:32:38.000
201D/01 015-01-09b9330a0715
201U/01 0utf8
203@/01 0917400194xy
208@/01 a15-01-09bz
209A/01 f108a401-06356/09dgx00

## Another comment
003@ 067890

0 comments on commit 2e23d57

Please sign in to comment.