Skip to content

Commit

Permalink
Add explicit PICA Patch parser and writer
Browse files Browse the repository at this point in the history
  • Loading branch information
nichtich committed Aug 28, 2023
1 parent 3ee299e commit 42a7df1
Show file tree
Hide file tree
Showing 12 changed files with 178 additions and 49 deletions.
2 changes: 2 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
Revision history for PICA::Data

{{$NEXT}}
- Add explicit PICA Patch parser and writer
- Fix parsing plain $$ (#136)
- Avoid circular dependency

2.11 2023-08-18T05:19:26Z
- Add parser method: all
Expand Down
1 change: 1 addition & 0 deletions lib/App/picadata.pm
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ my %TYPES = (
pixml => 'PIXML',
json => 'JSON',
ndjson => 'JSON',
patch => 'Patch',
);

my %COLORS = (
Expand Down
13 changes: 13 additions & 0 deletions lib/PICA/Data.pm
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ use PICA::Patch;
use PICA::Parser::XML;
use PICA::Parser::Plus;
use PICA::Parser::Plain;
use PICA::Parser::Patch;
use PICA::Parser::Import;
use PICA::Parser::Binary;
use PICA::Parser::PPXML;
Expand All @@ -404,6 +405,7 @@ use PICA::Parser::JSON;
use PICA::Writer::XML;
use PICA::Writer::Plus;
use PICA::Writer::Plain;
use PICA::Writer::Patch;
use PICA::Writer::Import;
use PICA::Writer::Binary;
use PICA::Writer::PPXML;
Expand Down Expand Up @@ -466,6 +468,9 @@ sub _pica_module {
elsif ($type =~ /^(pica)?plain$/) {
"${base}::Plain"->new(@_);
}
elsif ($type =~ /^patch$/) {
"${base}::Patch"->new(@_);
}
elsif ($type eq 'import') {
"${base}::Import"->new(@_);
}
Expand Down Expand Up @@ -676,6 +681,10 @@ L<PICA::Parser::PPXML> for type C<ppxml> (PicaPlus-XML)
L<PICA::Parser::PIXML> for type C<pixml> (PICA FOLIO Import XML)
=item
L<PICA::Parser::Patch> for type C<patch> (PICA Patch format)
=back
=head2 pica_guess( $data )
Expand Down Expand Up @@ -730,6 +739,10 @@ L<PICA::Writer::PPXML> for type C<ppxml> (PicaPlus-XML)
L<PICA::Writer::PIXML> for type C<pixml> (PICA FOLIO Import XML)
=item
L<PICA::Writer::Patch> for type C<patch> (PICA Patch format)
=back
=head2 pica_string( $record [, $type [, @options] ] )
Expand Down
5 changes: 4 additions & 1 deletion lib/PICA/Parser/Base.pm
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ Use one of the following subclasses instead:
=item L<PICA::Parser::JSON>
=item L<PICA::Parser::Patch>
=back
=head2 CONFIGURATION
Expand All @@ -151,7 +153,8 @@ them fatal by setting the I<strict> parameter to 1.
=item annotate
By default some parsers also support annotated PICA. Set to true to enforce
field annotations or to false to forbid them.
field annotations or to false to forbid them. Default value (undefined or empty
string) allows annotations.
=back
Expand Down
43 changes: 43 additions & 0 deletions lib/PICA/Parser/Patch.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package PICA::Parser::Patch;
use v5.14.1;

our $VERSION = '2.11';

use parent 'PICA::Parser::Plain';

use Carp qw(croak);

sub _new {
my $self = PICA::Parser::Base::_new(@_);
$self->annotated = undef;
$self->strict = 1;
$self;
}

sub parse_field {
my ($self, $field) = @_;
$field = $self->SUPER::parse_field($field);

return [@$field, " "] unless @$field % 2;

my $char = $field->[$#$field];
croak "Invalid annotation: '$char'" if $char !~ /^[ +-]$/;

return $field;
}

1;
__END__
=head1 NAME
PICA::Parser::Plain - Plain PICA+ format serializer
=head2 DESCRIPTION
This is basically L<PICA::Parser::Plain> with option C<strict> enabled and required
or empty annotation character C<+>, C<-> or space for each field.
The counterpart of this module is L<PICA::Writer::Patch>.
=cut
89 changes: 48 additions & 41 deletions lib/PICA/Parser/Plain.pm
Original file line number Diff line number Diff line change
Expand Up @@ -33,51 +33,59 @@ sub _next_record {
my @record;

for my $field (@fields) {
my ($annotation, $tag, $occ, $data);

unless (defined $self->{annotate} && !$self->{annotate}) {
if ($field =~ s/^([^a-z0-9]) (.+)/\2/) {
$annotation = $1;
}
elsif ($self->{annotate}) {
croak "ERROR: expected field annotation at field \"$field\"";
}
}
$field = $self->parse_field($field);
push @record, $field if $field;
}

if ($field =~ m/^(\d{3}[A-Z@])(\/(\d{2,3}))?\s(.+)/) {
$tag = $1;
$occ = $3;
$data = $4;
}
return \@record;
}

if (!$self->{strict} && $data =~ //) {
$data =~ s/\$/\$\$/g;
$data =~ s/ƒ/\$/g;
}
sub parse_field {
my ($self, $field) = @_;

if ($data !~ /^(\$[^\$]([^\$]|\$\$)*)+$/) {
if ($self->{strict}) {
croak "ERROR: invalid PICA field structure \"$field\"";
}
else {
carp
"WARNING: invalid PICA field structure \"$field\". Skipped field";
next;
}
}
my ($annotation, $tag, $occ, $data);

my @subfields;
while ($data =~ m/\G\$([^\$])(([^\$]|\$\$)*)/g) {
my ($code, $value) = ($1, $2);
$value =~ s/\$\$/\$/g;
push @subfields, $code, $value;
unless (defined $self->{annotate} && !$self->{annotate}) {
if ($field =~ s/^([^a-z0-9]) (.+)/\2/) {
$annotation = $1;
}
elsif ($self->{annotate}) {
croak "ERROR: expected field annotation at field \"$field\"";
}
}

push @subfields, $annotation if defined $annotation;
if ($field =~ m/^(\d{3}[A-Z@])(\/(\d{2,3}))?\s(.+)/) {
$tag = $1;
$occ = $3;
$data = $4;
}

push @record, [$tag, $occ > 0 ? $occ : '', @subfields];
if (!$self->{strict} && $data =~ //) {
$data =~ s/\$/\$\$/g;
$data =~ s/ƒ/\$/g;
}
return \@record;

if ($data !~ /^(\$[^\$]([^\$]|\$\$)*)+$/) {
if ($self->{strict}) {
croak "ERROR: invalid PICA field structure \"$field\"";
}
else {
carp
"WARNING: invalid PICA field structure \"$field\". Skipped field";
return;
}
}

my @subfields;
while ($data =~ m/\G\$([^\$])(([^\$]|\$\$)*)/g) {
my ($code, $value) = ($1, $2);
$value =~ s/\$\$/\$/g;
push @subfields, $code, $value;
}

push @subfields, $annotation if defined $annotation;

return [$tag, $occ > 0 ? $occ : '', @subfields];
}

1;
Expand All @@ -91,13 +99,12 @@ PICA::Parser::Plain - Plain PICA format parser
=head1 DESCRIPTION
This parser can parse both PICA Plain and annotated PICA. Option C<annotation>
can be used to enforce or forbid annotations.
See L<PICA::Parser::Base> for synopsis and configuration.
This parser can parse both PICA Plain and annotated PICA. See L<PICA::Parser::Base> for synopsis and configuration.
In addition to the C<$> this parser also allows C<ƒ> as subfield indicator and it skips lines with WinIBW download messages, unless option C<strict> is enabled.
The counterpart of this module is L<PICA::Writer::Plain>.
This parser can parse PICA Patch format but L<PICA::Parser::Patch> should be used instead to ensure every field is annotated with C<+>, C<-> or space.
=cut
6 changes: 5 additions & 1 deletion lib/PICA/Writer/Base.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use Scalar::Util qw(blessed openhandle reftype);
use PICA::Schema qw(clean_pica);
use Term::ANSIColor;
use Encode qw(decode);
use Carp qw(croak);
use Carp qw(croak);

sub new {
my $class = shift;
Expand Down Expand Up @@ -154,6 +154,8 @@ Use one of the following subclasses instead:
=item L<PICA::Writer::Generic>
=item L<PICA::Writer::Patch>
=back
=head1 METHODS
Expand Down Expand Up @@ -203,6 +205,8 @@ empty space if missing) or to false to ignore them.
=head1 SEE ALSO
See L<PICA::Parser::Base> for corresponding parser modules.
See L<Catmandu::Exporter::PICA> for usage of this module within the L<Catmandu>
framework (recommended).
Expand Down
38 changes: 38 additions & 0 deletions lib/PICA/Writer/Patch.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package PICA::Writer::Patch;
use v5.14.1;

our $VERSION = '2.11';

use Carp qw(croak);

use parent 'PICA::Writer::Plain';

sub write_field {
my ($self, $field) = @_;

if (@$field % 2) {
my $char = $field->[$#$field];
croak "Invalid annotation: '$char'" if $char !~ /^[ +-]$/;
}
else {
$field = [@$field, " "];
}
PICA::Writer::Base::write_field($self, $field);
}

1;
__END__
=head1 NAME
PICA::Writer::Plain - Plain PICA+ format serializer
=head2 DESCRIPTION
This is basically L<PICA::Writer::Plain> with option C<annotate> enabled. In
addition writing an annotated field with result in an error if the field is
annotated with another character but C<+>, C<-> or space.
The counterpart of this module is L<PICA::Parser::Patch>.
=cut
2 changes: 2 additions & 0 deletions lib/PICA/Writer/Plain.pm
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ PICA::Writer::Plain - Plain PICA+ format serializer
See L<PICA::Writer::Base> for synopsis and details.
This writer can be used to write PICA Patch format but L<PICA::Writer::Patch> should be used to ensure all fields are strictly annotated.
The counterpart of this module is L<PICA::Parser::Plain>.
=cut
2 changes: 1 addition & 1 deletion script/picadata
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Build an Avram schema from input data, optionally based on an existing schema
=head2 --from, -f

PICA serialization type (plain, plus/normalized, binary, import, XML, ppxml,
pixml) with Plain as default. Guessed from first input filename unless
pixml, patch) with Plain as default. Guessed from first input filename unless
specified. See format documentation at L<http://format.gbv.de/pica>.

=head2 --to, -t
Expand Down
6 changes: 6 additions & 0 deletions t/20-parser.t
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,10 @@ my $annotated = "";
}
}

{
my $patch = pica_parser(patch => \"003@ \$01\n+ 021A \$ax")->next;
is_deeply $patch->{record}, [['003@','','0','1',' '],['021A','','a','x','+']], 'patch parser';
throws_ok { pica_parser(patch => \"? 003@ \$01")->next } qr/Invalid annotation/;
}

done_testing;
20 changes: 15 additions & 5 deletions t/30-writer.t
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use warnings;
use utf8;

use Test::More;
use Test::Exception;
use Test::XML;

use PICA::Data qw(pica_writer pica_parser pica_string);
Expand All @@ -13,6 +14,7 @@ use PICA::Writer::XML;
use PICA::Writer::PPXML;
use PICA::Parser::PPXML;
use PICA::Writer::JSON;
use PICA::Writer::Patch;
use PICA::Writer::Generic;
use PICA::Schema;

Expand Down Expand Up @@ -213,6 +215,17 @@ note 'PICA::Writer::JSON';
like $out, qr/^\[\n\s+\[/m, 'JSON (pretty)';
}

note 'PICA::Writer::Patch';
{
my $out = "";
my $writer = PICA::Writer::Patch->new(fh => \$out);
my $record = [['003@','','0','1'],['021A','','a','x','+']];
$writer->write($record);
$writer->end;
is $out, " 003@ \$01\n+ 021A \$ax\n\n", "patch format";
throws_ok { $writer->write([['021A','','a','x','?']]) } qr{Invalid annotation};
}

note 'PICA::Data';

{
Expand Down Expand Up @@ -240,11 +253,8 @@ PLAIN
note 'Exeptions';

{
eval {pica_writer('plain', fh => '')};
ok $@, 'invalid filename';

eval {pica_writer('plain', fh => {})};
ok $@, 'invalid handle';
dies_ok {pica_writer('plain', fh => '')} 'invalid filename';
dies_ok {pica_writer('plain', fh => {})} 'invalid handle';
}

note 'undefined occurrence';
Expand Down

0 comments on commit 42a7df1

Please sign in to comment.