diff --git a/lib/PICA/Parser/Import.pm b/lib/PICA/Parser/Import.pm new file mode 100644 index 0000000..7749f97 --- /dev/null +++ b/lib/PICA/Parser/Import.pm @@ -0,0 +1,81 @@ +package PICA::Parser::Import; +use v5.14.1; +use utf8; + +our $VERSION = '2.09'; + +use charnames ':full'; +use Carp qw(carp croak); + +use parent 'PICA::Parser::Base'; + +sub _next_record { + my ($self) = @_; + + my $reader = $self->{reader}; + my $line; + + # All data before the first record separator is ignored + if (!$self->count) { + do { + $line = readline($reader); + return unless defined $line; + } while ($line =~ /^\x1D$/); + } + + my @record; + while (1) { + $line = readline($reader); + if (!defined $line) { + @record ? last : return; + } + + next if $line =~ /^#|^\s*$/; # ignore empty or comment lines + last if $line =~ /^\x1D$/; + chomp $line; + + if ($line + =~ m/^\x1E([012][0-9][0-9][A-Z@])(\/(\d{2,3}))?\s((\x1F[^\x1F]+)+)$/ + ) + { + my $tag = $1; + my $occ = $3 > 0 ? $3 : ''; + my @sf = split /\x1F/, $4; + shift @sf; + push @record, [$tag, $occ, map {split //, $_, 2} @sf]; + } + elsif ($self->{strict}) { + croak "ERROR: invalid PICA field structure \"$line\""; + } + else { + carp + "WARNING: invalid PICA field structure \"$line\". Skipped field"; + next; + } + + } + + return \@record; +} + +1; +__END__ + +=encoding UTF-8 + +=head1 NAME + +PICA::Parser::Import - PICA Import format parser + +=head1 DESCRIPTION + +Parses PICA+ records in PICA Import format (also known as "normalized title +format", see L). + +Fields or subfields spread over several lines are not supported! + +See L for synopsis and configuration. + +The counterpart of this module is L. + +=cut diff --git a/lib/PICA/Parser/Plain.pm b/lib/PICA/Parser/Plain.pm index 64e840c..9219f34 100644 --- a/lib/PICA/Parser/Plain.pm +++ b/lib/PICA/Parser/Plain.pm @@ -49,15 +49,13 @@ sub _next_record { $occ = $3; $data = $4; } + elsif ($self->{strict}) { + croak "ERROR: invalid PICA field structure \"$field\""; + } else { - if ($self->{strict}) { - croak " ERROR : no valid PICA field structure \"$field\""; - } - else { - carp - "WARNING: no valid PICA field structure \"$field\". Skipped field"; - next - } + carp + "WARNING: invalid PICA field structure \"$field\". Skipped field"; + next; } if (!$self->{strict} && $data =~ /^ƒ/) { @@ -83,7 +81,7 @@ sub _next_record { } else { $tokens[-1] .= $code; - $code = substr $value, 0, 1; + $code = substr $value, 0, 1; $value = substr $value, 1; } } diff --git a/lib/PICA/Parser/Plus.pm b/lib/PICA/Parser/Plus.pm index 3ff8cd9..ce75f9e 100644 --- a/lib/PICA/Parser/Plus.pm +++ b/lib/PICA/Parser/Plus.pm @@ -4,7 +4,7 @@ use v5.14.1; our $VERSION = '2.09'; use charnames qw(:full); -use Carp qw(carp croak); +use Carp qw(carp croak); use parent 'PICA::Parser::Base'; @@ -38,11 +38,11 @@ sub _next_record { } else { if ($self->{strict}) { - croak "ERROR: no valid PICA field structure \"$field\""; + croak "ERROR: invalid PICA field structure \"$field\""; } else { carp - "WARNING: no valid PICA field structure \"$field\". Skipped field"; + "WARNING: invalid PICA field structure \"$field\". Skipped field"; next; } } diff --git a/lib/PICA/Writer/Import.pm b/lib/PICA/Writer/Import.pm index c71d0dc..ffe5546 100644 --- a/lib/PICA/Writer/Import.pm +++ b/lib/PICA/Writer/Import.pm @@ -48,9 +48,9 @@ PICA::Writer::Import - PICA Import format serializer =head2 DESCRIPTION -Serializes PICA+ records in PICA Import format (based on CBS normalized file -format, see L). +Serializes PICA+ records in PICA Import format (also known as "normalized title +format", see L). -Parser of PICA Import format has not been implemented yet. +See L for corresponding parser. =cut diff --git a/t/20-parser.t b/t/20-parser.t index b42b91f..a07e65f 100644 --- a/t/20-parser.t +++ b/t/20-parser.t @@ -89,18 +89,18 @@ note 'error handling'; my $plus = "X01A \x{1F}01\x{1E}001A/0 \x{1F}01\x{1E}001A/AB \x{1F}01\x{1E}"; warnings_exist {PICA::Parser::Plus->new(\$plus)->next}[ - qr{no valid PICA field structure}, - qr{no valid PICA field structure}, - qr{no valid PICA field structure} + qr{invalid PICA field structure}, + qr{invalid PICA field structure}, + qr{invalid PICA field structure} ], 'skip faulty fields with warnings'; dies_ok {PICA::Parser::Plus->new(\$plus, strict => 1)->next} 'die on faulty fields with option strict'; my $plain = "X01@ \$01\n\n001@/0 \$01\n\n001@/AB \$01"; warnings_exist {PICA::Parser::Plain->new(\$plain)->next}[ - qr{no valid PICA field structure}, - qr{no valid PICA field structure}, - qr{no valid PICA field structure} + qr{invalid PICA field structure}, + qr{invalid PICA field structure}, + qr{invalid PICA field structure} ], 'skip faulty fields with warnings'; dies_ok {PICA::Parser::Plain->new(\$plain, strict => 1)->next} diff --git a/t/files/pica.import b/t/files/pica.import new file mode 100644 index 0000000..f52282b --- /dev/null +++ b/t/files/pica.import @@ -0,0 +1,26 @@ +All data before the first record separator (ASCII 29) is ignored. + + +# comment + +002@ 0Aan +003@ 012345 +010@ achi +011@ a2004n2004.01 +012X 00xy +019@ aXB-CN +021A a我国民事立法的回顾与展望fHistory and perspective of China civil lawh柳经纬主编; +101@ a120cPICAdOldenburg, IBIT Universität Oldenburg <715> +101B 015-01-09t15:32:38.000 +101D 015-01-09b9330a0715 +101U 0utf8 +145Z/40 a$btest$c... +201B/01 015-01-09t15:32:38.000 +201D/01 015-01-09b9330a0715 +201U/01 0utf8 +203@/01 0917400194xy +208@/01 a15-01-09bz +209A/01 f108a401-06356/09dgx00 + +## Another comment +003@ 067890