Add PICA Import format parser (#129)

gbv · Aug 9, 2023 · 2e23d57 · 2e23d57
1 parent 114e44a
commit 2e23d57
Show file tree

Hide file tree

Showing 8 changed files with 143 additions and 38 deletions.
diff --git a/Changes b/Changes
@@ -1,6 +1,7 @@
 Revision history for PICA::Data
 
 {{$NEXT}}
+    - Add PICA Import format parser (#129)
     - Add parser counter (method: count)
 
 2.09 2023-04-14T07:03:05Z

diff --git a/lib/PICA/Data.pm b/lib/PICA/Data.pm
@@ -396,6 +396,7 @@ use PICA::Patch;
 use PICA::Parser::XML;
 use PICA::Parser::Plus;
 use PICA::Parser::Plain;
+use PICA::Parser::Import;
 use PICA::Parser::Binary;
 use PICA::Parser::PPXML;
 use PICA::Parser::PIXML;

diff --git a/lib/PICA/Parser/Import.pm b/lib/PICA/Parser/Import.pm
@@ -0,0 +1,81 @@
+package PICA::Parser::Import;
+use v5.14.1;
+use utf8;
+
+our $VERSION = '2.09';
+
+use charnames ':full';
+use Carp qw(carp croak);
+
+use parent 'PICA::Parser::Base';
+
+sub _next_record {
+    my ($self) = @_;
+
+    my $reader = $self->{reader};
+    my $line;
+
+    # All data before the first record separator is ignored
+    if (!$self->count) {
+        do {
+            $line = readline($reader);
+            return unless defined $line;
+        } while ($line =~ /^\x1D$/);
+    }
+
+    my @record;
+    while (1) {
+        $line = readline($reader);
+        if (!defined $line) {
+            @record ? last : return;
+        }
+
+        next if $line =~ /^#|^\s*$/;    # ignore empty or comment lines
+        last if $line =~ /^\x1D$/;
+        chomp $line;
+
+        if ($line
+            =~ m/^\x1E([012][0-9][0-9][A-Z@])(\/(\d{2,3}))?\s((\x1F[^\x1F]+)+)$/
+            )
+        {
+            my $tag = $1;
+            my $occ = $3 > 0 ? $3 : '';
+            my @sf  = split /\x1F/, $4;
+            shift @sf;
+            push @record, [$tag, $occ, map {split //, $_, 2} @sf];
+        }
+        elsif ($self->{strict}) {
+            croak "ERROR: invalid PICA field structure \"$line\"";
+        }
+        else {
+            carp
+                "WARNING: invalid PICA field structure \"$line\". Skipped field";
+            next;
+        }
+
+    }
+
+    return \@record;
+}
+
+1;
+__END__
+
+=encoding UTF-8
+
+=head1 NAME
+
+PICA::Parser::Import - PICA Import format parser
+
+=head1 DESCRIPTION
+
+Parses PICA+ records in PICA Import format (also known as "normalized title
+format", see L<https://format.gbv.de/pica/import>).
+
+Fields or subfields spread over several lines are not supported!
+
+See L<PICA::Parser::Base> for synopsis and configuration.
+
+The counterpart of this module is L<PICA::Writer::Import>.
+
+=cut
diff --git a/lib/PICA/Parser/Plain.pm b/lib/PICA/Parser/Plain.pm
@@ -49,15 +49,13 @@ sub _next_record {
             $occ  = $3;
             $data = $4;
         }
+        elsif ($self->{strict}) {
+            croak "ERROR: invalid PICA field structure \"$field\"";
+        }
         else {
-            if ($self->{strict}) {
-                croak " ERROR : no valid PICA field structure \"$field\"";
-            }
-            else {
-                carp
-                    "WARNING: no valid PICA field structure \"$field\". Skipped field";
-                next
-            }
+            carp
+                "WARNING: invalid PICA field structure \"$field\". Skipped field";
+            next;
         }
 
         if (!$self->{strict} && $data =~ /^ƒ/) {
@@ -83,7 +81,7 @@ sub _next_record {
                     }
                     else {
                         $tokens[-1] .= $code;
-                        $code = substr $value, 0, 1;
+                        $code  = substr $value, 0, 1;
                         $value = substr $value, 1;
                     }
                 }

diff --git a/lib/PICA/Parser/Plus.pm b/lib/PICA/Parser/Plus.pm
@@ -4,7 +4,7 @@ use v5.14.1;
 our $VERSION = '2.09';
 
 use charnames qw(:full);
-use Carp qw(carp croak);
+use Carp      qw(carp croak);
 
 use parent 'PICA::Parser::Base';
 
@@ -32,24 +32,20 @@ sub _next_record {
     foreach my $field (@fields) {
         my ($tag, $occ, $data);
         if ($field =~ m/^(\d{3}[A-Z@])(\/(\d{2,3}))?\s(.+)/) {
-            $tag  = $1;
-            $occ  = $3;
-            $data = $4;
+            my $tag       = $1;
+            my $occ       = $3;
+            my @subfields = map {split //, $_, 2}
+                split($self->SUBFIELD_INDICATOR, substr($4, 1));
+            push @record, [$tag, $occ > 0 ? $occ : '', @subfields];
+        }
+        elsif ($self->{strict}) {
+            croak "ERROR: invalid PICA field structure \"$field\"";
         }
         else {
-            if ($self->{strict}) {
-                croak "ERROR: no valid PICA field structure \"$field\"";
-            }
-            else {
-                carp
-                    "WARNING: no valid PICA field structure \"$field\". Skipped field";
-                next;
-            }
+            carp
+                "WARNING: invalid PICA field structure \"$field\". Skipped field";
+            next;
         }
-
-        my @subfields = map {substr($_, 0, 1), substr($_, 1)}
-            split($self->SUBFIELD_INDICATOR, substr($data, 1));
-        push @record, [$tag, $occ > 0 ? $occ : '', @subfields];
     }
 
     return \@record;

diff --git a/lib/PICA/Writer/Import.pm b/lib/PICA/Writer/Import.pm
@@ -48,9 +48,9 @@ PICA::Writer::Import - PICA Import format serializer
 
 =head2 DESCRIPTION
 
-Serializes PICA+ records in PICA Import format (based on CBS normalized file
-format, see L<https://format.gbv.de/pica/import>).
+Serializes PICA+ records in PICA Import format (also known as "normalized title
+format", see L<https://format.gbv.de/pica/import>).
 
-Parser of PICA Import format has not been implemented yet.
+See L<PICA::Parser::Import> for corresponding parser.
 
 =cut
diff --git a/t/20-parser.t b/t/20-parser.t
@@ -17,7 +17,7 @@ is $first->{record}->[6]->[7], '柳经纬主编;', 'Unicode';
 is_deeply $first->{record}->[11],
     ['145Z', '40', 'a', '$', 'b', 'test$', 'c', '...'], 'sub field with $';
 
-foreach my $type (qw(Plain Plus JSON Binary XML PPXML PIXML)) {
+foreach my $type (qw(Plain Plus JSON Binary XML PPXML PIXML Import)) {
     my $module = "PICA::Parser::$type";
     my $file   = 't/files/pica.' . lc($type);
 
@@ -26,12 +26,14 @@ foreach my $type (qw(Plain Plus JSON Binary XML PPXML PIXML)) {
     is ref($parser), "PICA::Parser::$type", "parser from file";
 
     my $record = $parser->next;
-    is_deeply $record, $first;
-    is $parser->count, 1;
+    is_deeply $record, $first, 'first record';
+    is $parser->count, 1, 'count';
+
+    #note explain $record if $type eq 'Import';
 
     ok $parser->next()->{_id} eq '67890', 'next record';
     ok !$parser->next, 'parsed all records';
-    is $parser->count, 2;
+    is $parser->count, 2, 'count';
 
     foreach my $mode ('<', '<:utf8') {
         next
@@ -89,18 +91,18 @@ note 'error handling';
     my $plus
         = "X01A \x{1F}01\x{1E}001A/0 \x{1F}01\x{1E}001A/AB \x{1F}01\x{1E}";
     warnings_exist {PICA::Parser::Plus->new(\$plus)->next}[
-        qr{no valid PICA field structure},
-        qr{no valid PICA field structure},
-        qr{no valid PICA field structure}
+        qr{invalid PICA field structure},
+        qr{invalid PICA field structure},
+        qr{invalid PICA field structure}
     ],
         'skip faulty fields with warnings';
     dies_ok {PICA::Parser::Plus->new(\$plus, strict => 1)->next}
     'die on faulty fields with option strict';
     my $plain = "X01@ \$01\n\n001@/0 \$01\n\n001@/AB \$01";
     warnings_exist {PICA::Parser::Plain->new(\$plain)->next}[
-        qr{no valid PICA field structure},
-        qr{no valid PICA field structure},
-        qr{no valid PICA field structure}
+        qr{invalid PICA field structure},
+        qr{invalid PICA field structure},
+        qr{invalid PICA field structure}
     ],
         'skip faulty fields with warnings';
     dies_ok {PICA::Parser::Plain->new(\$plain, strict => 1)->next}

diff --git a/t/files/pica.import b/t/files/pica.import
@@ -0,0 +1,26 @@
+All data before the first record separator (ASCII 29) is ignored.
+
+
+# comment
+
+002@ 0Aan
+003@ 012345
+010@ achi
+011@ a2004n2004.01
+012X 00xy
+019@ aXB-CN
+021A a我国民事立法的回顾与展望fHistory and perspective of China civil lawh柳经纬主编;
+101@ a120cPICAdOldenburg, IBIT Universität Oldenburg <715>
+101B 015-01-09t15:32:38.000
+101D 015-01-09b9330a0715
+101U 0utf8
+145Z/40 a$btest$c...
+201B/01 015-01-09t15:32:38.000
+201D/01 015-01-09b9330a0715
+201U/01 0utf8
+203@/01 0917400194xy
+208@/01 a15-01-09bz
+209A/01 f108a401-06356/09dgx00
+
+## Another comment
+003@ 067890