From 82e44c3eaf3cbde6e9a95d693f76bf21c4ba520b Mon Sep 17 00:00:00 2001 From: Graham Knop Date: Mon, 8 Apr 2024 12:07:39 +0200 Subject: [PATCH] author import: handle duplicate ids When reading 00whois.xml, there can be cpanid elements containing duplicate id element values. This can happen when there is both a list and author entry for the same name. In these cases, the author entry is extraneous. Rather than relying on XML::Simple's built in array to hash transformation (using id values), do the work ourselves so we can handle the duplicate id entries. Also configure XML::Simple to be more strict about how it parses. --- lib/MetaCPAN/Script/Author.pm | 42 ++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/lib/MetaCPAN/Script/Author.pm b/lib/MetaCPAN/Script/Author.pm index a59df5b01..0c996dc37 100644 --- a/lib/MetaCPAN/Script/Author.pm +++ b/lib/MetaCPAN/Script/Author.pm @@ -78,9 +78,49 @@ my @compare_fields = do { sort grep !$seen{$_}++, @cpan_fields, @author_config_fields; }; +has whois_data => ( + is => 'ro', + traits => ['NoGetopt'], + lazy => 1, + builder => '_build_whois_data', +); + +sub _build_whois_data { + my $self = shift; + my $data = XMLin( + $self->author_fh, + ForceArray => 1, + SuppressEmpty => '', + NoAttr => 1, + KeyAttr => [], + ); + + my $whois_data = {}; + + for my $author ( @{ $data->{cpanid} } ) { + my $data = { + map { + my $content = $author->{$_}; + @$content == 1 + && !ref $content->[0] ? ( $_ => $content->[0] ) : (); + } keys %$author + }; + + my $pauseid = $data->{id}; + my $existing = $whois_data->{$pauseid}; + if ( !$existing + || $existing->{type} eq 'author' && $data->{type} eq 'list' ) + { + $whois_data->{$pauseid} = $data; + } + } + + return $whois_data; +} + sub index_authors { my $self = shift; - my $authors = XMLin( $self->author_fh )->{cpanid}; + my $authors = $self->whois_data; if ( $self->pauseid ) { log_info {"Indexing 1 author"};