From 12ed9b1054d8882f8a82c2a165c04ed53fdcf01f Mon Sep 17 00:00:00 2001 From: Graham Knop Date: Thu, 21 Nov 2024 23:43:26 +0100 Subject: [PATCH 1/2] be more resilient to bad data when updating contributors When trying to update all contributor data, it will fail because there are some broken releases in the index. While those should be fixed, they shouldn't cause the contributor script to fail. --- lib/MetaCPAN/Script/Role/Contributor.pm | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/MetaCPAN/Script/Role/Contributor.pm b/lib/MetaCPAN/Script/Role/Contributor.pm index 819412ba2..63a54e682 100644 --- a/lib/MetaCPAN/Script/Role/Contributor.pm +++ b/lib/MetaCPAN/Script/Role/Contributor.pm @@ -25,9 +25,28 @@ sub update_contributors { }, ); - my $bulk = $self->es->bulk_helper( es_doc_path('contributor') ); + my $report = sub { + my ( $action, $result, $i ) = @_; + if ( $i == 0 ) { + log_info {'flushing contributor updates'}; + } + }; + + my $bulk = $self->es->bulk_helper( + es_doc_path('contributor'), + on_success => $report, + on_error => $report, + ); + + log_info { 'updating contributors for ' . $scroll->total . ' releases' }; while ( my $release = $scroll->next ) { + my $source = $release->{_source}; + my $name = $source->{name}; + if ( !( $name && $source->{author} && $source->{distribution} ) ) { + Dlog_warn {"found broken release: $_"} $release; + next; + } log_debug { 'updating contributors for ' . $release->{_source}{name} }; my $actions = $self->release_contributor_update_actions( $release->{_source} ); From bea9c1fa3a2372dbd3895542c20113127cd7b785 Mon Sep 17 00:00:00 2001 From: Graham Knop Date: Thu, 21 Nov 2024 23:45:46 +0100 Subject: [PATCH 2/2] cache the email to pauseid mapping Rather than needing a query for every release to find pause IDs, store the email to pauseid mapping between releases. This should speed up running the contributor script with --all. --- lib/MetaCPAN/Script/Role/Contributor.pm | 51 ++++++++++++++++--------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/lib/MetaCPAN/Script/Role/Contributor.pm b/lib/MetaCPAN/Script/Role/Contributor.pm index 63a54e682..101578be4 100644 --- a/lib/MetaCPAN/Script/Role/Contributor.pm +++ b/lib/MetaCPAN/Script/Role/Contributor.pm @@ -97,6 +97,11 @@ sub release_contributor_update_actions { return \@actions; } +has email_mapping => ( + is => 'ro', + default => sub { {} }, +); + sub get_contributors { my ( $self, $release ) = @_; @@ -183,24 +188,34 @@ sub get_contributors { } if (%want_email) { - my $check_author = $self->es->search( - es_doc_path('author'), - body => { - query => { terms => { email => [ sort keys %want_email ] } }, - _source => [ 'email', 'pauseid' ], - size => 100, - }, - ); - - for my $author ( @{ $check_author->{hits}{hits} } ) { - my $emails = $author->{_source}{email}; - $emails = [$emails] - if !ref $emails; - my $pauseid = uc $author->{_source}{pauseid}; - for my $email (@$emails) { - for my $contrib ( @{ $want_email{$email} } ) { - $contrib->{pauseid} = $pauseid; - } + my $email_mapping = $self->email_mapping; + + my @fetch_email = grep !exists $email_mapping->{$_}, + sort keys %want_email; + + if (@fetch_email) { + my $check_author = $self->es->search( + es_doc_path('author'), + body => { + query => { terms => { email => \@fetch_email } }, + _source => [ 'email', 'pauseid' ], + size => 100, + }, + ); + + for my $author ( @{ $check_author->{hits}{hits} } ) { + my $pauseid = uc $author->{_source}{pauseid}; + my $emails = $author->{_source}{email}; + $email_mapping->{$_} //= $pauseid + for ref $emails ? @$emails : $emails; + } + } + + for my $email ( keys %want_email ) { + my $pauseid = $email_mapping->{$email} + or next; + for my $contrib ( @{ $want_email{$email} } ) { + $contrib->{pauseid} = $pauseid; } } }