Skip to content

Commit

Permalink
Merge pull request #1322 from metacpan/haarg/index-contrib-name-email
Browse files Browse the repository at this point in the history
include name and email in contributor index
  • Loading branch information
haarg authored Nov 19, 2024
2 parents 2656fc0 + 70ca750 commit 232b751
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 85 deletions.
10 changes: 10 additions & 0 deletions es/contributor/mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@
"index": "not_analyzed",
"type": "string"
},
"name": {
"ignore_above": 2048,
"index": "not_analyzed",
"type": "string"
},
"email": {
"ignore_above": 2048,
"index": "not_analyzed",
"type": "string"
},
"release_author": {
"ignore_above": 2048,
"index": "not_analyzed",
Expand Down
17 changes: 13 additions & 4 deletions lib/MetaCPAN/Document/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package MetaCPAN::Document::Contributor;
use MetaCPAN::Moose;

use ElasticSearchX::Model::Document;
use MetaCPAN::Types::TypeTiny qw( Str );
use MetaCPAN::Types::TypeTiny qw( ArrayRef Str );

has distribution => (
is => 'ro',
Expand All @@ -24,9 +24,18 @@ has release_name => (
);

has pauseid => (
is => 'ro',
isa => Str,
required => 1,
is => 'ro',
isa => Str,
);

has name => (
is => 'ro',
isa => Str,
);

has email => (
is => 'ro',
isa => ArrayRef [Str],
);

__PACKAGE__->meta->make_immutable;
Expand Down
27 changes: 20 additions & 7 deletions lib/MetaCPAN/Query/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,24 @@ sub find_release_contributors {
my $query = +{
bool => {
must => [
{ term => { release_author => $author } },
{ term => { release_name => $name } },
{ term => { release_author => $author } },
{ term => { release_name => $name } },
{ exists => { field => 'pauseid' } },
]
}
};

my $res = $self->es->search(
es_doc_path('contributor'),
body => {
query => $query,
size => 999,
query => $query,
size => 999,
_source => [ qw(
distribution
pauseid
release_author
release_name
) ],
}
);
hit_total($res) or return {};
Expand All @@ -40,11 +47,17 @@ sub find_author_contributions {
my $res = $self->es->search(
es_doc_path('contributor'),
body => {
query => $query,
size => 999,
query => $query,
size => 999,
_source => [ qw(
distribution
pauseid
release_author
release_name
) ],
}
);
$res->{hits}{total} or return {};
hit_total($res) or return {};

return +{
contributors => [ map { $_->{_source} } @{ $res->{hits}{hits} } ] };
Expand Down
27 changes: 1 addition & 26 deletions lib/MetaCPAN/Script/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -80,32 +80,7 @@ sub run {
? { range => { date => { gte => sprintf( 'now-%dd', $self->age ) } } }
: return;

my $timeout = $self->all ? '720m' : '5m';

my $scroll = $self->es->scroll_helper(
scroll => $timeout,
es_doc_path('release'),
body => {
query => $query,
size => 500,
_source => [qw( author distribution name )],
},
);

my @data;

while ( my $r = $scroll->next ) {
my $contrib_data = $self->get_cpan_author_contributors(
$r->{_source}{author},
$r->{_source}{name},
$r->{_source}{distribution},
);
next unless is_arrayref($contrib_data);
log_debug { 'adding release ' . $r->{_source}{name} };
push @data => @{$contrib_data};
}

$self->update_release_contirbutors( \@data, $timeout );
$self->update_contributors($query);
}

__PACKAGE__->meta->make_immutable;
Expand Down
11 changes: 8 additions & 3 deletions lib/MetaCPAN/Script/Release.pm
Original file line number Diff line number Diff line change
Expand Up @@ -383,9 +383,14 @@ sub import_archive {
MetaCPAN::Script::Runner->run;
}

my $contrib_data = $self->get_cpan_author_contributors( $document->author,
$document->name, $document->distribution );
$self->update_release_contirbutors($contrib_data);
$self->update_contributors( {
bool => {
must => [
{ term => { author => $document->author } },
{ term => { name => $document->name } },
],
},
} );
}

sub detect_status {
Expand Down
220 changes: 175 additions & 45 deletions lib/MetaCPAN/Script/Role/Contributor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2,61 +2,191 @@ package MetaCPAN::Script::Role::Contributor;

use Moose::Role;

use Log::Contextual qw( :log );
use MetaCPAN::ESConfig qw( es_doc_path );
use MetaCPAN::Util qw( digest true false );
use MetaCPAN::Util qw( true false );
use Ref::Util qw( is_arrayref );

sub get_cpan_author_contributors {
my ( $self, $author, $release, $distribution ) = @_;
my @ret;
my $es = $self->es;

my $type = $self->model->doc('release');
my $data;
eval {
$data = $type->get_contributors( $author, $release );
1;
} or return [];

for my $d ( @{ $data->{contributors} } ) {
next unless exists $d->{pauseid};

# skip existing records
my $id = digest( $d->{pauseid}, $release );
my $exists = $es->exists( es_doc_path('contributor'), id => $id, );
next if $exists;

$d->{release_author} = $author;
$d->{release_name} = $release;
$d->{distribution} = $distribution;
push @ret, $d;
sub update_contributors {
my ( $self, $query ) = @_;

my $scroll = $self->es->scroll_helper(
es_doc_path('release'),
body => {
query => $query,
sort => ['_doc'],
_source => [ qw<
name
author
distribution
metadata.author
metadata.x_contributors
> ],
},
);

my $bulk = $self->es->bulk_helper( es_doc_path('contributor') );

while ( my $release = $scroll->next ) {
log_debug { 'updating contributors for ' . $release->{_source}{name} };
my $actions = $self->release_contributor_update_actions(
$release->{_source} );
for my $action (@$actions) {
$bulk->add_action(%$action);
}
}

return \@ret;
$bulk->flush;
}

sub update_release_contirbutors {
my ( $self, $data, $timeout ) = @_;
return unless $data and is_arrayref($data);

my $bulk = $self->es->bulk_helper( es_doc_path('contributor'),
timeout => $timeout || '5m', );

for my $d ( @{$data} ) {
my $id = digest( $d->{pauseid}, $d->{release_name} );
$bulk->update( {
id => $id,
doc => {
pauseid => $d->{pauseid},
release_name => $d->{release_name},
release_author => $d->{release_author},
distribution => $d->{distribution},
sub release_contributor_update_actions {
my ( $self, $release ) = @_;
my @actions;

my $res = $self->es->search(
es_doc_path('contributor'),
body => {
query => {
bool => {
must => [
{ term => { release_name => $release->{name} } },
{ term => { release_author => $release->{author} } },
],
}
},
doc_as_upsert => true,
} );
sort => ['_doc'],
size => 500,
_source => false,
},
);
my @ids = map $_->{_id}, @{ $res->{hits}{hits} };
push @actions, map +{ delete => { id => $_ } }, @ids;

my $contribs = $self->get_contributors($release);
my @docs = map {
;
my $contrib = $_;
{
release_name => $release->{name},
release_author => $release->{author},
distribution => $release->{distribution},
map +( defined $contrib->{$_} ? ( $_ => $contrib->{$_} ) : () ),
qw(pauseid name email)
};
} @$contribs;
push @actions, map +{ create => { _source => $_ } }, @docs;
return \@actions;
}

sub get_contributors {
my ( $self, $release ) = @_;

my $author_name = $release->{author};
my $contribs = $release->{metadata}{x_contributors} || [];
my $authors = $release->{metadata}{author} || [];

for ( \( $contribs, $authors ) ) {

# If a sole contributor is a string upgrade it to an array...
$$_ = [$$_]
if !ref $$_;

# but if it's any other kind of value don't die trying to parse it.
$$_ = []
unless Ref::Util::is_arrayref($$_);
}
$authors = [ grep { $_ ne 'unknown' } @$authors ];

$bulk->flush;
my $author = eval {
$self->es->get_source( es_doc_path('author'), id => $author_name );
}
or return [];

my $author_email = $author->{email};

my $author_info = {
email => [
lc "$author_name\@cpan.org",
(
Ref::Util::is_arrayref($author_email)
? @{$author_email}
: $author_email
),
],
name => $author_name,
};
my %seen = map { $_ => $author_info }
( @{ $author_info->{email} }, $author_info->{name}, );

my @contribs = map {
my $name = $_;
my $email;
if ( $name =~ s/\s*<([^<>]+@[^<>]+)>// ) {
$email = $1;
}
my $info;
my $dupe;
if ( $email and $info = $seen{$email} ) {
$dupe = 1;
}
elsif ( $info = $seen{$name} ) {
$dupe = 1;
}
else {
$info = {
name => $name,
email => [],
};
}
$seen{$name} ||= $info;
if ($email) {
push @{ $info->{email} }, $email
unless grep { $_ eq $email } @{ $info->{email} };
$seen{$email} ||= $info;
}
$dupe ? () : $info;
} ( @$authors, @$contribs );

my %want_email;
for my $contrib (@contribs) {

# heuristic to autofill pause accounts
if ( !$contrib->{pauseid} ) {
my ($pauseid)
= map { /^(.*)\@cpan\.org$/ ? $1 : () }
@{ $contrib->{email} };
$contrib->{pauseid} = uc $pauseid
if $pauseid;

}

push @{ $want_email{$_} }, $contrib for @{ $contrib->{email} };
}

if (%want_email) {
my $check_author = $self->es->search(
es_doc_path('author'),
body => {
query => { terms => { email => [ sort keys %want_email ] } },
_source => [ 'email', 'pauseid' ],
size => 100,
},
);

for my $author ( @{ $check_author->{hits}{hits} } ) {
my $emails = $author->{_source}{email};
$emails = [$emails]
if !ref $emails;
my $pauseid = uc $author->{_source}{pauseid};
for my $email (@$emails) {
for my $contrib ( @{ $want_email{$email} } ) {
$contrib->{pauseid} = $pauseid;
}
}
}
}

return \@contribs;
}

no Moose::Role;
Expand Down

0 comments on commit 232b751

Please sign in to comment.