Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rewrite author update script to only purge on updates #1166

Merged
merged 1 commit into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 169 additions & 103 deletions lib/MetaCPAN/Script/Author.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ use warnings;
use Moose;
with 'MooseX::Getopt', 'MetaCPAN::Role::Script';

use DateTime::Format::ISO8601 ();
use DateTime ();
use Email::Valid ();
use Encode ();
use File::stat ();
use Cpanel::JSON::XS qw( decode_json );
use Log::Contextual qw( :log );
use Log::Contextual qw( :log :dlog );
use MetaCPAN::Document::Author ();
use URI ();
use XML::Simple qw( XMLin );
use MetaCPAN::Types::TypeTiny qw( Str );
use MetaCPAN::Util qw(diff_struct);

=head1 SYNOPSIS

Expand Down Expand Up @@ -48,29 +48,51 @@ sub run {
$self->index->refresh;
}

my @author_config_fields = qw(
name
asciiname
profile
blog
perlmongers
donation
email
website
city
region
country
location
extra
);

my @cpan_fields = qw(
pauseid
name
email
website
asciiname
is_pause_custodial_account
);

my @compare_fields = do {
my %seen;
sort grep !$seen{$_}++, @cpan_fields, @author_config_fields;
};

sub index_authors {
my $self = shift;
my $type = $self->index->type('author');
my $authors = XMLin( $self->author_fh )->{cpanid};

if ( $self->pauseid ) {
log_info {"Indexing 1 author"};
$authors = { $self->pauseid => $authors->{ $self->pauseid } };
}
else {
my $count = keys %$authors;
log_debug {"Counting author"};
log_info {"Indexing $count authors"};
}

log_debug {"Getting last update dates"};
my $dates
= $type->raw->filter( { exists => { field => 'updated' } } )
->size(10000)->all;
$dates = {
map {
$_->{pauseid} =>
DateTime::Format::ISO8601->parse_datetime( $_->{updated} )
} map { $_->{_source} } @{ $dates->{hits}->{hits} }
};
my @author_ids_to_purge;

my $bulk = $self->es->bulk_helper(
index => $self->index->name,
Expand All @@ -79,131 +101,175 @@ sub index_authors {
timeout => '25m',
);

my @author_ids_to_purge;
my $scroll = $self->es->scroll_helper(
index => $self->index->name,
search_type => 'scan',
size => 500,
body => {
query => {
$self->pauseid
? (
term => {
pauseid => $self->pauseid,
},
)
: ( match_all => {} ),
},
_source => [@compare_fields],
},
);

# update authors
while ( my $doc = $scroll->next ) {
my $pauseid = $doc->{_id};
my $whois_data = delete $authors->{$pauseid} || next;
$self->update_author( $bulk, $pauseid, $whois_data, $doc->{_source} );
}

# new authors
for my $pauseid ( keys %$authors ) {
next if ( $self->pauseid and $self->pauseid ne $pauseid );
my $data = $authors->{$pauseid};
my ( $name, $email, $homepage, $asciiname )
= ( @$data{qw(fullname email homepage asciiname)} );
$name = undef if ( ref $name );
$asciiname = q{} unless defined $asciiname;
$email = lc($pauseid) . '@cpan.org'
unless ( $email && Email::Valid->address($email) );
my $is_pause_custodial_account
= ( $name && $name =~ /\(PAUSE Custodial Account\)/ );
log_debug {
Encode::encode_utf8(
sprintf( "Indexing %s: %s <%s>", $pauseid, $name, $email ) );
};
my $conf = $self->author_config( $pauseid, $dates );
next unless ( $conf or $is_pause_custodial_account );
$conf ||= {};
my $put = {
pauseid => $pauseid,
name => $name,
asciiname => ref $asciiname ? undef : $asciiname,
email => $email,
website => $homepage,
map { $_ => $conf->{$_} }
grep { defined $conf->{$_} } keys %$conf
};
$put->{website} = [ $put->{website} ]
unless ( ref $put->{website} eq 'ARRAY' );
$put->{website} = [

# normalize www.homepage.com to http://www.homepage.com
map { $_->scheme ? $_->as_string : 'http://' . $_->as_string }
map { URI->new($_)->canonical }
grep {$_} @{ $put->{website} }
];

$put->{is_pause_custodial_account} = 1 if $is_pause_custodial_account;

# Now check the format we have is actually correct
my @errors = MetaCPAN::Document::Author->validate($put);
next if scalar @errors;

my $author = $type->new_document($put);
$author->gravatar_url; # build gravatar_url

# Do not import lat / lon's in the wrong order, or just invalid
if ( my $loc = $author->{location} ) {
my $whois_data = delete $authors->{$pauseid} || next;
$self->update_author( $bulk, $pauseid, $whois_data );
}

$bulk->flush;
$self->index->refresh;

$self->perform_purges;

log_info {"done"};
}

sub author_data_from_cpan {
my $self = shift;
my ( $pauseid, $whois_data ) = @_;

my $author_config = $self->author_config($pauseid) || {};

my $data = {
pauseid => $pauseid,
name => $whois_data->{fullname},
email => $whois_data->{email},
website => $whois_data->{homepage},
asciiname => $whois_data->{asciiname},
%$author_config,
is_pause_custodial_account => (
$whois_data->{fullname} =~ /\(PAUSE Custodial Account\)/ ? 1 : 0
),
};

undef $data->{name}
if ref $data->{name};

$data->{asciiname} = q{}
if !defined $data->{asciiname};

$data->{email} = lc($pauseid) . '@cpan.org'
unless $data->{email} && Email::Valid->address( $data->{email} );

$data->{website} = [

# normalize www.homepage.com to http://www.homepage.com
map +( $_->scheme ? '' : 'http://' ) . $_->as_string,
map URI->new($_)->canonical,
grep $_,
map +( ref eq 'ARRAY' ? @$_ : $_ ),
$data->{website}
];

# Do not import lat / lon's in the wrong order, or just invalid
if ( my $loc = $data->{location} ) {
if ( ref $loc ne 'ARRAY' || @$loc != 2 ) {
delete $data->{location};
}
else {
my $lat = $loc->[1];
my $lon = $loc->[0];

if ( $lat > 90 or $lat < -90 ) {
if ( !defined $lat or $lat > 90 or $lat < -90 ) {

# Invalid latitude
delete $author->{location};
delete $data->{location};
}
elsif ( $lon > 180 or $lon < -180 ) {
elsif ( !defined $lon or $lon > 180 or $lon < -180 ) {

# Invalid longitude
delete $author->{location};
delete $data->{location};
}
}
}

push @author_ids_to_purge, $put->{pauseid};
return $data;
}

# Only try put if this is a valid format
$bulk->update( {
id => $pauseid,
doc => $put,
doc_as_upsert => 1,
} );
sub update_author {
my $self = shift;
my ( $bulk, $pauseid, $whois_data, $current_data ) = @_;

my $data = $self->author_data_from_cpan( $pauseid, $whois_data );

log_debug {
Encode::encode_utf8( sprintf(
"Indexing %s: %s <%s>",
$pauseid, $data->{name}, $data->{email}
) );
};

# Now check the format we have is actually correct
if ( my @errors = MetaCPAN::Document::Author->validate($data) ) {
Dlog_error {
"Invalid data for $pauseid: $_"
}
\@errors;
return;
}

$bulk->flush;
$self->index->refresh;
return
unless diff_struct( $current_data, $data );

$self->purge_author_key(@author_ids_to_purge);
$self->perform_purges;
$data->{updated} = DateTime->now( time_zone => 'UTC' )->iso8601;

log_info {"done"};
$bulk->update( {
id => $pauseid,
doc => $data,
doc_as_upsert => 1,
} );

$self->purge_author_key($pauseid);
}

sub author_config {
my ( $self, $pauseid, $dates ) = @_;

my $fallback = $dates->{$pauseid} ? undef : {};
my ( $self, $pauseid ) = @_;

my $dir = $self->cpan->child( 'authors',
MetaCPAN::Util::author_dir($pauseid) );

my @files;
opendir( my $dh, $dir ) || return $fallback;
return undef
unless $dir->is_dir;

# Get the most recent version
my ($file)
= sort { $dir->child($b)->stat->mtime <=> $dir->child($a)->stat->mtime }
grep {m/author-.*?\.json/} readdir($dh);
return $fallback unless ($file);
$file = $dir->child($file);
return $fallback if !-e $file;
my ($file) = map $_->[0], sort { $a->[1] <=> $b->[1] }
map [ $_ => $_->stat->mtime ],
$dir->children(qr/\Aauthor-.*\.json\z/);

my $mtime = DateTime->from_epoch( epoch => $file->stat->mtime );

if ( $dates->{$pauseid} && $dates->{$pauseid} > $mtime ) {
log_debug {"Skipping $pauseid (newer version in index)"};
return undef;
}
return undef
unless $file;

my $author;
eval {
$author = decode_json( $file->slurp );
$author = decode_json( $file->slurp_raw );
1;
} or do {
log_warn {"$file is broken: $@"};
return $fallback;
return undef;
};

return {
map {
my $value = $author->{$_};
defined $value ? ( $_ => $value ) : ()
} @author_config_fields
};
$author
= { map { $_ => $author->{$_} }
qw(name asciiname profile blog perlmongers donation email website city region country location extra)
};
$author->{updated} = $mtime->iso8601;
return $author;
}

__PACKAGE__->meta->make_immutable;
Expand Down
Loading