Skip to content

Commit

Permalink
feat: new [taxonomy].extended.json with extended synonyms (#10744)
Browse files Browse the repository at this point in the history
This creates a new .extended.json file for all taxonomies, that contain
extended synonyms.
Fixes #10742

Notes:
- synonyms_extended entries are normalized (no accents, capitals, all
non letter chars turned to dashes etc.)
- synonyms are not normalized (as in the normal export)
- properties, parents, childrens are not included (they are in the .full
export)
- the .full export is not changed (it does not contain the extended
synonyms)
  • Loading branch information
stephanegigandet authored Aug 28, 2024
1 parent dc71d0d commit e1a485e
Showing 1 changed file with 32 additions and 21 deletions.
53 changes: 32 additions & 21 deletions lib/ProductOpener/Tags.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1098,8 +1098,7 @@ sub get_file_from_cache ($source, $target) {
# e.g. if the taxonomy building algorithm or configuration has changed
# This needs to be done also when the unaccenting parameters for languages set in Config.pm are changed

my $BUILD_TAGS_VERSION
= "20240403 - fix issue with additives.properties.txt not loaded + circular_parent check + moved canonicalization of properties to linter";
my $BUILD_TAGS_VERSION = "20240828 - new [tagtype].extended.json format with normalized extended synonyms";

sub get_from_cache ($tagtype, @files) {
# If the full set of cached files can't be found then returns the hash to be used
Expand Down Expand Up @@ -2092,6 +2091,7 @@ sub build_tags_taxonomy ($tagtype, $publish) {
# data structure to export the taxonomy to json format
my %taxonomy_json = ();
my %taxonomy_full_json = (); # including wikipedia abstracts
my %taxonomy_extended_json = (); # with extended synonyms

foreach my $lc (sort keys %{$stopwords{$tagtype}}) {
next if $lc =~ /\./; # .orig or .strings
Expand All @@ -2107,6 +2107,7 @@ sub build_tags_taxonomy ($tagtype, $publish) {

$taxonomy_json{$tagid} = {name => {}};
$taxonomy_full_json{$tagid} = {name => {}};
$taxonomy_extended_json{$tagid} = {name => {}};

# print "taxonomy - compute all children - $tagid - level: $level{$tagtype}{$tagid} - longest: $longest_parent{$tagid} - syn: $just_synonyms{$tagtype}{$tagid} - sort_key: $sort_key_parents{$tagid} \n";
if (defined $direct_parents{$tagtype}{$tagid}) {
Expand Down Expand Up @@ -2160,6 +2161,7 @@ sub build_tags_taxonomy ($tagtype, $publish) {

$taxonomy_json{$tagid}{name}{$lc} = $translations_to{$tagtype}{$tagid}{$lc};
$taxonomy_full_json{$tagid}{name}{$lc} = $translations_to{$tagtype}{$tagid}{$lc};
$taxonomy_extended_json{$tagid}{name}{$lc} = $translations_to{$tagtype}{$tagid}{$lc};

my $lc_tagid = get_string_id_for_lang($lc, $translations_to{$tagtype}{$tagid}{$lc});

Expand All @@ -2175,12 +2177,24 @@ sub build_tags_taxonomy ($tagtype, $publish) {
{
$taxonomy_json{$tagid}{name}{$lc} .= " - " . $synonyms_for{$tagtype}{$lc}{$lc_tagid}[1];
$taxonomy_full_json{$tagid}{name}{$lc} .= " - " . $synonyms_for{$tagtype}{$lc}{$lc_tagid}[1];
$taxonomy_extended_json{$tagid}{name}{$lc}
.= " - " . $synonyms_for{$tagtype}{$lc}{$lc_tagid}[1];
}

# add synonyms to the full taxonomy
if (defined $synonyms_for{$tagtype}{$lc}{$lc_tagid}) {
(defined $taxonomy_full_json{$tagid}{synonyms}) or $taxonomy_full_json{$tagid}{synonyms} = {};
$taxonomy_full_json{$tagid}{synonyms}{$lc} = $synonyms_for{$tagtype}{$lc}{$lc_tagid};
$taxonomy_extended_json{$tagid}{normalized_synonyms}{$lc}
= [map {get_string_id_for_lang($lc, $_)} @{$synonyms_for{$tagtype}{$lc}{$lc_tagid}}];
}

# add extended synonyms to the extended taxonomy
if (defined $synonyms_for_extended{$tagtype}{$lc}{$lc_tagid}) {
(defined $taxonomy_extended_json{$tagid}{synonyms_extended})
or $taxonomy_extended_json{$tagid}{synonyms_extended} = {};
$taxonomy_extended_json{$tagid}{normalized_synonyms_extended}{$lc}
= [sort keys %{$synonyms_for_extended{$tagtype}{$lc}{$lc_tagid}}];
}
}
}
Expand Down Expand Up @@ -2235,27 +2249,24 @@ sub build_tags_taxonomy ($tagtype, $publish) {

{
binmode STDOUT, ":encoding(UTF-8)";
if (open(my $OUT_JSON, ">", "$BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json")) {
print $OUT_JSON encode_json(\%taxonomy_json);
close($OUT_JSON);
}
else {
print
"Cannot open $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json, skipping writing taxonomy to file.\n";
}

if (open(my $OUT_JSON_FULL, ">", "$BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.full.json")) {
print $OUT_JSON_FULL encode_json(\%taxonomy_full_json);
close($OUT_JSON_FULL);
}
else {
print
"Cannot open $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.full.json, skipping writing taxonomy to file.\n";
foreach my $extension_taxonomy_ref (
["", \%taxonomy_json],
[".full", \%taxonomy_full_json],
[".extended", \%taxonomy_extended_json]
)
{
my ($extension, $taxonomy_ref) = @$extension_taxonomy_ref;
# Output the taxonomy in JSON format
if (open(my $OUT_JSON, ">", "$BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype$extension.json")) {
print $OUT_JSON encode_json($taxonomy_ref);
close($OUT_JSON);
}
else {
print
"Cannot open $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype$extension.json, skipping writing taxonomy to file.\n";
}
}
# to serve pre-compressed files from Apache
# nginx : needs nginx_static module
# system("cp $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json.json");
# system("gzip $BASE_DIRS{PUBLIC_DATA}/taxonomies/$tagtype.json");
}

$log->error("taxonomy errors", {errors => \@taxonomy_errors}) if $log->is_error();
Expand Down

0 comments on commit e1a485e

Please sign in to comment.