Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improve ingredients extraction #8942

Merged
merged 8 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ unit_test:
@echo "🥫 unit tests success"

integration_test:
@echo "🥫 Running unit tests …"
@echo "🥫 Running integration tests …"
# we launch the server and run tests within same container
# we also need dynamicfront for some assets to exists
# this is the place where variables are important
Expand Down
22 changes: 16 additions & 6 deletions lib/ProductOpener/Ingredients.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3955,7 +3955,9 @@ my %phrases_after_ingredients_list = (
],

en => [
'adds a trivial amount', # e.g. adds a trivial amount of added sugars per serving
'after opening',
#'Best before',
'nutrition(al)? (as sold|facts|information|typical|value[s]?)',
# "nutrition advice" seems to appear before ingredients rather than after.
# "nutritional" on its own would match the ingredient "nutritional yeast" etc.
Expand All @@ -3966,7 +3968,6 @@ my %phrases_after_ingredients_list = (
'once opened[,]? (consume|keep|refrigerate|store|use)',
'(Storage( instructions)?[: ]+)?Store in a cool[,]? dry place',
'(dist(\.)?|distributed|sold)(\&|and|sold| )* (by|exclusively)',
#'Best before',
#'See bottom of tin',
],

Expand Down Expand Up @@ -4214,10 +4215,10 @@ my %prefixes_before_dash = (fr => ['demi', 'saint',],);
my %ignore_phrases = (
de => [
'\d\d?\s?%\sFett\si(\.|,)\s?Tr(\.|,)?', # 45 % Fett i.Tr.
"inklusive",
'inklusive',
],
en => ["na|n/a|not applicable",],
benbenben2 marked this conversation as resolved.
Show resolved Hide resolved
fr => ["non applicable|non concerné",],
en => ['not applicable',],
fr => ['non applicable|non concerné',],

);

Expand Down Expand Up @@ -4422,7 +4423,7 @@ sub cut_ingredients_text_for_lang ($text, $language) {
if (defined $phrases_after_ingredients_list{$language}) {

foreach my $regexp (@{$phrases_after_ingredients_list{$language}}) {
if ($text =~ /\s*\b$regexp\b(.*)$/is) {
if ($text =~ /\*?\s*\b$regexp\b(.*)$/is) {
$text = $`;
$log->debug("removed phrases_after_ingredients_list", {removed => $1, kept => $text, regexp => $regexp})
if $log->is_debug();
Expand All @@ -4437,7 +4438,16 @@ sub cut_ingredients_text_for_lang ($text, $language) {
if (defined $ignore_phrases{$language}) {

foreach my $regexp (@{$ignore_phrases{$language}}) {
$text =~ s/^\s*($regexp)(\.)?\s*$//is;
# substract regexp
$text =~ s/\s*\b(?:$regexp)\s*/ /gi;
# rm opened-closed parenthesis
$text =~ s/\(\s?\)//g;
# rm double commas
$text =~ s/\s?,\s?,/,/g;
# rm double spaces
$text =~ s/\s+/ /g;
# rm space before comma
$text =~ s/\s,\s?/, /g;
}
}

Expand Down
3 changes: 3 additions & 0 deletions taxonomies/ingredients.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16565,6 +16565,9 @@ ciqual_food_name:en:Soy oil
ciqual_food_name:fr:Huile de soja
# ingredient/soya-oil has 41836 products 1in 28 languages @2021-08-16

<en:soya oil
en:non-gmo soybean oil

<en:soya oil
en:refined soya oil
pl:rafinowany olej sojowy
Expand Down
2 changes: 1 addition & 1 deletion taxonomies/labels.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4376,7 +4376,7 @@ nl:Niet geschikt voor kinderen onder 1 jaar
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#en:description:Labels used for identifying products (and ingredients) that have been grown organically.

en:Organic, organically grown, organically produced, ingredient produced organically, from organic farming, From Organic Agriculture
en:Organic, organically grown, organically produced, ingredient produced organically, from organic farming, From Organic Agriculture, organic ingredients
bg:Био, биологично земеделие, биологично
ca:Orgànic,de cultiu ecologic
cs:Bio
Expand Down
13 changes: 4 additions & 9 deletions tests/unit/ingredients_clean.t
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ my @tests = (
[
"fr",
"lait 98 % ,sel,ferments lactiques,coagulant Valeurs nutritionnelles Pour 100 g 1225 kj 295 kcal pour 22g 270 kJ 65 kcal Matières grasses dont acides gras saturés pour 100g 23g/ 15,5g pour 22g 5,1g/ 3,4g Glucides dont sucres traces Protéines pour 100g 22 g pour 22g 4,8 g Sel pour 100g 1,8 g pour 22g 0,40g Calcium pour 100g 680 mg(85 % ) pour 22g 150 mg(19 % ) Afin d'éviter les risques d'étouffement pour les enfants de moins de 4 ans, coupez en petites bouchées. AQR: Apports Quotidiens de Référence А conserver au froid après achat.",
"lait 98 % ,sel,ferments lactiques,coagulant"
"lait 98 %, sel,ferments lactiques,coagulant"
],

[
Expand Down Expand Up @@ -63,14 +63,9 @@ my @tests = (
],

[
"fr", "Ingrédients :
Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon,
crème fraîche
5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde.
",
"Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon,
crème fraîche
5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde."
"fr",
"Ingrédients : Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon, crème fraîche 5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde.",
"Pulpe de tomate 41% (tomate pelée 24.6%, jus de tomate 16.4%, acidifiant : acide citrique), purée de tomate 25%, eau, oignon, crème fraîche 5%, lait de coco déshydraté 2,5% (contient des protéines de lait), curry 2%, sucre, amidon modifié de maïs, poivron vert, poivron rouge, sel, noix de coco râpée 1%, arôme naturel de curry 0,25%, acidifiant : acide lactique. Peut contenir des traces de céleri et de moutarde."
],

[
Expand Down
42 changes: 42 additions & 0 deletions tests/unit/ingredients_extract.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/perl -w

# Tests of Ingredients::preparse_ingredients_text()

use Modern::Perl '2017';
use utf8;

use Test::More;
use Log::Any::Adapter 'TAP';

use ProductOpener::Products qw/:all/;
use ProductOpener::Tags qw/:all/;
use ProductOpener::TagsEntries qw/:all/;
use ProductOpener::Ingredients qw/:all/;

my @lists = (
# en phrases_after_ingredients_list
[
"en",
"carrots, green peas, corn, scallion. *adds a trivial amount of added sugars per serving.",
benbenben2 marked this conversation as resolved.
Show resolved Hide resolved
"carrots, green peas, corn, scallion.",
],
# en ignore_phrases,
[
"en",
"Egg White, Xanthan Gum (not applicable), Salt, Glucono-delta-lactone.",
"Egg White, Xanthan Gum, Salt, Glucono-delta-lactone.",
],
);

foreach my $test_ref (@lists) {
my $lc = $test_ref->[0]; # Language
my $ingredients_text_from_image = $test_ref->[1];
my $cut_ingredients_text_from_image = cut_ingredients_text_for_lang($ingredients_text_from_image, $lc);
print STDERR "input from the picture extraction (ingredients list ($lc)): $ingredients_text_from_image\n";
print STDERR "cut_ingredients_text_from_image (result from sub routine): $cut_ingredients_text_from_image\n";
my $expected = $test_ref->[2];
is(lc($cut_ingredients_text_from_image), lc($expected))
or print STDERR "Original ingredients: $ingredients_text_from_image ($lc)\n";
}

done_testing();
Loading