From 8e50420254f8c97847a50c591aaa876c786791ac Mon Sep 17 00:00:00 2001 From: kreativmonkey Date: Thu, 13 Aug 2020 09:14:14 +0200 Subject: [PATCH 01/13] add first version of mainz parser --- config.py | 1 + parsers/mainz.py | 201 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 parsers/mainz.py diff --git a/config.py b/config.py index f7c281de..374f3abb 100644 --- a/config.py +++ b/config.py @@ -16,6 +16,7 @@ 'karlsruhe', 'leipzig', 'magdeburg', + 'mainz', 'muenchen', 'marburg', 'niederbayern_oberpfalz', diff --git a/parsers/mainz.py b/parsers/mainz.py new file mode 100644 index 00000000..22236183 --- /dev/null +++ b/parsers/mainz.py @@ -0,0 +1,201 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup as parse +from bs4.element import Tag +import re +import datetime +import os.path as path + +from utils import Parser +from pyopenmensa.feed import OpenMensaCanteen + + +day_regex = re.compile('(?P\d{4}-\d{2}-\d{2})') +price_regex = re.compile('(?P\d+[,.]\d{2}) ?€') +notes_regex = re.compile('\[(?:(([A-Za-z0-9]+),?)+)\]$') +extract_legend = re.compile('\((\w+,?)+\)') +extract_legend_notes = re.compile('(?:([A-Za-z0-9]+))') +extract_notes_regex = re.compile('(?:([A-Za-z0-9]+)[,|\)])') + +canteenLegend = { + # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 + '0' : 'all', + '1' : 'Zentralmensa', + '2' : 'Mensa-Georg-Foster', + '3' : 'Cafe-Rewi', + '4' : 'Mensa-Bingen', + '5' : 'Mensa-K3', + '6' : 'Mensa-Holzstraße', + '7' : 'Mensarium', + '8' : 'Cafe-Bingen-Rochusberg', + '9' : 'Mensablitz' +} + +display = { + '2' : 'Aktuelle Woche', + '3' : 'Nächste Woche' +} + +roles = ('student', 'other', 'employee') + +extraLegend = { + # Source: https://www.studierendenwerk-mainz.de/essentrinken/speiseplan/ + '1': 'mit Farbstoff', + '2': 'mit Konservierungsstoff', + '3': 'mit Antioxidationsmittel', + '4': 'mit Geschmacksverstärker', + '5': 'geschwefelt', + '6': 'geschwärzt', + '7': 'gewachst', + '8': 'Phosphat', + '9': 'mit Süßungsmitteln', + '10': 'enthält eine Phenylalaninquelle', + 'S' : 'Schweinefleisch', + 'G' : 'Geflügelfleisch', + 'R' : 'Rindfleisch', + 'Gl' : 'Gluten', + 'We' : 'Weizen (inkl. Dinkel)', + 'Ro' : 'Roggen', + 'Ge' : 'Gerste', + 'Haf': 'Hafer', + 'Kr' : 'Krebstiere und Krebstiererzeugnisse', + 'Ei' : 'Eier und Eiererzeugnisse', + 'Fi' : 'Fisch und Fischerzeugnisse', + 'En' : 'Erdnüsse und Erdnusserzeugnisse', + 'So' : 'Soja und Sojaerzeugnisse', + 'La' : 'Milch und Milcherzeugnisse', + 'Sl' : 'Sellerie und Sellerieerzeugnisse', + 'Sf' : 'Senf und Senferzeugnisse', + 'Se' : 'Sesamsamen und Sesamsamenerzeugnisse', + 'Sw' : 'Schwefeldioxid und Sulfite > 10mg/kg', + 'Lu' : 'Lupine und Lupinerzeugnisse', + 'Wt' : 'Weichtiere und Weichtiererzeugnisse', + 'Nu' : 'Schalenfrüchte', + 'Man': 'Mandel', + 'Has': 'Haselnüsse', + 'Wa' : 'Walnüsse', + 'Ka' : 'Kaschunüsse', + 'Pe' : 'Pecanüsse', + 'Pa' : 'Paranüsse', + 'Pi' : 'Pistatien', + 'Mac': 'Macadamianüsse', + 'icon:S.png' : 'Scheinefleisch', + 'icon:R.png' : 'Rindfleisch', + 'icon:Fi.png' : 'Fisch', + 'icon:Gl.png' : 'Glutenfrei', + 'icon:La.png' : 'Lactosefrei', + 'icon:Vegan.png' : 'Vegan', + 'icon:Veggi.png' : 'Vegetarisch' + +} + +def build_meal_name(meal): + # Name des Gerichts + name = str(meal).strip() + # Remove the notes from Mealname and delete unnecessary spaces + name = ' '.join(re.sub(r'\((\w+,?)+\)', '', name).split()) + if len(name) > 250: + name = name[:245] + '...' + + return name + +def extract_meal_notes(meal): + # extracting the legend + legpart = extract_legend.findall(str(meal).strip()) + legend = [] + for l in legpart: + legend.extend(extract_legend_notes.findall(l)) + + notes = set() + for l in legend: + if extraLegend[l]: + notes.add(extraLegend[l]) + + return notes + +def build_meal_notes(meal): + notes = set() + + for icon in meal.find_all('img'): + # + # + if "icon:"+path.basename(icon['src']) in extraLegend: + notes.add(extraLegend["icon:"+path.basename(icon['src'])]) + + # extracting the legend + notes.update(extract_meal_notes(meal.find('div', class_="speiseplanname").string)) + + return list(notes) + +def build_meal_price(meal): + # Preis aus v extrahieren + # 3,40 € / 5,65 € + meal_prices = {} + + prices = price_regex.findall(str(meal)) + # s = student + # g = other + # m = employee + meal_prices["student"] = prices[0].replace(',', '.') + meal_prices["employee"] = prices[1].replace(',', '.') + meal_prices["other"] = prices[1].replace(',', '.') + + return meal_prices + +def build_meal_date(meal): + # Print the String of Date + # Format: Montag, 12. August 2020 + # Output: 12. August 2020 + + return meal + + +def parse_data(canteen, data): + for v in data.find_all('div'): + if not v.has_attr('class'): + continue + + if v['class'][0] == 'speiseplan_date': + date = build_meal_date(str(v.string).strip()) + + if v['class'][0] == 'speiseplan_bldngall_name': + # Get Mensa Name + canteen_name = str(v.string).strip() + + if v['class'][0] == 'speiseplancounter': + # Get Counter + counter_name = str(v.string).strip() + + if v['class'][0] == 'menuspeise': + # Name des Gerichts + meal_name = build_meal_name(v.find('div', class_="speiseplanname").string) + meal_notes = build_meal_notes(v) + meal_prices = build_meal_price(v) + + canteen.addMeal(date, counter_name, + meal_name, meal_notes, meal_prices) + + return canteen + + +def parse_url(url, today=False): + #base_data = load_base_data() + + canteen = OpenMensaCanteen() + + for d in display: + with urlopen(url + '&display_type=' + d) as resp: + resp = parse(resp.read().decode('utf-8', errors='ignore'), features='lxml') + speiseplan = resp.find('div', class_='speiseplan') + + canteen = parse_data(canteen, speiseplan) + + + return canteen.toXMLFeed() + +parser = Parser('mainz', + handler=parse_url, + shared_prefix='https://www.studierendenwerk-mainz.de/speiseplan/frontend/index.php') + +for canteen in canteenLegend: + parser.define(canteenLegend[canteen], suffix='?building_id='+canteen) + From 6f4a3a8e5c6d5da7bb41aa0d509e9c31646bcede Mon Sep 17 00:00:00 2001 From: kreativmonkey Date: Thu, 13 Aug 2020 09:17:50 +0200 Subject: [PATCH 02/13] change canteenLegend names to lowercase --- parsers/mainz.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 22236183..5770e5d2 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -1,6 +1,5 @@ from urllib.request import urlopen from bs4 import BeautifulSoup as parse -from bs4.element import Tag import re import datetime import os.path as path @@ -19,15 +18,15 @@ canteenLegend = { # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 '0' : 'all', - '1' : 'Zentralmensa', - '2' : 'Mensa-Georg-Foster', - '3' : 'Cafe-Rewi', - '4' : 'Mensa-Bingen', - '5' : 'Mensa-K3', - '6' : 'Mensa-Holzstraße', - '7' : 'Mensarium', - '8' : 'Cafe-Bingen-Rochusberg', - '9' : 'Mensablitz' + '1' : 'zentralmensa', + '2' : 'mensa-georg-foster', + '3' : 'cafe-rewi', + '4' : 'mensa-bingen', + '5' : 'mensa-K3', + '6' : 'mensa-holzstraße', + '7' : 'mensarium', + '8' : 'cafe-bingen-rochusberg', + '9' : 'mensablitz' } display = { From db748c0b6b342ed66bc3de80cbafd3f552367b1f Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Thu, 13 Aug 2020 15:32:13 +0200 Subject: [PATCH 03/13] direct extraction from the legend at mealname --- parsers/mainz.py | 65 ++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 5770e5d2..688cf4e6 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -12,8 +12,7 @@ price_regex = re.compile('(?P\d+[,.]\d{2}) ?€') notes_regex = re.compile('\[(?:(([A-Za-z0-9]+),?)+)\]$') extract_legend = re.compile('\((\w+,?)+\)') -extract_legend_notes = re.compile('(?:([A-Za-z0-9]+))') -extract_notes_regex = re.compile('(?:([A-Za-z0-9]+)[,|\)])') +extract_legend_notes = re.compile('(?<=[\(,])(\w{1,2})') canteenLegend = { # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 @@ -87,53 +86,43 @@ } -def build_meal_name(meal): - # Name des Gerichts - name = str(meal).strip() - # Remove the notes from Mealname and delete unnecessary spaces - name = ' '.join(re.sub(r'\((\w+,?)+\)', '', name).split()) +def build_meal_name(meal_name): + # There are the extras of the meal inside the meal name + # This will remove the extras and the unnecessary spaces + # Example: 6 gebackene Fischstäbchen (Gl,Fi,We) mit Reis und veganem Joghurt-Kräuter-Dip (3,Gl,So,Sf,Ge) + # Output: 6 gebackene Fischstäbchen mit Reis und veganem Joghurt-Kräuter-Dip + name = ' '.join(re.sub(r'\((\w+,?)+\)', '', str(meal_name)).split()) + + # Shorten the meal name to 250 characters if len(name) > 250: name = name[:245] + '...' return name -def extract_meal_notes(meal): - # extracting the legend - legpart = extract_legend.findall(str(meal).strip()) - legend = [] - for l in legpart: - legend.extend(extract_legend_notes.findall(l)) - - notes = set() - for l in legend: - if extraLegend[l]: - notes.add(extraLegend[l]) - - return notes - def build_meal_notes(meal): - notes = set() - - for icon in meal.find_all('img'): - # - # - if "icon:"+path.basename(icon['src']) in extraLegend: - notes.add(extraLegend["icon:"+path.basename(icon['src'])]) - - # extracting the legend - notes.update(extract_meal_notes(meal.find('div', class_="speiseplanname").string)) - - return list(notes) + meal_name = str(meal.find('div', class_="speiseplanname").string).strip() + images = meal.find_all('img') + + # Use a set for easy elimination of duplicates + notes = set() + + # extracting the icons with spezial informations about the meal + # Example: + for icon in images: + if "icon:"+path.basename(icon['src']) in extraLegend: + notes.add(extraLegend["icon:"+path.basename(icon['src'])]) + + for l in extract_legend_notes.findall(meal_name): + if extraLegend[l]: + notes.add(extraLegend[l]) + + return list(notes) def build_meal_price(meal): - # Preis aus v extrahieren - # 3,40 € / 5,65 € meal_prices = {} prices = price_regex.findall(str(meal)) - # s = student - # g = other - # m = employee + # The pricing for employee and others are the same! meal_prices["student"] = prices[0].replace(',', '.') meal_prices["employee"] = prices[1].replace(',', '.') meal_prices["other"] = prices[1].replace(',', '.') From 1e941ef1579b581d9ccfd3bb9527fa04e7ebf0a6 Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Thu, 13 Aug 2020 15:46:09 +0200 Subject: [PATCH 04/13] code cleanup --- parsers/mainz.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 688cf4e6..93528325 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -8,13 +8,13 @@ from pyopenmensa.feed import OpenMensaCanteen -day_regex = re.compile('(?P\d{4}-\d{2}-\d{2})') +day_regex = re.compile('(\d{2}.\s\w+\s\d{4})') price_regex = re.compile('(?P\d+[,.]\d{2}) ?€') notes_regex = re.compile('\[(?:(([A-Za-z0-9]+),?)+)\]$') extract_legend = re.compile('\((\w+,?)+\)') extract_legend_notes = re.compile('(?<=[\(,])(\w{1,2})') -canteenLegend = { +canteens = { # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 '0' : 'all', '1' : 'zentralmensa', @@ -86,14 +86,14 @@ } -def build_meal_name(meal_name): +def build_meal_name(meal): # There are the extras of the meal inside the meal name # This will remove the extras and the unnecessary spaces # Example: 6 gebackene Fischstäbchen (Gl,Fi,We) mit Reis und veganem Joghurt-Kräuter-Dip (3,Gl,So,Sf,Ge) # Output: 6 gebackene Fischstäbchen mit Reis und veganem Joghurt-Kräuter-Dip - name = ' '.join(re.sub(r'\((\w+,?)+\)', '', str(meal_name)).split()) + name = ' '.join(re.sub(r'\((\w+,?)+\)', '', str(meal)).split()) - # Shorten the meal name to 250 characters + # Shorten the meal name to 250 characters like the api specification: https://doc.openmensa.org/feed/v2/#name if len(name) > 250: name = name[:245] + '...' @@ -128,14 +128,6 @@ def build_meal_price(meal): meal_prices["other"] = prices[1].replace(',', '.') return meal_prices - -def build_meal_date(meal): - # Print the String of Date - # Format: Montag, 12. August 2020 - # Output: 12. August 2020 - - return meal - def parse_data(canteen, data): for v in data.find_all('div'): @@ -143,18 +135,15 @@ def parse_data(canteen, data): continue if v['class'][0] == 'speiseplan_date': - date = build_meal_date(str(v.string).strip()) + date = day_regex.findall(str(v.string).strip())[0] if v['class'][0] == 'speiseplan_bldngall_name': - # Get Mensa Name canteen_name = str(v.string).strip() if v['class'][0] == 'speiseplancounter': - # Get Counter counter_name = str(v.string).strip() if v['class'][0] == 'menuspeise': - # Name des Gerichts meal_name = build_meal_name(v.find('div', class_="speiseplanname").string) meal_notes = build_meal_notes(v) meal_prices = build_meal_price(v) @@ -166,10 +155,9 @@ def parse_data(canteen, data): def parse_url(url, today=False): - #base_data = load_base_data() - canteen = OpenMensaCanteen() - + + # There are two displays one for the current and one for the next week for d in display: with urlopen(url + '&display_type=' + d) as resp: resp = parse(resp.read().decode('utf-8', errors='ignore'), features='lxml') @@ -184,6 +172,6 @@ def parse_url(url, today=False): handler=parse_url, shared_prefix='https://www.studierendenwerk-mainz.de/speiseplan/frontend/index.php') -for canteen in canteenLegend: - parser.define(canteenLegend[canteen], suffix='?building_id='+canteen) +for canteen in canteens: + parser.define(canteens[canteen], suffix='?building_id='+canteen) From 5a158212773285a269ba6aea706d54dcc7b368c9 Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Thu, 13 Aug 2020 16:03:43 +0200 Subject: [PATCH 05/13] prevent for legend not in list --- parsers/mainz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 93528325..0b3baa06 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -113,7 +113,7 @@ def build_meal_notes(meal): notes.add(extraLegend["icon:"+path.basename(icon['src'])]) for l in extract_legend_notes.findall(meal_name): - if extraLegend[l]: + if l in extraLegend: notes.add(extraLegend[l]) return list(notes) From 9193625922d65b2d597f6cd0a6a36063a2786b50 Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Sat, 15 Aug 2020 22:34:04 +0200 Subject: [PATCH 06/13] updated to Y0hy0h suggestions and code linting --- parsers/mainz.py | 179 ++++++++++++++++++++++++----------------------- 1 file changed, 92 insertions(+), 87 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 0b3baa06..d1276dd4 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -8,11 +8,11 @@ from pyopenmensa.feed import OpenMensaCanteen -day_regex = re.compile('(\d{2}.\s\w+\s\d{4})') -price_regex = re.compile('(?P\d+[,.]\d{2}) ?€') -notes_regex = re.compile('\[(?:(([A-Za-z0-9]+),?)+)\]$') -extract_legend = re.compile('\((\w+,?)+\)') -extract_legend_notes = re.compile('(?<=[\(,])(\w{1,2})') +day_regex = re.compile(r'(\d{2}.\s\w+\s\d{4})') +price_regex = re.compile(r'(?P\d+[,.]\d{2}) ?€') +notes_regex = re.compile(r'\[(?:(([A-Za-z0-9]+),?)+)\]$') +extract_legend = re.compile(r'\((\w+,?)+\)') +extract_legend_notes = re.compile(r'(?<=[\(,])(\w{1,2})') canteens = { # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 @@ -36,54 +36,56 @@ roles = ('student', 'other', 'employee') extraLegend = { - # Source: https://www.studierendenwerk-mainz.de/essentrinken/speiseplan/ - '1': 'mit Farbstoff', - '2': 'mit Konservierungsstoff', - '3': 'mit Antioxidationsmittel', - '4': 'mit Geschmacksverstärker', - '5': 'geschwefelt', - '6': 'geschwärzt', - '7': 'gewachst', - '8': 'Phosphat', - '9': 'mit Süßungsmitteln', - '10': 'enthält eine Phenylalaninquelle', - 'S' : 'Schweinefleisch', - 'G' : 'Geflügelfleisch', - 'R' : 'Rindfleisch', - 'Gl' : 'Gluten', - 'We' : 'Weizen (inkl. Dinkel)', - 'Ro' : 'Roggen', - 'Ge' : 'Gerste', - 'Haf': 'Hafer', - 'Kr' : 'Krebstiere und Krebstiererzeugnisse', - 'Ei' : 'Eier und Eiererzeugnisse', - 'Fi' : 'Fisch und Fischerzeugnisse', - 'En' : 'Erdnüsse und Erdnusserzeugnisse', - 'So' : 'Soja und Sojaerzeugnisse', - 'La' : 'Milch und Milcherzeugnisse', - 'Sl' : 'Sellerie und Sellerieerzeugnisse', - 'Sf' : 'Senf und Senferzeugnisse', - 'Se' : 'Sesamsamen und Sesamsamenerzeugnisse', - 'Sw' : 'Schwefeldioxid und Sulfite > 10mg/kg', - 'Lu' : 'Lupine und Lupinerzeugnisse', - 'Wt' : 'Weichtiere und Weichtiererzeugnisse', - 'Nu' : 'Schalenfrüchte', - 'Man': 'Mandel', - 'Has': 'Haselnüsse', - 'Wa' : 'Walnüsse', - 'Ka' : 'Kaschunüsse', - 'Pe' : 'Pecanüsse', - 'Pa' : 'Paranüsse', - 'Pi' : 'Pistatien', - 'Mac': 'Macadamianüsse', - 'icon:S.png' : 'Scheinefleisch', - 'icon:R.png' : 'Rindfleisch', - 'icon:Fi.png' : 'Fisch', - 'icon:Gl.png' : 'Glutenfrei', - 'icon:La.png' : 'Lactosefrei', - 'icon:Vegan.png' : 'Vegan', - 'icon:Veggi.png' : 'Vegetarisch' - + # Source: https://www.studierendenwerk-mainz.de/essen-trinken/speiseplan + '1': 'mit Farbstoff', + '2': 'mit Konservierungsstoff', + '3': 'mit Antioxidationsmittel', + '4': 'mit Geschmacksverstärker', + '5': 'geschwefelt', + '6': 'geschwärzt', + '7': 'gewachst', + '8': 'Phosphat', + '9': 'mit Süßungsmitteln', + '10': 'enthält eine Phenylalaninquelle', + 'S' : 'Schweinefleisch', + 'G' : 'Geflügelfleisch', + 'R' : 'Rindfleisch', + 'Gl' : 'Gluten', + 'We' : 'Weizen (inkl. Dinkel)', + 'Ro' : 'Roggen', + 'Ge' : 'Gerste', + 'Haf': 'Hafer', + 'Kr' : 'Krebstiere und Krebstiererzeugnisse', + 'Ei' : 'Eier und Eiererzeugnisse', + 'Fi' : 'Fisch und Fischerzeugnisse', + 'En' : 'Erdnüsse und Erdnusserzeugnisse', + 'So' : 'Soja und Sojaerzeugnisse', + 'La' : 'Milch und Milcherzeugnisse', + 'Sl' : 'Sellerie und Sellerieerzeugnisse', + 'Sf' : 'Senf und Senferzeugnisse', + 'Se' : 'Sesamsamen und Sesamsamenerzeugnisse', + 'Sw' : 'Schwefeldioxid und Sulfite > 10mg/kg', + 'Lu' : 'Lupine und Lupinerzeugnisse', + 'Wt' : 'Weichtiere und Weichtiererzeugnisse', + 'Nu' : 'Schalenfrüchte', + 'Man': 'Mandel', + 'Has': 'Haselnüsse', + 'Wa' : 'Walnüsse', + 'Ka' : 'Kaschunüsse', + 'Pe' : 'Pecanüsse', + 'Pa' : 'Paranüsse', + 'Pi' : 'Pistatien', + 'Mac': 'Macadamianüsse', +} + +iconLegend = { + 'icon:S.png' : 'Scheinefleisch', + 'icon:R.png' : 'Rindfleisch', + 'icon:Fi.png' : 'Fisch', + 'icon:Gl.png' : 'Glutenfrei', + 'icon:La.png' : 'Lactosefrei', + 'icon:Vegan.png' : 'Vegan', + 'icon:Veggi.png' : 'Vegetarisch' } def build_meal_name(meal): @@ -91,13 +93,13 @@ def build_meal_name(meal): # This will remove the extras and the unnecessary spaces # Example: 6 gebackene Fischstäbchen (Gl,Fi,We) mit Reis und veganem Joghurt-Kräuter-Dip (3,Gl,So,Sf,Ge) # Output: 6 gebackene Fischstäbchen mit Reis und veganem Joghurt-Kräuter-Dip - name = ' '.join(re.sub(r'\((\w+,?)+\)', '', str(meal)).split()) - + name = ' '.join(re.sub(r'\((\w+,?)+\)', '', str(meal)).split()) + # Shorten the meal name to 250 characters like the api specification: https://doc.openmensa.org/feed/v2/#name - if len(name) > 250: - name = name[:245] + '...' - - return name + if len(name) > 250: + name = name[:245] + '...' + + return name def build_meal_notes(meal): meal_name = str(meal.find('div', class_="speiseplanname").string).strip() @@ -106,15 +108,16 @@ def build_meal_notes(meal): # Use a set for easy elimination of duplicates notes = set() - # extracting the icons with spezial informations about the meal + # Extracting the icons with special informations about the meal # Example: for icon in images: - if "icon:"+path.basename(icon['src']) in extraLegend: - notes.add(extraLegend["icon:"+path.basename(icon['src'])]) + icon_name = path.basename(icon['src']) + if icon_name in iconLegend: + notes.add(iconLegend[icon_name]) - for l in extract_legend_notes.findall(meal_name): - if l in extraLegend: - notes.add(extraLegend[l]) + for extra in extract_legend_notes.findall(meal_name): + if extra in extraLegend: + notes.add(extraLegend[extra]) return list(notes) @@ -128,30 +131,32 @@ def build_meal_price(meal): meal_prices["other"] = prices[1].replace(',', '.') return meal_prices - + def parse_data(canteen, data): - for v in data.find_all('div'): - if not v.has_attr('class'): - continue - - if v['class'][0] == 'speiseplan_date': - date = day_regex.findall(str(v.string).strip())[0] - - if v['class'][0] == 'speiseplan_bldngall_name': - canteen_name = str(v.string).strip() - - if v['class'][0] == 'speiseplancounter': - counter_name = str(v.string).strip() - - if v['class'][0] == 'menuspeise': - meal_name = build_meal_name(v.find('div', class_="speiseplanname").string) - meal_notes = build_meal_notes(v) - meal_prices = build_meal_price(v) - - canteen.addMeal(date, counter_name, - meal_name, meal_notes, meal_prices) + # We assume that the `div`s appear in a certain order and will associate each meal to the previously encountered date and category. + for v in data.find_all('div'): + if not v.has_attr('class'): + continue + + if v['class'][0] == 'speiseplan_date': + date = day_regex.findall(str(v.string).strip())[0] + + if v['class'][0] == 'speiseplan_bldngall_name': + canteen_name = str(v.string).strip() + + if v['class'][0] == 'speiseplancounter': + # Save the countername as category to list meals by counter + category = str(v.string).strip() + + if v['class'][0] == 'menuspeise': + meal_name = build_meal_name(v.find('div', class_="speiseplanname").string) + meal_notes = build_meal_notes(v) + meal_prices = build_meal_price(v) + + canteen.addMeal(date, category, + meal_name, meal_notes, meal_prices) - return canteen + return canteen def parse_url(url, today=False): From 0e66bc274d429f70f3d8acf431e67b68eac7ce94 Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Sat, 15 Aug 2020 22:39:07 +0200 Subject: [PATCH 07/13] fixing iconLegend icon: prefix --- parsers/mainz.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index d1276dd4..6765f3a7 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -79,13 +79,13 @@ } iconLegend = { - 'icon:S.png' : 'Scheinefleisch', - 'icon:R.png' : 'Rindfleisch', - 'icon:Fi.png' : 'Fisch', - 'icon:Gl.png' : 'Glutenfrei', - 'icon:La.png' : 'Lactosefrei', - 'icon:Vegan.png' : 'Vegan', - 'icon:Veggi.png' : 'Vegetarisch' + 'S.png' : 'Scheinefleisch', + 'R.png' : 'Rindfleisch', + 'Fi.png' : 'Fisch', + 'Gl.png' : 'Glutenfrei', + 'La.png' : 'Lactosefrei', + 'Vegan.png' : 'Vegan', + 'Veggi.png' : 'Vegetarisch' } def build_meal_name(meal): From 741788824529802cfcef9a19fdbfb0119703f166 Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Fri, 21 Aug 2020 08:03:00 +0200 Subject: [PATCH 08/13] update to klemens suggestions --- parsers/mainz.py | 152 +++++++++++++++++++++-------------------------- 1 file changed, 68 insertions(+), 84 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 6765f3a7..5e25c1c4 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -1,22 +1,16 @@ -from urllib.request import urlopen -from bs4 import BeautifulSoup as parse import re -import datetime import os.path as path +from urllib.request import urlopen -from utils import Parser -from pyopenmensa.feed import OpenMensaCanteen +from bs4 import BeautifulSoup as parse +from utils import Parser +from pyopenmensa.feed import LazyBuilder, extractNotes -day_regex = re.compile(r'(\d{2}.\s\w+\s\d{4})') price_regex = re.compile(r'(?P\d+[,.]\d{2}) ?€') -notes_regex = re.compile(r'\[(?:(([A-Za-z0-9]+),?)+)\]$') -extract_legend = re.compile(r'\((\w+,?)+\)') -extract_legend_notes = re.compile(r'(?<=[\(,])(\w{1,2})') canteens = { # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 - '0' : 'all', '1' : 'zentralmensa', '2' : 'mensa-georg-foster', '3' : 'cafe-rewi', @@ -26,14 +20,12 @@ '7' : 'mensarium', '8' : 'cafe-bingen-rochusberg', '9' : 'mensablitz' -} + } display = { '2' : 'Aktuelle Woche', '3' : 'Nächste Woche' -} - -roles = ('student', 'other', 'employee') + } extraLegend = { # Source: https://www.studierendenwerk-mainz.de/essen-trinken/speiseplan @@ -76,107 +68,99 @@ 'Pa' : 'Paranüsse', 'Pi' : 'Pistatien', 'Mac': 'Macadamianüsse', -} + } iconLegend = { - 'S.png' : 'Scheinefleisch', - 'R.png' : 'Rindfleisch', - 'Fi.png' : 'Fisch', + # Removed parts are in the extrasLegend! + #'S.png' : 'Scheinefleisch', + #'R.png' : 'Rindfleisch', + #'Fi.png' : 'Fisch', 'Gl.png' : 'Glutenfrei', 'La.png' : 'Lactosefrei', 'Vegan.png' : 'Vegan', 'Veggi.png' : 'Vegetarisch' -} - -def build_meal_name(meal): - # There are the extras of the meal inside the meal name - # This will remove the extras and the unnecessary spaces - # Example: 6 gebackene Fischstäbchen (Gl,Fi,We) mit Reis und veganem Joghurt-Kräuter-Dip (3,Gl,So,Sf,Ge) - # Output: 6 gebackene Fischstäbchen mit Reis und veganem Joghurt-Kräuter-Dip - name = ' '.join(re.sub(r'\((\w+,?)+\)', '', str(meal)).split()) - - # Shorten the meal name to 250 characters like the api specification: https://doc.openmensa.org/feed/v2/#name - if len(name) > 250: - name = name[:245] + '...' - - return name - -def build_meal_notes(meal): - meal_name = str(meal.find('div', class_="speiseplanname").string).strip() + } + +def get_icon_notes(meal): images = meal.find_all('img') - # Use a set for easy elimination of duplicates - notes = set() + notes = [] # Extracting the icons with special informations about the meal # Example: for icon in images: icon_name = path.basename(icon['src']) if icon_name in iconLegend: - notes.add(iconLegend[icon_name]) + notes.append(iconLegend[icon_name]) - for extra in extract_legend_notes.findall(meal_name): - if extra in extraLegend: - notes.add(extraLegend[extra]) + return notes - return list(notes) - def build_meal_price(meal): - meal_prices = {} - - prices = price_regex.findall(str(meal)) + meal_prices = {} + + prices = price_regex.findall(str(meal)) # The pricing for employee and others are the same! - meal_prices["student"] = prices[0].replace(',', '.') - meal_prices["employee"] = prices[1].replace(',', '.') - meal_prices["other"] = prices[1].replace(',', '.') - - return meal_prices - -def parse_data(canteen, data): + meal_prices["student"] = prices[0].replace(',', '.') + meal_prices["employee"] = prices[1].replace(',', '.') + meal_prices["other"] = prices[1].replace(',', '.') + + return meal_prices + +def parse_data(canteen, data): + date = None + category = None + # We assume that the `div`s appear in a certain order and will associate each meal to the previously encountered date and category. for v in data.find_all('div'): if not v.has_attr('class'): continue - if v['class'][0] == 'speiseplan_date': - date = day_regex.findall(str(v.string).strip())[0] + if 'speiseplan_date' in v['class']: + date = str(v.string).strip() - if v['class'][0] == 'speiseplan_bldngall_name': + if 'speiseplan_bldngall_name' in v['class']: + # used for the display type 'all' canteen_name = str(v.string).strip() - if v['class'][0] == 'speiseplancounter': + if 'speiseplancounter' in v['class']: # Save the countername as category to list meals by counter category = str(v.string).strip() - if v['class'][0] == 'menuspeise': - meal_name = build_meal_name(v.find('div', class_="speiseplanname").string) - meal_notes = build_meal_notes(v) + if 'menuspeise' in v['class']: + meal_name = str(v.find('div', class_="speiseplanname").string).strip() + meal_notes = [canteen_name] + get_icon_notes(v) meal_prices = build_meal_price(v) - canteen.addMeal(date, category, - meal_name, meal_notes, meal_prices) - - return canteen - - -def parse_url(url, today=False): - canteen = OpenMensaCanteen() - - # There are two displays one for the current and one for the next week - for d in display: - with urlopen(url + '&display_type=' + d) as resp: - resp = parse(resp.read().decode('utf-8', errors='ignore'), features='lxml') - speiseplan = resp.find('div', class_='speiseplan') - - canteen = parse_data(canteen, speiseplan) - - - return canteen.toXMLFeed() - + if date and category: + canteen.addMeal(date, category, + meal_name, meal_notes, meal_prices) + +def parse_url(url, today): + canteen = LazyBuilder() + canteen.setLegendData(extraLegend) + + # For today display: + if today: + with urlopen(url + '&display_type=1') as resp: + resp = parse(resp.read().decode('utf-8', errors='ignore'), features='lxml') + speiseplan = resp.find('div', class_='speiseplan') + + parse_data(canteen, speiseplan) + # For week display: + else: + for d in display: + with urlopen(url + '&display_type=' + d) as resp: + resp = parse(resp.read().decode('utf-8', errors='ignore'), features='lxml') + speiseplan = resp.find('div', class_='speiseplan') + + parse_data(canteen, speiseplan) + + return canteen.toXMLFeed() + +# The shared_prefix is the page where the Website it self gets its data from. parser = Parser('mainz', - handler=parse_url, - shared_prefix='https://www.studierendenwerk-mainz.de/speiseplan/frontend/index.php') + handler=parse_url, + shared_prefix='https://www.studierendenwerk-mainz.de/speiseplan/frontend/index.php') for canteen in canteens: - parser.define(canteens[canteen], suffix='?building_id='+canteen) - + parser.define(canteens[canteen], suffix='?building_id='+canteen) \ No newline at end of file From 52e5b9cf7c6c60bc30b2af16bbe4b89bf70aa271 Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Fri, 21 Aug 2020 10:28:13 +0200 Subject: [PATCH 09/13] add mainz to test url --- build_scripts/test-urls.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/build_scripts/test-urls.txt b/build_scripts/test-urls.txt index 6c0bfe1f..0a4c8f83 100644 --- a/build_scripts/test-urls.txt +++ b/build_scripts/test-urls.txt @@ -8,6 +8,7 @@ http://localhost:9090/thueringen/il-ehrenberg/full.xml http://localhost:9090/karlsruhe/adenauerring/full.xml http://localhost:9090/leipzig/liebigstrasse/full.xml http://localhost:9090/magdeburg/ovgu-unten/full.xml +http://localhost:9090/mainz/mensa-georg-foster/full.xml http://localhost:9090/marburg/erlenring/full.xml http://localhost:9090/muenchen/leopoldstrasse/full.xml http://localhost:9090/niederbayern_oberpfalz/uni-regensburg/full.xml From 0f09fcfb53f6849f13a8ef4e0892c2935533fa4b Mon Sep 17 00:00:00 2001 From: Kreativmonkey Date: Sat, 22 Aug 2020 00:42:23 +0200 Subject: [PATCH 10/13] remove wrong commend --- parsers/mainz.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 5e25c1c4..11ffe65e 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -8,6 +8,7 @@ from pyopenmensa.feed import LazyBuilder, extractNotes price_regex = re.compile(r'(?P\d+[,.]\d{2}) ?€') +legend_regex = re.compile(r'(?P(\w{1,3}))\s=\s(?P\w+((\s+\w+)*))$') canteens = { # API Extraction: https://github.com/kreativmonkey/jgu-mainz-openmensa/issues/1 @@ -74,16 +75,18 @@ # Removed parts are in the extrasLegend! #'S.png' : 'Scheinefleisch', #'R.png' : 'Rindfleisch', - #'Fi.png' : 'Fisch', + 'Fi.png' : 'Fisch', + 'Lamm.png' : 'Enthält Lammfleisch', + 'W.png' : 'Wildgericht', 'Gl.png' : 'Glutenfrei', 'La.png' : 'Lactosefrei', 'Vegan.png' : 'Vegan', - 'Veggi.png' : 'Vegetarisch' + 'Veggi.png' : 'Vegetarisch', + 'mensa-vital-small-2.png' : 'Mensa Vital' } def get_icon_notes(meal): images = meal.find_all('img') - notes = [] # Extracting the icons with special informations about the meal @@ -119,7 +122,6 @@ def parse_data(canteen, data): date = str(v.string).strip() if 'speiseplan_bldngall_name' in v['class']: - # used for the display type 'all' canteen_name = str(v.string).strip() if 'speiseplancounter' in v['class']: @@ -160,7 +162,7 @@ def parse_url(url, today): # The shared_prefix is the page where the Website it self gets its data from. parser = Parser('mainz', handler=parse_url, - shared_prefix='https://www.studierendenwerk-mainz.de/speiseplan/frontend/index.php') + shared_prefix='https://www.studierendenwerk-mainz.de/essen-trinken/speiseplan') for canteen in canteens: parser.define(canteens[canteen], suffix='?building_id='+canteen) \ No newline at end of file From 57b014c24b66953ddd3711ef946f69c1e86727cd Mon Sep 17 00:00:00 2001 From: Sebastian Preisner Date: Wed, 25 Nov 2020 10:32:50 +0100 Subject: [PATCH 11/13] Resolving last issues --- parsers/mainz.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 11ffe65e..80c91782 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -124,13 +124,10 @@ def parse_data(canteen, data): if 'speiseplan_bldngall_name' in v['class']: canteen_name = str(v.string).strip() - if 'speiseplancounter' in v['class']: - # Save the countername as category to list meals by counter - category = str(v.string).strip() if 'menuspeise' in v['class']: meal_name = str(v.find('div', class_="speiseplanname").string).strip() - meal_notes = [canteen_name] + get_icon_notes(v) + meal_notes = get_icon_notes(v) meal_prices = build_meal_price(v) if date and category: @@ -165,4 +162,4 @@ def parse_url(url, today): shared_prefix='https://www.studierendenwerk-mainz.de/essen-trinken/speiseplan') for canteen in canteens: - parser.define(canteens[canteen], suffix='?building_id='+canteen) \ No newline at end of file + parser.define(canteens[canteen], suffix='?building_id='+canteen) From 6af1617933789f9688e6ca0bd1fb494f4571ac52 Mon Sep 17 00:00:00 2001 From: Sebastian Preisner Date: Wed, 2 Dec 2020 18:50:54 +0100 Subject: [PATCH 12/13] fix wrong deleted lines --- parsers/mainz.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index 80c91782..e9410af5 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -120,10 +120,10 @@ def parse_data(canteen, data): if 'speiseplan_date' in v['class']: date = str(v.string).strip() - - if 'speiseplan_bldngall_name' in v['class']: - canteen_name = str(v.string).strip() + if 'speiseplancounter' in v['class']: + # Save the countername as category to list meals by counter + category = str(v.string).strip() if 'menuspeise' in v['class']: meal_name = str(v.find('div', class_="speiseplanname").string).strip() From 70b1ff4f78b9b02a76eef2fa3ff565d52b9f87dc Mon Sep 17 00:00:00 2001 From: Sebastian Preisner Date: Wed, 2 Dec 2020 18:52:43 +0100 Subject: [PATCH 13/13] fix indent --- parsers/mainz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parsers/mainz.py b/parsers/mainz.py index e9410af5..395cff21 100644 --- a/parsers/mainz.py +++ b/parsers/mainz.py @@ -122,8 +122,8 @@ def parse_data(canteen, data): date = str(v.string).strip() if 'speiseplancounter' in v['class']: - # Save the countername as category to list meals by counter - category = str(v.string).strip() + # Save the countername as category to list meals by counter + category = str(v.string).strip() if 'menuspeise' in v['class']: meal_name = str(v.find('div', class_="speiseplanname").string).strip()