From 94b261701954004667518637afad65405d7a65bb Mon Sep 17 00:00:00 2001
From: James Addison <55152140+jayaddison@users.noreply.github.com>
Date: Mon, 6 May 2024 11:08:14 +0100
Subject: [PATCH] MarleySpoon: add precautionary check for unexpected API URLs.
(#1069)
---
recipe_scrapers/marleyspoon.py | 24 +++++++++++-
tests/legacy/test_data/faulty.testhtml | 6 +++
tests/legacy/test_data/relative_url.testhtml | 6 +++
tests/legacy/test_marleyspoon_invalid.py | 41 ++++++++++++++++++++
4 files changed, 75 insertions(+), 2 deletions(-)
create mode 100644 tests/legacy/test_data/faulty.testhtml
create mode 100644 tests/legacy/test_data/relative_url.testhtml
create mode 100644 tests/legacy/test_marleyspoon_invalid.py
diff --git a/recipe_scrapers/marleyspoon.py b/recipe_scrapers/marleyspoon.py
index 1325a201b..d0f44ca14 100644
--- a/recipe_scrapers/marleyspoon.py
+++ b/recipe_scrapers/marleyspoon.py
@@ -1,12 +1,13 @@
# mypy: disallow_untyped_defs=False
import json
import re
+from urllib.parse import urljoin
import requests
from ._abstract import HEADERS, AbstractScraper
-from ._exceptions import ElementNotFoundInHtml
-from ._utils import normalize_string
+from ._exceptions import ElementNotFoundInHtml, RecipeScrapersExceptions
+from ._utils import get_host_name, normalize_string
ID_PATTERN = re.compile(r"/(\d+)-")
SCRIPT_PATTERN = re.compile(
@@ -65,6 +66,25 @@ def _get_json_params(self):
if api_url is None or api_token is None:
raise ElementNotFoundInHtml("Required script not found.")
+ from . import SCRAPERS
+
+ scraper_name = self.__class__.__name__
+ try:
+ next_url = urljoin(self.url, api_url)
+ host_name = get_host_name(next_url)
+ next_scraper = type(None)
+ # check: api.foo.xx.example, foo.xx.example, xx.example
+ while host_name and host_name.count("."):
+ next_scraper = SCRAPERS.get(host_name)
+ if next_scraper:
+ break
+ _, host_name = host_name.split(".", 1)
+ if not isinstance(self, next_scraper):
+ msg = f"Attempted to scrape using {next_scraper} from {scraper_name}"
+ raise ValueError(msg)
+ except Exception as e:
+ raise RecipeScrapersExceptions(f"Unexpected API URL: {api_url}") from e
+
return api_url, api_token
@classmethod
diff --git a/tests/legacy/test_data/faulty.testhtml b/tests/legacy/test_data/faulty.testhtml
new file mode 100644
index 000000000..b0c4f2998
--- /dev/null
+++ b/tests/legacy/test_data/faulty.testhtml
@@ -0,0 +1,6 @@
+
+
+
+
diff --git a/tests/legacy/test_data/relative_url.testhtml b/tests/legacy/test_data/relative_url.testhtml
new file mode 100644
index 000000000..16650db42
--- /dev/null
+++ b/tests/legacy/test_data/relative_url.testhtml
@@ -0,0 +1,6 @@
+
+
+
+
diff --git a/tests/legacy/test_marleyspoon_invalid.py b/tests/legacy/test_marleyspoon_invalid.py
new file mode 100644
index 000000000..a8bde3b42
--- /dev/null
+++ b/tests/legacy/test_marleyspoon_invalid.py
@@ -0,0 +1,41 @@
+import unittest
+
+import responses
+
+from recipe_scrapers._exceptions import RecipeScrapersExceptions
+from recipe_scrapers.marleyspoon import MarleySpoon
+
+
+class TestFaultyAPIURLResponse(unittest.TestCase):
+
+ @responses.activate
+ def test_faulty_response(self):
+ url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat"
+ with open("tests/legacy/test_data/faulty.testhtml") as faulty_data:
+ faulty_response = faulty_data.read()
+
+ responses.add(
+ method=responses.GET,
+ url=url,
+ body=faulty_response,
+ )
+
+ with self.assertRaises(RecipeScrapersExceptions):
+ MarleySpoon(url=url)
+
+ @responses.activate
+ def test_relative_api_url(self):
+ url = "https://marleyspoon.de/menu/113813-glasierte-veggie-burger-mit-roestkartoffeln-und-apfel-gurken-salat"
+ with open("tests/legacy/test_data/relative_url.testhtml") as relative_url_data:
+ relative_url_response = relative_url_data.read()
+
+ responses.add(
+ method=responses.GET,
+ url=url,
+ body=relative_url_response,
+ )
+
+ with self.assertRaises(Exception):
+ MarleySpoon(
+ url=url
+ ) # currently this raises an requests.exceptions.MissingSchema exception