diff --git a/README.md b/README.md index cdfea03..7347c61 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,15 @@ # Python Project Template +``` +make virtualenv +source .venv/bin/activ +``` + +``` +make fmt +``` + A low dependency and really simple to start project template for Python Projects. See also diff --git a/neptun_webscraper/spiders/dockerhub.py b/neptun_webscraper/spiders/dockerhub.py index 654dd18..523213b 100644 --- a/neptun_webscraper/spiders/dockerhub.py +++ b/neptun_webscraper/spiders/dockerhub.py @@ -8,7 +8,8 @@ from scrapy.selector import Selector from scrapy_playwright.page import PageMethod -# playwright (needed for JS support, if websites load content dynamically lazy) Settings +# playwright Settings +# needed for JS support, if websites load content dynamically lazy # npm i playwright@1.44.1 --global, playwright install SCRAPY_SETTINGS = { @@ -77,10 +78,14 @@ def start_requests(self): playwright_include_page=True, playwright_page_methods=[ PageMethod("wait_for_selector", "div#searchResults"), - # PageMethod("click", selector="button#onetrust-reject-all-handler", ), # timeouts, if more than one page to crawl + # timeouts, if more than one page to crawl + # PageMethod( + # "click", + # selector="button#onetrust-reject-all-handler", + # ), PageMethod( "screenshot", - path=f"{self.output_dir}/screenshot_page_{index}.png", + path=f"{self.output_dir}/screenshot_page_{index}.png", # noqa: E501 full_page=True, ), ], @@ -123,7 +128,6 @@ def parse(self, response): yield {"page_number": page_number, "items": items} def parse_result(self, result): - # extracts name, description, uploader, chips, downloads, stars, last update, pulls last week item = {} # Extract name @@ -148,7 +152,7 @@ def parse_result(self, result): if update_elem: item["last_update"] = parse_update_string(update_elem.strip()) desc_elem = result.xpath( - './/span[contains(text(), "Updated")]/ancestor::div[1]/following-sibling::p[1]/text()' + './/span[contains(text(), "Updated")]/ancestor::div[1]/following-sibling::p[1]/text()' # noqa: E501 ).get() item["description"] = desc_elem.strip() if desc_elem else None else: @@ -179,7 +183,7 @@ def parse_result(self, result): # Extract stars stars_elem = result.xpath( - '//svg[@data-testid="StarOutlineIcon"]/following-sibling::span/strong/text()' + '//svg[@data-testid="StarOutlineIcon"]/following-sibling::span/strong/text()' # noqa: E501 ).get() item["stars"] = stars_elem.strip() if stars_elem else None @@ -192,210 +196,5 @@ def close(self, reason): with open(filename, "w") as f: json.dump(items, f, indent=2) self.logger.info( - f"Items from page {page_number} have been written to {filename}" + f"Items from page {page_number} have been written to {filename}" # noqa: E501 ) - - -""" -SEARCH RESULT FORMAT (30.6.2024): - -
1B+
-Python is an interpreted, interactive, object-oriented, open-source programming language.
-100M+
-1B+
+Python is an interpreted, + interactive, object-oriented, open-source programming language.
+100M+
+