From 55093f1ee27029fbdba5b7b92ba7c8b1e955d217 Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Mon, 20 Nov 2023 21:42:00 -0300 Subject: [PATCH] =?UTF-8?q?Habilita=20Zyte=20Smart=20Proxy=20em=20Florian?= =?UTF-8?q?=C3=B3polis-SC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Spider fufnciona localmente, mas não funciona na Scrapy Cloud. Habilitando o Smart Proxy para evitar problemas de geolocalização - Ajuste de URL inicial para usar HTTPS ao invẽs de HTTP - Substituir mẽtodos antigos (extract() e extract_first()) por get() e getall(), considerados o padrão do Scrapy --- .../gazette/spiders/sc/sc_florianopolis.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/data_collection/gazette/spiders/sc/sc_florianopolis.py b/data_collection/gazette/spiders/sc/sc_florianopolis.py index a47cf56f3..6dd188553 100644 --- a/data_collection/gazette/spiders/sc/sc_florianopolis.py +++ b/data_collection/gazette/spiders/sc/sc_florianopolis.py @@ -12,7 +12,6 @@ class ScFlorianopolisSpider(BaseGazetteSpider): name = "sc_florianopolis" TERRITORY_ID = "4205407" - start_date = date(2009, 6, 1) def start_requests(self): @@ -25,7 +24,7 @@ def start_requests(self): for year, month in periods_of_interest: data = dict(ano=str(year), mes=str(month), passo="1", enviar="") yield FormRequest( - "http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial", + "https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial", formdata=data, ) @@ -42,14 +41,16 @@ def parse(self, response): yield Gazette( date=gazette_date, edition_number=gazette_edition_number, - file_urls=(url,), + file_urls=[ + url, + ], is_extra_edition=self.is_extra(link), power="executive_legislative", ) @staticmethod def get_pdf_url(response, link): - relative_url = link.css("::attr(href)").extract_first() + relative_url = link.css("::attr(href)").get() if not relative_url.lower().endswith(".pdf"): return None @@ -57,7 +58,7 @@ def get_pdf_url(response, link): @staticmethod def get_date(link): - text = " ".join(link.css("::text").extract()) + text = " ".join(link.css("::text").getall()) pattern = r"\d{1,2}\s+de\s+\w+\s+de\s+\d{4}" match = re.search(pattern, text) if not match: @@ -67,5 +68,5 @@ def get_date(link): @staticmethod def is_extra(link): - text = " ".join(link.css("::text").extract()) + text = " ".join(link.css("::text").getall()) return "extra" in text.lower()