From 55093f1ee27029fbdba5b7b92ba7c8b1e955d217 Mon Sep 17 00:00:00 2001
From: Renne Rocha <renne@rennerocha.com>
Date: Mon, 20 Nov 2023 21:42:00 -0300
Subject: [PATCH] =?UTF-8?q?Habilita=20Zyte=20Smart=20Proxy=20em=20Florian?=
 =?UTF-8?q?=C3=B3polis-SC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Spider fufnciona localmente, mas não funciona na Scrapy Cloud.
  Habilitando o Smart Proxy para evitar problemas de geolocalização
- Ajuste de URL inicial para usar HTTPS ao invẽs de HTTP
- Substituir mẽtodos antigos (extract() e extract_first()) por get() e
  getall(), considerados o padrão do Scrapy
---
 .../gazette/spiders/sc/sc_florianopolis.py          | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/data_collection/gazette/spiders/sc/sc_florianopolis.py b/data_collection/gazette/spiders/sc/sc_florianopolis.py
index a47cf56f3..6dd188553 100644
--- a/data_collection/gazette/spiders/sc/sc_florianopolis.py
+++ b/data_collection/gazette/spiders/sc/sc_florianopolis.py
@@ -12,7 +12,6 @@
 class ScFlorianopolisSpider(BaseGazetteSpider):
     name = "sc_florianopolis"
     TERRITORY_ID = "4205407"
-
     start_date = date(2009, 6, 1)
 
     def start_requests(self):
@@ -25,7 +24,7 @@ def start_requests(self):
         for year, month in periods_of_interest:
             data = dict(ano=str(year), mes=str(month), passo="1", enviar="")
             yield FormRequest(
-                "http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial",
+                "https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial",
                 formdata=data,
             )
 
@@ -42,14 +41,16 @@ def parse(self, response):
             yield Gazette(
                 date=gazette_date,
                 edition_number=gazette_edition_number,
-                file_urls=(url,),
+                file_urls=[
+                    url,
+                ],
                 is_extra_edition=self.is_extra(link),
                 power="executive_legislative",
             )
 
     @staticmethod
     def get_pdf_url(response, link):
-        relative_url = link.css("::attr(href)").extract_first()
+        relative_url = link.css("::attr(href)").get()
         if not relative_url.lower().endswith(".pdf"):
             return None
 
@@ -57,7 +58,7 @@ def get_pdf_url(response, link):
 
     @staticmethod
     def get_date(link):
-        text = " ".join(link.css("::text").extract())
+        text = " ".join(link.css("::text").getall())
         pattern = r"\d{1,2}\s+de\s+\w+\s+de\s+\d{4}"
         match = re.search(pattern, text)
         if not match:
@@ -67,5 +68,5 @@ def get_date(link):
 
     @staticmethod
     def is_extra(link):
-        text = " ".join(link.css("::text").extract())
+        text = " ".join(link.css("::text").getall())
         return "extra" in text.lower()