From e961b66eabe010ef17514b6fc2cebbb8cee96fa1 Mon Sep 17 00:00:00 2001 From: trevineju Date: Wed, 29 Jan 2025 10:50:07 -0300 Subject: [PATCH] =?UTF-8?q?Adapta=20base=20diof=20para=20acomodar=20outro?= =?UTF-8?q?=20padr=C3=A3o=20de=20URL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/base/diof.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/data_collection/gazette/spiders/base/diof.py b/data_collection/gazette/spiders/base/diof.py index 2abacde37..96e668e9b 100644 --- a/data_collection/gazette/spiders/base/diof.py +++ b/data_collection/gazette/spiders/base/diof.py @@ -25,6 +25,7 @@ class BaseDiofSpider(BaseGazetteSpider): e.g: - https://diario.igaci.al.gov.br - https://sai.io.org.br/ba/abare/site/diariooficial + - https://dom.imap.org.br/sitesMunicipios/imprensaOficial.cfm?varCodigo=219 """ custom_settings = {"DOWNLOAD_DELAY": 0.5} @@ -35,7 +36,7 @@ class BaseDiofSpider(BaseGazetteSpider): def start_requests(self): self._set_allowed_domains() - if "sai.io.org.br" in self.website: + if "sai.io" or "dom.imap" in self.website: yield Request( self.website, callback=self.interval_request, @@ -126,14 +127,21 @@ def collect_gazette(self, response, metadata, optional_url): yield Gazette(**metadata) def _set_allowed_domains(self): - domains = {"sai.io.org.br", "diof.io.org.br", urlparse(self.website).netloc} + domains = { + "sai.io.org.br", + "dom.imap.org.br", + "diof.io.org.br", + urlparse(self.website).netloc, + } self.allowed_domains = list(domains) def _get_client_id(self, response): - if "sai.io.org.br" in response.url: + if "sai.io" in response.url: self.client_id = re.search( r"\d+", response.css("iframe").attrib["src"] ).group() + elif "dom.imap" in response.url: + self.client_id = re.search(r"varCodigo=(\d+)", response.url).group(1) else: self.client_id = response.json()["cod_cliente"]