Skip to content

Commit

Permalink
Adapta base diof para acomodar outro padrão de URL
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju committed Jan 29, 2025
1 parent 1494375 commit e961b66
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions data_collection/gazette/spiders/base/diof.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class BaseDiofSpider(BaseGazetteSpider):
e.g:
- https://diario.igaci.al.gov.br
- https://sai.io.org.br/ba/abare/site/diariooficial
- https://dom.imap.org.br/sitesMunicipios/imprensaOficial.cfm?varCodigo=219
"""

custom_settings = {"DOWNLOAD_DELAY": 0.5}
Expand All @@ -35,7 +36,7 @@ class BaseDiofSpider(BaseGazetteSpider):
def start_requests(self):
self._set_allowed_domains()

if "sai.io.org.br" in self.website:
if "sai.io" or "dom.imap" in self.website:
yield Request(
self.website,
callback=self.interval_request,
Expand Down Expand Up @@ -126,14 +127,21 @@ def collect_gazette(self, response, metadata, optional_url):
yield Gazette(**metadata)

def _set_allowed_domains(self):
domains = {"sai.io.org.br", "diof.io.org.br", urlparse(self.website).netloc}
domains = {
"sai.io.org.br",
"dom.imap.org.br",
"diof.io.org.br",
urlparse(self.website).netloc,
}
self.allowed_domains = list(domains)

def _get_client_id(self, response):
if "sai.io.org.br" in response.url:
if "sai.io" in response.url:
self.client_id = re.search(
r"\d+", response.css("iframe").attrib["src"]
).group()
elif "dom.imap" in response.url:
self.client_id = re.search(r"varCodigo=(\d+)", response.url).group(1)
else:
self.client_id = response.json()["cod_cliente"]

Expand Down

0 comments on commit e961b66

Please sign in to comment.