Skip to content

Commit

Permalink
Atualiza raspadores para Correntina-BA (#1359)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Jan 29, 2025
2 parents 999dd8f + e961b66 commit ea4fc4f
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 14 deletions.
11 changes: 0 additions & 11 deletions data_collection/gazette/spiders/ba/ba_correntina.py

This file was deleted.

14 changes: 14 additions & 0 deletions data_collection/gazette/spiders/ba/ba_correntina_2007.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from datetime import date

from gazette.spiders.base.diof import BaseDiofSpider


class BaCorrentinaSpider(BaseDiofSpider):
TERRITORY_ID = "2909307"
name = "ba_correntina_2007"
website = (
"https://dom.imap.org.br/sitesMunicipios/imprensaOficial.cfm?varCodigo=219"
)
power = "executive"
start_date = date(2007, 11, 30)
end_date = date(2024, 12, 31)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/ba/ba_correntina_2025.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.doem import BaseDoemSpider


class BaCorrentinaSpider(BaseDoemSpider):
TERRITORY_ID = "2909307"
name = "ba_correntina_2025"
state_city_url_part = "ba/correntina"
start_date = date(2025, 1, 1)
14 changes: 11 additions & 3 deletions data_collection/gazette/spiders/base/diof.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class BaseDiofSpider(BaseGazetteSpider):
e.g:
- https://diario.igaci.al.gov.br
- https://sai.io.org.br/ba/abare/site/diariooficial
- https://dom.imap.org.br/sitesMunicipios/imprensaOficial.cfm?varCodigo=219
"""

custom_settings = {"DOWNLOAD_DELAY": 0.5}
Expand All @@ -35,7 +36,7 @@ class BaseDiofSpider(BaseGazetteSpider):
def start_requests(self):
self._set_allowed_domains()

if "sai.io.org.br" in self.website:
if "sai.io" or "dom.imap" in self.website:
yield Request(
self.website,
callback=self.interval_request,
Expand Down Expand Up @@ -126,14 +127,21 @@ def collect_gazette(self, response, metadata, optional_url):
yield Gazette(**metadata)

def _set_allowed_domains(self):
domains = {"sai.io.org.br", "diof.io.org.br", urlparse(self.website).netloc}
domains = {
"sai.io.org.br",
"dom.imap.org.br",
"diof.io.org.br",
urlparse(self.website).netloc,
}
self.allowed_domains = list(domains)

def _get_client_id(self, response):
if "sai.io.org.br" in response.url:
if "sai.io" in response.url:
self.client_id = re.search(
r"\d+", response.css("iframe").attrib["src"]
).group()
elif "dom.imap" in response.url:
self.client_id = re.search(r"varCodigo=(\d+)", response.url).group(1)
else:
self.client_id = response.json()["cod_cliente"]

Expand Down

0 comments on commit ea4fc4f

Please sign in to comment.