From 62ad42504930f391e50ec2c38f151d1167a8e4d9 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 20:21:22 +0100 Subject: [PATCH 1/4] fix: typo issue --- scrapers/in/events.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scrapers/in/events.py b/scrapers/in/events.py index 060bf8b9d7..904779cb12 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -1,5 +1,4 @@ import json -import logging import re from datetime import date from urllib.parse import urljoin @@ -12,7 +11,6 @@ from openstates.exceptions import EmptyScrape -log = logging.getLogger(__name__) PROXY_BASE_URL = "https://in-proxy.openstates.org/" @@ -96,7 +94,7 @@ def scrape(self): ) event.add_participant(committee_name, type="committee", note="host") event.add_document( - "Meeting Agenda", document_url, media_type="applicaiton/pdf" + "Meeting Agenda", document_url, media_type="application/pdf" ) event.add_media_link("Video of Hearing", video_url, media_type="text/html") From c99b717dd74af53435a9bddc6cedc447dda4efe2 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 21:03:15 +0100 Subject: [PATCH 2/4] clear code format --- scrapers/in/bills.py | 141 ++++++++++++++++++++++-------------------- scrapers/in/events.py | 54 +++++++--------- 2 files changed, 97 insertions(+), 98 deletions(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 58db3eb0df..14f79f28ef 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -71,14 +71,12 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): for r in rollcalls: proxy_link = PROXY_BASE_URL + r["link"] - try: - (path, resp) = self.urlretrieve(proxy_link) + path, _ = self.urlretrieve(proxy_link) except scrapelib.HTTPError as e: - self.warning(e) - self.warning( + self.logger.warning( "Unable to contact openstates proxy, skipping vote {}".format( - r["link"] + proxy_link ) ) continue @@ -275,23 +273,22 @@ def scrape(self, session=None): try: bill_json = client.get("bill", session=session, bill_link=bill_link) + # vehicle bill + if not bill_json: + self.logger.warning("Vehicle Bill: {}".format(bill_id)) + continue except scrapelib.HTTPError: self.logger.warning("Bill could not be accessed. Skipping.") continue - # vehicle bill - if len(list(bill_json.keys())) == 0: - self.logger.warning("Vehicle Bill: {}".format(bill_id)) - continue - # sometimes description is blank - # if that's the case, we can check to see if - # the latest version has a short description title = bill_json["description"] + # Check if the title is "NoneNone" (indicating a placeholder) and set it to None if "NoneNone" in title: title = None + # If the title is still empty or None, try to get the short description from the latest version if not title: - title = bill_json["latestVersion"]["shortDescription"] - # and if that doesn't work, use the bill_id but throw a warning + title = bill_json["latestVersion"].get("shortDescription") + # If the title is still not available, use the bill ID and log a warning if not title: title = bill_id self.logger.warning("Bill is missing a title, using bill id instead.") @@ -314,19 +311,15 @@ def scrape(self, session=None): bill.add_source(api_source, note="API details") # sponsors - for s in bill_json["authors"]: - self._add_sponsor_if_not_blank(bill, s, classification="author") - for s in bill_json["coauthors"]: - self._add_sponsor_if_not_blank(bill, s, classification="coauthor") - for s in bill_json["sponsors"]: - self._add_sponsor_if_not_blank(bill, s, classification="sponsor") - for s in bill_json["cosponsors"]: - self._add_sponsor_if_not_blank(bill, s, classification="cosponsor") + for category in ["authors", "coauthors", "sponsors", "cosponsors"]: + for sponsor in bill_json.get(category, []): + self._add_sponsor_if_not_blank( + bill, sponsor, classification=category[:-1] + ) # actions action_link = bill_json["actions"]["link"] api_source = urljoin(api_base_url, action_link) - try: actions = client.get( "bill_actions", session=session, action_link=action_link @@ -336,75 +329,84 @@ def scrape(self, session=None): self.logger.warning("Could not find bill actions page") actions = [] - for a in actions: - action_desc = a["description"] + for action in actions: + action_desc = action["description"] + + # Determine action chamber if "governor" in action_desc.lower(): action_chamber = "executive" - elif a["chamber"]["name"].lower() == "house": + elif action["chamber"]["name"].lower() == "house": action_chamber = "lower" else: action_chamber = "upper" - date = a["date"] + # Process action date + date = action.get("date") if not date: self.logger.warning("Action has no date, skipping") continue - # convert time to pupa fuzzy time - date = date.replace("T", " ") - # TODO: if we update pupa to accept datetimes we can drop this line - date = date.split()[0] + # Convert date to pupa fuzzy time format + date = date.replace("T", " ").split()[0] # Extract date part only - d = action_desc.lower() + action_desc_lower = action_desc.lower() committee = None - reading = False - attrs = self.categorizer.categorize(action_desc) - action_type = attrs["classification"] - - if "first reading" in d: - reading = True - - if "second reading" in d or "reread second time" in d: - reading = True - - if "third reading" in d or "reread third time" in d: - action_type.append("reading-3") + action_type = self.categorizer.categorize(action_desc)["classification"] + + # Identify reading actions + if any( + phase in action_desc_lower + for phase in [ + "first reading", + "second reading", + "third reading", + "reread second time", + "reread third time", + ] + ): reading = True - - if "adopted" in d and reading: + if ( + "third reading" in action_desc_lower + or "reread third time" in action_desc_lower + ): + action_type.append("reading-3") + + # Mark passage if adopted during reading + if "adopted" in action_desc_lower and reading: action_type.append("passage") - if ( - "referred" in d - and "committee on" in d - or "reassigned" in d - and "committee on" in d - ): - committee = d.split("committee on")[-1].strip() + # Identify related committee + if "committee on" in action_desc_lower: + committee = action_desc_lower.split("committee on")[-1].strip() - a = bill.add_action( + # Add action to bill + action_instance = bill.add_action( chamber=action_chamber, description=action_desc, date=date, classification=action_type, ) + + # Add committee as related entity if present if committee: - a.add_related_entity(committee, entity_type="organization") - - # subjects - subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]] - for subject in subjects: - subject = ( - subject - if not subject.startswith("PENSIONS AND RETIREMENT BENEFITS") - else "PENSIONS AND RETIREMENT BENEFITS; Public Retirement System (INPRS)" - ) + action_instance.add_related_entity( + committee, entity_type="organization" + ) + + # Extract subjects from the latest version of the bill + latest_subjects = bill_json["latestVersion"]["subjects"] + for subject_entry in latest_subjects: + subject = subject_entry["entry"] + if subject.startswith("PENSIONS AND RETIREMENT BENEFITS"): + subject = "PENSIONS AND RETIREMENT BENEFITS; Public Retirement System (INPRS)" + # Add the processed subject to the bill bill.add_subject(subject) # Abstract - if bill_json["latestVersion"]["digest"]: - bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest") + digest = bill_json["latestVersion"]["digest"] + if digest: + bill.add_abstract(digest, note="Digest") # votes yield from self._process_votes( @@ -415,10 +417,13 @@ def scrape(self, session=None): ) for v in bill_json["versions"]: - # note there are a number of links in the API response that won't work with just a browser, they need an api key # https://iga.in.gov/pdf-documents/123/2024/house/resolutions/HC0001/HC0001.01.INTR.pdf category = "resolutions" if "resolution" in bill_type else "bills" - url = f"https://iga.in.gov/pdf-documents/{self.session_no}/{bill_json['year']}/{bill_json['originChamber']}/{category}/{v['billName']}/{v['printVersionName']}.pdf" + url = ( + f"https://iga.in.gov/pdf-documents/{self.session_no}/" + f"{bill_json['year']}/{bill_json['originChamber']}/" + f"{category}/{v['billName']}/{v['printVersionName']}.pdf" + ) # PROXY URL # url = urljoin(PROXY_BASE_URL, v['link']) bill.add_version_link( diff --git a/scrapers/in/events.py b/scrapers/in/events.py index 904779cb12..09c6551fc8 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -11,9 +11,6 @@ from openstates.exceptions import EmptyScrape -PROXY_BASE_URL = "https://in-proxy.openstates.org/" - - class INEventScraper(Scraper): _tz = pytz.timezone("America/Indianapolis") base_url = "https://beta-api.iga.in.gov" @@ -26,9 +23,10 @@ def __init__(self, *args, **kwargs): def scrape(self): session_no = self.apiclient.get_session_no(self.session) response = self.apiclient.get("meetings", session=self.session) + meetings = response["meetings"] - if len(meetings["items"]) == 0: - raise EmptyScrape + if not meetings["items"]: + raise EmptyScrape("No meetings found in the response.") for item in meetings["items"]: meeting = self.apiclient.get( @@ -39,9 +37,6 @@ def scrape(self): continue committee = meeting["committee"] - - link = urljoin(self.base_url, meeting["link"]) - _id = link.split("/")[-1] committee_name = ( committee["name"] .replace(",", "") @@ -56,19 +51,25 @@ def scrape(self): committee_chamber = ( committee["chamber"].lower() if committee["chamber"] else "universal" ) - date = meeting["meetingdate"].replace(" ", "") - time = meeting["starttime"] - if time: - time = time.replace(" ", "") - when = dateutil.parser.parse(f"{date} {time}") + + link = urljoin(self.base_url, meeting["link"]) + _id = link.split("/")[-1] + + date_str = meeting["meetingdate"].replace(" ", "") + time_str = meeting["starttime"] + # Determine the 'when' variable based on the presence of time + if time_str: + time_str = time_str.replace( + " ", "" + ) # Clean up any spaces in the time string + when = dateutil.parser.parse(f"{date_str} {time_str}") when = self._tz.localize(when) all_day = False else: - when = dateutil.parser.parse(date).date() + when = dateutil.parser.parse(date_str).date() all_day = True location = meeting["location"] or "See Agenda" - video_url = ( f"https://iga.in.gov/legislative/{self.session}/meeting/watchlive/{_id}" ) @@ -83,7 +84,7 @@ def scrape(self): ) event.dedupe_key = event_name event.add_source(link, note="API details") - name_slug = committee_name.lower().replace(" ", "-") + name_slug = re.sub("[^a-zA-Z0-9]+", "-", committee_name.lower()) document_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/meeting.pdf" @@ -99,11 +100,9 @@ def scrape(self): event.add_media_link("Video of Hearing", video_url, media_type="text/html") agendas = meeting["agenda"] - if type(agendas) is str: - agendas = json.loads(meeting["agenda"]) - if agendas: - agenda = event.add_agenda_item("Bills under consideration") - + if isinstance(agendas, str): + agendas = json.loads(agendas) + agenda = event.add_agenda_item("Bills under consideration") for agenda_item in agendas: if agenda_item.get("bill", None): bill_id = agenda_item["bill"].get("billName") @@ -114,12 +113,9 @@ def scrape(self): for exhibit in meeting.get("exhibits"): # Original URL - # exhibit_pdf_url = self.apiclient.get_document_url( - # exhibit["pdfDownloadLink"] - # ) - # Proxy URL - exhibit_pdf_url = urljoin(PROXY_BASE_URL, exhibit["pdfDownloadLink"]) - self.logger.info(exhibit_pdf_url) + exhibit_pdf_url = self.apiclient.get_document_url( + exhibit["pdfDownloadLink"] + ) if exhibit_pdf_url: event.add_document( exhibit["description"], @@ -130,9 +126,7 @@ def scrape(self): for minute in meeting.get("minutes"): if minute["link"]: # Original URL - # minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" - # Proxy URL - minute_pdf_url = urljoin(PROXY_BASE_URL, minute["link"]) + minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" event.add_document( "Meeting Minutes", minute_pdf_url, From 20209dd0776a2eeb4ab176883d4a71ac3b2ba630 Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 21:07:31 +0100 Subject: [PATCH 3/4] fix lint issue --- scrapers/in/bills.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py index 14f79f28ef..2ac13189b3 100644 --- a/scrapers/in/bills.py +++ b/scrapers/in/bills.py @@ -73,7 +73,7 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session): proxy_link = PROXY_BASE_URL + r["link"] try: path, _ = self.urlretrieve(proxy_link) - except scrapelib.HTTPError as e: + except scrapelib.HTTPError: self.logger.warning( "Unable to contact openstates proxy, skipping vote {}".format( proxy_link From 71b88edb025e1b79b70f20f41424330d549a6c3d Mon Sep 17 00:00:00 2001 From: braykuka Date: Thu, 31 Oct 2024 21:12:55 +0100 Subject: [PATCH 4/4] fix conflict --- scrapers/in/events.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/scrapers/in/events.py b/scrapers/in/events.py index c14c82b145..8bbe227647 100644 --- a/scrapers/in/events.py +++ b/scrapers/in/events.py @@ -84,7 +84,6 @@ def scrape(self): ) event.dedupe_key = event_name event.add_source(link, note="API details") - name_slug = re.sub("[^a-zA-Z0-9]+", "-", committee_name.lower()) document_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/meeting.pdf" @@ -112,8 +111,9 @@ def scrape(self): agenda.add_subject(agenda_item["description"]) for exhibit in meeting.get("exhibits"): - # Original URL - + exhibit_pdf_url = self.apiclient.get_document_url( + exhibit["pdfDownloadLink"] + ) if exhibit_pdf_url: event.add_document( exhibit["description"], @@ -123,8 +123,7 @@ def scrape(self): for minute in meeting.get("minutes"): if minute["link"]: - # Original URL - + minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf" event.add_document( "Meeting Minutes", minute_pdf_url,