diff --git a/scrapers/ar/bills.py b/scrapers/ar/bills.py index 2c5c6a94d6..55c400b8ff 100644 --- a/scrapers/ar/bills.py +++ b/scrapers/ar/bills.py @@ -16,6 +16,11 @@ from .common import get_slug_for_session, get_biennium_year TIMEZONE = pytz.timezone("US/Central") +_AR_ORGANIZATION_ENTITY_NAME_KEYWORDS = [ + "Committee", + "House Management", + "Senate Efficiency", +] # Needed because they're using a port python doesn't expect @@ -209,7 +214,7 @@ def scrape_actions(self): def get_entity_name(self, link): entity_type = "person" - if "Committees" in link: + if any(keyword in link for keyword in _AR_ORGANIZATION_ENTITY_NAME_KEYWORDS): entity_type = "organization" return entity_type diff --git a/scrapers/fl/bills.py b/scrapers/fl/bills.py index 00e7885a0b..f4e878ae2b 100644 --- a/scrapers/fl/bills.py +++ b/scrapers/fl/bills.py @@ -14,6 +14,25 @@ SPONSOR_RE = re.compile( r"by\s+(?P[^(]+)(\(CO-INTRODUCERS\)\s+(?P[\s\S]+))?" ) +FL_ORGANIZATION_ENTITY_NAME_KEYWORDS = [ + "affairs", + "agriculture", + "appropriations", + "banking and insurance", + "committee", + "commerce and tourism", + "criminal justice", + "education", + "ethics and elections", + "environment and natural resources", + "fiscal policy", + "finance and tax", + "governmental oversight", + "health policy", + "regulated industries", + "rules", + "transportation", +] requests.packages.urllib3.disable_warnings() requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" @@ -157,7 +176,12 @@ def process_sponsors(self): for sp in sponsors.split("; "): sp = sp.strip() if sp: - sp_type = "organization" if "committee" in sp.lower() else "person" + sp_type = "person" + if any( + keyword in sp.lower() + for keyword in FL_ORGANIZATION_ENTITY_NAME_KEYWORDS + ): + sp_type = "organization" self.input.add_sponsorship(sp, "primary", sp_type, True) cosponsors = match.groupdict()["cosponsors"] diff --git a/scrapers/ia/bills.py b/scrapers/ia/bills.py index 8c9a5fff70..cf81e1cf4b 100644 --- a/scrapers/ia/bills.py +++ b/scrapers/ia/bills.py @@ -6,6 +6,8 @@ from openstates.scrape import Scraper, Bill from .actions import Categorizer +_IA_ORGANIZATION_ENTITY_NAME_KEYWORDS = ["COMMITTEE", "RULES AND ADMINISTRATION"] + class IABillScraper(Scraper): categorizer = Categorizer() @@ -237,10 +239,15 @@ def scrape_bill( sponsor_array = sponsors.replace("and", ",").split(",") for sponsor in sponsor_array: + entity_type = "person" + if any( + keyword in sponsor for keyword in _IA_ORGANIZATION_ENTITY_NAME_KEYWORDS + ): + entity_type = "organization" bill.add_sponsorship( name=sponsor.strip(), classification="primary", - entity_type="organization" if "COMMITTEE ON" in sponsor else "person", + entity_type=entity_type, primary=True, ) diff --git a/scrapers/id/bills.py b/scrapers/id/bills.py index 11dd7250dd..b7eaf877ce 100644 --- a/scrapers/id/bills.py +++ b/scrapers/id/bills.py @@ -169,6 +169,8 @@ def _split(string): # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: + # Adding chamber to further filter search results for committee + # This is based on the assumption that a House Bill can only be sponsored by a House Committee and so on for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( @@ -176,6 +178,7 @@ def _split(string): entity_type="organization", primary=True, classification="primary", + chamber=chamber, ) else: for person in _split(sponsors): diff --git a/scrapers/ks/bills.py b/scrapers/ks/bills.py index afedffca5f..53321a5f14 100644 --- a/scrapers/ks/bills.py +++ b/scrapers/ks/bills.py @@ -71,30 +71,37 @@ def scrape_bill_from_api(self, session, bill_id, bill_url): bill.add_source(api_url) bill.add_source(bill_url) - # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]: - primary_sponsor = self.clean_sponsor_name(primary_sponsor) + primary_sponsor, sponsor_chamber = self.clean_sponsor_name(primary_sponsor) if primary_sponsor: bill.add_sponsorship( name=primary_sponsor, - entity_type="organization" - if "committee" in primary_sponsor.lower() - else "person", + entity_type=( + "organization" + if "committee" in primary_sponsor.lower() + else "person" + ), primary=True, classification="primary", + # Using global "chamber" here because we assume + # the primary sponsor i.e. bill_data["ORIGINAL_SPONSOR"] + # will be a committee from the chamber of bill origin + # Not confident enough to do the same for bill_data["SPONSOR_NAMES"]. + chamber=sponsor_chamber or chamber, ) for sponsor in bill_data["SPONSOR_NAMES"]: if sponsor in bill_data["ORIGINAL_SPONSOR"]: continue - sponsor = self.clean_sponsor_name(sponsor) + sponsor, sponsor_chamber = self.clean_sponsor_name(sponsor) bill.add_sponsorship( name=sponsor, - entity_type="organization" - if "committee" in sponsor.lower() - else "person", + entity_type=( + "organization" if "committee" in sponsor.lower() else "person" + ), primary=False, classification="cosponsor", + chamber=sponsor_chamber, ) # history is backwards @@ -142,6 +149,8 @@ def classify_chamber(self, bill_id): return "upper" if (bill_id[0] == "S") else "lower" def clean_sponsor_name(self, sponsor): + sp_chamber = None if sponsor and sponsor.split()[0] in ["Representative", "Senator"]: + sp_chamber = "upper" if sponsor.split()[0] == "Senator" else "lower" sponsor = "".join(sponsor.split()[1:]) - return sponsor + return sponsor, sp_chamber diff --git a/scrapers/ma/bills.py b/scrapers/ma/bills.py index 2c73cfb1ec..420171c0db 100644 --- a/scrapers/ma/bills.py +++ b/scrapers/ma/bills.py @@ -189,6 +189,16 @@ def scrape_bill(self, session, bill_meta, chamber): '//dt[text()="Sponsor:" or text()="Presenter:"]/' "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]" ) + # Sponsors always have link that follows pattern Jeffrey N. Roy + # If this is a person i.e. "legislators" it will show in sponsor_href. + sponsor_href = page.xpath( + '//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd//a/@href' + ) + sponsor_href = sponsor_href[0] if sponsor_href else "" + entity_type = ( + "person" if "legislators/" in sponsor_href.lower() else "organization" + ) + if sponsor: sponsor = ( sponsor[0] @@ -198,7 +208,7 @@ def scrape_bill(self, session, bill_meta, chamber): .strip() ) bill.add_sponsorship( - sponsor, classification="primary", primary=True, entity_type="person" + sponsor, classification="primary", primary=True, entity_type=entity_type ) self.scrape_cosponsors(bill, bill_url) diff --git a/scrapers/nc/bills.py b/scrapers/nc/bills.py index 5361f7c592..1bbcc43b6b 100644 --- a/scrapers/nc/bills.py +++ b/scrapers/nc/bills.py @@ -131,10 +131,14 @@ def scrape_bill(self, chamber, session, bill_id, bill_type, bill_title): spon_type = "cosponsor" if not name: continue + entity_type = "person" + if "rules, calendar, and operations of the house" in name.lower(): + name = name.replace(")", "") + entity_type = "organization" bill.add_sponsorship( name, classification=spon_type, - entity_type="person", + entity_type=entity_type, primary=(spon_type == "primary"), ) except IndexError: diff --git a/scrapers/ne/bills.py b/scrapers/ne/bills.py index 7d7c73db08..a165d9b8cc 100644 --- a/scrapers/ne/bills.py +++ b/scrapers/ne/bills.py @@ -142,9 +142,12 @@ def bill_info(self, bill_link, session, main_url): introduced_by = introduced_by.split("Introduced By:")[1].strip() introduced_by = introduced_by.strip() + entity_type = "person" + if "committee" in introduced_by.lower(): + entity_type = "organization" bill.add_sponsorship( name=introduced_by, - entity_type="person", + entity_type=entity_type, primary=True, classification="primary", ) @@ -165,9 +168,12 @@ def bill_info(self, bill_link, session, main_url): # NE legislature site does not list cosponsors, so we grab it from action statements if "name added" in action: cosponsor_name = action.split("name added")[0].strip() + entity_type = "person" + if "committee" in cosponsor_name.lower(): + entity_type = "organization" bill.add_sponsorship( cosponsor_name, - entity_type="person", + entity_type=entity_type, classification="cosponsor", primary=False, ) diff --git a/scrapers/nv/bills.py b/scrapers/nv/bills.py index b67bbf5650..43f124932c 100644 --- a/scrapers/nv/bills.py +++ b/scrapers/nv/bills.py @@ -208,16 +208,25 @@ def add_sponsors(self, bill, sponsor_links, primary): if "Sponsors" in name or name == "": continue # Removes leg position from name + # Use position to determine chamber # Example: Assemblywoman Alexis Hansen + # Also check if sponsor is an organization or person + # Example: "Assembly Committee on Government Affairs" is an organization + chamber = None + entity_type = "person" + if "committee" in name.lower(): + entity_type = "organization" if name.split()[0] in ["Assemblywoman", "Assemblyman", "Senator"]: + chamber = "lower" if "Assembly" in name.split()[0] else "upper" name = " ".join(name.split()[1:]).strip() if name not in seen: seen.add(name) bill.add_sponsorship( name=name, classification="sponsor" if primary else "cosponsor", - entity_type="person", + entity_type=entity_type, primary=primary, + chamber=chamber, ) def add_actions(self, bill, chamber): diff --git a/scrapers/sc/bills.py b/scrapers/sc/bills.py index c48fb49c1d..b789345ec5 100644 --- a/scrapers/sc/bills.py +++ b/scrapers/sc/bills.py @@ -200,7 +200,7 @@ def scrape_subjects(self, session): try: self.info(url) data = urllib.request.urlopen(url).read() - except (http.client.IncompleteRead) as e: + except http.client.IncompleteRead as e: self.warning("Client IncompleteRead error on {}".format(url)) data = e.partial @@ -394,24 +394,37 @@ def scrape_details(self, bill_detail_url, session, chamber, bill_id): subjects = list(self._subjects[bill_id]) + def _get_sponsor_chamber(url): + url = url.get("href") + return ( + "upper" + if "chamber=S" in url + else ("lower" if "chamber=H" in url else None) + ) + for subject in subjects: bill.add_subject(subject) # sponsors - for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'): + for sponsor in doc.xpath('//a[contains(@href, "member.php")]'): + sp_chamber = _get_sponsor_chamber(sponsor) + sponsor = sponsor.text.strip() bill.add_sponsorship( name=sponsor, classification="primary", primary=True, entity_type="person", + chamber=sp_chamber, ) - for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'): + for sponsor in doc.xpath('//a[contains(@href, "committee.php")]'): + sp_chamber = _get_sponsor_chamber(sponsor) sponsor = sponsor.replace("\xa0", " ").strip() bill.add_sponsorship( name=sponsor, classification="primary", primary=True, entity_type="organization", + chamber=sp_chamber, ) # find versions diff --git a/scrapers/sd/bills.py b/scrapers/sd/bills.py index 3716835f11..f2bc5c8d12 100644 --- a/scrapers/sd/bills.py +++ b/scrapers/sd/bills.py @@ -16,6 +16,10 @@ "2023": "68", "2024": "69", } +_CHAMBER_MAP = { + "H": "lower", + "S": "upper", +} class SDBillScraper(Scraper, LXMLMixin): @@ -100,15 +104,22 @@ def scrape_bill(self, chamber, session, bill_id, title, url): classification="primary", primary=True, entity_type=sponsor_type, + chamber=_CHAMBER_MAP.get(sponsor["MemberType"], None), ) else: sponsor_type = "organization" committee_sponsor = re.search(r">(.*)", page["BillCommitteeSponsor"])[1] + csp_chamber = ( + "upper" + if "Senate" in committee_sponsor + else ("lower" if "House" in committee_sponsor else None) + ) bill.add_sponsorship( committee_sponsor, classification="primary", primary=True, entity_type=sponsor_type, + chamber=csp_chamber or chamber, ) for keyword in page["Keywords"]: diff --git a/scrapers/vt/bills.py b/scrapers/vt/bills.py index 583c43ec8a..7ee2215111 100644 --- a/scrapers/vt/bills.py +++ b/scrapers/vt/bills.py @@ -123,18 +123,27 @@ def scrape(self, session=None): sponsor_type = "cosponsor" continue - sponsor_name = ( - sponsor.xpath("a/text()")[0] - .replace("Rep.", "") - .replace("Sen.", "") - .strip() + chamber = None + sponsor_name = sponsor.xpath("a/text()")[0] + + if sponsor_name.startswith("Rep.") or sponsor_name.startswith("House"): + chamber = "lower" + sponsor_name = sponsor_name.replace("Rep.", "").strip() + elif sponsor_name.startswith("Sen.") or sponsor_name.startswith( + "Senate" + ): + chamber = "upper" + sponsor_name = sponsor_name.replace("Sen.", "").strip() + entity_type = ( + "organization" if "committee" in sponsor_name else "person" ) if sponsor_name and sponsor_name != "Less…": bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, - entity_type="person", + entity_type=entity_type, primary=(sponsor_type == "primary"), + chamber=chamber, ) version_links = doc.xpath("//ul[contains(@class,'bill-path')]/li/div/a") diff --git a/scrapers/wi/bills.py b/scrapers/wi/bills.py index e2d47e5b20..75e6e34234 100644 --- a/scrapers/wi/bills.py +++ b/scrapers/wi/bills.py @@ -288,6 +288,7 @@ def parse_sponsors(self, bill, action, chamber): elif type == "Cosponsored": sponsor_type = "cosponsor" + entity_type = "person" if title == "Senator": sponsor_chamber = "upper" elif title == "Representative": @@ -298,6 +299,7 @@ def parse_sponsors(self, bill, action, chamber): elif title in ("Joint Legislative Council", "Law Revision Committee"): sponsor_chamber = chamber people = title + entity_type = "organization" for r in re.split(r"\sand\s|\,", people): if r.strip(): @@ -306,7 +308,7 @@ def parse_sponsors(self, bill, action, chamber): chamber=sponsor_chamber, classification=sponsor_type, primary=sponsor_type == "primary", - entity_type="person", + entity_type=entity_type, ) def add_vote(self, bill, chamber, date, text, url): diff --git a/scrapers/wy/bills.py b/scrapers/wy/bills.py index 9bfdd9c103..71140024c5 100644 --- a/scrapers/wy/bills.py +++ b/scrapers/wy/bills.py @@ -10,6 +10,23 @@ TIMEZONE = pytz.timezone("US/Mountain") +_COMMITTEE_NAME_MAPPING = { + "Agriculture": "Agriculture, State and Public Lands and Water Resources", + "BlockChain/Technology": "Blockchain, Financial Technology and Digital Innovation Technology Select", + "Cap Fin & Inv": " Capital Financing and Investments Select", + "Corporations": "Corporations, Elections and Political Subdivisions", + "Fed Nat Res": "Federal Natural Resource Management Select", + "Labor": "Labor, Health and Social Services", + "Mgt Audit": "Management Audit", + "Mgt Council": "Management Council", + "Minerals": "Minerals, Business and Economic Development", + "Nat Res Fund": "Natural Resource Funding Select", + "Sel Sch Fac": "School Facilities Select", + "Transportation": "Transportation, Highways and Military Affairs", + "Travel": "Travel, Recreation, Wildlife and Cultural Resource", + "Tribal Relations": "Select Committee on Tribal Relations", + "Water": " Water Select", +} class WYBillScraper(Scraper, LXMLMixin): @@ -198,26 +215,21 @@ def scrape_bill(self, bill_num, session): for sponsor in bill_json["sponsors"]: status = "primary" if sponsor["primarySponsor"] else "cosponsor" sponsor_type = "person" if sponsor["sponsorTitle"] else "organization" - chamber = ( + sp_chamber = ( "lower" if sponsor["house"] == "H" else ("upper" if sponsor["house"] == "S" else None) ) - if chamber: - bill.add_sponsorship( - name=sponsor["name"], - classification=status, - entity_type=sponsor_type, - primary=sponsor["primarySponsor"], - chamber=chamber, - ) - else: - bill.add_sponsorship( - name=sponsor["name"], - classification=status, - entity_type=sponsor_type, - primary=sponsor["primarySponsor"], - ) + sponsor_name = sponsor["name"] + if sponsor_type == "organization": + sponsor_name = _COMMITTEE_NAME_MAPPING.get(sponsor_name, sponsor_name) + bill.add_sponsorship( + name=sponsor_name, + classification=status, + entity_type=sponsor_type, + primary=sponsor["primarySponsor"], + chamber=sp_chamber, + ) if bill_json["summary"]: bill.add_abstract(note="summary", abstract=bill_json["summary"])