Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add entity types multi jurisdictions #5085

Merged
Merged
7 changes: 6 additions & 1 deletion scrapers/ar/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
from .common import get_slug_for_session, get_biennium_year

TIMEZONE = pytz.timezone("US/Central")
_AR_ORGANIZATION_ENTITY_NAME_KEYWORDS = [
"Committee",
"House Management",
"Senate Efficiency",
]


# Needed because they're using a port python doesn't expect
Expand Down Expand Up @@ -209,7 +214,7 @@ def scrape_actions(self):

def get_entity_name(self, link):
entity_type = "person"
if "Committees" in link:
if any(keyword in link for keyword in _AR_ORGANIZATION_ENTITY_NAME_KEYWORDS):
entity_type = "organization"
return entity_type

Expand Down
26 changes: 25 additions & 1 deletion scrapers/fl/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,25 @@
SPONSOR_RE = re.compile(
r"by\s+(?P<sponsors>[^(]+)(\(CO-INTRODUCERS\)\s+(?P<cosponsors>[\s\S]+))?"
)
FL_ORGANIZATION_ENTITY_NAME_KEYWORDS = [
"affairs",
"agriculture",
"appropriations",
"banking and insurance",
"committee",
"commerce and tourism",
"criminal justice",
"education",
"ethics and elections",
"environment and natural resources",
"fiscal policy",
"finance and tax",
"governmental oversight",
"health policy",
"regulated industries",
"rules",
"transportation",
]

requests.packages.urllib3.disable_warnings()
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL"
Expand Down Expand Up @@ -157,7 +176,12 @@ def process_sponsors(self):
for sp in sponsors.split("; "):
sp = sp.strip()
if sp:
sp_type = "organization" if "committee" in sp.lower() else "person"
sp_type = "person"
if any(
keyword in sp.lower()
for keyword in FL_ORGANIZATION_ENTITY_NAME_KEYWORDS
):
sp_type = "organization"
self.input.add_sponsorship(sp, "primary", sp_type, True)

cosponsors = match.groupdict()["cosponsors"]
Expand Down
9 changes: 8 additions & 1 deletion scrapers/ia/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from openstates.scrape import Scraper, Bill
from .actions import Categorizer

_IA_ORGANIZATION_ENTITY_NAME_KEYWORDS = ["COMMITTEE", "RULES AND ADMINISTRATION"]


class IABillScraper(Scraper):
categorizer = Categorizer()
Expand Down Expand Up @@ -237,10 +239,15 @@ def scrape_bill(
sponsor_array = sponsors.replace("and", ",").split(",")

for sponsor in sponsor_array:
entity_type = "person"
if any(
keyword in sponsor for keyword in _IA_ORGANIZATION_ENTITY_NAME_KEYWORDS
):
entity_type = "organization"
bill.add_sponsorship(
name=sponsor.strip(),
classification="primary",
entity_type="organization" if "COMMITTEE ON" in sponsor else "person",
entity_type=entity_type,
primary=True,
)

Expand Down
3 changes: 3 additions & 0 deletions scrapers/id/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,16 @@ def _split(string):
# sponsors range from a committee to one legislator to a group of legs
sponsor_lists = bill_tables[0].text_content().split("by")
if len(sponsor_lists) > 1:
# Adding chamber to further filter search results for committee
# This is based on the assumption that a House Bill can only be sponsored by a House Committee and so on
for sponsors in sponsor_lists[1:]:
if "COMMITTEE" in sponsors.upper():
bill.add_sponsorship(
name=sponsors.strip(),
entity_type="organization",
primary=True,
classification="primary",
chamber=chamber,
)
else:
for person in _split(sponsors):
Expand Down
29 changes: 19 additions & 10 deletions scrapers/ks/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,30 +71,37 @@ def scrape_bill_from_api(self, session, bill_id, bill_url):

bill.add_source(api_url)
bill.add_source(bill_url)

# An "original sponsor" is the API's expression of "primary sponsor"
for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
primary_sponsor = self.clean_sponsor_name(primary_sponsor)
primary_sponsor, sponsor_chamber = self.clean_sponsor_name(primary_sponsor)
if primary_sponsor:
bill.add_sponsorship(
name=primary_sponsor,
entity_type="organization"
if "committee" in primary_sponsor.lower()
else "person",
entity_type=(
"organization"
if "committee" in primary_sponsor.lower()
else "person"
),
primary=True,
classification="primary",
# Using global "chamber" here because we assume
# the primary sponsor i.e. bill_data["ORIGINAL_SPONSOR"]
# will be a committee from the chamber of bill origin
# Not confident enough to do the same for bill_data["SPONSOR_NAMES"].
chamber=sponsor_chamber or chamber,
)
for sponsor in bill_data["SPONSOR_NAMES"]:
if sponsor in bill_data["ORIGINAL_SPONSOR"]:
continue
sponsor = self.clean_sponsor_name(sponsor)
sponsor, sponsor_chamber = self.clean_sponsor_name(sponsor)
bill.add_sponsorship(
name=sponsor,
entity_type="organization"
if "committee" in sponsor.lower()
else "person",
entity_type=(
"organization" if "committee" in sponsor.lower() else "person"
),
primary=False,
classification="cosponsor",
chamber=sponsor_chamber,
)

# history is backwards
Expand Down Expand Up @@ -142,6 +149,8 @@ def classify_chamber(self, bill_id):
return "upper" if (bill_id[0] == "S") else "lower"

def clean_sponsor_name(self, sponsor):
sp_chamber = None
if sponsor and sponsor.split()[0] in ["Representative", "Senator"]:
sp_chamber = "upper" if sponsor.split()[0] == "Senator" else "lower"
sponsor = "".join(sponsor.split()[1:])
return sponsor
return sponsor, sp_chamber
12 changes: 11 additions & 1 deletion scrapers/ma/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,16 @@ def scrape_bill(self, session, bill_meta, chamber):
'//dt[text()="Sponsor:" or text()="Presenter:"]/'
"following-sibling::dd/descendant-or-self::*/text()[normalize-space()]"
)
# Sponsors always have link that follows pattern <a href="/Legislators/Profile/JNR1/193">Jeffrey N. Roy</a>
# If this is a person i.e. "legislators" it will show in sponsor_href.
sponsor_href = page.xpath(
'//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd//a/@href'
)
sponsor_href = sponsor_href[0] if sponsor_href else ""
entity_type = (
"person" if "legislators/" in sponsor_href.lower() else "organization"
)

if sponsor:
sponsor = (
sponsor[0]
Expand All @@ -198,7 +208,7 @@ def scrape_bill(self, session, bill_meta, chamber):
.strip()
)
bill.add_sponsorship(
sponsor, classification="primary", primary=True, entity_type="person"
sponsor, classification="primary", primary=True, entity_type=entity_type
)

self.scrape_cosponsors(bill, bill_url)
Expand Down
6 changes: 5 additions & 1 deletion scrapers/nc/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,14 @@ def scrape_bill(self, chamber, session, bill_id, bill_type, bill_title):
spon_type = "cosponsor"
if not name:
continue
entity_type = "person"
if "rules, calendar, and operations of the house" in name.lower():
name = name.replace(")", "")
entity_type = "organization"
bill.add_sponsorship(
name,
classification=spon_type,
entity_type="person",
entity_type=entity_type,
primary=(spon_type == "primary"),
)
except IndexError:
Expand Down
10 changes: 8 additions & 2 deletions scrapers/ne/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,12 @@ def bill_info(self, bill_link, session, main_url):
introduced_by = introduced_by.split("Introduced By:")[1].strip()

introduced_by = introduced_by.strip()
entity_type = "person"
if "committee" in introduced_by.lower():
entity_type = "organization"
bill.add_sponsorship(
name=introduced_by,
entity_type="person",
entity_type=entity_type,
primary=True,
classification="primary",
)
Expand All @@ -165,9 +168,12 @@ def bill_info(self, bill_link, session, main_url):
# NE legislature site does not list cosponsors, so we grab it from action statements
if "name added" in action:
cosponsor_name = action.split("name added")[0].strip()
entity_type = "person"
if "committee" in cosponsor_name.lower():
entity_type = "organization"
bill.add_sponsorship(
cosponsor_name,
entity_type="person",
entity_type=entity_type,
classification="cosponsor",
primary=False,
)
Expand Down
11 changes: 10 additions & 1 deletion scrapers/nv/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,25 @@ def add_sponsors(self, bill, sponsor_links, primary):
if "Sponsors" in name or name == "":
continue
# Removes leg position from name
# Use position to determine chamber
# Example: Assemblywoman Alexis Hansen
# Also check if sponsor is an organization or person
# Example: "Assembly Committee on Government Affairs" is an organization
chamber = None
entity_type = "person"
if "committee" in name.lower():
entity_type = "organization"
if name.split()[0] in ["Assemblywoman", "Assemblyman", "Senator"]:
chamber = "lower" if "Assembly" in name.split()[0] else "upper"
name = " ".join(name.split()[1:]).strip()
if name not in seen:
seen.add(name)
bill.add_sponsorship(
name=name,
classification="sponsor" if primary else "cosponsor",
entity_type="person",
entity_type=entity_type,
primary=primary,
chamber=chamber,
)

def add_actions(self, bill, chamber):
Expand Down
19 changes: 16 additions & 3 deletions scrapers/sc/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def scrape_subjects(self, session):
try:
self.info(url)
data = urllib.request.urlopen(url).read()
except (http.client.IncompleteRead) as e:
except http.client.IncompleteRead as e:
self.warning("Client IncompleteRead error on {}".format(url))
data = e.partial

Expand Down Expand Up @@ -394,24 +394,37 @@ def scrape_details(self, bill_detail_url, session, chamber, bill_id):

subjects = list(self._subjects[bill_id])

def _get_sponsor_chamber(url):
url = url.get("href")
return (
"upper"
if "chamber=S" in url
else ("lower" if "chamber=H" in url else None)
)

for subject in subjects:
bill.add_subject(subject)

# sponsors
for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
for sponsor in doc.xpath('//a[contains(@href, "member.php")]'):
sp_chamber = _get_sponsor_chamber(sponsor)
sponsor = sponsor.text.strip()
bill.add_sponsorship(
name=sponsor,
classification="primary",
primary=True,
entity_type="person",
chamber=sp_chamber,
)
for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
for sponsor in doc.xpath('//a[contains(@href, "committee.php")]'):
sp_chamber = _get_sponsor_chamber(sponsor)
sponsor = sponsor.replace("\xa0", " ").strip()
bill.add_sponsorship(
name=sponsor,
classification="primary",
primary=True,
entity_type="organization",
chamber=sp_chamber,
)

# find versions
Expand Down
11 changes: 11 additions & 0 deletions scrapers/sd/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
"2023": "68",
"2024": "69",
}
_CHAMBER_MAP = {
"H": "lower",
"S": "upper",
}


class SDBillScraper(Scraper, LXMLMixin):
Expand Down Expand Up @@ -100,15 +104,22 @@ def scrape_bill(self, chamber, session, bill_id, title, url):
classification="primary",
primary=True,
entity_type=sponsor_type,
chamber=_CHAMBER_MAP.get(sponsor["MemberType"], None),
)
else:
sponsor_type = "organization"
committee_sponsor = re.search(r">(.*)</a>", page["BillCommitteeSponsor"])[1]
csp_chamber = (
"upper"
if "Senate" in committee_sponsor
else ("lower" if "House" in committee_sponsor else None)
)
bill.add_sponsorship(
committee_sponsor,
classification="primary",
primary=True,
entity_type=sponsor_type,
chamber=csp_chamber or chamber,
)

for keyword in page["Keywords"]:
Expand Down
21 changes: 15 additions & 6 deletions scrapers/vt/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,18 +123,27 @@ def scrape(self, session=None):
sponsor_type = "cosponsor"
continue

sponsor_name = (
sponsor.xpath("a/text()")[0]
.replace("Rep.", "")
.replace("Sen.", "")
.strip()
chamber = None
sponsor_name = sponsor.xpath("a/text()")[0]

if sponsor_name.startswith("Rep.") or sponsor_name.startswith("House"):
chamber = "lower"
sponsor_name = sponsor_name.replace("Rep.", "").strip()
elif sponsor_name.startswith("Sen.") or sponsor_name.startswith(
"Senate"
):
chamber = "upper"
sponsor_name = sponsor_name.replace("Sen.", "").strip()
entity_type = (
"organization" if "committee" in sponsor_name else "person"
)
if sponsor_name and sponsor_name != "Less…":
bill.add_sponsorship(
name=sponsor_name,
classification=sponsor_type,
entity_type="person",
entity_type=entity_type,
primary=(sponsor_type == "primary"),
chamber=chamber,
)

version_links = doc.xpath("//ul[contains(@class,'bill-path')]/li/div/a")
Expand Down
4 changes: 3 additions & 1 deletion scrapers/wi/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ def parse_sponsors(self, bill, action, chamber):
elif type == "Cosponsored":
sponsor_type = "cosponsor"

entity_type = "person"
if title == "Senator":
sponsor_chamber = "upper"
elif title == "Representative":
Expand All @@ -298,6 +299,7 @@ def parse_sponsors(self, bill, action, chamber):
elif title in ("Joint Legislative Council", "Law Revision Committee"):
sponsor_chamber = chamber
people = title
entity_type = "organization"

for r in re.split(r"\sand\s|\,", people):
if r.strip():
Expand All @@ -306,7 +308,7 @@ def parse_sponsors(self, bill, action, chamber):
chamber=sponsor_chamber,
classification=sponsor_type,
primary=sponsor_type == "primary",
entity_type="person",
entity_type=entity_type,
)

def add_vote(self, bill, chamber, date, text, url):
Expand Down
Loading
Loading