diff --git a/pyproject.toml b/pyproject.toml index 9c0fa23eda..281e4fc3ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ build-backend = "poetry.core.masonry.api" [tool.ruff] -extend-exclude = ["migrations", "commonlib", "scripts"] +extend-exclude = ["migrations", "commonlib", "scripts/historic"] [tool.ruff.lint] select = [ diff --git a/scripts/division_io.py b/scripts/division_io.py index bc2e0d8228..93ef91328a 100644 --- a/scripts/division_io.py +++ b/scripts/division_io.py @@ -7,8 +7,6 @@ """ -import re -import sys from enum import Enum from pathlib import Path from typing import cast @@ -17,9 +15,9 @@ import MySQLdb import pandas as pd import rich_click as click +from pylib.mysociety import config from rich import print from rich.prompt import Prompt -from pylib.mysociety import config repository_path = Path(__file__).parent.parent @@ -40,7 +38,6 @@ class TitlePriority(str, Enum): @classmethod def get_priority(cls, priority: str) -> int: - lookup = { cls.ORIGINAL_HEADER: 1, cls.PARLIAMENT_DESCRIBED: 5, @@ -104,7 +101,7 @@ def df_to_db(df: pd.DataFrame, *, new_priority: TitlePriority, verbose: bool = F # get all divisions with a title_priority below or equal to current priority existing_df = pd.read_sql( - f"SELECT division_id, title_priority FROM divisions", + "SELECT division_id, title_priority FROM divisions", db_connection, ) existing_df["int_title_priority"] = existing_df["title_priority"].apply( diff --git a/scripts/download_parliament_portraits.py b/scripts/download_parliament_portraits.py index 43c4c5e873..f9e0e40d6a 100644 --- a/scripts/download_parliament_portraits.py +++ b/scripts/download_parliament_portraits.py @@ -1,4 +1,4 @@ -''' +""" Python 3 Downloads thumbnails @@ -9,15 +9,14 @@ Pillow everypolitician-popolo -''' +""" import os -import csv +from os.path import exists from tempfile import gettempdir -from PIL import Image from urllib.request import urlretrieve -from os.path import exists +from PIL import Image from popolo_data.importer import Popolo small_image_folder = r"..\www\docs\images\mps" @@ -25,64 +24,71 @@ def get_id_lookup(): - """ - create id lookup from popolo file - convert datadotparl_id to parlparse - """ - people_url = "https://github.com/mysociety/parlparse/raw/master/members/people.json" - pop = Popolo.from_url(people_url) - count = 0 - lookup = {} - print ("Creating id lookup") - for p in pop.persons: - id = p.id - datadotparl = p.identifier_value("datadotparl_id") - if datadotparl: - lookup[datadotparl] = id[-5:] - count += 1 - print (count, len(pop.persons)) - return lookup - -image_format = "https://members-api.parliament.uk/api/Members/{0}/Portrait?CropType=ThreeFour" + """ + create id lookup from popolo file + convert datadotparl_id to parlparse + """ + people_url = "https://github.com/mysociety/parlparse/raw/master/members/people.json" + pop = Popolo.from_url(people_url) + count = 0 + lookup = {} + print("Creating id lookup") + for p in pop.persons: + id = p.id + datadotparl = p.identifier_value("datadotparl_id") + if datadotparl: + lookup[datadotparl] = id[-5:] + count += 1 + print(count, len(pop.persons)) + return lookup + + +image_format = ( + "https://members-api.parliament.uk/api/Members/{0}/Portrait?CropType=ThreeFour" +) + + def get_image_url(id): - return image_format.format(id) + return image_format.format(id) + def download_and_resize(mp_id, parlparse): - filename = "{0}.jpg".format(parlparse) - alt_filename = "{0}.jpeg".format(parlparse) - small_path = os.path.join(small_image_folder, filename) - small_path_alt = os.path.join(small_image_folder, alt_filename) - large_path = os.path.join(large_image_folder, filename) - temp_path = os.path.join(gettempdir(),"{0}.jpg".format(mp_id)) - image_url = get_image_url(mp_id) - try: - urlretrieve(image_url, temp_path) - except Exception: - return None - print ("downloaded: {0}".format(image_url)) - image = Image.open(temp_path) - if exists(large_path) is False: - image.thumbnail((120, 160)) - image.save(large_path, quality=95) - if not exists(small_path) and not exists(small_path_alt): - image.thumbnail((60, 80)) - image.save(small_path, quality=95) - image.close() - os.remove(temp_path) + filename = "{0}.jpg".format(parlparse) + alt_filename = "{0}.jpeg".format(parlparse) + small_path = os.path.join(small_image_folder, filename) + small_path_alt = os.path.join(small_image_folder, alt_filename) + large_path = os.path.join(large_image_folder, filename) + temp_path = os.path.join(gettempdir(), "{0}.jpg".format(mp_id)) + image_url = get_image_url(mp_id) + try: + urlretrieve(image_url, temp_path) + except Exception: + return None + print("downloaded: {0}".format(image_url)) + image = Image.open(temp_path) + if exists(large_path) is False: + image.thumbnail((120, 160)) + image.save(large_path, quality=95) + if not exists(small_path) and not exists(small_path_alt): + image.thumbnail((60, 80)) + image.save(small_path, quality=95) + image.close() + os.remove(temp_path) + def get_images(): - """ - fetch image if available - """ - lookup = get_id_lookup() - - for datadotparl, parlparse in lookup.items(): - - filename = "{0}.jpg".format(parlparse) - small_path = os.path.join(small_image_folder, filename) - large_path = os.path.join(large_image_folder, filename) - if exists(large_path) is False or exists(small_path) is False: - download_and_resize(datadotparl, parlparse) + """ + fetch image if available + """ + lookup = get_id_lookup() + + for datadotparl, parlparse in lookup.items(): + filename = "{0}.jpg".format(parlparse) + small_path = os.path.join(small_image_folder, filename) + large_path = os.path.join(large_image_folder, filename) + if exists(large_path) is False or exists(small_path) is False: + download_and_resize(datadotparl, parlparse) + if __name__ == "__main__": - get_images() + get_images() diff --git a/scripts/future-fetch.py b/scripts/future-fetch.py index 9bc099bfa1..5f90fab3a6 100644 --- a/scripts/future-fetch.py +++ b/scripts/future-fetch.py @@ -1,48 +1,51 @@ #!/usr/bin/env python3 # encoding: utf-8 +import datetime import json import os -import sys import re +import sys import urllib.request -import MySQLdb -import datetime +import MySQLdb # Set up commonlib pylib package_dir = os.path.abspath(os.path.split(__file__)[0]) sys.path.append(os.path.normpath(package_dir + "/../commonlib/pylib")) # And from that, get the config -from mysociety import config +from mysociety import config # noqa:E402 + config.set_file(os.path.abspath(package_dir + "/../conf/general")) # And now we have config, find parlparse -sys.path.append(os.path.normpath(config.get('PWMEMBERS') + '../pyscraper')) +sys.path.append(os.path.normpath(config.get("PWMEMBERS") + "../pyscraper")) # This name matching could be done a lot better -from resolvemembernames import memberList -from lords.resolvenames import lordsList +from lords.resolvenames import lordsList # noqa:E402 +from resolvemembernames import memberList # noqa:E402 -CALENDAR_LINK = 'https://whatson.parliament.uk/%(place)s/%(iso)s/' -CALENDAR_BASE = 'https://whatson-api.parliament.uk/calendar/events/list.json?queryParameters.startDate=%(date)s' +CALENDAR_LINK = "https://whatson.parliament.uk/%(place)s/%(iso)s/" +CALENDAR_BASE = "https://whatson-api.parliament.uk/calendar/events/list.json?queryParameters.startDate=%(date)s" positions = {} def fetch_url(date): - data = CALENDAR_BASE % {'date': date} + data = CALENDAR_BASE % {"date": date} data = urllib.request.urlopen(data) data = json.load(data) return data + def get_calendar_events(): date = datetime.date.today() data = fetch_url(date) - data = sorted(data, key=lambda x: x['StartDate'] + x['StartTime']) + data = sorted(data, key=lambda x: x["StartDate"] + x["StartTime"]) for event in data: yield Entry(event) + def make_time(t): if t and len(t) == 5: t += ":00" @@ -54,109 +57,120 @@ class Entry(object): modified = None deleted = 0 link_calendar = None - link_external = '' - body = 'uk' + link_external = "" + body = "uk" chamber = None event_date = None time_start = None time_end = None - committee_name = '' - debate_type = '' - title = '' + committee_name = "" + debate_type = "" + title = "" witnesses = None - witnesses_str = '' - location = '' + witnesses_str = "" + location = "" def __init__(self, event): - house = event['House'] # Lords / Commons / Joint - chamber = event['Type'] # Select & Joint Committees, General Committee, Grand Committee, Main Chamber, Westminster Hall - - if chamber == 'Select & Joint Committees': - house_url = 'committees' - if house == 'Joint': - self.chamber = 'Joint Committee' + house = event["House"] # Lords / Commons / Joint + chamber = event[ + "Type" + ] # Select & Joint Committees, General Committee, Grand Committee, Main Chamber, Westminster Hall + + if chamber == "Select & Joint Committees": + house_url = "committees" + if house == "Joint": + self.chamber = "Joint Committee" else: - self.chamber = '%s: Select Committee' % house + self.chamber = "%s: Select Committee" % house else: - self.chamber = '%s: %s' % (house, chamber) + self.chamber = "%s: %s" % (house, chamber) house_url = house.lower() # Two separate ID flows, for committees and not, it appears # We only have the one primary key - self.id = event['Id'] - if house_url == 'committees': + self.id = event["Id"] + if house_url == "committees": self.id += 1000000 - self.event_date = event['StartDate'][0:10] - self.time_start = make_time(event['StartTime']) - self.time_end = make_time(event['EndTime']) - self.link_calendar = CALENDAR_LINK % {'place': house_url, 'iso': self.event_date} - - if event['Category'] == "Prime Minister's Question Time": - self.debate_type = 'Oral questions' - self.title = event['Category'] + self.event_date = event["StartDate"][0:10] + self.time_start = make_time(event["StartTime"]) + self.time_end = make_time(event["EndTime"]) + self.link_calendar = CALENDAR_LINK % { + "place": house_url, + "iso": self.event_date, + } + + if event["Category"] == "Prime Minister's Question Time": + self.debate_type = "Oral questions" + self.title = event["Category"] else: - self.debate_type = event['Category'] - self.title = event['Description'] or '' + self.debate_type = event["Category"] + self.title = event["Description"] or "" - committee = event['Committee'] + committee = event["Committee"] if committee: - self.committee_name = committee['Description'] or '' - subject = (committee['Inquiries'] or [{}])[0].get('Description') + self.committee_name = committee["Description"] or "" + subject = (committee["Inquiries"] or [{}])[0].get("Description") if subject and self.title: - self.title += ': ' + subject + self.title += ": " + subject elif subject: self.title = subject self.people = [] - for member in event['Members']: - id = str(member['Id']) + for member in event["Members"]: + id = str(member["Id"]) match = memberList.match_by_mnis(id, self.event_date) if not match: match = lordsList.match_by_mnis(id, self.event_date) if match: self.people.append( - int(match['id'].replace('uk.org.publicwhip/person/', '')) + int(match["id"].replace("uk.org.publicwhip/person/", "")) ) self.witnesses = [] witnesses_str = [] - for activity in event['EventActivities'] or []: - for attendee in activity['Attendees']: - m = re.match(r'\b(\w+ \w+ MP)', attendee) + for activity in event["EventActivities"] or []: + for attendee in activity["Attendees"]: + m = re.match(r"\b(\w+ \w+ MP)", attendee) if m: mp = m.group(1) id, name, cons = memberList.matchfullnamecons( mp, None, self.event_date ) if id: - pid = int(id.replace('uk.org.publicwhip/person/', '')) + pid = int(id.replace("uk.org.publicwhip/person/", "")) mp_link = '%s' % (pid, mp) self.witnesses.append(pid) witnesses_str.append(attendee.replace(mp, mp_link)) continue witnesses_str.append(attendee) - self.witnesses_str = '\n'.join(witnesses_str) + self.witnesses_str = "\n".join(witnesses_str) - self.location = event['Location'] or '' + self.location = event["Location"] or "" def get_tuple(self): return ( - self.id, self.deleted, - self.link_calendar, self.link_external, - self.body, self.chamber, - self.event_date, self.time_start, self.time_end, + self.id, + self.deleted, + self.link_calendar, + self.link_external, + self.body, + self.chamber, + self.event_date, + self.time_start, + self.time_end, self.committee_name, self.debate_type, self.title, self.witnesses_str, self.location, - ) + ) def add(self): # TODO This function needs to insert into Xapian as well, or store to # insert in one go at the end - db_cursor.execute("""INSERT INTO future ( + db_cursor.execute( + """INSERT INTO future ( id, modified, deleted, link_calendar, link_external, body, chamber, @@ -170,8 +184,9 @@ def add(self): %s, %s, %s, %s, %s, %s, %s, %s - )""", self.get_tuple() - ) + )""", + self.get_tuple(), + ) self.update_people(delete_old=False) @@ -181,14 +196,14 @@ def update_people(self, delete_old=True): if delete_old: db_cursor.execute( - 'DELETE FROM future_people where calendar_id = %s', - (self.id,)) + "DELETE FROM future_people where calendar_id = %s", (self.id,) + ) db_cursor.executemany( - '''INSERT INTO future_people(calendar_id, person_id, witness) - VALUES (%s, %s, %s)''', - new_people + new_witnesses - ) + """INSERT INTO future_people(calendar_id, person_id, witness) + VALUES (%s, %s, %s)""", + new_people + new_witnesses, + ) def update(self): event_tuple = self.get_tuple() @@ -213,27 +228,27 @@ def update(self): WHERE id = %s """, - event_tuple[1:] + (self.id,) - ) + event_tuple[1:] + (self.id,), + ) self.update_people() + db_connection = MySQLdb.connect( - host=config.get('TWFY_DB_HOST'), - db=config.get('TWFY_DB_NAME'), - user=config.get('TWFY_DB_USER'), - passwd=config.get('TWFY_DB_PASS'), - charset='utf8', - ) + host=config.get("TWFY_DB_HOST"), + db=config.get("TWFY_DB_NAME"), + user=config.get("TWFY_DB_USER"), + passwd=config.get("TWFY_DB_PASS"), + charset="utf8", +) db_cursor = db_connection.cursor() - # Get the id's of entries from the future as the database sees it. # We'll delete ids from here as we go, and what's left will be things # which are no longer in Future Business. -db_cursor.execute('select id from future where event_date > CURRENT_DATE()') +db_cursor.execute("select id from future where event_date > CURRENT_DATE()") old_entries = set(db_cursor.fetchall()) for new_entry in get_calendar_events(): @@ -242,16 +257,16 @@ def update(self): positions[event_date] = positions.setdefault(event_date, 0) + 1 row_count = db_cursor.execute( - '''SELECT id, deleted, + """SELECT id, deleted, link_calendar, link_external, body, chamber, event_date, time_start, time_end, committee_name, debate_type, title, witnesses, location FROM future - WHERE id=%s''', - (id,) - ) + WHERE id=%s""", + (id,), + ) if row_count: # We have seen this event before. TODO Compare with current entry, @@ -260,12 +275,25 @@ def update(self): # For some reason the time fields come out as timedelta rather that # time, so need converting. - old_tuple = \ - old_row[0:6] + \ - (old_row[6].isoformat(), ) + \ - ((datetime.datetime.min + old_row[7]).time().isoformat() if old_row[7] is not None else None,) + \ - ((datetime.datetime.min + old_row[8]).time().isoformat() if old_row[8] is not None else None,) + \ - old_row[9:] + old_tuple = ( + old_row[0:6] + + (old_row[6].isoformat(),) + + ( + ( + (datetime.datetime.min + old_row[7]).time().isoformat() + if old_row[7] is not None + else None + ), + ) + + ( + ( + (datetime.datetime.min + old_row[8]).time().isoformat() + if old_row[8] is not None + else None + ), + ) + + old_row[9:] + ) new_tuple = new_entry.get_tuple() @@ -277,11 +305,9 @@ def update(self): new_entry.add() db_cursor.execute( - 'UPDATE future SET pos=%s WHERE id=%s', (positions[event_date], id) - ) - -db_cursor.executemany( - 'UPDATE future SET deleted=1 WHERE id=%s', tuple(old_entries) + "UPDATE future SET pos=%s WHERE id=%s", (positions[event_date], id) ) +db_cursor.executemany("UPDATE future SET deleted=1 WHERE id=%s", tuple(old_entries)) + db_connection.commit() diff --git a/scripts/photo-attribution-import.py b/scripts/photo-attribution-import.py index c967e06e86..bd21754342 100644 --- a/scripts/photo-attribution-import.py +++ b/scripts/photo-attribution-import.py @@ -4,7 +4,7 @@ import json import os import sys -import re + import MySQLdb # Set up commonlib pylib @@ -12,10 +12,13 @@ sys.path.append(os.path.normpath(package_dir + "/../commonlib/pylib")) # And from that, get the config -from mysociety import config +from mysociety import config # noqa:E402 + config.set_file(os.path.abspath(package_dir + "/../conf/general")) -filename = os.path.normpath(config.get('BASEDIR') + config.get('IMAGEPATH') + 'attribution.json') +filename = os.path.normpath( + config.get("BASEDIR") + config.get("IMAGEPATH") + "attribution.json" +) try: data = json.load(open(filename)) except OSError: @@ -23,23 +26,29 @@ sys.exit(0) db_connection = MySQLdb.connect( - host=config.get('TWFY_DB_HOST'), - db=config.get('TWFY_DB_NAME'), - user=config.get('TWFY_DB_USER'), - passwd=config.get('TWFY_DB_PASS'), - charset='utf8', + host=config.get("TWFY_DB_HOST"), + db=config.get("TWFY_DB_NAME"), + user=config.get("TWFY_DB_USER"), + passwd=config.get("TWFY_DB_PASS"), + charset="utf8", ) db_cursor = db_connection.cursor() data_blank = [r for r in data if not r["data_value"]] data_blank = [(r["person_id"], r["data_key"]) for r in data_blank] -db_cursor.executemany("""DELETE FROM personinfo - WHERE person_id=%s AND data_key=%s""", data_blank) +db_cursor.executemany( + """DELETE FROM personinfo + WHERE person_id=%s AND data_key=%s""", + data_blank, +) data = [r for r in data if r["data_value"]] data = [(r["person_id"], r["data_key"], r["data_value"]) for r in data] -db_cursor.executemany("""INSERT INTO personinfo +db_cursor.executemany( + """INSERT INTO personinfo (person_id, data_key, data_value) VALUES (%s, %s, %s) - ON DUPLICATE KEY UPDATE data_value = VALUES(data_value)""", data) + ON DUPLICATE KEY UPDATE data_value = VALUES(data_value)""", + data, +) db_connection.commit()