diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 55c4809f..3a2c2a9e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -48,8 +48,8 @@ jobs: python -m pip install --upgrade pip pip install -e .[dev] - - name: Generate databases - run: python cities/utils/csv_to_db_pipeline.py + # - name: Generate databases + # run: python cities/utils/csv_to_db_pipeline.py - name: Test diff --git a/.gitignore b/.gitignore index 0cbf1626..df8277e4 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,12 @@ tests/.coverage .vscode/launch.json data/sql/counties_database.db data/sql/msa_database.db +docs/experimental_notebooks/zoning/interactions_preds.dill +docs/experimental_notebooks/zoning/population_preds.dill +docs/experimental_notebooks/zoning/waic_dict_7.pkl +docs/experimental_notebooks/zoning/waic_dict_13.pkl +docs/experimental_notebooks/zoning/waic_dict_14.pkl + .Rproj.user **/*.RData @@ -42,6 +48,8 @@ data/minneapolis/sourced/demographic/** data/minneapolis/preds/** data/minneapolis/sourced/parcel_to_census_tract_mappings/** data/minneapolis/sourced/parcel_to_parking_info_mappings/** + +data/minneapolis/.pgpass cities/deployment/tracts_minneapolis/tracts_model_guide.pkl cities/deployment/tracts_minneapolis/tracts_model_params.pth build/cities/deployment/tracts_minneapolis/tracts_model_guide.pkl diff --git a/Makefile b/Makefile index 62670a78..3fff98ae 100755 --- a/Makefile +++ b/Makefile @@ -1,10 +1,20 @@ format: FORCE ./scripts/clean.sh + +path ?= . + +format_path: FORCE + ./scripts/clean_path.sh $(path) + lint: FORCE ./scripts/lint.sh test: FORCE + ./scripts/test.sh + +test_all: FORCE + ./scripts/clean.sh ./scripts/lint.sh ./scripts/test.sh ./scripts/test_notebooks.sh diff --git a/build/.env b/build/.env new file mode 100644 index 00000000..c1e54d7a --- /dev/null +++ b/build/.env @@ -0,0 +1,10 @@ +GOOGLE_CLOUD_PROJECT=cities-429602 +GOOGLE_CLOUD_BUCKET=minneapolis-basis + +ENV=dev +INSTANCE_CONNECTION_NAME=cities-429602:us-central1:cities-devel +DB_SEARCH_PATH=dev,public +HOST=34.123.100.76 +SCHEMA=minneapolis +DATABASE=cities +DB_USERNAME=postgres diff --git a/build/Dockerfile b/build/Dockerfile new file mode 100644 index 00000000..cb1144de --- /dev/null +++ b/build/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3 + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD [ "python", "main.py" ] diff --git a/build/api/Dockerfile b/build/api/Dockerfile new file mode 100644 index 00000000..cb1144de --- /dev/null +++ b/build/api/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3 + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD [ "python", "main.py" ] diff --git a/build/api/main.py b/build/api/main.py new file mode 100644 index 00000000..fbfcea0b --- /dev/null +++ b/build/api/main.py @@ -0,0 +1,235 @@ +import os + +from typing import Annotated + +from dotenv import load_dotenv +from fastapi import FastAPI, Depends, Query +from fastapi.middleware.gzip import GZipMiddleware +import uvicorn + +import psycopg2 +from psycopg2.pool import ThreadedConnectionPool + +load_dotenv() + +ENV = os.getenv("ENV") +USERNAME = os.getenv("DB_USERNAME") +PASSWORD = os.getenv("PASSWORD") +HOST = os.getenv("HOST") +DATABASE = os.getenv("DATABASE") +DB_SEARCH_PATH = os.getenv("DB_SEARCH_PATH") +INSTANCE_CONNECTION_NAME = os.getenv("INSTANCE_CONNECTION_NAME") + +app = FastAPI() + +if ENV == "dev": + from fastapi.middleware.cors import CORSMiddleware + + origins = [ + "http://localhost", + "http://localhost:5000", + ] + app.add_middleware(CORSMiddleware, allow_origins=origins, allow_credentials=True) + +app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5) + + +if ENV == "dev": + host = HOST +else: + host = f"/cloudsql/{INSTANCE_CONNECTION_NAME}" + +pool = ThreadedConnectionPool( + 1, + 10, + user=USERNAME, + password=PASSWORD, + host=HOST, + database=DATABASE, + options=f"-csearch_path={DB_SEARCH_PATH}", +) + + +def get_db() -> psycopg2.extensions.connection: + db = pool.getconn() + try: + yield db + finally: + pool.putconn(db) + + +predictor = None + + +def get_predictor(db: psycopg2.extensions.connection = Depends(get_db)): + from cities.deployment.tracts_minneapolis.predict import TractsModelPredictor + + global predictor + if predictor is None: + predictor = TractsModelPredictor(db) + return predictor + + +Limit = Annotated[float, Query(ge=0, le=1)] +Radius = Annotated[float, Query(ge=0)] +Year = Annotated[int, Query(ge=2000, le=2030)] + + +@app.middleware("http") +async def add_cache_control_header(request, call_next): + response = await call_next(request) + response.headers["Cache-Control"] = "public, max-age=300" + return response + + +if ENV == "dev": + + @app.middleware("http") + async def add_acess_control_header(request, call_next): + response = await call_next(request) + response.headers["Access-Control-Allow-Origin"] = "*" + return response + + +@app.get("/demographics") +async def read_demographics( + category: Annotated[str, Query(max_length=100)], db=Depends(get_db) +): + with db.cursor() as cur: + cur.execute( + """ + select tract_id, "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022" + from api__demographics where description = %s + """, + (category,), + ) + return [[desc[0] for desc in cur.description]] + cur.fetchall() + + +@app.get("/census-tracts") +async def read_census_tracts(year: Year, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute("select * from api__census_tracts where year_ = %s", (year,)) + row = cur.fetchone() + + return row[1] if row is not None else None + + +@app.get("/high-frequency-transit-lines") +async def read_high_frequency_transit_lines(year: Year, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute( + """ + select line_geom_json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (year,), + ) + row = cur.fetchone() + + return row[0] if row is not None else None + + +@app.get("/high-frequency-transit-stops") +async def read_high_frequency_transit_stops(year: Year, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute( + """ + select stop_geom_json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (year,), + ) + row = cur.fetchone() + + return row[0] if row is not None else None + + +@app.get("/yellow-zone") +async def read_yellow_zone( + year: Year, line_radius: Radius, stop_radius: Radius, db=Depends(get_db) +): + with db.cursor() as cur: + cur.execute( + """ + select + st_asgeojson(st_transform(st_union(st_buffer(line_geom, %s, 'quad_segs=4'), st_buffer(stop_geom, %s, 'quad_segs=4')), 4269))::json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (line_radius, stop_radius, year), + ) + row = cur.fetchone() + + if row is None: + return None + + return { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {"id": "0"}, "geometry": row[0]} + ], + } + + +@app.get("/blue-zone") +async def read_blue_zone(year: Year, radius: Radius, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute( + """ + select st_asgeojson(st_transform(st_buffer(line_geom, %s, 'quad_segs=4'), 4269))::json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (radius, year), + ) + row = cur.fetchone() + + if row is None: + return None + + return { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {"id": "0"}, "geometry": row[0]} + ], + } + + +@app.get("/predict") +async def read_predict( + blue_zone_radius: Radius, + yellow_zone_line_radius: Radius, + yellow_zone_stop_radius: Radius, + blue_zone_limit: Limit, + yellow_zone_limit: Limit, + year: Year, + db=Depends(get_db), + predictor=Depends(get_predictor), +): + result = predictor.predict_cumulative( + db, + intervention=( + { + "radius_blue": blue_zone_radius, + "limit_blue": blue_zone_limit, + "radius_yellow_line": yellow_zone_line_radius, + "radius_yellow_stop": yellow_zone_stop_radius, + "limit_yellow": yellow_zone_limit, + "reform_year": year, + } + ), + ) + return { + "census_tracts": [str(t) for t in result["census_tracts"]], + "housing_units_factual": [t.item() for t in result["housing_units_factual"]], + "housing_units_counterfactual": [ + t.tolist() for t in result["housing_units_counterfactual"] + ], + } + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000))) diff --git a/build/api/postgrest.conf b/build/api/postgrest.conf new file mode 100644 index 00000000..ddb71965 --- /dev/null +++ b/build/api/postgrest.conf @@ -0,0 +1,107 @@ +## Admin server used for checks. It's disabled by default unless a port is specified. +# admin-server-port = 3001 + +## The database role to use when no client authentication is provided +db-anon-role = "web_anon" + +## Notification channel for reloading the schema cache +db-channel = "pgrst" + +## Enable or disable the notification channel +db-channel-enabled = true + +## Enable in-database configuration +db-config = true + +## Function for in-database configuration +## db-pre-config = "postgrest.pre_config" + +## Extra schemas to add to the search_path of every request +db-extra-search-path = "public" + +## Limit rows in response +# db-max-rows = 1000 + +## Allow getting the EXPLAIN plan through the `Accept: application/vnd.pgrst.plan` header +# db-plan-enabled = false + +## Number of open connections in the pool +db-pool = 10 + +## Time in seconds to wait to acquire a slot from the connection pool +# db-pool-acquisition-timeout = 10 + +## Time in seconds after which to recycle pool connections +# db-pool-max-lifetime = 1800 + +## Time in seconds after which to recycle unused pool connections +# db-pool-max-idletime = 30 + +## Allow automatic database connection retrying +# db-pool-automatic-recovery = true + +## Stored proc to exec immediately after auth +# db-pre-request = "stored_proc_name" + +## Enable or disable prepared statements. disabling is only necessary when behind a connection pooler. +## When disabled, statements will be parametrized but won't be prepared. +db-prepared-statements = true + +## The name of which database schema to expose to REST clients +db-schemas = "api" + +## How to terminate database transactions +## Possible values are: +## commit (default) +## Transaction is always committed, this can not be overriden +## commit-allow-override +## Transaction is committed, but can be overriden with Prefer tx=rollback header +## rollback +## Transaction is always rolled back, this can not be overriden +## rollback-allow-override +## Transaction is rolled back, but can be overriden with Prefer tx=commit header +db-tx-end = "commit" + +## The standard connection URI format, documented at +## https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING +db-uri = "postgresql://postgres@34.123.100.76:5432/cities" + +# jwt-aud = "your_audience_claim" + +## Jspath to the role claim key +jwt-role-claim-key = ".role" + +## Choose a secret, JSON Web Key (or set) to enable JWT auth +## (use "@filename" to load from separate file) +# jwt-secret = "secret_with_at_least_32_characters" +jwt-secret-is-base64 = false + +## Enables and set JWT Cache max lifetime, disables caching with 0 +# jwt-cache-max-lifetime = 0 + +## Logging level, the admitted values are: crit, error, warn, info and debug. +log-level = "error" + +## Determine if the OpenAPI output should follow or ignore role privileges or be disabled entirely. +## Admitted values: follow-privileges, ignore-privileges, disabled +openapi-mode = "follow-privileges" + +## Base url for the OpenAPI output +openapi-server-proxy-uri = "" + +## Configurable CORS origins +# server-cors-allowed-origins = "" + +server-host = "!4" +server-port = 3001 + +## Allow getting the request-response timing information through the `Server-Timing` header +server-timing-enabled = true + +## Unix socket location +## if specified it takes precedence over server-port +# server-unix-socket = "/tmp/pgrst.sock" + +## Unix socket file mode +## When none is provided, 660 is applied by default +# server-unix-socket-mode = "660" diff --git a/build/api/requirements.txt b/build/api/requirements.txt new file mode 100644 index 00000000..95cd7505 --- /dev/null +++ b/build/api/requirements.txt @@ -0,0 +1,183 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --extra=api --output-file=api/requirements.txt +# +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # starlette + # watchfiles +certifi==2024.8.30 + # via + # httpcore + # httpx +chirho @ git+https://github.com/BasisResearch/chirho.git + # via cities (setup.py) +click==8.1.7 + # via + # typer + # uvicorn +contourpy==1.3.0 + # via matplotlib +cycler==0.12.1 + # via matplotlib +dill==0.3.8 + # via cities (setup.py) +dnspython==2.6.1 + # via email-validator +email-validator==2.2.0 + # via fastapi +fastapi[standard]==0.114.0 + # via cities (setup.py) +fastapi-cli[standard]==0.0.5 + # via fastapi +filelock==3.16.0 + # via torch +fonttools==4.53.1 + # via matplotlib +fsspec==2024.9.0 + # via torch +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.5 + # via httpx +httptools==0.6.1 + # via uvicorn +httpx==0.27.2 + # via fastapi +idna==3.8 + # via + # anyio + # email-validator + # httpx +jinja2==3.1.4 + # via + # fastapi + # torch +joblib==1.4.2 + # via scikit-learn +kiwisolver==1.4.7 + # via matplotlib +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via jinja2 +matplotlib==3.9.2 + # via cities (setup.py) +mdurl==0.1.2 + # via markdown-it-py +mpmath==1.3.0 + # via sympy +networkx==3.3 + # via torch +numpy==2.1.1 + # via + # cities (setup.py) + # contourpy + # matplotlib + # opt-einsum + # pandas + # pyro-ppl + # scikit-learn + # scipy +opt-einsum==3.3.0 + # via pyro-ppl +packaging==24.1 + # via + # matplotlib + # plotly +pandas==2.2.2 + # via cities (setup.py) +pillow==10.4.0 + # via matplotlib +plotly==5.24.0 + # via cities (setup.py) +psycopg2==2.9.9 + # via cities (setup.py) +pydantic==2.9.1 + # via fastapi +pydantic-core==2.23.3 + # via pydantic +pygments==2.18.0 + # via rich +pyparsing==3.1.4 + # via matplotlib +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.8.6 + # via + # chirho + # cities (setup.py) +python-dateutil==2.9.0.post0 + # via + # matplotlib + # pandas +python-dotenv==1.0.1 + # via uvicorn +python-multipart==0.0.9 + # via fastapi +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via uvicorn +rich==13.8.0 + # via typer +scikit-learn==1.5.1 + # via cities (setup.py) +scipy==1.14.1 + # via scikit-learn +shellingham==1.5.4 + # via typer +six==1.16.0 + # via python-dateutil +sniffio==1.3.1 + # via + # anyio + # httpx +sqlalchemy==2.0.34 + # via cities (setup.py) +starlette==0.38.5 + # via fastapi +sympy==1.13.2 + # via torch +tenacity==9.0.0 + # via plotly +threadpoolctl==3.5.0 + # via scikit-learn +torch==2.4.1 + # via + # cities (setup.py) + # pyro-ppl +tqdm==4.66.5 + # via pyro-ppl +typer==0.12.5 + # via fastapi-cli +typing-extensions==4.12.2 + # via + # fastapi + # pydantic + # pydantic-core + # sqlalchemy + # torch + # typer +tzdata==2024.1 + # via pandas +uvicorn[standard]==0.30.6 + # via + # fastapi + # fastapi-cli +uvloop==0.20.0 + # via uvicorn +watchfiles==0.24.0 + # via uvicorn +websockets==13.0.1 + # via uvicorn + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/build/api/schema.sql b/build/api/schema.sql new file mode 100644 index 00000000..2285c2b7 --- /dev/null +++ b/build/api/schema.sql @@ -0,0 +1,67 @@ +begin; +drop schema if exists api cascade; + +create schema api; + +create view api.demographics as ( + select * from api__demographics +); + +create view api.census_tracts as ( + select * from api__census_tracts +); + +create function api.high_frequency_transit_lines() returns setof dev.api__high_frequency_transit_lines as $$ + select * from dev.api__high_frequency_transit_lines +$$ language sql; + +create function api.high_frequency_transit_lines( + blue_zone_radius double precision, + yellow_zone_line_radius double precision, + yellow_zone_stop_radius double precision +) returns table ( + valid daterange, + geom geometry(LineString, 4269), + blue_zone_geom geometry(LineString, 4269), + yellow_zone_geom geometry(Geometry, 4269) +) as $$ + with + lines as (select * from dev.stg_high_frequency_transit_lines_union), + stops as (select * from dev.high_frequency_transit_stops), + lines_and_stops as ( + select + lines.valid * stops.valid as valid, + lines.geom as line_geom, + stops.geom as stop_geom + from lines inner join stops on lines.valid && stops.valid + ) + select + valid, + st_transform(line_geom, 4269) as geom, + st_transform(st_buffer(line_geom, blue_zone_radius), 4269) as blue_zone_geom, + st_transform(st_union(st_buffer(line_geom, yellow_zone_line_radius), st_buffer(stop_geom, yellow_zone_stop_radius)), 4269) as yellow_zone_geom + from lines_and_stops +$$ language sql; + +do $$ +begin +create role web_anon nologin; +exception when duplicate_object then raise notice '%, skipping', sqlerrm using errcode = sqlstate; +end +$$; + +grant all on schema public to web_anon; +grant all on schema dev to web_anon; +grant select on table public.spatial_ref_sys TO web_anon; +grant usage on schema api to web_anon; +grant all on all tables in schema api to web_anon; +grant all on all functions in schema api to web_anon; +grant all on schema api to web_anon; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA dev TO web_anon; +GRANT ALL PRIVILEGES ON ALL functions IN SCHEMA dev TO web_anon; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA api TO web_anon; +GRANT ALL PRIVILEGES ON ALL functions IN SCHEMA api TO web_anon; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO web_anon; +GRANT ALL PRIVILEGES ON ALL functions IN SCHEMA public TO web_anon; +grant web_anon to postgres; +commit; diff --git a/build/cities/__init__.py b/build/cities/__init__.py new file mode 100644 index 00000000..f993e182 --- /dev/null +++ b/build/cities/__init__.py @@ -0,0 +1,6 @@ +"""**cities** + +Project short description. +""" + +__version__ = "0.0.1" diff --git a/build/cities/deployment/tracts_minneapolis/.gitignore b/build/cities/deployment/tracts_minneapolis/.gitignore new file mode 100644 index 00000000..5304474d --- /dev/null +++ b/build/cities/deployment/tracts_minneapolis/.gitignore @@ -0,0 +1,2 @@ +*.pth +*.pkl \ No newline at end of file diff --git a/build/cities/deployment/tracts_minneapolis/__init__.py b/build/cities/deployment/tracts_minneapolis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/cities/deployment/tracts_minneapolis/generate_torch_loader.py b/build/cities/deployment/tracts_minneapolis/generate_torch_loader.py new file mode 100644 index 00000000..c07d8107 --- /dev/null +++ b/build/cities/deployment/tracts_minneapolis/generate_torch_loader.py @@ -0,0 +1,87 @@ +import os +import time + +import sqlalchemy +import torch +from dotenv import load_dotenv + +from cities.utils.data_grabber import find_repo_root +from cities.utils.data_loader import ZoningDataset, select_from_sql + +load_dotenv() + +local_user = os.getenv("USER") +if local_user == "rafal": + load_dotenv(os.path.expanduser("~/.env_pw")) +# local torch loader is needed for subsampling in evaluation, comparison to the previous dataset and useful for ED +DB_USERNAME = os.getenv("DB_USERNAME") +HOST = os.getenv("HOST") +DATABASE = os.getenv("DATABASE") +PASSWORD = os.getenv("PASSWORD") + + +##################### +# data load and prep +##################### + +kwargs = { + "categorical": ["year", "census_tract"], + "continuous": { + "housing_units", + "housing_units_original", + "total_value", + "total_value_original", + "median_value", + "mean_limit_original", + "median_distance", + "income", + "segregation_original", + "white_original", + "parcel_sqm", + }, + "outcome": "housing_units", +} + +load_start = time.time() +with sqlalchemy.create_engine( + f"postgresql://{DB_USERNAME}:{PASSWORD}@{HOST}/{DATABASE}" +).connect() as conn: + subset = select_from_sql( + "select * from dev.tracts_model__census_tracts order by census_tract, year", + conn, + kwargs, + ) +load_end = time.time() +print(f"Data loaded in {load_end - load_start} seconds") + + +columns_to_standardize = [ + "housing_units_original", + "total_value_original", +] + +new_standardization_dict = {} + +for column in columns_to_standardize: + new_standardization_dict[column] = { + "mean": subset["continuous"][column].mean(), + "std": subset["continuous"][column].std(), + } + + +assert "parcel_sqm" in subset["continuous"].keys() + +root = find_repo_root() + +pg_census_tracts_dataset = ZoningDataset( + subset["categorical"], + subset["continuous"], + standardization_dictionary=new_standardization_dict, +) +assert "parcel_sqm" in subset["continuous"].keys() + +pg_census_tracts_data_path = os.path.join( + root, "data/minneapolis/processed/pg_census_tracts_dataset.pt" +) + +torch.save(pg_census_tracts_dataset, pg_census_tracts_data_path) diff --git a/build/cities/deployment/tracts_minneapolis/predict.py b/build/cities/deployment/tracts_minneapolis/predict.py new file mode 100644 index 00000000..8ae4ac43 --- /dev/null +++ b/build/cities/deployment/tracts_minneapolis/predict.py @@ -0,0 +1,343 @@ +import copy +import os + +import dill +import pandas as pd +import pyro +import torch +from chirho.counterfactual.handlers import MultiWorldCounterfactual +from chirho.indexed.ops import IndexSet, gather +from chirho.interventional.handlers import do +from dotenv import load_dotenv +from pyro.infer import Predictive + +# from cities.modeling.zoning_models.zoning_tracts_sqm_model import ( +# TractsModelSqm as TractsModel, +# ) + +from cities.modeling.zoning_models.zoning_tracts_continuous_interactions_model import ( + TractsModelContinuousInteractions as TractsModel, +) +from cities.utils.data_grabber import find_repo_root +from cities.utils.data_loader import select_from_data, select_from_sql + +load_dotenv() + +local_user = os.getenv("USER") +if local_user == "rafal": + load_dotenv(os.path.expanduser("~/.env_pw")) + + +class TractsModelPredictor: + kwargs = { + "categorical": ["year", "year_original", "census_tract",], + "continuous": { + "housing_units", + "housing_units_original", + "total_value", + "median_value", + "mean_limit_original", + "median_distance", + "income", + "segregation_original", + "white_original", + "parcel_sqm", + 'downtown_overlap', + 'university_overlap', + }, + "outcome": "housing_units", + } + + kwargs_subset = { + "categorical": ["year", "year_original", "census_tract"], + "continuous": { + "housing_units", + "total_value", + "median_value", + "mean_limit_original", + "median_distance", + "income", + "segregation_original", + "white_original", + "parcel_sqm", + 'downtown_overlap', + 'university_overlap', + }, + "outcome": "housing_units", + } + + + + parcel_intervention_sql = """ + select + census_tract, + year_, + case + when downtown_yn then 0 + when not downtown_yn + and year_ >= %(reform_year)s + and distance_to_transit <= %(radius_blue)s + then %(limit_blue)s + when not downtown_yn + and year_ >= %(reform_year)s + and distance_to_transit > %(radius_blue)s + and (distance_to_transit_line <= %(radius_yellow_line)s + or distance_to_transit_stop <= %(radius_yellow_stop)s) + then %(limit_yellow)s + when not downtown_yn + and year_ >= %(reform_year)s + and distance_to_transit_line > %(radius_yellow_line)s + and distance_to_transit_stop > %(radius_yellow_stop)s + then 1 + else limit_con + end as intervention + from tracts_model__parcels + """ + + tracts_intervention_sql = f""" + with parcel_interventions as ({parcel_intervention_sql}) + select + census_tract, + year_, + avg(intervention) as intervention + from parcel_interventions + group by census_tract, year_ + order by census_tract, year_ + """ + + def __init__(self, conn): + self.conn = conn + + root = find_repo_root() + deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis") + + guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl") + self.param_path = os.path.join(deploy_path, "tracts_model_params.pth") + + need_to_train_flag = False + if not os.path.isfile(guide_path): + need_to_train_flag = True + print(f"Warning: '{guide_path}' does not exist.") + if not os.path.isfile(self.param_path): + need_to_train_flag = True + print(f"Warning: '{self.param_path}' does not exist.") + + if need_to_train_flag: + print("Please run 'train_model.py' to generate the required files.") + + with open(guide_path, "rb") as file: + guide = dill.load(file) + + self.data = select_from_sql( + "select * from tracts_model__census_tracts order by census_tract, year", + conn, + TractsModelPredictor.kwargs, + ) + + + # set to zero whenever the university overlap is above 1 + # TODO this should be handled at the data processing stage + self.data['continuous']['mean_limit_original'] = torch.where(self.data['continuous']['university_overlap'] > 1, + torch.zeros_like(self.data['continuous']['mean_limit_original']), + self.data['continuous']['mean_limit_original']) + + + self.subset = select_from_data(self.data, TractsModelPredictor.kwargs_subset) + + + self.years = self.data["categorical"]["year_original"] + self.year_ids = self.data['categorical']["year"] + self.tracts = self.data["categorical"]["census_tract"] + + + categorical_levels = { + "year": torch.unique(self.subset["categorical"]["year"]), + "year_original": torch.unique(self.subset["categorical"]["year_original"]), + "census_tract": torch.unique(self.subset["categorical"]["census_tract"]), + } + + self.housing_units_std = self.data["continuous"]["housing_units_original"].std() + self.housing_units_mean = self.data["continuous"][ + "housing_units_original" + ].mean() + + #interaction_pairs + ins = [ + ("university_overlap", "limit"), + ("downtown_overlap", "limit"), + ("distance", "downtown_overlap"), + ("distance", "university_overlap"), + ("distance", "limit"), + ("median_value", "segregation"), + ("distance", "segregation"), + ("limit", "sqm"), + ("segregation", "sqm"), + ("distance", "white"), + ("income", "limit"), + ("downtown_overlap", "median_value"), + ("downtown_overlap", "segregation"), + ("median_value", "white"), + ("distance", "income"), + ] + + + model = TractsModel(**self.subset, categorical_levels=categorical_levels, + housing_units_continuous_interaction_pairs=ins) + + self.predictive = Predictive(model=model, guide=guide, num_samples=100) + + # these are at the tracts level + def _tracts_intervention( + self, + conn, + radius_blue, + limit_blue, + radius_yellow_line, + radius_yellow_stop, + limit_yellow, + reform_year, + ): + params = { + "reform_year": reform_year, + "radius_blue": radius_blue, + "limit_blue": limit_blue, + "radius_yellow_line": radius_yellow_line, + "radius_yellow_stop": radius_yellow_stop, + "limit_yellow": limit_yellow, + } + df = pd.read_sql( + TractsModelPredictor.tracts_intervention_sql, conn, params=params + ) + return torch.tensor(df["intervention"].values, dtype=torch.float32) + + def predict_cumulative(self, conn, intervention): + """Predict the total number of housing units built from 2011-2020 under intervention. + + Returns a dictionary with keys: + - 'census_tracts': the tracts considered + - 'housing_units_factual': total housing units built according to real housing data + - 'housing_units_counterfactual': samples from prediction of total housing units built + """ + pyro.clear_param_store() + pyro.get_param_store().load(self.param_path) + + subset_for_preds = copy.deepcopy(self.subset) + subset_for_preds["continuous"]["housing_units"] = None + + limit_intervention = self._tracts_intervention(conn, **intervention) + + limit_intervention = torch.where(self.data['continuous']['university_overlap'] > 2, + torch.zeros_like(limit_intervention), + limit_intervention) + + limit_intervention = torch.where(self.data['continuous']['downtown_overlap'] > 1, + torch.zeros_like(limit_intervention), + limit_intervention) + + with MultiWorldCounterfactual() as mwc: + with do(actions={"limit": limit_intervention}): + result_all = self.predictive(**subset_for_preds)["housing_units"] + with mwc: + result_f = gather( + result_all, IndexSet(**{"limit": {0}}), event_dims=0 + ).squeeze() + result_cf = gather( + result_all, IndexSet(**{"limit": {1}}), event_dims=0 + ).squeeze() + + obs_housing_units = self.data["continuous"]["housing_units_original"] + f_housing_units = (result_f * self.housing_units_std + self.housing_units_mean)#.clamp(min = 0) + cf_housing_units = (result_cf * self.housing_units_std + self.housing_units_mean)#.clamp(min = 0) + + + # calculate cumulative housing units (factual) + obs_cumsums = {} + f_cumsums = {} + cf_cumsums = {} + for key in self.tracts.unique(): + obs_units = [] + f_units = [] + cf_units = [] + for year in self.years.unique(): + obs_units.append(obs_housing_units[(self.tracts == key) & (self.years == year)]) + f_units.append(f_housing_units[:,(self.tracts == key) & (self.years == year)]) + cf_units.append(cf_housing_units[:,(self.tracts == key) & (self.years == year)]) + + obs_cumsum = torch.cumsum(torch.stack(obs_units), dim = 0).flatten() + f_cumsum = torch.cumsum(torch.stack(f_units), dim = 0).squeeze() + cf_cumsum = torch.cumsum(torch.stack(cf_units), dim = 0).squeeze() + + obs_cumsums[key] = obs_cumsum + f_cumsums[key] = f_cumsum + cf_cumsums[key] = cf_cumsum + + + # presumably outdated + + tracts = self.data["categorical"]["census_tract"] + + # calculate cumulative housing units (factual) + f_totals = {} + for i in range(tracts.shape[0]): + key = tracts[i].item() + if key not in f_totals: + f_totals[key] = 0 + f_totals[key] += obs_housing_units[i] + + # calculate cumulative housing units (counterfactual) + cf_totals = {} + for i in range(tracts.shape[0]): + year = self.years[i].item() + key = tracts[i].item() + if key not in cf_totals: + cf_totals[key] = 0 + if year < intervention["reform_year"]: + cf_totals[key] += obs_housing_units[i] + else: + cf_totals[key] = cf_totals[key] + cf_housing_units[:, i] + cf_totals = {k: torch.clamp(v, 0) for k, v in cf_totals.items()} + + census_tracts = list(cf_totals.keys()) + f_housing_units = [f_totals[k] for k in census_tracts] + cf_housing_units = [cf_totals[k] for k in census_tracts] + + + + return {"obs_cumsums": obs_cumsums, "f_cumsums": f_cumsums, "cf_cumsums": cf_cumsums, + "limit_intervention": limit_intervention, + # presumably outdated + "census_tracts": census_tracts, + "housing_units_factual": f_housing_units, + "housing_units_counterfactual": cf_housing_units,} + + + # return { + # "census_tracts": census_tracts, + # "housing_units_factual": f_housing_units, + # "housing_units_counterfactual": cf_housing_units, + # "limit_intervention": limit_intervention, + # } + + +if __name__ == "__main__": + import time + + from cities.utils.data_loader import db_connection + + with db_connection() as conn: + predictor = TractsModelPredictor(conn) + start = time.time() + + result = predictor.predict_cumulative( + conn, + intervention={ + "radius_blue": 106.7, + "limit_blue": 0, + "radius_yellow_line": 402.3, + "radius_yellow_stop": 804.7, + "limit_yellow": 0.5, + "reform_year": 2015, + }, + ) + end = time.time() + print(f"Counterfactual in {end - start} seconds") diff --git a/build/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_dag_plot_high_density.png b/build/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_dag_plot_high_density.png new file mode 100644 index 00000000..e6e5f6cc Binary files /dev/null and b/build/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_dag_plot_high_density.png differ diff --git a/build/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_model_overview.ipynb b/build/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_model_overview.ipynb new file mode 100644 index 00000000..18b68ce0 --- /dev/null +++ b/build/cities/deployment/tracts_minneapolis/tracts_model_overview/tracts_model_overview.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## What is this project about?\n", + "\n", + "We use state-of-the-art Bayesian causal modeling tools ([ChiRho](https://github.com/BasisResearch/chirho)) to investigate the role of parking zoning reform in Minneapolis on the development of new housing units, at a relatively fine-grained level of census tracts. Minneapolis is an example of a city which somewhat sucessfuly navigates the housing crisis, and a parking zoning reform has been claimed to be connected to this outcome (see for example [here](https://reason.com/2024/02/27/fear-loathing-and-zoning-reform-in-minnesota/) and [here](https://www.strongtowns.org/journal/2023/9/15/ending-minimum-parking-requirements-was-a-policy-win-for-the-twin-cities)).\n", + "\n", + "\n", + "%TODO Someone should perhaps check if there are better links to include here\n", + "\n", + "Whether this is so, to what extent and with what uncertainty has been unclear. Yes, the number of housing units in the city increased faster after the reform. But it is not ovious whether this isn't a mere correlation arising from other variables being causally responsible, or random variation. We decided to take a deep dive and connect detailed census tracts data with demographic variables within a carefully devised causal model to investigate. Due to data availability limitations, we start at year 2010. Since a major world-wide event changed too many things in 2020, this is where our data collection stops, to be able to separate the zoning concerns from the complex and unprecedented events that follow. It turns out that even with 10 years of data only, causal modelling allows us to offer some (admittedly, uncertain) answers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why this is not a typical machine learning project\n", + "\n", + "A typical predictive project in machine learning tends to use as much data as possible and algorithms to identify patters, focusing only on predictive accuracy. While such an approach is useful, the key limitation is that such models have a hard time distinguishing accidental correlations from causal connections, and therefore are not realiable guides to counterfactual predictions and causal effect estimation. Moreover, a typical model often disregards information that humans use heavily: temporal, spatial or causal structures, which are needed to generalize well outside the training data.\n", + "\n", + "Instead, we use our core open source technology, [ChiRho](https://github.com/BasisResearch/chirho) to build **bayesian causal models** using hand-picked relevant variables. This way, we can work with humans and in the loop. The fact that we use Bayesian methods, allows for the injection of human understanding of the causal dependecies, which then are made work in symbiosis with the data, even if the latter is somewhat limited, and for honest assessment of the resulting uncertainties. The fact that the models is causal gives us a chance to address counterfactual queries involving alternative interventions.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Why care about different types of questions?\n", + "\n", + "Once we start thinking in causal terms, there are **multiple types of queries** that we can distinguish and answer using the model, and such questions typically have different answers. While assosciative information is often useful or revealing, equally often we wwant to be able to evaluate potential consequences of acting one way or another, and in this mode of reflection, we rather turn to thinking in terms of interventions and counterfactuals.\n", + "\n", + "- *Association*. Example: Is there a correlation between increased green spaces and decreased crime rate in an area? Perhaps, areas with more green spaces do tend to have lower crime rates for various reasons.\n", + "\n", + "- *Intervention* If the city implements a zoning change to create more green spaces, how would this impact the crime rate in the area? The answer might differ here: factors other than the policy change probably influence crime rates to a large extent.\n", + "\n", + "- *Counterfactual* Suppose you did create more green spaces and the crime rate in the area did go down. Are you to be thanked? This depends on whether the crime rate would have gone down had you not created more green space in the area. Would it?\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Counterfactual modeling of the zoning reform\n", + "\n", + "In the case at hand, we allow you, the user, to investigate predicted counterfactual outcomes of a zoning reform, specifed in terms of where the two zones start, what parking limits are to be imposed in different zones, and what year the reform has been introduced. From among the available variables we hand-picked the ones that are most useful and meaningfully causally connected. The model simultaneously learns the strenghts of over 30 causal connections and uses this information to inform its counterfactual predictions. The structural assumptions we have made at a high level can be described by the diagram below. However, a moderately competent user can use our [open source codebase](https://github.com/BasisResearch/cities) to tweak or modify these assumptions and invesigate the consequences of doing so.\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"DAG\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How does the model perform?\n", + "\n", + "The causal layer, nevertheless, should not take place at the cost of predictive power. The models went through a battery of tests on split data, each time being able to account for around 25-30% variation in the data (which for such noisy problems is fairly decent peformance), effectively on average improving predictions of new housing units appearing in each of census tracts at each of a given years by the count of 35-40 over a null model. A detailed notebook with model testing is also available at our open source codebase. " + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/build/cities/deployment/tracts_minneapolis/train_model.py b/build/cities/deployment/tracts_minneapolis/train_model.py new file mode 100644 index 00000000..82a95eab --- /dev/null +++ b/build/cities/deployment/tracts_minneapolis/train_model.py @@ -0,0 +1,114 @@ +import os +import time + +import dill +import pyro +import torch +from dotenv import load_dotenv + +from cities.modeling.svi_inference import run_svi_inference +from cities.modeling.zoning_models.zoning_tracts_continuous_interactions_model import ( + TractsModelContinuousInteractions as TractsModel, +) +from cities.utils.data_grabber import find_repo_root +from cities.utils.data_loader import db_connection, select_from_sql + +# from cities.modeling.zoning_models.zoning_tracts_model import TractsModel +# from cities.modeling.zoning_models.zoning_tracts_sqm_model import ( +# TractsModelSqm as TractsModel, +# ) + + +n_steps = 2000 + +load_dotenv() + +local_user = os.getenv("USER") +if local_user == "rafal": + load_dotenv(os.path.expanduser("~/.env_pw")) + +##################### +# data load and prep +##################### + +kwargs = { + "categorical": ["year", "census_tract"], + "continuous": { + "housing_units", + "housing_units_original", + "total_value", + "median_value", + "mean_limit_original", + "median_distance", + "income", + "segregation_original", + "white_original", + "parcel_sqm", + "downtown_overlap", + "university_overlap", + }, + "outcome": "housing_units", +} + +load_start = time.time() +with db_connection() as conn: + subset = select_from_sql( + "select * from dev.tracts_model__census_tracts order by census_tract, year", + conn, + kwargs, + ) +load_end = time.time() +print(f"Data loaded in {load_end - load_start} seconds") + +############################# +# instantiate and train model +############################# + +# interaction terms +ins = [ + ("university_overlap", "limit"), + ("downtown_overlap", "limit"), + ("distance", "downtown_overlap"), + ("distance", "university_overlap"), + ("distance", "limit"), + ("median_value", "segregation"), + ("distance", "segregation"), + ("limit", "sqm"), + ("segregation", "sqm"), + ("distance", "white"), + ("income", "limit"), + ("downtown_overlap", "median_value"), + ("downtown_overlap", "segregation"), + ("median_value", "white"), + ("distance", "income"), +] + +# model +tracts_model = TractsModel( + **subset, + categorical_levels={ + "year": torch.unique(subset["categorical"]["year"]), + "census_tract": torch.unique(subset["categorical"]["census_tract"]), + }, + housing_units_continuous_interaction_pairs=ins, +) + +pyro.clear_param_store() + +guide = run_svi_inference(tracts_model, n_steps=n_steps, lr=0.03, plot=False, **subset) + +########################################## +# save guide and params in the same folder +########################################## +root = find_repo_root() + +deploy_path = os.path.join(root, "cities/deployment/tracts_minneapolis") +guide_path = os.path.join(deploy_path, "tracts_model_guide.pkl") +param_path = os.path.join(deploy_path, "tracts_model_params.pth") + +serialized_guide = dill.dumps(guide) +with open(guide_path, "wb") as file: + file.write(serialized_guide) + +with open(param_path, "wb") as file: + pyro.get_param_store().save(param_path) diff --git a/build/cities/modeling/__init__.py b/build/cities/modeling/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/cities/modeling/evaluation.py b/build/cities/modeling/evaluation.py new file mode 100644 index 00000000..5613ca54 --- /dev/null +++ b/build/cities/modeling/evaluation.py @@ -0,0 +1,300 @@ +import copy +import os +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import matplotlib.pyplot as plt +import pyro +import seaborn as sns +import torch +from pyro.infer import Predictive +from torch.utils.data import DataLoader, random_split + +from cities.modeling.svi_inference import run_svi_inference +from cities.utils.data_grabber import find_repo_root +from cities.utils.data_loader import select_from_data + +root = find_repo_root() + + +def prep_data_for_test( + data_path: Optional[str] = None, train_size: float = 0.8 +) -> Tuple[DataLoader, DataLoader, list]: + + if data_path is None: + data_path = os.path.join(root, "data/minneapolis/processed/zoning_dataset.pt") + zoning_dataset_read = torch.load(data_path) + + train_size = int(train_size * len(zoning_dataset_read)) + test_size = len(zoning_dataset_read) - train_size + + train_dataset, test_dataset = random_split( + zoning_dataset_read, [train_size, test_size] + ) + + train_loader = DataLoader(train_dataset, batch_size=train_size, shuffle=True) + test_loader = DataLoader(test_dataset, batch_size=test_size, shuffle=False) + + categorical_levels = zoning_dataset_read.categorical_levels + + return train_loader, test_loader, categorical_levels + + +def recode_categorical( + kwarg_names: Dict[str, Any], train_loader: DataLoader, test_loader: DataLoader +) -> Tuple[Dict[str, Dict[str, torch.Tensor]], Dict[str, Dict[str, torch.Tensor]]]: + + assert all( + item in kwarg_names.keys() for item in ["categorical", "continuous", "outcome"] + ) + + train_data = next(iter(train_loader)) + test_data = next(iter(test_loader)) + + _train_data = select_from_data(train_data, kwarg_names) + _test_data = select_from_data(test_data, kwarg_names) + + #################################################### + # eliminate test categories not in the training data + #################################################### + def apply_mask(data, mask): + return {key: val[mask] for key, val in data.items()} + + mask = torch.ones(len(_test_data["outcome"]), dtype=torch.bool) + for key, value in _test_data["categorical"].items(): + mask = mask * torch.isin( + _test_data["categorical"][key], (_train_data["categorical"][key].unique()) + ) + + _test_data["categorical"] = apply_mask(_test_data["categorical"], mask) + _test_data["continuous"] = apply_mask(_test_data["continuous"], mask) + _test_data["outcome"] = _test_data["outcome"][mask] + + for key in _test_data["categorical"].keys(): + assert _test_data["categorical"][key].shape[0] == mask.sum() + for key in _test_data["continuous"].keys(): + assert _test_data["continuous"][key].shape[0] == mask.sum() + + # raise error if sum(mask) < .5 * len(test_data['outcome']) + if sum(mask) < 0.5 * len(_test_data["outcome"]): + raise ValueError( + "Sampled test data has too many new categorical levels, consider decreasing train size" + ) + + # #################################### + # recode categorical variables to have + # no index gaps in the training data + # #################################### + + mappings = {} + for name in _train_data["categorical"].keys(): + unique_train = torch.unique(_train_data["categorical"][name]) + mappings[name] = {v.item(): i for i, v in enumerate(unique_train)} + _train_data["categorical"][name] = torch.tensor( + [mappings[name][x.item()] for x in _train_data["categorical"][name]] + ) + _test_data["categorical"][name] = torch.tensor( + [mappings[name][x.item()] for x in _test_data["categorical"][name]] + ) + + return _train_data, _test_data + + +def test_performance( + model_or_class: Union[Callable[..., Any], Any], + kwarg_names: Dict[str, Any], + train_loader: DataLoader, + test_loader: DataLoader, + categorical_levels: Dict[str, torch.Tensor], + outcome_type: str = "outcome", + outcome_name: str = "outcome", + n_steps: int = 600, + plot: bool = True, + lim: Optional[Tuple[float, float]] = None, + is_class: bool = True, +) -> Dict[str, float]: + + _train_data, _test_data = recode_categorical(kwarg_names, train_loader, test_loader) + + pyro.clear_param_store() + + ###################### + # train and test + ###################### + + if is_class: + model = model_or_class(**_train_data) + + else: + model = model_or_class + + guide = run_svi_inference( + model, n_steps=n_steps, lr=0.01, verbose=True, **_train_data + ) + + predictive = Predictive(model, guide=guide, num_samples=1000) + + categorical_levels = model.categorical_levels + + _train_data_for_preds = copy.deepcopy(_train_data) + _test_data_for_preds = copy.deepcopy(_test_data) + + if outcome_type != "outcome": + _train_data_for_preds[outcome_type][outcome_name] = None # type: ignore + _test_data_for_preds[outcome_type][outcome_name] = None # type: ignore + + else: + _train_data_for_preds[outcome_type] = None # type: ignore + + samples_train = predictive( + **_train_data_for_preds, + categorical_levels=categorical_levels, + ) + + samples_test = predictive( + **_test_data_for_preds, + categorical_levels=categorical_levels, + ) + + train_predicted_mean = samples_train[outcome_name].squeeze().mean(dim=0) + train_predicted_lower = samples_train[outcome_name].squeeze().quantile(0.05, dim=0) + train_predicted_upper = samples_train[outcome_name].squeeze().quantile(0.95, dim=0) + + coverage_training = ( + _train_data[outcome_type][outcome_name] + .squeeze() + .gt(train_predicted_lower) + .float() + * _train_data[outcome_type][outcome_name] + .squeeze() + .lt(train_predicted_upper) + .float() + ) + + null_residuals_train = ( + _train_data[outcome_type][outcome_name].squeeze() + - _train_data[outcome_type][outcome_name].squeeze().mean() + ) + + null_mae_train = torch.abs(null_residuals_train).mean().item() + + residuals_train = ( + _train_data[outcome_type][outcome_name].squeeze() - train_predicted_mean + ) + mae_train = torch.abs(residuals_train).mean().item() + + rsquared_train = ( + 1 + - residuals_train.var() + / _train_data[outcome_type][outcome_name].squeeze().var() + ) + + test_predicted_mean = samples_test[outcome_name].squeeze().mean(dim=0) + test_predicted_lower = samples_test[outcome_name].squeeze().quantile(0.05, dim=0) + test_predicted_upper = samples_test[outcome_name].squeeze().quantile(0.95, dim=0) + + coverage_test = ( + _test_data[outcome_type][outcome_name] + .squeeze() + .gt(test_predicted_lower) + .float() + * _test_data[outcome_type][outcome_name] + .squeeze() + .lt(test_predicted_upper) + .float() + ) + + null_residuals_test = ( + _test_data[outcome_type][outcome_name].squeeze() + - _test_data[outcome_type][outcome_name].squeeze().mean() + ) + + null_mae_test = torch.abs(null_residuals_test).mean().item() + + residuals_test = ( + _test_data[outcome_type][outcome_name].squeeze() - test_predicted_mean + ) + mae_test = torch.abs(residuals_test).mean().item() + + rsquared_test = ( + 1 + - residuals_test.var() / _test_data[outcome_type][outcome_name].squeeze().var() + ) + + print(rsquared_train, rsquared_test) + + if plot: + fig, axs = plt.subplots(2, 2, figsize=(14, 10)) + + axs[0, 0].scatter( + x=_train_data[outcome_type][outcome_name], + y=train_predicted_mean, + s=6, + alpha=0.5, + ) + axs[0, 0].set_title( + "Training data, ratio of outcomes within 95% CI: {:.2f}".format( + coverage_training.mean().item() + ) + ) + + if lim is not None: + axs[0, 0].set_xlim(lim) + axs[0, 0].set_ylim(lim) + axs[0, 0].set_xlabel("observed values") + axs[0, 0].set_ylabel("mean predicted values") + + axs[0, 1].hist(residuals_train, bins=50) + + axs[0, 1].set_title( + "Training set residuals, MAE (null): {:.2f} ({:.2f}), Rsquared: {:.2f}".format( + mae_train, null_mae_train, rsquared_train.item() + ) + ) + axs[0, 1].set_xlabel("residuals") + axs[0, 1].set_ylabel("frequency") + + axs[1, 0].scatter( + x=_test_data[outcome_type][outcome_name], + y=test_predicted_mean, + s=6, + alpha=0.5, + ) + axs[1, 0].set_title( + "Test data, ratio of outcomes within 95% CI: {:.2f}".format( + coverage_test.mean().item() + ) + ) + axs[1, 0].set_xlabel("true values") + axs[1, 0].set_ylabel("mean predicted values") + if lim is not None: + axs[1, 0].set_xlim(lim) + axs[1, 0].set_ylim(lim) + + axs[1, 1].hist(residuals_test, bins=50) + + axs[1, 1].set_title( + "Test set residuals, MAE (null): {:.2f} ({:.2f}), Rsquared: {:.2f}".format( + mae_test, null_mae_test, rsquared_test.item() + ) + ) + + axs[1, 1].set_xlabel("residuals") + axs[1, 1].set_ylabel("frequency") + + plt.tight_layout(rect=(0, 0, 1, 0.96)) + sns.despine() + + fig.suptitle("Model evaluation", fontsize=16) + + plt.show() + + return { + "mae_null_train": null_mae_train, + "mae_null_test": null_mae_test, + "mae_train": mae_train, + "mae_test": mae_test, + "rsquared_train": rsquared_train, + "rsquared_test": rsquared_test, + "coverage_train": coverage_training.mean().item(), + "coverage_test": coverage_test.mean().item(), + } diff --git a/build/cities/modeling/model_components.py b/build/cities/modeling/model_components.py new file mode 100644 index 00000000..c914bb41 --- /dev/null +++ b/build/cities/modeling/model_components.py @@ -0,0 +1,351 @@ +from typing import Dict, List, Optional, Tuple + +import pyro +import pyro.distributions as dist +import torch + + +def get_n(categorical: Dict[str, torch.Tensor], continuous: Dict[str, torch.Tensor]): + N_categorical = len(categorical) + N_continuous = len(continuous) + + # a but convoluted, but groups might be missing and sometimes + # vars are allowed to be None + n_cat = None + if N_categorical > 0: + for value in categorical.values(): + if value is not None: + n_cat = value.shape[0] + break + + n_con = None + if N_continuous > 0: + for value in continuous.values(): + if value is not None: + n_con = value.shape[0] + break + + if N_categorical > 0 and N_continuous > 0: + if n_cat != n_con: + raise ValueError( + "The number of categorical and continuous data points must be the same" + ) + + n = n_cat if n_cat is not None else n_con + + if n is None: + raise ValueError("Both categorical and continuous dictionaries are empty.") + + return N_categorical, N_continuous, n + + +def check_categorical_is_subset_of_levels(categorical, categorical_levels): + + assert set(categorical.keys()).issubset(set(categorical_levels.keys())) + + # # TODO should these be subsets or can we only check lengths? + + return True + + +def get_categorical_levels(categorical): + """ + Assumes that no levels are missing from the categorical data, and constructs the levels from the unique values. + This should only be used with supersets of all data (so that every data subset will have its levels represented + in the levels returned here. + """ + return {name: torch.unique(categorical[name]) for name in categorical.keys()} + + +def categorical_contribution( + categorical: Dict[str, torch.Tensor], + child_name: str, + leeway: float, + categorical_levels: Dict[str, torch.Tensor], +) -> torch.Tensor: + + check_categorical_is_subset_of_levels(categorical, categorical_levels) + + categorical_names = list(categorical.keys()) + + weights_categorical_outcome = {} + objects_cat_weighted = {} + + for name in categorical_names: + weights_categorical_outcome[name] = pyro.sample( + f"weights_categorical_{name}_{child_name}", + dist.Normal(0.0, leeway).expand(categorical_levels[name].shape).to_event(1), + ) + + if len(weights_categorical_outcome[name].shape) > 1: + weights_categorical_outcome[name] = weights_categorical_outcome[ + name + ].squeeze(-2) + + final_nonevent_shape = torch.broadcast_shapes( + categorical[name].shape[:-1], weights_categorical_outcome[name].shape[:-1] + ) + expanded_weight_indices = categorical[name].expand(*final_nonevent_shape, -1) + expanded_weights = weights_categorical_outcome[name].expand( + *final_nonevent_shape, -1 + ) + + objects_cat_weighted[name] = torch.gather( + expanded_weights, dim=-1, index=expanded_weight_indices + ) + + # weight_indices = categorical[name].expand( + # *weights_categorical_outcome[name].shape[:-1], -1 + # ) + + # objects_cat_weighted[name] = torch.gather( + # weights_categorical_outcome[name], dim=-1, index=weight_indices + # ) + + values = list(objects_cat_weighted.values()) + + categorical_contribution_outcome = torch.stack(values, dim=0).sum(dim=0) + + return categorical_contribution_outcome + + +def continuous_contribution( + continuous: Dict[str, torch.Tensor], + child_name: str, + leeway: float, +) -> torch.Tensor: + + contributions = torch.zeros(1) + + bias_continuous = pyro.sample( + f"bias_continuous_{child_name}", + dist.Normal(0.0, leeway), + ) + + for key, value in continuous.items(): + + weight_continuous = pyro.sample( + f"weight_continuous_{key}_to_{child_name}", + dist.Normal(0.0, leeway), + ) + + contribution = weight_continuous * value + contributions = contribution + contributions + + contributions = bias_continuous + contributions + + return contributions + + +def add_linear_component( + child_name: str, + child_continuous_parents: Dict[str, torch.Tensor], + child_categorical_parents: Dict[str, torch.Tensor], + leeway: float, + data_plate, + categorical_levels: Dict[str, torch.Tensor], + observations: Optional[torch.Tensor] = None, +) -> torch.Tensor: + + sigma_child = pyro.sample( + f"sigma_{child_name}", dist.Exponential(1.0) + ) # type: ignore + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway=leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + with data_plate: + + mean_prediction_child = pyro.deterministic( # type: ignore + f"mean_outcome_prediction_{child_name}", + continuous_contribution_to_child + categorical_contribution_to_child, + event_dim=0, + ) + + child_observed = pyro.sample( # type: ignore + f"{child_name}", + dist.Normal(mean_prediction_child, sigma_child), + obs=observations, + ) + + return child_observed + + +def add_linear_component_continuous_interactions( + child_name: str, + child_continuous_parents: Dict[str, torch.Tensor], + child_categorical_parents: Dict[str, torch.Tensor], + continous_interaction_pairs: List[Tuple[str, str]], + leeway: float, + data_plate, + categorical_levels: Dict[str, torch.Tensor], + observations: Optional[torch.Tensor] = None, +) -> torch.Tensor: + + if continous_interaction_pairs == [("all", "all")]: + continous_interaction_pairs = [ + (key1, key2) + for key1 in child_continuous_parents.keys() + for key2 in child_continuous_parents.keys() + if key1 != key2 + ] + + for interaction_pair in continous_interaction_pairs: + assert interaction_pair[0] in child_continuous_parents.keys() + assert interaction_pair[1] in child_continuous_parents.keys() + + interaction_name = f"{interaction_pair[0]}_x_{interaction_pair[1]}_to_{child_name}" + + with data_plate: + child_continuous_parents[interaction_name] = pyro.deterministic( + interaction_name, + child_continuous_parents[interaction_pair[0]] + * child_continuous_parents[interaction_pair[1]], + event_dim=0, + ) + + child_observed = add_linear_component( + child_name=child_name, + child_continuous_parents=child_continuous_parents, + child_categorical_parents=child_categorical_parents, + leeway=leeway, + data_plate=data_plate, + categorical_levels=categorical_levels, + observations=observations, + ) + + return child_observed + + +def add_logistic_component( + child_name: str, + child_continuous_parents: Dict[str, torch.Tensor], + child_categorical_parents: Dict[str, torch.Tensor], + leeway: float, + data_plate, + categorical_levels: Dict[str, torch.Tensor], + observations: Optional[torch.Tensor] = None, +) -> torch.Tensor: + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + with data_plate: + + mean_prediction_child = pyro.deterministic( # type: ignore + f"mean_outcome_prediction_{child_name}", + categorical_contribution_to_child + continuous_contribution_to_child, + event_dim=0, + ) + + child_probs = pyro.deterministic( + f"child_probs_{child_name}", + torch.sigmoid(mean_prediction_child), + event_dim=0, + ) + + child_observed = pyro.sample( + f"{child_name}", + dist.Bernoulli(child_probs), + obs=observations, + ) + + return child_observed + + +def add_ratio_component( + child_name: str, + child_continuous_parents: Dict[str, torch.Tensor], + child_categorical_parents: Dict[str, torch.Tensor], + leeway: float, + data_plate, + categorical_levels: Dict[str, torch.Tensor], + observations: Optional[torch.Tensor] = None, +) -> torch.Tensor: + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + sigma_child = pyro.sample(f"sigma_{child_name}", dist.Exponential(40.0)) + + with data_plate: + + mean_prediction_child = pyro.deterministic( # type: ignore + f"mean_outcome_prediction_{child_name}", + categorical_contribution_to_child + continuous_contribution_to_child, + event_dim=0, + ) + + child_probs = pyro.deterministic( + f"child_probs_{child_name}", + torch.sigmoid(mean_prediction_child), + event_dim=0, + ) + + child_observed = pyro.sample( + child_name, dist.Normal(child_probs, sigma_child), obs=observations + ) + + return child_observed + + +def add_ratio_component_continuous_interactions( + child_name: str, + child_continuous_parents: Dict[str, torch.Tensor], + child_categorical_parents: Dict[str, torch.Tensor], + continous_interaction_pairs: List[Tuple[str, str]], + leeway: float, + data_plate, + categorical_levels: Dict[str, torch.Tensor], + observations: Optional[torch.Tensor] = None, +) -> torch.Tensor: + + for interaction_pair in continous_interaction_pairs: + assert interaction_pair[0] in child_continuous_parents.keys() + assert interaction_pair[1] in child_continuous_parents.keys() + + interaction_name = f"{interaction_pair[0]}_x_{interaction_pair[1]}_to_{child_name}" + + with data_plate: + child_continuous_parents[interaction_name] = pyro.deterministic( + interaction_name, + child_continuous_parents[interaction_pair[0]] + * child_continuous_parents[interaction_pair[1]], + event_dim=0, + ) + + child_observed = add_ratio_component( + child_name=child_name, + child_continuous_parents=child_continuous_parents, + child_categorical_parents=child_categorical_parents, + leeway=leeway, + data_plate=data_plate, + categorical_levels=categorical_levels, + observations=observations, + ) + + return child_observed diff --git a/build/cities/modeling/model_interactions.py b/build/cities/modeling/model_interactions.py new file mode 100644 index 00000000..2446d6d5 --- /dev/null +++ b/build/cities/modeling/model_interactions.py @@ -0,0 +1,181 @@ +import logging +import os +from typing import Optional + +import dill +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.modeling_utils import ( + prep_wide_data_for_inference, + train_interactions_model, +) +from cities.utils.data_grabber import DataGrabber, find_repo_root + + +class InteractionsModel: + def __init__( + self, + outcome_dataset: str, + intervention_dataset: str, + intervention_variable: Optional[str] = None, + forward_shift: int = 2, + num_iterations: int = 1500, + num_samples: int = 1000, + plot_loss: bool = False, + ): + self.outcome_dataset = outcome_dataset + self.intervention_dataset = intervention_dataset + self.forward_shift = forward_shift + self.num_iterations = num_iterations + self.num_samples = num_samples + self.plot_loss = plot_loss + self.root = find_repo_root() + + if intervention_variable: + self.intervention_variable = intervention_variable + else: + _dg = DataGrabber() + _dg.get_features_std_long([intervention_dataset]) + self.intervention_variable = _dg.std_long[intervention_dataset].columns[-1] + + self.data = prep_wide_data_for_inference( + outcome_dataset=self.outcome_dataset, + intervention_dataset=self.intervention_dataset, + forward_shift=self.forward_shift, + ) + + self.model = model_cities_interaction + + self.model_args = self.data["model_args"] + + self.model_conditioned = pyro.condition( # type: ignore + self.model, + data={"T": self.data["t"], "Y": self.data["y"], "X": self.data["x"]}, + ) + + self.model_rendering = pyro.render_model( # type: ignore + self.model, model_args=self.model_args, render_distributions=True + ) + + def train_interactions_model(self): + self.guide = train_interactions_model( + conditioned_model=self.model_conditioned, + model_args=self.model_args, + num_iterations=self.num_iterations, + plot_loss=self.plot_loss, + ) + + def sample_from_guide(self): + predictive = pyro.infer.Predictive( + model=self.model, + guide=self.guide, + num_samples=self.num_samples, + parallel=False, + ) + self.samples = predictive(*self.model_args) + + def save_guide(self): + guide_name = ( + f"{self.intervention_dataset}_{self.outcome_dataset}_{self.forward_shift}" + ) + serialized_guide = dill.dumps(self.guide) + file_path = os.path.join( + self.root, "data/model_guides", f"{guide_name}_guide.pkl" + ) + with open(file_path, "wb") as file: + file.write(serialized_guide) + param_path = os.path.join( + self.root, "data/model_guides", f"{guide_name}_params.pth" + ) + pyro.get_param_store().save(param_path) + + logging.info( + f"Guide and params for {self.intervention_dataset}", + f"{self.outcome_dataset} with shift {self.forward_shift}", + "has been saved.", + ) + + +def model_cities_interaction( + N_t, + N_cov, + N_s, + N_u, + state_index, + unit_index, + leeway=0.9, +): + bias_Y = pyro.sample("bias_Y", dist.Normal(0, leeway)) + bias_T = pyro.sample("bias_T", dist.Normal(0, leeway)) + + weight_TY = pyro.sample("weight_TY", dist.Normal(0, leeway)) + + sigma_T = pyro.sample("sigma_T", dist.Exponential(1)) + sigma_Y = pyro.sample("sigma_Y", dist.Exponential(1)) + + counties_plate = pyro.plate("counties_plate", N_u, dim=-1) + states_plate = pyro.plate("states_plate", N_s, dim=-2) + covariates_plate = pyro.plate("covariates_plate", N_cov, dim=-3) + time_plate = pyro.plate("time_plate", N_t, dim=-4) + + with covariates_plate: + bias_X = pyro.sample("bias_X", dist.Normal(0, leeway)) + sigma_X = pyro.sample("sigma_X", dist.Exponential(1)) + weight_XT = pyro.sample("weight_XT", dist.Normal(0, leeway)) + weight_XY = pyro.sample("weight_XY", dist.Normal(0, leeway)) + + with states_plate: + bias_stateT = pyro.sample("bias_stateT", dist.Normal(0, leeway)) + bias_stateY = pyro.sample("bias_stateY", dist.Normal(0, leeway)) + + with covariates_plate: + bias_stateX = pyro.sample("bias_stateX", dist.Normal(0, leeway)) + + with time_plate: + bias_timeT = pyro.sample("bias_timeT", dist.Normal(0, leeway)) + bias_timeY = pyro.sample("bias_timeY", dist.Normal(0, leeway)) + + with counties_plate: + with covariates_plate: + mean_X = pyro.deterministic( + "mean_X", + torch.einsum( + "...xdd,...xcd->...xdc", bias_X, bias_stateX[..., state_index, :] + ), + ) + + X = pyro.sample("X", dist.Normal(mean_X[..., unit_index], sigma_X)) + + XT_weighted = torch.einsum( + "...xdc, ...xdd -> ...dc", X, weight_XT + ).unsqueeze(-2) + XY_weighted = torch.einsum( + "...xdc, ...xdd -> ...dc", X, weight_XY + ).unsqueeze(-2) + + with time_plate: + bias_stateT_tiled = pyro.deterministic( + "bias_stateT_tiled", + torch.einsum("...cd -> ...dc", bias_stateT[..., state_index, :]), + ) + + mean_T = pyro.deterministic( + "mean_T", bias_T + bias_timeT + bias_stateT_tiled + XT_weighted + ) + + T = pyro.sample("T", dist.Normal(mean_T, sigma_T)) + + bias_stateY_tiled = pyro.deterministic( + "bias_stateY_tiled", + torch.einsum("...cd -> ...dc", bias_stateY[..., state_index, :]), + ) + + mean_Y = pyro.deterministic( + "mean_Y", + bias_Y + bias_timeY + bias_stateY_tiled + XY_weighted + weight_TY * T, + ) + Y = pyro.sample("Y", dist.Normal(mean_Y, sigma_Y)) + + return Y diff --git a/build/cities/modeling/modeling_utils.py b/build/cities/modeling/modeling_utils.py new file mode 100644 index 00000000..55aaccc6 --- /dev/null +++ b/build/cities/modeling/modeling_utils.py @@ -0,0 +1,403 @@ +from typing import Callable + +import matplotlib.pyplot as plt +import pandas as pd +import pyro +import torch +from pyro.infer import SVI, Trace_ELBO +from pyro.infer.autoguide import AutoNormal +from pyro.optim import Adam # type: ignore +from scipy.stats import spearmanr + +from cities.utils.data_grabber import ( + DataGrabber, + list_available_features, + list_tensed_features, +) + + +def drop_high_correlation(df, threshold=0.85): + df_var = df.iloc[:, 2:].copy() + correlation_matrix, _ = spearmanr(df_var) + + high_correlation_pairs = [ + (df_var.columns[i], df_var.columns[j]) + for i in range(df_var.shape[1]) + for j in range(i + 1, df_var.shape[1]) + if abs(correlation_matrix[i, j]) > threshold + and abs(correlation_matrix[i, j]) < 1.0 + ] + high_correlation_pairs = [ + (var1, var2) for var1, var2 in high_correlation_pairs if var1 != var2 + ] + + removed = set() + print( + f"Highly correlated pairs: {high_correlation_pairs}, second elements will be dropped" + ) + for var1, var2 in high_correlation_pairs: + assert var2 in df_var.columns + for var1, var2 in high_correlation_pairs: + if var2 in df_var.columns: + removed.add(var2) + df_var.drop(var2, axis=1, inplace=True) + + result = pd.concat([df.iloc[:, :2], df_var], axis=1) + print(f"Removed {removed} due to correlation > {threshold}") + return result + + +def prep_wide_data_for_inference( + outcome_dataset: str, intervention_dataset: str, forward_shift: int +): + """ + Prepares wide-format data for causal inference modeling. + + Parameters: + - outcome_dataset (str): Name of the outcome variable. + - intervention_dataset (str): Name of the intervention variable. + - forward_shift (int): Number of time steps to shift the outcome variable for prediction. + + Returns: + dict: A dictionary containing the necessary inputs for causal inference modeling. + + The function performs the following steps: + 1. Identifies available device (GPU if available, otherwise CPU), to be used with tensors. + 2. Uses a DataGrabber class to obtain standardized wide-format data. + 3. Separates covariate datasets into time series (tensed) and fixed covariates. + 4. Loads the required transformed features. + 5. Merges fixed covariates into a joint dataframe based on a common ID column. + 6. Ensures that the GeoFIPS (geographical identifier) is consistent across datasets. + 7. Extracts common years for which both intervention and outcome data are available. + 8. Shifts the outcome variable forward by the specified number of time steps. + 9. Prepares tensors for input features (x), interventions (t), and outcomes (y). + 10. Creates indices for states and units, preparing them as tensors. + 11. Validates the shapes of the tensors. + 12. Constructs a dictionary containing model arguments and prepared tensors. + + Example usage: + prep_data = prep_wide_data_for_inference("outcome_data", "intervention_data", 2) + """ + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + + dg = DataGrabber() + + tensed_covariates_datasets = [ + var + for var in list_tensed_features() + if var not in [outcome_dataset, intervention_dataset] + ] + fixed_covariates_datasets = [ + var + for var in list_available_features() + if var + not in tensed_covariates_datasets + [outcome_dataset, intervention_dataset] + ] + + features_needed = [ + outcome_dataset, + intervention_dataset, + ] + fixed_covariates_datasets + + dg.get_features_std_wide(features_needed) + + intervention = dg.std_wide[intervention_dataset] + outcome = dg.std_wide[outcome_dataset] + + # put covariates in one df as columns, dropping repeated ID columns + f_covariates = { + dataset: dg.std_wide[dataset] for dataset in fixed_covariates_datasets + } + f_covariates_joint = f_covariates[fixed_covariates_datasets[0]] + for dataset in f_covariates.keys(): + if dataset != fixed_covariates_datasets[0]: + if "GeoName" in f_covariates[dataset].columns: + f_covariates[dataset] = f_covariates[dataset].drop(columns=["GeoName"]) + f_covariates_joint = f_covariates_joint.merge( + f_covariates[dataset], on=["GeoFIPS"] + ) + + f_covariates_joint = drop_high_correlation(f_covariates_joint) + + assert f_covariates_joint["GeoFIPS"].equals(intervention["GeoFIPS"]) + + # extract data for which intervention and outcome overlap + year_min = max( + intervention.columns[2:].astype(int).min(), + outcome.columns[2:].astype(int).min(), + ) + + year_max = min( + intervention.columns[2:].astype(int).max(), + outcome.columns[2:].astype(int).max(), + ) + + assert all(intervention["GeoFIPS"] == outcome["GeoFIPS"]) + + outcome_years_to_keep = [ + year + for year in outcome.columns[2:] + if year_min <= int(year) <= year_max + forward_shift + ] + + outcome_years_to_keep = [ + year for year in outcome_years_to_keep if year in intervention.columns[2:] + ] + + outcome = outcome[outcome_years_to_keep] + + # shift outcome `forward_shift` steps ahead + # for the prediction task + outcome_shifted = outcome.copy() + + for i in range(len(outcome_years_to_keep) - forward_shift): + outcome_shifted.iloc[:, i] = outcome_shifted.iloc[:, i + forward_shift] + + years_to_drop = [ + f"{year}" for year in range(year_max - forward_shift + 1, year_max + 1) + ] + outcome_shifted.drop(columns=years_to_drop, inplace=True) + + intervention.drop(columns=["GeoFIPS", "GeoName"], inplace=True) + intervention = intervention[outcome_shifted.columns] + + assert intervention.shape == outcome_shifted.shape + + years_available = outcome_shifted.columns.astype(int).values + + unit_index = pd.factorize(f_covariates_joint["GeoFIPS"].values)[0] + state_index = pd.factorize(f_covariates_joint["GeoFIPS"].values // 1000)[0] + + # prepare tensors + x = torch.tensor( + f_covariates_joint.iloc[:, 2:].values, dtype=torch.float32, device=device + ) + x = x.unsqueeze(1).unsqueeze(1).permute(2, 3, 1, 0) + + t = torch.tensor(intervention.values, dtype=torch.float32, device=device) + t = t.unsqueeze(1).unsqueeze(1).permute(3, 1, 2, 0) + + y = torch.tensor(outcome_shifted.values, dtype=torch.float32, device=device) + y = y.unsqueeze(1).unsqueeze(1).permute(3, 1, 2, 0) + + state_index = torch.tensor(state_index, dtype=torch.int, device=device) + unit_index = torch.tensor(unit_index, dtype=torch.int, device=device) + + N_t = y.shape[0] + N_cov = x.shape[1] + N_s = state_index.unique().shape[0] + N_u = unit_index.unique().shape[0] + + assert x.shape == (1, N_cov, 1, N_u) + assert y.shape == (N_t, 1, 1, N_u) + assert t.shape == (N_t, 1, 1, N_u) + + model_args = (N_t, N_cov, N_s, N_u, state_index, unit_index) + + return { + "model_args": model_args, + "x": x, + "t": t, + "y": y, + "years_available": years_available, + "outcome_years": outcome_years_to_keep, + "covariates_df": f_covariates_joint, + } + + +def train_interactions_model( + conditioned_model: Callable, + model_args, + num_iterations: int = 1000, + plot_loss: bool = True, + print_interval: int = 100, + lr: float = 0.01, +): + guide = None + pyro.clear_param_store() # type: ignore + + guide = AutoNormal(conditioned_model) + + svi = SVI( + model=conditioned_model, guide=guide, optim=Adam({"lr": lr}), loss=Trace_ELBO() + ) + + losses = [] + for step in range(num_iterations): + loss = svi.step(*model_args) + losses.append(loss) + if step % print_interval == 0: + print("[iteration %04d] loss: %.4f" % (step + 1, loss)) + + if plot_loss: + plt.plot(range(num_iterations), losses, label="Loss") + plt.show() + + return guide + + +def prep_data_for_interaction_inference( + outcome_dataset, intervention_dataset, intervention_variable, forward_shift +): + dg = DataGrabber() + + tensed_covariates_datasets = [ + var + for var in list_tensed_features() + if var not in [outcome_dataset, intervention_dataset] + ] + fixed_covariates_datasets = [ + var + for var in list_available_features() + if var + not in tensed_covariates_datasets + [outcome_dataset, intervention_dataset] + ] + + dg.get_features_std_long(list_available_features()) + dg.get_features_std_wide(list_available_features()) + + year_min = max( + dg.std_long[intervention_dataset]["Year"].min(), + dg.std_long[outcome_dataset]["Year"].min(), + ) + year_max = min( + dg.std_long[intervention_dataset]["Year"].max(), + dg.std_long[outcome_dataset]["Year"].max(), + ) + outcome_df = dg.std_long[outcome_dataset].sort_values(by=["GeoFIPS", "Year"]) + + # now we adding forward shift to the outcome + # cleaning up and puting intervention/outcome in one df + # and fixed covariates in another + + outcome_df[f"{outcome_dataset}_shifted_by_{forward_shift}"] = None + + geo_subsets = [] + for geo_fips in outcome_df["GeoFIPS"].unique(): + geo_subset = outcome_df[outcome_df["GeoFIPS"] == geo_fips].copy() + # Shift the 'Value' column `forward_shift` in a new column + geo_subset[f"{outcome_dataset}_shifted_by_{forward_shift}"] = geo_subset[ + "Value" + ].shift(-forward_shift) + geo_subsets.append(geo_subset) + + outcome_df = pd.concat(geo_subsets).reset_index(drop=True) + + outcome = outcome_df[ + (outcome_df["Year"] >= year_min) + & (outcome_df["Year"] <= year_max + forward_shift) + ] + + intervention = dg.std_long[intervention_dataset][ + (dg.std_long[intervention_dataset]["Year"] >= year_min) + & (dg.std_long[intervention_dataset]["Year"] <= year_max) + ] + f_covariates = { + dataset: dg.std_wide[dataset] for dataset in fixed_covariates_datasets + } + f_covariates_joint = f_covariates[fixed_covariates_datasets[0]] + for dataset in f_covariates.keys(): + if dataset != fixed_covariates_datasets[0]: + if "GeoName" in f_covariates[dataset].columns: + f_covariates[dataset] = f_covariates[dataset].drop(columns=["GeoName"]) + f_covariates_joint = f_covariates_joint.merge( + f_covariates[dataset], on=["GeoFIPS"] + ) + + i_o_data = pd.merge(outcome, intervention, on=["GeoFIPS", "Year"]) + + if "GeoName_x" in i_o_data.columns: + i_o_data.rename(columns={"GeoName_x": "GeoName"}, inplace=True) + columns_to_drop = i_o_data.filter(regex=r"^GeoName_[a-zA-Z]$") + i_o_data.drop(columns=columns_to_drop.columns, inplace=True) + + i_o_data.rename(columns={"Value": outcome_dataset}, inplace=True) + + i_o_data["state"] = [code // 1000 for code in i_o_data["GeoFIPS"]] + + N_s = len(i_o_data["state"].unique()) # number of states + i_o_data.dropna(inplace=True) + + i_o_data["unit_index"] = pd.factorize(i_o_data["GeoFIPS"].values)[0] + i_o_data["state_index"] = pd.factorize(i_o_data["state"].values)[0] + i_o_data["time_index"] = pd.factorize(i_o_data["Year"].values)[0] + + assert i_o_data["GeoFIPS"].isin(f_covariates_joint["GeoFIPS"]).all() + + f_covariates_joint.drop(columns=["GeoName"], inplace=True) + data = i_o_data.merge(f_covariates_joint, on="GeoFIPS", how="left") + + assert not data.isna().any().any() + + time_index_idx = data.columns.get_loc("time_index") + covariates_df = data.iloc[:, time_index_idx + 1 :].copy() + covariates_df_sparse = covariates_df.copy() + covariates_df_sparse["unit_index"] = data["unit_index"] + covariates_df_sparse["state_index"] = data["state_index"] + covariates_df_sparse.drop_duplicates(inplace=True) + assert set(covariates_df_sparse["unit_index"]) == set(data["unit_index"]) + + # get tensors + + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + + y = data[f"{outcome_dataset}_shifted_by_{forward_shift}"] + y = torch.tensor(y, dtype=torch.float32, device=device) + + unit_index = torch.tensor(data["unit_index"], dtype=torch.int, device=device) + unit_index_sparse = torch.tensor( + covariates_df_sparse["unit_index"], dtype=torch.int + ) + + state_index = torch.tensor(data["state_index"], dtype=torch.int, device=device) + state_index_sparse = torch.tensor( + covariates_df_sparse["state_index"], dtype=torch.int + ) + + time_index = torch.tensor(data["time_index"], dtype=torch.int, device=device) + intervention = torch.tensor( + data[intervention_variable], dtype=torch.float32, device=device + ) + + covariates = torch.tensor(covariates_df.values, dtype=torch.float32, device=device) + + covariates_df_sparse.drop(columns=["unit_index", "state_index"], inplace=True) + covariates_sparse = torch.tensor( + covariates_df_sparse.values, dtype=torch.float32, device=device + ) + + N_cov = covariates.shape[1] # number of covariates + N_u = covariates_sparse.shape[0] # number of units (counties) + N_obs = len(y) # number of observations + N_t = len(time_index.unique()) # number of time points + N_s = len(state_index.unique()) # number of states + + assert len(intervention) == len(y) + assert len(unit_index) == len(y) + assert len(state_index) == len(unit_index) + assert len(time_index) == len(unit_index) + assert covariates.shape[1] == covariates_sparse.shape[1] + assert len(unit_index_sparse) == N_u + + return { + "N_t": N_t, + "N_cov": N_cov, + "N_s": N_s, + "N_u": N_u, + "N_obs": N_obs, + "unit_index": unit_index, + "state_index": state_index, + "time_index": time_index, + "unit_index_sparse": unit_index_sparse, + "state_index_sparse": state_index_sparse, + "covariates": covariates, + "covariates_sparse": covariates_sparse, + "intervention": intervention, + "y": y, + } diff --git a/build/cities/modeling/svi_inference.py b/build/cities/modeling/svi_inference.py new file mode 100644 index 00000000..8ccef03c --- /dev/null +++ b/build/cities/modeling/svi_inference.py @@ -0,0 +1,44 @@ +import matplotlib.pyplot as plt +import pyro +import torch +from pyro.infer.autoguide import AutoMultivariateNormal, init_to_mean + + +def run_svi_inference( + model, + verbose=True, + lr=0.03, + vi_family=AutoMultivariateNormal, + guide=None, + hide=[], + n_steps=500, + ylim=None, + plot=True, + **model_kwargs +): + losses = [] + if guide is None: + guide = vi_family( + pyro.poutine.block(model, hide=hide), init_loc_fn=init_to_mean + ) + elbo = pyro.infer.Trace_ELBO()(model, guide) + + elbo(**model_kwargs) + adam = torch.optim.Adam(elbo.parameters(), lr=lr) + + for step in range(1, n_steps + 1): + adam.zero_grad() + loss = elbo(**model_kwargs) + loss.backward() + losses.append(loss.item()) + adam.step() + if (step % 50 == 0) or (step == 1) & verbose: + print("[iteration %04d] loss: %.4f" % (step, loss)) + + if plot: + plt.plot(losses) + if ylim: + plt.ylim(ylim) + plt.show() + + return guide diff --git a/build/cities/modeling/tau_caching_pipeline.py b/build/cities/modeling/tau_caching_pipeline.py new file mode 100644 index 00000000..b517d522 --- /dev/null +++ b/build/cities/modeling/tau_caching_pipeline.py @@ -0,0 +1,88 @@ +import logging +import os +import time + +from cities.queries.causal_insight import CausalInsight +from cities.utils.data_grabber import ( + DataGrabber, + find_repo_root, + list_interventions, + list_outcomes, +) + +root = find_repo_root() +log_dir = os.path.join(root, "data", "tau_samples") +log_file_path = os.path.join(log_dir, ".sampling.log") +os.makedirs(log_dir, exist_ok=True) + +logging.basicConfig( + filename=log_file_path, + filemode="w", + format="%(asctime)s → %(name)s → %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, +) + + +session_start = time.time() + + +num_samples = 1000 + +data = DataGrabber() + +interventions = list_interventions() +outcomes = list_outcomes() + + +N_combinations_samples = len(interventions) * len(outcomes) + + +files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))] +num_files = len(files) + +logging.info( + f"{(num_files-2)} sample dictionaries already exist. " + f"Starting to obtain {N_combinations_samples - (num_files -2)}" + f" out of {N_combinations_samples} sample dictionaries needed." +) +remaining = N_combinations_samples - (num_files - 2) +for intervention in interventions: + for outcome in outcomes: + tau_samples_path = os.path.join( + root, + "data/tau_samples", + f"{intervention}_{outcome}_{num_samples}_tau.pkl", + ) + + if not os.path.exists(tau_samples_path): + start_time = time.time() + logging.info(f"Sampling {outcome}/{intervention} pair now.") + ci = CausalInsight( + outcome_dataset=outcome, + intervention_dataset=intervention, + num_samples=num_samples, + ) + + ci.generate_tensed_samples() + end_time = time.time() + duration = end_time - start_time + files = [ + f + for f in os.listdir(log_dir) + if os.path.isfile(os.path.join(log_dir, f)) + ] + num_files = len(files) + remaining -= 1 + logging.info( + f"Done sampling {outcome}/{intervention} pair, completed in {duration:.2f} seconds." + f" {remaining} out of {N_combinations_samples} samples remain." + ) + + +session_ends = time.time() + +logging.info( + f"All samples are now available." + f"Sampling took {session_ends - session_start:.2f} seconds, or {(session_ends - session_start)/60:.2f} minutes." +) diff --git a/build/cities/modeling/training_pipeline.py b/build/cities/modeling/training_pipeline.py new file mode 100644 index 00000000..3f4ebc72 --- /dev/null +++ b/build/cities/modeling/training_pipeline.py @@ -0,0 +1,90 @@ +import logging +import os +import sys +import time + +from cities.modeling.model_interactions import InteractionsModel +from cities.utils.data_grabber import find_repo_root, list_interventions, list_outcomes + +if __name__ != "__main__": + sys.exit() + +root = find_repo_root() +log_dir = os.path.join(root, "data", "model_guides") +log_file_path = os.path.join(log_dir, ".training.log") +os.makedirs(log_dir, exist_ok=True) + +logging.basicConfig( + filename=log_file_path, + filemode="w", + format="%(asctime)s → %(name)s → %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, +) + + +# if you need to train from scratch +# clean data/model_guides folder manually +# automatic fresh start is not implemented +# for security reasons + +num_iterations = 4000 + +interventions = list_interventions() +outcomes = list_outcomes() +shifts = [1, 2, 3] + + +N_combinations = len(interventions) * len(outcomes) * len(shifts) + +files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))] +num_files = len(files) + + +logging.info( + f"{(num_files-2)/2} guides already exist. " + f"Starting to train {N_combinations - (num_files -2)/2} out of {N_combinations} guides needed." +) + +remaining = N_combinations - (num_files - 2) / 2 +for intervention_dataset in interventions: + for outcome_dataset in outcomes: + for forward_shift in shifts: + # check if the corresponding guide already exists + # existing_guides = 0 seems rendundant, remove if all works + guide_name = f"{intervention_dataset}_{outcome_dataset}_{forward_shift}" + guide_path = os.path.join( + root, "data/model_guides", f"{guide_name}_guide.pkl" + ) + if not os.path.exists(guide_path): + # existing_guides += 1 seems redundat remove if all works + + logging.info(f"Training {guide_name} for {num_iterations} iterations.") + + start_time = time.time() + model = InteractionsModel( + outcome_dataset=outcome_dataset, + intervention_dataset=intervention_dataset, + forward_shift=forward_shift, + num_iterations=num_iterations, + plot_loss=False, + ) + + model.train_interactions_model() + model.save_guide() + + end_time = time.time() + duration = end_time - start_time + files = [ + f + for f in os.listdir(log_dir) + if os.path.isfile(os.path.join(log_dir, f)) + ] + num_files = len(files) + remaining -= 1 + logging.info( + f"Training of {guide_name} completed in {duration:.2f} seconds. " + f"{int(remaining)} out of {N_combinations} guides remain to be trained." + ) + +logging.info("All guides are now available.") diff --git a/build/cities/modeling/waic.py b/build/cities/modeling/waic.py new file mode 100644 index 00000000..cf388ee5 --- /dev/null +++ b/build/cities/modeling/waic.py @@ -0,0 +1,69 @@ +from typing import Any, Callable, Dict, Optional + +import pyro +import torch +from pyro.infer.enum import get_importance_trace + + +def compute_waic( + model: Callable[..., Any], + guide: Callable[..., Any], + num_particles: int, + max_plate_nesting: int, + sites: Optional[list[str]] = None, + *args: Any, + **kwargs: Any +) -> Dict[str, Any]: + + def vectorize(fn: Callable[..., Any]) -> Callable[..., Any]: + def _fn(*args: Any, **kwargs: Any) -> Any: + with pyro.plate( + "num_particles_vectorized", num_particles, dim=-max_plate_nesting + ): + return fn(*args, **kwargs) + + return _fn + + model_trace, _ = get_importance_trace( + "flat", max_plate_nesting, vectorize(model), vectorize(guide), args, kwargs + ) + + def site_filter_is_observed(site_name: str) -> bool: + return model_trace.nodes[site_name]["is_observed"] + + def site_filter_in_sites(site_name: str) -> bool: + return sites is not None and site_name in sites + + if sites is None: + site_filter = site_filter_is_observed + else: + site_filter = site_filter_in_sites + + observed_nodes = { + name: node for name, node in model_trace.nodes.items() if site_filter(name) + } + + log_p_post = { + key: observed_nodes[key]["log_prob"].mean(dim=0) # sum(axis = 0)/num_particles + for key in observed_nodes.keys() + } + + lppd = torch.stack([log_p_post[key] for key in log_p_post.keys()]).sum() + + var_log_p_post = { + key: (observed_nodes[key]["log_prob"]).var(axis=0) + for key in observed_nodes.keys() + } + + p_waic = torch.stack([var_log_p_post[key] for key in var_log_p_post.keys()]).sum() + + waic = -2 * (lppd - p_waic) + + return { + "waic": waic, + "nodes": observed_nodes, + "log_p_post": log_p_post, + "var_log_p_post": var_log_p_post, + "lppd": lppd, + "p_waic": p_waic, + } diff --git a/build/cities/modeling/zoning_models/distance_causal_model.py b/build/cities/modeling/zoning_models/distance_causal_model.py new file mode 100644 index 00000000..57f4fc31 --- /dev/null +++ b/build/cities/modeling/zoning_models/distance_causal_model.py @@ -0,0 +1,202 @@ +from typing import Any, Dict, Optional + +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.zoning_models.units_causal_model import add_linear_component, get_n + + +class DistanceCausalModel(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = dict() + for name in categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, + leeway=0.9, + ): + if categorical_levels is None: + categorical_levels = self.categorical_levels + + _N_categorical, _N_continuous, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + ################# + # register + ################# + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + month = pyro.sample( + "month", + dist.Categorical(torch.ones(len(categorical_levels["month"]))), + obs=categorical["month"], + ) + + zone_id = pyro.sample( + "zone_id", + dist.Categorical(torch.ones(len(categorical_levels["zone_id"]))), + obs=categorical["zone_id"], + ) + + neighborhood_id = pyro.sample( + "neighborhood_id", + dist.Categorical( + torch.ones(len(categorical_levels["neighborhood_id"])) + ), + obs=categorical["neighborhood_id"], + ) + + ward_id = pyro.sample( + "ward_id", + dist.Categorical(torch.ones(len(categorical_levels["ward_id"]))), + obs=categorical["ward_id"], + ) + + past_reform = pyro.sample( + "past_reform", dist.Normal(0, 1), obs=categorical["past_reform"] + ) + + # past_reform_by_zone = pyro.deterministic( + # "past_reform_by_zone", + # categorical_interaction_variable([past_reform, zone_id])[0], + # ) + # categorical_levels["past_reform_by_zone"] = torch.unique( + # past_reform_by_zone + # ) + + # ___________________________________ + # deterministic def of actual limits + # ___________________________________ + + with data_plate: + limit_con = pyro.deterministic( + "limit_con", + torch.where( + zone_id == 0, + torch.tensor(0.0), + torch.where( + zone_id == 1, + 1.0 - past_reform, + torch.where( + zone_id == 2, 1.0 - 0.5 * past_reform, torch.tensor(1.0) + ), + ), + ), + event_dim=0, + ) + + # __________________________________ + # regression for distance to transit + # __________________________________ + + distance_to_transit_continuous_parents = {} # type: ignore + distance_to_transit_categorical_parents = { + "zone_id": zone_id, + } + distance_to_transit = add_linear_component( + child_name="distance_to_transit", + child_continuous_parents=distance_to_transit_continuous_parents, + child_categorical_parents=distance_to_transit_categorical_parents, + leeway=leeway, + data_plate=data_plate, + observations=continuous["distance_to_transit"], + categorical_levels=categorical_levels, + ) + + # ___________________________ + # regression for parcel area + # ___________________________ + parcel_area_continuous_parents = {"distance_to_transit": distance_to_transit} # type: ignore + parcel_are_categorical_parents = { + "zone_id": zone_id, + "neighborhood_id": neighborhood_id, + } + parcel_area = add_linear_component( + child_name="parcel_area", + child_continuous_parents=parcel_area_continuous_parents, + child_categorical_parents=parcel_are_categorical_parents, + leeway=leeway, + data_plate=data_plate, + observations=continuous["parcel_area"], + categorical_levels=categorical_levels, + ) + + # ___________________________ + # regression for limit suspended in light of pyro.deterministic + # ___________________________ + + # limit_con_categorical_parents = {"past_reform_by_zone": past_reform_by_zone} + + # # TODO consider using a `pyro.deterministic` statement if safe to assume what the + # # rules are and hard code them + # limit_con = add_linear_component( + # child_name="limit_con", + # child_continuous_parents={}, + # child_categorical_parents=limit_con_categorical_parents, + # leeway=leeway, + # data_plate=data_plate, + # observations=continuous["limit_con"], + # categorical_levels=categorical_levels, + # ) + + # _____________________________ + # regression for housing units + # _____________________________ + + housing_units_continuous_parents = { + "limit_con": limit_con, + "parcel_area": parcel_area, + "distance_to_transit": distance_to_transit, + } + housing_units_categorical_parents = { + "year": year, + "month": month, + "zone_id": zone_id, + "neighborhood_id": neighborhood_id, + "ward_id": ward_id, + } + + housing_units = add_linear_component( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=leeway, + data_plate=data_plate, + observations=outcome, + categorical_levels=categorical_levels, + ) + + return housing_units diff --git a/build/cities/modeling/zoning_models/missingness_only_model.py b/build/cities/modeling/zoning_models/missingness_only_model.py new file mode 100644 index 00000000..76fd7e82 --- /dev/null +++ b/build/cities/modeling/zoning_models/missingness_only_model.py @@ -0,0 +1,173 @@ +from typing import Any, Dict, Optional + +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.zoning_models.units_causal_model import ( + categorical_contribution, + continuous_contribution, + get_n, +) + +# see A WEAKLY INFORMATIVE DEFAULT PRIOR DISTRIBUTION FOR +# LOGISTIC AND OTHER REGRESSION MODELS +# B Y A NDREW G ELMAN , A LEKS JAKULIN , M ARIA G RAZIA +# P ITTAU AND Y U -S UNG S +# they recommed Cauchy with 2.5 scale for coefficient priors + +# see also zoning_missingness_only.ipynb for a normal approximation + + +def add_logistic_component( + child_name: "str", + child_continuous_parents, + child_categorical_parents, + leeway, + data_plate, + observations=None, + categorical_levels=None, +): + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + with data_plate: + + mean_prediction_child = pyro.deterministic( # type: ignore + f"mean_outcome_prediction_{child_name}", + categorical_contribution_to_child + continuous_contribution_to_child, + event_dim=0, + ) + + child_probs = pyro.deterministic( + f"child_probs_{child_name}_{child_name}", + torch.sigmoid(mean_prediction_child), + event_dim=0, + ) + + child_observed = pyro.sample( + f"{child_name}", + dist.Bernoulli(child_probs), + obs=observations, + ) + + # TODO consider a gamma-like distro here + + return child_observed + + +class MissingnessOnlyModel(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = dict() + for name in categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, + leeway=0.9, + ): + if categorical_levels is None: + categorical_levels = self.categorical_levels + + _N_categorical, _N_continuous, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + ################# + # register + ################# + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + value = pyro.sample("value", dist.Normal(0, 1), obs=continuous["value"]) + + # month = pyro.sample( + # "month", + # dist.Categorical(torch.ones(len(categorical_levels["month"]))), + # obs=categorical["month"], + # ) + + # zone_id = pyro.sample( + # "zone_id", + # dist.Categorical(torch.ones(len(categorical_levels["zone_id"]))), + # obs=categorical["zone_id"], + # ) + + # neighborhood_id = pyro.sample( + # "neighborhood_id", + # dist.Categorical( + # torch.ones(len(categorical_levels["neighborhood_id"])) + # ), + # obs=categorical["neighborhood_id"], + # ) + + # ward_id = pyro.sample( + # "ward_id", + # dist.Categorical(torch.ones(len(categorical_levels["ward_id"]))), + # obs=categorical["ward_id"], + # ) + + # past_reform = pyro.sample( + # "past_reform", dist.Normal(0, 1), obs=categorical["past_reform"] + # ) + + # ___________________________ + # logistic regression for applied + # ___________________________ + + applied_continuous_parents = { + "value": value, + } + applied_categorical_parents = { + "year": year, + } + + applied = add_logistic_component( + child_name="applied", + child_continuous_parents=applied_continuous_parents, + child_categorical_parents=applied_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=categorical["applied"], + categorical_levels=categorical_levels, + ) + + return applied diff --git a/build/cities/modeling/zoning_models/tracts_model.py b/build/cities/modeling/zoning_models/tracts_model.py new file mode 100644 index 00000000..f4f8dc45 --- /dev/null +++ b/build/cities/modeling/zoning_models/tracts_model.py @@ -0,0 +1,703 @@ +from typing import Any, Dict, Optional + +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.zoning_models.units_causal_model import ( + add_linear_component, + categorical_contribution, + continuous_contribution, + get_n, +) + + +def add_ratio_component( + child_name: "str", + child_continuous_parents, + child_categorical_parents, + leeway, + data_plate, + observations=None, + categorical_levels=None, +): + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + sigma_child = pyro.sample(f"sigma_{child_name}", dist.Exponential(40.0)) + + with data_plate: + + mean_prediction_child = pyro.deterministic( # type: ignore + f"mean_outcome_prediction_{child_name}", + categorical_contribution_to_child + continuous_contribution_to_child, + event_dim=0, + ) + + child_probs = pyro.deterministic( + f"child_probs_{child_name}_{child_name}", + torch.sigmoid(mean_prediction_child), + event_dim=0, + ) + + child_observed = pyro.sample( + child_name, dist.Normal(child_probs, sigma_child), obs=observations + ) + + return child_observed + + +def add_poisson_component( + child_name: str, + child_continuous_parents: Dict[str, torch.Tensor], + child_categorical_parents: Dict[str, torch.Tensor], + leeway: float, + data_plate, + observations: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, +) -> torch.Tensor: + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + with data_plate: + + mean_prediction_child = pyro.deterministic( + f"mean_outcome_prediction_{child_name}", + torch.exp( + categorical_contribution_to_child + continuous_contribution_to_child + ), + event_dim=0, + ) + + child_observed = pyro.sample( + child_name, dist.Poisson(mean_prediction_child), obs=observations + ) + + return child_observed + + +class TractsModelNoRatios(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = dict() + for name in categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, + leeway=0.9, + ): + if categorical_levels is None: + categorical_levels = self.categorical_levels + + _N_categorical, _N_continuous, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + # ################# + # # register + # ################# + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + distance = pyro.sample( + "distance", dist.Normal(0, 1), obs=continuous["median_distance"] + ) + + # past_reform = pyro.sample( + # "past_reform", + # dist.Categorical(torch.ones(len(categorical_levels["past_reform"]))), + # obs=categorical["past_reform"], + # ) + + # ___________________________ + # regression for white + # ___________________________ + + white_continuous_parents = { + "distance": distance, + } + + white_categorical_parents = { + "year": year, + } + + white = add_linear_component( + child_name="white", + child_continuous_parents=white_continuous_parents, + child_categorical_parents=white_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["white"], + ) + + # ___________________________ + # regression for segregation + # ___________________________ + + segregation_continuous_parents = { + "distance": distance, + "white": white, + } + + segregation_categorical_parents = { + "year": year, + } + + segregation = add_linear_component( + child_name="segregation", + child_continuous_parents=segregation_continuous_parents, + child_categorical_parents=segregation_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["segregation"], + ) + + # ___________________________ + # regression for income + # ___________________________ + + income_continuous_parents = { + "distance": distance, + "white": white, + "segregation": segregation, + } + + income_categorical_parents = { + "year": year, + } + + income = add_linear_component( + child_name="income", + child_continuous_parents=income_continuous_parents, + child_categorical_parents=income_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["income"], + ) + + # _____________________________ + # regression for limit + # _____________________________ + + limit_continuous_parents = { + "distance": distance, + } + + limit_categorical_parents = { + "year": year, + } + + limit = add_linear_component( + child_name="limit", + child_continuous_parents=limit_continuous_parents, + child_categorical_parents=limit_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["mean_limit"], + ) + + # _____________________________ + # regression for median value + # _____________________________ + + value_continuous_parents = { + "distance": distance, + "limit": limit, + "income": income, + "white": white, + "segregation": segregation, + } + + value_categorical_parents = { + "year": year, + } + + median_value = add_linear_component( + child_name="median_value", + child_continuous_parents=value_continuous_parents, + child_categorical_parents=value_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["median_value"], + ) + + # ___________________________ + # regression for housing units + # ___________________________ + + housing_units_continuous_parents = { + "median_value": median_value, + "distance": distance, + "income": income, + "white": white, + "limit": limit, + "segregation": segregation, + } + + housing_units_categorical_parents = { + "year": year, + } + + housing_units = add_linear_component( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["housing_units"], + ) + + return housing_units + + +class TractsModel(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = dict() + for name in categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, + leeway=0.9, + ): + if categorical_levels is None: + categorical_levels = self.categorical_levels + + _N_categorical, _N_continuous, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + # ################# + # # register + # ################# + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + distance = pyro.sample( + "distance", dist.Normal(0, 1), obs=continuous["median_distance"] + ) + + # past_reform = pyro.sample( + # "past_reform", + # dist.Categorical(torch.ones(len(categorical_levels["past_reform"]))), + # obs=categorical["past_reform"], + # ) + + # ___________________________ + # regression for white + # ___________________________ + + white_continuous_parents = { + "distance": distance, + } + + white_categorical_parents = { + "year": year, + } + + white = add_ratio_component( + child_name="white", + child_continuous_parents=white_continuous_parents, + child_categorical_parents=white_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["white_original"], + ) + + # ___________________________ + # regression for segregation + # ___________________________ + + segregation_continuous_parents = { + "distance": distance, + "white": white, + } + + segregation_categorical_parents = { + "year": year, + } + + segregation = add_ratio_component( + child_name="segregation", + child_continuous_parents=segregation_continuous_parents, + child_categorical_parents=segregation_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["segregation_original"], + ) + + # ___________________________ + # regression for income + # ___________________________ + + income_continuous_parents = { + "distance": distance, + "white": white, + "segregation": segregation, + } + + income_categorical_parents = { + "year": year, + } + + income = add_linear_component( + child_name="income", + child_continuous_parents=income_continuous_parents, + child_categorical_parents=income_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["income"], + ) + + # _____________________________ + # regression for limit + # _____________________________ + + limit_continuous_parents = { + "distance": distance, + } + + limit_categorical_parents = { + "year": year, + } + + limit = add_ratio_component( + child_name="limit", + child_continuous_parents=limit_continuous_parents, + child_categorical_parents=limit_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["mean_limit_original"], + ) + + # _____________________________ + # regression for median value + # _____________________________ + + value_continuous_parents = { + "distance": distance, + "limit": limit, + "income": income, + "white": white, + "segregation": segregation, + } + + value_categorical_parents = { + "year": year, + } + + median_value = add_linear_component( + child_name="median_value", + child_continuous_parents=value_continuous_parents, + child_categorical_parents=value_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["median_value"], + ) + + # ___________________________ + # regression for housing units + # ___________________________ + + housing_units_continuous_parents = { + "median_value": median_value, + "distance": distance, + "income": income, + "white": white, + "limit": limit, + "segregation": segregation, + } + + housing_units_categorical_parents = { + "year": year, + } + + housing_units = add_linear_component( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["housing_units"], + ) + + return housing_units + + +class TractsModelPoisson(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = dict() + for name in categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, + leeway=0.9, + ): + if categorical_levels is None: + categorical_levels = self.categorical_levels + + _N_categorical, _N_continuous, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + # ################# + # # register + # ################# + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + distance = pyro.sample( + "distance", dist.Normal(0, 1), obs=continuous["median_distance"] + ) + + # past_reform = pyro.sample( + # "past_reform", + # dist.Categorical(torch.ones(len(categorical_levels["past_reform"]))), + # obs=categorical["past_reform"], + # ) + + # ___________________________ + # regression for white + # ___________________________ + + white_continuous_parents = { + "distance": distance, + } + + white_categorical_parents = { + "year": year, + } + + white = add_ratio_component( + child_name="white", + child_continuous_parents=white_continuous_parents, + child_categorical_parents=white_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["white_original"], + ) + + # ___________________________ + # regression for segregation + # ___________________________ + + segregation_continuous_parents = { + "distance": distance, + "white": white, + } + + segregation_categorical_parents = { + "year": year, + } + + segregation = add_ratio_component( + child_name="segregation", + child_continuous_parents=segregation_continuous_parents, + child_categorical_parents=segregation_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["segregation_original"], + ) + + # ___________________________ + # regression for income + # ___________________________ + + income_continuous_parents = { + "distance": distance, + "white": white, + "segregation": segregation, + } + + income_categorical_parents = { + "year": year, + } + + income = add_linear_component( + child_name="income", + child_continuous_parents=income_continuous_parents, + child_categorical_parents=income_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["income"], + ) + + # #_____________________________ + # # regression for limit + # #_____________________________ + + limit_continuous_parents = { + "distance": distance, + } + + limit_categorical_parents = { + "year": year, + } + + limit = add_ratio_component( + child_name="limit", + child_continuous_parents=limit_continuous_parents, + child_categorical_parents=limit_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["mean_limit_original"], + ) + + # # _____________________________ + # # regression for median value + # # _____________________________ + + value_continuous_parents = { + "distance": distance, + "limit": limit, + "income": income, + "white": white, + "segregation": segregation, + } + + value_categorical_parents = { + "year": year, + } + + median_value = add_linear_component( + child_name="median_value", + child_continuous_parents=value_continuous_parents, + child_categorical_parents=value_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["median_value"], + ) + + # # ___________________________ + # # regression for housing units + # # ___________________________ + + housing_units_continuous_parents = { + "median_value": median_value, + "distance": distance, + "income": income, + "white": white, + "limit": limit, + "segregation": segregation, + } + + housing_units_categorical_parents = { + "year": year, + } + + housing_units = add_poisson_component( + child_name="housing_units_original", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["housing_units_original"], + ) + + return housing_units diff --git a/build/cities/modeling/zoning_models/units_causal_model.py b/build/cities/modeling/zoning_models/units_causal_model.py new file mode 100644 index 00000000..27035096 --- /dev/null +++ b/build/cities/modeling/zoning_models/units_causal_model.py @@ -0,0 +1,289 @@ +from typing import Any, Dict, List, Optional + +import pyro +import pyro.distributions as dist +import torch + + +def get_n(categorical: Dict[str, torch.Tensor], continuous: Dict[str, torch.Tensor]): + N_categorical = len(categorical.keys()) + N_continuous = len(continuous.keys()) + + if N_categorical > 0: + n = len(next(iter(categorical.values()))) + elif N_continuous > 0: + n = len(next(iter(continuous.values()))) + + return N_categorical, N_continuous, n + + +def categorical_contribution(categorical, child_name, leeway, categorical_levels=None): + + categorical_names = list(categorical.keys()) + + if categorical_levels is None: + categorical_levels = { + name: torch.unique(categorical[name]) for name in categorical_names + } + + weights_categorical_outcome = {} + objects_cat_weighted = {} + + for name in categorical_names: + weights_categorical_outcome[name] = pyro.sample( + f"weights_categorical_{name}_{child_name}", + dist.Normal(0.0, leeway).expand(categorical_levels[name].shape).to_event(1), + ) + + objects_cat_weighted[name] = weights_categorical_outcome[name][ + ..., categorical[name] + ] + + values = list(objects_cat_weighted.values()) + for i in range(1, len(values)): + values[i] = values[i].view(values[0].shape) + + categorical_contribution_outcome = torch.stack(values, dim=0).sum(dim=0) + + return categorical_contribution_outcome + + +def continuous_contribution(continuous, child_name, leeway): + + contributions = torch.zeros(1) + + for key, value in continuous.items(): + bias_continuous = pyro.sample( + f"bias_continuous_{key}_{child_name}", + dist.Normal(0.0, leeway), + ) + + weight_continuous = pyro.sample( + f"weight_continuous_{key}_{child_name}", + dist.Normal(0.0, leeway), + ) + + contribution = bias_continuous + weight_continuous * value + contributions = contribution + contributions + + return contributions + + +def add_linear_component( + child_name: "str", + child_continuous_parents, + child_categorical_parents, + leeway, + data_plate, + observations=None, + categorical_levels=None, +): + + sigma_child = pyro.sample( + f"sigma_{child_name}", dist.Exponential(1.0) + ) # type: ignore + + continuous_contribution_to_child = continuous_contribution( + child_continuous_parents, child_name, leeway + ) + + categorical_contribution_to_child = categorical_contribution( + child_categorical_parents, + child_name, + leeway, + categorical_levels=categorical_levels, + ) + + with data_plate: + + mean_prediction_child = pyro.deterministic( # type: ignore + f"mean_outcome_prediction_{child_name}", + categorical_contribution_to_child + continuous_contribution_to_child, + event_dim=0, + ) + + child_observed = pyro.sample( # type: ignore + f"{child_name}", + dist.Normal(mean_prediction_child, sigma_child), + obs=observations, + ) + + # TODO consider a gamma-like distro here + + return child_observed + + +def categorical_interaction_variable(interaction_list: List[torch.Tensor]): + + assert len(interaction_list) > 1 + + for tensor in interaction_list: + assert tensor.shape == interaction_list[0].shape + + stacked_tensor = torch.stack(interaction_list, dim=-1) + + unique_pairs, inverse_indices = torch.unique( + stacked_tensor, return_inverse=True, dim=0 + ) + + unique_combined_tensor = inverse_indices.reshape(interaction_list[0].shape) + + indexing_dictionary = { + tuple(pair.tolist()): i for i, pair in enumerate(unique_pairs) + } + + return unique_combined_tensor, indexing_dictionary + + +class UnitsCausalModel(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = dict() + for name in categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + categorical_levels: Optional[Dict[str, torch.Tensor]] = None, + leeway=0.9, + ): + if categorical_levels is None: + categorical_levels = self.categorical_levels + + _N_categorical, _N_continuous, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + ################# + # register + ################# + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + month = pyro.sample( + "month", + dist.Categorical(torch.ones(len(categorical_levels["month"]))), + obs=categorical["month"], + ) + + zone_id = pyro.sample( + "zone_id", + dist.Categorical(torch.ones(len(categorical_levels["zone_id"]))), + obs=categorical["zone_id"], + ) + + neighborhood_id = pyro.sample( + "neighborhood_id", + dist.Categorical( + torch.ones(len(categorical_levels["neighborhood_id"])) + ), + obs=categorical["neighborhood_id"], + ) + + ward_id = pyro.sample( + "ward_id", + dist.Categorical(torch.ones(len(categorical_levels["ward_id"]))), + obs=categorical["ward_id"], + ) + + past_reform = pyro.sample( + "past_reform", dist.Normal(0, 1), obs=categorical["past_reform"] + ) + + past_reform_by_zone = pyro.deterministic( + "past_reform_by_zone", + categorical_interaction_variable([past_reform, zone_id])[0], + ) + categorical_levels["past_reform_by_zone"] = torch.unique( + past_reform_by_zone + ) + # ___________________________ + # regression for parcel area + # ___________________________ + parcel_area_continuous_parents = {} # type: ignore + parcel_are_categorical_parents = { + "zone_id": zone_id, + "neighborhood_id": neighborhood_id, + } + parcel_area = add_linear_component( + child_name="parcel_area", + child_continuous_parents=parcel_area_continuous_parents, + child_categorical_parents=parcel_are_categorical_parents, + leeway=leeway, + data_plate=data_plate, + observations=continuous["parcel_area"], + categorical_levels=categorical_levels, + ) + + # ___________________________ + # regression for limit + # ___________________________ + + limit_con_categorical_parents = {"past_reform_by_zone": past_reform_by_zone} + + # TODO consider using a `pyro.deterministic` statement if safe to assume what the + # rules are and hard code them + limit_con = add_linear_component( + child_name="limit_con", + child_continuous_parents={}, + child_categorical_parents=limit_con_categorical_parents, + leeway=leeway, + data_plate=data_plate, + observations=continuous["limit_con"], + categorical_levels=categorical_levels, + ) + + # _____________________________ + # regression for housing units + # _____________________________ + + housing_units_continuous_parents = { + "limit_con": limit_con, + "parcel_area": parcel_area, + } + housing_units_categorical_parents = { + "year": year, + "month": month, + "zone_id": zone_id, + "neighborhood_id": neighborhood_id, + "ward_id": ward_id, + } + + housing_units = add_linear_component( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=leeway, + data_plate=data_plate, + observations=outcome, + categorical_levels=categorical_levels, + ) + + return housing_units diff --git a/build/cities/modeling/zoning_models/zoning_tracts_continuous_interactions_model.py b/build/cities/modeling/zoning_models/zoning_tracts_continuous_interactions_model.py new file mode 100644 index 00000000..69bd017a --- /dev/null +++ b/build/cities/modeling/zoning_models/zoning_tracts_continuous_interactions_model.py @@ -0,0 +1,301 @@ +import warnings +from typing import Any, Dict, Optional + +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.model_components import ( + add_linear_component, + add_linear_component_continuous_interactions, + add_ratio_component_continuous_interactions, + add_ratio_component, + check_categorical_is_subset_of_levels, + get_categorical_levels, + get_n, +) + + +class TractsModelContinuousInteractions(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + housing_units_continuous_interaction_pairs=[], + limit_continuous_interaction_pairs=[], + ): + """ + + :param categorical: dict of categorical data + :param continuous: dict of continuous data + :param outcome: outcome data (unused, todo remove) + :param categorical_levels: dict of unique categorical values. If this is not passed, it will be computed from + the provided categorical data. Importantly, if categorical is a subset of the full dataset, this automated + computation may omit categorical levels that are present in the full dataset but not in the subset. + """ + super().__init__() + + self.leeway = leeway + self.housing_units_continuous_interaction_pairs = ( + housing_units_continuous_interaction_pairs + ) + self.limit_continuous_interaction_pairs = limit_continuous_interaction_pairs + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = get_categorical_levels(categorical) + else: + self.categorical_levels = categorical_levels + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + leeway=0.9, + categorical_levels=None, + n=None, + ): + if categorical_levels is not None: + warnings.warn( + "Passed categorical_levels will no longer override the levels passed to or computed during" + " model initialization. The argument will be ignored." + ) + + categorical_levels = self.categorical_levels + assert check_categorical_is_subset_of_levels(categorical, categorical_levels) + + if n is None: + _, _, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + # _________ + # register + # _________ + + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + distance = pyro.sample( + "distance", dist.Normal(0, 1), obs=continuous["median_distance"] + ) + + downtown_overlap = pyro.sample( + "downtown_overlap", + dist.Normal(0, 1), + obs=continuous["downtown_overlap"], + ) + + university_overlap = pyro.sample( + "university_overlap", + dist.Normal(0, 1), + obs=continuous["university_overlap"], + ) + + # ______________________ + # regression for sqm + # ______________________ + + sqm_continuous_parents = { + "distance": distance, + } + + sqm_categorical_parents = { + "year": year, + } + + sqm = add_linear_component( + child_name="sqm", + child_continuous_parents=sqm_continuous_parents, + child_categorical_parents=sqm_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["parcel_sqm"], + categorical_levels=self.categorical_levels, + ) + + # _______________________ + # regression for limit + # _______________________ + + limit_continuous_parents = { + "distance": distance, + "downtown_overlap": downtown_overlap, + "university_overlap": university_overlap, + } + + limit_categorical_parents = { + "year": year, + } + + + + limit = add_ratio_component( + child_name="limit", + child_continuous_parents=limit_continuous_parents, + child_categorical_parents=limit_categorical_parents, + leeway=8, # , + data_plate=data_plate, + observations=continuous["mean_limit_original"], + categorical_levels=self.categorical_levels, + ) + + + # limit = add_ratio_component( + # child_name="limit", + # child_continuous_parents=limit_continuous_parents, + # child_categorical_parents=limit_categorical_parents, + # leeway=8, # , + # data_plate=data_plate, + # observations=continuous["mean_limit_original"], + # categorical_levels=self.categorical_levels, + # ) + + # _____________________ + # regression for white + # _____________________ + + white_continuous_parents = { + "distance": distance, + "sqm": sqm, + "limit": limit, + } + + white_categorical_parents = { + "year": year, + } + + white = add_ratio_component( + child_name="white", + child_continuous_parents=white_continuous_parents, + child_categorical_parents=white_categorical_parents, + leeway=8, # 11.57, + data_plate=data_plate, + observations=continuous["white_original"], + categorical_levels=self.categorical_levels, + ) + + # ___________________________ + # regression for segregation + # ___________________________ + + segregation_continuous_parents = { + "distance": distance, + "white": white, + "sqm": sqm, + "limit": limit, + } + + segregation_categorical_parents = { + "year": year, + } + + segregation = add_ratio_component( + child_name="segregation", + child_continuous_parents=segregation_continuous_parents, + child_categorical_parents=segregation_categorical_parents, + leeway=8, # 11.57, + data_plate=data_plate, + observations=continuous["segregation_original"], + categorical_levels=self.categorical_levels, + ) + + # ______________________ + # regression for income + # ______________________ + + income_continuous_parents = { + "distance": distance, + "white": white, + "segregation": segregation, + "sqm": sqm, + "limit": limit, + } + + income_categorical_parents = { + "year": year, + } + + income = add_linear_component( + child_name="income", + child_continuous_parents=income_continuous_parents, + child_categorical_parents=income_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["income"], + categorical_levels=self.categorical_levels, + ) + + # _____________________________ + # regression for median value + # _____________________________ + + value_continuous_parents = { + "distance": distance, + "income": income, + "white": white, + "segregation": segregation, + "sqm": sqm, + "limit": limit, + } + + value_categorical_parents = { + "year": year, + } + + median_value = add_linear_component( + child_name="median_value", + child_continuous_parents=value_continuous_parents, + child_categorical_parents=value_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["median_value"], + categorical_levels=self.categorical_levels, + ) + + # ______________________________ + # regression for housing units + # ______________________________ + + housing_units_continuous_parents = { + "median_value": median_value, + "distance": distance, + "income": income, + "white": white, + "limit": limit, + "segregation": segregation, + "sqm": sqm, + "downtown_overlap": downtown_overlap, + "university_overlap": university_overlap, + } + + housing_units_categorical_parents = { + "year": year, + # "university_index": university_index, + # "downtown_index": downtown_index, + } + + housing_units = add_linear_component_continuous_interactions( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + continous_interaction_pairs=self.housing_units_continuous_interaction_pairs, + leeway=0.5, + data_plate=data_plate, + observations=continuous["housing_units"], + categorical_levels=self.categorical_levels, + ) + + return housing_units diff --git a/build/cities/modeling/zoning_models/zoning_tracts_model.py b/build/cities/modeling/zoning_models/zoning_tracts_model.py new file mode 100644 index 00000000..0357bdc4 --- /dev/null +++ b/build/cities/modeling/zoning_models/zoning_tracts_model.py @@ -0,0 +1,234 @@ +import warnings +from typing import Any, Dict, Optional + +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.model_components import ( + add_linear_component, + add_ratio_component, + check_categorical_is_subset_of_levels, + get_categorical_levels, + get_n, +) + + +class TractsModel(pyro.nn.PyroModule): + + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + """ + + :param categorical: dict of categorical data + :param continuous: dict of continuous data + :param outcome: outcome data (unused, todo remove) + :param categorical_levels: dict of unique categorical values. If this is not passed, it will be computed from + the provided categorical data. Importantly, if categorical is a subset of the full dataset, this automated + computation may omit categorical levels that are present in the full dataset but not in the subset. + """ + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + # you might need and pass further the original + # categorical levels of the training data + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = get_categorical_levels(categorical) + else: + self.categorical_levels = categorical_levels # type: ignore + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + leeway=0.9, + categorical_levels=None, + n=None, + ): + if categorical_levels is not None: + warnings.warn( + "Passed categorical_levels will no longer override the levels passed to or computed during" + " model initialization. The argument will be ignored." + ) + + categorical_levels = self.categorical_levels + assert check_categorical_is_subset_of_levels(categorical, categorical_levels) + + if n is None: + _, _, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + # _________ + # register + # _________ + + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + distance = pyro.sample( + "distance", dist.Normal(0, 1), obs=continuous["median_distance"] + ) + + # _____________________ + # regression for white + # _____________________ + + white_continuous_parents = { + "distance": distance, + } + + white_categorical_parents = { + "year": year, + } + + white = add_ratio_component( + child_name="white", + child_continuous_parents=white_continuous_parents, + child_categorical_parents=white_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["white_original"], + categorical_levels=self.categorical_levels, + ) + + # ___________________________ + # regression for segregation + # ___________________________ + + segregation_continuous_parents = { + "distance": distance, + "white": white, + } + + segregation_categorical_parents = { + "year": year, + } + + segregation = add_ratio_component( + child_name="segregation", + child_continuous_parents=segregation_continuous_parents, + child_categorical_parents=segregation_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["segregation_original"], + categorical_levels=self.categorical_levels, + ) + + # ______________________ + # regression for income + # ______________________ + + income_continuous_parents = { + "distance": distance, + "white": white, + "segregation": segregation, + } + + income_categorical_parents = { + "year": year, + } + + income = add_linear_component( + child_name="income", + child_continuous_parents=income_continuous_parents, + child_categorical_parents=income_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["income"], + categorical_levels=self.categorical_levels, + ) + + # _______________________ + # regression for limit + # _______________________ + + limit_continuous_parents = { + "distance": distance, + } + + limit_categorical_parents = { + "year": year, + } + + limit = add_ratio_component( + child_name="limit", + child_continuous_parents=limit_continuous_parents, + child_categorical_parents=limit_categorical_parents, + leeway=11.57, + data_plate=data_plate, + observations=continuous["mean_limit_original"], + categorical_levels=self.categorical_levels, + ) + + # _____________________________ + # regression for median value + # _____________________________ + + value_continuous_parents = { + "distance": distance, + "limit": limit, + "income": income, + "white": white, + "segregation": segregation, + } + + value_categorical_parents = { + "year": year, + } + + median_value = add_linear_component( + child_name="median_value", + child_continuous_parents=value_continuous_parents, + child_categorical_parents=value_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["median_value"], + categorical_levels=self.categorical_levels, + ) + + # ______________________________ + # regression for housing units + # ______________________________ + + housing_units_continuous_parents = { + "median_value": median_value, + "distance": distance, + "income": income, + "white": white, + "limit": limit, + "segregation": segregation, + } + + housing_units_categorical_parents = { + "year": year, + } + + housing_units = add_linear_component( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=0.9, + data_plate=data_plate, + observations=continuous["housing_units"], + categorical_levels=self.categorical_levels, + ) + + return housing_units diff --git a/build/cities/modeling/zoning_models/zoning_tracts_sqm_model.py b/build/cities/modeling/zoning_models/zoning_tracts_sqm_model.py new file mode 100644 index 00000000..5634a52f --- /dev/null +++ b/build/cities/modeling/zoning_models/zoning_tracts_sqm_model.py @@ -0,0 +1,261 @@ +import warnings +from typing import Any, Dict, Optional + +import pyro +import pyro.distributions as dist +import torch + +from cities.modeling.model_components import ( + add_linear_component, + add_ratio_component, + check_categorical_is_subset_of_levels, + get_categorical_levels, + get_n, +) + + +class TractsModelSqm(pyro.nn.PyroModule): + def __init__( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[ + torch.Tensor + ] = None, # init args kept for uniformity, consider deleting + categorical_levels: Optional[Dict[str, Any]] = None, + leeway=0.9, + ): + """ + + :param categorical: dict of categorical data + :param continuous: dict of continuous data + :param outcome: outcome data (unused, todo remove) + :param categorical_levels: dict of unique categorical values. If this is not passed, it will be computed from + the provided categorical data. Importantly, if categorical is a subset of the full dataset, this automated + computation may omit categorical levels that are present in the full dataset but not in the subset. + """ + super().__init__() + + self.leeway = leeway + + self.N_categorical, self.N_continuous, n = get_n(categorical, continuous) + + if self.N_categorical > 0 and categorical_levels is None: + self.categorical_levels = get_categorical_levels(categorical) + else: + self.categorical_levels = categorical_levels + + def forward( + self, + categorical: Dict[str, torch.Tensor], + continuous: Dict[str, torch.Tensor], + outcome: Optional[torch.Tensor] = None, + leeway=0.9, + categorical_levels=None, + n=None, + ): + if categorical_levels is not None: + warnings.warn( + "Passed categorical_levels will no longer override the levels passed to or computed during" + " model initialization. The argument will be ignored." + ) + + categorical_levels = self.categorical_levels + assert check_categorical_is_subset_of_levels(categorical, categorical_levels) + + if n is None: + _, _, n = get_n(categorical, continuous) + + data_plate = pyro.plate("data", size=n, dim=-1) + + # _________ + # register + # _________ + + with data_plate: + + year = pyro.sample( + "year", + dist.Categorical(torch.ones(len(categorical_levels["year"]))), + obs=categorical["year"], + ) + + distance = pyro.sample( + "distance", dist.Normal(0, 1), obs=continuous["median_distance"] + ) + + # ______________________ + # regression for sqm + # ______________________ + + sqm_continuous_parents = { + "distance": distance, + } + + sqm_categorical_parents = { + "year": year, + } + + sqm = add_linear_component( + child_name="sqm", + child_continuous_parents=sqm_continuous_parents, + child_categorical_parents=sqm_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["parcel_sqm"], + categorical_levels=self.categorical_levels, + ) + + # _______________________ + # regression for limit + # _______________________ + + limit_continuous_parents = { + "distance": distance, + } + + limit_categorical_parents = { + "year": year, + } + + limit = add_ratio_component( + child_name="limit", + child_continuous_parents=limit_continuous_parents, + child_categorical_parents=limit_categorical_parents, + leeway=8, # , + data_plate=data_plate, + observations=continuous["mean_limit_original"], + categorical_levels=self.categorical_levels, + ) + + # _____________________ + # regression for white + # _____________________ + + white_continuous_parents = { + "distance": distance, + "sqm": sqm, + "limit": limit, + } + + white_categorical_parents = { + "year": year, + } + + white = add_ratio_component( + child_name="white", + child_continuous_parents=white_continuous_parents, + child_categorical_parents=white_categorical_parents, + leeway=8, # 11.57, + data_plate=data_plate, + observations=continuous["white_original"], + categorical_levels=self.categorical_levels, + ) + + # ___________________________ + # regression for segregation + # ___________________________ + + segregation_continuous_parents = { + "distance": distance, + "white": white, + "sqm": sqm, + "limit": limit, + } + + segregation_categorical_parents = { + "year": year, + } + + segregation = add_ratio_component( + child_name="segregation", + child_continuous_parents=segregation_continuous_parents, + child_categorical_parents=segregation_categorical_parents, + leeway=8, # 11.57, + data_plate=data_plate, + observations=continuous["segregation_original"], + categorical_levels=self.categorical_levels, + ) + + # ______________________ + # regression for income + # ______________________ + + income_continuous_parents = { + "distance": distance, + "white": white, + "segregation": segregation, + "sqm": sqm, + "limit": limit, + } + + income_categorical_parents = { + "year": year, + } + + income = add_linear_component( + child_name="income", + child_continuous_parents=income_continuous_parents, + child_categorical_parents=income_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["income"], + categorical_levels=self.categorical_levels, + ) + + # _____________________________ + # regression for median value + # _____________________________ + + value_continuous_parents = { + "distance": distance, + "income": income, + "white": white, + "segregation": segregation, + "sqm": sqm, + "limit": limit, + } + + value_categorical_parents = { + "year": year, + } + + median_value = add_linear_component( + child_name="median_value", + child_continuous_parents=value_continuous_parents, + child_categorical_parents=value_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["median_value"], + categorical_levels=self.categorical_levels, + ) + + # ______________________________ + # regression for housing units + # ______________________________ + + housing_units_continuous_parents = { + "median_value": median_value, + "distance": distance, + "income": income, + "white": white, + "limit": limit, + "segregation": segregation, + "sqm": sqm, + } + + housing_units_categorical_parents = { + "year": year, + } + + housing_units = add_linear_component( + child_name="housing_units", + child_continuous_parents=housing_units_continuous_parents, + child_categorical_parents=housing_units_categorical_parents, + leeway=0.5, + data_plate=data_plate, + observations=continuous["housing_units"], + categorical_levels=self.categorical_levels, + ) + + return housing_units diff --git a/build/cities/queries/__init__.py b/build/cities/queries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/build/cities/queries/causal_insight.py b/build/cities/queries/causal_insight.py new file mode 100644 index 00000000..7a7a7e98 --- /dev/null +++ b/build/cities/queries/causal_insight.py @@ -0,0 +1,585 @@ +import os + +import dill +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import plotly.graph_objects as go +import pyro +import torch +from sklearn.preprocessing import StandardScaler + +from cities.modeling.model_interactions import model_cities_interaction +from cities.modeling.modeling_utils import prep_wide_data_for_inference +from cities.utils.cleaning_utils import ( + revert_prediction_df, + revert_standardize_and_scale_scaler, + sigmoid, +) +from cities.utils.data_grabber import DataGrabber, find_repo_root +from cities.utils.percentiles import transformed_intervention_from_percentile + + +class CausalInsight: + def __init__( + self, + outcome_dataset, + intervention_dataset, + num_samples=1000, + sites=None, + smoke_test=None, + ): + self.outcome_dataset = outcome_dataset + self.intervention_dataset = intervention_dataset + self.root = find_repo_root() + self.num_samples = num_samples + self.data = None + self.smoke_test = smoke_test + + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + self.tau_samples_path = os.path.join( + self.root, + "data/tau_samples", + f"{self.intervention_dataset}_{self.outcome_dataset}_{self.num_samples}_tau.pkl", + ) + + # these are loaded/computed as need be + + self.guide = None + self.data = None + self.fips_id = None + self.name = None + self.model = None + self.model_args = None + self.predictive = None + self.samples = None + self.tensed_samples = None + self.tensed_tau_samples = None + + self.intervened_value = None # possibly in the transformed scale + self.intervention_is_percentile = None # flag for the sort of input + self.intervened_percentile = None # possible passed at input + self.intervened_value_percentile = ( + None # calculated if input was on the transformed scale + ) + self.intervened_value_original = None # in the original scale + self.observed_intervention = None # in the transformed scale + self.observed_intervention_original = None # in the original scale + self.observed_intervention_percentile = None # calculated if input + # was on the transformed scale + self.observed_outcomes = None + self.intervention_diff = ( + None # difference between observed and counterfactual value of the + ) + # intervention variable + self.intervention_impact = None # dictionary with preds for each shift + self.predictions = None # df with preds, can be passed to plotting + self.prediction_original = None # df with preds on the original scale + # can be passed to plotting + self.fips_observed_data = None # to be used for plotting in + # contrast with the counterfactual prediction + self.year_id = None # year of intervention as index in the outcome years + self.prediction_years = None + + # these are used in posterior predictive checks + self.average_predictions = None + self.r_squared = None + + def load_guide(self, forward_shift): + pyro.clear_param_store() + guide_name = ( + f"{self.intervention_dataset}_{self.outcome_dataset}_{forward_shift}" + ) + guide_path = os.path.join( + self.root, "data/model_guides", f"{guide_name}_guide.pkl" + ) + + with open(guide_path, "rb") as file: + self.guide = dill.load(file) + param_path = os.path.join( + self.root, "data/model_guides", f"{guide_name}_params.pth" + ) + + pyro.get_param_store().load(param_path) + + self.forward_shift = forward_shift + + def generate_samples(self): + self.data = prep_wide_data_for_inference( + outcome_dataset=self.outcome_dataset, + intervention_dataset=self.intervention_dataset, + forward_shift=self.forward_shift, + ) + self.model = model_cities_interaction + + self.model_args = self.data["model_args"] + + self.predictive = pyro.infer.Predictive( + model=self.model, + guide=self.guide, + num_samples=self.num_samples, + parallel=True, + # return_sites=self.sites, + ) + self.samples = self.predictive(*self.model_args) + + # idexing and gathering with mwc in this context + # seems to fail, calculating the expected diff made by the intervention manually + # wrt to actual observed outcomes rather than predicting outcomes themselves + # effectively keeping the noise fixed and focusing on a counterfactual claim + + # TODO possible delete in the current strategy deemed uncontroversial + # else: + # if not isinstance(intervened_value, torch.Tensor): + # intervened_value = torch.tensor(intervened_value, device=self.device) + # intervened_expanded = intervened_value.expand_as(self.data['t']) + + # with MultiWorldCounterfactual(first_available_dim=-6) as mwc: + # with do(actions = dict(T = intervened_expanded)): + # self.predictive = pyro.infer.Predictive(model=self.model, guide=self.guide, + # num_samples=self.num_samples, parallel=True) + # self.samples = self.predictive(*self.model_args) + # self.mwc = mwc + + def generate_tensed_samples(self): + self.tensed_samples = {} + self.tensed_tau_samples = {} + + for shift in [1, 2, 3]: + self.load_guide(shift) + self.generate_samples() + self.tensed_samples[shift] = self.samples + self.tensed_tau_samples[shift] = ( + self.samples["weight_TY"].squeeze().detach().numpy() + ) + + if not self.smoke_test: + if not os.path.exists(self.tau_samples_path): + with open(self.tau_samples_path, "wb") as file: + dill.dump(self.tensed_tau_samples, file) + + def get_tau_samples(self): + if os.path.exists(self.tau_samples_path): + with open(self.tau_samples_path, "rb") as file: + self.tensed_tau_samples = dill.load(file) + else: + raise ValueError("No tau samples found. Run generate_tensed_samples first.") + + """Returns the intervened and observed value, in the original scale""" + + def slider_values_to_interventions(self, intervened_percent, year): + try: + original_column = dg.wide[self.intervention_dataset][ + str(year) + ].values.reshape(-1, 1) + except NameError: + dg = DataGrabber() + dg.get_features_wide([self.intervention_dataset]) + original_column = dg.wide[self.intervention_dataset][ + str(year) + ].values.reshape(-1, 1) + + max = original_column.max() + + intervened_original = intervened_percent * max / 100 + + scaler = StandardScaler() + scaler.fit(original_column) + + intervened_scaled = scaler.transform(intervened_original.reshape(-1, 1)) + intervened_transformed = sigmoid(intervened_scaled, scale=1 / 3) + + # TODO this output is a bit verbose + # consider deleting what ends up not needed in the frontend + percent_calcs = { + "max": max, + "intervened_percent": intervened_percent, + "intervened_original": intervened_original, + "intervened_scaled": intervened_scaled[0, 0], + "intervened_transformed": intervened_transformed[0, 0], + } + + return percent_calcs + + def get_intervened_and_observed_values_original_scale( + self, fips, intervened_value, year + ): + dg = DataGrabber() + dg.get_features_std_wide([self.intervention_dataset, self.outcome_dataset]) + dg.get_features_wide([self.intervention_dataset]) + + # intervened value, in the original scale + intervened_original_scale = revert_standardize_and_scale_scaler( + intervened_value, year, self.intervention_dataset + ) + + fips_id = ( + dg.std_wide[self.intervention_dataset] + .loc[dg.std_wide[self.intervention_dataset]["GeoFIPS"] == fips] + .index[0] + ) + + # observed value, in the original scale + observed_original_scale = dg.wide[self.intervention_dataset].iloc[fips_id][ + str(year) + ] + + return (intervened_original_scale[0], observed_original_scale) + + def get_fips_predictions( + self, fips, intervened_value, year=None, intervention_is_percentile=False + ): + self.fips = fips + + if self.data is None: + self.data = prep_wide_data_for_inference( + outcome_dataset=self.outcome_dataset, + intervention_dataset=self.intervention_dataset, + forward_shift=3, # shift doesn't matter here, as long as data exists + ) + + # start with the latest year possible by default + if year is None: + year = self.data["years_available"][-1] + assert year in self.data["years_available"] + + self.year = year + + if intervention_is_percentile: + self.intervened_percentile = intervened_value + intervened_value = transformed_intervention_from_percentile( + self.intervention_dataset, year, intervened_value + ) + + self.intervened_value = intervened_value + + # find years for prediction + outcome_years = self.data["outcome_years"] + year_id = [int(x) for x in outcome_years].index(year) + self.year_id = year_id + + self.prediction_years = outcome_years[(year_id) : (year_id + 4)] + + # find fips unit index + dg = DataGrabber() + dg.get_features_std_wide([self.intervention_dataset, self.outcome_dataset]) + dg.get_features_wide([self.intervention_dataset]) + interventions_this_year_original = dg.wide[self.intervention_dataset][str(year)] + + self.intervened_value_original = revert_standardize_and_scale_scaler( + self.intervened_value, self.year, self.intervention_dataset + ) + + self.intervened_value_percentile = round( + ( + np.mean( + interventions_this_year_original.values + <= self.intervened_value_original + ) + * 100 + ), + 3, + ) + + self.fips_id = ( + dg.std_wide[self.intervention_dataset] + .loc[dg.std_wide[self.intervention_dataset]["GeoFIPS"] == fips] + .index[0] + ) + + self.name = dg.std_wide[self.intervention_dataset]["GeoName"].iloc[self.fips_id] + + # get observed values at the prediction times + self.observed_intervention = dg.std_wide[self.intervention_dataset].iloc[ + self.fips_id + ][str(year)] + + self.observed_intervention_original = dg.wide[self.intervention_dataset].iloc[ + self.fips_id + ][str(year)] + + if intervention_is_percentile: + self.observed_intervention_percentile = round( + ( + np.mean( + interventions_this_year_original.values + <= self.observed_intervention_original + ) + * 100 + ), + 1, + ) + + self.observed_outcomes = dg.std_wide[self.outcome_dataset].iloc[self.fips_id][ + outcome_years[year_id : (year_id + 4)] + ] + self.intervention_diff = self.intervened_value - self.observed_intervention + + self.intervention_impact = {} + self.intervention_impact_mean = [] + self.intervention_impact_low = [] + self.intervention_impact_high = [] + for shift in [1, 2, 3]: + self.intervention_impact[shift] = ( + self.tensed_tau_samples[shift] * self.intervention_diff + ) + self.intervention_impact_mean.append( + np.mean(self.intervention_impact[shift]) + ) + self.intervention_impact_low.append( + np.percentile(self.intervention_impact[shift], 2.5) + ) + self.intervention_impact_high.append( + np.percentile(self.intervention_impact[shift], 97.5) + ) + + predicted_mean = [self.observed_outcomes.iloc[0]] + ( + self.intervention_impact_mean + self.observed_outcomes.iloc[1:] + ).tolist() + predicted_low = [self.observed_outcomes.iloc[0]] + ( + self.intervention_impact_low + self.observed_outcomes.iloc[1:] + ).tolist() + predicted_high = [self.observed_outcomes.iloc[0]] + ( + self.intervention_impact_high + self.observed_outcomes.iloc[1:] + ).tolist() + + self.predictions = pd.DataFrame( + { + "year": self.prediction_years, + "observed": self.observed_outcomes, + "mean": predicted_mean, + "low": predicted_low, + "high": predicted_high, + } + ) + + self.predictions_original = revert_prediction_df( + self.predictions, self.outcome_dataset + ) + + # TODO for some reason indexing using gather doesn't pick the right indices + # look into this some time, do this by hand for now + # with self.mwc: + # self.tau_samples = self.samples['weight_TY'].squeeze().detach().numpy() + # self.tensed_observed_samples[shift] = self.tensed_intervened_samples[shift] = gather( + # self.samples['Y'], IndexSet(**{"T": {0}}), + # event_dim=0,).squeeze() + # self.tensed_intervened_samples[shift] = gather( + # self.samples['Y'], IndexSet(**{"T": {1}}), + # event_dim=0,).squeeze()#[:,self.fips_id] + + # self.tensed_outcome_difference[shift] = ( + # self.tensed_intervened_samples[shift] - self.tensed_observed_samples[shift] + # ) + return + + def plot_predictions( + self, range_multiplier=1.5, show_figure=True, scaling="transformed" + ): + assert scaling in ["transformed", "original"] + + dg = DataGrabber() + + if scaling == "transformed": + dg.get_features_std_long([self.outcome_dataset]) + plot_data = dg.std_long[self.outcome_dataset] + self.fips_observed_data = plot_data[ + plot_data["GeoFIPS"] == self.fips + ].copy() + + y_min = ( + min( + self.fips_observed_data["Value"].min(), + self.predictions["low"].min(), + ) + - 0.05 + ) + y_max = ( + max( + self.fips_observed_data["Value"].max(), + self.predictions["high"].max(), + ) + + 0.05 + ) + else: + dg.get_features_long([self.outcome_dataset]) + plot_data = dg.long[self.outcome_dataset] + + self.fips_observed_data = plot_data[ + plot_data["GeoFIPS"] == self.fips + ].copy() + + y_min = 0.8 * min( + self.fips_observed_data["Value"].min(), + self.predictions_original["low"].min(), + ) + y_max = 1.3 * max( + self.fips_observed_data["Value"].max(), + self.predictions_original["high"].max(), + ) + + fig = go.Figure() + + fig.add_trace( + go.Scatter( + x=self.fips_observed_data["Year"], + y=self.fips_observed_data["Value"], + mode="lines+markers", + name=self.fips_observed_data["GeoName"].iloc[0], + line=dict(color="darkred", width=3), + text=self.fips_observed_data["GeoName"].iloc[0], + textposition="top right", + showlegend=False, + ) + ) + + if scaling == "transformed": + fig.add_trace( + go.Scatter( + x=self.predictions["year"], + y=self.predictions["mean"], + mode="lines", + line=dict(color="blue", width=2), + name="mean prediction", + text=self.predictions["mean"], + ) + ) + + credible_interval_trace = go.Scatter( + x=pd.concat([self.predictions["year"], self.predictions["year"][::-1]]), + y=pd.concat([self.predictions["high"], self.predictions["low"][::-1]]), + fill="toself", + fillcolor="rgba(0,100,80,0.2)", + line=dict(color="rgba(255,255,255,0)"), + name="95% credible interval around mean", + ) + + else: + fig.add_trace( + go.Scatter( + x=self.predictions_original["year"], + y=self.predictions_original["mean"], + mode="lines", + line=dict(color="blue", width=2), + name="mean prediction", + text=self.predictions_original["mean"], + ) + ) + + credible_interval_trace = go.Scatter( + x=pd.concat( + [ + self.predictions_original["year"], + self.predictions_original["year"][::-1], + ] + ), + y=pd.concat( + [ + self.predictions_original["high"], + self.predictions_original["low"][::-1], + ] + ), + fill="toself", + fillcolor="rgba(255, 255, 255, 0.31)", + line=dict(color="rgba(255,255,255,0)"), + name="95% credible interval around mean", + ) + + fig.add_trace(credible_interval_trace) + + if hasattr(self, "intervened_percentile"): + intervened_value = self.intervened_percentile + observed_intervention = self.observed_intervention_percentile + + else: + intervened_value = round(self.intervened_value, 3) + observed_intervention = round(self.observed_intervention, 3) + + if scaling == "transformed": + title = ( + f"Predicted {self.outcome_dataset} in {self.name} under intervention {intervened_value} " + f"in year {self.year}
" + f"compared to the observed values under observed intervention " + f"{observed_intervention}." + ) + + else: + title = ( + f"Predicted {self.outcome_dataset} in {self.name}
" + f"under intervention {self.intervened_value_original}" + f" in year {self.year}
" + f"{self.intervened_value_percentile}% of counties received a lower intervention
" + f"observed intervention: {self.observed_intervention_original}" + ) + + fig.update_yaxes(range=[y_min, y_max]) + + fig.update_layout( + title=title, + title_font=dict(size=12), + xaxis_title="Year", + yaxis_title="Value", + template="simple_white", + legend=dict(x=0.05, y=1, traceorder="normal", orientation="h"), + ) + + self.predictions_plot = fig + + if show_figure: + fig.show() + else: + return fig + + def plot_residuals(self): + predictions = self.samples["Y"].squeeze() + self.average_predictions = torch.mean(predictions, dim=0) + plt.hist(self.average_predictions - self.data["y"].squeeze(), bins=70) + plt.xlabel("residuals") + plt.ylabel("counts") + plt.text( + 0.7, + -0.1, + "(colored by year)", + ha="left", + va="bottom", + transform=plt.gca().transAxes, + ) + plt.show() + + def predictive_check(self): + y_flat = self.data["y"].view(-1) + observed_mean = torch.mean(y_flat) + tss = torch.sum((y_flat - observed_mean) ** 2) + average_predictions_flat = self.average_predictions.view(-1) + rss = torch.sum((y_flat - average_predictions_flat) ** 2) + r_squared = 1 - (rss / tss) + self.r_squared = r_squared + rounded_r_squared = np.round(r_squared.item(), 2) + plt.scatter(y=average_predictions_flat, x=y_flat) + plt.title( + f"{self.intervention_dataset}, {self.outcome_dataset}, " + f"R2={rounded_r_squared}" + ) + plt.ylabel("average prediction") + plt.xlabel("observed outcome") + plt.show + + def estimate_ATE(self): + tau_samples = self.samples["weight_TY"].squeeze().detach().numpy() + plt.hist(tau_samples, bins=70) + plt.axvline( + x=tau_samples.mean(), + color="red", + linestyle="dashed", + linewidth=2, + label=f"mean = {tau_samples.mean():.3f}", + ) + plt.title( + f"ATE for {self.intervention_dataset} and {self.outcome_dataset} " + f"with forward shift = {self.forward_shift}" + ) + plt.ylabel("counts") + plt.xlabel("ATE") + plt.legend() + plt.show() diff --git a/build/cities/queries/causal_insight_slim.py b/build/cities/queries/causal_insight_slim.py new file mode 100644 index 00000000..3efc6d09 --- /dev/null +++ b/build/cities/queries/causal_insight_slim.py @@ -0,0 +1,681 @@ +import os + +import dill +import numpy as np +import pandas as pd +import plotly.graph_objects as go +from sklearn.preprocessing import StandardScaler + +from cities.utils.cleaning_utils import ( + revert_prediction_df, + revert_standardize_and_scale_scaler, + sigmoid, +) +from cities.utils.data_grabber import DataGrabber, find_repo_root +from cities.utils.percentiles import transformed_intervention_from_percentile + + +class CausalInsightSlim: + def __init__( + self, + outcome_dataset, + intervention_dataset, + num_samples=1000, + sites=None, + smoke_test=None, + ): + self.outcome_dataset = outcome_dataset + self.intervention_dataset = intervention_dataset + self.root = find_repo_root() + self.num_samples = num_samples + self.smoke_test = smoke_test + self.data = None + + self.tau_samples_path = os.path.join( + self.root, + "data/tau_samples", + f"{self.intervention_dataset}_{self.outcome_dataset}_{self.num_samples}_tau.pkl", + ) + + def get_tau_samples(self): + if os.path.exists(self.tau_samples_path): + with open(self.tau_samples_path, "rb") as file: + self.tensed_tau_samples = dill.load(file) + else: + raise ValueError("No tau samples found. Run generate_tensed_samples first.") + + def slider_values_to_interventions(self, intervened_percent, year): + try: + original_column = dg.wide[self.intervention_dataset][ + str(year) + ].values.reshape(-1, 1) + except NameError: + dg = DataGrabber() + dg.get_features_wide([self.intervention_dataset]) + original_column = dg.wide[self.intervention_dataset][ + str(year) + ].values.reshape(-1, 1) + + max = original_column.max() + + intervened_original = intervened_percent * max / 100 + + scaler = StandardScaler() + scaler.fit(original_column) + + intervened_scaled = scaler.transform(intervened_original.reshape(-1, 1)) + intervened_transformed = sigmoid(intervened_scaled, scale=1 / 3) + + # TODO this output is a bit verbose + # consider deleting what ends up not needed in the frontend + percent_calcs = { + "max": max, + "intervened_percent": intervened_percent, + "intervened_original": intervened_original, + "intervened_scaled": intervened_scaled[0, 0], + "intervened_transformed": intervened_transformed[0, 0], + } + + return percent_calcs + + def get_intervened_and_observed_values_original_scale( + self, fips, intervened_value, year + ): + dg = DataGrabber() + dg.get_features_std_wide([self.intervention_dataset, self.outcome_dataset]) + dg.get_features_wide([self.intervention_dataset]) + + # intervened value, in the original scale + intervened_original_scale = revert_standardize_and_scale_scaler( + intervened_value, year, self.intervention_dataset + ) + + fips_id = ( + dg.std_wide[self.intervention_dataset] + .loc[dg.std_wide[self.intervention_dataset]["GeoFIPS"] == fips] + .index[0] + ) + + # observed value, in the original scale + observed_original_scale = dg.wide[self.intervention_dataset].iloc[fips_id][ + str(year) + ] + + return (intervened_original_scale[0], observed_original_scale) + + def get_group_predictions( + self, + group, + intervened_value, + year=None, + intervention_is_percentile=False, + produce_original=True, + ): + self.group_clean = list(set(group)) + self.group_clean.sort() + self.produce_original = produce_original + + if self.data is None: + file_path = os.path.join( + self.root, + "data/years_available", + f"{self.intervention_dataset}_{self.outcome_dataset}.pkl", + ) + with open(file_path, "rb") as file: + self.data = dill.load(file) + + if year is None: + year = self.data["years_available"][-1] + assert year in self.data["years_available"] + + self.year = year + + if intervention_is_percentile: + self.intervened_percentile = intervened_value + intervened_value = transformed_intervention_from_percentile( + self.intervention_dataset, year, intervened_value + ) + + self.intervened_value = intervened_value + + # find years for prediction + outcome_years = self.data["outcome_years"] + year_id = [int(x) for x in outcome_years].index(year) + self.year_id = year_id + + self.prediction_years = outcome_years[(year_id) : (year_id + 4)] + + dg = DataGrabber() + dg.get_features_std_wide([self.intervention_dataset, self.outcome_dataset]) + dg.get_features_wide([self.intervention_dataset, self.outcome_dataset]) + interventions_this_year_original = dg.wide[self.intervention_dataset][str(year)] + + self.intervened_value_original = revert_standardize_and_scale_scaler( + self.intervened_value, self.year, self.intervention_dataset + ) + + self.intervened_value_percentile = round( + ( + np.mean( + interventions_this_year_original.values + <= self.intervened_value_original + ) + * 100 + ), + 3, + ) + + # note: ids will be inceasingly sorted + self.fips_ids = ( + dg.std_wide[self.intervention_dataset] + .loc[ + dg.std_wide[self.intervention_dataset]["GeoFIPS"].isin(self.group_clean) + ] + .index.tolist() + ) + + assert len(self.fips_ids) == len(self.group_clean) + assert set( + dg.std_wide[self.intervention_dataset]["GeoFIPS"].iloc[self.fips_ids] + ) == set(self.group_clean) + + self.names = dg.std_wide[self.intervention_dataset]["GeoName"].iloc[ + self.fips_ids + ] + + self.observed_interventions = dg.std_wide[self.intervention_dataset].iloc[ + self.fips_ids + ][str(year)] + + self.observed_interventions_original = ( + dg.wide[self.intervention_dataset].iloc[self.fips_ids][str(year)].copy() + ) + + # + if intervention_is_percentile: + self.observed_interventions_percentile = ( + np.round( + [ + np.mean(interventions_this_year_original.values <= obs) + for obs in self.observed_interventions_original + ], + 3, + ) + * 100 + ) + + self.observed_outcomes = dg.std_wide[self.outcome_dataset].iloc[self.fips_ids][ + outcome_years[year_id : (year_id + 4)] + ] + + self.observed_outcomes_original = dg.wide[self.outcome_dataset].iloc[ + self.fips_ids + ][outcome_years[year_id : (year_id + 4)]] + + self.intervention_diffs = self.intervened_value - self.observed_interventions + + self.intervention_impacts = {} + self.intervention_impacts_means = [] + self.intervention_impacts_lows = [] + self.intervention_impacts_highs = [] + for shift in [1, 2, 3]: + self.intervention_impacts[shift] = np.outer( + self.tensed_tau_samples[shift], self.intervention_diffs + ) + self.intervention_impacts_means.append( + np.mean(self.intervention_impacts[shift], axis=0) + ) + self.intervention_impacts_lows.append( + np.percentile(self.intervention_impacts[shift], axis=0, q=2.5) + ) + self.intervention_impacts_highs.append( + np.percentile(self.intervention_impacts[shift], axis=0, q=97.5) + ) + + intervention_impacts_means_array = np.column_stack( + self.intervention_impacts_means + ) + intervention_impacts_lows_array = np.column_stack( + self.intervention_impacts_lows + ) + intervention_impacts_highs_array = np.column_stack( + self.intervention_impacts_highs + ) + + future_predicted_means = ( + self.observed_outcomes.iloc[:, 1:] + intervention_impacts_means_array + ) + # predicted_means = np.insert( + # future_predicted_means, 0, self.observed_outcomes.iloc[:, 0], axis=1 + # ) #TODO delete if the new version raises no index error + + predicted_means = np.column_stack( + [self.observed_outcomes.iloc[:, 0], future_predicted_means] + ) + + future_predicted_lows = ( + self.observed_outcomes.iloc[:, 1:] + intervention_impacts_lows_array + ) + predicted_lows = np.column_stack( + [self.observed_outcomes.iloc[:, 0], future_predicted_lows] + ) + # predicted_lows = np.insert( + # future_predicted_lows, 0, self.observed_outcomes.iloc[:, 0], axis=1 + # ) #TODO as above + + future_predicted_highs = ( + self.observed_outcomes.iloc[:, 1:] + intervention_impacts_highs_array + ) + # predicted_highs = np.insert( + # future_predicted_highs, 0, self.observed_outcomes.iloc[:, 0], axis=1 + # ) #TODO as above + + predicted_highs = np.column_stack( + [self.observed_outcomes.iloc[:, 0], future_predicted_highs] + ) + + if self.produce_original: + pred_means_reverted = [] + pred_lows_reverted = [] + pred_highs_reverted = [] + obs_out_reverted = [] + for i in range(predicted_means.shape[1]): + y = self.prediction_years[i] + obs_out_reverted.append( + revert_standardize_and_scale_scaler( + self.observed_outcomes.iloc[:, i], y, self.outcome_dataset + ) + ) + pred_means_reverted.append( + revert_standardize_and_scale_scaler( + predicted_means[:, i], y, self.outcome_dataset + ) + ) + + pred_lows_reverted.append( + revert_standardize_and_scale_scaler( + predicted_lows[:, i], y, self.outcome_dataset + ) + ) + + pred_highs_reverted.append( + revert_standardize_and_scale_scaler( + predicted_highs[:, i], y, self.outcome_dataset + ) + ) + + obs_out_reverted = np.column_stack(obs_out_reverted) + diff = obs_out_reverted - self.observed_outcomes_original + diff = np.array(diff) + obs_out_corrected = obs_out_reverted - diff + pred_means_reverted = np.column_stack(pred_means_reverted) + pred_means_corrected = pred_means_reverted - diff + pred_lows_reverted = np.column_stack(pred_lows_reverted) + pred_lows_corrected = pred_lows_reverted - diff + pred_highs_reverted = np.column_stack(pred_highs_reverted) + pred_highs_corrected = pred_highs_reverted - diff + + self.observed_outcomes_corrected = pd.DataFrame(obs_out_corrected) + self.observed_outcomes_corrected.index = self.observed_outcomes.index + + assert predicted_means.shape == pred_means_corrected.shape + assert predicted_lows.shape == pred_lows_corrected.shape + assert predicted_highs.shape == pred_highs_corrected.shape + + assert int(predicted_means.shape[0]) == len(self.group_clean) + assert int(predicted_means.shape[1]) == 4 + assert int(predicted_lows.shape[0]) == len(self.group_clean) + assert int(predicted_lows.shape[1]) == 4 + assert int(predicted_highs.shape[0]) == len(self.group_clean) + assert int(predicted_highs.shape[1]) == 4 + + self.group_predictions = { + self.group_clean[i]: pd.DataFrame( + { + "year": self.prediction_years, + "observed": self.observed_outcomes.loc[self.fips_ids[i]], + "mean": predicted_means[i,], + "low": predicted_lows[i,], + "high": predicted_highs[i,], + } + ) + for i in range(len(self.group_clean)) + } + + if self.produce_original: + self.group_predictions_original = { + self.group_clean[i]: pd.DataFrame( + { + "year": self.prediction_years, + "observed": self.observed_outcomes_corrected.loc[ + self.fips_ids[i] + ], + "mean": pred_means_corrected[i,], + "low": pred_lows_corrected[i,], + "high": pred_highs_corrected[i,], + } + ) + for i in range(len(self.group_clean)) + } + + def get_fips_predictions( + self, fips, intervened_value, year=None, intervention_is_percentile=False + ): + self.fips = fips + + if self.data is None: + file_path = os.path.join( + self.root, + "data/years_available", + f"{self.intervention_dataset}_{self.outcome_dataset}.pkl", + ) + with open(file_path, "rb") as file: + self.data = dill.load(file) + + # start with the latest year possible by default + if year is None: + year = self.data["years_available"][-1] + assert year in self.data["years_available"] + + self.year = year + + if intervention_is_percentile: + self.intervened_percentile = intervened_value + intervened_value = transformed_intervention_from_percentile( + self.intervention_dataset, year, intervened_value + ) + + self.intervened_value = intervened_value + + # find years for prediction + outcome_years = self.data["outcome_years"] + year_id = [int(x) for x in outcome_years].index(year) + self.year_id = year_id + + self.prediction_years = outcome_years[(year_id) : (year_id + 4)] + + dg = DataGrabber() + dg.get_features_std_wide([self.intervention_dataset, self.outcome_dataset]) + dg.get_features_wide([self.intervention_dataset, self.outcome_dataset]) + interventions_this_year_original = dg.wide[self.intervention_dataset][str(year)] + + self.intervened_value_original = revert_standardize_and_scale_scaler( + self.intervened_value, self.year, self.intervention_dataset + ) + + self.intervened_value_percentile = round( + ( + np.mean( + interventions_this_year_original.values + <= self.intervened_value_original + ) + * 100 + ), + 3, + ) + + self.fips_id = ( + dg.std_wide[self.intervention_dataset] + .loc[dg.std_wide[self.intervention_dataset]["GeoFIPS"] == fips] + .index[0] + ) + + self.name = dg.std_wide[self.intervention_dataset]["GeoName"].iloc[self.fips_id] + + # get observed values at the prediction times + self.observed_intervention = dg.std_wide[self.intervention_dataset].iloc[ + self.fips_id + ][str(year)] + + self.observed_intervention_original = dg.wide[self.intervention_dataset].iloc[ + self.fips_id + ][str(year)] + + if intervention_is_percentile: + self.observed_intervention_percentile = round( + ( + np.mean( + interventions_this_year_original.values + <= self.observed_intervention_original + ) + * 100 + ), + 1, + ) + + self.observed_outcomes = dg.std_wide[self.outcome_dataset].iloc[self.fips_id][ + outcome_years[year_id : (year_id + 4)] + ] + + # added + self.observed_outcomes_original = dg.wide[self.outcome_dataset].iloc[ + self.fips_id + ][outcome_years[year_id : (year_id + 4)]] + + self.intervention_diff = self.intervened_value - self.observed_intervention + + self.intervention_impact = {} + self.intervention_impact_mean = [] + self.intervention_impact_low = [] + self.intervention_impact_high = [] + for shift in [1, 2, 3]: + self.intervention_impact[shift] = ( + self.tensed_tau_samples[shift] * self.intervention_diff + ) + self.intervention_impact_mean.append( + np.mean(self.intervention_impact[shift]) + ) + self.intervention_impact_low.append( + np.percentile(self.intervention_impact[shift], 2.5) + ) + self.intervention_impact_high.append( + np.percentile(self.intervention_impact[shift], 97.5) + ) + + predicted_mean = [self.observed_outcomes.iloc[0]] + ( + self.intervention_impact_mean + self.observed_outcomes.iloc[1:] + ).tolist() + predicted_low = [self.observed_outcomes.iloc[0]] + ( + self.intervention_impact_low + self.observed_outcomes.iloc[1:] + ).tolist() + predicted_high = [self.observed_outcomes.iloc[0]] + ( + self.intervention_impact_high + self.observed_outcomes.iloc[1:] + ).tolist() + + self.predictions = pd.DataFrame( + { + "year": self.prediction_years, + "observed": self.observed_outcomes, + "mean": predicted_mean, + "low": predicted_low, + "high": predicted_high, + } + ) + + self.predictions_original = revert_prediction_df( + self.predictions, self.outcome_dataset + ) + + # this corrects for rever transformation perturbations + difference = ( + self.predictions_original["observed"] - self.observed_outcomes_original + ) + self.predictions_original[["observed", "mean", "low", "high"]] = ( + self.predictions_original[["observed", "mean", "low", "high"]].sub( + difference, axis=0 + ) + ) + + def plot_predictions( + self, range_multiplier=1.5, show_figure=True, scaling="transformed", fips=None + ): + assert scaling in ["transformed", "original"] + + # you need to pass fips + # and grab the appropriate predictions + # if you started with group predictions + if fips is not None: + self.fips = fips + self.predictions = self.group_predictions[fips] + self.predictions_original = self.group_predictions_original[fips] + + self.observed_intervention = self.observed_interventions[ + self.fips_ids[self.group_clean.index(fips)] + ] + self.observed_intervention_original = self.observed_interventions_original[ + self.fips_ids[self.group_clean.index(fips)] + ] + + self.name = self.names[self.fips_ids[self.group_clean.index(fips)]] + + dg = DataGrabber() + + if scaling == "transformed": + dg.get_features_std_long([self.outcome_dataset]) + plot_data = dg.std_long[self.outcome_dataset] + self.fips_observed_data = plot_data[ + plot_data["GeoFIPS"] == self.fips + ].copy() + + y_min = ( + min( + self.fips_observed_data["Value"].min(), + self.predictions["low"].min(), + ) + - 0.05 + ) + y_max = ( + max( + self.fips_observed_data["Value"].max(), + self.predictions["high"].max(), + ) + + 0.05 + ) + else: + dg.get_features_long([self.outcome_dataset]) + plot_data = dg.long[self.outcome_dataset] + + self.fips_observed_data = plot_data[ + plot_data["GeoFIPS"] == self.fips + ].copy() + + y_min = 0.8 * min( + self.fips_observed_data["Value"].min(), + self.predictions_original["low"].min(), + ) + y_max = 1.3 * max( + self.fips_observed_data["Value"].max(), + self.predictions_original["high"].max(), + ) + + fig = go.Figure() + + fig.add_trace( + go.Scatter( + x=self.fips_observed_data["Year"], + y=self.fips_observed_data["Value"], + mode="lines+markers", + name=self.fips_observed_data["GeoName"].iloc[0], + line=dict(color="darkred", width=3), + text=self.fips_observed_data["GeoName"].iloc[0], + textposition="top right", + showlegend=False, + ) + ) + + if scaling == "transformed": + fig.add_trace( + go.Scatter( + x=self.predictions["year"], + y=self.predictions["mean"], + mode="lines", + line=dict(color="blue", width=2), + name="mean prediction", + text=self.predictions["mean"], + ) + ) + + credible_interval_trace = go.Scatter( + x=pd.concat([self.predictions["year"], self.predictions["year"][::-1]]), + y=pd.concat([self.predictions["high"], self.predictions["low"][::-1]]), + fill="toself", + fillcolor="rgba(0,100,80,0.2)", + line=dict(color="rgba(255,255,255,0)"), + name="95% credible interval around mean", + ) + + else: + fig.add_trace( + go.Scatter( + x=self.predictions_original["year"], + y=self.predictions_original["mean"], + mode="lines", + line=dict(color="blue", width=2), + name="mean prediction", + text=self.predictions_original["mean"], + ) + ) + + credible_interval_trace = go.Scatter( + x=pd.concat( + [ + self.predictions_original["year"], + self.predictions_original["year"][::-1], + ] + ), + y=pd.concat( + [ + self.predictions_original["high"], + self.predictions_original["low"][::-1], + ] + ), + fill="toself", + fillcolor="rgba(255, 255, 255, 0.31)", + line=dict(color="rgba(255,255,255,0)"), + name="95% credible interval around mean", + ) + + fig.add_trace(credible_interval_trace) + + if hasattr(self, "intervened_percentile"): + intervened_value = self.intervened_percentile + observed_intervention = self.observed_intervention_percentile + + else: + intervened_value = round(self.intervened_value, 3) + observed_intervention = round(self.observed_intervention, 3) + + if scaling == "transformed": + title = ( + f"Predicted {self.outcome_dataset} in {self.name} under intervention {intervened_value} " + f"in year {self.year}
" + f"compared to the observed values under observed intervention " + f"{observed_intervention}." + ) + + else: + title = ( + f"Predicted {self.outcome_dataset} in {self.name}
" + f"under intervention {self.intervened_value_original}" + f" in year {self.year}
" + f"{self.intervened_value_percentile}% of counties received a lower intervention
" + f"observed intervention: {self.observed_intervention_original}" + ) + + fig.update_yaxes(range=[y_min, y_max]) + + fig.update_layout( + title=title, + title_font=dict(size=12), + xaxis_title="Year", + yaxis_title="Value", + template="simple_white", + legend=dict(x=0.05, y=1, traceorder="normal", orientation="h"), + ) + + self.predictions_plot = fig + + if show_figure: + fig.show() + else: + return fig diff --git a/build/cities/queries/fips_query.py b/build/cities/queries/fips_query.py new file mode 100644 index 00000000..5d6a14f3 --- /dev/null +++ b/build/cities/queries/fips_query.py @@ -0,0 +1,797 @@ +import numpy as np +import pandas as pd +import plotly.graph_objects as go + +from cities.utils.data_grabber import ( + DataGrabber, + MSADataGrabber, + check_if_tensed, + list_available_features, +) +from cities.utils.similarity_utils import ( + compute_weight_array, + generalized_euclidean_distance, + plot_weights, + slice_with_lag, +) + +# from scipy.spatial import distance + + +class FipsQuery: + """ + Class for querying and analyzing jurisdiction data for a specific FIPS code, + in terms of specified feature groups, outcome variable, time lag, and other, listed parameters. + """ + + def __init__( + self, + fips, + outcome_var=None, + feature_groups_with_weights=None, + lag=0, + top=5, + time_decay=1.08, + outcome_comparison_period=None, + outcome_percentile_range=None, + ): + """ + Initialize the FipsQuery instance. + + :param fips: the FIPS code of interest. + :param outcome_var: the outcome variable for analysis (optional, defaults to None). + :param feature_groups_with_weights: a dictionary specifying feature groups and their weights + (weights should beint between -4 and 4). + :param lag: time lag for comparing outcomes with historical data (int between 0 and 6). + :param top: the number of top locations to consider in comparisons (defaults to 5). + :param time_decay: adjusts the weight decay over time in the generalized Euclidean distance calculation + (default is 1.08, giving somewhat more weight to more recent data). + :param outcome_comparison_period: specifies the years to consider for the outcome comparison, + can be used only when lag=0 (defaults to None). + :param outcome_percentile_range: percentile range for filtering locations based on the most recent value + of the outcome variable (defaults to None). + """ + + if feature_groups_with_weights is None and outcome_var: + feature_groups_with_weights = {outcome_var: 4} + + if outcome_var: + outcome_var_dict = { + outcome_var: feature_groups_with_weights.pop(outcome_var) + } + outcome_var_dict.update(feature_groups_with_weights) + feature_groups_with_weights = outcome_var_dict + + assert not ( + lag > 0 and outcome_var is None + ), "lag will be idle with no outcome variable" + + assert not ( + lag > 0 and outcome_comparison_period is not None + ), "outcome_comparison_period is only used when lag = 0" + + assert not ( + outcome_var is None and outcome_comparison_period is not None + ), "outcome_comparison_period requires an outcome variable" + + assert not ( + outcome_var is None and outcome_percentile_range is not None + ), "outcome_percentile_range requires an outcome variable" + + self.all_available_features = list_available_features() + + feature_groups = list(feature_groups_with_weights.keys()) + + assert feature_groups, "You need to specify at least one feature group" + + assert all( + isinstance(value, int) and -4 <= value <= 4 + for value in feature_groups_with_weights.values() + ), "Feature weights need to be integers between -4 and 4" + + self.feature_groups_with_weights = feature_groups_with_weights + self.feature_groups = feature_groups + self.data = DataGrabber() + self.repo_root = self.data.repo_root + self.fips = fips + self.lag = lag + self.top = top + self.gdp_var = "gdp" + + # it's fine if they're None (by default) + self.outcome_var = outcome_var + self.outcome_comparison_period = outcome_comparison_period + + self.time_decay = time_decay + + if self.gdp_var not in self.feature_groups: + self.all_features = [self.gdp_var] + feature_groups + else: + self.all_features = feature_groups + + self.data.get_features_std_wide(self.all_features) + self.data.get_features_wide(self.all_features) + + assert ( + fips in self.data.std_wide[self.gdp_var]["GeoFIPS"].values + ), "FIPS not found in the data set." + self.name = self.data.std_wide[self.gdp_var]["GeoName"][ + self.data.std_wide[self.gdp_var]["GeoFIPS"] == self.fips + ].values[0] + + assert ( + self.lag >= 0 and self.lag < 6 and isinstance(self.lag, int) + ), "lag must be an iteger between 0 and 5" + assert ( + self.top > 0 + and isinstance(self.top, int) + and self.top + < 2800 # TODO Make sure the number makes sense once we add all datasets we need + ), "top must be a positive integer smaller than the number of locations in the dataset" + + if outcome_var: + assert check_if_tensed( + self.data.std_wide[self.outcome_var] + ), "Outcome needs to be a time series." + + self.outcome_with_percentiles = self.data.std_wide[self.outcome_var].copy() + most_recent_outcome = self.data.wide[self.outcome_var].iloc[:, -1].values + self.outcome_with_percentiles["percentile"] = ( + most_recent_outcome < most_recent_outcome[:, np.newaxis] + ).sum(axis=1) / most_recent_outcome.shape[0] + self.outcome_with_percentiles["percentile"] = round( + self.outcome_with_percentiles["percentile"] * 100, 2 + ) + self.outcome_percentile_range = outcome_percentile_range + + def compare_my_outcome_to_others(self, range_multiplier=2, sample_size=250): + """ + Compare the outcome of the selected location to a sample of other locations. + + This method generates a plot comparing the outcome of the current location to a + random sample of other locations. The plot creates a line for the current location + and lines for the sampled locations, providing a visual comparison. + It also marks the precentile at which the current location falls among *all* locations. + + :param range_multiplier: multiplier for adjusting the y-axis range (defaults to 2). + :param sample_size: random sample size of other locations (defaults to 250). + """ + + assert self.outcome_var, "Outcome comparison requires an outcome variable." + + self.data.get_features_long([self.outcome_var]) + plot_data = self.data.long[self.outcome_var] + my_plot_data = plot_data[plot_data["GeoFIPS"] == self.fips].copy() + my_percentile = self.outcome_with_percentiles["percentile"][ + self.outcome_with_percentiles["GeoFIPS"] == self.fips + ].values[0] + + others_plot_data = plot_data[plot_data["GeoFIPS"] != self.fips] + + fips = others_plot_data["GeoFIPS"].unique() + sampled_fips = np.random.choice(fips, sample_size, replace=False) + others_sampled_plot_data = plot_data[plot_data["GeoFIPS"].isin(sampled_fips)] + + y_min = my_plot_data["Value"].mean() - ( + range_multiplier * my_plot_data["Value"].std() + ) + y_max = my_plot_data["Value"].mean() + ( + range_multiplier * my_plot_data["Value"].std() + ) + + fig = go.Figure(layout_yaxis_range=[y_min, y_max]) + + for i, geoname in enumerate(others_sampled_plot_data["GeoName"].unique()): + subset = others_plot_data[others_plot_data["GeoName"] == geoname] + # line_color = shades_of_grey[i % len(shades_of_grey)] + # line_color = pastel_colors[i % len(pastel_colors)] + line_color = "lightgray" + fig.add_trace( + go.Scatter( + x=subset["Year"], + y=subset["Value"], + mode="lines", + name=subset["GeoName"].iloc[0], + line_color=line_color, + text=subset["GeoName"].iloc[0], + textposition="top right", + showlegend=False, + opacity=0.4, + ) + ) + + fig.add_trace( + go.Scatter( + x=my_plot_data["Year"], + y=my_plot_data["Value"], + mode="lines", + name=my_plot_data["GeoName"].iloc[0], + line=dict(color="darkred", width=3), + text=my_plot_data["GeoName"].iloc[0], + textposition="top right", + showlegend=False, + ) + ) + + label_x = my_plot_data["Year"].iloc[-1] - 2 + label_y = my_plot_data["Value"].iloc[-1] * 1.2 + fig.add_annotation( + text=f"Location recent percentile: {my_percentile}%", + x=label_x, + y=label_y, + showarrow=False, + font=dict(size=12, color="darkred"), + ) + + title = f"{self.outcome_var} of {self.name}, compared to {sample_size} random other locations" + fig.update_layout( + title=title, + xaxis_title="Year", + yaxis_title=f"{self.outcome_var}", + template="simple_white", + ) + + fig.show() + + def find_euclidean_kins(self): + """ + Find Euclidean kin locations based on the specified features, weights and outcome variable. + + This method calculates the Euclidean distance between the specified location and other + locations in the dataset based on the selected feature groups and outcome variable. It + adds information about the distance and the percentiles of the outcome variable to the + resulting dataframe, allowing for the identification of similar locations. + """ + + # cut the relevant years from the outcome variable + if self.outcome_comparison_period and self.outcome_var: + start_year, end_year = self.outcome_comparison_period + + outcome_df = self.data.std_wide[self.outcome_var].copy() + + condition = (outcome_df.columns[2:].copy().astype(int) >= start_year) & ( + outcome_df.columns[2:].copy().astype(int) <= end_year + ) + selected_columns = outcome_df.columns[2:][condition].copy() + filtered_dataframe = outcome_df[selected_columns] + + restricted_df = pd.concat( + [outcome_df.iloc[:, :2].copy(), filtered_dataframe], axis=1 + ) + + elif self.outcome_var: + restricted_df = self.data.std_wide[self.outcome_var].copy() + + if self.outcome_var: + self.restricted_outcome_df = restricted_df + + # apply lag in different directions to you and other locations + # to the outcome variable + if self.outcome_var: + self.outcome_slices = slice_with_lag(restricted_df, self.fips, self.lag) + + self.my_array = np.array(self.outcome_slices["my_array"]) + self.other_arrays = np.array(self.outcome_slices["other_arrays"]) + + assert self.my_array.shape[1] == self.other_arrays.shape[1] + + self.my_df = self.data.wide[self.outcome_var][ + self.data.wide[self.outcome_var]["GeoFIPS"] == self.fips + ].copy() + + self.other_df = self.outcome_slices["other_df"] + self.other_df = self.data.wide[self.outcome_var][ + self.data.wide[self.outcome_var]["GeoFIPS"] != self.fips + ].copy() + else: + self.my_df = pd.DataFrame( + self.data.wide[self.gdp_var][ + self.data.wide[self.gdp_var]["GeoFIPS"] == self.fips + ].iloc[:, :2] + ) + self.other_df = pd.DataFrame( + self.data.wide[self.gdp_var][ + self.data.wide[self.gdp_var]["GeoFIPS"] != self.fips + ].iloc[:, :2] + ) + + # add data on other features to the arrays + # prior to distance computation + + if self.outcome_var: + before_shape = self.other_df.shape + + my_features_arrays = np.array([]) + others_features_arrays = np.array([]) + feature_column_count = 0 + for feature in self.feature_groups: + if feature != self.outcome_var: + _extracted_df = self.data.wide[feature].copy() + feature_column_count += _extracted_df.shape[1] - 2 + _extracted_my_df = _extracted_df[_extracted_df["GeoFIPS"] == self.fips] + _extracted_other_df = _extracted_df[ + _extracted_df["GeoFIPS"] != self.fips + ] + + _extracted_other_df.columns = [ + f"{col}_{feature}" if col not in ["GeoFIPS", "GeoName"] else col + for col in _extracted_other_df.columns + ] + + _extracted_my_df.columns = [ + f"{col}_{feature}" if col not in ["GeoFIPS", "GeoName"] else col + for col in _extracted_my_df.columns + ] + + assert ( + _extracted_df.shape[1] + == _extracted_my_df.shape[1] + == _extracted_other_df.shape[1] + ) + + self.my_df = pd.concat( + (self.my_df, _extracted_my_df.iloc[:, 2:]), axis=1 + ) + + self.other_df = pd.concat( + (self.other_df, _extracted_other_df.iloc[:, 2:]), axis=1 + ) + + if self.outcome_var is None: + assert ( + self.my_df.shape[1] + == self.other_df.shape[1] + == feature_column_count + 2 + ) + + if self.outcome_var: + after_shape = self.other_df.shape + assert ( + before_shape[0] == after_shape[0] + ), "Feature merging went wrong!" + + _extracted_df_std = self.data.std_wide[feature].copy() + _extracted_other_array = np.array( + _extracted_df_std[_extracted_df_std["GeoFIPS"] != self.fips].iloc[ + :, 2: + ] + ) + _extracted_my_array = np.array( + _extracted_df_std[_extracted_df_std["GeoFIPS"] == self.fips].iloc[ + :, 2: + ] + ) + + if my_features_arrays.size == 0: + my_features_arrays = _extracted_my_array + else: + my_features_arrays = np.hstack( + (my_features_arrays, _extracted_my_array) + ) + + if others_features_arrays.size == 0: + others_features_arrays = _extracted_other_array + else: + others_features_arrays = np.hstack( + (others_features_arrays, _extracted_other_array) + ) + + if len(self.feature_groups) > 1 and self.outcome_var: + self.my_array = np.hstack((self.my_array, my_features_arrays)) + self.other_arrays = np.hstack((self.other_arrays, others_features_arrays)) + elif self.outcome_var is None: + self.my_array = my_features_arrays.copy() + self.other_arrays = others_features_arrays.copy() + + if self.outcome_var is None: + assert ( + feature_column_count + == self.my_array.shape[1] + == self.other_arrays.shape[1] + ) + assert my_features_arrays.shape == self.my_array.shape + assert others_features_arrays.shape == self.other_arrays.shape + + compute_weight_array(self, self.time_decay) + + diff = self.all_weights.shape[0] - self.other_arrays.shape[1] + self.all_weights = self.all_weights[diff:] + + # if self.outcome_var: + assert ( + self.other_arrays.shape[1] == self.all_weights.shape[0] + ), "Weights and arrays are misaligned" + + distances = [] + featurewise_contributions = [] + for vector in self.other_arrays: + _ge = generalized_euclidean_distance( + np.squeeze(self.my_array), vector, self.all_weights + ) + distances.append(_ge["distance"]) + featurewise_contributions.append(_ge["featurewise_contributions"]) + + # keep weighted distance contribution of each individual feature + featurewise_contributions_array = np.vstack(featurewise_contributions) + + assert featurewise_contributions_array.shape[1] == len(self.all_weights) + + # turn into df, add ID columns and sort by distance + featurewise_contributions_df = pd.DataFrame( + featurewise_contributions_array, columns=self.all_columns + ) + featurewise_contributions_df[f"distance to {self.fips}"] = distances + featurewise_contributions_df = pd.concat( + [self.other_df[["GeoFIPS", "GeoName"]], featurewise_contributions_df], + axis=1, + ) + featurewise_contributions_df.sort_values( + by=featurewise_contributions_df.columns[-1], inplace=True + ) + + # isolate ID columns with distance, tensed columns, atemporal columns + tensed_column_names = [ + col for col in featurewise_contributions_df.columns if col[:4].isdigit() + ] + atemporal_column_names = [ + col for col in featurewise_contributions_df.columns if not col[:4].isdigit() + ] + id_column_names = atemporal_column_names[0:2] + [atemporal_column_names[-1]] + atemporal_column_names = [ + col for col in atemporal_column_names if col not in id_column_names + ] + + id_df = featurewise_contributions_df[id_column_names] + tensed_featurewise_contributions_df = featurewise_contributions_df[ + tensed_column_names + ] + atemporal_featurewise_contributions_df = featurewise_contributions_df[ + atemporal_column_names + ] + + # aggregate tensed features (sum across years) + aggregated_tensed_featurewise_contributions_df = ( + tensed_featurewise_contributions_df.T.groupby( + tensed_featurewise_contributions_df.columns.str[5:] + ) + .sum() + .T + ) + + # aggregate atemporal features (sum across official feature list) + atemporal_aggregated_dict = {} + for feature in list(self.all_available_features): + _selected = [ + col + for col in atemporal_featurewise_contributions_df.columns + if col.endswith(feature) + ] + if _selected: + atemporal_aggregated_dict[feature] = ( + atemporal_featurewise_contributions_df[_selected].sum(axis=1) + ) + + aggregated_atemporal_featurewise_contributions_df = pd.DataFrame( + atemporal_aggregated_dict + ) + + self.featurewise_contributions = featurewise_contributions_df + + # put together the aggregated featurewise contributions + # and normalize row-wise + # numbers now mean: "percentage of contribution to the distance" + self.aggregated_featurewise_contributions = pd.concat( + [ + id_df, + aggregated_tensed_featurewise_contributions_df, + aggregated_atemporal_featurewise_contributions_df, + ], + axis=1, + ) + columns_to_normalize = self.aggregated_featurewise_contributions.iloc[:, 3:] + self.aggregated_featurewise_contributions.iloc[:, 3:] = ( + columns_to_normalize.div(columns_to_normalize.sum(axis=1), axis=0) + ) + + # some sanity checks + count = sum([1 for distance in distances if distance == 0]) + + assert ( + len(distances) == self.other_arrays.shape[0] + ), "Distances and arrays are misaligned" + assert ( + len(distances) == self.other_df.shape[0] + ), "Distances and df are misaligned" + + # #self.other_df[f"distance to {self.fips}"] = distances #remove soon if no errors + self.other_df.loc[:, f"distance to {self.fips}"] = distances + + count_zeros = (self.other_df[f"distance to {self.fips}"] == 0).sum() + assert count_zeros == count, "f{count_zeros} zeros in alien distances!" + + # sort and put together euclidean kins + self.other_df.sort_values(by=self.other_df.columns[-1], inplace=True) + + self.my_df[f"distance to {self.fips}"] = 0 + + self.euclidean_kins = pd.concat((self.my_df, self.other_df), axis=0) + + if self.outcome_var: + self.euclidean_kins = self.euclidean_kins.merge( + self.outcome_with_percentiles[["GeoFIPS", "percentile"]], + on="GeoFIPS", + how="left", + ) + + if self.outcome_var and self.outcome_percentile_range is not None: + myself = self.euclidean_kins.iloc[:1] + self.euclidean_kins = self.euclidean_kins[ + self.euclidean_kins["percentile"] >= self.outcome_percentile_range[0] + ] + self.euclidean_kins = self.euclidean_kins[ + self.euclidean_kins["percentile"] <= self.outcome_percentile_range[1] + ] + self.euclidean_kins = pd.concat([myself, self.euclidean_kins]) + + def plot_weights(self): + """ + This method calls the external function `plot_weights` to visualize the feature weights. + + """ + plot_weights(self) + + def plot_kins_other_var(self, var, fips_top_custom=None): + """ + For a specified variable plot the time series for the current location and its Euclidean kin locations. + + Parameters: + - var (str): The variable for which the time series will be plotted. + - fips_top_custom (list or None): Custom list of FIPS codes to use instead of the top Euclidean kin locations. + + Returns: + - fig: Plotly figure object. + + Note: + - The method requires running `find_euclidean_kins` first. + """ + + # assert self.outcome_var, "Outcome comparison requires an outcome variable" + assert hasattr(self, "euclidean_kins"), "Run `find_euclidean_kins` first" + + self.data.get_features_long([var]) + plot_data = self.data.long[var] + my_plot_data = plot_data[plot_data["GeoFIPS"] == self.fips].copy() + + if fips_top_custom is None: + fips_top = self.euclidean_kins["GeoFIPS"].iloc[1 : (self.top + 1)].values + else: + fips_top = fips_top_custom + + others_plot_data = plot_data[plot_data["GeoFIPS"].isin(fips_top)] + + value_column_name = my_plot_data.columns[-1] + fig = go.Figure() + fig.add_trace( + go.Scatter( + x=my_plot_data["Year"], + y=my_plot_data[value_column_name], + mode="lines", + name=my_plot_data["GeoName"].iloc[0], + line=dict(color="darkred", width=3), + text=my_plot_data["GeoName"].iloc[0], + textposition="top right", + ) + ) + + pastel_colors = ["#FFC0CB", "#A9A9A9", "#87CEFA", "#FFD700", "#98FB98"][ + : self.top + ] + + for i, fips in enumerate(fips_top): + subset = others_plot_data[others_plot_data["GeoFIPS"] == fips] + line_color = pastel_colors[i % len(pastel_colors)] + fig.add_trace( + go.Scatter( + x=subset["Year"] + self.lag, + y=subset[value_column_name], + mode="lines", + name=subset["GeoName"].iloc[0], + line_color=line_color, + text=subset["GeoName"].iloc[0], + textposition="top right", + ) + ) + + if self.lag > 0: + fig.update_layout( + shapes=[ + dict( + type="line", + x0=2021, + x1=2021, + y0=0, + y1=1, + xref="x", + yref="paper", + line=dict(color="darkgray", width=2), + ) + ] + ) + + fig.add_annotation( + text=f"their year {2021 - self.lag}", + x=2021.0, + y=1.05, + xref="x", + yref="paper", + showarrow=False, + font=dict(color="darkgray"), + ) + + top = self.top + lag = self.lag + title_1 = title = f"Top {self.top} locations matching your search" + title_2 = ( + f"Top {self.top} locations matching your search (lag of {self.lag} years)" + ) + + if not self.feature_groups: + if self.lag == 0: + title = title_1 + else: + title = title_2 + else: + if self.lag == 0: + title = f"Top {top} locations matching your search" + else: + title = f"Top {top} locations matching your search (lag of {lag} years)" + + fig.update_layout( + title=title, + xaxis_title="Year", + yaxis_title=f"{var}", + legend=dict(title="GeoName"), + template="simple_white", + ) + + return fig + + def plot_kins(self): + """ + Creates the time series plot of the outcome variable for the current location and its Euclidean kin locations. + """ + + fig = self.plot_kins_other_var(self.outcome_var) + return fig + + def show_kins_plot(self): + """ + Plot the time series of the outcome variable for the current location and its Euclidean kin locations. + """ + + fig = self.plot_kins() + fig.show() + + +# TODO_Nikodem add population clustering and warning if a population is much different, +# especially if small + + +class MSAFipsQuery(FipsQuery): + # super().__init__( + # fips, + # outcome_var, + # feature_groups_with_weights, + # lag, + # top, + # time_decay, + # outcome_comparison_period, + # outcome_percentile_range, + # ) + def __init__( + self, + fips, + outcome_var=None, + feature_groups_with_weights=None, + lag=0, + top=5, + time_decay=1.08, + outcome_comparison_period=None, + outcome_percentile_range=None, + ): + # self.data = MSADataGrabber() + # self.all_available_features = list_available_features(level="msa") + # self.gdp_var = "gdp_ma" + # print("MSAFipsQuery __init__ data:", self.data) + + if feature_groups_with_weights is None and outcome_var: + feature_groups_with_weights = {outcome_var: 4} + + if outcome_var: + outcome_var_dict = { + outcome_var: feature_groups_with_weights.pop(outcome_var) + } + outcome_var_dict.update(feature_groups_with_weights) + feature_groups_with_weights = outcome_var_dict + + assert not ( + lag > 0 and outcome_var is None + ), "Lag will be idle with no outcome variable" + + assert not ( + lag > 0 and outcome_comparison_period is not None + ), "outcome_comparison_period is only used when lag = 0" + + assert not ( + outcome_var is None and outcome_comparison_period is not None + ), "outcome_comparison_period requires an outcome variable" + + assert not ( + outcome_var is None and outcome_percentile_range is not None + ), "outcome_percentile_range requires an outcome variable" + + self.all_available_features = list_available_features("msa") + + feature_groups = list(feature_groups_with_weights.keys()) + + assert feature_groups, "You need to specify at least one feature group" + + assert all( + isinstance(value, int) and -4 <= value <= 4 + for value in feature_groups_with_weights.values() + ), "Feature weights need to be integers between -4 and 4" + + self.feature_groups_with_weights = feature_groups_with_weights + self.feature_groups = feature_groups + self.data = MSADataGrabber() + self.repo_root = self.data.repo_root + self.fips = fips + self.lag = lag + self.top = top + self.gdp_var = "gdp_ma" + + # it's fine if they're None (by default) + self.outcome_var = outcome_var + self.outcome_comparison_period = outcome_comparison_period + + self.time_decay = time_decay + + if self.gdp_var not in self.feature_groups: + self.all_features = [self.gdp_var] + feature_groups + else: + self.all_features = feature_groups + + self.data.get_features_std_wide(self.all_features) + self.data.get_features_wide(self.all_features) + + assert ( + fips in self.data.std_wide[self.gdp_var]["GeoFIPS"].values + ), "FIPS not found in the data set." + self.name = self.data.std_wide[self.gdp_var]["GeoName"][ + self.data.std_wide[self.gdp_var]["GeoFIPS"] == self.fips + ].values[0] + + assert ( + self.lag >= 0 and self.lag < 6 and isinstance(self.lag, int) + ), "lag must be an iteger between 0 and 5" + assert ( + self.top > 0 + and isinstance(self.top, int) + and self.top + < 100 # TODO Make sure the number makes sense once we add all datasets we need + ), "top must be a positive integer smaller than the number of locations in the dataset" + + if outcome_var: + assert check_if_tensed( + self.data.std_wide[self.outcome_var] + ), "Outcome needs to be a time series." + + self.outcome_with_percentiles = self.data.std_wide[self.outcome_var].copy() + most_recent_outcome = self.data.wide[self.outcome_var].iloc[:, -1].values + self.outcome_with_percentiles["percentile"] = ( + most_recent_outcome < most_recent_outcome[:, np.newaxis] + ).sum(axis=1) / most_recent_outcome.shape[0] + self.outcome_with_percentiles["percentile"] = round( + self.outcome_with_percentiles["percentile"] * 100, 2 + ) + self.outcome_percentile_range = outcome_percentile_range diff --git a/build/cities/utils/__init__.py b/build/cities/utils/__init__.py new file mode 100644 index 00000000..f19c781f --- /dev/null +++ b/build/cities/utils/__init__.py @@ -0,0 +1,2 @@ +# from .cleaning_utils import find_repo_root +# from .data_grabber import DataGrabber diff --git a/build/cities/utils/clean_gdp.py b/build/cities/utils/clean_gdp.py new file mode 100644 index 00000000..543d35c6 --- /dev/null +++ b/build/cities/utils/clean_gdp.py @@ -0,0 +1,80 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_gdp(): + gdp = pd.read_csv(f"{root}/data/raw/CAGDP1_2001_2021.csv", encoding="ISO-8859-1") + + gdp = gdp.loc[:9533] # drop notes at the bottom + + gdp["GeoFIPS"] = gdp["GeoFIPS"].fillna("").astype(str) + gdp["GeoFIPS"] = gdp["GeoFIPS"].str.strip(' "').astype(int) + + # remove large regions + gdp = gdp[gdp["GeoFIPS"] % 1000 != 0] + + # focus on chain-type GDP + mask = gdp["Description"].str.startswith("Chain") + gdp = gdp[mask] + + # drop Region number, Tablename, LineCode, IndustryClassification columns (the last one is empty anyway) + gdp = gdp.drop(gdp.columns[2:8], axis=1) + + # 2012 makes no sense, it's 100 throughout + gdp = gdp.drop("2012", axis=1) + + gdp.replace("(NA)", np.nan, inplace=True) + gdp.replace("(NM)", np.nan, inplace=True) + + # nan_rows = gdp[gdp.isna().any(axis=1)] # if inspection is needed + + gdp.dropna(axis=0, inplace=True) + + for column in gdp.columns[2:]: + gdp[column] = gdp[column].astype(float) + + assert gdp["GeoName"].is_unique + + # subsetting GeoFIPS to values in exclusions.csv + + exclusions_df = pd.read_csv(f"{root}/data/raw/exclusions.csv") + gdp = gdp[~gdp["GeoFIPS"].isin(exclusions_df["exclusions"])] + + assert len(gdp) == len(gdp["GeoFIPS"].unique()) + assert len(gdp) > 2800, "The number of records is lower than 2800" + + patState = r", [A-Z]{2}(\*{1,2})?$" + GeoNameError = "Wrong Geoname value!" + assert gdp["GeoName"].str.contains(patState, regex=True).all(), GeoNameError + assert sum(gdp["GeoName"].str.count(", ")) == gdp.shape[0], GeoNameError + + for column in gdp.columns[2:]: + assert (gdp[column] > 0).all(), f"Negative values in {column}" + assert gdp[column].isna().sum() == 0, f"Missing values in {column}" + assert gdp[column].isnull().sum() == 0, f"Null values in {column}" + assert (gdp[column] < 3000).all(), f"Values suspiciously large in {column}" + + # TODO_Nikodem investigate strange large values + + gdp_wide = gdp.copy() + gdp_long = pd.melt( + gdp.copy(), id_vars=["GeoFIPS", "GeoName"], var_name="Year", value_name="Value" + ) + + gdp_std_wide = standardize_and_scale(gdp) + gdp_std_long = pd.melt( + gdp_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Year", + value_name="Value", + ) + + gdp_wide.to_csv(f"{root}/data/processed/gdp_wide.csv", index=False) + gdp_long.to_csv(f"{root}/data/processed/gdp_long.csv", index=False) + gdp_std_wide.to_csv(f"{root}/data/processed/gdp_std_wide.csv", index=False) + gdp_std_long.to_csv(f"{root}/data/processed/gdp_std_long.csv", index=False) diff --git a/build/cities/utils/clean_variable.py b/build/cities/utils/clean_variable.py new file mode 100644 index 00000000..75d63b59 --- /dev/null +++ b/build/cities/utils/clean_variable.py @@ -0,0 +1,208 @@ +import numpy as np +import pandas as pd + +from cities.utils.clean_gdp import clean_gdp +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + + +class VariableCleaner: + def __init__( + self, + variable_name: str, + path_to_raw_csv: str, + year_or_category: str = "Year", # Year or Category + ): + self.variable_name = variable_name + self.path_to_raw_csv = path_to_raw_csv + self.year_or_category = year_or_category + self.root = find_repo_root() + self.data_grabber = DataGrabber() + self.folder = "processed" + self.gdp = None + self.variable_df = None + + def clean_variable(self): + self.load_raw_csv() + self.drop_nans() + self.load_gdp_data() + self.check_exclusions() + self.restrict_common_fips() + self.save_csv_files(self.folder) + + def load_raw_csv(self): + self.variable_df = pd.read_csv(self.path_to_raw_csv) + self.variable_df["GeoFIPS"] = self.variable_df["GeoFIPS"].astype(int) + + def drop_nans(self): + self.variable_df = self.variable_df.dropna() + + def load_gdp_data(self): + self.data_grabber.get_features_wide(["gdp"]) + self.gdp = self.data_grabber.wide["gdp"] + + def add_new_exclusions(self, common_fips): + new_exclusions = np.setdiff1d( + self.gdp["GeoFIPS"].unique(), self.variable_df["GeoFIPS"].unique() + ) + print("Adding new exclusions to exclusions.csv: " + str(new_exclusions)) + exclusions = pd.read_csv((f"{self.root}/data/raw/exclusions.csv")) + new_rows = pd.DataFrame( + { + "dataset": [self.variable_name] * len(new_exclusions), + "exclusions": new_exclusions, + } + ) + exclusions = pd.concat([exclusions, new_rows], ignore_index=True) + exclusions = exclusions.drop_duplicates() + exclusions = exclusions.sort_values(by=["dataset", "exclusions"]).reset_index( + drop=True + ) + exclusions.to_csv((f"{self.root}/data/raw/exclusions.csv"), index=False) + print("Rerunning gdp cleaning with new exclusions") + + def check_exclusions(self): + common_fips = np.intersect1d( + self.gdp["GeoFIPS"].unique(), self.variable_df["GeoFIPS"].unique() + ) + if ( + len( + np.setdiff1d( + self.gdp["GeoFIPS"].unique(), self.variable_df["GeoFIPS"].unique() + ) + ) + > 0 + ): + self.add_new_exclusions(common_fips) + clean_gdp() + self.clean_variable() + + def restrict_common_fips(self): + common_fips = np.intersect1d( + self.gdp["GeoFIPS"].unique(), self.variable_df["GeoFIPS"].unique() + ) + self.variable_df = self.variable_df[ + self.variable_df["GeoFIPS"].isin(common_fips) + ] + self.variable_df = self.variable_df.merge( + self.gdp[["GeoFIPS", "GeoName"]], on=["GeoFIPS", "GeoName"], how="left" + ) + self.variable_df = self.variable_df.sort_values(by=["GeoFIPS", "GeoName"]) + for column in self.variable_df.columns: + if column not in ["GeoFIPS", "GeoName"]: + self.variable_df[column] = self.variable_df[column].astype(float) + + def save_csv_files(self, folder): + # it would be great to make sure that a db is wide, if not make it wide + variable_db_wide = self.variable_df.copy() + variable_db_long = pd.melt( + self.variable_df, + id_vars=["GeoFIPS", "GeoName"], + var_name=self.year_or_category, + value_name="Value", + ) + variable_db_std_wide = standardize_and_scale(self.variable_df) + variable_db_std_long = pd.melt( + variable_db_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name=self.year_or_category, + value_name="Value", + ) + + variable_db_wide.to_csv( + (f"{self.root}/data/{folder}/" + self.variable_name + "_wide.csv"), + index=False, + ) + variable_db_long.to_csv( + (f"{self.root}/data/{folder}/" + self.variable_name + "_long.csv"), + index=False, + ) + variable_db_std_wide.to_csv( + (f"{self.root}/data/{folder}/" + self.variable_name + "_std_wide.csv"), + index=False, + ) + variable_db_std_long.to_csv( + (f"{self.root}/data/{folder}/" + self.variable_name + "_std_long.csv"), + index=False, + ) + + +class VariableCleanerMSA( + VariableCleaner +): # this class inherits functionalites of VariableCleaner, but works at the MSA level + def __init__( + self, variable_name: str, path_to_raw_csv: str, year_or_category: str = "Year" + ): + super().__init__(variable_name, path_to_raw_csv, year_or_category) + self.folder = "MSA_level" + self.metro_areas = None + + def clean_variable(self): + self.load_raw_csv() + self.drop_nans() + self.process_data() + # TODO self.check_exclusions('MA') functionality needs to be implemented in the future + # TODO but only if data missigness turns out to be a serious problem + # for now, process_data runs a check and reports missingness + # but we need to be more careful about MSA missingnes handling + # as there are much fewer MSAs than counties + self.save_csv_files(self.folder) + + def load_metro_areas(self): + self.metro_areas = pd.read_csv(f"{self.root}/data/raw/metrolist.csv") + + def process_data(self): + self.load_metro_areas() + assert ( + self.metro_areas["GeoFIPS"].nunique() + == self.variable_df["GeoFIPS"].nunique() + ) + assert ( + self.metro_areas["GeoName"].nunique() + == self.variable_df["GeoName"].nunique() + ) + self.variable_df["GeoFIPS"] = self.variable_df["GeoFIPS"].astype(np.int64) + + +def weighted_mean(group, column): + values = group[column] + weights = group["Total population"] + + not_nan_indices = ~np.isnan(values) + + if np.any(not_nan_indices) and np.sum(weights[not_nan_indices]) != 0: + weighted_values = values[not_nan_indices] * weights[not_nan_indices] + return np.sum(weighted_values) / np.sum(weights[not_nan_indices]) + else: + return np.nan + + +def communities_tracts_to_counties( + data, list_variables +) -> pd.DataFrame: # using the weighted mean function for total population + all_results = pd.DataFrame() + + for variable in list_variables: + weighted_avg = ( + data.groupby("GeoFIPS").apply(weighted_mean, column=variable).reset_index() + ) + weighted_avg.columns = ["GeoFIPS", variable] + + nan_counties = ( + data.groupby("GeoFIPS") + .apply(lambda x: all(np.isnan(x[variable]))) + .reset_index() + ) + nan_counties.columns = ["GeoFIPS", "all_nan"] + + result_df = pd.merge(weighted_avg, nan_counties, on="GeoFIPS") + result_df.loc[result_df["all_nan"], variable] = np.nan + + result_df = result_df.drop(columns=["all_nan"]) + + if "GeoFIPS" not in all_results.columns: + all_results = result_df.copy() + else: + all_results = pd.merge(all_results, result_df, on="GeoFIPS", how="left") + + return all_results diff --git a/build/cities/utils/cleaning_scripts/clean_age_composition.py b/build/cities/utils/cleaning_scripts/clean_age_composition.py new file mode 100644 index 00000000..acb63d07 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_age_composition.py @@ -0,0 +1,30 @@ +import pandas as pd + +from cities.utils.clean_variable import VariableCleaner +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + +data = DataGrabber() +data.get_features_wide(["gdp"]) +gdp = data.wide["gdp"] + + +def clean_age_first(): + age = pd.read_csv(f"{root}/data/raw/age.csv") + + age.iloc[:, 2:] = age.iloc[:, 2:].div(age["total_pop"], axis=0) * 100 + age.drop("total_pop", axis=1, inplace=True) + + age.to_csv(f"{root}/data/raw/age_percentages.csv", index=False) + + +def clean_age_composition(): + clean_age_first() + + cleaner = VariableCleaner( + variable_name="age_composition", + path_to_raw_csv=f"{root}/data/raw/age_percentages.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_burdens.py b/build/cities/utils/cleaning_scripts/clean_burdens.py new file mode 100644 index 00000000..cb2be9ad --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_burdens.py @@ -0,0 +1,57 @@ +import numpy as np +import pandas as pd + +from cities.utils.clean_variable import VariableCleaner, communities_tracts_to_counties +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + +data = DataGrabber() +data.get_features_wide(["gdp"]) +gdp = data.wide["gdp"] + + +def clean_burdens_first(): + burdens = pd.read_csv(f"{root}/data/raw/communities_raw.csv") + + list_variables = ["Housing burden (percent)", "Energy burden"] + burdens = communities_tracts_to_counties(burdens, list_variables) + + burdens["GeoFIPS"] = burdens["GeoFIPS"].astype(np.int64) + + common_fips = np.intersect1d(burdens["GeoFIPS"].unique(), gdp["GeoFIPS"].unique()) + burdens = burdens[burdens["GeoFIPS"].isin(common_fips)] + burdens = burdens.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left") + + burdens = burdens[ + ["GeoFIPS", "GeoName", "Housing burden (percent)", "Energy burden"] + ] + + burdens.columns = ["GeoFIPS", "GeoName", "burdens_housing", "burdens_energy"] + + columns_to_trans = burdens.columns[-2:] + burdens[columns_to_trans] = burdens[columns_to_trans].astype("float64") + + burdens_housing = burdens[["GeoFIPS", "GeoName", "burdens_housing"]] + burdens_energy = burdens[["GeoFIPS", "GeoName", "burdens_energy"]] + + burdens_housing.to_csv(f"{root}/data/raw/burdens_housing_raw.csv", index=False) + burdens_energy.to_csv(f"{root}/data/raw/burdens_energy_raw.csv", index=False) + + +def clean_burdens(): + clean_burdens_first() + + cleaner_housing = VariableCleaner( + variable_name="burdens_housing", + path_to_raw_csv=f"{root}/data/raw/burdens_housing_raw.csv", + year_or_category="Category", + ) + cleaner_housing.clean_variable() + + cleaner_energy = VariableCleaner( + variable_name="burdens_energy", + path_to_raw_csv=f"{root}/data/raw/burdens_energy_raw.csv", + year_or_category="Category", + ) + cleaner_energy.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_ethnic_composition.py b/build/cities/utils/cleaning_scripts/clean_ethnic_composition.py new file mode 100644 index 00000000..b18ef031 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_ethnic_composition.py @@ -0,0 +1,138 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_ethnic_composition(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide["gdp"] + + ethnic_composition = pd.read_csv(f"{root}/data/raw/ACSDP5Y2021_DP05_Race.csv") + + ethnic_composition = ethnic_composition.iloc[1:] + ethnic_composition["GEO_ID"].isna() == 0 + + ethnic_composition["GEO_ID"] = ethnic_composition["GEO_ID"].str.split("US").str[1] + ethnic_composition["GEO_ID"] = ethnic_composition["GEO_ID"].astype("int64") + ethnic_composition = ethnic_composition.rename(columns={"GEO_ID": "GeoFIPS"}) + + ethnic_composition = ethnic_composition[ + ["GeoFIPS"] + [col for col in ethnic_composition.columns if col.endswith("E")] + ] + ethnic_composition = ethnic_composition.drop(columns=["NAME"]) + + common_fips = np.intersect1d( + gdp["GeoFIPS"].unique(), ethnic_composition["GeoFIPS"].unique() + ) + len(common_fips) + + ethnic_composition = ethnic_composition[ + ethnic_composition["GeoFIPS"].isin(common_fips) + ] + + ethnic_composition = ethnic_composition.merge( + gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left" + ) + + ethnic_composition = ethnic_composition[ + [ + "GeoFIPS", + "GeoName", + "DP05_0070E", + "DP05_0072E", + "DP05_0073E", + "DP05_0074E", + "DP05_0075E", + "DP05_0077E", + "DP05_0078E", + "DP05_0079E", + "DP05_0080E", + "DP05_0081E", + "DP05_0082E", + "DP05_0083E", + ] + ] + + ethnic_composition.columns = [ + "GeoFIPS", + "GeoName", + "total_pop", + "mexican", + "puerto_rican", + "cuban", + "other_hispanic_latino", + "white", + "black_african_american", + "american_indian_alaska_native", + "asian", + "native_hawaiian_other_pacific_islander", + "other_race", + "two_or_more_sum", + ] + ethnic_composition = ethnic_composition.sort_values(by=["GeoFIPS", "GeoName"]) + + ethnic_composition.iloc[:, 2:] = ethnic_composition.iloc[:, 2:].apply( + pd.to_numeric, errors="coerce" + ) + ethnic_composition[ethnic_composition.columns[2:]] = ethnic_composition[ + ethnic_composition.columns[2:] + ].astype(float) + + ethnic_composition["other_race_races"] = ( + ethnic_composition["other_race"] + ethnic_composition["two_or_more_sum"] + ) + ethnic_composition = ethnic_composition.drop( + ["other_race", "two_or_more_sum"], axis=1 + ) + + ethnic_composition["totalALT"] = ethnic_composition.iloc[:, 3:].sum(axis=1) + assert (ethnic_composition["totalALT"] == ethnic_composition["total_pop"]).all() + ethnic_composition = ethnic_composition.drop("totalALT", axis=1) + + # copy with nominal values + ethnic_composition.to_csv( + f"{root}/data/raw/ethnic_composition_nominal.csv", index=False + ) + + row_sums = ethnic_composition.iloc[:, 2:].sum(axis=1) + ethnic_composition.iloc[:, 3:] = ethnic_composition.iloc[:, 3:].div( + row_sums, axis=0 + ) + + ethnic_composition = ethnic_composition.drop(["total_pop"], axis=1) + + ethnic_composition_wide = ethnic_composition.copy() + + ethnic_composition_long = pd.melt( + ethnic_composition, + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + ethnic_composition_std_wide = standardize_and_scale(ethnic_composition) + + ethnic_composition_std_long = pd.melt( + ethnic_composition_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + ethnic_composition_wide.to_csv( + f"{root}/data/processed/ethnic_composition_wide.csv", index=False + ) + ethnic_composition_long.to_csv( + f"{root}/data/processed/ethnic_composition_long.csv", index=False + ) + ethnic_composition_std_wide.to_csv( + f"{root}/data/processed/ethnic_composition_std_wide.csv", index=False + ) + ethnic_composition_std_long.to_csv( + f"{root}/data/processed/ethnic_composition_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_ethnic_composition_ma.py b/build/cities/utils/cleaning_scripts/clean_ethnic_composition_ma.py new file mode 100644 index 00000000..acc69717 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_ethnic_composition_ma.py @@ -0,0 +1,75 @@ +import numpy as np +import pandas as pd + +from cities.utils.clean_variable import VariableCleanerMSA +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_ethnic_initially(): + ethnic_composition = pd.read_csv(f"{root}/data/raw/ethnic_composition_cbsa.csv") + metro_areas = pd.read_csv(f"{root}/data/raw/metrolist.csv") + + ethnic_composition["CBSA"] = ethnic_composition["CBSA"].astype(np.int64) + ethnic_composition = ethnic_composition[ + ethnic_composition["CBSA"].isin(metro_areas["GeoFIPS"]) + ] + + ethnic_composition = pd.merge( + ethnic_composition, + metro_areas[["GeoFIPS", "GeoName"]], + left_on="CBSA", + right_on="GeoFIPS", + how="inner", + ) + ethnic_composition = ethnic_composition.drop_duplicates(subset=["CBSA"]) + + ethnic_composition.drop(columns="CBSA", inplace=True) + + cols_to_save = ethnic_composition.shape[1] - 2 + ethnic_composition_ma = ethnic_composition[ + ["GeoFIPS", "GeoName"] + list(ethnic_composition.columns[0:cols_to_save]) + ] + + ethnic_composition_ma.iloc[:, 2:] = ethnic_composition_ma.iloc[:, 2:].apply( + pd.to_numeric, errors="coerce" + ) + ethnic_composition_ma[ethnic_composition_ma.columns[2:]] = ethnic_composition_ma[ + ethnic_composition_ma.columns[2:] + ].astype(float) + + ethnic_composition_ma["other_race_races"] = ( + ethnic_composition_ma["other_race"] + ethnic_composition_ma["two_or_more_sum"] + ) + ethnic_composition_ma = ethnic_composition_ma.drop( + ["other_race", "two_or_more_sum"], axis=1 + ) + + ethnic_composition_ma["totalALT"] = ethnic_composition_ma.iloc[:, 3:].sum(axis=1) + assert ( + ethnic_composition_ma["totalALT"] == ethnic_composition_ma["total_pop"] + ).all() + ethnic_composition_ma = ethnic_composition_ma.drop("totalALT", axis=1) + + row_sums = ethnic_composition_ma.iloc[:, 2:].sum(axis=1) + ethnic_composition_ma.iloc[:, 3:] = ethnic_composition_ma.iloc[:, 3:].div( + row_sums, axis=0 + ) + + ethnic_composition_ma = ethnic_composition_ma.drop(["total_pop"], axis=1) + + ethnic_composition_ma.to_csv( + f"{root}/data/raw/ethnic_composition_ma.csv", index=False + ) + + +def clean_ethnic_composition_ma(): + clean_ethnic_initially() + + cleaner = VariableCleanerMSA( + variable_name="ethnic_composition_ma", + path_to_raw_csv=f"{root}/data/raw/ethnic_composition_ma.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_gdp_ma.py b/build/cities/utils/cleaning_scripts/clean_gdp_ma.py new file mode 100644 index 00000000..f14b6712 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_gdp_ma.py @@ -0,0 +1,11 @@ +from cities.utils.clean_variable import VariableCleanerMSA +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_gdp_ma(): + cleaner = VariableCleanerMSA( + variable_name="gdp_ma", path_to_raw_csv=f"{root}/data/raw/gdp_ma.csv" + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_hazard.py b/build/cities/utils/cleaning_scripts/clean_hazard.py new file mode 100644 index 00000000..8efbb4cb --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_hazard.py @@ -0,0 +1,87 @@ +import numpy as np +import pandas as pd + +from cities.utils.clean_variable import VariableCleaner, communities_tracts_to_counties +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + +data = DataGrabber() +data.get_features_wide(["gdp"]) +gdp = data.wide["gdp"] + + +variables_hazard = [ + "expected_agricultural_loss_rate", + "expected_building_loss_rate", + "expected_population_loss_rate", + "diesel_matter_exposure", + "proximity_to_hazardous_waste_sites", + "proximity_to_risk_management_plan_facilities", +] + + +def clean_hazard_first(): + hazard = pd.read_csv(f"{root}/data/raw/communities_raw.csv") + + list_variables = [ + "Expected agricultural loss rate (Natural Hazards Risk Index)", + "Expected building loss rate (Natural Hazards Risk Index)", + "Expected population loss rate (Natural Hazards Risk Index)", + "Diesel particulate matter exposure", + "Proximity to hazardous waste sites", + "Proximity to Risk Management Plan (RMP) facilities", + ] + + hazard = communities_tracts_to_counties(hazard, list_variables) + + hazard.dropna(inplace=True) + + hazard["GeoFIPS"] = hazard["GeoFIPS"].astype(np.int64) + + common_fips = np.intersect1d(hazard["GeoFIPS"].unique(), gdp["GeoFIPS"].unique()) + hazard = hazard[hazard["GeoFIPS"].isin(common_fips)] + hazard = hazard.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left") + + hazard = hazard[ + [ + "GeoFIPS", + "GeoName", + "Expected agricultural loss rate (Natural Hazards Risk Index)", + "Expected building loss rate (Natural Hazards Risk Index)", + "Expected population loss rate (Natural Hazards Risk Index)", + "Diesel particulate matter exposure", + "Proximity to hazardous waste sites", + "Proximity to Risk Management Plan (RMP) facilities", + ] + ] + + hazard.columns = [ + "GeoFIPS", + "GeoName", + "expected_agricultural_loss_rate", + "expected_building_loss_rate", + "expected_population_loss_rate", + "diesel_matter_exposure", + "proximity_to_hazardous_waste_sites", + "proximity_to_risk_management_plan_facilities", + ] + + columns_to_trans = hazard.columns[-6:] + hazard[columns_to_trans] = hazard[columns_to_trans].astype("float64") + + for variable in variables_hazard: + hazard_variable = hazard[["GeoFIPS", "GeoName", variable]] + hazard_variable.to_csv(f"{root}/data/raw/{variable}.csv", index=False) + + +def clean_hazard(): + clean_hazard_first() + + for variable in variables_hazard: + cleaner = VariableCleaner( + variable_name=variable, + path_to_raw_csv=f"{root}/data/raw/{variable}.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_health.py b/build/cities/utils/cleaning_scripts/clean_health.py new file mode 100644 index 00000000..7b7def54 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_health.py @@ -0,0 +1,74 @@ +import numpy as np +import pandas as pd + +from cities.utils.clean_variable import VariableCleaner, communities_tracts_to_counties +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + +data = DataGrabber() +data.get_features_wide(["gdp"]) +gdp = data.wide["gdp"] + + +def clean_health_first(): + health = pd.read_csv(f"{root}/data/raw/communities_raw.csv") + + list_variables = [ + "Life expectancy (years)", + "Current asthma among adults aged greater than or equal to 18 years", + "Diagnosed diabetes among adults aged greater than or equal to 18 years", + "Coronary heart disease among adults aged greater than or equal to 18 years", + ] + + health = communities_tracts_to_counties(health, list_variables) + + health.dropna(inplace=True) + + health["GeoFIPS"] = health["GeoFIPS"].astype(np.int64) + + common_fips = np.intersect1d(health["GeoFIPS"].unique(), gdp["GeoFIPS"].unique()) + health = health[health["GeoFIPS"].isin(common_fips)] + health = health.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left") + + health = health[ + [ + "GeoFIPS", + "GeoName", + "Life expectancy (years)", + "Current asthma among adults aged greater than or equal to 18 years", + "Diagnosed diabetes among adults aged greater than or equal to 18 years", + "Coronary heart disease among adults aged greater than or equal to 18 years", + ] + ] + + health.columns = [ + "GeoFIPS", + "GeoName", + "LifeExpectancy", + "Asthma", + "Diabetes", + "HeartDisease", + ] + + columns_to_round = health.columns[-3:] + health[columns_to_round] = health[columns_to_round].round(0).astype("float64") + health["LifeExpectancy"] = health["LifeExpectancy"].round(2).astype("float64") + + val_list = ["Asthma", "Diabetes", "HeartDisease"] + + for val in val_list: # dealing with weird format of percentages + health[val] = health[val] / 100 + + health.to_csv(f"{root}/data/raw/health_raw.csv", index=False) + + +def clean_health(): + clean_health_first() + + cleaner = VariableCleaner( + variable_name="health", + path_to_raw_csv=f"{root}/data/raw/health_raw.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_homeownership.py b/build/cities/utils/cleaning_scripts/clean_homeownership.py new file mode 100644 index 00000000..832836db --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_homeownership.py @@ -0,0 +1,20 @@ +from cities.utils.clean_variable import VariableCleaner +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_homeownership(): + variables = [ + "median_owner_occupied_home_value", + "median_rent", + "homeownership_rate", + ] + + for variable in variables: + cleaner = VariableCleaner( + variable_name=variable, + path_to_raw_csv=f"{root}/data/raw/{variable}.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_income_distribution.py b/build/cities/utils/cleaning_scripts/clean_income_distribution.py new file mode 100644 index 00000000..6525078a --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_income_distribution.py @@ -0,0 +1,13 @@ +from cities.utils.clean_variable import VariableCleaner +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_income_distribution(): + cleaner = VariableCleaner( + variable_name="income_distribution", + path_to_raw_csv=f"{root}/data/raw/income_distribution.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_industry.py b/build/cities/utils/cleaning_scripts/clean_industry.py new file mode 100644 index 00000000..41571fb2 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_industry.py @@ -0,0 +1,118 @@ +from pathlib import Path + +import numpy as np +import pandas as pd + +from cities.utils.clean_variable import VariableCleaner +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + +path = Path(__file__).parent.absolute() + + +def clean_industry_step_one(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide["gdp"] + + industry = pd.read_csv(f"{root}/data/raw/ACSDP5Y2021_DP03_industry.csv") + + industry["GEO_ID"] = industry["GEO_ID"].str.split("US").str[1] + industry["GEO_ID"] = industry["GEO_ID"].astype("int64") + industry = industry.rename(columns={"GEO_ID": "GeoFIPS"}) + + common_fips = np.intersect1d(gdp["GeoFIPS"].unique(), industry["GeoFIPS"].unique()) + + industry = industry[industry["GeoFIPS"].isin(common_fips)] + + industry = industry.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left") + + industry = industry[ + [ + "GeoFIPS", + "GeoName", + "DP03_0004E", + "DP03_0033E", + "DP03_0034E", + "DP03_0035E", + "DP03_0036E", + "DP03_0037E", + "DP03_0038E", + "DP03_0039E", + "DP03_0040E", + "DP03_0041E", + "DP03_0042E", + "DP03_0043E", + "DP03_0044E", + "DP03_0045E", + ] + ] + + column_name_mapping = { + "DP03_0004E": "employed_sum", + "DP03_0033E": "agri_forestry_mining", + "DP03_0034E": "construction", + "DP03_0035E": "manufacturing", + "DP03_0036E": "wholesale_trade", + "DP03_0037E": "retail_trade", + "DP03_0038E": "transport_utilities", + "DP03_0039E": "information", + "DP03_0040E": "finance_real_estate", + "DP03_0041E": "prof_sci_mgmt_admin", + "DP03_0042E": "education_health", + "DP03_0043E": "arts_entertainment", + "DP03_0044E": "other_services", + "DP03_0045E": "public_admin", + } + + industry.rename(columns=column_name_mapping, inplace=True) + + industry = industry.sort_values(by=["GeoFIPS", "GeoName"]) + + industry.to_csv(f"{root}/data/raw/industry_absolute.csv", index=False) + + row_sums = industry.iloc[:, 3:].sum(axis=1) + + industry.iloc[:, 3:] = industry.iloc[:, 3:].div(row_sums, axis=0) + industry = industry.drop(["employed_sum"], axis=1) + + industry.to_csv(f"{root}/data/raw/industry_percent.csv", index=False) + + industry_wide = industry.copy() + + industry_long = pd.melt( + industry, + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + industry_std_wide = standardize_and_scale(industry) + + industry_std_long = pd.melt( + industry_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + industry_wide.to_csv(f"{root}/data/processed/industry_wide.csv", index=False) + industry_long.to_csv(f"{root}/data/processed/industry_long.csv", index=False) + industry_std_wide.to_csv( + f"{root}/data/processed/industry_std_wide.csv", index=False + ) + industry_std_long.to_csv( + f"{root}/data/processed/industry_std_long.csv", index=False + ) + + +def clean_industry(): + clean_industry_step_one() + + cleaner = VariableCleaner( + variable_name="industry", + path_to_raw_csv=f"{root}/data/raw/industry_percent.csv", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_industry_ma.py b/build/cities/utils/cleaning_scripts/clean_industry_ma.py new file mode 100644 index 00000000..f95a4c92 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_industry_ma.py @@ -0,0 +1,13 @@ +from cities.utils.clean_variable import VariableCleanerMSA +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_industry_ma(): + cleaner = VariableCleanerMSA( + variable_name="industry_ma", + path_to_raw_csv=f"{root}/data/raw/industry_ma.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_industry_ts.py b/build/cities/utils/cleaning_scripts/clean_industry_ts.py new file mode 100644 index 00000000..b16daee7 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_industry_ts.py @@ -0,0 +1,124 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_industry_ts(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide["gdp"] + + industry_ts = pd.read_csv(f"{root}/data/raw/industry_time_series_people.csv") + + industry_ts["GEO_ID"] = industry_ts["GEO_ID"].str.split("US").str[1] + industry_ts["GEO_ID"] = industry_ts["GEO_ID"].astype("int64") + industry_ts = industry_ts.rename(columns={"GEO_ID": "GeoFIPS"}) + + common_fips = np.intersect1d( + gdp["GeoFIPS"].unique(), industry_ts["GeoFIPS"].unique() + ) + + industry_ts = industry_ts[industry_ts["GeoFIPS"].isin(common_fips)] + + years = industry_ts["Year"].unique() + + for year in years: + year_df = industry_ts[industry_ts["Year"] == year] + missing_fips = set(common_fips) - set(year_df["GeoFIPS"]) + + if missing_fips: + missing_data = { + "Year": [year] * len(missing_fips), + "GeoFIPS": list(missing_fips), + } + + # Fill all columns from the fourth column (index 3) onward with 0 + for col in industry_ts.columns[2:]: + missing_data[col] = 0 + + missing_df = pd.DataFrame(missing_data) + industry_ts = pd.concat([industry_ts, missing_df], ignore_index=True) + + industry_ts = industry_ts.merge( + gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left" + ) + + industry_ts = industry_ts[ + [ + "GeoFIPS", + "GeoName", + "Year", + "agriculture_total", + "mining_total", + "construction_total", + "manufacturing_total", + "wholesale_trade_total", + "retail_trade_total", + "transportation_warehousing_total", + "utilities_total", + "information_total", + "finance_insurance_total", + "real_estate_total", + "professional_services_total", + "management_enterprises_total", + "admin_support_services_total", + "educational_services_total", + "healthcare_social_services_total", + "arts_recreation_total", + "accommodation_food_services_total", + "other_services_total", + "public_administration_total", + ] + ] + + industry_ts = industry_ts.sort_values(by=["GeoFIPS", "GeoName", "Year"]) + + industry_ts.fillna(0, inplace=True) + + columns_to_save = industry_ts.columns[industry_ts.columns.get_loc("Year") + 1 :] + + for column in columns_to_save: + selected_columns = ["GeoFIPS", "GeoName", "Year", column] + subsetindustry_ts = industry_ts[selected_columns] + + subsetindustry_ts.rename(columns={column: "Value"}, inplace=True) + + subsetindustry_ts_long = subsetindustry_ts.copy() + + file_name_long = f"industry_{column}_long.csv" + subsetindustry_ts_long.to_csv( + f"{root}/data/processed/{file_name_long}", index=False + ) + + subsetindustry_ts_std_long = standardize_and_scale(subsetindustry_ts) + + file_name_std = f"industry_{column}_std_long.csv" + subsetindustry_ts_std_long.to_csv( + f"{root}/data/processed/{file_name_std}", index=False + ) + + subsetindustry_ts_wide = subsetindustry_ts.pivot_table( + index=["GeoFIPS", "GeoName"], columns="Year", values="Value" + ) + subsetindustry_ts_wide.reset_index(inplace=True) + subsetindustry_ts_wide.columns.name = None + + file_name_wide = f"industry_{column}_wide.csv" + subsetindustry_ts_wide.to_csv( + f"{root}/data/processed/{file_name_wide}", index=False + ) + + subsetindustry_ts_std_wide = subsetindustry_ts_std_long.pivot_table( + index=["GeoFIPS", "GeoName"], columns="Year", values="Value" + ) + subsetindustry_ts_std_wide.reset_index(inplace=True) + subsetindustry_ts_std_wide.columns.name = None + + file_name_std_wide = f"industry_{column}_std_wide.csv" + subsetindustry_ts_std_wide.to_csv( + f"{root}/data/processed/{file_name_std_wide}", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_population.py b/build/cities/utils/cleaning_scripts/clean_population.py new file mode 100644 index 00000000..3c4d0ead --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_population.py @@ -0,0 +1,84 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_population(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide["gdp"] + + cainc30 = pd.read_csv( + f"{root}/data/raw/CAINC30_1969_2021.csv", encoding="ISO-8859-1" + ) + + population = cainc30[cainc30["Description"] == " Population (persons) 3/"].copy() + + population["GeoFIPS"] = population["GeoFIPS"].fillna("").astype(str) + population["GeoFIPS"] = population["GeoFIPS"].str.strip(' "').astype(int) + + population = population[population["GeoFIPS"] % 1000 != 0] + + common_fips = np.intersect1d( + population["GeoFIPS"].unique(), gdp["GeoFIPS"].unique() + ) + assert len(common_fips) == len(gdp["GeoFIPS"].unique()) + + population = population[population["GeoFIPS"].isin(common_fips)] + assert population.shape[0] == gdp.shape[0] + + order = gdp["GeoFIPS"].tolist() + population = population.set_index("GeoFIPS").reindex(order).reset_index() + + # align with gdp + assert population["GeoFIPS"].tolist() == gdp["GeoFIPS"].tolist() + assert population["GeoName"].is_unique + + population = population.drop(population.columns[2:8], axis=1) + assert population.shape[0] == gdp.shape[0] + + # 243 NAs prior to 1993 + # na_counts = (population == '(NA)').sum().sum() + # print(na_counts) + + population.replace("(NA)", np.nan, inplace=True) + population.replace("(NM)", np.nan, inplace=True) + + # removed years prior to 1993, missigness, long time ago + population = population.drop(population.columns[2:26], axis=1) + + assert population.isna().sum().sum() == 0 + assert population.shape[0] == gdp.shape[0] + + for column in population.columns[2:]: + population[column] = population[column].astype(float) + + assert population.shape[0] == gdp.shape[0] + + population_long = pd.melt( + population.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Year", + value_name="Value", + ) + + population_std_wide = standardize_and_scale(population) + population_std_long = pd.melt( + population_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Year", + value_name="Value", + ) + + population.to_csv(f"{root}/data/processed/population_wide.csv", index=False) + population_long.to_csv(f"{root}/data/processed/population_long.csv", index=False) + population_std_wide.to_csv( + f"{root}/data/processed/population_std_wide.csv", index=False + ) + population_std_long.to_csv( + f"{root}/data/processed/population_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_population_density.py b/build/cities/utils/cleaning_scripts/clean_population_density.py new file mode 100644 index 00000000..ce429f8a --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_population_density.py @@ -0,0 +1,12 @@ +from cities.utils.clean_variable import VariableCleaner +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_population_density(): + cleaner = VariableCleaner( + variable_name="population_density", + path_to_raw_csv=f"{root}/data/raw/population_density.csv", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_population_ma.py b/build/cities/utils/cleaning_scripts/clean_population_ma.py new file mode 100644 index 00000000..21d9ee3c --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_population_ma.py @@ -0,0 +1,13 @@ +from cities.utils.clean_variable import VariableCleanerMSA +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_population_ma(): + cleaner = VariableCleanerMSA( + variable_name="population_ma", + path_to_raw_csv=f"{root}/data/raw/population_ma.csv", + year_or_category="Year", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_spending_HHS.py b/build/cities/utils/cleaning_scripts/clean_spending_HHS.py new file mode 100644 index 00000000..6db55e06 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_spending_HHS.py @@ -0,0 +1,142 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_spending_HHS(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide + gdp = gdp.get("gdp") + + spending_HHS = pd.read_csv(f"{root}/data/raw/spending_HHS.csv") + + transportUnwanted = spending_HHS[ + ( + pd.isna(spending_HHS["total_obligated_amount"]) + | (spending_HHS["total_obligated_amount"] == 1) + | (spending_HHS["total_obligated_amount"] == 0) + ) + ] + + exclude_mask = spending_HHS["total_obligated_amount"].isin( + transportUnwanted["total_obligated_amount"] + ) + spending_HHS = spending_HHS[~exclude_mask] # 95 observations dleted + + assert spending_HHS.isna().sum().sum() == 0, "Na values detected" + + # loading names and repearing fips of value 3 and shorter + + names_HHS = pd.read_csv(f"{root}/data/raw/spending_HHS_names.csv") + + spending_only_fips = np.setdiff1d(spending_HHS["GeoFIPS"], gdp["GeoFIPS"]) + + fips4_to_repair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)] + short4_fips = spending_HHS[spending_HHS["GeoFIPS"].isin(fips4_to_repair)] + + full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999] + full_geofips = spending_HHS[spending_HHS["GeoFIPS"].isin(full_geofipsLIST)] + + cleaningLIST = [full_geofips, short4_fips] # no 3digit FIPS + + # replacing damaged FIPS + + for badFIPS in cleaningLIST: + geofips_to_geonamealt = dict(zip(names_HHS["GeoFIPS"], names_HHS["GeoNameALT"])) + + badFIPS["GeoNameALT"] = badFIPS["GeoFIPS"].map(geofips_to_geonamealt) + badFIPS = badFIPS.rename(columns={"GeoFIPS": "damagedFIPS"}) + + badFIPSmapping_dict = dict(zip(gdp["GeoName"], gdp["GeoFIPS"])) + + badFIPS["repairedFIPS"] = badFIPS["GeoNameALT"].apply( + lambda x: badFIPSmapping_dict.get(x) + ) + repaired_geofips = badFIPS[badFIPS["repairedFIPS"].notna()] + + repair_ratio = repaired_geofips.shape[0] / badFIPS.shape[0] + print(f"Ratio of repaired FIPS: {round(repair_ratio, 2)}") + + # assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!' + + spending_HHS["GeoFIPS"] = spending_HHS[ + "GeoFIPS" + ].replace( # no FIPS were repaired actually + dict(zip(repaired_geofips["damagedFIPS"], repaired_geofips["repairedFIPS"])) + ) + + common_fips = np.intersect1d( + gdp["GeoFIPS"].unique(), spending_HHS["GeoFIPS"].unique() + ) + + all_FIPS_spending_HHS = spending_HHS.copy() + + spending_HHS = spending_HHS[ + spending_HHS["GeoFIPS"].isin(common_fips) + ] # 99 FIPS deleted + assert ( + spending_HHS.shape[0] / all_FIPS_spending_HHS.shape[0] > 0.9 + ), "Less than 0.9 of FIPS are common!" + + # grouping duplicate fips for years + # (they appeared because we have repaired some of them and now they match with number that is already present) + + spending_HHS = ( + spending_HHS.groupby(["GeoFIPS", "year"])["total_obligated_amount"] + .sum() + .reset_index() + ) + spending_HHS.reset_index(drop=True, inplace=True) + + # adding GeoNames + spending_HHS = spending_HHS.merge( + gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left" + )[["GeoFIPS", "GeoName", "year", "total_obligated_amount"]] + + unique_gdp = gdp[["GeoFIPS", "GeoName"]].drop_duplicates( + subset=["GeoFIPS", "GeoName"], keep="first" + ) + exclude_geofips = set(spending_HHS["GeoFIPS"]) + unique_gdp = unique_gdp[~unique_gdp["GeoFIPS"].isin(exclude_geofips)] + + unique_gdp["year"] = np.repeat(2018, unique_gdp.shape[0]) + unique_gdp["total_obligated_amount"] = np.repeat(0, unique_gdp.shape[0]) + spending_HHS = pd.concat([spending_HHS, unique_gdp], ignore_index=True) + spending_HHS = spending_HHS.sort_values(by=["GeoFIPS", "GeoName", "year"]) + + assert spending_HHS["GeoFIPS"].nunique() == spending_HHS["GeoName"].nunique() + assert spending_HHS["GeoFIPS"].nunique() == gdp["GeoFIPS"].nunique() + + # Assuming you have a DataFrame named 'your_dataframe' + spending_HHS = spending_HHS.rename(columns={"year": "Year"}) + + # standardizing and saving + spending_HHS_long = spending_HHS.copy() + + spending_HHS_wide = spending_HHS.pivot_table( + index=["GeoFIPS", "GeoName"], columns="Year", values="total_obligated_amount" + ) + spending_HHS_wide.reset_index(inplace=True) + spending_HHS_wide.columns.name = None + spending_HHS_wide = spending_HHS_wide.fillna(0) + + spending_HHS_std_long = standardize_and_scale(spending_HHS) + spending_HHS_std_wide = standardize_and_scale(spending_HHS_wide) + + spending_HHS_wide.to_csv( + f"{root}/data/processed/spending_HHS_wide.csv", index=False + ) + spending_HHS_long.to_csv( + f"{root}/data/processed/spending_HHS_long.csv", index=False + ) + spending_HHS_std_wide.to_csv( + f"{root}/data/processed/spending_HHS_std_wide.csv", index=False + ) + spending_HHS_std_long.to_csv( + f"{root}/data/processed/spending_HHS_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_spending_commerce.py b/build/cities/utils/cleaning_scripts/clean_spending_commerce.py new file mode 100644 index 00000000..2463bffa --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_spending_commerce.py @@ -0,0 +1,147 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_spending_commerce(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide + gdp = gdp.get("gdp") + + spending_commerce = pd.read_csv(f"{root}/data/raw/spending_commerce.csv") + + transportUnwanted = spending_commerce[ + ( + pd.isna(spending_commerce["total_obligated_amount"]) + | (spending_commerce["total_obligated_amount"] == 1) + | (spending_commerce["total_obligated_amount"] == 0) + ) + ] + + exclude_mask = spending_commerce["total_obligated_amount"].isin( + transportUnwanted["total_obligated_amount"] + ) + spending_commerce = spending_commerce[~exclude_mask] # 24 values lost + + assert spending_commerce.isna().sum().sum() == 0, "Na values detected" + + # loading names and repearing fips of value 3 and shorter + + names_commerce = pd.read_csv(f"{root}/data/raw/spending_commerce_names.csv") + + spending_only_fips = np.setdiff1d(spending_commerce["GeoFIPS"], gdp["GeoFIPS"]) + + fips4_to_repair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)] + short4_fips = spending_commerce[spending_commerce["GeoFIPS"].isin(fips4_to_repair)] + + full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999] + full_geofips = spending_commerce[ + spending_commerce["GeoFIPS"].isin(full_geofipsLIST) + ] + + cleaningLIST = [full_geofips, short4_fips] # no small fips + + # replacing damaged FIPS + + for badFIPS in cleaningLIST: + geofips_to_geonamealt = dict( + zip(names_commerce["GeoFIPS"], names_commerce["GeoNameALT"]) + ) + + badFIPS["GeoNameALT"] = badFIPS["GeoFIPS"].map(geofips_to_geonamealt) + badFIPS = badFIPS.rename(columns={"GeoFIPS": "damagedFIPS"}) + + badFIPSmapping_dict = dict(zip(gdp["GeoName"], gdp["GeoFIPS"])) + + badFIPS["repairedFIPS"] = badFIPS["GeoNameALT"].apply( + lambda x: badFIPSmapping_dict.get(x) + ) + repaired_geofips = badFIPS[badFIPS["repairedFIPS"].notna()] + + repair_ratio = repaired_geofips.shape[0] / badFIPS.shape[0] + print(f"Ratio of repaired FIPS: {round(repair_ratio, 2)}") + + # assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!' + + spending_commerce["GeoFIPS"] = spending_commerce["GeoFIPS"].replace( + dict(zip(repaired_geofips["damagedFIPS"], repaired_geofips["repairedFIPS"])) + ) + + # deleting short FIPS codes + + common_fips = np.intersect1d( + gdp["GeoFIPS"].unique(), spending_commerce["GeoFIPS"].unique() + ) + + all_FIPS_spending_commerce = spending_commerce.copy() + + spending_commerce = spending_commerce[ + spending_commerce["GeoFIPS"].isin(common_fips) + ] # 67 FIPS deleted + assert ( + spending_commerce.shape[0] / all_FIPS_spending_commerce.shape[0] > 0.9 + ), "Less than 0.9 of FIPS are common!" + + # grouping duplicate fips for years + # (they appeared because we have repaired some of them and now they match with number that is already present) + + spending_commerce = ( + spending_commerce.groupby(["GeoFIPS", "year"])["total_obligated_amount"] + .sum() + .reset_index() + ) + spending_commerce.reset_index(drop=True, inplace=True) + + # adding GeoNames + spending_commerce = spending_commerce.merge( + gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left" + )[["GeoFIPS", "GeoName", "year", "total_obligated_amount"]] + + unique_gdp = gdp[["GeoFIPS", "GeoName"]].drop_duplicates( + subset=["GeoFIPS", "GeoName"], keep="first" + ) + exclude_geofips = set(spending_commerce["GeoFIPS"]) + unique_gdp = unique_gdp[~unique_gdp["GeoFIPS"].isin(exclude_geofips)] + + unique_gdp["year"] = np.repeat(2018, unique_gdp.shape[0]) + unique_gdp["total_obligated_amount"] = np.repeat(0, unique_gdp.shape[0]) + spending_commerce = pd.concat([spending_commerce, unique_gdp], ignore_index=True) + spending_commerce = spending_commerce.sort_values(by=["GeoFIPS", "GeoName", "year"]) + + assert ( + spending_commerce["GeoFIPS"].nunique() == spending_commerce["GeoName"].nunique() + ) + assert spending_commerce["GeoFIPS"].nunique() == gdp["GeoFIPS"].nunique() + + spending_commerce = spending_commerce.rename(columns={"year": "Year"}) + + # standardizing and saving + spending_commerce_long = spending_commerce.copy() + + spending_commerce_wide = spending_commerce.pivot_table( + index=["GeoFIPS", "GeoName"], columns="Year", values="total_obligated_amount" + ) + spending_commerce_wide.reset_index(inplace=True) + spending_commerce_wide.columns.name = None + spending_commerce_wide = spending_commerce_wide.fillna(0) + + spending_commerce_std_long = standardize_and_scale(spending_commerce) + spending_commerce_std_wide = standardize_and_scale(spending_commerce_wide) + + spending_commerce_wide.to_csv( + f"{root}/data/processed/spending_commerce_wide.csv", index=False + ) + spending_commerce_long.to_csv( + f"{root}/data/processed/spending_commerce_long.csv", index=False + ) + spending_commerce_std_wide.to_csv( + f"{root}/data/processed/spending_commerce_std_wide.csv", index=False + ) + spending_commerce_std_long.to_csv( + f"{root}/data/processed/spending_commerce_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_spending_transportation.py b/build/cities/utils/cleaning_scripts/clean_spending_transportation.py new file mode 100644 index 00000000..0ff49927 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_spending_transportation.py @@ -0,0 +1,183 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_spending_transportation(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide + gdp = gdp.get("gdp") + + spending_transportation = pd.read_csv( + f"{root}/data/raw/spending_transportation.csv" + ) + + transportUnwanted = spending_transportation[ + ( + pd.isna(spending_transportation["total_obligated_amount"]) + | (spending_transportation["total_obligated_amount"] == 1) + | (spending_transportation["total_obligated_amount"] == 0) + ) + ] + + exclude_mask = spending_transportation["total_obligated_amount"].isin( + transportUnwanted["total_obligated_amount"] + ) + spending_transportation = spending_transportation[ + ~exclude_mask + ] # 66 values removed + + assert spending_transportation.isna().sum().sum() == 0, "Na values detected" + + # loading names and repearing fips of value 3 and shorter + + names_transportation = pd.read_csv( + f"{root}/data/raw/spending_transportation_names.csv" + ) + + short_geofips = spending_transportation[ + spending_transportation["GeoFIPS"].astype(str).str.len().between(1, 3) + ] + + spending_only_fips = np.setdiff1d( + spending_transportation["GeoFIPS"], gdp["GeoFIPS"] + ) + + fips4_to_repeair = [ + fip for fip in spending_only_fips if (fip < 10000 and fip > 999) + ] + short4_fips = spending_transportation[ + spending_transportation["GeoFIPS"].isin(fips4_to_repeair) + ] + + full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999] + full_geofips = spending_transportation[ + spending_transportation["GeoFIPS"].isin(full_geofipsLIST) + ] + + cleaningLIST = [full_geofips, short4_fips, short_geofips] + + for badFIPS in cleaningLIST: + geofips_to_geonamealt = dict( + zip(names_transportation["GeoFIPS"], names_transportation["GeoNameALT"]) + ) + + badFIPS["GeoNameALT"] = badFIPS["GeoFIPS"].map(geofips_to_geonamealt) + badFIPS = badFIPS.rename(columns={"GeoFIPS": "damagedFIPS"}) + + badFIPSmapping_dict = dict(zip(gdp["GeoName"], gdp["GeoFIPS"])) + + badFIPS["repairedFIPS"] = badFIPS["GeoNameALT"].apply( + lambda x: badFIPSmapping_dict.get(x) + ) + repaired_geofips = badFIPS[badFIPS["repairedFIPS"].notna()] + + repair_ratio = repaired_geofips.shape[0] / badFIPS.shape[0] + print(f"Ratio of repaired FIPS: {round(repair_ratio, 2)}") + + # assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!' + + spending_transportation["GeoFIPS"] = spending_transportation["GeoFIPS"].replace( + dict(zip(repaired_geofips["damagedFIPS"], repaired_geofips["repairedFIPS"])) + ) + + # deleting short FIPS codes + count_short_geofips = spending_transportation[ + spending_transportation["GeoFIPS"] <= 999 + ]["GeoFIPS"].count() + assert ( + count_short_geofips / spending_transportation.shape[0] < 0.05 + ), "More than 0.05 of FIPS are short and will be deleted!" + + spending_transportation = spending_transportation[ + spending_transportation["GeoFIPS"] > 999 + ] + + common_fips = np.intersect1d( + gdp["GeoFIPS"].unique(), spending_transportation["GeoFIPS"].unique() + ) + + all_FIPS_spending_transportation = spending_transportation.copy() + + spending_transportation = spending_transportation[ + spending_transportation["GeoFIPS"].isin(common_fips) + ] # 0.96 of FIPS are common + assert ( + spending_transportation.shape[0] / all_FIPS_spending_transportation.shape[0] + > 0.9 + ), "Less than 0.9 of FIPS are common!" + + # grouping duplicate fips for years + # (they appeared because we have repaired some of them and now they match with number that is already present) + + spending_transportation = ( + spending_transportation.groupby(["GeoFIPS", "year"])["total_obligated_amount"] + .sum() + .reset_index() + ) + spending_transportation.reset_index(drop=True, inplace=True) + + # adding GeoNames + spending_transportation = spending_transportation.merge( + gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left" + )[["GeoFIPS", "GeoName", "year", "total_obligated_amount"]] + + # adding missing FIPS with 0 values in total_obligated_amount column, and 2018 year (as a dummy variable) + + unique_gdp = gdp[["GeoFIPS", "GeoName"]].drop_duplicates( + subset=["GeoFIPS", "GeoName"], keep="first" + ) + exclude_geofips = set(spending_transportation["GeoFIPS"]) + unique_gdp = unique_gdp[~unique_gdp["GeoFIPS"].isin(exclude_geofips)] + + unique_gdp["year"] = np.repeat(2018, unique_gdp.shape[0]) + unique_gdp["total_obligated_amount"] = np.repeat(0, unique_gdp.shape[0]) + spending_transportation = pd.concat( + [spending_transportation, unique_gdp], ignore_index=True + ) + spending_transportation = spending_transportation.sort_values( + by=["GeoFIPS", "GeoName", "year"] + ) + + assert ( + spending_transportation["GeoFIPS"].nunique() + == spending_transportation["GeoName"].nunique() + ) + assert spending_transportation["GeoFIPS"].nunique() == gdp["GeoFIPS"].nunique() + + spending_transportation = spending_transportation.rename(columns={"year": "Year"}) + + # standardizing and saving + spending_transportation_long = spending_transportation.copy() + + spending_transportation_wide = spending_transportation.pivot_table( + index=["GeoFIPS", "GeoName"], columns="Year", values="total_obligated_amount" + ) + spending_transportation_wide.reset_index(inplace=True) + spending_transportation_wide.columns.name = None + spending_transportation_wide = spending_transportation_wide.fillna(0) + + spending_transportation_std_long = standardize_and_scale( + spending_transportation_long + ) + spending_transportation_std_wide = standardize_and_scale( + spending_transportation_wide + ) + + spending_transportation_wide.to_csv( + f"{root}/data/processed/spending_transportation_wide.csv", index=False + ) + spending_transportation_long.to_csv( + f"{root}/data/processed/spending_transportation_long.csv", index=False + ) + spending_transportation_std_wide.to_csv( + f"{root}/data/processed/spending_transportation_std_wide.csv", index=False + ) + spending_transportation_std_long.to_csv( + f"{root}/data/processed/spending_transportation_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_transport.py b/build/cities/utils/cleaning_scripts/clean_transport.py new file mode 100644 index 00000000..df789ecb --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_transport.py @@ -0,0 +1,93 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_transport(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide + gdp = gdp.get("gdp") + + # grabbing gdp for comparison + + transport = pd.read_csv(f"{root}/data/raw/smartLocationSmall.csv") + + # choosing transport variables + transport = transport[["GeoFIPS", "D3A", "WeightAvgNatWalkInd"]] + + # list of GeoFips with Na values + transportUnwanted = transport[ + ( + pd.isna(transport["WeightAvgNatWalkInd"]) + | (transport["WeightAvgNatWalkInd"] == 1) + ) + | (transport["D3A"] == 0) + | (transport["D3A"] == 1) + ] + + exclude_mask = transport["GeoFIPS"].isin(transportUnwanted["GeoFIPS"]) + transport = transport[~exclude_mask] + + # the step above deleted 10 records with NAs, + # no loss on a dataset because they were not common with gdp anyway + + assert transport.isna().sum().sum() == 0, "Na values detected" + assert transport["GeoFIPS"].is_unique + + # subsetting to common FIPS numbers + + common_fips = np.intersect1d(gdp["GeoFIPS"].unique(), transport["GeoFIPS"].unique()) + transport = transport[transport["GeoFIPS"].isin(common_fips)] + + assert len(common_fips) == len(transport["GeoFIPS"].unique()) + assert len(transport) > 2800, "The number of records is lower than 2800" + + # adding geoname column + transport = transport.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left")[ + ["GeoFIPS", "GeoName", "D3A", "WeightAvgNatWalkInd"] + ] + + # renaming D3A to roadDenisty + transport.rename(columns={"D3A": "roadDensity"}, inplace=True) + + patState = r", [A-Z]{2}(\*{1,2})?$" + GeoNameError = "Wrong GeoName value!" + assert transport["GeoName"].str.contains(patState, regex=True).all(), GeoNameError + assert sum(transport["GeoName"].str.count(", ")) == transport.shape[0], GeoNameError + + # changing values to floats + + for column in transport.columns[2:]: + transport[column] = transport[column].astype(float) + + # Standardizing, formatting, saving + + transport_wide = transport.copy() + transport_std_wide = standardize_and_scale(transport) + + transport_long = pd.melt( + transport, + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + transport_std_long = pd.melt( + transport_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + transport_wide.to_csv(f"{root}/data/processed/transport_wide.csv", index=False) + transport_long.to_csv(f"{root}/data/processed/transport_long.csv", index=False) + transport_std_wide.to_csv( + f"{root}/data/processed/transport_std_wide.csv", index=False + ) + transport_std_long.to_csv( + f"{root}/data/processed/transport_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/clean_unemployment.py b/build/cities/utils/cleaning_scripts/clean_unemployment.py new file mode 100644 index 00000000..4e25369c --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_unemployment.py @@ -0,0 +1,12 @@ +from cities.utils.clean_variable import VariableCleaner +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_unemployment(): + cleaner = VariableCleaner( + variable_name="unemployment_rate", + path_to_raw_csv=f"{root}/data/raw/unemployment_rate_wide_withNA.csv", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_urbanicity_ma.py b/build/cities/utils/cleaning_scripts/clean_urbanicity_ma.py new file mode 100644 index 00000000..710c8533 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_urbanicity_ma.py @@ -0,0 +1,118 @@ +import numpy as np +import pandas as pd + +from cities.utils.clean_variable import VariableCleanerMSA +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +def clean_urbanicity_initially(): + population_urban = pd.read_csv( + f"{root}/data/raw/DECENNIALDHC2020.P2-2023-12-25T165149.csv" + ) + + population_urban.set_index("Label (Grouping)", inplace=True) + transposed_df = population_urban.transpose() + transposed_df.reset_index(inplace=True) + df_population_urban = transposed_df.copy() + + filtered_df = pd.DataFrame( + df_population_urban[df_population_urban["index"].str.endswith("Metro Area")] + ) + + filtered_df = filtered_df.rename(columns={"index": "MetroName"}) + + filtered_df.columns = filtered_df.columns.str.replace("Total:", "total_pop") + filtered_df.columns = filtered_df.columns.str.replace("Urban", "urban_pop") + filtered_df.columns = filtered_df.columns.str.replace("Rural", "rural_pop") + filtered_df = filtered_df.iloc[:, :-1].reset_index(drop=True) + + population_urban = filtered_df.copy() + + housing_urban = pd.read_csv( + f"{root}/data/raw/DECENNIALDHC2020.H2-2023-12-25T174403.csv" + ) + + housing_urban.set_index("Label (Grouping)", inplace=True) + transposed_df = housing_urban.transpose() + transposed_df.reset_index(inplace=True) + housing_urban = transposed_df.copy() + + filtered_df = pd.DataFrame( + housing_urban[housing_urban["index"].str.endswith("Metro Area")] + ) + + filtered_df = filtered_df.rename(columns={"index": "MetroName"}) + + filtered_df.columns = filtered_df.columns.str.replace("Total:", "total_housing") + filtered_df.columns = filtered_df.columns.str.replace("Urban", "urban_housing") + filtered_df.columns = filtered_df.columns.str.replace("Rural", "rural_housing") + filtered_df = filtered_df.iloc[:, :-1].reset_index(drop=True) + housing_urban = filtered_df.copy() + + metrolist = pd.read_csv(f"{root}/data/raw/metrolist.csv") + + merged_df = housing_urban.merge(population_urban, on="MetroName") + + merged_df["MetroName"] = merged_df["MetroName"].str.replace("Metro Area", "(MA)") + + df1_subset = metrolist[["GeoFIPS", "GeoName"]].drop_duplicates() + + merged_df = pd.merge( + merged_df, df1_subset, left_on=["MetroName"], right_on=["GeoName"], how="left" + ) + + merged_df = merged_df.drop(columns=["GeoName"]) + merged_df.dropna(inplace=True) + + merged_df.columns = merged_df.columns.str.strip() + ordered_columns = [ + "GeoFIPS", + "MetroName", + "total_housing", + "urban_housing", + "rural_housing", + "total_pop", + "urban_pop", + "rural_pop", + ] + ordered_df = merged_df[ordered_columns] + + ordered_df = ordered_df.rename(columns={"MetroName": "GeoName"}) + + numeric_columns = [ + "total_housing", + "urban_housing", + "rural_housing", + "total_pop", + "urban_pop", + "rural_pop", + ] + ordered_df[numeric_columns] = ( + ordered_df[numeric_columns].replace({",": ""}, regex=True).astype(float) + ) + + ordered_df["GeoFIPS"] = ordered_df["GeoFIPS"].astype(np.int64) + + ordered_df["rural_pop_prct"] = ordered_df["rural_pop"] / ordered_df["total_pop"] + ordered_df["rural_housing_prct"] = ( + ordered_df["rural_housing"] / ordered_df["total_housing"] + ) + + ordered_df.drop(["total_pop", "total_housing"], axis=1, inplace=True) + + ordered_df.reset_index(drop=True, inplace=True) + + ordered_df.to_csv(f"{root}/data/raw/urbanicity_ma.csv", index=False) + + +def clean_urbanicity_ma(): + clean_urbanicity_initially() + + cleaner = VariableCleanerMSA( + variable_name="urbanicity_ma", + path_to_raw_csv=f"{root}/data/raw/urbanicity_ma.csv", + year_or_category="Category", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_scripts/clean_urbanization.py b/build/cities/utils/cleaning_scripts/clean_urbanization.py new file mode 100644 index 00000000..db199e2b --- /dev/null +++ b/build/cities/utils/cleaning_scripts/clean_urbanization.py @@ -0,0 +1,78 @@ +import numpy as np +import pandas as pd + +from cities.utils.cleaning_utils import standardize_and_scale +from cities.utils.data_grabber import DataGrabber, find_repo_root + +root = find_repo_root() + + +def clean_urbanization(): + data = DataGrabber() + data.get_features_wide(["gdp"]) + gdp = data.wide["gdp"] + + dtype_mapping = {"STATE": str, "COUNTY": str} + urbanization = pd.read_csv( + f"{root}/data/raw/2020_UA_COUNTY.csv", dtype=dtype_mapping + ) + + urbanization["GeoFIPS"] = urbanization["STATE"].astype(str) + urbanization[ + "COUNTY" + ].astype(str) + urbanization["GeoFIPS"] = urbanization["GeoFIPS"].astype(int) + + common_fips = np.intersect1d( + gdp["GeoFIPS"].unique(), urbanization["GeoFIPS"].unique() + ) + + urbanization = urbanization[urbanization["GeoFIPS"].isin(common_fips)] + + urbanization = urbanization.merge( + gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left" + ) + + urbanization = urbanization[ + [ + "GeoFIPS", + "GeoName", + "POPDEN_RUR", + "POPDEN_URB", + "HOUDEN_COU", + "HOUDEN_RUR", + "ALAND_PCT_RUR", + ] + ] + + urbanization = urbanization.sort_values(by=["GeoFIPS", "GeoName"]) + + urbanization_wide = urbanization.copy() + + urbanization_long = pd.melt( + urbanization, + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + urbanization_std_wide = standardize_and_scale(urbanization) + + urbanization_std_long = pd.melt( + urbanization_std_wide.copy(), + id_vars=["GeoFIPS", "GeoName"], + var_name="Category", + value_name="Value", + ) + + urbanization_wide.to_csv( + f"{root}/data/processed/urbanization_wide.csv", index=False + ) + urbanization_long.to_csv( + f"{root}/data/processed/urbanization_long.csv", index=False + ) + urbanization_std_wide.to_csv( + f"{root}/data/processed/urbanization_std_wide.csv", index=False + ) + urbanization_std_long.to_csv( + f"{root}/data/processed/urbanization_std_long.csv", index=False + ) diff --git a/build/cities/utils/cleaning_scripts/cleaning_pipeline.py b/build/cities/utils/cleaning_scripts/cleaning_pipeline.py new file mode 100644 index 00000000..542836de --- /dev/null +++ b/build/cities/utils/cleaning_scripts/cleaning_pipeline.py @@ -0,0 +1,74 @@ +from cities.utils.clean_age_composition import clean_age_composition +from cities.utils.clean_burdens import clean_burdens +from cities.utils.clean_ethnic_composition import clean_ethnic_composition +from cities.utils.clean_ethnic_composition_ma import clean_ethnic_composition_ma +from cities.utils.clean_gdp import clean_gdp +from cities.utils.clean_gdp_ma import clean_gdp_ma +from cities.utils.clean_hazard import clean_hazard +from cities.utils.clean_homeownership import clean_homeownership +from cities.utils.clean_income_distribution import clean_income_distribution +from cities.utils.clean_industry import clean_industry +from cities.utils.clean_industry_ma import clean_industry_ma +from cities.utils.clean_industry_ts import clean_industry_ts +from cities.utils.clean_population import clean_population +from cities.utils.clean_population_density import clean_population_density +from cities.utils.clean_population_ma import clean_population_ma +from cities.utils.clean_spending_commerce import clean_spending_commerce +from cities.utils.clean_spending_HHS import clean_spending_HHS +from cities.utils.clean_spending_transportation import clean_spending_transportation +from cities.utils.clean_transport import clean_transport +from cities.utils.clean_unemployment import clean_unemployment +from cities.utils.clean_urbanicity_ma import clean_urbanicity_ma +from cities.utils.clean_urbanization import clean_urbanization +from cities.utils.cleaning_poverty import clean_poverty + +# from cities.utils.clean_health import clean_health + + +# clean_health() lost of another 15-ish fips + +clean_population_density() + +clean_homeownership() + +clean_income_distribution() + +clean_hazard() + +clean_burdens() + +clean_age_composition() + +clean_gdp_ma() + +clean_industry_ma() + +clean_urbanicity_ma() + +clean_ethnic_composition_ma() + +clean_population_ma() + +clean_poverty() + +clean_unemployment() + +clean_gdp() + +clean_population() + +clean_transport() + +clean_spending_transportation() + +clean_spending_commerce() + +clean_spending_HHS() + +clean_ethnic_composition() + +clean_industry() + +clean_urbanization() + +clean_industry_ts() diff --git a/build/cities/utils/cleaning_scripts/cleaning_poverty.py b/build/cities/utils/cleaning_scripts/cleaning_poverty.py new file mode 100644 index 00000000..83d9d7e2 --- /dev/null +++ b/build/cities/utils/cleaning_scripts/cleaning_poverty.py @@ -0,0 +1,23 @@ +from cities.utils.clean_variable import VariableCleaner +from cities.utils.data_grabber import find_repo_root + +root = find_repo_root() + + +poverty_variables = [ + "povertyAll", + "povertyAllprct", + "povertyUnder18", + "povertyUnder18prct", + "medianHouseholdIncome", +] + + +def clean_poverty(): + for variable_name in poverty_variables: + cleaner = VariableCleaner( + variable_name, + path_to_raw_csv=f"{root}/data/raw/{variable_name}_wide.csv", + year_or_category="Year", + ) + cleaner.clean_variable() diff --git a/build/cities/utils/cleaning_utils.py b/build/cities/utils/cleaning_utils.py new file mode 100644 index 00000000..fa15818d --- /dev/null +++ b/build/cities/utils/cleaning_utils.py @@ -0,0 +1,83 @@ +from typing import List, Union + +import numpy as np +import pandas as pd +from sklearn.preprocessing import StandardScaler + +from cities.utils.data_grabber import DataGrabber + + +def sigmoid(x, scale=1 / 3): + range_0_1 = 1 / (1 + np.exp(-x * scale)) + range_minus1_1 = 2 * range_0_1 - 1 + return range_minus1_1 + + +def standardize_and_scale(data: pd.DataFrame) -> pd.DataFrame: + """ + Standardizes and scales float columns in a DataFrame to [-1,1], copying other columns. Returns a new DataFrame. + """ + standard_scaler = StandardScaler() # Standardize to mean 0, std 1 + + # Copy all columns first + new_data = data.copy() + + # Select float columns + float_cols = data.select_dtypes(include=["float64"]) + + # Standardize float columns to mean 0, std 1 + standardized_floats = standard_scaler.fit_transform(float_cols) + + # Apply sigmoid transformation, [-3std, 3std] to [-1, 1] + new_data[float_cols.columns] = sigmoid(standardized_floats, scale=1 / 3) + + return new_data + + +def revert_standardize_and_scale_scaler( + transformed_values: Union[np.ndarray, List, pd.Series, float], + year: int, + variable_name: str, +) -> List: + if not isinstance(transformed_values, np.ndarray): + transformed_values = np.array(transformed_values) + + def inverse_sigmoid(y, scale=1 / 3): + return -np.log((2 / (y + 1)) - 1) / scale + + # needed to avoid lint issues + dg: DataGrabber + + # normally this will be deployed in a context in which dg already exists + # and we want to avoid wasting time by reloading the data + try: + original_column = dg.wide[variable_name][str(year)].values + except NameError: + dg = DataGrabber() + dg.get_features_wide([variable_name]) + original_column = dg.wide[variable_name][str(year)].values.reshape(-1, 1) + + # dg = DataGrabber() + # dg.get_features_wide([variable_name]) + + # original_column = dg.wide[variable_name][str(year)].values.reshape(-1, 1) + scaler = StandardScaler() + scaler.fit(original_column) + + inverted_values_sigmoid = inverse_sigmoid(transformed_values) + inverted_values = scaler.inverse_transform( + inverted_values_sigmoid.reshape(-1, 1) + ).flatten() + + return inverted_values + + +def revert_prediction_df(df: pd.DataFrame, variable_name: str) -> pd.DataFrame: + df_copy = df.copy() + + for i in range(len(df)): + df_copy.iloc[i, 1:] = revert_standardize_and_scale_scaler( + df.iloc[i, 1:].tolist(), df.iloc[i, 0], variable_name + ) + + return df_copy diff --git a/build/cities/utils/data_grabber.py b/build/cities/utils/data_grabber.py new file mode 100644 index 00000000..ba6ee5e6 --- /dev/null +++ b/build/cities/utils/data_grabber.py @@ -0,0 +1,119 @@ +import os +import re +import sys +from pathlib import Path +from typing import List + +import pandas as pd + + +def find_repo_root() -> Path: + return Path(__file__).parent.parent.parent + + +def check_if_tensed(df): + years_to_check = ["2015", "2018", "2019", "2020"] + check = df.columns[2:].isin(years_to_check).any().any() + return check + + +class DataGrabber: + def __init__(self): + self.repo_root = find_repo_root() + self.data_path = os.path.join(self.repo_root, "data/processed") + self.wide = {} + self.std_wide = {} + self.long = {} + self.std_long = {} + + def get_features_wide(self, features: List[str]) -> None: + for feature in features: + file_path = os.path.join(self.data_path, f"{feature}_wide.csv") + self.wide[feature] = pd.read_csv(file_path) + + def get_features_std_wide(self, features: List[str]) -> None: + for feature in features: + file_path = os.path.join(self.data_path, f"{feature}_std_wide.csv") + self.std_wide[feature] = pd.read_csv(file_path) + + def get_features_long(self, features: List[str]) -> None: + for feature in features: + file_path = os.path.join(self.data_path, f"{feature}_long.csv") + self.long[feature] = pd.read_csv(file_path) + + def get_features_std_long(self, features: List[str]) -> None: + for feature in features: + file_path = os.path.join(self.data_path, f"{feature}_std_long.csv") + self.std_long[feature] = pd.read_csv(file_path) + + +class MSADataGrabber(DataGrabber): + def __init__(self): + super().__init__() + self.repo_root = find_repo_root() + self.data_path = os.path.join(self.repo_root, "data/MSA_level") + sys.path.insert(0, self.data_path) + + +def list_available_features(level="county"): + root = find_repo_root() + + if level == "county": + folder_path = f"{root}/data/processed" + elif level == "msa": + folder_path = f"{root}/data/MSA_level" + else: + raise ValueError("Invalid level. Please choose 'county' or 'msa'.") + + file_names = [f for f in os.listdir(folder_path) if f != ".gitkeep"] + processed_file_names = [] + + for file_name in file_names: + # Use regular expressions to find the patterns and split accordingly + matches = re.split(r"_wide|_long|_std", file_name) + if matches: + processed_file_names.append(matches[0]) + + feature_names = list(set(processed_file_names)) + + return sorted(feature_names) + + +def list_tensed_features(level="county"): + if level == "county": + data = DataGrabber() + all_features = list_available_features(level="county") + + elif level == "msa": + data = MSADataGrabber() + all_features = list_available_features(level="msa") + + else: + raise ValueError("Invalid level. Please choose 'county' or 'msa'.") + + data.get_features_wide(all_features) + + tensed_features = [] + for feature in all_features: + if check_if_tensed(data.wide[feature]): + tensed_features.append(feature) + + return sorted(tensed_features) + + +# TODO this only will pick up spending-based interventions +# needs to be modified/expanded when we add other types of interventions +def list_interventions(): + interventions = [ + feature for feature in list_tensed_features() if feature.startswith("spending_") + ] + return sorted(interventions) + + +def list_outcomes(): + outcomes = [ + feature + for feature in list_tensed_features() + if feature not in list_interventions() + ] + return sorted(outcomes) diff --git a/build/cities/utils/data_loader.py b/build/cities/utils/data_loader.py new file mode 100644 index 00000000..db3a13e3 --- /dev/null +++ b/build/cities/utils/data_loader.py @@ -0,0 +1,89 @@ +import os +from typing import Dict, List + +import pandas as pd +import sqlalchemy +import torch +from torch.utils.data import Dataset + + +class ZoningDataset(Dataset): + def __init__( + self, + categorical, + continuous, + standardization_dictionary=None, + ): + self.categorical = categorical + self.continuous = continuous + + self.standardization_dictionary = standardization_dictionary + + if self.categorical: + self.categorical_levels = dict() + for name in self.categorical.keys(): + self.categorical_levels[name] = torch.unique(categorical[name]) + + N_categorical = len(categorical.keys()) + N_continuous = len(continuous.keys()) + + if N_categorical > 0: + self.n = len(next(iter(categorical.values()))) + elif N_continuous > 0: + self.n = len(next(iter(continuous.values()))) + + def __len__(self): + return self.n + + def __getitem__(self, idx): + cat_data = {key: val[idx] for key, val in self.categorical.items()} + cont_data = {key: val[idx] for key, val in self.continuous.items()} + return { + "categorical": cat_data, + "continuous": cont_data, + } + + +def select_from_data(data, kwarg_names: Dict[str, List[str]]): + _data = {} + _data["outcome"] = data["continuous"][kwarg_names["outcome"]] + _data["categorical"] = { + key: val + for key, val in data["categorical"].items() + if key in kwarg_names["categorical"] + } + _data["continuous"] = { + key: val + for key, val in data["continuous"].items() + if key in kwarg_names["continuous"] + } + + return _data + + +def db_connection(): + DB_USERNAME = os.getenv("DB_USERNAME") + HOST = os.getenv("HOST") + DATABASE = os.getenv("DATABASE") + PASSWORD = os.getenv("PASSWORD") + DB_SEARCH_PATH = os.getenv("DB_SEARCH_PATH") + + return sqlalchemy.create_engine( + f"postgresql://{DB_USERNAME}:{PASSWORD}@{HOST}/{DATABASE}", + connect_args={"options": f"-csearch-path={DB_SEARCH_PATH}"}, + ).connect() + + +def select_from_sql(sql, conn, kwargs, params=None): + df = pd.read_sql(sql, conn, params=params) + return { + "outcome": df[kwargs["outcome"]], + "categorical": { + key: torch.tensor(df[key].values, dtype=torch.int64) + for key in kwargs["categorical"] + }, + "continuous": { + key: torch.tensor(df[key], dtype=torch.float32) + for key in kwargs["continuous"] + }, + } diff --git a/build/cities/utils/percentiles.py b/build/cities/utils/percentiles.py new file mode 100644 index 00000000..c4837a53 --- /dev/null +++ b/build/cities/utils/percentiles.py @@ -0,0 +1,64 @@ +import os + +import dill as dill +import numpy as np + +from cities.utils.data_grabber import DataGrabber, find_repo_root, list_interventions + + +def export_sorted_interventions(): + root = find_repo_root() + + interventions = list_interventions() + dg = DataGrabber() + + dg.get_features_std_wide(interventions) + + interventions_sorted = {} + for intervention in interventions: + intervention_frame = dg.std_wide[intervention].copy().iloc[:, 2:] + intervention_frame = intervention_frame.apply( + lambda col: col.sort_values().values + ) + assert ( + all(np.diff(intervention_frame[col]) >= 0) + for col in intervention_frame.columns + ), "A column is not increasing." + interventions_sorted[intervention] = intervention_frame + + with open( + os.path.join(root, "data/sorted_interventions", "interventions_sorted.pkl"), + "wb", + ) as f: + dill.dump(interventions_sorted, f) + + +def transformed_intervention_from_percentile(intervention, year, percentile): + root = find_repo_root() + + with open( + os.path.join(root, "data/sorted_interventions", "interventions_sorted.pkl"), + "rb", + ) as f: + interventions_sorted = dill.load(f) + intervention_frame = interventions_sorted[intervention] + + if str(year) not in intervention_frame.columns: + raise ValueError("Year not in intervention frame.") + + sorted_var = intervention_frame[str(year)] + n = len(sorted_var) + index = percentile * (n - 1) / 100 + + lower_index = int(index) + upper_index = lower_index + 1 + + if lower_index == n - 1: + return sorted_var[lower_index] + + interpolation_factor = index - lower_index + interpolated_value = (1 - interpolation_factor) * sorted_var[ + lower_index + ] + interpolation_factor * sorted_var[upper_index] + + return interpolated_value diff --git a/build/cities/utils/similarity_utils.py b/build/cities/utils/similarity_utils.py new file mode 100644 index 00000000..1db37327 --- /dev/null +++ b/build/cities/utils/similarity_utils.py @@ -0,0 +1,172 @@ +from typing import Dict + +import numpy as np +import pandas as pd +from plotly import graph_objs as go + +from cities.utils.data_grabber import check_if_tensed + + +def slice_with_lag(df: pd.DataFrame, fips: int, lag: int) -> Dict[str, np.ndarray]: + """ + Takes a pandas dataframe, a location FIPS and a lag (years), + returns a dictionary with two numpy arrays: + - my_array: the array of features for the location with the given FIPS + - other_arrays: the array of features for all other locations + if lag>0, drops first lag columns from my_array and last lag columns from other_arrays. + Meant to be used prior to calculating similarity. + """ + original_length = df.shape[0] + original_array_width = df.shape[1] - 2 + + # assert error if lag > original array width + assert ( + lag <= original_array_width + ), "Lag is greater than the number of years in the dataframe" + assert lag >= 0, "Lag must be a positive integer" + + # this assumes input df has two columns of metadata, then the rest are features + # obey this convention with other datasets! + + my_row = df.loc[df["GeoFIPS"] == fips].copy() + my_id = my_row[["GeoFIPS", "GeoName"]] + my_values = my_row.iloc[:, 2 + lag :] + + my_df = pd.concat([my_id, my_values], axis=1) + + my_df = pd.DataFrame( + {**my_id.to_dict(orient="list"), **my_values.to_dict(orient="list")} + ) + + assert fips in df["GeoFIPS"].values, "FIPS not found in the dataframe" + other_df = df[df["GeoFIPS"] != fips].copy() + + my_array = np.array(my_values) + + if lag > 0: + other_df = df[df["GeoFIPS"] != fips].iloc[:, :-lag] + + assert fips not in other_df["GeoFIPS"].values, "FIPS found in the other dataframe" + other_arrays = np.array(other_df.iloc[:, 2:]) + + assert other_arrays.shape[0] + 1 == original_length, "Dataset sizes don't match" + assert other_arrays.shape[1] == my_array.shape[1], "Lengths don't match" + + return { + "my_array": my_array, + "other_arrays": other_arrays, + "my_df": my_df, + "other_df": other_df, + } + + +def generalized_euclidean_distance(u, v, weights): + featurewise_squared_contributions = ( + abs(weights) + * ((weights >= 0) * abs(u - v) + (weights < 0) * (-abs(u - v) + 2)) ** 2 + ) + + featurewise_contributions = featurewise_squared_contributions ** (1 / 2) + + distance = sum(featurewise_squared_contributions) ** (1 / 2) + return { + "distance": distance, + "featurewise_contributions": featurewise_contributions, + } + + +def divide_exponentially(group_weight, number_of_features, rate): + """ + Returns a list of `number_of_features` weights that sum to `group_weight` and are distributed + exponentially. Intended for time series feature groups. + If `rate` is 1, all weights are equal. If `rate` is greater than 1, weights + prefer more recent events. + """ + result = [] + denominator = sum([rate**j for j in range(number_of_features)]) + for i in range(number_of_features): + value = group_weight * (rate**i) / denominator + result.append(value) + return result + + +def compute_weight_array(query_object, rate=1.08): + assert ( + sum( + abs(value) + for key, value in query_object.feature_groups_with_weights.items() + ) + != 0 + ), "At least one weight has to be other than 0" + + max_other_scores = sum( + abs(value) + for key, value in query_object.feature_groups_with_weights.items() + if key != query_object.outcome_var + ) + + if ( + query_object.outcome_var + and query_object.feature_groups_with_weights[query_object.outcome_var] != 0 + ): + weight_outcome_joint = max_other_scores if max_other_scores > 0 else 1 + query_object.feature_groups_with_weights[query_object.outcome_var] = ( + weight_outcome_joint + * query_object.feature_groups_with_weights[query_object.outcome_var] + ) + + tensed_status = {} + columns = {} + column_counts = {} + weight_lists = {} + all_columns = [] + for feature in query_object.feature_groups: + tensed_status[feature] = check_if_tensed(query_object.data.std_wide[feature]) + + if feature == query_object.outcome_var: + columns[feature] = query_object.restricted_outcome_df.columns[2:] + else: + columns[feature] = query_object.data.std_wide[feature].columns[2:] + + # TODO remove if all tests passed before merging + # column_counts[feature] = len(query_object.data.std_wide[feature].columns) - 2 + + column_counts[feature] = len(columns[feature]) + + if feature == query_object.outcome_var and query_object.lag > 0: + column_counts[feature] -= query_object.lag + + all_columns.extend([f"{column}_{feature}" for column in columns[feature]]) + + # TODO: remove if tests passed + # column_tags.extend([feature] * column_counts[feature]) + if tensed_status[feature]: + weight_lists[feature] = divide_exponentially( + query_object.feature_groups_with_weights[feature], + column_counts[feature], + rate, + ) + else: + weight_lists[feature] = [ + query_object.feature_groups_with_weights[feature] + / column_counts[feature] + ] * column_counts[feature] + + query_object.all_columns = all_columns[query_object.lag :] + query_object.all_weights = np.concatenate(list(weight_lists.values())) + + +def plot_weights(query_object): + fig = go.Figure() + + fig.add_trace(go.Bar(x=query_object.all_columns, y=query_object.all_weights)) + + fig.update_layout( + xaxis_title="columns", + yaxis_title="weights", + title="Weights of columns", + template="plotly_white", + ) + + query_object.weigth_plot = fig + query_object.weigth_plot.show() diff --git a/build/cities/utils/years_available_pipeline.py b/build/cities/utils/years_available_pipeline.py new file mode 100644 index 00000000..37ea85fe --- /dev/null +++ b/build/cities/utils/years_available_pipeline.py @@ -0,0 +1,31 @@ +import os + +import dill + +from cities.modeling.modeling_utils import prep_wide_data_for_inference +from cities.utils.data_grabber import find_repo_root, list_interventions, list_outcomes + +root = find_repo_root() +interventions = list_interventions() +outcomes = list_outcomes() + + +for intervention in interventions: + for outcome in outcomes: + # intervention = "spending_HHS" + # outcome = "gdp" + data = prep_wide_data_for_inference( + outcome_dataset=outcome, + intervention_dataset=intervention, + forward_shift=3, # shift doesn't matter here, as long as data exists + ) + data_slim = {key: data[key] for key in ["years_available", "outcome_years"]} + + assert len(data_slim["years_available"]) > 2 + file_path = os.path.join( + root, "data/years_available", f"{intervention}_{outcome}.pkl" + ) + print(file_path) + if not os.path.exists(file_path): + with open(file_path, "wb") as f: + dill.dump(data_slim, f) diff --git a/build/main.py b/build/main.py new file mode 100644 index 00000000..fbfcea0b --- /dev/null +++ b/build/main.py @@ -0,0 +1,235 @@ +import os + +from typing import Annotated + +from dotenv import load_dotenv +from fastapi import FastAPI, Depends, Query +from fastapi.middleware.gzip import GZipMiddleware +import uvicorn + +import psycopg2 +from psycopg2.pool import ThreadedConnectionPool + +load_dotenv() + +ENV = os.getenv("ENV") +USERNAME = os.getenv("DB_USERNAME") +PASSWORD = os.getenv("PASSWORD") +HOST = os.getenv("HOST") +DATABASE = os.getenv("DATABASE") +DB_SEARCH_PATH = os.getenv("DB_SEARCH_PATH") +INSTANCE_CONNECTION_NAME = os.getenv("INSTANCE_CONNECTION_NAME") + +app = FastAPI() + +if ENV == "dev": + from fastapi.middleware.cors import CORSMiddleware + + origins = [ + "http://localhost", + "http://localhost:5000", + ] + app.add_middleware(CORSMiddleware, allow_origins=origins, allow_credentials=True) + +app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5) + + +if ENV == "dev": + host = HOST +else: + host = f"/cloudsql/{INSTANCE_CONNECTION_NAME}" + +pool = ThreadedConnectionPool( + 1, + 10, + user=USERNAME, + password=PASSWORD, + host=HOST, + database=DATABASE, + options=f"-csearch_path={DB_SEARCH_PATH}", +) + + +def get_db() -> psycopg2.extensions.connection: + db = pool.getconn() + try: + yield db + finally: + pool.putconn(db) + + +predictor = None + + +def get_predictor(db: psycopg2.extensions.connection = Depends(get_db)): + from cities.deployment.tracts_minneapolis.predict import TractsModelPredictor + + global predictor + if predictor is None: + predictor = TractsModelPredictor(db) + return predictor + + +Limit = Annotated[float, Query(ge=0, le=1)] +Radius = Annotated[float, Query(ge=0)] +Year = Annotated[int, Query(ge=2000, le=2030)] + + +@app.middleware("http") +async def add_cache_control_header(request, call_next): + response = await call_next(request) + response.headers["Cache-Control"] = "public, max-age=300" + return response + + +if ENV == "dev": + + @app.middleware("http") + async def add_acess_control_header(request, call_next): + response = await call_next(request) + response.headers["Access-Control-Allow-Origin"] = "*" + return response + + +@app.get("/demographics") +async def read_demographics( + category: Annotated[str, Query(max_length=100)], db=Depends(get_db) +): + with db.cursor() as cur: + cur.execute( + """ + select tract_id, "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022" + from api__demographics where description = %s + """, + (category,), + ) + return [[desc[0] for desc in cur.description]] + cur.fetchall() + + +@app.get("/census-tracts") +async def read_census_tracts(year: Year, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute("select * from api__census_tracts where year_ = %s", (year,)) + row = cur.fetchone() + + return row[1] if row is not None else None + + +@app.get("/high-frequency-transit-lines") +async def read_high_frequency_transit_lines(year: Year, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute( + """ + select line_geom_json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (year,), + ) + row = cur.fetchone() + + return row[0] if row is not None else None + + +@app.get("/high-frequency-transit-stops") +async def read_high_frequency_transit_stops(year: Year, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute( + """ + select stop_geom_json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (year,), + ) + row = cur.fetchone() + + return row[0] if row is not None else None + + +@app.get("/yellow-zone") +async def read_yellow_zone( + year: Year, line_radius: Radius, stop_radius: Radius, db=Depends(get_db) +): + with db.cursor() as cur: + cur.execute( + """ + select + st_asgeojson(st_transform(st_union(st_buffer(line_geom, %s, 'quad_segs=4'), st_buffer(stop_geom, %s, 'quad_segs=4')), 4269))::json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (line_radius, stop_radius, year), + ) + row = cur.fetchone() + + if row is None: + return None + + return { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {"id": "0"}, "geometry": row[0]} + ], + } + + +@app.get("/blue-zone") +async def read_blue_zone(year: Year, radius: Radius, db=Depends(get_db)): + with db.cursor() as cur: + cur.execute( + """ + select st_asgeojson(st_transform(st_buffer(line_geom, %s, 'quad_segs=4'), 4269))::json + from api__high_frequency_transit_lines + where '%s-01-01'::date <@ valid + """, + (radius, year), + ) + row = cur.fetchone() + + if row is None: + return None + + return { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "properties": {"id": "0"}, "geometry": row[0]} + ], + } + + +@app.get("/predict") +async def read_predict( + blue_zone_radius: Radius, + yellow_zone_line_radius: Radius, + yellow_zone_stop_radius: Radius, + blue_zone_limit: Limit, + yellow_zone_limit: Limit, + year: Year, + db=Depends(get_db), + predictor=Depends(get_predictor), +): + result = predictor.predict_cumulative( + db, + intervention=( + { + "radius_blue": blue_zone_radius, + "limit_blue": blue_zone_limit, + "radius_yellow_line": yellow_zone_line_radius, + "radius_yellow_stop": yellow_zone_stop_radius, + "limit_yellow": yellow_zone_limit, + "reform_year": year, + } + ), + ) + return { + "census_tracts": [str(t) for t in result["census_tracts"]], + "housing_units_factual": [t.item() for t in result["housing_units_factual"]], + "housing_units_counterfactual": [ + t.tolist() for t in result["housing_units_counterfactual"] + ], + } + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000))) diff --git a/build/postgrest.conf b/build/postgrest.conf new file mode 100644 index 00000000..ddb71965 --- /dev/null +++ b/build/postgrest.conf @@ -0,0 +1,107 @@ +## Admin server used for checks. It's disabled by default unless a port is specified. +# admin-server-port = 3001 + +## The database role to use when no client authentication is provided +db-anon-role = "web_anon" + +## Notification channel for reloading the schema cache +db-channel = "pgrst" + +## Enable or disable the notification channel +db-channel-enabled = true + +## Enable in-database configuration +db-config = true + +## Function for in-database configuration +## db-pre-config = "postgrest.pre_config" + +## Extra schemas to add to the search_path of every request +db-extra-search-path = "public" + +## Limit rows in response +# db-max-rows = 1000 + +## Allow getting the EXPLAIN plan through the `Accept: application/vnd.pgrst.plan` header +# db-plan-enabled = false + +## Number of open connections in the pool +db-pool = 10 + +## Time in seconds to wait to acquire a slot from the connection pool +# db-pool-acquisition-timeout = 10 + +## Time in seconds after which to recycle pool connections +# db-pool-max-lifetime = 1800 + +## Time in seconds after which to recycle unused pool connections +# db-pool-max-idletime = 30 + +## Allow automatic database connection retrying +# db-pool-automatic-recovery = true + +## Stored proc to exec immediately after auth +# db-pre-request = "stored_proc_name" + +## Enable or disable prepared statements. disabling is only necessary when behind a connection pooler. +## When disabled, statements will be parametrized but won't be prepared. +db-prepared-statements = true + +## The name of which database schema to expose to REST clients +db-schemas = "api" + +## How to terminate database transactions +## Possible values are: +## commit (default) +## Transaction is always committed, this can not be overriden +## commit-allow-override +## Transaction is committed, but can be overriden with Prefer tx=rollback header +## rollback +## Transaction is always rolled back, this can not be overriden +## rollback-allow-override +## Transaction is rolled back, but can be overriden with Prefer tx=commit header +db-tx-end = "commit" + +## The standard connection URI format, documented at +## https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING +db-uri = "postgresql://postgres@34.123.100.76:5432/cities" + +# jwt-aud = "your_audience_claim" + +## Jspath to the role claim key +jwt-role-claim-key = ".role" + +## Choose a secret, JSON Web Key (or set) to enable JWT auth +## (use "@filename" to load from separate file) +# jwt-secret = "secret_with_at_least_32_characters" +jwt-secret-is-base64 = false + +## Enables and set JWT Cache max lifetime, disables caching with 0 +# jwt-cache-max-lifetime = 0 + +## Logging level, the admitted values are: crit, error, warn, info and debug. +log-level = "error" + +## Determine if the OpenAPI output should follow or ignore role privileges or be disabled entirely. +## Admitted values: follow-privileges, ignore-privileges, disabled +openapi-mode = "follow-privileges" + +## Base url for the OpenAPI output +openapi-server-proxy-uri = "" + +## Configurable CORS origins +# server-cors-allowed-origins = "" + +server-host = "!4" +server-port = 3001 + +## Allow getting the request-response timing information through the `Server-Timing` header +server-timing-enabled = true + +## Unix socket location +## if specified it takes precedence over server-port +# server-unix-socket = "/tmp/pgrst.sock" + +## Unix socket file mode +## When none is provided, 660 is applied by default +# server-unix-socket-mode = "660" diff --git a/build/requirements.txt b/build/requirements.txt new file mode 100644 index 00000000..15840bbf --- /dev/null +++ b/build/requirements.txt @@ -0,0 +1,184 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --extra=api --output-file=api/requirements.txt +# +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # starlette + # watchfiles +certifi==2024.8.30 + # via + # httpcore + # httpx +chirho @ git+https://github.com/BasisResearch/chirho.git + # via cities (setup.py) +click==8.1.7 + # via + # typer + # uvicorn +contourpy==1.3.0 + # via matplotlib +cycler==0.12.1 + # via matplotlib +dill==0.3.8 + # via cities (setup.py) +dnspython==2.6.1 + # via email-validator +email-validator==2.2.0 + # via fastapi +fastapi[standard]==0.114.0 + # via cities (setup.py) +fastapi-cli[standard]==0.0.5 + # via fastapi +filelock==3.16.0 + # via torch +fonttools==4.53.1 + # via matplotlib +fsspec==2024.9.0 + # via torch +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.5 + # via httpx +httptools==0.6.1 + # via uvicorn +httpx==0.27.2 + # via fastapi +idna==3.8 + # via + # anyio + # email-validator + # httpx +jinja2==3.1.4 + # via + # fastapi + # torch +joblib==1.4.2 + # via scikit-learn +kiwisolver==1.4.7 + # via matplotlib +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.5 + # via jinja2 +matplotlib==3.9.2 + # via cities (setup.py) +mdurl==0.1.2 + # via markdown-it-py +mpmath==1.3.0 + # via sympy +networkx==3.3 + # via torch +numpy==2.1.1 + # via + # cities (setup.py) + # contourpy + # matplotlib + # opt-einsum + # pandas + # pyro-ppl + # scikit-learn + # scipy +opt-einsum==3.3.0 + # via pyro-ppl +packaging==24.1 + # via + # matplotlib + # plotly +pandas==2.2.2 + # via cities (setup.py) +pillow==10.4.0 + # via matplotlib +plotly==5.24.0 + # via cities (setup.py) +psycopg2==2.9.9 + # via cities (setup.py) +pydantic==2.9.1 + # via fastapi +pydantic-core==2.23.3 + # via pydantic +pygments==2.18.0 + # via rich +pyparsing==3.1.4 + # via matplotlib +pyro-api==0.1.2 + # via pyro-ppl +pyro-ppl==1.8.6 + # via + # chirho + # cities (setup.py) +python-dateutil==2.9.0.post0 + # via + # matplotlib + # pandas +python-dotenv==1.0.1 + # via uvicorn +python-multipart==0.0.9 + # via fastapi +pytz==2024.1 + # via pandas +pyyaml==6.0.2 + # via uvicorn +rich==13.8.0 + # via typer +scikit-learn==1.5.1 + # via cities (setup.py) +scipy==1.14.1 + # via scikit-learn +shellingham==1.5.4 + # via typer +six==1.16.0 + # via python-dateutil +sniffio==1.3.1 + # via + # anyio + # httpx +sqlalchemy==2.0.34 + # via cities (setup.py) +starlette==0.38.5 + # via fastapi +sympy==1.13.2 + # via torch +tenacity==9.0.0 + # via plotly +threadpoolctl==3.5.0 + # via scikit-learn +#torch==2.4.1 +torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.4.1%2Bcpu.cxx11.abi-cp312-cp312-linux_x86_64.whl + # via + # cities (setup.py) + # pyro-ppl +tqdm==4.66.5 + # via pyro-ppl +typer==0.12.5 + # via fastapi-cli +typing-extensions==4.12.2 + # via + # fastapi + # pydantic + # pydantic-core + # sqlalchemy + # torch + # typer +tzdata==2024.1 + # via pandas +uvicorn[standard]==0.30.6 + # via + # fastapi + # fastapi-cli +uvloop==0.20.0 + # via uvicorn +watchfiles==0.24.0 + # via uvicorn +websockets==13.0.1 + # via uvicorn + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/build/schema.sql b/build/schema.sql new file mode 100644 index 00000000..2285c2b7 --- /dev/null +++ b/build/schema.sql @@ -0,0 +1,67 @@ +begin; +drop schema if exists api cascade; + +create schema api; + +create view api.demographics as ( + select * from api__demographics +); + +create view api.census_tracts as ( + select * from api__census_tracts +); + +create function api.high_frequency_transit_lines() returns setof dev.api__high_frequency_transit_lines as $$ + select * from dev.api__high_frequency_transit_lines +$$ language sql; + +create function api.high_frequency_transit_lines( + blue_zone_radius double precision, + yellow_zone_line_radius double precision, + yellow_zone_stop_radius double precision +) returns table ( + valid daterange, + geom geometry(LineString, 4269), + blue_zone_geom geometry(LineString, 4269), + yellow_zone_geom geometry(Geometry, 4269) +) as $$ + with + lines as (select * from dev.stg_high_frequency_transit_lines_union), + stops as (select * from dev.high_frequency_transit_stops), + lines_and_stops as ( + select + lines.valid * stops.valid as valid, + lines.geom as line_geom, + stops.geom as stop_geom + from lines inner join stops on lines.valid && stops.valid + ) + select + valid, + st_transform(line_geom, 4269) as geom, + st_transform(st_buffer(line_geom, blue_zone_radius), 4269) as blue_zone_geom, + st_transform(st_union(st_buffer(line_geom, yellow_zone_line_radius), st_buffer(stop_geom, yellow_zone_stop_radius)), 4269) as yellow_zone_geom + from lines_and_stops +$$ language sql; + +do $$ +begin +create role web_anon nologin; +exception when duplicate_object then raise notice '%, skipping', sqlerrm using errcode = sqlstate; +end +$$; + +grant all on schema public to web_anon; +grant all on schema dev to web_anon; +grant select on table public.spatial_ref_sys TO web_anon; +grant usage on schema api to web_anon; +grant all on all tables in schema api to web_anon; +grant all on all functions in schema api to web_anon; +grant all on schema api to web_anon; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA dev TO web_anon; +GRANT ALL PRIVILEGES ON ALL functions IN SCHEMA dev TO web_anon; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA api TO web_anon; +GRANT ALL PRIVILEGES ON ALL functions IN SCHEMA api TO web_anon; +GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO web_anon; +GRANT ALL PRIVILEGES ON ALL functions IN SCHEMA public TO web_anon; +grant web_anon to postgres; +commit; diff --git a/cities/deployment/tracts_minneapolis/tracts_model_guide.pkl b/cities/deployment/tracts_minneapolis/tracts_model_guide.pkl new file mode 100644 index 00000000..b99e3a1d Binary files /dev/null and b/cities/deployment/tracts_minneapolis/tracts_model_guide.pkl differ diff --git a/cities/deployment/tracts_minneapolis/tracts_model_params.pth b/cities/deployment/tracts_minneapolis/tracts_model_params.pth new file mode 100644 index 00000000..07942006 Binary files /dev/null and b/cities/deployment/tracts_minneapolis/tracts_model_params.pth differ diff --git a/cities/utils/data_grabber.py b/cities/utils/data_grabber.py index bdd48780..3cb3736b 100644 --- a/cities/utils/data_grabber.py +++ b/cities/utils/data_grabber.py @@ -294,15 +294,15 @@ def get_features_std_long(self, features: List[str]) -> None: self._get_features(features, "std_long") -DataGrabber = DataGrabberDB - +# DataGrabber = DataGrabberDB +# DataGrabber = DataGrabberCSV -def MSADataGrabberFactory(): - return DataGrabberDB(level="msa") +# def MSADataGrabberFactory(): +# return DataGrabberDB(level="msa") -MSADataGrabber = MSADataGrabberFactory +# MSADataGrabber = MSADataGrabberFactory # this reverts to csvs -# DataGrabber = DataGrabberCSV -# MSADataGrabber = MSADataGrabberCSV +DataGrabber = DataGrabberCSV +MSADataGrabber = MSADataGrabberCSV diff --git a/data/minneapolis/.pgpass b/data/minneapolis/.pgpass new file mode 100644 index 00000000..ec7b3921 --- /dev/null +++ b/data/minneapolis/.pgpass @@ -0,0 +1 @@ +34.123.100.76:5432:cities:postgres:VA.TlSR#Z%mu**Q9 \ No newline at end of file diff --git a/data/minneapolis/sourced/demographic/ar-two-ts-one-predictor.ipynb b/data/minneapolis/sourced/demographic/ar-two-ts-one-predictor.ipynb new file mode 100644 index 00000000..0d6c2cb9 --- /dev/null +++ b/data/minneapolis/sourced/demographic/ar-two-ts-one-predictor.ipynb @@ -0,0 +1,836 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from typing import Dict, List\n", + "import math\n", + "import torch\n", + "import pyro\n", + "import pyro.distributions as dist\n", + "from ts_plots import plot_ts\n", + "import pyro.optim as optim\n", + "import pyro.infer as infer\n", + "import seaborn as sns \n", + "import matplotlib.pyplot as plt\n", + "import pyro\n", + "import torch\n", + "from chirho.indexed.ops import IndexSet, gather, indices_of\n", + "from pyro.infer.autoguide import (AutoMultivariateNormal, init_to_mean, AutoNormal,\n", + " AutoLowRankMultivariateNormal, AutoGaussian,)\n", + "import copy\n", + "\n", + "# import condition from chirho\n", + "from chirho.observational.handlers import condition\n", + "\n", + "\n", + "from torch.utils.data import DataLoader\n", + "\n", + "\n", + "\n", + "smoke_test = 'CI' in os.environ\n", + "\n", + "n_samples = 10 if smoke_test else 1000\n", + "n_steps = 10 if smoke_test else 500\n", + "n_series = 2 if smoke_test else 8 #TODO upgarde to 5" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "census_tracts_data_path = \"pg_census_tracts_dataset.pt\"\n", + "\n", + "def select_from_data(data, kwarg_names: Dict[str, List[str]]):\n", + " _data = {}\n", + " _data[\"outcome\"] = data[\"continuous\"][kwarg_names[\"outcome\"]]\n", + " _data[\"categorical\"] = {\n", + " key: val\n", + " for key, val in data[\"categorical\"].items()\n", + " if key in kwarg_names[\"categorical\"]\n", + " }\n", + " _data[\"continuous\"] = {\n", + " key: val\n", + " for key, val in data[\"continuous\"].items()\n", + " if key in kwarg_names[\"continuous\"]\n", + " }\n", + "\n", + " return _data\n", + "\n", + "ct_dataset_read = torch.load(census_tracts_data_path, weights_only=False)\n", + "ct_loader = DataLoader(ct_dataset_read, batch_size=len(ct_dataset_read), shuffle=True)\n", + "data = next(iter(ct_loader))\n", + "\n", + "kwargs = {\n", + " \"categorical\": [\"year\", \"census_tract\", 'university_index', 'downtown_index'],\n", + " \"continuous\": {\n", + " \"housing_units\",\n", + " \"housing_units_original\"\n", + " \"total_value\",\n", + " \"median_value\",\n", + " \"mean_limit_original\",\n", + " \"median_distance\",\n", + " \"income\",\n", + " 'limit',\n", + " \"segregation_original\",\n", + " \"white_original\",\n", + " \"parcel_sqm\",\n", + " 'downtown_overlap', \n", + " 'university_overlap',\n", + " },\n", + " \"outcome\": \"housing_units\",\n", + "}\n", + "\n", + "subset = select_from_data(data, kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "outcome_obs = copy.deepcopy(subset['outcome'])\n", + "\n", + "series_idx = copy.deepcopy(subset['categorical']['census_tract'])\n", + "time_idx = copy.deepcopy(subset['categorical']['year'])\n", + "\n", + "\n", + "unique_series = torch.unique(series_idx)\n", + "unique_times = torch.unique(time_idx)\n", + "\n", + "num_series = unique_series.size(0)\n", + "time_steps = unique_times.size(0)\n", + "\n", + "reshaped_outcome = torch.empty((num_series, time_steps), dtype=outcome_obs.dtype)\n", + "reshaped_outcome[...,:] = torch.nan \n", + "\n", + "def reshape_into_time_series(variable, series_idx, time_idx):\n", + " \n", + " # raise value eror if they are not of the same shape\n", + " if variable.shape[0] != series_idx.shape[0] or variable.shape[0] != time_idx.shape[0]:\n", + " raise ValueError(\"The shapes of variable, series_idx, and time_idx must match.\")\n", + " \n", + " unique_series = torch.unique(series_idx)\n", + " unique_times = torch.unique(time_idx)\n", + "\n", + " num_series = unique_series.size(0)\n", + " time_steps = unique_times.size(0)\n", + "\n", + " reshaped_variable= torch.empty((num_series, time_steps), dtype=variable.dtype)\n", + " reshaped_variable[...,:] = torch.nan\n", + "\n", + " for i, series in enumerate(unique_series):\n", + " for j, time in enumerate(unique_times):\n", + " mask = (series_idx == series) & (time_idx == time)\n", + " index = torch.where(mask)[0]\n", + " if index.numel() > 0:\n", + " reshaped_variable[i, j] = variable[index]\n", + " \n", + " for i, series_id in enumerate(unique_series):\n", + " sorted_times, sorted_indices = torch.sort(time_idx[series_idx == series_id])\n", + " sorted_outcomes = outcome_obs[series_idx == series_id][sorted_indices]\n", + " assert torch.all(reshaped_variable[i,:] == sorted_outcomes)\n", + "\n", + " return { \"reshaped_variable\": reshaped_variable, \"unique_series\": unique_series, \"unique_times\": unique_times }\n", + "\n", + "reshaped_outcome_obs = reshape_into_time_series(outcome_obs, series_idx, time_idx) \n", + "outcome_obs_ts = reshaped_outcome_obs[\"reshaped_variable\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([27053000101, 27053000102, 27053000300, 27053000601, 27053000603,\n", + " 27053001100, 27053001700, 27053002200, 27053002400, 27053002700,\n", + " 27053003200, 27053003300, 27053003800, 27053005901, 27053005902,\n", + " 27053006800, 27053007700, 27053007801, 27053008100, 27053008200,\n", + " 27053008300, 27053008400, 27053008500, 27053009500, 27053009600,\n", + " 27053010600, 27053010700, 27053011000, 27053011703, 27053011800,\n", + " 27053011998, 27053012001, 27053012003, 27053100200, 27053100400,\n", + " 27053100500, 27053100700, 27053100800, 27053100900, 27053101200,\n", + " 27053101300, 27053101600, 27053101800, 27053101900, 27053102000,\n", + " 27053102100, 27053102300, 27053102500, 27053102600, 27053102800,\n", + " 27053102900, 27053103000, 27053103100, 27053103400, 27053103600,\n", + " 27053103700, 27053103900, 27053104000, 27053104100, 27053104400,\n", + " 27053104800, 27053104900, 27053105100, 27053105201, 27053105204,\n", + " 27053105400, 27053105500, 27053105600, 27053105700, 27053106000,\n", + " 27053106200, 27053106400, 27053106500, 27053106600, 27053106700,\n", + " 27053106900, 27053107000, 27053107400, 27053107500, 27053107600,\n", + " 27053108000, 27053108600, 27053108700, 27053108800, 27053108900,\n", + " 27053109000, 27053109100, 27053109200, 27053109300, 27053109400,\n", + " 27053109700, 27053109800, 27053109900, 27053110000, 27053110100,\n", + " 27053110200, 27053110400, 27053110500, 27053110800, 27053110900,\n", + " 27053111100, 27053111200, 27053111300, 27053111400, 27053111500,\n", + " 27053111600, 27053125600, 27053125700, 27053125800, 27053125900,\n", + " 27053126000, 27053126100, 27053126200])\n", + "torch.Size([1130])\n" + ] + } + ], + "source": [ + "print(unique_series)\n", + "print(series_position.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([-0.3007, -0.3340, -0.3229, -0.3007, -0.3340, -0.3007, -0.3229, -0.3451,\n", + " -0.3007, -0.3340])\n" + ] + }, + { + "data": { + "text/plain": [ + "tensor([27053102800, 27053105600, 27053102000, ..., 27053108600,\n", + " 27053100200, 27053100700])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(outcome_obs[series_idx == 27053102800])\n", + "\n", + "series_idx" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([8, 1])\n", + "predictor tensor([[ 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n", + " 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n", + " 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,\n", + " 1.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000,\n", + " -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000,\n", + " -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000, -2.0000,\n", + " -2.0000, -2.0000],\n", + " [ 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000,\n", + " 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000,\n", + " 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000, 1.1000,\n", + " 1.1000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000,\n", + " -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000,\n", + " -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000, -1.8000,\n", + " -1.8000, -1.8000],\n", + " [ 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000,\n", + " 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000,\n", + " 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000, 1.2000,\n", + " 1.2000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000,\n", + " -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000,\n", + " -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000, -1.6000,\n", + " -1.6000, -1.6000],\n", + " [ 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000,\n", + " 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000,\n", + " 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000, 1.3000,\n", + " 1.3000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000,\n", + " -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000,\n", + " -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000, -1.4000,\n", + " -1.4000, -1.4000],\n", + " [ 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000,\n", + " 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000,\n", + " 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000, 3.4000,\n", + " 3.4000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000,\n", + " 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000,\n", + " 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000, 4.8000,\n", + " 4.8000, 4.8000],\n", + " [ 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000,\n", + " 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000,\n", + " 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000, 3.5000,\n", + " 3.5000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000,\n", + " 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000,\n", + " 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000, 5.0000,\n", + " 5.0000, 5.0000],\n", + " [ 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000,\n", + " 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000,\n", + " 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000, 3.6000,\n", + " 3.6000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000,\n", + " 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000,\n", + " 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000, 5.2000,\n", + " 5.2000, 5.2000],\n", + " [ 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000,\n", + " 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000,\n", + " 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000, 3.7000,\n", + " 3.7000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000,\n", + " 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000,\n", + " 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000, 5.4000,\n", + " 5.4000, 5.4000]])\n", + "init tensor([[ 1.6614],\n", + " [ 1.2669],\n", + " [ 1.0617],\n", + " [ 1.6213],\n", + " [ 0.5481],\n", + " [ 0.8339],\n", + " [-0.5228],\n", + " [ 1.3817]]) torch.Size([8, 1])\n", + "ytrue tensor([0., 0., 0., 0., 0., 0., 0., 0.]) torch.Size([8])\n", + "init tensor([[ 1.6614],\n", + " [ 1.2669],\n", + " [ 1.0617],\n", + " [ 1.6213],\n", + " [ 0.5481],\n", + " [ 0.8339],\n", + " [-0.5228],\n", + " [ 1.3817]]) torch.Size([8, 1])\n", + "sampling tensor([1.1645, 1.0568, 1.0247, 1.2985, 1.9192, 2.0835, 1.5909, 2.4027]) tensor([0.9590, 0.9442, 0.8462, 1.2869, 1.8801, 1.8904, 1.6754, 2.4561]) 0.2\n", + "sampling tensor([0.8836, 0.9277, 0.9385, 1.1648, 2.4521, 2.5062, 2.4702, 2.8325]) tensor([0.7994, 0.8255, 0.6240, 1.1401, 3.1695, 2.1399, 2.7899, 2.5771]) 0.2\n", + "sampling tensor([0.8197, 0.8802, 0.8496, 1.1060, 2.9678, 2.6060, 2.9160, 2.8808]) tensor([0.8848, 0.7844, 1.1254, 1.6118, 3.0499, 2.4084, 2.7343, 2.9893]) 0.2\n", + "sampling tensor([0.8539, 0.8638, 1.0502, 1.2947, 2.9200, 2.7133, 2.8937, 3.0457]) tensor([0.8760, 0.4120, 1.1715, 1.2670, 3.0862, 2.6638, 2.7331, 3.0930]) 0.2\n", + "sampling tensor([0.8504, 0.7148, 1.0686, 1.1568, 2.9345, 2.8155, 2.8933, 3.0872]) tensor([0.9075, 0.8527, 0.9420, 1.3327, 2.7976, 2.9062, 2.9515, 2.9209]) 0.2\n", + "sampling tensor([0.8630, 0.8911, 0.9768, 1.1831, 2.8191, 2.9125, 2.9806, 3.0183]) tensor([0.7525, 1.0182, 0.8974, 1.0517, 2.4905, 3.1085, 2.9722, 2.8542]) 0.2\n", + "sampling tensor([0.8010, 0.9573, 0.9590, 1.0707, 2.6962, 2.9934, 2.9889, 2.9917]) tensor([0.8637, 0.7302, 1.0344, 1.0142, 2.1829, 2.7073, 3.0891, 3.1004]) 0.2\n", + "sampling tensor([0.8455, 0.8421, 1.0138, 1.0557, 2.5731, 2.8329, 3.0356, 3.0902]) tensor([0.7643, 1.0689, 0.7915, 1.1257, 2.4191, 2.8035, 3.1611, 3.3089]) 0.2\n", + "sampling tensor([0.8057, 0.9776, 0.9166, 1.1003, 2.6676, 2.8714, 3.0644, 3.1735]) tensor([0.8245, 1.2252, 0.6474, 1.2027, 2.5290, 2.8380, 2.8644, 2.8440]) 0.2\n", + "sampling tensor([0.8298, 1.0401, 0.8590, 1.1311, 2.7116, 2.8852, 2.9458, 2.9876]) tensor([0.9918, 1.0512, 1.0858, 1.0245, 2.8434, 2.5659, 2.8704, 2.3672]) 0.2\n", + "sampling tensor([0.8967, 0.9705, 1.0343, 1.0598, 2.8374, 2.7764, 2.9482, 2.7969]) tensor([0.8768, 0.8262, 1.2885, 1.0594, 2.6183, 2.8967, 3.0878, 2.6368]) 0.2\n", + "sampling tensor([0.8507, 0.8805, 1.1154, 1.0738, 2.7473, 2.9087, 3.0351, 2.9047]) tensor([1.1584, 1.1739, 1.4344, 0.7682, 2.9505, 2.8683, 2.7778, 3.0693]) 0.2\n", + "sampling tensor([0.9633, 1.0196, 1.1738, 0.9573, 2.8802, 2.8973, 2.9111, 3.0777]) tensor([0.8413, 0.7604, 0.9851, 1.0909, 3.1127, 2.8327, 3.2868, 2.9644]) 0.2\n", + "sampling tensor([0.8365, 0.8542, 0.9940, 1.0864, 2.9451, 2.8831, 3.1147, 3.0358]) tensor([0.9169, 0.8311, 1.0574, 1.1989, 3.1183, 2.8125, 3.1844, 3.2632]) 0.2\n", + "sampling tensor([0.8667, 0.8824, 1.0230, 1.1296, 2.9473, 2.8750, 3.0737, 3.1553]) tensor([0.8000, 0.5880, 1.1689, 1.1033, 2.8200, 3.0836, 3.1718, 3.3616]) 0.2\n", + "sampling tensor([0.8200, 0.7852, 1.0676, 1.0913, 2.8280, 2.9834, 3.0687, 3.1947]) tensor([0.7002, 1.1055, 0.8529, 0.8479, 2.9574, 2.9752, 3.0332, 3.0946]) 0.2\n", + "sampling tensor([0.7801, 0.9922, 0.9411, 0.9891, 2.8830, 2.9401, 3.0133, 3.0879]) tensor([0.9535, 0.9376, 0.8490, 0.9693, 2.9775, 3.1411, 2.9559, 2.8555]) 0.2\n", + "sampling tensor([0.8814, 0.9250, 0.9396, 1.0377, 2.8910, 3.0064, 2.9823, 2.9922]) tensor([0.8869, 1.0381, 0.9373, 1.1719, 2.7924, 3.3074, 2.5171, 3.3156]) 0.2\n", + "sampling tensor([0.8548, 0.9652, 0.9749, 1.1187, 2.8170, 3.0730, 2.8068, 3.1762]) tensor([0.6742, 1.0000, 0.9904, 0.9320, 2.8353, 3.3518, 2.6693, 3.0751]) 0.2\n", + "sampling tensor([0.7697, 0.9500, 0.9961, 1.0228, 2.8341, 3.0907, 2.8677, 3.0800]) tensor([0.8082, 1.0932, 0.8338, 0.7305, 2.8807, 3.1286, 2.8236, 3.1098]) 0.2\n", + "sampling tensor([0.8233, 0.9873, 0.9335, 0.9422, 2.8523, 3.0014, 2.9295, 3.0939]) tensor([0.8253, 0.9624, 1.2274, 0.8632, 2.7502, 3.2247, 2.8109, 3.2757]) 0.2\n", + "sampling tensor([0.8301, 0.9350, 1.0910, 0.9953, 2.8001, 3.0399, 2.9244, 3.1603]) tensor([0.6143, 0.7999, 1.3726, 0.6261, 2.6870, 2.8580, 2.8266, 3.0368]) 0.2\n", + "sampling tensor([0.7457, 0.8700, 1.1491, 0.9005, 2.7748, 2.8932, 2.9306, 3.0647]) tensor([0.8224, 0.9470, 1.3068, 1.1277, 2.7519, 2.8896, 2.9404, 3.1502]) 0.2\n", + "sampling tensor([0.8290, 0.9288, 1.1227, 1.1011, 2.8007, 2.9058, 2.9762, 3.1101]) tensor([0.6490, 1.0351, 1.2034, 1.3915, 2.3171, 2.6677, 3.1155, 3.3360]) 0.2\n", + "sampling tensor([-0.7404, -0.4860, -0.3186, -0.1434, 3.3268, 3.5671, 3.8462, 4.0344]) tensor([-0.6961, -0.4971, -0.0775, 0.2463, 3.3115, 3.3958, 3.6888, 3.8712]) 0.2\n", + "sampling tensor([-1.2784, -1.0989, -0.8310, -0.6015, 3.7246, 3.8583, 4.0755, 4.2485]) tensor([-1.1690, -1.3330, -0.9250, -0.9269, 3.7021, 4.1779, 3.9066, 4.0387]) 0.2\n", + "sampling tensor([-1.4676, -1.4332, -1.1700, -1.0708, 3.8808, 4.1712, 4.1626, 4.3155]) tensor([-1.2799, -1.3256, -0.8626, -1.2096, 3.9243, 4.1113, 4.0877, 4.6889]) 0.2\n", + "sampling tensor([-1.5120, -1.4303, -1.1450, -1.1838, 3.9697, 4.1445, 4.2351, 4.5756]) tensor([-1.3311, -1.4066, -0.7662, -1.2684, 4.1184, 4.1596, 4.4624, 4.1900]) 0.2\n", + "sampling tensor([-1.5324, -1.4627, -1.1065, -1.2074, 4.0473, 4.1639, 4.3850, 4.3760]) tensor([-1.6437, -1.4913, -0.5709, -1.0727, 4.1306, 3.9616, 4.4953, 4.5742]) 0.2\n", + "sampling tensor([-1.6575, -1.4965, -1.0283, -1.1291, 4.0522, 4.0847, 4.3981, 4.5297]) tensor([-1.8631, -1.6930, -1.4019, -1.0118, 3.9594, 3.8904, 4.1861, 4.6715]) 0.2\n", + "sampling tensor([-1.7452, -1.5772, -1.3608, -1.1047, 3.9838, 4.0561, 4.2744, 4.5686]) tensor([-1.7631, -1.7439, -1.2705, -1.2866, 3.9983, 4.2274, 4.0546, 4.6271]) 0.2\n", + "sampling tensor([-1.7053, -1.5976, -1.3082, -1.2146, 3.9993, 4.1910, 4.2219, 4.5508]) tensor([-1.5366, -1.5879, -1.0659, -1.5067, 4.3592, 4.2945, 3.9881, 4.7027]) 0.2\n", + "sampling tensor([-1.6146, -1.5351, -1.2264, -1.3027, 4.1437, 4.2178, 4.1953, 4.5811]) tensor([-1.9884, -1.4237, -1.1947, -1.4989, 3.9685, 4.1652, 4.1970, 4.5946]) 0.2\n", + "sampling tensor([-1.7954, -1.4695, -1.2779, -1.2996, 3.9874, 4.1661, 4.2788, 4.5379]) tensor([-1.8362, -1.5743, -1.3397, -1.2107, 3.7901, 3.8011, 4.3986, 4.7661]) 0.2\n", + "sampling tensor([-1.7345, -1.5297, -1.3359, -1.1843, 3.9161, 4.0204, 4.3594, 4.6065]) tensor([-1.7281, -1.4548, -1.3361, -0.9866, 4.1320, 3.7112, 4.3819, 4.2332]) 0.2\n", + "sampling tensor([-1.6912, -1.4819, -1.3344, -1.0946, 4.0528, 3.9845, 4.3528, 4.3933]) tensor([-1.9101, -1.4205, -1.0189, -1.2292, 4.3946, 4.0302, 4.4955, 4.1212]) 0.2\n", + "sampling tensor([-1.7641, -1.4682, -1.2076, -1.1917, 4.1578, 4.1121, 4.3982, 4.3485]) tensor([-1.6857, -1.4738, -1.0328, -1.6376, 3.9828, 3.6963, 4.5198, 4.2196]) 0.2\n", + "sampling tensor([-1.6743, -1.4895, -1.2131, -1.3550, 3.9931, 3.9785, 4.4079, 4.3878]) tensor([-1.7335, -1.5964, -0.8609, -1.5713, 3.7540, 4.0331, 4.0207, 4.1290]) 0.2\n", + "sampling tensor([-1.6934, -1.5386, -1.1444, -1.3285, 3.9016, 4.1132, 4.2083, 4.3516]) tensor([-1.7766, -1.5831, -1.1011, -1.2238, 4.0234, 4.2672, 4.3486, 4.3273]) 0.2\n", + "sampling tensor([-1.7106, -1.5333, -1.2404, -1.1895, 4.0094, 4.2069, 4.3395, 4.4309]) tensor([-2.0009, -1.9794, -1.5991, -1.3017, 3.8831, 4.2267, 4.2345, 4.2356]) 0.2\n", + "sampling tensor([-1.8004, -1.6917, -1.4397, -1.2207, 3.9532, 4.1907, 4.2938, 4.3942]) tensor([-1.5994, -1.4702, -1.3363, -1.2015, 3.9650, 3.9079, 4.2204, 4.7107]) 0.2\n", + "sampling tensor([-1.6397, -1.4881, -1.3345, -1.1806, 3.9860, 4.0632, 4.2882, 4.5843]) tensor([-1.7306, -1.1702, -1.0202, -1.2503, 4.0385, 3.9392, 4.1451, 4.6010]) 0.2\n", + "sampling tensor([-1.6923, -1.3681, -1.2081, -1.2001, 4.0154, 4.0757, 4.2580, 4.5404]) tensor([-1.6326, -0.9675, -1.0959, -1.5258, 3.7411, 3.8427, 4.0080, 4.6235]) 0.2\n", + "sampling tensor([-1.6531, -1.2870, -1.2384, -1.3103, 3.8964, 4.0371, 4.2032, 4.5494]) tensor([-1.5052, -1.4606, -1.1210, -1.3427, 3.6279, 4.1991, 4.4115, 4.6487]) 0.2\n", + "sampling tensor([-1.6021, -1.4842, -1.2484, -1.2371, 3.8512, 4.1796, 4.3646, 4.5595]) tensor([-1.2590, -1.7062, -1.1769, -1.3045, 3.8121, 4.5182, 4.1822, 4.5201]) 0.2\n", + "sampling tensor([-1.5036, -1.5825, -1.2708, -1.2218, 3.9249, 4.3073, 4.2729, 4.5080]) tensor([-1.3670, -1.4537, -1.5777, -1.1912, 3.8517, 4.3122, 4.2566, 4.8096]) 0.2\n", + "sampling tensor([-1.5468, -1.4815, -1.4311, -1.1765, 3.9407, 4.2249, 4.3026, 4.6238]) tensor([-1.3126, -1.3427, -1.2613, -1.1219, 4.0166, 4.2344, 4.5727, 4.6434]) 0.2\n", + "sampling tensor([-1.5250, -1.4371, -1.3045, -1.1488, 4.0066, 4.1938, 4.4291, 4.5574]) tensor([-1.8126, -1.0757, -1.5558, -0.8167, 3.8255, 4.0765, 4.6362, 4.6970]) 0.2\n", + "sampling tensor([-1.7250, -1.3303, -1.4223, -1.0267, 3.9302, 4.1306, 4.4545, 4.5788]) tensor([-1.4974, -1.4293, -1.4973, -0.9230, 3.7944, 4.4144, 4.6211, 4.2113]) 0.2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_259967/3164560226.py:57: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown\n", + " fig.show()\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# true ar1 process\n", + "T = 50\n", + "n_series = n_series\n", + "\n", + "true_phi = .4\n", + "true_sigma = .2\n", + "\n", + "true_contribution = 0.5\n", + "\n", + " # 2D tensor of shape (n_series, 1) with random values\n", + " #set seed\n", + "torch.manual_seed(1)\n", + "\n", + "with pyro.plate(\"series\", n_series, dim = -2):\n", + " init = pyro.sample(\"init\", dist.Normal(1, 1))\n", + " \n", + "mid = T//2\n", + "\n", + "predictor = torch.zeros((n_series,T))\n", + "for i in range(n_series//2):\n", + " predictor[i,:mid] = 1. + i/10\n", + " predictor[i, mid:] = -2. + 2*i/10\n", + "\n", + "for i in range(n_series//2, n_series):\n", + " predictor[i,:mid] = 3. + i/10\n", + " predictor[i, mid:] = 4. + 2*i/10\n", + "\n", + "\n", + "print(\"predictor\", predictor)\n", + "\n", + "print(\"init\", init, init.shape)\n", + "y_true = torch.zeros((n_series,T))\n", + "y_exp_true = torch.zeros( (n_series,T))\n", + "y_prev_true = torch.zeros((n_series,T))\n", + "\n", + "\n", + "y_exp_true[:,0] = true_contribution * predictor[:,0]\n", + "print(\"ytrue\", y_true[:,0], y_true[:,0].shape)\n", + "print(\"init\", init, init.shape)\n", + "y_true[:,0] = init.squeeze()\n", + "\n", + "\n", + "for t in range(1, T):\n", + " \n", + " y_prev_true[:,t] = y_true[:,t-1]\n", + " y_exp_true[:,t] = true_phi * y_prev_true[...,:,t] + true_contribution * predictor[...,:,t] \n", + " \n", + " y_true[:,t] = pyro.sample(f\"y_{t}\", dist.Normal(y_exp_true[:,t], true_sigma))\n", + " print(\"sampling\", y_exp_true[:,t], y_true[:,t], true_sigma)\n", + " \n", + "\n", + "fig, ax= plot_ts(y_true, title=f\"{n_series} true AR(1) processes\", xlabel=\"t\", ylabel=\"y\",)\n", + "fig.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[ 1.6614],\n", + " [ 1.2669],\n", + " [ 1.0617],\n", + " [ 1.6213],\n", + " [ 0.5481],\n", + " [ 0.8339],\n", + " [-0.5228],\n", + " [ 1.3817]])\n" + ] + } + ], + "source": [ + "class AR1model(pyro.nn.PyroModule):\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " def forward(self, outcome_obs = None, predictor_obs = None,\n", + " initial_obs = None, T = None, no_series = n_series):\n", + "\n", + " if outcome_obs is not None:\n", + " T = outcome_obs.shape[-1]\n", + "\n", + "\n", + " phi = pyro.sample(\"phi\", dist.Normal(1., 0.4)) \n", + " sigma = pyro.sample(\"sigma\", dist.Uniform(0.001, 1.0))\n", + " \n", + "\n", + " contribution = pyro.sample(\"contribution\", dist.Normal(0.02, 1.))\n", + "\n", + " series_plate = pyro.plate(\"series\", no_series, dim = -2)\n", + "\n", + " time_plate = pyro.plate(\"time\", T, dim=-1)\n", + "\n", + " with series_plate:\n", + " \n", + " with time_plate:\n", + " predictor = pyro.sample( \"predictor\", dist.Normal(0.0, 1.0), obs=predictor_obs)\n", + "\n", + " \n", + " y_ts = {}\n", + " y_exp = {}\n", + " y_prev = {}\n", + "\n", + " \n", + " y_prev[0] = torch.zeros_like(predictor[...,:,0].unsqueeze(-1))\n", + "\n", + "\n", + " with series_plate:\n", + "\n", + " y_exp[0] = contribution * predictor[...,:,0].unsqueeze(-1)\n", + " \n", + " y_ts[0]= pyro.sample(\"y_0\", dist.Normal(y_exp[0], sigma), obs=initial_obs)\n", + "\n", + "\n", + " for t in range(1, T):\n", + " \n", + " with series_plate:\n", + " y_prev[t] = y_ts[t-1]\n", + " pred_slice = predictor[...,:,t].unsqueeze(-1)\n", + " y_exp[t] = pyro.deterministic(f\"y_exp_{t}\", phi * y_prev[t] + contribution * pred_slice)\n", + " \n", + " y_ts[t] = pyro.sample(f\"y_{t}\", dist.Normal(y_exp[t], sigma), \n", + " obs=outcome_obs[:,t].unsqueeze(-1) if outcome_obs is not None else None)\n", + " \n", + " y_ts_stacked = pyro.deterministic(\"y_stacked\", torch.cat(list(y_ts.values()), dim=1))\n", + " \n", + " return y_ts, y_ts_stacked\n", + "\n", + "ar1_model = AR1model()\n", + "\n", + "print(init)\n", + "\n", + "with condition(data = {\"phi\": true_phi, \"sigma\": true_sigma, \"contribution\": true_contribution}):\n", + " with pyro.poutine.trace() as tr:\n", + " _, y_intermediate = ar1_model(outcome_obs=None, initial_obs=init, \n", + " predictor_obs=predictor, T=T, no_series=n_series)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "odict_keys(['phi', 'sigma', 'contribution', 'series', 'time', 'init', 'predictor', 'y_0', 'y_exp_1', 'y_1', 'y_exp_2', 'y_2', 'y_exp_3', 'y_3', 'y_exp_4', 'y_4', 'y_exp_5', 'y_5', 'y_exp_6', 'y_6', 'y_exp_7', 'y_7', 'y_exp_8', 'y_8', 'y_exp_9', 'y_9', 'y_exp_10', 'y_10', 'y_exp_11', 'y_11', 'y_exp_12', 'y_12', 'y_exp_13', 'y_13', 'y_exp_14', 'y_14', 'y_exp_15', 'y_15', 'y_exp_16', 'y_16', 'y_exp_17', 'y_17', 'y_exp_18', 'y_18', 'y_exp_19', 'y_19', 'y_exp_20', 'y_20', 'y_exp_21', 'y_21', 'y_exp_22', 'y_22', 'y_exp_23', 'y_23', 'y_exp_24', 'y_24', 'y_exp_25', 'y_25', 'y_exp_26', 'y_26', 'y_exp_27', 'y_27', 'y_exp_28', 'y_28', 'y_exp_29', 'y_29', 'y_exp_30', 'y_30', 'y_exp_31', 'y_31', 'y_exp_32', 'y_32', 'y_exp_33', 'y_33', 'y_exp_34', 'y_34', 'y_exp_35', 'y_35', 'y_exp_36', 'y_36', 'y_exp_37', 'y_37', 'y_exp_38', 'y_38', 'y_exp_39', 'y_39', 'y_exp_40', 'y_40', 'y_exp_41', 'y_41', 'y_exp_42', 'y_42', 'y_exp_43', 'y_43', 'y_exp_44', 'y_44', 'y_exp_45', 'y_45', 'y_exp_46', 'y_46', 'y_exp_47', 'y_47', 'y_exp_48', 'y_48', 'y_exp_49', 'y_49', 'y_prev_stacked', 'y_exp_stacked', 'y_stacked'])\n", + "tensor([0.5000, 0.5500, 0.6000, 0.6500, 1.7000, 1.7500, 1.8000, 1.8500]) tensor([0.5000, 0.5500, 0.6000, 0.6500, 1.7000, 1.7500, 1.8000, 1.8500])\n", + "tensor([1.1645, 1.0568, 1.0247, 1.2985, 1.9192, 2.0835, 1.5909, 2.4027]) tensor([1.1645, 1.0568, 1.0247, 1.2985, 1.9192, 2.0835, 1.5909, 2.4027])\n", + "tensor([0.8836, 0.9277, 0.9385, 1.1648, 2.4521, 2.5062, 2.4702, 2.8325]) tensor([1.0502, 0.9273, 1.0581, 1.3108, 2.4210, 2.6779, 2.2307, 2.8185])\n", + "tensor([0.8197, 0.8802, 0.8496, 1.1060, 2.9678, 2.6060, 2.9160, 2.8808]) tensor([1.0123, 0.8076, 0.9897, 1.4341, 2.6439, 2.8260, 2.7146, 3.0477])\n", + "tensor([0.8539, 0.8638, 1.0502, 1.2947, 2.9200, 2.7133, 2.8937, 3.0457]) tensor([0.8916, 0.9109, 0.9697, 1.2109, 2.7454, 2.7820, 2.7742, 3.0364])\n", + "tensor([0.8504, 0.7148, 1.0686, 1.1568, 2.9345, 2.8155, 2.8933, 3.0872]) tensor([0.8743, 0.9516, 1.1500, 1.0170, 2.7246, 2.7563, 2.7727, 3.0428])\n", + "tensor([0.8630, 0.8911, 0.9768, 1.1831, 2.8191, 2.9125, 2.9806, 3.0183]) tensor([0.9627, 1.0049, 1.0865, 1.0314, 2.8402, 2.6880, 2.9638, 3.0245])\n", + "tensor([0.8010, 0.9573, 0.9590, 1.0707, 2.6962, 2.9934, 2.9889, 2.9917]) tensor([0.8729, 0.8960, 1.0738, 1.0944, 2.8961, 2.8426, 3.0439, 3.1306])\n", + "tensor([0.8455, 0.8421, 1.0138, 1.0557, 2.5731, 2.8329, 3.0356, 3.0902]) tensor([0.8652, 0.9206, 1.0096, 1.2486, 2.9276, 2.9358, 2.9242, 3.1912])\n", + "tensor([0.8057, 0.9776, 0.9166, 1.1003, 2.6676, 2.8714, 3.0644, 3.1735]) tensor([0.8726, 1.1580, 0.9862, 1.1014, 2.8368, 2.8784, 2.9254, 3.0789])\n", + "tensor([0.8298, 1.0401, 0.8590, 1.1311, 2.7116, 2.8852, 2.9458, 2.9876]) tensor([0.8158, 1.0143, 1.0263, 1.0887, 2.8517, 2.8923, 2.9601, 3.2067])\n", + "tensor([0.8967, 0.9705, 1.0343, 1.0598, 2.8374, 2.7764, 2.9482, 2.7969]) tensor([0.8760, 0.9548, 1.0525, 1.0189, 2.9125, 3.0329, 3.0194, 3.1012])\n", + "tensor([0.8507, 0.8805, 1.1154, 1.0738, 2.7473, 2.9087, 3.0351, 2.9047]) tensor([0.8096, 0.8901, 1.0790, 0.9685, 2.7349, 3.0110, 3.0408, 3.1209])\n", + "tensor([0.9633, 1.0196, 1.1738, 0.9573, 2.8802, 2.8973, 2.9111, 3.0777]) tensor([0.8745, 1.0499, 0.9375, 1.1455, 2.6899, 3.0113, 2.9424, 3.1455])\n", + "tensor([0.8365, 0.8542, 0.9940, 1.0864, 2.9451, 2.8831, 3.1147, 3.0358]) tensor([0.8359, 0.9203, 0.8611, 1.0428, 2.8268, 3.0387, 3.0766, 2.9582])\n", + "tensor([0.8667, 0.8824, 1.0230, 1.1296, 2.9473, 2.8750, 3.0737, 3.1553]) tensor([0.8258, 0.9340, 0.9788, 1.0379, 2.7741, 2.9866, 3.1595, 3.0339])\n", + "tensor([0.8200, 0.7852, 1.0676, 1.0913, 2.8280, 2.9834, 3.0687, 3.1947]) tensor([0.7733, 0.8297, 1.0582, 1.1030, 2.9505, 2.8937, 3.0980, 2.9975])\n", + "tensor([0.7801, 0.9922, 0.9411, 0.9891, 2.8830, 2.9401, 3.0133, 3.0879]) tensor([0.8556, 0.8375, 1.0603, 1.0193, 2.9080, 2.8303, 2.9387, 2.9179])\n", + "tensor([0.8814, 0.9250, 0.9396, 1.0377, 2.8910, 3.0064, 2.9823, 2.9922]) tensor([0.9416, 1.0208, 1.0046, 1.0556, 2.8464, 3.0221, 3.0801, 3.0434])\n", + "tensor([0.8548, 0.9652, 0.9749, 1.1187, 2.8170, 3.0730, 2.8068, 3.1762]) tensor([0.7694, 0.9122, 1.1662, 0.9579, 2.8433, 2.9017, 3.0328, 2.9312])\n", + "tensor([0.7697, 0.9500, 0.9961, 1.0228, 2.8341, 3.0907, 2.8677, 3.0800]) tensor([0.8695, 0.8874, 1.1264, 1.0153, 2.8599, 2.9131, 2.9261, 2.8543])\n", + "tensor([0.8233, 0.9873, 0.9335, 0.9422, 2.8523, 3.0014, 2.9295, 3.0939]) tensor([1.0712, 0.9299, 1.0466, 1.0515, 2.8432, 2.9389, 2.9508, 3.1740])\n", + "tensor([0.8301, 0.9350, 1.0910, 0.9953, 2.8001, 3.0399, 2.9244, 3.1603]) tensor([0.9582, 0.9080, 1.1097, 0.9560, 2.8583, 3.0293, 3.0668, 3.1522])\n", + "tensor([0.7457, 0.8700, 1.1491, 0.9005, 2.7748, 2.8932, 2.9306, 3.0647]) tensor([0.8080, 0.8076, 1.1719, 1.1376, 2.7944, 2.8864, 3.0133, 2.9750])\n", + "tensor([0.8290, 0.9288, 1.1227, 1.1011, 2.8007, 2.9058, 2.9762, 3.1101]) tensor([0.6574, 0.9978, 1.0282, 0.9728, 2.8105, 2.8191, 2.9675, 3.0369])\n", + "tensor([-0.7404, -0.4860, -0.3186, -0.1434, 3.3268, 3.5671, 3.8462, 4.0344]) tensor([-0.7375, -0.5097, -0.3774, -0.2913, 3.4742, 3.5643, 3.8861, 3.9855])\n", + "tensor([-1.2784, -1.0989, -0.8310, -0.6015, 3.7246, 3.8583, 4.0755, 4.2485]) tensor([-1.3301, -1.1797, -1.0378, -0.8174, 3.8348, 3.8820, 4.1075, 4.2840])\n", + "tensor([-1.4676, -1.4332, -1.1700, -1.0708, 3.8808, 4.1712, 4.1626, 4.3155]) tensor([-1.7230, -1.3804, -1.1281, -1.0717, 3.9592, 4.1742, 4.2534, 4.3393])\n", + "tensor([-1.5120, -1.4303, -1.1450, -1.1838, 3.9697, 4.1445, 4.2351, 4.5756]) tensor([-1.7691, -1.5664, -1.3102, -1.1311, 3.9939, 4.1693, 4.1149, 4.4613])\n", + "tensor([-1.5324, -1.4627, -1.1065, -1.2074, 4.0473, 4.1639, 4.3850, 4.3760]) tensor([-1.8056, -1.3707, -1.1896, -1.0990, 3.8874, 4.1342, 4.1281, 4.4463])\n", + "tensor([-1.6575, -1.4965, -1.0283, -1.1291, 4.0522, 4.0847, 4.3981, 4.5297]) tensor([-1.6782, -1.5440, -1.2719, -1.2843, 4.1154, 4.2316, 4.2942, 4.5359])\n", + "tensor([-1.7452, -1.5772, -1.3608, -1.1047, 3.9838, 4.0561, 4.2744, 4.5686]) tensor([-1.5704, -1.5229, -1.1246, -1.3332, 3.9616, 4.0951, 4.3739, 4.6010])\n", + "tensor([-1.7053, -1.5976, -1.3082, -1.2146, 3.9993, 4.1910, 4.2219, 4.5508]) tensor([-1.5132, -1.4700, -1.1067, -1.1595, 4.0973, 4.1015, 4.3584, 4.5211])\n", + "tensor([-1.6146, -1.5351, -1.2264, -1.3027, 4.1437, 4.2178, 4.1953, 4.5811]) tensor([-1.5861, -1.3731, -1.2480, -1.0940, 4.0933, 4.1043, 4.3262, 4.3598])\n", + "tensor([-1.7954, -1.4695, -1.2779, -1.2996, 3.9874, 4.1661, 4.2788, 4.5379]) tensor([-1.6687, -1.2991, -1.2302, -1.1314, 4.1098, 4.2381, 4.4349, 4.4850])\n", + "tensor([-1.7345, -1.5297, -1.3359, -1.1843, 3.9161, 4.0204, 4.3594, 4.6065]) tensor([-1.7697, -1.5314, -1.3945, -1.1723, 4.0106, 4.2833, 4.3849, 4.3810])\n", + "tensor([-1.6912, -1.4819, -1.3344, -1.0946, 4.0528, 3.9845, 4.3528, 4.3933]) tensor([-1.6579, -1.6825, -1.3609, -1.1451, 3.8681, 4.1977, 4.2091, 4.2931])\n", + "tensor([-1.7641, -1.4682, -1.2076, -1.1917, 4.1578, 4.1121, 4.3982, 4.3485]) tensor([-1.6826, -1.6192, -1.2888, -1.2383, 3.9444, 4.0677, 4.2555, 4.5724])\n", + "tensor([-1.6743, -1.4895, -1.2131, -1.3550, 3.9931, 3.9785, 4.4079, 4.3878]) tensor([-1.6393, -1.5431, -1.4845, -1.0267, 4.0036, 4.0963, 4.2806, 4.5675])\n", + "tensor([-1.6934, -1.5386, -1.1444, -1.3285, 3.9016, 4.1132, 4.2083, 4.3516]) tensor([-1.6855, -1.5984, -1.4361, -1.0818, 3.9685, 4.1836, 4.3519, 4.6738])\n", + "tensor([-1.7106, -1.5333, -1.2404, -1.1895, 4.0094, 4.2069, 4.3395, 4.4309]) tensor([-1.7210, -1.6293, -1.3275, -1.1264, 4.0605, 4.1694, 4.1924, 4.4684])\n", + "tensor([-1.8004, -1.6917, -1.4397, -1.2207, 3.9532, 4.1907, 4.2938, 4.3942]) tensor([-1.7798, -1.6624, -1.2498, -1.2893, 3.8786, 4.1006, 4.2881, 4.4843])\n", + "tensor([-1.6397, -1.4881, -1.3345, -1.1806, 3.9860, 4.0632, 4.2882, 4.5843]) tensor([-1.7135, -1.6286, -1.2257, -1.2059, 3.9408, 4.1294, 4.3063, 4.6547])\n", + "tensor([-1.6923, -1.3681, -1.2081, -1.2001, 4.0154, 4.0757, 4.2580, 4.5404]) tensor([-1.5569, -1.4990, -1.3459, -1.2117, 3.9104, 4.1156, 4.3978, 4.6076])\n", + "tensor([-1.6531, -1.2870, -1.2384, -1.3103, 3.8964, 4.0371, 4.2032, 4.5494]) tensor([-1.5990, -1.3875, -1.3124, -1.1304, 3.9873, 4.1873, 4.3362, 4.4175])\n", + "tensor([-1.6021, -1.4842, -1.2484, -1.2371, 3.8512, 4.1796, 4.3646, 4.5595]) tensor([-1.7088, -1.2711, -1.3150, -1.2897, 4.0126, 4.0010, 4.3126, 4.4318])\n", + "tensor([-1.5036, -1.5825, -1.2708, -1.2218, 3.9249, 4.3073, 4.2729, 4.5080]) tensor([-1.6650, -1.2157, -1.3652, -1.3055, 4.0069, 4.2310, 4.4414, 4.5414])\n", + "tensor([-1.5468, -1.4815, -1.4311, -1.1765, 3.9407, 4.2249, 4.3026, 4.6238]) tensor([-1.6460, -1.3221, -1.5031, -1.2773, 3.8542, 4.1358, 4.3750, 4.5518])\n", + "tensor([-1.5250, -1.4371, -1.3045, -1.1488, 4.0066, 4.1938, 4.4291, 4.5574]) tensor([-1.6855, -1.3387, -1.5088, -1.3580, 4.0307, 4.1735, 4.3126, 4.4787])\n", + "tensor([-1.7250, -1.3303, -1.4223, -1.0267, 3.9302, 4.1306, 4.4545, 4.5788]) tensor([-1.5945, -1.3868, -1.3389, -1.2973, 3.9685, 4.2125, 4.3691, 4.4954])\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(tr.trace.nodes.keys())\n", + "\n", + "assert tr.trace.nodes['phi']['value']== true_phi\n", + "assert tr.trace.nodes['sigma']['value']== true_sigma\n", + "assert tr.trace.nodes['contribution']['value']== true_contribution\n", + "assert torch.equal(tr.trace.nodes['predictor']['value'], predictor) \n", + "\n", + "\n", + "max_graph = 1.2 * torch.max(y_exp_true)\n", + "min_graph = 1.2 * torch.min(y_exp_true)\n", + "plt.scatter( y_exp_true.flatten(), tr.trace.nodes[\"y_exp_stacked\"]['value'].flatten())\n", + "plt.plot([min_graph, max_graph], [min_graph, max_graph], linestyle='--', color='red')\n", + "\n", + "plt.xlabel(\"True y_exp\")\n", + "plt.ylabel(\"Estimated y_exp\")\n", + "plt.title(\"Estimated vs True y_exp\")\n", + "plt.show()\n", + "\n", + "assert torch.allclose(y_intermediate, y_true, atol= true_sigma * 8)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([8, 50]) torch.Size([8, 50]) torch.Size([8, 1])\n", + "[iteration 0001] loss: 2812.8303\n", + "[iteration 0100] loss: 2630.1743\n", + "[iteration 0200] loss: 2557.0708\n", + "[iteration 0300] loss: 2576.9009\n", + "[iteration 0400] loss: 2599.6711\n", + "[iteration 0500] loss: 2560.7441\n", + "[iteration 0600] loss: 2564.7576\n", + "[iteration 0700] loss: 2572.7246\n", + "[iteration 0800] loss: 2556.7070\n", + "[iteration 0900] loss: 2558.2761\n", + "[iteration 1000] loss: 2556.4187\n", + "[iteration 1100] loss: 2561.4504\n", + "[iteration 1200] loss: 2557.6111\n", + "[iteration 1300] loss: 2557.6667\n", + "[iteration 1400] loss: 2571.2625\n", + "[iteration 1500] loss: 2557.4009\n", + "[iteration 1600] loss: 2560.4299\n", + "[iteration 1700] loss: 2555.8784\n", + "[iteration 1800] loss: 2559.1411\n", + "[iteration 1900] loss: 2556.8948\n", + "[iteration 2000] loss: 2571.1294\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def run_svi_inference(\n", + " model,\n", + " n_steps=500,\n", + " verbose=True,\n", + " lr=0.03,\n", + " vi_family=AutoMultivariateNormal,\n", + " guide=None,\n", + " **model_kwargs,\n", + "):\n", + " losses = []\n", + " if guide is None:\n", + " guide = vi_family(model, init_loc_fn=init_to_mean)\n", + " elbo = pyro.infer.Trace_ELBO()(model, guide)\n", + " # initialize parameters\n", + " elbo(**model_kwargs)\n", + " adam = torch.optim.Adam(elbo.parameters(), lr=lr)\n", + " # Do gradient steps\n", + " for step in range(1, n_steps + 1):\n", + " adam.zero_grad()\n", + " loss = elbo(**model_kwargs)\n", + " loss.backward()\n", + " losses.append(loss.item())\n", + " adam.step()\n", + " if (step % 100 == 0) or (step == 1) & verbose:\n", + " print(\"[iteration %04d] loss: %.4f\" % (step, loss))\n", + "\n", + " plt.plot(losses)\n", + "\n", + " return guide\n", + "\n", + "pyro.clear_param_store()\n", + "\n", + "print(y_true.shape, predictor.shape, init.shape)\n", + "guide = run_svi_inference(ar1_model, n_steps=2000, lr=0.03, \n", + " vi_family= AutoMultivariateNormal,\n", + " outcome_obs = y_intermediate, predictor_obs = predictor, \n", + " initial_obs = init, no_series = n_series)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "predictive = pyro.infer.Predictive(ar1_model, guide=guide, num_samples = n_samples)\n", + "samples = predictive(initial_obs=init, predictor_obs = predictor, T = T)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "phi, sigma, contribution = samples[\"phi\"].flatten(), samples[\"sigma\"].flatten(), samples[\"contribution\"].flatten()\n", + "phi, sigma = samples[\"phi\"].flatten(), samples[\"sigma\"].flatten()#, samples[\"contribution\"].flatten()\n", + "phi_color = 'blue'\n", + "sigma_color = 'green'\n", + "contribution_color = 'purple'\n", + "\n", + "sites = [phi, sigma, contribution]\n", + "names = ['phi', 'sigma', 'contribution']\n", + "colors = [phi_color, sigma_color, contribution_color]\n", + "\n", + "for i in range(len(sites)):\n", + " plt.hist(sites[i].numpy(), bins=30, alpha=0.5)\n", + " plt.axvline(sites[i].mean().item(), color=colors[i], linestyle='dashed', linewidth=1, label=f'mean {names[i]}')\n", + " plt.axvline(eval(f'true_{names[i]}'), color=colors[i], linewidth=1, label=f'true {names[i]}')\n", + " plt.title(f\"Posterior distribution of {names[i]}\")\n", + " plt.xlim(0, 1)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "n_rows = math.ceil(n_series / 2)\n", + "\n", + "fig, axs = plt.subplots(n_rows, 2, figsize=(10, n_rows * 3))\n", + "\n", + "for series in range(n_series):\n", + "\n", + " ax = axs[series // 2, series % 2] if n_rows > 1 else axs[series % 2]\n", + " \n", + " ax.plot(y_true[series, :].detach().numpy(), label=\"true\")\n", + " mean_pred = samples['y_stacked'][...,series,:].mean(dim = -4).squeeze()\n", + " low_pred = samples['y_stacked'][...,series,:].quantile(0.05, dim=-4).squeeze()\n", + " high_pred = samples['y_stacked'][...,series,:].quantile(0.95, dim=-4).squeeze()\n", + "\n", + " overall_mean = y_true[series,:].mean()\n", + " null_residuals = y_true[series,:] - overall_mean\n", + " model_residuals = y_true[series,:] - mean_pred\n", + " # r^2 \n", + " r2 = 1 - (model_residuals.var() / null_residuals.var())\n", + "\n", + "\n", + " ax.plot(mean_pred.detach().numpy(), label=\"mean prediction\")\n", + "\n", + " ax.fill_between(range(T), low_pred.detach().numpy(),\n", + " high_pred.detach().numpy(), alpha=0.5, label=\"95% credible interval\")\n", + "\n", + "\n", + " ax.set_title(f\"Series {series + 1}, $R^2$ = {r2:.2f}\")\n", + "\n", + " if series == 0:\n", + " ax.legend()\n", + " \n", + "sns.despine()\n", + "plt.tight_layout()\n", + "plt.show() \n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "polis-dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/experimental_notebooks/.Rhistory b/docs/experimental_notebooks/.Rhistory new file mode 100644 index 00000000..e86675ec --- /dev/null +++ b/docs/experimental_notebooks/.Rhistory @@ -0,0 +1,87 @@ +require(dagitty) +# with zones +zones_dag <- dagitty('dag { +year [pos="0,2"] +month [pos="1,2"] +limit_con [pos="2,1"] +parcel_area [pos="2,0"] +ward_id [pos="3.5,0.2"] +zone_id [pos = "1, 0"] +neighborhood_id [pos = "4, 0"] +housing_units [pos = "5,1"] +past_reform [pos ="0, .5"] +past_reform_by_zone [pos = "0,1"] +year -> housing_units +month -> housing_units +limit_con -> housing_units +parcel_area -> housing_units +ward_id -> housing_units +zone_id -> housing_units +neighborhood_id -> housing_units +neighborhood_id -> parcel_area +zone_id -> past_reform_by_zone +zone_id -> parcel_area +past_reform -> past_reform_by_zone +past_reform_by_zone -> limit_con +}') +plot(zones_dag) +paths(zones_dag,"limit_con","housing_units") +adjustmentSets(zones_dag,"limit_con","housing_units",type = "all") +impliedConditionalIndependencies(zones_dag) +#--------------------------------------------------------------------------------------- +# with distances +zones_dag <- dagitty('dag { +year [pos="0,2"] +month [pos="1,2"] +limit_con [pos="2,1"] +parcel_area [pos="2,0"] +ward_id [pos="3.5,0.2"] +zone_id [pos = "1, 0"] +neighborhood_id [pos = "4, 0"] +housing_units [pos = "5,1"] +past_reform [pos ="0, .5"] +past_reform_by_zone [pos = "0,1"] +year -> housing_units +month -> housing_units +limit_con -> housing_units +parcel_area -> housing_units +ward_id -> housing_units +zone_id -> housing_units +neighborhood_id -> housing_units +neighborhood_id -> parcel_area +zone_id -> past_reform_by_zone +zone_id -> parcel_area +past_reform -> past_reform_by_zone +past_reform_by_zone -> limit_con +}') +plot(zones_dag) +paths(zones_dag,"limit_con","housing_units") +adjustmentSets(zones_dag,"limit_con","housing_units",type = "all") +impliedConditionalIndependencies(zones_dag) +require(dagitty) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +total_value [pos = "1,0"] +median_value [pos = "1.2,0.3"] +limit [pos="1,1"] +units [pos = "2,1"] +distance -> limit +distance -> total_value +distance -> median_value +distance -> units +year -> limit +year -> total_value +year -> median_value +year -> units +total_value -> units +median_value -> units +limit -> total_value +limit -> median_value +limit -> units +}') +plot(tracts_dag) +paths(tracts_dag,"limit","units") +adjustmentSets(tracts_dag,"limit","units", type = "all") +impliedConditionalIndependencies(tracts_dag) diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/files-pane.pper b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/files-pane.pper new file mode 100644 index 00000000..257a7704 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/files-pane.pper @@ -0,0 +1,9 @@ +{ + "sortOrder": [ + { + "columnIndex": 2, + "ascending": true + } + ], + "path": "~/s78projects/cities/docs/experimental_notebooks/zoning" +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/source-pane.pper b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/source-pane.pper new file mode 100644 index 00000000..b074a4fe --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/source-pane.pper @@ -0,0 +1,3 @@ +{ + "activeTab": 1 +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/windowlayoutstate.pper b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/windowlayoutstate.pper new file mode 100644 index 00000000..5563078a --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/windowlayoutstate.pper @@ -0,0 +1,14 @@ +{ + "left": { + "splitterpos": 463, + "topwindowstate": "NORMAL", + "panelheight": 1120, + "windowheight": 1158 + }, + "right": { + "splitterpos": 694, + "topwindowstate": "NORMAL", + "panelheight": 1120, + "windowheight": 1158 + } +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/workbench-pane.pper b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/workbench-pane.pper new file mode 100644 index 00000000..d3c76342 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/pcs/workbench-pane.pper @@ -0,0 +1,5 @@ +{ + "TabSet1": 0, + "TabSet2": 1, + "TabZoom": {} +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/rmd-outputs b/docs/experimental_notebooks/.Rproj.user/735F60EC/rmd-outputs new file mode 100644 index 00000000..3f2ff2d6 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/rmd-outputs @@ -0,0 +1,5 @@ + + + + + diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/saved_source_markers b/docs/experimental_notebooks/.Rproj.user/735F60EC/saved_source_markers new file mode 100644 index 00000000..2b1bef11 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/saved_source_markers @@ -0,0 +1 @@ +{"active_set":"","sets":[]} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/4EAFCA4F b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/4EAFCA4F new file mode 100644 index 00000000..fc4c1eb3 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/4EAFCA4F @@ -0,0 +1,26 @@ +{ + "id": "4EAFCA4F", + "path": "~/s78projects/cities/docs/experimental_notebooks/zoning/tracts_dags.R", + "project_path": "zoning/tracts_dags.R", + "type": "r_source", + "hash": "2157559845", + "contents": "", + "dirty": true, + "created": 1724161369201.0, + "source_on_save": false, + "relative_order": 2, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "21,24", + "scrollLine": "4" + }, + "folds": "", + "lastKnownWriteTime": 1724161086, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1724161086, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/4EAFCA4F-contents b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/4EAFCA4F-contents new file mode 100644 index 00000000..56daab70 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/4EAFCA4F-contents @@ -0,0 +1,37 @@ +require(dagitty) + +# with zones +tracts_dag <- dagitty('dag { + year [pos="0,2"] + distance [pos = "0,0"] + total_value [pos = "1,0"] + median_value [pos = "1.2,0.3"] + limit [pos="1,1"] + units [pos = "2,1"] + + distance -> limit + distance -> total_value + distance -> median_value + distance -> units + + year -> limit + year -> total_value + year -> median_value + year -> units + + total_value -> units + median_value -> units + + + limit -> total_value + limit -> median_value + limit -> units + + }') + + + +plot(tracts_dag) +paths(tracts_dag,"limit","units") +adjustmentSets(tracts_dag,"limit","units", type = "all") +impliedConditionalIndependencies(tracts_dag) diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/FFB1EEE4 b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/FFB1EEE4 new file mode 100644 index 00000000..1ab31a63 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/FFB1EEE4 @@ -0,0 +1,26 @@ +{ + "id": "FFB1EEE4", + "path": "~/s78projects/cities/docs/experimental_notebooks/zoning/zoning_dags.R", + "project_path": "zoning/zoning_dags.R", + "type": "r_source", + "hash": "150867923", + "contents": "", + "dirty": false, + "created": 1724161317981.0, + "source_on_save": false, + "relative_order": 1, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "2,2", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1724161086, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1724161086, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/FFB1EEE4-contents b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/FFB1EEE4-contents new file mode 100644 index 00000000..eef85bf9 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/per/t/FFB1EEE4-contents @@ -0,0 +1,79 @@ +require(dagitty) + +# with zones +zones_dag <- dagitty('dag { + year [pos="0,2"] + month [pos="1,2"] + limit_con [pos="2,1"] + parcel_area [pos="2,0"] + ward_id [pos="3.5,0.2"] + zone_id [pos = "1, 0"] + neighborhood_id [pos = "4, 0"] + housing_units [pos = "5,1"] + past_reform [pos ="0, .5"] + past_reform_by_zone [pos = "0,1"] + + + year -> housing_units + month -> housing_units + limit_con -> housing_units + parcel_area -> housing_units + ward_id -> housing_units + zone_id -> housing_units + neighborhood_id -> housing_units + + neighborhood_id -> parcel_area + zone_id -> past_reform_by_zone + zone_id -> parcel_area + past_reform -> past_reform_by_zone + past_reform_by_zone -> limit_con + +}') + + + +plot(zones_dag) +paths(zones_dag,"limit_con","housing_units") +adjustmentSets(zones_dag,"limit_con","housing_units",type = "all") +impliedConditionalIndependencies(zones_dag) + + + + +#--------------------------------------------------------------------------------------- +# with distances +zones_dag <- dagitty('dag { + year [pos="0,2"] + month [pos="1,2"] + limit_con [pos="2,1"] + parcel_area [pos="2,0"] + ward_id [pos="3.5,0.2"] + zone_id [pos = "1, 0"] + neighborhood_id [pos = "4, 0"] + housing_units [pos = "5,1"] + past_reform [pos ="0, .5"] + past_reform_by_zone [pos = "0,1"] + + + year -> housing_units + month -> housing_units + limit_con -> housing_units + parcel_area -> housing_units + ward_id -> housing_units + zone_id -> housing_units + neighborhood_id -> housing_units + + neighborhood_id -> parcel_area + zone_id -> past_reform_by_zone + zone_id -> parcel_area + past_reform -> past_reform_by_zone + past_reform_by_zone -> limit_con + +}') + + + +plot(zones_dag) +paths(zones_dag,"limit_con","housing_units") +adjustmentSets(zones_dag,"limit_con","housing_units",type = "all") +impliedConditionalIndependencies(zones_dag) diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/4057158B b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/4057158B new file mode 100644 index 00000000..e1be0f7d --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/4057158B @@ -0,0 +1,7 @@ +{ + "tempName": "Untitled1", + "source_window_id": "", + "Source": "Source", + "cursorPosition": "33,40", + "scrollLine": "18" +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/AE428842 b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/AE428842 new file mode 100644 index 00000000..c01e52d7 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/AE428842 @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "95,0", + "scrollLine": "0" +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/B2BF0B6B b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/B2BF0B6B new file mode 100644 index 00000000..b67e905d --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/B2BF0B6B @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "2,2", + "scrollLine": "0" +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/CEE077AB b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/CEE077AB new file mode 100644 index 00000000..ece09bb4 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/CEE077AB @@ -0,0 +1,7 @@ +{ + "tempName": "Untitled1", + "source_window_id": "", + "Source": "Source", + "cursorPosition": "57,4", + "scrollLine": "56" +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/D2FE184D b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/D2FE184D new file mode 100644 index 00000000..4102d7f5 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/D2FE184D @@ -0,0 +1,6 @@ +{ + "source_window_id": "", + "Source": "Source", + "cursorPosition": "21,24", + "scrollLine": "4" +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/INDEX b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/INDEX new file mode 100644 index 00000000..f24983a6 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/prop/INDEX @@ -0,0 +1,2 @@ +~%2Fs78projects%2Fcities%2Fdocs%2Fexperimental_notebooks%2Fzoning%2Ftracts_dags.R="D2FE184D" +~%2Fs78projects%2Fcities%2Fdocs%2Fexperimental_notebooks%2Fzoning%2Fzoning_dags.R="B2BF0B6B" diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/644C3C4D b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/644C3C4D new file mode 100644 index 00000000..45bda793 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/644C3C4D @@ -0,0 +1,27 @@ +{ + "id": "644C3C4D", + "path": "~/s78projects/cities/docs/experimental_notebooks/zoning/tracts_population_dag.R", + "project_path": "zoning/tracts_population_dag.R", + "type": "r_source", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1727800740083.0, + "source_on_save": false, + "relative_order": 2, + "properties": { + "tempName": "Untitled1", + "source_window_id": "", + "Source": "Source", + "cursorPosition": "33,40", + "scrollLine": "18" + }, + "folds": "", + "lastKnownWriteTime": 1728657199, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1728657199368, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/644C3C4D-contents b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/644C3C4D-contents new file mode 100644 index 00000000..0b5a1dcc --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/644C3C4D-contents @@ -0,0 +1,108 @@ +require(dagitty) + +if (rstudioapi::isAvailable()) { + # Set working directory to the script's location + setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} + +print(getwd()) + +# with zones +tracts_dag <- dagitty('dag { + year [pos="0,2"] + distance [pos = "0,0"] + university_overlap [pos = "0,.8"] + downtown_overlap [pos = "0, 1.2"] + + total_population [pos = "0.1, .2"] + + square_meters [pos = "0.2,.4"] + limit [pos = "0.2, 1.6"] + + white [pos = "0.4,1.8"] + segregation [pos = "0.6,0.2"] + + income [pos = "0.7, .8"] + + median_value [pos = "0.9,1.4"] + housing_units [pos = "1.,.6"] + + + distance -> total_population + year -> total_population + university_overlap -> total_popoulation + downtown_overlap -> total_population + + distance -> square_meters + year -> square_meters + + distance -> limit + year -> limit + + distance -> white + square_meters -> white + limit -> white + + distance -> segregation + year -> segregation + limit -> segregation + square_meters -> segregation + white -> segregation + + distance -> income + white -> income + segregation -> income + square_meters -> income + limit -> income + year -> income + + distance -> median_value + income -> median_value + white -> median_value + segregation -> median_value + square_meters -> median_value + limit -> median_value + year -> median_value + + + university_overlap -> housing_units + downtown_overlap -> housing_units + median_value -> housing_units + distance -> housing_units + income -> housing_units + white -> housing_units + limit -> housing_units + segregation -> housing_units + square_meters -> housing_units + year -> housing_units + + + + + }') + +plot(tracts_dag) + + +png("tracts_dag_plot_high_density.png", + width = 2000, + height = 1600, + res = 300 +) +plot(tracts_dag) +dev.off() + +pdf("tracts_dag_plot.pdf", + width = 10, + height = 8, + pointsize = 18, + paper = "special", + useDingbats = FALSE, + compress = FALSE) +plot(tracts_dag) +dev.off() + +plot(tracts_dag) +paths(tracts_dag,"limit","housing_units") +adjustmentSets(tracts_dag,"limit","housing_units", type = "all") +impliedConditionalIndependencies(tracts_dag) diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/A2603F02 b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/A2603F02 new file mode 100644 index 00000000..784022b7 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/A2603F02 @@ -0,0 +1,26 @@ +{ + "id": "A2603F02", + "path": "~/s78projects/cities/docs/experimental_notebooks/zoning/tracts_dags.R", + "project_path": "zoning/tracts_dags.R", + "type": "r_source", + "hash": "0", + "contents": "", + "dirty": false, + "created": 1727800733002.0, + "source_on_save": false, + "relative_order": 1, + "properties": { + "source_window_id": "", + "Source": "Source", + "cursorPosition": "95,0", + "scrollLine": "0" + }, + "folds": "", + "lastKnownWriteTime": 1727798984, + "encoding": "UTF-8", + "collab_server": "", + "source_window": "", + "last_content_update": 1727798984, + "read_only": false, + "read_only_alternatives": [] +} \ No newline at end of file diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/A2603F02-contents b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/A2603F02-contents new file mode 100644 index 00000000..104e3944 --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/A2603F02-contents @@ -0,0 +1,95 @@ +require(dagitty) + +if (rstudioapi::isAvailable()) { + # Set working directory to the script's location + setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} + +print(getwd()) + +# with zones +tracts_dag <- dagitty('dag { + year [pos="0,2"] + distance [pos = "0,0"] + + square_meters [pos = "0.2,.4"] + limit [pos = "0.2, 1.6"] + + white [pos = "0.4,1.8"] + segregation [pos = "0.6,0.2"] + + income [pos = "0.7, .8"] + + median_value [pos = "0.9,1.4"] + housing_units [pos = "1.,.6"] + + distance -> square_meters + year -> square_meters + + distance -> limit + year -> limit + + distance -> white + square_meters -> white + limit -> white + + distance -> segregation + year -> segregation + limit -> segregation + square_meters -> segregation + white -> segregation + + distance -> income + white -> income + segregation -> income + square_meters -> income + limit -> income + year -> income + + distance -> median_value + income -> median_value + white -> median_value + segregation -> median_value + square_meters -> median_value + limit -> median_value + year -> median_value + + median_value -> housing_units + distance -> housing_units + income -> housing_units + white -> housing_units + limit -> housing_units + segregation -> housing_units + square_meters -> housing_units + year -> housing_units + + + + + }') + +plot(tracts_dag) + + +png("tracts_dag_plot_high_density.png", + width = 2000, + height = 1600, + res = 300 +) +plot(tracts_dag) +dev.off() + +pdf("tracts_dag_plot.pdf", + width = 10, + height = 8, + pointsize = 18, + paper = "special", + useDingbats = FALSE, + compress = FALSE) +plot(tracts_dag) +dev.off() + +plot(tracts_dag) +paths(tracts_dag,"limit","housing_units") +adjustmentSets(tracts_dag,"limit","housing_units", type = "all") +impliedConditionalIndependencies(tracts_dag) diff --git a/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/lock_file b/docs/experimental_notebooks/.Rproj.user/735F60EC/sources/session-fb0f82db/lock_file new file mode 100644 index 00000000..e69de29b diff --git a/docs/experimental_notebooks/.Rproj.user/shared/notebooks/patch-chunk-names b/docs/experimental_notebooks/.Rproj.user/shared/notebooks/patch-chunk-names new file mode 100644 index 00000000..e69de29b diff --git a/docs/experimental_notebooks/.Rproj.user/shared/notebooks/paths b/docs/experimental_notebooks/.Rproj.user/shared/notebooks/paths new file mode 100644 index 00000000..6c1f37fe --- /dev/null +++ b/docs/experimental_notebooks/.Rproj.user/shared/notebooks/paths @@ -0,0 +1,2 @@ +/home/rafal/s78projects/cities/docs/experimental_notebooks/zoning/tracts_interactions_dag.R="37338FC3" +/home/rafal/s78projects/cities/docs/experimental_notebooks/zoning/tracts_population_dag.R="69D3F49D" diff --git a/docs/experimental_notebooks/zoning/.RData b/docs/experimental_notebooks/zoning/.RData new file mode 100644 index 00000000..dab7dc96 Binary files /dev/null and b/docs/experimental_notebooks/zoning/.RData differ diff --git a/docs/experimental_notebooks/zoning/.Rhistory b/docs/experimental_notebooks/zoning/.Rhistory new file mode 100644 index 00000000..3c6261fb --- /dev/null +++ b/docs/experimental_notebooks/zoning/.Rhistory @@ -0,0 +1,403 @@ +require(dagitty) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +white [pos = ".2,1"] +segregation [pos = ".6,1"] +income [pos = ".9,.8"] +median_value [pos = "1.2,0.2"] +limit [pos=".7,1.8"] +units [pos = "1.5,.8"] +sqm [pos = ".2,.4"] +distance -> sqm +year -> sqm +year -> limit +distance -> limit +distance -> white +year -> white +sqm -> white +limit -> white +sqm -> segregation +distance -> segregation +white -> segregation +year -> segregation +limit -> segregation +sqm -> income +distance -> income +white -> income +segregation -> income +year -> income +limit -> income +sqm -> median_value +distance -> median_value +limit -> median_value +income -> median_value +white -> median_value +segregation -> median_value +year -> median_value +sqm -> units +median_value -> units +distance -> units +income -> units +white -> units +limit -> units +segregation -> units +year -> units +}') +plot(tracts_dag) +paths(tracts_dag,"limit","units") +adjustmentSets(tracts_dag,"limit","units", type = "all") +impliedConditionalIndependencies(tracts_dag) +require(dagitty) +if (rstudioapi::isAvailable()) { +# Set working directory to the script's location +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} +print(getwd()) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +sqm [pos = "0.2,.4"] +limit [pos = "0.2, 1.6"] +white [pos = "0.4,1.8"] +segregation [pos = "0.6,0.2"] +income [pos = "0.7, .8"] +median_value [pos = "0.9,1.4"] +housing_units [pos = "1.,.6"] +distance -> sqm +year -> sqm +distance -> limit +year -> limit +distance -> white +sqm -> white +limit -> white +distance -> segregation +year -> segregation +limit -> segregation +sqm -> segregation +white -> segregation +distance -> income +white -> income +segregation -> income +sqm -> income +limit -> income +year -> income +distance -> median_value +income -> median_value +white -> median_value +segregation -> median_value +sqm -> median_value +limit -> median_value +year -> median_value +median_value -> housing_units +distance -> housing_units +income -> housing_units +white -> housing_units +limit -> housing_units +segregation -> housing_units +sqm -> housing_units +year -> housing_units +}') +plot(tracts_dag) +png("tracts_dag_plot_high_density.png", +width = 2000, +height = 1600, +res = 300 +) +plot(tracts_dag) +dev.off() +pdf("tracts_dag_plot.pdf", +width = 10, +height = 8, +pointsize = 18, +paper = "special", +useDingbats = FALSE, +compress = FALSE) +plot(tracts_dag) +dev.off() +plot(tracts_dag) +paths(tracts_dag,"limit","units") +require(dagitty) +if (rstudioapi::isAvailable()) { +# Set working directory to the script's location +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} +print(getwd()) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +square_meters [pos = "0.2,.4"] +limit [pos = "0.2, 1.6"] +white [pos = "0.4,1.8"] +segregation [pos = "0.6,0.2"] +income [pos = "0.7, .8"] +median_value [pos = "0.9,1.4"] +housing_units [pos = "1.,.6"] +distance -> sqm +year -> sqm +distance -> limit +year -> limit +distance -> white +square_meters -> white +limit -> white +distance -> segregation +year -> segregation +limit -> segregation +square_meters -> segregation +white -> segregation +distance -> income +white -> income +segregation -> income +square_meters -> income +limit -> income +year -> income +distance -> median_value +income -> median_value +white -> median_value +segregation -> median_value +square_meters -> median_value +limit -> median_value +year -> median_value +median_value -> housing_units +distance -> housing_units +income -> housing_units +white -> housing_units +limit -> housing_units +segregation -> housing_units +square_meters -> housing_units +year -> housing_units +}') +plot(tracts_dag) +png("tracts_dag_plot_high_density.png", +width = 2000, +height = 1600, +res = 300 +) +plot(tracts_dag) +dev.off() +pdf("tracts_dag_plot.pdf", +width = 10, +height = 8, +pointsize = 18, +paper = "special", +useDingbats = FALSE, +compress = FALSE) +plot(tracts_dag) +dev.off() +plot(tracts_dag) +paths(tracts_dag,"limit","units") +require(dagitty) +if (rstudioapi::isAvailable()) { +# Set working directory to the script's location +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} +print(getwd()) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +square_meters [pos = "0.2,.4"] +limit [pos = "0.2, 1.6"] +white [pos = "0.4,1.8"] +segregation [pos = "0.6,0.2"] +income [pos = "0.7, .8"] +median_value [pos = "0.9,1.4"] +housing_units [pos = "1.,.6"] +distance -> square_meters +year -> square_meters +distance -> limit +year -> limit +distance -> white +square_meters -> white +limit -> white +distance -> segregation +year -> segregation +limit -> segregation +square_meters -> segregation +white -> segregation +distance -> income +white -> income +segregation -> income +square_meters -> income +limit -> income +year -> income +distance -> median_value +income -> median_value +white -> median_value +segregation -> median_value +square_meters -> median_value +limit -> median_value +year -> median_value +median_value -> housing_units +distance -> housing_units +income -> housing_units +white -> housing_units +limit -> housing_units +segregation -> housing_units +square_meters -> housing_units +year -> housing_units +}') +plot(tracts_dag) +png("tracts_dag_plot_high_density.png", +width = 2000, +height = 1600, +res = 300 +) +plot(tracts_dag) +dev.off() +pdf("tracts_dag_plot.pdf", +width = 10, +height = 8, +pointsize = 18, +paper = "special", +useDingbats = FALSE, +compress = FALSE) +plot(tracts_dag) +dev.off() +plot(tracts_dag) +paths(tracts_dag,"limit","units") +require(dagitty) +if (rstudioapi::isAvailable()) { +# Set working directory to the script's location +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} +print(getwd()) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +square_meters [pos = "0.2,.4"] +limit [pos = "0.2, 1.6"] +white [pos = "0.4,1.8"] +segregation [pos = "0.6,0.2"] +income [pos = "0.7, .8"] +median_value [pos = "0.9,1.4"] +housing_units [pos = "1.,.6"] +distance -> square_meters +year -> square_meters +distance -> limit +year -> limit +distance -> white +square_meters -> white +limit -> white +distance -> segregation +year -> segregation +limit -> segregation +square_meters -> segregation +white -> segregation +distance -> income +white -> income +segregation -> income +square_meters -> income +limit -> income +year -> income +distance -> median_value +income -> median_value +white -> median_value +segregation -> median_value +square_meters -> median_value +limit -> median_value +year -> median_value +median_value -> housing_units +distance -> housing_units +income -> housing_units +white -> housing_units +limit -> housing_units +segregation -> housing_units +square_meters -> housing_units +year -> housing_units +}') +plot(tracts_dag) +png("tracts_dag_plot_high_density.png", +width = 2000, +height = 1600, +res = 300 +) +plot(tracts_dag) +dev.off() +pdf("tracts_dag_plot.pdf", +width = 10, +height = 8, +pointsize = 18, +paper = "special", +useDingbats = FALSE, +compress = FALSE) +plot(tracts_dag) +dev.off() +plot(tracts_dag) +paths(tracts_dag,"limit","units") +require(dagitty) +if (rstudioapi::isAvailable()) { +# Set working directory to the script's location +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +} +print(getwd()) +# with zones +tracts_dag <- dagitty('dag { +year [pos="0,2"] +distance [pos = "0,0"] +square_meters [pos = "0.2,.4"] +limit [pos = "0.2, 1.6"] +white [pos = "0.4,1.8"] +segregation [pos = "0.6,0.2"] +income [pos = "0.7, .8"] +median_value [pos = "0.9,1.4"] +housing_units [pos = "1.,.6"] +distance -> square_meters +year -> square_meters +distance -> limit +year -> limit +distance -> white +square_meters -> white +limit -> white +distance -> segregation +year -> segregation +limit -> segregation +square_meters -> segregation +white -> segregation +distance -> income +white -> income +segregation -> income +square_meters -> income +limit -> income +year -> income +distance -> median_value +income -> median_value +white -> median_value +segregation -> median_value +square_meters -> median_value +limit -> median_value +year -> median_value +median_value -> housing_units +distance -> housing_units +income -> housing_units +white -> housing_units +limit -> housing_units +segregation -> housing_units +square_meters -> housing_units +year -> housing_units +}') +plot(tracts_dag) +png("tracts_dag_plot_high_density.png", +width = 2000, +height = 1600, +res = 300 +) +plot(tracts_dag) +dev.off() +pdf("tracts_dag_plot.pdf", +width = 10, +height = 8, +pointsize = 18, +paper = "special", +useDingbats = FALSE, +compress = FALSE) +plot(tracts_dag) +dev.off() +plot(tracts_dag) +paths(tracts_dag,"limit","housing_units") +adjustmentSets(tracts_dag,"limit","housing_units", type = "all") +impliedConditionalIndependencies(tracts_dag) diff --git a/docs/testing_notebooks/causal_insights_demo.ipynb b/docs/testing_notebooks/causal_insights_demo.ipynb index 61c1d424..a104e02f 100644 --- a/docs/testing_notebooks/causal_insights_demo.ipynb +++ b/docs/testing_notebooks/causal_insights_demo.ipynb @@ -10,8 +10,7 @@ "import random\n", "\n", "from cities.queries.causal_insight import CausalInsight\n", - "from cities.utils.data_grabber import (DataGrabber, list_interventions,\n", - " list_outcomes)\n", + "from cities.utils.data_grabber import DataGrabber, list_interventions, list_outcomes\n", "\n", "smoke_test = \"CI\" in os.environ\n", "num_samples = 10 if smoke_test else 1000" diff --git a/docs/testing_notebooks/slider.ipynb b/docs/testing_notebooks/slider.ipynb index 90691e87..db8a6f79 100644 --- a/docs/testing_notebooks/slider.ipynb +++ b/docs/testing_notebooks/slider.ipynb @@ -6,12 +6,14 @@ "metadata": {}, "outputs": [], "source": [ - "import os \n", + "import os\n", + "\n", + "import numpy as np\n", + "\n", "from cities.queries.causal_insight import CausalInsight as CausalInsight\n", "from cities.queries.causal_insight_slim import CausalInsightSlim as CausalInsightSlim\n", "from cities.utils.data_grabber import DataGrabber\n", "\n", - "import numpy as np\n", "smoke_test = \"CI\" in os.environ\n", "num_samples = 10 if smoke_test else 1000" ] @@ -53,7 +55,7 @@ "intervention = \"spending_commerce\"\n", "year = 2018\n", "\n", - "#the object instantiation doesn't change\n", + "# the object instantiation doesn't change\n", "ci = CausalInsight(\n", " outcome_dataset=outcome,\n", " intervention_dataset=intervention,\n", @@ -61,8 +63,7 @@ ")\n", "\n", "\n", - "\n", - "percent_calc = ci.slider_values_to_interventions(intervened_percent=50, year = year)\n", + "percent_calc = ci.slider_values_to_interventions(intervened_percent=50, year=year)\n", "\n", "display(percent_calc)\n", "\n", @@ -70,12 +71,15 @@ "\n", "ci.get_tau_samples()\n", "\n", - "ci.get_fips_predictions(intervened_value= percent_calc['intervened_transformed'],\n", - " fips=fips, year = year)\n", + "ci.get_fips_predictions(\n", + " intervened_value=percent_calc[\"intervened_transformed\"], fips=fips, year=year\n", + ")\n", "\n", "ci.plot_predictions(range_multiplier=1)\n", "\n", - "assert np.allclose(ci.intervened_value_original, percent_calc['intervened_original'], rtol = 0.01)\n" + "assert np.allclose(\n", + " ci.intervened_value_original, percent_calc[\"intervened_original\"], rtol=0.01\n", + ")" ] }, { @@ -2161,7 +2165,6 @@ } ], "source": [ - "\n", "ci_slim = CausalInsightSlim(\n", " outcome_dataset=outcome,\n", " intervention_dataset=intervention,\n", @@ -2170,17 +2173,21 @@ "\n", "\n", "ci_slim.get_tau_samples()\n", - "ci_slim.get_fips_predictions(intervened_value= percent_calc['intervened_transformed'],\n", - " fips=fips, year = year)\n", + "ci_slim.get_fips_predictions(\n", + " intervened_value=percent_calc[\"intervened_transformed\"], fips=fips, year=year\n", + ")\n", "\n", "ci_slim.plot_predictions(range_multiplier=1)\n", "\n", - "ci_slim.plot_predictions(range_multiplier =1, scaling = \"original\")\n", + "ci_slim.plot_predictions(range_multiplier=1, scaling=\"original\")\n", "\n", - "percent_calc_slim = ci.slider_values_to_interventions(intervened_percent=50, year = year)\n", + "percent_calc_slim = ci.slider_values_to_interventions(intervened_percent=50, year=year)\n", "\n", - "assert np.allclose(ci_slim.intervened_value_original, percent_calc_slim['intervened_original'], rtol = 0.01)\n", - " " + "assert np.allclose(\n", + " ci_slim.intervened_value_original,\n", + " percent_calc_slim[\"intervened_original\"],\n", + " rtol=0.01,\n", + ")" ] } ], diff --git a/docs/testing_notebooks/slim_insight.ipynb b/docs/testing_notebooks/slim_insight.ipynb index 0ad30bd8..0b4e6aea 100644 --- a/docs/testing_notebooks/slim_insight.ipynb +++ b/docs/testing_notebooks/slim_insight.ipynb @@ -6,12 +6,11 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "import cProfile\n", + "import os\n", "\n", "from cities.queries.causal_insight import CausalInsight\n", - "from cities.utils.data_grabber import (DataGrabber, list_interventions,\n", - " list_outcomes)\n", + "from cities.utils.data_grabber import DataGrabber, list_interventions, list_outcomes\n", "\n", "smoke_test = \"CI\" in os.environ\n", "num_samples = 10 if smoke_test else 1000" @@ -36,8 +35,8 @@ "data = DataGrabber()\n", "data.get_features_wide([\"gdp\"])\n", "gdp = data.wide[\"gdp\"]\n", - "fips = gdp[\"GeoFIPS\"][5] #1011\n", - "fips2 = gdp[\"GeoFIPS\"][6] #1013" + "fips = gdp[\"GeoFIPS\"][5] # 1011\n", + "fips2 = gdp[\"GeoFIPS\"][6] # 1013" ] }, { @@ -4777,9 +4776,9 @@ "\n", "# confirming that the time consuming moves involve the model\n", "# gettign samples, etc.\n", - "profiler_basic.print_stats(sort='cumulative')\n", + "profiler_basic.print_stats(sort=\"cumulative\")\n", "\n", - "# note: restricting sites to tau drops time from 8s to 5s. " + "# note: restricting sites to tau drops time from 8s to 5s." ] }, { @@ -4800,11 +4799,11 @@ "# this is prep pipeline\n", "print(smoke_test)\n", "ci = CausalInsight(\n", - " outcome_dataset=outcome,\n", - " intervention_dataset=intervention2,\n", - " num_samples=num_samples,\n", - " smoke_test=smoke_test\n", - " )\n", + " outcome_dataset=outcome,\n", + " intervention_dataset=intervention2,\n", + " num_samples=num_samples,\n", + " smoke_test=smoke_test,\n", + ")\n", "ci.generate_tensed_samples()" ] }, @@ -7368,7 +7367,7 @@ } ], "source": [ - "# this is the slim execution \n", + "# this is the slim execution\n", "def slim_run():\n", " ci = CausalInsight(\n", " outcome_dataset=outcome,\n", @@ -7388,7 +7387,7 @@ "profiler_slim.disable()\n", "\n", "\n", - "profiler_slim.print_stats(sort='cumulative')" + "profiler_slim.print_stats(sort=\"cumulative\")" ] } ], diff --git a/scripts/clean.sh b/scripts/clean.sh index 6918545f..a7ec44ae 100755 --- a/scripts/clean.sh +++ b/scripts/clean.sh @@ -1,22 +1,12 @@ #!/bin/bash set -euxo pipefail -# isort suspended as conflicting with black -# nbqa isort docs/guides/ - - -# this sometimes conflicts with black but does some -# preliminary import sorting -# and is then overriden by black -isort cities/ tests/ - -black ./cities/ ./tests/ ./docs/guides/ - -black docs/guides/ +isort --profile="black" cities/ tests/ +black cities/ tests/ autoflake --remove-all-unused-imports --in-place --recursive ./cities ./tests -nbqa autoflake --nbqa-shell --remove-all-unused-imports --recursive --in-place docs/guides/ - -#nbqa black docs/guides/ +nbqa --nbqa-shell autoflake --remove-all-unused-imports --recursive --in-place docs/guides/ docs/testing_notebooks +nbqa --nbqa-shell isort --profile="black" docs/guides/ docs/testing_notebooks +black docs/guides/ docs/testing_notebooks diff --git a/scripts/clean_path.sh b/scripts/clean_path.sh new file mode 100755 index 00000000..4cbc324f --- /dev/null +++ b/scripts/clean_path.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -euxo pipefail + + +path="${1:-.}" + +isort --profile="black" "$path" +black "$path" +autoflake --remove-all-unused-imports --in-place --recursive "$path" + +if [[ -d "$path" ]]; then + nbqa --nbqa-shell autoflake --remove-all-unused-imports --recursive --in-place "$path" + nbqa --nbqa-shell isort --profile="black" "$path" + black "$path" +fi diff --git a/scripts/lint.sh b/scripts/lint.sh index 5e5b9abe..aa432ab9 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -2,11 +2,12 @@ set -euxo pipefail mypy --ignore-missing-imports cities/ -#isort --check --diff cities/ tests/ -black --check cities/ tests/ docs/guides/ -flake8 cities/ tests/ --ignore=E203,W503 --max-line-length=127 +isort --profile="black" --check --diff cities/ tests/ +black --check cities/ tests/ +flake8 cities/ tests/ --ignore=E203,W503 --max-line-length=127 -nbqa autoflake --nbqa-shell -v --recursive --check docs/guides/ -#nbqa isort --check docs/guides/ +nbqa --nbqa-shell autoflake -v --recursive --check docs/guides/ +nbqa --nbqa-shell isort --profile="black" --check docs/guides/ +black --check docs/guides/ diff --git a/tests/test_data_grabber_sql.py b/tests/test_data_grabber_sql.py index d1c9d708..6b2d2097 100644 --- a/tests/test_data_grabber_sql.py +++ b/tests/test_data_grabber_sql.py @@ -13,6 +13,14 @@ list_csvs, ) +smoke_test = "CI" in os.environ + +if smoke_test: + pytest.skip( + "Skipping all tests in this file during smoke tests", allow_module_level=True + ) + + root = find_repo_root() data_dirs = { diff --git a/tests/test_sorted_interventions.py b/tests/test_sorted_interventions.py index 40c85355..1914d801 100644 --- a/tests/test_sorted_interventions.py +++ b/tests/test_sorted_interventions.py @@ -2,10 +2,18 @@ import dill import numpy as np +import pytest from cities.utils.data_grabber import DataGrabber, find_repo_root, list_interventions from cities.utils.percentiles import transformed_intervention_from_percentile +smoke_test = "CI" in os.environ + +if smoke_test: + pytest.skip( + "Skipping all tests in this file during smoke tests", allow_module_level=True + ) + def test_sorted_interventions_present(): root = find_repo_root()