Skip to content

Commit

Permalink
Merge pull request #43 from BasisResearch/nl-add-industry
Browse files Browse the repository at this point in the history
adding industry and urbanization datasets
  • Loading branch information
rfl-urbaniak authored Oct 26, 2023
2 parents 9040095 + d0b2d71 commit d206598
Show file tree
Hide file tree
Showing 16 changed files with 133,056 additions and 7,584 deletions.
97 changes: 97 additions & 0 deletions cities/utils/clean_industry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber


def clean_industry():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide["gdp"]

industry = pd.read_csv("../data/raw/ACSDP5Y2021_DP03_industry.csv")

assert industry["GEO_ID"].isna() == 0

industry["GEO_ID"] = industry["GEO_ID"].str.split("US").str[1]
industry["GEO_ID"] = industry["GEO_ID"].astype("int64")
industry = industry.rename(columns={"GEO_ID": "GeoFIPS"})

common_fips = np.intersect1d(gdp["GeoFIPS"].unique(), industry["GeoFIPS"].unique())

industry = industry[industry["GeoFIPS"].isin(common_fips)]

industry = industry.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left")

industry = industry[
[
"GeoFIPS",
"GeoName",
"DP03_0004E",
"DP03_0033E",
"DP03_0034E",
"DP03_0035E",
"DP03_0036E",
"DP03_0037E",
"DP03_0038E",
"DP03_0039E",
"DP03_0040E",
"DP03_0041E",
"DP03_0042E",
"DP03_0043E",
"DP03_0044E",
"DP03_0045E",
]
]

column_name_mapping = {
"DP03_0004E": "employed_sum",
"DP03_0033E": "agri_forestry_mining",
"DP03_0034E": "construction",
"DP03_0035E": "manufacturing",
"DP03_0036E": "wholesale_trade",
"DP03_0037E": "retail_trade",
"DP03_0038E": "transport_utilities",
"DP03_0039E": "information",
"DP03_0040E": "finance_real_estate",
"DP03_0041E": "prof_sci_mgmt_admin",
"DP03_0042E": "education_health",
"DP03_0043E": "arts_entertainment",
"DP03_0044E": "other_services",
"DP03_0045E": "public_admin",
}

industry.rename(columns=column_name_mapping, inplace=True)

industry = industry.sort_values(by=["GeoFIPS", "GeoName"])

industry.to_csv("../data/raw/industry_absolute.csv", index=False)

row_sums = industry.iloc[:, 3:].sum(axis=1)

industry.iloc[:, 3:] = industry.iloc[:, 3:].div(row_sums, axis=0)
industry = industry.drop(["employed_sum"], axis=1)

industry_wide = industry.copy()

industry_long = pd.melt(
industry,
id_vars=["GeoFIPS", "GeoName"],
var_name="Category",
value_name="Value",
)

industry_std_wide = standardize_and_scale(industry)

industry_std_long = pd.melt(
industry_std_wide.copy(),
id_vars=["GeoFIPS", "GeoName"],
var_name="Category",
value_name="Value",
)

industry_wide.to_csv("../data/processed/industry_wide.csv", index=False)
industry_long.to_csv("../data/processed/industry_long.csv", index=False)
industry_std_wide.to_csv("../data/processed/industry_std_wide.csv", index=False)
industry_std_long.to_csv("../data/processed/industry_std_long.csv", index=False)
70 changes: 70 additions & 0 deletions cities/utils/clean_urbanization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber


def clean_urbanization():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide["gdp"]

dtype_mapping = {"STATE": str, "COUNTY": str}
urbanization = pd.read_csv("../data/raw/2020_UA_COUNTY.csv", dtype=dtype_mapping)

urbanization["GeoFIPS"] = urbanization["STATE"].astype(str) + urbanization[
"COUNTY"
].astype(str)
urbanization["GeoFIPS"] = urbanization["GeoFIPS"].astype(int)

common_fips = np.intersect1d(
gdp["GeoFIPS"].unique(), urbanization["GeoFIPS"].unique()
)

urbanization = urbanization[urbanization["GeoFIPS"].isin(common_fips)]

urbanization = urbanization.merge(
gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left"
)

urbanization = urbanization[
[
"GeoFIPS",
"GeoName",
"POPDEN_RUR",
"POPDEN_URB",
"HOUDEN_COU",
"HOUDEN_RUR",
"ALAND_PCT_RUR",
]
]

urbanization = urbanization.sort_values(by=["GeoFIPS", "GeoName"])

urbanization_wide = urbanization.copy()

urbanization_long = pd.melt(
urbanization,
id_vars=["GeoFIPS", "GeoName"],
var_name="Category",
value_name="Value",
)

urbanization_std_wide = standardize_and_scale(urbanization)

urbanization_std_long = pd.melt(
urbanization_std_wide.copy(),
id_vars=["GeoFIPS", "GeoName"],
var_name="Category",
value_name="Value",
)

urbanization_wide.to_csv("../data/processed/urbanization_wide.csv", index=False)
urbanization_long.to_csv("../data/processed/urbanization_long.csv", index=False)
urbanization_std_wide.to_csv(
"../data/processed/urbanization_std_wide.csv", index=False
)
urbanization_std_long.to_csv(
"../data/processed/urbanization_std_long.csv", index=False
)
6 changes: 6 additions & 0 deletions cities/utils/cleaning_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from cities.utils.clean_ethnic_composition import clean_ethnic_composition
from cities.utils.clean_gdp import clean_gdp
from cities.utils.clean_industry import clean_industry
from cities.utils.clean_population import clean_population
from cities.utils.clean_spending_commerce import clean_spending_commerce
from cities.utils.clean_spending_HHS import clean_spending_HHS
from cities.utils.clean_spending_transportation import clean_spending_transportation
from cities.utils.clean_transport import clean_transport
from cities.utils.clean_urbanization import clean_urbanization

clean_gdp()

Expand All @@ -19,3 +21,7 @@
clean_spending_HHS()

clean_ethnic_composition()

clean_industry()

clean_urbanization()
Loading

0 comments on commit d206598

Please sign in to comment.