diff --git a/.gitignore b/.gitignore index fafa28c2..808f981b 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,8 @@ data/industry_sector_ratios_NZ_2030.csv /data/transport_data.csv /data/switzerland* /data/.nfs* +/data/demand/unsd/data +/data/industrial_database.csv #/data/Industrial_Database.csv /data/retro/tabula-calculator-calcsetbuilding.csv /data/nuts* diff --git a/Snakefile b/Snakefile index c09d78e3..c35cb128 100644 --- a/Snakefile +++ b/Snakefile @@ -474,6 +474,13 @@ rule plot_summary: "scripts/plot_summary.py" +rule build_industrial_database: + output: + industrial_database="data/industrial_database.csv", + script: + "scripts/build_industrial_database.py" + + rule prepare_db: input: network=RDIR diff --git a/scripts/build_industrial_database.py b/scripts/build_industrial_database.py new file mode 100644 index 00000000..6579f34a --- /dev/null +++ b/scripts/build_industrial_database.py @@ -0,0 +1,520 @@ +# -*- coding: utf-8 -*- + +import json +import math +import os +import pprint +from pathlib import Path + +import country_converter as coco +import matplotlib.pyplot as plt +import numpy as np +import openpyxl +import pandas as pd +import pycountry +import requests +import seaborn as sns +from geopy.geocoders import Nominatim + + +def get_cocode_from_name(df, country_column_name): + country_codes = {} + + for country in pycountry.countries: + country_codes[country.name] = country.alpha_2 + + df["country"] = df[country_column_name].map(country_codes) + return df + + +def get_cocode_from_coords(df): + geolocator = Nominatim(user_agent="geoapi") # Initialize geolocator + + # Initialize an empty list to store country codes + country_codes = [] + + for index, row in df.iterrows(): + # Get latitude and longitude from the row + latitude = row["Latitude"] + longitude = row["Longitude"] + + # Perform reverse geocoding to get location information + location = geolocator.reverse((latitude, longitude), exactly_one=True) + + if location and location.raw.get("address", {}).get("country_code"): + # Extract and append the country code to the list + country_code = location.raw["address"]["country_code"].upper() + country_codes.append(country_code) + else: + country_codes.append(None) + + # Add the country code list as a new column to the DataFrame + df["country"] = country_codes + + return df + + +def create_steel_db(): + # Global Steel Plant Tracker data set you requested from Global Energy Monitor from the link below: + + # The following excel file was downloaded from the following webpage + # https://globalenergymonitor.org/wp-content/uploads/2023/03/Global-Steel-Plant-Tracker-2023-03.xlsx . The dataset contains 1433 Steel plants globally. + + fn = "https://globalenergymonitor.org/wp-content/uploads/2023/03/Global-Steel-Plant-Tracker-2023-03.xlsx" + storage_options = {"User-Agent": "Mozilla/5.0"} + steel_orig = pd.read_excel( + fn, + index_col=0, + storage_options=storage_options, + sheet_name="Steel Plants", + header=0, + ) + + df_steel = steel_orig.copy() + df_steel = df_steel[ + [ + "Plant name (English)", + "Country", + "Coordinates", + "Coordinate accuracy", + "Status", + "Start date", + "Plant age (years)", + "Nominal crude steel capacity (ttpa)", + "Nominal BOF steel capacity (ttpa)", + "Nominal EAF steel capacity (ttpa)", + "Nominal OHF steel capacity (ttpa)", + "Nominal iron capacity (ttpa)", + "Nominal BF capacity (ttpa)", + "Nominal DRI capacity (ttpa)", + "Ferronickel capacity (ttpa)", + "Sinter plant capacity (ttpa)", + "Coking plant capacity (ttpa)", + "Pelletizing plant capacity (ttpa)", + "Category steel product", + "Main production process", + "Municipality", + ] + ] + + # Keep only operating steel plants + df_steel = df_steel.loc[df_steel["Status"] == "operating"] + + # Create a column with iso2 country code + cc = coco.CountryConverter() + Country = pd.Series(df_steel["Country"]) + df_steel["country"] = cc.pandas_convert(series=Country, to="ISO2") + + # Split Coordeinates column into x and y columns + df_steel[["y", "x"]] = df_steel["Coordinates"].str.split(",", expand=True) + + # Drop Coordinates column as it contains a ',' and is not needed anymore + df_steel = df_steel.drop(columns="Coordinates", axis=1) + + # Fetch steel plants that uses DRI and BF techs and drop them from main df + mixed_steel_plants = df_steel[ + df_steel["Main production process"] == "integrated (BF and DRI)" + ].copy() + df_steel = df_steel.drop(mixed_steel_plants.index) + + # Separate the two techs in two dataframes + DRI_share = mixed_steel_plants.copy() + BF_share = mixed_steel_plants.copy() + BF_share["Main production process"] = "integrated (BF)" + DRI_share["Main production process"] = "integrated (DRI)" + + # Calculate the share of both techs according to the capacities of iron production + BF_share["Nominal crude steel capacity (ttpa)"] = BF_share[ + "Nominal crude steel capacity (ttpa)" + ] * mixed_steel_plants.apply( + lambda x: x["Nominal BF capacity (ttpa)"] / x["Nominal iron capacity (ttpa)"], + axis=1, + ) + DRI_share["Nominal crude steel capacity (ttpa)"] = ( + mixed_steel_plants["Nominal crude steel capacity (ttpa)"] + - BF_share["Nominal crude steel capacity (ttpa)"] + ) + + # Add suffix to the index to differentiate between them in the main df + DRI_share.index += "_DRI" + BF_share.index += "_BF" + + # Merge them back to the main df + df_steel = pd.concat([df_steel, BF_share, DRI_share]) + df_steel["Main production process"].value_counts() + + # Remove plants with unknown production technology + unknown_ind = df_steel[ + df_steel["Main production process"].str.contains("unknown") + ].index + df_steel = df_steel.drop(unknown_ind) + if len(unknown_ind) > 0: + print( + "dropped {0} steel/iron plants with unknown production technology of total {1} plants".format( + len(unknown_ind), len(df_steel) + ) + ) + df_steel["Main production process"].value_counts() + + # Dict to map the technology names of the source to that expected in the workflow + iron_techs = { + "electric": "Electric arc", + "integrated (BF)": "Integrated steelworks", + "integrated (DRI)": "DRI + Electric arc", + "ironmaking (BF)": "Integrated steelworks", + "ironmaking (DRI)": "DRI + Electric arc", + "oxygen": "Integrated steelworks", + "electric, oxygen": "Electric arc", + } + + # Creating the necessary columns in the dataframe + iron_making = df_steel[ + df_steel["Main production process"].str.contains("ironmaking") + ].index + df_steel.loc[iron_making, "Nominal crude steel capacity (ttpa)"] = df_steel.loc[ + iron_making, "Nominal iron capacity (ttpa)" + ] + df_steel["unit"] = "kt/yr" + df_steel["quality"] = "exact" + df_steel = df_steel.reset_index() + df_steel = df_steel.rename( + columns={ + "Nominal crude steel capacity (ttpa)": "capacity", + "Municipality": "location", + "Plant ID": "ID", + } + ) + df_steel.capacity = pd.to_numeric(df_steel.capacity) + df_steel["technology"] = df_steel["Main production process"].apply( + lambda x: iron_techs[x] + ) + df_steel.x = df_steel.x.apply(lambda x: eval(x)) + df_steel.y = df_steel.y.apply(lambda y: eval(y)) + + return df_steel[ + [ + "country", + "y", + "x", + "location", + "technology", + "capacity", + "unit", + "quality", + "ID", + ] + ].dropna() + + +def create_cement_db(): + # ------------- + # CEMENT + # ------------- + # The following excel file was downloaded from the following webpage https://www.cgfi.ac.uk/spatial-finance-initiative/geoasset-project/cement/. + # The dataset contains 3117 cement plants globally. + fn = "https://www.cgfi.ac.uk/wp-content/uploads/2021/08/SFI-Global-Cement-Database-July-2021.xlsx" + storage_options = {"User-Agent": "Mozilla/5.0"} + cement_orig = pd.read_excel( + fn, + index_col=0, + storage_options=storage_options, + sheet_name="SFI_ALD_Cement_Database", + header=0, + ) + + df_cement = cement_orig.copy() + df_cement = df_cement[ + [ + "country", + "iso3", + "latitude", + "longitude", + "status", + "plant_type", + "capacity", + "year", + "city", + ] + ] + df_cement = df_cement.rename( + columns={ + "country": "Country", + "latitude": "y", + "longitude": "x", + "city": "location", + } + ) + df_cement["unit"] = "Kt/yr" + df_cement["technology"] = "Cement" + df_cement["capacity"] = df_cement["capacity"] * 1000 + # Keep only operating steel plants + df_cement = df_cement.loc[df_cement["status"] == "Operating"] + + # Create a column with iso2 country code + cc = coco.CountryConverter() + iso3 = pd.Series(df_cement["iso3"]) + df_cement["country"] = cc.pandas_convert(series=iso3, to="ISO2") + + # Dropping the null capacities reduces the dataframe from 3000+ rows to 1672 rows + na_index = df_cement[df_cement.capacity.isna()].index + print( + "There are {} out of {} total cement plants with unknown capcities, setting value to country average".format( + len(na_index), len(df_cement) + ) + ) + avg_c_cap = df_cement.groupby(df_cement.country)["capacity"].mean() + df_cement["capacity"] = df_cement.apply( + lambda x: avg_c_cap[x["country"]] + if math.isnan(x["capacity"]) + else x["capacity"], + axis=1, + ) + + df_cement["quality"] = "actual" + df_cement.loc[na_index, "quality"] = "actual" # TODO change + + df_cement = df_cement.reset_index() + df_cement = df_cement.rename(columns={"uid": "ID"}) + df_cement.capacity = pd.to_numeric(df_cement.capacity) + + return df_cement[ + [ + "country", + "y", + "x", + "location", + "technology", + "capacity", + "unit", + "quality", + "ID", + ] + ] + + +def create_refineries_df(): + # ------------- + # OIL REFINERIES + # ------------- + # The data were downloaded directly from arcgis server using a query found on this webpage: + # https://www.arcgis.com/home/item.html?id=a6979b6bccbf4e719de3f703ea799259&sublayer=0#data + # and https://www.arcgis.com/home/item.html?id=a917ac2766bc47e1877071f0201b6280 + + # The dataset contains 536 global Oil refineries. + + base_url = "https://services.arcgis.com" + facts = "/jDGuO8tYggdCCnUJ/arcgis/rest/services/Global_Oil_Refinery_Complex_and_Daily_Capacity/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=FID%20ASC&resultOffset=0&resultRecordCount=537&cacheHint=true&quantizationParameters=%7B%22mode%22%3A%22edit%22%7D" + + first_response = requests.get(base_url + facts) + response_list = first_response.json() + + data = [] + for response in response_list["features"]: + data.append( + { + "FID_": response["attributes"].get("FID_"), + "Company": response["attributes"].get("Company"), + "Name": response["attributes"].get("Name"), + "City": response["attributes"].get("City"), + "Facility": response["attributes"].get("Facility"), + "Prov_State": response["attributes"].get("Prov_State"), + "Country": response["attributes"].get("Country"), + "Address": response["attributes"].get("Address"), + "Zip": response["attributes"].get("Zip"), + "County": response["attributes"].get("County"), + "PADD": response["attributes"].get("PADD"), + "Capacity": response["attributes"].get("Capacity"), + "Longitude": response["attributes"].get("Longitude"), + "Latitude": response["attributes"].get("Latitude"), + "Markets": response["attributes"].get("Markets"), + "CORPORATIO": response["attributes"].get("CORPORATIO"), + } + ) + + df = pd.DataFrame(data) + + df = get_cocode_from_name(df, "Country") + + df_nans = df[df.country.isna()] + df = df.dropna(axis=0) + + df_bylocation = get_cocode_from_coords(df_nans) + + df_refineries = pd.concat([df, df_bylocation]) + + # Creating the necessary columns in the dataframe + # df_refineries["technology"] = df_refineries["Main production process"].apply(lambda x: iron_techs[x]) + df_refineries["unit"] = "bpd" + df_refineries["quality"] = "exact" + df_refineries["technology"] = "HVC" + + df_refineries = df_refineries.rename( + columns={ + "Capacity": "capacity", + "Prov_State": "location", + "Latitude": "y", + "Longitude": "x", + "FID_": "ID", + } + ) + df_refineries = df_refineries.reset_index() + df_refineries.capacity = pd.to_numeric(df_refineries.capacity) + + return df_refineries[ + [ + "country", + "y", + "x", + "location", + "technology", + "capacity", + "unit", + "quality", + "ID", + ] + ] + + +def create_paper_df(): + # ------------- + # Paper + # ------------- + # The following excel file was downloaded from the following webpage https://www.cgfi.ac.uk/spatial-finance-initiative/geoasset-project/cement/ . The dataset contains 3117 cement plants globally. + + fn = "https://www.cgfi.ac.uk/wp-content/uploads/2023/03/SFI_ALD_Pulp_Paper_Sample_LatAm_Jan_2023.xlsx" + + storage_options = {"User-Agent": "Mozilla/5.0"} + paper_orig = pd.read_excel( + fn, + index_col=0, + storage_options=storage_options, + sheet_name="SFI_ALD_PPM_LatAm", + header=0, + ) + + df_paper = paper_orig.copy() + df_paper = df_paper[ + [ + "country", + "iso3", + "latitude", + "longitude", + "status", + "primary_product", + "capacity_paper", + "city", + ] + ] + + df_paper = df_paper.rename( + columns={ + "country": "Country", + "latitude": "y", + "longitude": "x", + "city": "location", + "capacity_paper": "capacity", + } + ) + df_paper["unit"] = "10kt/yr" + df_paper["technology"] = "Paper" + df_paper["capacity"] = df_paper["capacity"] + + df_paper.capacity = df_paper.capacity.apply( + lambda x: x if type(x) == int or type(x) == int == float else np.nan + ) + + # Keep only operating steel plants + # df_paper = df_paper.loc[df_paper["status"] == "Operating"] + + # Create a column with iso2 country code + cc = coco.CountryConverter() + iso3 = pd.Series(df_paper["iso3"]) + df_paper["country"] = cc.pandas_convert(series=iso3, to="ISO2") + + # Dropping the null capacities reduces the dataframe from 3000+ rows to 1672 rows + na_index = df_paper[df_paper.capacity.isna()].index + print( + "There are {} out of {} total paper plants with unknown capcities, setting value to country average".format( + len(na_index), len(df_paper) + ) + ) + avg_c_cap = df_paper.groupby(df_paper.country)["capacity"].mean() + na_index + + df_paper["capacity"] = df_paper.apply( + lambda x: avg_c_cap[x["country"]] + if math.isnan(x["capacity"]) + else x["capacity"], + axis=1, + ) + + df_paper["quality"] = "actual" + df_paper.loc[na_index, "quality"] = "actual" # TODO change + df_paper.capacity = pd.to_numeric(df_paper.capacity) + + df_paper = df_paper.reset_index() + df_paper = df_paper.rename(columns={"uid": "ID"}) + + industrial_database_paper = df_paper[ + [ + "country", + "y", + "x", + "location", + "technology", + "capacity", + "unit", + "quality", + "ID", + ] + ] + + no_infp_index = industrial_database_paper[ + industrial_database_paper.y == "No information" + ].index + print( + "Setting plants of countries with no values for paper plants to 1.0".format( + len(na_index), len(df_paper) + ) + ) + industrial_database_paper = industrial_database_paper.drop(no_infp_index) + industrial_database_paper.capacity = industrial_database_paper.capacity.fillna(1) + + return industrial_database_paper + + +if __name__ == "__main__": + if "snakemake" not in globals(): + from helpers import mock_snakemake, sets_path_to_root + + os.chdir(os.path.dirname(os.path.abspath(__file__))) + + # from helper import mock_snakemake #TODO remove func from here to helper script + snakemake = mock_snakemake( + "build_industrial_database", + simpl="", + clusters="10", + ll="c1.0", + opts="Co2L", + planning_horizons="2030", + sopts="144H", + discountrate="0.071", + demand="DF", + ) + industrial_database_steel = create_steel_db() + industrial_database_cement = create_cement_db() + industrial_database_refineries = create_refineries_df() + industrial_database_paper = create_paper_df() + + industrial_database = pd.concat( + [ + industrial_database_steel, + industrial_database_cement, + industrial_database_refineries, + industrial_database_paper, + ] + ) + + industrial_database.to_csv( + snakemake.output["industrial_database"], header=True, index=0 + ) diff --git a/scripts/build_industrial_distribution_key.py b/scripts/build_industrial_distribution_key.py index 3b04af08..8dfeea57 100644 --- a/scripts/build_industrial_distribution_key.py +++ b/scripts/build_industrial_distribution_key.py @@ -115,6 +115,7 @@ def build_nodal_distribution_key( header=0, keep_default_na=False, # , index_col=0 ) + geo_locs.capacity = pd.to_numeric(geo_locs.capacity) gadm_clustering = snakemake.config["clustering_options"]["alternative_clustering"]