diff --git a/.gitignore b/.gitignore
index fafa28c2..808f981b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,8 @@ data/industry_sector_ratios_NZ_2030.csv
 /data/transport_data.csv
 /data/switzerland*
 /data/.nfs*
+/data/demand/unsd/data
+/data/industrial_database.csv
 #/data/Industrial_Database.csv
 /data/retro/tabula-calculator-calcsetbuilding.csv
 /data/nuts*
diff --git a/Snakefile b/Snakefile
index c09d78e3..c35cb128 100644
--- a/Snakefile
+++ b/Snakefile
@@ -474,6 +474,13 @@ rule plot_summary:
         "scripts/plot_summary.py"
 
 
+rule build_industrial_database:
+    output:
+        industrial_database="data/industrial_database.csv",
+    script:
+        "scripts/build_industrial_database.py"
+
+
 rule prepare_db:
     input:
         network=RDIR
diff --git a/scripts/build_industrial_database.py b/scripts/build_industrial_database.py
new file mode 100644
index 00000000..6579f34a
--- /dev/null
+++ b/scripts/build_industrial_database.py
@@ -0,0 +1,520 @@
+# -*- coding: utf-8 -*-
+
+import json
+import math
+import os
+import pprint
+from pathlib import Path
+
+import country_converter as coco
+import matplotlib.pyplot as plt
+import numpy as np
+import openpyxl
+import pandas as pd
+import pycountry
+import requests
+import seaborn as sns
+from geopy.geocoders import Nominatim
+
+
+def get_cocode_from_name(df, country_column_name):
+    country_codes = {}
+
+    for country in pycountry.countries:
+        country_codes[country.name] = country.alpha_2
+
+    df["country"] = df[country_column_name].map(country_codes)
+    return df
+
+
+def get_cocode_from_coords(df):
+    geolocator = Nominatim(user_agent="geoapi")  # Initialize geolocator
+
+    # Initialize an empty list to store country codes
+    country_codes = []
+
+    for index, row in df.iterrows():
+        # Get latitude and longitude from the row
+        latitude = row["Latitude"]
+        longitude = row["Longitude"]
+
+        # Perform reverse geocoding to get location information
+        location = geolocator.reverse((latitude, longitude), exactly_one=True)
+
+        if location and location.raw.get("address", {}).get("country_code"):
+            # Extract and append the country code to the list
+            country_code = location.raw["address"]["country_code"].upper()
+            country_codes.append(country_code)
+        else:
+            country_codes.append(None)
+
+    # Add the country code list as a new column to the DataFrame
+    df["country"] = country_codes
+
+    return df
+
+
+def create_steel_db():
+    # Global Steel Plant Tracker data set you requested from Global Energy Monitor from the link below:
+
+    # The following excel file was downloaded from the following webpage
+    # https://globalenergymonitor.org/wp-content/uploads/2023/03/Global-Steel-Plant-Tracker-2023-03.xlsx . The dataset contains 1433 Steel plants globally.
+
+    fn = "https://globalenergymonitor.org/wp-content/uploads/2023/03/Global-Steel-Plant-Tracker-2023-03.xlsx"
+    storage_options = {"User-Agent": "Mozilla/5.0"}
+    steel_orig = pd.read_excel(
+        fn,
+        index_col=0,
+        storage_options=storage_options,
+        sheet_name="Steel Plants",
+        header=0,
+    )
+
+    df_steel = steel_orig.copy()
+    df_steel = df_steel[
+        [
+            "Plant name (English)",
+            "Country",
+            "Coordinates",
+            "Coordinate accuracy",
+            "Status",
+            "Start date",
+            "Plant age (years)",
+            "Nominal crude steel capacity (ttpa)",
+            "Nominal BOF steel capacity (ttpa)",
+            "Nominal EAF steel capacity (ttpa)",
+            "Nominal OHF steel capacity (ttpa)",
+            "Nominal iron capacity (ttpa)",
+            "Nominal BF capacity (ttpa)",
+            "Nominal DRI capacity (ttpa)",
+            "Ferronickel capacity (ttpa)",
+            "Sinter plant capacity (ttpa)",
+            "Coking plant capacity (ttpa)",
+            "Pelletizing plant capacity (ttpa)",
+            "Category steel product",
+            "Main production process",
+            "Municipality",
+        ]
+    ]
+
+    # Keep only operating steel plants
+    df_steel = df_steel.loc[df_steel["Status"] == "operating"]
+
+    # Create a column with iso2 country code
+    cc = coco.CountryConverter()
+    Country = pd.Series(df_steel["Country"])
+    df_steel["country"] = cc.pandas_convert(series=Country, to="ISO2")
+
+    # Split Coordeinates column into x and y columns
+    df_steel[["y", "x"]] = df_steel["Coordinates"].str.split(",", expand=True)
+
+    # Drop Coordinates column as it contains a ',' and is not needed anymore
+    df_steel = df_steel.drop(columns="Coordinates", axis=1)
+
+    # Fetch steel plants that uses DRI and BF techs and drop them from main df
+    mixed_steel_plants = df_steel[
+        df_steel["Main production process"] == "integrated (BF and DRI)"
+    ].copy()
+    df_steel = df_steel.drop(mixed_steel_plants.index)
+
+    # Separate the two techs in two dataframes
+    DRI_share = mixed_steel_plants.copy()
+    BF_share = mixed_steel_plants.copy()
+    BF_share["Main production process"] = "integrated (BF)"
+    DRI_share["Main production process"] = "integrated (DRI)"
+
+    # Calculate the share of both techs according to the capacities of iron production
+    BF_share["Nominal crude steel capacity (ttpa)"] = BF_share[
+        "Nominal crude steel capacity (ttpa)"
+    ] * mixed_steel_plants.apply(
+        lambda x: x["Nominal BF capacity (ttpa)"] / x["Nominal iron capacity (ttpa)"],
+        axis=1,
+    )
+    DRI_share["Nominal crude steel capacity (ttpa)"] = (
+        mixed_steel_plants["Nominal crude steel capacity (ttpa)"]
+        - BF_share["Nominal crude steel capacity (ttpa)"]
+    )
+
+    # Add suffix to the index to differentiate between them in the main df
+    DRI_share.index += "_DRI"
+    BF_share.index += "_BF"
+
+    # Merge them back to the main df
+    df_steel = pd.concat([df_steel, BF_share, DRI_share])
+    df_steel["Main production process"].value_counts()
+
+    # Remove plants with unknown production technology
+    unknown_ind = df_steel[
+        df_steel["Main production process"].str.contains("unknown")
+    ].index
+    df_steel = df_steel.drop(unknown_ind)
+    if len(unknown_ind) > 0:
+        print(
+            "dropped {0} steel/iron plants with unknown production technology of total {1} plants".format(
+                len(unknown_ind), len(df_steel)
+            )
+        )
+    df_steel["Main production process"].value_counts()
+
+    # Dict to map the technology names of the source to that expected in the workflow
+    iron_techs = {
+        "electric": "Electric arc",
+        "integrated (BF)": "Integrated steelworks",
+        "integrated (DRI)": "DRI + Electric arc",
+        "ironmaking (BF)": "Integrated steelworks",
+        "ironmaking (DRI)": "DRI + Electric arc",
+        "oxygen": "Integrated steelworks",
+        "electric, oxygen": "Electric arc",
+    }
+
+    # Creating the necessary columns in the dataframe
+    iron_making = df_steel[
+        df_steel["Main production process"].str.contains("ironmaking")
+    ].index
+    df_steel.loc[iron_making, "Nominal crude steel capacity (ttpa)"] = df_steel.loc[
+        iron_making, "Nominal iron capacity (ttpa)"
+    ]
+    df_steel["unit"] = "kt/yr"
+    df_steel["quality"] = "exact"
+    df_steel = df_steel.reset_index()
+    df_steel = df_steel.rename(
+        columns={
+            "Nominal crude steel capacity (ttpa)": "capacity",
+            "Municipality": "location",
+            "Plant ID": "ID",
+        }
+    )
+    df_steel.capacity = pd.to_numeric(df_steel.capacity)
+    df_steel["technology"] = df_steel["Main production process"].apply(
+        lambda x: iron_techs[x]
+    )
+    df_steel.x = df_steel.x.apply(lambda x: eval(x))
+    df_steel.y = df_steel.y.apply(lambda y: eval(y))
+
+    return df_steel[
+        [
+            "country",
+            "y",
+            "x",
+            "location",
+            "technology",
+            "capacity",
+            "unit",
+            "quality",
+            "ID",
+        ]
+    ].dropna()
+
+
+def create_cement_db():
+    # -------------
+    # CEMENT
+    # -------------
+    # The following excel file was downloaded from the following webpage https://www.cgfi.ac.uk/spatial-finance-initiative/geoasset-project/cement/.
+    # The dataset contains 3117 cement plants globally.
+    fn = "https://www.cgfi.ac.uk/wp-content/uploads/2021/08/SFI-Global-Cement-Database-July-2021.xlsx"
+    storage_options = {"User-Agent": "Mozilla/5.0"}
+    cement_orig = pd.read_excel(
+        fn,
+        index_col=0,
+        storage_options=storage_options,
+        sheet_name="SFI_ALD_Cement_Database",
+        header=0,
+    )
+
+    df_cement = cement_orig.copy()
+    df_cement = df_cement[
+        [
+            "country",
+            "iso3",
+            "latitude",
+            "longitude",
+            "status",
+            "plant_type",
+            "capacity",
+            "year",
+            "city",
+        ]
+    ]
+    df_cement = df_cement.rename(
+        columns={
+            "country": "Country",
+            "latitude": "y",
+            "longitude": "x",
+            "city": "location",
+        }
+    )
+    df_cement["unit"] = "Kt/yr"
+    df_cement["technology"] = "Cement"
+    df_cement["capacity"] = df_cement["capacity"] * 1000
+    # Keep only operating steel plants
+    df_cement = df_cement.loc[df_cement["status"] == "Operating"]
+
+    # Create a column with iso2 country code
+    cc = coco.CountryConverter()
+    iso3 = pd.Series(df_cement["iso3"])
+    df_cement["country"] = cc.pandas_convert(series=iso3, to="ISO2")
+
+    # Dropping the null capacities reduces the dataframe from 3000+  rows to 1672 rows
+    na_index = df_cement[df_cement.capacity.isna()].index
+    print(
+        "There are {} out of {} total cement plants with unknown capcities, setting value to country average".format(
+            len(na_index), len(df_cement)
+        )
+    )
+    avg_c_cap = df_cement.groupby(df_cement.country)["capacity"].mean()
+    df_cement["capacity"] = df_cement.apply(
+        lambda x: avg_c_cap[x["country"]]
+        if math.isnan(x["capacity"])
+        else x["capacity"],
+        axis=1,
+    )
+
+    df_cement["quality"] = "actual"
+    df_cement.loc[na_index, "quality"] = "actual"  # TODO change
+
+    df_cement = df_cement.reset_index()
+    df_cement = df_cement.rename(columns={"uid": "ID"})
+    df_cement.capacity = pd.to_numeric(df_cement.capacity)
+
+    return df_cement[
+        [
+            "country",
+            "y",
+            "x",
+            "location",
+            "technology",
+            "capacity",
+            "unit",
+            "quality",
+            "ID",
+        ]
+    ]
+
+
+def create_refineries_df():
+    # -------------
+    # OIL REFINERIES
+    # -------------
+    # The data were downloaded directly from arcgis server using a query found on this webpage:
+    # https://www.arcgis.com/home/item.html?id=a6979b6bccbf4e719de3f703ea799259&sublayer=0#data
+    # and https://www.arcgis.com/home/item.html?id=a917ac2766bc47e1877071f0201b6280
+
+    # The dataset contains 536 global Oil refineries.
+
+    base_url = "https://services.arcgis.com"
+    facts = "/jDGuO8tYggdCCnUJ/arcgis/rest/services/Global_Oil_Refinery_Complex_and_Daily_Capacity/FeatureServer/0/query?f=json&where=1%3D1&returnGeometry=false&spatialRel=esriSpatialRelIntersects&outFields=*&orderByFields=FID%20ASC&resultOffset=0&resultRecordCount=537&cacheHint=true&quantizationParameters=%7B%22mode%22%3A%22edit%22%7D"
+
+    first_response = requests.get(base_url + facts)
+    response_list = first_response.json()
+
+    data = []
+    for response in response_list["features"]:
+        data.append(
+            {
+                "FID_": response["attributes"].get("FID_"),
+                "Company": response["attributes"].get("Company"),
+                "Name": response["attributes"].get("Name"),
+                "City": response["attributes"].get("City"),
+                "Facility": response["attributes"].get("Facility"),
+                "Prov_State": response["attributes"].get("Prov_State"),
+                "Country": response["attributes"].get("Country"),
+                "Address": response["attributes"].get("Address"),
+                "Zip": response["attributes"].get("Zip"),
+                "County": response["attributes"].get("County"),
+                "PADD": response["attributes"].get("PADD"),
+                "Capacity": response["attributes"].get("Capacity"),
+                "Longitude": response["attributes"].get("Longitude"),
+                "Latitude": response["attributes"].get("Latitude"),
+                "Markets": response["attributes"].get("Markets"),
+                "CORPORATIO": response["attributes"].get("CORPORATIO"),
+            }
+        )
+
+    df = pd.DataFrame(data)
+
+    df = get_cocode_from_name(df, "Country")
+
+    df_nans = df[df.country.isna()]
+    df = df.dropna(axis=0)
+
+    df_bylocation = get_cocode_from_coords(df_nans)
+
+    df_refineries = pd.concat([df, df_bylocation])
+
+    # Creating the necessary columns in the dataframe
+    # df_refineries["technology"] = df_refineries["Main production process"].apply(lambda x: iron_techs[x])
+    df_refineries["unit"] = "bpd"
+    df_refineries["quality"] = "exact"
+    df_refineries["technology"] = "HVC"
+
+    df_refineries = df_refineries.rename(
+        columns={
+            "Capacity": "capacity",
+            "Prov_State": "location",
+            "Latitude": "y",
+            "Longitude": "x",
+            "FID_": "ID",
+        }
+    )
+    df_refineries = df_refineries.reset_index()
+    df_refineries.capacity = pd.to_numeric(df_refineries.capacity)
+
+    return df_refineries[
+        [
+            "country",
+            "y",
+            "x",
+            "location",
+            "technology",
+            "capacity",
+            "unit",
+            "quality",
+            "ID",
+        ]
+    ]
+
+
+def create_paper_df():
+    # -------------
+    # Paper
+    # -------------
+    # The following excel file was downloaded from the following webpage https://www.cgfi.ac.uk/spatial-finance-initiative/geoasset-project/cement/ . The dataset contains 3117 cement plants globally.
+
+    fn = "https://www.cgfi.ac.uk/wp-content/uploads/2023/03/SFI_ALD_Pulp_Paper_Sample_LatAm_Jan_2023.xlsx"
+
+    storage_options = {"User-Agent": "Mozilla/5.0"}
+    paper_orig = pd.read_excel(
+        fn,
+        index_col=0,
+        storage_options=storage_options,
+        sheet_name="SFI_ALD_PPM_LatAm",
+        header=0,
+    )
+
+    df_paper = paper_orig.copy()
+    df_paper = df_paper[
+        [
+            "country",
+            "iso3",
+            "latitude",
+            "longitude",
+            "status",
+            "primary_product",
+            "capacity_paper",
+            "city",
+        ]
+    ]
+
+    df_paper = df_paper.rename(
+        columns={
+            "country": "Country",
+            "latitude": "y",
+            "longitude": "x",
+            "city": "location",
+            "capacity_paper": "capacity",
+        }
+    )
+    df_paper["unit"] = "10kt/yr"
+    df_paper["technology"] = "Paper"
+    df_paper["capacity"] = df_paper["capacity"]
+
+    df_paper.capacity = df_paper.capacity.apply(
+        lambda x: x if type(x) == int or type(x) == int == float else np.nan
+    )
+
+    # Keep only operating steel plants
+    # df_paper = df_paper.loc[df_paper["status"] == "Operating"]
+
+    # Create a column with iso2 country code
+    cc = coco.CountryConverter()
+    iso3 = pd.Series(df_paper["iso3"])
+    df_paper["country"] = cc.pandas_convert(series=iso3, to="ISO2")
+
+    # Dropping the null capacities reduces the dataframe from 3000+  rows to 1672 rows
+    na_index = df_paper[df_paper.capacity.isna()].index
+    print(
+        "There are {} out of {} total paper plants with unknown capcities, setting value to country average".format(
+            len(na_index), len(df_paper)
+        )
+    )
+    avg_c_cap = df_paper.groupby(df_paper.country)["capacity"].mean()
+    na_index
+
+    df_paper["capacity"] = df_paper.apply(
+        lambda x: avg_c_cap[x["country"]]
+        if math.isnan(x["capacity"])
+        else x["capacity"],
+        axis=1,
+    )
+
+    df_paper["quality"] = "actual"
+    df_paper.loc[na_index, "quality"] = "actual"  # TODO change
+    df_paper.capacity = pd.to_numeric(df_paper.capacity)
+
+    df_paper = df_paper.reset_index()
+    df_paper = df_paper.rename(columns={"uid": "ID"})
+
+    industrial_database_paper = df_paper[
+        [
+            "country",
+            "y",
+            "x",
+            "location",
+            "technology",
+            "capacity",
+            "unit",
+            "quality",
+            "ID",
+        ]
+    ]
+
+    no_infp_index = industrial_database_paper[
+        industrial_database_paper.y == "No information"
+    ].index
+    print(
+        "Setting plants of countries with no values for paper plants to 1.0".format(
+            len(na_index), len(df_paper)
+        )
+    )
+    industrial_database_paper = industrial_database_paper.drop(no_infp_index)
+    industrial_database_paper.capacity = industrial_database_paper.capacity.fillna(1)
+
+    return industrial_database_paper
+
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        from helpers import mock_snakemake, sets_path_to_root
+
+        os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+        # from helper import mock_snakemake #TODO remove func from here to helper script
+        snakemake = mock_snakemake(
+            "build_industrial_database",
+            simpl="",
+            clusters="10",
+            ll="c1.0",
+            opts="Co2L",
+            planning_horizons="2030",
+            sopts="144H",
+            discountrate="0.071",
+            demand="DF",
+        )
+    industrial_database_steel = create_steel_db()
+    industrial_database_cement = create_cement_db()
+    industrial_database_refineries = create_refineries_df()
+    industrial_database_paper = create_paper_df()
+
+    industrial_database = pd.concat(
+        [
+            industrial_database_steel,
+            industrial_database_cement,
+            industrial_database_refineries,
+            industrial_database_paper,
+        ]
+    )
+
+    industrial_database.to_csv(
+        snakemake.output["industrial_database"], header=True, index=0
+    )
diff --git a/scripts/build_industrial_distribution_key.py b/scripts/build_industrial_distribution_key.py
index 3b04af08..8dfeea57 100644
--- a/scripts/build_industrial_distribution_key.py
+++ b/scripts/build_industrial_distribution_key.py
@@ -115,6 +115,7 @@ def build_nodal_distribution_key(
         header=0,
         keep_default_na=False,  # , index_col=0
     )
+    geo_locs.capacity = pd.to_numeric(geo_locs.capacity)
 
     gadm_clustering = snakemake.config["clustering_options"]["alternative_clustering"]