diff --git a/tabular-playground-feb2021.ipynb b/tabular-playground-feb2021.ipynb new file mode 100644 index 0000000..24ca3bf --- /dev/null +++ b/tabular-playground-feb2021.ipynb @@ -0,0 +1,3528 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nimport os\\nimport sys\\nfrom tempfile import NamedTemporaryFile\\nfrom urllib.request import urlopen\\nfrom urllib.parse import unquote\\nfrom urllib.error import HTTPError\\nfrom zipfile import ZipFile\\n\\nCHUNK_SIZE = 40960 \\nDATASET_MAPPING = \\'tabular-playground-series-feb-2021:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F25225%2F1923495%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20210208%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20210208T152806Z%26X-Goog-Expires%3D259199%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Daa7afbe87c577f16206b41ef193717b54b23ff6ba665e6f18f4d03f107f08c34d3ad6f5afa301d1b0f289b6d82a9101c38ab20a8fa96346fefb5446bd298f44daac2c5b38eaaa52644635d0dcea55f6beb935a1f575478f425a15e0cf0dd9434ff9962355512a40b5504a12e5a70d636276a611c8ee194987c6e8ded3b36877f6c0875afd75333b0df4d1fdf6051eb1711f7779f4f6542360cb860b6e6e9afa88a51c27cabf129a460f6f1a72850e889232370ef0ee1f2513cb44d47cc87faa6204be8ddc5b4e2d9fd28b5dcf704ef5e65bdd969dc18d0079586d8c79c12e65057a49827978d009ed6797670d2a28f2174aba5fcbfdf9db7306d59f50d11da8d\\'\\nKAGGLE_INPUT_PATH=\\'/home/kaggle/input\\'\\nKAGGLE_INPUT_SYMLINK=\\'/kaggle\\'\\n\\nos.makedirs(KAGGLE_INPUT_PATH, 777)\\nos.symlink(KAGGLE_INPUT_PATH, os.path.join(\\'..\\', \\'input\\'), target_is_directory=True)\\nos.makedirs(KAGGLE_INPUT_SYMLINK)\\nos.symlink(KAGGLE_INPUT_PATH, os.path.join(KAGGLE_INPUT_SYMLINK, \\'input\\'), target_is_directory=True)\\n\\nfor dataset_mapping in DATASET_MAPPING.split(\\',\\'):\\n directory, download_url_encoded = dataset_mapping.split(\\':\\')\\n download_url = unquote(download_url_encoded)\\n destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)\\n try:\\n with urlopen(download_url) as zipfileres, NamedTemporaryFile() as tfile:\\n total_length = zipfileres.headers[\\'content-length\\']\\n print(f\\'Downloading {directory}, {total_length} bytes zipped\\')\\n dl = 0\\n data = zipfileres.read(CHUNK_SIZE)\\n while len(data) > 0:\\n dl += len(data)\\n tfile.write(data)\\n done = int(50 * dl / int(total_length))\\n sys.stdout.write(f\"\\r[{\\'=\\' * done}{\\' \\' * (50-done)}] {dl} bytes downloaded\")\\n sys.stdout.flush()\\n data = zipfileres.read(CHUNK_SIZE)\\n print(f\\'\\nUnzipping {directory}\\')\\n with ZipFile(tfile) as zfile:\\n zfile.extractall(destination_path)\\n except HTTPError as e:\\n print(f\\'Failed to load (likely expired) {download_url} to path {destination_path}\\')\\n continue\\n except OSError as e:\\n print(f\\'Failed to load {download_url} to path {destination_path}\\')\\n continue\\nprint(\\'Dataset import complete.\\')\\n'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATASETS\n", + "# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,\n", + "# THEN FEEL FREE TO DELETE CELL.\n", + "'''\n", + "import os\n", + "import sys\n", + "from tempfile import NamedTemporaryFile\n", + "from urllib.request import urlopen\n", + "from urllib.parse import unquote\n", + "from urllib.error import HTTPError\n", + "from zipfile import ZipFile\n", + "\n", + "CHUNK_SIZE = 40960 \n", + "DATASET_MAPPING = 'tabular-playground-series-feb-2021:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F25225%2F1923495%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20210208%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20210208T152806Z%26X-Goog-Expires%3D259199%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Daa7afbe87c577f16206b41ef193717b54b23ff6ba665e6f18f4d03f107f08c34d3ad6f5afa301d1b0f289b6d82a9101c38ab20a8fa96346fefb5446bd298f44daac2c5b38eaaa52644635d0dcea55f6beb935a1f575478f425a15e0cf0dd9434ff9962355512a40b5504a12e5a70d636276a611c8ee194987c6e8ded3b36877f6c0875afd75333b0df4d1fdf6051eb1711f7779f4f6542360cb860b6e6e9afa88a51c27cabf129a460f6f1a72850e889232370ef0ee1f2513cb44d47cc87faa6204be8ddc5b4e2d9fd28b5dcf704ef5e65bdd969dc18d0079586d8c79c12e65057a49827978d009ed6797670d2a28f2174aba5fcbfdf9db7306d59f50d11da8d'\n", + "KAGGLE_INPUT_PATH='/home/kaggle/input'\n", + "KAGGLE_INPUT_SYMLINK='/kaggle'\n", + "\n", + "os.makedirs(KAGGLE_INPUT_PATH, 777)\n", + "os.symlink(KAGGLE_INPUT_PATH, os.path.join('..', 'input'), target_is_directory=True)\n", + "os.makedirs(KAGGLE_INPUT_SYMLINK)\n", + "os.symlink(KAGGLE_INPUT_PATH, os.path.join(KAGGLE_INPUT_SYMLINK, 'input'), target_is_directory=True)\n", + "\n", + "for dataset_mapping in DATASET_MAPPING.split(','):\n", + " directory, download_url_encoded = dataset_mapping.split(':')\n", + " download_url = unquote(download_url_encoded)\n", + " destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)\n", + " try:\n", + " with urlopen(download_url) as zipfileres, NamedTemporaryFile() as tfile:\n", + " total_length = zipfileres.headers['content-length']\n", + " print(f'Downloading {directory}, {total_length} bytes zipped')\n", + " dl = 0\n", + " data = zipfileres.read(CHUNK_SIZE)\n", + " while len(data) > 0:\n", + " dl += len(data)\n", + " tfile.write(data)\n", + " done = int(50 * dl / int(total_length))\n", + " sys.stdout.write(f\"\\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded\")\n", + " sys.stdout.flush()\n", + " data = zipfileres.read(CHUNK_SIZE)\n", + " print(f'\\nUnzipping {directory}')\n", + " with ZipFile(tfile) as zfile:\n", + " zfile.extractall(destination_path)\n", + " except HTTPError as e:\n", + " print(f'Failed to load (likely expired) {download_url} to path {destination_path}')\n", + " continue\n", + " except OSError as e:\n", + " print(f'Failed to load {download_url} to path {destination_path}')\n", + " continue\n", + "print('Dataset import complete.')\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../input/tabular-playground-series-feb-2021/train.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrain_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../input/tabular-playground-series-feb-2021/train.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtest_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'../input/tabular-playground-series-feb-2021/test.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 684\u001b[0m )\n\u001b[1;32m 685\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 686\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 687\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 688\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 450\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 452\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfp_or_buf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 453\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 454\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 944\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"has_index_names\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 945\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 946\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 948\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"c\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"c\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"python\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.8/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 2006\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"usecols\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0musecols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2007\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2008\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2009\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munnamed_cols\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2010\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../input/tabular-playground-series-feb-2021/train.csv'" + ] + } + ], + "source": [ + "train_df = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')\n", + "test_df = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcat0cat1cat2cat3cat4cat5cat6cat7cat8...cont5cont6cont7cont8cont9cont10cont11cont12cont13target
01ABAABDAEC...0.8811220.4216500.7414130.8957990.8024610.7244170.7019150.8776180.7199036.994023
12BAAABBAEA...0.4400110.3462300.2784950.5934130.5460560.6132520.7412890.3266790.8084648.071256
23AAACBDABC...0.9141550.3696020.8325640.8656200.8252510.2641040.6955610.8691330.8283525.760456
34AAACBDAEG...0.9341380.5789300.4073130.8680990.7944020.4942690.6981250.8097990.6147667.806457
46ABAABBAEC...0.3826000.7059400.3251930.4409670.4621460.7244470.6830730.3434570.2977436.868974
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " id cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 ... cont5 cont6 \\\n", + "0 1 A B A A B D A E C ... 0.881122 0.421650 \n", + "1 2 B A A A B B A E A ... 0.440011 0.346230 \n", + "2 3 A A A C B D A B C ... 0.914155 0.369602 \n", + "3 4 A A A C B D A E G ... 0.934138 0.578930 \n", + "4 6 A B A A B B A E C ... 0.382600 0.705940 \n", + "\n", + " cont7 cont8 cont9 cont10 cont11 cont12 cont13 \\\n", + "0 0.741413 0.895799 0.802461 0.724417 0.701915 0.877618 0.719903 \n", + "1 0.278495 0.593413 0.546056 0.613252 0.741289 0.326679 0.808464 \n", + "2 0.832564 0.865620 0.825251 0.264104 0.695561 0.869133 0.828352 \n", + "3 0.407313 0.868099 0.794402 0.494269 0.698125 0.809799 0.614766 \n", + "4 0.325193 0.440967 0.462146 0.724447 0.683073 0.343457 0.297743 \n", + "\n", + " target \n", + "0 6.994023 \n", + "1 8.071256 \n", + "2 5.760456 \n", + "3 7.806457 \n", + "4 6.868974 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcont0cont1cont2cont3cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13target
count300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000300000.000000
mean250018.5769470.5246340.5066490.4441150.4462140.4554710.5083370.4783450.4559040.4593210.5268990.5049430.5299380.5245490.5033497.456260
std144450.1500100.2048750.2352690.2000890.2386690.2006950.2316120.1924320.2044930.2206420.2040250.2015490.2308600.2208920.2252180.887295
min1.000000-0.093505-0.055105-0.0602740.1347600.189216-0.0872470.0439530.2087030.0040410.0730400.0596440.064161-0.0056000.1581210.000000
25%124772.5000000.3704510.3523070.3141210.2145720.2798530.3387470.3398960.2780410.3086550.3619570.3388980.3166620.3321430.2912896.798341
50%250002.5000000.4922080.6151560.4572710.3778230.4113510.4413840.4100900.3607360.4258010.4888670.5198550.5588270.4073650.4339097.496503
75%375226.5000000.6547930.6881500.5548350.7197580.6218080.7095150.6042460.6393880.5415250.7527650.6728090.7203810.7324310.7308708.161166
max499999.0000001.0526660.8517461.0176891.0064690.9940501.0444331.0933121.0365411.0141560.9720911.0297731.0380490.9613700.87357910.309208
\n", + "
" + ], + "text/plain": [ + " id cont0 cont1 cont2 \\\n", + "count 300000.000000 300000.000000 300000.000000 300000.000000 \n", + "mean 250018.576947 0.524634 0.506649 0.444115 \n", + "std 144450.150010 0.204875 0.235269 0.200089 \n", + "min 1.000000 -0.093505 -0.055105 -0.060274 \n", + "25% 124772.500000 0.370451 0.352307 0.314121 \n", + "50% 250002.500000 0.492208 0.615156 0.457271 \n", + "75% 375226.500000 0.654793 0.688150 0.554835 \n", + "max 499999.000000 1.052666 0.851746 1.017689 \n", + "\n", + " cont3 cont4 cont5 cont6 \\\n", + "count 300000.000000 300000.000000 300000.000000 300000.000000 \n", + "mean 0.446214 0.455471 0.508337 0.478345 \n", + "std 0.238669 0.200695 0.231612 0.192432 \n", + "min 0.134760 0.189216 -0.087247 0.043953 \n", + "25% 0.214572 0.279853 0.338747 0.339896 \n", + "50% 0.377823 0.411351 0.441384 0.410090 \n", + "75% 0.719758 0.621808 0.709515 0.604246 \n", + "max 1.006469 0.994050 1.044433 1.093312 \n", + "\n", + " cont7 cont8 cont9 cont10 \\\n", + "count 300000.000000 300000.000000 300000.000000 300000.000000 \n", + "mean 0.455904 0.459321 0.526899 0.504943 \n", + "std 0.204493 0.220642 0.204025 0.201549 \n", + "min 0.208703 0.004041 0.073040 0.059644 \n", + "25% 0.278041 0.308655 0.361957 0.338898 \n", + "50% 0.360736 0.425801 0.488867 0.519855 \n", + "75% 0.639388 0.541525 0.752765 0.672809 \n", + "max 1.036541 1.014156 0.972091 1.029773 \n", + "\n", + " cont11 cont12 cont13 target \n", + "count 300000.000000 300000.000000 300000.000000 300000.000000 \n", + "mean 0.529938 0.524549 0.503349 7.456260 \n", + "std 0.230860 0.220892 0.225218 0.887295 \n", + "min 0.064161 -0.005600 0.158121 0.000000 \n", + "25% 0.316662 0.332143 0.291289 6.798341 \n", + "50% 0.558827 0.407365 0.433909 7.496503 \n", + "75% 0.720381 0.732431 0.730870 8.161166 \n", + "max 1.038049 0.961370 0.873579 10.309208 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Checking Missing Values" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "cat0 0\n", + "cat1 0\n", + "cat2 0\n", + "cat3 0\n", + "cat4 0\n", + "cat5 0\n", + "cat6 0\n", + "cat7 0\n", + "cat8 0\n", + "cat9 0\n", + "cont0 0\n", + "cont1 0\n", + "cont2 0\n", + "cont3 0\n", + "cont4 0\n", + "cont5 0\n", + "cont6 0\n", + "cont7 0\n", + "cont8 0\n", + "cont9 0\n", + "cont10 0\n", + "cont11 0\n", + "cont12 0\n", + "cont13 0\n", + "target 0\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Unique values for Categorical Columns" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unique_val(df, col):\n", + " return df[col].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique values of cat0: \n", + "['A' 'B']\n", + "Unique values of cat1: \n", + "['B' 'A']\n", + "Unique values of cat2: \n", + "['A' 'B']\n", + "Unique values of cat3: \n", + "['A' 'C' 'D' 'B']\n", + "Unique values of cat4: \n", + "['B' 'C' 'A' 'D']\n", + "Unique values of cat5: \n", + "['D' 'B' 'A' 'C']\n", + "Unique values of cat6: \n", + "['A' 'B' 'D' 'C' 'E' 'I' 'G' 'H']\n", + "Unique values of cat7: \n", + "['E' 'B' 'D' 'G' 'F' 'A' 'C' 'I']\n", + "Unique values of cat8: \n", + "['C' 'A' 'G' 'E' 'D' 'F' 'B']\n", + "Unique values of cat9: \n", + "['I' 'F' 'N' 'K' 'B' 'L' 'G' 'H' 'O' 'A' 'J' 'M' 'C' 'D' 'E']\n" + ] + } + ], + "source": [ + "# Training data\n", + "for i in range(0,10):\n", + " print('Unique values of cat'+ str(i) + ': ')\n", + " print(get_unique_val(train_df, 'cat'+ str(i)))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique values of cat0: \n", + "['A' 'B']\n", + "Unique values of cat1: \n", + "['B' 'A']\n", + "Unique values of cat2: \n", + "['A' 'B']\n", + "Unique values of cat3: \n", + "['C' 'A' 'D' 'B']\n", + "Unique values of cat4: \n", + "['B' 'A' 'C' 'D']\n", + "Unique values of cat5: \n", + "['D' 'B' 'A' 'C']\n", + "Unique values of cat6: \n", + "['A' 'B' 'C' 'D' 'E' 'I' 'H']\n", + "Unique values of cat7: \n", + "['E' 'D' 'B' 'G' 'F' 'I' 'A' 'C']\n", + "Unique values of cat8: \n", + "['E' 'C' 'D' 'G' 'A' 'F' 'B']\n", + "Unique values of cat9: \n", + "['G' 'L' 'F' 'I' 'A' 'K' 'M' 'O' 'N' 'H' 'B' 'J' 'C' 'E' 'D']\n" + ] + } + ], + "source": [ + "# Test Data\n", + "for i in range(0,10):\n", + " print('Unique values of cat'+ str(i) + ': ')\n", + " print(get_unique_val(test_df, 'cat'+ str(i)))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "{'G'}\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n", + "-----Train - Test-----\n", + "set()\n", + "-----Test - Train-----\n", + "set()\n" + ] + } + ], + "source": [ + "for i in range(0,10):\n", + " train_minus_test = set(get_unique_val(train_df, 'cat'+ str(i))).difference(set(get_unique_val(test_df, 'cat'+ str(i))))\n", + " test_minus_train = set(get_unique_val(test_df, 'cat'+ str(i))).difference(set(get_unique_val(train_df, 'cat'+ str(i))))\n", + " print('-----Train - Test-----')\n", + " print(train_minus_test)\n", + " print('-----Test - Train-----')\n", + " print(test_minus_train)\n", + " if len(train_minus_test) > 0:\n", + " for item in train_minus_test:\n", + " train_df = train_df[train_df['cat'+ str(i)] != item]\n", + " \n", + " if len(test_minus_train) > 0:\n", + " for item in test_minus_train:\n", + " test_df = test_df[test_df['cat'+ str(i)] != item]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique values of cat0: \n", + "['A' 'B']\n", + "Unique values of cat1: \n", + "['B' 'A']\n", + "Unique values of cat2: \n", + "['A' 'B']\n", + "Unique values of cat3: \n", + "['A' 'C' 'D' 'B']\n", + "Unique values of cat4: \n", + "['B' 'C' 'A' 'D']\n", + "Unique values of cat5: \n", + "['D' 'B' 'A' 'C']\n", + "Unique values of cat6: \n", + "['A' 'B' 'D' 'C' 'E' 'I' 'H']\n", + "Unique values of cat7: \n", + "['E' 'B' 'D' 'G' 'F' 'A' 'C' 'I']\n", + "Unique values of cat8: \n", + "['C' 'A' 'G' 'E' 'D' 'F' 'B']\n", + "Unique values of cat9: \n", + "['I' 'F' 'N' 'K' 'B' 'L' 'G' 'H' 'O' 'A' 'J' 'M' 'C' 'D' 'E']\n" + ] + } + ], + "source": [ + "# Training data after removal of extra records that were present in either of training data and testing data but absent in another one\n", + "for i in range(0,10):\n", + " print('Unique values of cat'+ str(i) + ': ')\n", + " print(get_unique_val(train_df, 'cat'+ str(i)))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique values of cat0: \n", + "['A' 'B']\n", + "Unique values of cat1: \n", + "['B' 'A']\n", + "Unique values of cat2: \n", + "['A' 'B']\n", + "Unique values of cat3: \n", + "['C' 'A' 'D' 'B']\n", + "Unique values of cat4: \n", + "['B' 'A' 'C' 'D']\n", + "Unique values of cat5: \n", + "['D' 'B' 'A' 'C']\n", + "Unique values of cat6: \n", + "['A' 'B' 'C' 'D' 'E' 'I' 'H']\n", + "Unique values of cat7: \n", + "['E' 'D' 'B' 'G' 'F' 'I' 'A' 'C']\n", + "Unique values of cat8: \n", + "['E' 'C' 'D' 'G' 'A' 'F' 'B']\n", + "Unique values of cat9: \n", + "['G' 'L' 'F' 'I' 'A' 'K' 'M' 'O' 'N' 'H' 'B' 'J' 'C' 'E' 'D']\n" + ] + } + ], + "source": [ + "# Test Data after removal of extra records that were present in either of training data and testing data but absent in another one\n", + "for i in range(0,10):\n", + " print('Unique values of cat'+ str(i) + ': ')\n", + " print(get_unique_val(test_df, 'cat'+ str(i)))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 299997\n", + "cat0 299997\n", + "cat1 299997\n", + "cat2 299997\n", + "cat3 299997\n", + "cat4 299997\n", + "cat5 299997\n", + "cat6 299997\n", + "cat7 299997\n", + "cat8 299997\n", + "cat9 299997\n", + "cont0 299997\n", + "cont1 299997\n", + "cont2 299997\n", + "cont3 299997\n", + "cont4 299997\n", + "cont5 299997\n", + "cont6 299997\n", + "cont7 299997\n", + "cont8 299997\n", + "cont9 299997\n", + "cont10 299997\n", + "cont11 299997\n", + "cont12 299997\n", + "cont13 299997\n", + "target 299997\n", + "dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 200000\n", + "cat0 200000\n", + "cat1 200000\n", + "cat2 200000\n", + "cat3 200000\n", + "cat4 200000\n", + "cat5 200000\n", + "cat6 200000\n", + "cat7 200000\n", + "cat8 200000\n", + "cat9 200000\n", + "cont0 200000\n", + "cont1 200000\n", + "cont2 200000\n", + "cont3 200000\n", + "cont4 200000\n", + "cont5 200000\n", + "cont6 200000\n", + "cont7 200000\n", + "cont8 200000\n", + "cont9 200000\n", + "cont10 200000\n", + "cont11 200000\n", + "cont12 200000\n", + "cont13 200000\n", + "dtype: int64" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_df.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creating a dataframe of categorical columns only" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cat0cat1cat2cat3cat4cat5cat6cat7cat8cat9
0ABAABDAECI
1BAAABBAEAF
2AAACBDABCN
3AAACBDAEGK
4ABAABBAECF
\n", + "
" + ], + "text/plain": [ + " cat0 cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9\n", + "0 A B A A B D A E C I\n", + "1 B A A A B B A E A F\n", + "2 A A A C B D A B C N\n", + "3 A A A C B D A E G K\n", + "4 A B A A B B A E C F" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "object_cols = train_df.select_dtypes(include=['object'])\n", + "object_cols.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# One - hot encoding\n", + "One - hot encoding does not work well if any categorical column has a lot of categorical values because it will increase size of the data. Hence, we will consider columns with upto 10 categorical values, rest categorical values will be dropped." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8']" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "low_cardinality_cols = [col for col in object_cols if train_df[col].nunique()<10]\n", + "low_cardinality_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['cat9']" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "high_cardinality_cols = list(set(object_cols) - set(low_cardinality_cols))\n", + "high_cardinality_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "one_hot_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)\n", + "one_hot_encoded_cols = pd.DataFrame(one_hot_encoder.fit_transform(train_df[low_cardinality_cols]))\n", + "one_hot_encoded_cols.index = train_df.index" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...30313233343536373839
01.00.00.01.01.00.01.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
10.01.01.00.01.00.01.00.00.00.0...0.00.00.01.00.00.00.00.00.00.0
21.00.01.00.01.00.00.00.01.00.0...0.00.00.00.00.01.00.00.00.00.0
31.00.01.00.01.00.00.00.01.00.0...0.00.00.00.00.00.00.00.00.01.0
41.00.00.01.01.00.01.00.00.00.0...0.00.00.00.00.01.00.00.00.00.0
\n", + "

5 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 30 31 32 33 \\\n", + "0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "1 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 \n", + "2 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "3 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "4 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "\n", + " 34 35 36 37 38 39 \n", + "0 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "1 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 0.0 0.0 1.0 \n", + "4 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 40 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_hot_encoded_cols.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "one_hot_encoded_test_cols = pd.DataFrame(one_hot_encoder.fit_transform(test_df[low_cardinality_cols]))\n", + "one_hot_encoded_test_cols.index = test_df.index" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...30313233343536373839
01.00.00.01.01.00.00.00.01.00.0...0.00.00.00.00.00.00.01.00.00.0
11.00.00.01.01.00.00.00.01.00.0...0.00.00.00.00.01.00.00.00.00.0
21.00.00.01.01.00.00.00.01.00.0...0.00.00.00.00.01.00.00.00.00.0
31.00.01.00.00.01.01.00.00.00.0...0.00.00.00.00.00.00.01.00.00.0
41.00.00.01.01.00.01.00.00.00.0...0.00.00.00.00.00.00.01.00.00.0
\n", + "

5 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... 30 31 32 33 \\\n", + "0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "1 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "2 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "3 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "4 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 \n", + "\n", + " 34 35 36 37 38 39 \n", + "0 0.0 0.0 0.0 1.0 0.0 0.0 \n", + "1 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "2 0.0 1.0 0.0 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 1.0 0.0 0.0 \n", + "4 0.0 0.0 0.0 1.0 0.0 0.0 \n", + "\n", + "[5 rows x 40 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_hot_encoded_test_cols.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Separating numeric columns" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "numeric_train_df = train_df.drop(object_cols, axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcont0cont1cont2cont3cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13target
010.9231910.6849680.1244540.2178860.2814210.8811220.4216500.7414130.8957990.8024610.7244170.7019150.8776180.7199036.994023
120.4376270.0142130.3574380.8461270.2823540.4400110.3462300.2784950.5934130.5460560.6132520.7412890.3266790.8084648.071256
230.7322090.7601220.4546440.8129900.2937560.9141550.3696020.8325640.8656200.8252510.2641040.6955610.8691330.8283525.760456
340.7051420.7716780.1537350.7328930.7697850.9341380.5789300.4073130.8680990.7944020.4942690.6981250.8097990.6147667.806457
460.4860630.6393490.4962120.3541860.2791050.3826000.7059400.3251930.4409670.4621460.7244470.6830730.3434570.2977436.868974
\n", + "
" + ], + "text/plain": [ + " id cont0 cont1 cont2 cont3 cont4 cont5 cont6 \\\n", + "0 1 0.923191 0.684968 0.124454 0.217886 0.281421 0.881122 0.421650 \n", + "1 2 0.437627 0.014213 0.357438 0.846127 0.282354 0.440011 0.346230 \n", + "2 3 0.732209 0.760122 0.454644 0.812990 0.293756 0.914155 0.369602 \n", + "3 4 0.705142 0.771678 0.153735 0.732893 0.769785 0.934138 0.578930 \n", + "4 6 0.486063 0.639349 0.496212 0.354186 0.279105 0.382600 0.705940 \n", + "\n", + " cont7 cont8 cont9 cont10 cont11 cont12 cont13 \\\n", + "0 0.741413 0.895799 0.802461 0.724417 0.701915 0.877618 0.719903 \n", + "1 0.278495 0.593413 0.546056 0.613252 0.741289 0.326679 0.808464 \n", + "2 0.832564 0.865620 0.825251 0.264104 0.695561 0.869133 0.828352 \n", + "3 0.407313 0.868099 0.794402 0.494269 0.698125 0.809799 0.614766 \n", + "4 0.325193 0.440967 0.462146 0.724447 0.683073 0.343457 0.297743 \n", + "\n", + " target \n", + "0 6.994023 \n", + "1 8.071256 \n", + "2 5.760456 \n", + "3 7.806457 \n", + "4 6.868974 " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numeric_train_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "numeric_test_df = test_df.drop(object_cols, axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcont0cont1cont2cont3cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13
000.3312610.5511310.4862840.1560580.7016790.5955070.2869120.2798840.2022340.2426540.2851470.2643080.6536540.302448
150.4024040.6175460.2968520.1874400.2774800.4795520.3974360.4767420.8570730.5163930.5620650.7305420.3184920.736251
2150.5306770.6181260.7608650.1998720.2795080.6763950.6952840.2533160.5869340.5485550.8361930.7597880.3335720.273905
3160.4396530.4874030.7157860.1928080.4795030.7598750.2400490.2980740.4424750.5967460.4141310.2553820.5890800.311625
4170.3898440.7527600.3152690.1917340.7578450.2102320.3298510.6166630.1704750.2632350.7109610.2240450.2858600.794931
\n", + "
" + ], + "text/plain": [ + " id cont0 cont1 cont2 cont3 cont4 cont5 cont6 \\\n", + "0 0 0.331261 0.551131 0.486284 0.156058 0.701679 0.595507 0.286912 \n", + "1 5 0.402404 0.617546 0.296852 0.187440 0.277480 0.479552 0.397436 \n", + "2 15 0.530677 0.618126 0.760865 0.199872 0.279508 0.676395 0.695284 \n", + "3 16 0.439653 0.487403 0.715786 0.192808 0.479503 0.759875 0.240049 \n", + "4 17 0.389844 0.752760 0.315269 0.191734 0.757845 0.210232 0.329851 \n", + "\n", + " cont7 cont8 cont9 cont10 cont11 cont12 cont13 \n", + "0 0.279884 0.202234 0.242654 0.285147 0.264308 0.653654 0.302448 \n", + "1 0.476742 0.857073 0.516393 0.562065 0.730542 0.318492 0.736251 \n", + "2 0.253316 0.586934 0.548555 0.836193 0.759788 0.333572 0.273905 \n", + "3 0.298074 0.442475 0.596746 0.414131 0.255382 0.589080 0.311625 \n", + "4 0.616663 0.170475 0.263235 0.710961 0.224045 0.285860 0.794931 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numeric_test_df.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Combining one hot encoded categorical and numerical data" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "final_train_df = pd.concat([one_hot_encoded_cols, numeric_train_df], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...cont5cont6cont7cont8cont9cont10cont11cont12cont13target
01.00.00.01.01.00.01.00.00.00.0...0.8811220.4216500.7414130.8957990.8024610.7244170.7019150.8776180.7199036.994023
10.01.01.00.01.00.01.00.00.00.0...0.4400110.3462300.2784950.5934130.5460560.6132520.7412890.3266790.8084648.071256
21.00.01.00.01.00.00.00.01.00.0...0.9141550.3696020.8325640.8656200.8252510.2641040.6955610.8691330.8283525.760456
31.00.01.00.01.00.00.00.01.00.0...0.9341380.5789300.4073130.8680990.7944020.4942690.6981250.8097990.6147667.806457
41.00.00.01.01.00.01.00.00.00.0...0.3826000.7059400.3251930.4409670.4621460.7244470.6830730.3434570.2977436.868974
\n", + "

5 rows × 56 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... cont5 cont6 \\\n", + "0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.881122 0.421650 \n", + "1 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.440011 0.346230 \n", + "2 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.914155 0.369602 \n", + "3 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.934138 0.578930 \n", + "4 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.382600 0.705940 \n", + "\n", + " cont7 cont8 cont9 cont10 cont11 cont12 cont13 \\\n", + "0 0.741413 0.895799 0.802461 0.724417 0.701915 0.877618 0.719903 \n", + "1 0.278495 0.593413 0.546056 0.613252 0.741289 0.326679 0.808464 \n", + "2 0.832564 0.865620 0.825251 0.264104 0.695561 0.869133 0.828352 \n", + "3 0.407313 0.868099 0.794402 0.494269 0.698125 0.809799 0.614766 \n", + "4 0.325193 0.440967 0.462146 0.724447 0.683073 0.343457 0.297743 \n", + "\n", + " target \n", + "0 6.994023 \n", + "1 8.071256 \n", + "2 5.760456 \n", + "3 7.806457 \n", + "4 6.868974 \n", + "\n", + "[5 rows x 56 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_train_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "final_test_df = pd.concat([one_hot_encoded_test_cols, numeric_test_df], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13
01.00.00.01.01.00.00.00.01.00.0...0.7016790.5955070.2869120.2798840.2022340.2426540.2851470.2643080.6536540.302448
11.00.00.01.01.00.00.00.01.00.0...0.2774800.4795520.3974360.4767420.8570730.5163930.5620650.7305420.3184920.736251
21.00.00.01.01.00.00.00.01.00.0...0.2795080.6763950.6952840.2533160.5869340.5485550.8361930.7597880.3335720.273905
31.00.01.00.00.01.01.00.00.00.0...0.4795030.7598750.2400490.2980740.4424750.5967460.4141310.2553820.5890800.311625
41.00.00.01.01.00.01.00.00.00.0...0.7578450.2102320.3298510.6166630.1704750.2632350.7109610.2240450.2858600.794931
\n", + "

5 rows × 55 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... cont4 cont5 \\\n", + "0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.701679 0.595507 \n", + "1 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.277480 0.479552 \n", + "2 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.279508 0.676395 \n", + "3 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 ... 0.479503 0.759875 \n", + "4 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.757845 0.210232 \n", + "\n", + " cont6 cont7 cont8 cont9 cont10 cont11 cont12 \\\n", + "0 0.286912 0.279884 0.202234 0.242654 0.285147 0.264308 0.653654 \n", + "1 0.397436 0.476742 0.857073 0.516393 0.562065 0.730542 0.318492 \n", + "2 0.695284 0.253316 0.586934 0.548555 0.836193 0.759788 0.333572 \n", + "3 0.240049 0.298074 0.442475 0.596746 0.414131 0.255382 0.589080 \n", + "4 0.329851 0.616663 0.170475 0.263235 0.710961 0.224045 0.285860 \n", + "\n", + " cont13 \n", + "0 0.302448 \n", + "1 0.736251 \n", + "2 0.273905 \n", + "3 0.311625 \n", + "4 0.794931 \n", + "\n", + "[5 rows x 55 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_test_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index([ 0, 1, 2, 3, 4, 5, 6,\n", + " 7, 8, 9, 10, 11, 12, 13,\n", + " 14, 15, 16, 17, 18, 19, 20,\n", + " 21, 22, 23, 24, 25, 26, 27,\n", + " 28, 29, 30, 31, 32, 33, 34,\n", + " 35, 36, 37, 38, 39, 'id', 'cont0',\n", + " 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',\n", + " 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'target'],\n", + " dtype='object')" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_train_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index([ 0, 1, 2, 3, 4, 5, 6,\n", + " 7, 8, 9, 10, 11, 12, 13,\n", + " 14, 15, 16, 17, 18, 19, 20,\n", + " 21, 22, 23, 24, 25, 26, 27,\n", + " 28, 29, 30, 31, 32, 33, 34,\n", + " 35, 36, 37, 38, 39, 'id', 'cont0',\n", + " 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',\n", + " 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],\n", + " dtype='object')" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_test_df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Splitting of training data into training and validation data" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 6.994023\n", + "1 8.071256\n", + "2 5.760456\n", + "3 7.806457\n", + "4 6.868974\n", + "Name: target, dtype: float64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = final_train_df.pop('target')\n", + "y.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13
01.00.00.01.01.00.01.00.00.00.0...0.2814210.8811220.4216500.7414130.8957990.8024610.7244170.7019150.8776180.719903
10.01.01.00.01.00.01.00.00.00.0...0.2823540.4400110.3462300.2784950.5934130.5460560.6132520.7412890.3266790.808464
21.00.01.00.01.00.00.00.01.00.0...0.2937560.9141550.3696020.8325640.8656200.8252510.2641040.6955610.8691330.828352
31.00.01.00.01.00.00.00.01.00.0...0.7697850.9341380.5789300.4073130.8680990.7944020.4942690.6981250.8097990.614766
41.00.00.01.01.00.01.00.00.00.0...0.2791050.3826000.7059400.3251930.4409670.4621460.7244470.6830730.3434570.297743
\n", + "

5 rows × 55 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... cont4 cont5 \\\n", + "0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.281421 0.881122 \n", + "1 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.282354 0.440011 \n", + "2 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.293756 0.914155 \n", + "3 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.769785 0.934138 \n", + "4 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.279105 0.382600 \n", + "\n", + " cont6 cont7 cont8 cont9 cont10 cont11 cont12 \\\n", + "0 0.421650 0.741413 0.895799 0.802461 0.724417 0.701915 0.877618 \n", + "1 0.346230 0.278495 0.593413 0.546056 0.613252 0.741289 0.326679 \n", + "2 0.369602 0.832564 0.865620 0.825251 0.264104 0.695561 0.869133 \n", + "3 0.578930 0.407313 0.868099 0.794402 0.494269 0.698125 0.809799 \n", + "4 0.705940 0.325193 0.440967 0.462146 0.724447 0.683073 0.343457 \n", + "\n", + " cont13 \n", + "0 0.719903 \n", + "1 0.808464 \n", + "2 0.828352 \n", + "3 0.614766 \n", + "4 0.297743 \n", + "\n", + "[5 rows x 55 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = final_train_df\n", + "X.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "x_train, x_val, y_train, y_val = train_test_split(X, y, random_state = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13
3061.00.01.00.01.00.00.00.01.00.0...0.5495970.5260690.8365950.2309230.3492580.5015290.3770140.6986040.3257500.353437
486121.00.01.00.01.00.00.00.01.00.0...0.3056440.4596220.5406110.3244450.4210980.8499330.3725420.2085880.7076920.194632
872671.00.00.01.01.00.00.00.00.01.0...0.2774910.2960800.3460070.2949800.2861440.5654610.7527700.3090880.7252240.374624
191671.00.01.00.01.00.00.00.01.00.0...0.3381120.1839480.3717490.4531020.1395950.3032550.2974990.2506100.3752460.725660
313221.00.00.01.01.00.00.00.01.00.0...0.2799160.2335380.3363930.5965830.3015050.3228280.3531560.3766400.4426990.833210
\n", + "

5 rows × 55 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... cont4 \\\n", + "306 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.549597 \n", + "48612 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.305644 \n", + "87267 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 0.277491 \n", + "19167 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.338112 \n", + "31322 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.279916 \n", + "\n", + " cont5 cont6 cont7 cont8 cont9 cont10 cont11 \\\n", + "306 0.526069 0.836595 0.230923 0.349258 0.501529 0.377014 0.698604 \n", + "48612 0.459622 0.540611 0.324445 0.421098 0.849933 0.372542 0.208588 \n", + "87267 0.296080 0.346007 0.294980 0.286144 0.565461 0.752770 0.309088 \n", + "19167 0.183948 0.371749 0.453102 0.139595 0.303255 0.297499 0.250610 \n", + "31322 0.233538 0.336393 0.596583 0.301505 0.322828 0.353156 0.376640 \n", + "\n", + " cont12 cont13 \n", + "306 0.325750 0.353437 \n", + "48612 0.707692 0.194632 \n", + "87267 0.725224 0.374624 \n", + "19167 0.375246 0.725660 \n", + "31322 0.442699 0.833210 \n", + "\n", + "[5 rows x 55 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_train.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13
956291.00.01.00.01.00.00.00.01.00.0...0.2796950.5841950.5377940.3213650.4215840.6452320.6950900.8163760.6396130.779314
90880.01.00.01.01.00.00.00.01.00.0...0.2779900.9352450.5990430.6221500.7826380.8385410.5535320.7661890.8238160.230092
516261.00.01.00.01.00.01.00.00.00.0...0.2820390.7283010.4165190.6861440.4414440.8341800.5663470.7227090.6340510.362356
2446731.00.01.00.01.00.00.00.00.01.0...0.3054890.4158470.3669290.6296510.3719330.3941250.4729860.3968070.2446060.226515
208011.00.01.00.01.00.00.00.01.00.0...0.7464880.9221640.2191820.7909050.8908580.8039710.4327470.4492710.8210530.357360
\n", + "

5 rows × 55 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... cont4 \\\n", + "95629 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.279695 \n", + "9088 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.277990 \n", + "51626 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.282039 \n", + "244673 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 0.305489 \n", + "20801 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.746488 \n", + "\n", + " cont5 cont6 cont7 cont8 cont9 cont10 cont11 \\\n", + "95629 0.584195 0.537794 0.321365 0.421584 0.645232 0.695090 0.816376 \n", + "9088 0.935245 0.599043 0.622150 0.782638 0.838541 0.553532 0.766189 \n", + "51626 0.728301 0.416519 0.686144 0.441444 0.834180 0.566347 0.722709 \n", + "244673 0.415847 0.366929 0.629651 0.371933 0.394125 0.472986 0.396807 \n", + "20801 0.922164 0.219182 0.790905 0.890858 0.803971 0.432747 0.449271 \n", + "\n", + " cont12 cont13 \n", + "95629 0.639613 0.779314 \n", + "9088 0.823816 0.230092 \n", + "51626 0.634051 0.362356 \n", + "244673 0.244606 0.226515 \n", + "20801 0.821053 0.357360 \n", + "\n", + "[5 rows x 55 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x_val.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model and training" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8629479507070059" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "model = RandomForestRegressor(n_estimators = 100, random_state = 0)\n", + "model.fit(x_train, y_train)\n", + "pred = model.predict(x_val)\n", + "rmse = mean_squared_error(y_val, pred, squared = False)\n", + "rmse" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...cont4cont5cont6cont7cont8cont9cont10cont11cont12cont13
01.00.00.01.01.00.00.00.01.00.0...0.7016790.5955070.2869120.2798840.2022340.2426540.2851470.2643080.6536540.302448
11.00.00.01.01.00.00.00.01.00.0...0.2774800.4795520.3974360.4767420.8570730.5163930.5620650.7305420.3184920.736251
21.00.00.01.01.00.00.00.01.00.0...0.2795080.6763950.6952840.2533160.5869340.5485550.8361930.7597880.3335720.273905
31.00.01.00.00.01.01.00.00.00.0...0.4795030.7598750.2400490.2980740.4424750.5967460.4141310.2553820.5890800.311625
41.00.00.01.01.00.01.00.00.00.0...0.7578450.2102320.3298510.6166630.1704750.2632350.7109610.2240450.2858600.794931
\n", + "

5 rows × 55 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 9 ... cont4 cont5 \\\n", + "0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.701679 0.595507 \n", + "1 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.277480 0.479552 \n", + "2 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 ... 0.279508 0.676395 \n", + "3 1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 ... 0.479503 0.759875 \n", + "4 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 0.0 0.0 ... 0.757845 0.210232 \n", + "\n", + " cont6 cont7 cont8 cont9 cont10 cont11 cont12 \\\n", + "0 0.286912 0.279884 0.202234 0.242654 0.285147 0.264308 0.653654 \n", + "1 0.397436 0.476742 0.857073 0.516393 0.562065 0.730542 0.318492 \n", + "2 0.695284 0.253316 0.586934 0.548555 0.836193 0.759788 0.333572 \n", + "3 0.240049 0.298074 0.442475 0.596746 0.414131 0.255382 0.589080 \n", + "4 0.329851 0.616663 0.170475 0.263235 0.710961 0.224045 0.285860 \n", + "\n", + " cont13 \n", + "0 0.302448 \n", + "1 0.736251 \n", + "2 0.273905 \n", + "3 0.311625 \n", + "4 0.794931 \n", + "\n", + "[5 rows x 55 columns]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "final_test_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([7.67545224, 7.70351859, 7.57978512, ..., 7.37587732, 7.79308431,\n", + " 7.55770055])" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = model.predict(final_test_df)\n", + "predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Submission" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtarget
007.675452
157.703519
2157.579785
3167.224458
4177.432672
\n", + "
" + ], + "text/plain": [ + " id target\n", + "0 0 7.675452\n", + "1 5 7.703519\n", + "2 15 7.579785\n", + "3 16 7.224458\n", + "4 17 7.432672" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = pd.DataFrame()\n", + "result['id'] = final_test_df['id']\n", + "result['target'] = predictions\n", + "result.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "result.to_csv('../output/submission.csv', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/opt/conda/bin/kaggle\", line 5, in \n", + " from kaggle.cli import main\n", + " File \"/opt/conda/lib/python3.7/site-packages/kaggle/__init__.py\", line 23, in \n", + " api.authenticate()\n", + " File \"/opt/conda/lib/python3.7/site-packages/kaggle/api/kaggle_api_extended.py\", line 166, in authenticate\n", + " self.config_file, self.config_dir))\n", + "OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.\n" + ] + } + ], + "source": [ + "!kaggle competitions submit -c tabular-playground-series-feb-2021 -f submission.csv -m \"Message\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}