diff --git a/Prediction Models/Crab_Age_Prediction/Readme.md b/Prediction Models/Crab_Age_Prediction/Readme.md new file mode 100644 index 00000000..9dbb028e --- /dev/null +++ b/Prediction Models/Crab_Age_Prediction/Readme.md @@ -0,0 +1,42 @@ +# Crab Age Prediction Model + +This repository contains a machine learning model that predicts the age of crabs based on various biological measurements. The project involves Exploratory Data Analysis (EDA), feature engineering, and multiple machine learning models to determine which factors most accurately predict crab age. + +## Table of Contents +- [Introduction](#introduction) +- [Problem Statement](#problem-statement) +- [Solution Overview](#solution-overview) +- [Data](#data) + + +## Introduction + +Determining the age of marine species such as crabs is essential for studying population dynamics and ecological impacts. This project focuses on developing a machine learning model to predict crab age based on various biological characteristics, like size, weight, and shell dimensions. The model aims to help biologists and ecologists with accurate age estimations, facilitating better research and conservation efforts. + +## Problem Statement + +Age prediction in crabs is complex due to several challenges: +- **Biological Variability**: Differences in growth rates across individual crabs due to genetics and environmental factors. +- **Measurement Limitations**: Variability in available biological measurements. +- **Feature Selection**: Identifying which measurements contribute most effectively to accurate age prediction. + +This project aims to address these challenges by leveraging machine learning techniques to create a predictive model for crab age. + +## Solution Overview + +The model uses various machine learning algorithms, including linear regression, decision trees, and ensemble methods. Steps taken include: +1. **Exploratory Data Analysis (EDA)**: Identifying patterns, outliers, and relationships within the data. +2. **Feature Engineering**: Selecting and transforming features to improve model accuracy. +3. **Model Selection and Training**: Comparing multiple models to determine the best predictor of crab age. + +Key features may include measurements such as carapace length, width, weight, and other morphological characteristics. + +## Data + +The dataset contains various biological measurements for crabs, including: +- **Carapace Dimensions**: Length, width, and height. +- **Weight Measurements**: Including whole weight, shell weight, etc. +- **Other Characteristics**: Information about species, habitat, or other ecological factors, if available. + +The dataset should be placed in the `data/` folder in CSV format. + diff --git a/Prediction Models/Crab_Age_Prediction/crab-age-predictions-eda-f-e-modeling-10th.ipynb b/Prediction Models/Crab_Age_Prediction/crab-age-predictions-eda-f-e-modeling-10th.ipynb new file mode 100644 index 00000000..67ada5b5 --- /dev/null +++ b/Prediction Models/Crab_Age_Prediction/crab-age-predictions-eda-f-e-modeling-10th.ipynb @@ -0,0 +1,1898 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "64fad5f4", + "metadata": { + "papermill": { + "duration": 0.023427, + "end_time": "2023-06-13T09:19:59.138539", + "exception": false, + "start_time": "2023-06-13T09:19:59.115112", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#
Import Libraries
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cf32024", + "metadata": { + "papermill": { + "duration": 20.705581, + "end_time": "2023-06-13T09:20:19.910608", + "exception": false, + "start_time": "2023-06-13T09:19:59.205027", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install sklego\n", + "\n", + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing\n", + "from pandas.api.types import is_numeric_dtype\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import optuna\n", + "\n", + "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler\n", + "from sklearn.metrics import mean_absolute_error\n", + "from sklearn.model_selection import KFold, train_test_split, GridSearchCV\n", + "\n", + "\n", + "# Models\n", + "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor, StackingRegressor\n", + "from lightgbm import LGBMRegressor\n", + "from xgboost import XGBRegressor\n", + "from sklego.linear_model import LADRegression\n", + "from catboost import CatBoostRegressor\n", + "\n", + "\n", + "# Ignore warnings ;)\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "id": "5660f80e", + "metadata": { + "id": "9iEKB2Oh3uNF", + "papermill": { + "duration": 0.024267, + "end_time": "2023-06-13T09:20:19.959155", + "exception": false, + "start_time": "2023-06-13T09:20:19.934888", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#Import the data
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd92fa96", + "metadata": { + "id": "Y-gW90p23uNH", + "papermill": { + "duration": 0.680544, + "end_time": "2023-06-13T09:20:20.663008", + "exception": false, + "start_time": "2023-06-13T09:20:19.982464", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# files path\n", + "train_path = \"/kaggle/input/playground-series-s3e16/train.csv\"\n", + "test_path = \"/kaggle/input/playground-series-s3e16/test.csv\"\n", + "original_path = \"/kaggle/input/crab-age-prediction/CrabAgePrediction.csv\"\n", + "synthetic_path = \"/kaggle/input/ps-s3-e16-synthetic-train-data/train_synthetic.csv\"\n", + "\n", + "# function to import our dataset \n", + "def import_data(train_path, test_path, original_path, synthetic_path):\n", + " train = pd.read_csv(train_path)\n", + " test = pd.read_csv(test_path)\n", + " original = pd.read_csv(original_path)\n", + " synthetic = pd.read_csv(synthetic_path)\n", + " \n", + " return train, test, original, synthetic\n", + "\n", + "train, test, original, synthetic = import_data(train_path, test_path, original_path, synthetic_path)" + ] + }, + { + "cell_type": "markdown", + "id": "729aaacc", + "metadata": { + "papermill": { + "duration": 0.023237, + "end_time": "2023-06-13T09:20:20.709912", + "exception": false, + "start_time": "2023-06-13T09:20:20.686675", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "The train dataset is a synthetic dataset generated from the [Crab Age Prediction](https://www.kaggle.com/datasets/sidhus/crab-age-prediction) dataset(original). These are the descriptions of the variables in this dataset:\n", + "\n", + "Exploratory Data Analysis - EDA
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc00863c", + "metadata": { + "id": "DyWZdMJG3uNJ", + "outputId": "8a5b1171-c6ac-4b79-e967-f61c067dd810", + "papermill": { + "duration": 0.063962, + "end_time": "2023-06-13T09:20:20.893200", + "exception": false, + "start_time": "2023-06-13T09:20:20.829238", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "train.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9662dcaf", + "metadata": { + "papermill": { + "duration": 0.045347, + "end_time": "2023-06-13T09:20:20.962317", + "exception": false, + "start_time": "2023-06-13T09:20:20.916970", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "original.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "add29001", + "metadata": { + "papermill": { + "duration": 0.049808, + "end_time": "2023-06-13T09:20:21.036607", + "exception": false, + "start_time": "2023-06-13T09:20:20.986799", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "synthetic.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "699852c3", + "metadata": { + "papermill": { + "duration": 0.045925, + "end_time": "2023-06-13T09:20:21.108412", + "exception": false, + "start_time": "2023-06-13T09:20:21.062487", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "cd570494", + "metadata": { + "papermill": { + "duration": 0.025364, + "end_time": "2023-06-13T09:20:21.159190", + "exception": false, + "start_time": "2023-06-13T09:20:21.133826", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Now, we'll try to use some descriptive statistics" + ] + }, + { + "cell_type": "markdown", + "id": "d1737796", + "metadata": { + "papermill": { + "duration": 0.023672, + "end_time": "2023-06-13T09:20:21.207598", + "exception": false, + "start_time": "2023-06-13T09:20:21.183926", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#1. Univariate Statistics
" + ] + }, + { + "cell_type": "markdown", + "id": "cd37335d", + "metadata": { + "papermill": { + "duration": 0.02352, + "end_time": "2023-06-13T09:20:21.255280", + "exception": false, + "start_time": "2023-06-13T09:20:21.231760", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "We can use the .describe() method from pandas to see basic stats like count, mean, standard deviation, minimum, maximum, quantiles..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b540ab7", + "metadata": { + "papermill": { + "duration": 0.105598, + "end_time": "2023-06-13T09:20:21.384989", + "exception": false, + "start_time": "2023-06-13T09:20:21.279391", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "train.describe().T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5824f34", + "metadata": { + "papermill": { + "duration": 0.041004, + "end_time": "2023-06-13T09:20:21.500177", + "exception": false, + "start_time": "2023-06-13T09:20:21.459173", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# fonction to calculate univariate stats like pandas describe method\n", + "def univariate_stats(df):\n", + " #df.drop('id', axis=1, inplace=True)\n", + " output_df = pd.DataFrame(columns=['Count', 'Missing', 'Unique', 'Dtype', 'IsNumeric', 'Mode', 'Mean', 'Min', '25%', 'Median', '75%', 'Max', 'Std', 'Skew', 'Kurt'])\n", + " \n", + " for col in df:\n", + " if is_numeric_dtype(df[col]):\n", + " output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], df[col].mean(), df[col].min(), df[col].quantile(.25), df[col].median(), df[col].quantile(.75), df[col].max(), df[col].std(), df[col].skew(), df[col].kurt() ]\n", + " else:\n", + " output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], '-', '-', '-', '-', '-', '-', '-', '-', '-' ]\n", + "\n", + " return output_df.sort_values(by=['IsNumeric', 'Unique'], ascending=False)\n", + "\n", + "\n", + "pd.set_option('display.max_rows', 100)\n", + "pd.set_option('display.max_columns', 100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68343761", + "metadata": { + "papermill": { + "duration": 0.255555, + "end_time": "2023-06-13T09:20:21.780149", + "exception": false, + "start_time": "2023-06-13T09:20:21.524594", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Call the function on train\n", + "univariate_stats(train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b04ec10e", + "metadata": { + "papermill": { + "duration": 0.13138, + "end_time": "2023-06-13T09:20:21.938161", + "exception": false, + "start_time": "2023-06-13T09:20:21.806781", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Call the function to check univariate stats on the original dataset\n", + "univariate_stats(original)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcc9292b", + "metadata": { + "papermill": { + "duration": 0.194384, + "end_time": "2023-06-13T09:20:22.159895", + "exception": false, + "start_time": "2023-06-13T09:20:21.965511", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Call the function to check univariate stats on test dataset\n", + "univariate_stats(test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "695693f3", + "metadata": { + "papermill": { + "duration": 0.313853, + "end_time": "2023-06-13T09:20:22.499909", + "exception": false, + "start_time": "2023-06-13T09:20:22.186056", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Call the function to check univariate stats on synthetic dataset\n", + "univariate_stats(synthetic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e17abf6d", + "metadata": { + "papermill": { + "duration": 0.041634, + "end_time": "2023-06-13T09:20:22.568187", + "exception": false, + "start_time": "2023-06-13T09:20:22.526553", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# List of numerical columns and categorical columns\n", + "\n", + "numeric_cols = train.select_dtypes(include=['float64']).columns.tolist()\n", + "categ_cols = train.select_dtypes(include=['object']).columns.tolist()\n", + "target = 'Age'\n", + "numeric_cols\n", + "categ_cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf027b93", + "metadata": { + "papermill": { + "duration": 16.48609, + "end_time": "2023-06-13T09:20:39.132428", + "exception": false, + "start_time": "2023-06-13T09:20:22.646338", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def plot_histograms(df_train, df_test, original, synthetic,target_col, n_cols=3):\n", + " n_rows = (len(df_train.columns) - 1) // n_cols + 1\n", + "\n", + " fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))\n", + " axes = axes.flatten()\n", + "\n", + " for i, var_name in enumerate(df_train.columns.tolist()):\n", + " ax = axes[i]\n", + " sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train') # plot train data\n", + " sns.distplot(original[var_name], kde=True, ax=ax, label='Original') # plot original data\n", + " sns.distplot(synthetic[var_name], kde=True, ax=ax, label='Synthetic') # plot original data\n", + " if var_name != target_col:\n", + " sns.distplot(df_test[var_name], kde=True, ax=ax, label='Test') # plot test data\n", + " \n", + " ax.set_title(f'{var_name} Distribution (Train vs Test)')\n", + " ax.legend()\n", + "\n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + "plot_histograms(train[numeric_cols], test[numeric_cols], synthetic, original[numeric_cols], target, n_cols=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6021903a", + "metadata": { + "papermill": { + "duration": 78.610486, + "end_time": "2023-06-13T09:21:57.773692", + "exception": false, + "start_time": "2023-06-13T09:20:39.163206", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def plot_distribution(df, hue, title='', drop_cols=[]):\n", + " sns.set_style('whitegrid')\n", + "\n", + " cols = df.columns.drop([hue] + drop_cols)\n", + " n_cols = 2\n", + " n_rows = (len(cols) - 1) // n_cols + 1\n", + "\n", + " fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))\n", + "\n", + " for i, var_name in enumerate(cols):\n", + " row = i // n_cols\n", + " col = i % n_cols\n", + "\n", + " ax = axes[row, col]\n", + " sns.histplot(data=df, x=var_name, kde=True, ax=ax, hue=hue) # sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')\n", + " ax.set_title(f'{var_name} Distribution')\n", + "\n", + " fig.suptitle(f'{title} Distribution Plot by {hue}', fontweight='bold', fontsize=16)\n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + "plot_distribution(train, hue='Sex', title='Train data')\n", + "plot_distribution(test, hue='Sex', title='Test data')\n", + "plot_distribution(original, hue='Sex', title='Original data')\n", + "plot_distribution(synthetic, hue='Sex', title= 'Synthetic data')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10726c06", + "metadata": { + "papermill": { + "duration": 8.927933, + "end_time": "2023-06-13T09:22:06.762168", + "exception": false, + "start_time": "2023-06-13T09:21:57.834235", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def plot_boxplot(df, hue, title='', drop_cols=[], n_cols=3):\n", + " sns.set_style('whitegrid')\n", + "\n", + " cols = df.columns.drop([hue] + drop_cols)\n", + " n_rows = (len(cols) - 1) // n_cols + 1\n", + "\n", + " fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))\n", + "\n", + " for i, var_name in enumerate(cols):\n", + " row = i // n_cols\n", + " col = i % n_cols\n", + "\n", + " ax = axes[row, col]\n", + " sns.boxplot(data=df, x=hue, y=var_name, ax=ax, showmeans=True, \n", + " meanprops={\"marker\":\"s\",\"markerfacecolor\":\"white\", \"markeredgecolor\":\"blue\", \"markersize\":\"5\"})\n", + " ax.set_title(f'{var_name} by {hue}')\n", + " ax.set_xlabel('')\n", + "\n", + " fig.suptitle(f'{title} Boxplot by {hue}', fontweight='bold', fontsize=16)\n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + "plot_boxplot(train, hue='Sex', title='Train data', n_cols=2)\n", + "plot_boxplot(original, hue='Sex', title='Original data', n_cols=2)\n", + "plot_boxplot(test, hue='Sex', title='Test data', n_cols=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01100a7c", + "metadata": { + "papermill": { + "duration": 7.745215, + "end_time": "2023-06-13T09:22:14.579464", + "exception": false, + "start_time": "2023-06-13T09:22:06.834249", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def plot_violinplot(df, hue, title='', drop_cols=[], n_cols=2):\n", + " sns.set_style('whitegrid')\n", + "\n", + " cols = df.columns.drop([hue] + drop_cols)\n", + " n_rows = (len(cols) - 1) // n_cols + 1\n", + "\n", + " fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))\n", + "\n", + " for i, var_name in enumerate(cols):\n", + " row = i // n_cols\n", + " col = i % n_cols\n", + "\n", + " ax = axes[row, col]\n", + " sns.violinplot(data=df, x=hue, y=var_name, ax=ax, inner='quartile')\n", + " ax.set_title(f'{var_name} Distribution')\n", + "\n", + " fig.suptitle(f'{title} Violin Plot by {hue}', fontweight='bold', fontsize=16)\n", + " plt.tight_layout()\n", + " plt.show()\n", + " \n", + "plot_violinplot(train, hue='Sex', title='Train data', n_cols=2)" + ] + }, + { + "cell_type": "markdown", + "id": "3460ae32", + "metadata": { + "papermill": { + "duration": 0.074544, + "end_time": "2023-06-13T09:22:14.732933", + "exception": false, + "start_time": "2023-06-13T09:22:14.658389", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "#2. Bivariate Statistics
" + ] + }, + { + "cell_type": "markdown", + "id": "131690c8", + "metadata": { + "papermill": { + "duration": 0.076052, + "end_time": "2023-06-13T09:22:14.884777", + "exception": false, + "start_time": "2023-06-13T09:22:14.808725", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n",
+ " 💡 Recall: There are three types of bivariate analysis.
\n",
+ "
\n",
+ " 💡 \n",
+ "Since the features in our datasets are strongly correlated to each others, applying PCA could be a good idea.
\n",
+ " Let's visualize some graphics to gain more insights.\n",
+ "\n",
+ "
Feature Engineering
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "419bd785", + "metadata": { + "id": "aerc4iNU3uNP", + "outputId": "f87502be-953c-449d-ff85-c3e2177f82b6", + "papermill": { + "duration": 0.306708, + "end_time": "2023-06-13T09:23:53.252712", + "exception": false, + "start_time": "2023-06-13T09:23:52.946004", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "train[\"Data Type\"] = 0\n", + "test[\"Data Type\"] = 1\n", + "original[\"Data Type\"] = 2\n", + "synthetic[\"Data Type\"] = 3\n", + "\n", + "ids = []\n", + "for i in range(len(original)):\n", + " ids.append(i + 123419)\n", + "\n", + "original[\"id\"] = ids\n", + "synthetic[\"id\"] += 127312\n", + "\n", + "# concatenate datasets\n", + "df_concat = pd.concat([train, original, synthetic], ignore_index=True)\n", + "df_concat = df_concat.drop_duplicates()\n", + "df_all = pd.concat([df_concat, test], ignore_index=True)\n", + "df_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a67cf11", + "metadata": { + "papermill": { + "duration": 0.247528, + "end_time": "2023-06-13T09:23:53.953216", + "exception": false, + "start_time": "2023-06-13T09:23:53.705688", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_all = pd.get_dummies(df_all)\n", + "df_all" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f6cf7ec", + "metadata": { + "id": "Vx8-kBZS3uNP", + "outputId": "1089b557-3522-4fb7-cacb-277a62f6a359", + "papermill": { + "duration": 0.167946, + "end_time": "2023-06-13T09:23:54.573599", + "exception": false, + "start_time": "2023-06-13T09:23:54.405653", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_all[df_all['Height'] == 0]['Height']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71cf996e", + "metadata": { + "id": "kSbwgc8D3uNP", + "papermill": { + "duration": 70.98887, + "end_time": "2023-06-13T09:25:05.711089", + "exception": false, + "start_time": "2023-06-13T09:23:54.722219", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "h1 = df_all[df_all[\"Height\"] != 0]\n", + "h0 = df_all[df_all[\"Height\"] == 0]\n", + "print(h1.shape, h0.shape)\n", + "\n", + "# prediction of Height by Random Forest Regressor\n", + "\n", + "x_h1 = h1.drop(columns=[\"Height\", \"Age\", \"Data Type\"], axis=1)\n", + "y_h1 = h1[\"Height\"]\n", + "x_h0 = h0.drop(columns=[\"Height\", \"Age\", \"Data Type\"], axis=1)\n", + "\n", + "rfr = RandomForestRegressor(n_jobs=-1, random_state=42)\n", + "rfr.fit(x_h1, y_h1)\n", + "preds_height = rfr.predict(x_h0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b22f98", + "metadata": { + "id": "9wixBHMD3uNR", + "outputId": "01dd0f7b-325b-4a31-df00-0368432e7668", + "papermill": { + "duration": 0.1627, + "end_time": "2023-06-13T09:25:06.028867", + "exception": false, + "start_time": "2023-06-13T09:25:05.866167", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "len(preds_height)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "455dd321", + "metadata": { + "id": "as5IcoJ_3uNR", + "outputId": "3aa30034-814c-429b-a86e-9178468fe428", + "papermill": { + "duration": 4.516241, + "end_time": "2023-06-13T09:25:10.696350", + "exception": false, + "start_time": "2023-06-13T09:25:06.180109", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "cnt = 0\n", + "for i in range(len(df_all)):\n", + " if df_all.loc[i, \"Height\"] == 0:\n", + " df_all.loc[i, \"Height\"] = preds_height[cnt]\n", + " cnt += 1\n", + "\n", + "df_all[\"Height\"].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0979a994", + "metadata": { + "id": "sj8GP2ij3uNS", + "outputId": "69b570f6-837b-4807-9c8d-44fc4b0a57b1", + "papermill": { + "duration": 0.173823, + "end_time": "2023-06-13T09:25:11.023660", + "exception": false, + "start_time": "2023-06-13T09:25:10.849837", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_all[df_all['Height'] == 0]['Height']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7081dc91", + "metadata": { + "id": "U0xgnS_C3uNS", + "outputId": "4e09ab9c-a465-4383-8230-c36300b688c4", + "papermill": { + "duration": 0.216196, + "end_time": "2023-06-13T09:25:11.392374", + "exception": false, + "start_time": "2023-06-13T09:25:11.176178", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Prepare our final dataset for train\n", + "\n", + "train = df_all[df_all[\"Data Type\"] != 1]\n", + "train.sort_values(\"id\", inplace=True)\n", + "train.reset_index(drop=True, inplace=True)\n", + "\n", + "train = train.drop(columns=[\"id\", \"Data Type\"], axis=1)\n", + "train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aced0b22", + "metadata": { + "id": "Sd6DE_Ik3uNT", + "outputId": "5fdc0c86-7828-41de-d6f2-8ebf4dd8a426", + "papermill": { + "duration": 0.186995, + "end_time": "2023-06-13T09:25:11.731465", + "exception": false, + "start_time": "2023-06-13T09:25:11.544470", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# dataset for test\n", + "test = df_all[df_all[\"Data Type\"] == 1]\n", + "test.sort_values(\"id\", inplace=True)\n", + "test.reset_index(drop=True, inplace=True)\n", + "test.drop(columns=[\"id\", \"Age\", \"Data Type\"], inplace=True)\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37ed6563", + "metadata": { + "papermill": { + "duration": 0.167326, + "end_time": "2023-06-13T09:25:12.353391", + "exception": false, + "start_time": "2023-06-13T09:25:12.186065", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# function for PCA features but I'll keep this as a last resort\n", + "def add_pca_features(X_train, X_test): \n", + " \n", + " # Select the columns for PCA\n", + " pca_features = X_train.select_dtypes(include=['float64']).columns.tolist()\n", + " n_components = 4 # len(pca_features)\n", + "\n", + " # Create the pipeline\n", + " pipeline = make_pipeline(StandardScaler(), PCA(n_components=n_components))\n", + " \n", + " # Perform PCA\n", + " pipeline.fit(X_train[pca_features])\n", + "\n", + " # Create column names for PCA features\n", + " pca_columns = [f'PCA_{i}' for i in range(n_components)]\n", + "\n", + " # Add PCA features to the dataframe\n", + " X_train[pca_columns] = pipeline.transform(X_train[pca_features])\n", + " X_test[pca_columns] = pipeline.transform(X_test[pca_features])\n", + "\n", + " return X_train, X_test" + ] + }, + { + "cell_type": "markdown", + "id": "676c8e97", + "metadata": { + "papermill": { + "duration": 0.150509, + "end_time": "2023-06-13T09:25:12.655229", + "exception": false, + "start_time": "2023-06-13T09:25:12.504720", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n", + " 💡 We expose here many techniques and features that we can use in our dataset but we'll use only few of them.\n", + " \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "885c3075", + "metadata": { + "id": "iKC0agEF3uNT", + "papermill": { + "duration": 0.167826, + "end_time": "2023-06-13T09:25:12.973874", + "exception": false, + "start_time": "2023-06-13T09:25:12.806048", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# function to add more features to the dataset\n", + "def feature_engineering(df): \n", + " \n", + " # Clean the weights by capping the over weights with total body weights\n", + " \"\"\"df['Shell Weight']=np.where(df['Shell Weight']>df['Weight'],df['Weight'],df['Shell Weight'])\n", + " df['Viscera Weight']=np.where(df['Viscera Weight']>df['Weight'],df['Weight'],df['Viscera Weight'])\n", + " df['Shucked Weight']=np.where(df['Shucked Weight']>df['Weight'],df['Weight'],df['Shucked Weight'])\"\"\"\n", + " \n", + " # Adding brand news features\n", + " df['Shucked Weight ratio'] = df['Shucked Weight'] / df['Weight']\n", + " #df['Viscera Weight ratio'] = df['Viscera Weight'] / df['Weight'] # dropped due to low correlation with our target\n", + " df['Shell Weight ratio'] = df['Shell Weight'] / df['Weight']\n", + " \n", + " df['Volume'] = df['Length'] * df['Diameter'] * df['Height']\n", + " \n", + " #df['Meat Yield'] = df['Shucked Weight'] / (df['Weight'] + df['Shell Weight'])\n", + " #df['Weight_to_Shucked_Weight'] = df['Weight'] / df['Shucked Weight']\n", + " \n", + " \"\"\"df['dim1'] = df['Length'] * df['Diameter']\n", + " df['dim2'] = df['Length'] * df['Height']\n", + " df['dim3'] = df['Height'] * df['Diameter']\"\"\"\n", + " \n", + " # Crab BMI\n", + " df['bmi']=df['Weight']/(df['Height']**2)\n", + " \n", + " # Water Loss during experiment\n", + " df[\"water_loss\"]=df[\"Weight\"]-df[\"Shucked Weight\"]-df['Viscera Weight']-df['Shell Weight']\n", + " df[\"water_loss\"]=np.where( df[\"water_loss\"]<0,\n", + " min(df[\"Shucked Weight\"].min(), df[\"Viscera Weight\"].min(), df[\"Shell Weight\"].min()),\n", + " df[\"water_loss\"]\n", + " )\n", + " \n", + " # Crab density approx\n", + " df['density'] = df['Weight']/(df['Volume'])\n", + " df['BSA'] = np.sqrt( (df['Weight']* 0.0283) * (df['Height']*30.48) / 3600 )\n", + " \n", + " news_cols = ['Shucked Weight ratio', 'Shell Weight ratio', 'Volume', 'bmi', \"water_loss\", 'density', 'BSA']\n", + " \n", + " return df, news_cols" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53f5a626", + "metadata": { + "id": "bSFlzo4d3uNT", + "outputId": "13e22886-8a14-49fc-ff0d-54fa69bc42f6", + "papermill": { + "duration": 0.212599, + "end_time": "2023-06-13T09:25:13.338714", + "exception": false, + "start_time": "2023-06-13T09:25:13.126115", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "train_eng, news_cols = feature_engineering(train)\n", + "test_eng, news_cols = feature_engineering(test)\n", + "train_eng" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97add2e6", + "metadata": { + "id": "qTAmXVm-3uNU", + "outputId": "ebf0cc6e-3948-48fd-b204-a91998586bc1", + "papermill": { + "duration": 1.850481, + "end_time": "2023-06-13T09:25:15.341659", + "exception": false, + "start_time": "2023-06-13T09:25:13.491178", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#plot_heatmap(train_eng[numeric_cols + news_cols + [target]], title='Train_eng data')\n", + "\n", + "\n", + "corr_mat_data = train_eng.corr()\n", + "data_mask = np.triu(np.ones_like(corr_mat_data, dtype = bool))\n", + "cmap = sns.diverging_palette(100, 7, s = 75, l = 40, n = 20, center = 'light', as_cmap = True)\n", + "\n", + "#fig, axes = plt.subplots(1, 1, figsize = (25, 10))\n", + "plt.figure(figsize=(12, 12))\n", + "sns.heatmap(corr_mat_data, annot = True, cmap = cmap, fmt = '.2f', center = 0,\n", + " annot_kws = {'size': 12}, mask = data_mask).set_title('Correlations train features');" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09561f35", + "metadata": { + "id": "rOwaIXf-3uNU", + "outputId": "6f18fb67-6897-4eb6-b6a3-d9f169923688", + "papermill": { + "duration": 0.193425, + "end_time": "2023-06-13T09:25:15.690191", + "exception": false, + "start_time": "2023-06-13T09:25:15.496766", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "columns_to_drop = ['Sex_I', 'Age']\n", + "X = train_eng.drop(columns=columns_to_drop, axis=1)\n", + "test_eng = test_eng.drop('Sex_I', axis=1)\n", + "Y = train_eng['Age']\n", + "Y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b348132", + "metadata": { + "id": "cAkXmNP73uNU", + "outputId": "e39dbc99-f845-4a86-eef8-c7c53aa0348e", + "papermill": { + "duration": 0.186123, + "end_time": "2023-06-13T09:25:16.030862", + "exception": false, + "start_time": "2023-06-13T09:25:15.844739", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1fa9d09", + "metadata": { + "papermill": { + "duration": 0.175092, + "end_time": "2023-06-13T09:25:16.373122", + "exception": false, + "start_time": "2023-06-13T09:25:16.198030", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "\"\"\"#scaling the data\n", + "scaler = StandardScaler()\n", + "\n", + "X = pd.DataFrame(scaler.fit_transform(X))\n", + "test_eng = pd.DataFrame(scaler.transform(test_eng))\n", + "test_eng\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "18ff45ff", + "metadata": { + "papermill": { + "duration": 0.180147, + "end_time": "2023-06-13T09:25:16.710939", + "exception": false, + "start_time": "2023-06-13T09:25:16.530792", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n",
+ " 💡 Scaling :
\n",
+ " As we'll use Gradient Boosting Decision Trees(GBDT) models, so scaling the data might not be necessary.\n",
+ "\n",
+ "\n",
+ "
Models building
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42296034", + "metadata": { + "id": "tph2yAK3R6vy", + "outputId": "aa9f08b2-c18b-4fdc-da40-e5989cb7fa36", + "papermill": { + "duration": 5323.666964, + "end_time": "2023-06-13T10:54:01.263305", + "exception": false, + "start_time": "2023-06-13T09:25:17.596341", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "hist_cv_scores, hist_preds = list(), list()\n", + "lgb_cv_scores, lgb_preds = list(), list()\n", + "xgb_cv_scores, xgb_preds = list(), list()\n", + "cat_cv_scores, cat_preds = list(), list()\n", + "\n", + "ens_cv_scores, ens_preds = list(), list()\n", + "\n", + "\n", + "\n", + "imp_hist = pd.DataFrame()\n", + "#imp_cat = pd.DataFrame()\n", + "imp_xgb = pd.DataFrame()\n", + "imp_ens = pd.DataFrame()\n", + "imp = pd.DataFrame()\n", + "\n", + "skf = KFold(n_splits = 10, random_state = 42, shuffle = True)\n", + " \n", + "for i, (train_ix, test_ix) in enumerate(skf.split(X, Y)):\n", + " \n", + " X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]\n", + " Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]\n", + " \n", + " print('=====================================================================')\n", + " \n", + "\n", + " #==================================================== LightGBM ==========================================================#\n", + " \n", + " lgb_params = {\n", + " \"objective\": \"regression_l1\", # =\"mae\"\n", + " \"metric\": \"mae\",\n", + " \"learning_rate\": 0.03, # 0.01\n", + " \"n_estimators\": 5000,\n", + " \"max_depth\": 10,\n", + " \"num_leaves\": 255,\n", + " \"reg_alpha\": 0.1, \n", + " \"reg_lambda\": 0.1, \n", + " \"subsample\": 0.4 \n", + " }\n", + " \n", + " lgb_md = LGBMRegressor(**lgb_params).fit(X_train, Y_train)\n", + "\n", + " # Validation\n", + " lgb_pred_1 = lgb_md.predict(X_test)\n", + " lgb_score_fold = mean_absolute_error(Y_test, lgb_pred_1) \n", + " lgb_cv_scores.append(lgb_score_fold)\n", + " \n", + " # Prediction\n", + " lgb_pred_2 = lgb_md.predict(test_eng)\n", + " lgb_preds.append(lgb_pred_2)\n", + " \n", + " # Importance\n", + " _imp = pd.DataFrame({\"features\": X.columns, \"importance\": lgb_md.feature_importances_})\n", + " imp = pd.concat([imp, _imp], axis=0, ignore_index=True)\n", + "\n", + " print('Fold N°', i, '==> LightGBM - MAE: ====>', lgb_score_fold)\n", + "\n", + " \n", + " #==================================================== HistGradientBoosting ====================================================#\n", + " \n", + " \n", + " hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',\n", + " l2_regularization = 0.01,\n", + " early_stopping = False,\n", + " learning_rate = 0.01,\n", + " max_iter = 1000,\n", + " max_depth = 15,\n", + " max_bins = 255,\n", + " min_samples_leaf = 30,\n", + " max_leaf_nodes = 30).fit(X_train, Y_train)\n", + " \n", + " # Validation\n", + " hist_pred_1 = hist_md.predict(X_test)\n", + " hist_score_fold = mean_absolute_error(Y_test, hist_pred_1)\n", + " hist_cv_scores.append(hist_score_fold)\n", + "\n", + " # Prediction\n", + " hist_pred_2 = hist_md.predict(test_eng)\n", + " hist_preds.append(hist_pred_2)\n", + " \n", + " print('Fold N°', i, '==> HistGradient - MAE: ====>', hist_score_fold)\n", + "\n", + " \n", + " #======================================================== XGBoost ============================================================#\n", + " \n", + "\n", + " xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',\n", + " tree_method = 'hist',\n", + " colsample_bytree = 0.9, \n", + " gamma = 0.65, \n", + " learning_rate = 0.01, \n", + " max_depth = 7, \n", + " min_child_weight = 20, \n", + " n_estimators = 5000,\n", + " subsample = 0.7,\n", + " random_state = 42).fit(X_train, Y_train,\n", + " #eval_set = [(X_train, Y_train), (X_test, Y_test)],\n", + " verbose=0\n", + " )\n", + " # Validation\n", + " xgb_pred_1 = xgb_md.predict(X_test)\n", + " xgb_score_fold = mean_absolute_error(Y_test, xgb_pred_1) \n", + " xgb_cv_scores.append(xgb_score_fold)\n", + "\n", + " # Prediction\n", + " xgb_pred_2 = xgb_md.predict(test_eng)\n", + " xgb_preds.append(xgb_pred_2)\n", + " \n", + " print('Fold N°', i, '==> XGBoost - MAE: ====>', xgb_score_fold)\n", + "\n", + " \n", + " #========================================================= CatBoost ========================================================#\n", + " \n", + " \n", + " cat_md = CatBoostRegressor(loss_function = 'MAE',\n", + " iterations = 1000,\n", + " learning_rate = 0.03,\n", + " depth = 10, \n", + " random_strength = 0.2,\n", + " bagging_temperature = 0.7,\n", + " border_count = 254,\n", + " l2_leaf_reg = 0.001,\n", + " verbose = False,\n", + " grow_policy = 'Lossguide',\n", + " task_type = 'CPU',\n", + " random_state = 42).fit(X_train, Y_train)\n", + " \n", + " # Validation\n", + " cat_pred_1 = cat_md.predict(X_test)\n", + " cat_score_fold = mean_absolute_error(Y_test, cat_pred_1) \n", + " cat_cv_scores.append(cat_score_fold)\n", + " \n", + " # Prediction\n", + " cat_pred_2 = cat_md.predict(test_eng)\n", + " cat_preds.append(cat_pred_2)\n", + " \n", + " print('Fold N°', i, '==> CatBoost - MAE: ====>', cat_score_fold)\n", + "\n", + " \n", + " #========================================================= LAD Ensemble =========================================================#\n", + " \n", + " x = pd.DataFrame({'hist': np.round(hist_pred_1.tolist()), \n", + " 'lgb': np.round(lgb_pred_1.tolist()),\n", + " 'xgb': np.round(xgb_pred_1.tolist()), \n", + " 'cat': np.round(cat_pred_1.tolist())}\n", + " )\n", + " y = Y_test\n", + " \n", + " x_test = pd.DataFrame({'hist': np.round(hist_pred_2.tolist()), \n", + " 'lgb': np.round(lgb_pred_2.tolist()),\n", + " 'xgb': np.round(xgb_pred_2.tolist()), \n", + " 'cat': np.round(cat_pred_2.tolist())}\n", + " )\n", + " \n", + " lad_md = LADRegression().fit(x, y)\n", + " \n", + " # Validation\n", + " lad_pred = lad_md.predict(x) \n", + " ens_score = mean_absolute_error(y, lad_pred)\n", + " ens_cv_scores.append(ens_score)\n", + " \n", + " #Predictions\n", + " lad_pred_test = lad_md.predict(x_test)\n", + " ens_preds.append(lad_pred_test)\n", + " \n", + " print('Fold N°', i, '==> LAD Model 1 ensemble - MAE: ====>', ens_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "466ab7f8", + "metadata": { + "id": "4gb-E3Wbmzcy", + "outputId": "9dbc7f8e-3477-437a-bc1b-3abe72649255", + "papermill": { + "duration": 0.198329, + "end_time": "2023-06-13T10:54:01.629060", + "exception": false, + "start_time": "2023-06-13T10:54:01.430731", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Display important features for LGBMRegressor\n", + "imp = imp.groupby(\"features\")[\"importance\"].agg([\"mean\", \"std\"])\n", + "imp.columns = [\"importance\", \"importance_std\"]\n", + "imp[\"importance_cov\"] = imp[\"importance_std\"] / imp[\"importance\"]\n", + "imp = imp.reset_index(drop=False)\n", + "display(imp.sort_values(\"importance\", ascending=False, ignore_index=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72c264f3", + "metadata": { + "id": "6kzgu7PBmcEr", + "outputId": "9624c4d3-3ea9-42e7-bd04-d294a81ed0e0", + "papermill": { + "duration": 17.121349, + "end_time": "2023-06-13T10:54:18.918304", + "exception": false, + "start_time": "2023-06-13T10:54:01.796955", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "lgb_cv_score = np.mean(lgb_cv_scores)\n", + "print(f\"Score on CV test data ======> {lgb_cv_score}\")\n", + "preds_val = lgb_md.predict(X_val)\n", + "preds_val_score = mean_absolute_error(Y_val, preds_val)\n", + "print(f\"Score on Valid data (unseen data) ======> {preds_val_score}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2568184", + "metadata": { + "id": "ynkBFBVwR6zn", + "outputId": "11853749-7499-4a95-bd19-50abc4c0abd7", + "papermill": { + "duration": 2.178501, + "end_time": "2023-06-13T10:54:21.261710", + "exception": false, + "start_time": "2023-06-13T10:54:19.083209", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# HistGB scores\n", + "\n", + "hist_cv_score = np.mean(hist_cv_scores)\n", + "print(f\"Score on CV test data ======> {hist_cv_score}\")\n", + "\n", + "hist_preds_val = hist_md.predict(X_val)\n", + "hist_preds_val_score = mean_absolute_error(Y_val, hist_preds_val)\n", + "print(f\"Score on Valid data (unseen data) ======> {hist_preds_val_score}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "600a6742", + "metadata": { + "id": "dsQ1mx-59bSx", + "outputId": "bd0e9dd0-15b7-416a-ebf2-8fcf19cf4894", + "papermill": { + "duration": 4.355499, + "end_time": "2023-06-13T10:54:25.783086", + "exception": false, + "start_time": "2023-06-13T10:54:21.427587", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# XGBoost scores\n", + "\n", + "xgb_cv_score = np.mean(xgb_cv_scores)\n", + "print(f\"Score on CV test data ======> {xgb_cv_score}\")\n", + "\n", + "xgb_preds_val = xgb_md.predict(X_val)\n", + "xgb_preds_val_score = mean_absolute_error(Y_val, xgb_preds_val)\n", + "print(f\"Score on Valid data (unseen data) ======> {xgb_preds_val_score}\")" + ] + }, + { + "cell_type": "markdown", + "id": "58eaf3d2", + "metadata": { + "papermill": { + "duration": 0.165481, + "end_time": "2023-06-13T10:54:26.116452", + "exception": false, + "start_time": "2023-06-13T10:54:25.950971", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "\n",
+ " 💡 Saving the models :
\n",
+ " We can use joblib to save the models we trained so we can use them later if needed.\n",
+ "\n",
+ "\n",
+ "
Submissions
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e262a2a1", + "metadata": { + "id": "4rLMnbSr3uNX", + "papermill": { + "duration": 0.213049, + "end_time": "2023-06-13T10:54:27.171465", + "exception": false, + "start_time": "2023-06-13T10:54:26.958416", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "submission = pd.read_csv(\"/kaggle/input/playground-series-s3e16/sample_submission.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f8bb52e", + "metadata": { + "papermill": { + "duration": 0.622624, + "end_time": "2023-06-13T10:54:27.959578", + "exception": false, + "start_time": "2023-06-13T10:54:27.336954", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Visualize models performances\n", + "hist_cv_score = np.mean(hist_cv_scores)\n", + "lgb_cv_score = np.mean(lgb_cv_scores)\n", + "xgb_cv_score = np.mean(xgb_cv_scores)\n", + "cat_cv_score = np.mean(cat_cv_scores)\n", + "ens_cv_score = np.mean(ens_cv_scores)\n", + "\n", + "\n", + "model_perf = pd.DataFrame({'Models': [ 'HistGradient' ,'LightGBM', 'XGBoost', 'CatBoost', 'LAD Model'],\n", + " 'CV-scores': [ hist_cv_score, lgb_cv_score, xgb_cv_score, cat_cv_score, ens_cv_score]\n", + " })\n", + "\n", + "plt.figure(figsize = (8, 8))\n", + "ax = sns.barplot(y = 'Models', x = 'CV-scores', data = model_perf)\n", + "ax.bar_label(ax.containers[0]);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "effa2d26", + "metadata": { + "id": "Y6Cts_Tp3uNd", + "papermill": { + "duration": 0.185627, + "end_time": "2023-06-13T10:54:28.640810", + "exception": false, + "start_time": "2023-06-13T10:54:28.455183", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "unique_targets = np.unique(train['Age'])\n", + "def mattop_post_process(preds):\n", + " return np.array([min(unique_targets, key = lambda x: abs(x - pred)) for pred in preds])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91142f31", + "metadata": { + "papermill": { + "duration": 8.398007, + "end_time": "2023-06-13T10:54:37.205473", + "exception": false, + "start_time": "2023-06-13T10:54:28.807466", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ens_preds_test = mattop_post_process(pd.DataFrame(ens_preds).apply(np.mean, axis = 0))\n", + "\n", + "submission['Age'] = ens_preds_test.astype(int)\n", + "submission.to_csv('LAD_model.csv', index = False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + }, + "papermill": { + "default_parameters": {}, + "duration": 5695.682242, + "end_time": "2023-06-13T10:54:41.703074", + "environment_variables": {}, + "exception": null, + "input_path": "__notebook__.ipynb", + "output_path": "__notebook__.ipynb", + "parameters": {}, + "start_time": "2023-06-13T09:19:46.020832", + "version": "2.4.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}