diff --git a/Prediction Models/Crab_Age_Prediction/Readme.md b/Prediction Models/Crab_Age_Prediction/Readme.md
new file mode 100644
index 00000000..9dbb028e
--- /dev/null
+++ b/Prediction Models/Crab_Age_Prediction/Readme.md	
@@ -0,0 +1,42 @@
+# Crab Age Prediction Model
+
+This repository contains a machine learning model that predicts the age of crabs based on various biological measurements. The project involves Exploratory Data Analysis (EDA), feature engineering, and multiple machine learning models to determine which factors most accurately predict crab age.
+
+## Table of Contents
+- [Introduction](#introduction)
+- [Problem Statement](#problem-statement)
+- [Solution Overview](#solution-overview)
+- [Data](#data)
+
+
+## Introduction
+
+Determining the age of marine species such as crabs is essential for studying population dynamics and ecological impacts. This project focuses on developing a machine learning model to predict crab age based on various biological characteristics, like size, weight, and shell dimensions. The model aims to help biologists and ecologists with accurate age estimations, facilitating better research and conservation efforts.
+
+## Problem Statement
+
+Age prediction in crabs is complex due to several challenges:
+- **Biological Variability**: Differences in growth rates across individual crabs due to genetics and environmental factors.
+- **Measurement Limitations**: Variability in available biological measurements.
+- **Feature Selection**: Identifying which measurements contribute most effectively to accurate age prediction.
+
+This project aims to address these challenges by leveraging machine learning techniques to create a predictive model for crab age.
+
+## Solution Overview
+
+The model uses various machine learning algorithms, including linear regression, decision trees, and ensemble methods. Steps taken include:
+1. **Exploratory Data Analysis (EDA)**: Identifying patterns, outliers, and relationships within the data.
+2. **Feature Engineering**: Selecting and transforming features to improve model accuracy.
+3. **Model Selection and Training**: Comparing multiple models to determine the best predictor of crab age.
+  
+Key features may include measurements such as carapace length, width, weight, and other morphological characteristics.
+
+## Data
+
+The dataset contains various biological measurements for crabs, including:
+- **Carapace Dimensions**: Length, width, and height.
+- **Weight Measurements**: Including whole weight, shell weight, etc.
+- **Other Characteristics**: Information about species, habitat, or other ecological factors, if available.
+
+The dataset should be placed in the `data/` folder in CSV format.
+
diff --git a/Prediction Models/Crab_Age_Prediction/crab-age-predictions-eda-f-e-modeling-10th.ipynb b/Prediction Models/Crab_Age_Prediction/crab-age-predictions-eda-f-e-modeling-10th.ipynb
new file mode 100644
index 00000000..67ada5b5
--- /dev/null
+++ b/Prediction Models/Crab_Age_Prediction/crab-age-predictions-eda-f-e-modeling-10th.ipynb	
@@ -0,0 +1,1898 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "64fad5f4",
+   "metadata": {
+    "papermill": {
+     "duration": 0.023427,
+     "end_time": "2023-06-13T09:19:59.138539",
+     "exception": false,
+     "start_time": "2023-06-13T09:19:59.115112",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">Import Libraries</p>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cf32024",
+   "metadata": {
+    "papermill": {
+     "duration": 20.705581,
+     "end_time": "2023-06-13T09:20:19.910608",
+     "exception": false,
+     "start_time": "2023-06-13T09:19:59.205027",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install sklego\n",
+    "\n",
+    "import numpy as np # linear algebra\n",
+    "import pandas as pd # data processing\n",
+    "from pandas.api.types import is_numeric_dtype\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import optuna\n",
+    "\n",
+    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler\n",
+    "from sklearn.metrics import mean_absolute_error\n",
+    "from sklearn.model_selection import KFold, train_test_split, GridSearchCV\n",
+    "\n",
+    "\n",
+    "# Models\n",
+    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor, StackingRegressor\n",
+    "from lightgbm import LGBMRegressor\n",
+    "from xgboost import XGBRegressor\n",
+    "from sklego.linear_model import LADRegression\n",
+    "from catboost import CatBoostRegressor\n",
+    "\n",
+    "\n",
+    "# Ignore warnings ;)\n",
+    "import warnings\n",
+    "warnings.simplefilter(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5660f80e",
+   "metadata": {
+    "id": "9iEKB2Oh3uNF",
+    "papermill": {
+     "duration": 0.024267,
+     "end_time": "2023-06-13T09:20:19.959155",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:19.934888",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">Import the data</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd92fa96",
+   "metadata": {
+    "id": "Y-gW90p23uNH",
+    "papermill": {
+     "duration": 0.680544,
+     "end_time": "2023-06-13T09:20:20.663008",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:19.982464",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# files path\n",
+    "train_path     = \"/kaggle/input/playground-series-s3e16/train.csv\"\n",
+    "test_path      = \"/kaggle/input/playground-series-s3e16/test.csv\"\n",
+    "original_path  = \"/kaggle/input/crab-age-prediction/CrabAgePrediction.csv\"\n",
+    "synthetic_path = \"/kaggle/input/ps-s3-e16-synthetic-train-data/train_synthetic.csv\"\n",
+    "\n",
+    "# function to import our dataset \n",
+    "def import_data(train_path, test_path, original_path, synthetic_path):\n",
+    "    train     =  pd.read_csv(train_path)\n",
+    "    test      =  pd.read_csv(test_path)\n",
+    "    original  =  pd.read_csv(original_path)\n",
+    "    synthetic =  pd.read_csv(synthetic_path)\n",
+    "    \n",
+    "    return train, test, original, synthetic\n",
+    "\n",
+    "train, test, original, synthetic = import_data(train_path, test_path, original_path, synthetic_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "729aaacc",
+   "metadata": {
+    "papermill": {
+     "duration": 0.023237,
+     "end_time": "2023-06-13T09:20:20.709912",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:20.686675",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "The train dataset is a synthetic dataset generated from the [Crab Age Prediction](https://www.kaggle.com/datasets/sidhus/crab-age-prediction) dataset(original). These are the descriptions of the variables in this dataset:\n",
+    "\n",
+    "<ul>\n",
+    "<li> Sex: Gender of the Crab - Male, Female and Indeterminate </li>\n",
+    "<li> Length: Length of the Crab in feet </li>\n",
+    "<li> Diameter: Diameter of the Crab in feet </li>\n",
+    "<li> Height: Height of the Crab in feet </li>\n",
+    "<li> Weight: Weight of the Crab in ounces </li>\n",
+    "<li> Shucked Weight: Weight without the shell in ounces </li>\n",
+    "<li> Viscera Weight: Weight that wraps around the crab's abdominal organs in ounces </li>\n",
+    "<li> Shell Weight: Weight of the Shell in ounces </li>\n",
+    "<li> Age: Age of the Crab in months</li>\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e39b4b8e",
+   "metadata": {
+    "papermill": {
+     "duration": 0.023076,
+     "end_time": "2023-06-13T09:20:20.806366",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:20.783290",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">Exploratory Data Analysis - EDA</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc00863c",
+   "metadata": {
+    "id": "DyWZdMJG3uNJ",
+    "outputId": "8a5b1171-c6ac-4b79-e967-f61c067dd810",
+    "papermill": {
+     "duration": 0.063962,
+     "end_time": "2023-06-13T09:20:20.893200",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:20.829238",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "train.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9662dcaf",
+   "metadata": {
+    "papermill": {
+     "duration": 0.045347,
+     "end_time": "2023-06-13T09:20:20.962317",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:20.916970",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "original.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "add29001",
+   "metadata": {
+    "papermill": {
+     "duration": 0.049808,
+     "end_time": "2023-06-13T09:20:21.036607",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:20.986799",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "synthetic.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "699852c3",
+   "metadata": {
+    "papermill": {
+     "duration": 0.045925,
+     "end_time": "2023-06-13T09:20:21.108412",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.062487",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "test.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd570494",
+   "metadata": {
+    "papermill": {
+     "duration": 0.025364,
+     "end_time": "2023-06-13T09:20:21.159190",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.133826",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "Now, we'll try to use some descriptive statistics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d1737796",
+   "metadata": {
+    "papermill": {
+     "duration": 0.023672,
+     "end_time": "2023-06-13T09:20:21.207598",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.183926",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">1. Univariate Statistics</p> "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd37335d",
+   "metadata": {
+    "papermill": {
+     "duration": 0.02352,
+     "end_time": "2023-06-13T09:20:21.255280",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.231760",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "We can use the .describe() method from pandas to see basic stats like count, mean, standard deviation, minimum, maximum, quantiles..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b540ab7",
+   "metadata": {
+    "papermill": {
+     "duration": 0.105598,
+     "end_time": "2023-06-13T09:20:21.384989",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.279391",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "train.describe().T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5824f34",
+   "metadata": {
+    "papermill": {
+     "duration": 0.041004,
+     "end_time": "2023-06-13T09:20:21.500177",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.459173",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# fonction to calculate univariate stats like pandas describe method\n",
+    "def univariate_stats(df):\n",
+    "    #df.drop('id', axis=1, inplace=True)\n",
+    "    output_df = pd.DataFrame(columns=['Count', 'Missing', 'Unique', 'Dtype', 'IsNumeric', 'Mode', 'Mean', 'Min', '25%', 'Median', '75%', 'Max', 'Std', 'Skew', 'Kurt'])\n",
+    "    \n",
+    "    for col in df:\n",
+    "        if is_numeric_dtype(df[col]):\n",
+    "            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], df[col].mean(), df[col].min(), df[col].quantile(.25), df[col].median(), df[col].quantile(.75), df[col].max(), df[col].std(), df[col].skew(), df[col].kurt() ]\n",
+    "        else:\n",
+    "            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], '-', '-', '-', '-', '-', '-', '-', '-', '-' ]\n",
+    "\n",
+    "    return output_df.sort_values(by=['IsNumeric', 'Unique'], ascending=False)\n",
+    "\n",
+    "\n",
+    "pd.set_option('display.max_rows', 100)\n",
+    "pd.set_option('display.max_columns', 100)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68343761",
+   "metadata": {
+    "papermill": {
+     "duration": 0.255555,
+     "end_time": "2023-06-13T09:20:21.780149",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.524594",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Call the function on train\n",
+    "univariate_stats(train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b04ec10e",
+   "metadata": {
+    "papermill": {
+     "duration": 0.13138,
+     "end_time": "2023-06-13T09:20:21.938161",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.806781",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Call the function to check univariate stats on the original dataset\n",
+    "univariate_stats(original)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcc9292b",
+   "metadata": {
+    "papermill": {
+     "duration": 0.194384,
+     "end_time": "2023-06-13T09:20:22.159895",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:21.965511",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Call the function to check univariate stats on test dataset\n",
+    "univariate_stats(test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "695693f3",
+   "metadata": {
+    "papermill": {
+     "duration": 0.313853,
+     "end_time": "2023-06-13T09:20:22.499909",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:22.186056",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Call the function to check univariate stats on synthetic dataset\n",
+    "univariate_stats(synthetic)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e17abf6d",
+   "metadata": {
+    "papermill": {
+     "duration": 0.041634,
+     "end_time": "2023-06-13T09:20:22.568187",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:22.526553",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# List of numerical columns and categorical columns\n",
+    "\n",
+    "numeric_cols = train.select_dtypes(include=['float64']).columns.tolist()\n",
+    "categ_cols   = train.select_dtypes(include=['object']).columns.tolist()\n",
+    "target       = 'Age'\n",
+    "numeric_cols\n",
+    "categ_cols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf027b93",
+   "metadata": {
+    "papermill": {
+     "duration": 16.48609,
+     "end_time": "2023-06-13T09:20:39.132428",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:22.646338",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_histograms(df_train, df_test, original, synthetic,target_col, n_cols=3):\n",
+    "    n_rows = (len(df_train.columns) - 1) // n_cols + 1\n",
+    "\n",
+    "    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))\n",
+    "    axes = axes.flatten()\n",
+    "\n",
+    "    for i, var_name in enumerate(df_train.columns.tolist()):\n",
+    "        ax = axes[i]\n",
+    "        sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')      # plot train data\n",
+    "        sns.distplot(original[var_name], kde=True, ax=ax, label='Original')   # plot original data\n",
+    "        sns.distplot(synthetic[var_name], kde=True, ax=ax, label='Synthetic')   # plot original data\n",
+    "        if var_name != target_col:\n",
+    "            sns.distplot(df_test[var_name], kde=True, ax=ax, label='Test')    # plot test data\n",
+    "        \n",
+    "        ax.set_title(f'{var_name} Distribution (Train vs Test)')\n",
+    "        ax.legend()\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "        \n",
+    "plot_histograms(train[numeric_cols], test[numeric_cols], synthetic, original[numeric_cols], target, n_cols=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6021903a",
+   "metadata": {
+    "papermill": {
+     "duration": 78.610486,
+     "end_time": "2023-06-13T09:21:57.773692",
+     "exception": false,
+     "start_time": "2023-06-13T09:20:39.163206",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_distribution(df, hue, title='', drop_cols=[]):\n",
+    "    sns.set_style('whitegrid')\n",
+    "\n",
+    "    cols = df.columns.drop([hue] + drop_cols)\n",
+    "    n_cols = 2\n",
+    "    n_rows = (len(cols) - 1) // n_cols + 1\n",
+    "\n",
+    "    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))\n",
+    "\n",
+    "    for i, var_name in enumerate(cols):\n",
+    "        row = i // n_cols\n",
+    "        col = i % n_cols\n",
+    "\n",
+    "        ax = axes[row, col]\n",
+    "        sns.histplot(data=df, x=var_name, kde=True, ax=ax, hue=hue) # sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')\n",
+    "        ax.set_title(f'{var_name} Distribution')\n",
+    "\n",
+    "    fig.suptitle(f'{title} Distribution Plot by {hue}', fontweight='bold', fontsize=16)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "    \n",
+    "plot_distribution(train, hue='Sex', title='Train data')\n",
+    "plot_distribution(test, hue='Sex', title='Test data')\n",
+    "plot_distribution(original, hue='Sex', title='Original data')\n",
+    "plot_distribution(synthetic, hue='Sex', title= 'Synthetic data')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10726c06",
+   "metadata": {
+    "papermill": {
+     "duration": 8.927933,
+     "end_time": "2023-06-13T09:22:06.762168",
+     "exception": false,
+     "start_time": "2023-06-13T09:21:57.834235",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_boxplot(df, hue, title='', drop_cols=[], n_cols=3):\n",
+    "    sns.set_style('whitegrid')\n",
+    "\n",
+    "    cols = df.columns.drop([hue] + drop_cols)\n",
+    "    n_rows = (len(cols) - 1) // n_cols + 1\n",
+    "\n",
+    "    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))\n",
+    "\n",
+    "    for i, var_name in enumerate(cols):\n",
+    "        row = i // n_cols\n",
+    "        col = i % n_cols\n",
+    "\n",
+    "        ax = axes[row, col]\n",
+    "        sns.boxplot(data=df, x=hue, y=var_name, ax=ax, showmeans=True, \n",
+    "                    meanprops={\"marker\":\"s\",\"markerfacecolor\":\"white\", \"markeredgecolor\":\"blue\", \"markersize\":\"5\"})\n",
+    "        ax.set_title(f'{var_name} by {hue}')\n",
+    "        ax.set_xlabel('')\n",
+    "\n",
+    "    fig.suptitle(f'{title} Boxplot by {hue}', fontweight='bold', fontsize=16)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "    \n",
+    "plot_boxplot(train, hue='Sex', title='Train data', n_cols=2)\n",
+    "plot_boxplot(original, hue='Sex', title='Original data', n_cols=2)\n",
+    "plot_boxplot(test, hue='Sex', title='Test data', n_cols=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01100a7c",
+   "metadata": {
+    "papermill": {
+     "duration": 7.745215,
+     "end_time": "2023-06-13T09:22:14.579464",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:06.834249",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_violinplot(df, hue, title='', drop_cols=[], n_cols=2):\n",
+    "    sns.set_style('whitegrid')\n",
+    "\n",
+    "    cols = df.columns.drop([hue] + drop_cols)\n",
+    "    n_rows = (len(cols) - 1) // n_cols + 1\n",
+    "\n",
+    "    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))\n",
+    "\n",
+    "    for i, var_name in enumerate(cols):\n",
+    "        row = i // n_cols\n",
+    "        col = i % n_cols\n",
+    "\n",
+    "        ax = axes[row, col]\n",
+    "        sns.violinplot(data=df, x=hue, y=var_name, ax=ax, inner='quartile')\n",
+    "        ax.set_title(f'{var_name} Distribution')\n",
+    "\n",
+    "    fig.suptitle(f'{title} Violin Plot by {hue}', fontweight='bold', fontsize=16)\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "    \n",
+    "plot_violinplot(train, hue='Sex', title='Train data', n_cols=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3460ae32",
+   "metadata": {
+    "papermill": {
+     "duration": 0.074544,
+     "end_time": "2023-06-13T09:22:14.732933",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:14.658389",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">2. Bivariate Statistics</p> "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "131690c8",
+   "metadata": {
+    "papermill": {
+     "duration": 0.076052,
+     "end_time": "2023-06-13T09:22:14.884777",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:14.808725",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "<p style=\"padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;\">\n",
+    "    <b>💡 Recall: There are three types of bivariate analysis.</b><br>       \n",
+    "<ul> \n",
+    "<li> <b>Numerical - Numerical: Pearson's Correlation</b> </li> <br>\n",
+    "The correlation represents the strength of a linear relationship between two numerical variables. If there is no correlation between the two variables, there is no tendency to change along with the values of the second quantity.  <br>  <br>\n",
+    "<li> <b>Categorical - Numerical: one-way ANOVA(3 + groups) or t-test (exactly 2 groups)</b>       </li> <br>\n",
+    "The ANOVA test is used to determine whether there is a significant difference among the averages of more than two groups that are statistically different from each other. <br><br>\n",
+    "<li> <b>Categorical - Categorical: Chi-square Test</b>       </li> <br>\n",
+    "It is calculated based on the difference between expected frequencies and the observed frequencies in one or more categories of the frequency table.\n",
+    "</ul>\n",
+    "</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "362ee210",
+   "metadata": {
+    "papermill": {
+     "duration": 0.267599,
+     "end_time": "2023-06-13T09:22:15.228466",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:14.960867",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# this just an intermediate function that will be used in bivstats for one-way ANOVA\n",
+    "def anova(df, feature, label):\n",
+    "    import pandas as pd\n",
+    "    import numpy as np\n",
+    "    from scipy import stats\n",
+    "    \n",
+    "    groups = df[feature].unique()\n",
+    "    df_grouped = df.groupby(feature)\n",
+    "    group_labels = []\n",
+    "    for g in groups:\n",
+    "        g_list = df_grouped.get_group(g)\n",
+    "        group_labels.append(g_list[label])\n",
+    "        \n",
+    "    return stats.f_oneway(*group_labels)\n",
+    "\n",
+    "# function to calculate bivariate stats; Pearson' correlation, p-value and one-way ANOVA\n",
+    "def bivstats(df, label):\n",
+    "    from scipy import stats\n",
+    "    import pandas as pd\n",
+    "    import numpy as np\n",
+    "    \n",
+    "    # Create an empty DataFrame to store output\n",
+    "    output_df = pd.DataFrame(columns=['Stat', '+/-', 'Effect size', 'p-value'])\n",
+    "    \n",
+    "    for col in df:\n",
+    "        if col != label:\n",
+    "            if df[col].isnull().sum() == 0:\n",
+    "                if is_numeric_dtype(df[col]):   # Only calculate r, \n",
+    "                    r, p = stats.pearsonr(df[label], df[col])\n",
+    "                    output_df.loc[col] = ['r', np.sign(r), abs(round(r, 3)), round(p,6)]\n",
+    "                    \n",
+    "                else:\n",
+    "                    F, p = anova(df[[col, label]], col, label)\n",
+    "                    output_df.loc[col] = ['F', '', round(F, 3), round(p,6)]\n",
+    "                    \n",
+    "            else:\n",
+    "                output_df.loc[col] = [np.nan, np.nan, np.nan, np.nan]\n",
+    "\n",
+    "    return output_df.sort_values(by=['Effect size', 'Stat'], ascending=[False, False])\n",
+    " \n",
+    "pd.options.display.float_format = '{:.5f}'.format\n",
+    "bivstats(train, target)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "514f9292",
+   "metadata": {
+    "papermill": {
+     "duration": 2.545891,
+     "end_time": "2023-06-13T09:22:17.895528",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:15.349637",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_heatmap(df, title):\n",
+    "    # Create a mask for the diagonal elements\n",
+    "    mask = np.zeros_like(df.astype(float).corr())\n",
+    "    mask[np.triu_indices_from(mask)] = True\n",
+    "\n",
+    "    # Set the colormap and figure size\n",
+    "    colormap = plt.cm.RdBu_r\n",
+    "    plt.figure(figsize=(8, 8))\n",
+    "\n",
+    "    # Set the title and font properties\n",
+    "    plt.title(f'{title} correlation of features', fontweight='bold', y=1.02, size=8)\n",
+    "\n",
+    "    # Plot the heatmap with the masked diagonal elements\n",
+    "    sns.heatmap(df.astype(float).corr(), linewidths=0.1, vmax=1.0, vmin=-1.0, \n",
+    "                square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={\"size\": 14, \"weight\": \"bold\"},\n",
+    "                mask=mask)\n",
+    "\n",
+    "plot_heatmap(train[numeric_cols + [target]], title='Train data')\n",
+    "plot_heatmap(test[numeric_cols], title='Test data')\n",
+    "plot_heatmap(original[numeric_cols + [target]], title='Original')\n",
+    "plot_heatmap(synthetic[numeric_cols + [target]], title='Synthetic')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6805d66",
+   "metadata": {
+    "papermill": {
+     "duration": 0.082019,
+     "end_time": "2023-06-13T09:22:18.060477",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:17.978458",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "<p style=\"padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;\">\n",
+    "    <b>💡 \n",
+    "Since the features in our datasets are strongly correlated to each others, applying PCA could be a good idea. <br>\n",
+    "        Let's visualize some graphics to gain more insights.\n",
+    "</b>\n",
+    "</p> \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09191870",
+   "metadata": {
+    "papermill": {
+     "duration": 94.045275,
+     "end_time": "2023-06-13T09:23:52.187623",
+     "exception": false,
+     "start_time": "2023-06-13T09:22:18.142348",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_scatter_with_fixed_col(df, fixed_col, hue=False, drop_cols=[], size=5, title=''):\n",
+    "    sns.set_style('whitegrid')\n",
+    "    \n",
+    "    if hue:\n",
+    "        cols = df.columns.drop([hue, fixed_col] + drop_cols)\n",
+    "    else:\n",
+    "        cols = df.columns.drop([fixed_col] + drop_cols)\n",
+    "    n_cols = 2\n",
+    "    n_rows = (len(cols) - 1) // n_cols + 1\n",
+    "    fig, axes = plt.subplots(n_rows, n_cols, figsize=(size, size/n_cols*n_rows), sharex=False, sharey=False)\n",
+    "    fig.suptitle(f'{title} Set Scatter Plot with Target Column by {hue}', fontsize=20, fontweight='bold', y=1)\n",
+    "\n",
+    "    for i, col in enumerate(cols):\n",
+    "        n_row = i // n_cols\n",
+    "        n_col = i % n_cols\n",
+    "        ax = axes[n_row, n_col]\n",
+    "\n",
+    "        ax.set_xlabel(f'{col}', fontsize=12)\n",
+    "        ax.set_ylabel(f'{fixed_col}', fontsize=12)\n",
+    "\n",
+    "        # Plot the scatterplot\n",
+    "        if hue:\n",
+    "            sns.scatterplot(data=df, x=col, y=fixed_col, hue=hue, ax=ax,\n",
+    "                            s=40, edgecolor='gray', alpha=0.3, palette='bright')\n",
+    "            ax.legend(title=hue, title_fontsize=12, fontsize=12) # loc='upper right'\n",
+    "        else:\n",
+    "            sns.scatterplot(data=df, x=col, y=fixed_col, ax=ax,\n",
+    "                            s=40, edgecolor='gray', alpha=0.3)\n",
+    "\n",
+    "        ax.tick_params(axis='both', which='major', labelsize=8)\n",
+    "        ax.set_title(f'{col}', fontsize=16)\n",
+    "    \n",
+    "    plt.tight_layout(pad=0.5, h_pad=0.5, w_pad=0.5)\n",
+    "    plt.show()\n",
+    "    \n",
+    "plot_scatter_with_fixed_col(train, fixed_col=target, hue='Sex', size=10, title='Train data')\n",
+    "plot_scatter_with_fixed_col(original, fixed_col=target, hue='Sex', size=10, title='Original data')\n",
+    "plot_scatter_with_fixed_col(synthetic, fixed_col=target, hue='Sex', size=10, title='Synthetic data')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a4dfbc1",
+   "metadata": {
+    "id": "U-NQvtFb3uNN",
+    "outputId": "54670183-d7b7-4fb2-cf35-cf9883963250",
+    "papermill": {
+     "duration": 0.159006,
+     "end_time": "2023-06-13T09:23:52.496325",
+     "exception": false,
+     "start_time": "2023-06-13T09:23:52.337319",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# sns.pairplot(data=train, vars=['Age', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight'], hue='Sex')\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e883a89c",
+   "metadata": {
+    "id": "MQz7Ggep3uNO",
+    "papermill": {
+     "duration": 0.149412,
+     "end_time": "2023-06-13T09:23:52.796325",
+     "exception": false,
+     "start_time": "2023-06-13T09:23:52.646913",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">Feature Engineering</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "419bd785",
+   "metadata": {
+    "id": "aerc4iNU3uNP",
+    "outputId": "f87502be-953c-449d-ff85-c3e2177f82b6",
+    "papermill": {
+     "duration": 0.306708,
+     "end_time": "2023-06-13T09:23:53.252712",
+     "exception": false,
+     "start_time": "2023-06-13T09:23:52.946004",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "train[\"Data Type\"] = 0\n",
+    "test[\"Data Type\"] = 1\n",
+    "original[\"Data Type\"] = 2\n",
+    "synthetic[\"Data Type\"] = 3\n",
+    "\n",
+    "ids = []\n",
+    "for i in range(len(original)):\n",
+    "    ids.append(i + 123419)\n",
+    "\n",
+    "original[\"id\"] = ids\n",
+    "synthetic[\"id\"] += 127312\n",
+    "\n",
+    "# concatenate datasets\n",
+    "df_concat = pd.concat([train, original, synthetic], ignore_index=True)\n",
+    "df_concat = df_concat.drop_duplicates()\n",
+    "df_all = pd.concat([df_concat, test], ignore_index=True)\n",
+    "df_all"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a67cf11",
+   "metadata": {
+    "papermill": {
+     "duration": 0.247528,
+     "end_time": "2023-06-13T09:23:53.953216",
+     "exception": false,
+     "start_time": "2023-06-13T09:23:53.705688",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df_all = pd.get_dummies(df_all)\n",
+    "df_all"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f6cf7ec",
+   "metadata": {
+    "id": "Vx8-kBZS3uNP",
+    "outputId": "1089b557-3522-4fb7-cacb-277a62f6a359",
+    "papermill": {
+     "duration": 0.167946,
+     "end_time": "2023-06-13T09:23:54.573599",
+     "exception": false,
+     "start_time": "2023-06-13T09:23:54.405653",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df_all[df_all['Height'] == 0]['Height']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71cf996e",
+   "metadata": {
+    "id": "kSbwgc8D3uNP",
+    "papermill": {
+     "duration": 70.98887,
+     "end_time": "2023-06-13T09:25:05.711089",
+     "exception": false,
+     "start_time": "2023-06-13T09:23:54.722219",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "h1 = df_all[df_all[\"Height\"] != 0]\n",
+    "h0 = df_all[df_all[\"Height\"] == 0]\n",
+    "print(h1.shape, h0.shape)\n",
+    "\n",
+    "# prediction of Height by Random Forest Regressor\n",
+    "\n",
+    "x_h1 = h1.drop(columns=[\"Height\", \"Age\", \"Data Type\"], axis=1)\n",
+    "y_h1 = h1[\"Height\"]\n",
+    "x_h0 = h0.drop(columns=[\"Height\", \"Age\", \"Data Type\"], axis=1)\n",
+    "\n",
+    "rfr = RandomForestRegressor(n_jobs=-1, random_state=42)\n",
+    "rfr.fit(x_h1, y_h1)\n",
+    "preds_height = rfr.predict(x_h0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79b22f98",
+   "metadata": {
+    "id": "9wixBHMD3uNR",
+    "outputId": "01dd0f7b-325b-4a31-df00-0368432e7668",
+    "papermill": {
+     "duration": 0.1627,
+     "end_time": "2023-06-13T09:25:06.028867",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:05.866167",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "len(preds_height)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "455dd321",
+   "metadata": {
+    "id": "as5IcoJ_3uNR",
+    "outputId": "3aa30034-814c-429b-a86e-9178468fe428",
+    "papermill": {
+     "duration": 4.516241,
+     "end_time": "2023-06-13T09:25:10.696350",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:06.180109",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "cnt = 0\n",
+    "for i in range(len(df_all)):\n",
+    "    if df_all.loc[i, \"Height\"] == 0:\n",
+    "        df_all.loc[i, \"Height\"] = preds_height[cnt]\n",
+    "        cnt += 1\n",
+    "\n",
+    "df_all[\"Height\"].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0979a994",
+   "metadata": {
+    "id": "sj8GP2ij3uNS",
+    "outputId": "69b570f6-837b-4807-9c8d-44fc4b0a57b1",
+    "papermill": {
+     "duration": 0.173823,
+     "end_time": "2023-06-13T09:25:11.023660",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:10.849837",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df_all[df_all['Height'] == 0]['Height']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7081dc91",
+   "metadata": {
+    "id": "U0xgnS_C3uNS",
+    "outputId": "4e09ab9c-a465-4383-8230-c36300b688c4",
+    "papermill": {
+     "duration": 0.216196,
+     "end_time": "2023-06-13T09:25:11.392374",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:11.176178",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Prepare our final dataset for train\n",
+    "\n",
+    "train = df_all[df_all[\"Data Type\"] != 1]\n",
+    "train.sort_values(\"id\", inplace=True)\n",
+    "train.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "train = train.drop(columns=[\"id\", \"Data Type\"], axis=1)\n",
+    "train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aced0b22",
+   "metadata": {
+    "id": "Sd6DE_Ik3uNT",
+    "outputId": "5fdc0c86-7828-41de-d6f2-8ebf4dd8a426",
+    "papermill": {
+     "duration": 0.186995,
+     "end_time": "2023-06-13T09:25:11.731465",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:11.544470",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# dataset for test\n",
+    "test = df_all[df_all[\"Data Type\"] == 1]\n",
+    "test.sort_values(\"id\", inplace=True)\n",
+    "test.reset_index(drop=True, inplace=True)\n",
+    "test.drop(columns=[\"id\", \"Age\", \"Data Type\"], inplace=True)\n",
+    "test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37ed6563",
+   "metadata": {
+    "papermill": {
+     "duration": 0.167326,
+     "end_time": "2023-06-13T09:25:12.353391",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:12.186065",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# function for PCA features but I'll keep this as a last resort\n",
+    "def add_pca_features(X_train, X_test):    \n",
+    "    \n",
+    "    # Select the columns for PCA\n",
+    "    pca_features = X_train.select_dtypes(include=['float64']).columns.tolist()\n",
+    "    n_components = 4 # len(pca_features)\n",
+    "\n",
+    "    # Create the pipeline\n",
+    "    pipeline = make_pipeline(StandardScaler(), PCA(n_components=n_components))\n",
+    "    \n",
+    "    # Perform PCA\n",
+    "    pipeline.fit(X_train[pca_features])\n",
+    "\n",
+    "    # Create column names for PCA features\n",
+    "    pca_columns = [f'PCA_{i}' for i in range(n_components)]\n",
+    "\n",
+    "    # Add PCA features to the dataframe\n",
+    "    X_train[pca_columns] = pipeline.transform(X_train[pca_features])\n",
+    "    X_test[pca_columns] = pipeline.transform(X_test[pca_features])\n",
+    "\n",
+    "    return X_train, X_test"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "676c8e97",
+   "metadata": {
+    "papermill": {
+     "duration": 0.150509,
+     "end_time": "2023-06-13T09:25:12.655229",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:12.504720",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "<p style=\"padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;\">\n",
+    "    <b>💡 We expose here many techniques and features that we can use in our dataset but we'll use only few of them.\n",
+    "    </b>\n",
+    "</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "885c3075",
+   "metadata": {
+    "id": "iKC0agEF3uNT",
+    "papermill": {
+     "duration": 0.167826,
+     "end_time": "2023-06-13T09:25:12.973874",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:12.806048",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# function to add more features to the dataset\n",
+    "def feature_engineering(df): \n",
+    "    \n",
+    "    # Clean the weights by capping the over weights with total body weights\n",
+    "    \"\"\"df['Shell Weight']=np.where(df['Shell Weight']>df['Weight'],df['Weight'],df['Shell Weight'])\n",
+    "    df['Viscera Weight']=np.where(df['Viscera Weight']>df['Weight'],df['Weight'],df['Viscera Weight'])\n",
+    "    df['Shucked Weight']=np.where(df['Shucked Weight']>df['Weight'],df['Weight'],df['Shucked Weight'])\"\"\"\n",
+    "    \n",
+    "    # Adding brand news features\n",
+    "    df['Shucked Weight ratio'] = df['Shucked Weight'] / df['Weight']\n",
+    "    #df['Viscera Weight ratio'] = df['Viscera Weight'] / df['Weight']    # dropped due to low correlation with our target\n",
+    "    df['Shell Weight ratio']   = df['Shell Weight'] / df['Weight']\n",
+    "    \n",
+    "    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']\n",
+    "    \n",
+    "    #df['Meat Yield'] = df['Shucked Weight'] / (df['Weight'] + df['Shell Weight'])\n",
+    "    #df['Weight_to_Shucked_Weight'] = df['Weight'] / df['Shucked Weight']\n",
+    "    \n",
+    "    \"\"\"df['dim1']   = df['Length'] * df['Diameter']\n",
+    "    df['dim2']   = df['Length'] * df['Height']\n",
+    "    df['dim3']   = df['Height'] * df['Diameter']\"\"\"\n",
+    "    \n",
+    "    # Crab BMI\n",
+    "    df['bmi']=df['Weight']/(df['Height']**2)\n",
+    "    \n",
+    "    # Water Loss during experiment\n",
+    "    df[\"water_loss\"]=df[\"Weight\"]-df[\"Shucked Weight\"]-df['Viscera Weight']-df['Shell Weight']\n",
+    "    df[\"water_loss\"]=np.where(  df[\"water_loss\"]<0,\n",
+    "                                min(df[\"Shucked Weight\"].min(), df[\"Viscera Weight\"].min(), df[\"Shell Weight\"].min()),\n",
+    "                                df[\"water_loss\"]\n",
+    "                             )\n",
+    "    \n",
+    "    # Crab density approx\n",
+    "    df['density'] = df['Weight']/(df['Volume'])\n",
+    "    df['BSA'] = np.sqrt( (df['Weight']* 0.0283) * (df['Height']*30.48) / 3600 )\n",
+    "    \n",
+    "    news_cols = ['Shucked Weight ratio', 'Shell Weight ratio', 'Volume', 'bmi', \"water_loss\", 'density', 'BSA']\n",
+    "    \n",
+    "    return df, news_cols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53f5a626",
+   "metadata": {
+    "id": "bSFlzo4d3uNT",
+    "outputId": "13e22886-8a14-49fc-ff0d-54fa69bc42f6",
+    "papermill": {
+     "duration": 0.212599,
+     "end_time": "2023-06-13T09:25:13.338714",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:13.126115",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "train_eng,  news_cols = feature_engineering(train)\n",
+    "test_eng, news_cols   = feature_engineering(test)\n",
+    "train_eng"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97add2e6",
+   "metadata": {
+    "id": "qTAmXVm-3uNU",
+    "outputId": "ebf0cc6e-3948-48fd-b204-a91998586bc1",
+    "papermill": {
+     "duration": 1.850481,
+     "end_time": "2023-06-13T09:25:15.341659",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:13.491178",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "#plot_heatmap(train_eng[numeric_cols + news_cols + [target]], title='Train_eng data')\n",
+    "\n",
+    "\n",
+    "corr_mat_data = train_eng.corr()\n",
+    "data_mask = np.triu(np.ones_like(corr_mat_data, dtype = bool))\n",
+    "cmap = sns.diverging_palette(100, 7, s = 75, l = 40, n = 20, center = 'light', as_cmap = True)\n",
+    "\n",
+    "#fig, axes = plt.subplots(1, 1, figsize = (25, 10))\n",
+    "plt.figure(figsize=(12, 12))\n",
+    "sns.heatmap(corr_mat_data, annot = True, cmap = cmap, fmt = '.2f', center = 0,\n",
+    "            annot_kws = {'size': 12}, mask = data_mask).set_title('Correlations train features');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09561f35",
+   "metadata": {
+    "id": "rOwaIXf-3uNU",
+    "outputId": "6f18fb67-6897-4eb6-b6a3-d9f169923688",
+    "papermill": {
+     "duration": 0.193425,
+     "end_time": "2023-06-13T09:25:15.690191",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:15.496766",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "columns_to_drop = ['Sex_I', 'Age']\n",
+    "X = train_eng.drop(columns=columns_to_drop, axis=1)\n",
+    "test_eng = test_eng.drop('Sex_I', axis=1)\n",
+    "Y = train_eng['Age']\n",
+    "Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b348132",
+   "metadata": {
+    "id": "cAkXmNP73uNU",
+    "outputId": "e39dbc99-f845-4a86-eef8-c7c53aa0348e",
+    "papermill": {
+     "duration": 0.186123,
+     "end_time": "2023-06-13T09:25:16.030862",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:15.844739",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1fa9d09",
+   "metadata": {
+    "papermill": {
+     "duration": 0.175092,
+     "end_time": "2023-06-13T09:25:16.373122",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:16.198030",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"#scaling the data\n",
+    "scaler = StandardScaler()\n",
+    "\n",
+    "X = pd.DataFrame(scaler.fit_transform(X))\n",
+    "test_eng = pd.DataFrame(scaler.transform(test_eng))\n",
+    "test_eng\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18ff45ff",
+   "metadata": {
+    "papermill": {
+     "duration": 0.180147,
+     "end_time": "2023-06-13T09:25:16.710939",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:16.530792",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "<p style=\"padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;\">\n",
+    "    <b>💡 Scaling : <br>\n",
+    "         As we'll use Gradient Boosting Decision Trees(GBDT) models, so scaling the data might not be necessary.\n",
+    "\n",
+    "</b>\n",
+    "</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c21f748",
+   "metadata": {
+    "id": "LR9NT9WU3uNV",
+    "papermill": {
+     "duration": 0.249049,
+     "end_time": "2023-06-13T09:25:17.118235",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:16.869186",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "X, X_val, Y, Y_val = train_test_split(X, Y, test_size=0.1, random_state=42)\n",
+    "X_val"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42d243f9",
+   "metadata": {
+    "id": "rljyC9JK3uNV",
+    "papermill": {
+     "duration": 0.160308,
+     "end_time": "2023-06-13T09:25:17.437775",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:17.277467",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">Models building</p>  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42296034",
+   "metadata": {
+    "id": "tph2yAK3R6vy",
+    "outputId": "aa9f08b2-c18b-4fdc-da40-e5989cb7fa36",
+    "papermill": {
+     "duration": 5323.666964,
+     "end_time": "2023-06-13T10:54:01.263305",
+     "exception": false,
+     "start_time": "2023-06-13T09:25:17.596341",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "hist_cv_scores, hist_preds = list(), list()\n",
+    "lgb_cv_scores, lgb_preds   = list(), list()\n",
+    "xgb_cv_scores, xgb_preds   = list(), list()\n",
+    "cat_cv_scores, cat_preds   = list(), list()\n",
+    "\n",
+    "ens_cv_scores, ens_preds = list(), list()\n",
+    "\n",
+    "\n",
+    "\n",
+    "imp_hist = pd.DataFrame()\n",
+    "#imp_cat = pd.DataFrame()\n",
+    "imp_xgb = pd.DataFrame()\n",
+    "imp_ens = pd.DataFrame()\n",
+    "imp = pd.DataFrame()\n",
+    "\n",
+    "skf = KFold(n_splits = 10, random_state = 42, shuffle = True)\n",
+    "    \n",
+    "for i, (train_ix, test_ix) in enumerate(skf.split(X, Y)):\n",
+    "        \n",
+    "    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]\n",
+    "    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]\n",
+    "    \n",
+    "    print('=====================================================================')\n",
+    "    \n",
+    "\n",
+    "    #==================================================== LightGBM ==========================================================#\n",
+    "    \n",
+    "    lgb_params = {\n",
+    "                    \"objective\": \"regression_l1\", # =\"mae\"\n",
+    "                    \"metric\": \"mae\",\n",
+    "                    \"learning_rate\": 0.03, # 0.01\n",
+    "                    \"n_estimators\": 5000,\n",
+    "                    \"max_depth\": 10,\n",
+    "                    \"num_leaves\": 255,\n",
+    "                    \"reg_alpha\": 0.1, \n",
+    "                    \"reg_lambda\": 0.1, \n",
+    "                    \"subsample\": 0.4 \n",
+    "                }\n",
+    "    \n",
+    "    lgb_md = LGBMRegressor(**lgb_params).fit(X_train, Y_train)\n",
+    "\n",
+    "    # Validation\n",
+    "    lgb_pred_1 = lgb_md.predict(X_test)\n",
+    "    lgb_score_fold = mean_absolute_error(Y_test, lgb_pred_1)    \n",
+    "    lgb_cv_scores.append(lgb_score_fold)\n",
+    "    \n",
+    "    # Prediction\n",
+    "    lgb_pred_2 = lgb_md.predict(test_eng)\n",
+    "    lgb_preds.append(lgb_pred_2)\n",
+    "    \n",
+    "    # Importance\n",
+    "    _imp = pd.DataFrame({\"features\": X.columns, \"importance\": lgb_md.feature_importances_})\n",
+    "    imp = pd.concat([imp, _imp], axis=0, ignore_index=True)\n",
+    "\n",
+    "    print('Fold N°', i, '==> LightGBM -          MAE: ====>', lgb_score_fold)\n",
+    "\n",
+    "    \n",
+    "    #==================================================== HistGradientBoosting ====================================================#\n",
+    "    \n",
+    "    \n",
+    "    hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',\n",
+    "                                            l2_regularization = 0.01,\n",
+    "                                            early_stopping = False,\n",
+    "                                            learning_rate = 0.01,\n",
+    "                                            max_iter = 1000,\n",
+    "                                            max_depth = 15,\n",
+    "                                            max_bins = 255,\n",
+    "                                            min_samples_leaf = 30,\n",
+    "                                            max_leaf_nodes = 30).fit(X_train, Y_train)\n",
+    "   \n",
+    "    # Validation\n",
+    "    hist_pred_1 = hist_md.predict(X_test)\n",
+    "    hist_score_fold = mean_absolute_error(Y_test, hist_pred_1)\n",
+    "    hist_cv_scores.append(hist_score_fold)\n",
+    "\n",
+    "    # Prediction\n",
+    "    hist_pred_2 = hist_md.predict(test_eng)\n",
+    "    hist_preds.append(hist_pred_2)\n",
+    "    \n",
+    "    print('Fold N°', i, '==> HistGradient -       MAE: ====>', hist_score_fold)\n",
+    "\n",
+    "    \n",
+    "    #======================================================== XGBoost ============================================================#\n",
+    "    \n",
+    "\n",
+    "    xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',\n",
+    "                          tree_method = 'hist',\n",
+    "                          colsample_bytree = 0.9, \n",
+    "                          gamma = 0.65, \n",
+    "                          learning_rate = 0.01, \n",
+    "                          max_depth = 7, \n",
+    "                          min_child_weight = 20, \n",
+    "                          n_estimators = 5000,\n",
+    "                          subsample = 0.7,\n",
+    "                          random_state = 42).fit(X_train, Y_train,\n",
+    "                                            #eval_set = [(X_train, Y_train), (X_test, Y_test)],\n",
+    "                                            verbose=0\n",
+    "                                            )\n",
+    "    # Validation\n",
+    "    xgb_pred_1 = xgb_md.predict(X_test)\n",
+    "    xgb_score_fold = mean_absolute_error(Y_test, xgb_pred_1)    \n",
+    "    xgb_cv_scores.append(xgb_score_fold)\n",
+    "\n",
+    "    # Prediction\n",
+    "    xgb_pred_2 = xgb_md.predict(test_eng)\n",
+    "    xgb_preds.append(xgb_pred_2)\n",
+    "    \n",
+    "    print('Fold N°', i, '==> XGBoost -        MAE: ====>', xgb_score_fold)\n",
+    "\n",
+    "    \n",
+    "    #========================================================= CatBoost ========================================================#\n",
+    "   \n",
+    "    \n",
+    "    cat_md = CatBoostRegressor(loss_function = 'MAE',\n",
+    "                               iterations = 1000,\n",
+    "                               learning_rate = 0.03,\n",
+    "                               depth = 10, \n",
+    "                               random_strength = 0.2,\n",
+    "                               bagging_temperature = 0.7,\n",
+    "                               border_count = 254,\n",
+    "                               l2_leaf_reg = 0.001,\n",
+    "                               verbose = False,\n",
+    "                               grow_policy = 'Lossguide',\n",
+    "                               task_type = 'CPU',\n",
+    "                               random_state = 42).fit(X_train, Y_train)\n",
+    "    \n",
+    "    # Validation\n",
+    "    cat_pred_1 = cat_md.predict(X_test)\n",
+    "    cat_score_fold = mean_absolute_error(Y_test, cat_pred_1)    \n",
+    "    cat_cv_scores.append(cat_score_fold)\n",
+    "    \n",
+    "    # Prediction\n",
+    "    cat_pred_2 = cat_md.predict(test_eng)\n",
+    "    cat_preds.append(cat_pred_2)\n",
+    "    \n",
+    "    print('Fold N°', i, '==> CatBoost       - MAE: ====>', cat_score_fold)\n",
+    "\n",
+    "    \n",
+    "    #========================================================= LAD Ensemble =========================================================#\n",
+    "    \n",
+    "    x = pd.DataFrame({'hist': np.round(hist_pred_1.tolist()), \n",
+    "                      'lgb': np.round(lgb_pred_1.tolist()),\n",
+    "                      'xgb': np.round(xgb_pred_1.tolist()), \n",
+    "                      'cat': np.round(cat_pred_1.tolist())}\n",
+    "                    )\n",
+    "    y = Y_test\n",
+    "    \n",
+    "    x_test = pd.DataFrame({'hist': np.round(hist_pred_2.tolist()), \n",
+    "                           'lgb': np.round(lgb_pred_2.tolist()),\n",
+    "                           'xgb': np.round(xgb_pred_2.tolist()), \n",
+    "                           'cat': np.round(cat_pred_2.tolist())}\n",
+    "                         )\n",
+    "    \n",
+    "    lad_md = LADRegression().fit(x, y)\n",
+    "    \n",
+    "    # Validation\n",
+    "    lad_pred = lad_md.predict(x)     \n",
+    "    ens_score = mean_absolute_error(y, lad_pred)\n",
+    "    ens_cv_scores.append(ens_score)\n",
+    "    \n",
+    "    #Predictions\n",
+    "    lad_pred_test = lad_md.predict(x_test)\n",
+    "    ens_preds.append(lad_pred_test)\n",
+    "    \n",
+    "    print('Fold N°', i, '==> LAD Model 1 ensemble - MAE: ====>', ens_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "466ab7f8",
+   "metadata": {
+    "id": "4gb-E3Wbmzcy",
+    "outputId": "9dbc7f8e-3477-437a-bc1b-3abe72649255",
+    "papermill": {
+     "duration": 0.198329,
+     "end_time": "2023-06-13T10:54:01.629060",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:01.430731",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Display important features for LGBMRegressor\n",
+    "imp = imp.groupby(\"features\")[\"importance\"].agg([\"mean\", \"std\"])\n",
+    "imp.columns = [\"importance\", \"importance_std\"]\n",
+    "imp[\"importance_cov\"] = imp[\"importance_std\"] / imp[\"importance\"]\n",
+    "imp = imp.reset_index(drop=False)\n",
+    "display(imp.sort_values(\"importance\", ascending=False, ignore_index=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72c264f3",
+   "metadata": {
+    "id": "6kzgu7PBmcEr",
+    "outputId": "9624c4d3-3ea9-42e7-bd04-d294a81ed0e0",
+    "papermill": {
+     "duration": 17.121349,
+     "end_time": "2023-06-13T10:54:18.918304",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:01.796955",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "lgb_cv_score = np.mean(lgb_cv_scores)\n",
+    "print(f\"Score on CV test data             ======> {lgb_cv_score}\")\n",
+    "preds_val = lgb_md.predict(X_val)\n",
+    "preds_val_score = mean_absolute_error(Y_val, preds_val)\n",
+    "print(f\"Score on Valid data (unseen data) ======> {preds_val_score}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2568184",
+   "metadata": {
+    "id": "ynkBFBVwR6zn",
+    "outputId": "11853749-7499-4a95-bd19-50abc4c0abd7",
+    "papermill": {
+     "duration": 2.178501,
+     "end_time": "2023-06-13T10:54:21.261710",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:19.083209",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# HistGB scores\n",
+    "\n",
+    "hist_cv_score = np.mean(hist_cv_scores)\n",
+    "print(f\"Score on CV test data             ======> {hist_cv_score}\")\n",
+    "\n",
+    "hist_preds_val = hist_md.predict(X_val)\n",
+    "hist_preds_val_score = mean_absolute_error(Y_val, hist_preds_val)\n",
+    "print(f\"Score on Valid data (unseen data) ======> {hist_preds_val_score}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "600a6742",
+   "metadata": {
+    "id": "dsQ1mx-59bSx",
+    "outputId": "bd0e9dd0-15b7-416a-ebf2-8fcf19cf4894",
+    "papermill": {
+     "duration": 4.355499,
+     "end_time": "2023-06-13T10:54:25.783086",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:21.427587",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# XGBoost scores\n",
+    "\n",
+    "xgb_cv_score = np.mean(xgb_cv_scores)\n",
+    "print(f\"Score on CV test data             ======> {xgb_cv_score}\")\n",
+    "\n",
+    "xgb_preds_val = xgb_md.predict(X_val)\n",
+    "xgb_preds_val_score = mean_absolute_error(Y_val, xgb_preds_val)\n",
+    "print(f\"Score on Valid data (unseen data) ======> {xgb_preds_val_score}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "58eaf3d2",
+   "metadata": {
+    "papermill": {
+     "duration": 0.165481,
+     "end_time": "2023-06-13T10:54:26.116452",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:25.950971",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "<p style=\"padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;\">\n",
+    "    <b>💡 Saving the models : <br>\n",
+    "        We can use joblib to save the models we trained so we can use them later if needed.\n",
+    "\n",
+    "</b>\n",
+    "</p> "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d153c61",
+   "metadata": {
+    "id": "gPrtyVu4GV3R",
+    "jupyter": {
+     "source_hidden": true
+    },
+    "papermill": {
+     "duration": 0.177429,
+     "end_time": "2023-06-13T10:54:26.460348",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:26.282919",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"from joblib import dump, load\n",
+    "dump(lgb_md, 'lgb_md.joblib') \n",
+    "dump(hist_md, 'hist_md.joblib') \n",
+    "dump(xgb_md, 'xgb_md.joblib') \n",
+    "dump(lad_md, 'lad_md.joblib') \"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5fecbd17",
+   "metadata": {
+    "papermill": {
+     "duration": 0.167594,
+     "end_time": "2023-06-13T10:54:26.792927",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:26.625333",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# <p style=\"padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000\">Submissions</p>  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e262a2a1",
+   "metadata": {
+    "id": "4rLMnbSr3uNX",
+    "papermill": {
+     "duration": 0.213049,
+     "end_time": "2023-06-13T10:54:27.171465",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:26.958416",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "submission = pd.read_csv(\"/kaggle/input/playground-series-s3e16/sample_submission.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f8bb52e",
+   "metadata": {
+    "papermill": {
+     "duration": 0.622624,
+     "end_time": "2023-06-13T10:54:27.959578",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:27.336954",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Visualize models performances\n",
+    "hist_cv_score = np.mean(hist_cv_scores)\n",
+    "lgb_cv_score = np.mean(lgb_cv_scores)\n",
+    "xgb_cv_score = np.mean(xgb_cv_scores)\n",
+    "cat_cv_score = np.mean(cat_cv_scores)\n",
+    "ens_cv_score = np.mean(ens_cv_scores)\n",
+    "\n",
+    "\n",
+    "model_perf = pd.DataFrame({'Models': [ 'HistGradient' ,'LightGBM', 'XGBoost', 'CatBoost', 'LAD Model'],\n",
+    "                           'CV-scores': [ hist_cv_score, lgb_cv_score, xgb_cv_score, cat_cv_score, ens_cv_score]\n",
+    "                          })\n",
+    "\n",
+    "plt.figure(figsize = (8, 8))\n",
+    "ax = sns.barplot(y = 'Models', x = 'CV-scores', data = model_perf)\n",
+    "ax.bar_label(ax.containers[0]);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "effa2d26",
+   "metadata": {
+    "id": "Y6Cts_Tp3uNd",
+    "papermill": {
+     "duration": 0.185627,
+     "end_time": "2023-06-13T10:54:28.640810",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:28.455183",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "unique_targets = np.unique(train['Age'])\n",
+    "def mattop_post_process(preds):\n",
+    "     return np.array([min(unique_targets, key = lambda x: abs(x - pred)) for pred in preds])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91142f31",
+   "metadata": {
+    "papermill": {
+     "duration": 8.398007,
+     "end_time": "2023-06-13T10:54:37.205473",
+     "exception": false,
+     "start_time": "2023-06-13T10:54:28.807466",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "ens_preds_test = mattop_post_process(pd.DataFrame(ens_preds).apply(np.mean, axis = 0))\n",
+    "\n",
+    "submission['Age'] = ens_preds_test.astype(int)\n",
+    "submission.to_csv('LAD_model.csv', index = False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "duration": 5695.682242,
+   "end_time": "2023-06-13T10:54:41.703074",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "__notebook__.ipynb",
+   "output_path": "__notebook__.ipynb",
+   "parameters": {},
+   "start_time": "2023-06-13T09:19:46.020832",
+   "version": "2.4.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}